This adds an exception to holding o_valid low, when the ALU is idle.
If a write to the ALU just occurred, allow o_valid to become high, in
the same cycle.
*.il
**/*.gtkw
.eggs
-
+formal_test_temp
.vscode/*
build
gen
.noseids
nosetests.xml
+test-out
- ccache
- .cache/pip
- apt-cache
+ when: 'always'
variables:
PIP_CACHE_DIR: "$CI_PROJECT_DIR/.cache/pip"
GIT_SUBMODULE_STRATEGY: recursive
+ GIT_DEPTH: "500"
build:
stage: build
- apt-get -o dir::cache::archives="$(pwd)/apt-cache" update
- >-
apt-get -o dir::cache::archives="$(pwd)/apt-cache" -y install
- build-essential git python3-dev python3-pip
- python3-setuptools python3-wheel pkg-config tcl-dev
- libreadline-dev bison flex libffi-dev ccache python3-venv
- binutils-powerpc64-linux-gnu binutils-powerpc64le-linux-gnu
- autoconf gperf libgmp-dev libmpfr-dev libssl-dev curl
+ build-essential
+ git
+ python3-dev
+ python3-pip
+ python3-setuptools
+ python3-setuptools-scm
+ python3-wheel
+ pkg-config
+ tcl-dev
+ libreadline-dev
+ bison
+ flex
+ libffi-dev
+ ccache
+ python3-venv
+ binutils-powerpc64-linux-gnu
+ binutils-powerpc64le-linux-gnu
+ autoconf
+ gperf
+ libgmp-dev
+ libmpfr-dev
+ libssl-dev
+ curl
- export PATH="/usr/lib/ccache:$PATH"
- export CCACHE_BASEDIR="$PWD"
- export CCACHE_DIR="$PWD/ccache"
- ccache --show-stats || true
- curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
- source $HOME/.cargo/env
- after_script:
- - export CCACHE_DIR="$PWD/ccache"
- - ccache --show-stats
script:
- - python3 -m venv .env
+ - python3 -m venv --system-site-packages .env
- . .env/bin/activate
- - pip install nose
+ - pip install pytest-xdist==3.3.1 pytest==7.3.1
+
+ - git clone --depth 1 -b v0.1.1 https://github.com/cocotb/cocotb-bus.git cocotb-bus
+ - pushd cocotb-bus
+ - pip install . --no-deps
+ - popd
+
+ - git clone --depth 1 -b v1.5.2 https://github.com/cocotb/cocotb.git cocotb
+ - pushd cocotb
+ - pip install .
+ - popd
+
+ - git clone --depth 1 https://git.libre-soc.org/git/pytest-output-to-files.git pytest-output-to-files
+ - pushd pytest-output-to-files
+ - git rev-parse HEAD
+ - python3 setup.py develop
+ - popd
- - git clone --depth 1 https://github.com/SRI-CSL/yices2.git yices2
+ - git clone --depth 1 -b Yices-2.6.4 https://github.com/SRI-CSL/yices2.git yices2
- pushd yices2
- autoconf
- ./configure
- - make -j$(nproc) > /dev/null
+ - make -j$(nproc)
- make install
- popd
- - git clone --depth 1 https://github.com/YosysHQ/yosys.git yosys
+ - git clone --depth 1 -b yosys-0.17 https://github.com/YosysHQ/yosys.git yosys
- pushd yosys
- make config-gcc
- - make -j$(nproc) > /dev/null
+ - make -j$(nproc)
- make install
- popd
- yosys -V
- - git clone --depth 1 https://github.com/YosysHQ/SymbiYosys.git SymbiYosys
+ - git clone https://github.com/YosysHQ/SymbiYosys.git SymbiYosys
- pushd SymbiYosys
- - make install > /dev/null
+ - git checkout d10e472edf4ea9be3aa6347b264ba575fbea933a
+ - make install
- popd
- - git clone --depth 1 https://github.com/nmigen/nmigen.git nmigen
+ - git clone --depth 1 https://gitlab.com/nmigen/nmigen.git nmigen
- pushd nmigen
- - python setup.py develop
+ - git rev-parse HEAD
+ - python3 setup.py develop
+ - popd
+
+ - git clone --depth 1 https://git.libre-soc.org/git/mdis.git mdis
+ - pushd mdis
+ - git rev-parse HEAD
+ - python3 setup.py develop
- popd
- git clone --depth 1 https://git.libre-soc.org/git/nmutil.git nmutil
- pushd nmutil
- - python setup.py develop
+ - git rev-parse HEAD
+ - python3 setup.py develop
- popd
- git clone --depth 1 https://git.libre-soc.org/git/nmigen-soc.git nmigen-soc
- git clone --depth 1 https://git.libre-soc.org/git/openpower-isa.git openpower-isa
- pushd openpower-isa
- python3 setup.py develop
- - make -j$(nproc) svanalysis > /dev/null
- - make -j$(nproc) pyfnwriter > /dev/null 2>&1
- - make -j$(nproc) pywriter > /dev/null 2>&1
+ - if ! out="$(make 2>&1)"; then echo "$out"; exit 1; fi
- popd
- git clone --depth 1 https://git.libre-soc.org/git/c4m-jtag.git c4m-jtag
- popd
- IEEE754FPU_PATH="$(pwd)"/ieee754fpu
- - git clone --depth 1 --recursive https://github.com/billzorn/sfpy.git sfpy
+ - git clone --depth 1 --recursive -b v0.6.0 https://github.com/billzorn/sfpy.git sfpy
- pushd sfpy
+ - git apply "$IEEE754FPU_PATH"/sfpy.patch
- pushd berkeley-softfloat-3
- git apply "$IEEE754FPU_PATH"/berkeley-softfloat.patch
- popd
- git apply ../softposit_sfpy_build.patch
- git apply "$IEEE754FPU_PATH"/SoftPosit.patch
- popd
- - pip install --upgrade -r requirements.txt
+ - pip install -r requirements.txt
- make lib -j$(nproc)
- make cython -j$(nproc)
- make wheel -j$(nproc)
- - pip install dist/sfpy*.whl
+ - pip install --force-reinstall dist/sfpy*.whl
- popd
- - cargo install maturin
+ - python3 -m pip install 'maturin>=0.11,<0.12'
- git clone --depth 1 https://git.libre-soc.org/git/power-instruction-analyzer.git pia
- pushd pia
- maturin build --cargo-extra-args=--features=python-extension
- popd
- python setup.py develop
- - nosetests -v --processes=-1 --process-timeout=120
+ - SILENCELOG='!*,default' pytest -v --maxfail=20
cp pinmux/ls180/ls180_pins.py src/soc/debug
cp pinmux/ls180/ls180_pins.py src/soc/litex/florent/libresoc
-install: gitupdate develop mkpinmux svanalysis
+install: gitupdate develop mkpinmux
# this is now actually part of openpower-isa repository
pywriter:
- pywriter
+ echo "pywriter is part of openpower-isa, run that instead"
# this is now actually part of openpower-isa repository
svanalysis:
- svanalysis
+ echo "sv_analysis is part of openpower-isa, run that instead"
develop:
python3 setup.py develop # yes, develop, not install
--enable-xics --enable-sram4x4kblock --disable-svp64 \
src/soc/litex/florent/libresoc/libresoc.v
+# build microwatt "external core", note that the TLB set size is set to 16
+# for I/D-Cache which needs a corresponding alteration of the device-tree
+# entries for linux
+microwatt_external_core:
+ python3 src/soc/simple/issuer_verilog.py --microwatt-compat --enable-mmu \
+ external_core_top.v
+
+# build microwatt "external core" with fixed 64-bit width SVP64
+# note that the TLB set size is set to 16
+# for I/D-Cache which needs a corresponding alteration of the device-tree
+# entries for linux
+microwatt_external_core_svp64:
+ python3 src/soc/simple/issuer_verilog.py --microwatt-compat-svp64 --enable-mmu \
+ external_core_top.v
+
+microwatt_external_core_spi:
+ python3 src/soc/simple/issuer_verilog.py --microwatt-compat \
+ --small-cache \
+ --enable-mmu \
+ --pc-reset 0x10000000 \
+ external_core_top.v
+
+# microwatt-compatible core with smaller cache size (quick. VERSA_ECP5. just)
+microwatt_external_core_bram:
+ python3 src/soc/simple/issuer_verilog.py --microwatt-compat \
+ --small-cache \
+ --enable-mmu \
+ --pc-reset 0xFF000000 \
+ external_core_top.v
+
+# microwatt-compatible core with larger cache size (experiment on arty)
+microwatt_external_core_bram_arty:
+ python3 src/soc/simple/issuer_verilog.py --microwatt-compat \
+ --enable-mmu \
+ --pc-reset 0xFF000000 \
+ external_core_top.v
+
# build the litex libresoc SoC without 4k SRAMs
ls180_verilog_build: ls180_verilog
make -C soc/soc/litex/florent ls180
'sphinx.ext.coverage',
'recommonmark',
#'symbolator_sphinx',
- 'sphinxcontrib_verilog_diagrams',
+ #'sphinxcontrib_verilog_diagrams', # XXX now spinxcontrib-hdl-diagrams
'sphinx_rtd_theme',
#'sphinx_tabs.tabs',
]
--- /dev/null
+{
+ "nodes": {
+ "c4m-jtag": {
+ "flake": false,
+ "locked": {
+ "lastModified": 1619101523,
+ "narHash": "sha256-y1OY8URcE1lnu5L7IDFcJ8zT8sqlrfMP9VPNmVvACGk=",
+ "ref": "master",
+ "rev": "c2bf4810f9f91ced7fcda777b92b86ab353da288",
+ "revCount": 146,
+ "type": "git",
+ "url": "https://git.libre-soc.org/git/c4m-jtag.git"
+ },
+ "original": {
+ "type": "git",
+ "url": "https://git.libre-soc.org/git/c4m-jtag.git"
+ }
+ },
+ "migen": {
+ "flake": false,
+ "locked": {
+ "lastModified": 1631614362,
+ "narHash": "sha256-BgYf4e7O/rbS5P1ZpDlcgCEUh2h2vK3FyHADdzyaMg0=",
+ "owner": "m-labs",
+ "repo": "migen",
+ "rev": "7bc4eb1387b39159a74c1dbd1b820728e0bfbbaa",
+ "type": "github"
+ },
+ "original": {
+ "owner": "m-labs",
+ "repo": "migen",
+ "type": "github"
+ }
+ },
+ "nix-litex": {
+ "flake": false,
+ "locked": {
+ "lastModified": 1632150297,
+ "narHash": "sha256-ghlAJBZxLVkQB+9tXEOBOF1FfdT5Pn4292khF4iKCNA=",
+ "ref": "main",
+ "rev": "5ab6984eb1efad0c91d808c9b7b79e00e50ccc05",
+ "revCount": 31,
+ "type": "git",
+ "url": "https://git.sr.ht/~lschuermann/nix-litex"
+ },
+ "original": {
+ "ref": "main",
+ "type": "git",
+ "url": "https://git.sr.ht/~lschuermann/nix-litex"
+ }
+ },
+ "nixpkgs": {
+ "locked": {
+ "lastModified": 1631723418,
+ "narHash": "sha256-Sbey1S81fXUKcEHVCMwlXMju/IoCQxMwP1PPkVYpGrc=",
+ "owner": "L-as",
+ "repo": "nixpkgs",
+ "rev": "8bfc1026477692b933df6eeec27bd494cac3e436",
+ "type": "github"
+ },
+ "original": {
+ "owner": "L-as",
+ "ref": "libresoc",
+ "repo": "nixpkgs",
+ "type": "github"
+ }
+ },
+ "nmigen": {
+ "flake": false,
+ "locked": {
+ "lastModified": 1618220900,
+ "narHash": "sha256-Ol2SMZLUTikZWDLmK7F5lZuKBfGO71WmisATPNMTpHQ=",
+ "ref": "master",
+ "rev": "d824795c2c7cb43dcbc8ed8fac6d309d77284913",
+ "revCount": 1056,
+ "type": "git",
+ "url": "https://git.libre-soc.org/git/nmigen.git"
+ },
+ "original": {
+ "type": "git",
+ "url": "https://git.libre-soc.org/git/nmigen.git"
+ }
+ },
+ "nmigen-soc": {
+ "flake": false,
+ "locked": {
+ "lastModified": 1601572554,
+ "narHash": "sha256-v9SH+KuIPydXCr363RUsMg9/tabuu+GjKPJOKq2Jze0=",
+ "ref": "master",
+ "rev": "692017c7eaf21ff37302790c4422db6bd08667be",
+ "revCount": 48,
+ "type": "git",
+ "url": "https://git.libre-soc.org/git/nmigen-soc.git"
+ },
+ "original": {
+ "type": "git",
+ "url": "https://git.libre-soc.org/git/nmigen-soc.git"
+ }
+ },
+ "root": {
+ "inputs": {
+ "c4m-jtag": "c4m-jtag",
+ "migen": "migen",
+ "nix-litex": "nix-litex",
+ "nixpkgs": "nixpkgs",
+ "nmigen": "nmigen",
+ "nmigen-soc": "nmigen-soc",
+ "yosys": "yosys"
+ }
+ },
+ "yosys": {
+ "flake": false,
+ "locked": {
+ "lastModified": 1617979565,
+ "narHash": "sha256-M8ppe+lL/pgd2sXh7bM6/sbk1099KKECeWA5mXtqE6Y=",
+ "owner": "YosysHQ",
+ "repo": "yosys",
+ "rev": "a58571d0fe8971cb7d3a619a31b2c21be6d75bac",
+ "type": "github"
+ },
+ "original": {
+ "owner": "YosysHQ",
+ "repo": "yosys",
+ "rev": "a58571d0fe8971cb7d3a619a31b2c21be6d75bac",
+ "type": "github"
+ }
+ }
+ },
+ "root": "root",
+ "version": 7
+}
--- /dev/null
+{
+ description = "FOSS CPU/GPU/VPU/SoC all in one, see https://libre-soc.org/";
+
+ inputs.nixpkgs.url = "github:L-as/nixpkgs?ref=libresoc"; # for alliance and migen
+ inputs.c4m-jtag.url = "git+https://git.libre-soc.org/git/c4m-jtag.git";
+ inputs.c4m-jtag.flake = false;
+ inputs.nmigen.url = "git+https://git.libre-soc.org/git/nmigen.git";
+ inputs.nmigen.flake = false;
+ inputs.nmigen-soc.url = "git+https://git.libre-soc.org/git/nmigen-soc.git";
+ inputs.nmigen-soc.flake = false;
+ inputs.migen.url = "github:m-labs/migen";
+ inputs.migen.flake = false;
+ inputs.yosys.url = "github:YosysHQ/yosys?rev=a58571d0fe8971cb7d3a619a31b2c21be6d75bac";
+ inputs.yosys.flake = false;
+ # submodules needed
+ inputs.nix-litex.url = "git+https://git.sr.ht/~lschuermann/nix-litex?ref=main";
+ inputs.nix-litex.flake = false;
+
+ outputs = { self, nixpkgs, c4m-jtag, nmigen, nmigen-soc, nix-litex, migen, yosys }:
+ let
+ getv = x: builtins.substring 0 8 x.lastModifiedDate;
+
+ supportedSystems = [ "x86_64-linux" "x86_64-darwin" "aarch64-linux" "aarch64-darwin" ];
+
+ forAllSystems = nixpkgs.lib.genAttrs supportedSystems;
+
+ litex = pkgs: import "${nix-litex}/pkgs" {
+ inherit pkgs;
+ pkgMetas = builtins.fromTOML (builtins.readFile ./nix/litex.toml);
+ skipChecks = true; # FIXME: remove once checks work
+ };
+
+ nixpkgsFor = forAllSystems (system: import nixpkgs { inherit system; overlays = [ self.overlay ]; });
+
+ lib = nixpkgs.lib;
+ in
+ {
+ overlay = final: prev: {
+ python37 = prev.python37.override {
+ packageOverrides = lib.composeExtensions (litex final).pythonOverlay (pfinal: pprev: {
+ libresoc-ieee754fpu = pfinal.callPackage ./nix/ieee754fpu.nix {};
+ libresoc-openpower-isa = pfinal.callPackage ./nix/openpower-isa.nix {};
+ c4m-jtag = pfinal.callPackage (import ./nix/c4m-jtag.nix { src = c4m-jtag; version = getv c4m-jtag; }) {};
+ bigfloat = pfinal.callPackage ./nix/bigfloat.nix {};
+ modgrammar = pfinal.callPackage ./nix/modgrammar.nix {};
+ libresoc-nmutil = pfinal.callPackage ./nix/nmutil.nix {};
+ libresoc-soc = pfinal.callPackage (import ./nix/soc.nix { version = getv self; }) {};
+
+ nmigen-soc = pprev.nmigen-soc.overrideAttrs (_: {
+ doCheck = false;
+ src = nmigen-soc;
+ setuptoolsCheckPhase = "true";
+ });
+
+ nmigen = pprev.nmigen.overrideAttrs (_: {
+ src = nmigen;
+ });
+
+ migen = pprev.migen.overrideAttrs (_: {
+ src = migen;
+ });
+ });
+ };
+
+ yosys = prev.yosys.overrideAttrs (_: {
+ version = "0.9+4052";
+ src = yosys;
+ });
+
+ libresoc-verilog = final.callPackage (import ./nix/verilog.nix { version = getv self; }) { python3Packages = final.python37Packages; };
+ libresoc-ls180 = final.callPackage (import ./nix/ls180.nix { version = getv self; }) { python3Packages = final.python37Packages; };
+ libresoc-ecp5 = final.callPackage (import ./nix/ecp5.nix { version = getv self; }) { python3Packages = final.python37Packages; };
+ libresoc-ecp5-program = final.callPackage (import ./nix/ecp5-program.nix { version = getv self; }) { python3Packages = final.python37Packages; };
+ libresoc-pinmux = final.callPackage (import ./nix/pinmux.nix { version = getv self; }) {};
+ };
+
+ apps = forAllSystems (system: {
+ ecp5 = {
+ type = "app";
+ program = "${nixpkgsFor.${system}.libresoc-ecp5-program}";
+ };
+ });
+ defaultApp = forAllSystems (system: self.apps.${system}.ecp5);
+
+ packages = forAllSystems (system: {
+ soc = nixpkgsFor.${system}.python37Packages.libresoc-soc;
+ verilog = nixpkgsFor.${system}.libresoc-verilog;
+ pinmux = nixpkgsFor.${system}.libresoc-pinmux;
+ ls180 = nixpkgsFor.${system}.libresoc-ls180;
+ ecp5 = nixpkgsFor.${system}.libresoc-ecp5;
+ ecp5-program = nixpkgsFor.${system}.libresoc-ecp5-program;
+ openpower-isa = nixpkgsFor.${system}.python37Packages.libresoc-openpower-isa;
+ debugNixpkgs = nixpkgsFor.${system};
+ });
+
+ defaultPackage = forAllSystems (system: self.packages.${system}.verilog);
+ };
+}
#!/bin/sh
cd pinmux
python2 src/pinmux_generator.py -v -s ls180 -o ls180
+# temporary - return to older version of pinmux
+#python2 src/pinmux_generator.py -v -s ngi_router -o ngi_router
--- /dev/null
+{ lib, buildPythonPackage, fetchPypi, gmp, mpfr, six }:
+
+buildPythonPackage rec {
+ pname = "bigfloat";
+ version = "0.4.0";
+
+ buildInputs = [ gmp mpfr ];
+ propagatedBuildInputs = [ six ];
+
+ src = fetchPypi {
+ inherit pname version;
+ sha256 = "WLlr3ocqylmJ0T2C66Os8qoblOIhF91yoWulkRsMDLg=";
+ };
+
+ doCheck = false;
+
+ meta = with lib; {
+ homepage = "https://pypi.org/project/bigfloat/";
+ license = licenses.lgpl3Plus;
+ };
+}
--- /dev/null
+{ version, src }:
+
+{ lib, python, buildPythonPackage, nmigen-soc, nmigen, modgrammar, setuptools-scm }:
+
+buildPythonPackage {
+ pname = "c4m-jtag";
+ inherit src version;
+
+ nativeBuildInputs = [ setuptools-scm ];
+ propagatedBuildInputs = [ nmigen-soc nmigen modgrammar ];
+
+ doCheck = false;
+
+ pythonImportsCheck = [ "c4m.nmigen.jtag.tap" ];
+
+ prePatch = ''
+ export SETUPTOOLS_SCM_PRETEND_VERSION=${version}
+ '';
+
+ meta = with lib; {
+ homepage = "https://pypi.org/project/libresoc-openpower-isa/";
+ license = licenses.lgpl3Plus;
+ };
+}
--- /dev/null
+{ version }:
+
+{ writeShellScript, openocd, python3Packages, libresoc-ecp5, nextpnr, trellis }:
+
+let
+ pythonWithEnv = python3Packages.python.withPackages (ps: with ps; [
+ requests migen libresoc-soc litex-boards litex litedram liteeth liteiclink litescope litesdcard
+ ]);
+in
+writeShellScript "program-ecp5-libresoc" ''
+ export PATH="${openocd}/bin:${pythonWithEnv}/bin:${trellis}/bin:${nextpnr}/bin:$PATH"
+
+ dir="$(mktemp -d)"
+ pushd "$dir"
+ echo "$dir"
+
+ export PYTHONPATH="${../src/soc/litex/florent}:$PYTHONPATH"
+
+ python ${../src/soc/litex/florent/versa_ecp5.py} --sys-clk-freq=55e6 --load-from ${libresoc-ecp5}
+
+ popd
+ rm -rf "$dir"
+ exit 0
+''
--- /dev/null
+{ version }:
+
+{ stdenv, python3Packages, yosys, libresoc-verilog, libresoc-pinmux, pkgsCross
+, nextpnr, trellis }:
+
+stdenv.mkDerivation {
+ pname = "libresoc-versa-ecp5.v";
+ inherit version;
+
+ src = ../src/soc/litex/florent;
+
+ nativeBuildInputs =
+ (with python3Packages; [
+ python libresoc-soc litex-boards litex litedram liteeth liteiclink litescope litesdcard
+ ])
+ ++ [ trellis nextpnr pkgsCross.powernv.buildPackages.gcc ];
+
+ postPatch = ''
+ patchShebangs --build .
+ '';
+
+ configurePhase = "true";
+
+ buildPhase = ''
+ runHook preBuild
+ export PINMUX="$(mktemp -d)"
+ ln -s ${libresoc-pinmux} "$PINMUX/ls180"
+ cp ${libresoc-verilog} libresoc/libresoc.v
+ ./versa_ecp5.py --sys-clk-freq=55e6 --build
+ runHook postBuild
+ '';
+
+ installPhase = ''
+ runHook preInstall
+ mv /build/florent/build/versa_ecp5/gateware/versa_ecp5.svf $out
+ runHook postInstall
+ '';
+
+ fixupPhase = "true";
+}
--- /dev/null
+{ lib, buildPythonPackage, libresoc-nmutil, bigfloat, fetchgit }:
+
+buildPythonPackage {
+ pname = "libresoc-ieee754fpu";
+ version = "unstable-2021-06-05";
+
+ src = fetchgit {
+ url = "https://git.libre-soc.org/git/ieee754fpu.git";
+ rev = "c62fa3a7ee95832587d7725729dcdb9a002ae015";
+ sha256 = "wbr1vGFzUlUtBT6IcRsykADYeksiVoq/LacU/dbRQ0o=";
+ };
+
+ propagatedBuildInputs = [ libresoc-nmutil bigfloat ];
+
+ doCheck = false;
+
+ prePatch = ''
+ touch ./src/ieee754/part/__init__.py
+ '';
+
+ pythonImportsCheck = [ "ieee754.part" ];
+
+ meta = with lib; {
+ homepage = "https://pypi.org/project/libresoc-ieee754fpu/";
+ license = licenses.lgpl3Plus;
+ };
+}
--- /dev/null
+[litex]
+github_user = "enjoy-digital"
+github_repo = "litex"
+git_revision = "42d8fc226a4f4e8dfef104257a95f98eb9b10da7"
+github_archive_nix_hash = "16zb7mci2a09jc5bbr4342pn95iyl84705n566alpx696xk2l0zr"
+
+[litex-boards]
+github_user = "litex-hub"
+github_repo = "litex-boards"
+git_revision = "1781be166aee867421e0d943f6a62c3397524563"
+github_archive_nix_hash = "0ar41ibs6si03iyhcjn3blw1rkdsazn5rsa95ph8v061kg2yjbjh"
+
+[liteeth]
+github_user = "enjoy-digital"
+github_repo = "liteeth"
+git_revision = "64b85e621e740b9b7a9bdb03749758c703fea6e1"
+github_archive_nix_hash = "1gbscl36n6mgaz1y1b27nzhykrhrccl6ls5vp7dd6divpqdf328i"
+
+[litedram]
+github_user = "enjoy-digital"
+github_repo = "litedram"
+git_revision = "ac825e51124e926c67455292cd2b949954fc6f65"
+github_archive_nix_hash = "1acs4kgbsv8pgml1q7709afh46f8mpy8b1nw0p9n8a1zih8ang1r"
+
+[litehyperbus]
+github_user = "litex-hub"
+github_repo = "litehyperbus"
+git_revision = "c4b64d2c992cedf3e03ffdf87f389feb5ddfff52"
+github_archive_nix_hash = "1iwjwzz4wa9zzm6yqa7rkag9igmsawp8wpmkj6fqia20b7xjglnb"
+
+[liteiclink]
+github_user = "enjoy-digital"
+github_repo = "liteiclink"
+git_revision = "efd200fa9e625144131a310fc09fd1fecf1682e6"
+github_archive_nix_hash = "0g643ryfzc6iq0p80rhq116n5w6mh4fv4yg4adyy5i1vy2grlg8s"
+
+[litepcie]
+github_user = "enjoy-digital"
+github_repo = "litepcie"
+git_revision = "0718fd135fc30e0a3598eaf66ce2fcb54b62193c"
+github_archive_nix_hash = "1m3i4hv49438ik4qhdp7rx9nan5rddrqp7nzvya9xfbh7lfc59hl"
+
+[litescope]
+github_user = "enjoy-digital"
+github_repo = "litescope"
+git_revision = "2739d5a069386c8e834c7f660dce9f93dc2b4598"
+github_archive_nix_hash = "08r7dzlmlfs9pmfz4xkf61sal5zy3caby88bcb4993c43nzpw8a3"
+
+[litesdcard]
+github_user = "enjoy-digital"
+github_repo = "litesdcard"
+git_revision = "edee2467fcabc62c4b34e3daa2271a71e52ba09f"
+github_archive_nix_hash = "0n5x9cx61xij0hc61slabxa05pzmw8i5fyg54ydmxi2fl2p5p0rs"
+
+[litespi]
+github_user = "litex-hub"
+github_repo = "litespi"
+git_revision = "c0730ebdb3c976618bf24e9ec04911e7c9934adf"
+github_archive_nix_hash = "015irjdpii514aj4av02pglvvq0wgxkplyy09435crzy9j5i5v04"
+
+[pythondata-misc-tapcfg]
+github_user = "litex-hub"
+github_repo = "pythondata-misc-tapcfg"
+git_revision = "25c97a4a9ff9af85248028fe01e2c65b2e3640ee"
+github_archive_nix_hash = "0zr6d5giqzsjmqpfyf1b25r0y70bj09xjbfinfxcdc6s8cwwwz71"
+
+[pythondata-software-compiler_rt]
+github_user = "litex-hub"
+github_repo = "pythondata-software-compiler_rt"
+git_revision = "7cfcaed2e726027fd622650b58dd77e47c495ee0"
+github_archive_nix_hash = "0b65dj95418j4pjqqkqjq5npnn1ih1789ba9575kxcljgj7r8xb7"
+
+[pythondata-cpu-serv]
+github_user = "litex-hub"
+github_repo = "pythondata-cpu-serv"
+git_revision = "915cdf793395ab48cc52c0225660eb6eeff41133"
+github_archive_nix_hash = "1ndkjhh7r521cc9g63pmjvgvv9sa3s8n2mkdli91nr7ns3q3lxmk"
+
+[litevideo]
+github_user = "enjoy-digital"
+github_repo = "litevideo"
+git_revision = "41f30143075ece3fff5c33a332ed067d1837cbb3"
+github_archive_nix_hash = "06vw4rn8xby8is13275bmkrxlwp3wlznbdqfay78a5m8bp73kypy"
+
+[valentyusb-hw_cdc_eptri]
+github_user = "litex-hub"
+github_repo = "valentyusb"
+git_revision = "a0526ad053c394306ad7a585a7ddd463831ad09d"
+github_archive_nix_hash = "0nad2x5j5rnjyciwm0abxhzng8nrv06ri8g9qdi39zk8n9zy7cmf"
--- /dev/null
+{ version }:
+
+{ stdenv, python3Packages, yosys, libresoc-verilog, libresoc-pinmux, pkgsCross }:
+
+stdenv.mkDerivation {
+ pname = "libresoc-ls1804k";
+ inherit version;
+
+ src = ../src/soc/litex/florent;
+
+ nativeBuildInputs =
+ (with python3Packages; [
+ python libresoc-soc litex litedram liteeth liteiclink litescope litesdcard
+ ])
+ ++ [ pkgsCross.powernv.buildPackages.gcc ];
+
+ postPatch = ''
+ patchShebangs --build .
+ '';
+
+ configurePhase = "true";
+
+ buildPhase = ''
+ runHook preBuild
+ export PINMUX="$(mktemp -d)"
+ ln -s ${libresoc-pinmux} "$PINMUX/ls180"
+ cp ${libresoc-verilog} libresoc/libresoc.v
+ ./ls180soc.py --build --platform=ls180sram4k --num-srams=2 --srams4k
+ runHook postBuild
+ '';
+
+ installPhase = ''
+ runHook preInstall
+ mkdir $out
+ mv build/ls180sram4k/gateware/ls180sram4k.v $out/ls180.v
+ mv build/ls180sram4k/gateware/mem.init $out
+ mv build/ls180sram4k/gateware/mem_1.init $out
+ mv libresoc/libresoc.v $out
+ mv libresoc/SPBlock_512W64B8W.v $out
+ runHook postInstall
+ '';
+
+ fixupPhase = "true";
+}
--- /dev/null
+{ lib, buildPythonPackage, fetchFromGitHub }:
+
+buildPythonPackage rec {
+ pname = "modgrammar";
+ version = "unstable-2020-09-20";
+
+ src = fetchFromGitHub {
+ owner = "bloerwald";
+ repo = "modgrammar";
+ rev = "d363ad5a86584e560a8b03cbe11c0168d7610691";
+ sha256 = "SO2qjfEVaJfgbA5HLJYwXlaeUzt5EFoljYQ2SsdDCbc=";
+ };
+
+ doCheck = false;
+
+ meta = with lib; {
+ homepage = "https://pypi.org/project/modgrammar/";
+ # license = licenses.bsd; # FIXME: Which BSD?
+ };
+}
--- /dev/null
+{ lib, buildPythonPackage, bigfloat, fetchgit, pyvcd }:
+
+buildPythonPackage {
+ pname = "libresoc-nmutil";
+ version = "unstable-2021-08-24";
+
+ propagatedBuildInputs = [ pyvcd ];
+
+ src = fetchgit {
+ url = "https://git.libre-soc.org/git/nmutil.git";
+ rev = "efda080db6978d249a23003bec734f1cc07de329";
+ sha256 = "nTgUiZc4CC0VoUND29kHSIyMlP9IB3oZfehutoNK07w=";
+ };
+
+ doCheck = false;
+
+ meta = with lib; {
+ homepage = "https://pypi.org/project/libresoc-ieee754fpu/";
+ license = licenses.lgpl3Plus;
+ };
+}
--- /dev/null
+{ lib, python, buildPythonPackage, fetchgit, libresoc-nmutil, astor, nmigen, ply, pygdbmi }:
+
+buildPythonPackage {
+ pname = "libresoc-openpower-isa";
+ version = "unstable-2021-09-04";
+
+ src = fetchgit {
+ url = "https://git.libre-soc.org/git/openpower-isa.git";
+ rev = "6e43a194f3d07ed5a8daa297187a32746c4c4d3c";
+ sha256 = "0EekUouTQruTXGO5jlPJtqh0DOudghILy0nca5eaZz8=";
+ };
+
+ propagatedBuildInputs = [ libresoc-nmutil astor nmigen ply pygdbmi ];
+
+ doCheck = false;
+
+ prePatch = ''
+ touch ./src/openpower/sv/__init__.py # TODO: fix upstream
+ '';
+
+ postInstall = ''
+ cp -rT ./openpower $out/${python.sitePackages}/../openpower/
+ '';
+
+ pythonImportsCheck = [ "openpower.decoder.power_decoder2" "openpower" ];
+
+ meta = with lib; {
+ homepage = "https://pypi.org/project/libresoc-openpower-isa/";
+ license = licenses.lgpl3Plus;
+ };
+}
--- /dev/null
+{ version }:
+
+{ stdenv, python2 }:
+
+stdenv.mkDerivation {
+ pname = "libresoc-pinmux";
+ inherit version;
+
+ src = ../pinmux;
+
+ nativeBuildInputs = [ python2 ];
+
+ configurePhase = "true";
+
+ buildPhase = ''
+ runHook preBuild
+ python src/pinmux_generator.py -v -s ls180 -o ls180
+ runHook postBuild
+ '';
+
+ installPhase = ''
+ runHook preInstall
+ mv ls180 $out
+ runHook postInstall
+ '';
+
+ fixupPhase = "true";
+}
--- /dev/null
+{ version }:
+
+{ lib, buildPythonPackage, yosys, runCommand, c4m-jtag, nmigen-soc
+, libresoc-ieee754fpu, libresoc-openpower-isa, python }:
+
+let
+ # If we use ../. as source, then any change to
+ # any unrelated Nix file would cause a rebuild,
+ # since the build would have access to it.
+ src = runCommand "libresoc-soc-source" {} ''
+ mkdir $out
+ cp -r ${../src} -T $out/src
+ cp -r ${../setup.py} -T $out/setup.py
+ cp -r ${../README.md} -T $out/README.md
+ cp -r ${../NEWS.txt} -T $out/NEWS.txt
+ '';
+in
+buildPythonPackage {
+ pname = "libresoc-soc";
+ inherit version src;
+
+ propagatedBuildInputs = [
+ c4m-jtag nmigen-soc python libresoc-ieee754fpu libresoc-openpower-isa yosys
+ ];
+
+ doCheck = false;
+
+ prePatch = ''
+ rm -r src/soc/litex
+ '';
+
+ pythonImportsCheck = [ "soc" ];
+
+ meta = with lib; {
+ homepage = "https://libre-soc.org/";
+ license = licenses.lgpl3Plus;
+ };
+}
--- /dev/null
+{ version }:
+
+{ runCommand, python3Packages, libresoc-pinmux }:
+
+let script = ''
+ mkdir pinmux
+ ln -s ${libresoc-pinmux} pinmux/ls180
+ export PINMUX="$(realpath ./pinmux)"
+ python3 -m soc.simple.issuer_verilog \
+ --debug=jtag --enable-core --enable-pll \
+ --enable-xics --enable-sram4x4kblock --disable-svp64 \
+ $out
+''; in
+runCommand "libresoc.v" {
+ inherit version;
+
+ nativeBuildInputs = (with python3Packages; [
+ libresoc-soc
+ ]) ++ [ libresoc-pinmux ];
+} script
-Subproject commit 096caad8418250693c93ccf90047750704adcaa7
+Subproject commit 7cbf0e2a54448f549243cd602ebafd10de8d32f0
--- /dev/null
+[tool.pytest.ini_options]
+minversion = "6.0"
+python_classes = ""
+python_functions = ""
+testpaths = ["src/soc"]
+required_plugins = ["pytest-xdist>=1.0.0", "pytest-output-to-files>=0.1.0"]
+addopts = [
+ "-n",
+ "auto",
+ "--shorten-output-dir=test-out",
+]
version = '0.0.1'
+# the only reason this is added is because it's become a part of python 3.9.
+# the project standard is python 3.7 however in future that will be updated.
+# for now, cached_property is RELUCTANTLY added but a *copy* is added so
+# that the generation of HDL is not critically dependent on random crap
+# off the internet. you're spending USD 16 *MILLION* on masks, you better
+# be absolutely paranoid-level certain you know where every piece of the
+# chain creating the HDL comes from.
+cprop = "git+https://git.libre-soc.org/git/cached-property.git@1.5.2" \
+ "#egg=cached-property-1.5.2"
+
# using pip3 for ongoing development is a royal pain. seriously not
# recommended. therefore a number of these dependencies have been
# commented out. *they are still required* - they will need installing
# manually.
+# XXX UNDER NO CIRCUMSTANCES ADD ARBITRARY DEPENDENCIES HERE. XXX
+# as this is HDL, not software, every dependency added is
+# a serious maintenance and reproducible-build problem.
+# dropping USD 16 million on 7nm Mask Charges when the
+# HDL can be compromised - accidentally or deliberately -
+# by pip3 going out and randomly downloading complete
+# shite is not going to do anyone any favours.
+
+# TODO: make *all* of these be from libre-soc git repo only
+# (which means updating the nmigen-soc one to mirror gitlab)
+
install_requires = [
# 'sfpy', # needs manual patching
'libresoc-ieee754fpu', # uploaded (successfully, whew) to pip
'libresoc-openpower-isa', # uploaded (successfully, whew) to pip
# 'nmigen-soc', # install manually from git.libre-soc.org
+
+ # git url needed for having `pip3 install -e .` install from libre-soc git
+ "cached-property@"+cprop,
+]
+
+# git url needed for having `setup.py develop` install from libre-soc git
+dependency_links = [
+ cprop,
]
test_requires = [
long_description_content_type='text/markdown',
classifiers=[
"Topic :: Software Development",
- "License :: OSI Approved :: GNU Lesser General Public License v3 or later (LGPLv3+)",
+ "License :: OSI Approved :: " \
+ "GNU Lesser General Public License v3 or later (LGPLv3+)",
"Programming Language :: Python :: 3",
"Operating System :: OS Independent",
],
include_package_data=True,
zip_safe=False,
install_requires=install_requires,
+ dependency_links=dependency_links,
tests_require=test_requires,
test_suite='nose.collector',
)
--- /dev/null
+#!/usr/bin/env python3
+#
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2022 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+# Sponsored by NLnet and NGI POINTER under EU Grants 871528 and 957073
+# Part of the Libre-SOC Project.
+#
+# this is a wrapper around the external_core_top.v verilog module
+# which allows for faster development iteration (oh and microwatt or
+# other core to be dropped into a peripheral fabric)
+
+from nmigen import (Elaboratable, Cat, Module, Signal, ClockSignal, Instance,
+ ResetSignal, Const)
+from nmigen.cli import rtlil, verilog
+
+from soc.debug.dmi import DMIInterface
+from nmigen_soc.wishbone.bus import Interface
+import os
+
+__all__ = ["ExternalCore"]
+
+
+class ExternalCore(Elaboratable):
+ """External Core verilog wrapper for microwatt and libre-soc
+ (actually, anything prepared to map to the Signals defined below)
+ remember to call ExternalCore.add_verilog_source
+ """
+
+ def __init__(self, ibus=None, dbus=None, features=None, name=None):
+
+ # set up the icache wishbone bus
+ if features is None:
+ features = frozenset(("stall",))
+ if ibus is None:
+ ibus = Interface(addr_width=32,
+ data_width=64,
+ features=features,
+ granularity=8,
+ name="core_ibus")
+ if dbus is None:
+ dbus = Interface(addr_width=32,
+ data_width=64,
+ features=features,
+ granularity=8,
+ name="core_dbus")
+ self.dmi = DMIInterface(name="dmi")
+ self.ibus = ibus
+ self.dbus = dbus
+
+ assert len(self.ibus.dat_r) == 64, "bus width must be 64"
+ assert len(self.dbus.dat_r) == 64, "bus width must be 64"
+
+ # IRQ for data buffer receive/xmit
+ self.irq = Signal()
+
+ # debug monitoring signals
+ self.nia = Signal(64)
+ self.nia_req = Signal()
+ self.msr = Signal(64)
+ self.ldst_addr = Signal(64)
+ self.ldst_req = Signal()
+
+ # alternative reset and termination indicator
+ self.alt_reset = Signal()
+ self.terminated_o = Signal()
+
+ @classmethod
+ def add_verilog_source(cls, verilog_src_dir, platform):
+ # add each of the verilog sources, needed for when doing platform.build
+ for fname in ['external_core_top.v',
+ ]:
+ # prepend the src directory to each filename, add its contents
+ fullname = os.path.join(verilog_src_dir, fname)
+ with open(fullname) as f:
+ platform.add_file(fullname, f)
+
+ def elaborate(self, platform):
+ m = Module()
+ comb = m.d.comb
+
+ # create definition of external core here, so that
+ # nmigen understands I/O directions (defined by i_ and o_ prefixes)
+ ibus, dbus, dmi = self.ibus, self.dbus, self.dmi
+
+ # sigh, microwatt wishbone address is borked, it contains the 3 LSBs
+ ibus_adr = Signal(32)
+ dbus_adr = Signal(32)
+ m.d.comb += ibus.adr.eq(ibus_adr[3:])
+ m.d.comb += dbus.adr.eq(dbus_adr[3:])
+
+ kwargs = {
+ # clock/reset signals
+ 'i_clk': ClockSignal(),
+ 'i_rst': ResetSignal(),
+ # DMI interface
+ 'i_dmi_addr': dmi.addr_i,
+ 'i_dmi_req': dmi.req_i,
+ 'i_dmi_wr': dmi.we_i,
+ 'i_dmi_din': dmi.din,
+ 'o_dmi_dout': dmi.dout,
+ 'o_dmi_ack': dmi.ack_o,
+ # debug/monitor signals
+ 'o_nia': self.nia,
+ 'o_nia_req': self.nia_req,
+ 'o_msr_o': self.msr,
+ 'o_ldst_addr': self.ldst_addr,
+ 'o_ldst_req': self.ldst_req,
+ 'i_alt_reset': self.alt_reset,
+ 'o_terminated_out': self.terminated_o,
+ # wishbone instruction bus
+ 'o_wishbone_insn_out.adr': ibus_adr,
+ 'o_wishbone_insn_out.dat': ibus.dat_w,
+ 'o_wishbone_insn_out.sel': ibus.sel,
+ 'o_wishbone_insn_out.cyc': ibus.cyc,
+ 'o_wishbone_insn_out.stb': ibus.stb,
+ 'o_wishbone_insn_out.we': ibus.we,
+ 'i_wishbone_insn_in.dat': ibus.dat_r,
+ 'i_wishbone_insn_in.ack': ibus.ack,
+ 'i_wishbone_insn_in.stall': ibus.stall,
+ # wishbone data bus
+ 'o_wishbone_data_out.adr': dbus_adr,
+ 'o_wishbone_data_out.dat': dbus.dat_w,
+ 'o_wishbone_data_out.sel': dbus.sel,
+ 'o_wishbone_data_out.cyc': dbus.cyc,
+ 'o_wishbone_data_out.stb': dbus.stb,
+ 'o_wishbone_data_out.we': dbus.we,
+ 'i_wishbone_data_in.dat': dbus.dat_r,
+ 'i_wishbone_data_in.ack': dbus.ack,
+ 'i_wishbone_data_in.stall': dbus.stall,
+ # external interrupt request
+ 'i_ext_irq': self.irq,
+ }
+ core = Instance("external_core_top", **kwargs)
+ m.submodules['core_top'] = core
+
+ return m
+
+
+def create_ilang(dut, ports, test_name):
+ vl = rtlil.convert(dut, name=test_name, ports=ports)
+ with open("%s.il" % test_name, "w") as f:
+ f.write(vl)
+
+def create_verilog(dut, ports, test_name):
+ vl = verilog.convert(dut, name=test_name, ports=ports)
+ with open("%s.v" % test_name, "w") as f:
+ f.write(vl)
+
+
+if __name__ == "__main__":
+ core = ExternalCore(name="core")
+ create_ilang(core, [
+ core.ibus.cyc, core.ibus.stb, core.ibus.ack,
+ core.ibus.dat_r, core.ibus.dat_w, core.ibus.adr,
+ core.ibus.we, core.ibus.sel, core.ibus.stall,
+ core.dbus.cyc, core.dbus.stb, core.dbus.ack,
+ core.dbus.dat_r, core.dbus.dat_w, core.dbus.adr,
+ core.dbus.we, core.dbus.sel,
+ core.irq, core.alt_reset, core.terminated_o,
+ core.msr, core.nia, core.nia_req,
+ core.ldst_addr, core.ldst_req,
+ core.dmi.addr_i, core.dmi.req_i, core.dmi.we_i,
+ core.dmi.din, core.dmi.dout, core.dmi.ack_o,
+ ], "core_0")
+
--- /dev/null
+#!/usr/bin/env python3
+#
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2020-2022 Raptor Engineering LLC <support@raptorengineering.com>
+# Copyright (C) 2022 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+# Sponsored by NLnet and NGI POINTER under EU Grants 871528 and 957073
+# Part of the Libre-SOC Project.
+#
+# this is a wrapper around the opencores verilog 10/100 MAC
+
+from nmigen import (Elaboratable, Cat, Module, Signal, ClockSignal, Instance,
+ ResetSignal)
+
+from nmigen_soc.wishbone.bus import Interface
+from nmigen_soc.memory import MemoryMap
+from nmigen.utils import log2_int
+from nmigen.cli import rtlil, verilog
+import os
+
+__all__ = ["EthMAC"]
+
+
+class EthMAC(Elaboratable):
+ """Ethernet MAC from opencores, nmigen wrapper.
+ remember to call EthMAC.add_verilog_source
+ """
+
+ def __init__(self, master_bus=None, slave_bus=None, name=None,
+ irq=None, pins=None):
+ if name is not None:
+ # convention: give the name in the format "name_number"
+ self.idx = int(name.split("_")[-1])
+ else:
+ self.idx = 0
+ name = "eth_0"
+ self.granularity = 8
+ self.data_width = 32
+ self.dsize = log2_int(self.data_width//self.granularity)
+
+ # set up the wishbone busses
+ features = frozenset()
+ if master_bus is None:
+ master_bus = Interface(addr_width=30,
+ data_width=32,
+ features=features,
+ granularity=8,
+ name=name+"_wb_%d_0" % self.idx)
+ if slave_bus is None:
+ slave_bus = Interface(addr_width=12,
+ data_width=32,
+ features=features,
+ granularity=8,
+ name=name+"_wb_%d_1" % self.idx)
+ self.master_bus = master_bus
+ self.slave_bus = slave_bus
+ if irq is None:
+ irq = Signal()
+ self.irq = irq
+
+ slave_mmap = MemoryMap(addr_width=12+self.dsize,
+ data_width=self.granularity)
+
+ self.slave_bus.memory_map = slave_mmap
+
+ # RMII TX signals
+ self.mtx_clk = Signal()
+ self.mtxd = Signal(4)
+ self.mtxen = Signal()
+ self.mtxerr = Signal()
+
+ # RMII RX signals
+ self.mrx_clk = Signal()
+ self.mrxd = Signal(4)
+ self.mrxdv = Signal()
+ self.mrxerr = Signal()
+
+ # RMII common signals
+ self.mcoll = Signal()
+ self.mcrs = Signal()
+
+ # RMII management interface signals
+ self.mdc = Signal()
+ self.md_in = Signal()
+ self.md_out = Signal()
+ self.md_direction = Signal()
+
+ # pins resource
+ self.pins = pins
+
+ @classmethod
+ def add_verilog_source(cls, verilog_src_dir, platform):
+ # add each of the verilog sources, needed for when doing platform.build
+ for fname in ['eth_clockgen.v', 'eth_cop.v', 'eth_crc.v',
+ 'eth_fifo.v', 'eth_maccontrol.v', 'ethmac_defines.v',
+ 'eth_macstatus.v', 'ethmac.v', 'eth_miim.v',
+ 'eth_outputcontrol.v', 'eth_random.v',
+ 'eth_receivecontrol.v', 'eth_registers.v',
+ 'eth_register.v', 'eth_rxaddrcheck.v',
+ 'eth_rxcounters.v', 'eth_rxethmac.v',
+ 'eth_rxstatem.v', 'eth_shiftreg.v',
+ 'eth_spram_256x32.v', 'eth_top.v',
+ 'eth_transmitcontrol.v', 'eth_txcounters.v',
+ 'eth_txethmac.v', 'eth_txstatem.v', 'eth_wishbone.v',
+ 'timescale.v']:
+ # prepend the src directory to each filename, add its contents
+ fullname = os.path.join(verilog_src_dir, fname)
+ with open(fullname) as f:
+ platform.add_file(fullname, f)
+
+ def elaborate(self, platform):
+ m = Module()
+ comb = m.d.comb
+ idx = self.idx
+
+ # Calculate arbiter bus address
+ wb_master_bus_adr = Signal(32)
+ # arbiter address is in words, ethernet master address is in bytes
+ comb += self.master_bus.adr.eq(wb_master_bus_adr >> 2)
+
+ # create definition of external verilog EthMAC code here, so that
+ # nmigen understands I/O directions (defined by i_ and o_ prefixes)
+ ethmac = Instance("eth_top",
+ # Clock/reset (use DomainRenamer if needed)
+ i_wb_clk_i=ClockSignal(),
+ i_wb_rst_i=ResetSignal(),
+
+ # Master Wishbone bus signals
+ o_m_wb_adr_o=wb_master_bus_adr,
+ i_m_wb_dat_i=self.master_bus.dat_r,
+ o_m_wb_sel_o=self.master_bus.sel,
+ o_m_wb_dat_o=self.master_bus.dat_w,
+ o_m_wb_we_o=self.master_bus.we,
+ o_m_wb_stb_o=self.master_bus.stb,
+ o_m_wb_cyc_o=self.master_bus.cyc,
+ i_m_wb_ack_i=self.master_bus.ack,
+
+ # Slave Wishbone bus signals
+ i_wb_adr_i=self.slave_bus.adr,
+ i_wb_dat_i=self.slave_bus.dat_w,
+ i_wb_sel_i=self.slave_bus.sel,
+ o_wb_dat_o=self.slave_bus.dat_r,
+ i_wb_we_i=self.slave_bus.we,
+ i_wb_stb_i=self.slave_bus.stb,
+ i_wb_cyc_i=self.slave_bus.cyc,
+ o_wb_ack_o=self.slave_bus.ack,
+
+ o_int_o=self.irq,
+
+ # RMII TX
+ i_mtx_clk_pad_i=self.mtx_clk,
+ o_mtxd_pad_o=self.mtxd,
+ o_mtxen_pad_o=self.mtxen,
+ o_mtxerr_pad_o=self.mtxerr,
+
+ # RMII RX
+ i_mrx_clk_pad_i=self.mrx_clk,
+ i_mrxd_pad_i=self.mrxd,
+ i_mrxdv_pad_i=self.mrxdv,
+ i_mrxerr_pad_i=self.mrxerr,
+
+ # RMII common
+ i_mcoll_pad_i=self.mcoll,
+ i_mcrs_pad_i=self.mcrs,
+
+ # Management Interface
+ o_mdc_pad_o=self.mdc,
+ i_md_pad_i=self.md_in,
+ o_md_pad_o=self.md_out,
+ o_md_padoe_o=self.md_direction
+ );
+
+ m.submodules['ethmac_%d' % self.idx] = ethmac
+
+ if self.pins is not None:
+ comb += self.mtx_clk.eq(self.pins.mtx_clk.i)
+ comb += self.pins.mtxd.o.eq(self.mtxd)
+ comb += self.pins.mtxen.o.eq(self.mtxen)
+ comb += self.pins.mtxerr.o.eq(self.mtxerr)
+
+ comb += self.mrx_clk.eq(self.pins.mrx_clk.i)
+ comb += self.mrxd.eq(self.pins.mrxd.i)
+ comb += self.mrxdv.eq(self.pins.mrxdv.i)
+ comb += self.mrxerr.eq(self.pins.mrxerr.i)
+ comb += self.mcoll.eq(self.pins.mcoll.i)
+ comb += self.mcrs.eq(self.pins.mcrs.i)
+
+ comb += self.pins.mdc.o.eq(self.mdc)
+
+ comb += self.pins.md.o.eq(self.md_out)
+ comb += self.pins.md.oe.eq(self.md_direction)
+ comb += self.md_in.eq(self.pins.md.i)
+ return m
+
+
+def create_ilang(dut, ports, test_name):
+ vl = rtlil.convert(dut, name=test_name, ports=ports)
+ with open("%s.il" % test_name, "w") as f:
+ f.write(vl)
+
+def create_verilog(dut, ports, test_name):
+ vl = verilog.convert(dut, name=test_name, ports=ports)
+ with open("%s.v" % test_name, "w") as f:
+ f.write(vl)
+
+if __name__ == "__main__":
+ ethmac = EthMAC(name="eth_0")
+ create_ilang(ethmac, [ethmac.master_bus.cyc, ethmac.master_bus.stb,
+ ethmac.master_bus.ack, ethmac.master_bus.dat_r,
+ ethmac.master_bus.dat_w, ethmac.master_bus.adr,
+ ethmac.master_bus.we, ethmac.master_bus.sel,
+ ethmac.slave_bus.cyc, ethmac.slave_bus.stb,
+ ethmac.slave_bus.ack,
+ ethmac.slave_bus.dat_r, ethmac.slave_bus.dat_w,
+ ethmac.slave_bus.adr,
+ ethmac.slave_bus.we, ethmac.slave_bus.sel,
+ ethmac.mtx_clk, ethmac.mtxd, ethmac.mtxen,
+ ethmac.mtxerr, ethmac.mrx_clk, ethmac.mrxd,
+ ethmac.mrxdv, ethmac.mrxerr, ethmac.mcoll,
+ ethmac.mcrs, ethmac.mdc, ethmac.md_in,
+ ethmac.md_out, ethmac.md_direction
+ ], "eth_0")
+
--- /dev/null
+#!/usr/bin/env python3
+#
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2022 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+# Sponsored by NLnet and NGI POINTER under EU Grants 871528 and 957073
+# Part of the Libre-SOC Project.
+#
+# this is a wrapper around the opencores verilog uart16550 module
+
+from nmigen import (Elaboratable, Cat, Module, Signal, ClockSignal, Instance,
+ ResetSignal, Record)
+
+from nmigen_soc.wishbone.bus import Interface
+from nmigen.cli import rtlil, verilog
+import os
+
+__all__ = ["SDRAM", "SDRAMConfig"]
+
+ """
+ class MT48LC16M16(SDRModule):
+ # geometry
+ nbanks = 4
+ nrows = 8192
+ ncols = 512
+ # timings
+ technology_timings = _TechnologyTimings(tREFI=64e6/8192,
+ tWTR=(2, None),
+ tCCD=(1, None),
+ tRRD=(None, 15))
+ speedgrade_timings = {"default": _SpeedgradeTimings(tRP=20,
+ tRCD=20,
+ tWR=15,
+ tRFC=(None, 66),
+ tFAW=None,
+ tRAS=44)}
+ # for MT48LC16M16-75 part
+ comb += self.cfg.sdr_en.eq(1)
+ comb += self.cfg.sdr_mode_reg.eq(0x033)
+ comb += self.cfg.req_depth.eq(3) # max
+ comb += self.cfg.sdr_tras_d.eq(44) # Active to precharge delay
+ comb += self.cfg.sdr_trp_d.eq(20) # Precharge to active delay
+ comb += self.cfg.sdr_trcd_d.eq(20) # Active to R/W delay
+ comb += self.cfg.sdr_cas.eq(3) # CAS latency
+ comb += self.cfg.sdr_trcar_d.eq(66) # tRFC auto-refresh period
+ comb += self.cfg.sdr_twr_d.eq(15) # clock + 7.5ns
+ comb += self.cfg.sdr_rfsh.eq(0x100)
+ comb += self.cfg.sdr_rfmax.eq(6)
+ """
+
+
+class SDRAMConfig(Record):
+ def __init__(self, refresh_timer_sz, refresh_row_count, name=None):
+ super().__init__(name=name, layout=[
+ # configuration parameters, these need to match the SDRAM IC datasheet
+ ('req_depth', 2), # max request accepted
+ ('sdr_en', 1), # Enable SDRAM controller
+ ('sdr_mode_reg', 13),
+ ('sdr_tras_d', 4), # Active to precharge delay
+ ('sdr_trp_d', 4), # Precharge to active delay
+ ('sdr_trcd_d', 4), # Active to R/W delay
+ ('sdr_cas', 3), # SDRAM CAS Latency
+ ('sdr_trcar_d', 4), # Auto-refresh period
+ ('sdr_twr_d', 4), # Write recovery delay
+ ('sdr_rfsh', refresh_timer_sz),
+ ('sdr_rfmax', refresh_row_count)
+ ])
+
+
+class SDRAM(Elaboratable):
+ """SDRAM controller from opencores, nmigen wrapper. remember to call
+ SDRAM.add_verilog_source.
+
+ * the SDRAM IC will be accessible over the Wishbone Bus
+ * sdr_* signals must be wired to the IC
+ * cfg parameters must match those listed in the SDRAM IC's datasheet
+ """
+
+ def __init__(self, bus=None, features=None, name=None,
+ data_width=32, addr_width=26,
+ sdr_data_width=16,
+ cfg=None,
+ pins=None):
+ if name is not None:
+ name = "sdram"
+ self.data_width = data_width
+ self.sdr_data_width = sdr_data_width
+ self.addr_width = addr_width
+ self.refresh_timer_sz = 12
+ self.refresh_row_count = 3
+
+ # set up the wishbone bus
+ if features is None:
+ features = frozenset({'cti'})
+ if bus is None:
+ bus = Interface(addr_width=addr_width,
+ data_width=data_width,
+ features=features,
+ granularity=8,
+ name=name)
+ self.bus = bus
+ assert len(self.bus.dat_r) == data_width, \
+ "bus width must be %d" % data_width
+
+ byte_width = sdr_data_width // 8 # for individual byte masks/enables
+
+ # SDRAM signals
+ self.sdram_clk = Signal() # sdram phy clock
+ self.sdram_resetn = Signal(reset_less=True) # sdram reset (low)
+ self.sdr_cs_n = Signal() # chip select
+ self.sdr_cke = Signal() # clock-enable
+ self.sdr_ras_n = Signal() # read-address strobe
+ self.sdr_cas_n = Signal() # cas
+ self.sdr_we_n = Signal() # write-enable
+ self.sdr_dqm = Signal(byte_width) # data mask
+ self.sdr_ba = Signal(2) # bank enable
+ self.sdr_addr = Signal(13) # sdram address, 13 bits
+ # these combine to create a bi-direction inout, sdr_dq
+ # note, each bit of sdr_den_n covers a *byte* of sdr_din/sdr_dout
+ self.sdr_den_n = Signal(byte_width)
+ self.sdr_din = Signal(data_width)
+ self.sdr_dout = Signal(data_width)
+
+ # configuration parameters, these need to match the SDRAM IC datasheet
+ self.sdr_init_done = Signal() # Indicate SDRAM init Done
+ if cfg is None:
+ cfg = SDRAMConfig(self.refresh_timer_sz,
+ self.refresh_row_count, name="sdr_cfg")
+
+ # config and pins resource
+ self.pins = pins
+ self.cfg = cfg
+
+ @classmethod
+ def add_verilog_source(cls, verilog_src_dir, platform):
+ # add each of the verilog sources, needed for when doing platform.build
+ for fname in [ './core/sdrc_bank_ctl.v', './core/sdrc_bank_fsm.v',
+ './core/sdrc_bs_convert.v', './core/sdrc_core.v',
+ './core/sdrc_req_gen.v', './core/sdrc_xfr_ctl.v',
+ './core/sdrc_define.v',
+ './lib/async_fifo.v', './lib/sync_fifo.v',
+ './top/sdrc_top.v', './wb2sdrc/wb2sdrc.v',
+ ]:
+ # prepend the src directory to each filename, add its contents
+ fullname = os.path.join(verilog_src_dir, fname)
+ with open(fullname) as f:
+ platform.add_file(fullname, f)
+
+ def elaborate(self, platform):
+ m = Module()
+ comb = m.d.comb
+
+ # create definition of external verilog 16550 uart here, so that # nmigen understands I/O directions (defined by i_ and o_ prefixes)
+ bus = self.bus
+
+ params = {
+ # clock/reset (use DomainRenamer if needed)
+ 'i_wb_clk_i' : ClockSignal(),
+ 'i_wb_rst_i' : ResetSignal(),
+
+ # wishbone bus signals
+ 'i_wb_adr_i' : bus.adr,
+ 'i_wb_dat_i' : bus.dat_w,
+ 'i_wb_sel_i' : bus.sel,
+ 'o_wb_dat_o' : bus.dat_r,
+ 'i_wb_we_i' : bus.we,
+ 'i_wb_stb_i' : bus.stb,
+ 'i_wb_cyc_i' : bus.cyc,
+ 'o_wb_ack_o' : bus.ack,
+
+ # SDRAM signals
+ 'i_sdram_clk' : self.sdram_clk,
+ 'i_sdram_resetn' : self.sdram_resetn,
+ 'o_sdr_cs_n' : self.sdr_cs_n,
+ 'o_sdr_cke' : self.sdr_cke,
+ 'o_sdr_ras_n' : self.sdr_ras_n,
+ 'o_sdr_cas_n' : self.sdr_cas_n,
+ 'o_sdr_we_n' : self.sdr_we_n,
+ 'o_sdr_dqm' : self.sdr_dqm,
+ 'o_sdr_ba' : self.sdr_ba,
+ 'o_sdr_addr' : self.sdr_addr,
+ 'o_sdr_den_n' : self.sdr_den_n,
+ 'i_sdr_din' : self.sdr_din,
+ 'o_sdr_dout' : self.sdr_dout,
+
+ # configuration parameters (from the SDRAM IC datasheet)
+ 'o_sdr_init_done' : self.sdr_init_done ,
+ 'i_cfg_req_depth' : self.cfg.req_depth ,
+ 'i_cfg_sdr_en' : self.cfg.sdr_en ,
+ 'i_cfg_sdr_mode_reg' : self.cfg.sdr_mode_reg ,
+ 'i_cfg_sdr_tras_d' : self.cfg.sdr_tras_d ,
+ 'i_cfg_sdr_trp_d' : self.cfg.sdr_trp_d ,
+ 'i_cfg_sdr_trcd_d' : self.cfg.sdr_trcd_d ,
+ 'i_cfg_sdr_cas' : self.cfg.sdr_cas ,
+ 'i_cfg_sdr_trcar_d' : self.cfg.sdr_trcar_d ,
+ 'i_cfg_sdr_twr_d' : self.cfg.sdr_twr_d ,
+ 'i_cfg_sdr_rfsh' : self.cfg.sdr_rfsh ,
+ 'i_cfg_sdr_rfmax' : self.cfg.sdr_rfmax,
+
+ # verilog parameters
+ 'p_APP_AW' : self.addr_width, # Application Address Width
+ 'p_APP_DW' : self.data_width, # Application Data Width
+ 'p_APP_BW' : self.addr_width//8, # Application Byte Width
+ 'p_APP_RW' : 9, # Application Request Width
+ 'p_SDR_DW' : self.sdr_data_width, # SDR Data Width
+ 'p_SDR_BW' : self.sdr_data_width//8, # SDR Byte Width
+ 'p_dw' : self.data_width, # data width
+ 'p_tw' : 8, # tag id width
+ 'p_bl' : 9, # burst_length_width
+ }
+ m.submodules['sdrc_top'] = Instance("sdrc_top", **params)
+
+ return m
+
+ if self.pins is not None:
+ comb += self.pins.tx.eq(self.tx_o)
+ comb += self.rx_i.eq(self.pins.rx)
+
+ return m
+
+
+def create_ilang(dut, ports, test_name):
+ vl = rtlil.convert(dut, name=test_name, ports=ports)
+ with open("%s.il" % test_name, "w") as f:
+ f.write(vl)
+
+def create_verilog(dut, ports, test_name):
+ vl = verilog.convert(dut, name=test_name, ports=ports)
+ with open("%s.v" % test_name, "w") as f:
+ f.write(vl)
+
+
+if __name__ == "__main__":
+ sdram = SDRAM(name="sdram", data_width=8)
+ create_ilang(sdram, [sdram.bus.cyc, sdram.bus.stb, sdram.bus.ack,
+ sdram.bus.dat_r, sdram.bus.dat_w, sdram.bus.adr,
+ sdram.bus.we, sdram.bus.sel,
+ sdram.sdram_clk, sdram.sdram_resetn,
+ sdram.sdr_cs_n, sdram.sdr_cke,
+ sdram.sdr_ras_n, sdram.sdr_cas_n, sdram.sdr_we_n,
+ sdram.sdr_dqm, sdram.sdr_ba, sdram.sdr_addr,
+ sdram.sdr_den_n, sdram.sdr_din, sdram.sdr_dout,
+ sdram.sdr_init_done, sdram.cfg.req_depth,
+ sdram.cfg.sdr_en, sdram.cfg.sdr_mode_reg,
+ sdram.cfg.sdr_tras_d, sdram.cfg.sdr_trp_d,
+ sdram.cfg.sdr_trcd_d, sdram.cfg.sdr_cas,
+ sdram.cfg.sdr_trcar_d, sdram.cfg.sdr_twr_d,
+ sdram.cfg.sdr_rfsh, sdram.cfg.sdr_rfmax,
+ ], "sdram")
+
data_width=self.memory.width,
granularity=granularity,
features=features,
- alignment=0,
+ #alignment=0,
name=None)
self.bus = bus
self.granularity = bus.granularity
--- /dev/null
+#!/usr/bin/env python3
+#
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2022 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+# Copyright (C) 2022 Raptor Engineering, LLC <support@raptorengineering.com>
+# Sponsored by NLnet and NGI POINTER under EU Grants 871528 and 957073
+# Part of the Libre-SOC Project.
+#
+# this is a System Console peripheral compatible with microwatt
+# https://github.com/antonblanchard/microwatt/blob/master/syscon.vhdl
+
+from nmigen import (Elaboratable, Cat, Module, Signal)
+from nmigen.cli import rtlil, verilog
+
+from lambdasoc.periph import Peripheral
+
+__all__ = ["MicrowattSYSCON"]
+
+
+class MicrowattSYSCON(Peripheral, Elaboratable):
+ """Microwatt-compatible (Sys)tem (Con)figuration module
+ """
+
+ def __init__(self, *, sys_clk_freq=100e6,
+ core_clk_freq=100e6,
+ mem_clk_freq=100e6,
+ spi_offset=None,
+ dram_addr=None,
+ has_uart=True,
+ uart_is_16550=True
+ ):
+ super().__init__(name="syscon")
+ self.sys_clk_freq = sys_clk_freq
+ self.core_clk_freq = core_clk_freq
+ self.mem_clk_freq = mem_clk_freq
+ self.has_uart = has_uart
+ self.spi_offset = spi_offset
+ self.dram_addr = dram_addr
+ self.uart_is_16550 = uart_is_16550
+
+ # System control ports
+ self.dram_at_0 = Signal()
+ self.core_reset = Signal()
+ self.soc_reset = Signal()
+
+ # set up a CSR Bank and associated bridge. has to be in this order
+ # (declare bank, declare bridge) for some unknown reason.
+ # (r)ead regs will have a r_stb and r_data Record entry
+ # (w)rite regs will have a w_stb and w_data Record entry
+ bank = self.csr_bank()
+ self._reg_sig_r = bank.csr(64, "r") # signature
+ self._reg_info_r = bank.csr(64, "r") # info
+ self._bram_info_r = bank.csr(64, "r") # bram info
+ self._dram_info_r = bank.csr(64, "r") # dram info
+ self._clk_info_r = bank.csr(64, "r") # nest clock frequency
+ self._ctrl_info_r = bank.csr(64, "rw") # control info
+ self._dram_init_r = bank.csr(64, "r") # dram initialisation info
+ self._spiflash_info_r = bank.csr(64, "r") # spi flash info
+ self._uart0_info_r = bank.csr(64, "r") # UART0 info (baud etc.)
+ self._uart1_info_r = bank.csr(64, "r") # UART1 info (baud etc.)
+ self._bram_bootaddr_r = bank.csr(64, "r") # BRAM boot address
+ self._core_clk_info_r = bank.csr(64, "r") # core clock frequency
+ self._mem_clk_info_r = bank.csr(64, "r") # memory clock frequency
+
+ # bridge the above-created CSRs over wishbone. ordering and size
+ # above mattered, the bridge automatically packs them together
+ # as memory-addressable "things" for us
+ self._bridge = self.bridge(data_width=32, granularity=8, alignment=3)
+ self.bus = self._bridge.bus
+
+ def elaborate(self, platform):
+ m = Module()
+ comb, sync = m.d.comb, m.d.comb
+ m.submodules.bridge = self._bridge
+
+ # enter data into the CSRs. r_data can be left live all the time,
+ # w_data obviously has to be set only when w_stb triggers.
+
+ # identifying signature
+ comb += self._reg_sig_r.r_data.eq(0xf00daa5500010001)
+
+ # nest clock rate (hz)
+ comb += self._clk_info_r.r_data.eq(int(self.sys_clk_freq)) # in hz
+
+ # core clock rate (hz)
+ comb += self._core_clk_info_r.r_data.eq(int(self.core_clk_freq)) # in hz
+
+ # memory clock rate (hz)
+ comb += self._mem_clk_info_r.r_data.eq(int(self.mem_clk_freq)) # in hz
+
+ # detect peripherals
+ has_spi = self.spi_offset is not None
+ has_dram = self.dram_addr is not None
+
+ # uart peripheral clock rate, currently assumed to be system clock
+ # 0 ..31 : UART clock freq (in HZ)
+ # 32 : UART is 16550 (otherwise pp)
+ comb += self._uart0_info_r.r_data[0:32].eq(int(self.sys_clk_freq))
+ comb += self._uart0_info_r.r_data[32].eq(1)
+
+ # Reg Info, defines what peripherals and characteristics are present
+ comb += self._reg_info_r.r_data[0].eq(self.has_uart) # has UART0
+ comb += self._reg_info_r.r_data[1].eq(has_dram) # has DDR DRAM
+ comb += self._reg_info_r.r_data[3].eq(has_spi) # has SPI Flash
+ comb += self._reg_info_r.r_data[5].eq(1) # Large SYSCON
+
+ # system control
+ sysctrl = Cat(self.dram_at_0, self.core_reset, self.soc_reset)
+ with m.If(self._ctrl_info_r.w_stb):
+ sync += sysctrl.eq(self._ctrl_info_r.w_data)
+ comb += self._ctrl_info_r.r_data.eq(sysctrl)
+
+ # SPI Flash Address
+ comb += self._spiflash_info_r.r_data.eq(self.spi_offset or 0)
+
+ return m
+
+
+def create_ilang(dut, ports, test_name):
+ vl = rtlil.convert(dut, name=test_name, ports=ports)
+ with open("%s.il" % test_name, "w") as f:
+ f.write(vl)
+
+def create_verilog(dut, ports, test_name):
+ vl = verilog.convert(dut, name=test_name, ports=ports)
+ with open("%s.v" % test_name, "w") as f:
+ f.write(vl)
+
+
+if __name__ == "__main__":
+ from nmigen_soc import wishbone
+ class QuickDemo(Elaboratable):
+ def elaborate(self, platform):
+ m = Module()
+ arbiter = wishbone.Arbiter(addr_width=30, data_width=32,
+ granularity=8)
+ decoder = wishbone.Decoder(addr_width=30, data_width=32,
+ granularity=8)
+ m.submodules.syscon = syscon = MicrowattSYSCON()
+ m.submodules.decoder = decoder
+ m.submodules.arbiter = arbiter
+ decoder.add(syscon.bus, addr=0xc0000000)
+ m.d.comb += arbiter.bus.connect(decoder.bus)
+ return m
+ m = QuickDemo()
+ create_ilang(m, None, "syscondemo")
+
--- /dev/null
+#!/usr/bin/env python3
+#
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2020-2022 Raptor Engineering LLC <support@raptorengineering.com>
+# Copyright (C) 2022 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+# Sponsored by NLnet and NGI POINTER under EU Grants 871528 and 957073
+# Part of the Libre-SOC Project.
+#
+# this is a wrapper around the opencores verilog tercel module
+
+from nmigen import (Elaboratable, Cat, Module, Signal, ClockSignal, Instance,
+ ResetSignal, Const)
+
+from nmigen_soc.wishbone.bus import Interface
+from nmigen_soc.memory import MemoryMap
+from nmigen.utils import log2_int
+from nmigen.cli import rtlil, verilog
+from nmutil.byterev import byte_reverse
+import os
+
+__all__ = ["Tercel"]
+
+
+class Tercel(Elaboratable):
+ """Tercel SPI controller from Raptor Engineering, nmigen wrapper.
+ remember to call Tercel.add_verilog_source
+ """
+
+ def __init__(self, bus=None, cfg_bus=None, features=None, name=None,
+ data_width=32, spi_region_addr_width=28, pins=None,
+ clk_freq=None,
+ lattice_ecp5_usrmclk=False,
+ adr_offset=0): # address offset (bytes)
+ if name is not None:
+ # convention: give the name in the format "name_number"
+ self.idx = int(name.split("_")[-1])
+ else:
+ self.idx = 0
+ name = "spi_0"
+ self.granularity = 8
+ self.data_width = data_width
+ self.dsize = log2_int(self.data_width//self.granularity)
+ self.adr_offset = adr_offset
+ self.lattice_ecp5_usrmclk = lattice_ecp5_usrmclk
+
+ # TODO, sort this out.
+ assert clk_freq is not None
+ clk_freq = round(clk_freq)
+ self.clk_freq = Const(clk_freq, 32) #clk_freq.bit_length())
+
+ # set up the wishbone busses
+ if features is None:
+ #features = frozenset({'err'}) # sigh
+ features = frozenset()
+ if bus is None:
+ bus = Interface(addr_width=spi_region_addr_width,
+ data_width=data_width,
+ features=features,
+ granularity=8,
+ name=name+"_wb_%d_0" % self.idx)
+ if cfg_bus is None:
+ cfg_bus = Interface(addr_width=6,
+ data_width=data_width,
+ features=features,
+ granularity=8,
+ name=name+"_wb_%d_1" % self.idx)
+ self.bus = bus
+ assert len(self.bus.dat_r) == data_width, \
+ "bus width must be %d" % data_width
+ self.cfg_bus = cfg_bus
+ assert len(self.cfg_bus.dat_r) == data_width, \
+ "bus width must be %d" % data_width
+
+ mmap = MemoryMap(addr_width=spi_region_addr_width+self.dsize,
+ data_width=self.granularity)
+ cfg_mmap = MemoryMap(addr_width=6+self.dsize,
+ data_width=self.granularity)
+
+ self.bus.memory_map = mmap
+ self.cfg_bus.memory_map = cfg_mmap
+
+ # QSPI signals
+ self.dq_out = Signal(4) # Data
+ self.dq_direction = Signal(4)
+ self.dq_in = Signal(4)
+ self.cs_n_out = Signal() # Slave select
+ self.spi_clk = Signal() # Clock
+ self.dbg_port = Signal(8) # debug info
+
+ # pins resource
+ self.pins = pins
+
+ @classmethod
+ def add_verilog_source(cls, verilog_src_dir, platform):
+ # add each of the verilog sources, needed for when doing platform.build
+ for fname in ['wishbone_spi_master.v', 'phy.v']:
+ # prepend the src directory to each filename, add its contents
+ fullname = os.path.join(verilog_src_dir, fname)
+ with open(fullname) as f:
+ platform.add_file(fullname, f)
+
+ def elaborate(self, platform):
+ m = Module()
+ comb = m.d.comb
+ pins, bus, cfg_bus = self.pins, self.bus, self.cfg_bus
+
+ # Calculate SPI flash address
+ spi_bus_adr = Signal(30)
+ # wb address is in words, offset is in bytes
+ comb += spi_bus_adr.eq(bus.adr - (self.adr_offset >> 2))
+
+ # urrr.... byte-reverse the config bus and data bus read/write
+ cdat_w = Signal.like(cfg_bus.dat_w)
+ cdat_r = Signal.like(cfg_bus.dat_r)
+ dat_w = Signal.like(bus.dat_w)
+ dat_r = Signal.like(bus.dat_r)
+ comb += cdat_w.eq(byte_reverse(m, "rv_cdat_w", cfg_bus.dat_w, 4))
+ comb += cfg_bus.dat_r.eq(byte_reverse(m, "rv_cdat_r", cdat_r, 4))
+ comb += dat_w.eq(byte_reverse(m, "rv_dat_w", bus.dat_w, 4))
+ comb += bus.dat_r.eq(byte_reverse(m, "rv_dat_r", dat_r, 4))
+
+ # create definition of external verilog Tercel code here, so that
+ # nmigen understands I/O directions (defined by i_ and o_ prefixes)
+ idx, bus = self.idx, self.bus
+ tercel = Instance("tercel_core",
+ # System parameters
+ i_sys_clk_freq = self.clk_freq,
+
+ # Clock/reset (use DomainRenamer if needed)
+ i_peripheral_clock=ClockSignal(),
+ i_peripheral_reset=ResetSignal(),
+
+ # SPI region Wishbone bus signals
+ i_wishbone_adr=spi_bus_adr,
+ i_wishbone_dat_w=dat_w,
+ i_wishbone_sel=bus.sel,
+ o_wishbone_dat_r=dat_r,
+ i_wishbone_we=bus.we,
+ i_wishbone_stb=bus.stb,
+ i_wishbone_cyc=bus.cyc,
+ o_wishbone_ack=bus.ack,
+ #o_wishbone_err=bus.err,
+
+ # Configuration region Wishbone bus signals
+ i_cfg_wishbone_adr=cfg_bus.adr,
+ i_cfg_wishbone_dat_w=cdat_w,
+ i_cfg_wishbone_sel=cfg_bus.sel,
+ o_cfg_wishbone_dat_r=cdat_r,
+ i_cfg_wishbone_we=cfg_bus.we,
+ i_cfg_wishbone_stb=cfg_bus.stb,
+ i_cfg_wishbone_cyc=cfg_bus.cyc,
+ o_cfg_wishbone_ack=cfg_bus.ack,
+ #o_cfg_wishbone_err=cfg_bus.err,
+
+ # QSPI signals
+ o_spi_d_out=self.dq_out,
+ o_spi_d_direction=self.dq_direction,
+ i_spi_d_in=self.dq_in,
+ o_spi_ss_n=self.cs_n_out,
+ o_spi_clock=self.spi_clk,
+
+ # debug port
+ o_debug_port=self.dbg_port
+ );
+
+ m.submodules['tercel_%d' % self.idx] = tercel
+
+ if pins is not None:
+ for i in range(4):
+ pad = getattr(pins, "dq%d" % i)
+ comb += pad.o.eq(self.dq_out[i])
+ comb += pad.oe.eq(self.dq_direction[i])
+ comb += self.dq_in[i].eq(pad.i)
+ # ECP5 needs special handling for the SPI clock, sigh.
+ if self.lattice_ecp5_usrmclk:
+ comb += pad.o_clk.eq(ClockSignal())
+ comb += pad.i_clk.eq(ClockSignal())
+ # XXX invert handled by SPIFlashResource
+ comb += pins.cs_n.eq(self.cs_n_out)
+ # ECP5 needs special handling for the SPI clock, sigh.
+ if self.lattice_ecp5_usrmclk:
+ m.submodules += Instance("USRMCLK",
+ i_USRMCLKI = self.spi_clk,
+ i_USRMCLKTS = 0
+ )
+ else:
+ comb += pins.clk.eq(self.spi_clk)
+
+ return m
+
+ def ports(self):
+ return [self.bus.cyc, self.bus.stb, self.bus.ack,
+ self.bus.dat_r, self.bus.dat_w, self.bus.adr,
+ self.bus.we, self.bus.sel,
+ self.cfg_bus.cyc, self.cfg_bus.stb,
+ self.cfg_bus.ack,
+ self.cfg_bus.dat_r, self.cfg_bus.dat_w,
+ self.cfg_bus.adr,
+ self.cfg_bus.we, self.cfg_bus.sel,
+ self.dq_out, self.dq_direction, self.dq_in,
+ self.cs_n_out, self.spi_clk
+ ]
+
+
+def create_ilang(dut, ports, test_name):
+ vl = rtlil.convert(dut, name=test_name, ports=ports)
+ with open("%s.il" % test_name, "w") as f:
+ f.write(vl)
+
+def create_verilog(dut, ports, test_name):
+ vl = verilog.convert(dut, name=test_name, ports=ports)
+ with open("%s.v" % test_name, "w") as f:
+ f.write(vl)
+
+
+if __name__ == "__main__":
+ tercel = Tercel(name="spi_0", data_width=32, clk_freq=100e6)
+ create_ilang(tercel, tercel.ports(), "spi_0")
+
"""
-def wb_write(bus, addr, data, sel=True):
+def wb_write(bus, addr, data, sel=0b1111):
# write wb
yield bus.we.eq(1)
yield bus.cyc.eq(1)
yield bus.stb.eq(1)
- yield bus.sel.eq(0b1111 if sel else 0b1) # 32-bit / 8-bit
+ #yield bus.sel.eq(0b1111 if sel else 0b1) # 32-bit / 8-bit
+ yield bus.sel.eq(sel)
yield bus.adr.eq(addr)
yield bus.dat_w.eq(data)
yield bus.dat_w.eq(0)
-def wb_read(bus, addr, sel=True):
+def wb_read(bus, addr, sel=0b1111):
# read wb
yield bus.cyc.eq(1)
yield bus.stb.eq(1)
yield bus.we.eq(0)
- yield bus.sel.eq(0b1111 if sel else 0b1) # 32-bit / 8-bit
+ #yield bus.sel.eq(0b1111 if sel else 0b1) # 32-bit / 8-bit
+ yield bus.sel.eq(sel)
yield bus.adr.eq(addr)
# wait for ack to go high
--- /dev/null
+#!/usr/bin/env python3
+#
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2022 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+# Sponsored by NLnet and NGI POINTER under EU Grants 871528 and 957073
+# Part of the Libre-SOC Project.
+#
+# this is a wrapper around the opencores verilog uart16550 module
+
+from nmigen import (Elaboratable, Cat, Module, Signal, ClockSignal, Instance,
+ ResetSignal)
+
+from nmigen_soc.wishbone.bus import Interface
+from nmigen.cli import rtlil, verilog
+import os
+import tempfile
+
+__all__ = ["UART16550"]
+
+
+class UART16550(Elaboratable):
+ """16550 UART from opencores, nmigen wrapper. remember to call
+ UART16550.add_verilog_source
+ """
+
+ def __init__(self, bus=None, features=None, name=None, data_width=32,
+ pins=None, irq=None):
+ if name is not None:
+ # convention: give the name in the format "name_number"
+ self.idx = int(name.split("_")[-1])
+ else:
+ self.idx = 0
+ name = "uart_0"
+ self.data_width = data_width
+
+ # set up the wishbone bus
+ if features is None:
+ features = frozenset()
+ if bus is None:
+ bus = Interface(addr_width=5,
+ data_width=data_width,
+ features=features,
+ granularity=8,
+ name=name+"_wb_%d" % self.idx)
+ self.bus = bus
+ assert len(self.bus.dat_r) == data_width, \
+ "bus width must be %d" % data_width
+
+ # IRQ for data buffer receive/xmit
+ if irq is None:
+ irq = Signal()
+ self.irq = irq
+
+ # 9-pin UART signals (if anyone still remembers those...)
+ self.tx_o = Signal() # transmit
+ self.rx_i = Signal() # receive
+ self.rts_o = Signal() # ready to send
+ self.cts_i = Signal() # clear to send
+ self.dtr_o = Signal() # data terminal ready
+ self.dsr_i = Signal() # data send ready
+ self.ri_i = Signal() # can't even remember what this is!
+ self.dcd_i = Signal() # or this!
+
+ # pins resource
+ self.pins = pins
+
+ @classmethod
+ def add_verilog_source(cls, verilog_src_dir, platform):
+ # create a temp file containing "`define DATA_BUS_WIDTH_8"
+ t = tempfile.NamedTemporaryFile(delete=False, suffix=".v")
+ t.write("`define DATA_BUS_WIDTH_8\n".encode())
+ t.flush()
+ t.seek(0)
+ platform.add_file(t.name, t)
+
+ # add each of the verilog sources, needed for when doing platform.build
+ for fname in ['raminfr.v', 'uart_defines.v', 'uart_rfifo.v',
+ 'uart_top.v', 'timescale.v', 'uart_receiver.v',
+ 'uart_sync_flops.v', 'uart_transmitter.v',
+ 'uart_debug_if.v', 'uart_regs.v',
+ 'uart_tfifo.v', 'uart_wb.v'
+ ]:
+ # prepend the src directory to each filename, add its contents
+ fullname = os.path.join(verilog_src_dir, fname)
+ with open(fullname) as f:
+ platform.add_file(fullname, f)
+
+ def elaborate(self, platform):
+ m = Module()
+ comb = m.d.comb
+
+ # create definition of external verilog 16550 uart here, so that # nmigen understands I/O directions (defined by i_ and o_ prefixes)
+ idx, bus = self.idx, self.bus
+ uart = Instance("uart_top",
+ # clock/reset (use DomainRenamer if needed)
+ i_wb_clk_i=ClockSignal(),
+ i_wb_rst_i=ResetSignal(),
+ # wishbone bus signals
+ i_wb_adr_i=bus.adr,
+ i_wb_dat_i=bus.dat_w,
+ i_wb_sel_i=bus.sel,
+ o_wb_dat_o=bus.dat_r,
+ i_wb_we_i=bus.we,
+ i_wb_stb_i=bus.stb,
+ i_wb_cyc_i=bus.cyc,
+ o_wb_ack_o=bus.ack,
+ # interrupt line
+ o_int_o=self.irq,
+ # 9-pin RS232/UART signals
+ o_stx_pad_o=self.tx_o,
+ i_srx_pad_i=self.rx_i,
+ o_rts_pad_o=self.rts_o,
+ i_cts_pad_i=self.cts_i,
+ o_dtr_pad_o=self.dtr_o,
+ i_dsr_pad_i=self.dsr_i,
+ i_ri_pad_i=self.ri_i,
+ i_dcd_pad_i=self.dcd_i
+ );
+
+ m.submodules['uart16550_%d' % self.idx] = uart
+
+ if self.pins is not None:
+ comb += self.pins.tx.eq(self.tx_o)
+ comb += self.rx_i.eq(self.pins.rx)
+
+ return m
+
+
+def create_ilang(dut, ports, test_name):
+ vl = rtlil.convert(dut, name=test_name, ports=ports)
+ with open("%s.il" % test_name, "w") as f:
+ f.write(vl)
+
+def create_verilog(dut, ports, test_name):
+ vl = verilog.convert(dut, name=test_name, ports=ports)
+ with open("%s.v" % test_name, "w") as f:
+ f.write(vl)
+
+
+if __name__ == "__main__":
+ uart = UART16550(name="uart_0", data_width=8)
+ create_ilang(uart, [uart.bus.cyc, uart.bus.stb, uart.bus.ack,
+ uart.bus.dat_r, uart.bus.dat_w, uart.bus.adr,
+ uart.bus.we, uart.bus.sel,
+ uart.irq,
+ uart.tx_o, uart.rx_i, uart.rts_o, uart.cts_i,
+ uart.dtr_o, uart.dsr_i, uart.ri_i, uart.dcd_i
+ ], "uart_0")
+
--- /dev/null
+# Copyright (C) 2022 Raptor Engineering, LLC <support@raptorengineering.com>
+#
+# Based partly on code from LibreSoC
+#
+# Modifications for the Libre-SOC Project funded by NLnet and NGI POINTER
+# under EU Grants 871528 and 957073, under the LGPLv3+ License
+#
+# this is a wrapper around the Verilog Wishbone Components wb_async_reg module
+
+from nmigen import (Elaboratable, Cat, Module, Signal, ClockSignal, Instance,
+ ResetSignal, Const)
+
+from nmigen_soc.wishbone.bus import Interface
+from nmigen_soc.memory import MemoryMap
+from nmigen.utils import log2_int
+from nmigen.cli import rtlil, verilog
+from nmutil.byterev import byte_reverse
+import os
+
+__all__ = ["WBAsyncBridge"]
+
+
+class WBAsyncBridge(Elaboratable):
+ """Verilog Wishbone Components wb_async_reg module, nmigen wrapper.
+ remember to call WBAsyncBridge.add_verilog_source
+ """
+
+ def __init__(self, master_bus=None, slave_bus=None, master_features=None,
+ slave_features=None, name=None,
+ address_width=30, data_width=32, granularity=8,
+ master_clock_domain=None, slave_clock_domain=None):
+ if name is not None:
+ # convention: give the name in the format "name_number"
+ self.idx = int(name.split("_")[-1])
+ else:
+ self.idx = 0
+ name = "wbasyncbridge_0"
+ self.address_width = address_width
+ self.data_width = data_width
+ self.granularity = granularity
+ self.dsize = log2_int(self.data_width//self.granularity)
+
+ # set up the clock domains
+ if master_clock_domain is None:
+ self.wb_mclk = ClockSignal()
+ self.wb_mrst = ResetSignal()
+ else:
+ self.wb_mclk = ClockSignal(master_clock_domain)
+ self.wb_mrst = ResetSignal(master_clock_domain)
+ if slave_clock_domain is None:
+ self.wb_sclk = ClockSignal()
+ self.wb_srst = ResetSignal()
+ else:
+ self.wb_sclk = ClockSignal(slave_clock_domain)
+ self.wb_srst = ResetSignal(slave_clock_domain)
+
+ # set up the wishbone busses
+ if master_features is None:
+ master_features = frozenset()
+ if slave_features is None:
+ slave_features = frozenset()
+ if master_bus is None:
+ master_bus = Interface(addr_width=self.address_width,
+ data_width=self.data_width,
+ features=master_features,
+ granularity=self.granularity,
+ name=name+"_wb_%d_master" % self.idx)
+ if slave_bus is None:
+ slave_bus = Interface(addr_width=self.address_width,
+ data_width=self.data_width,
+ features=slave_features,
+ granularity=self.granularity,
+ name=name+"_wb_%d_slave" % self.idx)
+ self.master_bus = master_bus
+ assert len(self.master_bus.dat_r) == data_width, \
+ "bus width must be %d" % data_width
+ self.slave_bus = slave_bus
+ assert len(self.slave_bus.dat_r) == data_width, \
+ "bus width must be %d" % data_width
+
+ @classmethod
+ def add_verilog_source(cls, verilog_src_dir, platform):
+ # add each of the verilog sources, needed for when doing platform.build
+ for fname in ['wb_async_reg.v']:
+ # prepend the src directory to each filename, add its contents
+ fullname = os.path.join(verilog_src_dir, fname)
+ with open(fullname) as f:
+ platform.add_file(fullname, f)
+
+ def elaborate(self, platform):
+ m = Module()
+ comb = m.d.comb
+ master_bus, slave_bus = self.master_bus, self.slave_bus
+ slave_err = Signal()
+ slave_rty = Signal()
+
+ # create definition of external verilog bridge code here, so that
+ # nmigen understands I/O directions (defined by i_ and o_ prefixes)
+ idx = self.idx
+ wb_async_bridge = Instance("wb_async_reg",
+ # Parameters
+ p_ADDR_WIDTH=self.address_width,
+ p_DATA_WIDTH=self.data_width,
+ # width of select is the data width
+ # *divided* by the data granularity.
+ # data_width=32-bit, data granularity=8-bit,
+ # select_width ==> 32/8 ==> 4
+ p_SELECT_WIDTH=self.data_width//self.granularity,
+
+ # Clocks/resets
+ i_wbm_clk=self.wb_mclk,
+ i_wbm_rst=self.wb_mrst,
+ i_wbs_clk=self.wb_sclk,
+ i_wbs_rst=self.wb_srst,
+
+ # Master Wishbone bus signals
+ i_wbm_adr_i=self.master_bus.adr,
+ i_wbm_dat_i=self.master_bus.dat_w,
+ o_wbm_dat_o=self.master_bus.dat_r,
+ i_wbm_we_i=self.master_bus.we,
+ i_wbm_sel_i=self.master_bus.sel,
+ i_wbm_stb_i=self.master_bus.stb,
+ i_wbm_cyc_i=self.master_bus.cyc,
+ o_wbm_ack_o=self.master_bus.ack,
+ #o_wbm_err=self.master_bus.err,
+ #o_wbm_rty_i=self.master_bus.rty,
+
+ # Slave Wishbone bus signals
+ o_wbs_adr_o=self.slave_bus.adr,
+ i_wbs_dat_i=self.slave_bus.dat_r,
+ o_wbs_dat_o=self.slave_bus.dat_w,
+ o_wbs_we_o=self.slave_bus.we,
+ o_wbs_sel_o=self.slave_bus.sel,
+ o_wbs_stb_o=self.slave_bus.stb,
+ o_wbs_cyc_o=self.slave_bus.cyc,
+ i_wbs_ack_i=self.slave_bus.ack,
+ i_wbs_err_i=slave_err,
+ i_wbs_rty_i=slave_rty
+ );
+
+ # Wire unused signals to 0
+ comb += slave_err.eq(0)
+ comb += slave_rty.eq(0)
+
+ m.submodules['wb_async_bridge_%d' % self.idx] = wb_async_bridge
+
+ return m
+
+ def ports(self):
+ return [self.master_bus.adr, self.master_bus.dat_w,
+ self.master_bus.dat_r,
+ self.master_bus.we, self.master_bus.sel,
+ self.master_bus.stb,
+ self.master_bus.cyc, self.master_bus.ack,
+ self.master_bus.err,
+ self.master_bus.rty,
+ self.slave_bus.adr, self.slave_bus.dat_w,
+ self.slave_bus.dat_r,
+ self.slave_bus.we, self.slave_bus.sel,
+ self.slave_bus.stb,
+ self.slave_bus.cyc, self.slave_bus.ack,
+ self.slave_bus.err,
+ self.slave_bus.rty
+ ]
+
+def create_ilang(dut, ports, test_name):
+ vl = rtlil.convert(dut, name=test_name, ports=ports)
+ with open("%s.il" % test_name, "w") as f:
+ f.write(vl)
+
+def create_verilog(dut, ports, test_name):
+ vl = verilog.convert(dut, name=test_name, ports=ports)
+ with open("%s.v" % test_name, "w") as f:
+ f.write(vl)
+
+
+if __name__ == "__main__":
+ wbasyncbridge = WBAsyncBridge(name="wbasyncbridge_0", address_width=30, data_width=32, granularity=8)
+ create_ilang(wbasyncbridge, wbasyncbridge.ports(), "wbasyncbridge_0")
shift_reg = Signal(dw_from)
counter = Signal(log2_int(ratio, False))
- counter_reset = Signal()
- counter_ce = Signal()
- with m.If(counter_reset):
- sync += counter.eq(0)
- with m.Elif(counter_ce):
- sync += counter.eq(counter + 1)
+ cur_counter = Signal(log2_int(ratio, False))
counter_done = Signal()
comb += counter_done.eq(counter == ratio-1)
+ comb += cur_counter.eq(counter)
+ skip = Signal()
# Main FSM
with m.FSM() as fsm:
with m.State("IDLE"):
- comb += counter_reset.eq(1)
+ sync += counter.eq(0)
sync += cached_data.eq(0)
with m.If(master.stb & master.cyc):
with m.If(master.we):
with m.State("WRITE"):
comb += write.eq(1)
- comb += slave.we.eq(1)
- comb += slave.cyc.eq(1)
with m.If(master.stb & master.cyc):
+ comb += skip.eq(slave.sel == 0)
+ comb += slave.we.eq(1)
+ comb += slave.cyc.eq(1)
comb += slave.stb.eq(1)
- with m.If(slave.ack):
- comb += counter_ce.eq(1)
+ with m.If(slave.ack | skip):
+ sync += counter.eq(counter + 1)
with m.If(counter_done):
comb += master.ack.eq(1)
m.next = "IDLE"
with m.State("READ"):
comb += read.eq(1)
- comb += slave.cyc.eq(1)
with m.If(master.stb & master.cyc):
+ comb += skip.eq(slave.sel == 0)
+ comb += slave.cyc.eq(1)
comb += slave.stb.eq(1)
- with m.If(slave.ack):
- comb += counter_ce.eq(1)
+ with m.If(slave.ack | skip):
+ comb += cur_counter.eq(counter + 1) # TODO use Picker
+ sync += counter.eq(cur_counter)
with m.If(counter_done):
comb += master.ack.eq(1)
comb += master.dat_r.eq(shift_reg)
comb += slave.cti.eq(7) # indicate end of burst
with m.Else():
comb += slave.cti.eq(2)
- comb += slave.adr.eq(Cat(counter, master.adr))
+ comb += slave.adr.eq(Cat(cur_counter, master.adr))
# write Datapath - select fragments of data, depending on "counter"
with m.Switch(counter):
# read Datapath - uses cached_data and master.dat_r as a shift-register.
# by the time "counter" is done (counter_done) this is complete
comb += shift_reg.eq(Cat(cached_data[dw_to:], slave.dat_r))
- with m.If(read & counter_ce):
+ with m.If(read & (slave.ack | skip)):
sync += cached_data.eq(shift_reg)
'bare_wb': BareFetchUnit,
#'test_cache_wb': TestCacheFetchUnit
}
+ self.pspec = pspec
+ if self.pspec.imem_ifacetype in ['mmu_cache_wb', 'test_mmu_cache_wb']:
+ # XXX BLECH! use pspec to transfer the I-Cache which is
+ # created down inside LoadStore1!
+ self.fu = icache = pspec.icache # ICache already FetchUnitInterface
+ # tell I-Cache to connect up to its FetchUnitInterface
+ icache.use_fetch_interface()
+ return
+
fukls = fudict[pspec.imem_ifacetype]
self.fu = fukls(pspec)
+ def wb_bus(self):
+ return self.fu.ibus
+
import json
from pprint import pprint
from collections import OrderedDict
+from openpower.util import log
+from nmigen.build.dsl import Resource, Subsignal, Pins
def _byteify(data, ignore_dicts = False):
return data
+def get_pinspec_resources(chipname=None, subset=None, conn=None):
+ """get_pinspec_resources - returns an auto-generated list of resources
+ """
+ chip = load_pinouts(chipname)
+ pinmap = chip['pins.map']
+ specs = []
+ for k, bus in chip['pins.specs'].items():
+ k, num = k.lower().split(":")
+ name = '%s%s' % (k, num)
+ if subset is None or name in subset:
+ io = []
+ for pin in bus:
+ pin = pin.lower()
+ pin, pin_dir = pin[:-1], pin[-1] # split pin+ into pin, +
+ pname = '%s_%s' % (name, pin)
+ if pname in pinmap:
+ newpin = pinmap[pname][2:]
+ newpin = '_'.join(newpin.split("_")[1:])
+ # turn direction into nmigen Pins direction format
+ dirn = {'-': 'i', '+': 'o', '*': 'io'}[pin_dir]
+ # TODO: make assert_width not have to be 1
+ p = Pins(newpin, dir=dirn, conn=conn, assert_width=1)
+ io.append(Subsignal(pin, p))
+ spec = Resource.family(name, num, default_name=name, ios=io)
+ log("pinspec", name, repr(spec))
+ specs.append(spec)
+ return specs
+
+
def get_pinspecs(chipname=None, subset=None):
+ """get_pinspecs - returns a dictionary of lists of pins for an IO function
+ example: {'uart': ['tx+', 'rx-'],
+ 'i2c': ['sda*', 'scl+']}
+ """
chip = load_pinouts(chipname)
pinmap = chip['pins.map']
specs = OrderedDict() # preserve order
pth = os.path.split(pth)[0]
# path is relative to this filename, in the pinmux submodule
- fname = "%s/../../../pinmux/%s/litex_pinpads.json" % (pth, chipname)
+ pinmux = os.getenv("PINMUX", "%s/../../../pinmux" % pth)
+ fname = "%s/%s/fabric_pinpads.json" % (pinmux, chipname)
with open(fname) as f:
txt = f.read()
return chip
if __name__ == '__main__':
- if sys.argv == 2:
+ # run this with:
+ # git submodule update --init --remote --recursive
+ # make mkpinmux
+ # python3 soc/config/pinouts.py ngi_pointer (or ls180, or other)
+ # it will print out a stack of debug stuff
+ if len(sys.argv) == 2:
chipname = sys.argv[1]
else:
chipname = None
for k, v in chip.items():
print ("\n****", k, "****")
pprint(v)
+ print ("chipname pinspec resources", sys.argv, chipname)
+ specs = get_pinspec_resources(chipname, subset=None)
sys.setrecursionlimit(10**6)
-def read_from_addr(dut, addr):
+def read_from_addr(dut, addr, stall=True):
yield dut.a_pc_i.eq(addr)
- yield dut.a_valid_i.eq(1)
- yield dut.f_valid_i.eq(1)
- yield dut.a_stall_i.eq(1)
- yield
- yield dut.a_stall_i.eq(0)
+ yield dut.a_i_valid.eq(1)
+ yield dut.f_i_valid.eq(1)
+ if stall:
+ yield dut.a_stall_i.eq(1)
+ yield
+ yield dut.a_stall_i.eq(0)
yield
yield Settle()
while (yield dut.f_busy_o):
yield
res = (yield dut.f_instr_o)
- yield dut.a_valid_i.eq(0)
- yield dut.f_valid_i.eq(0)
+ yield dut.a_i_valid.eq(0)
+ yield dut.f_i_valid.eq(0)
yield
return res
yield dut.x_st_data_i.eq(value)
yield dut.x_st_i.eq(1)
yield dut.x_mask_i.eq(-1)
- yield dut.x_valid_i.eq(1)
+ yield dut.x_i_valid.eq(1)
yield dut.x_stall_i.eq(1)
- yield dut.m_valid_i.eq(1)
+ yield dut.m_i_valid.eq(1)
yield
yield
def read_from_addr(dut, addr):
yield dut.x_addr_i.eq(addr)
yield dut.x_ld_i.eq(1)
- yield dut.x_valid_i.eq(1)
+ yield dut.x_i_valid.eq(1)
yield dut.x_stall_i.eq(1)
yield
yield dut.x_stall_i.eq(0)
yield Settle()
while (yield dut.x_busy_o):
yield
- assert (yield dut.x_valid_i)
+ assert (yield dut.x_i_valid)
return (yield dut.m_ld_data_o)
yield dut.x_st_i.eq(1)
yield dut.x_mask_i.eq(1 << offset)
print("write_byte", addr, bin(1 << offset), hex(val << (offset*8)))
- yield dut.x_valid_i.eq(1)
- yield dut.m_valid_i.eq(1)
+ yield dut.x_i_valid.eq(1)
+ yield dut.m_i_valid.eq(1)
yield
yield dut.x_st_i.eq(0)
offset = addr & 0x3
yield dut.x_addr_i.eq(addr)
yield dut.x_ld_i.eq(1)
- yield dut.x_valid_i.eq(1)
+ yield dut.x_i_valid.eq(1)
yield
yield dut.x_ld_i.eq(0)
yield Settle()
while (yield dut.x_busy_o):
yield
- assert (yield dut.x_valid_i)
+ assert (yield dut.x_i_valid)
val = (yield dut.m_ld_data_o)
print("read_byte", addr, offset, hex(val))
return (val >> (offset * 8)) & 0xff
import unittest
from soc.config.test.test_loadstore import TestMemPspec
from soc.config.loadstore import ConfigMemoryPortInterface
+from openpower.exceptions import LDSTExceptionTuple
-def wait_busy(port, no=False):
+def wait_busy(port, no=False, debug=None):
+ cnt = 0
while True:
busy = yield port.busy_o
- print("busy", no, busy)
+ print("busy", no, busy, cnt, debug)
if bool(busy) == no:
break
yield
+ cnt += 1
-def wait_addr(port):
+def wait_addr(port,debug=None):
+ cnt = 0
while True:
addr_ok = yield port.addr_ok_o
- print("addrok", addr_ok)
- if addr_ok:
+ exc_happened = yield port.exc_o.happened
+ print("addrok", addr_ok,cnt,debug,exc_happened)
+ if addr_ok or exc_happened:
break
yield
+ cnt += 1
def wait_ldok(port):
yield
-def pi_st(port1, addr, data, datalen, msr_pr=0):
+def pi_st(port1, addr, data, datalen, msr, is_dcbz=0):
# have to wait until not busy
- yield from wait_busy(port1, no=False) # wait until not busy
+ yield from wait_busy(port1,debug="pi_st_A") # wait while busy
# set up a ST on the port. address first:
+ yield port1.is_dcbz_i.eq(is_dcbz) # reset dcbz too
yield port1.is_st_i.eq(1) # indicate ST
yield port1.data_len.eq(datalen) # ST length (1/2/4/8)
- yield port1.msr_pr.eq(msr_pr) # MSR PR bit (1==>virt, 0==>real)
+ yield port1.priv_mode.eq(~msr.pr)
+ yield port1.virt_mode.eq(msr.dr)
+ yield port1.mode_32bit.eq(~msr.sf)
yield port1.addr.data.eq(addr) # set address
yield port1.addr.ok.eq(1) # set ok
yield Settle()
+
+ # must check exception even before waiting for address.
+ # XXX TODO: wait_addr should check for exception
+ exc_info = yield from get_exception_info(port1.exc_o)
+ exc_happened = exc_info.happened
+ if exc_happened:
+ print("print fast ST exception happened")
+ yield # MUST wait for one clock cycle before de-asserting these
+ yield port1.is_st_i.eq(0) # end
+ yield port1.addr.ok.eq(0) # set !ok
+ yield port1.is_dcbz_i.eq(0) # reset dcbz too
+ return "fast", exc_info
+
yield from wait_addr(port1) # wait until addr ok
+
+ exc_info = yield from get_exception_info(port1.exc_o)
+ exc_happened = exc_info.happened
+ if exc_happened:
+ print("print fast ST exception happened")
+ yield # MUST wait for one clock cycle before de-asserting these
+ yield port1.is_st_i.eq(0) # end
+ yield port1.addr.ok.eq(0) # set !ok
+ yield port1.is_dcbz_i.eq(0) # reset dcbz too
+ return "fast", exc_info
+
+
# yield # not needed, just for checking
# yield # not needed, just for checking
# assert "ST" for one cycle (required by the API)
yield port1.st.ok.eq(1)
yield
yield port1.st.ok.eq(0)
- yield from wait_busy(port1, True) # wait while busy
+ exc_info = yield from get_exception_info(port1.exc_o)
+ exc_happened = exc_info.happened
+ if exc_happened:
+ print("print fast ST exception happened")
+ yield # MUST wait for one clock cycle before de-asserting these
+ yield port1.is_st_i.eq(0) # end
+ yield port1.addr.ok.eq(0) # set !ok
+ yield port1.is_dcbz_i.eq(0) # reset dcbz too
+ return "fast", exc_info
+
+ yield from wait_busy(port1,debug="pi_st_E") # wait while busy
+ exc_info = yield from get_exception_info(port1.exc_o)
+ exc_happened = exc_info.happened
+ if exc_happened:
+ yield # needed if mmu/dache is used
+ yield port1.is_st_i.eq(0) # end
+ yield port1.addr.ok.eq(0) # set !ok
+ yield port1.is_dcbz_i.eq(0) # reset dcbz too
+ yield # needed if mmu/dache is used
+ return "slow", exc_info
# can go straight to reset.
yield port1.is_st_i.eq(0) # end
yield port1.addr.ok.eq(0) # set !ok
+ yield port1.is_dcbz_i.eq(0) # reset dcbz too
+ yield # needed if mmu/dache is used
+
+ return None, None
+
+def get_exception_info(exc_o):
+ attrs = []
+ for fname in LDSTExceptionTuple._fields:
+ attr = getattr(exc_o, fname)
+ val = yield attr
+ attrs.append(val)
+ return LDSTExceptionTuple(*attrs)
+
+# copy of pi_st removed
-def pi_ld(port1, addr, datalen, msr_pr=0):
+def pi_ld(port1, addr, datalen, msr):
# have to wait until not busy
- yield from wait_busy(port1, no=False) # wait until not busy
+ yield from wait_busy(port1,debug="pi_ld_A") # wait while busy
# set up a LD on the port. address first:
yield port1.is_ld_i.eq(1) # indicate LD
yield port1.data_len.eq(datalen) # LD length (1/2/4/8)
- yield port1.msr_pr.eq(msr_pr) # MSR PR bit (1==>virt, 0==>real)
+ yield port1.priv_mode.eq(~msr.pr)
+ yield port1.virt_mode.eq(msr.dr)
+ yield port1.mode_32bit.eq(~msr.sf)
yield port1.addr.data.eq(addr) # set address
yield port1.addr.ok.eq(1) # set ok
yield Settle()
yield from wait_addr(port1) # wait until addr ok
+ exc_info = yield from get_exception_info(port1.exc_o)
+ exc_happened = exc_info.happened
+ if exc_happened:
+ print("print fast LD exception happened")
+ yield # MUST wait for one clock cycle before de-asserting these
+ yield port1.is_ld_i.eq(0) # end
+ yield port1.addr.ok.eq(0) # set !ok
+ return None, "fast", exc_info
+
yield
yield from wait_ldok(port1) # wait until ld ok
data = yield port1.ld.data
+ exc_info = yield from get_exception_info(port1.exc_o)
exc_happened = yield port1.exc_o.happened
+ exc_happened = exc_info.happened
# cleanup
yield port1.is_ld_i.eq(0) # end
yield port1.addr.ok.eq(0) # set !ok
if exc_happened:
- return 0
+ return None, "slow", exc_info
- yield from wait_busy(port1, no=False) # wait while not busy
+ yield from wait_busy(port1, debug="pi_ld_E") # wait while busy
- return data
+ exc_info = yield from get_exception_info(port1.exc_o)
+ exc_happened = exc_info.happened
+ if exc_happened:
+ return None, "slow", exc_info
+ return data, None, None
-def pi_ldst(arg, dut, msr_pr=0):
+
+def pi_ldst(arg, dut, msr):
# do two half-word stores at consecutive addresses, then two loads
addr1 = 0x04
data = 0xbeef
data2 = 0xf00f
#data = 0x4
- yield from pi_st(dut, addr1, data, 2, msr_pr)
- yield from pi_st(dut, addr2, data2, 2, msr_pr)
- result = yield from pi_ld(dut, addr1, 2, msr_pr)
- result2 = yield from pi_ld(dut, addr2, 2, msr_pr)
+ assert(yield from pi_st(dut, addr1, data, 2, msr) is None)
+ assert(yield from pi_st(dut, addr2, data2, 2, msr) is None)
+ result, exc = yield from pi_ld(dut, addr1, 2, msr)
+ result2, exc2 = yield from pi_ld(dut, addr2, 2, msr)
+ assert(exc is None)
+ assert(exc2 is None)
arg.assertEqual(data, result, "data %x != %x" % (result, data))
arg.assertEqual(data2, result2, "data2 %x != %x" % (result2, data2))
# now load both in a 32-bit load to make sure they're really consecutive
data3 = data | (data2 << 16)
- result3 = yield from pi_ld(dut, addr1, 4, msr_pr)
+ result3, exc3 = yield from pi_ld(dut, addr1, 4, msr)
+ assert(exc3 is None)
arg.assertEqual(data3, result3, "data3 %x != %x" % (result3, data3))
dut = Module()
pspec = TestMemPspec(ldst_ifacetype=ifacetype,
imem_ifacetype='',
- addr_wid=48,
+ addr_wid=64,
mask_wid=8,
reg_wid=64)
cmpi = ConfigMemoryPortInterface(pspec)
vcd_name='test_pi_%s.vcd' % ifacetype)
+# FIXME: TypeError: pi_ldst() missing 1 required positional argument: 'msr'
+@unittest.skip('broken')
class TestPIMem(unittest.TestCase):
-
def test_pi_mem(self):
tst_config_pi(self, 'testpi')
--- /dev/null
+ls180_pins.py
from nmigen.utils import log2_int
from nmigen.cli import rtlil
from soc.config.state import CoreState
+from openpower.consts import FastRegsEnum
# DMI register addresses
class DBGCore:
- CTRL = 0b0000
- STAT = 0b0001
+ CTRL = 0b0000 # Control: start/stop/reset
+ STAT = 0b0001 # Status (read started/stopped/stopping)
NIA = 0b0010 # NIA register (read only for now)
MSR = 0b0011 # MSR (read only)
GSPR_IDX = 0b0100 # GSPR register index
CR = 0b1000 # CR (read only)
XER = 0b1001 # XER (read only) - note this is a TEMPORARY hack
SVSTATE = 0b1010 # SVSTATE register (read only for now)
+ STOPADDR = 0b1011 # Address at which the core automatically stops
# CTRL register (direct actions, write 1 to act, read back 0)
self.core_stop_o = Signal()
self.core_rst_o = Signal()
self.icache_rst_o = Signal()
+ self.stopping_o = Signal(name="stopping")
# Core status inputs
self.terminate_i = Signal()
self.core_stopped_i = Signal()
self.state = CoreState("core_dbg")
- # GSPR register read port
- self.d_gpr = DbgReg("d_gpr")
-
- # CR register read port
- self.d_cr = DbgReg("d_cr")
-
- # XER register read port
- self.d_xer = DbgReg("d_xer")
+ self.d_gpr = DbgReg("d_gpr") # GSPR register read port
+ self.d_fast = DbgReg("d_fast") # GSPR register read port
+ self.d_cr = DbgReg("d_cr") # CR register read port
+ self.d_xer = DbgReg("d_xer") # XER register read port
# Core logging data
self.log_data_i = Signal(256)
self.log_read_data_o = Signal(64)
self.log_write_addr_o = Signal(32)
+ # address at which the processor stops automatically
+ # set to 0xffffffffffffffff by default (impossible to reach)
+ self.stop_addr_o = Signal(64, reset=-1)
+
# Misc
self.terminated_o = Signal()
m = Module()
comb, sync = m.d.comb, m.d.sync
dmi, d_gpr, d_cr, d_xer, = self.dmi, self.d_gpr, self.d_cr, self.d_xer
+ d_fast = self.d_fast
# DMI needs fixing... make a one clock pulse
dmi_req_i_1 = Signal()
stat_reg = Signal(64)
# Some internal latches
- stopping = Signal()
+ stopping = self.stopping_o
do_step = Signal()
do_reset = Signal()
do_icreset = Signal()
terminated = Signal()
do_gspr_rd = Signal()
+ # select either GPRs or FAST regs to read, based on GSPR_IDX
gspr_index = Signal.like(d_gpr.addr)
+ fast_index = Signal.like(d_gpr.addr)
+ gspr_en = Signal()
+ fast_en = Signal()
log_dmi_addr = Signal(32)
log_dmi_data = Signal(64)
LOG_INDEX_BITS = log2_int(self.LOG_LENGTH)
- # Single cycle register accesses on DMI except for GSPR data
+ # Single cycle register accesses on DMI except for registers
with m.Switch(dmi.addr_i):
with m.Case(DBGCore.GSPR_DATA):
- comb += dmi.ack_o.eq(d_gpr.ack)
- comb += d_gpr.req.eq(dmi.req_i)
+ with m.If(gspr_en): # GPR requested, acknowledge GPR
+ comb += dmi.ack_o.eq(d_gpr.ack)
+ comb += d_gpr.req.eq(dmi.req_i)
+ with m.If(fast_en): # FAST requested
+ comb += dmi.ack_o.eq(d_fast.ack)
+ comb += d_fast.req.eq(dmi.req_i)
with m.Case(DBGCore.CR):
comb += dmi.ack_o.eq(d_cr.ack)
comb += d_cr.req.eq(dmi.req_i)
comb += dmi.ack_o.eq(d_xer.ack)
comb += d_xer.req.eq(dmi.req_i)
with m.Default():
+ # everything else is immediate-acknowledgement (combinatorial)
comb += dmi.ack_o.eq(dmi.req_i)
# Status register read composition (DBUG_CORE_STAT_xxx)
# DMI read data mux
with m.Switch(dmi.addr_i):
- with m.Case( DBGCore.STAT):
+ with m.Case( DBGCore.STAT): # Status register
comb += dmi.dout.eq(stat_reg)
- with m.Case( DBGCore.NIA):
+ with m.Case( DBGCore.NIA): # NIA (PC)
comb += dmi.dout.eq(self.state.pc)
- with m.Case( DBGCore.MSR):
+ with m.Case( DBGCore.MSR): # MSR
comb += dmi.dout.eq(self.state.msr)
- with m.Case( DBGCore.SVSTATE):
+ with m.Case( DBGCore.SVSTATE): # SVSTATE
comb += dmi.dout.eq(self.state.svstate)
- with m.Case( DBGCore.GSPR_DATA):
- comb += dmi.dout.eq(d_gpr.data)
- with m.Case( DBGCore.LOG_ADDR):
+ with m.Case( DBGCore.GSPR_DATA): # GPR/FAST regs
+ with m.If(gspr_en):
+ comb += dmi.dout.eq(d_gpr.data) # GPR data selected
+ with m.If(fast_en):
+ comb += dmi.dout.eq(d_fast.data) # FAST reg read selected
+ with m.Case( DBGCore.LOG_ADDR): # Logging
comb += dmi.dout.eq(Cat(log_dmi_addr, self.log_write_addr_o))
with m.Case( DBGCore.LOG_DATA):
comb += dmi.dout.eq(log_dmi_data)
- with m.Case(DBGCore.CR):
+ with m.Case(DBGCore.CR): # CR
comb += dmi.dout.eq(d_cr.data)
- with m.Case(DBGCore.XER):
+ with m.Case(DBGCore.XER): # XER
comb += dmi.dout.eq(d_xer.data)
+ with m.Case(DBGCore.STOPADDR): # Halt PC
+ comb += dmi.dout.eq(self.stop_addr_o)
# DMI writes
# Reset the 1-cycle "do" signals
# GSPR address
with m.Elif(dmi.addr_i == DBGCore.GSPR_IDX):
- sync += gspr_index.eq(dmi.din)
+ sync += gspr_index.eq(0)
+ sync += fast_index.eq(0)
+ sync += gspr_en.eq(0)
+ sync += fast_en.eq(0)
+ with m.If(dmi.din <= 31):
+ sync += gspr_index.eq(dmi.din)
+ sync += gspr_en.eq(1)
+ # cover the FastRegs LR, CTR, SRR0, SRR1 etc.
+ # numbering is from microwatt
+ for x, i in FastRegsEnum.__dict__.items():
+ if not isinstance(i, int) or x == 'N_REGS':
+ continue
+ with m.If(dmi.din == 32+i):
+ sync += fast_index.eq(i)
+ sync += fast_en.eq(1)
# Log address
with m.Elif(dmi.addr_i == DBGCore.LOG_ADDR):
sync += log_dmi_addr.eq(dmi.din)
sync += do_dmi_log_rd.eq(1)
+
+ # set PC Halt address
+ with m.Elif(dmi.addr_i == DBGCore.STOPADDR):
+ sync += self.stop_addr_o.eq(dmi.din)
+
with m.Else():
# sync += Display("DMI read from " & to_string(dmi_addr))
pass
sync += terminated.eq(1)
comb += d_gpr.addr.eq(gspr_index)
+ comb += d_fast.addr.eq(fast_index)
# Core control signals generated by the debug module
- comb += self.core_stop_o.eq(stopping & ~do_step)
+ # Note: make stop and terminated synchronous, to help with timing
+ # however this *may* interfere with some of the DMI-based unit tests
+ # so has to be kept an eye on
+ sync += self.core_stop_o.eq((stopping & ~do_step) | self.terminate_i)
+ sync += self.terminated_o.eq(terminated | self.terminate_i)
comb += self.core_rst_o.eq(do_reset)
comb += self.icache_rst_o.eq(do_icreset)
- comb += self.terminated_o.eq(terminated)
# Logging RAM (none)
yield from self.d_gpr
yield from self.d_cr
yield from self.d_xer
+ yield from self.d_fast
yield self.log_data_i
yield self.log_read_addr_i
yield self.log_read_data_o
def jtagremote_server_recv(self, tdo):
data = self.get_data(1, 0) # read 1 byte, non-blocking
- if data is None:
+ if data is None or len(data) == 0:
return None # no data read
data = bytes.decode(data)
if self.debug:
yield
yield dut.bus.tms.eq(0)
+def tms_data_getset(dut, tms, d_len, d_in=0, reverse=False):
+ if reverse:
+ # Reverse the for loop to transmit MSB-first
+ bit_range = range(d_len-1, -1, -1)
+ else:
+ bit_range = range(d_len)
-def tms_data_getset(dut, tms, d_len, d_in=0):
res = 0
yield dut.bus.tms.eq(tms)
- for i in range(d_len):
+ for i in bit_range:
tdi = 1 if (d_in & (1<<i)) else 0
yield dut.bus.tck.eq(1)
res |= (1<<i) if (yield dut.bus.tdo) else 0
yield from tms_state_set(dut, [1, 1, 0])
-def jtag_read_write_reg(dut, addr, d_len, d_in=0):
+def jtag_read_write_reg(dut, addr, d_len, d_in=0, reverse=False):
yield from jtag_set_run(dut)
yield from jtag_set_shift_ir(dut)
yield from tms_data_getset(dut, 0, dut._ir_width, addr)
yield from jtag_set_idle(dut)
yield from jtag_set_shift_dr(dut)
- result = yield from tms_data_getset(dut, 0, d_len, d_in)
+ result = yield from tms_data_getset(dut, 0, d_len, d_in, reverse)
yield from jtag_set_idle(dut)
return result
# read DMI CTRL register
status = yield from jtag_read_write_reg(dut, DMI_READ, 64)
print ("dmi ctrl status", hex(status))
- assert status == 0
+ assert status == 6
# write DMI MSR address
yield from jtag_read_write_reg(dut, DMI_ADDR, 8, DBGCore.MSR)
# read DMI CTRL register
status = yield from jtag_read_write_reg(dut, DMI_READ, 64)
print ("dmi ctrl status", hex(status))
- assert status == 0
+ assert status == 6
# write DMI MSR address
yield from jtag_read_write_reg(dut, DMI_ADDR, 8, DBGCore.MSR)
if __name__ == '__main__':
- dut = JTAG(test_pinset(), wb_data_wid=64)
+ dut = JTAG(test_pinset(), wb_data_wid=64, domain="sync")
dut.stop = False
# rather than the client access the JTAG bus directly
cdut.c = JTAGClient()
dut.s.get_connection()
else:
+ print ("running server only as requested, use openocd remote to test")
+ sys.stdout.flush()
dut.s.get_connection(None) # block waiting for connection
# take copy of ir_width and scan_len
sim.add_sync_process(wrap(jtag_srv(dut))) # jtag server
if len(sys.argv) != 2 or sys.argv[1] != 'server':
sim.add_sync_process(wrap(jtag_sim(cdut, dut))) # actual jtag tester
- else:
- print ("running server only as requested, use openocd remote to test")
sim.add_sync_process(wrap(dmi_sim(dut))) # handles (pretends to be) DMI
with sim.write_vcd("dmi2jtag_test_srv.vcd"):
The basic rules are:
-1) p.ready_o is asserted on the initial ("Idle") state, otherwise it keeps low.
-2) n.valid_o is asserted on the final ("Done") state, otherwise it keeps low.
-3) The FSM stays in the Idle state while p.valid_i is low, otherwise
+1) p.o_ready is asserted on the initial ("Idle") state, otherwise it keeps low.
+2) n.o_valid is asserted on the final ("Done") state, otherwise it keeps low.
+3) The FSM stays in the Idle state while p.i_valid is low, otherwise
it accepts the input data and moves on.
-4) The FSM stays in the Done state while n.ready_i is low, otherwise
+4) The FSM stays in the Done state while n.i_ready is low, otherwise
it releases the output data and goes back to the Idle state.
"""
"""Simple sequential shifter
Prev port data:
- * p.data_i.data: value to be shifted
- * p.data_i.shift: shift amount
+ * p.i_data.data: value to be shifted
+ * p.i_data.shift: shift amount
* When zero, no shift occurs.
* On POWER, range is 0 to 63 for 32-bit,
* and 0 to 127 for 64-bit.
* op.sdir: shift direction (0 = left, 1 = right)
Next port data:
- * n.data_o.data: shifted value
+ * n.o_data.data: shifted value
"""
class PrevData:
def __init__(self, width):
- self.data = Signal(width, name="p_data_i")
+ self.data = Signal(width, name="p_i_data")
self.shift = Signal(width, name="p_shift_i")
self.ctx = Dummy() # comply with CompALU API
class NextData:
def __init__(self, width):
- self.data = Signal(width, name="n_data_o")
+ self.data = Signal(width, name="n_o_data")
def _get_data(self):
return [self.data]
self.width = width
self.p = PrevControl()
self.n = NextControl()
- self.p.data_i = Shifter.PrevData(width)
- self.n.data_o = Shifter.NextData(width)
+ self.p.i_data = Shifter.PrevData(width)
+ self.n.o_data = Shifter.NextData(width)
# more pieces to make this example class comply with the CompALU API
self.op = CompFSMOpSubset(name="op")
- self.p.data_i.ctx.op = self.op
- self.i = self.p.data_i._get_data()
- self.out = self.n.data_o._get_data()
+ self.p.i_data.ctx.op = self.op
+ self.i = self.p.i_data._get_data()
+ self.out = self.n.o_data._get_data()
def elaborate(self, platform):
m = Module()
# build the data flow
m.d.comb += [
# connect input and output
- shift_in.eq(self.p.data_i.data),
- self.n.data_o.data.eq(shift_reg),
+ shift_in.eq(self.p.i_data.data),
+ self.n.o_data.data.eq(shift_reg),
# generate shifted views of the register
shift_left_by_1.eq(Cat(0, shift_reg[:-1])),
shift_right_by_1.eq(Cat(shift_reg[1:], 0)),
with m.FSM():
with m.State("IDLE"):
m.d.comb += [
- # keep p.ready_o active on IDLE
- self.p.ready_o.eq(1),
+ # keep p.o_ready active on IDLE
+ self.p.o_ready.eq(1),
# keep loading the shift register and shift count
load.eq(1),
- next_count.eq(self.p.data_i.shift),
+ next_count.eq(self.p.i_data.shift),
]
# capture the direction bit as well
m.d.sync += direction.eq(self.op.sdir)
- with m.If(self.p.valid_i):
+ with m.If(self.p.i_valid):
# Leave IDLE when data arrives
with m.If(next_count == 0):
# short-circuit for zero shift
# exit when shift counter goes to zero
m.next = "DONE"
with m.State("DONE"):
- # keep n.valid_o active while the data is not accepted
- m.d.comb += self.n.valid_o.eq(1)
- with m.If(self.n.ready_i):
+ # keep n.o_valid active while the data is not accepted
+ m.d.comb += self.n.o_valid.eq(1)
+ with m.If(self.n.i_ready):
# go back to IDLE when the data is accepted
m.next = "IDLE"
def __iter__(self):
yield self.op.sdir
- yield self.p.data_i.data
- yield self.p.data_i.shift
- yield self.p.valid_i
- yield self.p.ready_o
- yield self.n.ready_i
- yield self.n.valid_o
- yield self.n.data_o.data
+ yield self.p.i_data.data
+ yield self.p.i_data.shift
+ yield self.p.i_valid
+ yield self.p.o_ready
+ yield self.n.i_ready
+ yield self.n.o_valid
+ yield self.n.o_data.data
def ports(self):
return list(self)
{'comment': 'Shifter Demonstration'},
('prev port', [
('op__sdir', 'in'),
- ('p_data_i[7:0]', 'in'),
+ ('p_i_data[7:0]', 'in'),
('p_shift_i[7:0]', 'in'),
({'submodule': 'p'}, [
- ('p_valid_i', 'in'),
- ('p_ready_o', 'out')])]),
+ ('p_i_valid', 'in'),
+ ('p_o_ready', 'out')])]),
('internal', [
'fsm_state' if is_engine_pysim() else 'fsm_state[1:0]',
'count[3:0]',
'shift_reg[7:0]']),
('next port', [
- ('n_data_o[7:0]', 'out'),
+ ('n_o_data[7:0]', 'out'),
({'submodule': 'n'}, [
- ('n_valid_o', 'out'),
- ('n_ready_i', 'in')])])]
+ ('n_o_valid', 'out'),
+ ('n_i_ready', 'in')])])]
write_gtkw("test_shifter.gtkw", "test_shifter.vcd",
gtkwave_desc, gtkwave_style,
sim.add_clock(1e-6)
def send(data, shift, direction):
- # present input data and assert valid_i
- yield dut.p.data_i.data.eq(data)
- yield dut.p.data_i.shift.eq(shift)
+ # present input data and assert i_valid
+ yield dut.p.i_data.data.eq(data)
+ yield dut.p.i_data.shift.eq(shift)
yield dut.op.sdir.eq(direction)
- yield dut.p.valid_i.eq(1)
+ yield dut.p.i_valid.eq(1)
yield
- # wait for p.ready_o to be asserted
- while not (yield dut.p.ready_o):
+ # wait for p.o_ready to be asserted
+ while not (yield dut.p.o_ready):
yield
- # clear input data and negate p.valid_i
- yield dut.p.valid_i.eq(0)
- yield dut.p.data_i.data.eq(0)
- yield dut.p.data_i.shift.eq(0)
+ # clear input data and negate p.i_valid
+ yield dut.p.i_valid.eq(0)
+ yield dut.p.i_data.data.eq(0)
+ yield dut.p.i_data.shift.eq(0)
yield dut.op.sdir.eq(0)
def receive(expected):
# signal readiness to receive data
- yield dut.n.ready_i.eq(1)
+ yield dut.n.i_ready.eq(1)
yield
- # wait for n.valid_o to be asserted
- while not (yield dut.n.valid_o):
+ # wait for n.o_valid to be asserted
+ while not (yield dut.n.o_valid):
yield
# read result
- result = yield dut.n.data_o.data
- # negate n.ready_i
- yield dut.n.ready_i.eq(0)
+ result = yield dut.n.o_data.data
+ # negate n.i_ready
+ yield dut.n.i_ready.eq(0)
# check result
assert result == expected
only one cycle (sync)
"""
-from nmigen import Elaboratable, Signal, Module, Const, Mux, Array
+from nmigen import Elaboratable, Signal, Module, Const, Mux
from nmigen.hdl.rec import Record, Layout
from nmigen.cli import main
from nmigen.cli import verilog, rtlil
from soc.fu.alu.alu_input_record import CompALUOpSubset
from soc.fu.cr.cr_input_record import CompCROpSubset
+from soc.fu.pipe_data import FUBaseData
+from soc.fu.alu.pipe_data import CommonPipeSpec
+from soc.fu.compunits.compunits import FunctionUnitBaseSingle
+
import operator
class DummyALU(Elaboratable):
def __init__(self, width):
self.p = Dummy() # make look like nmutil pipeline API
- self.p.data_i = Dummy()
- self.p.data_i.ctx = Dummy()
+ self.p.i_data = Dummy()
+ self.p.i_data.ctx = Dummy()
self.n = Dummy() # make look like nmutil pipeline API
- self.n.data_o = Dummy()
- self.p.valid_i = Signal()
- self.p.ready_o = Signal()
- self.n.ready_i = Signal()
- self.n.valid_o = Signal()
+ self.n.o_data = Dummy()
+ self.p.i_valid = Signal()
+ self.p.o_ready = Signal()
+ self.n.i_ready = Signal()
+ self.n.o_valid = Signal()
self.counter = Signal(4)
self.op = CompCROpSubset()
i = []
i.append(Signal(width, name="i1"))
i.append(Signal(width, name="i2"))
i.append(Signal(width, name="i3"))
- self.i = Array(i)
+ self.i = i
self.a, self.b, self.c = i[0], i[1], i[2]
- self.out = Array([Signal(width, name="alu_o")])
+ self.out = tuple([Signal(width, name="alu_o")])
self.o = self.out[0]
self.width = width
# more "look like nmutil pipeline API"
- self.p.data_i.ctx.op = self.op
- self.p.data_i.a = self.a
- self.p.data_i.b = self.b
- self.p.data_i.c = self.c
- self.n.data_o.o = self.o
+ self.p.i_data.ctx.op = self.op
+ self.p.i_data.a = self.a
+ self.p.i_data.b = self.b
+ self.p.i_data.c = self.c
+ self.n.o_data.o = self.o
def elaborate(self, platform):
m = Module()
go_now = Signal(reset_less=True) # testing no-delay ALU
- with m.If(self.p.valid_i):
+ with m.If(self.p.i_valid):
# input is valid. next check, if we already said "ready" or not
- with m.If(~self.p.ready_o):
+ with m.If(~self.p.o_ready):
# we didn't say "ready" yet, so say so and initialise
- m.d.sync += self.p.ready_o.eq(1)
+ m.d.sync += self.p.o_ready.eq(1)
m.d.sync += self.o.eq(self.a)
m.d.comb += go_now.eq(1)
with m.Else():
# input says no longer valid, so drop ready as well.
# a "proper" ALU would have had to sync in the opcode and a/b ops
- m.d.sync += self.p.ready_o.eq(0)
+ m.d.sync += self.p.o_ready.eq(0)
# ok so the counter's running: when it gets to 1, fire the output
with m.If((self.counter == 1) | go_now):
# set the output as valid if the recipient is ready for it
- m.d.sync += self.n.valid_o.eq(1)
- with m.If(self.n.ready_i & self.n.valid_o):
- m.d.sync += self.n.valid_o.eq(0)
+ m.d.sync += self.n.o_valid.eq(1)
+ with m.If(self.n.i_ready & self.n.o_valid):
+ m.d.sync += self.n.o_valid.eq(0)
# recipient said it was ready: reset back to known-good.
m.d.sync += self.counter.eq(0) # reset the counter
m.d.sync += self.o.eq(0) # clear the output for tidiness sake
def ports(self):
return list(self)
+#####################
+# converting even this dummy ALU over to the FunctionUnit RegSpecs API
+# which, errr, note that the regspecs are totally ignored below, but
+# at least the widths are all 64-bit so it's okay.
+#####################
+
+# input (and output) for logical initial stage (common input)
+
+
+class ALUInputData(FUBaseData):
+ regspec = [('INT', 'a', '0:63'), # RA
+ ('INT', 'b', '0:63'), # RB/immediate
+ ]
+
+ def __init__(self, pspec):
+ super().__init__(pspec, False)
+
+
+# output from ALU final stage
+class ALUOutputData(FUBaseData):
+ regspec = [('INT', 'o', '0:63'), # RT
+ ]
+
+ def __init__(self, pspec):
+ super().__init__(pspec, True)
+
+
+# ALU pipe specification class
+class ALUPipeSpec(CommonPipeSpec):
+ regspec = (ALUInputData.regspec, ALUOutputData.regspec)
+ opsubsetkls = CompALUOpSubset
+
+
+class ALUFunctionUnit(FunctionUnitBaseSingle):
+ # class ALUFunctionUnit(FunctionUnitBaseMulti):
+ fnunit = Function.ALU
+
+ def __init__(self, idx, parent_pspec):
+ super().__init__(ALUPipeSpec, ALU, 1, parent_pspec)
+
class ALU(Elaboratable):
def __init__(self, width):
+ # XXX major temporary hack: attempting to convert
+ # ALU over to RegSpecs API, FunctionUnitBaseSingle passes in
+ # a regspec here which we can't cope with. therefore, errr...
+ # just throw it away and set the width to 64
+ if not isinstance(width, int):
+ width = 64
+ # TODO, really this should just inherit from ControlBase it would
+ # be a lot less messy.
self.p = Dummy() # make look like nmutil pipeline API
- self.p.data_i = Dummy()
- self.p.data_i.ctx = Dummy()
+ self.p.i_data = Dummy()
+ self.p.i_data.ctx = Dummy()
self.n = Dummy() # make look like nmutil pipeline API
- self.n.data_o = Dummy()
- self.p.valid_i = Signal()
- self.p.ready_o = Signal()
- self.n.ready_i = Signal()
- self.n.valid_o = Signal()
+ self.n.o_data = Dummy()
+ self.p.i_valid = Signal()
+ self.p.o_ready = Signal()
+ self.n.i_ready = Signal()
+ self.n.o_valid = Signal()
self.counter = Signal(4)
self.op = CompALUOpSubset(name="op")
i = []
i.append(Signal(width, name="i1"))
i.append(Signal(width, name="i2"))
- self.i = Array(i)
+ self.i = i
self.a, self.b = i[0], i[1]
out = []
out.append(Data(width, name="alu_o"))
out.append(Data(width, name="alu_cr"))
- self.out = Array(out)
+ self.out = tuple(out)
self.o = self.out[0]
self.cr = self.out[1]
self.width = width
- # more "look like nmutil pipeline API"
- self.p.data_i.ctx.op = self.op
- self.p.data_i.a = self.a
- self.p.data_i.b = self.b
- self.n.data_o.o = self.o
- self.n.data_o.cr = self.cr
+ # more "look like nmutil ControlBase pipeline API" stuff
+ self.p.i_data.ctx.op = self.op
+ self.p.i_data.a = self.a
+ self.p.i_data.b = self.b
+ self.n.o_data.o = self.o
+ self.n.o_data.cr = self.cr
def elaborate(self, platform):
m = Module()
with m.If(go_now):
# with a combinatorial, no-delay ALU, just pass through
# the handshake signals to the other side
- m.d.comb += self.p.ready_o.eq(self.n.ready_i)
- m.d.comb += self.n.valid_o.eq(self.p.valid_i)
+ m.d.comb += self.p.o_ready.eq(self.n.i_ready)
+ m.d.comb += self.n.o_valid.eq(self.p.i_valid)
with m.Else():
# sequential ALU handshake:
- # ready_o responds to valid_i, but only if the ALU is idle
- m.d.comb += self.p.ready_o.eq(alu_idle)
- # select the internally generated valid_o, above
- m.d.comb += self.n.valid_o.eq(alu_done)
+ # o_ready responds to i_valid, but only if the ALU is idle
+ m.d.comb += self.p.o_ready.eq(alu_idle)
+ # select the internally generated o_valid, above
+ m.d.comb += self.n.o_valid.eq(alu_done)
- # hold the ALU result until ready_o is asserted
+ # hold the ALU result until o_ready is asserted
alu_r = Signal(self.width)
# output masks
m.d.comb += self.cr.ok.eq(self.op.rc.rc)
with m.If(alu_idle):
- with m.If(self.p.valid_i):
+ with m.If(self.p.i_valid):
# as this is a "fake" pipeline, just grab the output right now
with m.If(self.op.insn_type == MicrOp.OP_ADD):
with m.Else():
m.d.comb += go_now.eq(1)
- with m.Elif(~alu_done | self.n.ready_i):
+ with m.Elif(~alu_done | self.n.i_ready):
# decrement the counter while the ALU is neither idle nor finished
m.d.sync += self.counter.eq(self.counter - 1)
yield self.a
yield self.b
yield from self.o.ports()
- yield self.p.valid_i
- yield self.p.ready_o
- yield self.n.valid_o
- yield self.n.ready_i
+ yield self.p.i_valid
+ yield self.p.o_ready
+ yield self.n.o_valid
+ yield self.n.i_ready
def ports(self):
return list(self)
class BranchALU(Elaboratable):
def __init__(self, width):
self.p = Dummy() # make look like nmutil pipeline API
- self.p.data_i = Dummy()
- self.p.data_i.ctx = Dummy()
+ self.p.i_data = Dummy()
+ self.p.i_data.ctx = Dummy()
self.n = Dummy() # make look like nmutil pipeline API
- self.n.data_o = Dummy()
- self.p.valid_i = Signal()
- self.p.ready_o = Signal()
- self.n.ready_i = Signal()
- self.n.valid_o = Signal()
+ self.n.o_data = Dummy()
+ self.p.i_valid = Signal()
+ self.p.o_ready = Signal()
+ self.n.i_ready = Signal()
+ self.n.o_valid = Signal()
self.counter = Signal(4)
self.op = Signal(2)
i = []
i.append(Signal(width, name="i1"))
i.append(Signal(width, name="i2"))
- self.i = Array(i)
+ self.i = i
self.a, self.b = i[0], i[1]
- self.out = Array([Signal(width)])
+ self.out = tuple([Signal(width)])
self.o = self.out[0]
self.width = width
]
go_now = Signal(reset_less=True) # testing no-delay ALU
- with m.If(self.p.valid_i):
+ with m.If(self.p.i_valid):
# input is valid. next check, if we already said "ready" or not
- with m.If(~self.p.ready_o):
+ with m.If(~self.p.o_ready):
# we didn't say "ready" yet, so say so and initialise
- m.d.sync += self.p.ready_o.eq(1)
+ m.d.sync += self.p.o_ready.eq(1)
# as this is a "fake" pipeline, just grab the output right now
with m.Switch(self.op):
with m.Else():
# input says no longer valid, so drop ready as well.
# a "proper" ALU would have had to sync in the opcode and a/b ops
- m.d.sync += self.p.ready_o.eq(0)
+ m.d.sync += self.p.o_ready.eq(0)
# ok so the counter's running: when it gets to 1, fire the output
with m.If((self.counter == 1) | go_now):
# set the output as valid if the recipient is ready for it
- m.d.sync += self.n.valid_o.eq(1)
- with m.If(self.n.ready_i & self.n.valid_o):
- m.d.sync += self.n.valid_o.eq(0)
+ m.d.sync += self.n.o_valid.eq(1)
+ with m.If(self.n.i_ready & self.n.o_valid):
+ m.d.sync += self.n.o_valid.eq(0)
# recipient said it was ready: reset back to known-good.
m.d.sync += self.counter.eq(0) # reset the counter
m.d.sync += self.o.eq(0) # clear the output for tidiness sake
yield dut.b.eq(b)
yield dut.op.insn_type.eq(op)
yield dut.op.invert_in.eq(inv_a)
- yield dut.n.ready_i.eq(0)
- yield dut.p.valid_i.eq(1)
- yield dut.n.ready_i.eq(1)
+ yield dut.n.i_ready.eq(0)
+ yield dut.p.i_valid.eq(1)
+ yield dut.n.i_ready.eq(1)
yield
# wait for the ALU to accept our input data
- while not (yield dut.p.ready_o):
+ while not (yield dut.p.o_ready):
yield
- yield dut.p.valid_i.eq(0)
+ yield dut.p.i_valid.eq(0)
yield dut.a.eq(0)
yield dut.b.eq(0)
yield dut.op.insn_type.eq(0)
yield dut.op.invert_in.eq(0)
# wait for the ALU to present the output data
- while not (yield dut.n.valid_o):
+ while not (yield dut.n.o_valid):
yield
# latch the result and lower read_i
result = yield dut.o.data
- yield dut.n.ready_i.eq(0)
+ yield dut.n.i_ready.eq(0)
return result
sim.add_clock(1e-6)
def send(a, b, op, inv_a=0, rc=0):
- # present input data and assert valid_i
+ # present input data and assert i_valid
yield dut.a.eq(a)
yield dut.b.eq(b)
yield dut.op.insn_type.eq(op)
yield dut.op.invert_in.eq(inv_a)
yield dut.op.rc.rc.eq(rc)
- yield dut.p.valid_i.eq(1)
+ yield dut.p.i_valid.eq(1)
yield
- # wait for ready_o to be asserted
- while not (yield dut.p.ready_o):
+ # wait for o_ready to be asserted
+ while not (yield dut.p.o_ready):
yield
- # clear input data and negate valid_i
+ # clear input data and negate i_valid
# if send is called again immediately afterwards, there will be no
# visible transition (they will not be negated, after all)
- yield dut.p.valid_i.eq(0)
+ yield dut.p.i_valid.eq(0)
yield dut.a.eq(0)
yield dut.b.eq(0)
yield dut.op.insn_type.eq(0)
def receive():
# signal readiness to receive data
- yield dut.n.ready_i.eq(1)
+ yield dut.n.i_ready.eq(1)
yield
- # wait for valid_o to be asserted
- while not (yield dut.n.valid_o):
+ # wait for o_valid to be asserted
+ while not (yield dut.n.o_valid):
yield
# read results
result = yield dut.o.data
cr = yield dut.cr.data
- # negate ready_i
+ # negate i_ready
# if receive is called again immediately afterwards, there will be no
# visible transition (it will not be negated, after all)
- yield dut.n.ready_i.eq(0)
+ yield dut.n.i_ready.eq(0)
return result, cr
def producer():
'i2[15:0]',
'op__insn_type' if pysim else 'op__insn_type[6:0]',
'op__invert_in',
- 'valid_i',
- 'ready_o',
- 'valid_o',
- 'ready_i',
+ 'i_valid',
+ 'o_ready',
+ 'o_valid',
+ 'i_ready',
'alu_o[15:0]',
'alu_o_ok',
'alu_cr[15:0]',
# TODO: replace with Memory at some point
-from nmigen import Elaboratable, Signal, Array, Module
+from nmigen import Elaboratable, Signal, Array, Module, Memory
from nmutil.util import Display
+
class CacheRam(Elaboratable):
def __init__(self, ROW_BITS=16, WIDTH = 64, TRACE=True, ADD_BUF=False,
ADD_BUF = self.ADD_BUF
SIZE = 2**ROW_BITS
- ram = Array(Signal(WIDTH) for i in range(SIZE))
+ # set up the Cache RAM Memory and create one read and one write port
+ # the read port is *not* transparent (does not pass write-thru-read)
#attribute ram_style of ram : signal is "block";
-
- rd_data0 = Signal(WIDTH)
-
+ ram = Memory(depth=SIZE, width=WIDTH,
+ attrs={'syn_ramstyle': "block_ram"})
+ m.submodules.rdport = rdport = ram.read_port(transparent=False)
+ m.submodules.wrport = wrport = ram.write_port(granularity=8)
+
with m.If(TRACE):
with m.If(self.wr_sel.bool()):
sync += Display( "write ramno %d a: %%x "
"sel: %%x dat: %%x" % self.ram_num,
self.wr_addr,
self.wr_sel, self.wr_data)
- for i in range(WIDTH//8):
- lbit = i * 8;
- mbit = lbit + 8;
- with m.If(self.wr_sel[i]):
- sync += ram[self.wr_addr][lbit:mbit].eq(self.wr_data[lbit:mbit])
- with m.If(self.rd_en):
- sync += rd_data0.eq(ram[self.rd_addr])
- if TRACE:
+
+ # read data output and a latched copy. behaves like microwatt cacheram
+ rd_data0 = Signal(WIDTH)
+ rd_data0l = Signal(WIDTH)
+
+ # delay on read address/en
+ rd_delay = Signal()
+ rd_delay_addr = Signal.like(self.rd_addr)
+ sync += rd_delay_addr.eq(self.rd_addr)
+ sync += rd_delay.eq(self.rd_en)
+
+ # write port
+ comb += wrport.addr.eq(self.wr_addr)
+ comb += wrport.en.eq(self.wr_sel)
+ comb += wrport.data.eq(self.wr_data)
+
+ # read port (include a latch on the output, for microwatt compatibility)
+ comb += rdport.addr.eq(self.rd_addr)
+ comb += rdport.en.eq(self.rd_en)
+ with m.If(rd_delay):
+ comb += rd_data0.eq(rdport.data)
+ sync += rd_data0l.eq(rd_data0) # preserve latched data
+ with m.Else():
+ comb += rd_data0.eq(rd_data0l) # output latched (last-read)
+
+ if TRACE:
+ with m.If(rd_delay):
sync += Display("read ramno %d a: %%x dat: %%x" % self.ram_num,
- self.rd_addr, ram[self.rd_addr])
+ rd_delay_addr, rd_data0)
pass
-
+ # extra delay requested?
if ADD_BUF:
sync += self.rd_data_o.eq(rd_data0)
else:
self.src2_i = Signal(rwid, reset_less=True) # oper2 in
self.busy_o = Signal(reset_less=True) # fn busy out
- self.data_o = Signal(rwid, reset_less=True) # Dest out
+ self.o_data = Signal(rwid, reset_less=True) # Dest out
self.rd_rel_o = Signal(reset_less=True) # release src1/src2 request
- # release request out (valid_o)
+ # release request out (o_valid)
self.req_rel_o = Signal(reset_less=True)
self.done_o = self.req_rel_o # 'normalise' API
# NOTE: this spells TROUBLE if the ALU isn't ready!
# go_read is only valid for one clock!
with m.If(self.go_rd_i): # src operands ready, GO!
- with m.If(~self.alu.p_ready_o): # no ACK yet
- m.d.comb += self.alu.p_valid_i.eq(1) # so indicate valid
+ with m.If(~self.alu.p_o_ready): # no ACK yet
+ m.d.comb += self.alu.p_i_valid.eq(1) # so indicate valid
# only proceed if ALU says its output is valid
- with m.If(self.alu.n_valid_o):
+ with m.If(self.alu.n_o_valid):
# when ALU ready, write req release out. waits for shadow
m.d.comb += self.req_rel_o.eq(req_l.q & busy_o & self.shadown_i)
# when output latch is ready, and ALU says ready, accept ALU output
with m.If(self.req_rel_o & self.go_wr_i):
# tells ALU "thanks got it"
- m.d.comb += self.alu.n_ready_i.eq(1)
+ m.d.comb += self.alu.n_i_ready.eq(1)
# output the data from the latch on go_write
with m.If(self.go_wr_i):
- m.d.comb += self.data_o.eq(data_r)
+ m.d.comb += self.o_data.eq(data_r)
return m
yield self.busy_o
yield self.rd_rel_o
yield self.req_rel_o
- yield self.data_o
+ yield self.o_data
def ports(self):
return list(self)
yield
yield dut.go_rd_i.eq(0)
req_rel_o = yield dut.req_rel_o
- result = yield dut.data_o
+ result = yield dut.o_data
print("req_rel", req_rel_o, result)
while True:
req_rel_o = yield dut.req_rel_o
- result = yield dut.data_o
+ result = yield dut.o_data
print("req_rel", req_rel_o, result)
if req_rel_o:
break
yield
yield dut.go_wr_i.eq(1)
yield
- result = yield dut.data_o
+ result = yield dut.o_data
print("result", result)
yield dut.go_wr_i.eq(0)
yield
# output (busy/done)
self.busy_o = Signal(name="cu_busy_o", reset_less=True) # fn busy out
self.done_o = Signal(name="cu_done_o", reset_less=True)
+ self.alu_done_o = Signal(name="cu_alu_done_o", reset_less=True)
class MultiCompUnit(RegSpecALUAPI, Elaboratable):
- def __init__(self, rwid, alu, opsubsetkls, n_src=2, n_dst=1, name=None):
+ def __init__(self, rwid, alu, opsubsetkls, n_src=2, n_dst=1, name=None,
+ sync_rw=True):
"""MultiCompUnit
* :rwid: width of register latches (TODO: allocate per regspec)
* :n_dst: number of destination operands
"""
RegSpecALUAPI.__init__(self, rwid, alu)
+ self.sync_rw = sync_rw
self.alu_name = name or "alu"
self.opsubsetkls = opsubsetkls
self.cu = cu = CompUnitRecord(opsubsetkls, rwid, n_src, n_dst,
self.wr = cu.wr
self.rdmaskn = cu.rdmaskn
self.wrmask = cu.wrmask
+ self.alu_done_o = cu.alu_done_o
self.go_rd_i = self.rd.go_i # temporary naming
self.go_wr_i = self.wr.go_i # temporary naming
self.rd_rel_o = self.rd.rel_o # temporary naming
self.busy_o = cu.busy_o
self.dest = cu._dest
- self.data_o = self.dest[0] # Dest out
+ self.o_data = self.dest[0] # Dest out
self.done_o = cu.done_o
def _mux_op(self, m, sl, op_is_imm, imm, i):
def elaborate(self, platform):
m = Module()
- setattr(m.submodules, self.alu_name, self.alu)
+ if self.sync_rw:
+ rw_domain = m.d.sync
+ else:
+ rw_domain = m.d.comb
+ # generate a pulse on system reset, to reset any latches, if needed
+ system_reset = Signal(reset=1)
+ m.d.sync += system_reset.eq(0)
+
+ # add the ALU to the MultiCompUnit only if it is a "real" ALU
+ # see AllFunctionUnits as to why: a FunctionUnitBaseMulti
+ # only has one "real" ALU but multiple pseudo front-ends,
+ # aka "ReservationStations" (ALUProxy "fronts")
+ if isinstance(self.alu, Elaboratable):
+ setattr(m.submodules, self.alu_name, self.alu)
m.submodules.src_l = src_l = SRLatch(False, self.n_src, name="src")
m.submodules.opc_l = opc_l = SRLatch(sync=False, name="opc")
m.submodules.req_l = req_l = SRLatch(False, self.n_dst, name="req")
# ALU only proceeds when all src are ready. rd_rel_o is delayed
# so combine it with go_rd_i. if all bits are set we're good
all_rd = Signal(reset_less=True)
- m.d.comb += all_rd.eq(self.busy_o & rok_l.q &
+ m.d.comb += all_rd.eq(self.busy_o & # rok_l.q & # XXX LOOP
(((~self.rd.rel_o) | self.rd.go_i).all()))
# generate read-done pulse
all_rd_pulse = Signal(reset_less=True)
- m.d.comb += all_rd_pulse.eq(rising_edge(m, all_rd))
+ m.d.comb += all_rd_pulse.eq(rising_edge(m, all_rd)) # XXX LOOP
# create rising pulse from alu valid condition.
- alu_done = Signal(reset_less=True)
+ alu_done = self.cu.alu_done_o
alu_pulse = Signal(reset_less=True)
alu_pulsem = Signal(self.n_dst, reset_less=True)
- m.d.comb += alu_done.eq(self.alu.n.valid_o)
+ m.d.comb += alu_done.eq(self.alu.n.o_valid)
m.d.comb += alu_pulse.eq(rising_edge(m, alu_done))
m.d.comb += alu_pulsem.eq(Repl(alu_pulse, self.n_dst))
# is enough, when combined with when read-phase is done (rst_l.q)
wr_any = Signal(reset_less=True)
req_done = Signal(reset_less=True)
- m.d.comb += self.done_o.eq(self.busy_o &
- ~((self.wr.rel_o & ~self.wrmask).bool()))
+ m.d.comb += self.done_o.eq(self.busy_o & ~(self.wr.rel_o).bool())
m.d.comb += wr_any.eq(self.wr.go_i.bool() | prev_wr_go.bool())
- m.d.comb += req_done.eq(wr_any & ~self.alu.n.ready_i &
- ((req_l.q & self.wrmask) == 0))
+ m.d.comb += req_done.eq(wr_any & ~self.alu.n.i_ready & (req_l.q == 0))
# argh, complicated hack: if there are no regs to write,
# instead of waiting for regs that are never going to happen,
# we indicate "done" when the ALU is "done"
with m.If((self.wrmask == 0) &
- self.alu.n.ready_i & self.alu.n.valid_o & self.busy_o):
+ self.alu.n.i_ready & self.alu.n.o_valid & self.busy_o):
m.d.comb += req_done.eq(1)
# shadow/go_die
m.d.comb += reset.eq(req_done | self.go_die_i)
m.d.comb += rst_r.eq(self.issue_i | self.go_die_i)
m.d.comb += reset_w.eq(self.wr.go_i | Repl(self.go_die_i, self.n_dst))
- m.d.comb += reset_r.eq(self.rd.go_i | Repl(self.go_die_i, self.n_src))
+ m.d.comb += reset_r.eq(self.rd.go_i | Repl(rst_r, self.n_src))
# read-done,wr-proceed latch
- m.d.sync += rok_l.s.eq(self.issue_i) # set up when issue starts
- m.d.sync += rok_l.r.eq(self.alu.n.valid_o & self.busy_o) # ALU done
+ rw_domain += rok_l.s.eq(self.issue_i) # set up when issue starts
+ rw_domain += rok_l.r.eq(self.alu.n.o_valid & self.busy_o) # ALUdone LOOP
# wr-done, back-to-start latch
- m.d.sync += rst_l.s.eq(all_rd) # set when read-phase is fully done
- m.d.sync += rst_l.r.eq(rst_r) # *off* on issue
+ rw_domain += rst_l.s.eq(all_rd) # set when read-phase is fully done
+ rw_domain += rst_l.r.eq(rst_r) # *off* on issue
# opcode latch (not using go_rd_i) - inverted so that busy resets to 0
m.d.sync += opc_l.s.eq(self.issue_i) # set on issue
m.d.sync += opc_l.r.eq(req_done) # reset on ALU
- # src operand latch (not using go_wr_i)
- m.d.sync += src_l.s.eq(Repl(self.issue_i, self.n_src))
+ # src operand latch (not using go_wr_i) ANDed with rdmask
+ rdmaskn = Signal(self.n_src)
+ latchregister(m, self.rdmaskn, rdmaskn, self.issue_i, name="rdmask_l")
+ m.d.sync += src_l.s.eq(Repl(self.issue_i, self.n_src) & ~rdmaskn)
m.d.sync += src_l.r.eq(reset_r)
# dest operand latch (not using issue_i)
- m.d.sync += req_l.s.eq(alu_pulsem & self.wrmask)
- m.d.sync += req_l.r.eq(reset_w | prev_wr_go)
+ rw_domain += req_l.s.eq(alu_pulsem & self.wrmask)
+ m.d.comb += req_l.r.eq(reset_w | prev_wr_go |
+ Repl(system_reset, self.n_dst))
# pass operation to the ALU (sync: plenty time to wait for src reads)
op = self.get_op()
name = "data_r%d" % i
lro = self.get_out(i)
ok = Const(1, 1)
+ data_r_ok = Const(1, 1)
if isinstance(lro, Record):
+ print("wr fields", i, lro, lro.fields)
data_r = Record.like(lro, name=name)
- print("wr fields", i, lro, data_r.fields)
# bye-bye abstract interface design..
- fname = find_ok(data_r.fields)
+ fname = find_ok(lro.fields)
if fname:
ok = getattr(lro, fname)
+ data_r_ok = getattr(data_r, fname)
+ # write-ok based on incoming output *and* whether the latched
+ # data was ok.
+ # XXX fails - wrok.append((ok|data_r_ok) & self.busy_o)
+ wrok.append(ok & self.busy_o)
else:
- data_r = Signal.like(lro, name=name, reset_less=True)
- wrok.append(ok & self.busy_o)
- with m.If(alu_pulse):
- m.d.sync += data_r.eq(lro)
+ data_r = Signal.like(lro, name=name)
+ # really should retire this but it's part of unit tests
+ wrok.append(ok & self.busy_o)
+ #latchregister(m, lro, data_r, ok & self.busy_o, name=name)
+ latchregister(m, lro, data_r, alu_pulse, name=name)
with m.If(self.issue_i):
- m.d.sync += data_r.eq(0)
+ m.d.comb += data_r.eq(0)
drl.append(data_r)
# ok, above we collated anything with an "ok" on the output side
# create a latch/register for src1/src2 (even if it is a copy of imm)
for i in range(self.n_src):
src, alusrc, latch, _ = sl[i]
- latchregister(m, src, alusrc, latch, name="src_r%d" % i)
+ reg = latchregister(m, src, alusrc, latch, name="src_r%d" % i)
+ # rdmask stops src latches from being set. clear all if not busy
+ with m.If(~self.busy_o):
+ m.d.sync += reg.eq(0)
# -----
# ALU connection / interaction
# on a go_read, tell the ALU we're accepting data.
m.submodules.alui_l = alui_l = SRLatch(False, name="alui")
- m.d.comb += self.alu.p.valid_i.eq(alui_l.q)
- m.d.sync += alui_l.r.eq(self.alu.p.ready_o & alui_l.q)
+ m.d.comb += self.alu.p.i_valid.eq(alui_l.q)
+ m.d.sync += alui_l.r.eq(self.alu.p.o_ready & alui_l.q)
m.d.comb += alui_l.s.eq(all_rd_pulse)
# ALU output "ready" side. alu "ready" indication stays hi until
# ALU says "valid".
m.submodules.alu_l = alu_l = SRLatch(False, name="alu")
- m.d.comb += self.alu.n.ready_i.eq(alu_l.q)
- m.d.sync += alu_l.r.eq(self.alu.n.valid_o & alu_l.q)
- m.d.comb += alu_l.s.eq(all_rd_pulse)
+ m.d.comb += self.alu.n.i_ready.eq(alu_l.q)
+ m.d.sync += alu_l.r.eq(self.alu.n.o_valid & alu_l.q)
+ m.d.comb += alu_l.s.eq(all_rd_pulse) # XXX LOOP
# -----
# outputs
m.d.comb += self.busy_o.eq(opc_l.q) # busy out
# read-release gated by busy (and read-mask)
- bro = Repl(self.busy_o, self.n_src)
- m.d.comb += self.rd.rel_o.eq(src_l.q & bro & slg & ~self.rdmaskn)
+ if True: #self.sync_rw: - experiment (doesn't work)
+ bro = Repl(self.busy_o, self.n_src)
+ else:
+ bro = Repl(self.busy_o|self.issue_i, self.n_src)
+ m.d.comb += self.rd.rel_o.eq(src_l.q & bro & slg)
# write-release gated by busy and by shadow (and write-mask)
brd = Repl(self.busy_o & self.shadown_i, self.n_dst)
- m.d.comb += self.wr.rel_o.eq(req_l.q & brd & self.wrmask)
+ m.d.comb += self.wr.rel_o.eq(req_l.q_int & brd)
# output the data from the latch on go_write
for i in range(self.n_dst):
yield self.busy_o
yield self.rd.rel_o
yield self.wr.rel_o
- yield self.data_o
+ yield self.o_data
def ports(self):
return list(self)
and (as long as there was no exception) the data comes out (at any
time from the PortInterface), and is captured by the LDCompSTUnit.
+TODO: dcbz, yes, that's going to be complicated, has to be done
+ with great care, to detect the case when dcbz is set
+ and *not* expect to read any data, just the address.
+ so, wait for RA but not RB.
+
Both LD and ST may request that the address be computed from summing
operand1 (src[0]) with operand2 (src[1]) *or* by summing operand1 with
the immediate (from the opcode).
* A third FSM activates to cover ST. it activates if op_is_st is true
+ * TODO document DCBZ (not complete yet)
+
* The "overall" (fourth) FSM coordinates the progression and completion
of the three other FSMs, firing "WR_RESET" which switches off "busy"
from nmigen.compat.sim import run_simulation
from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Mux, Cat, Elaboratable, Array, Repl
+from nmigen import Module, Signal, Mux, Cat, Elaboratable, Array, Repl, C
from nmigen.hdl.rec import Record, Layout
from nmutil.latch import SRLatch, latchregister
from soc.fu.ldst.ldst_input_record import CompLDSTOpSubset
from openpower.decoder.power_decoder2 import Data
from openpower.consts import MSR
+from soc.config.test.test_loadstore import TestMemPspec
+
+# for debugging dcbz
+from nmutil.util import Display
# TODO: LDSTInputData and LDSTOutputData really should be used
Data (outputs)
--------------
- * :data_o: Dest out (LD) - managed by wr[0] go/req
+ * :o_data: Dest out (LD) - managed by wr[0] go/req
* :addr_o: Address out (LD or ST) - managed by wr[1] go/req
* :exc_o: Address/Data Exception occurred. LD/ST must terminate
TODO: use one module for the byte-reverse as it's quite expensive in gates
"""
- def __init__(self, pi=None, rwid=64, awid=48, opsubset=CompLDSTOpSubset,
+ def __init__(self, pi=None, rwid=64, awid=64, opsubset=CompLDSTOpSubset,
debugtest=False, name=None):
super().__init__(rwid)
self.awid = awid
self.pi = pi
self.cu = cu = LDSTCompUnitRecord(rwid, opsubset, name=name)
- self.debugtest = debugtest
+ self.debugtest = debugtest # enable debug output for unit testing
# POWER-compliant LD/ST has index and update: *fixed* number of ports
self.n_src = n_src = 3 # RA, RB, RT/RS
- self.n_dst = n_dst = 2 # RA, RT/RS
+ self.n_dst = n_dst = 3 # RA, RT/RS, CR0
# set up array of src and dest signals
for i in range(n_src):
self.oper_i = cu.oper_i
self.src_i = cu._src_i
- self.data_o = Data(self.data_wid, name="o") # Dest1 out: RT
+ self.o_data = Data(self.data_wid, name="o") # Dest1 out: RT
self.addr_o = Data(self.data_wid, name="ea") # Addr out: Update => RA
+ self.cr_o = Data(4, name="cr0") # CR0 (for stdcx etc)
self.exc_o = cu.exc_o
self.done_o = cu.done_o
self.busy_o = cu.busy_o
#####################
# latches for the FSM.
- m.submodules.opc_l = opc_l = SRLatch(sync=False, name="opc")
+ m.submodules.opc_l = opc_l = SRLatch(sync=True, name="opc")
m.submodules.src_l = src_l = SRLatch(False, self.n_src, name="src")
m.submodules.alu_l = alu_l = SRLatch(sync=False, name="alu")
m.submodules.adr_l = adr_l = SRLatch(sync=False, name="adr")
m.submodules.sto_l = sto_l = SRLatch(sync=False, name="sto")
m.submodules.wri_l = wri_l = SRLatch(sync=False, name="wri")
m.submodules.upd_l = upd_l = SRLatch(sync=False, name="upd")
+ m.submodules.cr0_l = cr0_l = SRLatch(sync=False, name="cr0")
m.submodules.rst_l = rst_l = SRLatch(sync=False, name="rst")
m.submodules.lsd_l = lsd_l = SRLatch(sync=False, name="lsd") # done
# opcode decode
op_is_ld = Signal(reset_less=True)
op_is_st = Signal(reset_less=True)
+ op_is_dcbz = Signal(reset_less=True)
+ op_is_st_or_dcbz = Signal(reset_less=True)
+ op_is_atomic = Signal(reset_less=True)
# ALU/LD data output control
alu_valid = Signal(reset_less=True) # ALU operands are valid
rda_any = Signal(reset_less=True) # any read for address ops
rd_done = Signal(reset_less=True) # all *necessary* operands read
wr_reset = Signal(reset_less=True) # final reset condition
+ canceln = Signal(reset_less=True) # cancel (active low)
+ store_done = Signal(reset_less=True) # store has been actioned
# LD and ALU out
alu_o = Signal(self.data_wid, reset_less=True)
reset_o = Signal(reset_less=True) # reset opcode
reset_w = Signal(reset_less=True) # reset write
reset_u = Signal(reset_less=True) # reset update
+ reset_c = Signal(reset_less=True) # reset cr0
reset_a = Signal(reset_less=True) # reset adr latch
reset_i = Signal(reset_less=True) # issue|die (use a lot)
reset_r = Signal(self.n_src, reset_less=True) # reset src
reset_s = Signal(reset_less=True) # reset store
- comb += reset_i.eq(issue_i | self.go_die_i) # various
- comb += reset_o.eq(self.done_o | self.go_die_i) # opcode reset
- comb += reset_w.eq(self.wr.go_i[0] | self.go_die_i) # write reg 1
- comb += reset_u.eq(self.wr.go_i[1] | self.go_die_i) # update (reg 2)
- comb += reset_s.eq(self.go_st_i | self.go_die_i) # store reset
- comb += reset_r.eq(self.rd.go_i | Repl(self.go_die_i, self.n_src))
- comb += reset_a.eq(self.go_ad_i | self.go_die_i)
+ # end execution when a terminating condition is detected:
+ # - go_die_i: a speculative operation was cancelled
+ # - exc_o.happened: an exception has occurred
+ terminate = Signal()
+ comb += terminate.eq(self.go_die_i | self.exc_o.happened)
+
+ comb += reset_i.eq(issue_i | terminate) # various
+ comb += reset_o.eq(self.done_o | terminate) # opcode reset
+ comb += reset_w.eq(self.wr.go_i[0] | terminate) # write reg 1
+ comb += reset_u.eq(self.wr.go_i[1] | terminate) # update (reg 2)
+ comb += reset_c.eq(self.wr.go_i[2] | terminate) # cr0 (reg 3)
+ comb += reset_s.eq(self.go_st_i | terminate) # store reset
+ comb += reset_r.eq(self.rd.go_i | Repl(terminate, self.n_src))
+ comb += reset_a.eq(self.go_ad_i | terminate)
p_st_go = Signal(reset_less=True)
sync += p_st_go.eq(self.st.go_i)
# decode bits of operand (latched)
oper_r = CompLDSTOpSubset(name="oper_r") # Dest register
- comb += op_is_st.eq(oper_r.insn_type == MicrOp.OP_STORE) # ST
- comb += op_is_ld.eq(oper_r.insn_type == MicrOp.OP_LOAD) # LD
+ comb += op_is_st.eq(oper_r.insn_type == MicrOp.OP_STORE) # ST
+ comb += op_is_ld.eq(oper_r.insn_type == MicrOp.OP_LOAD) # LD
+ comb += op_is_dcbz.eq(oper_r.insn_type == MicrOp.OP_DCBZ) # DCBZ
+ comb += op_is_atomic.eq(oper_r.reserve) # atomic LR/SC
+ comb += op_is_st_or_dcbz.eq(op_is_st | op_is_dcbz)
+ # dcbz is special case of store
+ #uncomment if needed
+ #comb += Display("compldst_multi: op_is_dcbz = %i",
+ # (oper_r.insn_type == MicrOp.OP_DCBZ))
op_is_update = oper_r.ldst_mode == LDSTMode.update # UPDATE
op_is_cix = oper_r.ldst_mode == LDSTMode.cix # cache-inhibit
comb += self.load_mem_o.eq(op_is_ld & self.go_ad_i)
# - alu_l : looks after add of src1/2/imm (EA)
# - adr_l : waits for add (EA)
# - upd_l : waits for adr and Regfile (port 2)
+ # - cr0_l : waits for Rc=1 and CR0 Regfile (port 3)
# - src_l[2] : ST
# - lod_l : waits for adr (EA) and for LD Data
# - wri_l : waits for LD Data and Regfile (port 1)
# opcode latch - inverted so that busy resets to 0
# note this MUST be sync so as to avoid a combinatorial loop
# between busy_o and issue_i on the reset latch (rst_l)
- sync += opc_l.s.eq(issue_i) # XXX NOTE: INVERTED FROM book!
- sync += opc_l.r.eq(reset_o) # XXX NOTE: INVERTED FROM book!
+ comb += opc_l.s.eq(issue_i) # XXX NOTE: INVERTED FROM book!
+ comb += opc_l.r.eq(reset_o) # XXX NOTE: INVERTED FROM book!
# src operand latch
- sync += src_l.s.eq(Repl(issue_i, self.n_src))
+ sync += src_l.s.eq(Repl(issue_i, self.n_src) & ~self.rdmaskn)
sync += src_l.r.eq(reset_r)
+ #### sync += Display("reset_r = %i",reset_r)
# alu latch. use sync-delay between alu_ok and valid to generate pulse
comb += alu_l.s.eq(reset_i)
#self.done_o | (self.pi.busy_o & op_is_update),
self.n_dst))
+ # CR0 operand latch (CR0 written to reg 3 if Rc=1)
+ op_is_rc1 = self.oper_i.rc.rc & self.oper_i.rc.ok
+ comb += cr0_l.s.eq(issue_i & op_is_rc1)
+ sync += cr0_l.r.eq(reset_c)
+
# update-mode operand latch (EA written to reg 2)
sync += upd_l.s.eq(reset_i)
sync += upd_l.r.eq(reset_u)
# store latch
- comb += sto_l.s.eq(addr_ok & op_is_st)
+ comb += sto_l.s.eq(addr_ok & op_is_st_or_dcbz)
sync += sto_l.r.eq(reset_s | p_st_go)
# ld/st done. needed to stop LD/ST from activating repeatedly
# create a latch/register for the operand
with m.If(self.issue_i):
sync += oper_r.eq(self.oper_i)
- with m.If(self.done_o):
+ with m.If(self.done_o | terminate):
sync += oper_r.eq(0)
- # and for LD
+ # and for LD and store-done
ldd_r = Signal(self.data_wid, reset_less=True) # Dest register
latchregister(m, ldd_o, ldd_r, ld_ok, name="ldo_r")
+ # store actioned, communicate through CR0 (for atomic LR/SC)
+ latchregister(m, self.pi.store_done.data, store_done,
+ self.pi.store_done.ok,
+ name="std_r")
+
# and for each input from the incoming src operands
srl = []
for i in range(self.n_src):
# now do the ALU addr add: one cycle, and say "ready" (next cycle, too)
comb += alu_o.eq(src1_or_z + src2_or_imm) # actual EA
- m.d.sync += alu_ok.eq(alu_valid) # keep ack in sync with EA
+ m.d.sync += alu_ok.eq(alu_valid & canceln) # keep ack in sync with EA
############################
# Control Signal calculation
# 1st operand read-request only when zero not active
# 2nd operand only needed when immediate is not active
- slg = Cat(op_is_z, op_is_imm)
+ slg = Cat(op_is_z, op_is_imm) #is this correct ?
bro = Repl(self.busy_o, self.n_src)
- comb += self.rd.rel_o.eq(src_l.q & bro & ~slg & ~self.rdmaskn)
+ comb += self.rd.rel_o.eq(src_l.q & bro & ~slg)
# note when the address-related read "go" signals are active
comb += rda_any.eq(self.rd.go_i[0] | self.rd.go_i[1])
# alu input valid when 1st and 2nd ops done (or imm not active)
- comb += alu_valid.eq(busy_o & ~(self.rd.rel_o[0] | self.rd.rel_o[1]))
+ comb += alu_valid.eq(busy_o & ~(self.rd.rel_o[0] | self.rd.rel_o[1]) &
+ canceln)
# 3rd operand only needed when operation is a store
comb += self.rd.rel_o[2].eq(src_l.q[2] & busy_o & op_is_st)
comb += self.adr_rel_o.eq(alu_valid & adr_l.q & busy_o)
# the write/store (etc) all must be cancelled if an exception occurs
- cancel = Signal(reset_less=True)
- comb += cancel.eq(self.exc_o.happened | self.shadown_i)
+ # note: cancel is active low, like shadown_i,
+ # while exc_o.happpened is active high
+ comb += canceln.eq(~self.exc_o.happened & self.shadown_i)
# store release when st ready *and* all operands read (and no shadow)
- comb += self.st.rel_o.eq(sto_l.q & busy_o & rd_done & op_is_st &
- cancel)
+ # dcbz is special case of store -- TODO verify shadows
+ comb += self.st.rel_o.eq(sto_l.q & busy_o & rd_done & op_is_st_or_dcbz &
+ canceln)
# request write of LD result. waits until shadow is dropped.
comb += self.wr.rel_o[0].eq(rd_done & wri_l.q & busy_o & lod_l.qn &
- op_is_ld & cancel)
+ op_is_ld & canceln)
# request write of EA result only in update mode
comb += self.wr.rel_o[1].eq(upd_l.q & busy_o & op_is_update &
- alu_valid & cancel)
+ alu_valid & canceln)
+
+ # request write of CR0 result only in reserve and Rc=1
+ comb += self.wr.rel_o[2].eq(cr0_l.q & busy_o & op_is_atomic &
+ alu_valid & canceln)
# provide "done" signal: select req_rel for non-LD/ST, adr_rel for LD/ST
comb += wr_any.eq(self.st.go_i | p_st_go |
- self.wr.go_i[0] | self.wr.go_i[1])
- comb += wr_reset.eq(rst_l.q & busy_o & cancel &
- ~(self.st.rel_o | self.wr.rel_o[0] |
- self.wr.rel_o[1]) &
- (lod_l.qn | op_is_st)
+ self.wr.go_i.bool())
+ comb += wr_reset.eq(rst_l.q & busy_o & canceln &
+ ~(self.st.rel_o | self.wr.rel_o.bool()) &
+ (lod_l.qn | op_is_st_or_dcbz)
)
comb += self.done_o.eq(wr_reset & (~self.pi.busy_o | op_is_ld))
# Data/Address outputs
# put the LD-output register directly onto the output bus on a go_write
- comb += self.data_o.data.eq(self.dest[0])
+ comb += self.o_data.data.eq(self.dest[0])
+ comb += self.o_data.ok.eq(self.wr.rel_o[0])
with m.If(self.wr.go_i[0]):
comb += self.dest[0].eq(ldd_r)
# "update" mode, put address out on 2nd go-write
comb += self.addr_o.data.eq(self.dest[1])
+ comb += self.addr_o.ok.eq(self.wr.rel_o[1])
with m.If(op_is_update & self.wr.go_i[1]):
comb += self.dest[1].eq(addr_r)
+ # fun-fun-fun, calculate CR0 when Rc=1 requested.
+ cr0 = self.dest[2]
+ comb += self.cr_o.data.eq(cr0)
+ comb += self.cr_o.ok.eq(self.wr.rel_o[2])
+ with m.If(cr0_l.q):
+ comb += cr0.eq(Cat(C(0, 1), store_done, C(0, 2)))
+
# need to look like MultiCompUnit: put wrmask out.
# XXX may need to make this enable only when write active
- comb += self.wrmask.eq(bro & Cat(op_is_ld, op_is_update))
+ comb += self.wrmask.eq(bro & Cat(op_is_ld, op_is_update, cr0_l.q))
###########################
# PortInterface connections
# connect to LD/ST PortInterface.
comb += pi.is_ld_i.eq(op_is_ld & busy_o) # decoded-LD
- comb += pi.is_st_i.eq(op_is_st & busy_o) # decoded-ST
+ comb += pi.is_nc.eq(op_is_cix & busy_o) # cache-inhibited
+ comb += pi.is_st_i.eq(op_is_st_or_dcbz & busy_o) # decoded-ST
+ comb += pi.is_dcbz_i.eq(op_is_dcbz & busy_o) # decoded-DCBZ
+ comb += pi.reserve.eq(oper_r.reserve & busy_o) # atomic LR/SC
comb += pi.data_len.eq(oper_r.data_len) # data_len
# address: use sync to avoid long latency
sync += pi.addr.data.eq(addr_r) # EA from adder
+ with m.If(op_is_dcbz):
+ sync += Display("LDSTCompUnit.DCBZ: EA from adder %x", addr_r)
+
sync += pi.addr.ok.eq(alu_ok & lsd_l.q) # "do address stuff" (once)
comb += self.exc_o.eq(pi.exc_o) # exception occurred
comb += addr_ok.eq(self.pi.addr_ok_o) # no exc, address fine
- # connect MSR.PR for priv/virt operation
- comb += pi.msr_pr.eq(oper_r.msr[MSR.PR])
+ # connect MSR.PR etc. for priv/virt operation
+ comb += pi.priv_mode.eq(~oper_r.msr[MSR.PR])
+ comb += pi.virt_mode.eq(oper_r.msr[MSR.DR])
+ comb += pi.mode_32bit.eq(~oper_r.msr[MSR.SF])
+ with m.If(self.issue_i): # display this only once
+ sync += Display("LDSTCompUnit: oper_r.msr %x pr=%x dr=%x sf=%x",
+ oper_r.msr,
+ oper_r.msr[MSR.PR],
+ oper_r.msr[MSR.DR],
+ oper_r.msr[MSR.SF])
# byte-reverse on LD
revnorev = Signal(64, reset_less=True)
comb += pi.st.data.eq(stdata_r)
with m.Else():
comb += pi.st.data.eq(op3)
+
# store - data goes in based on go_st
comb += pi.st.ok.eq(self.st.go_i) # go store signals st data valid
to LDSTOutputData o and o1 respectively.
"""
if i == 0:
- return self.data_o # LDSTOutputData.regspec o
+ return self.o_data # LDSTOutputData.regspec o
if i == 1:
return self.addr_o # LDSTOutputData.regspec o1
+ if i == 2:
+ return self.cr_o # LDSTOutputData.regspec cr_a
# return self.dest[i]
def get_fu_out(self, i):
yield self.adr_rel_o
yield self.sto_rel_o
yield self.wr.rel_o
- yield from self.data_o.ports()
+ yield from self.o_data.ports()
yield from self.addr_o.ports()
+ yield from self.cr_o.ports()
yield self.load_mem_o
yield self.stwd_mem_o
yield dut.src1_i.eq(src1)
yield dut.src2_i.eq(src2)
yield dut.src3_i.eq(src3)
- yield dut.oper_i.imm_data.imm.eq(imm)
+ yield dut.oper_i.imm_data.data.eq(imm)
yield dut.oper_i.imm_data.ok.eq(imm_ok)
- yield dut.oper_i.update.eq(update)
+ #guess: this one was removed -- yield dut.oper_i.update.eq(update)
yield dut.issue_i.eq(1)
yield
yield dut.issue_i.eq(0)
if rel == active_rel:
break
yield
- yield dut.rd.go.eq(active_rel)
+ yield dut.rd.go_i.eq(active_rel)
yield
- yield dut.rd.go.eq(0)
+ yield dut.rd.go_i.eq(0)
yield from wait_for(dut.adr_rel_o, False, test1st=True)
# yield from wait_for(dut.adr_rel_o)
yield dut.src1_i.eq(src1)
yield dut.src2_i.eq(src2)
yield dut.oper_i.zero_a.eq(zero_a)
- yield dut.oper_i.imm_data.imm.eq(imm)
+ yield dut.oper_i.imm_data.data.eq(imm)
yield dut.oper_i.imm_data.ok.eq(imm_ok)
yield dut.issue_i.eq(1)
yield
# wait for the operands (RA, RB, or both)
if rd:
- yield dut.rd.go.eq(rd)
+ yield dut.rd.go_i.eq(rd)
yield from wait_for(dut.rd.rel_o)
- yield dut.rd.go.eq(0)
+ yield dut.rd.go_i.eq(0)
yield from wait_for(dut.adr_rel_o, False, test1st=True)
# yield dut.ad.go.eq(1)
if update:
yield from wait_for(dut.wr.rel_o[1])
- yield dut.wr.go.eq(0b10)
+ yield dut.wr.go_i.eq(0b10)
yield
addr = yield dut.addr_o
print("addr", addr)
- yield dut.wr.go.eq(0)
+ yield dut.wr.go_i.eq(0)
else:
addr = None
yield from wait_for(dut.wr.rel_o[0], test1st=True)
- yield dut.wr.go.eq(1)
+ yield dut.wr.go_i.eq(1)
yield
- data = yield dut.data_o
- print(data)
- yield dut.wr.go.eq(0)
+ data = yield dut.o_data.o
+ data_ok = yield dut.o_data.o_ok
+ yield dut.wr.go_i.eq(0)
yield from wait_for(dut.busy_o)
yield
# wait_for(dut.stwd_mem_o)
- return data, addr
+ return data, data_ok, addr
def ldst_sim(dut):
class TestLDSTCompUnit(LDSTCompUnit):
- def __init__(self, rwid):
+ def __init__(self, rwid, pspec):
from soc.experiment.l0_cache import TstL0CacheBuffer
- self.l0 = l0 = TstL0CacheBuffer()
- pi = l0.l0.dports[0].pi
+ self.l0 = l0 = TstL0CacheBuffer(pspec)
+ pi = l0.l0.dports[0]
LDSTCompUnit.__init__(self, pi, rwid, 4)
def elaborate(self, platform):
m = LDSTCompUnit.elaborate(self, platform)
m.submodules.l0 = self.l0
- m.d.comb += self.ad.go.eq(self.ad.rel) # link addr-go direct to rel
+ # link addr-go direct to rel
+ m.d.comb += self.ad.go_i.eq(self.ad.rel_o)
return m
def test_scoreboard():
- dut = TestLDSTCompUnit(16)
+ units = {}
+ pspec = TestMemPspec(ldst_ifacetype='bare_wb',
+ imem_ifacetype='bare_wb',
+ addr_wid=64,
+ mask_wid=8,
+ reg_wid=64,
+ units=units)
+
+ dut = TestLDSTCompUnit(16,pspec)
vl = rtlil.convert(dut, ports=dut.ports())
with open("test_ldst_comp.il", "w") as f:
f.write(vl)
class TestLDSTCompUnitRegSpec(LDSTCompUnit):
- def __init__(self):
+ def __init__(self, pspec):
from soc.experiment.l0_cache import TstL0CacheBuffer
from soc.fu.ldst.pipe_data import LDSTPipeSpec
regspec = LDSTPipeSpec.regspec
- self.l0 = l0 = TstL0CacheBuffer()
- pi = l0.l0.dports[0].pi
+ self.l0 = l0 = TstL0CacheBuffer(pspec)
+ pi = l0.l0.dports[0]
LDSTCompUnit.__init__(self, pi, regspec, 4)
def elaborate(self, platform):
m = LDSTCompUnit.elaborate(self, platform)
m.submodules.l0 = self.l0
- m.d.comb += self.ad.go.eq(self.ad.rel) # link addr-go direct to rel
+ # link addr-go direct to rel
+ m.d.comb += self.ad.go_i.eq(self.ad.rel_o)
return m
def test_scoreboard_regspec():
- dut = TestLDSTCompUnitRegSpec()
+ units = {}
+ pspec = TestMemPspec(ldst_ifacetype='bare_wb',
+ imem_ifacetype='bare_wb',
+ addr_wid=64,
+ mask_wid=8,
+ reg_wid=64,
+ units=units)
+
+ dut = TestLDSTCompUnitRegSpec(pspec)
vl = rtlil.convert(dut, ports=dut.ports())
with open("test_ldst_comp.il", "w") as f:
f.write(vl)
from nmigen.compat.sim import run_simulation
from nmigen.cli import verilog, rtlil
-from nmigen import Module, Const, Signal, Array, Cat, Elaboratable
+from nmigen import Module, Const, Signal, Cat, Elaboratable
from regfile.regfile import RegFileArray, treereduce
from scoreboard.fn_unit import IntFnUnit, FPFnUnit, LDFnUnit, STFnUnit
int_src2_pend_v.append(fu.src2_pend_o)
int_rd_pend_v.append(fu.int_rd_pend_o)
int_wr_pend_v.append(fu.int_wr_pend_o)
- int_fus = Array(if_l)
+ int_fus = if_l
# Count of number of FUs
n_int_fus = len(if_l)
# merge (OR) all integer FU / ALU outputs to a single value
# bit of a hack: treereduce needs a list with an item named "dest_o"
dest_o = treereduce(int_alus)
- m.d.sync += int_dest.data_i.eq(dest_o)
+ m.d.sync += int_dest.i_data.eq(dest_o)
# connect ALUs
for i, alu in enumerate(int_alus):
m.d.comb += alu.go_wr_i.eq(intpick1.go_wr_o[i])
m.d.comb += alu.issue_i.eq(fn_issue_l[i])
# m.d.comb += fn_busy_l[i].eq(alu.busy_o) # XXX ignore, use fnissue
- m.d.comb += alu.src1_i.eq(int_src1.data_o)
- m.d.comb += alu.src2_i.eq(int_src2.data_o)
+ m.d.comb += alu.src1_i.eq(int_src1.o_data)
+ m.d.comb += alu.src2_i.eq(int_src2.o_data)
m.d.comb += if_l[i].req_rel_i.eq(alu.req_rel_o) # pipe out ready
return m
src2 = self.regs[src2]
if op == IADD:
val = (src1 + src2) & ((1 << (self.rwidth))-1)
+ print ("RegSim op: ADD", hex(src1), hex(src2), hex(val))
elif op == ISUB:
val = (src1 - src2) & ((1 << (self.rwidth))-1)
+ print ("RegSim op: SUB", hex(src1), hex(src2), hex(val))
+ else:
+ print ("RegSim op: UNSUPPORTED", op)
self.regs[dest] = val
def setval(self, dest, val):
+#!/usr/bin/env python3
+#
+# Copyright (C) 2020,2021 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+# Copyright (C) 2020 Cole Poirier
+# Copyright (C) 2020,2021 Cesar Strauss
+# Copyright (C) 2021 Tobias Platen
+#
+# Original dcache.vhdl Copyright of its authors and licensed
+# by IBM under CC-BY 4.0
+# https://github.com/antonblanchard/microwatt
+#
+# Conversion to nmigen funded by NLnet and NGI POINTER under EU Grants
+# 871528 and 957073, under the LGPL-v3+ License
+
"""DCache
based on Anton Blanchard microwatt dcache.vhdl
* https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg
* https://bugs.libre-soc.org/show_bug.cgi?id=469
+* https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
+ (discussion about brams for ECP5)
"""
from enum import Enum, unique
-from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
+from nmigen import (Module, Signal, Elaboratable, Cat, Repl, Array, Const,
+ Record, Memory)
from nmutil.util import Display
+from nmigen.lib.coding import Decoder
from copy import deepcopy
from random import randint, seed
+from nmigen_soc.wishbone.bus import Interface
+
from nmigen.cli import main
from nmutil.iocontrol import RecordObject
from nmigen.utils import log2_int
WBIOMasterOut, WBIOSlaveOut)
from soc.experiment.cache_ram import CacheRam
-#from soc.experiment.plru import PLRU
-from nmutil.plru import PLRU
+from soc.experiment.plru import PLRU, PLRUs
+#from nmutil.plru import PLRU, PLRUs
# for test
from soc.bus.sram import SRAM
from nmutil.util import wrap
-
-# TODO: make these parameters of DCache at some point
-LINE_SIZE = 64 # Line size in bytes
-NUM_LINES = 16 # Number of lines in a set
-NUM_WAYS = 4 # Number of ways
-TLB_SET_SIZE = 64 # L1 DTLB entries per set
-TLB_NUM_WAYS = 2 # L1 DTLB number of sets
-TLB_LG_PGSZ = 12 # L1 DTLB log_2(page_size)
LOG_LENGTH = 0 # Non-zero to enable log data collection
-# BRAM organisation: We never access more than
-# -- WB_DATA_BITS at a time so to save
-# -- resources we make the array only that wide, and
-# -- use consecutive indices for to make a cache "line"
-# --
-# -- ROW_SIZE is the width in bytes of the BRAM
-# -- (based on WB, so 64-bits)
-ROW_SIZE = WB_DATA_BITS // 8;
-
-# ROW_PER_LINE is the number of row (wishbone
-# transactions) in a line
-ROW_PER_LINE = LINE_SIZE // ROW_SIZE
-
-# BRAM_ROWS is the number of rows in BRAM needed
-# to represent the full dcache
-BRAM_ROWS = NUM_LINES * ROW_PER_LINE
-
-print ("ROW_SIZE", ROW_SIZE)
-print ("ROW_PER_LINE", ROW_PER_LINE)
-print ("BRAM_ROWS", BRAM_ROWS)
-print ("NUM_WAYS", NUM_WAYS)
-
-# Bit fields counts in the address
-
-# REAL_ADDR_BITS is the number of real address
-# bits that we store
-REAL_ADDR_BITS = 56
-
-# ROW_BITS is the number of bits to select a row
-ROW_BITS = log2_int(BRAM_ROWS)
-
-# ROW_LINE_BITS is the number of bits to select
-# a row within a line
-ROW_LINE_BITS = log2_int(ROW_PER_LINE)
-
-# LINE_OFF_BITS is the number of bits for
-# the offset in a cache line
-LINE_OFF_BITS = log2_int(LINE_SIZE)
-
-# ROW_OFF_BITS is the number of bits for
-# the offset in a row
-ROW_OFF_BITS = log2_int(ROW_SIZE)
-
-# INDEX_BITS is the number if bits to
-# select a cache line
-INDEX_BITS = log2_int(NUM_LINES)
-
-# SET_SIZE_BITS is the log base 2 of the set size
-SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
-
-# TAG_BITS is the number of bits of
-# the tag part of the address
-TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
-
-# TAG_WIDTH is the width in bits of each way of the tag RAM
-TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
-
-# WAY_BITS is the number of bits to select a way
-WAY_BITS = log2_int(NUM_WAYS)
-
-# Example of layout for 32 lines of 64 bytes:
-layout = """\
- .. tag |index| line |
- .. | row | |
- .. | |---| | ROW_LINE_BITS (3)
- .. | |--- - --| LINE_OFF_BITS (6)
- .. | |- --| ROW_OFF_BITS (3)
- .. |----- ---| | ROW_BITS (8)
- .. |-----| | INDEX_BITS (5)
- .. --------| | TAG_BITS (45)
-"""
-print (layout)
-print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
- (TAG_BITS, INDEX_BITS, ROW_BITS,
- ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
-print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
-print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
-print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
-
-TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
-
-print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
-
-def CacheTagArray():
- return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" % x) \
- for x in range(NUM_LINES))
-
-def CacheValidBitsArray():
- return Array(Signal(NUM_WAYS, name="cachevalid_%d" % x) \
- for x in range(NUM_LINES))
-
-def RowPerLineValidArray():
- return Array(Signal(name="rows_valid%d" % x) \
- for x in range(ROW_PER_LINE))
-
-# L1 TLB
-TLB_SET_BITS = log2_int(TLB_SET_SIZE)
-TLB_WAY_BITS = log2_int(TLB_NUM_WAYS)
-TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
-TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
-TLB_PTE_BITS = 64
-TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
-
def ispow2(x):
return (1<<log2_int(x, False)) == x
-assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
-assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
-assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
-assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
-assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
-assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
- "geometry bits don't add up"
-assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
- "geometry bits don't add up"
-assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
- "geometry bits don't add up"
-assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
-assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
-
-
-def TLBValidBitsArray():
- return Array(Signal(TLB_NUM_WAYS, name="tlbvalid%d" % x) \
- for x in range(TLB_SET_SIZE))
-
-def TLBTagEAArray():
- return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
- for x in range (TLB_NUM_WAYS))
-
-def TLBTagsArray():
- return Array(Signal(TLB_TAG_WAY_BITS, name="tlbtags%d" % x) \
- for x in range (TLB_SET_SIZE))
-
-def TLBPtesArray():
- return Array(Signal(TLB_PTE_WAY_BITS, name="tlbptes%d" % x) \
- for x in range(TLB_SET_SIZE))
-
-def HitWaySet():
- return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
- for x in range(TLB_NUM_WAYS))
-
-# Cache RAM interface
-def CacheRamOut():
- return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
- for x in range(NUM_WAYS))
-
-# PLRU output interface
-def PLRUOut():
- return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
- for x in range(NUM_LINES))
-
-# TLB PLRU output interface
-def TLBPLRUOut():
- return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
- for x in range(TLB_SET_SIZE))
-
-# Helper functions to decode incoming requests
-#
-# Return the cache line index (tag index) for an address
-def get_index(addr):
- return addr[LINE_OFF_BITS:SET_SIZE_BITS]
-# Return the cache row index (data memory) for an address
-def get_row(addr):
- return addr[ROW_OFF_BITS:SET_SIZE_BITS]
+class DCacheConfig:
+ def __init__(self, LINE_SIZE = 64, # Line size in bytes
+ NUM_LINES = 64, # Number of lines in a set
+ NUM_WAYS = 2, # Number of ways
+ TLB_SET_SIZE = 64, # L1 DTLB entries per set
+ TLB_NUM_WAYS = 2, # L1 DTLB number of sets
+ TLB_LG_PGSZ = 12): # L1 DTLB log_2(page_size)
+ self.LINE_SIZE = LINE_SIZE
+ self.NUM_LINES = NUM_LINES
+ self.NUM_WAYS = NUM_WAYS
+ self.TLB_SET_SIZE = TLB_SET_SIZE
+ self.TLB_NUM_WAYS = TLB_NUM_WAYS
+ self.TLB_LG_PGSZ = TLB_LG_PGSZ
+
+ # BRAM organisation: We never access more than
+ # -- WB_DATA_BITS at a time so to save
+ # -- resources we make the array only that wide, and
+ # -- use consecutive indices to make a cache "line"
+ # --
+ # -- ROW_SIZE is the width in bytes of the BRAM
+ # -- (based on WB, so 64-bits)
+ self.ROW_SIZE = WB_DATA_BITS // 8;
+
+ # ROW_PER_LINE is the number of row (wishbone
+ # transactions) in a line
+ self.ROW_PER_LINE = self.LINE_SIZE // self.ROW_SIZE
+
+ # BRAM_ROWS is the number of rows in BRAM needed
+ # to represent the full dcache
+ self.BRAM_ROWS = self.NUM_LINES * self.ROW_PER_LINE
+
+ print ("ROW_SIZE", self.ROW_SIZE)
+ print ("ROW_PER_LINE", self.ROW_PER_LINE)
+ print ("BRAM_ROWS", self.BRAM_ROWS)
+ print ("NUM_WAYS", self.NUM_WAYS)
+
+ # Bit fields counts in the address
+
+ # REAL_ADDR_BITS is the number of real address
+ # bits that we store
+ self.REAL_ADDR_BITS = 56
+
+ # ROW_BITS is the number of bits to select a row
+ self.ROW_BITS = log2_int(self.BRAM_ROWS)
+
+ # ROW_LINE_BITS is the number of bits to select
+ # a row within a line
+ self.ROW_LINE_BITS = log2_int(self.ROW_PER_LINE)
+
+ # LINE_OFF_BITS is the number of bits for
+ # the offset in a cache line
+ self.LINE_OFF_BITS = log2_int(self.LINE_SIZE)
+
+ # ROW_OFF_BITS is the number of bits for
+ # the offset in a row
+ self.ROW_OFF_BITS = log2_int(self.ROW_SIZE)
+
+ # INDEX_BITS is the number if bits to
+ # select a cache line
+ self.INDEX_BITS = log2_int(self.NUM_LINES)
+
+ # SET_SIZE_BITS is the log base 2 of the set size
+ self.SET_SIZE_BITS = self.LINE_OFF_BITS + self.INDEX_BITS
+
+ # TAG_BITS is the number of bits of
+ # the tag part of the address
+ self.TAG_BITS = self.REAL_ADDR_BITS - self.SET_SIZE_BITS
+
+ # TAG_WIDTH is the width in bits of each way of the tag RAM
+ self.TAG_WIDTH = self.TAG_BITS + 7 - ((self.TAG_BITS + 7) % 8)
+
+ # WAY_BITS is the number of bits to select a way
+ self.WAY_BITS = log2_int(self.NUM_WAYS)
+
+ # Example of layout for 32 lines of 64 bytes:
+ layout = f"""\
+ DCache Layout:
+ |.. -----------------------| REAL_ADDR_BITS ({self.REAL_ADDR_BITS})
+ .. |--------------| SET_SIZE_BITS ({self.SET_SIZE_BITS})
+ .. tag |index| line |
+ .. | row | |
+ .. | |---| | ROW_LINE_BITS ({self.ROW_LINE_BITS})
+ .. | |--- - --| LINE_OFF_BITS ({self.LINE_OFF_BITS})
+ .. | |- --| ROW_OFF_BITS ({self.ROW_OFF_BITS})
+ .. |----- ---| | ROW_BITS ({self.ROW_BITS})
+ .. |-----| | INDEX_BITS ({self.INDEX_BITS})
+ .. --------| | TAG_BITS ({self.TAG_BITS})
+ """
+ print (layout)
+ print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
+ (self.TAG_BITS, self.INDEX_BITS, self.ROW_BITS,
+ self.ROW_OFF_BITS, self.LINE_OFF_BITS, self.ROW_LINE_BITS))
+ print ("index @: %d-%d" % (self.LINE_OFF_BITS, self.SET_SIZE_BITS))
+ print ("row @: %d-%d" % (self.LINE_OFF_BITS, self.ROW_OFF_BITS))
+ print ("tag @: %d-%d width %d" % (self.SET_SIZE_BITS,
+ self.REAL_ADDR_BITS, self.TAG_WIDTH))
+
+ self.TAG_RAM_WIDTH = self.TAG_WIDTH * self.NUM_WAYS
+
+ print ("TAG_RAM_WIDTH", self.TAG_RAM_WIDTH)
+ print (" TAG_WIDTH", self.TAG_WIDTH)
+ print (" NUM_WAYS", self.NUM_WAYS)
+ print (" NUM_LINES", self.NUM_LINES)
+
+ # L1 TLB
+ self.TLB_SET_BITS = log2_int(self.TLB_SET_SIZE)
+ self.TLB_WAY_BITS = log2_int(self.TLB_NUM_WAYS)
+ self.TLB_EA_TAG_BITS = 64 - (self.TLB_LG_PGSZ + self.TLB_SET_BITS)
+ self.TLB_TAG_WAY_BITS = self.TLB_NUM_WAYS * self.TLB_EA_TAG_BITS
+ self.TLB_PTE_BITS = 64
+ self.TLB_PTE_WAY_BITS = self.TLB_NUM_WAYS * self.TLB_PTE_BITS;
+
+ assert (self.LINE_SIZE % self.ROW_SIZE) == 0, \
+ "LINE_SIZE not multiple of ROW_SIZE"
+ assert ispow2(self.LINE_SIZE), "LINE_SIZE not power of 2"
+ assert ispow2(self.NUM_LINES), "NUM_LINES not power of 2"
+ assert ispow2(self.ROW_PER_LINE), "ROW_PER_LINE not power of 2"
+ assert self.ROW_BITS == \
+ (self.INDEX_BITS + self.ROW_LINE_BITS), \
+ "geometry bits don't add up"
+ assert (self.LINE_OFF_BITS == \
+ self.ROW_OFF_BITS + self.ROW_LINE_BITS), \
+ "geometry bits don't add up"
+ assert self.REAL_ADDR_BITS == \
+ (self.TAG_BITS + self.INDEX_BITS + self.LINE_OFF_BITS), \
+ "geometry bits don't add up"
+ assert self.REAL_ADDR_BITS == \
+ (self.TAG_BITS + self.ROW_BITS + self.ROW_OFF_BITS), \
+ "geometry bits don't add up"
+ assert 64 == WB_DATA_BITS, \
+ "Can't yet handle wb width that isn't 64-bits"
+ assert self.SET_SIZE_BITS <= self.TLB_LG_PGSZ, \
+ "Set indexed by virtual address"
+
+ def CacheTagArray(self):
+ return Array(Signal(self.TAG_RAM_WIDTH, name="tag%d" % x) \
+ for x in range(self.NUM_LINES))
+
+ def CacheValidsArray(self):
+ return Array(Signal(self.NUM_WAYS, name="tag_valids%d" % x)
+ for x in range(self.NUM_LINES))
+
+ def RowPerLineValidArray(self):
+ return Array(Signal(name="rows_valid%d" % x) \
+ for x in range(self.ROW_PER_LINE))
+
+ def TLBHit(self, name):
+ return Record([('valid', 1),
+ ('way', self.TLB_WAY_BITS)], name=name)
+
+ def TLBTagEAArray(self):
+ return Array(Signal(self.TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
+ for x in range (self.TLB_NUM_WAYS))
+
+ def TLBRecord(self, name):
+ tlb_layout = [('valid', self.TLB_NUM_WAYS),
+ ('tag', self.TLB_TAG_WAY_BITS),
+ ('pte', self.TLB_PTE_WAY_BITS)
+ ]
+ return Record(tlb_layout, name=name)
+
+ def TLBValidArray(self):
+ return Array(Signal(self.TLB_NUM_WAYS, name="tlb_valid%d" % x)
+ for x in range(self.TLB_SET_SIZE))
+
+ def HitWaySet(self):
+ return Array(Signal(self.WAY_BITS, name="hitway_%d" % x) \
+ for x in range(self.TLB_NUM_WAYS))
+
+ # Cache RAM interface
+ def CacheRamOut(self):
+ return Array(Signal(self.WB_DATA_BITS, name="cache_out%d" % x) \
+ for x in range(self.NUM_WAYS))
+
+ # PLRU output interface
+ def PLRUOut(self):
+ return Array(Signal(self.WAY_BITS, name="plru_out%d" % x) \
+ for x in range(self.NUM_LINES))
+
+ # TLB PLRU output interface
+ def TLBPLRUOut(self):
+ return Array(Signal(self.TLB_WAY_BITS, name="tlbplru_out%d" % x) \
+ for x in range(self.TLB_SET_SIZE))
+
+ # Helper functions to decode incoming requests
+ #
+ # Return the cache line index (tag index) for an address
+ def get_index(self, addr):
+ return addr[self.LINE_OFF_BITS:self.SET_SIZE_BITS]
+
+ # Return the cache row index (data memory) for an address
+ def get_row(self, addr):
+ return addr[self.ROW_OFF_BITS:self.SET_SIZE_BITS]
-# Return the index of a row within a line
-def get_row_of_line(row):
- return row[:ROW_BITS][:ROW_LINE_BITS]
+ # Return the index of a row within a line
+ def get_row_of_line(self, row):
+ return row[:self.ROW_BITS][:self.ROW_LINE_BITS]
-# Returns whether this is the last row of a line
-def is_last_row_addr(addr, last):
- return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
+ # Returns whether this is the last row of a line
+ def is_last_row_addr(self, addr, last):
+ return addr[self.ROW_OFF_BITS:self.LINE_OFF_BITS] == last
-# Returns whether this is the last row of a line
-def is_last_row(row, last):
- return get_row_of_line(row) == last
+ # Returns whether this is the last row of a line
+ def is_last_row(self, row, last):
+ return self.get_row_of_line(row) == last
-# Return the next row in the current cache line. We use a
-# dedicated function in order to limit the size of the
-# generated adder to be only the bits within a cache line
-# (3 bits with default settings)
-def next_row(row):
- row_v = row[0:ROW_LINE_BITS] + 1
- return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
+ # Return the next row in the current cache line. We use a
+ # dedicated function in order to limit the size of the
+ # generated adder to be only the bits within a cache line
+ # (3 bits with default settings)
+ def next_row(self, row):
+ row_v = row[0:self.ROW_LINE_BITS] + 1
+ return Cat(row_v[:self.ROW_LINE_BITS], row[self.ROW_LINE_BITS:])
-# Get the tag value from the address
-def get_tag(addr):
- return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
+ # Get the tag value from the address
+ def get_tag(self, addr):
+ return addr[self.SET_SIZE_BITS:self.REAL_ADDR_BITS]
-# Read a tag from a tag memory row
-def read_tag(way, tagset):
- return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
+ # Read a tag from a tag memory row
+ def read_tag(self, way, tagset):
+ return tagset.word_select(way, self.TAG_WIDTH)[:self.TAG_BITS]
-# Read a TLB tag from a TLB tag memory row
-def read_tlb_tag(way, tags):
- return tags.word_select(way, TLB_EA_TAG_BITS)
+ # Read a TLB tag from a TLB tag memory row
+ def read_tlb_tag(self, way, tags):
+ return tags.word_select(way, self.TLB_EA_TAG_BITS)
-# Write a TLB tag to a TLB tag memory row
-def write_tlb_tag(way, tags, tag):
- return read_tlb_tag(way, tags).eq(tag)
+ # Write a TLB tag to a TLB tag memory row
+ def write_tlb_tag(self, way, tags, tag):
+ return self.read_tlb_tag(way, tags).eq(tag)
-# Read a PTE from a TLB PTE memory row
-def read_tlb_pte(way, ptes):
- return ptes.word_select(way, TLB_PTE_BITS)
+ # Read a PTE from a TLB PTE memory row
+ def read_tlb_pte(self, way, ptes):
+ return ptes.word_select(way, self.TLB_PTE_BITS)
-def write_tlb_pte(way, ptes, newpte):
- return read_tlb_pte(way, ptes).eq(newpte)
+ def write_tlb_pte(self, way, ptes, newpte):
+ return self.read_tlb_pte(way, ptes).eq(newpte)
# Record for storing permission, attribute, etc. bits from a PTE
class MemAccessRequest(RecordObject):
- def __init__(self, name=None):
+ def __init__(self, cfg, name=None):
super().__init__(name=name)
self.op = Signal(Op)
self.valid = Signal()
self.dcbz = Signal()
- self.real_addr = Signal(REAL_ADDR_BITS)
+ self.real_addr = Signal(cfg.REAL_ADDR_BITS)
self.data = Signal(64)
self.byte_sel = Signal(8)
- self.hit_way = Signal(WAY_BITS)
+ self.hit_way = Signal(cfg.WAY_BITS)
self.same_tag = Signal()
self.mmu_req = Signal()
# First stage register, contains state for stage 1 of load hits
# and for the state machine used by all other operations
class RegStage1(RecordObject):
- def __init__(self, name=None):
+ def __init__(self, cfg, name=None):
super().__init__(name=name)
# Info about the request
self.full = Signal() # have uncompleted request
self.mmu_req = Signal() # request is from MMU
- self.req = MemAccessRequest(name="reqmem")
+ self.req = MemAccessRequest(cfg, name="reqmem")
# Cache hit state
- self.hit_way = Signal(WAY_BITS)
+ self.hit_way = Signal(cfg.WAY_BITS)
self.hit_load_valid = Signal()
- self.hit_index = Signal(INDEX_BITS)
+ self.hit_index = Signal(cfg.INDEX_BITS)
self.cache_hit = Signal()
# TLB hit state
- self.tlb_hit = Signal()
- self.tlb_hit_way = Signal(TLB_NUM_WAYS)
- self.tlb_hit_index = Signal(TLB_WAY_BITS)
+ self.tlb_hit = cfg.TLBHit("tlb_hit")
+ self.tlb_hit_index = Signal(cfg.TLB_SET_BITS)
# 2-stage data buffer for data forwarded from writes to reads
self.forward_data1 = Signal(64)
self.forward_data2 = Signal(64)
self.forward_sel1 = Signal(8)
self.forward_valid1 = Signal()
- self.forward_way1 = Signal(WAY_BITS)
- self.forward_row1 = Signal(ROW_BITS)
+ self.forward_way1 = Signal(cfg.WAY_BITS)
+ self.forward_row1 = Signal(cfg.ROW_BITS)
self.use_forward1 = Signal()
self.forward_sel = Signal(8)
self.write_tag = Signal()
self.slow_valid = Signal()
self.wb = WBMasterOut("wb")
- self.reload_tag = Signal(TAG_BITS)
- self.store_way = Signal(WAY_BITS)
- self.store_row = Signal(ROW_BITS)
- self.store_index = Signal(INDEX_BITS)
- self.end_row_ix = Signal(ROW_LINE_BITS)
- self.rows_valid = RowPerLineValidArray()
+ self.reload_tag = Signal(cfg.TAG_BITS)
+ self.store_way = Signal(cfg.WAY_BITS)
+ self.store_row = Signal(cfg.ROW_BITS)
+ self.store_index = Signal(cfg.INDEX_BITS)
+ self.end_row_ix = Signal(cfg.ROW_LINE_BITS)
+ self.rows_valid = cfg.RowPerLineValidArray()
self.acks_pending = Signal(3)
self.inc_acks = Signal()
self.dec_acks = Signal()
# Reservation information
class Reservation(RecordObject):
- def __init__(self):
- super().__init__()
+ def __init__(self, cfg, name=None):
+ super().__init__(name=name)
self.valid = Signal()
- self.addr = Signal(64-LINE_OFF_BITS)
+ self.addr = Signal(64-cfg.LINE_OFF_BITS)
class DTLBUpdate(Elaboratable):
- def __init__(self):
+ def __init__(self, cfg):
+ self.cfg = cfg
self.tlbie = Signal()
self.tlbwe = Signal()
self.doall = Signal()
- self.updated = Signal()
- self.v_updated = Signal()
- self.tlb_hit = Signal()
- self.tlb_req_index = Signal(TLB_SET_BITS)
+ self.tlb_hit = cfg.TLBHit("tlb_hit")
+ self.tlb_req_index = Signal(cfg.TLB_SET_BITS)
- self.tlb_hit_way = Signal(TLB_WAY_BITS)
- self.tlb_tag_way = Signal(TLB_TAG_WAY_BITS)
- self.tlb_pte_way = Signal(TLB_PTE_WAY_BITS)
- self.repl_way = Signal(TLB_WAY_BITS)
- self.eatag = Signal(TLB_EA_TAG_BITS)
- self.pte_data = Signal(TLB_PTE_BITS)
+ self.repl_way = Signal(cfg.TLB_WAY_BITS)
+ self.eatag = Signal(cfg.TLB_EA_TAG_BITS)
+ self.pte_data = Signal(cfg.TLB_PTE_BITS)
- self.dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
-
- self.tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
- self.db_out = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
- self.pb_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
+ # read from dtlb array
+ self.tlb_read = Signal()
+ self.tlb_read_index = Signal(cfg.TLB_SET_BITS)
+ self.tlb_way = cfg.TLBRecord("o_tlb_way")
def elaborate(self, platform):
m = Module()
comb = m.d.comb
sync = m.d.sync
-
- tagset = Signal(TLB_TAG_WAY_BITS)
- pteset = Signal(TLB_PTE_WAY_BITS)
-
- tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
- comb += db_out.eq(self.dv)
+ cfg = self.cfg
+
+ # there are 3 parts to this:
+ # QTY TLB_NUM_WAYs TAGs - of width (say) 46 bits of Effective Address
+ # QTY TLB_NUM_WAYs PTEs - of width (say) 64 bits
+ # "Valid" bits, one per "way", of QTY TLB_NUM_WAYs. these cannot
+ # be a Memory because they can all be cleared (tlbie, doall), i mean,
+ # we _could_, in theory, by overriding the Reset Signal of the Memory,
+ # hmmm....
+
+ dtlb_valid = cfg.TLBValidArray()
+ tlb_req_index = self.tlb_req_index
+
+ print ("TLB_TAG_WAY_BITS", cfg.TLB_TAG_WAY_BITS)
+ print (" TLB_EA_TAG_BITS", cfg.TLB_EA_TAG_BITS)
+ print (" TLB_NUM_WAYS", cfg.TLB_NUM_WAYS)
+ print ("TLB_PTE_WAY_BITS", cfg.TLB_PTE_WAY_BITS)
+ print (" TLB_PTE_BITS", cfg.TLB_PTE_BITS)
+ print (" TLB_NUM_WAYS", cfg.TLB_NUM_WAYS)
+
+ # TAG and PTE Memory SRAMs. transparent, write-enables are TLB_NUM_WAYS
+ tagway = Memory(depth=cfg.TLB_SET_SIZE, width=cfg.TLB_TAG_WAY_BITS,
+ attrs={'syn_ramstyle': "block_ram"})
+ m.submodules.rd_tagway = rd_tagway = tagway.read_port()
+ m.submodules.wr_tagway = wr_tagway = tagway.write_port(
+ granularity=cfg.TLB_EA_TAG_BITS)
+
+ pteway = Memory(depth=cfg.TLB_SET_SIZE, width=cfg.TLB_PTE_WAY_BITS,
+ attrs={'syn_ramstyle': "block_ram"})
+ m.submodules.rd_pteway = rd_pteway = pteway.read_port()
+ m.submodules.wr_pteway = wr_pteway = pteway.write_port(
+ granularity=cfg.TLB_PTE_BITS)
+
+ # commented out for now, can be put in if Memory.reset can be
+ # used for tlbie&doall to reset the entire Memory to zero in 1 cycle
+ #validm = Memory(depth=TLB_SET_SIZE, width=TLB_NUM_WAYS)
+ #m.submodules.rd_valid = rd_valid = validm.read_port()
+ #m.submodules.wr_valid = wr_valid = validm.write_port(
+ #granularity=1)
+
+ # connect up read and write addresses to Valid/PTE/TAG SRAMs
+ m.d.comb += rd_pteway.addr.eq(self.tlb_read_index)
+ m.d.comb += rd_tagway.addr.eq(self.tlb_read_index)
+ #m.d.comb += rd_valid.addr.eq(self.tlb_read_index)
+ m.d.comb += wr_tagway.addr.eq(tlb_req_index)
+ m.d.comb += wr_pteway.addr.eq(tlb_req_index)
+ #m.d.comb += wr_valid.addr.eq(tlb_req_index)
+
+ updated = Signal()
+ v_updated = Signal()
+ tb_out = Signal(cfg.TLB_TAG_WAY_BITS) # tlb_way_tags_t
+ db_out = Signal(cfg.TLB_NUM_WAYS) # tlb_way_valids_t
+ pb_out = Signal(cfg.TLB_PTE_WAY_BITS) # tlb_way_ptes_t
+ dv = Signal(cfg.TLB_NUM_WAYS) # tlb_way_valids_t
+
+ comb += dv.eq(dtlb_valid[tlb_req_index])
+ comb += db_out.eq(dv)
with m.If(self.tlbie & self.doall):
- pass # clear all back in parent
+ # clear all valid bits at once
+ # XXX hmmm, validm _could_ use Memory reset here...
+ for i in range(cfg.TLB_SET_SIZE):
+ sync += dtlb_valid[i].eq(0)
with m.Elif(self.tlbie):
- with m.If(self.tlb_hit):
- comb += db_out.bit_select(self.tlb_hit_way, 1).eq(0)
- comb += self.v_updated.eq(1)
-
+ # invalidate just the hit_way
+ with m.If(self.tlb_hit.valid):
+ comb += db_out.bit_select(self.tlb_hit.way, 1).eq(0)
+ comb += v_updated.eq(1)
with m.Elif(self.tlbwe):
-
- comb += tagset.eq(self.tlb_tag_way)
- comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
- comb += tb_out.eq(tagset)
-
- comb += pteset.eq(self.tlb_pte_way)
- comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
- comb += pb_out.eq(pteset)
-
+ # write to the requested tag and PTE
+ comb += cfg.write_tlb_tag(self.repl_way, tb_out, self.eatag)
+ comb += cfg.write_tlb_pte(self.repl_way, pb_out, self.pte_data)
+ # set valid bit
comb += db_out.bit_select(self.repl_way, 1).eq(1)
- comb += self.updated.eq(1)
- comb += self.v_updated.eq(1)
+ comb += updated.eq(1)
+ comb += v_updated.eq(1)
+
+ # above, sometimes valid is requested to be updated but data not
+ # therefore split them out, here. note the granularity thing matches
+ # with the shift-up of the eatag/pte_data into the correct TLB way.
+ # thus is it not necessary to write the entire lot, just the portion
+ # being altered: hence writing the *old* copy of the row is not needed
+ with m.If(updated): # PTE and TAG to be written
+ comb += wr_pteway.data.eq(pb_out)
+ comb += wr_pteway.en.eq(1<<self.repl_way)
+ comb += wr_tagway.data.eq(tb_out)
+ comb += wr_tagway.en.eq(1<<self.repl_way)
+ with m.If(v_updated): # Valid to be written
+ sync += dtlb_valid[tlb_req_index].eq(db_out)
+ #comb += wr_valid.data.eq(db_out)
+ #comb += wr_valid.en.eq(1<<self.repl_way)
+
+ # select one TLB way, use a register here
+ r_delay = Signal()
+ sync += r_delay.eq(self.tlb_read)
+ # first deal with the valids, which are not in a Memory.
+ # tlb way valid is output on a 1 clock delay with sync,
+ # but have to explicitly deal with "forwarding" here
+ with m.If(self.tlb_read):
+ with m.If(v_updated): # write *and* read in same cycle: forward
+ sync += self.tlb_way.valid.eq(db_out)
+ with m.Else():
+ sync += self.tlb_way.valid.eq(dtlb_valid[self.tlb_read_index])
+ # now deal with the Memory-read case. the output must remain
+ # valid (stable) even when a read-request is not made, but stable
+ # on a one-clock delay, hence the register
+ r_tlb_way = cfg.TLBRecord("r_tlb_way")
+ with m.If(r_delay):
+ # on one clock delay, capture the contents of the read port(s)
+ comb += self.tlb_way.tag.eq(rd_tagway.data)
+ comb += self.tlb_way.pte.eq(rd_pteway.data)
+ sync += r_tlb_way.tag.eq(rd_tagway.data)
+ sync += r_tlb_way.pte.eq(rd_pteway.data)
+ with m.Else():
+ # ... so that the register can output it when no read is requested
+ # it's rather overkill but better to be safe than sorry
+ comb += self.tlb_way.tag.eq(r_tlb_way.tag)
+ comb += self.tlb_way.pte.eq(r_tlb_way.pte)
+ #comb += self.tlb_way.eq(r_tlb_way)
return m
class DCachePendingHit(Elaboratable):
- def __init__(self, tlb_pte_way, tlb_valid_way, tlb_hit_way,
- cache_valid_idx, cache_tag_set,
- req_addr,
- hit_set):
+ def __init__(self, cfg, tlb_way,
+ cache_i_validdx, cache_tag_set,
+ req_addr):
self.go = Signal()
self.virt_mode = Signal()
self.is_hit = Signal()
- self.tlb_hit = Signal()
- self.hit_way = Signal(WAY_BITS)
+ self.tlb_hit = cfg.TLBHit("tlb_hit")
+ self.hit_way = Signal(cfg.WAY_BITS)
self.rel_match = Signal()
- self.req_index = Signal(INDEX_BITS)
- self.reload_tag = Signal(TAG_BITS)
+ self.req_index = Signal(cfg.INDEX_BITS)
+ self.reload_tag = Signal(cfg.TAG_BITS)
- self.tlb_hit_way = tlb_hit_way
- self.tlb_pte_way = tlb_pte_way
- self.tlb_valid_way = tlb_valid_way
- self.cache_valid_idx = cache_valid_idx
+ self.tlb_way = tlb_way
+ self.cache_i_validdx = cache_i_validdx
self.cache_tag_set = cache_tag_set
self.req_addr = req_addr
- self.hit_set = hit_set
+ self.cfg = cfg
def elaborate(self, platform):
m = Module()
go = self.go
virt_mode = self.virt_mode
is_hit = self.is_hit
- tlb_pte_way = self.tlb_pte_way
- tlb_valid_way = self.tlb_valid_way
- cache_valid_idx = self.cache_valid_idx
+ tlb_way = self.tlb_way
+ cache_i_validdx = self.cache_i_validdx
cache_tag_set = self.cache_tag_set
req_addr = self.req_addr
- tlb_hit_way = self.tlb_hit_way
tlb_hit = self.tlb_hit
- hit_set = self.hit_set
hit_way = self.hit_way
rel_match = self.rel_match
req_index = self.req_index
reload_tag = self.reload_tag
+ cfg = self.cfg
+ hit_set = Array(Signal(name="hit_set_%d" % i) \
+ for i in range(cfg.TLB_NUM_WAYS))
rel_matches = Array(Signal(name="rel_matches_%d" % i) \
- for i in range(TLB_NUM_WAYS))
- hit_way_set = HitWaySet()
+ for i in range(cfg.TLB_NUM_WAYS))
+ hit_way_set = cfg.HitWaySet()
# Test if pending request is a hit on any way
# In order to make timing in virtual mode,
# the TLB, and then decide later which match to use.
with m.If(virt_mode):
- for j in range(TLB_NUM_WAYS): # tlb_num_way_t
- s_tag = Signal(TAG_BITS, name="s_tag%d" % j)
- s_hit = Signal()
- s_pte = Signal(TLB_PTE_BITS)
- s_ra = Signal(REAL_ADDR_BITS)
- comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
- comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
- s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
- comb += s_tag.eq(get_tag(s_ra))
-
- for i in range(NUM_WAYS): # way_t
+ for j in range(cfg.TLB_NUM_WAYS): # tlb_num_way_t
+ s_tag = Signal(cfg.TAG_BITS, name="s_tag%d" % j)
+ s_hit = Signal(name="s_hit%d" % j)
+ s_pte = Signal(cfg.TLB_PTE_BITS, name="s_pte%d" % j)
+ s_ra = Signal(cfg.REAL_ADDR_BITS, name="s_ra%d" % j)
+ # read the PTE, calc the Real Address, get tge tag
+ comb += s_pte.eq(cfg.read_tlb_pte(j, tlb_way.pte))
+ comb += s_ra.eq(Cat(req_addr[0:cfg.TLB_LG_PGSZ],
+ s_pte[cfg.TLB_LG_PGSZ:cfg.REAL_ADDR_BITS]))
+ comb += s_tag.eq(cfg.get_tag(s_ra))
+ # for each way check tge tag against the cache tag set
+ for i in range(cfg.NUM_WAYS): # way_t
is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
- comb += is_tag_hit.eq(go & cache_valid_idx[i] &
- (read_tag(i, cache_tag_set) == s_tag)
- & tlb_valid_way[j])
+ comb += is_tag_hit.eq(go & cache_i_validdx[i] &
+ (cfg.read_tag(i, cache_tag_set) == s_tag)
+ & (tlb_way.valid[j]))
with m.If(is_tag_hit):
comb += hit_way_set[j].eq(i)
comb += s_hit.eq(1)
comb += hit_set[j].eq(s_hit)
- with m.If(s_tag == reload_tag):
- comb += rel_matches[j].eq(1)
- with m.If(tlb_hit):
- comb += is_hit.eq(hit_set[tlb_hit_way])
- comb += hit_way.eq(hit_way_set[tlb_hit_way])
- comb += rel_match.eq(rel_matches[tlb_hit_way])
+ comb += rel_matches[j].eq(s_tag == reload_tag)
+ with m.If(tlb_hit.valid):
+ comb += is_hit.eq(hit_set[tlb_hit.way])
+ comb += hit_way.eq(hit_way_set[tlb_hit.way])
+ comb += rel_match.eq(rel_matches[tlb_hit.way])
with m.Else():
- s_tag = Signal(TAG_BITS)
- comb += s_tag.eq(get_tag(req_addr))
- for i in range(NUM_WAYS): # way_t
+ s_tag = Signal(cfg.TAG_BITS)
+ comb += s_tag.eq(cfg.get_tag(req_addr))
+ for i in range(cfg.NUM_WAYS): # way_t
is_tag_hit = Signal(name="is_tag_hit_%d" % i)
- comb += is_tag_hit.eq(go & cache_valid_idx[i] &
- (read_tag(i, cache_tag_set) == s_tag))
+ comb += is_tag_hit.eq(go & cache_i_validdx[i] &
+ (cfg.read_tag(i, cache_tag_set) == s_tag))
with m.If(is_tag_hit):
comb += hit_way.eq(i)
comb += is_hit.eq(1)
return m
-class DCache(Elaboratable):
+class DCache(Elaboratable, DCacheConfig):
"""Set associative dcache write-through
TODO (in no specific order):
at the end of line (this requires dealing with requests coming in
while not idle...)
"""
- def __init__(self):
+ def __init__(self, pspec=None):
self.d_in = LoadStore1ToDCacheType("d_in")
self.d_out = DCacheToLoadStore1Type("d_out")
self.m_out = DCacheToMMUType("m_out")
self.stall_out = Signal()
-
- self.wb_out = WBMasterOut("wb_out")
- self.wb_in = WBSlaveOut("wb_in")
+ self.any_stall_out = Signal()
+ self.dreq_when_stall = Signal()
+ self.mreq_when_stall = Signal()
+
+ # standard naming (wired to non-standard for compatibility)
+ self.bus = Interface(addr_width=32,
+ data_width=64,
+ granularity=8,
+ features={'stall'},
+ #alignment=0,
+ name="dcache")
self.log_out = Signal(20)
+ # test if small cache to be enabled
+ self.small_cache = (hasattr(pspec, "small_cache") and
+ (pspec.small_cache == True))
+ # test if microwatt compatibility is to be enabled
+ self.microwatt_compat = (hasattr(pspec, "microwatt_compat") and
+ (pspec.microwatt_compat == True))
+ # test if fabric compatibility is to be enabled
+ self.fabric_compat = (hasattr(pspec, "fabric_compat") and
+ (pspec.fabric_compat == True))
+
+ XLEN = pspec.XLEN
+ TLB_SET_SIZE = 8
+ TLB_NUM_WAYS = 2
+ NUM_LINES = 8
+ NUM_WAYS = 2
+
+ if self.small_cache:
+ # reduce way sizes and num lines to ridiculously small
+ TLB_SET_SIZE = 2
+ TLB_NUM_WAYS = 1
+ NUM_LINES = 2
+ NUM_WAYS = 1
+ if self.microwatt_compat or self.fabric_compat:
+ # reduce way sizes
+ NUM_WAYS = 1
+ TLB_NUM_WAYS = 1
+
+ super().__init__(TLB_SET_SIZE=TLB_SET_SIZE,
+ # XLEN=XLEN, # TODO
+ TLB_NUM_WAYS = TLB_NUM_WAYS,
+ NUM_LINES = NUM_LINES,
+ NUM_WAYS = NUM_WAYS
+ )
+
def stage_0(self, m, r0, r1, r0_full):
"""Latch the request in r0.req as long as we're not stalling
"""
comb += r.doall.eq(m_in.doall)
comb += r.tlbld.eq(m_in.tlbld)
comb += r.mmu_req.eq(1)
+ comb += r.d_valid.eq(1)
m.d.sync += Display(" DCACHE req mmu addr %x pte %x ld %d",
m_in.addr, m_in.pte, r.req.load)
comb += r.doall.eq(0)
comb += r.tlbld.eq(0)
comb += r.mmu_req.eq(0)
+ comb += r.d_valid.eq(0)
+
+ sync += r0_full.eq(0)
with m.If((~r1.full & ~d_in.hold) | ~r0_full):
sync += r0.eq(r)
sync += r0_full.eq(r.req.valid)
+ with m.Elif(~r0.d_valid):
# Sample data the cycle after a request comes in from loadstore1.
# If another request has come in already then the data will get
# put directly into req.data below.
- with m.If(r0.req.valid & ~r.req.valid & ~r0.d_valid &
- ~r0.mmu_req):
- sync += r0.req.data.eq(d_in.data)
- sync += r0.d_valid.eq(1)
+ sync += r0.req.data.eq(d_in.data)
+ sync += r0.d_valid.eq(1)
with m.If(d_in.valid):
m.d.sync += Display(" DCACHE req cache "
"virt %d addr %x data %x ld %d",
r.req.virt_mode, r.req.addr,
r.req.data, r.req.load)
- def tlb_read(self, m, r0_stall, tlb_valid_way,
- tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
- dtlb_tags, dtlb_ptes):
+ def tlb_read(self, m, r0_stall, tlb_way):
"""TLB
Operates in the second cycle on the request latched in r0.req.
TLB updates write the entry at the end of the second cycle.
sync = m.d.sync
m_in, d_in = self.m_in, self.d_in
- index = Signal(TLB_SET_BITS)
- addrbits = Signal(TLB_SET_BITS)
+ addrbits = Signal(self.TLB_SET_BITS)
- amin = TLB_LG_PGSZ
- amax = TLB_LG_PGSZ + TLB_SET_BITS
+ amin = self.TLB_LG_PGSZ
+ amax = self.TLB_LG_PGSZ + self.TLB_SET_BITS
with m.If(m_in.valid):
comb += addrbits.eq(m_in.addr[amin : amax])
with m.Else():
comb += addrbits.eq(d_in.addr[amin : amax])
- comb += index.eq(addrbits)
# If we have any op and the previous op isn't finished,
# then keep the same output for next cycle.
- with m.If(~r0_stall):
- sync += tlb_valid_way.eq(dtlb_valid_bits[index])
- sync += tlb_tag_way.eq(dtlb_tags[index])
- sync += tlb_pte_way.eq(dtlb_ptes[index])
+ d = self.dtlb_update
+ comb += d.tlb_read_index.eq(addrbits)
+ comb += d.tlb_read.eq(~r0_stall)
+ comb += tlb_way.eq(d.tlb_way)
- def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
+ def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, tlb_req_index):
"""Generate TLB PLRUs
"""
comb = m.d.comb
sync = m.d.sync
- if TLB_NUM_WAYS == 0:
+ if self.TLB_NUM_WAYS == 0:
return
- for i in range(TLB_SET_SIZE):
- # TLB PLRU interface
- tlb_plru = PLRU(TLB_WAY_BITS)
- setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
- tlb_plru_acc_en = Signal()
- comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
- comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
- comb += tlb_plru.acc_i.eq(r1.tlb_hit_way)
- comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
+ # suite of PLRUs with a selection and output mechanism
+ tlb_plrus = PLRUs("d_tlb", self.TLB_SET_SIZE, self.TLB_WAY_BITS)
+ m.submodules.tlb_plrus = tlb_plrus
+ comb += tlb_plrus.way.eq(r1.tlb_hit.way)
+ comb += tlb_plrus.valid.eq(r1.tlb_hit.valid)
+ comb += tlb_plrus.index.eq(r1.tlb_hit_index)
+ comb += tlb_plrus.isel.eq(tlb_req_index) # select victim
+ comb += tlb_plru_victim.eq(tlb_plrus.o_index) # selected victim
def tlb_search(self, m, tlb_req_index, r0, r0_valid,
- tlb_valid_way, tlb_tag_way, tlb_hit_way,
- tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
+ tlb_way,
+ pte, tlb_hit, valid_ra, perm_attr, ra):
comb = m.d.comb
- hitway = Signal(TLB_WAY_BITS)
+ hitway = Signal(self.TLB_WAY_BITS)
hit = Signal()
- eatag = Signal(TLB_EA_TAG_BITS)
+ eatag = Signal(self.TLB_EA_TAG_BITS)
- TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
- comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
- comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
+ self.TLB_LG_END = self.TLB_LG_PGSZ + self.TLB_SET_BITS
+ r0_req_addr = r0.req.addr[self.TLB_LG_PGSZ : self.TLB_LG_END]
+ comb += tlb_req_index.eq(r0_req_addr)
+ comb += eatag.eq(r0.req.addr[self.TLB_LG_END : 64 ])
- for i in range(TLB_NUM_WAYS):
+ for i in range(self.TLB_NUM_WAYS):
is_tag_hit = Signal(name="is_tag_hit%d" % i)
- tlb_tag = Signal(TLB_EA_TAG_BITS, name="tlb_tag%d" % i)
- comb += tlb_tag.eq(read_tlb_tag(i, tlb_tag_way))
- comb += is_tag_hit.eq(tlb_valid_way[i] & (tlb_tag == eatag))
+ tlb_tag = Signal(self.TLB_EA_TAG_BITS, name="tlb_tag%d" % i)
+ comb += tlb_tag.eq(self.read_tlb_tag(i, tlb_way.tag))
+ comb += is_tag_hit.eq((tlb_way.valid[i]) & (tlb_tag == eatag))
with m.If(is_tag_hit):
comb += hitway.eq(i)
comb += hit.eq(1)
- comb += tlb_hit.eq(hit & r0_valid)
- comb += tlb_hit_way.eq(hitway)
+ comb += tlb_hit.valid.eq(hit & r0_valid)
+ comb += tlb_hit.way.eq(hitway)
- with m.If(tlb_hit):
- comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
- comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
+ with m.If(tlb_hit.valid):
+ comb += pte.eq(self.read_tlb_pte(hitway, tlb_way.pte))
+ comb += valid_ra.eq(tlb_hit.valid | ~r0.req.virt_mode)
with m.If(r0.req.virt_mode):
- comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
- r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
- pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
+ comb += ra.eq(Cat(Const(0, self.ROW_OFF_BITS),
+ r0.req.addr[self.ROW_OFF_BITS:self.TLB_LG_PGSZ],
+ pte[self.TLB_LG_PGSZ:self.REAL_ADDR_BITS]))
comb += perm_attr.reference.eq(pte[8])
comb += perm_attr.changed.eq(pte[7])
comb += perm_attr.nocache.eq(pte[5])
comb += perm_attr.rd_perm.eq(pte[2])
comb += perm_attr.wr_perm.eq(pte[1])
with m.Else():
- comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
- r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
+ comb += ra.eq(Cat(Const(0, self.ROW_OFF_BITS),
+ r0.req.addr[self.ROW_OFF_BITS:self.REAL_ADDR_BITS]))
comb += perm_attr.reference.eq(1)
comb += perm_attr.changed.eq(1)
comb += perm_attr.nocache.eq(0)
with m.If(valid_ra):
m.d.sync += Display("DCACHE virt mode %d hit %d ra %x pte %x",
- r0.req.virt_mode, tlb_hit, ra, pte)
+ r0.req.virt_mode, tlb_hit.valid, ra, pte)
m.d.sync += Display(" perm ref=%d", perm_attr.reference)
m.d.sync += Display(" perm chg=%d", perm_attr.changed)
m.d.sync += Display(" perm noc=%d", perm_attr.nocache)
m.d.sync += Display(" perm rdp=%d", perm_attr.rd_perm)
m.d.sync += Display(" perm wrp=%d", perm_attr.wr_perm)
- def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
- tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
- dtlb_tags, tlb_pte_way, dtlb_ptes):
-
- dtlb_valids = TLBValidBitsArray()
+ def tlb_update(self, m, r0_valid, r0, tlb_req_index,
+ tlb_hit, tlb_plru_victim):
comb = m.d.comb
sync = m.d.sync
comb += tlbie.eq(r0_valid & r0.tlbie)
comb += tlbwe.eq(r0_valid & r0.tlbld)
- m.submodules.tlb_update = d = DTLBUpdate()
- with m.If(tlbie & r0.doall):
- # clear all valid bits at once
- for i in range(TLB_SET_SIZE):
- sync += dtlb_valid_bits[i].eq(0)
- with m.If(d.updated):
- sync += dtlb_tags[tlb_req_index].eq(d.tb_out)
- sync += dtlb_ptes[tlb_req_index].eq(d.pb_out)
- with m.If(d.v_updated):
- sync += dtlb_valid_bits[tlb_req_index].eq(d.db_out)
-
- comb += d.dv.eq(dtlb_valid_bits[tlb_req_index])
+ d = self.dtlb_update
comb += d.tlbie.eq(tlbie)
comb += d.tlbwe.eq(tlbwe)
comb += d.doall.eq(r0.doall)
comb += d.tlb_hit.eq(tlb_hit)
- comb += d.tlb_hit_way.eq(tlb_hit_way)
- comb += d.tlb_tag_way.eq(tlb_tag_way)
- comb += d.tlb_pte_way.eq(tlb_pte_way)
comb += d.tlb_req_index.eq(tlb_req_index)
- with m.If(tlb_hit):
- comb += d.repl_way.eq(tlb_hit_way)
+ with m.If(tlb_hit.valid):
+ comb += d.repl_way.eq(tlb_hit.way)
with m.Else():
- comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
- comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
+ comb += d.repl_way.eq(tlb_plru_victim)
+ comb += d.eatag.eq(r0.req.addr[self.TLB_LG_PGSZ + self.TLB_SET_BITS:64])
comb += d.pte_data.eq(r0.req.data)
def maybe_plrus(self, m, r1, plru_victim):
comb = m.d.comb
sync = m.d.sync
- if TLB_NUM_WAYS == 0:
+ if self.TLB_NUM_WAYS == 0:
return
- for i in range(NUM_LINES):
- # PLRU interface
- plru = PLRU(WAY_BITS)
- setattr(m.submodules, "plru%d" % i, plru)
- plru_acc_en = Signal()
-
- comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
- comb += plru.acc_en.eq(plru_acc_en)
- comb += plru.acc_i.eq(r1.hit_way)
- comb += plru_victim[i].eq(plru.lru_o)
+ # suite of PLRUs with a selection and output mechanism
+ m.submodules.plrus = plrus = PLRUs("dtag", self.NUM_LINES,
+ self.WAY_BITS)
+ comb += plrus.way.eq(r1.hit_way)
+ comb += plrus.valid.eq(r1.cache_hit)
+ comb += plrus.index.eq(r1.hit_index)
+ comb += plrus.isel.eq(r1.store_index) # select victim
+ comb += plru_victim.eq(plrus.o_index) # selected victim
- def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
+ def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set):
"""Cache tag RAM read port
"""
comb = m.d.comb
sync = m.d.sync
+
m_in, d_in = self.m_in, self.d_in
- index = Signal(INDEX_BITS)
+ # synchronous tag read-port: NOT TRANSPARENT (cannot pass through
+ # write-to-a-read at the same time), seems to pass tests ok
+ m.submodules.rd_tag = rd_tag = self.tagmem.read_port(transparent=False)
+
+ index = Signal(self.INDEX_BITS)
with m.If(r0_stall):
comb += index.eq(req_index)
with m.Elif(m_in.valid):
- comb += index.eq(get_index(m_in.addr))
+ comb += index.eq(self.get_index(m_in.addr))
with m.Else():
- comb += index.eq(get_index(d_in.addr))
- sync += cache_tag_set.eq(cache_tags[index])
+ comb += index.eq(self.get_index(d_in.addr))
+ comb += rd_tag.addr.eq(index)
+ comb += cache_tag_set.eq(rd_tag.data) # read-port is a 1-clock delay
def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
r0_valid, r1, cache_valids, replace_way,
use_forward1_next, use_forward2_next,
req_hit_way, plru_victim, rc_ok, perm_attr,
valid_ra, perm_ok, access_ok, req_op, req_go,
- tlb_pte_way,
- tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
+ tlb_hit, tlb_way, cache_tag_set,
cancel_store, req_same_tag, r0_stall, early_req_row):
"""Cache request parsing and hit detection
"""
m_in, d_in = self.m_in, self.d_in
is_hit = Signal()
- hit_way = Signal(WAY_BITS)
+ hit_way = Signal(self.WAY_BITS)
op = Signal(Op)
opsel = Signal(3)
go = Signal()
nc = Signal()
- hit_set = Array(Signal(name="hit_set_%d" % i) \
- for i in range(TLB_NUM_WAYS))
- cache_valid_idx = Signal(NUM_WAYS)
+ cache_i_validdx = Signal(self.NUM_WAYS)
# Extract line, row and tag from request
- comb += req_index.eq(get_index(r0.req.addr))
- comb += req_row.eq(get_row(r0.req.addr))
- comb += req_tag.eq(get_tag(ra))
+ comb += req_index.eq(self.get_index(r0.req.addr))
+ comb += req_row.eq(self.get_row(r0.req.addr))
+ comb += req_tag.eq(self.get_tag(ra))
if False: # display on comb is a bit... busy.
comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
r0.req.addr, ra, req_index, req_tag, req_row)
comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
- comb += cache_valid_idx.eq(cache_valids[req_index])
-
- m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
- tlb_valid_way, tlb_hit_way,
- cache_valid_idx, cache_tag_set,
- r0.req.addr,
- hit_set)
+ comb += cache_i_validdx.eq(cache_valids[req_index])
+ m.submodules.dcache_pend = dc = DCachePendingHit(self, tlb_way,
+ cache_i_validdx, cache_tag_set,
+ r0.req.addr)
comb += dc.tlb_hit.eq(tlb_hit)
comb += dc.reload_tag.eq(r1.reload_tag)
comb += dc.virt_mode.eq(r0.req.virt_mode)
comb += dc.go.eq(go)
comb += dc.req_index.eq(req_index)
+
comb += is_hit.eq(dc.is_hit)
comb += hit_way.eq(dc.hit_way)
comb += req_same_tag.eq(dc.rel_match)
# For a store, consider this a hit even if the row isn't
# valid since it will be by the time we perform the store.
# For a load, check the appropriate row valid bit.
- rrow = Signal(ROW_LINE_BITS)
+ rrow = Signal(self.ROW_LINE_BITS)
comb += rrow.eq(req_row)
valid = r1.rows_valid[rrow]
comb += is_hit.eq((~r0.req.load) | valid)
comb += hit_way.eq(replace_way)
# Whether to use forwarded data for a load or not
- with m.If((get_row(r1.req.real_addr) == req_row) &
+ with m.If((self.get_row(r1.req.real_addr) == req_row) &
(r1.req.hit_way == hit_way)):
# Only need to consider r1.write_bram here, since if we
# are writing refill data here, then we don't have a
# The way to replace on a miss
with m.If(r1.write_tag):
- comb += replace_way.eq(plru_victim[r1.store_index])
+ comb += replace_way.eq(plru_victim)
with m.Else():
comb += replace_way.eq(r1.store_way)
(perm_attr.wr_perm |
(r0.req.load & perm_attr.rd_perm)))
comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
+
# Combine the request and cache hit status to decide what
# operation needs to be done
comb += nc.eq(r0.req.nc | perm_attr.nocache)
# row requested.
with m.If(~r0_stall):
with m.If(m_in.valid):
- comb += early_req_row.eq(get_row(m_in.addr))
+ comb += early_req_row.eq(self.get_row(m_in.addr))
with m.Else():
- comb += early_req_row.eq(get_row(d_in.addr))
+ comb += early_req_row.eq(self.get_row(d_in.addr))
with m.Else():
comb += early_req_row.eq(req_row)
with m.Else():
comb += clear_rsrv.eq(r0.req.atomic_last) # store conditional
with m.If((~reservation.valid) |
- (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
+ (r0.req.addr[self.LINE_OFF_BITS:64] !=
+ reservation.addr)):
comb += cancel_store.eq(1)
def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
reservation, r0):
-
comb = m.d.comb
sync = m.d.sync
sync += reservation.valid.eq(0)
with m.Elif(set_rsrv):
sync += reservation.valid.eq(1)
- sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
+ sync += reservation.addr.eq(r0.req.addr[self.LINE_OFF_BITS:64])
def writeback_control(self, m, r1, cache_out_row):
"""Return data for loads & completion control logic
dsel = data_fwd.word_select(i, 8)
comb += data_out.word_select(i, 8).eq(dsel)
+ # DCache output to LoadStore
comb += d_out.valid.eq(r1.ls_valid)
comb += d_out.data.eq(data_out)
comb += d_out.store_done.eq(~r1.stcx_fail)
# error cases complete without stalling
with m.If(r1.ls_error):
- sync += Display("completing ld/st with error")
+ with m.If(r1.dcbz):
+ sync += Display("completing dcbz with error")
+ with m.Else():
+ sync += Display("completing ld/st with error")
# Slow ops (load miss, NC, stores)
with m.If(r1.slow_valid):
account by using 1-cycle delayed signals for load hits.
"""
comb = m.d.comb
- wb_in = self.wb_in
+ bus = self.bus
+
+ # a Binary-to-Unary one-hots here. replace-way one-hot is gated
+ # (enabled) by bus.ack, not-write-bram, and state RELOAD_WAIT_ACK
+ m.submodules.rams_replace_way_e = rwe = Decoder(self.NUM_WAYS)
+ comb += rwe.n.eq(~((r1.state == State.RELOAD_WAIT_ACK) & bus.ack &
+ ~r1.write_bram))
+ comb += rwe.i.eq(replace_way)
+
+ m.submodules.rams_hit_way_e = hwe = Decoder(self.NUM_WAYS)
+ comb += hwe.i.eq(r1.hit_way)
+
+ # this one is gated with write_bram, and replace_way_e can never be
+ # set at the same time. that means that do_write can OR the outputs
+ m.submodules.rams_hit_req_way_e = hre = Decoder(self.NUM_WAYS)
+ comb += hre.n.eq(~r1.write_bram) # Decoder.n is inverted
+ comb += hre.i.eq(r1.req.hit_way)
+
+ # common Signals
+ do_read = Signal()
+ wr_addr = Signal(self.ROW_BITS)
+ wr_data = Signal(WB_DATA_BITS)
+ wr_sel = Signal(self.ROW_SIZE)
+ rd_addr = Signal(self.ROW_BITS)
+
+ comb += do_read.eq(1) # always enable
+ comb += rd_addr.eq(early_req_row)
+
+ # Write mux:
+ #
+ # Defaults to wishbone read responses (cache refill)
+ #
+ # For timing, the mux on wr_data/sel/addr is not
+ # dependent on anything other than the current state.
- for i in range(NUM_WAYS):
- do_read = Signal(name="do_rd%d" % i)
- rd_addr = Signal(ROW_BITS, name="rd_addr_%d" % i)
+ with m.If(r1.write_bram):
+ # Write store data to BRAM. This happens one
+ # cycle after the store is in r0.
+ comb += wr_data.eq(r1.req.data)
+ comb += wr_sel.eq(r1.req.byte_sel)
+ comb += wr_addr.eq(self.get_row(r1.req.real_addr))
+
+ with m.Else():
+ # Otherwise, we might be doing a reload or a DCBZ
+ with m.If(r1.dcbz):
+ comb += wr_data.eq(0)
+ with m.Else():
+ comb += wr_data.eq(bus.dat_r)
+ comb += wr_addr.eq(r1.store_row)
+ comb += wr_sel.eq(~0) # all 1s
+
+ # set up Cache Rams
+ for i in range(self.NUM_WAYS):
do_write = Signal(name="do_wr%d" % i)
- wr_addr = Signal(ROW_BITS, name="wr_addr_%d" % i)
- wr_data = Signal(WB_DATA_BITS, name="din_%d" % i)
- wr_sel = Signal(ROW_SIZE)
- wr_sel_m = Signal(ROW_SIZE)
- _d_out = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
+ wr_sel_m = Signal(self.ROW_SIZE, name="wr_sel_m_%d" % i)
+ d_out= Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
- way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
- setattr(m.submodules, "cacheram_%d" % i, way)
+ way = CacheRam(self.ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
+ m.submodules["cacheram_%d" % i] = way
comb += way.rd_en.eq(do_read)
comb += way.rd_addr.eq(rd_addr)
- comb += _d_out.eq(way.rd_data_o)
+ comb += d_out.eq(way.rd_data_o)
comb += way.wr_sel.eq(wr_sel_m)
comb += way.wr_addr.eq(wr_addr)
comb += way.wr_data.eq(wr_data)
# Cache hit reads
- comb += do_read.eq(1)
- comb += rd_addr.eq(early_req_row)
- with m.If(r1.hit_way == i):
- comb += cache_out_row.eq(_d_out)
-
- # Write mux:
- #
- # Defaults to wishbone read responses (cache refill)
- #
- # For timing, the mux on wr_data/sel/addr is not
- # dependent on anything other than the current state.
-
- with m.If(r1.write_bram):
- # Write store data to BRAM. This happens one
- # cycle after the store is in r0.
- comb += wr_data.eq(r1.req.data)
- comb += wr_sel.eq(r1.req.byte_sel)
- comb += wr_addr.eq(get_row(r1.req.real_addr))
-
- with m.If(i == r1.req.hit_way):
- comb += do_write.eq(1)
- with m.Else():
- # Otherwise, we might be doing a reload or a DCBZ
- with m.If(r1.dcbz):
- comb += wr_data.eq(0)
- with m.Else():
- comb += wr_data.eq(wb_in.dat)
- comb += wr_addr.eq(r1.store_row)
- comb += wr_sel.eq(~0) # all 1s
+ with m.If(hwe.o[i]):
+ comb += cache_out_row.eq(d_out)
- with m.If((r1.state == State.RELOAD_WAIT_ACK)
- & wb_in.ack & (replace_way == i)):
- comb += do_write.eq(1)
+ # these are mutually-exclusive via their Decoder-enablers
+ # (note: Decoder-enable is inverted)
+ comb += do_write.eq(hre.o[i] | rwe.o[i])
# Mask write selects with do_write since BRAM
# doesn't have a global write-enable
# It also handles error cases (TLB miss, cache paradox)
def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
req_hit_way, req_index, req_tag, access_ok,
- tlb_hit, tlb_hit_way, tlb_req_index):
-
+ tlb_hit, tlb_req_index):
comb = m.d.comb
sync = m.d.sync
sync += r1.hit_way.eq(req_hit_way)
sync += r1.hit_index.eq(req_index)
- with m.If(req_op == Op.OP_LOAD_HIT):
- sync += r1.hit_load_valid.eq(1)
- with m.Else():
- sync += r1.hit_load_valid.eq(0)
-
- with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
- sync += r1.cache_hit.eq(1)
- with m.Else():
- sync += r1.cache_hit.eq(0)
+ sync += r1.hit_load_valid.eq(req_op == Op.OP_LOAD_HIT)
+ sync += r1.cache_hit.eq((req_op == Op.OP_LOAD_HIT) |
+ (req_op == Op.OP_STORE_HIT))
with m.If(req_op == Op.OP_BAD):
sync += Display("Signalling ld/st error "
sync += r1.ls_error.eq(~r0.mmu_req)
sync += r1.mmu_error.eq(r0.mmu_req)
sync += r1.cache_paradox.eq(access_ok)
-
with m.Else():
sync += r1.ls_error.eq(0)
sync += r1.mmu_error.eq(0)
sync += r1.cache_paradox.eq(0)
- with m.If(req_op == Op.OP_STCX_FAIL):
- sync += r1.stcx_fail.eq(1)
- with m.Else():
- sync += r1.stcx_fail.eq(0)
+ sync += r1.stcx_fail.eq(req_op == Op.OP_STCX_FAIL)
# Record TLB hit information for updating TLB PLRU
sync += r1.tlb_hit.eq(tlb_hit)
- sync += r1.tlb_hit_way.eq(tlb_hit_way)
sync += r1.tlb_hit_index.eq(tlb_req_index)
# Memory accesses are handled by this state machine:
# All wishbone requests generation is done here.
# This machine operates at stage 1.
def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
- cache_valids, r0, replace_way,
+ r0, replace_way,
req_hit_way, req_same_tag,
- r0_valid, req_op, cache_tags, req_go, ra):
+ r0_valid, req_op, cache_valids, req_go, ra):
comb = m.d.comb
sync = m.d.sync
- wb_in = self.wb_in
+ bus = self.bus
d_in = self.d_in
- req = MemAccessRequest("mreq_ds")
+ m.submodules.wr_tag = wr_tag = self.tagmem.write_port(
+ granularity=self.TAG_WIDTH)
+
+ req = MemAccessRequest(self, "mreq_ds")
- req_row = Signal(ROW_BITS)
- req_idx = Signal(INDEX_BITS)
- req_tag = Signal(TAG_BITS)
- comb += req_idx.eq(get_index(req.real_addr))
- comb += req_row.eq(get_row(req.real_addr))
- comb += req_tag.eq(get_tag(req.real_addr))
+ r1_next_cycle = Signal()
+ req_row = Signal(self.ROW_BITS)
+ req_idx = Signal(self.INDEX_BITS)
+ req_tag = Signal(self.TAG_BITS)
+ comb += req_idx.eq(self.get_index(req.real_addr))
+ comb += req_row.eq(self.get_row(req.real_addr))
+ comb += req_tag.eq(self.get_tag(req.real_addr))
sync += r1.use_forward1.eq(use_forward1_next)
sync += r1.forward_sel.eq(0)
sync += r1.forward_data1.eq(r1.req.data)
sync += r1.forward_sel1.eq(r1.req.byte_sel)
sync += r1.forward_way1.eq(r1.req.hit_way)
- sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
+ sync += r1.forward_row1.eq(self.get_row(r1.req.real_addr))
sync += r1.forward_valid1.eq(1)
with m.Else():
with m.If(r1.dcbz):
sync += r1.forward_data1.eq(0)
with m.Else():
- sync += r1.forward_data1.eq(wb_in.dat)
+ sync += r1.forward_data1.eq(bus.dat_r)
sync += r1.forward_sel1.eq(~0) # all 1s
sync += r1.forward_way1.eq(replace_way)
sync += r1.forward_row1.eq(r1.store_row)
sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
- with m.If(~r0.mmu_req):
- sync += r1.ls_valid.eq(1)
- with m.Else():
+ with m.If(r0.mmu_req):
sync += r1.mmu_done.eq(1)
+ with m.Else():
+ sync += r1.ls_valid.eq(1)
with m.If(r1.write_tag):
# Store new tag in selected way
- for i in range(NUM_WAYS):
- with m.If(i == replace_way):
- ct = Signal(TAG_RAM_WIDTH)
- comb += ct.eq(cache_tags[r1.store_index])
- """
-TODO: check this
-cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
- (TAG_WIDTH - 1 downto TAG_BITS => '0') & r1.reload_tag;
- """
- comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
- sync += cache_tags[r1.store_index].eq(ct)
+ replace_way_onehot = Signal(self.NUM_WAYS)
+ comb += replace_way_onehot.eq(1<<replace_way)
+ ct = Signal(self.TAG_RAM_WIDTH)
+ comb += ct.eq(r1.reload_tag << (replace_way*self.TAG_WIDTH))
+ comb += wr_tag.en.eq(replace_way_onehot)
+ comb += wr_tag.addr.eq(r1.store_index)
+ comb += wr_tag.data.eq(ct)
+
sync += r1.store_way.eq(replace_way)
sync += r1.write_tag.eq(0)
| (req_op == Op.OP_STORE_HIT)):
sync += r1.req.eq(req)
sync += r1.full.eq(1)
+ # do not let r1.state RELOAD_WAIT_ACK or STORE_WAIT_ACK
+ # destroy r1.req by overwriting r1.full back to zero
+ comb += r1_next_cycle.eq(1)
# Main state machine
with m.Switch(r1.state):
with m.Case(State.IDLE):
- sync += r1.wb.adr.eq(req.real_addr[ROW_LINE_BITS:])
+ sync += r1.wb.adr.eq(req.real_addr[self.ROW_OFF_BITS:])
sync += r1.wb.sel.eq(req.byte_sel)
sync += r1.wb.dat.eq(req.data)
sync += r1.dcbz.eq(req.dcbz)
# for subsequent stores.
sync += r1.store_index.eq(req_idx)
sync += r1.store_row.eq(req_row)
- sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1)
+ sync += r1.end_row_ix.eq(self.get_row_of_line(req_row)-1)
sync += r1.reload_tag.eq(req_tag)
sync += r1.req.same_tag.eq(1)
with m.If(req.op == Op.OP_STORE_HIT):
sync += r1.store_way.eq(req.hit_way)
+ #with m.If(r1.dec_acks):
+ # sync += r1.acks_pending.eq(r1.acks_pending - 1)
+
# Reset per-row valid bits,
# ready for handling OP_LOAD_MISS
- for i in range(ROW_PER_LINE):
+ for i in range(self.ROW_PER_LINE):
sync += r1.rows_valid[i].eq(0)
with m.If(req_op != Op.OP_NONE):
sync += r1.state.eq(State.STORE_WAIT_ACK)
sync += r1.acks_pending.eq(1)
sync += r1.full.eq(0)
+ comb += r1_next_cycle.eq(0)
sync += r1.slow_valid.eq(1)
- with m.If(~req.mmu_req):
- sync += r1.ls_valid.eq(1)
- with m.Else():
+ with m.If(req.mmu_req):
sync += r1.mmu_done.eq(1)
+ with m.Else():
+ sync += r1.ls_valid.eq(1)
with m.If(req.op == Op.OP_STORE_HIT):
sync += r1.write_bram.eq(1)
pass
with m.Case(State.RELOAD_WAIT_ACK):
- ld_stbs_done = Signal()
- # Requests are all sent if stb is 0
- comb += ld_stbs_done.eq(~r1.wb.stb)
# If we are still sending requests, was one accepted?
- with m.If((~wb_in.stall) & r1.wb.stb):
- # That was the last word? We are done sending.
- # Clear stb and set ld_stbs_done so we can handle an
- # eventual last ack on the same cycle.
+ with m.If((~bus.stall) & r1.wb.stb):
+ # That was the last word? We are done sending. Clear stb
# sigh - reconstruct wb adr with 3 extra 0s at front
- wb_adr = Cat(Const(0, ROW_OFF_BITS), r1.wb.adr)
- with m.If(is_last_row_addr(wb_adr, r1.end_row_ix)):
+ wb_adr = Cat(Const(0, self.ROW_OFF_BITS), r1.wb.adr)
+ with m.If(self.is_last_row_addr(wb_adr, r1.end_row_ix)):
sync += r1.wb.stb.eq(0)
- comb += ld_stbs_done.eq(1)
# Calculate the next row address in the current cache line
- row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
+ rlen = self.LINE_OFF_BITS-self.ROW_OFF_BITS
+ row = Signal(rlen)
comb += row.eq(r1.wb.adr)
- sync += r1.wb.adr[:LINE_OFF_BITS-ROW_OFF_BITS].eq(row+1)
+ sync += r1.wb.adr[:rlen].eq(row+1)
# Incoming acks processing
- sync += r1.forward_valid1.eq(wb_in.ack)
- with m.If(wb_in.ack):
- srow = Signal(ROW_LINE_BITS)
+ sync += r1.forward_valid1.eq(bus.ack)
+ with m.If(bus.ack):
+ srow = Signal(self.ROW_LINE_BITS)
comb += srow.eq(r1.store_row)
sync += r1.rows_valid[srow].eq(1)
# Compare the whole address in case the
# request in r1.req is not the one that
# started this refill.
- with m.If(req.valid & r1.req.same_tag &
- ((r1.dcbz & r1.req.dcbz) |
- (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
- (r1.store_row == get_row(req.real_addr))):
- sync += r1.full.eq(0)
+ rowmatch = Signal()
+ lastrow = Signal()
+ comb += rowmatch.eq(r1.store_row ==
+ self.get_row(r1.req.real_addr))
+ comb += lastrow.eq(self.is_last_row(r1.store_row,
+ r1.end_row_ix))
+ with m.If(r1.full & r1.req.same_tag &
+ ((r1.dcbz & req.dcbz) |
+ (r1.req.op == Op.OP_LOAD_MISS)) & rowmatch):
+ sync += r1.full.eq(r1_next_cycle)
sync += r1.slow_valid.eq(1)
- with m.If(~r1.mmu_req):
- sync += r1.ls_valid.eq(1)
- with m.Else():
+ with m.If(r1.mmu_req):
sync += r1.mmu_done.eq(1)
+ with m.Else():
+ sync += r1.ls_valid.eq(1)
sync += r1.forward_sel.eq(~0) # all 1s
sync += r1.use_forward1.eq(1)
# Check for completion
- with m.If(ld_stbs_done & is_last_row(r1.store_row,
- r1.end_row_ix)):
+ with m.If(lastrow):
# Complete wishbone cycle
sync += r1.wb.cyc.eq(0)
# Cache line is now valid
- cv = Signal(INDEX_BITS)
+ cv = Signal(self.INDEX_BITS)
comb += cv.eq(cache_valids[r1.store_index])
comb += cv.bit_select(r1.store_way, 1).eq(1)
sync += cache_valids[r1.store_index].eq(cv)
cv, r1.store_index, r1.store_way)
# Increment store row counter
- sync += r1.store_row.eq(next_row(r1.store_row))
+ sync += r1.store_row.eq(self.next_row(r1.store_row))
with m.Case(State.STORE_WAIT_ACK):
st_stbs_done = Signal()
- acks = Signal(3)
adjust_acks = Signal(3)
comb += st_stbs_done.eq(~r1.wb.stb)
- comb += acks.eq(r1.acks_pending)
with m.If(r1.inc_acks != r1.dec_acks):
with m.If(r1.inc_acks):
- comb += adjust_acks.eq(acks + 1)
+ comb += adjust_acks.eq(r1.acks_pending + 1)
with m.Else():
- comb += adjust_acks.eq(acks - 1)
+ comb += adjust_acks.eq(r1.acks_pending - 1)
with m.Else():
- comb += adjust_acks.eq(acks)
+ comb += adjust_acks.eq(r1.acks_pending)
sync += r1.acks_pending.eq(adjust_acks)
# Clear stb when slave accepted request
- with m.If(~wb_in.stall):
+ with m.If(~bus.stall):
# See if there is another store waiting
# to be done which is in the same real page.
+ # (this is when same_tsg is true)
with m.If(req.valid):
- _ra = req.real_addr[ROW_LINE_BITS:SET_SIZE_BITS]
- sync += r1.wb.adr[0:SET_SIZE_BITS].eq(_ra)
+ _ra = req.real_addr[self.ROW_OFF_BITS:
+ self.SET_SIZE_BITS]
+ alen = self.SET_SIZE_BITS-self.ROW_OFF_BITS
+ sync += r1.wb.adr[0:alen].eq(_ra)
sync += r1.wb.dat.eq(req.data)
sync += r1.wb.sel.eq(req.byte_sel)
with m.If((adjust_acks < 7) & req.same_tag &
- ((req.op == Op.OP_STORE_MISS)
- | (req.op == Op.OP_STORE_HIT))):
+ ((req.op == Op.OP_STORE_MISS) |
+ (req.op == Op.OP_STORE_HIT))):
sync += r1.wb.stb.eq(1)
comb += st_stbs_done.eq(0)
+ sync += r1.store_way.eq(req.hit_way)
+ sync += r1.store_row.eq(self.get_row(req.real_addr))
with m.If(req.op == Op.OP_STORE_HIT):
sync += r1.write_bram.eq(1)
- sync += r1.full.eq(0)
+ sync += r1.full.eq(r1_next_cycle)
sync += r1.slow_valid.eq(1)
# Store requests never come from the MMU
comb += st_stbs_done.eq(1)
# Got ack ? See if complete.
- with m.If(wb_in.ack):
+ sync += Display("got ack %d %d stbs %d adjust_acks %d",
+ bus.ack, bus.ack, st_stbs_done, adjust_acks)
+ with m.If(bus.ack):
with m.If(st_stbs_done & (adjust_acks == 1)):
sync += r1.state.eq(State.IDLE)
sync += r1.wb.cyc.eq(0)
with m.Case(State.NC_LOAD_WAIT_ACK):
# Clear stb when slave accepted request
- with m.If(~wb_in.stall):
+ with m.If(~bus.stall):
sync += r1.wb.stb.eq(0)
# Got ack ? complete.
- with m.If(wb_in.ack):
+ with m.If(bus.ack):
sync += r1.state.eq(State.IDLE)
- sync += r1.full.eq(0)
+ sync += r1.full.eq(r1_next_cycle)
sync += r1.slow_valid.eq(1)
- with m.If(~r1.mmu_req):
- sync += r1.ls_valid.eq(1)
- with m.Else():
+ with m.If(r1.mmu_req):
sync += r1.mmu_done.eq(1)
+ with m.Else():
+ sync += r1.ls_valid.eq(1)
sync += r1.forward_sel.eq(~0) # all 1s
sync += r1.use_forward1.eq(1)
sync += r1.wb.cyc.eq(0)
sync += r1.wb.stb.eq(0)
- def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out):
+ def dcache_log(self, m, r1, valid_ra, tlb_hit, stall_out):
sync = m.d.sync
- d_out, wb_in, log_out = self.d_out, self.wb_in, self.log_out
+ d_out, bus, log_out = self.d_out, self.bus, self.log_out
- sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
+ sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit.way[:3],
stall_out, req_op[:3], d_out.valid, d_out.error,
- r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
+ r1.wb.cyc, r1.wb.stb, bus.ack, bus.stall,
r1.real_adr[3:6]))
def elaborate(self, platform):
m = Module()
- comb = m.d.comb
- d_in = self.d_in
+ comb, sync = m.d.comb, m.d.sync
+ m_in, d_in = self.m_in, self.d_in
# Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
- cache_tags = CacheTagArray()
- cache_tag_set = Signal(TAG_RAM_WIDTH)
- cache_valids = CacheValidBitsArray()
+ cache_valids = self.CacheValidsArray()
+ cache_tag_set = Signal(self.TAG_RAM_WIDTH)
- # TODO attribute ram_style : string;
- # TODO attribute ram_style of cache_tags : signal is "distributed";
+ self.tagmem = Memory(depth=self.NUM_LINES, width=self.TAG_RAM_WIDTH,
+ attrs={'syn_ramstyle': "block_ram"})
"""note: these are passed to nmigen.hdl.Memory as "attributes".
don't know how, just that they are.
"""
- dtlb_valid_bits = TLBValidBitsArray()
- dtlb_tags = TLBTagsArray()
- dtlb_ptes = TLBPtesArray()
# TODO attribute ram_style of
# dtlb_tags : signal is "distributed";
# TODO attribute ram_style of
r0 = RegStage0("r0")
r0_full = Signal()
- r1 = RegStage1("r1")
+ r1 = RegStage1(self, "r1")
- reservation = Reservation()
+ reservation = Reservation(self, "rsrv")
# Async signals on incoming request
- req_index = Signal(INDEX_BITS)
- req_row = Signal(ROW_BITS)
- req_hit_way = Signal(WAY_BITS)
- req_tag = Signal(TAG_BITS)
+ req_index = Signal(self.INDEX_BITS)
+ req_row = Signal(self.ROW_BITS)
+ req_hit_way = Signal(self.WAY_BITS)
+ req_tag = Signal(self.TAG_BITS)
req_op = Signal(Op)
req_data = Signal(64)
req_same_tag = Signal()
req_go = Signal()
- early_req_row = Signal(ROW_BITS)
+ early_req_row = Signal(self.ROW_BITS)
cancel_store = Signal()
set_rsrv = Signal()
cache_out_row = Signal(WB_DATA_BITS)
- plru_victim = PLRUOut()
- replace_way = Signal(WAY_BITS)
+ plru_victim = Signal(self.WAY_BITS)
+ replace_way = Signal(self.WAY_BITS)
# Wishbone read/write/cache write formatting signals
bus_sel = Signal(8)
# TLB signals
- tlb_tag_way = Signal(TLB_TAG_WAY_BITS)
- tlb_pte_way = Signal(TLB_PTE_WAY_BITS)
- tlb_valid_way = Signal(TLB_NUM_WAYS)
- tlb_req_index = Signal(TLB_SET_BITS)
- tlb_hit = Signal()
- tlb_hit_way = Signal(TLB_WAY_BITS)
- pte = Signal(TLB_PTE_BITS)
- ra = Signal(REAL_ADDR_BITS)
+ tlb_way = self.TLBRecord("tlb_way")
+ tlb_req_index = Signal(self.TLB_SET_BITS)
+ tlb_hit = self.TLBHit("tlb_hit")
+ pte = Signal(self.TLB_PTE_BITS)
+ ra = Signal(self.REAL_ADDR_BITS)
valid_ra = Signal()
perm_attr = PermAttr("dc_perms")
rc_ok = Signal()
perm_ok = Signal()
access_ok = Signal()
- tlb_plru_victim = TLBPLRUOut()
+ tlb_plru_victim = Signal(self.TLB_WAY_BITS)
# we don't yet handle collisions between loadstore1 requests
# and MMU requests
comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
comb += self.stall_out.eq(r0_stall)
-
- # Wire up wishbone request latch out of stage 1
- comb += self.wb_out.eq(r1.wb)
+ # debugging: detect if any stall ever requested, which is fine,
+ # but if a request comes in when stall requested, that's bad.
+ with m.If(r0_stall):
+ sync += self.any_stall_out.eq(1)
+ with m.If(d_in.valid):
+ sync += self.dreq_when_stall.eq(1)
+ with m.If(m_in.valid):
+ sync += self.mreq_when_stall.eq(1)
# deal with litex not doing wishbone pipeline mode
# XXX in wrong way. FIFOs are needed in the SRAM test
- # so that stb/ack match up
- comb += self.wb_in.stall.eq(self.wb_out.cyc & ~self.wb_in.ack)
+ # so that stb/ack match up. same thing done in icache.py
+ if not self.microwatt_compat or self.fabric_compat:
+ comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
+
+ # Wire up wishbone request latch out of stage 1
+ comb += self.bus.we.eq(r1.wb.we)
+ comb += self.bus.adr.eq(r1.wb.adr)
+ comb += self.bus.sel.eq(r1.wb.sel)
+ comb += self.bus.stb.eq(r1.wb.stb)
+ comb += self.bus.dat_w.eq(r1.wb.dat)
+ comb += self.bus.cyc.eq(r1.wb.cyc)
+
+ # create submodule TLBUpdate
+ m.submodules.dtlb_update = self.dtlb_update = DTLBUpdate(self)
# call sub-functions putting everything together, using shared
# signals established above
self.stage_0(m, r0, r1, r0_full)
- self.tlb_read(m, r0_stall, tlb_valid_way,
- tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
- dtlb_tags, dtlb_ptes)
+ self.tlb_read(m, r0_stall, tlb_way)
self.tlb_search(m, tlb_req_index, r0, r0_valid,
- tlb_valid_way, tlb_tag_way, tlb_hit_way,
- tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra)
- self.tlb_update(m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
- tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
- dtlb_tags, tlb_pte_way, dtlb_ptes)
+ tlb_way,
+ pte, tlb_hit, valid_ra, perm_attr, ra)
+ self.tlb_update(m, r0_valid, r0, tlb_req_index,
+ tlb_hit, tlb_plru_victim)
self.maybe_plrus(m, r1, plru_victim)
- self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
- self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
+ self.maybe_tlb_plrus(m, r1, tlb_plru_victim, tlb_req_index)
+ self.cache_tag_read(m, r0_stall, req_index, cache_tag_set)
self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
r0_valid, r1, cache_valids, replace_way,
use_forward1_next, use_forward2_next,
req_hit_way, plru_victim, rc_ok, perm_attr,
valid_ra, perm_ok, access_ok, req_op, req_go,
- tlb_pte_way,
- tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
+ tlb_hit, tlb_way, cache_tag_set,
cancel_store, req_same_tag, r0_stall, early_req_row)
self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
r0_valid, r0, reservation)
self.rams(m, r1, early_req_row, cache_out_row, replace_way)
self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
req_hit_way, req_index, req_tag, access_ok,
- tlb_hit, tlb_hit_way, tlb_req_index)
+ tlb_hit, tlb_req_index)
self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
- cache_valids, r0, replace_way,
+ r0, replace_way,
req_hit_way, req_same_tag,
- r0_valid, req_op, cache_tags, req_go, ra)
- #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
+ r0_valid, req_op, cache_valids, req_go, ra)
+ #self.dcache_log(m, r1, valid_ra, tlb_hit, stall_out)
return m
# liveness counter
live_cnt = Signal(5)
# keep data and valid stable, until accepted
- with m.If(Past(dut.p.valid_i) & ~Past(dut.p.ready_o)):
+ with m.If(Past(dut.p.i_valid) & ~Past(dut.p.o_ready)):
comb += [
Assume(Stable(dut.op.sdir)),
Assume(Stable(dut.p.data_i.data)),
Assume(Stable(dut.p.data_i.shift)),
- Assume(Stable(dut.p.valid_i)),
+ Assume(Stable(dut.p.i_valid)),
]
# force reading the output in a reasonable time,
# necessary to pass induction
- with m.If(Past(dut.n.valid_o) & ~Past(dut.n.ready_i)):
- comb += Assume(dut.n.ready_i)
+ with m.If(Past(dut.n.o_valid) & ~Past(dut.n.i_ready)):
+ comb += Assume(dut.n.i_ready)
# capture transferred input data
- with m.If(dut.p.ready_o & dut.p.valid_i):
+ with m.If(dut.p.o_ready & dut.p.i_valid):
sync += [
data_i.eq(dut.p.data_i.data),
shift_i.eq(dut.p.data_i.shift),
# one work item ever in flight at any given time.
# Whenever the unit is busy (not ready) the read and write counters
# will differ by exactly one unit.
- m.d.comb += Assert((read_cnt + ~dut.p.ready_o) & 0xF == write_cnt)
+ m.d.comb += Assert((read_cnt + ~dut.p.o_ready) & 0xF == write_cnt)
# Check for liveness. It will ensure that the FSM is not stuck, and
# will eventually produce some result.
- # In this case, the delay between ready_o being negated and valid_o
+ # In this case, the delay between o_ready being negated and o_valid
# being asserted has to be less than 16 cycles.
- with m.If(~dut.p.ready_o & ~dut.n.valid_o):
+ with m.If(~dut.p.o_ready & ~dut.n.o_valid):
m.d.sync += live_cnt.eq(live_cnt + 1)
with m.Else():
m.d.sync += live_cnt.eq(0)
m.d.comb += Assert(live_cnt < 16)
# check coverage as output data is accepted
- with m.If(dut.n.ready_i & dut.n.valid_o):
+ with m.If(dut.n.i_ready & dut.n.o_valid):
# increment read counter
sync += read_cnt.eq(read_cnt + 1)
# check result
traces = [
'clk',
'p_data_i[7:0]', 'p_shift_i[7:0]', 'op__sdir',
- 'p_valid_i', 'p_ready_o',
+ 'p_i_valid', 'p_o_ready',
'n_data_o[7:0]',
- 'n_valid_o', 'n_ready_i',
+ 'n_o_valid', 'n_i_ready',
('formal', {'module': 'top'}, [
'write_cnt[3:0]', 'read_cnt[3:0]', 'cov[7:0]'
])
--- /dev/null
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2022 Cesar Strauss <cestrauss@gmail.com>
+# Sponsored by NLnet under EU Grant and 957073
+# Part of the Libre-SOC Project.
+
+"""
+Formal proof of soc.experiment.compalu_multi.MultiCompUnit
+
+In short, MultiCompUnit:
+
+1) stores an opcode from Issue, when not "busy", and "issue" is pulsed
+2) signals "busy" high
+3) fetches its operand(s), if any (which are not masked or zero) from the
+Scoreboard (REL/GO protocol)
+4) starts the ALU (ready/valid protocol), as soon as all inputs are available
+5) captures result from ALU (again ready/valid)
+5) sends the result(s) back to the Scoreboard (again REL/GO)
+6) drops "busy"
+
+Note that, if the conditions are right, many of the above can occur together,
+on a single cycle.
+
+The formal proof involves ensuring that:
+1) the ALU gets the right opcode from Issue
+2) the ALU gets the right operands from the Scoreboard
+3) the Scoreboard receives the right result from the ALU
+4) no transactions are dropped or repeated
+
+This can be checked using holding registers and transaction counters.
+
+See https://bugs.libre-soc.org/show_bug.cgi?id=879 and
+https://bugs.libre-soc.org/show_bug.cgi?id=197
+"""
+
+import unittest
+
+from nmigen import Signal, Module
+from nmigen.hdl.ast import Cover, Const, Assume, Assert
+from nmutil.formaltest import FHDLTestCase
+from nmutil.singlepipe import ControlBase
+
+from soc.experiment.compalu_multi import MultiCompUnit
+from soc.fu.alu.alu_input_record import CompALUOpSubset
+
+
+# Formal model of a simple ALU, whose inputs and outputs are randomly
+# generated by the formal engine
+
+class ALUCtx:
+ def __init__(self):
+ self.op = CompALUOpSubset(name="op")
+
+
+class ALUInput:
+ def __init__(self):
+ self.a = Signal(16)
+ self.b = Signal(16)
+ self.ctx = ALUCtx()
+
+ def eq(self, i):
+ return [self.a.eq(i.a), self.b.eq(i.b)]
+
+
+class ALUOutput:
+ def __init__(self):
+ self.o1 = Signal(16)
+ self.o2 = Signal(16)
+
+ def eq(self, i):
+ return [self.o1.eq(i.o1), self.o2.eq(i.o2)]
+
+
+class ALU(ControlBase):
+ def __init__(self):
+ super().__init__(stage=self)
+ self.p.i_data, self.n.o_data = self.new_specs(None)
+ self.i, self.o = self.p.i_data, self.n.o_data
+
+ def setup(self, m, i):
+ pass
+
+ def ispec(self, name=None):
+ return ALUInput()
+
+ def ospec(self, name=None):
+ return ALUOutput()
+
+ def elaborate(self, platform):
+ m = super().elaborate(platform)
+ return m
+
+
+class CompALUMultiTestCase(FHDLTestCase):
+ def test_formal(self):
+ inspec = [('INT', 'a', '0:15'),
+ ('INT', 'b', '0:15')]
+ outspec = [('INT', 'o1', '0:15'),
+ ('INT', 'o2', '0:15')]
+ regspec = (inspec, outspec)
+ m = Module()
+ # Instantiate "random" ALU
+ alu = ALU()
+ m.submodules.dut = dut = MultiCompUnit(regspec, alu, CompALUOpSubset)
+ # TODO Test shadow / die
+ m.d.comb += [dut.shadown_i.eq(1), dut.go_die_i.eq(0)]
+ # Don't issue while busy
+ issue = Signal()
+ m.d.comb += dut.issue_i.eq(issue & ~dut.busy_o)
+ # Avoid toggling go_i when rel_o is low (rel / go protocol)
+ rd_go = Signal(dut.n_src)
+ m.d.comb += dut.cu.rd.go_i.eq(rd_go & dut.cu.rd.rel_o)
+ wr_go = Signal(dut.n_dst)
+ m.d.comb += dut.cu.wr.go_i.eq(wr_go & dut.cu.wr.rel_o)
+ # Transaction counters
+ do_issue = Signal()
+ m.d.comb += do_issue.eq(dut.issue_i & ~dut.busy_o)
+ cnt_issue = Signal(4)
+ m.d.sync += cnt_issue.eq(cnt_issue + do_issue)
+ do_read = Signal(dut.n_src)
+ m.d.comb += do_read.eq(dut.cu.rd.rel_o & dut.cu.rd.go_i)
+ cnt_read = []
+ for i in range(dut.n_src):
+ cnt = Signal(4, name="cnt_read_%d" % i)
+ m.d.sync += cnt.eq(cnt + do_read[i])
+ cnt_read.append(cnt)
+ do_write = Signal(dut.n_dst)
+ m.d.comb += do_write.eq(dut.cu.wr.rel_o & dut.cu.wr.go_i)
+ cnt_write = []
+ for i in range(dut.n_dst):
+ cnt = Signal(4, name="cnt_write_%d" % i)
+ m.d.sync += cnt.eq(cnt + do_write[i])
+ cnt_write.append(cnt)
+ do_alu_write = Signal()
+ m.d.comb += do_alu_write.eq(alu.p.i_valid & alu.p.o_ready)
+ cnt_alu_write = Signal(4)
+ m.d.sync += cnt_alu_write.eq(cnt_alu_write + do_alu_write)
+ do_alu_read = Signal()
+ m.d.comb += do_alu_read.eq(alu.n.o_valid & alu.n.i_ready)
+ cnt_alu_read = Signal(4)
+ m.d.sync += cnt_alu_read.eq(cnt_alu_read + do_alu_read)
+ cnt_masked_read = []
+ do_masked_read = Signal(dut.n_src)
+ for i in range(dut.n_src):
+ cnt = Signal(4, name="cnt_masked_read_%d" % i)
+ if i == 0:
+ extra = dut.oper_i.zero_a
+ elif i == 1:
+ extra = dut.oper_i.imm_data.ok
+ else:
+ extra = Const(0, 1)
+ m.d.comb += do_masked_read[i].eq(do_issue &
+ (dut.rdmaskn[i] | extra))
+ m.d.sync += cnt.eq(cnt + do_masked_read[i])
+ cnt_masked_read.append(cnt)
+ # If the ALU is idle, do not assert valid
+ with m.If((cnt_alu_read == cnt_alu_write) & ~do_alu_write):
+ m.d.comb += Assume(~alu.n.o_valid)
+ # Keep ALU valid high, until read
+ last_alu_valid = Signal()
+ m.d.sync += last_alu_valid.eq(alu.n.o_valid & ~alu.n.i_ready)
+ with m.If(last_alu_valid):
+ m.d.comb += Assume(alu.n.o_valid)
+
+ # Invariant checks
+
+ # For every instruction issued, at any point in time,
+ # each operand was either:
+ # 1) Already read
+ # 2) Not read yet, but the read is pending (rel_o high)
+ # 3) Masked
+ for i in range(dut.n_src):
+ sum_read = Signal(4)
+ m.d.comb += sum_read.eq(
+ cnt_read[i] + cnt_masked_read[i] + dut.cu.rd.rel_o[i])
+ m.d.comb += Assert(sum_read == cnt_issue)
+
+ # For every instruction, either:
+ # 1) The ALU is executing the instruction
+ # 2) Otherwise, execution is pending (alu.p.i_valid is high)
+ # 3) Otherwise, it is waiting for operands
+ # (some dut.cu.rd.rel_o are still high)
+ # 4) ... unless all operands are masked, in which case there is a one
+ # cycle delay
+ all_masked = Signal()
+ m.d.sync += all_masked.eq(do_masked_read.all())
+ sum_alu_write = Signal(4)
+ m.d.comb += sum_alu_write.eq(
+ cnt_alu_write +
+ (dut.cu.rd.rel_o.any() | all_masked | alu.p.i_valid))
+ m.d.comb += Assert(sum_alu_write == cnt_issue)
+
+ # Ask the formal engine to give an example
+ m.d.comb += Cover((cnt_issue == 2)
+ & (cnt_read[0] == 1)
+ & (cnt_read[1] == 0)
+ & (cnt_write[0] == 1)
+ & (cnt_write[1] == 1)
+ & (cnt_alu_write == 1)
+ & (cnt_alu_read == 1)
+ & (cnt_masked_read[0] == 1)
+ & (cnt_masked_read[1] == 1))
+ with self.subTest("cover"):
+ self.assertFormal(m, mode="cover", depth=10)
+
+ # Check assertions
+ with self.subTest("bmc"):
+ self.assertFormal(m, mode="bmc", depth=10)
+
+
+if __name__ == "__main__":
+ unittest.main()
write TAG_BITS width which may not match full ram blocks and might
cause muxes to be inferred for "partial writes".
* Check if making the read size of PLRU a ROM helps utilization
+
+Links:
+
+* https://bugs.libre-soc.org/show_bug.cgi?id=485
+* https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
+ (discussion about brams for ECP5)
+
"""
from enum import (Enum, unique)
-from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl)
+from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl,
+ Record)
from nmigen.cli import main, rtlil
from nmutil.iocontrol import RecordObject
from nmigen.utils import log2_int
+from nmigen.lib.coding import Decoder
from nmutil.util import Display
+from nmutil.latch import SRLatch
#from nmutil.plru import PLRU
+from soc.experiment.plru import PLRU, PLRUs
from soc.experiment.cache_ram import CacheRam
-from soc.experiment.plru import PLRU
from soc.experiment.mem_types import (Fetch1ToICacheType,
ICacheToDecode1Type,
from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
WB_SEL_BITS, WBAddrType, WBDataType,
WBSelType, WBMasterOut, WBSlaveOut,
- WBMasterOutVector, WBSlaveOutVector,
- WBIOMasterOut, WBIOSlaveOut)
+ )
+
+from nmigen_soc.wishbone.bus import Interface
+from soc.minerva.units.fetch import FetchUnitInterface
+
# for test
from soc.bus.sram import SRAM
# Also, check out the cxxsim nmigen branch, and latest yosys from git
from nmutil.sim_tmp_alternative import Simulator, Settle
+# from microwatt/utils.vhdl
+def ispow2(n):
+ return n != 0 and (n & (n - 1)) == 0
SIM = 0
-LINE_SIZE = 64
-# BRAM organisation: We never access more than wishbone_data_bits
-# at a time so to save resources we make the array only that wide,
-# and use consecutive indices for to make a cache "line"
-#
-# ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
-ROW_SIZE = WB_DATA_BITS // 8
-# Number of lines in a set
-NUM_LINES = 16
-# Number of ways
-NUM_WAYS = 4
-# L1 ITLB number of entries (direct mapped)
-TLB_SIZE = 64
-# L1 ITLB log_2(page_size)
-TLB_LG_PGSZ = 12
-# Number of real address bits that we store
-REAL_ADDR_BITS = 56
# Non-zero to enable log data collection
LOG_LENGTH = 0
-ROW_SIZE_BITS = ROW_SIZE * 8
-# ROW_PER_LINE is the number of row (wishbone) transactions in a line
-ROW_PER_LINE = LINE_SIZE // ROW_SIZE
-# BRAM_ROWS is the number of rows in BRAM needed to represent the full icache
-BRAM_ROWS = NUM_LINES * ROW_PER_LINE
-# INSN_PER_ROW is the number of 32bit instructions per BRAM row
-INSN_PER_ROW = ROW_SIZE_BITS // 32
-
-# Bit fields counts in the address
-#
-# INSN_BITS is the number of bits to select an instruction in a row
-INSN_BITS = log2_int(INSN_PER_ROW)
-# ROW_BITS is the number of bits to select a row
-ROW_BITS = log2_int(BRAM_ROWS)
-# ROW_LINE_BITS is the number of bits to select a row within a line
-ROW_LINE_BITS = log2_int(ROW_PER_LINE)
-# LINE_OFF_BITS is the number of bits for the offset in a cache line
-LINE_OFF_BITS = log2_int(LINE_SIZE)
-# ROW_OFF_BITS is the number of bits for the offset in a row
-ROW_OFF_BITS = log2_int(ROW_SIZE)
-# INDEX_BITS is the number of bits to select a cache line
-INDEX_BITS = log2_int(NUM_LINES)
-# SET_SIZE_BITS is the log base 2 of the set size
-SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
-# TAG_BITS is the number of bits of the tag part of the address
-TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
-# TAG_WIDTH is the width in bits of each way of the tag RAM
-TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
-
-# WAY_BITS is the number of bits to select a way
-WAY_BITS = log2_int(NUM_WAYS)
-TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
-
-# L1 ITLB
-TLB_BITS = log2_int(TLB_SIZE)
-TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
-TLB_PTE_BITS = 64
-
-print("BRAM_ROWS =", BRAM_ROWS)
-print("INDEX_BITS =", INDEX_BITS)
-print("INSN_BITS =", INSN_BITS)
-print("INSN_PER_ROW =", INSN_PER_ROW)
-print("LINE_SIZE =", LINE_SIZE)
-print("LINE_OFF_BITS =", LINE_OFF_BITS)
-print("LOG_LENGTH =", LOG_LENGTH)
-print("NUM_LINES =", NUM_LINES)
-print("NUM_WAYS =", NUM_WAYS)
-print("REAL_ADDR_BITS =", REAL_ADDR_BITS)
-print("ROW_BITS =", ROW_BITS)
-print("ROW_OFF_BITS =", ROW_OFF_BITS)
-print("ROW_LINE_BITS =", ROW_LINE_BITS)
-print("ROW_PER_LINE =", ROW_PER_LINE)
-print("ROW_SIZE =", ROW_SIZE)
-print("ROW_SIZE_BITS =", ROW_SIZE_BITS)
-print("SET_SIZE_BITS =", SET_SIZE_BITS)
-print("SIM =", SIM)
-print("TAG_BITS =", TAG_BITS)
-print("TAG_RAM_WIDTH =", TAG_RAM_WIDTH)
-print("TAG_BITS =", TAG_BITS)
-print("TLB_BITS =", TLB_BITS)
-print("TLB_EA_TAG_BITS =", TLB_EA_TAG_BITS)
-print("TLB_LG_PGSZ =", TLB_LG_PGSZ)
-print("TLB_PTE_BITS =", TLB_PTE_BITS)
-print("TLB_SIZE =", TLB_SIZE)
-print("WAY_BITS =", WAY_BITS)
-
-# from microwatt/utils.vhdl
-def ispow2(n):
- return n != 0 and (n & (n - 1)) == 0
-
-assert LINE_SIZE % ROW_SIZE == 0
-assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
-assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
-assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
-assert ispow2(INSN_PER_ROW), "INSN_PER_ROW not power of 2"
-assert (ROW_BITS == (INDEX_BITS + ROW_LINE_BITS)), \
- "geometry bits don't add up"
-assert (LINE_OFF_BITS == (ROW_OFF_BITS + ROW_LINE_BITS)), \
- "geometry bits don't add up"
-assert (REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS)), \
- "geometry bits don't add up"
-assert (REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS)), \
- "geometry bits don't add up"
-
-# Example of layout for 32 lines of 64 bytes:
-#
-# .. tag |index| line |
-# .. | row | |
-# .. | | | |00| zero (2)
-# .. | | |-| | INSN_BITS (1)
-# .. | |---| | ROW_LINE_BITS (3)
-# .. | |--- - --| LINE_OFF_BITS (6)
-# .. | |- --| ROW_OFF_BITS (3)
-# .. |----- ---| | ROW_BITS (8)
-# .. |-----| | INDEX_BITS (5)
-# .. --------| | TAG_BITS (53)
-
-# The cache data BRAM organized as described above for each way
-#subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
-#
-# The cache tags LUTRAM has a row per set. Vivado is a pain and will
-# not handle a clean (commented) definition of the cache tags as a 3d
-# memory. For now, work around it by putting all the tags
-def CacheTagArray():
- return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" %x) \
- for x in range(NUM_LINES))
-
-# The cache valid bits
-def CacheValidBitsArray():
- return Array(Signal(NUM_WAYS, name="cachevalid_%d" %x) \
- for x in range(NUM_LINES))
-
-def RowPerLineValidArray():
- return Array(Signal(name="rows_valid_%d" %x) \
- for x in range(ROW_PER_LINE))
-
-
-# TODO to be passed to nigmen as ram attributes
-# attribute ram_style : string;
-# attribute ram_style of cache_tags : signal is "distributed";
-
-
-def TLBValidBitsArray():
- return Array(Signal(name="tlbvalid_%d" %x) \
- for x in range(TLB_SIZE))
-
-def TLBTagArray():
- return Array(Signal(TLB_EA_TAG_BITS, name="tlbtag_%d" %x) \
- for x in range(TLB_SIZE))
-
-def TLBPtesArray():
- return Array(Signal(TLB_PTE_BITS, name="tlbptes_%d" %x) \
- for x in range(TLB_SIZE))
-
-# Cache RAM interface
-def CacheRamOut():
- return Array(Signal(ROW_SIZE_BITS, name="cache_out_%d" %x) \
- for x in range(NUM_WAYS))
-
-# PLRU output interface
-def PLRUOut():
- return Array(Signal(WAY_BITS, name="plru_out_%d" %x) \
- for x in range(NUM_LINES))
-
-# Return the cache line index (tag index) for an address
-def get_index(addr):
- return addr[LINE_OFF_BITS:SET_SIZE_BITS]
-
-# Return the cache row index (data memory) for an address
-def get_row(addr):
- return addr[ROW_OFF_BITS:SET_SIZE_BITS]
-
-# Return the index of a row within a line
-def get_row_of_line(row):
- return row[:ROW_LINE_BITS]
-
-# Returns whether this is the last row of a line
-def is_last_row_addr(addr, last):
- return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
-
-# Returns whether this is the last row of a line
-def is_last_row(row, last):
- return get_row_of_line(row) == last
-
-# Return the next row in the current cache line. We use a dedicated
-# function in order to limit the size of the generated adder to be
-# only the bits within a cache line (3 bits with default settings)
-def next_row(row):
- row_v = row[0:ROW_LINE_BITS] + 1
- return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
-
-# Read the instruction word for the given address
-# in the current cache row
-def read_insn_word(addr, data):
- word = addr[2:INSN_BITS+2]
- return data.word_select(word, 32)
-
-# Get the tag value from the address
-def get_tag(addr):
- return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
-
-# Read a tag from a tag memory row
-def read_tag(way, tagset):
- return tagset.word_select(way, TAG_BITS)
-
-# Write a tag to tag memory row
-def write_tag(way, tagset, tag):
- return read_tag(way, tagset).eq(tag)
-
-# Simple hash for direct-mapped TLB index
-def hash_ea(addr):
- hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ addr[
- TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS
- ] ^ addr[
- TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS
- ]
- return hsh
+class ICacheConfig:
+ def __init__(self, XLEN = 64,
+ LINE_SIZE = 64,
+ NUM_LINES = 64, # Number of lines in a set
+ NUM_WAYS = 2, # Number of ways
+ TLB_SIZE = 64, # L1 ITLB number of entries
+ TLB_LG_PGSZ = 12): # L1 ITLB log_2(page_size)
+ self.XLEN = XLEN
+ self.LINE_SIZE = LINE_SIZE
+ self.NUM_LINES = NUM_LINES
+ self.NUM_WAYS = NUM_WAYS
+ self.TLB_SIZE = TLB_SIZE
+ self.TLB_LG_PGSZ = TLB_LG_PGSZ
+
+ # BRAM organisation: We never access more than wishbone_data_bits
+ # at a time so to save resources we make the array only that wide,
+ # and use consecutive indices for to make a cache "line"
+ #
+ # self.ROW_SIZE is the width in bytes of the BRAM
+ # (based on WB, so 64-bits)
+ self.ROW_SIZE = WB_DATA_BITS // 8
+ # Number of real address bits that we store
+ self.REAL_ADDR_BITS = XLEN-8 # 56 for XLEN=64
+
+ self.ROW_SIZE_BITS = self.ROW_SIZE * 8
+ # ROW_PER_LINE is the number of row (wishbone) transactions in a line
+ self.ROW_PER_LINE = self.LINE_SIZE // self.ROW_SIZE
+ # BRAM_ROWS is the number of rows in BRAM
+ # needed to represent the full icache
+ self.BRAM_ROWS = self.NUM_LINES * self.ROW_PER_LINE
+ # INSN_PER_ROW is the number of 32bit instructions per BRAM row
+ self.INSN_PER_ROW = self.ROW_SIZE_BITS // 32
+
+ # Bit fields counts in the address
+ #
+ # INSN_BITS is the number of bits to select an instruction in a row
+ self.INSN_BITS = log2_int(self.INSN_PER_ROW)
+ # ROW_BITS is the number of bits to select a row
+ self.ROW_BITS = log2_int(self.BRAM_ROWS)
+ # ROW_LINE_BITS is the number of bits to select a row within a line
+ self.ROW_LINE_BITS = log2_int(self.ROW_PER_LINE)
+ # LINE_OFF_BITS is the number of bits for the offset in a cache line
+ self.LINE_OFF_BITS = log2_int(self.LINE_SIZE)
+ # ROW_OFF_BITS is the number of bits for the offset in a row
+ self.ROW_OFF_BITS = log2_int(self.ROW_SIZE)
+ # INDEX_BITS is the number of bits to select a cache line
+ self.INDEX_BITS = log2_int(self.NUM_LINES)
+ # SET_SIZE_BITS is the log base 2 of the set size
+ self.SET_SIZE_BITS = self.LINE_OFF_BITS + self.INDEX_BITS
+ # TAG_BITS is the number of bits of the tag part of the address
+ self.TAG_BITS = self.REAL_ADDR_BITS - self.SET_SIZE_BITS
+ # TAG_WIDTH is the width in bits of each way of the tag RAM
+ self.TAG_WIDTH = self.TAG_BITS + 7 - ((self.TAG_BITS + 7) % 8)
+
+ # WAY_BITS is the number of bits to select a way
+ self.WAY_BITS = log2_int(self.NUM_WAYS)
+ self.TAG_RAM_WIDTH = self.TAG_BITS * self.NUM_WAYS
+
+ # L1 ITLB
+ self.TL_BITS = log2_int(self.TLB_SIZE)
+ self.TLB_EA_TAG_BITS = XLEN - (self.TLB_LG_PGSZ + self.TL_BITS)
+ self.TLB_PTE_BITS = XLEN
+
+ print("self.XLEN =", self.XLEN)
+ print("self.BRAM_ROWS =", self.BRAM_ROWS)
+ print("self.INDEX_BITS =", self.INDEX_BITS)
+ print("self.INSN_BITS =", self.INSN_BITS)
+ print("self.INSN_PER_ROW =", self.INSN_PER_ROW)
+ print("self.LINE_SIZE =", self.LINE_SIZE)
+ print("self.LINE_OFF_BITS =", self.LINE_OFF_BITS)
+ print("LOG_LENGTH =", LOG_LENGTH)
+ print("self.NUM_LINES =", self.NUM_LINES)
+ print("self.NUM_WAYS =", self.NUM_WAYS)
+ print("self.REAL_ADDR_BITS =", self.REAL_ADDR_BITS)
+ print("self.ROW_BITS =", self.ROW_BITS)
+ print("self.ROW_OFF_BITS =", self.ROW_OFF_BITS)
+ print("self.ROW_LINE_BITS =", self.ROW_LINE_BITS)
+ print("self.ROW_PER_LINE =", self.ROW_PER_LINE)
+ print("self.ROW_SIZE =", self.ROW_SIZE)
+ print("self.ROW_SIZE_BITS =", self.ROW_SIZE_BITS)
+ print("self.SET_SIZE_BITS =", self.SET_SIZE_BITS)
+ print("SIM =", SIM)
+ print("self.TAG_BITS =", self.TAG_BITS)
+ print("self.TAG_RAM_WIDTH =", self.TAG_RAM_WIDTH)
+ print("self.TAG_BITS =", self.TAG_BITS)
+ print("self.TL_BITS =", self.TL_BITS)
+ print("self.TLB_EA_TAG_BITS =", self.TLB_EA_TAG_BITS)
+ print("self.TLB_LG_PGSZ =", self.TLB_LG_PGSZ)
+ print("self.TLB_PTE_BITS =", self.TLB_PTE_BITS)
+ print("self.TLB_SIZE =", self.TLB_SIZE)
+ print("self.WAY_BITS =", self.WAY_BITS)
+ print()
+
+ assert self.LINE_SIZE % self.ROW_SIZE == 0
+ assert ispow2(self.LINE_SIZE), "self.LINE_SIZE not power of 2"
+ assert ispow2(self.NUM_LINES), "self.NUM_LINES not power of 2"
+ assert ispow2(self.ROW_PER_LINE), "self.ROW_PER_LINE not power of 2"
+ assert ispow2(self.INSN_PER_ROW), "self.INSN_PER_ROW not power of 2"
+ assert (self.ROW_BITS == (self.INDEX_BITS + self.ROW_LINE_BITS)), \
+ "geometry bits don't add up"
+ assert (self.LINE_OFF_BITS ==
+ (self.ROW_OFF_BITS + self.ROW_LINE_BITS)), \
+ "geometry bits don't add up"
+ assert (self.REAL_ADDR_BITS ==
+ (self.TAG_BITS + self.INDEX_BITS + self.LINE_OFF_BITS)), \
+ "geometry bits don't add up"
+ assert (self.REAL_ADDR_BITS ==
+ (self.TAG_BITS + self.ROW_BITS + self.ROW_OFF_BITS)), \
+ "geometry bits don't add up"
+
+ # Example of layout for 32 lines of 64 bytes:
+ #
+ # .. tag |index| line |
+ # .. | row | |
+ # .. | | | |00| zero (2)
+ # .. | | |-| | self.INSN_BITS (1)
+ # .. | |---| | self.ROW_LINE_BITS (3)
+ # .. | |--- - --| self.LINE_OFF_BITS (6)
+ # .. | |- --| self.ROW_OFF_BITS (3)
+ # .. |----- ---| | self.ROW_BITS (8)
+ # .. |-----| | self.INDEX_BITS (5)
+ # .. --------| | self.TAG_BITS (53)
+
+ # The cache data BRAM organized as described above for each way
+ #subtype cache_row_t is std_ulogic_vector(self.ROW_SIZE_BITS-1 downto 0);
+ #
+ def RowPerLineValidArray(self):
+ return Array(Signal(name="rows_valid_%d" %x) \
+ for x in range(self.ROW_PER_LINE))
+
+
+ # TODO to be passed to nigmen as ram attributes
+ # attribute ram_style : string;
+ # attribute ram_style of cache_tags : signal is "distributed";
+
+ def TLBRecord(self, name):
+ tlb_layout = [ ('tag', self.TLB_EA_TAG_BITS),
+ ('pte', self.TLB_PTE_BITS)
+ ]
+ return Record(tlb_layout, name=name)
+
+ def TLBArray(self):
+ return Array(self.TLBRecord("tlb%d" % x) for x in range(self.TLB_SIZE))
+
+ # PLRU output interface
+ def PLRUOut(self):
+ return Array(Signal(self.WAY_BITS, name="plru_out_%d" %x) \
+ for x in range(self.NUM_LINES))
+
+ # Return the cache line index (tag index) for an address
+ def get_index(self, addr):
+ return addr[self.LINE_OFF_BITS:self.SET_SIZE_BITS]
+
+ # Return the cache row index (data memory) for an address
+ def get_row(self, addr):
+ return addr[self.ROW_OFF_BITS:self.SET_SIZE_BITS]
+
+ # Return the index of a row within a line
+ def get_row_of_line(self, row):
+ return row[:self.ROW_BITS][:self.ROW_LINE_BITS]
+
+ # Returns whether this is the last row of a line
+ def is_last_row_addr(self, addr, last):
+ return addr[self.ROW_OFF_BITS:self.LINE_OFF_BITS] == last
+
+ # Returns whether this is the last row of a line
+ def is_last_row(self, row, last):
+ return self.get_row_of_line(row) == last
+
+ # Return the next row in the current cache line. We use a dedicated
+ # function in order to limit the size of the generated adder to be
+ # only the bits within a cache line (3 bits with default settings)
+ def next_row(self, row):
+ row_v = row[0:self.ROW_LINE_BITS] + 1
+ return Cat(row_v[:self.ROW_LINE_BITS], row[self.ROW_LINE_BITS:])
+
+ # Read the instruction word for the given address
+ # in the current cache row
+ def read_insn_word(self, addr, data):
+ word = addr[2:self.INSN_BITS+2]
+ return data.word_select(word, 32)
+
+ # Get the tag value from the address
+ def get_tag(self, addr):
+ return addr[self.SET_SIZE_BITS:self.REAL_ADDR_BITS]
+
+ # Read a tag from a tag memory row
+ def read_tag(self, way, tagset):
+ return tagset.word_select(way, self.TAG_BITS)
+
+ # Write a tag to tag memory row
+ def write_tag(self, way, tagset, tag):
+ return self.read_tag(way, tagset).eq(tag)
+
+ # Simple hash for direct-mapped TLB index
+ def hash_ea(self, addr):
+ hsh = (addr[self.TLB_LG_PGSZ:self.TLB_LG_PGSZ + self.TL_BITS] ^
+ addr[self.TLB_LG_PGSZ + self.TL_BITS:
+ self.TLB_LG_PGSZ + 2 * self.TL_BITS ] ^
+ addr[self.TLB_LG_PGSZ + 2 * self.TL_BITS:
+ self.TLB_LG_PGSZ + 3 * self.TL_BITS])
+ return hsh
# Cache reload state machine
class RegInternal(RecordObject):
- def __init__(self):
+ def __init__(self, cfg):
super().__init__()
# Cache hit state (Latches for 1 cycle BRAM access)
- self.hit_way = Signal(NUM_WAYS)
+ self.hit_way = Signal(cfg.WAY_BITS)
self.hit_nia = Signal(64)
self.hit_smark = Signal()
self.hit_valid = Signal()
self.state = Signal(State, reset=State.IDLE)
self.wb = WBMasterOut("wb")
self.req_adr = Signal(64)
- self.store_way = Signal(NUM_WAYS)
- self.store_index = Signal(NUM_LINES)
- self.store_row = Signal(BRAM_ROWS)
- self.store_tag = Signal(TAG_BITS)
+ self.store_way = Signal(cfg.WAY_BITS)
+ self.store_index = Signal(cfg.INDEX_BITS)
+ self.store_row = Signal(cfg.ROW_BITS)
+ self.store_tag = Signal(cfg.TAG_BITS)
self.store_valid = Signal()
- self.end_row_ix = Signal(ROW_LINE_BITS)
- self.rows_valid = RowPerLineValidArray()
+ self.end_row_ix = Signal(cfg.ROW_LINE_BITS)
+ self.rows_valid = cfg.RowPerLineValidArray()
# TLB miss state
self.fetch_failed = Signal()
-class ICache(Elaboratable):
+class ICache(FetchUnitInterface, Elaboratable, ICacheConfig):
"""64 bit direct mapped icache. All instructions are 4B aligned."""
- def __init__(self):
+ def __init__(self, pspec):
+ FetchUnitInterface.__init__(self, pspec)
self.i_in = Fetch1ToICacheType(name="i_in")
self.i_out = ICacheToDecode1Type(name="i_out")
self.flush_in = Signal()
self.inval_in = Signal()
- self.wb_out = WBMasterOut(name="wb_out")
- self.wb_in = WBSlaveOut(name="wb_in")
+ # standard naming (wired to non-standard for compatibility)
+ self.bus = Interface(addr_width=32,
+ data_width=64,
+ granularity=8,
+ features={'stall'},
+ #alignment=0,
+ name="icache_wb")
self.log_out = Signal(54)
+ # use FetchUnitInterface, helps keep some unit tests running
+ self.use_fetch_iface = False
+
+ # test if small cache to be enabled
+ self.small_cache = (hasattr(pspec, "small_cache") and
+ (pspec.small_cache == True))
+ # test if microwatt compatibility to be enabled
+ self.microwatt_compat = (hasattr(pspec, "microwatt_compat") and
+ (pspec.microwatt_compat == True))
+ # test if fabric compatibility is to be enabled
+ self.fabric_compat = (hasattr(pspec, "fabric_compat") and
+ (pspec.fabric_compat == True))
+
+ XLEN = pspec.XLEN
+ LINE_SIZE = 64
+ TLB_SIZE = 8
+ NUM_LINES = 8
+ NUM_WAYS = 2
+ if self.small_cache:
+ # reduce way sizes and num lines to ridiculously small
+ NUM_LINES = 2
+ NUM_WAYS = 1
+ TLB_SIZE = 2
+ if self.microwatt_compat or self.fabric_compat:
+ # reduce way sizes
+ NUM_WAYS = 1
+
+ ICacheConfig.__init__(self, LINE_SIZE=LINE_SIZE,
+ XLEN=XLEN,
+ NUM_LINES = NUM_LINES,
+ NUM_WAYS = NUM_WAYS,
+ TLB_SIZE=TLB_SIZE
+ )
+
+ def use_fetch_interface(self):
+ self.use_fetch_iface = True
# Generate a cache RAM for each way
def rams(self, m, r, cache_out_row, use_previous,
comb = m.d.comb
sync = m.d.sync
- wb_in, stall_in = self.wb_in, self.stall_in
+ bus, stall_in = self.bus, self.stall_in
+
+ # read condition (for every cache ram)
+ do_read = Signal()
+ comb += do_read.eq(~(stall_in | use_previous))
+
+ rd_addr = Signal(self.ROW_BITS)
+ wr_addr = Signal(self.ROW_BITS)
+ comb += rd_addr.eq(req_row)
+ comb += wr_addr.eq(r.store_row)
- for i in range(NUM_WAYS):
- do_read = Signal(name="do_rd_%d" % i)
+ # binary-to-unary converters: replace-way enabled by bus.ack,
+ # hit-way left permanently enabled
+ m.submodules.replace_way_e = re = Decoder(self.NUM_WAYS)
+ m.submodules.hit_way_e = he = Decoder(self.NUM_WAYS)
+ comb += re.i.eq(replace_way)
+ comb += re.n.eq(~bus.ack)
+ comb += he.i.eq(r.hit_way)
+
+ for i in range(self.NUM_WAYS):
do_write = Signal(name="do_wr_%d" % i)
- rd_addr = Signal(ROW_BITS)
- wr_addr = Signal(ROW_BITS)
- d_out = Signal(ROW_SIZE_BITS, name="d_out_%d" % i)
- wr_sel = Signal(ROW_SIZE)
+ d_out = Signal(self.ROW_SIZE_BITS, name="d_out_%d" % i)
+ wr_sel = Signal(self.ROW_SIZE, name="wr_sel_%d" % i)
- way = CacheRam(ROW_BITS, ROW_SIZE_BITS, True)
- setattr(m.submodules, "cacheram_%d" % i, way)
+ way = CacheRam(self.ROW_BITS, self.ROW_SIZE_BITS,
+ TRACE=True, ram_num=i)
+ m.submodules["cacheram_%d" % i] = way
comb += way.rd_en.eq(do_read)
comb += way.rd_addr.eq(rd_addr)
comb += d_out.eq(way.rd_data_o)
comb += way.wr_sel.eq(wr_sel)
comb += way.wr_addr.eq(wr_addr)
- comb += way.wr_data.eq(wb_in.dat)
+ comb += way.wr_data.eq(bus.dat_r)
- comb += do_read.eq(~(stall_in | use_previous))
- comb += do_write.eq(wb_in.ack & (replace_way == i))
+ comb += do_write.eq(re.o[i])
with m.If(do_write):
sync += Display("cache write adr: %x data: %lx",
wr_addr, way.wr_data)
- with m.If(r.hit_way == i):
+ with m.If(he.o[i]):
comb += cache_out_row.eq(d_out)
with m.If(do_read):
sync += Display("cache read adr: %x data: %x",
req_row, d_out)
- comb += rd_addr.eq(req_row)
- comb += wr_addr.eq(r.store_row)
- comb += wr_sel.eq(Repl(do_write, ROW_SIZE))
+ comb += wr_sel.eq(Repl(do_write, self.ROW_SIZE))
# Generate PLRUs
def maybe_plrus(self, m, r, plru_victim):
comb = m.d.comb
- with m.If(NUM_WAYS > 1):
- for i in range(NUM_LINES):
- plru_acc_i = Signal(WAY_BITS)
- plru_acc_en = Signal()
- plru = PLRU(WAY_BITS)
- setattr(m.submodules, "plru_%d" % i, plru)
-
- comb += plru.acc_i.eq(plru_acc_i)
- comb += plru.acc_en.eq(plru_acc_en)
+ if self.NUM_WAYS == 0:
+ return
- # PLRU interface
- with m.If(get_index(r.hit_nia) == i):
- comb += plru.acc_en.eq(r.hit_valid)
- comb += plru.acc_i.eq(r.hit_way)
- comb += plru_victim[i].eq(plru.lru_o)
+ m.submodules.plrus = plru = PLRUs("itag", self.NUM_LINES,
+ self.WAY_BITS)
+ comb += plru.way.eq(r.hit_way)
+ comb += plru.valid.eq(r.hit_valid)
+ comb += plru.index.eq(self.get_index(r.hit_nia))
+ comb += plru.isel.eq(r.store_index) # select victim
+ comb += plru_victim.eq(plru.o_index) # selected victim
# TLB hit detection and real address generation
- def itlb_lookup(self, m, tlb_req_index, itlb_ptes, itlb_tags,
- real_addr, itlb_valid_bits, ra_valid, eaa_priv,
+ def itlb_lookup(self, m, tlb_req_index, itlb, itlb_valid,
+ real_addr, ra_valid, eaa_priv,
priv_fault, access_ok):
comb = m.d.comb
i_in = self.i_in
- pte = Signal(TLB_PTE_BITS)
- ttag = Signal(TLB_EA_TAG_BITS)
+ # use an *asynchronous* Memory read port here (combinatorial)
+ m.submodules.rd_tlb = rd_tlb = self.tlbmem.read_port(domain="comb")
+ tlb = self.TLBRecord("tlb_rdport")
+ pte, ttag = tlb.pte, tlb.tag
- comb += tlb_req_index.eq(hash_ea(i_in.nia))
- comb += pte.eq(itlb_ptes[tlb_req_index])
- comb += ttag.eq(itlb_tags[tlb_req_index])
+ comb += tlb_req_index.eq(self.hash_ea(i_in.nia))
+ comb += rd_tlb.addr.eq(tlb_req_index)
+ comb += tlb.eq(rd_tlb.data)
with m.If(i_in.virt_mode):
- comb += real_addr.eq(Cat(
- i_in.nia[:TLB_LG_PGSZ],
- pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
- ))
+ comb += real_addr.eq(Cat(i_in.nia[:self.TLB_LG_PGSZ],
+ pte[self.TLB_LG_PGSZ:self.REAL_ADDR_BITS]))
- with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
- comb += ra_valid.eq(itlb_valid_bits[tlb_req_index])
+ with m.If(ttag == i_in.nia[self.TLB_LG_PGSZ + self.TL_BITS:64]):
+ comb += ra_valid.eq(itlb_valid.q.bit_select(tlb_req_index, 1))
comb += eaa_priv.eq(pte[3])
with m.Else():
- comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
+ comb += real_addr.eq(i_in.nia[:self.REAL_ADDR_BITS])
comb += ra_valid.eq(1)
comb += eaa_priv.eq(1)
comb += access_ok.eq(ra_valid & ~priv_fault)
# iTLB update
- def itlb_update(self, m, itlb_valid_bits, itlb_tags, itlb_ptes):
+ def itlb_update(self, m, itlb, itlb_valid):
comb = m.d.comb
sync = m.d.sync
m_in = self.m_in
- wr_index = Signal(TLB_SIZE)
- comb += wr_index.eq(hash_ea(m_in.addr))
+ wr_index = Signal(self.TL_BITS)
+ wr_unary = Signal(self.TLB_SIZE)
+ comb += wr_index.eq(self.hash_ea(m_in.addr))
+ comb += wr_unary.eq(1<<wr_index)
+
+ m.submodules.wr_tlb = wr_tlb = self.tlbmem.write_port()
+ sync += itlb_valid.s.eq(0)
+ sync += itlb_valid.r.eq(0)
with m.If(m_in.tlbie & m_in.doall):
# Clear all valid bits
- for i in range(TLB_SIZE):
- sync += itlb_valid_bits[i].eq(0)
+ sync += itlb_valid.r.eq(-1)
with m.Elif(m_in.tlbie):
# Clear entry regardless of hit or miss
- sync += itlb_valid_bits[wr_index].eq(0)
+ sync += itlb_valid.r.eq(wr_unary)
with m.Elif(m_in.tlbld):
- sync += itlb_tags[wr_index].eq(
- m_in.addr[TLB_LG_PGSZ + TLB_BITS:64]
- )
- sync += itlb_ptes[wr_index].eq(m_in.pte)
- sync += itlb_valid_bits[wr_index].eq(1)
+ tlb = self.TLBRecord("tlb_wrport")
+ comb += tlb.tag.eq(m_in.addr[self.TLB_LG_PGSZ + self.TL_BITS:64])
+ comb += tlb.pte.eq(m_in.pte)
+ comb += wr_tlb.en.eq(1)
+ comb += wr_tlb.addr.eq(wr_index)
+ comb += wr_tlb.data.eq(tlb)
+ sync += itlb_valid.s.eq(wr_unary)
# Cache hit detection, output to fetch2 and other misc logic
def icache_comb(self, m, use_previous, r, req_index, req_row,
req_hit_way, req_tag, real_addr, req_laddr,
- cache_valid_bits, cache_tags, access_ok,
+ cache_valids, access_ok,
req_is_hit, req_is_miss, replace_way,
plru_victim, cache_out_row):
comb = m.d.comb
+ m.submodules.rd_tag = rd_tag = self.tagmem.read_port(domain="comb")
- i_in, i_out, wb_out = self.i_in, self.i_out, self.wb_out
+ i_in, i_out, bus = self.i_in, self.i_out, self.bus
flush_in, stall_out = self.flush_in, self.stall_out
is_hit = Signal()
- hit_way = Signal(NUM_WAYS)
+ hit_way = Signal(self.WAY_BITS)
# i_in.sequential means that i_in.nia this cycle is 4 more than
# last cycle. If we read more than 32 bits at a time, had a
# cache hit last cycle, and we don't want the first 32-bit chunk
# then we can keep the data we read last cycle and just use that.
- with m.If(i_in.nia[2:INSN_BITS+2] != 0):
+ with m.If(i_in.nia[2:self.INSN_BITS+2] != 0):
comb += use_previous.eq(i_in.sequential & r.hit_valid)
# Extract line, row and tag from request
- comb += req_index.eq(get_index(i_in.nia))
- comb += req_row.eq(get_row(i_in.nia))
- comb += req_tag.eq(get_tag(real_addr))
+ comb += req_index.eq(self.get_index(i_in.nia))
+ comb += req_row.eq(self.get_row(i_in.nia))
+ comb += req_tag.eq(self.get_tag(real_addr))
# Calculate address of beginning of cache row, will be
# used for cache miss processing if needed
comb += req_laddr.eq(Cat(
- Const(0, ROW_OFF_BITS),
- real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
+ Const(0, self.ROW_OFF_BITS),
+ real_addr[self.ROW_OFF_BITS:self.REAL_ADDR_BITS],
))
# Test if pending request is a hit on any way
hitcond = Signal()
- comb += hitcond.eq((r.state == State.WAIT_ACK)
- & (req_index == r.store_index)
- & r.rows_valid[req_row % ROW_PER_LINE]
+ rowvalid = Signal()
+ comb += rowvalid.eq(r.rows_valid[req_row % self.ROW_PER_LINE])
+ comb += hitcond.eq((r.state == State.WAIT_ACK) &
+ (req_index == r.store_index) &
+ rowvalid
)
- with m.If(i_in.req):
- cvb = Signal(NUM_WAYS)
- ctag = Signal(TAG_RAM_WIDTH)
- comb += ctag.eq(cache_tags[req_index])
- comb += cvb.eq(cache_valid_bits[req_index])
- for i in range(NUM_WAYS):
- tagi = Signal(TAG_BITS, name="tag_i%d" % i)
- comb += tagi.eq(read_tag(i, ctag))
- hit_test = Signal(name="hit_test%d" % i)
- comb += hit_test.eq(i == r.store_way)
- with m.If((cvb[i] | (hitcond & hit_test))
- & (tagi == req_tag)):
- comb += hit_way.eq(i)
- comb += is_hit.eq(1)
+ # i_in.req asserts Decoder active
+ cvb = Signal(self.NUM_WAYS)
+ ctag = Signal(self.TAG_RAM_WIDTH)
+ comb += rd_tag.addr.eq(req_index)
+ comb += ctag.eq(rd_tag.data)
+ comb += cvb.eq(cache_valids.q.word_select(req_index, self.NUM_WAYS))
+ m.submodules.store_way_e = se = Decoder(self.NUM_WAYS)
+ comb += se.i.eq(r.store_way)
+ comb += se.n.eq(~i_in.req)
+ for i in range(self.NUM_WAYS):
+ tagi = Signal(self.TAG_BITS, name="tag_i%d" % i)
+ hit_test = Signal(name="hit_test%d" % i)
+ is_tag_hit = Signal(name="is_tag_hit_%d" % i)
+ comb += tagi.eq(self.read_tag(i, ctag))
+ comb += hit_test.eq(se.o[i])
+ comb += is_tag_hit.eq((cvb[i] | (hitcond & hit_test)) &
+ (tagi == req_tag))
+ with m.If(is_tag_hit):
+ comb += hit_way.eq(i)
+ comb += is_hit.eq(1)
# Generate the "hit" and "miss" signals
# for the synchronous blocks
comb += req_is_hit.eq(is_hit)
comb += req_is_miss.eq(~is_hit)
- with m.Else():
- comb += req_is_hit.eq(0)
- comb += req_is_miss.eq(0)
-
comb += req_hit_way.eq(hit_way)
# The way to replace on a miss
with m.If(r.state == State.CLR_TAG):
- comb += replace_way.eq(plru_victim[r.store_index])
+ comb += replace_way.eq(plru_victim)
with m.Else():
comb += replace_way.eq(r.store_way)
# be output an entire row which I prefer not to do just yet
# as it would force fetch2 to know about some of the cache
# geometry information.
- comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out_row))
+ comb += i_out.insn.eq(self.read_insn_word(r.hit_nia, cache_out_row))
comb += i_out.valid.eq(r.hit_valid)
comb += i_out.nia.eq(r.hit_nia)
comb += i_out.stop_mark.eq(r.hit_smark)
comb += stall_out.eq(~(is_hit & access_ok))
# Wishbone requests output (from the cache miss reload machine)
- comb += wb_out.eq(r.wb)
+ comb += bus.we.eq(r.wb.we)
+ comb += bus.adr.eq(r.wb.adr)
+ comb += bus.sel.eq(r.wb.sel)
+ comb += bus.stb.eq(r.wb.stb)
+ comb += bus.dat_w.eq(r.wb.dat)
+ comb += bus.cyc.eq(r.wb.cyc)
# Cache hit synchronous machine
def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
with m.If(req_is_hit):
sync += r.hit_way.eq(req_hit_way)
- sync += Display(
- "cache hit nia:%x IR:%x SM:%x idx:%x tag:%x " \
- "way:%x RA:%x", i_in.nia, i_in.virt_mode, \
- i_in.stop_mark, req_index, req_tag, \
- req_hit_way, real_addr
- )
-
-
+ sync += Display("cache hit nia:%x IR:%x SM:%x idx:%x tag:%x "
+ "way:%x RA:%x", i_in.nia, i_in.virt_mode,
+ i_in.stop_mark, req_index, req_tag,
+ req_hit_way, real_addr)
with m.If(~stall_in):
# Send stop marks and NIA down regardless of validity
i_in = self.i_in
# Reset per-row valid flags, only used in WAIT_ACK
- for i in range(ROW_PER_LINE):
+ for i in range(self.ROW_PER_LINE):
sync += r.rows_valid[i].eq(0)
# We need to read a cache line
"cache miss nia:%x IR:%x SM:%x idx:%x "
" way:%x tag:%x RA:%x", i_in.nia,
i_in.virt_mode, i_in.stop_mark, req_index,
- replace_way, req_tag, real_addr
- )
+ replace_way, req_tag, real_addr)
# Keep track of our index and way for subsequent stores
- st_row = Signal(BRAM_ROWS)
- comb += st_row.eq(get_row(req_laddr))
+ st_row = Signal(self.ROW_BITS)
+ comb += st_row.eq(self.get_row(req_laddr))
sync += r.store_index.eq(req_index)
sync += r.store_row.eq(st_row)
sync += r.store_tag.eq(req_tag)
sync += r.store_valid.eq(1)
- sync += r.end_row_ix.eq(get_row_of_line(st_row) - 1)
+ sync += r.end_row_ix.eq(self.get_row_of_line(st_row) - 1)
# Prep for first wishbone read. We calculate the address
# of the start of the cache line and start the WB cycle.
sync += r.state.eq(State.CLR_TAG)
def icache_miss_clr_tag(self, m, r, replace_way,
- cache_valid_bits, req_index,
- tagset, cache_tags):
-
+ req_index,
+ cache_valids):
comb = m.d.comb
sync = m.d.sync
+ m.submodules.wr_tag = wr_tag = self.tagmem.write_port(
+ granularity=self.TAG_BITS)
# Get victim way from plru
sync += r.store_way.eq(replace_way)
+
# Force misses on that way while reloading that line
- cv = Signal(INDEX_BITS)
- comb += cv.eq(cache_valid_bits[req_index])
- comb += cv.bit_select(replace_way, 1).eq(0)
- sync += cache_valid_bits[req_index].eq(cv)
+ idx = req_index*self.NUM_WAYS + replace_way # 2D index, 1st dim: self.NUM_WAYS
+ comb += cache_valids.r.eq(1<<idx)
- for i in range(NUM_WAYS):
- with m.If(i == replace_way):
- comb += tagset.eq(cache_tags[r.store_index])
- comb += write_tag(i, tagset, r.store_tag)
- sync += cache_tags[r.store_index].eq(tagset)
+ # use write-port "granularity" to select the tag to write to
+ # TODO: the Memory should be multipled-up (by NUM_TAGS)
+ tagset = Signal(self.TAG_RAM_WIDTH)
+ comb += tagset.eq(r.store_tag << (replace_way*self.TAG_BITS))
+ comb += wr_tag.en.eq(1<<replace_way)
+ comb += wr_tag.addr.eq(r.store_index)
+ comb += wr_tag.data.eq(tagset)
sync += r.state.eq(State.WAIT_ACK)
def icache_miss_wait_ack(self, m, r, replace_way, inval_in,
- stbs_done, cache_valid_bits):
+ cache_valids):
comb = m.d.comb
sync = m.d.sync
- wb_in = self.wb_in
-
- # Requests are all sent if stb is 0
- stbs_zero = Signal()
- comb += stbs_zero.eq(r.wb.stb == 0)
- comb += stbs_done.eq(stbs_zero)
+ bus = self.bus
# If we are still sending requests, was one accepted?
- with m.If(~wb_in.stall & ~stbs_zero):
- # That was the last word? We are done sending.
- # Clear stb and set stbs_done so we can handle
- # an eventual last ack on the same cycle.
- with m.If(is_last_row_addr(r.req_adr, r.end_row_ix)):
- sync += Display(
- "IS_LAST_ROW_ADDR r.wb.addr:%x " \
- "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x " \
- "stbs_done:%x", r.wb.adr, r.end_row_ix,
- r.wb.stb, stbs_zero, stbs_done
- )
+ with m.If(~bus.stall & r.wb.stb):
+ # That was the last word? We are done sending. Clear stb
+ with m.If(self.is_last_row_addr(r.req_adr, r.end_row_ix)):
+ sync += Display("IS_LAST_ROW_ADDR r.wb.addr:%x "
+ "r.end_row_ix:%x r.wb.stb:%x",
+ r.wb.adr, r.end_row_ix, r.wb.stb)
sync += r.wb.stb.eq(0)
- comb += stbs_done.eq(1)
# Calculate the next row address
- rarange = Signal(LINE_OFF_BITS - ROW_OFF_BITS)
- comb += rarange.eq(
- r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS] + 1
- )
- sync += r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(
- rarange
- )
+ rarange = Signal(self.LINE_OFF_BITS - self.ROW_OFF_BITS)
+ comb += rarange.eq(r.req_adr[self.ROW_OFF_BITS:
+ self.LINE_OFF_BITS] + 1)
+ sync += r.req_adr[self.ROW_OFF_BITS:self.LINE_OFF_BITS].eq(rarange)
sync += Display("RARANGE r.req_adr:%x rarange:%x "
- "stbs_zero:%x stbs_done:%x",
- r.req_adr, rarange, stbs_zero, stbs_done)
+ "r.wb.stb:%x",
+ r.req_adr, rarange, r.wb.stb)
# Incoming acks processing
- with m.If(wb_in.ack):
- sync += Display("WB_IN_ACK data:%x stbs_zero:%x "
- "stbs_done:%x",
- wb_in.dat, stbs_zero, stbs_done)
+ with m.If(bus.ack):
+ sync += Display("WB_IN_ACK data:%x", bus.dat_r)
- sync += r.rows_valid[r.store_row % ROW_PER_LINE].eq(1)
+ sync += r.rows_valid[r.store_row % self.ROW_PER_LINE].eq(1)
# Check for completion
- with m.If(stbs_done &
- is_last_row(r.store_row, r.end_row_ix)):
+ with m.If(self.is_last_row(r.store_row, r.end_row_ix)):
# Complete wishbone cycle
sync += r.wb.cyc.eq(0)
# be nice, clear addr
sync += r.req_adr.eq(0)
# Cache line is now valid
- cv = Signal(INDEX_BITS)
- comb += cv.eq(cache_valid_bits[r.store_index])
- comb += cv.bit_select(replace_way, 1).eq(
- r.store_valid & ~inval_in
- )
- sync += cache_valid_bits[r.store_index].eq(cv)
-
+ idx = r.store_index*self.NUM_WAYS + replace_way # 2D index again
+ valid = r.store_valid & ~inval_in
+ comb += cache_valids.s.eq(1<<idx)
sync += r.state.eq(State.IDLE)
- # not completed, move on to next request in row
- with m.Else():
- # Increment store row counter
- sync += r.store_row.eq(next_row(r.store_row))
-
+ # move on to next request in row
+ # Increment store row counter
+ sync += r.store_row.eq(self.next_row(r.store_row))
# Cache miss/reload synchronous machine
- def icache_miss(self, m, cache_valid_bits, r, req_is_miss,
+ def icache_miss(self, m, r, req_is_miss,
req_index, req_laddr, req_tag, replace_way,
- cache_tags, access_ok, real_addr):
+ cache_valids, access_ok, real_addr):
comb = m.d.comb
sync = m.d.sync
- i_in, wb_in, m_in = self.i_in, self.wb_in, self.m_in
+ i_in, bus, m_in = self.i_in, self.bus, self.m_in
stall_in, flush_in = self.stall_in, self.flush_in
inval_in = self.inval_in
- tagset = Signal(TAG_RAM_WIDTH)
- stbs_done = Signal()
-
comb += r.wb.sel.eq(-1)
comb += r.wb.adr.eq(r.req_adr[3:])
# Process cache invalidations
with m.If(inval_in):
- for i in range(NUM_LINES):
- sync += cache_valid_bits[i].eq(0)
+ comb += cache_valids.r.eq(-1)
sync += r.store_valid.eq(0)
# Main state machine
with m.Switch(r.state):
with m.Case(State.IDLE):
- self.icache_miss_idle(
- m, r, req_is_miss, req_laddr,
- req_index, req_tag, replace_way,
- real_addr
- )
+ self.icache_miss_idle(m, r, req_is_miss, req_laddr,
+ req_index, req_tag, replace_way,
+ real_addr)
with m.Case(State.CLR_TAG, State.WAIT_ACK):
with m.If(r.state == State.CLR_TAG):
- self.icache_miss_clr_tag(
- m, r, replace_way,
- cache_valid_bits, req_index,
- tagset, cache_tags
- )
-
- self.icache_miss_wait_ack(
- m, r, replace_way, inval_in,
- stbs_done, cache_valid_bits
- )
+ self.icache_miss_clr_tag(m, r, replace_way,
+ req_index,
+ cache_valids)
+
+ self.icache_miss_wait_ack(m, r, replace_way, inval_in,
+ cache_valids)
# TLB miss and protection fault processing
with m.If(flush_in | m_in.tlbld):
comb = m.d.comb
sync = m.d.sync
- wb_in, i_out = self.wb_in, self.i_out
+ bus, i_out = self.bus, self.i_out
log_out, stall_out = self.log_out, self.stall_out
# Output data to logger
for i in range(LOG_LENGTH):
log_data = Signal(54)
- lway = Signal(NUM_WAYS)
+ lway = Signal(self.WAY_BITS)
wstate = Signal()
sync += lway.eq(req_hit_way)
sync += log_data.eq(Cat(
ra_valid, access_ok, req_is_miss, req_is_hit,
lway, wstate, r.hit_nia[2:6], r.fetch_failed,
- stall_out, wb_in.stall, r.wb.cyc, r.wb.stb,
- r.real_addr[3:6], wb_in.ack, i_out.insn, i_out.valid
+ stall_out, bus.stall, r.wb.cyc, r.wb.stb,
+ r.real_addr[3:6], bus.ack, i_out.insn, i_out.valid
))
comb += log_out.eq(log_data)
m = Module()
comb = m.d.comb
- # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
- cache_tags = CacheTagArray()
- cache_valid_bits = CacheValidBitsArray()
+ # Cache-Ways "valid" indicators. this is a 2D Signal, by the
+ # number of ways and the number of lines.
+ vec = SRLatch(sync=True, llen=self.NUM_WAYS*self.NUM_LINES,
+ name="cachevalids")
+ m.submodules.cache_valids = cache_valids = vec
+
+ # TLB Array
+ itlb = self.TLBArray()
+ vec = SRLatch(sync=False, llen=self.TLB_SIZE, name="tlbvalids")
+ m.submodules.itlb_valids = itlb_valid = vec
- itlb_valid_bits = TLBValidBitsArray()
- itlb_tags = TLBTagArray()
- itlb_ptes = TLBPtesArray()
# TODO to be passed to nmigen as ram attributes
# attribute ram_style of itlb_tags : signal is "distributed";
# attribute ram_style of itlb_ptes : signal is "distributed";
# Privilege bit from PTE EAA field
eaa_priv = Signal()
- r = RegInternal()
+ r = RegInternal(self)
# Async signal on incoming request
- req_index = Signal(NUM_LINES)
- req_row = Signal(BRAM_ROWS)
- req_hit_way = Signal(NUM_WAYS)
- req_tag = Signal(TAG_BITS)
+ req_index = Signal(self.INDEX_BITS)
+ req_row = Signal(self.ROW_BITS)
+ req_hit_way = Signal(self.WAY_BITS)
+ req_tag = Signal(self.TAG_BITS)
req_is_hit = Signal()
req_is_miss = Signal()
req_laddr = Signal(64)
- tlb_req_index = Signal(TLB_SIZE)
- real_addr = Signal(REAL_ADDR_BITS)
+ tlb_req_index = Signal(self.TL_BITS)
+ real_addr = Signal(self.REAL_ADDR_BITS)
ra_valid = Signal()
priv_fault = Signal()
access_ok = Signal()
use_previous = Signal()
- cache_out_row = Signal(ROW_SIZE_BITS)
+ cache_out_row = Signal(self.ROW_SIZE_BITS)
+
+ plru_victim = Signal(self.WAY_BITS)
+ replace_way = Signal(self.WAY_BITS)
- plru_victim = PLRUOut()
- replace_way = Signal(NUM_WAYS)
+ self.tlbmem = Memory(depth=self.TLB_SIZE,
+ width=self.TLB_EA_TAG_BITS+self.TLB_PTE_BITS,
+ #attrs={'syn_ramstyle': "block_ram"}
+ )
+ self.tagmem = Memory(depth=self.NUM_LINES,
+ width=self.TAG_RAM_WIDTH,
+ #attrs={'syn_ramstyle': "block_ram"}
+ )
# call sub-functions putting everything together,
# using shared signals established above
self.rams(m, r, cache_out_row, use_previous, replace_way, req_row)
self.maybe_plrus(m, r, plru_victim)
- self.itlb_lookup(m, tlb_req_index, itlb_ptes, itlb_tags, real_addr,
- itlb_valid_bits, ra_valid, eaa_priv, priv_fault,
+ self.itlb_lookup(m, tlb_req_index, itlb, itlb_valid, real_addr,
+ ra_valid, eaa_priv, priv_fault,
access_ok)
- self.itlb_update(m, itlb_valid_bits, itlb_tags, itlb_ptes)
+ self.itlb_update(m, itlb, itlb_valid)
self.icache_comb(m, use_previous, r, req_index, req_row, req_hit_way,
- req_tag, real_addr, req_laddr, cache_valid_bits,
- cache_tags, access_ok, req_is_hit, req_is_miss,
+ req_tag, real_addr, req_laddr,
+ cache_valids,
+ access_ok, req_is_hit, req_is_miss,
replace_way, plru_victim, cache_out_row)
self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
req_index, req_tag, real_addr)
- self.icache_miss(m, cache_valid_bits, r, req_is_miss, req_index,
- req_laddr, req_tag, replace_way, cache_tags,
+ self.icache_miss(m, r, req_is_miss, req_index,
+ req_laddr, req_tag, replace_way,
+ cache_valids,
access_ok, real_addr)
#self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
# req_is_miss, req_is_hit, lway, wstate, r)
+ # don't connect up to FetchUnitInterface so that some unit tests
+ # can continue to operate
+ if not self.use_fetch_iface:
+ return m
+
+ # connect to FetchUnitInterface. FetchUnitInterface is undocumented
+ # so needs checking and iterative revising
+ i_in, bus, i_out = self.i_in, self.bus, self.i_out
+ comb += i_in.req.eq(self.a_i_valid)
+ comb += i_in.nia.eq(self.a_pc_i)
+ comb += self.stall_in.eq(self.a_stall_i)
+ comb += self.f_fetch_err_o.eq(i_out.fetch_failed)
+ comb += self.f_badaddr_o.eq(i_out.nia)
+ comb += self.f_instr_o.eq(i_out.insn)
+ comb += self.f_busy_o.eq(~i_out.valid) # probably
+
+ # TODO, connect dcache wb_in/wb_out to "standard" nmigen Wishbone bus
+ ibus = self.ibus
+ comb += ibus.adr.eq(self.bus.adr)
+ comb += ibus.dat_w.eq(self.bus.dat_w)
+ comb += ibus.sel.eq(self.bus.sel)
+ comb += ibus.cyc.eq(self.bus.cyc)
+ comb += ibus.stb.eq(self.bus.stb)
+ comb += ibus.we.eq(self.bus.we)
+
+ comb += self.bus.dat_r.eq(ibus.dat_r)
+ comb += self.bus.ack.eq(ibus.ack)
+ if hasattr(ibus, "stall"):
+ comb += self.bus.stall.eq(ibus.stall)
+ else:
+ # fake-up the wishbone stall signal to comply with pipeline mode
+ # same thing is done in dcache.py
+ comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
+
return m
def icache_sim(dut):
- i_out = dut.i_in
- i_in = dut.i_out
+ i_in = dut.i_in
+ i_out = dut.i_out
m_out = dut.m_in
- yield i_in.valid.eq(0)
- yield i_out.priv_mode.eq(1)
- yield i_out.req.eq(0)
- yield i_out.nia.eq(0)
- yield i_out.stop_mark.eq(0)
+ yield i_in.priv_mode.eq(1)
+ yield i_in.req.eq(0)
+ yield i_in.nia.eq(0)
+ yield i_in.stop_mark.eq(0)
yield m_out.tlbld.eq(0)
yield m_out.tlbie.eq(0)
yield m_out.addr.eq(0)
yield
yield
yield
- yield i_out.req.eq(1)
- yield i_out.nia.eq(Const(0x0000000000000004, 64))
- for i in range(30):
- yield
+
+ # miss, stalls for a bit
+ yield i_in.req.eq(1)
+ yield i_in.nia.eq(Const(0x0000000000000004, 64))
yield
- valid = yield i_in.valid
+ valid = yield i_out.valid
+ while not valid:
+ yield
+ valid = yield i_out.valid
+ yield i_in.req.eq(0)
+
+ insn = yield i_out.insn
nia = yield i_out.nia
- insn = yield i_in.insn
- print(f"valid? {valid}")
- assert valid
assert insn == 0x00000001, \
"insn @%x=%x expected 00000001" % (nia, insn)
- yield i_out.req.eq(0)
+ yield i_in.req.eq(0)
yield
# hit
+ yield i_in.req.eq(1)
+ yield i_in.nia.eq(Const(0x0000000000000008, 64))
yield
+ valid = yield i_out.valid
+ while not valid:
+ yield
+ valid = yield i_out.valid
+ yield i_in.req.eq(0)
+
+ nia = yield i_out.nia
+ insn = yield i_out.insn
yield
- yield i_out.req.eq(1)
- yield i_out.nia.eq(Const(0x0000000000000008, 64))
- yield
- yield
- valid = yield i_in.valid
- nia = yield i_in.nia
- insn = yield i_in.insn
- assert valid
assert insn == 0x00000002, \
"insn @%x=%x expected 00000002" % (nia, insn)
- yield
# another miss
- yield i_out.req.eq(1)
- yield i_out.nia.eq(Const(0x0000000000000040, 64))
- for i in range(30):
- yield
+ yield i_in.req.eq(1)
+ yield i_in.nia.eq(Const(0x0000000000000040, 64))
yield
- valid = yield i_in.valid
- nia = yield i_out.nia
- insn = yield i_in.insn
- assert valid
+ valid = yield i_out.valid
+ while not valid:
+ yield
+ valid = yield i_out.valid
+ yield i_in.req.eq(0)
+
+ nia = yield i_in.nia
+ insn = yield i_out.insn
assert insn == 0x00000010, \
"insn @%x=%x expected 00000010" % (nia, insn)
- # test something that aliases
- yield i_out.req.eq(1)
- yield i_out.nia.eq(Const(0x0000000000000100, 64))
+ # test something that aliases (this only works because
+ # the unit test SRAM is a depth of 512)
+ yield i_in.req.eq(1)
+ yield i_in.nia.eq(Const(0x0000000000000100, 64))
yield
yield
- valid = yield i_in.valid
+ valid = yield i_out.valid
assert ~valid
for i in range(30):
yield
yield
- insn = yield i_in.insn
- valid = yield i_in.valid
- insn = yield i_in.insn
+ insn = yield i_out.insn
+ valid = yield i_out.valid
+ insn = yield i_out.insn
assert valid
assert insn == 0x00000040, \
"insn @%x=%x expected 00000040" % (nia, insn)
- yield i_out.req.eq(0)
-
+ yield i_in.req.eq(0)
def test_icache(mem):
- dut = ICache()
-
- memory = Memory(width=64, depth=512, init=mem)
- sram = SRAM(memory=memory, granularity=8)
-
- m = Module()
-
- m.submodules.icache = dut
- m.submodules.sram = sram
-
- m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
- m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
- m.d.comb += sram.bus.we.eq(dut.wb_out.we)
- m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
- m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
- m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
-
- m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
- m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
-
- # nmigen Simulation
- sim = Simulator(m)
- sim.add_clock(1e-6)
-
- sim.add_sync_process(wrap(icache_sim(dut)))
- with sim.write_vcd('test_icache.vcd'):
+ from soc.config.test.test_loadstore import TestMemPspec
+ pspec = TestMemPspec(addr_wid=32,
+ mask_wid=8,
+ reg_wid=64,
+ XLEN=32,
+ )
+ dut = ICache(pspec)
+
+ memory = Memory(width=64, depth=512, init=mem)
+ sram = SRAM(memory=memory, granularity=8)
+
+ m = Module()
+
+ m.submodules.icache = dut
+ m.submodules.sram = sram
+
+ m.d.comb += sram.bus.cyc.eq(dut.bus.cyc)
+ m.d.comb += sram.bus.stb.eq(dut.bus.stb)
+ m.d.comb += sram.bus.we.eq(dut.bus.we)
+ m.d.comb += sram.bus.sel.eq(dut.bus.sel)
+ m.d.comb += sram.bus.adr.eq(dut.bus.adr)
+ m.d.comb += sram.bus.dat_w.eq(dut.bus.dat_w)
+
+ m.d.comb += dut.bus.ack.eq(sram.bus.ack)
+ m.d.comb += dut.bus.dat_r.eq(sram.bus.dat_r)
+
+ # nmigen Simulation
+ sim = Simulator(m)
+ sim.add_clock(1e-6)
+
+ sim.add_sync_process(wrap(icache_sim(dut)))
+ with sim.write_vcd('test_icache.vcd'):
sim.run()
+
if __name__ == '__main__':
- dut = ICache()
+ from soc.config.test.test_loadstore import TestMemPspec
+ pspec = TestMemPspec(addr_wid=64,
+ mask_wid=8,
+ XLEN=32,
+ reg_wid=64,
+ )
+ dut = ICache(pspec)
vl = rtlil.convert(dut, ports=[])
with open("test_icache.il", "w") as f:
f.write(vl)
+ # set up memory every 32-bits with incrementing values 0 1 2 ...
mem = []
for i in range(512):
mem.append((i*2) | ((i*2+1)<<32))
test_icache(mem)
-
m.submodules.mem = mem = self.mem
do_fetch = Signal() # set when fetch while valid and not stalled
- m.d.comb += do_fetch.eq(self.a_valid_i & ~self.a_stall_i)
+ m.d.comb += do_fetch.eq(self.a_i_valid & ~self.a_stall_i)
# bit of a messy FSM that progresses from idle to in progress
# to done.
with m.If(~do_fetch): # done
m.d.sync += op_in_progress.eq(0)
- m.d.comb += self.a_busy_o.eq(op_actioned & self.a_valid_i)
+ m.d.comb += self.a_busy_o.eq(op_actioned & self.a_i_valid)
# fetch
m.d.comb += mem.rdport.addr.eq(self.a_pc_i[adr_lsb:])
m.d.comb += self.f_instr_o.eq(mem.rdport.data)
class L0CacheBuffer2(Elaboratable):
"""L0CacheBuffer2"""
- def __init__(self, n_units=8, regwid=64, addrwid=48):
+ def __init__(self, n_units=8, regwid=64, addrwid=64):
self.n_units = n_units
self.regwid = regwid
self.addrwid = addrwid
# connect the ports as modules
for i in range(self.n_units):
- d = LDSTSplitter(64, 48, 4, self.dports[i])
+ d = LDSTSplitter(64, 64, 4, self.dports[i])
setattr(m.submodules, "ldst_splitter%d" % i, d)
# state-machine latches TODO
:addr_array_i: an NxN Array of Signals with bits set indicating address
match. bits across the diagonal (addr_array_i[x][x])
will always be set, to indicate "active".
- :data_i: an Nx Array of Records {data: 128 bit, byte_enable: 16 bit}
- :data_o: an Output Record of same type
+ :i_data: an Nx Array of Records {data: 128 bit, byte_enable: 16 bit}
+ :o_data: an Output Record of same type
{data: 128 bit, byte_enable: 16 bit}
"""
self.array_size = array_size
ul = []
for i in range(array_size):
ul.append(DataMergerRecord())
- self.data_i = Array(ul)
- self.data_o = DataMergerRecord()
+ self.i_data = Array(ul)
+ self.o_data = DataMergerRecord()
def elaborate(self, platform):
m = Module()
select = self.addr_array_i[idx][j]
r = DataMergerRecord()
with m.If(select):
- comb += r.eq(self.data_i[j])
+ comb += r.eq(self.i_data[j])
l.append(r)
- comb += self.data_o.data.eq(ortreereduce(l, "data"))
- comb += self.data_o.en.eq(ortreereduce(l, "en"))
+ comb += self.o_data.data.eq(ortreereduce(l, "data"))
+ comb += self.o_data.en.eq(ortreereduce(l, "en"))
return m
for j in range(self.n_units):
inp = self.input_array[j]
- m.d.comb += dm_even.data_i[j].en.eq(inp.bytemask_even)
- m.d.comb += dm_odd.data_i[j].en.eq(inp.bytemask_odd)
- m.d.comb += dm_even.data_i[j].data.eq(inp.data_even)
- m.d.comb += dm_odd.data_i[j].data.eq(inp.data_odd)
+ m.d.comb += dm_even.i_data[j].en.eq(inp.bytemask_even)
+ m.d.comb += dm_odd.i_data[j].en.eq(inp.bytemask_odd)
+ m.d.comb += dm_even.i_data[j].data.eq(inp.data_even)
+ m.d.comb += dm_odd.i_data[j].data.eq(inp.data_odd)
m.d.comb += dm_even.addr_array_i[j].eq(self.addr_match(j,addr_even))
m.d.comb += dm_odd.addr_array_i[j].eq(self.addr_match(j,addr_odd))
- m.d.comb += self.data_odd.eq(dm_odd.data_o.data)
- m.d.comb += self.data_even.eq(dm_even.data_o.data)
+ m.d.comb += self.data_odd.eq(dm_odd.o_data.data)
+ m.d.comb += self.data_even.eq(dm_even.o_data.data)
return m
by this class. That task is taken care of by LDSTCompUnit.
"""
- def __init__(self, n_units, pimem, regwid=64, addrwid=48):
+ def __init__(self, n_units, pimem, regwid=64, addrwid=64):
self.n_units = n_units
self.pimem = pimem
self.regwid = regwid
def data_merger_merge(dut):
# starting with all inputs zero
yield Settle()
- en = yield dut.data_o.en
- data = yield dut.data_o.data
+ en = yield dut.o_data.en
+ data = yield dut.o_data.data
assert en == 0, "en must be zero"
assert data == 0, "data must be zero"
yield
yield dut.addr_array_i[0].eq(0xFF)
for j in range(dut.array_size):
- yield dut.data_i[j].en.eq(1 << j)
- yield dut.data_i[j].data.eq(0xFF << (16*j))
+ yield dut.i_data[j].en.eq(1 << j)
+ yield dut.i_data[j].data.eq(0xFF << (16*j))
yield Settle()
- en = yield dut.data_o.en
- data = yield dut.data_o.data
+ en = yield dut.o_data.en
+ data = yield dut.o_data.data
assert data == 0xff00ff00ff00ff00ff00ff00ff00ff
assert en == 0xff
yield
def test_l0_cache_test_bare_wb(self):
pspec = TestMemPspec(ldst_ifacetype='test_bare_wb',
- addr_wid=48,
+ addr_wid=64,
mask_wid=8,
reg_wid=64)
dut = TstL0CacheBuffer(pspec)
def test_l0_cache_testpi(self):
pspec = TestMemPspec(ldst_ifacetype='testpi',
- addr_wid=48,
+ addr_wid=64,
mask_wid=8,
reg_wid=64)
dut = TstL0CacheBuffer(pspec)
do_store = Signal() # set when store while valid and not stalled
m.d.comb += [
- do_load.eq(self.x_ld_i & (self.x_valid_i & ~self.x_stall_i)),
- do_store.eq(self.x_st_i & (self.x_valid_i & ~self.x_stall_i)),
+ do_load.eq(self.x_ld_i & (self.x_i_valid & ~self.x_stall_i)),
+ do_store.eq(self.x_st_i & (self.x_i_valid & ~self.x_stall_i)),
]
# bit of a messy FSM that progresses from idle to in progress
# to done.
with m.If(~(do_load | do_store)): # done
m.d.sync += op_in_progress.eq(0)
- m.d.comb += self.x_busy_o.eq(op_actioned & self.x_valid_i)
+ m.d.comb += self.x_busy_o.eq(op_actioned & self.x_i_valid)
m.d.comb += [
# load
DCacheToMMUType,
MMUToICacheType)
+# Radix Tree Page Directory Entry Record, TODO put this somewhere sensible
+# v3.0C Book III p1015-1016 section 6.7.10.1
+class RTPDE(RecordObject):
+ def __init__(self, name=None):
+ super().__init__(name=name)
+ self.nls = Signal(5) # Nextded Access Auth bits 59:63 LSB0 0:4
+ self.rs1 = Signal(3) # Reserved bits 56:58 LSB0 5:7
+ self.nlb = Signal(52) # Next Level Base bit 4:55 LSB0 8:59
+ self.rs2 = Signal(2) # Reserved bit 2:3 LSB0 60:61
+ self.leaf = Signal(1) # leaf bit 1 LSB0 62
+ self.valid = Signal(1) # valid bit 0 LSB0 63
+
+
+# Radix Tree Page Table Entry Record, TODO put this somewhere sensible
+# v3.0C Book III p1016 section 6.7.10.2
+class RTPTE(RecordObject):
+ def __init__(self, name=None):
+ super().__init__(name=name)
+ self.eaa = Signal(4) # Encoded Access Auth bits 60:63 LSB0 0:3
+ self.att = Signal(2) # Attributes bits 58:59 LSB0 4:5
+ self.rs1 = Signal(1) # Reserved bit 57 LSB0 6
+ self.c = Signal(1) # Change bit 56 LSB0 7
+ self.r = Signal(1) # Reference bit 55 LSB0 8
+ self.sw = Signal(3) # SW bits 1:3 bits 52:54 LSB0 9:11
+ self.rpn = Signal(45) # Real Page Number bits 7:51 LSB0 12:56
+ self.rs2 = Signal(4) # Reserved bit 3:6 LSB0 57-60
+ self.sw0 = Signal(1) # SW bit 0 bit 2 LSB0 61
+ self.leaf = Signal(1) # leaf bit 1 LSB0 62
+ self.valid = Signal(1) # valid bit 0 LSB0 63
+
+# and these... which of course are turned round to LSB0 order.
+# TODO: sigh. use botchify and put them in openpower.consts
+EAA_PRIV = 3 # bit 0 (in MSB0) set ==> problem-state banned (priv=1 only)
+EAA_RD = 2 # bit 1 (in MSB0) set ==> loads are permitted
+EAA_WR = 1 # bit 2 (in MSB0) set ==> load and stores permitted
+EAA_EXE = 0 # bit 3 (in MSB0) set ==> execute permitted
+
+# for debugging
+display_invalid = True
@unique
class State(Enum):
RADIX_FINISH = 9
+# Process Table Record - near-identical to Page Table Record (same format)
+# v3.0C Book III Section 6.7.6.2 p1004
+class PRTBL(RecordObject):
+ def __init__(self, name=None):
+ super().__init__(name=name)
+ self.rpds = Signal(5) # Root Page Directory Size 59:63 LSB0 0:4
+ self.rts2 = Signal(3) # Radix Tree Size part 2 56:58 LSB0 5:7
+ self.rpdb = Signal(52) # Root Page Directory Base 4:55 LSB0 8:59
+ self.rsv2 = Signal(1) # reserved 3 LSB0 60
+ self.rts1 = Signal(2) # Radix Tree Size part 1 1:2 LSB0 61:62
+ self.rsv1 = Signal(1) # reserved 0 LSB0 63
+
+
class RegStage(RecordObject):
def __init__(self, name=None):
super().__init__(name=name)
self.priv = Signal()
self.addr = Signal(64)
self.inval_all = Signal()
+
# config SPRs
self.prtbl = Signal(64)
self.pid = Signal(32)
+
# internal state
self.state = Signal(State) # resets to IDLE
self.done = Signal()
self.err = Signal()
+
+ # there are 4 quadrants (0-3): here we only support 2 (pt0 and pt3)
+ # these are bits 62-63 of any given address.
+ # except in segment_check, bit 62 is ignored
+ # Quadrant Select can be seen in v3.0C 6.7.10 p1015 book III figure 36
+ # and is further described in 6.7.11.3 p1019
self.pgtbl0 = Signal(64)
self.pt0_valid = Signal()
self.pgtbl3 = Signal(64)
self.pt3_valid = Signal()
+
self.shift = Signal(6)
self.mask_size = Signal(5)
self.pgbase = Signal(56)
self.rc_error = Signal()
+# Page Table Record - note that HR bit is treated as part of rts below
+# (near-identical to Process Table Record - same format)
+# v3.0C Book III Section 6.7.6.1 p1003
+class PGTBL(RecordObject):
+ def __init__(self, name=None):
+ super().__init__(name=name)
+ self.rpds = Signal(5) # Root Page Directory Size 59:63 LSB0 0:4
+ self.rts2 = Signal(3) # Radix Tree Size part 2 56:58 LSB0 5:7
+ self.rpdb = Signal(52) # Root Page Directory Base 4:55 LSB0 8:59
+ self.s = Signal(1) # Host Secure 3 LSB0 60
+ self.rts1 = Signal(2) # Radix Tree Size part 1 1:2 LSB0 61:62
+ self.hr = Signal(1) # Host Radix 0 LSB0 63
+
+
class MMU(Elaboratable):
"""Radix MMU
(i.e. there is no gRA -> hRA translation).
"""
def __init__(self):
- self.l_in = LoadStore1ToMMUType()
- self.l_out = MMUToLoadStore1Type()
- self.d_out = MMUToDCacheType()
- self.d_in = DCacheToMMUType()
- self.i_out = MMUToICacheType()
+ self.l_in = LoadStore1ToMMUType("l_in")
+ self.l_out = MMUToLoadStore1Type("l_out")
+ self.d_out = MMUToDCacheType("d_out")
+ self.d_in = DCacheToMMUType("d_in")
+ self.i_out = MMUToICacheType("i_out")
def radix_tree_idle(self, m, l_in, r, v):
+ """radix_tree_idle - the main decision-point. valid actions include:
+ * LDST incoming TLBIE request (invalidate TLB entry)
+ * LDST incoming RADIX walk request
+ * set either PRTBL or PID SPRs (which then fires a TLB invalidate)
+ """
comb = m.d.comb
sync = m.d.sync
pt_valid = Signal()
- pgtbl = Signal(64)
+ pgtbl = PGTBL("pgtbl")
rts = Signal(6)
- mbits = Signal(6)
+ mbits = Signal(6, name="mbits_idle")
- with m.If(~l_in.addr[63]):
- comb += pgtbl.eq(r.pgtbl0)
- comb += pt_valid.eq(r.pt0_valid)
- with m.Else():
+ with m.If(l_in.addr[63]): # quadrant 3
comb += pgtbl.eq(r.pgtbl3)
comb += pt_valid.eq(r.pt3_valid)
+ with m.Else():
+ comb += pgtbl.eq(r.pgtbl0)
+ comb += pt_valid.eq(r.pt0_valid)
# rts == radix tree size, number of address bits
- # being translated
- comb += rts.eq(Cat(pgtbl[5:8], pgtbl[61:63]))
+ # being translated. takes bits 5:7 and 61:62
+ comb += rts.eq(Cat(pgtbl.rts2, pgtbl.rts1, C(0)))
# mbits == number of address bits to index top
- # level of tree
- comb += mbits.eq(pgtbl[0:5])
+ # level of tree. takes bits 0:4
+ comb += mbits.eq(pgtbl.rpds)
# set v.shift to rts so that we can use finalmask
- # for the segment check
+ # for the segment check.
+ # note: rpdb (52 bits long) is truncated to 48 bits
comb += v.shift.eq(rts)
comb += v.mask_size.eq(mbits[0:5])
- comb += v.pgbase.eq(Cat(C(0, 8), pgtbl[8:56]))
+
+ # create the page base from root page directory base (48 bits with 8 0s)
+ comb += v.pgbase.eq(Cat(C(0, 8), pgtbl.rpdb[:48])) # bits 8:55
+
+ # request either TLB invalidate
+ # or start a RADIX walk
with m.If(l_in.valid):
comb += v.addr.eq(l_in.addr)
comb += v.store.eq(~(l_in.load | l_in.iside))
comb += v.priv.eq(l_in.priv)
- comb += Display("state %d l_in.valid addr %x iside %d store %d "
- "rts %x mbits %x pt_valid %d",
+ sync += Display("state %d l_in.valid addr %x iside %d store %d "
+ "rpdb %x rts %d mbits %d pt_valid %d",
v.state, v.addr, v.iside, v.store,
- rts, mbits, pt_valid)
+ pgtbl.rpdb, rts, mbits, pt_valid)
with m.If(l_in.tlbie):
# Invalidate all iTLB/dTLB entries for
# set v.shift so we can use finalmask
# for generating the process table
# entry address
- comb += v.shift.eq(r.prtbl[0:5])
+ prtbl = PRTBL("prtbl")
+ comb += prtbl.eq(r.prtbl)
+ comb += v.shift.eq(prtbl.rpds)
comb += v.state.eq(State.PROC_TBL_READ)
with m.Elif(mbits == 0):
# Use RPDS = 0 to disable radix tree walks
comb += v.state.eq(State.RADIX_FINISH)
comb += v.invalid.eq(1)
+ if(display_invalid):
+ sync += Display("MMUBUG: Use RPDS = 0 to disable"
+ " radix tree walks")
with m.Else():
comb += v.state.eq(State.SEGMENT_CHECK)
+ # set either PID or PRTBL SPRs
+ # (then invalidate TLBs)
+
with m.If(l_in.mtspr):
# Move to PID needs to invalidate L1 TLBs
- # and cached pgtbl0 value. Move to PRTBL
- # does that plus invalidating the cached
+ # and cached pgtbl0 value.
+ # Move to PRTBL does that plus invalidating the cached
# pgtbl3 value as well.
with m.If(~l_in.sprn[9]):
comb += v.pid.eq(l_in.rs[0:32])
def proc_tbl_wait(self, m, v, r, data):
comb = m.d.comb
- with m.If(r.addr[63]):
- comb += v.pgtbl3.eq(data)
+ sync = m.d.sync
+ rts = Signal(6)
+ mbits = Signal(6, name="mbits_tbl_wait")
+ prtbl = PRTBL("prtblw")
+ comb += prtbl.eq(data)
+
+ with m.If(r.addr[63]): # top bit of quadrant selects pt3
+ comb += v.pgtbl3.eq(prtbl)
comb += v.pt3_valid.eq(1)
with m.Else():
- comb += v.pgtbl0.eq(data)
+ comb += v.pgtbl0.eq(prtbl)
comb += v.pt0_valid.eq(1)
- rts = Signal(6)
- mbits = Signal(6)
-
# rts == radix tree size, # address bits being translated
- comb += rts.eq(Cat(data[5:8], data[61:63]))
+ comb += rts.eq(Cat(prtbl.rts2, prtbl.rts1, C(0)))
# mbits == # address bits to index top level of tree
- comb += mbits.eq(data[0:5])
+ comb += mbits.eq(prtbl.rpds[0:5])
# set v.shift to rts so that we can use finalmask for the segment check
comb += v.shift.eq(rts)
comb += v.mask_size.eq(mbits[0:5])
- comb += v.pgbase.eq(Cat(C(0, 8), data[8:56]))
+
+ # create the page base from root page directory base (48 bits with 8 0s)
+ comb += v.pgbase.eq(Cat(C(0, 8), prtbl.rpdb[:48])) # bits 8:55
with m.If(mbits):
comb += v.state.eq(State.SEGMENT_CHECK)
+ sync += Display("PROC TBL %d data %x rts1 %x rts2 %x rts %d "
+ "rpdb %x mbits %d pgbase %x "
+ " pt0_valid %d, pt3_valid %d",
+ v.state, data, prtbl.rts1, prtbl.rts2, rts,
+ prtbl.rpdb, mbits, v.pgbase,
+ v.pt0_valid, v.pt3_valid)
with m.Else():
comb += v.state.eq(State.RADIX_FINISH)
comb += v.invalid.eq(1)
+ if (display_invalid): m.d.sync += Display("MMU: mbits is invalid")
def radix_read_wait(self, m, v, r, d_in, data):
comb = m.d.comb
sync = m.d.sync
+ rpte = RTPTE(name="radix_rpte") # page-table (leaf) entry
+ rpde = RTPDE(name="radix_rpde") # page-directory (non-leaf) entry
+
perm_ok = Signal()
rc_ok = Signal()
- mbits = Signal(6)
- valid = Signal()
- leaf = Signal()
+ mbits = Signal(6, name="mbits_read_wait")
+ valid = rpte.valid
+ eaa = rpte.eaa
+ leaf = rpte.leaf
badtree = Signal()
- comb += Display("RDW %016x done %d "
+ sync += Display("RDW %016x done %d "
"perm %d rc %d mbits %d shf %d "
"valid %d leaf %d bad %d",
data, d_in.done, perm_ok, rc_ok,
mbits, r.shift, valid, leaf, badtree)
- # set pde
+ # set pde and interpret as Radix Tree Page Table Entry (leaf=1 case)
comb += v.pde.eq(data)
+ comb += rpte.eq(data)
+ comb += rpde.eq(data)
- # test valid bit
- comb += valid.eq(data[63]) # valid=data[63]
- comb += leaf.eq(data[62]) # valid=data[63]
-
- comb += v.pde.eq(data)
- # valid & leaf
with m.If(valid):
+ # valid & leaf: RADIX Page-Table Entry
with m.If(leaf):
# check permissions and RC bits
- with m.If(r.priv | ~data[3]):
- with m.If(~r.iside):
- comb += perm_ok.eq(data[1] | (data[2] & ~r.store))
- with m.Else():
+ with m.If(r.priv | ~eaa[EAA_PRIV]):
+ with m.If(r.iside): # instruction-side request
# no IAMR, so no KUEP support for now
# deny execute permission if cache inhibited
- comb += perm_ok.eq(data[0] & ~data[5])
+ comb += perm_ok.eq(eaa[EAA_EXE] & ~rpte.att[1])
+ with m.Else():
+ # Load/Store (read/write)
+ comb += perm_ok.eq(eaa[EAA_WR] |
+ (eaa[EAA_RD] & ~r.store))
+ comb += rc_ok.eq(rpte.r & (rpte.c | ~r.store))
- comb += rc_ok.eq(data[8] & (data[7] | ~r.store))
+ # permissions / rc ok, load TLB, otherwise report error
with m.If(perm_ok & rc_ok):
comb += v.state.eq(State.RADIX_LOAD_TLB)
+ sync += Display("RADIX LEAF data %x att %x eaa %x "
+ "R %d C %d "
+ "shift %d pgbase %x ",
+ data, rpte.att, eaa,
+ rpte.r, rpte.c,
+ v.shift, v.pgbase
+ )
with m.Else():
comb += v.state.eq(State.RADIX_FINISH)
comb += v.perm_err.eq(~perm_ok)
# permission error takes precedence over RC error
comb += v.rc_error.eq(perm_ok)
- # valid & !leaf
+ # valid & !leaf: RADIX Page-Directory Entry
with m.Else():
- comb += mbits.eq(data[0:5])
+ comb += mbits.eq(rpde.nls) # 5 bits NLS into 6-bit-long mbits
comb += badtree.eq((mbits < 5) |
(mbits > 16) |
(mbits > r.shift))
comb += v.badtree.eq(1)
with m.Else():
comb += v.shift.eq(r.shift - mbits)
- comb += v.mask_size.eq(mbits[0:5])
- comb += v.pgbase.eq(Cat(C(0, 8), data[8:56]))
+ comb += v.mask_size.eq(mbits)
+ # pagebase is first 48 bits of NLB, shifted up 1 byte
+ comb += v.pgbase.eq(Cat(C(0, 8), rpde.nlb[:48]))
comb += v.state.eq(State.RADIX_LOOKUP)
with m.Else():
# non-present PTE, generate a DSI
comb += v.state.eq(State.RADIX_FINISH)
comb += v.invalid.eq(1)
+ if (display_invalid):
+ sync += Display("MMU: non-present PTE, generate a DSI")
def segment_check(self, m, v, r, data, finalmask):
+ """segment_check: checks validity of the request before doing a
+ RADIX lookup. reports either segment error or bad tree if not ok
+ """
comb = m.d.comb
- mbits = Signal(6)
+ mbits = Signal(6, name="mbits_check")
nonzero = Signal()
comb += mbits.eq(r.mask_size)
comb += v.shift.eq(r.shift + (31 - 12) - mbits)
comb += nonzero.eq((r.addr[31:62] & ~finalmask[0:31]).bool())
- with m.If((r.addr[63] ^ r.addr[62]) | nonzero):
+ with m.If((r.addr[63] != r.addr[62]) # pt3 == 0b11 and pt1 == 0b00
+ | nonzero):
comb += v.state.eq(State.RADIX_FINISH)
comb += v.segerror.eq(1)
with m.Elif((mbits < 5) | (mbits > 16) |
sync += Display("MMU completing op without error")
with m.If(l_out.err):
- sync += Display("MMU completing op with err invalid"
+ sync += Display("MMU completing op with err invalid="
"%d badtree=%d", l_out.invalid, l_out.badtree)
with m.If(rin.state == State.RADIX_LOOKUP):
- sync += Display ("radix lookup shift=%d msize=%d",
- rin.shift, rin.mask_size)
+ sync += Display ("radix lookup shift=%x msize=%x",
+ rin.shift, mask)
with m.If(r.state == State.RADIX_LOOKUP):
- sync += Display(f"send load addr=%x addrsh=%d mask=%x",
+ sync += Display(f"send load addr=%x addrsh=%x mask=%x",
d_out.addr, addrsh, mask)
+
+ # update the internal register
sync += r.eq(rin)
def elaborate(self, platform):
self.rin = rin = RegStage("r_in")
r = RegStage("r")
+ # get access to prtbl and pid for debug / testing purposes ONLY
+ # (actually, not needed, because setup_regs() triggers mmu direct)
+ # self._prtbl = r.prtbl
+ # self._pid = r.pid
+
l_in = self.l_in
l_out = self.l_out
d_out = self.d_out
self.mmu_0(m, r, rin, l_in, l_out, d_out, addrsh, mask)
- v = RegStage()
+ v = RegStage("v")
dcreq = Signal()
tlb_load = Signal()
itlb_load = Signal()
comb += v.eq(r)
comb += v.valid.eq(0)
- comb += dcreq.eq(0)
comb += v.done.eq(0)
comb += v.err.eq(0)
comb += v.invalid.eq(0)
comb += v.segerror.eq(0)
comb += v.perm_err.eq(0)
comb += v.rc_error.eq(0)
- comb += tlb_load.eq(0)
- comb += itlb_load.eq(0)
- comb += tlbie_req.eq(0)
comb += v.inval_all.eq(0)
- comb += prtbl_rd.eq(0)
# Radix tree data structures in memory are
# big-endian, so we need to byte-swap them
# generate mask for extracting address fields for PTE addr generation
m.submodules.pte_mask = pte_mask = Mask(16-5)
+ pte_mask.mask.name = "pte_mask"
comb += pte_mask.shift.eq(r.mask_size - 5)
comb += mask.eq(Cat(C(0x1f, 5), pte_mask.mask))
# generate mask for extracting address bits to go in
# TLB entry in order to support pages > 4kB
m.submodules.tlb_mask = tlb_mask = Mask(44)
+ tlb_mask.mask.name = "tlb_mask"
comb += tlb_mask.shift.eq(r.shift)
comb += finalmask.eq(tlb_mask.mask)
+ # Shift address bits 61--12 right by 0--47 bits and
+ # supply the least significant 16 bits of the result.
+ comb += addrsh.eq(r.addr[12:62] >> r.shift)
+
with m.If(r.state != State.IDLE):
sync += Display("MMU state %d %016x", r.state, data)
+ sync += Display("addrsh %x r.shift %d r.addr[12:62] %x",
+ addrsh, r.shift, r.addr[12:62])
+
+ ##########
+ # Main FSM
+ ##########
with m.Switch(r.state):
with m.Case(State.IDLE):
sync += Display(" RADIX_FINISH")
comb += v.state.eq(State.IDLE)
+ # check and report either error or done.
with m.If((v.state == State.RADIX_FINISH) |
((v.state == State.RADIX_LOAD_TLB) & r.iside)):
comb += v.err.eq(v.invalid | v.badtree | v.segerror
| v.perm_err | v.rc_error)
comb += v.done.eq(~v.err)
- with m.If(~r.addr[63]):
+ # PID is only valid if MSB of address is zero, top 2 bits are Quadrant
+ with m.If(~r.addr[63]): # quadrant 0 (pt0)
comb += effpid.eq(r.pid)
+ # calculate Process Table Address
pr24 = Signal(24, reset_less=True)
- comb += pr24.eq(masked(r.prtbl[12:36], effpid[8:32], finalmask))
- comb += prtb_adr.eq(Cat(C(0, 4), effpid[0:8], pr24, r.prtbl[36:56]))
+ prtbla = PRTBL("prtbla")
+ comb += prtbla.eq(r.prtbl)
+ rpdb = prtbla.rpdb
+ comb += pr24.eq(masked(rpdb[4:28], effpid[8:32], finalmask))
+ comb += prtb_adr.eq(Cat(C(0, 4), effpid[0:8], pr24, rpdb[28:48]))
+ # calculate Page Table Address
pg16 = Signal(16, reset_less=True)
comb += pg16.eq(masked(r.pgbase[3:19], addrsh, mask))
comb += pgtb_adr.eq(Cat(C(0, 3), pg16, r.pgbase[19:56]))
+ # calculate Page Table Entry from Real Page Number (leaf=1, RTPTE)
+ rpte = RTPTE(name="rpte")
+ comb += rpte.eq(r.pde)
pd44 = Signal(44, reset_less=True)
- comb += pd44.eq(masked(r.pde[12:56], r.addr[12:56], finalmask))
+ comb += pd44.eq(masked(rpte.rpn, r.addr[12:56], finalmask))
comb += pte.eq(Cat(r.pde[0:12], pd44))
# update registers
comb += addr.eq(prtb_adr)
with m.Else():
comb += addr.eq(pgtb_adr)
+ sync += Display(f"pagetable pg16=%x addrsh %x mask %x pgbase=%x "
+ "pgbase[19:56]=%x",
+ pg16, addrsh, mask, r.pgbase, r.pgbase[19:56])
+ # connect to other interfaces: LDST, D-Cache, I-Cache
comb += l_out.done.eq(r.done)
comb += l_out.err.eq(r.err)
comb += l_out.invalid.eq(r.invalid)
mem = {0x0: 0x000000, # to get mtspr prtbl working
0x10000: # PARTITION_TABLE_2
- # PATB_GR=1 PRTB=0x1000 PRTS=0xb
- b(0x800000000100000b),
+ # HR=1 RTS1=0x2 PRTB=0x300 RTS2=0x5 PRTS=0xb
+ b(0xc0000000000030ad),
0x30000: # RADIX_ROOT_PTE
# V = 1 L = 0 NLB = 0x400 NLS = 9
# R = 1 C = 1 ATT = 0 EAA 0x7
b(0xc000000000000187),
- 0x1000000: # PROCESS_TABLE_3
+#
+# slightly different from radix_walk_example.txt: address in microwatt
+# has the top bit set to indicate hypervisor. here, Quadrant 3's
+# process table entry is put instead into Quadrant 0. the entry
+# PROCESS_TABLE_3 should, strictly speaking, be at 0x1000010
+
+# 0x1000000: # PROCESS_TABLE_3 (pt0_valid)
+# # RTS1 = 0x2 RPDB = 0x300 RTS2 = 0x5 RPDS = 12
+# b(0x40000000000300ac),
+
+ 0x1000000: # PROCESS_TABLE_3 (pt3_valid)
# RTS1 = 0x2 RPDB = 0x300 RTS2 = 0x5 RPDS = 13
b(0x40000000000300ad),
}
+ # microwatt mmu.bin first part of test 2.
+ # PRTBL must be set to 0x12000, PID to 1
+ mem = {
+ 0x0: 0x000000, # to get mtspr prtbl working
+ 0x13920: 0x86810000000000c0, # leaf, supposed to be at 0x13920
+ 0x10000: 0x0930010000000080, # directory node
+ 0x12010: 0x0a00010000000000, # page table
+ 0x124000: 0x0000000badc0ffee, # memory to be looked up
+ }
+
+ # microwatt mmu.bin first part of test 4.
+ # PRTBL must be set to 0x12000, PID to 1
+ mem = {
+ 0x0: 0x000000, # to get mtspr prtbl working
+ 0x13858: 0x86a10000000000c0, # leaf node
+ 0x10000: 0x0930010000000080, # directory node
+ 0x12010: 0x0a00010000000000, # page table
+ }
+
+ # microwatt mmu.bin test 5.
+ # PRTBL must be set to 0x12000, PID to 1
+ mem = {
+ 0x0: 0x000000, # to get mtspr prtbl working
+ 0x13cf8: 0x86b10000000000c0, # leaf node
+ 0x13d00: 0x0000000000000000, # invalid leaf node
+ 0x10008: 0x0930010000000080, # directory node
+ 0x12010: 0x0a00010000000000, # page table
+ }
+
+ # microwatt mmu.bin test 12, instruction-side
+ # PRTBL must be set to 0x12000, PID to 1, iside to 1
+ mem = {
+ 0x0: 0x000000, # to get mtspr prtbl working
+ 0x13920: 0x01110000000000c0, # leaf node
+ 0x10008: 0x0930010000000080, # directory node
+ 0x12010: 0x0a00010000000000, # page table
+ }
+
while not stop:
while True: # wait for dc_valid
if stop:
return
dc_valid = yield (dut.d_out.valid)
+ tlbld = yield (dut.d_out.tlbld)
if dc_valid:
break
yield
addr = yield dut.d_out.addr
+ if tlbld:
+ pte = yield dut.d_out.pte
+ print (" DCACHE PTE %x -> %x" % (pte, addr))
+ yield dut.d_in.done.eq(1)
+ yield
+ yield dut.d_in.done.eq(0)
+ continue
+
if addr not in mem:
print (" DCACHE LOOKUP FAIL %x" % (addr))
stop = True
yield
yield dut.d_in.done.eq(0)
+
def mmu_wait(dut):
global stop
while not stop: # wait for dc_valid / err
+ d_valid = yield (dut.d_out.valid)
+ if d_valid:
+ tlbld = yield (dut.d_out.tlbld)
+ addr = yield (dut.d_out.addr)
+ print ("addr %x tlbld %d" % (addr, tlbld))
l_done = yield (dut.l_out.done)
l_err = yield (dut.l_out.err)
l_badtree = yield (dut.l_out.badtree)
yield dut.l_in.mtspr.eq(0) # captured by RegStage(s)
yield dut.l_in.load.eq(0) # can reset everything safely
+
def mmu_sim(dut):
global stop
+ # microwatt PRTBL = 0x12000, other test is 0x1000000
+ #prtbl = 0x100000
+ #pidr = 0x0
+ prtbl = 0x12000
+ pidr = 0x1
+
# MMU MTSPR set prtbl
yield dut.l_in.mtspr.eq(1)
yield dut.l_in.sprn[9].eq(1) # totally fake way to set SPR=prtbl
- yield dut.l_in.rs.eq(0x1000000) # set process table
+ yield dut.l_in.rs.eq(prtbl) # set process table
yield dut.l_in.valid.eq(1)
yield from mmu_wait(dut)
yield
prtbl = yield (dut.rin.prtbl)
print ("prtbl after MTSPR %x" % prtbl)
- assert prtbl == 0x1000000
+ assert prtbl == prtbl
+
+ if True: # microwatt test set PIDR
+ # MMU MTSPR set PIDR = 1
+ yield dut.l_in.mtspr.eq(1)
+ yield dut.l_in.sprn[9].eq(0) # totally fake way to set SPR=pidr
+ yield dut.l_in.rs.eq(pidr) # set process table
+ yield dut.l_in.valid.eq(1)
+ yield from mmu_wait(dut)
+ yield
+ yield dut.l_in.sprn.eq(0)
+ yield dut.l_in.rs.eq(0)
+ yield
#yield dut.rin.prtbl.eq(0x1000000) # manually set process table
#yield
+ #addr = 0x10000 # original test
+ #addr = 0x124108 # microwatt mmu.bin test 2
+ #addr = 0x10b0d8 # microwatt mmu.bin test 4
+ # these are a misalignment test. one load results in two actual
+ # lookups, one of which has a valid page table entry, the other
+ # does not. we currently do not support misaligned in Loadstore1
+ # therefore these tests fail with an align_intr (0x600) at 0x39fffd
+ addr = 0x39fffd # microwatt mmu.bin test 5
+ addr = 0x3a0000 # microwatt mmu.bin test 5
+
+ # microwatt mmu.bin test 12 is instruction-side
+ addr = 0x324000 # microwatt mmu.bin test 12
+ iside = 1
# MMU PTE request
- yield dut.l_in.load.eq(1)
+ yield dut.l_in.iside.eq(iside)
+ yield dut.l_in.load.eq(0)
yield dut.l_in.priv.eq(1)
- yield dut.l_in.addr.eq(0x10000)
+ yield dut.l_in.addr.eq(addr)
yield dut.l_in.valid.eq(1)
yield from mmu_wait(dut)
addr = yield dut.d_out.addr
pte = yield dut.d_out.pte
+ tlb_ld = yield dut.d_out.tlbld
l_done = yield (dut.l_out.done)
l_err = yield (dut.l_out.err)
l_badtree = yield (dut.l_out.badtree)
- print ("translated done %d err %d badtree %d addr %x pte %x" % \
- (l_done, l_err, l_badtree, addr, pte))
+ print ("translated done %d err %d badtree %d "
+ "addr %x pte %x tlb_ld %d" % \
+ (l_done, l_err, l_badtree, addr, pte, tlb_ld))
+
yield
yield dut.l_in.priv.eq(0)
yield dut.l_in.addr.eq(0)
busy_o/1 most likely to be x_busy_o
go_die_i/1 rst?
- addr.data/48 x_addr_i (x_addr_i[:4] goes into LenExpand)
- addr.ok/1 probably x_valid_i & ~x_stall_i
+ addr.data/64 x_addr_i (x_addr_i[:4] goes into LenExpand)
+ addr.ok/1 probably x_i_valid & ~x_stall_i
addr_ok_o/1 no equivalent. *might* work using x_stall_i
exc_o/6(?) m_load_err_o and m_store_err_o
class Pi2LSUI(PortInterfaceBase):
def __init__(self, name, lsui=None,
- data_wid=64, mask_wid=8, addr_wid=48):
+ data_wid=64, mask_wid=8, addr_wid=64):
print("pi2lsui reg mask addr", data_wid, mask_wid, addr_wid)
super().__init__(data_wid, addr_wid)
if lsui is None:
self.lsui_busy = Signal()
self.valid_l = SRLatch(False, name="valid")
- def set_wr_addr(self, m, addr, mask, misalign, msr_pr):
+ def set_wr_addr(self, m, addr, mask, misalign, msr, is_dcbz, is_nc):
+ print("pi2lsui TODO, implement is_dcbz")
m.d.comb += self.valid_l.s.eq(1)
m.d.comb += self.lsui.x_mask_i.eq(mask)
m.d.comb += self.lsui.x_addr_i.eq(addr)
- def set_rd_addr(self, m, addr, mask, misalign, msr_pr):
+ def set_rd_addr(self, m, addr, mask, misalign, msr, is_nc):
m.d.comb += self.valid_l.s.eq(1)
m.d.comb += self.lsui.x_mask_i.eq(mask)
m.d.comb += self.lsui.x_addr_i.eq(addr)
m.next = "IDLE"
# indicate valid at both ends. OR with lsui_busy (stops comb loop)
- m.d.comb += self.lsui.m_valid_i.eq(self.valid_l.q )
- m.d.comb += self.lsui.x_valid_i.eq(self.valid_l.q )
+ m.d.comb += self.lsui.m_i_valid.eq(self.valid_l.q )
+ m.d.comb += self.lsui.x_i_valid.eq(self.valid_l.q )
# reset the valid latch when not busy. sync to stop loop
lsui_active = Signal()
class Pi2LSUI1(Elaboratable):
def __init__(self, name, pi=None, lsui=None,
- data_wid=64, mask_wid=8, addr_wid=48):
+ data_wid=64, mask_wid=8, addr_wid=64):
print("pi2lsui reg mask addr", data_wid, mask_wid, addr_wid)
self.addrbits = mask_wid
if pi is None:
# expand the LSBs of address plus LD/ST len into 16-bit mask
m.d.comb += lsui.x_mask_i.eq(lenexp.lexp_o)
# pass through the address, indicate "valid"
- m.d.comb += lsui.x_valid_i.eq(1)
+ m.d.comb += lsui.x_i_valid.eq(1)
# indicate "OK" - XXX should be checking address valid
m.d.comb += pi.addr_ok_o.eq(1)
from nmutil.latch import SRLatch, latchregister
from nmutil.util import rising_edge
from openpower.decoder.power_decoder2 import Data
+from openpower.decoder.power_enums import MSRSpec
from soc.scoreboard.addr_match import LenExpand
from soc.experiment.mem_types import LDSTException
# for testing purposes
from soc.experiment.testmem import TestMemory
#from soc.scoreboard.addr_split import LDSTSplitter
+from nmutil.util import Display
import unittest
busy_o is deasserted on the cycle AFTER st.ok is asserted.
"""
- def __init__(self, name=None, regwid=64, addrwid=48):
+ def __init__(self, name=None, regwid=64, addrwid=64):
self._regwid = regwid
self._addrwid = addrwid
RecordObject.__init__(self, name=name)
- # distinguish op type (ld/st)
- self.is_ld_i = Signal(reset_less=True)
- self.is_st_i = Signal(reset_less=True)
+ # distinguish op type (ld/st/dcbz/nc)
+ self.is_ld_i = Signal(reset_less=True)
+ self.is_st_i = Signal(reset_less=True)
+ self.is_dcbz_i = Signal(reset_less=True) # cache-line zeroing
+ self.is_nc = Signal() # no cacheing
# LD/ST data length (TODO: other things may be needed)
self.data_len = Signal(4, reset_less=True)
+ # atomic reservation (LR/SC - ldarx / stdcx etc.)
+ self.reserve = Signal(reset_less=True)
+
# common signals
self.busy_o = Signal(reset_less=True) # do not use if busy
self.go_die_i = Signal(reset_less=True) # back to reset
# LD/ST
self.ld = Data(regwid, "ld_data_o") # ok to be set by L0 Cache/Buf
self.st = Data(regwid, "st_data_i") # ok to be set by CompUnit
+ self.store_done = Data(1, "store_done_o") # store has been actioned
- # additional "modes"
- self.is_dcbz = Signal() # data cache block zero request
- self.is_nc = Signal() # no cacheing
- self.msr_pr = Signal() # 1==virtual, 0==privileged
+ #only priv_mode = not msr_pr is used currently
+ # TODO: connect signals
+ self.virt_mode = Signal() # ctrl.msr(MSR_DR);
+ self.priv_mode = Signal() # not ctrl.msr(MSR_PR);
+ self.mode_32bit = Signal() # not ctrl.msr(MSR_SF);
- # mmu
- self.mmu_done = Signal() # keep for now
-
# dcache
self.ldst_error = Signal()
## Signalling ld/st error - NC cache hit, TLB miss, prot/RC failure
return [self.is_ld_i.eq(inport.is_ld_i),
self.is_st_i.eq(inport.is_st_i),
self.is_nc.eq(inport.is_nc),
- self.is_dcbz.eq(inport.is_dcbz),
+ self.is_dcbz_i.eq(inport.is_dcbz_i),
self.data_len.eq(inport.data_len),
+ self.reserve.eq(inport.reserve),
self.go_die_i.eq(inport.go_die_i),
self.addr.data.eq(inport.addr.data),
self.addr.ok.eq(inport.addr.ok),
self.st.eq(inport.st),
- self.msr_pr.eq(inport.msr_pr),
+ self.virt_mode.eq(inport.virt_mode),
+ self.priv_mode.eq(inport.priv_mode),
+ self.mode_32bit.eq(inport.mode_32bit),
inport.ld.eq(self.ld),
inport.busy_o.eq(self.busy_o),
inport.addr_ok_o.eq(self.addr_ok_o),
inport.exc_o.eq(self.exc_o),
- inport.mmu_done.eq(self.mmu_done),
+ inport.store_done.eq(self.store_done),
inport.ldst_error.eq(self.ldst_error),
inport.cache_paradox.eq(self.cache_paradox)
]
def connect_port(self, inport):
return self.pi.connect_port(inport)
- def set_wr_addr(self, m, addr, mask, misalign, msr_pr): pass
- def set_rd_addr(self, m, addr, mask, misalign, msr_pr): pass
+ def set_wr_addr(self, m, addr, mask, misalign, msr, is_dcbz, is_nc): pass
+ def set_rd_addr(self, m, addr, mask, misalign, msr, is_nc): pass
def set_wr_data(self, m, data, wen): pass
def get_rd_data(self, m): pass
pi = self.pi
comb += lds.eq(pi.is_ld_i) # ld-req signals
comb += sts.eq(pi.is_st_i) # st-req signals
- pr = pi.msr_pr # MSR problem state: PR=1 ==> virt, PR==0 ==> priv
+
+ # TODO: construct an MSRspec here and pass it over in
+ # self.set_rd_addr and set_wr_addr below rather than just pr
+ pr = ~pi.priv_mode
+ dr = pi.virt_mode
+ sf = ~pi.mode_32bit
+ msr = MSRSpec(pr=pr, dr=dr, sf=sf)
# detect busy "edge"
busy_delay = Signal()
misalign = Signal()
comb += misalign.eq(lenexp.lexp_o[8:].bool())
-
# activate mode: only on "edge"
comb += ld_active.s.eq(rising_edge(m, lds)) # activate LD mode
comb += st_active.s.eq(rising_edge(m, sts)) # activate ST mode
# LD/ST requested activates "busy" (only if not already busy)
with m.If(self.pi.is_ld_i | self.pi.is_st_i):
comb += busy_l.s.eq(~busy_delay)
+ with m.If(self.pi.exc_o.happened):
+ sync += Display("fast exception")
# if now in "LD" mode: wait for addr_ok, then send the address out
# to memory, acknowledge address, and send out LD data
comb += lenexp.len_i.eq(pi.data_len)
comb += lenexp.addr_i.eq(lsbaddr)
with m.If(pi.addr.ok & adrok_l.qn):
- self.set_rd_addr(m, pi.addr.data, lenexp.lexp_o, misalign, pr)
+ self.set_rd_addr(m, pi.addr.data, lenexp.lexp_o, misalign,
+ msr, pi.is_nc)
comb += pi.addr_ok_o.eq(1) # acknowledge addr ok
sync += adrok_l.s.eq(1) # and pull "ack" latch
comb += lenexp.len_i.eq(pi.data_len)
comb += lenexp.addr_i.eq(lsbaddr)
with m.If(pi.addr.ok):
- self.set_wr_addr(m, pi.addr.data, lenexp.lexp_o, misalign, pr)
- with m.If(adrok_l.qn):
+ self.set_wr_addr(m, pi.addr.data, lenexp.lexp_o, misalign, msr,
+ pi.is_dcbz_i, pi.is_nc)
+ with m.If(adrok_l.qn & self.pi.exc_o.happened==0):
comb += pi.addr_ok_o.eq(1) # acknowledge addr ok
sync += adrok_l.s.eq(1) # and pull "ack" latch
comb += reset_l.s.eq(ldok) # reset mode after 1 cycle
# for ST mode, when addr has been "ok'd", wait for incoming "ST ok"
+ sync += st_done.s.eq(0) # store done trigger
with m.If(st_active.q & pi.st.ok):
# shift data up before storing. lenexp *bit* version of mask is
# passed straight through as byte-level "write-enable" lines.
- stdata = Signal(self.regwid, reset_less=True)
+ stdata = Signal(self.regwid*2, reset_less=True)
comb += stdata.eq(pi.st.data << (lenexp.addr_i*8))
# TODO: replace with link to LoadStoreUnitInterface.x_store_data
# and also handle the ready/stall/busy protocol
stok = self.set_wr_data(m, stdata, lenexp.lexp_o)
- sync += st_done.s.eq(1) # store done trigger
+ sync += st_done.s.eq(~self.pi.exc_o.happened) # store done trigger
with m.If(st_done.q):
comb += reset_l.s.eq(stok) # reset mode after 1 cycle
# after waiting one cycle (reset_l is "sync" mode), reset the port
with m.If(reset_l.q):
- comb += ld_active.r.eq(1) # leave the ST active for 1 cycle
+ comb += ld_active.r.eq(1) # leave the LD active for 1 cycle
comb += st_active.r.eq(1) # leave the ST active for 1 cycle
comb += reset_l.r.eq(1) # clear reset
comb += adrok_l.r.eq(1) # address reset
# monitor for an exception, clear busy immediately
with m.If(self.pi.exc_o.happened):
comb += busy_l.r.eq(1)
+ comb += reset_l.s.eq(1) # also reset whole unit
# however ST needs one cycle before busy is reset
#with m.If(self.pi.st.ok | self.pi.ld.ok):
comb += busy_l.r.eq(1)
# busy latch outputs to interface
- comb += pi.busy_o.eq(busy_l.q)
+ if hasattr(self, "external_busy"):
+ # when there is an extra (external) busy, include that here.
+ # this is used e.g. in LoadStore1 when an instruction fault
+ # is being processed (instr_fault) and stops Load/Store requests
+ # from being made until it's done
+ comb += pi.busy_o.eq(busy_l.q | self.external_busy(m))
+ else:
+ comb += pi.busy_o.eq(busy_l.q)
return m
# hard-code memory addressing width to 6 bits
self.mem = TestMemory(regwid, 5, granularity=regwid//8, init=False)
- def set_wr_addr(self, m, addr, mask, misalign, msr_pr):
+ def set_wr_addr(self, m, addr, mask, misalign, msr, is_dcbz, is_nc):
lsbaddr, msbaddr = self.splitaddr(addr)
m.d.comb += self.mem.wrport.addr.eq(msbaddr)
- def set_rd_addr(self, m, addr, mask, misalign, msr_pr):
+ def set_rd_addr(self, m, addr, mask, misalign, msr, is_nc):
lsbaddr, msbaddr = self.splitaddr(addr)
m.d.comb += self.mem.rdport.addr.eq(msbaddr)
# based on microwatt plru.vhdl
-from nmigen import Elaboratable, Signal, Array, Module, Mux, Const
+from nmigen import Elaboratable, Signal, Array, Module, Mux, Const, Cat
from nmigen.cli import rtlil
+from nmigen.lib.coding import Decoder
class PLRU(Elaboratable):
def ports(self):
return [self.acc_en, self.lru_o, self.acc_i]
+
+class PLRUs(Elaboratable):
+ def __init__(self, cachetype, n_plrus, n_bits):
+ self.cachetype = cachetype
+ self.n_plrus = n_plrus
+ self.n_bits = n_bits
+ self.valid = Signal()
+ self.way = Signal(n_bits)
+ self.index = Signal(n_plrus.bit_length())
+ self.isel = Signal(n_plrus.bit_length())
+ self.o_index = Signal(n_bits)
+
+ def elaborate(self, platform):
+ """Generate TLB PLRUs
+ """
+ m = Module()
+ comb = m.d.comb
+
+ if self.n_plrus == 0:
+ return m
+
+ # Binary-to-Unary one-hot, enabled by valid
+ m.submodules.te = te = Decoder(self.n_plrus)
+ comb += te.n.eq(~self.valid)
+ comb += te.i.eq(self.index)
+
+ out = Array(Signal(self.n_bits, name="plru_out%d" % x) \
+ for x in range(self.n_plrus))
+
+ for i in range(self.n_plrus):
+ # PLRU interface
+ name = "%s_plru_%d" % (self.cachetype, i)
+ m.submodules[name] = plru = PLRU(self.n_bits)
+
+ comb += plru.acc_en.eq(te.o[i])
+ comb += plru.acc_i.eq(self.way)
+ comb += out[i].eq(plru.lru_o)
+
+ # select output based on index
+ comb += self.o_index.eq(out[self.isel])
+
+ return m
+
+ def ports(self):
+ return [self.valid, self.way, self.index, self.isel, self.o_index]
+
+
if __name__ == '__main__':
dut = PLRU(2)
vl = rtlil.convert(dut, ports=dut.ports())
f.write(vl)
+ dut = PLRUs("testing", 4, 2)
+ vl = rtlil.convert(dut, ports=dut.ports())
+ with open("test_plrus.il", "w") as f:
+ f.write(vl)
+
+
RTS2 = 0x5
RPDS = 12
- PROCESS_TABLE_3 | PROCESS_TABLE_3 //Hypervisor Userspace
+0x1000010 : PROCESS_TABLE_3 | PROCESS_TABLE_3 //Hypervisor Userspace
0x40000000000300ad | 0x0
RTS1 = 0x2
RPDB = 0x300
self.addr_o = Signal(rwid, reset_less=True)
# in/out register data (note: not register#, actual data)
- self.data_o = Signal(rwid, reset_less=True)
+ self.o_data = Signal(rwid, reset_less=True)
self.src1_i = Signal(rwid, reset_less=True)
self.src2_i = Signal(rwid, reset_less=True)
# input operand
# merge (OR) all integer FU / ALU outputs to a single value
if self.units:
- data_o = treereduce(self.units, "data_o")
- comb += self.data_o.eq(data_o)
+ o_data = treereduce(self.units, "o_data")
+ comb += self.o_data.eq(o_data)
if self.ldstmode:
addr_o = treereduce(self.units, "addr_o")
comb += self.addr_o.eq(addr_o)
# branch is active (TODO: a better signal: this is over-using the
# go_write signal - actually the branch should not be "writing")
with m.If(br1.go_wr_i):
- sync += self.branch_direction_o.eq(br1.data_o+Const(1, 2))
+ sync += self.branch_direction_o.eq(br1.o_data+Const(1, 2))
sync += bspec.active_i.eq(0)
comb += bspec.br_i.eq(1)
# branch occurs if data == 1, failed if data == 0
- comb += bspec.br_ok_i.eq(br1.data_o == 1)
+ comb += bspec.br_ok_i.eq(br1.o_data == 1)
for i in range(n_intfus):
# *expected* direction of the branch matched against *actual*
comb += bshadow.s_good_i[i][0].eq(bspec.match_g_o[i])
comb += int_src2.ren.eq(intfus.src2_rsel_o)
# connect ALUs to regfile
- comb += int_dest.data_i.eq(cu.data_o)
- comb += cu.src1_i.eq(int_src1.data_o)
- comb += cu.src2_i.eq(int_src2.data_o)
+ comb += int_dest.i_data.eq(cu.o_data)
+ comb += cu.src1_i.eq(int_src1.o_data)
+ comb += cu.src2_i.eq(int_src2.o_data)
# connect ALU Computation Units
comb += cu.go_rd_i[0:n_intfus].eq(go_rd_o[0:n_intfus])
self.n_regs = n_regs
mqbits = unsigned(int(log(qlen) / log(2))+2)
- self.p_add_i = Signal(mqbits) # instructions to add (from data_i)
- self.p_ready_o = Signal() # instructions were added
- self.data_i = Instruction._nq(n_in, "data_i")
+ self.p_add_i = Signal(mqbits) # instructions to add (from i_data)
+ self.p_o_ready = Signal() # instructions were added
+ self.i_data = Instruction._nq(n_in, "i_data")
self.busy_o = Signal(reset_less=True) # at least one CU is busy
self.qlen_o = Signal(mqbits, reset_less=True)
# link up instruction queue
comb += iq.p_add_i.eq(self.p_add_i)
- comb += self.p_ready_o.eq(iq.p_ready_o)
+ comb += self.p_o_ready.eq(iq.p_o_ready)
for i in range(self.n_in):
- comb += eq(iq.data_i[i], self.data_i[i])
+ comb += eq(iq.i_data[i], self.i_data[i])
# take instruction and process it. note that it's possible to
# "inspect" the queue contents *without* actually removing the
# "resetting" done above (insn_i=0) could be re-ASSERTed.
with m.If(iq.qlen_o != 0):
# get the operands and operation
- instr = iq.data_o[0]
+ instr = iq.o_data[0]
imm = instr.imm_data.data
dest = instr.write_reg.data
src1 = instr.read_reg1.data
return m
def __iter__(self):
- yield self.p_ready_o
- for o in self.data_i:
+ yield self.p_o_ready
+ for o in self.i_data:
yield from list(o)
yield self.p_add_i
sendlen = 1
for idx, instr in enumerate(instrs):
- yield dut.data_i[idx].eq(instr)
+ yield dut.i_data[idx].eq(instr)
insn_type = yield instr.insn_type
fn_unit = yield instr.fn_unit
print("senddata ", idx, insn_type, fn_unit, instr)
yield dut.p_add_i.eq(sendlen)
yield
- o_p_ready = yield dut.p_ready_o
+ o_p_ready = yield dut.p_o_ready
while not o_p_ready:
yield
- o_p_ready = yield dut.p_ready_o
+ o_p_ready = yield dut.p_o_ready
yield dut.p_add_i.eq(0)
dest = instr['write_reg']
insn_type = instr['insn_type']
fn_unit = instr['fn_unit']
- yield dut.data_i[idx].insn_type.eq(insn_type)
- yield dut.data_i[idx].fn_unit.eq(fn_unit)
- yield dut.data_i[idx].read_reg1.data.eq(reg1)
- yield dut.data_i[idx].read_reg1.ok.eq(1) # XXX TODO
- yield dut.data_i[idx].read_reg2.data.eq(reg2)
- yield dut.data_i[idx].read_reg2.ok.eq(1) # XXX TODO
- yield dut.data_i[idx].write_reg.data.eq(dest)
- yield dut.data_i[idx].write_reg.ok.eq(1) # XXX TODO
- yield dut.data_i[idx].imm_data.data.eq(imm)
- yield dut.data_i[idx].imm_data.ok.eq(op_imm)
- di = yield dut.data_i[idx]
+ yield dut.i_data[idx].insn_type.eq(insn_type)
+ yield dut.i_data[idx].fn_unit.eq(fn_unit)
+ yield dut.i_data[idx].read_reg1.data.eq(reg1)
+ yield dut.i_data[idx].read_reg1.ok.eq(1) # XXX TODO
+ yield dut.i_data[idx].read_reg2.data.eq(reg2)
+ yield dut.i_data[idx].read_reg2.ok.eq(1) # XXX TODO
+ yield dut.i_data[idx].write_reg.data.eq(dest)
+ yield dut.i_data[idx].write_reg.ok.eq(1) # XXX TODO
+ yield dut.i_data[idx].imm_data.data.eq(imm)
+ yield dut.i_data[idx].imm_data.ok.eq(op_imm)
+ di = yield dut.i_data[idx]
print("senddata %d %x" % (idx, di))
yield dut.p_add_i.eq(sendlen)
yield
- o_p_ready = yield dut.p_ready_o
+ o_p_ready = yield dut.p_o_ready
while not o_p_ready:
yield
- o_p_ready = yield dut.p_ready_o
+ o_p_ready = yield dut.p_o_ready
yield dut.p_add_i.eq(0)
from soc.experiment.compldst_multi import CompLDSTOpSubset
from soc.experiment.l0_cache import TstL0CacheBuffer
-from soc.experiment.alu_hier import ALU, BranchALU
+# for testing purposes
+from soc.config.test.test_loadstore import TestMemPspec
+from soc.experiment.alu_hier import ALUFunctionUnit, BranchALU
from soc.fu.alu.alu_input_record import CompALUOpSubset
from openpower.decoder.power_enums import MicrOp, Function
self.issue_i = Signal(n_units, reset_less=True)
self.rd0 = go_record(n_units, "rd0")
self.rd1 = go_record(n_units, "rd1")
- self.go_rd_i = [self.rd0.go, self.rd1.go] # XXX HACK!
+ self.go_rd_i = [self.rd0.go_i, self.rd1.go_i] # XXX HACK!
self.wr0 = go_record(n_units, "wr0")
- self.go_wr_i = [self.wr0.go]
+ self.go_wr_i = [self.wr0.go_i]
self.shadown_i = Signal(n_units, reset_less=True)
self.go_die_i = Signal(n_units, reset_less=True)
if ldstmode:
# outputs
self.busy_o = Signal(n_units, reset_less=True)
- self.rd_rel_o = [self.rd0.rel, self.rd1.rel] # HACK!
- self.req_rel_o = self.wr0.rel
+ self.rd_rel_o = [self.rd0.rel_o, self.rd1.rel_o] # HACK!
+ self.req_rel_o = self.wr0.rel_o
self.done_o = Signal(n_units, reset_less=True)
if ldstmode:
self.ld_o = Signal(n_units, reset_less=True) # op is LD
self.addr_o = Signal(rwid, reset_less=True)
# in/out register data (note: not register#, actual data)
- self.data_o = Signal(rwid, reset_less=True)
+ self.o_data = Signal(rwid, reset_less=True)
self.src1_i = Signal(rwid, reset_less=True)
self.src2_i = Signal(rwid, reset_less=True)
# input operand
go_rd_l1.append(alu.go_rd_i[1])
issue_l.append(alu.issue_i)
busy_l.append(alu.busy_o)
- comb += self.rd0.rel.eq(Cat(*rd_rel0_l))
- comb += self.rd1.rel.eq(Cat(*rd_rel1_l))
+ comb += self.rd0.rel_o.eq(Cat(*rd_rel0_l))
+ comb += self.rd1.rel_o.eq(Cat(*rd_rel1_l))
comb += self.req_rel_o.eq(Cat(*req_rel_l))
comb += self.done_o.eq(Cat(*done_l))
comb += self.busy_o.eq(Cat(*busy_l))
comb += Cat(*godie_l).eq(self.go_die_i)
comb += Cat(*shadow_l).eq(self.shadown_i)
- comb += Cat(*go_wr_l).eq(self.wr0.go) # XXX TODO
- comb += Cat(*go_rd_l0).eq(self.rd0.go)
- comb += Cat(*go_rd_l1).eq(self.rd1.go)
+ comb += Cat(*go_wr_l).eq(self.wr0.go_i) # XXX TODO
+ comb += Cat(*go_rd_l0).eq(self.rd0.go_i)
+ comb += Cat(*go_rd_l1).eq(self.rd1.go_i)
comb += Cat(*issue_l).eq(self.issue_i)
# connect data register input/output
# protected by a single go_wr. multi-issue requires a bus
# to be inserted here.
if self.units:
- data_o = ortreereduce(self.units, "data_o")
- comb += self.data_o.eq(data_o)
+ o_data = ortreereduce(self.units, "o_data")
+ comb += self.o_data.eq(o_data)
if self.ldstmode:
addr_o = ortreereduce(self.units, "addr_o")
comb += self.addr_o.eq(addr_o)
for i, alu in enumerate(self.units):
comb += alu.src1_i.eq(self.src1_i)
comb += alu.src2_i.eq(self.src2_i)
+ # temporary: set read mask to 0b111111111
+ if hasattr(alu, "rdmaskn"):
+ with m.If(alu.busy_o):
+ comb += alu.rdmaskn.eq(-1)
if not self.ldstmode:
return m
# LD/ST Units
units = []
for i in range(n_ldsts):
- pi = l0.l0.dports[i].pi
+ pi = l0.l0.dports[i]
units.append(LDSTCompUnit(pi, rwid, awid=48))
CompUnitsBase.__init__(self, rwid, units, ldstmode=True)
# Int ALUs
alus = []
- for i in range(n_alus):
- alus.append(ALU(rwid))
units = []
- for alu in alus:
- aluopwid = 3 # extra bit for immediate mode
- units.append(MultiCompUnit(rwid, alu, CompALUOpSubset))
+ for i in range(n_alus):
+ fu = ALUFunctionUnit(i)
+ units.append(fu)
+ alus.append(fu.alu)
CompUnitsBase.__init__(self, rwid, units)
wpnd.append(Signal(nf, name="wr_dst%d_pend_o" %
j, reset_less=True))
- self.dest_i = Array(dst) # Dest in (top)
- self.src_i = Array(src) # oper in (top)
+ self.dest_i = dst # Dest in (top)
+ self.src_i = src # oper in (top)
# for Register File Select Lines (horizontal), per-reg
- self.dst_rsel_o = Array(dsel) # dest reg (bot)
- self.src_rsel_o = Array(rsel) # src reg (bot)
+ self.dst_rsel_o = dsel # dest reg (bot)
+ self.src_rsel_o = rsel # src reg (bot)
- self.go_rd_i = Array(rd)
- self.go_wr_i = Array(wr)
+ self.go_rd_i = rd
+ self.go_wr_i = wr
self.go_die_i = Signal(n_int_alus, reset_less=True)
self.fn_issue_i = Signal(n_int_alus, reset_less=True)
self.fpregs = RegFileArray(rwid, n_regs)
# Memory (test for now)
- self.l0 = TstL0CacheBuffer()
+ pspec = TestMemPspec(ldst_ifacetype='testpi',
+ addr_wid=48,
+ mask_wid=8,
+ reg_wid=64)
+ dut = TstL0CacheBuffer(pspec)
+ self.l0 = TstL0CacheBuffer(pspec)
# issue q needs to get at these
self.aluissue = IssueUnitGroup(2)
]
# take these to outside (issue needs them)
- comb += cua.op.eq_from_execute1(self.instr)
+ comb += cua.op.eq_from_execute1(self.instr.do)
comb += cub.oper_i.eq(self.br_oper_i)
comb += cub.imm_i.eq(self.br_imm_i)
- comb += cul.op.eq_from_execute1(self.instr)
+ comb += cul.op.eq_from_execute1(self.instr.do)
# TODO: issueunit.f (FP)
# Group Picker... done manually for now.
go_rd_o = ipick1.go_rd_o
+ delay_pick_l = []
go_wr_o = ipick1.go_wr_o
go_rd_i = intfus.go_rd_i
go_wr_i = intfus.go_wr_i
rrel_o = cu.rd_rel_o
rqrl_o = cu.req_rel_o
for i in range(fu_n_src):
- comb += ipick1.rd_rel_i[i][0:n_intfus].eq(rrel_o[i][0:n_intfus])
+ # connect with a delay so that src data arrives at the right time
+ pick = Signal(n_intfus, name="pick_%d" % i)
+ delay_pick = Signal(n_intfus, name="dp_%d" % i)
+ rp = Signal(n_intfus, name="rp_%d" % i)
+ comb += pick[0:n_intfus].eq(rrel_o[i][0:n_intfus] & ~delay_pick)
+ comb += ipick1.rd_rel_i[i][0:n_intfus].eq(pick[0:n_intfus])
comb += ipick1.readable_i[i][0:n_intfus].eq(int_rd_o[0:n_intfus])
+ sync += delay_pick.eq(rp)
+ comb += rp.eq(go_rd_o[i])
+ delay_pick_l.append(delay_pick)
int_wr_o = intfus.writable_o
for i in range(fu_n_dst):
# XXX FIXME: rqrl_o[i] here
# branch is active (TODO: a better signal: this is over-using the
# go_write signal - actually the branch should not be "writing")
with m.If(br1.go_wr_i):
- sync += self.branch_direction_o.eq(br1.data_o+Const(1, 2))
+ sync += self.branch_direction_o.eq(br1.o_data+Const(1, 2))
sync += bspec.active_i.eq(0)
comb += bspec.br_i.eq(1)
# branch occurs if data == 1, failed if data == 0
- comb += bspec.br_ok_i.eq(br1.data_o == 1)
+ comb += bspec.br_ok_i.eq(br1.o_data == 1)
for i in range(n_intfus):
# *expected* direction of the branch matched against *actual*
comb += bshadow.s_good_i[i][0].eq(bspec.match_g_o[i])
comb += int_src2.ren.eq(intfus.src_rsel_o[1])
# connect ALUs to regfile
- comb += int_dest.data_i.eq(cu.data_o)
- comb += cu.src1_i.eq(int_src1.data_o)
- comb += cu.src2_i.eq(int_src2.data_o)
+ comb += int_dest.i_data.eq(cu.o_data)
+ comb += cu.src1_i.eq(int_src1.o_data)
+ comb += cu.src2_i.eq(int_src2.o_data)
# connect ALU Computation Units
for i in range(fu_n_src):
- comb += cu.go_rd_i[i][0:n_intfus].eq(go_rd_o[i][0:n_intfus])
+ comb += cu.go_rd_i[i][0:n_intfus].eq(delay_pick_l[i][0:n_intfus])
for i in range(fu_n_dst):
comb += cu.go_wr_i[i][0:n_intfus].eq(go_wr_o[i][0:n_intfus])
comb += cu.issue_i[0:n_intfus].eq(fn_issue_o[0:n_intfus])
self.n_regs = n_regs
mqbits = unsigned(int(log(qlen) / log(2))+2)
- self.p_add_i = Signal(mqbits) # instructions to add (from data_i)
- self.p_ready_o = Signal() # instructions were added
- self.data_i = Instruction._nq(n_in, "data_i")
+ self.p_add_i = Signal(mqbits) # instructions to add (from i_data)
+ self.p_o_ready = Signal() # instructions were added
+ self.i_data = Instruction._nq(n_in, "i_data")
self.busy_o = Signal(reset_less=True) # at least one CU is busy
self.qlen_o = Signal(mqbits, reset_less=True)
# link up instruction queue
comb += iq.p_add_i.eq(self.p_add_i)
- comb += self.p_ready_o.eq(iq.p_ready_o)
+ comb += self.p_o_ready.eq(iq.p_o_ready)
for i in range(self.n_in):
- comb += eq(iq.data_i[i], self.data_i[i])
+ comb += eq(iq.i_data[i], self.i_data[i])
# take instruction and process it. note that it's possible to
# "inspect" the queue contents *without* actually removing the
# "resetting" done above (insn_i=0) could be re-ASSERTed.
with m.If(iq.qlen_o != 0):
# get the operands and operation
- instr = iq.data_o[0]
- imm = instr.imm_data.data
+ instr = iq.o_data[0]
+ imm = instr.do.imm_data.data
dest = instr.write_reg.data
src1 = instr.read_reg1.data
src2 = instr.read_reg2.data
- op = instr.insn_type
- fu = instr.fn_unit
- opi = instr.imm_data.ok # immediate set
+ op = instr.do.insn_type
+ fu = instr.do.fn_unit
+ opi = instr.do.imm_data.ok # immediate set
# set the src/dest regs
comb += sc.int_dest_i.eq(dest)
return m
def __iter__(self):
- yield self.p_ready_o
- for o in self.data_i:
+ yield self.p_o_ready
+ for o in self.i_data:
yield from list(o)
yield self.p_add_i
sendlen = 1
for idx, instr in enumerate(instrs):
- yield dut.data_i[idx].eq(instr)
- insn_type = yield instr.insn_type
- fn_unit = yield instr.fn_unit
+ yield dut.i_data[idx].eq(instr)
+ insn_type = yield instr.do.insn_type
+ fn_unit = yield instr.do.fn_unit
print("senddata ", idx, insn_type, fn_unit, instr)
yield dut.p_add_i.eq(sendlen)
yield
- o_p_ready = yield dut.p_ready_o
+ o_p_ready = yield dut.p_o_ready
while not o_p_ready:
yield
- o_p_ready = yield dut.p_ready_o
+ o_p_ready = yield dut.p_o_ready
yield dut.p_add_i.eq(0)
dest = instr['write_reg']
insn_type = instr['insn_type']
fn_unit = instr['fn_unit']
- yield dut.data_i[idx].insn_type.eq(insn_type)
- yield dut.data_i[idx].fn_unit.eq(fn_unit)
- yield dut.data_i[idx].read_reg1.data.eq(reg1)
- yield dut.data_i[idx].read_reg1.ok.eq(1) # XXX TODO
- yield dut.data_i[idx].read_reg2.data.eq(reg2)
- yield dut.data_i[idx].read_reg2.ok.eq(1) # XXX TODO
- yield dut.data_i[idx].write_reg.data.eq(dest)
- yield dut.data_i[idx].write_reg.ok.eq(1) # XXX TODO
- yield dut.data_i[idx].imm_data.data.eq(imm)
- yield dut.data_i[idx].imm_data.ok.eq(op_imm)
- di = yield dut.data_i[idx]
- print("senddata %d %x" % (idx, di))
+ yield dut.i_data[idx].do.insn_type.eq(insn_type)
+ yield dut.i_data[idx].do.fn_unit.eq(fn_unit)
+ yield dut.i_data[idx].read_reg1.data.eq(reg1)
+ yield dut.i_data[idx].read_reg1.ok.eq(1) # XXX TODO
+ yield dut.i_data[idx].read_reg2.data.eq(reg2)
+ yield dut.i_data[idx].read_reg2.ok.eq(1) # XXX TODO
+ yield dut.i_data[idx].write_reg.data.eq(dest)
+ yield dut.i_data[idx].write_reg.ok.eq(1) # XXX TODO
+ yield dut.i_data[idx].do.imm_data.data.eq(imm)
+ yield dut.i_data[idx].do.imm_data.ok.eq(op_imm)
+ #di = yield dut.i_data[idx]
+ #print("senddata %d %x" % (idx, di))
yield dut.p_add_i.eq(sendlen)
yield
- o_p_ready = yield dut.p_ready_o
+ o_p_ready = yield dut.p_o_ready
while not o_p_ready:
yield
- o_p_ready = yield dut.p_ready_o
+ o_p_ready = yield dut.p_o_ready
yield dut.p_add_i.eq(0)
]
- with Program(lst) as program:
+ with Program(lst, bigendian=False) as program:
gen = program.generate_instructions()
# issue instruction(s), wait for issue to be free before proceeding
0, 0, (0, 0)))
instrs.append((5, 3, 3, MicrOp.OP_ADD, Function.ALU,
0, 0, (0, 0)))
- if False:
+ if True:
instrs.append((3, 5, 5, MicrOp.OP_MUL_L64, Function.ALU,
1, 7, (0, 0)))
if False:
instrs.append((6, 7, 7, 0, 0, (0, 0)))
# issue instruction(s), wait for issue to be free before proceeding
+ print("instructions", instrs)
for i, instr in enumerate(instrs):
- print(i, instr)
+ print("issue instruction", i, instr)
src1, src2, dest, op, fn_unit, opi, imm, (br_ok, br_fail) = instr
print("instr %d: (%d, %d, %d, %s, %s, %d, %d)" %
with open("test_scoreboard6600.il", "w") as f:
f.write(vl)
- run_simulation(m, power_sim(m, dut, pdecode2, instruction, alusim),
- vcd_name='test_powerboard6600.vcd')
+ #run_simulation(m, power_sim(m, dut, pdecode2, instruction, alusim),
+ # vcd_name='test_powerboard6600.vcd')
- # run_simulation(dut, scoreboard_sim(dut, alusim),
- # vcd_name='test_scoreboard6600.vcd')
+ run_simulation(dut, scoreboard_sim(dut, alusim),
+ vcd_name='test_scoreboard6600.vcd')
# run_simulation(dut, scoreboard_branch_sim(dut, alusim),
# vcd_name='test_scoreboard6600.vcd')
src2 = self.regs[src2] & maxbits
if op == MicrOp.OP_ADD:
val = src1 + src2
+ print(" add src1, src2", src1, src2, val)
elif op == MicrOp.OP_MUL_L64:
val = src1 * src2
- print("mul src1, src2", src1, src2, val)
+ print(" mul src1, src2", src1, src2, val)
elif op == ISUB:
val = src1 - src2
+ print(" sub src1, src2", src1, src2, val)
elif op == ISHF:
val = src1 >> (src2 & maxbits)
elif op == IBGT:
--- /dev/null
+def b(x): # byte-reverse function
+ return int.from_bytes(x.to_bytes(8, byteorder='little'),
+ byteorder='big', signed=False)
+
+test1 = {
+ 0x10000: # PARTITION_TABLE_2
+ # PATB_GR=1 PRTB=0x1000 PRTS=0xb
+ b(0x800000000100000b),
+
+ 0x30000: # RADIX_ROOT_PTE
+ # V = 1 L = 0 NLB = 0x400 NLS = 9
+ b(0x8000000000040009),
+
+ 0x40000: # RADIX_SECOND_LEVEL
+ # V = 1 L = 1 SW = 0 RPN = 0
+ # R = 1 C = 1 ATT = 0 EAA 0x3
+ b(0xc000000000000183),
+
+ 0x1000000: # PROCESS_TABLE_3
+ # RTS1 = 0x2 RPDB = 0x300 RTS2 = 0x5 RPDS = 13
+ b(0x40000000000300ad),
+
+ #0x10004: 0
+
+}
+
+
+# executable permission is barred here (EAA=0x2)
+test2 = {
+ 0x10000: # PARTITION_TABLE_2
+ # PATB_GR=1 PRTB=0x1000 PRTS=0xb
+ b(0x800000000100000b),
+
+ 0x30000: # RADIX_ROOT_PTE
+ # V = 1 L = 0 NLB = 0x400 NLS = 9
+ b(0x8000000000040009),
+
+ 0x40000: # RADIX_SECOND_LEVEL
+ # V = 1 L = 1 SW = 0 RPN = 0
+ # R = 1 C = 1 ATT = 0 EAA 0x2
+ b(0xc000000000000182),
+
+ 0x1000000: # PROCESS_TABLE_3
+ # RTS1 = 0x2 RPDB = 0x300 RTS2 = 0x5 RPDS = 13
+ b(0x40000000000300ad),
+
+ #0x10004: 0
+
+}
+
+
+# microwatt mmu.bin first part of test 2. PRTBL must be set to 0x12000, PID to 1
+microwatt_test2 = {
+ 0x13920: 0x86810000000000c0, # leaf node
+ 0x10000: 0x0930010000000080, # directory node
+ 0x12010: 0x0a00010000000000, # page table
+ 0x8108: 0x0000000badc0ffee, # memory to be looked up
+ }
+
+microwatt_test4 = {
+ 0x13858: 0x86a10000000000c0, # leaf node
+ 0x10000: 0x0930010000000080, # directory node
+ 0x12010: 0x0a00010000000000, # page table
+}
+
+# microwatt mmu.bin test 5: a misaligned read which crosses over to a TLB that
+# is not valid. must attempt a 64-bit read at address 0x39fffd to trigger
+
+microwatt_test5 = {
+ 0x13cf8: 0x86b10000000000c0, # leaf, covers up to 0x39ffff
+ 0x10008: 0x0930010000000080, # directory node
+ 0x12010: 0x0a00010000000000, # page table
+ 0x39fff8: 0x0123456badc0ffee, # to be looked up (should fail)
+ 0x400000: 0x0123456badc0ffee, # not page-mapped
+}
+
+# linux kernel 5.7 first MMU enable
+"""
+ rd @ 000bf803 di b000000000001033 sel ff 3.......
+ rd @ 000bf804 di 0 sel ff ........
+ rd @ 000bf805 di 0 sel ff ........
+ rd @ 000bf806 di 10000 sel ff ........
+ rd @ 000bf807 di c0000000005fc380 sel ff ........
+ rd @ 000bf800 di 80000000 sel ff ........
+ rd @ 000bf801 di c00000000059d400 sel ff ..Y.....
+ rd @ 000bf802 di c000000000000000 sel ff ........
+pc a588 insn 7c7a03a6 msr a000000000000003
+pc a58c insn 7c9b03a6 msr a000000000000003
+pc a590 insn 4c000024 msr a000000000000003
+pc a598 insn f82d0190 msr b000000000000033
+ rd @ 01c00000 di ad005c0000000040 sel ff ........
+ rd @ 01c00001 di 0 sel ff ........
+ rd @ 01c00002 di 0 sel ff ........
+ rd @ 01c00003 di 0 sel ff ........
+ rd @ 01c00004 di 0 sel ff ........
+ rd @ 01c00005 di 0 sel ff ........
+ rd @ 01c00006 di 0 sel ff ........
+ rd @ 01c00007 di 0 sel ff ........
+ rd @ 000b8000 di 9e0ff0f00000080 sel ff ........
+ rd @ 000b8001 di 0 sel ff ........
+ rd @ 000b8002 di 0 sel ff ........
+ rd @ 000b8003 di 0 sel ff ........
+ rd @ 000b8004 di 0 sel ff ........
+ rd @ 000b8005 di 0 sel ff ........
+ rd @ 000b8006 di 0 sel ff ........
+ rd @ 000b8007 di 0 sel ff ........
+ rd @ 01fffc00 di 9d0ff0f00000080 sel ff ........
+ rd @ 01fffc01 di 0 sel ff ........
+ rd @ 01fffc02 di 0 sel ff ........
+ rd @ 01fffc03 di 0 sel ff ........
+ rd @ 01fffc04 di 0 sel ff ........
+ rd @ 01fffc05 di 0 sel ff ........
+ rd @ 01fffc06 di 0 sel ff ........
+ rd @ 01fffc07 di 0 sel ff ........
+ rd @ 01fffa00 di 8f010000000000c0 sel ff ........
+ rd @ 01fffa01 di 8f012000000000c0 sel ff ........
+ rd @ 01fffa02 di 8f014000000000c0 sel ff ........
+ rd @ 01fffa03 di 8e016000000000c0 sel ff ........
+ rd @ 01fffa04 di 8e018000000000c0 sel ff ........
+ rd @ 01fffa05 di 8e01a000000000c0 sel ff ........
+ rd @ 01fffa06 di 8e01c000000000c0 sel ff ........
+ rd @ 01fffa07 di 8e01e000000000c0 sel ff ........
+"""
+
+microwatt_linux_5_7_boot = {
+ 0x000bf803<<3: 0xb000000000001033,
+ 0x000bf804<<3: 0x0,
+ 0x000bf805<<3: 0x0,
+ 0x000bf806<<3: 0x10000,
+ 0x000bf807<<3: 0xc0000000005fc380,
+ 0x000bf800<<3: 0x80000000,
+ 0x000bf801<<3: 0xc00000000059d400,
+ 0x000bf802<<3: 0xc000000000000000,
+ 0x01c00000<<3: 0xad005c0000000040,
+ 0x01c00001<<3: 0x0,
+ 0x01c00002<<3: 0x0,
+ 0x01c00003<<3: 0x0,
+ 0x01c00004<<3: 0x0,
+ 0x01c00005<<3: 0x0,
+ 0x01c00006<<3: 0x0,
+ 0x01c00007<<3: 0x0,
+ 0x000b8000<<3: 0x09e0ff0f00000080,
+ 0x000b8001<<3: 0x0,
+ 0x000b8002<<3: 0x0,
+ 0x000b8003<<3: 0x0,
+ 0x000b8004<<3: 0x0,
+ 0x000b8005<<3: 0x0,
+ 0x000b8006<<3: 0x0,
+ 0x000b8007<<3: 0x0,
+ 0x01fffc00<<3: 0x09d0ff0f00000080,
+ 0x01fffc01<<3: 0x0,
+ 0x01fffc02<<3: 0x0,
+ 0x01fffc03<<3: 0x0,
+ 0x01fffc04<<3: 0x0,
+ 0x01fffc05<<3: 0x0,
+ 0x01fffc06<<3: 0x0,
+ 0x01fffc07<<3: 0x0,
+ 0x01fffa00<<3: 0x8f010000000000c0,
+ 0x01fffa01<<3: 0x8f012000000000c0,
+ 0x01fffa02<<3: 0x8f014000000000c0,
+ 0x01fffa03<<3: 0x8e016000000000c0,
+ 0x01fffa04<<3: 0x8e018000000000c0,
+ 0x01fffa05<<3: 0x8e01a000000000c0,
+ 0x01fffa06<<3: 0x8e01c000000000c0,
+ 0x01fffa07<<3: 0x8e01e000000000c0,
+}
# transaction parameters, passed via signals
self.delay = Signal(8)
self.data = Signal.like(self.port)
+ self.data_valid = False
# add ourselves to the simulation process list
sim.add_sync_process(self._process)
yield
yield Settle()
# read the transaction parameters
+ assert self.data_valid, "an unexpected operand was consumed"
delay = (yield self.delay)
data = (yield self.data)
# wait for `delay` cycles
yield self.port.eq(data)
yield self.count.eq(self.count + 1)
yield
+ self.data_valid = False
yield self.go_i.eq(0)
yield self.port.eq(0)
"""
yield self.data.eq(data)
yield self.delay.eq(delay)
+ self.data_valid = True
class ResultConsumer:
# transaction parameters, passed via signals
self.delay = Signal(8)
self.expected = Signal.like(self.port)
+ self.expecting = False
# add ourselves to the simulation process list
sim.add_sync_process(self._process)
yield
yield Settle()
# read the transaction parameters
+ assert self.expecting, "an unexpected result was produced"
delay = (yield self.delay)
expected = (yield self.expected)
# wait for `delay` cycles
"""
yield self.expected.eq(expected)
yield self.delay.eq(delay)
+ self.expecting = True
def op_sim(dut, a, b, op, inv_a=0, imm=0, imm_ok=0, zero_a=0):
wrmask=[0, 1],
src_delays=[2, 0], dest_delays=[1, 0])
- # test combinatorial zero-delay operation
- # In the test ALU, any operation other than ADD, MUL, EXTS or SHR
- # is zero-delay, and do a subtraction.
- # 5 - 2 = 3
- yield from op.issue([5, 2], MicrOp.OP_CMP, [3, 0],
- wrmask=[0, 1],
- src_delays=[0, 1], dest_delays=[2, 0])
# test all combinations of masked input ports
# NOP does not make any request nor response
yield from op.issue([5, 2], MicrOp.OP_NOP, [0, 0],
yield from op.issue([2, 0x80], MicrOp.OP_EXTSWSLI, [0xFF80, 0],
rdmaskn=[1, 0], wrmask=[0, 1],
src_delays=[1, 2], dest_delays=[1, 0])
+
+ # test combinatorial zero-delay operation
+ # In the test ALU, any operation other than ADD, MUL, EXTS or SHR
+ # is zero-delay, and do a subtraction.
+ # 5 - 2 = 3
+ yield from op.issue([5, 2], MicrOp.OP_CMP, [3, 0],
+ wrmask=[0, 1],
+ src_delays=[0, 1], dest_delays=[2, 0])
+
# test with rc=1, so expect results on the CR output port
# 5 + 2 = 7
# 7 > 0 => CR = 0b100
('prev port', 'in', [
'op__sdir', 'p_data_i[7:0]', 'p_shift_i[7:0]',
({'submodule': 'p'},
- ['p_valid_i', 'p_ready_o'])]),
+ ['p_i_valid', 'p_o_ready'])]),
('next port', 'out', [
'n_data_o[7:0]',
({'submodule': 'n'},
- ['n_valid_o', 'n_ready_i'])])]),
- ('debug', {'module': 'top'},
+ ['n_o_valid', 'n_i_ready'])])]),
+ ('debug', {'module': 'bench'},
['src1_count[7:0]', 'src2_count[7:0]', 'dest1_count[7:0]'])]
write_gtkw(
"test_compunit_fsm1.gtkw",
"test_compunit_fsm1.vcd",
traces, style,
- module='top.cu'
+ module='bench.top.cu'
)
m = Module()
alu = Shifter(8)
('alu', {'submodule': 'alu'}, [
('prev port', 'in', [
'oper_i_None__insn_type', 'i1[15:0]',
- 'valid_i', 'ready_o']),
+ 'i_valid', 'o_ready']),
('next port', 'out', [
- 'alu_o[15:0]', 'valid_o', 'ready_i'])])]
+ 'alu_o[15:0]', 'o_valid', 'i_ready'])])]
write_gtkw("test_compunit_regspec3.gtkw",
"test_compunit_regspec3.vcd",
traces, style,
clk_period=1e-6,
- module='top.cu')
+ module='bench.top.cu')
inspec = [('INT', 'a', '0:15'),
('INT', 'b', '0:15'),
('alu', {'submodule': 'alu'}, [
('prev port', 'in', [
'op__insn_type', 'op__invert_in', 'a[15:0]', 'b[15:0]',
- 'valid_i', 'ready_o']),
+ 'i_valid', 'o_ready']),
('next port', 'out', [
- 'alu_o[15:0]', 'valid_o', 'ready_i',
+ 'alu_o[15:0]', 'o_valid', 'i_ready',
'alu_o_ok', 'alu_cr_ok'])]),
- ('debug', {'module': 'top'},
+ ('debug', {'module': 'bench'},
['src1_count[7:0]', 'src2_count[7:0]', 'dest1_count[7:0]'])]
write_gtkw("test_compunit_regspec1.gtkw",
"test_compunit_regspec1.vcd",
traces, style,
clk_period=1e-6,
- module='top.cu')
+ module='bench.top.cu')
inspec = [('INT', 'a', '0:15'),
('INT', 'b', '0:15')]
import unittest
from nmigen import Module
from nmigen.sim import Simulator
+from nmutil.gtkw import write_gtkw
+
+from openpower.consts import MSR
+from openpower.decoder.power_enums import MicrOp, LDSTMode
+
from soc.experiment.compldst_multi import LDSTCompUnit
from soc.experiment.pimem import PortInterface
+from soc.experiment.test.test_compalu_multi import OperandProducer
from soc.fu.ldst.pipe_data import LDSTPipeSpec
+class OpSim:
+ def __init__(self, dut, sim):
+ self.dut = dut
+ # create one operand producer for each input port
+ self.producers = list()
+ for i in range(len(dut.src_i)):
+ self.producers.append(OperandProducer(sim, dut, i))
+
+ def issue(self, op, ra=None, rb=None, rc=None,
+ zero_a=False, imm=None, update=False,
+ byterev=True, signext=False,
+ data_len=2, msr_pr=0,
+ delays=None):
+ assert zero_a == (ra is None), \
+ "ra and zero_a are mutually exclusive"
+ assert (rb is None) != (imm is None), \
+ "rb and imm are mutually exclusive"
+ if op == MicrOp.OP_STORE:
+ assert rc, "need source operand for store"
+ dut = self.dut
+ pi = dut.pi
+ producers = self.producers
+ if ra:
+ yield from producers[0].send(ra, delays['ra'])
+ if rb:
+ yield from producers[1].send(rb, delays['rb'])
+ if rc:
+ yield from producers[2].send(rc, delays['rc'])
+ yield dut.oper_i.insn_type.eq(op)
+ yield dut.oper_i.data_len.eq(data_len)
+ yield dut.oper_i.zero_a.eq(zero_a)
+ yield dut.oper_i.byte_reverse.eq(byterev)
+ yield dut.oper_i.sign_extend.eq(signext)
+ if imm is not None:
+ yield dut.oper_i.imm_data.data.eq(imm)
+ yield dut.oper_i.imm_data.ok.eq(1)
+ if update:
+ yield dut.oper_i.ldst_mode.eq(LDSTMode.update)
+ yield dut.oper_i.msr[MSR.PR].eq(msr_pr)
+ yield dut.issue_i.eq(1)
+ yield
+ yield dut.issue_i.eq(0)
+ # deactivate decoder inputs along with issue_i, so we can be sure they
+ # were latched at the correct cycle
+ yield dut.oper_i.insn_type.eq(0)
+ yield dut.oper_i.data_len.eq(0)
+ yield dut.oper_i.zero_a.eq(0)
+ yield dut.oper_i.byte_reverse.eq(0)
+ yield dut.oper_i.sign_extend.eq(0)
+ yield dut.oper_i.imm_data.data.eq(0)
+ yield dut.oper_i.imm_data.ok.eq(0)
+ yield dut.oper_i.ldst_mode.eq(LDSTMode.NONE)
+ yield dut.oper_i.msr[MSR.PR].eq(0)
+ while not (yield pi.addr.ok):
+ yield
+
+
+# FIXME: AttributeError: type object 'LDSTPipeSpec' has no attribute 'regspec'
+@unittest.skip('broken')
class TestLDSTCompUnit(unittest.TestCase):
def test_ldst_compunit(self):
m = Module()
pi = PortInterface(name="pi")
regspec = LDSTPipeSpec.regspec
- dut = LDSTCompUnit(pi, regspec)
+ dut = LDSTCompUnit(pi, regspec, name="ldst")
m.submodules.dut = dut
sim = Simulator(m)
sim.add_clock(1e-6)
+ op = OpSim(dut, sim)
+ self.write_gtkw()
def process():
- yield
+ yield from op.issue(MicrOp.OP_STORE, ra=1, rb=2, rc=3,
+ delays={'ra': 1, 'rb': 2, 'rc': 5})
sim.add_sync_process(process)
sim_writer = sim.write_vcd("test_ldst_compunit.vcd")
with sim_writer:
sim.run()
+ @classmethod
+ def write_gtkw(cls):
+ style = {'dec': {'base': 'dec'}}
+ traces = [
+ 'clk',
+ ('state latches', [
+ 'q_opc',
+ ('q_src[2:0]', {'bit': 2}),
+ ('q_src[2:0]', {'bit': 1}),
+ ('q_src[2:0]', {'bit': 0}),
+ 'q_alu', 'q_adr', 'qn_lod', 'q_sto',
+ 'q_wri', 'q_upd', 'q_rst', 'q_lsd'
+ ]),
+ ('operation', [
+ ('oper_i_ldst__insn_type', {'display': 'insn_type'}),
+ ('oper_i_ldst__ldst_mode', {'display': 'ldst_mode'}),
+ ('oper_i_ldst__zero_a', {'display': 'zero_a'}),
+ ('oper_i_ldst__imm_data__ok', {'display': 'imm_data_ok'}),
+ ('oper_i_ldst__imm_data__data[63:0]', 'dec',
+ {'display': 'imm_data_data'})
+ ]),
+ 'cu_issue_i', 'cu_busy_o',
+ ('address ALU', [
+ ('cu_rd__rel_o[2:0]', {'bit': 2}),
+ ('cu_rd__go_i[2:0]', {'bit': 2}),
+ ('src1_i[63:0]', 'dec'),
+ ('cu_rd__rel_o[2:0]', {'bit': 1}),
+ ('cu_rd__go_i[2:0]', {'bit': 1}),
+ ('src2_i[63:0]', 'dec'),
+ 'alu_valid', 'alu_ok', ('alu_o[63:0]', 'dec'),
+ 'cu_ad__rel_o', 'cu_ad__go_i',
+ 'pi_addr_i_ok', ('pi_addr_i[47:0]', 'dec'),
+ ]),
+ ('store operand', [
+ ('cu_rd__rel_o[2:0]', {'bit': 0}),
+ ('cu_rd__go_i[2:0]', {'bit': 0}),
+ ('src3_i[63:0]', 'dec'),
+ 'rd_done',
+ ]),
+ 'cu_st__rel_o', 'cu_st__go_i'
+ ]
+ write_gtkw("test_ldst_compunit.gtkw",
+ "test_ldst_compunit.vcd",
+ traces, style, module="top.dut")
+
if __name__ == '__main__':
unittest.main()
--- /dev/null
+# test case for LOAD / STORE Computation Unit using MMU
+
+from nmigen.sim import Simulator, Delay, Settle, Tick
+from nmigen.cli import verilog, rtlil
+from nmigen import Module, Signal, Mux, Cat, Elaboratable, Array, Repl
+from nmigen.hdl.rec import Record, Layout
+
+from nmutil.latch import SRLatch, latchregister
+from nmutil.byterev import byte_reverse
+from nmutil.extend import exts
+from nmutil.util import wrap
+from soc.fu.regspec import RegSpecAPI
+
+from openpower.decoder.power_enums import MicrOp, Function, LDSTMode
+from soc.fu.ldst.ldst_input_record import CompLDSTOpSubset
+from openpower.decoder.power_decoder2 import Data
+from openpower.consts import MSR
+
+from soc.experiment.compalu_multi import go_record, CompUnitRecord
+from soc.experiment.l0_cache import PortInterface
+from soc.experiment.pimem import LDSTException
+from soc.experiment.compldst_multi import LDSTCompUnit, load, store
+from soc.config.test.test_loadstore import TestMemPspec
+
+from soc.experiment.mmu import MMU
+from nmutil.util import Display
+
+from soc.config.loadstore import ConfigMemoryPortInterface
+from soc.experiment.test import pagetables
+from openpower.test.wb_get import wb_get
+from openpower.test import wb_get as wbget
+
+
+########################################
+
+def wait_for_debug(sig, reason, wait=True, test1st=False):
+ v = (yield sig)
+ cnt = 0
+ print("wait for", reason, sig, v, wait, test1st)
+ if test1st and bool(v) == wait:
+ return
+ while True:
+ cnt = cnt + 1
+ if cnt > 15:
+ raise(Exception(reason))
+ break
+ yield
+ v = (yield sig)
+ #print("...wait for", sig, v)
+ if bool(v) == wait:
+ break
+
+def store_debug(dut, src1, src2, src3, imm, imm_ok=True, update=False,
+ byterev=True,dcbz=False):
+ print("cut here ======================================")
+ print("ST", src1, src2, src3, imm, imm_ok, update)
+ if dcbz:
+ yield dut.oper_i.insn_type.eq(MicrOp.OP_DCBZ)
+ else:
+ yield dut.oper_i.insn_type.eq(MicrOp.OP_STORE)
+ yield dut.oper_i.data_len.eq(2) # half-word
+ yield dut.oper_i.byte_reverse.eq(byterev)
+ yield dut.src1_i.eq(src1)
+ yield dut.src2_i.eq(src2)
+ yield dut.src3_i.eq(src3)
+ yield dut.oper_i.imm_data.data.eq(imm)
+ yield dut.oper_i.imm_data.ok.eq(imm_ok)
+ #guess: this one was removed -- yield dut.oper_i.update.eq(update)
+ yield dut.issue_i.eq(1)
+ yield
+ yield dut.issue_i.eq(0)
+
+ if imm_ok:
+ active_rel = 0b101
+ else:
+ active_rel = 0b111
+ if dcbz:
+ active_rel = 0b001 # may be wrong, verify
+
+ # wait for all active rel signals to come up
+ cnt = 0
+ while True:
+ rel = yield dut.rd.rel_o # guess: wrong in dcbz case
+ cnt = cnt + 1
+ print("waitActiveRel",cnt)
+ if cnt > 10:
+ raise(Exception("Error1"))
+ print("rel EQ active_rel ?",rel,active_rel)
+ if rel == active_rel:
+ break
+ yield
+ yield dut.rd.go_i.eq(active_rel)
+ yield
+ yield dut.rd.go_i.eq(0)
+
+ yield from wait_for_debug(dut.adr_rel_o, "addr valid",False, test1st=True)
+ # yield from wait_for(dut.adr_rel_o)
+ # yield dut.ad.go.eq(1)
+ # yield
+ # yield dut.ad.go.eq(0)
+
+ if update:
+ yield from wait_for_debug(dut.wr.rel_o[1],"update")
+ yield dut.wr.go.eq(0b10)
+ yield
+ addr = yield dut.addr_o
+ print("addr", addr)
+ yield dut.wr.go.eq(0)
+ else:
+ addr = None
+ print("not update ===============")
+
+ yield from wait_for_debug(dut.sto_rel_o,"sto_rel_o")
+ yield dut.go_st_i.eq(1)
+ yield
+ yield dut.go_st_i.eq(0)
+ yield from wait_for_debug(dut.busy_o,"not_busy" ,False)
+ ###wait_for(dut.stwd_mem_o)
+ yield
+ return addr
+
+# same thing as soc/src/soc/experiment/test/test_dcbz_pi.py
+def ldst_sim(dut):
+ yield dut.mmu.rin.prtbl.eq(0x1000000) # set process table
+ addr = 0x100e0
+ data = 0xFF #just a single byte for this test
+ #data = 0xf553b658ba7e1f51
+
+ yield from store(dut, addr, 0, data, 0)
+ yield
+ ld_data, data_ok, ld_addr = yield from load(dut, addr, 0, 0)
+ print(data,data_ok,ld_addr)
+ assert(ld_data==data)
+ yield
+
+ data = 0
+
+ print("doing dcbz/store with data 0 .....")
+ yield from store_debug(dut, addr, 0, data, 0, dcbz=True) #hangs
+
+ ld_data, data_ok, ld_addr = yield from load(dut, addr, 0, 0)
+ print(data,data_ok,ld_addr)
+ print("ld_data is")
+ print(ld_data)
+ assert(ld_data==data)
+ print("dzbz test passed")
+
+ wbget.stop = True # stop simulation
+
+########################################
+class TestLDSTCompUnitMMU(LDSTCompUnit):
+
+ def __init__(self, rwid, pspec):
+ # use a LoadStore1 here
+ cmpi = ConfigMemoryPortInterface(pspec)
+ self.cmpi = cmpi
+ ldst = cmpi.pi
+ self.l0 = ldst
+
+ self.mmu = MMU()
+ LDSTCompUnit.__init__(self, ldst.pi, rwid, 4)
+
+ def elaborate(self, platform):
+ m = LDSTCompUnit.elaborate(self, platform)
+ m.submodules.l0 = self.l0
+ m.submodules.mmu = self.mmu
+ # link addr-go direct to rel
+ m.d.comb += self.ad.go_i.eq(self.ad.rel_o)
+
+ # link mmu and dcache together
+ dcache = self.l0.dcache
+ mmu = self.mmu
+ m.d.comb += dcache.m_in.eq(mmu.d_out) # MMUToDCacheType
+ m.d.comb += mmu.d_in.eq(dcache.m_out) # DCacheToMMUType
+
+ return m
+
+
+def test_scoreboard_mmu():
+
+ m = Module()
+
+ units = {}
+ pspec = TestMemPspec(ldst_ifacetype='mmu_cache_wb',
+ imem_ifacetype='bare_wb',
+ addr_wid=48,
+ mask_wid=8,
+ reg_wid=64,
+ units=units)
+
+ dut = TestLDSTCompUnitMMU(16,pspec)
+
+ m.submodules.dut = dut
+
+ sim = Simulator(m)
+ sim.add_clock(1e-6)
+
+ dut.mem = pagetables.test1
+ wbget.stop = False
+
+ sim.add_sync_process(wrap(ldst_sim(dut)))
+ sim.add_sync_process(wrap(wb_get(dut.cmpi.wb_bus(), dut.mem)))
+ with sim.write_vcd('test_scoreboard_mmu.vcd'):
+ sim.run()
+
+########################################
+class TestLDSTCompUnitRegSpecMMU(LDSTCompUnit):
+
+ def __init__(self, pspec):
+ from soc.fu.ldst.pipe_data import LDSTPipeSpec
+ regspec = LDSTPipeSpec.regspec
+
+ # use a LoadStore1 here
+ cmpi = ConfigMemoryPortInterface(pspec)
+ self.cmpi = cmpi
+ ldst = cmpi.pi
+ self.l0 = ldst
+
+ self.mmu = MMU()
+ LDSTCompUnit.__init__(self, ldst.pi, regspec, 4)
+
+ def elaborate(self, platform):
+ m = LDSTCompUnit.elaborate(self, platform)
+ m.submodules.l0 = self.l0
+ m.submodules.mmu = self.mmu
+ # link addr-go direct to rel
+ m.d.comb += self.ad.go_i.eq(self.ad.rel_o)
+
+ # link mmu and dcache together
+ dcache = self.l0.dcache
+ mmu = self.mmu
+ m.d.comb += dcache.m_in.eq(mmu.d_out) # MMUToDCacheType
+ m.d.comb += mmu.d_in.eq(dcache.m_out) # DCacheToMMUType
+
+ return m
+
+def test_scoreboard_regspec_mmu():
+
+ m = Module()
+
+ units = {}
+ pspec = TestMemPspec(ldst_ifacetype='mmu_cache_wb',
+ imem_ifacetype='bare_wb',
+ addr_wid=48,
+ mask_wid=8,
+ reg_wid=64,
+ units=units)
+
+ dut = TestLDSTCompUnitRegSpecMMU(pspec)
+
+ m.submodules.dut = dut
+
+ sim = Simulator(m)
+ sim.add_clock(1e-6)
+
+ dut.mem = pagetables.test1
+ wbget.stop = False
+
+ sim.add_sync_process(wrap(ldst_sim(dut)))
+ sim.add_sync_process(wrap(wb_get(dut.cmpi.wb_bus(), dut.mem)))
+ with sim.write_vcd('test_scoreboard_regspec_mmu.vcd'):
+ sim.run()
+
+if __name__ == '__main__':
+ test_scoreboard_regspec_mmu()
+ test_scoreboard_mmu()
--- /dev/null
+# test case for LOAD / STORE Computation Unit using MMU
+
+from nmigen.back.pysim import Simulator, Delay, Settle, Tick
+from nmigen.cli import verilog, rtlil
+from nmigen import Module, Signal, Mux, Cat, Elaboratable, Array, Repl
+from nmigen.hdl.rec import Record, Layout
+
+from nmutil.latch import SRLatch, latchregister
+from nmutil.byterev import byte_reverse
+from nmutil.extend import exts
+from nmutil.util import wrap
+from soc.fu.regspec import RegSpecAPI
+
+from openpower.decoder.power_enums import MicrOp, Function, LDSTMode
+from soc.fu.ldst.ldst_input_record import CompLDSTOpSubset
+from openpower.decoder.power_decoder2 import Data
+from openpower.consts import MSR
+
+from soc.experiment.compalu_multi import go_record, CompUnitRecord
+from soc.experiment.l0_cache import PortInterface
+from soc.experiment.pimem import LDSTException
+from soc.experiment.compldst_multi import LDSTCompUnit, load, store
+from soc.config.test.test_loadstore import TestMemPspec
+
+from soc.experiment.mmu import MMU
+from nmutil.util import Display
+
+from soc.config.loadstore import ConfigMemoryPortInterface
+from soc.experiment.test import pagetables
+from soc.experiment.test.test_wishbone import wb_get
+
+# new unit added to this test case
+from soc.fu.mmu.pipe_data import MMUPipeSpec
+from soc.fu.mmu.fsm import FSMMMUStage
+
+# for sending instructions to the FSM
+from openpower.consts import MSR
+from openpower.decoder.power_fields import DecodeFields
+from openpower.decoder.power_fieldsn import SignalBitRange
+from openpower.decoder.power_decoder2 import decode_spr_num
+from openpower.decoder.power_enums import MicrOp
+
+
+def test_TLBIE(dut):
+ yield dut.fsm.p.i_data.ctx.op.eq(MicrOp.OP_TLBIE)
+ yield dut.fsm.p.valid_i.eq(1)
+ yield
+ yield dut.fsm.p.valid_i.eq(0)
+ yield
+ yield
+ yield
+ yield
+ yield Display("OP_TLBIE test done")
+
+
+def ldst_sim(dut):
+ yield dut.mmu.rin.prtbl.eq(0x1000000) # set process table
+ addr = 0x100e0
+ data = 0xFF # just a single byte for this test
+ #data = 0xf553b658ba7e1f51
+
+ yield from store(dut, addr, 0, data, 0)
+ yield
+ ld_data, data_ok, ld_addr = yield from load(dut, addr, 0, 0)
+ print(data, data_ok, ld_addr)
+ assert(ld_data == data)
+ yield
+ yield from test_TLBIE(dut)
+
+ """
+ -- not testing dzbz here --
+ data = 0
+
+ print("doing dcbz/store with data 0 .....")
+ yield from store_debug(dut, addr, 0, data, 0, dcbz=True) #hangs
+
+ ld_data, data_ok, ld_addr = yield from load(dut, addr, 0, 0)
+ print(data,data_ok,ld_addr)
+ print("ld_data is")
+ print(ld_data)
+ assert(ld_data==data)
+ print("dzbz test passed")
+ """
+
+ dut.stop = True # stop simulation
+
+########################################
+
+
+class TestLDSTCompUnitMMUFSM(LDSTCompUnit):
+
+ def __init__(self, rwid, pspec):
+ from soc.experiment.l0_cache import TstL0CacheBuffer
+ self.l0 = l0 = TstL0CacheBuffer(pspec)
+ pi = l0.l0.dports[0]
+ LDSTCompUnit.__init__(self, pi, rwid, 4)
+
+ def elaborate(self, platform):
+ m = LDSTCompUnit.elaborate(self, platform)
+ m.submodules.l0 = self.l0
+ # link addr-go direct to rel
+ m.d.comb += self.ad.go_i.eq(self.ad.rel_o)
+ return m
+
+
+def test_scoreboard_mmu():
+
+ units = {}
+ pspec = TestMemPspec(ldst_ifacetype='mmu_cache_wb',
+ imem_ifacetype='bare_wb',
+ addr_wid=48,
+ mask_wid=8,
+ reg_wid=64,
+ units=units)
+
+ dut = TestLDSTCompUnit(16, pspec)
+ vl = rtlil.convertMMUFSM(dut, ports=dut.ports())
+ with open("test_ldst_comp_mmu1.il", "w") as f:
+ f.write(vl)
+
+ run_simulation(dut, ldst_sim(dut), vcd_name='test_ldst_comp.vcd')
+
+########################################
+
+
+class TestLDSTCompUnitRegSpecMMUFSM(LDSTCompUnit):
+
+ def __init__(self, pspec):
+ from soc.experiment.l0_cache import TstL0CacheBuffer
+ from soc.fu.ldst.pipe_data import LDSTPipeSpec
+ regspec = LDSTPipeSpec.regspec
+
+ # use a LoadStore1 here
+
+ cmpi = ConfigMemoryPortInterface(pspec)
+ self.cmpi = cmpi
+ ldst = cmpi.pi
+ self.l0 = ldst
+
+ self.mmu = MMU()
+
+ pipe_spec = MMUPipeSpec(id_wid=2, parent_pspec=None)
+ self.fsm = FSMMMUStage(pipe_spec)
+
+ self.fsm.set_ldst_interface(ldst)
+
+ LDSTCompUnit.__init__(self, ldst.pi, regspec, 4)
+
+ def elaborate(self, platform):
+ m = LDSTCompUnit.elaborate(self, platform)
+ m.submodules.l0 = self.l0
+ m.submodules.mmu = self.mmu
+ m.submodules.fsm = self.fsm
+ # link addr-go direct to rel
+ m.d.comb += self.ad.go_i.eq(self.ad.rel_o)
+
+ # link mmu and dcache together
+ dcache = self.l0.dcache
+ mmu = self.mmu
+ m.d.comb += dcache.m_in.eq(mmu.d_out) # MMUToDCacheType
+ m.d.comb += mmu.d_in.eq(dcache.m_out) # DCacheToMMUType
+
+ return m
+
+
+def test_scoreboard_regspec_mmufsm():
+
+ m = Module()
+
+ units = {}
+ pspec = TestMemPspec(ldst_ifacetype='mmu_cache_wb',
+ imem_ifacetype='bare_wb',
+ addr_wid=48,
+ mask_wid=8,
+ reg_wid=64,
+ units=units)
+
+ dut = TestLDSTCompUnitRegSpecMMUFSM(pspec)
+
+ m.submodules.dut = dut
+
+ sim = Simulator(m)
+ sim.add_clock(1e-6)
+
+ dut.mem = pagetables.test1
+ dut.stop = False
+
+ sim.add_sync_process(wrap(ldst_sim(dut))) # rename ?
+ sim.add_sync_process(wrap(wb_get(dut)))
+ with sim.write_vcd('test_scoreboard_regspec_mmufsm.vcd'):
+ sim.run()
+
+
+if __name__ == '__main__':
+ test_scoreboard_regspec_mmufsm()
+ # only one test for now -- test_scoreboard_mmu()
yield
-def test_dcache(mem, test_fn, test_name):
+def tst_dcache(mem, test_fn, test_name):
dut = DCache()
memory = Memory(width=64, depth=len(mem), init=mem, simulate=True)
m.submodules.dcache = dut
m.submodules.sram = sram
- m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
- m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
- m.d.comb += sram.bus.we.eq(dut.wb_out.we)
- m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
- m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
- m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
+ m.d.comb += sram.bus.cyc.eq(dut.bus.cyc)
+ m.d.comb += sram.bus.stb.eq(dut.bus.stb)
+ m.d.comb += sram.bus.we.eq(dut.bus.we)
+ m.d.comb += sram.bus.sel.eq(dut.bus.sel)
+ m.d.comb += sram.bus.adr.eq(dut.bus.adr)
+ m.d.comb += sram.bus.dat_w.eq(dut.bus.dat_w)
- m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
- m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
+ m.d.comb += dut.bus.ack.eq(sram.bus.ack)
+ m.d.comb += dut.bus.dat_r.eq(sram.bus.dat_r)
dcache_write_gtkw(test_name)
('d_out', [
'd_out_valid', 'd_out_data[63:0]'
]),
+ # XXX TODO, update to standard wishbone Signals (single "bus" Interface)
('wb_out', [
'wb_out_cyc', 'wb_out_stb', 'wb_out_we',
'wb_out_adr[31:0]', 'wb_out_sel[7:0]', 'wb_out_dat[63:0]'
for i in range(memsize):
mem.append(i)
- test_dcache(mem, dcache_regression_sim, "simpleregression")
+ tst_dcache(mem, dcache_regression_sim, "simpleregression")
mem = []
memsize = 256
for i in range(memsize):
mem.append(i)
- test_dcache(mem, dcache_random_sim, "random")
+ tst_dcache(mem, dcache_random_sim, "random")
mem = []
for i in range(1024):
mem.append((i*2)| ((i*2+1)<<32))
- test_dcache(mem, dcache_sim, "")
+ tst_dcache(mem, dcache_sim, "")
yield
-def test_dcache(mem, test_fn, test_name):
+def tst_dcache(mem, test_fn, test_name):
dut = DCache()
memory = Memory(width=64, depth=len(mem), init=mem, simulate=True)
m.submodules.dcache = dut
m.submodules.sram = sram
- m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
- m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
- m.d.comb += sram.bus.we.eq(dut.wb_out.we)
- m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
- m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
- m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
+ m.d.comb += sram.bus.cyc.eq(dut.bus.cyc)
+ m.d.comb += sram.bus.stb.eq(dut.bus.stb)
+ m.d.comb += sram.bus.we.eq(dut.bus.we)
+ m.d.comb += sram.bus.sel.eq(dut.bus.sel)
+ m.d.comb += sram.bus.adr.eq(dut.bus.adr)
+ m.d.comb += sram.bus.dat_w.eq(dut.bus.dat_w)
- m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
- m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
+ m.d.comb += dut.bus.ack.eq(sram.bus.ack)
+ m.d.comb += dut.bus.dat_r.eq(sram.bus.dat_r)
dcache_write_gtkw(test_name)
for i in range(memsize):
mem.append(i)
- test_dcache(mem, dcache_regression_sim, "simpleregression")
+ tst_dcache(mem, dcache_regression_sim, "simpleregression")
mem = []
memsize = 256
for i in range(memsize):
mem.append(i)
- test_dcache(mem, dcache_random_sim, "random")
+ tst_dcache(mem, dcache_random_sim, "random")
mem = []
for i in range(1024):
mem.append((i*2)| ((i*2+1)<<32))
- test_dcache(mem, dcache_sim, "")
+ tst_dcache(mem, dcache_sim, "")
--- /dev/null
+"""DCache PortInterface Test
+ starting as a copy to test_ldst_pi.py
+"""
+
+from nmigen import (C, Module, Signal, Elaboratable, Mux, Cat, Repl, Signal)
+from nmigen.cli import main
+from nmigen.cli import rtlil
+from nmutil.mask import Mask, masked
+from nmutil.util import Display
+from random import randint, seed
+from nmigen.sim import Simulator, Delay, Settle
+from nmutil.util import wrap
+
+from soc.config.test.test_pi2ls import pi_ld, pi_st, pi_ldst
+from soc.config.test.test_loadstore import TestMemPspec
+from soc.config.loadstore import ConfigMemoryPortInterface
+
+from soc.fu.ldst.loadstore import LoadStore1
+from soc.experiment.mmu import MMU
+from soc.experiment.test import pagetables
+
+from nmigen.compat.sim import run_simulation
+from openpower.test.wb_get import wb_get
+from openpower.test import wb_get as wbget
+from openpower.decoder.power_enums import MSRSpec
+
+wbget.stop = False
+
+
+def setup_mmu():
+
+ wbget.stop = False
+
+ pspec = TestMemPspec(ldst_ifacetype='mmu_cache_wb',
+ imem_ifacetype='',
+ addr_wid=48,
+ #disable_cache=True, # hmmm...
+ mask_wid=8,
+ reg_wid=64)
+
+ m = Module()
+ comb = m.d.comb
+ cmpi = ConfigMemoryPortInterface(pspec)
+ m.submodules.ldst = ldst = cmpi.pi
+ m.submodules.mmu = mmu = MMU()
+ dcache = ldst.dcache
+
+ l_in, l_out = mmu.l_in, mmu.l_out
+ d_in, d_out = dcache.d_in, dcache.d_out
+
+ # link mmu and dcache together
+ m.d.comb += dcache.m_in.eq(mmu.d_out) # MMUToDCacheType
+ m.d.comb += mmu.d_in.eq(dcache.m_out) # DCacheToMMUType
+
+ # link ldst and MMU together
+ comb += l_in.eq(ldst.m_out)
+ comb += ldst.m_in.eq(l_out)
+
+ return m, cmpi
+
+### test case for dcbz
+
+def _test_dcbz_addr_100e0(dut, mem):
+ mmu = dut.submodules.mmu
+ pi = dut.submodules.ldst.pi
+ wbget.stop = False
+
+ yield mmu.rin.prtbl.eq(0x1000000) # set process table
+ yield
+
+ addr = 0x100e0
+ data = 0xf553b658ba7e1f51
+
+ msr = MSRSpec(pr=1, dr=0, sf=1) # 64 bit by default
+
+ yield from pi_st(pi, addr, data, 8, msr)
+ yield
+
+ ld_data, _, _ = yield from pi_ld(pi, addr, 8, msr)
+ assert ld_data == 0xf553b658ba7e1f51
+ ld_data, _, _ = yield from pi_ld(pi, addr, 8, msr)
+ assert ld_data == 0xf553b658ba7e1f51
+
+ print("do_dcbz ===============")
+ yield from pi_st(pi, addr, data, 8, msr, is_dcbz=1)
+ print("done_dcbz ===============")
+ yield
+
+ ld_data, _, _ = yield from pi_ld(pi, addr, 8, msr)
+ print("ld_data after dcbz")
+ print(ld_data)
+ assert ld_data == 0
+
+ yield
+ wbget.stop = True
+
+def test_dcbz_addr_100e0():
+
+ m, cmpi = setup_mmu()
+
+ mem = pagetables.test1
+
+ # nmigen Simulation
+ sim = Simulator(m)
+ sim.add_clock(1e-6)
+
+ sim.add_sync_process(wrap(_test_dcbz_addr_100e0(m, mem)))
+ sim.add_sync_process(wrap(wb_get(cmpi.wb_bus(), mem)))
+ with sim.write_vcd('test_dcbz_addr_zero.vcd'):
+ sim.run()
+
+if __name__ == '__main__':
+ test_dcbz_addr_100e0()
super().__init__(regwid, addrwid)
self.ldst = LDSTSplitter(32, 48, 4)
- def set_wr_addr(self, m, addr, mask, misalign, msr_pr):
+ def set_wr_addr(self, m, addr, mask, misalign, msr, is_dcbz, is_nc):
m.d.comb += self.ldst.addr_i.eq(addr)
- def set_rd_addr(self, m, addr, mask, misalign, msr_pr):
+ def set_rd_addr(self, m, addr, mask, misalign, msr, is_nc):
m.d.comb += self.ldst.addr_i.eq(addr)
def set_wr_data(self, m, data, wen):
# TODO: memory ports
-def test_cache_single_run(dut):
+def tst_cache_single_run(dut):
#test single byte
addr = 0
data = 0xfeedface
dut = TestCachedMemoryPortInterface()
#LDSTSplitter(8, 48, 4) #data leng in bytes, address bits, select bits
- run_simulation(dut, test_cache_single_run(dut),
+ run_simulation(dut, tst_cache_single_run(dut),
vcd_name='test_cache_single.vcd')
from nmutil.mask import Mask, masked
from nmutil.util import Display
from random import randint, seed
+from openpower.test.wb_get import wb_get
+from openpower.test import wb_get as wbget
if True:
from nmigen.back.pysim import Simulator, Delay, Settle
from soc.experiment.mmu import MMU
from nmigen.compat.sim import run_simulation
+from openpower.decoder.power_enums import MSRSpec
-stop = False
+msr_default = MSRSpec(pr=1, dr=0, sf=1) # 64 bit by default
+
+
+wbget.stop = False
def b(x): # byte-reverse function
return int.from_bytes(x.to_bytes(8, byteorder='little'),
byteorder='big', signed=False)
-def wb_get(wb, mem):
- """simulator process for getting memory load requests
- """
-
- global stop
- assert(stop==False)
-
- while not stop:
- while True: # wait for dc_valid
- if stop:
- return
- cyc = yield (wb.cyc)
- stb = yield (wb.stb)
- if cyc and stb:
- break
- yield
- addr = (yield wb.adr) << 3
- if addr not in mem:
- print (" WB LOOKUP NO entry @ %x, returning zero" % (addr))
-
- # read or write?
- we = (yield wb.we)
- if we:
- store = (yield wb.dat_w)
- sel = (yield wb.sel)
- data = mem.get(addr, 0)
- # note we assume 8-bit sel, here
- res = 0
- for i in range(8):
- mask = 0xff << (i*8)
- if sel & (1<<i):
- res |= store & mask
- else:
- res |= data & mask
- mem[addr] = res
- print (" DCACHE set %x mask %x data %x" % (addr, sel, res))
- else:
- data = mem.get(addr, 0)
- yield wb.dat_r.eq(data)
- print (" DCACHE get %x data %x" % (addr, data))
-
- yield wb.ack.eq(1)
- yield
- yield wb.ack.eq(0)
- yield
+#def dumpmem(mem,fn):
+# f = open(fn,"w")
+# for cell in mem:
+# f.write(str(hex(cell))+"="+str(hex(mem[cell]))+"\n")
def mmu_lookup(dut, addr):
mmu = dut.submodules.mmu
- global stop
print("pi_ld", hex(addr))
- data = yield from pi_ld(dut.submodules.ldst.pi, addr, 4, msr_pr=1)
+ data, _, _ = yield from pi_ld(dut.submodules.ldst.pi, addr, 4, msr=msr_default)
print("pi_ld done, data", hex(data))
"""
# original test code kept for reference
- while not stop: # wait for dc_valid / err
+ while not wbget.stop: # wait for dc_valid / err
print("waiting for mmu")
l_done = yield (mmu.l_out.done)
l_err = yield (mmu.l_out.err)
def ldst_sim(dut):
mmu = dut.submodules.mmu
- global stop
yield mmu.rin.prtbl.eq(0x1000000) # set process table
yield
data = yield from mmu_lookup(dut, addr+8)
assert data == 0xf001a5a5
- yield from pi_st(dut.submodules.ldst.pi, addr+4, 0x10015a5a, 4, msr_pr=1)
+ yield from pi_st(dut.submodules.ldst.pi, addr+4, 0x10015a5a, 4, msr=msr_default)
data = yield from mmu_lookup(dut, addr+4)
assert data == 0x10015a5a
yield
yield
- stop = True
+ wbget.stop = True
def setup_mmu():
- global stop
- stop = False
+ wbget.stop = False
pspec = TestMemPspec(ldst_ifacetype='mmu_cache_wb',
imem_ifacetype='',
l_in, l_out = mmu.l_in, mmu.l_out
d_in, d_out = dcache.d_in, dcache.d_out
- wb_out, wb_in = dcache.wb_out, dcache.wb_in
# link mmu and dcache together
m.d.comb += dcache.m_in.eq(mmu.d_out) # MMUToDCacheType
def ldst_sim_misalign(dut):
mmu = dut.submodules.mmu
- global stop
- stop = False
+ wbget.stop = False
yield mmu.rin.prtbl.eq(0x1000000) # set process table
yield
- data = yield from pi_ld(dut.submodules.ldst.pi, 0x1007, 8, msr_pr=1)
- print ("misalign ld data", hex(data))
+ data, _, _ = yield from pi_ld(dut.submodules.ldst.pi, 0x1007, 8, msr_default)
+ print ("misalign ld data", data)
yield
- stop = True
+ wbget.stop = True
def test_misalign_mmu():
def ldst_sim_radixmiss(dut):
mmu = dut.submodules.mmu
- global stop
- stop = False
+ wbget.stop = False
yield mmu.rin.prtbl.eq(1<<40) # set process table
yield
- data = yield from pi_ld(dut.submodules.ldst.pi, 0x10000000, 8, msr_pr=1)
- print ("radixmiss ld data", hex(data))
+ data, _, _ = yield from pi_ld(dut.submodules.ldst.pi,
+ 0x10000000, 8, msr=msr_default)
+ print ("radixmiss ld data", data)
yield
- stop = True
+ wbget.stop = True
def ldst_sim_dcache_regression(dut):
mmu = dut.submodules.mmu
- global stop
- stop = False
+ wbget.stop = False
yield mmu.rin.prtbl.eq(0x1000000) # set process table
yield
addr = 0x10000
- data = yield from pi_ld(dut.submodules.ldst.pi, addr, 8, msr_pr=1)
- print ("=== dcache_regression ld data", hex(data))
+ data, _, _ = yield from pi_ld(dut.submodules.ldst.pi, addr, 8, msr=msr_default)
+ print ("=== dcache_regression ld data", data)
assert(data == 0xdeadbeef01234567)
yield
- stop = True
-
-def ldst_sim_dcache_random2(dut):
- mmu = dut.submodules.mmu
- pi = dut.submodules.ldst.pi
- global stop
- stop = False
-
- yield mmu.rin.prtbl.eq(0x1000000) # set process table
- yield
-
- memsize = 64
-
- refs = [
- ## random values from a failed test
- [0x100e0,0xf553b658ba7e1f51],
- [0x10150,0x12c95a730df1cee7],
- [0x10080,0x5a921ae06674cd81],
- [0x100f8,0x4fea5eab80090fa5],
- [0x10080,0xd481432d17a340be],
- [0x10060,0x8553fcf29526fb32],
- # [0x101d0,0x327c967c8be30ded],
- [0x101e0,0x8f15d8d05d25b151]
- ]
-
- for i in refs:
- addr = i[0]
- data = i[1]
-
- yield from pi_st(pi, addr, data, 8, msr_pr=1)
- yield
-
- ld_data = yield from pi_ld(pi, addr, 8, msr_pr=1)
-
- print ("dcache_random values", hex(addr), hex(data), hex(ld_data), data==ld_data)
- assert(data==ld_data) ## investigate why this fails -- really seldom
-
- yield
- stop = True
+ wbget.stop = True
def ldst_sim_dcache_random(dut):
mmu = dut.submodules.mmu
pi = dut.submodules.ldst.pi
- global stop
- stop = False
+ wbget.stop = False
yield mmu.rin.prtbl.eq(0x1000000) # set process table
yield
addr *= 8
addr += 0x10000
- yield from pi_st(pi, addr, data, 8, msr_pr=1)
+ yield from pi_st(pi, addr, data, 8, msr=msr_default)
yield
- ld_data = yield from pi_ld(pi, addr, 8, msr_pr=1)
+ ld_data, _, _ = yield from pi_ld(pi, addr, 8, msr=msr_default)
- print ("dcache_random values", hex(addr), hex(data), hex(ld_data), data==ld_data)
+ eq = (data==ld_data)
+ print ("dcache_random values", hex(addr), hex(data), hex(ld_data), eq)
assert(data==ld_data) ## investigate why this fails -- really seldom
yield
- stop = True
+ wbget.stop = True
def ldst_sim_dcache_first(dut): # this test is likely to fail
mmu = dut.submodules.mmu
pi = dut.submodules.ldst.pi
- global stop
- stop = False
+ wbget.stop = False
yield mmu.rin.prtbl.eq(0x1000000) # set process table
yield
data = 0x8c5a3e460d71f0b4
# known to fail without bugfix in src/soc/fu/ldst/loadstore.py
- yield from pi_st(pi, addr, data, 8, msr_pr=1)
+ yield from pi_st(pi, addr, data, 8, msr=msr_default)
yield
- ld_data = yield from pi_ld(pi, addr, 8, msr_pr=1)
+ ld_data, _, _ = yield from pi_ld(pi, addr, 8, msr=msr_default)
print ("addr",addr)
print ("dcache_first ld data", hex(data), hex(ld_data))
assert(data==ld_data)
yield
- stop = True
+ wbget.stop = True
def test_radixmiss_mmu():
with sim.write_vcd('test_ldst_pi_random.vcd'):
sim.run()
+def ldst_sim_dcache_random2(dut, mem):
+ mmu = dut.submodules.mmu
+ pi = dut.submodules.ldst.pi
+ wbget.stop = False
+
+ yield mmu.rin.prtbl.eq(0x1000000) # set process table
+ yield
+
+ memsize = 64
+
+ refs = [
+ ## random values from a failed test
+ #[0x100e0,0xf553b658ba7e1f51,0,0], ## 1
+ #[0x10150,0x12c95a730df1cee7,0,0], ## 2
+ #[0x10080,0x5a921ae06674cd81,0,0], ## 3
+ #[0x100f8,0x4fea5eab80090fa5,0,0], ## 4
+ #[0x10080,0xd481432d17a340be,0,0], ## 5
+ #[0x10060,0x8553fcf29526fb32,0,0], ## 6
+ [0x101d0,0x327c967c8be30ded,0,0], ## 7
+ [0x101e0,0x8f15d8d05d25b151,1,0] ## 8
+ #uncommenting line 7 will cause the original test not to fail
+
+ ]
+
+ c = 0
+ for i in refs:
+ addr = i[0]
+ data = i[1]
+ c1 = i[2]
+ c2 = i[3]
+
+ print("== write: wb_get")
+
+ for i in range(0,c1):
+ print("before_pi_st")
+ yield
+
+ yield from pi_st(pi, addr, data, 8, msr=msr_default)
+ yield
+
+ for i in range(0,c2):
+ print("before_pi_ld")
+ yield
+
+ print("== read: wb_get")
+ ld_data, _, _ = yield from pi_ld(pi, addr, 8, msr=msr_default)
+
+ #dumpmem(mem,"/tmp/dumpmem"+str(c)+".txt")
+ #c += 1
+
+ eq = (data==ld_data)
+ print ("dcache_random values", hex(addr), hex(data), hex(ld_data), eq)
+ assert(data==ld_data) ## investigate why this fails -- really seldom
+
+ yield
+ wbget.stop = True
+
def test_dcache_random2():
m, cmpi = setup_mmu()
0x1000000: # PROCESS_TABLE_3
# RTS1 = 0x2 RPDB = 0x300 RTS2 = 0x5 RPDS = 13
b(0x40000000000300ad),
+
+ ###0x101e0:0x8f15d8d05d25b152 ## flush cache -- then check again
}
# nmigen Simulation
sim = Simulator(m)
sim.add_clock(1e-6)
- sim.add_sync_process(wrap(ldst_sim_dcache_random2(m)))
+ sim.add_sync_process(wrap(ldst_sim_dcache_random2(m, mem)))
sim.add_sync_process(wrap(wb_get(cmpi.wb_bus(), mem)))
with sim.write_vcd('test_ldst_pi_random2.vcd'):
sim.run()
from soc.experiment.mmu import MMU
from nmigen.compat.sim import run_simulation
+from openpower.test.wb_get import wb_get
+from openpower.test import wb_get as wbget
+from openpower.decoder.power_enums import MSRSpec
+msr_default = MSRSpec(pr=0, dr=0, sf=1) # 64 bit by default
-stop = False
+
+wbget.stop = False
def b(x): # byte-reverse function
return int.from_bytes(x.to_bytes(8, byteorder='little'),
byteorder='big', signed=False)
-def wb_get(wb, mem):
- """simulator process for getting memory load requests
- """
-
- global stop
-
- while not stop:
- while True: # wait for dc_valid
- if stop:
- return
- cyc = yield (wb.cyc)
- stb = yield (wb.stb)
- if cyc and stb:
- break
- yield
- addr = (yield wb.adr) << 3
- if addr not in mem:
- print (" WB LOOKUP NO entry @ %x, returning zero" % (addr))
-
- # read or write?
- we = (yield wb.we)
- if we:
- store = (yield wb.dat_w)
- sel = (yield wb.sel)
- data = mem.get(addr, 0)
- # note we assume 8-bit sel, here
- res = 0
- for i in range(8):
- mask = 0xff << (i*8)
- if sel & (1<<i):
- res |= store & mask
- else:
- res |= data & mask
- mem[addr] = res
- print (" DCACHE set %x mask %x data %x" % (addr, sel, res))
- else:
- data = mem.get(addr, 0)
- yield wb.dat_r.eq(data)
- print (" DCACHE get %x data %x" % (addr, data))
-
- yield wb.ack.eq(1)
- yield
- yield wb.ack.eq(0)
- yield
-
def setup_mmu():
l_in, l_out = mmu.l_in, mmu.l_out
d_in, d_out = dcache.d_in, dcache.d_out
- wb_out, wb_in = dcache.wb_out, dcache.wb_in
# link mmu and dcache together
m.d.comb += dcache.m_in.eq(mmu.d_out) # MMUToDCacheType
def ldst_sim_misalign(dut):
mmu = dut.submodules.mmu
- global stop
- stop = False
+ wbget.stop = False
yield mmu.rin.prtbl.eq(0x1000000) # set process table
yield
- data = yield from pi_ld(dut.submodules.ldst.pi, 0x1000, 4, msr_pr=1)
+ # load 8 bytes at aligned address
+ align_addr = 0x1000
+ data, exctype, exc = yield from pi_ld(dut.submodules.ldst.pi,
+ align_addr, 8, msr=msr_default)
+ print ("ldst_sim_misalign (aligned)", hex(data), exctype, exc)
+ assert data == 0xdeadbeef01234567
+
+ # load 4 bytes at aligned address
+ align_addr = 0x1004
+ data, exctype, exc = yield from pi_ld(dut.submodules.ldst.pi,
+ align_addr, 4, msr=msr_default)
+ print ("ldst_sim_misalign (aligned)", hex(data), exctype, exc)
+ assert data == 0xdeadbeef
+
+ # load 8 bytes at *mis*-aligned address which is still within
+ # the page
+ misalign_addr = 0x1004
+ data, exctype, exc = yield from pi_ld(dut.submodules.ldst.pi,
+ misalign_addr, 8, msr=msr_default)
+
+ print ("ldst_sim_misalign", hex(data), exctype, exc)
+ assert data == 0xf001a5a5deadbeef
+
+ # load 8 bytes at *mis*-aligned address which is still within
+ # the page
+ misalign_addr = 0x1006
+ data, exctype, exc = yield from pi_ld(dut.submodules.ldst.pi,
+ misalign_addr, 8, msr=msr_default)
+
+ print ("ldst_sim_misalign", hex(data), exctype, exc)
+ assert data == 0xf00ff001a5a5dead
+ wbget.stop = True
+ return
+
+ # load 8 bytes at *mis*-aligned address which is NOT within
+ # the page - TODO - work this out
+ misalign_addr = 0x10000004
+ data, exctype, exc = yield from pi_ld(dut.submodules.ldst.pi,
+ misalign_addr, 8, msr=msr_default)
+ print ("ldst_sim_misalign", data, exctype, exc)
+ yield
+ dar = yield dut.submodules.ldst.dar
+ print ("DAR", hex(dar))
+ assert dar == misalign_addr
+ # check exception bits
+ assert exc.happened
+ assert exc.alignment
+ assert not exc.segment_fault
+ assert not exc.instr_fault
+ assert not exc.invalid
+ assert not exc.perm_error
+ assert not exc.rc_error
+ assert not exc.badtree
+
+ wbget.stop = True
def test_misalign_mmu():
--- /dev/null
+from nmigen import (C, Module, Signal, Elaboratable, Mux, Cat, Repl, Signal,
+ Const)
+from nmigen.cli import main
+from nmigen.cli import rtlil
+from nmutil.mask import Mask, masked
+from nmutil.util import Display
+from random import randint, seed
+from nmigen.sim import Simulator, Delay, Settle
+from nmutil.util import wrap
+
+from soc.config.test.test_pi2ls import (pi_ld, pi_st, pi_ldst, wait_busy,
+ get_exception_info)
+#from soc.config.test.test_pi2ls import pi_st_debug
+from soc.config.test.test_loadstore import TestMemPspec
+from soc.config.loadstore import ConfigMemoryPortInterface
+
+from soc.fu.ldst.loadstore import LoadStore1
+from soc.experiment.mmu import MMU
+from soc.experiment.test import pagetables
+
+from nmigen.compat.sim import run_simulation
+from random import random
+from openpower.test.wb_get import wb_get_classic
+from openpower.test import wb_get as wbget
+from openpower.exceptions import LDSTExceptionTuple
+
+from soc.config.test.test_fetch import read_from_addr
+from openpower.decoder.power_enums import MSRSpec
+
+
+def setup_mmu():
+
+ wbget.stop = False
+
+ pspec = TestMemPspec(ldst_ifacetype='mmu_cache_wb',
+ imem_ifacetype='',
+ addr_wid=48,
+ #disable_cache=True, # hmmm...
+ mask_wid=8,
+ reg_wid=64)
+
+ m = Module()
+ comb = m.d.comb
+ cmpi = ConfigMemoryPortInterface(pspec)
+ m.submodules.ldst = ldst = cmpi.pi
+ m.submodules.mmu = mmu = MMU()
+ dcache = ldst.dcache
+ icache = ldst.icache
+
+ l_in, l_out = mmu.l_in, mmu.l_out
+ d_in, d_out = dcache.d_in, dcache.d_out
+ i_in, i_out = icache.i_in, icache.i_out # FetchToICache, ICacheToDecode
+
+ # link mmu, dcache and icache together
+ m.d.comb += dcache.m_in.eq(mmu.d_out) # MMUToDCacheType
+ m.d.comb += icache.m_in.eq(mmu.i_out) # MMUToICacheType
+ m.d.comb += mmu.d_in.eq(dcache.m_out) # DCacheToMMUType
+
+ # link ldst and MMU together
+ comb += l_in.eq(ldst.m_out)
+ comb += ldst.m_in.eq(l_out)
+
+ # add a debug status Signal: use "msg.str = "blah"
+ # then toggle with yield msg.eq(0); yield msg.eq(1)
+ debug_status = Signal(8, decoder=lambda _ : debug_status.str)
+ m.debug_status = debug_status
+ debug_status.str = ''
+
+ return m, cmpi
+
+
+def icache_read(dut,addr,priv,virt):
+
+ icache = dut.submodules.ldst.icache
+ i_in = icache.i_in
+ i_out = icache.i_out
+
+ yield i_in.priv_mode.eq(priv)
+ yield i_in.virt_mode.eq(virt)
+ yield i_in.req.eq(1)
+ yield i_in.nia.eq(addr)
+ yield i_in.stop_mark.eq(0)
+
+ yield i_in.req.eq(1)
+ yield i_in.nia.eq(addr)
+ yield
+ valid = yield i_out.valid
+ failed = yield i_out.fetch_failed
+ while not valid and not failed:
+ yield
+ valid = yield i_out.valid
+ failed = yield i_out.fetch_failed
+ yield i_in.req.eq(0)
+
+ nia = yield i_out.nia
+ insn = yield i_out.insn
+ yield
+ yield
+
+ return nia, insn, valid, failed
+
+
+test_exceptions = True
+test_dcbz = True
+test_random = True
+
+
+def debug(dut, msg):
+ print ("set debug message", msg)
+ dut.debug_status.str = msg # set the message
+ yield dut.debug_status.eq(0) # trigger an update
+ yield dut.debug_status.eq(1)
+
+
+def _test_loadstore1_ifetch_iface(dut, mem):
+ """test_loadstore1_ifetch_iface
+
+ read in priv mode, non-virtual. tests the FetchUnitInterface
+
+ """
+
+ mmu = dut.submodules.mmu
+ ldst = dut.submodules.ldst
+ pi = ldst.pi
+ icache = dut.submodules.ldst.icache
+ wbget.stop = False
+
+ print("=== test loadstore instruction (real) ===")
+
+ i_in = icache.i_in
+ i_out = icache.i_out
+ i_m_in = icache.m_in
+
+ yield from debug(dut, "real mem instruction")
+ # set address to 0x8, update mem[0x8] to 01234 | 0x5678<<32
+ # (have to do 64-bit writes into the dictionary-memory-emulated-thing)
+ addr = 8
+ addr2 = 12
+ expected_insn2 = 0x5678
+ expected_insn = 0x1234
+ mem[addr] = expected_insn | expected_insn2<<32
+
+ yield i_in.priv_mode.eq(1)
+ insn = yield from read_from_addr(icache, addr, stall=False)
+
+ nia = yield i_out.nia # NO, must use FetchUnitInterface
+ print ("fetched %x from addr %x" % (insn, nia))
+ assert insn == expected_insn
+
+ print("=== test loadstore instruction (2nd, real) ===")
+ yield from debug(dut, "real mem 2nd (addr 0xc)")
+
+ insn2 = yield from read_from_addr(icache, addr2, stall=False)
+
+ nia = yield i_out.nia # NO, must use FetchUnitInterface
+ print ("fetched %x from addr2 %x" % (insn2, nia))
+ assert insn2 == expected_insn2
+
+ print("=== test loadstore instruction (done) ===")
+
+ yield from debug(dut, "test done")
+ yield
+ yield
+
+ print ("fetched %x from addr %x" % (insn, nia))
+ assert insn == expected_insn
+
+ wbget.stop = True
+
+
+def write_mem2(mem, addr, i1, i2):
+ mem[addr] = i1 | i2<<32
+
+
+#TODO: use fetch interface here
+def lookup_virt(dut,addr):
+ icache = dut.submodules.ldst.icache
+ i_in = icache.i_in
+ i_out = icache.i_out
+ yield i_in.priv_mode.eq(0)
+ yield i_in.virt_mode.eq(1)
+ yield i_in.req.eq(0)
+ yield i_in.stop_mark.eq(0)
+
+ yield icache.a_i_valid.eq(1)
+ yield icache.a_pc_i.eq(addr)
+ yield
+ valid = yield i_out.valid
+ failed = yield i_out.fetch_failed
+ while not valid and not failed:
+ yield
+ valid = yield i_out.valid
+ failed = yield i_out.fetch_failed
+ yield icache.a_i_valid.eq(0)
+
+ return valid,failed
+
+
+def mmu_lookup(dut,addr):
+ ldst = dut.submodules.ldst
+ pi = ldst.pi
+ yield from debug(dut, "instr fault "+hex(addr))
+ yield ldst.priv_mode.eq(0)
+ yield ldst.instr_fault.eq(1)
+ yield ldst.maddr.eq(addr)
+ yield
+ yield ldst.instr_fault.eq(0)
+ while True:
+ done = yield (ldst.done)
+ exc_info = yield from get_exception_info(pi.exc_o)
+ if done or exc_info.happened:
+ break
+ yield
+ yield
+ assert exc_info.happened == 0 # assert just before doing the fault set zero
+ yield ldst.instr_fault.eq(0)
+ yield from debug(dut, "instr fault done "+hex(addr))
+ yield
+ yield
+ yield
+
+
+def _test_loadstore1_ifetch_multi(dut, mem):
+ mmu = dut.submodules.mmu
+ ldst = dut.submodules.ldst
+ pi = ldst.pi
+ icache = dut.submodules.ldst.icache
+ assert wbget.stop == False
+
+ print ("set process table")
+ yield from debug(dut, "set prtble")
+ yield mmu.rin.prtbl.eq(0x1000000) # set process table
+ yield
+
+ i_in = icache.i_in
+ i_out = icache.i_out
+ i_m_in = icache.m_in
+
+ # fetch instructions from multiple addresses
+ # should cope with some addresses being invalid
+ real_addrs = [0,4,8,0,8,4,0,0,12]
+ write_mem2(mem,0,0xF0,0xF4)
+ write_mem2(mem,8,0xF8,0xFC)
+
+ yield i_in.priv_mode.eq(1)
+ for addr in real_addrs:
+ yield from debug(dut, "real_addr "+hex(addr))
+ insn = yield from read_from_addr(icache, addr, stall=False)
+ nia = yield i_out.nia # NO, must use FetchUnitInterface
+ print ("TEST_MULTI: fetched %x from addr %x == %x" % (insn, nia,addr))
+ assert insn==0xF0+addr
+
+ # now with virtual memory enabled
+ yield i_in.virt_mode.eq(1)
+
+ virt_addrs = [0x10200,0x10204,0x10208,0x10200,
+ 0x102008,0x10204,0x10200,0x10200,0x10200C]
+
+ write_mem2(mem,0x10200,0xF8,0xFC)
+
+ for addr in virt_addrs:
+ yield from debug(dut, "virt_addr "+hex(addr))
+
+ valid, failed = yield from lookup_virt(dut,addr)
+ yield
+ print("TEST_MULTI: failed=",failed) # this is reported wrong
+ if failed==1: # test one first
+ yield from mmu_lookup(dut,addr)
+ valid, failed = yield from lookup_virt(dut,addr)
+ assert(valid==1)
+
+ wbget.stop = True
+
+
+def _test_loadstore1_ifetch(dut, mem):
+ """test_loadstore1_ifetch
+
+ this is quite a complex multi-step test.
+
+ * first (just because, as a demo) read in priv mode, non-virtual.
+ just like in experiment/icache.py itself.
+
+ * second, using the (usual) PTE for these things (which came originally
+ from gem5-experimental experiment/radix_walk_example.txt) do a
+ virtual-memory read through the *instruction* cache.
+ this is expected to FAIL
+
+ * third: mess about with the MMU, setting "iside" (instruction-side),
+ requesting an MMU RADIX LOOKUP. this triggers an itlb_load
+ (instruction-cache TLB entry-insertion)
+
+ * fourth and finally: retry the read of the instruction through i-cache.
+ this is now expected to SUCCEED
+
+ a lot going on.
+ """
+
+ mmu = dut.submodules.mmu
+ ldst = dut.submodules.ldst
+ pi = ldst.pi
+ icache = dut.submodules.ldst.icache
+ wbget.stop = False
+
+ print("=== test loadstore instruction (real) ===")
+
+ i_in = icache.i_in
+ i_out = icache.i_out
+ i_m_in = icache.m_in
+
+ # first virtual memory test
+
+ print ("set process table")
+ yield from debug(dut, "set prtble")
+ yield mmu.rin.prtbl.eq(0x1000000) # set process table
+ yield
+
+ yield from debug(dut, "real mem instruction")
+ # set address to zero, update mem[0] to 01234
+ addr = 8
+ expected_insn = 0x1234
+ mem[addr] = expected_insn
+
+ yield i_in.priv_mode.eq(1)
+ yield i_in.req.eq(0)
+ yield i_in.nia.eq(addr)
+ yield i_in.stop_mark.eq(0)
+ yield i_m_in.tlbld.eq(0)
+ yield i_m_in.tlbie.eq(0)
+ yield i_m_in.addr.eq(0)
+ yield i_m_in.pte.eq(0)
+ yield
+ yield
+ yield
+
+ # miss, stalls for a bit -- this one is different here
+ ##nia, insn, valid, failed = yield from icache_read(dut,addr,0,0)
+ ##assert(valid==0)
+ ##assert(failed==1)
+
+ yield i_in.req.eq(1)
+ yield i_in.nia.eq(addr)
+ yield
+ valid = yield i_out.valid
+ while not valid:
+ yield
+ valid = yield i_out.valid
+ yield i_in.req.eq(0)
+
+ nia = yield i_out.nia
+ insn = yield i_out.insn
+ yield
+ yield
+
+ print ("fetched %x from addr %x" % (insn, nia))
+ assert insn == expected_insn
+
+ print("=== test loadstore instruction (virtual) ===")
+
+ # look up i-cache expecting it to fail
+
+ yield from debug(dut, "virtual instr req")
+ # set address to 0x10200, update mem[] to 5678
+ virt_addr = 0x10200
+ real_addr = virt_addr
+ expected_insn = 0x5678
+ mem[real_addr] = expected_insn
+
+ yield i_in.priv_mode.eq(0)
+ yield i_in.virt_mode.eq(1)
+ yield i_in.req.eq(0)
+ yield i_in.nia.eq(virt_addr)
+ yield i_in.stop_mark.eq(0)
+ yield i_m_in.tlbld.eq(0)
+ yield i_m_in.tlbie.eq(0)
+ yield i_m_in.addr.eq(0)
+ yield i_m_in.pte.eq(0)
+ yield
+ yield
+ yield
+
+ # miss, stalls for a bit
+ yield i_in.req.eq(1)
+ yield i_in.nia.eq(virt_addr)
+ yield
+ valid = yield i_out.valid
+ failed = yield i_out.fetch_failed
+ while not valid and not failed:
+ yield
+ valid = yield i_out.valid
+ failed = yield i_out.fetch_failed
+ yield i_in.req.eq(0)
+
+ print ("failed?", "yes" if failed else "no")
+ assert failed == 1
+ yield
+ yield
+
+ print("=== test loadstore instruction (instruction fault) ===")
+
+ yield from debug(dut, "instr fault")
+
+ virt_addr = 0x10200
+
+ yield ldst.priv_mode.eq(0)
+ yield ldst.instr_fault.eq(1)
+ yield ldst.maddr.eq(virt_addr)
+ # still broken -- investigate
+ # msr = MSRSpec(pr=?, dr=?, sf=0)
+ # ld_data, exctype, exc = yield from pi_ld(pi, virt_addr, 8, msr=msr)
+ yield
+ yield ldst.instr_fault.eq(0)
+ while True:
+ done = yield (ldst.done)
+ exc_info = yield from get_exception_info(pi.exc_o)
+ if done or exc_info.happened:
+ break
+ yield
+ assert exc_info.happened == 0 # assert just before doing the fault set zero
+ yield ldst.instr_fault.eq(0)
+ yield
+ yield
+ yield
+
+ print("=== test loadstore instruction (try instruction again) ===")
+ yield from debug(dut, "instr virt retry")
+ # set address to 0x10200, update mem[] to 5678
+ virt_addr = 0x10200
+ real_addr = virt_addr
+ expected_insn = 0x5678
+
+ yield i_in.priv_mode.eq(0)
+ yield i_in.virt_mode.eq(1)
+ yield i_in.req.eq(0)
+ yield i_in.nia.eq(virt_addr)
+ yield i_in.stop_mark.eq(0)
+ yield i_m_in.tlbld.eq(0)
+ yield i_m_in.tlbie.eq(0)
+ yield i_m_in.addr.eq(0)
+ yield i_m_in.pte.eq(0)
+ yield
+ yield
+ yield
+
+ # miss, stalls for a bit
+ """
+ yield i_in.req.eq(1)
+ yield i_in.nia.eq(virt_addr)
+ yield
+ valid = yield i_out.valid
+ failed = yield i_out.fetch_failed
+ while not valid and not failed:
+ yield
+ valid = yield i_out.valid
+ failed = yield i_out.fetch_failed
+ yield i_in.req.eq(0)
+ nia = yield i_out.nia
+ insn = yield i_out.insn
+ """
+
+ ## part 4
+ nia, insn, valid, failed = yield from icache_read(dut,virt_addr,0,1)
+
+ yield from debug(dut, "test done")
+ yield
+ yield
+
+ print ("failed?", "yes" if failed else "no")
+ assert failed == 0
+
+ print ("fetched %x from addr %x" % (insn, nia))
+ assert insn == expected_insn
+
+ wbget.stop = True
+
+
+def _test_loadstore1_invalid(dut, mem):
+ mmu = dut.submodules.mmu
+ pi = dut.submodules.ldst.pi
+ wbget.stop = False
+
+ print("=== test invalid ===")
+
+ addr = 0
+ msr = MSRSpec(pr=1, dr=0, sf=0) # set problem-state
+ ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
+ print("ld_data", ld_data, exctype, exc)
+ assert (exctype == "slow")
+ invalid = exc.invalid
+ assert (invalid == 1)
+
+ print("=== test invalid done ===")
+
+ wbget.stop = True
+
+
+def _test_loadstore1_microwatt_mmu_bin_test2(dut, mem):
+ mmu = dut.submodules.mmu
+ pi = dut.submodules.ldst.pi
+ ldst = dut.submodules.ldst # to get at DAR (NOT part of PortInterface)
+ wbget.stop = False
+
+ yield mmu.rin.prtbl.eq(0x12000) # set process table
+ yield mmu.rin.pid.eq(0x1) # set PID=1
+ yield
+
+ addr = 0x124108
+ msr = MSRSpec(pr=1, dr=1, sf=1)
+
+ print("=== alignment error (ld) ===")
+
+ ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
+ print("ld_data after mmu.bin test2")
+ print(ld_data)
+ assert ld_data == 0x0000000badc0ffee
+ assert exctype is None
+
+ wbget.stop = True
+
+
+def _test_loadstore1_microwatt_mmu_bin_test5(dut, mem):
+ mmu = dut.submodules.mmu
+ pi = dut.submodules.ldst.pi
+ ldst = dut.submodules.ldst # to get at DAR (NOT part of PortInterface)
+ wbget.stop = False
+
+ yield mmu.rin.prtbl.eq(0x12000) # set process table
+ yield mmu.rin.pid.eq(0x1) # set PID=1
+ yield
+
+ addr = 0x39fffd
+ msr = MSRSpec(pr=1, dr=1, sf=1)
+
+ print("=== page-fault alignment error (ld) ===")
+
+ ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
+ print("ld_data after mmu.bin test5")
+ print(ld_data)
+ print (exctype, exc)
+
+ wbget.stop = True
+
+
+def test_pi_ld_misalign(pi, addr, data_len, msr):
+ for i in range(0,data_len):
+ ld_data, exctype, exc = yield from pi_ld(pi, addr+i, data_len, msr=msr)
+ yield
+ assert exc is None # use "is None" not "== None"
+ print("MISALIGN: test_pi_ld_misalign returned",hex(ld_data))
+
+
+def test_pi_st_ld_misalign(pi, addr, data_len, msr):
+ data = 0x0102030405060708
+ for i in range(0, data_len):
+ exctype, exc = yield from pi_st(pi, addr+i, data, data_len, msr=msr)
+ print (exctype, exc)
+ assert exc is None # use "is None" not "== None"
+ ld_data, exctype, exc = yield from pi_ld(pi, addr+i, data_len, msr=msr)
+ yield
+ assert exc is None # use "is None" not "== None"
+ print("MISALIGN: test_pi_ld_misalign returned",hex(ld_data))
+ assert ld_data == data
+
+
+def _test_loadstore1_misalign(dut, mem):
+ mmu = dut.submodules.mmu
+ pi = dut.submodules.ldst.pi
+ ldst = dut.submodules.ldst # to get at DAR (NOT part of PortInterface)
+ wbget.stop = False
+
+ yield mmu.rin.prtbl.eq(0x12000) # set process table
+ yield mmu.rin.pid.eq(0x1) # set PID=1
+ #yield
+
+ addr = 1
+ msr = MSRSpec(pr=0, dr=0, sf=1)
+
+ yield from test_pi_ld_misalign(pi,0,8,msr)
+
+ yield from test_pi_st_ld_misalign(pi,0,8,msr)
+
+ wbget.stop = True
+
+
+def _test_loadstore1(dut, mem):
+ mmu = dut.submodules.mmu
+ pi = dut.submodules.ldst.pi
+ ldst = dut.submodules.ldst # to get at DAR (NOT part of PortInterface)
+ wbget.stop = False
+
+ yield mmu.rin.prtbl.eq(0x1000000) # set process table
+ yield
+
+ addr = 0x100e0
+ data = 0xf553b658ba7e1f51
+ msr = MSRSpec(pr=0, dr=0, sf=0)
+
+ if test_dcbz:
+ yield from pi_st(pi, addr, data, 8, msr=msr)
+ yield
+
+ ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
+ assert ld_data == 0xf553b658ba7e1f51
+ assert exctype is None
+
+ ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
+ assert ld_data == 0xf553b658ba7e1f51
+ assert exctype is None
+
+ print("do_dcbz ===============")
+ yield from pi_st(pi, addr, data, 8, msr=msr, is_dcbz=1)
+ print("done_dcbz ===============")
+ yield
+
+ ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
+ print("ld_data after dcbz")
+ print(ld_data)
+ assert ld_data == 0
+ assert exctype is None
+
+ if test_exceptions:
+ print("=== alignment error (ld) ===")
+ addr = 0xFF100e0FF
+ ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
+ if exc:
+ alignment = exc.alignment
+ happened = exc.happened
+ yield # wait for dsr to update
+ dar = yield ldst.dar
+ else:
+ alignment = 0
+ happened = 0
+ dar = 0
+ assert (happened == 1)
+ assert (alignment == 1)
+ assert (dar == addr)
+ assert (exctype == "fast")
+ yield from wait_busy(pi, debug="pi_ld_E_alignment_error")
+ # wait is only needed in case of in exception here
+ print("=== alignment error test passed (ld) ===")
+
+ # take some cycles in between so that gtkwave separates out
+ # signals
+ yield
+ yield
+ yield
+ yield
+
+ print("=== alignment error (st) ===")
+ addr = 0xFF100e0FF
+ exctype, exc = yield from pi_st(pi, addr,0, 8, msr=msr)
+ if exc:
+ alignment = exc.alignment
+ happened = exc.happened
+ else:
+ alignment = 0
+ happened = 0
+ assert (happened == 1)
+ assert (alignment==1)
+ assert (dar==addr)
+ assert (exctype == "fast")
+ #???? yield from wait_busy(pi, debug="pi_st_E_alignment_error")
+ # wait is only needed in case of in exception here
+ print("=== alignment error test passed (st) ===")
+ yield #FIXME hangs
+
+ if True:
+ print("=== no alignment error (ld) ===")
+ addr = 0x100e0
+ ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
+ print("ld_data", ld_data, exctype, exc)
+ if exc:
+ alignment = exc.alignment
+ happened = exc.happened
+ else:
+ alignment = 0
+ happened = 0
+ assert (happened == 0)
+ assert (alignment == 0)
+ print("=== no alignment error done (ld) ===")
+
+ if test_random:
+ addrs = [0x456920,0xa7a180,0x299420,0x1d9d60]
+
+ for addr in addrs:
+ print("== RANDOM addr ==",hex(addr))
+ ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
+ print("ld_data[RANDOM]",ld_data,exc,addr)
+ assert (exctype == None)
+
+ for addr in addrs:
+ print("== RANDOM addr ==",hex(addr))
+ exc = yield from pi_st(pi, addr,0xFF*addr, 8, msr=msr)
+ assert (exctype == None)
+
+ # readback written data and compare
+ for addr in addrs:
+ print("== RANDOM addr ==",hex(addr))
+ ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
+ print("ld_data[RANDOM_READBACK]",ld_data,exc,addr)
+ assert (exctype == None)
+ assert (ld_data == 0xFF*addr)
+
+ print("== RANDOM addr done ==")
+
+ wbget.stop = True
+
+
+def _test_loadstore1_ifetch_invalid(dut, mem):
+ mmu = dut.submodules.mmu
+ ldst = dut.submodules.ldst
+ pi = ldst.pi
+ icache = dut.submodules.ldst.icache
+ wbget.stop = False
+
+ print("=== test loadstore instruction (invalid) ===")
+
+ i_in = icache.i_in
+ i_out = icache.i_out
+ i_m_in = icache.m_in
+
+ # first virtual memory test
+
+ print ("set process table")
+ yield from debug(dut, "set prtbl")
+ yield mmu.rin.prtbl.eq(0x1000000) # set process table
+ yield
+
+ yield from debug(dut, "real mem instruction")
+ # set address to zero, update mem[0] to 01234
+ addr = 8
+ expected_insn = 0x1234
+ mem[addr] = expected_insn
+
+ yield i_in.priv_mode.eq(1)
+ yield i_in.req.eq(0)
+ yield i_in.nia.eq(addr)
+ yield i_in.stop_mark.eq(0)
+ yield i_m_in.tlbld.eq(0)
+ yield i_m_in.tlbie.eq(0)
+ yield i_m_in.addr.eq(0)
+ yield i_m_in.pte.eq(0)
+ yield
+ yield
+ yield
+
+ # miss, stalls for a bit
+ yield i_in.req.eq(1)
+ yield i_in.nia.eq(addr)
+ yield
+ valid = yield i_out.valid
+ nia = yield i_out.nia
+ while not valid:
+ yield
+ valid = yield i_out.valid
+ yield i_in.req.eq(0)
+
+ nia = yield i_out.nia
+ insn = yield i_out.insn
+
+ yield
+ yield
+
+ print ("fetched %x from addr %x" % (insn, nia))
+ assert insn == expected_insn
+
+ print("=== test loadstore instruction (virtual) ===")
+ yield from debug(dut, "virtual instr req")
+
+ # look up i-cache expecting it to fail
+
+ # set address to 0x10200, update mem[] to 5678
+ virt_addr = 0x10200
+ real_addr = virt_addr
+ expected_insn = 0x5678
+ mem[real_addr] = expected_insn
+
+ yield i_in.priv_mode.eq(1)
+ yield i_in.virt_mode.eq(1)
+ yield i_in.req.eq(0)
+ yield i_in.nia.eq(virt_addr)
+ yield i_in.stop_mark.eq(0)
+ yield i_m_in.tlbld.eq(0)
+ yield i_m_in.tlbie.eq(0)
+ yield i_m_in.addr.eq(0)
+ yield i_m_in.pte.eq(0)
+ yield
+ yield
+ yield
+
+ # miss, stalls for a bit
+ yield i_in.req.eq(1)
+ yield i_in.nia.eq(virt_addr)
+ yield
+ valid = yield i_out.valid
+ failed = yield i_out.fetch_failed
+ while not valid and not failed:
+ yield
+ valid = yield i_out.valid
+ failed = yield i_out.fetch_failed
+ yield i_in.req.eq(0)
+
+ print ("failed?", "yes" if failed else "no")
+ assert failed == 1
+ yield
+ yield
+
+ print("=== test invalid loadstore instruction (instruction fault) ===")
+
+ yield from debug(dut, "instr fault (perm err expected)")
+ virt_addr = 0x10200
+
+ yield ldst.priv_mode.eq(0)
+ yield ldst.instr_fault.eq(1)
+ yield ldst.maddr.eq(virt_addr)
+ #ld_data, exctype, exc = yield from pi_ld(pi, virt_addr, 8, msr=msr)
+ yield
+ yield ldst.instr_fault.eq(0)
+ while True:
+ done = yield (ldst.done)
+ exc_info = yield from get_exception_info(pi.exc_o)
+ if done or exc_info.happened:
+ break
+ yield
+ assert exc_info.happened == 1 # different here as expected
+
+ # TODO: work out what kind of exception occurred and check it's
+ # the right one. we *expect* it to be a permissions error because
+ # the RPTE leaf node in pagetables.test2 is marked as "non-executable"
+ # but we also expect instr_fault to be set because it is an instruction
+ # (iside) lookup
+ print (" MMU lookup exception type?")
+ for fname in LDSTExceptionTuple._fields:
+ print (" fname %20s %d" % (fname, getattr(exc_info, fname)))
+
+ # ok now printed them out and visually inspected: check them with asserts
+ assert exc_info.instr_fault == 1 # instruction fault (yes!)
+ assert exc_info.perm_error == 1 # permissions (yes!)
+ assert exc_info.rc_error == 0
+ assert exc_info.alignment == 0
+ assert exc_info.invalid == 0
+ assert exc_info.segment_fault == 0
+ assert exc_info.rc_error == 0
+
+ yield from debug(dut, "test done")
+ yield ldst.instr_fault.eq(0)
+ yield
+ yield
+ yield
+
+ wbget.stop = True
+
+
+def test_loadstore1_ifetch_unit_iface():
+
+ m, cmpi = setup_mmu()
+
+ mem = pagetables.test1
+
+ # set this up before passing to Simulator (which calls elaborate)
+ icache = m.submodules.ldst.icache
+ icache.use_fetch_interface() # this is the function which converts
+ # to FetchUnitInterface. *including*
+ # rewiring the Wishbone Bus to ibus
+
+ # nmigen Simulation
+ sim = Simulator(m)
+ sim.add_clock(1e-6)
+
+ sim.add_sync_process(wrap(_test_loadstore1_ifetch_iface(m, mem)))
+ # add two wb_get_classic processes onto the *same* memory dictionary.
+ # this shouuuld work.... cross-fingers...
+ sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
+ sim.add_sync_process(wrap(wb_get_classic(icache.ibus, mem))) # ibus not bus
+ with sim.write_vcd('test_loadstore1_ifetch_iface.vcd',
+ traces=[m.debug_status]): # include extra debug
+ sim.run()
+
+
+def test_loadstore1_ifetch():
+
+ m, cmpi = setup_mmu()
+
+ mem = pagetables.test1
+
+ # nmigen Simulation
+ sim = Simulator(m)
+ sim.add_clock(1e-6)
+
+ icache = m.submodules.ldst.icache
+ sim.add_sync_process(wrap(_test_loadstore1_ifetch(m, mem)))
+ # add two wb_get_classic processes onto the *same* memory dictionary.
+ # this shouuuld work.... cross-fingers...
+ sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
+ sim.add_sync_process(wrap(wb_get_classic(icache.bus, mem)))
+ with sim.write_vcd('test_loadstore1_ifetch.vcd',
+ traces=[m.debug_status]): # include extra debug
+ sim.run()
+
+
+def test_loadstore1():
+
+ m, cmpi = setup_mmu()
+
+ mem = pagetables.test1
+
+ # nmigen Simulation
+ sim = Simulator(m)
+ sim.add_clock(1e-6)
+
+ sim.add_sync_process(wrap(_test_loadstore1(m, mem)))
+ sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
+ with sim.write_vcd('test_loadstore1.vcd'):
+ sim.run()
+
+
+def test_loadstore1_microwatt_mmu_bin_test2():
+
+ m, cmpi = setup_mmu()
+
+ mem = pagetables.microwatt_test2
+
+ # nmigen Simulation
+ sim = Simulator(m)
+ sim.add_clock(1e-6)
+
+ sim.add_sync_process(wrap(_test_loadstore1_microwatt_mmu_bin_test2(m, mem)))
+ sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
+ with sim.write_vcd('test_microwatt_mmu_test2.vcd'):
+ sim.run()
+
+
+def test_loadstore1_microwatt_mmu_bin_test5():
+
+ m, cmpi = setup_mmu()
+
+ mem = pagetables.microwatt_test5
+
+ # nmigen Simulation
+ sim = Simulator(m)
+ sim.add_clock(1e-6)
+
+ sim.add_sync_process(wrap(_test_loadstore1_microwatt_mmu_bin_test5(m, mem)))
+ sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
+ with sim.write_vcd('test_microwatt_mmu_test5.vcd'):
+ sim.run()
+
+
+def test_loadstore1_misalign():
+
+ m, cmpi = setup_mmu()
+
+ mem = pagetables.microwatt_test2
+
+ # nmigen Simulation
+ sim = Simulator(m)
+ sim.add_clock(1e-6)
+
+ ###########1122334455667788
+ mem[0] = 0x0102030405060708
+ mem[8] = 0xffffffffffffffff
+
+ sim.add_sync_process(wrap(_test_loadstore1_misalign(m, mem)))
+ sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
+ with sim.write_vcd('test_loadstore1_misalign.vcd'):
+ sim.run()
+ print ("mem", mem)
+
+
+def test_loadstore1_invalid():
+
+ m, cmpi = setup_mmu()
+
+ mem = {}
+
+ # nmigen Simulation
+ sim = Simulator(m)
+ sim.add_clock(1e-6)
+
+ sim.add_sync_process(wrap(_test_loadstore1_invalid(m, mem)))
+ sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
+ with sim.write_vcd('test_loadstore1_invalid.vcd'):
+ sim.run()
+
+
+def test_loadstore1_ifetch_invalid():
+ m, cmpi = setup_mmu()
+
+ # this is a specially-arranged page table which has the permissions
+ # barred for execute on the leaf node (EAA=0x2 instead of EAA=0x3)
+ mem = pagetables.test2
+
+ # nmigen Simulation
+ sim = Simulator(m)
+ sim.add_clock(1e-6)
+
+ icache = m.submodules.ldst.icache
+ sim.add_sync_process(wrap(_test_loadstore1_ifetch_invalid(m, mem)))
+ # add two wb_get_classic processes onto the *same* memory dictionary.
+ # this shouuuld work.... cross-fingers...
+ sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
+ sim.add_sync_process(wrap(wb_get_classic(icache.bus, mem)))
+ with sim.write_vcd('test_loadstore1_ifetch_invalid.vcd',
+ traces=[m.debug_status]): # include extra debug
+ sim.run()
+
+
+def test_loadstore1_ifetch_multi():
+ m, cmpi = setup_mmu()
+ wbget.stop = False
+
+ # this is a specially-arranged page table which has the permissions
+ # barred for execute on the leaf node (EAA=0x2 instead of EAA=0x3)
+ mem = pagetables.test1
+
+ # set this up before passing to Simulator (which calls elaborate)
+ icache = m.submodules.ldst.icache
+ icache.use_fetch_interface() # this is the function which converts
+ # to FetchUnitInterface. *including*
+ # rewiring the Wishbone Bus to ibus
+
+ # nmigen Simulation
+ sim = Simulator(m)
+ sim.add_clock(1e-6)
+
+ sim.add_sync_process(wrap(_test_loadstore1_ifetch_multi(m, mem)))
+ # add two wb_get_classic processes onto the *same* memory dictionary.
+ # this shouuuld work.... cross-fingers...
+ sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
+ sim.add_sync_process(wrap(wb_get_classic(icache.ibus, mem))) # ibus not bus
+ with sim.write_vcd('test_loadstore1_ifetch_multi.vcd',
+ traces=[m.debug_status]): # include extra debug
+ sim.run()
+
+if __name__ == '__main__':
+ #test_loadstore1()
+ #test_loadstore1_microwatt_mmu_bin_test2()
+ #test_loadstore1_microwatt_mmu_bin_test5()
+ #test_loadstore1_invalid()
+ #test_loadstore1_ifetch() #FIXME
+ #test_loadstore1_ifetch_invalid()
+ #test_loadstore1_ifetch_unit_iface() # guess: should be working
+ #test_loadstore1_ifetch_multi()
+ test_loadstore1_misalign()
from soc.experiment.mmu import MMU
from soc.experiment.dcache import DCache
from soc.experiment.icache import ICache
+from openpower.test.wb_get import wb_get
+from openpower.test import wb_get as wbget
import random
-stop = False
-
-def set_stop(newval):
- global stop
- stop = newval
-
+wbget.stop = False
def b(x):
return int.from_bytes(x.to_bytes(8, byteorder='little'),
}
-def wb_get(c, mem, name):
- """simulator process for getting memory load requests
- """
-
- logfile = open("/tmp/wb_get.log","w")
-
- def log(msg):
- logfile.write(msg+"\n")
- print(msg)
-
- global stop
- while not stop:
- while True: # wait for dc_valid
- if stop:
- log("stop")
- return
- cyc = yield (c.wb_out.cyc)
- stb = yield (c.wb_out.stb)
- if cyc and stb:
- break
- yield
- addr = (yield c.wb_out.adr) << 3
- if addr not in mem:
- log("%s LOOKUP FAIL %x" % (name, addr))
- stop = True
- return
-
- yield
- data = mem[addr]
- yield c.wb_in.dat.eq(data)
- log("%s get %x data %x" % (name, addr, data))
- yield c.wb_in.ack.eq(1)
- yield
- yield c.wb_in.ack.eq(0)
- yield
-
-
def icache_sim(dut, mem):
i_out = dut.i_in
i_in = dut.i_out
m_out = dut.m_in
+ wbget.stop = False
+
for k,v in mem.items():
yield i_in.valid.eq(0)
yield i_out.priv_mode.eq(1)
yield i_out.req.eq(0)
yield
+ wbget.stop = True
def test_icache_il():
dut = ICache()
# read from "memory" process and corresponding wishbone "read" process
sim.add_sync_process(wrap(icache_sim(icache, mem)))
- sim.add_sync_process(wrap(wb_get(icache, mem, "ICACHE")))
+ sim.add_sync_process(wrap(wb_get(icache.bus, mem, "ICACHE")))
with sim.write_vcd('test_icache.vcd'):
sim.run()
def mmu_lookup(mmu, addr):
- global stop
yield mmu.l_in.load.eq(1)
yield mmu.l_in.priv.eq(1)
yield mmu.l_in.addr.eq(addr)
yield mmu.l_in.valid.eq(1)
- while not stop: # wait for dc_valid / err
+
+ print ("mmu lookup %x stopped" % addr, wbget.stop)
+ while not wbget.stop: # wait for dc_valid / err
+ print ("stopped", wbget.stop)
l_done = yield (mmu.l_out.done)
l_err = yield (mmu.l_out.err)
l_badtree = yield (mmu.l_out.badtree)
def mmu_sim(mmu):
- global stop
+ wbget.stop = False
yield mmu.rin.prtbl.eq(0x1000000) # set process table
yield
phys_addr = yield from mmu_lookup(mmu, 0x10000)
assert phys_addr == 0x40000
+ yield
- stop = True
+ wbget.stop = True
def test_mmu():
sim.add_clock(1e-6)
sim.add_sync_process(wrap(mmu_sim(mmu)))
- sim.add_sync_process(wrap(wb_get(dcache, default_mem, "DCACHE")))
+ sim.add_sync_process(wrap(wb_get(dcache.bus,
+ default_mem, "DCACHE")))
with sim.write_vcd('test_mmu.vcd'):
sim.run()
from soc.experiment.mmu import MMU
from soc.experiment.dcache import DCache
+from openpower.test.wb_get import wb_get
+from openpower.test import wb_get as wbget
#more imports
# will take at least one week (10.10.2020)
# many unconnected signals
+def b(x):
+ return int.from_bytes(x.to_bytes(8, byteorder='little'),
+ byteorder='big', signed=False)
+
+mem = {0x10000: # PARTITION_TABLE_2
+ # PATB_GR=1 PRTB=0x1000 PRTS=0xb
+ b(0x800000000100000b),
+
+ 0x30000: # RADIX_ROOT_PTE
+ # V = 1 L = 0 NLB = 0x400 NLS = 9
+ b(0x8000000000040009),
+
+ 0x40000: # RADIX_SECOND_LEVEL
+ # V = 1 L = 1 SW = 0 RPN = 0
+ # R = 1 C = 1 ATT = 0 EAA 0x7
+ b(0xc000000000000187),
+
+ 0x1000000: # PROCESS_TABLE_3
+ # RTS1 = 0x2 RPDB = 0x300 RTS2 = 0x5 RPDS = 13
+ b(0x40000000000300ad),
+ }
+
class TestMicrowattMemoryPortInterface(PortInterfaceBase):
"""TestMicrowattMemoryPortInterface
self.mmu = mmu
self.dcache = dcache
- def set_wr_addr(self, m, addr, mask, misalign, msr_pr):
+ def set_wr_addr(self, m, addr, mask, misalign, msr, is_dcbz, is_nc):
m.d.comb += self.dcache.d_in.addr.eq(addr)
m.d.comb += self.mmu.l_in.addr.eq(addr)
m.d.comb += self.mmu.l_in.load.eq(0)
- m.d.comb += self.mmu.l_in.priv.eq(1) # TODO put msr_pr here
+ m.d.comb += self.mmu.l_in.priv.eq(~msr.pr) # TODO verify
m.d.comb += self.mmu.l_in.valid.eq(1)
- def set_rd_addr(self, m, addr, mask, misalign, msr_pr):
+ def set_rd_addr(self, m, addr, mask, misalign, msr, is_nc):
m.d.comb += self.dcache.d_in.addr.eq(addr)
m.d.comb += self.mmu.l_in.addr.eq(addr)
m.d.comb += self.mmu.l_in.load.eq(1)
- m.d.comb += self.mmu.l_in.priv.eq(1) # TODO put msr_pr here
+ m.d.comb += self.mmu.l_in.priv.eq(~msr.pr) # TODO verify
m.d.comb += self.mmu.l_in.valid.eq(1)
def set_wr_data(self, m, data, wen):
yield from super().ports()
# TODO: memory ports
-stop = False
-
-
-def wb_get(dc):
- """simulator process for getting memory load requests
- """
-
- global stop
-
- def b(x):
- return int.from_bytes(x.to_bytes(8, byteorder='little'),
- byteorder='big', signed=False)
-
- mem = {0x10000: # PARTITION_TABLE_2
- # PATB_GR=1 PRTB=0x1000 PRTS=0xb
- b(0x800000000100000b),
-
- 0x30000: # RADIX_ROOT_PTE
- # V = 1 L = 0 NLB = 0x400 NLS = 9
- b(0x8000000000040009),
-
- 0x40000: # RADIX_SECOND_LEVEL
- # V = 1 L = 1 SW = 0 RPN = 0
- # R = 1 C = 1 ATT = 0 EAA 0x7
- b(0xc000000000000187),
-
- 0x1000000: # PROCESS_TABLE_3
- # RTS1 = 0x2 RPDB = 0x300 RTS2 = 0x5 RPDS = 13
- b(0x40000000000300ad),
- }
-
- while not stop:
- while True: # wait for dc_valid
- if stop:
- return
- cyc = yield (dc.wb_out.cyc)
- stb = yield (dc.wb_out.stb)
- if cyc and stb:
- break
- yield
- addr = (yield dc.wb_out.adr) << 3
- if addr not in mem:
- print (" WB LOOKUP NO entry @ %x, returning zero" % (addr))
-
- data = mem.get(addr, 0)
- yield dc.wb_in.dat.eq(data)
- print (" DCACHE get %x data %x" % (addr, data))
- yield dc.wb_in.ack.eq(1)
- yield
- yield dc.wb_in.ack.eq(0)
- yield
+wbget.stop = False
def mmu_lookup(dut, addr):
mmu = dut.mmu
- global stop
print("pi_ld")
yield from pi_ld(dut.pi, addr, 1)
def mmu_sim(dut):
mmu = dut.mmu
- global stop
yield mmu.rin.prtbl.eq(0x1000000) # set process table
yield
phys_addr = yield from mmu_lookup(dut, 0x10000)
assert phys_addr == 0x40000
- stop = True
+ wbget.stop = True
def test_mmu():
sim.add_clock(1e-6)
sim.add_sync_process(wrap(mmu_sim(dut)))
- sim.add_sync_process(wrap(wb_get(dcache)))
+ sim.add_sync_process(wrap(wb_get(dcache.bus, mem)))
with sim.write_vcd('test_mmu_pi.vcd'):
sim.run()
--- /dev/null
+from openpower.test.wb_get import wb_get
+
recwidth += width
comb += p.eq(AnyConst(width))
- pspec = ALUPipeSpec(id_wid=2, op_wid=recwidth)
+ pspec = ALUPipeSpec(id_wid=2, op_wid=recwidth, parent_pspec=None)
m.submodules.dut = dut = ALUInputStage(pspec)
a = Signal(64)
module = Driver()
self.assertFormal(module, mode="bmc", depth=4)
self.assertFormal(module, mode="cover", depth=4)
+
def test_ilang(self):
dut = Driver()
vl = rtlil.convert(dut, ports=[])
width = p.width
comb += p.eq(AnyConst(width))
- pspec = ALUPipeSpec(id_wid=2)
+ pspec = ALUPipeSpec(id_wid=2, parent_pspec=None)
m.submodules.dut = dut = ALUMainStage(pspec)
# convenience variables
a = dut.i.a
b = dut.i.b
ca_in = dut.i.xer_ca[0] # CA carry in
- ca32_in = dut.i.xer_ca[1] # CA32 carry in 32
+ ca32_in = dut.i.xer_ca[1] # CA32 carry in 32
so_in = dut.i.xer_so # SO sticky overflow
ca_o = dut.o.xer_ca.data[0] # CA carry out
- ca32_o = dut.o.xer_ca.data[1] # CA32 carry out32
+ ca32_o = dut.o.xer_ca.data[1] # CA32 carry out32
ov_o = dut.o.xer_ov.data[0] # OV overflow
- ov32_o = dut.o.xer_ov.data[1] # OV32 overflow32
+ ov32_o = dut.o.xer_ov.data[1] # OV32 overflow32
o = dut.o.o.data
# setup random inputs
module = Driver()
self.assertFormal(module, mode="bmc", depth=2)
self.assertFormal(module, mode="cover", depth=2)
+
def test_ilang(self):
dut = Driver()
vl = rtlil.convert(dut, ports=[])
recwidth += width
comb += p.eq(AnyConst(width))
- pspec = ALUPipeSpec(id_wid=2)
+ pspec = ALUPipeSpec(id_wid=2, parent_pspec=None)
m.submodules.dut = dut = ALUOutputStage(pspec)
o = Signal(64)
return m
+
class GTCombinerTestCase(FHDLTestCase):
def test_formal(self):
module = Driver()
self.assertFormal(module, mode="bmc", depth=4)
self.assertFormal(module, mode="cover", depth=4)
+
def test_ilang(self):
dut = Driver()
vl = rtlil.convert(dut, ports=[])
from nmutil.pipemodbase import PipeModBase
from nmutil.extend import exts, extz
from soc.fu.alu.pipe_data import ALUInputData, ALUOutputData
-from ieee754.part.partsig import PartitionedSignal
+from ieee754.part.partsig import SimdSignal
from openpower.decoder.power_enums import MicrOp
from openpower.decoder.power_fields import DecodeFields
return ALUOutputData(self.pspec) # defines pipeline stage output format
def elaborate(self, platform):
+ XLEN = self.pspec.XLEN
m = Module()
comb = m.d.comb
comb += b_i.eq(b) # into trap pipeline
with m.Elif(is_32bit):
with m.If(op.is_signed):
- comb += a_i.eq(exts(a, 32, 64))
- comb += b_i.eq(exts(b, 32, 64))
+ comb += a_i.eq(exts(a, 32, XLEN))
+ comb += b_i.eq(exts(b, 32, XLEN))
with m.Else():
- comb += a_i.eq(extz(a, 32, 64))
- comb += b_i.eq(extz(b, 32, 64))
+ comb += a_i.eq(extz(a, 32, XLEN))
+ comb += b_i.eq(extz(b, 32, XLEN))
with m.Else():
comb += a_i.eq(a)
comb += b_i.eq(b)
#### CMP, CMPL v3.0B p85-86
with m.Case(MicrOp.OP_CMP):
- a_n = Signal(64) # temporary - inverted a
+ a_n = Signal(XLEN) # temporary - inverted a
tval = Signal(5)
a_lt = Signal()
carry_32 = Signal()
# this is supposed to be inverted (b-a, not a-b)
comb += a_n.eq(~a) # sigh a gets inverted
- comb += carry_32.eq(add_o[33] ^ a[32] ^ b[32])
- comb += carry_64.eq(add_o[65])
+ if XLEN == 64:
+ comb += carry_32.eq(add_o[33] ^ a[32] ^ b[32])
+ else:
+ comb += carry_32.eq(add_o[XLEN+1])
+ comb += carry_64.eq(add_o[XLEN+1])
comb += zerolo.eq(~((a_n[0:32] ^ b[0:32]).bool()))
- comb += zerohi.eq(~((a_n[32:64] ^ b[32:64]).bool()))
+ comb += zerohi.eq(~((a_n[32:XLEN] ^ b[32:XLEN]).bool()))
with m.If(zerolo & (is_32bit | zerohi)):
# values are equal
comb += tval[2].eq(1)
with m.Else():
- comb += msb_a.eq(Mux(is_32bit, a_n[31], a_n[63]))
- comb += msb_b.eq(Mux(is_32bit, b[31], b[63]))
+ comb += msb_a.eq(Mux(is_32bit, a_n[31], a_n[XLEN-1]))
+ comb += msb_b.eq(Mux(is_32bit, b[31], b[XLEN-1]))
C0 = Const(0, 1)
with m.If(msb_a != msb_b):
# Subtraction might overflow, but
# https://bugs.libre-soc.org/show_bug.cgi?id=319#c5
ca = Signal(2, reset_less=True)
comb += ca[0].eq(add_o[-1]) # XER.CA
- comb += ca[1].eq(add_o[33] ^ (a_i[32] ^ b_i[32])) # XER.CA32
+ if XLEN == 64:
+ comb += ca[1].eq(add_o[33] ^ (a_i[32] ^ b_i[32])) # XER.CA32
+ else:
+ comb += ca[1].eq(add_o[-1]) # XER.CA32
comb += cry_o.data.eq(ca)
comb += cry_o.ok.eq(1)
# 32-bit (ov[1]) and 64-bit (ov[0]) overflow
ov = Signal(2, reset_less=True)
comb += ov[0].eq(calc_ov(a_i[-1], b_i[-1], ca[0], add_o[-2]))
- comb += ov[1].eq(calc_ov(a_i[31], b_i[31], ca[1], add_o[32]))
+ if XLEN == 64:
+ comb += ov[1].eq(calc_ov(a_i[31], b_i[31], ca[1],
+ add_o[32]))
+ else:
+ comb += ov[1].eq(calc_ov(a_i[-1], b_i[-1], ca[0],
+ add_o[-2]))
comb += ov_o.data.eq(ov)
comb += ov_o.ok.eq(1)
with m.Case(MicrOp.OP_EXTS):
with m.If(op.data_len == 1):
- comb += o.data.eq(exts(a, 8, 64))
+ comb += o.data.eq(exts(a, 8, XLEN))
with m.If(op.data_len == 2):
- comb += o.data.eq(exts(a, 16, 64))
+ comb += o.data.eq(exts(a, 16, XLEN))
with m.If(op.data_len == 4):
- comb += o.data.eq(exts(a, 32, 64))
+ comb += o.data.eq(exts(a, 32, XLEN))
comb += o.ok.eq(1) # output register
###################
from nmigen import (Module, Signal, Cat, Repl)
from soc.fu.alu.pipe_data import ALUInputData, ALUOutputData
from soc.fu.common_output_stage import CommonOutputStage
-from ieee754.part.partsig import PartitionedSignal
+from ieee754.part.partsig import SimdSignal
from openpower.decoder.power_enums import MicrOp
class ALUInputData(FUBaseData):
- regspec = [('INT', 'ra', '0:63'), # RA
- ('INT', 'rb', '0:63'), # RB/immediate
- ('XER', 'xer_so', '32'), # XER bit 32: SO
- ('XER', 'xer_ca', '34,45')] # XER bit 34/45: CA/CA32
def __init__(self, pspec):
super().__init__(pspec, False)
# convenience
self.a, self.b = self.ra, self.rb
+ @property
+ def regspec(self):
+ return [('INT', 'ra', self.intrange), # RA
+ ('INT', 'rb', self.intrange), # RB/immediate
+ ('XER', 'xer_so', '32'), # XER bit 32: SO
+ ('XER', 'xer_ca', '34,45')] # XER bit 34/45: CA/CA32
+
+
class ALUOutputData(FUBaseData):
- regspec = [('INT', 'o', '0:63'),
- ('CR', 'cr_a', '0:3'),
- ('XER', 'xer_ca', '34,45'), # bit0: ca, bit1: ca32
- ('XER', 'xer_ov', '33,44'), # bit0: ov, bit1: ov32
- ('XER', 'xer_so', '32')]
def __init__(self, pspec):
super().__init__(pspec, True)
# convenience
self.cr0 = self.cr_a
+ @property
+ def regspec(self):
+ return [('INT', 'o', self.intrange),
+ ('CR', 'cr_a', '0:3'),
+ ('XER', 'xer_ca', '34,45'), # bit0: ca, bit1: ca32
+ ('XER', 'xer_ov', '33,44'), # bit0: ov, bit1: ov32
+ ('XER', 'xer_so', '32')]
+
+
class ALUPipeSpec(CommonPipeSpec):
- regspec = (ALUInputData.regspec, ALUOutputData.regspec)
opsubsetkls = CompALUOpSubset
+ regspecklses = (ALUInputData, ALUOutputData)
from soc.fu.alu.main_stage import ALUMainStage
from soc.fu.alu.output_stage import ALUOutputStage
+
class ALUStages(PipeModBaseChain):
def get_chain(self):
inp = ALUInputStage(self.pspec)
main = ALUMainStage(self.pspec)
- return [inp, main]
+ out = ALUOutputStage(self.pspec)
+ return [inp, main, out]
+
+
+class ALUBasePipe(ControlBase):
+ def __init__(self, pspec):
+ ControlBase.__init__(self)
+ self.pspec = pspec
+ self.pipe1 = ALUStages(pspec)
+ self._eqs = self.connect([self.pipe1])
+
+ def elaborate(self, platform):
+ m = ControlBase.elaborate(self, platform)
+ m.submodules.pipe1 = self.pipe1
+ m.d.comb += self._eqs
+ return m
+
+class ALUStages1(PipeModBaseChain):
+ def get_chain(self):
+ inp = ALUInputStage(self.pspec)
+ return [inp]
+
+class ALUStages2(PipeModBaseChain):
+ def get_chain(self):
+ main = ALUMainStage(self.pspec)
+ return [main]
-class ALUStageEnd(PipeModBaseChain):
+class ALUStages3(PipeModBaseChain):
def get_chain(self):
out = ALUOutputStage(self.pspec)
return [out]
def __init__(self, pspec):
ControlBase.__init__(self)
self.pspec = pspec
- self.pipe1 = ALUStages(pspec)
- self.pipe2 = ALUStageEnd(pspec)
- self._eqs = self.connect([self.pipe1, self.pipe2])
+ self.pipe1 = ALUStages1(pspec)
+ self.pipe2 = ALUStages2(pspec)
+ self.pipe3 = ALUStages3(pspec)
+ self._eqs = self.connect([self.pipe1, self.pipe2, self.pipe3])
def elaborate(self, platform):
m = ControlBase.elaborate(self, platform)
- m.submodules.pipe1 = self.pipe1
- m.submodules.pipe2 = self.pipe2
+ m.submodules.logical_pipe1 = self.pipe1
+ m.submodules.logical_pipe2 = self.pipe2
+ m.submodules.logical_pipe3 = self.pipe3
m.d.comb += self._eqs
return m
+
def set_alu_inputs(alu, dec2, sim):
# TODO: see https://bugs.libre-soc.org/show_bug.cgi?id=305#c43
# detect the immediate here (with m.If(self.i.ctx.op.imm_data.imm_ok))
- # and place it into data_i.b
+ # and place it into i_data.b
inp = yield from get_cu_inputs(dec2, sim)
yield from ALUHelpers.set_int_ra(alu, dec2, inp)
class ALUIAllCases(ALUTestCase):
def case_ilang(self):
- pspec = ALUPipeSpec(id_wid=2)
+ pspec = ALUPipeSpec(id_wid=2, parent_pspec=None)
alu = ALUBasePipe(pspec)
vl = rtlil.convert(alu, ports=alu.ports())
with open("alu_pipeline.il", "w") as f:
class TestRunner(unittest.TestCase):
- def execute(self, alu,instruction, pdecode2, test):
+ def execute(self, alu, instruction, pdecode2, test):
program = test.program
sim = ISA(pdecode2, test.regs, test.sprs, test.cr,
test.mem, test.msr,
fn_unit = yield pdecode2.e.do.fn_unit
asmcode = yield pdecode2.e.asmcode
dec_asmcode = yield pdecode2.dec.op.asmcode
- print ("asmcode", asmcode, dec_asmcode)
+ print("asmcode", asmcode, dec_asmcode)
self.assertEqual(fn_unit, Function.ALU.value)
yield from set_alu_inputs(alu, pdecode2, sim)
# set valid for one cycle, propagate through pipeline...
- yield alu.p.valid_i.eq(1)
+ yield alu.p.i_valid.eq(1)
yield
- yield alu.p.valid_i.eq(0)
+ yield alu.p.i_valid.eq(0)
opname = code.split(' ')[0]
yield from sim.call(opname)
index = sim.pc.CIA.value//4
- vld = yield alu.n.valid_o
+ vld = yield alu.n.o_valid
while not vld:
yield
- vld = yield alu.n.valid_o
+ vld = yield alu.n.o_valid
yield
yield from self.check_alu_outputs(alu, pdecode2, sim, code)
yield Settle()
def test_it(self):
- test_data = ALUTestCase().test_data
+ test_data = ALUTestCase({'soc'}).test_data
m = Module()
comb = m.d.comb
instruction = Signal(32)
opkls = ALUPipeSpec.opsubsetkls
pdecode = create_pdecode()
- m.submodules.pdecode2 = pdecode2 = PowerDecode2(pdecode, opkls, fn_name)
+ m.submodules.pdecode2 = pdecode2 = PowerDecode2(
+ pdecode, opkls, fn_name)
pdecode = pdecode2.dec
- pspec = ALUPipeSpec(id_wid=2)
+ class PPspec:
+ XLEN = 64
+ pps = PPspec()
+ pspec = ALUPipeSpec(id_wid=2, parent_pspec=pps)
m.submodules.alu = alu = ALUBasePipe(pspec)
- comb += alu.p.data_i.ctx.op.eq_from_execute1(pdecode2.do)
- comb += alu.n.ready_i.eq(1)
+ comb += alu.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
+ comb += alu.n.i_ready.eq(1)
comb += pdecode2.dec.raw_opcode_in.eq(instruction)
sim = Simulator(m)
oe_ok = yield dec2.e.do.oe.ok
if not oe or not oe_ok:
# if OE not enabled, XER SO and OV must correspondingly be false
- so_ok = yield alu.n.data_o.xer_so.ok
- ov_ok = yield alu.n.data_o.xer_ov.ok
+ so_ok = yield alu.n.o_data.xer_so.ok
+ ov_ok = yield alu.n.o_data.xer_ov.ok
self.assertEqual(so_ok, False, code)
self.assertEqual(ov_ok, False, code)
recwidth += width
comb += p.eq(AnyConst(width))
- pspec = ALUPipeSpec(id_wid=2, op_wid=recwidth)
+ pspec = ALUPipeSpec(id_wid=2, op_wid=recwidth, parent_pspec=None)
m.submodules.dut = dut = ALUInputStage(pspec)
a = Signal(64)
return m
+
class GTCombinerTestCase(FHDLTestCase):
def test_formal(self):
module = Driver()
self.assertFormal(module, mode="bmc", depth=4)
self.assertFormal(module, mode="cover", depth=4)
+
def test_ilang(self):
dut = Driver()
vl = rtlil.convert(dut, ports=[])
recwidth += width
comb += p.eq(AnyConst(width))
- pspec = BranchPipeSpec(id_wid=2)
+ pspec = BranchPipeSpec(id_wid=2, parent_pspec=None)
m.submodules.dut = dut = BranchMainStage(pspec)
# convenience aliases
def test_formal(self):
module = Driver()
self.assertFormal(module, mode="bmc", depth=2)
+
def test_ilang(self):
dut = Driver()
vl = rtlil.convert(dut, ports=[])
class BranchPipeSpec(CommonPipeSpec):
- regspec = (BranchInputData.regspec, BranchOutputData.regspec)
+ regspecklses = (BranchInputData, BranchOutputData)
opsubsetkls = CompBROpSubset
from nmutil.singlepipe import ControlBase
from nmutil.pipemodbase import PipeModBaseChain
from soc.fu.branch.main_stage import BranchMainStage
+from nmutil.pipemodbase import PipeModBase
+from soc.fu.branch.pipe_data import BranchInputData
+from nmigen import Module
+
+# gives a 1-clock delay to stop combinatorial link between in and out
+class DummyBranchStage(PipeModBase):
+ def __init__(self, pspec): super().__init__(pspec, "dummy")
+ def ispec(self): return BranchInputData(self.pspec)
+ def ospec(self): return BranchInputData(self.pspec)
+
+ def elaborate(self, platform):
+ m = Module()
+ m.d.comb += self.o.eq(self.i) # pass-through output
+ return m
+
+class BranchDummyStages(PipeModBaseChain):
+ def get_chain(self):
+ dummy = DummyBranchStage(self.pspec)
+ return [dummy]
+
class BranchStages(PipeModBaseChain):
def get_chain(self):
def __init__(self, pspec):
ControlBase.__init__(self)
self.pspec = pspec
- self.pipe1 = BranchStages(pspec)
- self._eqs = self.connect([self.pipe1])
+ self.pipe1 = BranchDummyStages(pspec)
+ self.pipe2 = BranchStages(pspec)
+ self._eqs = self.connect([self.pipe1, self.pipe2])
def elaborate(self, platform):
m = ControlBase.elaborate(self, platform)
- m.submodules.pipe = self.pipe1
+ m.submodules.pipe1 = self.pipe1
+ m.submodules.pipe2 = self.pipe2
m.d.comb += self._eqs
return m
class BranchAllCases(BranchTestCase):
def case_ilang(self):
- pspec = BranchPipeSpec(id_wid=2)
+ pspec = BranchPipeSpec(id_wid=2, parent_pspec=None)
alu = BranchBasePipe(pspec)
vl = rtlil.convert(alu, ports=alu.ports())
with open("branch_pipeline.il", "w") as f:
class TestRunner(unittest.TestCase):
def test_it(self):
- test_data = BranchAllCases().test_data
+ test_data = BranchTestCase().test_data
+ print ("test data", test_data)
m = Module()
comb = m.d.comb
instruction = Signal(32)
m.submodules.pdecode2 = pdecode2 = PowerDecode2(None, opkls, fn_name)
pdecode = pdecode2.dec
- pspec = BranchPipeSpec(id_wid=2)
+ pspec = BranchPipeSpec(id_wid=2, parent_pspec=None)
m.submodules.branch = branch = BranchBasePipe(pspec)
- comb += branch.p.data_i.ctx.op.eq_from_execute1(pdecode2.do)
- comb += branch.p.valid_i.eq(1)
- comb += branch.n.ready_i.eq(1)
+ comb += branch.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
+ comb += branch.p.i_valid.eq(1)
+ comb += branch.n.i_ready.eq(1)
comb += pdecode2.dec.raw_opcode_in.eq(instruction)
sim = Simulator(m)
print(index)
ins, code = instructions[index]
- print("0x{:X}".format(ins & 0xffffffff))
+ print("insn 0x{:X}".format(ins & 0xffffffff))
print(code)
# ask the decoder to decode this binary data (endian'd)
sim.run()
def assert_outputs(self, branch, dec2, sim, prev_nia, code):
- branch_taken = yield branch.n.data_o.nia.ok
+ branch_taken = yield branch.n.o_data.nia.ok
sim_branch_taken = prev_nia != sim.pc.CIA
self.assertEqual(branch_taken, sim_branch_taken, code)
if branch_taken:
- branch_addr = yield branch.n.data_o.nia.data
+ branch_addr = yield branch.n.o_data.nia.data
print(f"real: {branch_addr:x}, sim: {sim.pc.CIA.value:x}")
self.assertEqual(branch_addr, sim.pc.CIA.value, code)
# TODO: this should be checking write_fast2
lk = yield dec2.e.do.lk
- branch_lk = yield branch.n.data_o.lr.ok
+ branch_lk = yield branch.n.o_data.lr.ok
self.assertEqual(lk, branch_lk, code)
if lk:
- branch_lr = yield branch.n.data_o.lr.data
+ branch_lr = yield branch.n.o_data.lr.data
self.assertEqual(sim.spr['LR'], branch_lr, code)
def set_inputs(self, branch, dec2, sim):
# and updating the condition register
from nmigen import (Module, Signal, Cat, Const)
from nmutil.pipemodbase import PipeModBase
-from ieee754.part.partsig import PartitionedSignal
+from ieee754.part.partsig import SimdSignal
from openpower.decoder.power_enums import MicrOp
super().__init__(pspec, "output")
def elaborate(self, platform):
+ XLEN = self.pspec.XLEN
m = Module()
comb = m.d.comb
op = self.i.ctx.op
# XXX ah. right. this needs to be done only if the *mode* is 32-bit
# (an MSR bit)
# see https://bugs.libre-soc.org/show_bug.cgi?id=424
- target = Signal(64, reset_less=True)
+ target = Signal(XLEN, reset_less=True)
#with m.If(op.is_32bit):
# comb += target.eq(o[:32])
#with m.Else():
from soc.experiment.compalu_multi import MultiCompUnit
from openpower.decoder.power_enums import Function
from soc.config.test.test_loadstore import TestMemPspec
+from nmutil.concurrentunit import ReservationStations2
# pipeline / spec imports
note that the rdflags function obtains (dynamically, from instruction
decoding) which read-register ports are to be requested. this is not
ideal (it could be a lot neater) but works for now.
+
+ also note: additional members, fu.rd_latches and fu.wr_latches
+ are replaced, here, by core.py. those contain the latched
+ read/write register information which the FU needs in order
+ to actually read (and write) the correct register number
"""
- def __init__(self, speckls, pipekls, idx):
+ def __init__(self, speckls, pipekls, idx, parent_pspec):
alu_name = "alu_%s%d" % (self.fnunit.name.lower(), idx)
- pspec = speckls(id_wid=2) # spec (NNNPipeSpec instance)
+ # spec (NNNPipeSpec instance)
+ pspec = speckls(id_wid=2, parent_pspec=parent_pspec)
opsubset = pspec.opsubsetkls # get the operand subset class
- regspec = pspec.regspec # get the regspec
+ rsk = pspec.regspecklses # get the regspec classes
+ regspec = []
+ for kls in rsk:
+ regspec.append(kls(pspec).regspec)
+ print ("regspecs", regspec)
alu = pipekls(pspec) # create actual NNNBasePipe
self.pspec = pspec
super().__init__(regspec, alu, opsubset, name=alu_name) # MultiCompUnit
+ # these are set to None for now: core get_byregfiles fills them in
+ # (for now)
+ self.fu_rdlatches = None
+ self.fu_wrlatches = None
##############################################################
# TODO: ReservationStations-based (FunctionUnitBaseConcurrent)
-class FunctionUnitBaseMulti:
- pass
+class FunctionUnitBaseMulti(ReservationStations2):
+ """FunctionUnitBaseMulti
+
+ similar to FunctionUnitBaseSingle except it creates a list
+ of MultiCompUnit instances all using the same ALU instance.
+
+ * :speckls: - the specification. contains regspec and op subset info,
+ and contains common "stuff" like the pipeline ctx,
+ what type of nmutil pipeline base is to be used (etc)
+ * :pipekls: - the type of pipeline. actually connects things together
+
+ * :num_rows: - number of ReservationStations wrapped around the FU
+
+ note that it is through MultiCompUnit.get_in/out that we *actually*
+ connect up the association between regspec variable names (defined
+ in the pipe_data).
+
+ note that the rdflags function obtains (dynamically, from instruction
+ decoding) which read-register ports are to be requested. this is not
+ ideal (it could be a lot neater) but works for now.
+ """
+
+ def __init__(self, speckls, pipekls, num_rows, parent_pspec):
+ id_wid = num_rows.bit_length()
+ # spec (NNNPipeSpec instance)
+ pspec = speckls(id_wid=id_wid, parent_pspec=parent_pspec)
+ self.pspec = pspec
+ opsubset = pspec.opsubsetkls # get the operand subset class
+ rsk = pspec.regspecklses # get the regspec classes
+ regspec = []
+ for kls in rsk:
+ regspec.append(kls(pspec).regspec)
+ print ("regspecs", regspec)
+ alu = pipekls(pspec) # create actual NNNBasePipe
+ alu_name = self.fnunit.name.lower()
+ super().__init__(alu, num_rows, alu_name) # initialise fan-in/fan-out
+ self.cu = []
+ for idx in range(num_rows):
+ alu_name = "alu_%s%d" % (alu_name, idx)
+ palu = self.pseudoalus[idx]
+ cu = MultiCompUnit(regspec, palu, opsubset, name=alu_name,
+ sync_rw=False)
+ cu.fnunit = self.fnunit
+ cu.fu_muxidx = idx
+ self.cu.append(cu)
+
+ def elaborate(self, platform):
+ m = super().elaborate(platform)
+ # set the muxids so that ReservationStations2 can direct data
+ # without this the incoming data gets routed to the wrong place!
+ # NOTE: for Mask Cancellation this has to be done slightly differently
+ for i, p in enumerate(self.p):
+ m.d.comb += p.i_data.muxid.eq(i)
+ return m
######################################################################
###### actual Function Units: these are "single" stage pipelines #####
-class ALUFunctionUnit(FunctionUnitBaseSingle):
+# class ALUFunctionUnit(FunctionUnitBaseSingle):
+
+
+class ALUFunctionUnit(FunctionUnitBaseMulti):
fnunit = Function.ALU
- def __init__(self, idx):
- super().__init__(ALUPipeSpec, ALUBasePipe, idx)
+ def __init__(self, num_rses, parent_pspec):
+ super().__init__(ALUPipeSpec, ALUBasePipe, num_rses, parent_pspec)
-class LogicalFunctionUnit(FunctionUnitBaseSingle):
+# class LogicalFunctionUnit(FunctionUnitBaseSingle):
+class LogicalFunctionUnit(FunctionUnitBaseMulti):
fnunit = Function.LOGICAL
- def __init__(self, idx):
- super().__init__(LogicalPipeSpec, LogicalBasePipe, idx)
+ def __init__(self, idx, parent_pspec):
+ super().__init__(LogicalPipeSpec, LogicalBasePipe, idx, parent_pspec)
-class CRFunctionUnit(FunctionUnitBaseSingle):
+# class CRFunctionUnit(FunctionUnitBaseSingle):
+class CRFunctionUnit(FunctionUnitBaseMulti):
fnunit = Function.CR
- def __init__(self, idx):
- super().__init__(CRPipeSpec, CRBasePipe, idx)
+ def __init__(self, idx, parent_pspec):
+ super().__init__(CRPipeSpec, CRBasePipe, idx, parent_pspec)
-class BranchFunctionUnit(FunctionUnitBaseSingle):
+# class BranchFunctionUnit(FunctionUnitBaseSingle):
+class BranchFunctionUnit(FunctionUnitBaseMulti):
fnunit = Function.BRANCH
- def __init__(self, idx):
- super().__init__(BranchPipeSpec, BranchBasePipe, idx)
+ def __init__(self, idx, parent_pspec):
+ super().__init__(BranchPipeSpec, BranchBasePipe, idx, parent_pspec)
-class ShiftRotFunctionUnit(FunctionUnitBaseSingle):
+# class ShiftRotFunctionUnit(FunctionUnitBaseSingle):
+class ShiftRotFunctionUnit(FunctionUnitBaseMulti):
fnunit = Function.SHIFT_ROT
- def __init__(self, idx):
- super().__init__(ShiftRotPipeSpec, ShiftRotBasePipe, idx)
+ def __init__(self, idx, parent_pspec):
+ super().__init__(ShiftRotPipeSpec, ShiftRotBasePipe, idx, parent_pspec)
class DivFSMFunctionUnit(FunctionUnitBaseSingle):
fnunit = Function.DIV
- def __init__(self, idx):
- super().__init__(DivPipeSpecFSMDivCore, DivBasePipe, idx)
+ def __init__(self, idx, parent_pspec):
+ super().__init__(DivPipeSpecFSMDivCore, DivBasePipe, idx, parent_pspec)
class MMUFSMFunctionUnit(FunctionUnitBaseSingle):
fnunit = Function.MMU
- def __init__(self, idx):
- super().__init__(MMUPipeSpec, FSMMMUStage, idx)
+ def __init__(self, idx, parent_pspec):
+ super().__init__(MMUPipeSpec, FSMMMUStage, idx, parent_pspec)
+ self.exc_o = self.alu.exc_o # get at MMU exception
class DivPipeFunctionUnit(FunctionUnitBaseSingle):
fnunit = Function.DIV
- def __init__(self, idx):
- super().__init__(DivPipeSpecDivPipeCore, DivBasePipe, idx)
+ def __init__(self, idx, parent_pspec):
+ super().__init__(DivPipeSpecDivPipeCore, DivBasePipe, idx, parent_pspec)
-class MulFunctionUnit(FunctionUnitBaseSingle):
+# class MulFunctionUnit(FunctionUnitBaseSingle):
+class MulFunctionUnit(FunctionUnitBaseMulti):
fnunit = Function.MUL
- def __init__(self, idx):
- super().__init__(MulPipeSpec, MulBasePipe, idx)
+ def __init__(self, idx, parent_pspec):
+ super().__init__(MulPipeSpec, MulBasePipe, idx, parent_pspec)
class TrapFunctionUnit(FunctionUnitBaseSingle):
fnunit = Function.TRAP
- def __init__(self, idx):
- super().__init__(TrapPipeSpec, TrapBasePipe, idx)
+ def __init__(self, idx, parent_pspec):
+ super().__init__(TrapPipeSpec, TrapBasePipe, idx, parent_pspec)
class SPRFunctionUnit(FunctionUnitBaseSingle):
fnunit = Function.SPR
- def __init__(self, idx):
- super().__init__(SPRPipeSpec, SPRBasePipe, idx)
+ def __init__(self, idx, parent_pspec):
+ super().__init__(SPRPipeSpec, SPRBasePipe, idx, parent_pspec)
# special-case: LD/ST conforms to the CompUnit API but is not a pipeline
class LDSTFunctionUnit(LDSTCompUnit):
fnunit = Function.LDST
- def __init__(self, pi, awid, idx):
+ def __init__(self, pi, awid, idx, parent_pspec):
alu_name = "ldst_%s%d" % (self.fnunit.name.lower(), idx)
- pspec = LDSTPipeSpec(id_wid=2) # spec (NNNPipeSpec instance)
+ # spec (NNNPipeSpec instance)
+ pspec = LDSTPipeSpec(id_wid=2, parent_pspec=parent_pspec)
opsubset = pspec.opsubsetkls # get the operand subset class
- regspec = pspec.regspec # get the regspec
+ rsk = pspec.regspecklses # get the regspec classes
+ regspec = []
+ for kls in rsk:
+ regspec.append(kls(pspec).regspec)
+ print ("regspecs", regspec)
self.opsubsetkls = opsubset
super().__init__(pi, regspec, awid, opsubset, name=alu_name)
# create dictionary of Function Units
self.fus = {}
+ self.actual_alus = {}
for name, qty in units.items():
kls = alus[name]
- for i in range(qty):
- self.fus["%s%d" % (name, i)] = kls(i)
+ if issubclass(kls, FunctionUnitBaseMulti):
+ # create just the one ALU but many "fronts"
+ fu = kls(qty, parent_pspec=pspec)
+ self.actual_alus[name] = fu # to be made a module of AllFUs
+ for i in range(qty):
+ self.fus["%s%d" % (name, i)] = fu.cu[i]
+ else:
+ for i in range(qty):
+ self.fus["%s%d" % (name, i)] = kls(i, parent_pspec=pspec)
# debug print for MMU ALU
if microwatt_mmu:
# if any PortInterfaces, we want LDST Units.
if pilist is None:
return
- print ("pilist", pilist)
+ print("pilist", pilist)
for i, pi in enumerate(pilist):
- self.fus["ldst%d" % (i)] = LDSTFunctionUnit(pi, addrwid, i)
+ self.fus["ldst%d" % (i)] = LDSTFunctionUnit(pi, addrwid, i, pspec)
# extract exceptions from any FunctionUnits for easy access
self.excs = {}
for name, alu in self.fus.items():
if hasattr(alu, "exc_o"):
- print ("FU exceptions", name, type(alu.exc_o), alu.exc_o)
+ print("FU exceptions", name, type(alu.exc_o), alu.exc_o)
self.excs[name] = alu.exc_o
def get_exc(self, name):
- return self.excs.get(name, default=None)
+ return self.excs.get(name)
def get_fu(self, name):
return self.fus.get(name)
def elaborate(self, platform):
m = Module()
+ # add MultiCompUnit modules (Single CompUnits add their own ALU)
for (name, fu) in self.fus.items():
- setattr(m.submodules, name, fu)
+ m.submodules[name] = fu
+ # if any ReservationStations, there is only one ALU per RS so add that
+ for (name, alu) in self.actual_alus.items():
+ m.submodules[name] = alu
return m
def __iter__(self):
def tst_all_fus():
pspec = TestMemPspec(ldst_ifacetype='testpi',
imem_ifacetype='',
- addr_wid=48,
+ addr_wid=64,
mask_wid=8,
reg_wid=64)
dut = AllFunctionUnits(pspec)
alu_temp = Signal(16)
write_req_valid = Signal(reset=0)
with m.If(~Past(go_die) & Past(busy)):
- with m.If(Rose(dut.alu.n.valid_o)):
+ with m.If(Rose(dut.alu.n.o_valid)):
sync += alu_temp.eq(dut.alu.o)
sync += write_req_valid.eq(1)
# write_req_valid should only be high once the alu finishes
- with m.If(~write_req_valid & ~dut.alu.n.valid_o):
+ with m.If(~write_req_valid & ~dut.alu.n.o_valid):
comb += Assert(wr_rel == 0)
# Property 6: Write request release is held up if shadow_n
# then the alu data should be output
with m.If(Past(wr_rel) & Past(go_wr)):
# the alu data is output
- comb += Assert((dut.data_o == alu_temp)
- | (dut.data_o == dut.alu.o))
+ comb += Assert((dut.o_data == alu_temp)
+ | (dut.o_data == dut.alu.o))
# wr_rel is dropped
comb += Assert(wr_rel == 0)
# busy is dropped.
('alu', {'submodule': 'alu'}, [
('prev port', 'in', [
'oper_i_None__insn_type', 'i1[15:0]',
- 'valid_i', 'ready_o']),
+ 'i_valid', 'o_ready']),
('next port', 'out', [
- 'alu_o[15:0]', 'valid_o', 'ready_i'])])]
+ 'alu_o[15:0]', 'o_valid', 'i_ready'])])]
write_gtkw('test_fu_formal_bmc.gtkw',
os.path.dirname(__file__) +
yield
while True:
yield
- rd_rel = yield dut.rd.rel
+ rd_rel = yield dut.rd.rel_o
if rd_rel != 0:
break
- yield dut.rd.go.eq(0xfff)
+ yield dut.rd.go_i.eq(0xfff)
yield
- yield dut.rd.go.eq(0)
+ yield dut.rd.go_i.eq(0)
for i in range(10):
yield
from openpower.decoder.power_decoder2 import PowerDecode2, get_rdflags
from openpower.decoder.power_enums import Function
from openpower.decoder.isa.all import ISA
+from openpower.decoder.isa.mem import Mem
from soc.experiment.compalu_multi import find_ok # hack
from soc.config.test.test_loadstore import TestMemPspec
# pipelines (or FSMs) the write mask is only valid at that time.
if hasattr(cu, "alu"): # ALU CompUnits
while True:
- valid_o = yield cu.alu.n.valid_o
- if valid_o:
+ o_valid = yield cu.alu.n.o_valid
+ if o_valid:
break
yield
else: # LDST CompUnit
return mem.mem
-def setup_test_memory(l0, sim):
+def setup_tst_memory(l0, test_mem):
+ # create independent Sim Mem from test values
+ sim_mem = Mem(initial_mem=test_mem)
mem = get_l0_mem(l0)
print("before, init mem", mem.depth, mem.width, mem)
for i in range(mem.depth):
- data = sim.mem.ld(i*8, 8, False)
+ data = sim_mem.ld(i*8, 8, False)
print("init ", i, hex(data))
yield mem._array[i].eq(data)
yield Settle()
- for k, v in sim.mem.mem.items():
+ for k, v in sim_mem.mem.items():
print(" %6x %016x" % (k, v))
print("before, nmigen mem dump")
for i in range(mem.depth):
self.funit = funit
self.bigendian = bigendian
- def execute(self, cu, l0, instruction, pdecode2, simdec2, test):
+ def execute(self, m, cu, l0, instruction, pdecode2, simdec2, test):
program = test.program
print("test", test.name, test.mem)
# initialise memory
if self.funit == Function.LDST:
- yield from setup_test_memory(l0, sim)
+ yield from setup_tst_memory(l0, test.mem)
pc = sim.pc.CIA.value
index = pc//4
fast_out2 = yield pdecode2.e.write_fast2.data
fast_out2_ok = yield pdecode2.e.write_fast2.ok
print("lk:", lk, fast_out2, fast_out2_ok)
- op_lk = yield cu.alu.pipe1.p.data_i.ctx.op.lk
+ op_lk = yield cu.alu.pipe1.p.i_data.ctx.op.lk
print("op_lk:", op_lk)
- print(dir(cu.alu.pipe1.n.data_o))
+ print(dir(cu.alu.pipe1.n.o_data))
fn_unit = yield pdecode2.e.do.fn_unit
fuval = self.funit.value
self.assertEqual(fn_unit & fuval, fuval)
# set operand and get inputs
yield from set_operand(cu, pdecode2, sim)
# reset read-operand mask
- rdmask = get_rdflags(pdecode2.e, cu)
+ rdmask = get_rdflags(m, pdecode2.e, cu)
#print ("hardcoded rdmask", cu.rdflags(pdecode2.e))
#print ("decoder rdmask", rdmask)
yield cu.rdmaskn.eq(~rdmask)
# debugging issue with branch
if self.funit == Function.BRANCH:
- lr = yield cu.alu.pipe1.n.data_o.lr.data
- lr_ok = yield cu.alu.pipe1.n.data_o.lr.ok
+ lr = yield cu.alu.pipe1.n.o_data.lr.data
+ lr_ok = yield cu.alu.pipe1.n.o_data.lr.ok
print("lr:", hex(lr), lr_ok)
if self.funit == Function.LDST:
m.d.comb += cu.ad.go_i.eq(cu.ad.rel_o) # link addr direct to rel
m.d.comb += cu.st.go_i.eq(cu.st.rel_o) # link store direct to rel
else:
- m.submodules.cu = cu = self.fukls(0)
+ m.submodules.cu = cu = self.fukls(0, parent_pspec=None)
l0 = None
comb += pdecode2.dec.raw_opcode_in.eq(instruction)
for test in self.test_data:
print(test.name)
with self.subTest(test.name):
- yield from self.execute(cu, l0, instruction,
+ yield from self.execute(m, cu, l0, instruction,
pdecode2, simdec2,
test)
import unittest
from openpower.decoder.power_enums import (XER_bits, Function)
-from soc.fu.div.test.test_pipe_caller import get_cu_inputs
+from soc.fu.div.test.helper import get_cu_inputs
from soc.fu.div.test.test_pipe_caller import DivTestCases # creates the tests
from openpower.test.common import ALUHelpers
recwidth += width
comb += p.eq(AnyConst(width))
- pspec = ALUPipeSpec(id_wid=2)
+ pspec = ALUPipeSpec(id_wid=2, parent_pspec=None)
m.submodules.dut = dut = CRMainStage(pspec)
full_cr_in = Signal(32)
# into cr_a
comb += dut.i.cr_a.eq(cr_input_arr[bc])
-
# For OP_CROP, we need to input the corresponding CR
# registers for BA, BB, and BT
with m.Case(MicrOp.OP_CROP):
comb += Assert(o[4*i:4*i+4] == cr[4*i:4*i+4])
with m.Else():
comb += Assert(o[4*i:4*i+4] == 0)
- with m.Else(): # mfcrf
+ with m.Else(): # mfcrf
comb += Assert(o == cr)
comb += o_ok.eq(1)
with m.Case(MicrOp.OP_SETB):
with m.If(cr_arr[4*bfa]):
- comb += Assert(o == ((1<<64)-1))
+ comb += Assert(o == ((1 << 64)-1))
with m.Elif(cr_arr[4*bfa+1]):
comb += Assert(o == 1)
with m.Else():
def test_formal(self):
module = Driver()
self.assertFormal(module, mode="bmc", depth=2)
+
def test_ilang(self):
dut = Driver()
vl = rtlil.convert(dut, ports=[])
class CRPipeSpec(CommonPipeSpec):
- regspec = (CRInputData.regspec, CROutputData.regspec)
+ regspecklses = (CRInputData, CROutputData)
opsubsetkls = CompCROpSubset
class CRIlangCase(TestAccumulatorBase):
def case_ilang(self):
- pspec = CRPipeSpec(id_wid=2)
+ pspec = CRPipeSpec(id_wid=2, parent_pspec=None)
alu = CRBasePipe(pspec)
vl = rtlil.convert(alu, ports=alu.ports())
with open("cr_pipeline.il", "w") as f:
cr_en = yield dec2.e.write_cr.ok
if whole_reg_ok:
- full_cr = yield alu.n.data_o.full_cr.data & full_cr_mask
+ full_cr = yield alu.n.o_data.full_cr.data & full_cr_mask
expected_cr = simulator.cr.value
- print("CR whole: expected %x, actual: %x mask: %x" % \
- (expected_cr, full_cr, full_cr_mask))
+ print("CR whole: expected %x, actual: %x mask: %x" %
+ (expected_cr, full_cr, full_cr_mask))
# HACK: only look at the bits that we expected to change
self.assertEqual(expected_cr & full_cr_mask, full_cr, code)
elif cr_en:
expected_cr = simulator.cr.value
print(f"CR whole: {expected_cr:x}, sel {cr_sel}")
expected_cr = simulator.crl[cr_sel].get_range().value
- real_cr = yield alu.n.data_o.cr.data
+ real_cr = yield alu.n.o_data.cr.data
print(f"CR part: expected {expected_cr:x}, actual: {real_cr:x}")
self.assertEqual(expected_cr, real_cr, code)
- alu_out = yield alu.n.data_o.o.data
+ alu_out = yield alu.n.o_data.o.data
out_reg_valid = yield dec2.e.write_reg.ok
if out_reg_valid:
write_reg_idx = yield dec2.e.write_reg.data
yield instruction.eq(ins) # raw binary instr.
yield Settle()
yield from self.set_inputs(alu, pdecode2, sim)
- yield alu.p.valid_i.eq(1)
+ yield alu.p.i_valid.eq(1)
fn_unit = yield pdecode2.e.do.fn_unit
self.assertEqual(fn_unit, Function.CR.value, code)
yield
yield from sim.call(opname)
index = sim.pc.CIA.value//4
- vld = yield alu.n.valid_o
+ vld = yield alu.n.o_valid
while not vld:
yield
- vld = yield alu.n.valid_o
+ vld = yield alu.n.o_valid
yield
yield from self.assert_outputs(alu, pdecode2, sim, code)
m.submodules.pdecode2 = pdecode2 = PowerDecode2(None, opkls, fn_name)
pdecode = pdecode2.dec
- pspec = CRPipeSpec(id_wid=2)
+ pspec = CRPipeSpec(id_wid=2, parent_pspec=None)
m.submodules.alu = alu = CRBasePipe(pspec)
- comb += alu.p.data_i.ctx.op.eq_from_execute1(pdecode2.do)
- comb += alu.n.ready_i.eq(1)
+ comb += alu.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
+ comb += alu.n.i_ready.eq(1)
comb += pdecode2.dec.raw_opcode_in.eq(instruction)
sim = Simulator(m)
from nmigen import (Module, Signal, Cat, Repl, Mux, Const, Array)
from nmutil.pipemodbase import PipeModBase
-from ieee754.part.partsig import PartitionedSignal
+from ieee754.part.partsig import SimdSignal
from openpower.decoder.power_enums import MicrOp
from openpower.decoder.power_fields import DecodeFields
--- /dev/null
+# SPDX-License-Identifier: LGPL-3-or-later
+# Copyright 2022 Jacob Lifshay programmerjake@gmail.com
+
+# Funded by NLnet Assure Programme 2021-02-052, https://nlnet.nl/assure part
+# of Horizon 2020 EU Programme 957073.
+
+from collections import defaultdict
+import logging
+import math
+import enum
+from fractions import Fraction
+from types import FunctionType
+from functools import lru_cache
+from nmigen.hdl.ast import Signal, unsigned, signed, Const
+from nmigen.hdl.dsl import Module, Elaboratable
+from nmigen.hdl.mem import Memory
+from nmutil.clz import CLZ
+from nmutil.plain_data import plain_data, fields, replace
+
+try:
+ from functools import cached_property
+except ImportError:
+ from cached_property import cached_property
+
+# fix broken IDE type detection for cached_property
+from typing import TYPE_CHECKING, Any
+if TYPE_CHECKING:
+ from functools import cached_property
+
+
+_NOT_FOUND = object()
+
+
+def cache_on_self(func):
+ """like `functools.cached_property`, except for methods. unlike
+ `lru_cache` the cache is per-class instance rather than a global cache
+ per-method."""
+
+ assert isinstance(func, FunctionType), \
+ "non-plain methods are not supported"
+
+ cache_name = func.__name__ + "__cache"
+
+ def wrapper(self, *args, **kwargs):
+ # specifically access through `__dict__` to bypass frozen=True
+ cache = self.__dict__.get(cache_name, _NOT_FOUND)
+ if cache is _NOT_FOUND:
+ self.__dict__[cache_name] = cache = {}
+ key = (args, *kwargs.items())
+ retval = cache.get(key, _NOT_FOUND)
+ if retval is _NOT_FOUND:
+ retval = func(self, *args, **kwargs)
+ cache[key] = retval
+ return retval
+
+ wrapper.__doc__ = func.__doc__
+ return wrapper
+
+
+@enum.unique
+class RoundDir(enum.Enum):
+ DOWN = enum.auto()
+ UP = enum.auto()
+ NEAREST_TIES_UP = enum.auto()
+ ERROR_IF_INEXACT = enum.auto()
+
+
+@plain_data(frozen=True, eq=False, repr=False)
+class FixedPoint:
+ __slots__ = "bits", "frac_wid"
+
+ def __init__(self, bits, frac_wid):
+ self.bits = bits
+ self.frac_wid = frac_wid
+ assert isinstance(self.bits, int)
+ assert isinstance(self.frac_wid, int) and self.frac_wid >= 0
+
+ @staticmethod
+ def cast(value):
+ """convert `value` to a fixed-point number with enough fractional
+ bits to preserve its value."""
+ if isinstance(value, FixedPoint):
+ return value
+ if isinstance(value, int):
+ return FixedPoint(value, 0)
+ if isinstance(value, str):
+ value = value.strip()
+ neg = value.startswith("-")
+ if neg or value.startswith("+"):
+ value = value[1:]
+ if value.startswith(("0x", "0X")) and "." in value:
+ value = value[2:]
+ got_dot = False
+ bits = 0
+ frac_wid = 0
+ for digit in value:
+ if digit == "_":
+ continue
+ if got_dot:
+ if digit == ".":
+ raise ValueError("too many `.` in string")
+ frac_wid += 4
+ if digit == ".":
+ got_dot = True
+ continue
+ if not digit.isalnum():
+ raise ValueError("invalid hexadecimal digit")
+ bits <<= 4
+ bits |= int("0x" + digit, base=16)
+ else:
+ bits = int(value, base=0)
+ frac_wid = 0
+ if neg:
+ bits = -bits
+ return FixedPoint(bits, frac_wid)
+
+ if isinstance(value, float):
+ n, d = value.as_integer_ratio()
+ log2_d = d.bit_length() - 1
+ assert d == 1 << log2_d, ("d isn't a power of 2 -- won't ever "
+ "fail with float being IEEE 754")
+ return FixedPoint(n, log2_d)
+ raise TypeError("can't convert type to FixedPoint")
+
+ @staticmethod
+ def with_frac_wid(value, frac_wid, round_dir=RoundDir.ERROR_IF_INEXACT):
+ """convert `value` to the nearest fixed-point number with `frac_wid`
+ fractional bits, rounding according to `round_dir`."""
+ assert isinstance(frac_wid, int) and frac_wid >= 0
+ assert isinstance(round_dir, RoundDir)
+ if isinstance(value, Fraction):
+ numerator = value.numerator
+ denominator = value.denominator
+ else:
+ value = FixedPoint.cast(value)
+ numerator = value.bits
+ denominator = 1 << value.frac_wid
+ if denominator < 0:
+ numerator = -numerator
+ denominator = -denominator
+ bits, remainder = divmod(numerator << frac_wid, denominator)
+ if round_dir == RoundDir.DOWN:
+ pass
+ elif round_dir == RoundDir.UP:
+ if remainder != 0:
+ bits += 1
+ elif round_dir == RoundDir.NEAREST_TIES_UP:
+ if remainder * 2 >= denominator:
+ bits += 1
+ elif round_dir == RoundDir.ERROR_IF_INEXACT:
+ if remainder != 0:
+ raise ValueError("inexact conversion")
+ else:
+ assert False, "unimplemented round_dir"
+ return FixedPoint(bits, frac_wid)
+
+ def to_frac_wid(self, frac_wid, round_dir=RoundDir.ERROR_IF_INEXACT):
+ """convert to the nearest fixed-point number with `frac_wid`
+ fractional bits, rounding according to `round_dir`."""
+ return FixedPoint.with_frac_wid(self, frac_wid, round_dir)
+
+ def __float__(self):
+ # use truediv to get correct result even when bits
+ # and frac_wid are huge
+ return float(self.bits / (1 << self.frac_wid))
+
+ def as_fraction(self):
+ return Fraction(self.bits, 1 << self.frac_wid)
+
+ def cmp(self, rhs):
+ """compare self with rhs, returning a positive integer if self is
+ greater than rhs, zero if self is equal to rhs, and a negative integer
+ if self is less than rhs."""
+ rhs = FixedPoint.cast(rhs)
+ common_frac_wid = max(self.frac_wid, rhs.frac_wid)
+ lhs = self.to_frac_wid(common_frac_wid)
+ rhs = rhs.to_frac_wid(common_frac_wid)
+ return lhs.bits - rhs.bits
+
+ def __eq__(self, rhs):
+ return self.cmp(rhs) == 0
+
+ def __ne__(self, rhs):
+ return self.cmp(rhs) != 0
+
+ def __gt__(self, rhs):
+ return self.cmp(rhs) > 0
+
+ def __lt__(self, rhs):
+ return self.cmp(rhs) < 0
+
+ def __ge__(self, rhs):
+ return self.cmp(rhs) >= 0
+
+ def __le__(self, rhs):
+ return self.cmp(rhs) <= 0
+
+ def fract(self):
+ """return the fractional part of `self`.
+ that is `self - math.floor(self)`.
+ """
+ fract_mask = (1 << self.frac_wid) - 1
+ return FixedPoint(self.bits & fract_mask, self.frac_wid)
+
+ def __str__(self):
+ if self < 0:
+ return "-" + str(-self)
+ digit_bits = 4
+ frac_digit_count = (self.frac_wid + digit_bits - 1) // digit_bits
+ fract = self.fract().to_frac_wid(frac_digit_count * digit_bits)
+ frac_str = hex(fract.bits)[2:].zfill(frac_digit_count)
+ return hex(math.floor(self)) + "." + frac_str
+
+ def __repr__(self):
+ return f"FixedPoint.with_frac_wid({str(self)!r}, {self.frac_wid})"
+
+ def __add__(self, rhs):
+ rhs = FixedPoint.cast(rhs)
+ common_frac_wid = max(self.frac_wid, rhs.frac_wid)
+ lhs = self.to_frac_wid(common_frac_wid)
+ rhs = rhs.to_frac_wid(common_frac_wid)
+ return FixedPoint(lhs.bits + rhs.bits, common_frac_wid)
+
+ def __radd__(self, lhs):
+ # symmetric
+ return self.__add__(lhs)
+
+ def __neg__(self):
+ return FixedPoint(-self.bits, self.frac_wid)
+
+ def __sub__(self, rhs):
+ rhs = FixedPoint.cast(rhs)
+ common_frac_wid = max(self.frac_wid, rhs.frac_wid)
+ lhs = self.to_frac_wid(common_frac_wid)
+ rhs = rhs.to_frac_wid(common_frac_wid)
+ return FixedPoint(lhs.bits - rhs.bits, common_frac_wid)
+
+ def __rsub__(self, lhs):
+ # a - b == -(b - a)
+ return -self.__sub__(lhs)
+
+ def __mul__(self, rhs):
+ rhs = FixedPoint.cast(rhs)
+ return FixedPoint(self.bits * rhs.bits, self.frac_wid + rhs.frac_wid)
+
+ def __rmul__(self, lhs):
+ # symmetric
+ return self.__mul__(lhs)
+
+ def __floor__(self):
+ return self.bits >> self.frac_wid
+
+ def div(self, rhs, frac_wid, round_dir=RoundDir.ERROR_IF_INEXACT):
+ assert isinstance(frac_wid, int) and frac_wid >= 0
+ assert isinstance(round_dir, RoundDir)
+ rhs = FixedPoint.cast(rhs)
+ return FixedPoint.with_frac_wid(self.as_fraction()
+ / rhs.as_fraction(),
+ frac_wid, round_dir)
+
+ def sqrt(self, round_dir=RoundDir.ERROR_IF_INEXACT):
+ assert isinstance(round_dir, RoundDir)
+ if self < 0:
+ raise ValueError("can't compute sqrt of negative number")
+ if self == 0:
+ return self
+ retval = FixedPoint(0, self.frac_wid)
+ int_part_wid = self.bits.bit_length() - self.frac_wid
+ first_bit_index = -(-int_part_wid // 2) # division rounds up
+ last_bit_index = -self.frac_wid
+ for bit_index in range(first_bit_index, last_bit_index - 1, -1):
+ trial = retval + FixedPoint(1 << (bit_index + self.frac_wid),
+ self.frac_wid)
+ if trial * trial <= self:
+ retval = trial
+ if round_dir == RoundDir.DOWN:
+ pass
+ elif round_dir == RoundDir.UP:
+ if retval * retval < self:
+ retval += FixedPoint(1, self.frac_wid)
+ elif round_dir == RoundDir.NEAREST_TIES_UP:
+ half_way = retval + FixedPoint(1, self.frac_wid + 1)
+ if half_way * half_way <= self:
+ retval += FixedPoint(1, self.frac_wid)
+ elif round_dir == RoundDir.ERROR_IF_INEXACT:
+ if retval * retval != self:
+ raise ValueError("inexact sqrt")
+ else:
+ assert False, "unimplemented round_dir"
+ return retval
+
+ def rsqrt(self, round_dir=RoundDir.ERROR_IF_INEXACT):
+ """compute the reciprocal-sqrt of `self`"""
+ assert isinstance(round_dir, RoundDir)
+ if self < 0:
+ raise ValueError("can't compute rsqrt of negative number")
+ if self == 0:
+ raise ZeroDivisionError("can't compute rsqrt of zero")
+ retval = FixedPoint(0, self.frac_wid)
+ first_bit_index = -(-self.frac_wid // 2) # division rounds up
+ last_bit_index = -self.frac_wid
+ for bit_index in range(first_bit_index, last_bit_index - 1, -1):
+ trial = retval + FixedPoint(1 << (bit_index + self.frac_wid),
+ self.frac_wid)
+ if trial * trial * self <= 1:
+ retval = trial
+ if round_dir == RoundDir.DOWN:
+ pass
+ elif round_dir == RoundDir.UP:
+ if retval * retval * self < 1:
+ retval += FixedPoint(1, self.frac_wid)
+ elif round_dir == RoundDir.NEAREST_TIES_UP:
+ half_way = retval + FixedPoint(1, self.frac_wid + 1)
+ if half_way * half_way * self <= 1:
+ retval += FixedPoint(1, self.frac_wid)
+ elif round_dir == RoundDir.ERROR_IF_INEXACT:
+ if retval * retval * self != 1:
+ raise ValueError("inexact rsqrt")
+ else:
+ assert False, "unimplemented round_dir"
+ return retval
+
+
+class ParamsNotAccurateEnough(Exception):
+ """raised when the parameters aren't accurate enough to have goldschmidt
+ division work."""
+
+
+def _assert_accuracy(condition, msg="not accurate enough"):
+ if condition:
+ return
+ raise ParamsNotAccurateEnough(msg)
+
+
+@plain_data(frozen=True, unsafe_hash=True)
+class GoldschmidtDivParamsBase:
+ """parameters for a Goldschmidt division algorithm, excluding derived
+ parameters.
+ """
+
+ __slots__ = ("io_width", "extra_precision", "table_addr_bits",
+ "table_data_bits", "iter_count")
+
+ def __init__(self, io_width, extra_precision, table_addr_bits,
+ table_data_bits, iter_count):
+ assert isinstance(io_width, int)
+ assert isinstance(extra_precision, int)
+ assert isinstance(table_addr_bits, int)
+ assert isinstance(table_data_bits, int)
+ assert isinstance(iter_count, int)
+ self.io_width = io_width
+ """bit-width of the input divisor and the result.
+ the input numerator is `2 * io_width`-bits wide.
+ """
+
+ self.extra_precision = extra_precision
+ """number of bits of additional precision used inside the algorithm."""
+
+ self.table_addr_bits = table_addr_bits
+ """the number of address bits used in the lookup-table."""
+
+ self.table_data_bits = table_data_bits
+ """the number of data bits used in the lookup-table."""
+
+ self.iter_count = iter_count
+ """the total number of iterations of the division algorithm's loop"""
+
+
+@plain_data(frozen=True, unsafe_hash=True)
+class GoldschmidtDivParams(GoldschmidtDivParamsBase):
+ """parameters for a Goldschmidt division algorithm.
+ Use `GoldschmidtDivParams.get` to find a efficient set of parameters.
+ """
+
+ __slots__ = "table", "ops"
+
+ def _shrink_bound(self, bound, round_dir):
+ """prevent fractions from having huge numerators/denominators by
+ rounding to a `FixedPoint` and converting back to a `Fraction`.
+
+ This is intended only for values used to compute bounds, and not for
+ values that end up in the hardware.
+ """
+ assert isinstance(bound, (Fraction, int))
+ assert round_dir is RoundDir.DOWN or round_dir is RoundDir.UP, \
+ "you shouldn't use that round_dir on bounds"
+ frac_wid = self.io_width * 4 + 100 # should be enough precision
+ fixed = FixedPoint.with_frac_wid(bound, frac_wid, round_dir)
+ return fixed.as_fraction()
+
+ def _shrink_min(self, min_bound):
+ """prevent fractions used as minimum bounds from having huge
+ numerators/denominators by rounding down to a `FixedPoint` and
+ converting back to a `Fraction`.
+
+ This is intended only for values used to compute bounds, and not for
+ values that end up in the hardware.
+ """
+ return self._shrink_bound(min_bound, RoundDir.DOWN)
+
+ def _shrink_max(self, max_bound):
+ """prevent fractions used as maximum bounds from having huge
+ numerators/denominators by rounding up to a `FixedPoint` and
+ converting back to a `Fraction`.
+
+ This is intended only for values used to compute bounds, and not for
+ values that end up in the hardware.
+ """
+ return self._shrink_bound(max_bound, RoundDir.UP)
+
+ @property
+ def table_addr_count(self):
+ """number of distinct addresses in the lookup-table."""
+ # used while computing self.table, so can't just do len(self.table)
+ return 1 << self.table_addr_bits
+
+ def table_input_exact_range(self, addr):
+ """return the range of inputs as `Fraction`s used for the table entry
+ with address `addr`."""
+ assert isinstance(addr, int)
+ assert 0 <= addr < self.table_addr_count
+ _assert_accuracy(self.io_width >= self.table_addr_bits)
+ addr_shift = self.io_width - self.table_addr_bits
+ min_numerator = (1 << self.io_width) + (addr << addr_shift)
+ denominator = 1 << self.io_width
+ values_per_table_entry = 1 << addr_shift
+ max_numerator = min_numerator + values_per_table_entry - 1
+ min_input = Fraction(min_numerator, denominator)
+ max_input = Fraction(max_numerator, denominator)
+ min_input = self._shrink_min(min_input)
+ max_input = self._shrink_max(max_input)
+ assert 1 <= min_input <= max_input < 2
+ return min_input, max_input
+
+ def table_value_exact_range(self, addr):
+ """return the range of values as `Fraction`s used for the table entry
+ with address `addr`."""
+ min_input, max_input = self.table_input_exact_range(addr)
+ # division swaps min/max
+ min_value = 1 / max_input
+ max_value = 1 / min_input
+ min_value = self._shrink_min(min_value)
+ max_value = self._shrink_max(max_value)
+ assert 0.5 < min_value <= max_value <= 1
+ return min_value, max_value
+
+ def table_exact_value(self, index):
+ min_value, max_value = self.table_value_exact_range(index)
+ # we round down
+ return min_value
+
+ def __init__(self, io_width, extra_precision, table_addr_bits,
+ table_data_bits, iter_count):
+ super().__init__(io_width=io_width,
+ extra_precision=extra_precision,
+ table_addr_bits=table_addr_bits,
+ table_data_bits=table_data_bits,
+ iter_count=iter_count)
+ _assert_accuracy(self.io_width >= 1, "io_width out of range")
+ _assert_accuracy(self.extra_precision >= 0,
+ "extra_precision out of range")
+ _assert_accuracy(self.table_addr_bits >= 1,
+ "table_addr_bits out of range")
+ _assert_accuracy(self.table_data_bits >= 1,
+ "table_data_bits out of range")
+ _assert_accuracy(self.iter_count >= 1, "iter_count out of range")
+ table = []
+ for addr in range(1 << self.table_addr_bits):
+ table.append(FixedPoint.with_frac_wid(self.table_exact_value(addr),
+ self.table_data_bits,
+ RoundDir.DOWN))
+
+ self.table = tuple(table)
+ """ the lookup-table.
+ type: tuple[FixedPoint, ...]
+ """
+
+ self.ops = tuple(self.__make_ops())
+ "the operations needed to perform the goldschmidt division algorithm."
+
+ @property
+ def expanded_width(self):
+ """the total number of bits of precision used inside the algorithm."""
+ return self.io_width + self.extra_precision
+
+ @property
+ def n_d_f_int_wid(self):
+ """the number of bits in the integer part of `state.n`, `state.d`, and
+ `state.f` during the main iteration loop.
+ """
+ return 2
+
+ @property
+ def n_d_f_total_wid(self):
+ """the total number of bits (both integer and fraction bits) in
+ `state.n`, `state.d`, and `state.f` during the main iteration loop.
+ """
+ return self.n_d_f_int_wid + self.expanded_width
+
+ @cache_on_self
+ def max_neps(self, i):
+ """maximum value of `neps[i]`.
+ `neps[i]` is defined to be `n[i] * N_prime[i - 1] * F_prime[i - 1]`.
+ """
+ assert isinstance(i, int) and 0 <= i < self.iter_count
+ return Fraction(1, 1 << self.expanded_width)
+
+ @cache_on_self
+ def max_deps(self, i):
+ """maximum value of `deps[i]`.
+ `deps[i]` is defined to be `d[i] * D_prime[i - 1] * F_prime[i - 1]`.
+ """
+ assert isinstance(i, int) and 0 <= i < self.iter_count
+ return Fraction(1, 1 << self.expanded_width)
+
+ @cache_on_self
+ def max_feps(self, i):
+ """maximum value of `feps[i]`.
+ `feps[i]` is defined to be `f[i] * (2 - D_prime[i - 1])`.
+ """
+ assert isinstance(i, int) and 0 <= i < self.iter_count
+ # zero, because the computation of `F_prime[i]` in
+ # `GoldschmidtDivOp.MulDByF.run(...)` is exact.
+ return Fraction(0)
+
+ @cached_property
+ def e0_range(self):
+ """minimum and maximum values of `e[0]`
+ (the relative error in `F_prime[-1]`)
+ """
+ min_e0 = Fraction(0)
+ max_e0 = Fraction(0)
+ for addr in range(self.table_addr_count):
+ # `F_prime[-1] = (1 - e[0]) / B`
+ # => `e[0] = 1 - B * F_prime[-1]`
+ min_b, max_b = self.table_input_exact_range(addr)
+ f_prime_m1 = self.table[addr].as_fraction()
+ assert min_b >= 0 and f_prime_m1 >= 0, \
+ "only positive quadrant of interval multiplication implemented"
+ min_product = min_b * f_prime_m1
+ max_product = max_b * f_prime_m1
+ # negation swaps min/max
+ cur_min_e0 = 1 - max_product
+ cur_max_e0 = 1 - min_product
+ min_e0 = min(min_e0, cur_min_e0)
+ max_e0 = max(max_e0, cur_max_e0)
+ min_e0 = self._shrink_min(min_e0)
+ max_e0 = self._shrink_max(max_e0)
+ return min_e0, max_e0
+
+ @cached_property
+ def min_e0(self):
+ """minimum value of `e[0]` (the relative error in `F_prime[-1]`)
+ """
+ min_e0, max_e0 = self.e0_range
+ return min_e0
+
+ @cached_property
+ def max_e0(self):
+ """maximum value of `e[0]` (the relative error in `F_prime[-1]`)
+ """
+ min_e0, max_e0 = self.e0_range
+ return max_e0
+
+ @cached_property
+ def max_abs_e0(self):
+ """maximum value of `abs(e[0])`."""
+ return max(abs(self.min_e0), abs(self.max_e0))
+
+ @cached_property
+ def min_abs_e0(self):
+ """minimum value of `abs(e[0])`."""
+ return Fraction(0)
+
+ @cache_on_self
+ def max_n(self, i):
+ """maximum value of `n[i]` (the relative error in `N_prime[i]`
+ relative to the previous iteration)
+ """
+ assert isinstance(i, int) and 0 <= i < self.iter_count
+ if i == 0:
+ # from Claim 10
+ # `n[0] = neps[0] / ((1 - e[0]) * (A / B))`
+ # `n[0] <= 2 * neps[0] / (1 - e[0])`
+
+ assert self.max_e0 < 1 and self.max_neps(0) >= 0, \
+ "only one quadrant of interval division implemented"
+ retval = 2 * self.max_neps(0) / (1 - self.max_e0)
+ elif i == 1:
+ # from Claim 10
+ # `n[1] <= neps[1] / ((1 - f[0]) * (1 - pi[0] - delta[0]))`
+ min_mpd = 1 - self.max_pi(0) - self.max_delta(0)
+ assert self.max_f(0) <= 1 and min_mpd >= 0, \
+ "only one quadrant of interval multiplication implemented"
+ prod = (1 - self.max_f(0)) * min_mpd
+ assert self.max_neps(1) >= 0 and prod > 0, \
+ "only one quadrant of interval division implemented"
+ retval = self.max_neps(1) / prod
+ else:
+ # from Claim 6
+ # `0 <= n[i] <= 2 * max_neps[i] / (1 - pi[i - 1] - delta[i - 1])`
+ min_mpd = 1 - self.max_pi(i - 1) - self.max_delta(i - 1)
+ assert self.max_neps(i) >= 0 and min_mpd > 0, \
+ "only one quadrant of interval division implemented"
+ retval = self.max_neps(i) / min_mpd
+
+ return self._shrink_max(retval)
+
+ @cache_on_self
+ def max_d(self, i):
+ """maximum value of `d[i]` (the relative error in `D_prime[i]`
+ relative to the previous iteration)
+ """
+ assert isinstance(i, int) and 0 <= i < self.iter_count
+ if i == 0:
+ # from Claim 10
+ # `d[0] = deps[0] / (1 - e[0])`
+
+ assert self.max_e0 < 1 and self.max_deps(0) >= 0, \
+ "only one quadrant of interval division implemented"
+ retval = self.max_deps(0) / (1 - self.max_e0)
+ elif i == 1:
+ # from Claim 10
+ # `d[1] <= deps[1] / ((1 - f[0]) * (1 - delta[0] ** 2))`
+ assert self.max_f(0) <= 1 and self.max_delta(0) <= 1, \
+ "only one quadrant of interval multiplication implemented"
+ divisor = (1 - self.max_f(0)) * (1 - self.max_delta(0) ** 2)
+ assert self.max_deps(1) >= 0 and divisor > 0, \
+ "only one quadrant of interval division implemented"
+ retval = self.max_deps(1) / divisor
+ else:
+ # from Claim 6
+ # `0 <= d[i] <= max_deps[i] / (1 - delta[i - 1])`
+ assert self.max_deps(i) >= 0 and self.max_delta(i - 1) < 1, \
+ "only one quadrant of interval division implemented"
+ retval = self.max_deps(i) / (1 - self.max_delta(i - 1))
+
+ return self._shrink_max(retval)
+
+ @cache_on_self
+ def max_f(self, i):
+ """maximum value of `f[i]` (the relative error in `F_prime[i]`
+ relative to the previous iteration)
+ """
+ assert isinstance(i, int) and 0 <= i < self.iter_count
+ if i == 0:
+ # from Claim 10
+ # `f[0] = feps[0] / (1 - delta[0])`
+
+ assert self.max_delta(0) < 1 and self.max_feps(0) >= 0, \
+ "only one quadrant of interval division implemented"
+ retval = self.max_feps(0) / (1 - self.max_delta(0))
+ elif i == 1:
+ # from Claim 10
+ # `f[1] = feps[1]`
+ retval = self.max_feps(1)
+ else:
+ # from Claim 6
+ # `f[i] <= max_feps[i]`
+ retval = self.max_feps(i)
+
+ return self._shrink_max(retval)
+
+ @cache_on_self
+ def max_delta(self, i):
+ """ maximum value of `delta[i]`.
+ `delta[i]` is defined in Definition 4 of paper.
+ """
+ assert isinstance(i, int) and 0 <= i < self.iter_count
+ if i == 0:
+ # `delta[0] = abs(e[0]) + 3 * d[0] / 2`
+ retval = self.max_abs_e0 + Fraction(3, 2) * self.max_d(0)
+ else:
+ # `delta[i] = delta[i - 1] ** 2 + f[i - 1]`
+ prev_max_delta = self.max_delta(i - 1)
+ assert prev_max_delta >= 0
+ retval = prev_max_delta ** 2 + self.max_f(i - 1)
+
+ # `delta[i]` has to be smaller than one otherwise errors would go off
+ # to infinity
+ _assert_accuracy(retval < 1)
+
+ return self._shrink_max(retval)
+
+ @cache_on_self
+ def max_pi(self, i):
+ """ maximum value of `pi[i]`.
+ `pi[i]` is defined right below Theorem 5 of paper.
+ """
+ assert isinstance(i, int) and 0 <= i < self.iter_count
+ # `pi[i] = 1 - (1 - n[i]) * prod`
+ # where `prod` is the product of,
+ # for `j` in `0 <= j < i`, `(1 - n[j]) / (1 + d[j])`
+ min_prod = Fraction(1)
+ for j in range(i):
+ max_n_j = self.max_n(j)
+ max_d_j = self.max_d(j)
+ assert max_n_j <= 1 and max_d_j > -1, \
+ "only one quadrant of interval division implemented"
+ min_prod *= (1 - max_n_j) / (1 + max_d_j)
+ max_n_i = self.max_n(i)
+ assert max_n_i <= 1 and min_prod >= 0, \
+ "only one quadrant of interval multiplication implemented"
+ retval = 1 - (1 - max_n_i) * min_prod
+ return self._shrink_max(retval)
+
+ @cached_property
+ def max_n_shift(self):
+ """ maximum value of `state.n_shift`.
+ """
+ # numerator must be less than `denominator << self.io_width`, so
+ # `n_shift` is at most `self.io_width`
+ return self.io_width
+
+ @cached_property
+ def n_hat(self):
+ """ maximum value of, for all `i`, `max_n(i)` and `max_d(i)`
+ """
+ n_hat = Fraction(0)
+ for i in range(self.iter_count):
+ n_hat = max(n_hat, self.max_n(i), self.max_d(i))
+ return self._shrink_max(n_hat)
+
+ def __make_ops(self):
+ """ Goldschmidt division algorithm.
+
+ based on:
+ Even, G., Seidel, P. M., & Ferguson, W. E. (2003).
+ A Parametric Error Analysis of Goldschmidt's Division Algorithm.
+ https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.90.1238&rep=rep1&type=pdf
+
+ yields: GoldschmidtDivOp
+ the operations needed to perform the division.
+ """
+ # establish assumptions of the paper's error analysis (section 3.1):
+
+ # 1. normalize so A (numerator) and B (denominator) are in [1, 2)
+ yield GoldschmidtDivOp.Normalize
+
+ # 2. ensure all relative errors from directed rounding are <= 1 / 4.
+ # the assumption is met by multipliers with > 4-bits precision
+ _assert_accuracy(self.expanded_width > 4)
+
+ # 3. require `abs(e[0]) + 3 * d[0] / 2 + f[0] < 1 / 2`.
+ _assert_accuracy(self.max_abs_e0 + 3 * self.max_d(0) / 2
+ + self.max_f(0) < Fraction(1, 2))
+
+ # 4. the initial approximation F'[-1] of 1/B is in [1/2, 1].
+ # (B is the denominator)
+
+ for addr in range(self.table_addr_count):
+ f_prime_m1 = self.table[addr]
+ _assert_accuracy(0.5 <= f_prime_m1 <= 1)
+
+ yield GoldschmidtDivOp.FEqTableLookup
+
+ # we use Setting I (section 4.1 of the paper):
+ # Require `n[i] <= n_hat` and `d[i] <= n_hat` and `f[i] = 0`:
+ # the conditions on n_hat are satisfied by construction.
+ for i in range(self.iter_count):
+ _assert_accuracy(self.max_f(i) == 0)
+ yield GoldschmidtDivOp.MulNByF
+ if i != self.iter_count - 1:
+ yield GoldschmidtDivOp.MulDByF
+ yield GoldschmidtDivOp.FEq2MinusD
+
+ # relative approximation error `p(N_prime[i])`:
+ # `p(N_prime[i]) = (A / B - N_prime[i]) / (A / B)`
+ # `0 <= p(N_prime[i])`
+ # `p(N_prime[i]) <= (2 * i) * n_hat \`
+ # ` + (abs(e[0]) + 3 * n_hat / 2) ** (2 ** i)`
+ i = self.iter_count - 1 # last used `i`
+ # compute power manually to prevent huge intermediate values
+ power = self._shrink_max(self.max_abs_e0 + 3 * self.n_hat / 2)
+ for _ in range(i):
+ power = self._shrink_max(power * power)
+
+ max_rel_error = (2 * i) * self.n_hat + power
+
+ min_a_over_b = Fraction(1, 2)
+ min_abs_error_for_correctness = min_a_over_b / (1 << self.max_n_shift)
+ min_rel_error_for_correctness = (min_abs_error_for_correctness
+ / min_a_over_b)
+
+ _assert_accuracy(
+ max_rel_error < min_rel_error_for_correctness,
+ f"not accurate enough: max_rel_error={max_rel_error}"
+ f" min_rel_error_for_correctness={min_rel_error_for_correctness}")
+
+ yield GoldschmidtDivOp.CalcResult
+
+ @cache_on_self
+ def default_cost_fn(self):
+ """ calculate the estimated cost on an arbitrary scale of implementing
+ goldschmidt division with the specified parameters. larger cost
+ values mean worse parameters.
+
+ This is the default cost function for `GoldschmidtDivParams.get`.
+
+ returns: float
+ """
+ rom_cells = self.table_data_bits << self.table_addr_bits
+ cost = float(rom_cells)
+ for op in self.ops:
+ if op == GoldschmidtDivOp.MulNByF \
+ or op == GoldschmidtDivOp.MulDByF:
+ mul_cost = self.expanded_width ** 2
+ mul_cost *= self.expanded_width.bit_length()
+ cost += mul_cost
+ cost += 5e7 * self.iter_count
+ return cost
+
+ @staticmethod
+ @lru_cache(maxsize=1 << 16)
+ def __cached_new(base_params):
+ assert isinstance(base_params, GoldschmidtDivParamsBase)
+ kwargs = {}
+ for field in fields(GoldschmidtDivParamsBase):
+ kwargs[field] = getattr(base_params, field)
+ try:
+ return GoldschmidtDivParams(**kwargs), None
+ except ParamsNotAccurateEnough as e:
+ return None, e
+
+ @staticmethod
+ def __raise(e): # type: (ParamsNotAccurateEnough) -> Any
+ raise e
+
+ @staticmethod
+ def cached_new(base_params, handle_error=__raise):
+ assert isinstance(base_params, GoldschmidtDivParamsBase)
+ params, error = GoldschmidtDivParams.__cached_new(base_params)
+ if error is None:
+ return params
+ else:
+ return handle_error(error)
+
+ @staticmethod
+ def get(io_width, cost_fn=default_cost_fn, max_table_addr_bits=12):
+ """ find efficient parameters for a goldschmidt division algorithm
+ with `params.io_width == io_width`.
+
+ arguments:
+ io_width: int
+ bit-width of the input divisor and the result.
+ the input numerator is `2 * io_width`-bits wide.
+ cost_fn: Callable[[GoldschmidtDivParams], float]
+ return the estimated cost on an arbitrary scale of implementing
+ goldschmidt division with the specified parameters. larger cost
+ values mean worse parameters.
+ max_table_addr_bits: int
+ maximum allowable value of `table_addr_bits`
+ """
+ assert isinstance(io_width, int) and io_width >= 1
+ assert callable(cost_fn)
+
+ last_error = None
+ last_error_params = None
+
+ def cached_new(base_params):
+ def handle_error(e):
+ nonlocal last_error, last_error_params
+ last_error = e
+ last_error_params = base_params
+ return None
+
+ retval = GoldschmidtDivParams.cached_new(base_params, handle_error)
+ if retval is None:
+ logging.debug(f"GoldschmidtDivParams.get: err: {base_params}")
+ else:
+ logging.debug(f"GoldschmidtDivParams.get: ok: {base_params}")
+ return retval
+
+ @lru_cache(maxsize=None)
+ def get_cost(base_params):
+ params = cached_new(base_params)
+ if params is None:
+ return math.inf
+ retval = cost_fn(params)
+ logging.debug(f"GoldschmidtDivParams.get: cost={retval}: {params}")
+ return retval
+
+ # start with parameters big enough to always work.
+ initial_extra_precision = io_width * 2 + 4
+ initial_params = GoldschmidtDivParamsBase(
+ io_width=io_width,
+ extra_precision=initial_extra_precision,
+ table_addr_bits=min(max_table_addr_bits, io_width),
+ table_data_bits=io_width + initial_extra_precision,
+ iter_count=1 + io_width.bit_length())
+
+ if cached_new(initial_params) is None:
+ raise ValueError(f"initial goldschmidt division algorithm "
+ f"parameters are invalid: {initial_params}"
+ ) from last_error
+
+ # find good initial `iter_count`
+ params = initial_params
+ for iter_count in range(1, initial_params.iter_count):
+ trial_params = replace(params, iter_count=iter_count)
+ if cached_new(trial_params) is not None:
+ params = trial_params
+ break
+
+ # now find `table_addr_bits`
+ cost = get_cost(params)
+ for table_addr_bits in range(1, max_table_addr_bits):
+ trial_params = replace(params, table_addr_bits=table_addr_bits)
+ trial_cost = get_cost(trial_params)
+ if trial_cost < cost:
+ params = trial_params
+ cost = trial_cost
+ break
+
+ # check one higher `iter_count` to see if it has lower cost
+ for table_addr_bits in range(1, max_table_addr_bits + 1):
+ trial_params = replace(params,
+ table_addr_bits=table_addr_bits,
+ iter_count=params.iter_count + 1)
+ trial_cost = get_cost(trial_params)
+ if trial_cost < cost:
+ params = trial_params
+ cost = trial_cost
+ break
+
+ # now shrink `table_data_bits`
+ while True:
+ trial_params = replace(params,
+ table_data_bits=params.table_data_bits - 1)
+ trial_cost = get_cost(trial_params)
+ if trial_cost < cost:
+ params = trial_params
+ cost = trial_cost
+ else:
+ break
+
+ # and shrink `extra_precision`
+ while True:
+ trial_params = replace(params,
+ extra_precision=params.extra_precision - 1)
+ trial_cost = get_cost(trial_params)
+ if trial_cost < cost:
+ params = trial_params
+ cost = trial_cost
+ else:
+ break
+
+ retval = cached_new(params)
+ assert isinstance(retval, GoldschmidtDivParams)
+ return retval
+
+
+def clz(v, wid):
+ """count leading zeros -- handy for debugging."""
+ assert isinstance(wid, int)
+ assert isinstance(v, int) and 0 <= v < (1 << wid)
+ return (1 << wid).bit_length() - v.bit_length()
+
+
+@enum.unique
+class GoldschmidtDivOp(enum.Enum):
+ Normalize = "n, d, n_shift = normalize(n, d)"
+ FEqTableLookup = "f = table_lookup(d)"
+ MulNByF = "n *= f"
+ MulDByF = "d *= f"
+ FEq2MinusD = "f = 2 - d"
+ CalcResult = "result = unnormalize_and_round(n)"
+
+ def run(self, params, state):
+ assert isinstance(params, GoldschmidtDivParams)
+ assert isinstance(state, GoldschmidtDivState)
+ expanded_width = params.expanded_width
+ table_addr_bits = params.table_addr_bits
+ if self == GoldschmidtDivOp.Normalize:
+ # normalize so 1 <= d < 2
+ # can easily be done with count-leading-zeros and left shift
+ while state.d < 1:
+ state.n = (state.n * 2).to_frac_wid(expanded_width)
+ state.d = (state.d * 2).to_frac_wid(expanded_width)
+
+ state.n_shift = 0
+ # normalize so 1 <= n < 2
+ while state.n >= 2:
+ state.n = (state.n * 0.5).to_frac_wid(expanded_width,
+ round_dir=RoundDir.DOWN)
+ state.n_shift += 1
+ elif self == GoldschmidtDivOp.FEqTableLookup:
+ # compute initial f by table lookup
+ d_m_1 = state.d - 1
+ d_m_1 = d_m_1.to_frac_wid(table_addr_bits, RoundDir.DOWN)
+ assert 0 <= d_m_1.bits < (1 << params.table_addr_bits)
+ state.f = params.table[d_m_1.bits]
+ state.f = state.f.to_frac_wid(expanded_width,
+ round_dir=RoundDir.DOWN)
+ elif self == GoldschmidtDivOp.MulNByF:
+ assert state.f is not None
+ n = state.n * state.f
+ state.n = n.to_frac_wid(expanded_width, round_dir=RoundDir.DOWN)
+ elif self == GoldschmidtDivOp.MulDByF:
+ assert state.f is not None
+ d = state.d * state.f
+ state.d = d.to_frac_wid(expanded_width, round_dir=RoundDir.UP)
+ elif self == GoldschmidtDivOp.FEq2MinusD:
+ state.f = (2 - state.d).to_frac_wid(expanded_width)
+ elif self == GoldschmidtDivOp.CalcResult:
+ assert state.n_shift is not None
+ # scale to correct value
+ n = state.n * (1 << state.n_shift)
+
+ state.quotient = math.floor(n)
+ state.remainder = state.orig_n - state.quotient * state.orig_d
+ if state.remainder >= state.orig_d:
+ state.quotient += 1
+ state.remainder -= state.orig_d
+ else:
+ assert False, f"unimplemented GoldschmidtDivOp: {self}"
+
+ def gen_hdl(self, params, state, sync_rom):
+ """generate the hdl for this operation.
+
+ arguments:
+ params: GoldschmidtDivParams
+ the goldschmidt division parameters.
+ state: GoldschmidtDivHDLState
+ the input/output state
+ sync_rom: bool
+ true if the rom should be read synchronously rather than
+ combinatorially, incurring an extra clock cycle of latency.
+ """
+ assert isinstance(params, GoldschmidtDivParams)
+ assert isinstance(state, GoldschmidtDivHDLState)
+ m = state.m
+ if self == GoldschmidtDivOp.Normalize:
+ # normalize so 1 <= d < 2
+ assert state.d.width == params.io_width
+ assert state.n.width == 2 * params.io_width
+ d_leading_zeros = CLZ(params.io_width)
+ m.submodules.d_leading_zeros = d_leading_zeros
+ m.d.comb += d_leading_zeros.sig_in.eq(state.d)
+ d_shift_out = Signal.like(state.d)
+ m.d.comb += d_shift_out.eq(state.d << d_leading_zeros.lz)
+ d = Signal(params.n_d_f_total_wid)
+ m.d.comb += d.eq((d_shift_out << (1 + params.expanded_width))
+ >> state.d.width)
+
+ # normalize so 1 <= n < 2
+ n_leading_zeros = CLZ(2 * params.io_width)
+ m.submodules.n_leading_zeros = n_leading_zeros
+ m.d.comb += n_leading_zeros.sig_in.eq(state.n)
+ signed_zero = Const(0, signed(1)) # force subtraction to be signed
+ n_shift_s_v = (params.io_width + signed_zero + d_leading_zeros.lz
+ - n_leading_zeros.lz)
+ n_shift_s = Signal.like(n_shift_s_v)
+ n_shift_n_lz_out = Signal.like(state.n)
+ n_shift_d_lz_out = Signal.like(state.n << d_leading_zeros.lz)
+ m.d.comb += [
+ n_shift_s.eq(n_shift_s_v),
+ n_shift_d_lz_out.eq(state.n << d_leading_zeros.lz),
+ n_shift_n_lz_out.eq(state.n << n_leading_zeros.lz),
+ ]
+ state.n_shift = Signal(d_leading_zeros.lz.width)
+ n = Signal(params.n_d_f_total_wid)
+ with m.If(n_shift_s < 0):
+ m.d.comb += [
+ state.n_shift.eq(0),
+ n.eq((n_shift_d_lz_out << (1 + params.expanded_width))
+ >> state.d.width),
+ ]
+ with m.Else():
+ m.d.comb += [
+ state.n_shift.eq(n_shift_s),
+ n.eq((n_shift_n_lz_out << (1 + params.expanded_width))
+ >> state.n.width),
+ ]
+ state.n = n
+ state.d = d
+ elif self == GoldschmidtDivOp.FEqTableLookup:
+ assert state.d.width == params.n_d_f_total_wid, "invalid d width"
+ # compute initial f by table lookup
+
+ # extra bit for table entries == 1.0
+ table_width = 1 + params.table_data_bits
+ table = Memory(width=table_width, depth=len(params.table),
+ init=[i.bits for i in params.table])
+ addr = state.d[:-params.n_d_f_int_wid][-params.table_addr_bits:]
+ if sync_rom:
+ table_read = table.read_port()
+ m.d.comb += table_read.addr.eq(addr)
+ state.insert_pipeline_register()
+ else:
+ table_read = table.read_port(domain="comb")
+ m.d.comb += table_read.addr.eq(addr)
+ m.submodules.table_read = table_read
+ state.f = Signal(params.n_d_f_int_wid + params.expanded_width)
+ data_shift = params.expanded_width - params.table_data_bits
+ m.d.comb += state.f.eq(table_read.data << data_shift)
+ elif self == GoldschmidtDivOp.MulNByF:
+ assert state.n.width == params.n_d_f_total_wid, "invalid n width"
+ assert state.f is not None
+ assert state.f.width == params.n_d_f_total_wid, "invalid f width"
+ n = Signal.like(state.n)
+ m.d.comb += n.eq((state.n * state.f) >> params.expanded_width)
+ state.n = n
+ elif self == GoldschmidtDivOp.MulDByF:
+ assert state.d.width == params.n_d_f_total_wid, "invalid d width"
+ assert state.f is not None
+ assert state.f.width == params.n_d_f_total_wid, "invalid f width"
+ d = Signal.like(state.d)
+ d_times_f = Signal.like(state.d * state.f)
+ m.d.comb += [
+ d_times_f.eq(state.d * state.f),
+ # round the multiplication up
+ d.eq((d_times_f >> params.expanded_width)
+ + (d_times_f[:params.expanded_width] != 0)),
+ ]
+ state.d = d
+ elif self == GoldschmidtDivOp.FEq2MinusD:
+ assert state.d.width == params.n_d_f_total_wid, "invalid d width"
+ f = Signal.like(state.d)
+ m.d.comb += f.eq((2 << params.expanded_width) - state.d)
+ state.f = f
+ elif self == GoldschmidtDivOp.CalcResult:
+ assert state.n.width == params.n_d_f_total_wid, "invalid n width"
+ assert state.n_shift is not None
+ # scale to correct value
+ n = state.n * (1 << state.n_shift)
+ q_approx = Signal(params.io_width)
+ # extra bit for if it's bigger than orig_d
+ r_approx = Signal(params.io_width + 1)
+ adjusted_r = Signal(signed(1 + params.io_width))
+ m.d.comb += [
+ q_approx.eq((state.n << state.n_shift)
+ >> params.expanded_width),
+ r_approx.eq(state.orig_n - q_approx * state.orig_d),
+ adjusted_r.eq(r_approx - state.orig_d),
+ ]
+ state.quotient = Signal(params.io_width)
+ state.remainder = Signal(params.io_width)
+
+ with m.If(adjusted_r >= 0):
+ m.d.comb += [
+ state.quotient.eq(q_approx + 1),
+ state.remainder.eq(adjusted_r),
+ ]
+ with m.Else():
+ m.d.comb += [
+ state.quotient.eq(q_approx),
+ state.remainder.eq(r_approx),
+ ]
+ else:
+ assert False, f"unimplemented GoldschmidtDivOp: {self}"
+
+
+@plain_data(repr=False)
+class GoldschmidtDivState:
+ __slots__ = ("orig_n", "orig_d", "n", "d",
+ "f", "quotient", "remainder", "n_shift")
+
+ def __init__(self, orig_n, orig_d, n, d,
+ f=None, quotient=None, remainder=None, n_shift=None):
+ assert isinstance(orig_n, int)
+ assert isinstance(orig_d, int)
+ assert isinstance(n, FixedPoint)
+ assert isinstance(d, FixedPoint)
+ assert f is None or isinstance(f, FixedPoint)
+ assert quotient is None or isinstance(quotient, int)
+ assert remainder is None or isinstance(remainder, int)
+ assert n_shift is None or isinstance(n_shift, int)
+ self.orig_n = orig_n
+ """original numerator"""
+
+ self.orig_d = orig_d
+ """original denominator"""
+
+ self.n = n
+ """numerator -- N_prime[i] in the paper's algorithm 2"""
+
+ self.d = d
+ """denominator -- D_prime[i] in the paper's algorithm 2"""
+
+ self.f = f
+ """current factor -- F_prime[i] in the paper's algorithm 2"""
+
+ self.quotient = quotient
+ """final quotient"""
+
+ self.remainder = remainder
+ """final remainder"""
+
+ self.n_shift = n_shift
+ """amount the numerator needs to be left-shifted at the end of the
+ algorithm.
+ """
+
+ def __repr__(self):
+ fields_str = []
+ for field in fields(GoldschmidtDivState):
+ value = getattr(self, field)
+ if value is None:
+ continue
+ if isinstance(value, int) and field != "n_shift":
+ fields_str.append(f"{field}={hex(value)}")
+ else:
+ fields_str.append(f"{field}={value!r}")
+ return f"GoldschmidtDivState({', '.join(fields_str)})"
+
+
+def goldschmidt_div(n, d, params, trace=lambda state: None):
+ """ Goldschmidt division algorithm.
+
+ based on:
+ Even, G., Seidel, P. M., & Ferguson, W. E. (2003).
+ A Parametric Error Analysis of Goldschmidt's Division Algorithm.
+ https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.90.1238&rep=rep1&type=pdf
+
+ arguments:
+ n: int
+ numerator. a `2*width`-bit unsigned integer.
+ must be less than `d << width`, otherwise the quotient wouldn't
+ fit in `width` bits.
+ d: int
+ denominator. a `width`-bit unsigned integer. must not be zero.
+ width: int
+ the bit-width of the inputs/outputs. must be a positive integer.
+ trace: Function[[GoldschmidtDivState], None]
+ called with the initial state and the state after executing each
+ operation in `params.ops`.
+
+ returns: tuple[int, int]
+ the quotient and remainder. a tuple of two `width`-bit unsigned
+ integers.
+ """
+ assert isinstance(params, GoldschmidtDivParams)
+ assert isinstance(d, int) and 0 < d < (1 << params.io_width)
+ assert isinstance(n, int) and 0 <= n < (d << params.io_width)
+
+ # this whole algorithm is done with fixed-point arithmetic where values
+ # have `width` fractional bits
+
+ state = GoldschmidtDivState(
+ orig_n=n,
+ orig_d=d,
+ n=FixedPoint(n, params.io_width),
+ d=FixedPoint(d, params.io_width),
+ )
+
+ trace(state)
+ for op in params.ops:
+ op.run(params, state)
+ trace(state)
+
+ assert state.quotient is not None
+ assert state.remainder is not None
+
+ return state.quotient, state.remainder
+
+
+@plain_data(eq=False)
+class GoldschmidtDivHDLState:
+ __slots__ = ("m", "orig_n", "orig_d", "n", "d",
+ "f", "quotient", "remainder", "n_shift")
+
+ __signal_name_prefix = "state_"
+
+ def __init__(self, m, orig_n, orig_d, n, d,
+ f=None, quotient=None, remainder=None, n_shift=None):
+ assert isinstance(m, Module)
+ assert isinstance(orig_n, Signal)
+ assert isinstance(orig_d, Signal)
+ assert isinstance(n, Signal)
+ assert isinstance(d, Signal)
+ assert f is None or isinstance(f, Signal)
+ assert quotient is None or isinstance(quotient, Signal)
+ assert remainder is None or isinstance(remainder, Signal)
+ assert n_shift is None or isinstance(n_shift, Signal)
+
+ self.m = m
+ """The HDL Module"""
+
+ self.orig_n = orig_n
+ """original numerator"""
+
+ self.orig_d = orig_d
+ """original denominator"""
+
+ self.n = n
+ """numerator -- N_prime[i] in the paper's algorithm 2"""
+
+ self.d = d
+ """denominator -- D_prime[i] in the paper's algorithm 2"""
+
+ self.f = f
+ """current factor -- F_prime[i] in the paper's algorithm 2"""
+
+ self.quotient = quotient
+ """final quotient"""
+
+ self.remainder = remainder
+ """final remainder"""
+
+ self.n_shift = n_shift
+ """amount the numerator needs to be left-shifted at the end of the
+ algorithm.
+ """
+
+ # old_signals must be set last
+ self.old_signals = defaultdict(list)
+
+ def __setattr__(self, name, value):
+ assert isinstance(name, str)
+ if name.startswith("_"):
+ return super().__setattr__(name, value)
+ try:
+ old_signals = self.old_signals[name]
+ except AttributeError:
+ # haven't yet finished __post_init__
+ return super().__setattr__(name, value)
+ assert name != "m" and name != "old_signals", f"can't write to {name}"
+ assert isinstance(value, Signal)
+ value.name = f"{self.__signal_name_prefix}{name}_{len(old_signals)}"
+ old_signal = getattr(self, name, None)
+ if old_signal is not None:
+ assert isinstance(old_signal, Signal)
+ old_signals.append(old_signal)
+ return super().__setattr__(name, value)
+
+ def insert_pipeline_register(self):
+ old_prefix = self.__signal_name_prefix
+ try:
+ for field in fields(GoldschmidtDivHDLState):
+ if field.startswith("_") or field == "m":
+ continue
+ old_sig = getattr(self, field, None)
+ if old_sig is None:
+ continue
+ assert isinstance(old_sig, Signal)
+ new_sig = Signal.like(old_sig)
+ setattr(self, field, new_sig)
+ self.m.d.sync += new_sig.eq(old_sig)
+ finally:
+ self.__signal_name_prefix = old_prefix
+
+
+class GoldschmidtDivHDL(Elaboratable):
+ """ Goldschmidt division algorithm.
+
+ based on:
+ Even, G., Seidel, P. M., & Ferguson, W. E. (2003).
+ A Parametric Error Analysis of Goldschmidt's Division Algorithm.
+ https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.90.1238&rep=rep1&type=pdf
+
+ attributes:
+ params: GoldschmidtDivParams
+ the goldschmidt division algorithm parameters.
+ pipe_reg_indexes: list[int]
+ the operation indexes where pipeline registers should be inserted.
+ duplicate values mean multiple registers should be inserted for
+ that operation index -- this is useful to allow yosys to spread a
+ multiplication across those multiple pipeline stages.
+ sync_rom: bool
+ true if the rom should be read synchronously rather than
+ combinatorially, incurring an extra clock cycle of latency.
+ n: Signal(unsigned(2 * params.io_width))
+ input numerator. a `2 * params.io_width`-bit unsigned integer.
+ must be less than `d << params.io_width`, otherwise the quotient
+ wouldn't fit in `params.io_width` bits.
+ d: Signal(unsigned(params.io_width))
+ input denominator. a `params.io_width`-bit unsigned integer.
+ must not be zero.
+ q: Signal(unsigned(params.io_width))
+ output quotient. only valid when `n < (d << params.io_width)`.
+ r: Signal(unsigned(params.io_width))
+ output remainder. only valid when `n < (d << params.io_width)`.
+ trace: list[GoldschmidtDivHDLState]
+ list of the initial state and the state after executing each
+ operation in `params.ops`.
+ """
+
+ @property
+ def total_pipeline_registers(self):
+ """the total number of pipeline registers"""
+ return len(self.pipe_reg_indexes) + self.sync_rom
+
+ def __init__(self, params, pipe_reg_indexes=(), sync_rom=False):
+ assert isinstance(params, GoldschmidtDivParams)
+ assert isinstance(sync_rom, bool)
+ self.params = params
+ self.pipe_reg_indexes = sorted(int(i) for i in pipe_reg_indexes)
+ self.sync_rom = sync_rom
+ self.n = Signal(unsigned(2 * params.io_width))
+ self.d = Signal(unsigned(params.io_width))
+ self.q = Signal(unsigned(params.io_width))
+ self.r = Signal(unsigned(params.io_width))
+
+ # in constructor so we get trace without needing to call elaborate
+ state = GoldschmidtDivHDLState(
+ m=Module(),
+ orig_n=self.n,
+ orig_d=self.d,
+ n=self.n,
+ d=self.d)
+
+ self.trace = [replace(state)]
+
+ # copy and reverse
+ pipe_reg_indexes = list(reversed(self.pipe_reg_indexes))
+
+ for op_index, op in enumerate(self.params.ops):
+ while len(pipe_reg_indexes) > 0 \
+ and pipe_reg_indexes[-1] <= op_index:
+ pipe_reg_indexes.pop()
+ state.insert_pipeline_register()
+ op.gen_hdl(self.params, state, self.sync_rom)
+ self.trace.append(replace(state))
+
+ while len(pipe_reg_indexes) > 0:
+ pipe_reg_indexes.pop()
+ state.insert_pipeline_register()
+
+ state.m.d.comb += [
+ self.q.eq(state.quotient),
+ self.r.eq(state.remainder),
+ ]
+
+ def elaborate(self, platform):
+ return self.trace[0].m
+
+
+GOLDSCHMIDT_SQRT_RSQRT_TABLE_ADDR_INT_WID = 2
+
+
+@lru_cache()
+def goldschmidt_sqrt_rsqrt_table(table_addr_bits, table_data_bits):
+ """Generate the look-up table needed for Goldschmidt's square-root and
+ reciprocal-square-root algorithm.
+
+ arguments:
+ table_addr_bits: int
+ the number of address bits for the look-up table.
+ table_data_bits: int
+ the number of data bits for the look-up table.
+ """
+ assert isinstance(table_addr_bits, int) and \
+ table_addr_bits >= GOLDSCHMIDT_SQRT_RSQRT_TABLE_ADDR_INT_WID
+ assert isinstance(table_data_bits, int) and table_data_bits >= 1
+ table = []
+ table_len = 1 << table_addr_bits
+ for addr in range(table_len):
+ if addr == 0:
+ value = FixedPoint(0, table_data_bits)
+ elif (addr << 2) < table_len:
+ value = None # table entries should be unused
+ else:
+ table_addr_frac_wid = table_addr_bits
+ table_addr_frac_wid -= GOLDSCHMIDT_SQRT_RSQRT_TABLE_ADDR_INT_WID
+ max_input_value = FixedPoint(addr + 1, table_addr_bits - 2)
+ max_frac_wid = max(max_input_value.frac_wid, table_data_bits)
+ value = max_input_value.to_frac_wid(max_frac_wid)
+ value = value.rsqrt(RoundDir.DOWN)
+ value = value.to_frac_wid(table_data_bits, RoundDir.DOWN)
+ table.append(value)
+
+ # tuple for immutability
+ return tuple(table)
+
+# FIXME: add code to calculate error bounds and check that the algorithm will
+# actually work (like in the goldschmidt division algorithm).
+# FIXME: add code to calculate a good set of parameters based on the error
+# bounds checking.
+
+
+def goldschmidt_sqrt_rsqrt(radicand, io_width, frac_wid, extra_precision,
+ table_addr_bits, table_data_bits, iter_count):
+ """Goldschmidt's square-root and reciprocal-square-root algorithm.
+
+ uses algorithm based on second method at:
+ https://en.wikipedia.org/wiki/Methods_of_computing_square_roots#Goldschmidt%E2%80%99s_algorithm
+
+ arguments:
+ radicand: FixedPoint(frac_wid=frac_wid)
+ the input value to take the square-root and reciprocal-square-root of.
+ io_width: int
+ the number of bits in the input (`radicand`) and output values.
+ frac_wid: int
+ the number of fraction bits in the input (`radicand`) and output
+ values.
+ extra_precision: int
+ the number of bits of internal extra precision.
+ table_addr_bits: int
+ the number of address bits for the look-up table.
+ table_data_bits: int
+ the number of data bits for the look-up table.
+
+ returns: tuple[FixedPoint, FixedPoint]
+ the square-root and reciprocal-square-root, rounded down to the
+ nearest representable value. If `radicand == 0`, then the
+ reciprocal-square-root value returned is zero.
+ """
+ assert (isinstance(radicand, FixedPoint)
+ and radicand.frac_wid == frac_wid
+ and 0 <= radicand.bits < (1 << io_width))
+ assert isinstance(io_width, int) and io_width >= 1
+ assert isinstance(frac_wid, int) and 0 <= frac_wid < io_width
+ assert isinstance(extra_precision, int) and extra_precision >= io_width
+ assert isinstance(table_addr_bits, int) and table_addr_bits >= 1
+ assert isinstance(table_data_bits, int) and table_data_bits >= 1
+ assert isinstance(iter_count, int) and iter_count >= 0
+ expanded_frac_wid = frac_wid + extra_precision
+ s = radicand.to_frac_wid(expanded_frac_wid)
+ sqrt_rshift = extra_precision
+ rsqrt_rshift = extra_precision
+ while s != 0 and s < 1:
+ s = (s * 4).to_frac_wid(expanded_frac_wid)
+ sqrt_rshift += 1
+ rsqrt_rshift -= 1
+ while s >= 4:
+ s = s.div(4, expanded_frac_wid)
+ sqrt_rshift -= 1
+ rsqrt_rshift += 1
+ table = goldschmidt_sqrt_rsqrt_table(table_addr_bits=table_addr_bits,
+ table_data_bits=table_data_bits)
+ # core goldschmidt sqrt/rsqrt algorithm:
+ # initial setup:
+ table_addr_frac_wid = table_addr_bits
+ table_addr_frac_wid -= GOLDSCHMIDT_SQRT_RSQRT_TABLE_ADDR_INT_WID
+ addr = s.to_frac_wid(table_addr_frac_wid, RoundDir.DOWN)
+ assert 0 <= addr.bits < (1 << table_addr_bits), "table addr out of range"
+ f = table[addr.bits]
+ assert f is not None, "accessed invalid table entry"
+ # use with_frac_wid to fix IDE type deduction
+ f = FixedPoint.with_frac_wid(f, expanded_frac_wid, RoundDir.DOWN)
+ x = (s * f).to_frac_wid(expanded_frac_wid, RoundDir.DOWN)
+ h = (f * 0.5).to_frac_wid(expanded_frac_wid, RoundDir.DOWN)
+ for _ in range(iter_count):
+ # iteration step:
+ f = (1.5 - x * h).to_frac_wid(expanded_frac_wid, RoundDir.DOWN)
+ x = (x * f).to_frac_wid(expanded_frac_wid, RoundDir.DOWN)
+ h = (h * f).to_frac_wid(expanded_frac_wid, RoundDir.DOWN)
+ r = 2 * h
+ # now `x` is approximately `sqrt(s)` and `r` is approximately `rsqrt(s)`
+
+ sqrt = FixedPoint(x.bits >> sqrt_rshift, frac_wid)
+ rsqrt = FixedPoint(r.bits >> rsqrt_rshift, frac_wid)
+
+ next_sqrt = FixedPoint(sqrt.bits + 1, frac_wid)
+ if next_sqrt * next_sqrt <= radicand:
+ sqrt = next_sqrt
+
+ next_rsqrt = FixedPoint(rsqrt.bits + 1, frac_wid)
+ if next_rsqrt * next_rsqrt * radicand <= 1 and radicand != 0:
+ rsqrt = next_rsqrt
+ return sqrt, rsqrt
--- /dev/null
+# SPDX-License-Identifier: LGPL-3-or-later
+# Copyright 2022 Jacob Lifshay programmerjake@gmail.com
+
+# Funded by NLnet Assure Programme 2021-02-052, https://nlnet.nl/assure part
+# of Horizon 2020 EU Programme 957073.
+
+from nmutil.plain_data import fields, replace
+import math
+import unittest
+from nmutil.formaltest import FHDLTestCase
+from nmutil.sim_util import do_sim, hash_256
+from nmigen.sim import Tick, Delay
+from nmigen.hdl.ast import Signal
+from nmigen.hdl.dsl import Module
+from soc.fu.div.experiment.goldschmidt_div_sqrt import (
+ GoldschmidtDivHDL, GoldschmidtDivHDLState, GoldschmidtDivParams,
+ GoldschmidtDivState, ParamsNotAccurateEnough, goldschmidt_div,
+ FixedPoint, RoundDir, goldschmidt_sqrt_rsqrt)
+
+
+class TestFixedPoint(FHDLTestCase):
+ def test_str_roundtrip(self):
+ for frac_wid in range(8):
+ for bits in range(-1 << 9, 1 << 9):
+ with self.subTest(bits=hex(bits), frac_wid=frac_wid):
+ value = FixedPoint(bits, frac_wid)
+ round_trip_value = FixedPoint.cast(str(value))
+ self.assertEqual(value, round_trip_value)
+
+ @staticmethod
+ def trap(f):
+ try:
+ return f(), None
+ except (ValueError, ZeroDivisionError) as e:
+ return None, e.__class__.__name__
+
+ def test_sqrt(self):
+ for frac_wid in range(8):
+ for bits in range(1 << 9):
+ for round_dir in RoundDir:
+ radicand = FixedPoint(bits, frac_wid)
+ expected_f = math.sqrt(float(radicand))
+ expected = self.trap(lambda: FixedPoint.with_frac_wid(
+ expected_f, frac_wid, round_dir))
+ with self.subTest(radicand=repr(radicand),
+ round_dir=str(round_dir),
+ expected=repr(expected)):
+ result = self.trap(lambda: radicand.sqrt(round_dir))
+ self.assertEqual(result, expected)
+
+ def test_rsqrt(self):
+ for frac_wid in range(8):
+ for bits in range(1, 1 << 9):
+ for round_dir in RoundDir:
+ radicand = FixedPoint(bits, frac_wid)
+ expected_f = 1 / math.sqrt(float(radicand))
+ expected = self.trap(lambda: FixedPoint.with_frac_wid(
+ expected_f, frac_wid, round_dir))
+ with self.subTest(radicand=repr(radicand),
+ round_dir=str(round_dir),
+ expected=repr(expected)):
+ result = self.trap(lambda: radicand.rsqrt(round_dir))
+ self.assertEqual(result, expected)
+
+
+class TestGoldschmidtDiv(FHDLTestCase):
+ def test_case1(self):
+ with self.assertRaises(ParamsNotAccurateEnough):
+ GoldschmidtDivParams(io_width=3, extra_precision=2,
+ table_addr_bits=3, table_data_bits=5,
+ iter_count=2)
+
+ def test_case2(self):
+ with self.assertRaises(ParamsNotAccurateEnough):
+ GoldschmidtDivParams(io_width=4, extra_precision=1,
+ table_addr_bits=1, table_data_bits=5,
+ iter_count=1)
+
+ @staticmethod
+ def cases(io_width, cases=None):
+ assert isinstance(io_width, int) and io_width >= 1
+ if cases is not None:
+ for n, d in cases:
+ assert isinstance(d, int) \
+ and 0 < d < (1 << io_width), "invalid case"
+ assert isinstance(n, int) \
+ and 0 <= n < (d << io_width), "invalid case"
+ yield (n, d)
+ elif io_width > 6:
+ assert io_width * 2 <= 256, \
+ "can't generate big enough numbers for test cases"
+ for i in range(10000):
+ d = hash_256(f'd {i}') % (1 << io_width)
+ if d == 0:
+ d = 1
+ n = hash_256(f'n {i}') % (d << io_width)
+ yield (n, d)
+ else:
+ for d in range(1, 1 << io_width):
+ for n in range(d << io_width):
+ yield (n, d)
+
+ def tst(self, io_width, cases=None):
+ assert isinstance(io_width, int)
+ params = GoldschmidtDivParams.get(io_width)
+ with self.subTest(params=str(params)):
+ for n, d in self.cases(io_width, cases):
+ expected_q, expected_r = divmod(n, d)
+ with self.subTest(n=hex(n), d=hex(d),
+ expected_q=hex(expected_q),
+ expected_r=hex(expected_r)):
+ trace = []
+
+ def trace_fn(state):
+ assert isinstance(state, GoldschmidtDivState)
+ trace.append((replace(state)))
+ q, r = goldschmidt_div(n, d, params, trace=trace_fn)
+ with self.subTest(q=hex(q), r=hex(r), trace=repr(trace)):
+ self.assertEqual((q, r), (expected_q, expected_r))
+
+ def tst_sim(self, io_width, cases=None, pipe_reg_indexes=(),
+ sync_rom=False):
+ assert isinstance(io_width, int)
+ params = GoldschmidtDivParams.get(io_width)
+ m = Module()
+ dut = GoldschmidtDivHDL(params, pipe_reg_indexes=pipe_reg_indexes,
+ sync_rom=sync_rom)
+ m.submodules.dut = dut
+ # make sync domain get added
+ m.d.sync += Signal().eq(0)
+
+ def inputs_proc():
+ yield Tick()
+ for n, d in self.cases(io_width, cases):
+ yield dut.n.eq(n)
+ yield dut.d.eq(d)
+ yield Tick()
+
+ def check_interals(n, d):
+ # check internals only if dut is completely combinatorial
+ # so we don't have to figure out how to read values in
+ # previous clock cycles
+ if dut.total_pipeline_registers != 0:
+ return
+ ref_trace = []
+
+ def ref_trace_fn(state):
+ assert isinstance(state, GoldschmidtDivState)
+ ref_trace.append((replace(state)))
+ goldschmidt_div(n=n, d=d, params=params, trace=ref_trace_fn)
+ self.assertEqual(len(dut.trace), len(ref_trace))
+ for index, state in enumerate(dut.trace):
+ ref_state = ref_trace[index]
+ last_op = None if index == 0 else params.ops[index - 1]
+ with self.subTest(index=index, state=repr(state),
+ ref_state=repr(ref_state),
+ last_op=str(last_op)):
+ for field in fields(GoldschmidtDivHDLState):
+ sig = getattr(state, field)
+ if not isinstance(sig, Signal):
+ continue
+ ref_value = getattr(ref_state, field)
+ ref_value_str = repr(ref_value)
+ if isinstance(ref_value, int):
+ ref_value_str = hex(ref_value)
+ value = yield sig
+ with self.subTest(field_name=field,
+ sig=repr(sig),
+ sig_shape=repr(sig.shape()),
+ value=hex(value),
+ ref_value=ref_value_str):
+ if isinstance(ref_value, int):
+ self.assertEqual(value, ref_value)
+ else:
+ assert isinstance(ref_value, FixedPoint)
+ self.assertEqual(value, ref_value.bits)
+
+ def check_outputs():
+ yield Tick()
+ for _ in range(dut.total_pipeline_registers):
+ yield Tick()
+ for n, d in self.cases(io_width, cases):
+ yield Delay(0.1e-6)
+ expected_q, expected_r = divmod(n, d)
+ with self.subTest(n=hex(n), d=hex(d),
+ expected_q=hex(expected_q),
+ expected_r=hex(expected_r)):
+ q = yield dut.q
+ r = yield dut.r
+ with self.subTest(q=hex(q), r=hex(r)):
+ self.assertEqual((q, r), (expected_q, expected_r))
+ yield from check_interals(n, d)
+
+ yield Tick()
+
+ with self.subTest(params=str(params)):
+ with do_sim(self, m, (dut.n, dut.d, dut.q, dut.r)) as sim:
+ sim.add_clock(1e-6)
+ sim.add_process(inputs_proc)
+ sim.add_process(check_outputs)
+ sim.run()
+
+ def test_1_through_4(self):
+ for io_width in range(1, 4 + 1):
+ with self.subTest(io_width=io_width):
+ self.tst(io_width)
+
+ def test_5(self):
+ self.tst(5)
+
+ def test_6(self):
+ self.tst(6)
+
+ def test_8(self):
+ self.tst(8)
+
+ def test_16(self):
+ self.tst(16)
+
+ def test_32(self):
+ self.tst(32)
+
+ def test_64(self):
+ self.tst(64)
+
+ def test_sim_5(self):
+ self.tst_sim(5)
+
+ def test_sim_8(self):
+ self.tst_sim(8)
+
+ def test_sim_16(self):
+ self.tst_sim(16)
+
+ def test_sim_32(self):
+ self.tst_sim(32)
+
+ def test_sim_64(self):
+ self.tst_sim(64)
+
+ def tst_params(self, io_width):
+ assert isinstance(io_width, int)
+ params = GoldschmidtDivParams.get(io_width)
+ print()
+ print(params)
+
+ def test_params_1(self):
+ self.tst_params(1)
+
+ def test_params_2(self):
+ self.tst_params(2)
+
+ def test_params_3(self):
+ self.tst_params(3)
+
+ def test_params_4(self):
+ self.tst_params(4)
+
+ def test_params_5(self):
+ self.tst_params(5)
+
+ def test_params_6(self):
+ self.tst_params(6)
+
+ def test_params_7(self):
+ self.tst_params(7)
+
+ def test_params_8(self):
+ self.tst_params(8)
+
+ def test_params_9(self):
+ self.tst_params(9)
+
+ def test_params_10(self):
+ self.tst_params(10)
+
+ def test_params_11(self):
+ self.tst_params(11)
+
+ def test_params_12(self):
+ self.tst_params(12)
+
+ def test_params_13(self):
+ self.tst_params(13)
+
+ def test_params_14(self):
+ self.tst_params(14)
+
+ def test_params_15(self):
+ self.tst_params(15)
+
+ def test_params_16(self):
+ self.tst_params(16)
+
+ def test_params_17(self):
+ self.tst_params(17)
+
+ def test_params_18(self):
+ self.tst_params(18)
+
+ def test_params_19(self):
+ self.tst_params(19)
+
+ def test_params_20(self):
+ self.tst_params(20)
+
+ def test_params_21(self):
+ self.tst_params(21)
+
+ def test_params_22(self):
+ self.tst_params(22)
+
+ def test_params_23(self):
+ self.tst_params(23)
+
+ def test_params_24(self):
+ self.tst_params(24)
+
+ def test_params_25(self):
+ self.tst_params(25)
+
+ def test_params_26(self):
+ self.tst_params(26)
+
+ def test_params_27(self):
+ self.tst_params(27)
+
+ def test_params_28(self):
+ self.tst_params(28)
+
+ def test_params_29(self):
+ self.tst_params(29)
+
+ def test_params_30(self):
+ self.tst_params(30)
+
+ def test_params_31(self):
+ self.tst_params(31)
+
+ def test_params_32(self):
+ self.tst_params(32)
+
+ def test_params_33(self):
+ self.tst_params(33)
+
+ def test_params_34(self):
+ self.tst_params(34)
+
+ def test_params_35(self):
+ self.tst_params(35)
+
+ def test_params_36(self):
+ self.tst_params(36)
+
+ def test_params_37(self):
+ self.tst_params(37)
+
+ def test_params_38(self):
+ self.tst_params(38)
+
+ def test_params_39(self):
+ self.tst_params(39)
+
+ def test_params_40(self):
+ self.tst_params(40)
+
+ def test_params_41(self):
+ self.tst_params(41)
+
+ def test_params_42(self):
+ self.tst_params(42)
+
+ def test_params_43(self):
+ self.tst_params(43)
+
+ def test_params_44(self):
+ self.tst_params(44)
+
+ def test_params_45(self):
+ self.tst_params(45)
+
+ def test_params_46(self):
+ self.tst_params(46)
+
+ def test_params_47(self):
+ self.tst_params(47)
+
+ def test_params_48(self):
+ self.tst_params(48)
+
+ def test_params_49(self):
+ self.tst_params(49)
+
+ def test_params_50(self):
+ self.tst_params(50)
+
+ def test_params_51(self):
+ self.tst_params(51)
+
+ def test_params_52(self):
+ self.tst_params(52)
+
+ def test_params_53(self):
+ self.tst_params(53)
+
+ def test_params_54(self):
+ self.tst_params(54)
+
+ def test_params_55(self):
+ self.tst_params(55)
+
+ def test_params_56(self):
+ self.tst_params(56)
+
+ def test_params_57(self):
+ self.tst_params(57)
+
+ def test_params_58(self):
+ self.tst_params(58)
+
+ def test_params_59(self):
+ self.tst_params(59)
+
+ def test_params_60(self):
+ self.tst_params(60)
+
+ def test_params_61(self):
+ self.tst_params(61)
+
+ def test_params_62(self):
+ self.tst_params(62)
+
+ def test_params_63(self):
+ self.tst_params(63)
+
+ def test_params_64(self):
+ self.tst_params(64)
+
+
+class TestGoldschmidtSqrtRSqrt(FHDLTestCase):
+ def tst(self, io_width, frac_wid, extra_precision,
+ table_addr_bits, table_data_bits, iter_count):
+ assert isinstance(io_width, int)
+ assert isinstance(frac_wid, int)
+ assert isinstance(extra_precision, int)
+ assert isinstance(table_addr_bits, int)
+ assert isinstance(table_data_bits, int)
+ assert isinstance(iter_count, int)
+ with self.subTest(io_width=io_width, frac_wid=frac_wid,
+ extra_precision=extra_precision,
+ table_addr_bits=table_addr_bits,
+ table_data_bits=table_data_bits,
+ iter_count=iter_count):
+ for bits in range(1 << io_width):
+ radicand = FixedPoint(bits, frac_wid)
+ expected_sqrt = radicand.sqrt(RoundDir.DOWN)
+ expected_rsqrt = FixedPoint(0, frac_wid)
+ if radicand > 0:
+ expected_rsqrt = radicand.rsqrt(RoundDir.DOWN)
+ with self.subTest(radicand=repr(radicand),
+ expected_sqrt=repr(expected_sqrt),
+ expected_rsqrt=repr(expected_rsqrt)):
+ sqrt, rsqrt = goldschmidt_sqrt_rsqrt(
+ radicand=radicand, io_width=io_width,
+ frac_wid=frac_wid,
+ extra_precision=extra_precision,
+ table_addr_bits=table_addr_bits,
+ table_data_bits=table_data_bits,
+ iter_count=iter_count)
+ with self.subTest(sqrt=repr(sqrt), rsqrt=repr(rsqrt)):
+ self.assertEqual((sqrt, rsqrt),
+ (expected_sqrt, expected_rsqrt))
+
+ def test1(self):
+ self.tst(io_width=16, frac_wid=8, extra_precision=20,
+ table_addr_bits=4, table_data_bits=28, iter_count=4)
+
+
+if __name__ == "__main__":
+ unittest.main()
import enum
from nmigen import Elaboratable, Module, Signal, Shape, unsigned, Cat, Mux
from soc.fu.div.pipe_data import CoreInputData, CoreOutputData, DivPipeSpec
-from nmutil.iocontrol import PrevControl, NextControl
from nmutil.singlepipe import ControlBase
from ieee754.div_rem_sqrt_rsqrt.core import DivPipeCoreOperation
class FSMDivCoreStage(ControlBase):
def __init__(self, pspec):
- super().__init__()
- self.pspec = pspec
- self.p.data_i = CoreInputData(pspec)
- self.n.data_o = CoreOutputData(pspec)
- self.saved_input_data = CoreInputData(pspec)
+ self.pspec = pspec # store now: used in ispec and ospec
+ super().__init__(stage=self)
+ self.saved_input_data = self.ispec()
self.empty = Signal(reset=1)
self.saved_state = DivState(64, name="saved_state")
self.div_state_next = DivStateNext(64)
self.div_state_init = DivStateInit(64)
self.divisor = Signal(unsigned(64))
+ def ispec(self):
+ return CoreInputData(self.pspec)
+
+ def ospec(self):
+ return CoreOutputData(self.pspec)
+
+ # an extremely rare (and catastrophic) coredump in the binary executable
+ # known as "python 3.7" requires the addition of this function.
+ # no, that's not a "crash which most n00bs call an exception", being
+ # thrown: that's an *actual* coredump created by /usr/bin/python3.7 which
+ # actually segfaults if this function is not added. no idea why.
+ def setup(self, m, i):
+ pass
+
def elaborate(self, platform):
m = super().elaborate(platform)
m.submodules.div_state_next = self.div_state_next
m.submodules.div_state_init = self.div_state_init
- data_i = self.p.data_i
- data_o = self.n.data_o
- core_i = data_i.core
- core_o = data_o.core
+ i_data = self.p.i_data
+ o_data = self.n.o_data
+ core_i = i_data.core
+ core_o = o_data.core
core_saved_i = self.saved_input_data.core
m.d.comb += self.div_state_init.dividend.eq(core_i.dividend)
- m.d.comb += data_o.eq_without_core(self.saved_input_data)
+ m.d.comb += o_data.eq_without_core(self.saved_input_data)
m.d.comb += core_o.quotient_root.eq(self.div_state_next.o.quotient)
# fract width of `DivPipeCoreOutputData.remainder`
remainder_fract_width = 64 * 3
rem_start = remainder_fract_width - dividend_fract_width
m.d.comb += core_o.remainder.eq(self.div_state_next.o.remainder
<< rem_start)
- m.d.comb += self.n.valid_o.eq(
+ m.d.comb += self.n.o_valid.eq(
~self.empty & self.saved_state.will_be_done_after(1))
- m.d.comb += self.p.ready_o.eq(self.empty)
+ m.d.comb += self.p.o_ready.eq(self.empty)
m.d.sync += self.saved_state.eq(self.div_state_next.o)
with m.If(self.empty):
m.d.comb += self.div_state_next.i.eq(self.div_state_init.o)
m.d.comb += self.div_state_next.divisor.eq(core_i.divisor_radicand)
- with m.If(self.p.valid_i):
+ with m.If(self.p.i_valid):
m.d.sync += self.empty.eq(0)
- m.d.sync += self.saved_input_data.eq(data_i)
+ m.d.sync += self.saved_input_data.eq(i_data)
with m.Else():
m.d.comb += [
self.div_state_next.i.eq(self.saved_state),
self.div_state_next.divisor.eq(core_saved_i.divisor_radicand)]
- with m.If(self.n.ready_i & self.n.valid_o):
+ with m.If(self.n.i_ready & self.n.o_valid):
m.d.sync += self.empty.eq(1)
return m
from nmutil.pipemodbase import PipeModBase
from soc.fu.logical.pipe_data import LogicalInputData
from soc.fu.div.pipe_data import DivMulOutputData
-from ieee754.part.partsig import PartitionedSignal
+from ieee754.part.partsig import SimdSignal
from openpower.decoder.power_enums import MicrOp
from openpower.decoder.power_fields import DecodeFields
class DivInputData(FUBaseData):
- regspec = [('INT', 'ra', '0:63'), # RA
- ('INT', 'rb', '0:63'), # RB/immediate
- ('XER', 'xer_so', '32'), ] # XER bit 32: SO
-
def __init__(self, pspec):
super().__init__(pspec, False)
# convenience
self.a, self.b = self.ra, self.rb
+ @property
+ def regspec(self):
+ return [('INT', 'ra', self.intrange), # RA
+ ('INT', 'rb', self.intrange), # RB/immediate
+ ('XER', 'xer_so', '32'), ] # XER bit 32: SO
+
# output stage shared between div and mul: like ALUOutputData but no CA/32
class DivMulOutputData(FUBaseData):
- regspec = [('INT', 'o', '0:63'),
- ('CR', 'cr_a', '0:3'),
- ('XER', 'xer_ov', '33,44'), # bit0: ov, bit1: ov32
- ('XER', 'xer_so', '32')]
def __init__(self, pspec):
super().__init__(pspec, True)
# convenience
self.cr0 = self.cr_a
+ @property
+ def regspec(self):
+ return [('INT', 'o', self.intrange),
+ ('CR', 'cr_a', '0:3'),
+ ('XER', 'xer_ov', '33,44'), # bit0: ov, bit1: ov32
+ ('XER', 'xer_so', '32')]
+
class DivPipeKindConfigBase:
def __init__(self,
class DivPipeSpec(CommonPipeSpec):
- def __init__(self, id_wid, div_pipe_kind):
- super().__init__(id_wid=id_wid)
+ def __init__(self, id_wid, parent_pspec, div_pipe_kind):
+ super().__init__(id_wid=id_wid, parent_pspec=parent_pspec)
self.div_pipe_kind = div_pipe_kind
self.core_config = div_pipe_kind.config.core_config
- regspec = (DivInputData.regspec, DivMulOutputData.regspec)
+ regspecklses = (DivInputData, DivMulOutputData)
opsubsetkls = CompLogicalOpSubset
class DivPipeSpecDivPipeCore(DivPipeSpec):
- def __init__(self, id_wid):
- super().__init__(id_wid=id_wid, div_pipe_kind=DivPipeKind.DivPipeCore)
+ def __init__(self, id_wid, parent_pspec):
+ super().__init__(id_wid=id_wid,
+ parent_pspec=parent_pspec,
+ div_pipe_kind=DivPipeKind.DivPipeCore)
class DivPipeSpecFSMDivCore(DivPipeSpec):
- def __init__(self, id_wid):
- super().__init__(id_wid=id_wid, div_pipe_kind=DivPipeKind.FSMDivCore)
+ def __init__(self, id_wid, parent_pspec):
+ super().__init__(id_wid=id_wid,
+ parent_pspec=parent_pspec,
+ div_pipe_kind=DivPipeKind.FSMDivCore)
class DivPipeSpecSimOnly(DivPipeSpec):
- def __init__(self, id_wid):
- super().__init__(id_wid=id_wid, div_pipe_kind=DivPipeKind.SimOnly)
+ def __init__(self, id_wid, parent_pspec):
+ super().__init__(id_wid=id_wid,
+ parent_pspec=parent_pspec,
+ div_pipe_kind=DivPipeKind.SimOnly)
class CoreBaseData(DivInputData):
class DivStagesStart(PipeModBaseChain):
def get_chain(self):
alu_input = DivMulInputStage(self.pspec)
+ return [alu_input]
+
+
+class DivStagesSetup(PipeModBaseChain):
+ def get_chain(self):
div_setup = DivSetupStage(self.pspec)
if isinstance(self.pspec.div_pipe_kind.config,
DivPipeKindConfigCombPipe):
core_setup = [DivCoreSetupStage(self.pspec)]
else:
core_setup = ()
- return [alu_input, div_setup, *core_setup]
+ return [div_setup, *core_setup]
class DivStagesMiddle(PipeModBaseChain):
else:
core_final = ()
div_out = DivOutputStage(self.pspec)
- alu_out = DivMulOutputStage(self.pspec)
self.div_out = div_out # debugging - bug #425
- return [*core_final, div_out, alu_out]
+ return [*core_final, div_out]
+
+
+class DivStagesFinalise(PipeModBaseChain):
+ def get_chain(self):
+ alu_out = DivMulOutputStage(self.pspec)
+ return [alu_out]
class DivBasePipe(ControlBase):
ControlBase.__init__(self)
self.pspec = pspec
self.pipe_start = DivStagesStart(pspec)
+ self.pipe_setup = DivStagesSetup(pspec)
self.pipe_middles = []
if isinstance(self.pspec.div_pipe_kind.config,
DivPipeKindConfigCombPipe):
self.pipe_middles.append(
self.pspec.div_pipe_kind.config.core_stage_class(pspec))
self.pipe_end = DivStagesEnd(pspec)
+ self.pipe_final = DivStagesFinalise(pspec)
self._eqs = self.connect([self.pipe_start,
+ self.pipe_setup,
*self.pipe_middles,
- self.pipe_end])
+ self.pipe_end,
+ self.pipe_final])
def elaborate(self, platform):
m = ControlBase.elaborate(self, platform)
m.submodules.pipe_start = self.pipe_start
+ m.submodules.pipe_setup = self.pipe_setup
for i in range(len(self.pipe_middles)):
name = f"pipe_middle_{i}"
setattr(m.submodules, name, self.pipe_middles[i])
m.submodules.pipe_end = self.pipe_end
+ m.submodules.pipe_final = self.pipe_final
m.d.comb += self._eqs
return m
from nmigen import (Module, Signal, Cat, Repl, Mux, Const, Array)
from nmutil.pipemodbase import PipeModBase
from soc.fu.div.pipe_data import DivInputData
-from ieee754.part.partsig import PartitionedSignal
+from ieee754.part.partsig import SimdSignal
from openpower.decoder.power_enums import MicrOp
from openpower.decoder.power_fields import DecodeFields
return CoreInputData(self.pspec)
def elaborate(self, platform):
+ XLEN = self.pspec.XLEN
m = Module()
comb = m.d.comb
# convenience variables
# work out if a/b are negative (check 32-bit / signed)
comb += dividend_neg_o.eq(Mux(op.is_32bit,
- a[31], a[63]) & op.is_signed)
- comb += divisor_neg_o.eq(Mux(op.is_32bit, b[31], b[63]) & op.is_signed)
+ a[31], a[XLEN-1]) & op.is_signed)
+ comb += divisor_neg_o.eq(Mux(op.is_32bit,
+ b[31], b[XLEN-1]) & op.is_signed)
# negation of a 64-bit value produces the same lower 32-bit
# result as negation of just the lower 32-bits, so we don't
# need to do anything special before negating
- abs_dor = Signal(64, reset_less=True) # absolute of divisor
- abs_dend = Signal(64, reset_less=True) # absolute of dividend
+ abs_dor = Signal(XLEN, reset_less=True) # absolute of divisor
+ abs_dend = Signal(XLEN, reset_less=True) # absolute of dividend
comb += abs_dor.eq(Mux(divisor_neg_o, -b, b))
comb += abs_dend.eq(Mux(dividend_neg_o, -a, a))
with m.If(op.is_32bit):
comb += dividend_o.eq(abs_dend[0:32] << 32)
with m.Else():
- comb += dividend_o.eq(abs_dend[0:64] << 64)
+ comb += dividend_o.eq(abs_dend[0:XLEN] << XLEN)
###### sticky overflow and context, both pass-through #####
def set_alu_inputs(alu, dec2, sim):
# TODO: see https://bugs.libre-soc.org/show_bug.cgi?id=305#c43
# detect the immediate here (with m.If(self.i.ctx.op.imm_data.imm_ok))
- # and place it into data_i.b
+ # and place it into i_data.b
inp = yield from get_cu_inputs(dec2, sim)
yield from ALUHelpers.set_int_ra(alu, dec2, inp)
# note that it is critically important to do this
# for DIV otherwise it starts trying to produce
# multiple results.
- yield alu.p.valid_i.eq(1)
+ yield alu.p.i_valid.eq(1)
yield
- yield alu.p.valid_i.eq(0)
+ yield alu.p.i_valid.eq(0)
opname = code.split(' ')[0]
fnname = opname.replace(".", "_")
yield from isa_sim.call(opname)
index = isa_sim.pc.CIA.value//4
- vld = yield alu.n.valid_o
+ vld = yield alu.n.o_valid
while not vld:
yield
yield Delay(0.1e-6)
print(f"time: {sim._engine.now * 1e6}us")
except AttributeError:
pass
- vld = yield alu.n.valid_o
+ vld = yield alu.n.o_valid
# bug #425 investigation
do = alu.pipe_end.div_out
ctx_op = do.i.ctx.op
m.submodules.pdecode2 = pdecode2 = PowerDecode2(pdecode)
- pspec = DivPipeSpec(id_wid=2, div_pipe_kind=div_pipe_kind)
+ class PPspec:
+ XLEN = 64
+ pps = PPspec()
+ pspec = DivPipeSpec(
+ id_wid=2, div_pipe_kind=div_pipe_kind, parent_pspec=pps)
m.submodules.alu = alu = DivBasePipe(pspec)
- comb += alu.p.data_i.ctx.op.eq_from_execute1(pdecode2.do)
- comb += alu.n.ready_i.eq(1)
+ comb += alu.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
+ comb += alu.n.i_ready.eq(1)
comb += pdecode2.dec.raw_opcode_in.eq(instruction)
sim = Simulator(m)
print("oe, oe_ok", oe, oe_ok)
if not oe or not oe_ok:
# if OE not enabled, XER SO and OV must not be activated
- so_ok = yield alu.n.data_o.xer_so.ok
- ov_ok = yield alu.n.data_o.xer_ov.ok
+ so_ok = yield alu.n.o_data.xer_so.ok
+ ov_ok = yield alu.n.o_data.xer_ov.ok
print("so, ov", so_ok, ov_ok)
self.assertEqual(ov_ok, False, code)
self.assertEqual(so_ok, False, code)
class TestPipeIlang(unittest.TestCase):
def write_ilang(self, div_pipe_kind):
- pspec = DivPipeSpec(id_wid=2, div_pipe_kind=div_pipe_kind)
+ class PPspec:
+ XLEN = 64
+ pps = PPspec()
+ pspec = DivPipeSpec(
+ id_wid=2, div_pipe_kind=div_pipe_kind, parent_pspec=pps)
alu = DivBasePipe(pspec)
vl = rtlil.convert(alu, ports=alu.ports())
with open(f"div_pipeline_{div_pipe_kind.name}.il", "w") as f:
('is_signed', 1),
('data_len', 4),
('byte_reverse', 1),
+ ('reserve', 1), # atomic update
('sign_extend', 1),
('ldst_mode', LDSTMode),
('insn', 32),
from nmigen import (Elaboratable, Module, Signal, Shape, unsigned, Cat, Mux,
Record, Memory,
- Const)
+ Const, C)
from nmutil.iocontrol import RecordObject
-from nmutil.util import rising_edge
+from nmutil.util import rising_edge, Display
from enum import Enum, unique
from soc.experiment.dcache import DCache
+from soc.experiment.icache import ICache
from soc.experiment.pimem import PortInterfaceBase
from soc.experiment.mem_types import LoadStore1ToMMUType
from soc.experiment.mem_types import MMUToLoadStore1Type
IDLE = 0 # ready for instruction
ACK_WAIT = 1 # waiting for ack from dcache
MMU_LOOKUP = 2 # waiting for MMU to look up translation
- TLBIE_WAIT = 3 # waiting for MMU to finish doing a tlbie
+ #SECOND_REQ = 3 # second request for unaligned transfer
+
+@unique
+class Misalign(Enum):
+ ONEWORD = 0 # only one word needed, all good
+ NEED2WORDS = 1 # need to send/receive two words
+ WAITFIRST = 2 # waiting for the first word
+ WAITSECOND = 3 # waiting for the second word
# captures the LDSTRequest from the PortInterface, which "blips" most
self.load = Signal()
self.dcbz = Signal()
- self.addr = Signal(64)
+ self.raddr = Signal(64)
# self.store_data = Signal(64) # this is already sync (on a delay)
- self.byte_sel = Signal(8)
+ self.byte_sel = Signal(16)
self.nc = Signal() # non-cacheable access
self.virt_mode = Signal()
self.priv_mode = Signal()
+ self.mode_32bit = Signal() # XXX UNUSED AT PRESENT
+ self.alignstate = Signal(Misalign) # progress of alignment request
self.align_intr = Signal()
+ # atomic (LR/SC reservation)
+ self.reserve = Signal()
+ self.atomic = Signal()
+ self.atomic_last = Signal()
+
# glue logic for microwatt mmu and dcache
class LoadStore1(PortInterfaceBase):
addrwid = pspec.addr_wid
super().__init__(regwid, addrwid)
- self.dcache = DCache()
+ self.dcache = DCache(pspec)
+ self.icache = ICache(pspec)
# these names are from the perspective of here (LoadStore1)
self.d_out = self.dcache.d_in # in to dcache is out for LoadStore
self.d_in = self.dcache.d_out # out from dcache is in for LoadStore
- self.m_out = LoadStore1ToMMUType() # out *to* MMU
- self.m_in = MMUToLoadStore1Type() # in *from* MMU
+ self.i_out = self.icache.i_in # in to icache is out for LoadStore
+ self.i_in = self.icache.i_out # out from icache is in for LoadStore
+ self.m_out = LoadStore1ToMMUType("m_out") # out *to* MMU
+ self.m_in = MMUToLoadStore1Type("m_in") # in *from* MMU
self.req = LDSTRequest(name="ldst_req")
# TODO, convert dcache wb_in/wb_out to "standard" nmigen Wishbone bus
self.dbus = Record(make_wb_layout(pspec))
+ self.ibus = Record(make_wb_layout(pspec))
# for creating a single clock blip to DCache
self.d_valid = Signal()
self.d_w_valid = Signal()
self.d_validblip = Signal()
- # DSISR and DAR cached values. note that the MMU FSM is where
- # these are accessed by OP_MTSPR/OP_MFSPR, on behalf of LoadStore1.
- # by contrast microwatt has the spr set/get done *in* loadstore1.vhdl
- self.dsisr = Signal(64)
- self.dar = Signal(64)
-
# state info for LD/ST
self.done = Signal()
+ self.done_delay = Signal()
# latch most of the input request
self.load = Signal()
self.tlbie = Signal()
self.dcbz = Signal()
- self.addr = Signal(64)
- self.store_data = Signal(64)
- self.load_data = Signal(64)
- self.byte_sel = Signal(8)
+ self.raddr = Signal(64)
+ self.maddr = Signal(64)
+ self.store_data = Signal(64) # first half (aligned)
+ self.store_data2 = Signal(64) # second half (misaligned)
+ self.load_data = Signal(128) # 128 to cope with misalignment
+ self.load_data_delay = Signal(128) # perform 2 LD/STs
+ self.byte_sel = Signal(16) # also for misaligned, 16-bit
+ self.alignstate = Signal(Misalign) # progress of alignment request
+ self.next_addr = Signal(64) # 2nd (aligned) read/write addr
#self.xerc : xer_common_t;
- #self.reserve = Signal()
- #self.atomic = Signal()
- #self.atomic_last = Signal()
#self.rc = Signal()
self.nc = Signal() # non-cacheable access
- self.virt_mode = Signal()
- self.priv_mode = Signal()
- self.state = Signal(State)
- self.instr_fault = Signal()
+ self.mode_32bit = Signal() # XXX UNUSED AT PRESENT
+ self.state = Signal(State)
+ self.instr_fault = Signal() # indicator to request i-cache MMU lookup
+ self.r_instr_fault = Signal() # accessed in external_busy
+ self.priv_mode = Signal() # only for instruction fetch (not LDST)
self.align_intr = Signal()
self.busy = Signal()
self.wait_dcache = Signal()
self.wait_mmu = Signal()
- #self.mode_32bit = Signal()
+ self.lrsc_misalign = Signal()
#self.intr_vec : integer range 0 to 16#fff#;
#self.nia = Signal(64)
#self.srr1 = Signal(16)
-
- def set_wr_addr(self, m, addr, mask, misalign, msr_pr):
+ # use these to set the dsisr or dar respectively
+ self.mmu_set_spr = Signal()
+ self.mmu_set_dsisr = Signal()
+ self.mmu_set_dar = Signal()
+ self.sprval_in = Signal(64)
+
+ # ONLY access these read-only, do NOT attempt to change
+ self.dsisr = Signal(32)
+ self.dar = Signal(64)
+
+ # when external_busy set, do not allow PortInterface to proceed
+ def external_busy(self, m):
+ return self.instr_fault | self.r_instr_fault
+
+ def set_wr_addr(self, m, addr, mask, misalign, msr, is_dcbz, is_nc):
+ m.d.comb += self.req.nc.eq(is_nc)
m.d.comb += self.req.load.eq(0) # store operation
m.d.comb += self.req.byte_sel.eq(mask)
- m.d.comb += self.req.addr.eq(addr)
- m.d.comb += self.req.priv_mode.eq(~msr_pr) # not-problem ==> priv
- m.d.comb += self.req.virt_mode.eq(msr_pr) # problem-state ==> virt
- m.d.comb += self.req.align_intr.eq(misalign)
+ m.d.comb += self.req.raddr.eq(addr)
+ m.d.comb += self.req.priv_mode.eq(~msr.pr) # not-problem ==> priv
+ m.d.comb += self.req.virt_mode.eq(msr.dr) # DR ==> virt
+ m.d.comb += self.req.mode_32bit.eq(~msr.sf) # not-sixty-four ==> 32bit
+ m.d.comb += self.req.dcbz.eq(is_dcbz)
+ with m.If(misalign):
+ m.d.comb += self.req.alignstate.eq(Misalign.NEED2WORDS)
+ m.d.sync += self.next_addr.eq(Cat(C(0, 3), addr[3:]+1))
+
+ # m.d.comb += Display("set_wr_addr %i dcbz %i",addr,is_dcbz)
+
# option to disable the cache entirely for write
if self.disable_cache:
m.d.comb += self.req.nc.eq(1)
+
+ # dcbz cannot do no-cache
+ with m.If(is_dcbz & self.req.nc):
+ m.d.comb += self.req.align_intr.eq(1)
+
+ # hmm, rather than add yet another argument to set_wr_addr
+ # read direct from PortInterface
+ m.d.comb += self.req.reserve.eq(self.pi.reserve) # atomic request
+ m.d.comb += self.req.atomic.eq(~self.lrsc_misalign)
+ m.d.comb += self.req.atomic_last.eq(~self.lrsc_misalign)
+
return None
- def set_rd_addr(self, m, addr, mask, misalign, msr_pr):
+ def set_rd_addr(self, m, addr, mask, misalign, msr, is_nc):
m.d.comb += self.d_valid.eq(1)
m.d.comb += self.req.load.eq(1) # load operation
m.d.comb += self.req.byte_sel.eq(mask)
- m.d.comb += self.req.align_intr.eq(misalign)
- m.d.comb += self.req.addr.eq(addr)
- m.d.comb += self.req.priv_mode.eq(~msr_pr) # not-problem ==> priv
- m.d.comb += self.req.virt_mode.eq(msr_pr) # problem-state ==> virt
+ m.d.comb += self.req.raddr.eq(addr)
+ m.d.comb += self.req.priv_mode.eq(~msr.pr) # not-problem ==> priv
+ m.d.comb += self.req.virt_mode.eq(msr.dr) # DR ==> virt
+ m.d.comb += self.req.mode_32bit.eq(~msr.sf) # not-sixty-four ==> 32bit
+ m.d.comb += self.req.nc.eq(is_nc)
# BAD HACK! disable cacheing on LD when address is 0xCxxx_xxxx
# this is for peripherals. same thing done in Microwatt loadstore1.vhdl
with m.If(addr[28:] == Const(0xc, 4)):
# option to disable the cache entirely for read
if self.disable_cache:
m.d.comb += self.req.nc.eq(1)
+ with m.If(misalign):
+ # need two reads: prepare next address in advance
+ m.d.comb += self.req.alignstate.eq(Misalign.NEED2WORDS)
+ m.d.sync += self.next_addr.eq(Cat(C(0, 3), addr[3:]+1))
+
+ # hmm, rather than add yet another argument to set_rd_addr
+ # read direct from PortInterface
+ m.d.comb += self.req.reserve.eq(self.pi.reserve) # atomic request
+ m.d.comb += self.req.atomic.eq(~self.lrsc_misalign)
+ m.d.comb += self.req.atomic_last.eq(~self.lrsc_misalign)
+
return None #FIXME return value
def set_wr_data(self, m, data, wen):
# put data into comb which is picked up in main elaborate()
m.d.comb += self.d_w_valid.eq(1)
m.d.comb += self.store_data.eq(data)
- #m.d.sync += self.d_out.byte_sel.eq(wen) # this might not be needed
+ m.d.sync += self.store_data2.eq(data[64:128])
st_ok = self.done # TODO indicates write data is valid
+ m.d.comb += self.pi.store_done.data.eq(self.d_in.store_done)
+ m.d.comb += self.pi.store_done.ok.eq(1)
return st_ok
def get_rd_data(self, m):
- ld_ok = self.done # indicates read data is valid
- data = self.load_data # actual read data
+ ld_ok = self.done_delay # indicates read data is valid
+ data = self.load_data_delay # actual read data
return data, ld_ok
def elaborate(self, platform):
m = super().elaborate(platform)
comb, sync = m.d.comb, m.d.sync
- # create dcache module
+ # microwatt takes one more cycle before next operation can be issued
+ sync += self.done_delay.eq(self.done)
+ #sync += self.load_data_delay[0:64].eq(self.load_data[0:64])
+
+ # create dcache and icache module
m.submodules.dcache = dcache = self.dcache
+ m.submodules.icache = icache = self.icache
# temp vars
d_out, d_in, dbus = self.d_out, self.d_in, self.dbus
+ i_out, i_in, ibus = self.i_out, self.i_in, self.ibus
m_out, m_in = self.m_out, self.m_in
exc = self.pi.exc_o
exception = exc.happened
mmureq = Signal()
- # copy of address, but gets over-ridden for OP_FETCH_FAILED
+ # copy of address, but gets over-ridden for instr_fault
maddr = Signal(64)
- m.d.comb += maddr.eq(self.addr)
+ m.d.comb += maddr.eq(self.raddr)
+
+ # check for LR/SC misalignment, used in set_rd/wr_addr above
+ comb += self.lrsc_misalign.eq(((self.pi.data_len[0:3]-1) &
+ self.req.raddr[0:3]).bool())
+ with m.If(self.lrsc_misalign & self.req.reserve):
+ m.d.comb += self.req.align_intr.eq(1)
# create a blip (single pulse) on valid read/write request
# this can be over-ridden in the FSM to get dcache to re-run
# a request when MMU_LOOKUP completes.
m.d.comb += self.d_validblip.eq(rising_edge(m, self.d_valid))
ldst_r = LDSTRequest("ldst_r")
+ sync += Display("MMUTEST: LoadStore1 d_in.error=%i",d_in.error)
# fsm skeleton
with m.Switch(self.state):
with m.Case(State.IDLE):
- with m.If(self.d_validblip & ~exc.happened):
+ sync += self.load_data_delay.eq(0) # clear out
+ with m.If((self.d_validblip | self.instr_fault) &
+ ~exc.happened):
comb += self.busy.eq(1)
sync += self.state.eq(State.ACK_WAIT)
sync += ldst_r.eq(self.req) # copy of LDSTRequest on "blip"
+ # sync += Display("validblip self.req.virt_mode=%i",
+ # self.req.virt_mode)
+ with m.If(self.instr_fault):
+ comb += mmureq.eq(1)
+ sync += self.r_instr_fault.eq(1)
+ comb += maddr.eq(self.maddr)
+ sync += self.state.eq(State.MMU_LOOKUP)
+ with m.Else():
+ sync += self.r_instr_fault.eq(0)
+ # if the LD/ST requires two dwords, move to waiting
+ # for first word
+ with m.If(self.req.alignstate == Misalign.NEED2WORDS):
+ sync += ldst_r.alignstate.eq(Misalign.WAITFIRST)
with m.Else():
sync += ldst_r.eq(0)
# waiting for completion
with m.Case(State.ACK_WAIT):
+ sync += Display("MMUTEST: ACK_WAIT")
comb += self.busy.eq(~exc.happened)
with m.If(d_in.error):
comb += exception.eq(1)
sync += self.state.eq(State.IDLE)
sync += ldst_r.eq(0)
- sync += self.dsisr[63 - 38].eq(~self.load)
+ sync += Display("cache error -> update dsisr")
+ sync += self.dsisr[63 - 38].eq(~ldst_r.load)
# XXX there is no architected bit for this
# (probably should be a machine check in fact)
sync += self.dsisr[63 - 35].eq(d_in.cache_paradox)
+ sync += self.r_instr_fault.eq(0)
with m.Else():
# Look up the translation for TLB miss
comb += mmureq.eq(1)
sync += self.state.eq(State.MMU_LOOKUP)
with m.If(d_in.valid):
- m.d.comb += self.done.eq(~mmureq) # done if not doing MMU
with m.If(self.done):
- sync += Display("ACK_WAIT, done %x", self.addr)
- sync += self.state.eq(State.IDLE)
- sync += ldst_r.eq(0)
- with m.If(self.load):
- m.d.comb += self.load_data.eq(d_in.data)
+ sync += Display("ACK_WAIT, done %x", self.raddr)
+ with m.If(ldst_r.alignstate == Misalign.ONEWORD):
+ # done if there is only one dcache operation
+ sync += self.state.eq(State.IDLE)
+ sync += ldst_r.eq(0)
+ with m.If(ldst_r.load):
+ m.d.comb += self.load_data.eq(d_in.data)
+ sync += self.load_data_delay[0:64].eq(d_in.data)
+ m.d.comb += self.done.eq(~mmureq) # done if not MMU
+ with m.Elif(ldst_r.alignstate == Misalign.WAITFIRST):
+ # first LD done: load data, initiate 2nd request.
+ # leave in ACK_WAIT state
+ with m.If(ldst_r.load):
+ m.d.comb += self.load_data[0:63].eq(d_in.data)
+ sync += self.load_data_delay[0:64].eq(d_in.data)
+ with m.Else():
+ m.d.sync += d_out.data.eq(self.store_data2)
+ # mmm kinda cheating, make a 2nd blip.
+ # use an aligned version of the address
+ m.d.comb += self.d_validblip.eq(1)
+ comb += self.req.eq(ldst_r) # from copy of request
+ comb += self.req.raddr.eq(self.next_addr)
+ comb += self.req.byte_sel.eq(ldst_r.byte_sel[8:])
+ comb += self.req.alignstate.eq(Misalign.WAITSECOND)
+ sync += ldst_r.raddr.eq(self.next_addr)
+ sync += ldst_r.byte_sel.eq(ldst_r.byte_sel[8:])
+ sync += ldst_r.alignstate.eq(Misalign.WAITSECOND)
+ sync += Display(" second req %x", self.req.raddr)
+ with m.Elif(ldst_r.alignstate == Misalign.WAITSECOND):
+ sync += Display(" done second %x", d_in.data)
+ # done second load
+ sync += self.state.eq(State.IDLE)
+ sync += ldst_r.eq(0)
+ with m.If(ldst_r.load):
+ m.d.comb += self.load_data[64:128].eq(d_in.data)
+ sync += self.load_data_delay[64:128].eq(d_in.data)
+ m.d.comb += self.done.eq(~mmureq) # done if not MMU
# waiting here for the MMU TLB lookup to complete.
# either re-try the dcache lookup or throw MMU exception
with m.Case(State.MMU_LOOKUP):
- comb += self.busy.eq(1)
+ comb += self.busy.eq(~exception)
with m.If(m_in.done):
- with m.If(~self.instr_fault):
+ with m.If(~self.r_instr_fault):
sync += Display("MMU_LOOKUP, done %x -> %x",
- self.addr, d_out.addr)
+ self.raddr, d_out.addr)
# retry the request now that the MMU has
# installed a TLB entry, if not exception raised
m.d.comb += self.d_out.valid.eq(~exception)
sync += self.state.eq(State.ACK_WAIT)
- sync += ldst_r.eq(0)
with m.Else():
- sync += Display("MMU_LOOKUP, exception %x", self.addr)
- # instruction lookup fault: store address in DAR
- comb += exc.happened.eq(1)
- sync += self.dar.eq(self.addr)
+ sync += self.state.eq(State.IDLE)
+ sync += self.r_instr_fault.eq(0)
+ comb += self.done.eq(1)
with m.If(m_in.err):
- # MMU RADIX exception thrown
+ # MMU RADIX exception thrown. XXX
+ # TODO: critical that the write here has to
+ # notify the MMU FSM of the change to dsisr
comb += exception.eq(1)
+ comb += self.done.eq(1)
+ sync += Display("MMU RADIX exception thrown")
sync += self.dsisr[63 - 33].eq(m_in.invalid)
- sync += self.dsisr[63 - 36].eq(m_in.perm_error)
- sync += self.dsisr[63 - 38].eq(self.load)
+ sync += self.dsisr[63 - 36].eq(m_in.perm_error) # noexec
+ sync += self.dsisr[63 - 38].eq(~ldst_r.load)
sync += self.dsisr[63 - 44].eq(m_in.badtree)
sync += self.dsisr[63 - 45].eq(m_in.rc_error)
+ sync += self.state.eq(State.IDLE)
+ # exception thrown, clear out instruction fault state
+ sync += self.r_instr_fault.eq(0)
- with m.Case(State.TLBIE_WAIT):
- pass
+ # MMU FSM communicating a request to update DSISR or DAR (OP_MTSPR)
+ with m.If(self.mmu_set_spr):
+ with m.If(self.mmu_set_dsisr):
+ sync += self.dsisr.eq(self.sprval_in)
+ with m.If(self.mmu_set_dar):
+ sync += self.dar.eq(self.sprval_in)
- # alignment error: store address in DAR
+ # hmmm, alignment occurs in set_rd_addr/set_wr_addr, note exception
with m.If(self.align_intr):
comb += exc.happened.eq(1)
- sync += self.dar.eq(self.addr)
+ # check for updating DAR
+ with m.If(exception):
+ sync += Display("exception %x", self.raddr)
+ # alignment error: store address in DAR
+ with m.If(self.align_intr):
+ sync += Display("alignment error: addr in DAR %x", self.raddr)
+ sync += self.dar.eq(self.raddr)
+ with m.Elif(~self.r_instr_fault):
+ sync += Display("not instr fault, addr in DAR %x", self.raddr)
+ sync += self.dar.eq(self.raddr)
+
+ # when done or exception, return to idle state
+ with m.If(self.done | exception):
+ sync += self.state.eq(State.IDLE)
+ comb += self.busy.eq(0)
# happened, alignment, instr_fault, invalid.
# note that all of these flow through - eventually to the TRAP
# pipeline, via PowerDecoder2.
+ comb += self.align_intr.eq(self.req.align_intr)
comb += exc.invalid.eq(m_in.invalid)
comb += exc.alignment.eq(self.align_intr)
- comb += exc.instr_fault.eq(self.instr_fault)
+ comb += exc.instr_fault.eq(self.r_instr_fault)
# badtree, perm_error, rc_error, segment_fault
comb += exc.badtree.eq(m_in.badtree)
comb += exc.perm_error.eq(m_in.perm_error)
comb += exc.rc_error.eq(m_in.rc_error)
comb += exc.segment_fault.eq(m_in.segerr)
+ # conditions for 0x400 trap need these in SRR1
+ with m.If(exception & ~exc.alignment & exc.instr_fault):
+ comb += exc.srr1[14].eq(exc.invalid) # 47-33
+ comb += exc.srr1[12].eq(exc.perm_error) # 47-35
+ comb += exc.srr1[3].eq(exc.badtree) # 47-44
+ comb += exc.srr1[2].eq(exc.rc_error) # 47-45
# TODO, connect dcache wb_in/wb_out to "standard" nmigen Wishbone bus
- comb += dbus.adr.eq(dcache.wb_out.adr)
- comb += dbus.dat_w.eq(dcache.wb_out.dat)
- comb += dbus.sel.eq(dcache.wb_out.sel)
- comb += dbus.cyc.eq(dcache.wb_out.cyc)
- comb += dbus.stb.eq(dcache.wb_out.stb)
- comb += dbus.we.eq(dcache.wb_out.we)
-
- comb += dcache.wb_in.dat.eq(dbus.dat_r)
- comb += dcache.wb_in.ack.eq(dbus.ack)
+ comb += dbus.adr.eq(dcache.bus.adr)
+ comb += dbus.dat_w.eq(dcache.bus.dat_w)
+ comb += dbus.sel.eq(dcache.bus.sel)
+ comb += dbus.cyc.eq(dcache.bus.cyc)
+ comb += dbus.stb.eq(dcache.bus.stb)
+ comb += dbus.we.eq(dcache.bus.we)
+
+ comb += dcache.bus.dat_r.eq(dbus.dat_r)
+ comb += dcache.bus.ack.eq(dbus.ack)
if hasattr(dbus, "stall"):
- comb += dcache.wb_in.stall.eq(dbus.stall)
+ comb += dcache.bus.stall.eq(dbus.stall)
- # update out d data when flag set
+ # update out d data when flag set, for first half (second done in FSM)
with m.If(self.d_w_valid):
m.d.sync += d_out.data.eq(self.store_data)
#with m.Else():
m.d.comb += self.d_out.valid.eq(~exc.happened)
m.d.comb += d_out.load.eq(self.req.load)
m.d.comb += d_out.byte_sel.eq(self.req.byte_sel)
- m.d.comb += self.addr.eq(self.req.addr)
+ m.d.comb += self.raddr.eq(self.req.raddr)
m.d.comb += d_out.nc.eq(self.req.nc)
m.d.comb += d_out.priv_mode.eq(self.req.priv_mode)
m.d.comb += d_out.virt_mode.eq(self.req.virt_mode)
- m.d.comb += self.align_intr.eq(self.req.align_intr)
+ m.d.comb += d_out.reserve.eq(self.req.reserve)
+ m.d.comb += d_out.atomic.eq(self.req.atomic)
+ m.d.comb += d_out.atomic_last.eq(self.req.atomic_last)
+ #m.d.comb += Display("validblip dcbz=%i addr=%x",
+ #self.req.dcbz,self.req.addr)
+ m.d.comb += d_out.dcbz.eq(self.req.dcbz)
with m.Else():
m.d.comb += d_out.load.eq(ldst_r.load)
m.d.comb += d_out.byte_sel.eq(ldst_r.byte_sel)
- m.d.comb += self.addr.eq(ldst_r.addr)
+ m.d.comb += self.raddr.eq(ldst_r.raddr)
m.d.comb += d_out.nc.eq(ldst_r.nc)
m.d.comb += d_out.priv_mode.eq(ldst_r.priv_mode)
m.d.comb += d_out.virt_mode.eq(ldst_r.virt_mode)
- m.d.comb += self.align_intr.eq(ldst_r.align_intr)
-
- # XXX these should be possible to remove but for some reason
- # cannot be... yet. TODO, investigate
- m.d.comb += self.load_data.eq(d_in.data)
- m.d.comb += d_out.addr.eq(self.addr)
+ m.d.comb += d_out.reserve.eq(ldst_r.reserve)
+ m.d.comb += d_out.atomic.eq(ldst_r.atomic)
+ m.d.comb += d_out.atomic_last.eq(ldst_r.atomic_last)
+ #m.d.comb += Display("no_validblip dcbz=%i addr=%x",
+ #ldst_r.dcbz,ldst_r.addr)
+ m.d.comb += d_out.dcbz.eq(ldst_r.dcbz)
+ m.d.comb += d_out.addr.eq(self.raddr)
# Update outputs to MMU
m.d.comb += m_out.valid.eq(mmureq)
m.d.comb += m_out.iside.eq(self.instr_fault)
m.d.comb += m_out.load.eq(ldst_r.load)
- # m_out.priv <= r.priv_mode; TODO
+ with m.If(self.instr_fault):
+ m.d.comb += m_out.priv.eq(self.priv_mode)
+ with m.Else():
+ m.d.comb += m_out.priv.eq(ldst_r.priv_mode)
m.d.comb += m_out.tlbie.eq(self.tlbie)
# m_out.mtspr <= mmu_mtspr; # TODO
# m_out.sprn <= sprn; # TODO
# LDSTCompUnit is unusual in that it's non-standard to RegSpecAPI
regspec = [('INT', 'o', '0:63'), # RT
('INT', 'o1', '0:63'), # RA (effective address, update mode)
- # TODO, later ('CR', 'cr_a', '0:3'),
+ ('CR', 'cr_a', '0:3'),
# TODO, later ('XER', 'xer_so', '32')
]
def __init__(self, pspec):
class LDSTPipeSpec(CommonPipeSpec):
- regspec = (LDSTInputData.regspec, LDSTOutputData.regspec)
+ regspecklses = (LDSTInputData, LDSTOutputData)
opsubsetkls = CompLDSTOpSubset
def elaborate(self, platform):
m = Module()
perm = Signal(self.width, reset_less=True)
- rb64 = [Signal(1, reset_less=True, name=f"rb64_{i}") for i in range(64)]
- for i in range(64):
- m.d.comb += rb64[i].eq(self.rb[63-i])
+ rb64 = [Signal(1, reset_less=True, name=f"rb64_{i}")
+ for i in range(self.width)]
+ for i in range(self.width):
+ m.d.comb += rb64[i].eq(self.rb[self.width-1-i])
rb64 = Array(rb64)
- for i in range(8):
+ for i in range(self.width//8):
index = self.rs[8*i:8*i+8]
idx = Signal(8, name=f"idx_{i}", reset_less=True)
m.d.comb += idx.eq(index)
- with m.If(idx < 64):
+ with m.If(idx < self.width):
m.d.comb += perm[i].eq(rb64[idx])
m.d.comb += self.ra[0:8].eq(perm)
return m
recwidth += width
comb += p.eq(AnyConst(width))
- pspec = ALUPipeSpec(id_wid=2)
+ pspec = ALUPipeSpec(id_wid=2, parent_pspec=None)
m.submodules.dut = dut = ALUInputStage(pspec)
a = Signal(64)
dut.i.b.eq(b),
a.eq(AnyConst(64)),
b.eq(AnyConst(64))]
-
+
comb += dut.i.ctx.op.eq(rec)
# Assert that op gets copied from the input to output
module = Driver()
self.assertFormal(module, mode="bmc", depth=4)
self.assertFormal(module, mode="cover", depth=4)
+
def test_ilang(self):
dut = Driver()
vl = rtlil.convert(dut, ports=[])
width = p.width
comb += p.eq(AnyConst(width))
- pspec = ALUPipeSpec(id_wid=2)
+ pspec = ALUPipeSpec(id_wid=2, parent_pspec=None)
m.submodules.dut = dut = LogicalMainStage(pspec)
# convenience variables
# setup random inputs
comb += [a.eq(AnyConst(64)),
b.eq(AnyConst(64)),
- #carry_in.eq(AnyConst(0b11)),
+ # carry_in.eq(AnyConst(0b11)),
]
comb += dut.i.ctx.op.eq(rec)
comb += a_signed_32.eq(a[0:32])
o_ok = Signal()
- comb += o_ok.eq(1) # will be set to zero if no op takes place
+ comb += o_ok.eq(1) # will be set to zero if no op takes place
# main assertion of arithmetic operations
with m.Switch(rec.insn_type):
comb += peo.eq(32)
with m.Else():
comb += peo.eq(pe32.o)
- with m.If(XO[-1]): # cnttzw
+ with m.If(XO[-1]): # cnttzw
comb += pe32.i.eq(a[0:32])
comb += Assert(o == peo)
- with m.Else(): # cntlzw
+ with m.Else(): # cntlzw
comb += pe32.i.eq(a[0:32][::-1])
comb += Assert(o == peo)
with m.Else():
comb += peo64.eq(64)
with m.Else():
comb += peo64.eq(pe64.o)
- with m.If(XO[-1]): # cnttzd
+ with m.If(XO[-1]): # cnttzd
comb += pe64.i.eq(a[0:64])
comb += Assert(o == peo64)
- with m.Else(): # cntlzd
+ with m.Else(): # cntlzd
comb += pe64.i.eq(a[0:64][::-1])
comb += Assert(o == peo64)
module = Driver()
self.assertFormal(module, mode="bmc", depth=2)
self.assertFormal(module, mode="cover", depth=2)
+
def test_ilang(self):
dut = Driver()
vl = rtlil.convert(dut, ports=[])
# to the output stage
# Copyright (C) 2020 Michael Nolan <mtnolan2640@gmail.com>
+# Copyright (C) 2020, 2021 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+
from nmigen import (Module, Signal, Cat, Repl, Mux, Const, Array)
from nmutil.pipemodbase import PipeModBase
from nmutil.clz import CLZ
from soc.fu.logical.bpermd import Bpermd
from soc.fu.logical.popcount import Popcount
from soc.fu.logical.pipe_data import LogicalOutputData
-from ieee754.part.partsig import PartitionedSignal
+from ieee754.part.partsig import SimdSignal
from openpower.decoder.power_enums import MicrOp
from openpower.decoder.power_fields import DecodeFields
return LogicalOutputData(self.pspec)
def elaborate(self, platform):
+ XLEN = self.pspec.XLEN
m = Module()
comb = m.d.comb
op, a, b, o = self.i.ctx.op, self.i.a, self.i.b, self.o.o
comb += o.ok.eq(1) # overridden if no op activates
- m.submodules.bpermd = bpermd = Bpermd(64)
- m.submodules.popcount = popcount = Popcount()
+ m.submodules.bpermd = bpermd = Bpermd(XLEN)
+ m.submodules.popcount = popcount = Popcount(XLEN)
##########################
# main switch for logic ops AND, OR and XOR, cmpb, parity, and popcount
par0 = Signal(reset_less=True)
par1 = Signal(reset_less=True)
comb += par0.eq(Cat(a[0], a[8], a[16], a[24]).xor())
- comb += par1.eq(Cat(a[32], a[40], a[48], a[56]).xor())
+ if XLEN == 64:
+ comb += par1.eq(Cat(a[32], a[40], a[48], a[56]).xor())
with m.If(op.data_len[3] == 1):
comb += o.data.eq(par0 ^ par1)
with m.Else():
comb += o[0].eq(par0)
- comb += o[32].eq(par1)
+ if XLEN == 64:
+ comb += o[32].eq(par1)
###################
###### cntlz v3.0B p99
count_right = Signal(reset_less=True)
comb += count_right.eq(XO[-1])
- cntz_i = Signal(64, reset_less=True)
+ cntz_i = Signal(XLEN, reset_less=True)
a32 = Signal(32, reset_less=True)
comb += a32.eq(a[0:32])
with m.Else():
comb += cntz_i.eq(Mux(count_right, a[::-1], a))
- m.submodules.clz = clz = CLZ(64)
+ m.submodules.clz = clz = CLZ(XLEN)
comb += clz.sig_in.eq(cntz_i)
comb += o.data.eq(Mux(op.is_32bit, clz.lz-32, clz.lz))
from soc.fu.common_output_stage import CommonOutputStage
from soc.fu.logical.pipe_data import (LogicalInputData, LogicalOutputData,
LogicalOutputDataFinal)
-from ieee754.part.partsig import PartitionedSignal
+from ieee754.part.partsig import SimdSignal
from openpower.decoder.power_enums import MicrOp
# input (and output) for logical initial stage (common input)
class LogicalInputData(FUBaseData):
- regspec = [('INT', 'ra', '0:63'), # RA
- ('INT', 'rb', '0:63'), # RB/immediate
- ('XER', 'xer_so', '32'), # bit0: so
- ]
def __init__(self, pspec):
super().__init__(pspec, False)
# convenience
self.a, self.b = self.ra, self.rb
+ @property
+ def regspec(self):
+ return [('INT', 'ra', self.intrange), # RA
+ ('INT', 'rb', self.intrange), # RB/immediate
+ ('XER', 'xer_so', '32'), # bit0: so
+ ]
# input to logical final stage (common output)
class LogicalOutputData(FUBaseData):
- regspec = [('INT', 'o', '0:63'), # RT
- ('CR', 'cr_a', '0:3'),
- ('XER', 'xer_so', '32'), # bit0: so
- ]
def __init__(self, pspec):
super().__init__(pspec, True)
# convenience
self.cr0 = self.cr_a
+ @property
+ def regspec(self):
+ return [('INT', 'o', self.intrange),
+ ('CR', 'cr_a', '0:3'),
+ ('XER', 'xer_so', '32'), # bit0: so
+ ]
+
# output from logical final stage (common output) - note that XER.so
# is *not* included (the only reason it's in the input is because of CR0)
class LogicalOutputDataFinal(FUBaseData):
- regspec = [('INT', 'o', '0:63'), # RT
- ('CR', 'cr_a', '0:3'),
- ]
def __init__(self, pspec):
super().__init__(pspec, True)
# convenience
self.cr0 = self.cr_a
+ @property
+ def regspec(self):
+ return [('INT', 'o', self.intrange),
+ ('CR', 'cr_a', '0:3'),
+ ]
class LogicalPipeSpec(CommonPipeSpec):
- regspec = (LogicalInputData.regspec, LogicalOutputDataFinal.regspec)
+ regspecklses = (LogicalInputData, LogicalOutputDataFinal)
opsubsetkls = CompLogicalOpSubset
class LogicalStages1(PipeModBaseChain):
def get_chain(self):
inp = LogicalInputStage(self.pspec)
+ return [inp]
+
+class LogicalStages2(PipeModBaseChain):
+ def get_chain(self):
main = LogicalMainStage(self.pspec)
- return [inp, main]
+ return [main]
-class LogicalStages2(PipeModBaseChain):
+class LogicalStages3(PipeModBaseChain):
def get_chain(self):
out = LogicalOutputStage(self.pspec)
return [out]
self.pspec = pspec
self.pipe1 = LogicalStages1(pspec)
self.pipe2 = LogicalStages2(pspec)
- self._eqs = self.connect([self.pipe1, self.pipe2])
+ self.pipe3 = LogicalStages3(pspec)
+ self._eqs = self.connect([self.pipe1, self.pipe2, self.pipe3])
def elaborate(self, platform):
m = ControlBase.elaborate(self, platform)
m.submodules.logical_pipe1 = self.pipe1
m.submodules.logical_pipe2 = self.pipe2
+ m.submodules.logical_pipe3 = self.pipe3
m.d.comb += self._eqs
return m
class Popcount(Elaboratable):
- def __init__(self):
- self.a = Signal(64, reset_less=True)
- self.b = Signal(64, reset_less=True)
+ def __init__(self, width=64):
+ self.width = width
+ self.a = Signal(width, reset_less=True)
+ self.b = Signal(width, reset_less=True)
self.data_len = Signal(4, reset_less=True) # data len up to... err.. 8?
- self.o = Signal(64, reset_less=True)
+ self.o = Signal(width, reset_less=True)
+ assert width in [32, 64], "only 32 or 64 bit supported for now"
def elaborate(self, platform):
m = Module()
# creating arrays big enough to store the sum, each time
pc = [a]
# QTY32 2-bit (to take 2x 1-bit sums) etc.
- work = [(32, 2), (16, 3), (8, 4), (4, 5), (2, 6), (1, 7)]
+ work = [(16, 3), (8, 4), (4, 5), (2, 6), (1, 7)]
+ if self.width == 64:
+ work = [(32, 2)] + work
for l, bw in work: # l=number of add-reductions, bw=bitwidth
pc.append(array_of(l, bw))
- pc8 = pc[3] # array of 8 8-bit counts (popcntb)
- pc32 = pc[5] # array of 2 32-bit counts (popcntw)
+ pc8 = pc[-4] # array of 8 8-bit counts (popcntb)
+ pc32 = pc[-2] # array of 2 32-bit counts (popcntw)
popcnt = pc[-1] # array of 1 64-bit count (popcntd)
# cascade-tree of adds
for idx, (l, bw) in enumerate(work):
# decode operation length (1-hot)
with m.If(data_len == 1):
# popcntb - pack 8x 4-bit answers into 8x 8-bit output fields
- for i in range(8):
+ for i in range(self.width//8):
comb += o[i*8:(i+1)*8].eq(pc8[i])
with m.Elif(data_len == 4):
- # popcntw - pack 2x 5-bit answers into 2x 32-bit output fields
- for i in range(2):
- comb += o[i*32:(i+1)*32].eq(pc32[i])
+ if self.width == 64:
+ # popcntw - pack 2x 5-bit answers into 2x 32-bit output fields
+ for i in range(2):
+ comb += o[i*32:(i+1)*32].eq(pc32[i])
+ else:
+ comb += o.eq(popcnt[0])
with m.Else():
# popcntd - put 1x 6-bit answer into 64-bit output
comb += o.eq(popcnt[0])
def set_alu_inputs(alu, dec2, sim):
# TODO: see https://bugs.libre-soc.org/show_bug.cgi?id=305#c43
# detect the immediate here (with m.If(self.i.ctx.op.imm_data.imm_ok))
- # and place it into data_i.b
+ # and place it into i_data.b
inp = yield from get_cu_inputs(dec2, sim)
- print ("set alu inputs", inp)
+ print("set alu inputs", inp)
yield from ALUHelpers.set_int_ra(alu, dec2, inp)
yield from ALUHelpers.set_int_rb(alu, dec2, inp)
yield from ALUHelpers.set_xer_so(alu, dec2, inp)
class LogicalIlangCase(TestAccumulatorBase):
def case_ilang(self):
- pspec = LogicalPipeSpec(id_wid=2)
+ class PPspec:
+ XLEN = 64
+ pps = PPspec()
+ pspec = LogicalPipeSpec(id_wid=2, parent_pspec=pps)
alu = LogicalBasePipe(pspec)
vl = rtlil.convert(alu, ports=alu.ports())
with open("logical_pipeline.il", "w") as f:
f.write(vl)
-class TestRunner(FHDLTestCase):
- def __init__(self, test_data):
- super().__init__("run_all")
- self.test_data = test_data
+class TestRunner(unittest.TestCase):
- def execute(self, alu,instruction, pdecode2, test):
+ def execute(self, alu, instruction, pdecode2, test):
print(test.name)
program = test.program
self.subTest(test.name)
yield from set_alu_inputs(alu, pdecode2, simulator)
# set valid for one cycle, propagate through pipeline...
- yield alu.p.valid_i.eq(1)
+ yield alu.p.i_valid.eq(1)
yield
- yield alu.p.valid_i.eq(0)
+ yield alu.p.i_valid.eq(0)
opname = code.split(' ')[0]
yield from simulator.call(opname)
index = simulator.pc.CIA.value//4
- vld = yield alu.n.valid_o
+ vld = yield alu.n.o_valid
while not vld:
yield
- vld = yield alu.n.valid_o
+ vld = yield alu.n.o_valid
yield
yield from self.check_alu_outputs(alu, pdecode2,
simulator, code)
yield Settle()
- def run_all(self):
+ def test_it(self):
+ test_data = LogicalIlangCase().test_data + \
+ LogicalTestCase({'soc'}).test_data
m = Module()
comb = m.d.comb
instruction = Signal(32)
m.submodules.pdecode2 = pdecode2 = PowerDecode2(pdecode)
- pspec = LogicalPipeSpec(id_wid=2)
+ class PPspec:
+ XLEN = 64
+ pps = PPspec()
+ pspec = LogicalPipeSpec(id_wid=2, parent_pspec=pps)
m.submodules.alu = alu = LogicalBasePipe(pspec)
- comb += alu.p.data_i.ctx.op.eq_from_execute1(pdecode2.do)
- comb += alu.n.ready_i.eq(1)
+ comb += alu.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
+ comb += alu.n.i_ready.eq(1)
comb += pdecode2.dec.raw_opcode_in.eq(instruction)
sim = Simulator(m)
sim.add_clock(1e-6)
def process():
- for test in self.test_data:
+ for test in test_data:
print(test.name)
program = test.program
with self.subTest(test.name):
if __name__ == "__main__":
- unittest.main(exit=False)
- suite = unittest.TestSuite()
- suite.addTest(TestRunner(LogicalIlangCase().test_data))
- suite.addTest(TestRunner(LogicalTestCase().test_data))
-
- runner = unittest.TextTestRunner()
- runner.run(suite)
+ unittest.main()
from soc.experiment.mem_types import MMUToLoadStore1Type
from soc.fu.ldst.loadstore import LoadStore1, TestSRAMLoadStore1
+from nmutil.util import Display
class FSMMMUStage(ControlBase):
self.pspec = pspec
# set up p/n data
- self.p.data_i = MMUInputData(pspec)
- self.n.data_o = MMUOutputData(pspec)
+ self.p.i_data = MMUInputData(pspec)
+ self.n.o_data = MMUOutputData(pspec)
+ self.exc_o = self.n.o_data.exception # AllFunctionUnits needs this
self.mmu = MMU()
self.illegal = Signal()
# for SPR field number access
- i = self.p.data_i
+ i = self.p.i_data
self.fields = DecodeFields(SignalBitRange, [i.ctx.op.insn])
self.fields.create_specs()
# incoming PortInterface
self.ldst = ldst
self.dcache = self.ldst.dcache
+ self.icache = self.ldst.icache
self.pi = self.ldst.pi
def elaborate(self, platform):
assert hasattr(self, "dcache"), "remember to call set_ldst_interface"
m = super().elaborate(platform)
comb, sync = m.d.comb, m.d.sync
- dcache = self.dcache
+ dcache, icache = self.dcache, self.icache
+ ldst = self.ldst # managed externally: do not add here
- # link mmu and dcache together
+ # link mmu, dcache and icache together
m.submodules.mmu = mmu = self.mmu
- ldst = self.ldst # managed externally: do not add here
m.d.comb += dcache.m_in.eq(mmu.d_out) # MMUToDCacheType
m.d.comb += mmu.d_in.eq(dcache.m_out) # DCacheToMMUType
+ m.d.comb += icache.m_in.eq(mmu.i_out) # MMUToICacheType
l_in, l_out = mmu.l_in, mmu.l_out
d_in, d_out = dcache.d_in, dcache.d_out
- wb_out, wb_in = dcache.wb_out, dcache.wb_in
# link ldst and MMU together
comb += l_in.eq(ldst.m_out)
comb += ldst.m_in.eq(l_out)
- data_i, data_o = self.p.data_i, self.n.data_o
- a_i, b_i, o, spr1_o = data_i.ra, data_i.rb, data_o.o, data_o.spr1
- op = data_i.ctx.op
+ i_data, o_data = self.p.i_data, self.n.o_data
+ op = i_data.ctx.op
+ cia_i = op.cia
msr_i = op.msr
- spr1_i = data_i.spr1
-
- # these are set / got here *ON BEHALF* of LoadStore1
- dsisr, dar = ldst.dsisr, ldst.dar
+ a_i, b_i, spr1_i = i_data.ra, i_data.rb, i_data.spr1
+ o, exc_o, spr1_o = o_data.o, o_data.exception, o_data.spr1
# busy/done signals
- busy = Signal()
- done = Signal()
- m.d.comb += self.n.valid_o.eq(busy & done)
- m.d.comb += self.p.ready_o.eq(~busy)
+ busy = Signal(name="mmu_fsm_busy")
+ done = Signal(name="mmu_fsm_done")
+ m.d.comb += self.n.o_valid.eq(busy & done)
+ m.d.comb += self.p.o_ready.eq(~busy)
# take copy of X-Form SPR field
x_fields = self.fields.FormXFX
spr = Signal(len(x_fields.SPR))
comb += spr.eq(decode_spr_num(x_fields.SPR))
- # based on MSR bits, set priv and virt mode. TODO: 32-bit mode
- comb += d_in.priv_mode.eq(~msr_i[MSR.PR])
- comb += d_in.virt_mode.eq(msr_i[MSR.DR])
- #comb += d_in.mode_32bit.eq(msr_i[MSR.SF]) # ?? err
-
# ok so we have to "pulse" the MMU (or dcache) rather than
# hold the valid hi permanently. guess what this does...
valid = Signal()
m.d.comb += blip.eq(rising_edge(m, valid))
with m.If(~busy):
- with m.If(self.p.valid_i):
+ with m.If(self.p.i_valid):
sync += busy.eq(1)
with m.Else():
# enabled ("valid") and we twiddle our thumbs until it
# responds ("done").
- # FIXME: properly implement MicrOp.OP_MTSPR and MicrOp.OP_MFSPR
+ # WIP: properly implement MicrOp.OP_MTSPR and MicrOp.OP_MFSPR
with m.Switch(op.insn_type):
+
+ ##########
+ # OP_MTSPR
+ ##########
+
with m.Case(MicrOp.OP_MTSPR):
+ comb += Display("MMUTEST: OP_MTSPR: spr=%i", spr)
# despite redirection this FU **MUST** behave exactly
# like the SPR FU. this **INCLUDES** updating the SPR
# regfile because the CSV file entry for OP_MTSPR
with m.If(~spr[9] & ~spr[5]):
comb += self.debug0.eq(3)
#if matched update local cached value
+ #commented out because there is a driver conflict
+ comb += ldst.sprval_in.eq(a_i)
+ comb += ldst.mmu_set_spr.eq(1)
with m.If(spr[0]):
- sync += dsisr.eq(a_i[:32])
+ comb += ldst.mmu_set_dar.eq(1)
with m.Else():
- sync += dar.eq(a_i)
+ comb += ldst.mmu_set_dsisr.eq(1)
comb += done.eq(1)
# pass it over to the MMU instead
with m.Else():
+ # PGTBL and PID
comb += self.debug0.eq(4)
# blip the MMU and wait for it to complete
comb += valid.eq(1) # start "pulse"
comb += l_in.rs.eq(a_i) # incoming operand (RS)
comb += done.eq(1) # FIXME l_out.done
+ ##########
+ # OP_MFSPR
+ ##########
+
with m.Case(MicrOp.OP_MFSPR):
- # subset SPR: first check a few bits
- #with m.If(~spr[9] & ~spr[5]):
- # comb += self.debug0.eq(5)
- #with m.If(spr[0]):
- # comb += o.data.eq(dsisr)
- #with m.Else():
- # comb += o.data.eq(dar)
- #do NOT return cached values
- comb += o.data.eq(spr1_i)
+ comb += Display("MMUTEST: OP_MFSPR: spr=%i returns=%i",
+ spr, spr1_i)
+ # partial SPR number decoding perfectly fine
+ with m.If(spr[9] | spr[5]):
+ # identified as an MMU OP_MFSPR, contact the MMU.
+ # interestingly, the read is combinatorial: no need
+ # to set "valid", just set the SPR number
+ comb += l_in.sprn.eq(spr) # which SPR
+ comb += o.data.eq(l_out.sprval)
+ with m.Else():
+ # identified as DSISR or DAR. again: read the SPR
+ # directly, combinatorial access
+ with m.If(spr[0]):
+ comb += o.data.eq(ldst.dar)
+ with m.Else():
+ comb += o.data.eq(ldst.dsisr)
+
comb += o.ok.eq(1)
comb += done.eq(1)
- # pass it over to the MMU instead
- #with m.Else():
- # comb += self.debug0.eq(6)
- # # blip the MMU and wait for it to complete
- # comb += valid.eq(1) # start "pulse"
- # comb += l_in.valid.eq(blip) # start
- # comb += l_in.mtspr.eq(0) # mfspr!=mtspr
- # comb += l_in.sprn.eq(spr) # which SPR
- # comb += l_in.rs.eq(a_i) # incoming operand (RS)
- # comb += o.data.eq(l_out.sprval) # SPR from MMU
- # comb += o.ok.eq(l_out.done) # only when l_out valid
- # comb += done.eq(1) # FIXME l_out.done
-
- # XXX this one is going to have to go through LDSTCompUnit
- # because it's LDST that has control over dcache
- # (through PortInterface). or, another means is devised
- # so as not to have double-drivers of d_in.valid and addr
- #
- #with m.Case(MicrOp.OP_DCBZ):
- # # activate dcbz mode (spec: v3.0B p850)
- # comb += valid.eq(1) # start "pulse"
- # comb += d_in.valid.eq(blip) # start
- # comb += d_in.dcbz.eq(1) # dcbz mode
- # comb += d_in.addr.eq(a_i + b_i) # addr is (RA|0) + RB
- # comb += done.eq(d_out.store_done) # TODO
- # comb += self.debug0.eq(1)
+
+ ##########
+ # OP_TLBIE
+ ##########
with m.Case(MicrOp.OP_TLBIE):
+ comb += Display("MMUTEST: OP_TLBIE: insn_bits=%i", spr)
# pass TLBIE request to MMU (spec: v3.0B p1034)
# note that the spr is *not* an actual spr number, it's
# just that those bits happen to match with field bits
# RIC, PRS, R
+ comb += Display("TLBIE: %i %i", spr, l_out.done)
comb += valid.eq(1) # start "pulse"
comb += l_in.valid.eq(blip) # start
comb += l_in.tlbie.eq(1) # mtspr mode
comb += done.eq(l_out.done) # zzzz
comb += self.debug0.eq(2)
+ ##########
+ # OP_FETCH_FAILED
+ ##########
+
+ with m.Case(MicrOp.OP_FETCH_FAILED):
+ comb += Display("MMUTEST: OP_FETCH_FAILED: @%x", cia_i)
+ # trigger an instruction fetch failed MMU event.
+ # PowerDecoder2 drops svstate.pc into NIA for us
+ # really, this should be direct communication with the
+ # MMU, rather than going through LoadStore1. but, doing
+ # so allows for the opportunity to prevent LoadStore1
+ # from accepting any other LD/ST requests.
+ comb += valid.eq(1) # start "pulse"
+ comb += ldst.instr_fault.eq(blip)
+ comb += ldst.priv_mode.eq(~msr_i[MSR.PR])
+ comb += ldst.maddr.eq(cia_i)
+ # XXX should not access this!
+ comb += done.eq(ldst.done)
+ comb += self.debug0.eq(3)
+ # LDST unit contains exception data, which (messily)
+ # is copied over, here. not ideal but it will do for now
+ comb += exc_o.eq(ldst.pi.exc_o)
+
+ ############
+ # OP_ILLEGAL
+ ############
+
with m.Case(MicrOp.OP_ILLEGAL):
comb += self.illegal.eq(1)
- with m.If(self.n.ready_i & self.n.valid_o):
+ with m.If(self.n.i_ready & self.n.o_valid):
sync += busy.eq(0)
return m
layout = (('insn_type', MicrOp),
('fn_unit', Function),
('insn', 32),
- ('msr', 64), # TODO: a lot less bits. only need PR, DR, SF
+ ('cia', 64), # for instruction fault (MMU PTE lookup)
+ ('msr', 64), # ditto, to set priv_mode etc.
('zero_a', 1),
)
super().__init__(layout, name=name)
from soc.fu.pipe_data import FUBaseData
from soc.fu.mmu.mmu_input_record import CompMMUOpSubset
from soc.fu.alu.pipe_data import CommonPipeSpec
+from openpower.exceptions import LDSTException
class MMUInputData(FUBaseData):
('SPR', 'spr1', '0:63'), # MMU (slow)
]
def __init__(self, pspec):
- super().__init__(pspec, True)
+ super().__init__(pspec, True, LDSTException)
class MMUPipeSpec(CommonPipeSpec):
- regspec = (MMUInputData.regspec, MMUOutputData.regspec)
+ regspecklses = (MMUInputData, MMUOutputData)
opsubsetkls = CompMMUOpSubset
# libre-soc has own SPR unit
# other instructions here -> must be load/store
- def case_mmu_ldst(self):
+ def cse_dcbz(self):
lst = [
"dcbz 1,2",
+ ]
+
+ initial_regs = [0] * 32
+ initial_regs[1] = 0x2
+ initial_regs[2] = 0x2020
+
+ self.add_case(Program(lst, bigendian),
+ initial_regs, initial_mem={})
+
+ def case_mmu_dar(self):
+ lst = [
+ "mfspr 1, 720", # DAR to reg 1
+ "mtspr 19, 3", # reg 3 to DAR
+ ]
+
+ initial_regs = [0] * 32
+ initial_regs[1] = 0x2
+ initial_regs[3] = 0x5
+
+ initial_sprs = {'DAR': 0x87654321,
+ }
+ self.add_case(Program(lst, bigendian),
+ initial_regs, initial_sprs, initial_mem={})
+
+ def case_mmu_ldst(self):
+ lst = [
+ "dcbz 1,0",
"tlbie 0,0,0,0,0", # RB,RS,RIC,PRS,R
"mtspr 18, 1", # reg 1 to DSISR
"mtspr 19, 2", # reg 2 to DAR
- "mfspr 1, 18", # DSISR to reg 1
- "mfspr 2, 19", # DAR to reg 2
+ "mfspr 5, 18", # DSISR to reg 5
+ "mfspr 6, 19", # DAR to reg 6
"mtspr 48, 3", # set MMU PID
"mtspr 720, 4", # set MMU PRTBL
- "lhz 3, 0(1)" # load some data
+ "lhz 3, 0(1)", # load some data
+ "addi 7, 0, 1"
]
initial_regs = [0] * 32
- initial_regs[3] = 1
+ initial_regs[1] = 0x2
+ initial_regs[2] = 0x2020
+ initial_regs[3] = 5
initial_regs[4] = 0xDEADBEEF
- #initial_regs[1] = 0xDEADBEEF
- #FIXME initial_sprs = {'DSISR': 0x12345678, 'DAR': 0x87654321}
- initial_sprs = {}
+ initial_sprs = {'DSISR': 0x12345678, 'DAR': 0x87654321,
+ 'PIDR': 0xabcd, 'PRTBL': 0x0def}
self.add_case(Program(lst, bigendian),
- initial_regs, initial_sprs)
+ initial_regs, initial_sprs, initial_mem={})
if __name__ == "__main__":
+ mem = {}
unittest.main(exit=False)
suite = unittest.TestSuite()
- suite.addTest(TestRunner(MMUTestCase().test_data,microwatt_mmu=True))
+ suite.addTest(TestRunner(MMUTestCase().test_data,
+ microwatt_mmu=True,
+ svp64=False,
+ rom=mem))
runner = unittest.TextTestRunner()
runner.run(suite)
debughang = 2
+
class MMUTestCase(TestAccumulatorBase):
# MMU handles MTSPR, MFSPR, DCBZ and TLBIE.
# other instructions here -> must be load/store
def case_mfspr_after_invalid_load(self):
- lst = [ # TODO -- set SPR on both sinulator and port interface
- "mfspr 1, 18", # DSISR to reg 1
- "mfspr 2, 19", # DAR to reg 2
- # TODO -- verify returned sprvals
- ]
+ lst = [ # TODO -- set SPR on both sinulator and port interface
+ "mfspr 1, 18", # DSISR to reg 1
+ "mfspr 2, 19", # DAR to reg 2
+ # TODO -- verify returned sprvals
+ ]
initial_regs = [0] * 32
- #THOSE are currently broken -- initial_sprs = {'DSISR': 0x12345678, 'DAR': 0x87654321}
+ # THOSE are currently broken -- initial_sprs = {'DSISR': 0x12345678, 'DAR': 0x87654321}
initial_sprs = {}
self.add_case(Program(lst, bigendian),
initial_regs, initial_sprs)
- #def case_ilang(self):
- # pspec = SPRPipeSpec(id_wid=2)
+ # def case_ilang(self):
+ # pspec = SPRPipeSpec(id_wid=2, parent_pspec=None)
# alu = SPRBasePipe(pspec)
# vl = rtlil.convert(alu, ports=alu.ports())
# with open("trap_pipeline.il", "w") as f:
fsm = core.fus.fus["mmu0"].alu
- vld = yield fsm.n.valid_o
+ vld = yield fsm.n.o_valid
while not vld:
yield
- if debughang: print("not valid -- hang")
- vld = yield fsm.n.valid_o
- if debughang==2: vld=1
+ if debughang:
+ print("not valid -- hang")
+ vld = yield fsm.n.o_valid
+ if debughang == 2:
+ vld = 1
yield
def run_all(self):
reg_wid=64)
m.submodules.core = core = NonProductionCore(pspec
- # XXX NO absolutely do not do this.
- # all options must go into the pspec
- #, microwatt_mmu=True
- )
+ # XXX NO absolutely do not do this.
+ # all options must go into the pspec
+ # , microwatt_mmu=True
+ )
comb += pdecode2.dec.raw_opcode_in.eq(instruction)
sim = Simulator(m)
traces=[]):
sim.run()
+
if __name__ == "__main__":
unittest.main(exit=False)
suite = unittest.TestSuite()
debughang = 1
+
def set_fsm_inputs(alu, dec2, sim):
# TODO: see https://bugs.libre-soc.org/show_bug.cgi?id=305#c43
# detect the immediate here (with m.If(self.i.ctx.op.imm_data.imm_ok))
- # and place it into data_i.b
+ # and place it into i_data.b
print("Error here")
inp = yield from get_cu_inputs(dec2, sim)
# yield from ALUHelpers.set_spr_spr1(alu, dec2, inp)
overflow = None
- a=None
- b=None
+ a = None
+ b = None
# TODO
if 'xer_so' in inp:
print("xer_so::::::::::::::::::::::::::::::::::::::::::::::::")
so = inp['xer_so']
print(so)
overflow = pia.OverflowFlags(so=bool(so),
- ov=False,
- ov32=False)
+ ov=False,
+ ov32=False)
if 'ra' in inp:
a = inp['ra']
if 'rb' in inp:
def check_fsm_outputs(fsm, pdecode2, sim, code):
# check that MMUOutputData is correct
- return None #TODO
+ return None # TODO
+
+# incomplete test - connect fsm inputs first
+
-#incomplete test - connect fsm inputs first
class MMUIlangCase(TestAccumulatorBase):
- #def case_ilang(self):
- # pspec = SPRPipeSpec(id_wid=2)
+ # def case_ilang(self):
+ # pspec = SPRPipeSpec(id_wid=2, parent_pspec=None)
# alu = SPRBasePipe(pspec)
# vl = rtlil.convert(alu, ports=alu.ports())
# with open("trap_pipeline.il", "w") as f:
def __init__(self, test_data):
super().__init__("run_all")
self.test_data = test_data
+ # hack here -- all unit tests are affected
+ self.run_all()
def check_fsm_outputs(self, alu, dec2, sim, code, pia_res):
sim_o = {}
res = {}
- #MMUOutputData does not have xer
+ # MMUOutputData does not have xer
yield from ALUHelpers.get_cr_a(res, alu, dec2)
- #yield from ALUHelpers.get_xer_ov(res, alu, dec2)
+ # yield from ALUHelpers.get_xer_ov(res, alu, dec2)
yield from ALUHelpers.get_int_o(res, alu, dec2)
- #yield from ALUHelpers.get_xer_so(res, alu, dec2)
-
+ # yield from ALUHelpers.get_xer_so(res, alu, dec2)
print("res output", res)
yield from ALUHelpers.get_sim_int_o(sim_o, sim, dec2)
yield from ALUHelpers.get_wr_sim_cr_a(sim_o, sim, dec2)
- #yield from ALUHelpers.get_sim_xer_ov(sim_o, sim, dec2)
- #yield from ALUHelpers.get_sim_xer_so(sim_o, sim, dec2)
+ # yield from ALUHelpers.get_sim_xer_ov(sim_o, sim, dec2)
+ # yield from ALUHelpers.get_sim_xer_so(sim_o, sim, dec2)
print("sim output", sim_o)
print("power-instruction-analyzer result:")
print(pia_res)
- #if pia_res is not None:
+ # if pia_res is not None:
# with self.subTest(check="pia", sim_o=sim_o, pia_res=str(pia_res)):
# pia_o = pia_res_to_output(pia_res)
# ALUHelpers.check_int_o(self, res, pia_o, code)
# #ALUHelpers.check_xer_so(self, res, pia_o, code)
with self.subTest(check="sim", sim_o=sim_o, pia_res=str(pia_res)):
- #ALUHelpers.check_int_o(self, res, sim_o, code) # mmu is not an alu
+ # ALUHelpers.check_int_o(self, res, sim_o, code) # mmu is not an alu
ALUHelpers.check_cr_a(self, res, sim_o, code)
#ALUHelpers.check_xer_ov(self, res, sim_o, code)
#ALUHelpers.check_xer_so(self, res, sim_o, code)
- #oe = yield dec2.e.do.oe.oe
- #oe_ok = yield dec2.e.do.oe.ok
+ # oe = yield dec2.e.do.oe.oe
+ # oe_ok = yield dec2.e.do.oe.ok
#print("oe, oe_ok", oe, oe_ok)
- #if not oe or not oe_ok:
+ # if not oe or not oe_ok:
# # if OE not enabled, XER SO and OV must not be activated
- # so_ok = yield alu.n.data_o.xer_so.ok
- # ov_ok = yield alu.n.data_o.xer_ov.ok
+ # so_ok = yield alu.n.o_data.xer_so.ok
+ # ov_ok = yield alu.n.o_data.xer_ov.ok
# print("so, ov", so_ok, ov_ok)
# self.assertEqual(ov_ok, False, code)
# self.assertEqual(so_ok, False, code)
print("dec2 spr/fast in", fast_out, spr_out)
fn_unit = yield pdecode2.e.do.fn_unit
- #FIXME this fails -- self.assertEqual(fn_unit, Function.SPR.value)
+ # FIXME this fails -- self.assertEqual(fn_unit, Function.SPR.value)
pia_res = yield from set_fsm_inputs(fsm, pdecode2, sim)
yield
opname = code.split(' ')[0]
index = pc//4
print("pc after %08x" % (pc))
- vld = yield fsm.n.valid_o #fsm
+ vld = yield fsm.n.o_valid # fsm
while not vld:
yield
if debughang:
print("not valid -- hang")
return
- vld = yield fsm.n.valid_o
- if debughang==2: vld=1
+ vld = yield fsm.n.o_valid
+ if debughang == 2:
+ vld = 1
yield
yield from self.check_fsm_outputs(fsm, pdecode2, sim, code, pia_res)
comb = m.d.comb
instruction = Signal(32)
- pspec = TestMemPspec(addr_wid=48,
+ pspec = TestMemPspec(addr_wid=64,
mask_wid=8,
reg_wid=64,
)
m.submodules.pdecode2 = pdecode2 = PowerDecode2(pdecode)
- pipe_spec = MMUPipeSpec(id_wid=2)
+ pipe_spec = MMUPipeSpec(id_wid=2, parent_pspec=None)
ldst = LoadStore1(pspec)
fsm = FSMMMUStage(pipe_spec)
fsm.set_ldst_interface(ldst)
m.submodules.fsm = fsm
m.submodules.ldst = ldst
- #FIXME connect fsm inputs
+ # FIXME connect fsm inputs
- comb += fsm.p.data_i.ctx.op.eq_from_execute1(pdecode2.do)
- comb += fsm.p.valid_i.eq(1)
- comb += fsm.n.ready_i.eq(1)
+ comb += fsm.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
+ comb += fsm.p.i_valid.eq(1)
+ comb += fsm.n.i_ready.eq(1)
comb += pdecode2.dec.raw_opcode_in.eq(instruction)
sim = Simulator(m)
traces=[]):
sim.run()
+
if __name__ == "__main__":
unittest.main(exit=False)
suite = unittest.TestSuite()
# set up the mul stages. do not add them to m.submodules, this
# is handled by StageChain.setup().
- pspec = MulPipeSpec(id_wid=2)
+ pspec = MulPipeSpec(id_wid=2, parent_pspec=None)
pipe1 = MulMainStage1(pspec)
pipe2 = MulMainStage2(pspec)
pipe3 = MulMainStage3(pspec)
- class Dummy: pass
- dut = Dummy() # make a class into which dut.i and dut.o can be dropped
+ class Dummy:
+ pass
+ dut = Dummy() # make a class into which dut.i and dut.o can be dropped
dut.i = pipe1.ispec()
- chain = [pipe1, pipe2, pipe3] # chain of 3 mul stages
+ chain = [pipe1, pipe2, pipe3] # chain of 3 mul stages
- StageChain(chain).setup(m, dut.i) # input linked here, through chain
- dut.o = chain[-1].o # output is the last thing in the chain...
+ StageChain(chain).setup(m, dut.i) # input linked here, through chain
+ dut.o = chain[-1].o # output is the last thing in the chain...
# convenience variables
a = dut.i.ra
# setup random inputs
comb += [a.eq(AnyConst(64)),
b.eq(AnyConst(64)),
- ]
+ ]
comb += dut.i.ctx.op.eq(rec)
###### HI-32 #####
with m.Case(MicrOp.OP_MUL_H32):
- comb += Assume(rec.is_32bit) # OP_MUL_H32 is a 32-bit op
+ comb += Assume(rec.is_32bit) # OP_MUL_H32 is a 32-bit op
exp_prod = Signal(64)
expected_o = Signal.like(exp_prod)
# differ, we negate the product. This implies that
# the product is calculated from the absolute values
# of the inputs.
- prod = Signal.like(exp_prod) # intermediate product
+ prod = Signal.like(exp_prod) # intermediate product
comb += prod.eq(abs32_a * abs32_b)
comb += exp_prod.eq(Mux(ab32_sne, -prod, prod))
comb += expected_o.eq(Repl(exp_prod[32:64], 2))
# differ, we negate the product. This implies that
# the product is calculated from the absolute values
# of the inputs.
- prod = Signal.like(exp_prod) # intermediate product
+ prod = Signal.like(exp_prod) # intermediate product
comb += prod.eq(abs64_a * abs64_b)
comb += exp_prod.eq(Mux(ab64_sne, -prod, prod))
comb += Assert(o[0:64] == exp_prod[64:128])
module = Driver()
self.assertFormal(module, mode="bmc", depth=2)
self.assertFormal(module, mode="cover", depth=2)
+
def test_ilang(self):
dut = Driver()
vl = rtlil.convert(dut, ports=[])
from nmigen import Module
from nmutil.pipemodbase import PipeModBase
from soc.fu.mul.pipe_data import MulIntermediateData, MulOutputData
-from ieee754.part.partsig import PartitionedSignal
+from ieee754.part.partsig import SimdSignal
class MulMainStage2(PipeModBase):
class MulOutputData(FUBaseData):
- regspec = [('INT', 'o', '0:128'),
- ('XER', 'xer_so', '32')] # XER bit 32: SO
def __init__(self, pspec):
super().__init__(pspec, False) # still input style
self.data.append(self.neg_res)
self.data.append(self.neg_res32)
+ @property
+ def regspec(self):
+ return [('INT', 'o', "0:%d" % (self.pspec.XLEN*2)), # 2xXLEN
+ ('XER', 'xer_so', '32')] # XER bit 32: SO
+
class MulPipeSpec(CommonPipeSpec):
- regspec = (DivInputData.regspec, DivMulOutputData.regspec)
+ regspecklses = (DivInputData, DivMulOutputData)
opsubsetkls = CompMULOpSubset
from nmutil.pipemodbase import PipeModBase
from soc.fu.div.pipe_data import DivMulOutputData
from soc.fu.mul.pipe_data import MulOutputData
-from ieee754.part.partsig import PartitionedSignal
+from ieee754.part.partsig import SimdSignal
from openpower.decoder.power_enums import MicrOp
from nmutil.pipemodbase import PipeModBase
from soc.fu.div.pipe_data import DivInputData
from soc.fu.mul.pipe_data import MulIntermediateData
-from ieee754.part.partsig import PartitionedSignal
+from ieee754.part.partsig import SimdSignal
from nmutil.util import eq32
class MulMainStage1(PipeModBase):
return MulIntermediateData(self.pspec) # pipeline stage output format
def elaborate(self, platform):
+ XLEN = self.pspec.XLEN
m = Module()
comb = m.d.comb
comb += is_32bit.eq(op.is_32bit)
# work out if a/b are negative (check 32-bit / signed)
- comb += sign_a.eq(Mux(op.is_32bit, a[31], a[63]) & op.is_signed)
- comb += sign_b.eq(Mux(op.is_32bit, b[31], b[63]) & op.is_signed)
+ comb += sign_a.eq(Mux(op.is_32bit, a[31], a[XLEN-1]) & op.is_signed)
+ comb += sign_b.eq(Mux(op.is_32bit, b[31], b[XLEN-1]) & op.is_signed)
comb += sign32_a.eq(a[31] & op.is_signed)
comb += sign32_b.eq(b[31] & op.is_signed)
# negation of a 64-bit value produces the same lower 32-bit
# result as negation of just the lower 32-bits, so we don't
# need to do anything special before negating
- abs_a = Signal(64, reset_less=True)
- abs_b = Signal(64, reset_less=True)
+ abs_a = Signal(XLEN, reset_less=True)
+ abs_b = Signal(XLEN, reset_less=True)
comb += abs_a.eq(Mux(sign_a, -a, a))
comb += abs_b.eq(Mux(sign_b, -b, b))
def set_alu_inputs(alu, dec2, sim, has_third_input):
# TODO: see https://bugs.libre-soc.org/show_bug.cgi?id=305#c43
# detect the immediate here (with m.If(self.i.ctx.op.imm_data.imm_ok))
- # and place it into data_i.b
+ # and place it into i_data.b
inp = yield from get_cu_inputs(dec2, sim)
print("set alu inputs", inp)
overflow = pia.OverflowFlags(so=bool(so),
ov=False,
ov32=False)
+ immediate_ok = yield dec2.e.do.imm_data.ok
+ if immediate_ok:
+ immediate = yield dec2.e.do.imm_data.data
+ else:
+ immediate = None
rc = inp["rc"] if has_third_input else None
return pia.InstructionInput(ra=inp.get("ra"), rb=inp.get("rb"),
+ immediate=immediate,
rc=rc, overflow=overflow)
has_third_input)
# set valid for one cycle, propagate through pipeline...
- yield alu.p.valid_i.eq(1)
+ yield alu.p.i_valid.eq(1)
yield
- yield alu.p.valid_i.eq(0)
+ yield alu.p.i_valid.eq(0)
opname = code.split(' ')[0]
fnname = opname.replace(".", "_")
print(f"{fnname}({pia_inputs})")
- pia_res = None
- try:
- pia_res = getattr(pia, fnname)(pia_inputs)
- except AttributeError:
- EXPECTED_FAILURES = ["mulli"]
- if fnname not in EXPECTED_FAILURES:
- raise
- else:
- print("not implemented, as expected.")
+ pia_res = getattr(pia, fnname)(pia_inputs)
print(f"-> {pia_res}")
yield from isa_sim.call(opname)
index = isa_sim.pc.CIA.value//4
# ...wait for valid to pop out the end
- vld = yield alu.n.valid_o
+ vld = yield alu.n.o_valid
while not vld:
yield
yield Delay(0.1e-6)
- vld = yield alu.n.valid_o
+ vld = yield alu.n.o_valid
yield Delay(0.1e-6)
# XXX sim._engine is an internal variable
m.submodules.pdecode2 = pdecode2 = PowerDecode2(None, opkls, fn_name)
pdecode = pdecode2.dec
- pspec = MulPipeSpec(id_wid=2)
+ class PPspec:
+ XLEN = 64
+ pps = PPspec()
+ pspec = MulPipeSpec(id_wid=2, parent_pspec=pps)
m.submodules.alu = alu = MulBasePipe(pspec)
- comb += alu.p.data_i.ctx.op.eq_from_execute1(pdecode2.do)
- comb += alu.n.ready_i.eq(1)
+ comb += alu.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
+ comb += alu.n.i_ready.eq(1)
comb += pdecode2.dec.raw_opcode_in.eq(instruction)
sim = Simulator(m)
oe_ok = yield dec2.e.do.oe.ok
if not oe or not oe_ok:
# if OE not enabled, XER SO and OV must correspondingly be false
- so_ok = yield alu.n.data_o.xer_so.ok
- ov_ok = yield alu.n.data_o.xer_ov.ok
+ so_ok = yield alu.n.o_data.xer_so.ok
+ ov_ok = yield alu.n.o_data.xer_ov.ok
self.assertEqual(so_ok, False, code)
self.assertEqual(ov_ok, False, code)
import unittest
from soc.fu.mul.test.helper import MulTestHelper
from openpower.test.mul.long_mul_cases import (MulTestCases2Arg,
- MulTestCases3Arg)
+ MulTestCases3Arg,
+ MUL_3_ARG_TEST_VALUES)
class TestPipeLong(MulTestHelper):
def test_mul_pipe_2_arg(self):
- self.run_all(MulTestCases2Arg().test_data, "mul_pipe_caller_long_2_arg",
- has_third_input=False)
+ self.run_all(MulTestCases2Arg({'soc'}).test_data,
+ "mul_pipe_caller_long_2_arg", has_third_input=False)
def helper_3_arg(self, subtest_index):
- self.run_all(MulTestCases3Arg(subtest_index).test_data,
+ self.run_all(MulTestCases3Arg(subtest_index, {'soc'}).test_data,
f"mul_pipe_caller_long_3_arg_{subtest_index}",
has_third_input=True)
class TestPipeIlang(unittest.TestCase):
def write_ilang(self):
- pspec = MulPipeSpec(id_wid=2)
+ class PPspec:
+ XLEN = 64
+ pps = PPspec()
+ pspec = MulPipeSpec(id_wid=2, parent_pspec=pps)
alu = MulBasePipe(pspec)
vl = rtlil.convert(alu, ports=alu.ports())
with open("mul_pipeline.il", "w") as f:
"""
def __init__(self, pspec, output, exc_kls=None):
- self.ctx = PipeContext(pspec) # context for ReservationStation usage
+ self.pspec = pspec
+ self.ctx = PipeContext(pspec) # context for ReservationStation usage
self.muxid = self.ctx.muxid
self.data = []
self.is_output = output
# take regspec and create data attributes (in or out)
# TODO: use widspec to create reduced bit mapping.
+ print (self.regspec)
for i, (regfile, regname, widspec) in enumerate(self.regspec):
wid = get_regspec_bitwidth([self.regspec], 0, i)
if output:
if hasattr(self, "exception"):
yield from self.exception.ports()
+ # convenience function to return 0:63 if XLEN=64, 0:31 if XLEN=32 etc.
+ @property
+ def intrange(self):
+ return "0:%d" % (self.pspec.XLEN-1)
+
def eq(self, i):
eqs = [self.ctx.eq(i.ctx)]
assert len(self.data) == len(i.data), \
- "length of %s mismatch against %s: %s %s" % \
- (repr(self), repr(i), repr(self.data), repr(i.data))
+ "length of %s mismatch against %s: %s %s" % \
+ (repr(self), repr(i), repr(self.data), repr(i.data))
for j in range(len(self.data)):
assert type(self.data[j]) == type(i.data[j]), \
- "type mismatch in FUBaseData %s %s" % \
- (repr(self.data[j]), repr(i.data[j]))
+ "type mismatch in FUBaseData %s %s" % \
+ (repr(self.data[j]), repr(i.data[j]))
eqs.append(self.data[j].eq(i.data[j]))
if hasattr(self, "exception"):
eqs.append(self.exception.eq(i.exception))
return eqs
def ports(self):
- return self.ctx.ports() # TODO: include self.data
+ return self.ctx.ports() # TODO: include self.data
# hmmm there has to be a better way than this
"""CommonPipeSpec: base class for all pipeline specifications
see README.md for explanation of members.
"""
- def __init__(self, id_wid):
+
+ def __init__(self, id_wid, parent_pspec):
self.pipekls = SimpleHandshakeRedir
self.id_wid = id_wid
self.opkls = lambda _: self.opsubsetkls()
- self.op_wid = get_rec_width(self.opkls(None)) # hmm..
+ self.op_wid = get_rec_width(self.opkls(None)) # hmm..
self.stage = None
+ self.parent_pspec = parent_pspec
+
+ # forward attributes from parent_pspec
+ def __getattr__(self, name):
+ return getattr(self.parent_pspec, name)
+
+
+def get_pspec_draft_bitmanip(pspec):
+ """ True if the draft bitmanip instructions are enabled in the provided
+ pspec. The instructions enabled by this are draft instructions -- they are
+ not official OpenPower instructions, they are intended to be eventually
+ submitted to the OpenPower ISA WG.
+
+ https://libre-soc.org/openpower/sv/bitmanip/
+ """
+ # use `is True` to account for Mock absurdities
+ return getattr(pspec, "draft_bitmanip", False) is True
class RegSpec:
def __init__(self, rwid, n_src=None, n_dst=None, name=None):
self._rwid = rwid
+ print ("RegSpec", rwid)
if isinstance(rwid, int):
# rwid: integer (covers all registers)
self._n_src, self._n_dst = n_src, n_dst
"""
self.rwid = rwid
+ def get_io_spec(self, direction, i):
+ if direction: # input (read specs)
+ return self.get_in_spec(i)
+ return self.get_out_spec(i)
+
def get_in_spec(self, i):
return self.rwid[0][i]
if isinstance(self.rwid, int): # old - testing - API (rwid is int)
return self.alu.out[i]
# regspec-based API: look up variable through regspec thru row number
- return getattr(self.alu.n.data_o, self.get_out_name(i))
+ return getattr(self.alu.n.o_data, self.get_out_name(i))
def get_in(self, i):
if isinstance(self.rwid, int): # old - testing - API (rwid is int)
return self.alu.i[i]
# regspec-based API: look up variable through regspec thru row number
- return getattr(self.alu.p.data_i, self.get_in_name(i))
+ return getattr(self.alu.p.i_data, self.get_in_name(i))
def get_op(self):
if isinstance(self.rwid, int): # old - testing - API (rwid is int)
return self.alu.op
- return self.alu.p.data_i.ctx.op
+ return self.alu.p.i_data.ctx.op
-# Proof of correctness for partitioned equal signal combiner
+# Proof of correctness for shift/rotate FU
# Copyright (C) 2020 Michael Nolan <mtnolan2640@gmail.com>
"""
Links:
* https://bugs.libre-soc.org/show_bug.cgi?id=340
+
+run tests with:
+pip install pytest
+pip install pytest-xdist
+pytest -n auto src/soc/fu/shift_rot/formal/proof_main_stage.py
+because that tells pytest to run the tests in parallel, it will take a few
+minutes instead of an hour.
"""
+import unittest
+import enum
from nmigen import (Module, Signal, Elaboratable, Mux, Cat, Repl,
- signed)
-from nmigen.asserts import Assert, AnyConst, Assume, Cover
+ signed, Const, unsigned)
+from nmigen.asserts import Assert, AnyConst, Assume
from nmutil.formaltest import FHDLTestCase
-from nmigen.cli import rtlil
+from nmutil.sim_util import do_sim
+from nmigen.sim import Delay
from soc.fu.shift_rot.main_stage import ShiftRotMainStage
-from soc.fu.shift_rot.rotator import right_mask, left_mask
from soc.fu.shift_rot.pipe_data import ShiftRotPipeSpec
-from soc.fu.shift_rot.sr_input_record import CompSROpSubset
from openpower.decoder.power_enums import MicrOp
-from openpower.consts import field
-import unittest
-from nmutil.extend import exts
+
+@enum.unique
+class TstOp(enum.Enum):
+ """ops we're testing, the idea is if we run a separate formal proof for
+ each instruction, we end up covering them all and each runs much faster,
+ also the formal proofs can be run in parallel."""
+ SHL = MicrOp.OP_SHL
+ SHR = MicrOp.OP_SHR
+ RLC32 = MicrOp.OP_RLC, 32
+ RLC64 = MicrOp.OP_RLC, 64
+ RLCL = MicrOp.OP_RLCL
+ RLCR = MicrOp.OP_RLCR
+ EXTSWSLI = MicrOp.OP_EXTSWSLI
+ TERNLOG = MicrOp.OP_TERNLOG
+ # grev removed -- leaving code for later use in grevlut
+ # GREV32 = MicrOp.OP_GREV, 32
+ # GREV64 = MicrOp.OP_GREV, 64
+
+ @property
+ def op(self):
+ if isinstance(self.value, tuple):
+ return self.value[0]
+ return self.value
+
+
+def eq_any_const(sig: Signal):
+ return sig.eq(AnyConst(sig.shape(), src_loc_at=1))
+
+
+class Mask(Elaboratable):
+ # copied from qemu's mask fn:
+ # https://gitlab.com/qemu-project/qemu/-/blob/477c3b934a47adf7de285863f59d6e4503dd1a6d/target/ppc/internal.h#L21
+ def __init__(self):
+ self.start = Signal(6)
+ self.end = Signal(6)
+ self.out = Signal(64)
+
+ def elaborate(self, platform):
+ m = Module()
+ max_val = Const(~0, unsigned(64))
+ max_bit = 63
+ with m.If(self.start == 0):
+ m.d.comb += self.out.eq(max_val << (max_bit - self.end))
+ with m.Elif(self.end == max_bit):
+ m.d.comb += self.out.eq(max_val >> self.start)
+ with m.Else():
+ ret = (max_val >> self.start) ^ ((max_val >> self.end) >> 1)
+ m.d.comb += self.out.eq(Mux(self.start > self.end, ~ret, ret))
+ return m
+
+
+class TstMask(unittest.TestCase):
+ def test_mask(self):
+ dut = Mask()
+
+ def case(start, end, expected):
+ with self.subTest(start=start, end=end):
+ yield dut.start.eq(start)
+ yield dut.end.eq(end)
+ yield Delay(1e-6)
+ out = yield dut.out
+ with self.subTest(out=hex(out), expected=hex(expected)):
+ self.assertEqual(expected, out)
+
+ def process():
+ for start in range(64):
+ for end in range(64):
+ expected = 0
+ if start > end:
+ for i in range(start, 64):
+ expected |= 1 << (63 - i)
+ for i in range(0, end + 1):
+ expected |= 1 << (63 - i)
+ else:
+ for i in range(start, end + 1):
+ expected |= 1 << (63 - i)
+ yield from case(start, end, expected)
+ with do_sim(self, dut, [dut.start, dut.end, dut.out]) as sim:
+ sim.add_process(process)
+ sim.run()
+
+
+def rotl64(v, amt):
+ v |= Const(0, 64) # convert to value at least 64-bits wide
+ amt |= Const(0, 6) # convert to value at least 6-bits wide
+ return (Cat(v[:64], v[:64]) >> (64 - amt[:6]))[:64]
+
+
+def rotl32(v, amt):
+ v |= Const(0, 32) # convert to value at least 32-bits wide
+ return rotl64(Cat(v[:32], v[:32]), amt)
# This defines a module to drive the device under test and assert
# properties about its outputs
class Driver(Elaboratable):
- def __init__(self):
- # inputs and outputs
- pass
+ def __init__(self, which):
+ assert isinstance(which, TstOp) or which is None
+ self.which = which
def elaborate(self, platform):
m = Module()
comb = m.d.comb
- rec = CompSROpSubset()
- # Setup random inputs for dut.op. do them explicitly so that
- # we can see which ones cause failures in the debug report
- #for p in rec.ports():
- # comb += p.eq(AnyConst(p.width))
- comb += rec.insn_type.eq(AnyConst(rec.insn_type.width))
- comb += rec.fn_unit.eq(AnyConst(rec.fn_unit.width))
- comb += rec.imm_data.imm.eq(AnyConst(rec.imm_data.imm.width))
- comb += rec.imm_data.imm_ok.eq(AnyConst(rec.imm_data.imm_ok.width))
- comb += rec.rc.rc.eq(AnyConst(rec.rc.rc.width))
- comb += rec.rc.rc_ok.eq(AnyConst(rec.rc.rc_ok.width))
- comb += rec.oe.oe.eq(AnyConst(rec.oe.oe.width))
- comb += rec.oe.oe_ok.eq(AnyConst(rec.oe.oe_ok.width))
- comb += rec.write_cr0.eq(AnyConst(rec.write_cr0.width))
- comb += rec.input_carry.eq(AnyConst(rec.input_carry.width))
- comb += rec.output_carry.eq(AnyConst(rec.output_carry.width))
- comb += rec.input_cr.eq(AnyConst(rec.input_cr.width))
- comb += rec.is_32bit.eq(AnyConst(rec.is_32bit.width))
- comb += rec.is_signed.eq(AnyConst(rec.is_signed.width))
- comb += rec.insn.eq(AnyConst(rec.insn.width))
-
-
- pspec = ShiftRotPipeSpec(id_wid=2)
+ pspec = ShiftRotPipeSpec(id_wid=2, parent_pspec=None)
+ pspec.draft_bitmanip = True
m.submodules.dut = dut = ShiftRotMainStage(pspec)
- # convenience variables
- rs = dut.i.rs # register to shift
- b = dut.i.rb # register containing amount to shift by
- ra = dut.i.a # source register if masking is to be done
- carry_in = dut.i.xer_ca[0]
- carry_in32 = dut.i.xer_ca[1]
- carry_out = dut.o.xer_ca
- o = dut.o.o.data
- print ("fields", rec.fields)
- itype = rec.insn_type
-
- # instruction fields
- m_fields = dut.fields.FormM
- md_fields = dut.fields.FormMD
-
- # setup random inputs
- comb += rs.eq(AnyConst(64))
- comb += ra.eq(AnyConst(64))
- comb += b.eq(AnyConst(64))
- comb += carry_in.eq(AnyConst(1))
- comb += carry_in32.eq(AnyConst(1))
-
- # copy operation
- comb += dut.i.ctx.op.eq(rec)
+ # Set inputs to formal variables
+ comb += [
+ eq_any_const(dut.i.ctx.op.insn_type),
+ eq_any_const(dut.i.ctx.op.fn_unit),
+ eq_any_const(dut.i.ctx.op.imm_data.data),
+ eq_any_const(dut.i.ctx.op.imm_data.ok),
+ eq_any_const(dut.i.ctx.op.rc.rc),
+ eq_any_const(dut.i.ctx.op.rc.ok),
+ eq_any_const(dut.i.ctx.op.oe.oe),
+ eq_any_const(dut.i.ctx.op.oe.ok),
+ eq_any_const(dut.i.ctx.op.write_cr0),
+ eq_any_const(dut.i.ctx.op.input_carry),
+ eq_any_const(dut.i.ctx.op.output_carry),
+ eq_any_const(dut.i.ctx.op.input_cr),
+ eq_any_const(dut.i.ctx.op.is_32bit),
+ eq_any_const(dut.i.ctx.op.is_signed),
+ eq_any_const(dut.i.ctx.op.insn),
+ eq_any_const(dut.i.xer_ca),
+ eq_any_const(dut.i.ra),
+ eq_any_const(dut.i.rb),
+ eq_any_const(dut.i.rc),
+ ]
# check that the operation (op) is passed through (and muxid)
comb += Assert(dut.o.ctx.op == dut.i.ctx.op)
comb += Assert(dut.o.ctx.muxid == dut.i.ctx.muxid)
- # signed and signed/32 versions of input rs
- a_signed = Signal(signed(64))
- a_signed_32 = Signal(signed(32))
- comb += a_signed.eq(rs)
- comb += a_signed_32.eq(rs[0:32])
-
- # masks: start-left
- mb = Signal(7, reset_less=True)
- ml = Signal(64, reset_less=True)
-
- # clear left?
- with m.If((itype == MicrOp.OP_RLC) | (itype == MicrOp.OP_RLCL)):
- with m.If(rec.is_32bit):
- comb += mb.eq(m_fields.MB)
- with m.Else():
- comb += mb.eq(md_fields.mb)
- with m.Else():
- with m.If(rec.is_32bit):
- comb += mb.eq(b[0:6])
- with m.Else():
- comb += mb.eq(b+32)
- comb += ml.eq(left_mask(m, mb))
-
- # masks: end-right
- me = Signal(7, reset_less=True)
- mr = Signal(64, reset_less=True)
-
- # clear right?
- with m.If((itype == MicrOp.OP_RLC) | (itype == MicrOp.OP_RLCR)):
- with m.If(rec.is_32bit):
- comb += me.eq(m_fields.ME)
- with m.Else():
- comb += me.eq(md_fields.me)
- with m.Else():
- with m.If(rec.is_32bit):
- comb += me.eq(b[0:6])
- with m.Else():
- comb += me.eq(63-b)
- comb += mr.eq(right_mask(m, me))
-
- # must check Data.ok
- o_ok = Signal()
- comb += o_ok.eq(1)
-
- # main assertion of arithmetic operations
- with m.Switch(itype):
-
- # left-shift: 64/32-bit
- with m.Case(MicrOp.OP_SHL):
- comb += Assume(ra == 0)
- with m.If(rec.is_32bit):
- comb += Assert(o[0:32] == ((rs << b[0:6]) & 0xffffffff))
- comb += Assert(o[32:64] == 0)
- with m.Else():
- comb += Assert(o == ((rs << b[0:7]) & ((1 << 64)-1)))
-
- # right-shift: 64/32-bit / signed
- with m.Case(MicrOp.OP_SHR):
- comb += Assume(ra == 0)
- with m.If(~rec.is_signed):
- with m.If(rec.is_32bit):
- comb += Assert(o[0:32] == (rs[0:32] >> b[0:6]))
- comb += Assert(o[32:64] == 0)
- with m.Else():
- comb += Assert(o == (rs >> b[0:7]))
- with m.Else():
- with m.If(rec.is_32bit):
- comb += Assert(o[0:32] == (a_signed_32 >> b[0:6]))
- comb += Assert(o[32:64] == Repl(rs[31], 32))
- with m.Else():
- comb += Assert(o == (a_signed >> b[0:7]))
-
- # extswsli: 32/64-bit moded
- with m.Case(MicrOp.OP_EXTSWSLI):
- comb += Assume(ra == 0)
- with m.If(rec.is_32bit):
- comb += Assert(o[0:32] == ((rs << b[0:6]) & 0xffffffff))
- comb += Assert(o[32:64] == 0)
- with m.Else():
- # sign-extend to 64 bit
- a_s = Signal(64, reset_less=True)
- comb += a_s.eq(exts(rs, 32, 64))
- comb += Assert(o == ((a_s << b[0:7]) & ((1 << 64)-1)))
-
- # rlwinm, rlwnm, rlwimi
- # *CAN* these even be 64-bit capable? I don't think they are.
- with m.Case(MicrOp.OP_RLC):
- comb += Assume(ra == 0)
- comb += Assume(rec.is_32bit)
-
- # Duplicate some signals so that they're much easier to find
- # in gtkwave.
- # Pro-tip: when debugging, factor out expressions into
- # explicitly named
- # signals, and search using a unique grep-tag (RLC in my case).
- # After
- # debugging, resubstitute values to comply with surrounding
- # code norms.
-
- mrl = Signal(64, reset_less=True, name='MASK_FOR_RLC')
- with m.If(mb > me):
- comb += mrl.eq(ml | mr)
- with m.Else():
- comb += mrl.eq(ml & mr)
-
- ainp = Signal(64, reset_less=True, name='A_INP_FOR_RLC')
- comb += ainp.eq(field(rs, 32, 63))
-
- sh = Signal(6, reset_less=True, name='SH_FOR_RLC')
- comb += sh.eq(b[0:6])
-
- exp_shl = Signal(64, reset_less=True,
- name='A_SHIFTED_LEFT_BY_SH_FOR_RLC')
- comb += exp_shl.eq((ainp << sh) & 0xFFFFFFFF)
-
- exp_shr = Signal(64, reset_less=True,
- name='A_SHIFTED_RIGHT_FOR_RLC')
- comb += exp_shr.eq((ainp >> (32 - sh)) & 0xFFFFFFFF)
-
- exp_rot = Signal(64, reset_less=True,
- name='A_ROTATED_LEFT_FOR_RLC')
- comb += exp_rot.eq(exp_shl | exp_shr)
-
- exp_ol = Signal(32, reset_less=True, name='EXPECTED_OL_FOR_RLC')
- comb += exp_ol.eq(field((exp_rot & mrl) | (ainp & ~mrl),
- 32, 63))
-
- act_ol = Signal(32, reset_less=True, name='ACTUAL_OL_FOR_RLC')
- comb += act_ol.eq(field(o, 32, 63))
-
- # If I uncomment the following lines, I can confirm that all
- # 32-bit rotations work. If I uncomment only one of the
- # following lines, I can confirm that all 32-bit rotations
- # work. When I remove/recomment BOTH lines, however, the
- # assertion fails. Why??
-
-# comb += Assume(mr == 0xFFFFFFFF)
-# comb += Assume(ml == 0xFFFFFFFF)
- #with m.If(rec.is_32bit):
- # comb += Assert(act_ol == exp_ol)
- # comb += Assert(field(o, 0, 31) == 0)
-
- #TODO
- with m.Case(MicrOp.OP_RLCR):
- pass
- with m.Case(MicrOp.OP_RLCL):
- pass
- with m.Default():
- comb += o_ok.eq(0)
-
- # check that data ok was only enabled when op actioned
- comb += Assert(dut.o.o.ok == o_ok)
+ if self.which is None:
+ for i in TstOp:
+ comb += Assume(dut.i.ctx.op.insn_type != i.op)
+ comb += Assert(~dut.o.o.ok)
+ else:
+ # we're only checking a particular operation:
+ comb += Assume(dut.i.ctx.op.insn_type == self.which.op)
+ comb += Assert(dut.o.o.ok)
+
+ # dispatch to check fn for each op
+ getattr(self, f"_check_{self.which.name.lower()}")(m, dut)
return m
+ def _check_shl(self, m, dut):
+ m.d.comb += Assume(dut.i.ra == 0)
+ expected = Signal(64)
+ with m.If(dut.i.ctx.op.is_32bit):
+ m.d.comb += expected.eq((dut.i.rs << dut.i.rb[:6])[:32])
+ with m.Else():
+ m.d.comb += expected.eq((dut.i.rs << dut.i.rb[:7])[:64])
+ m.d.comb += Assert(dut.o.o.data == expected)
+ m.d.comb += Assert(dut.o.xer_ca.data == 0)
+
+ def _check_shr(self, m, dut):
+ m.d.comb += Assume(dut.i.ra == 0)
+ expected = Signal(64)
+ carry = Signal()
+ shift_in_s = Signal(signed(128))
+ shift_roundtrip = Signal(signed(128))
+ shift_in_u = Signal(128)
+ shift_amt = Signal(7)
+ with m.If(dut.i.ctx.op.is_32bit):
+ m.d.comb += [
+ shift_amt.eq(dut.i.rb[:6]),
+ shift_in_s.eq(dut.i.rs[:32].as_signed()),
+ shift_in_u.eq(dut.i.rs[:32]),
+ ]
+ with m.Else():
+ m.d.comb += [
+ shift_amt.eq(dut.i.rb[:7]),
+ shift_in_s.eq(dut.i.rs.as_signed()),
+ shift_in_u.eq(dut.i.rs),
+ ]
+
+ with m.If(dut.i.ctx.op.is_signed):
+ m.d.comb += [
+ expected.eq(shift_in_s >> shift_amt),
+ shift_roundtrip.eq((shift_in_s >> shift_amt) << shift_amt),
+ carry.eq((shift_in_s < 0) & (shift_roundtrip != shift_in_s)),
+ ]
+ with m.Else():
+ m.d.comb += [
+ expected.eq(shift_in_u >> shift_amt),
+ carry.eq(0),
+ ]
+ m.d.comb += Assert(dut.o.o.data == expected)
+ m.d.comb += Assert(dut.o.xer_ca.data == Repl(carry, 2))
+
+ def _check_rlc32(self, m, dut):
+ m.d.comb += Assume(dut.i.ctx.op.is_32bit)
+ # rlwimi, rlwinm, and rlwnm
+
+ m.submodules.mask = mask = Mask()
+ expected = Signal(64)
+ rot = Signal(64)
+ m.d.comb += rot.eq(rotl32(dut.i.rs[:32], dut.i.rb[:5]))
+ m.d.comb += mask.start.eq(dut.fields.FormM.MB[:] + 32)
+ m.d.comb += mask.end.eq(dut.fields.FormM.ME[:] + 32)
+
+ # for rlwinm and rlwnm, ra is guaranteed to be 0, so that part of
+ # the expression turns into a no-op
+ m.d.comb += expected.eq((rot & mask.out) | (dut.i.ra & ~mask.out))
+ m.d.comb += Assert(dut.o.o.data == expected)
+ m.d.comb += Assert(dut.o.xer_ca.data == 0)
+
+ def _check_rlc64(self, m, dut):
+ m.d.comb += Assume(~dut.i.ctx.op.is_32bit)
+ # rldic and rldimi
+
+ # `rb` is always a 6-bit immediate
+ m.d.comb += Assume(dut.i.rb[6:] == 0)
+
+ m.submodules.mask = mask = Mask()
+ expected = Signal(64)
+ rot = Signal(64)
+ m.d.comb += rot.eq(rotl64(dut.i.rs, dut.i.rb[:6]))
+ mb = dut.fields.FormMD.mb[:]
+ m.d.comb += mask.start.eq(Cat(mb[1:6], mb[0]))
+ m.d.comb += mask.end.eq(63 - dut.i.rb[:6])
+
+ # for rldic, ra is guaranteed to be 0, so that part of
+ # the expression turns into a no-op
+ m.d.comb += expected.eq((rot & mask.out) | (dut.i.ra & ~mask.out))
+ m.d.comb += Assert(dut.o.o.data == expected)
+ m.d.comb += Assert(dut.o.xer_ca.data == 0)
+
+ def _check_rlcl(self, m, dut):
+ m.d.comb += Assume(~dut.i.ctx.op.is_32bit)
+ # rldicl and rldcl
+
+ m.d.comb += Assume(~dut.i.ctx.op.is_signed)
+ m.d.comb += Assume(dut.i.ra == 0)
+
+ m.submodules.mask = mask = Mask()
+ m.d.comb += mask.end.eq(63)
+ mb = dut.fields.FormMD.mb[:]
+ m.d.comb += mask.start.eq(Cat(mb[1:6], mb[0]))
+
+ rot = Signal(64)
+ m.d.comb += rot.eq(rotl64(dut.i.rs, dut.i.rb[:6]))
+
+ expected = Signal(64)
+ m.d.comb += expected.eq(rot & mask.out)
+
+ m.d.comb += Assert(dut.o.o.data == expected)
+ m.d.comb += Assert(dut.o.xer_ca.data == 0)
+
+ def _check_rlcr(self, m, dut):
+ m.d.comb += Assume(~dut.i.ctx.op.is_32bit)
+ # rldicr and rldcr
+
+ m.d.comb += Assume(~dut.i.ctx.op.is_signed)
+ m.d.comb += Assume(dut.i.ra == 0)
+
+ m.submodules.mask = mask = Mask()
+ m.d.comb += mask.start.eq(0)
+ me = dut.fields.FormMD.me[:]
+ m.d.comb += mask.end.eq(Cat(me[1:6], me[0]))
+
+ rot = Signal(64)
+ m.d.comb += rot.eq(rotl64(dut.i.rs, dut.i.rb[:6]))
+
+ expected = Signal(64)
+ m.d.comb += expected.eq(rot & mask.out)
+
+ m.d.comb += Assert(dut.o.o.data == expected)
+ m.d.comb += Assert(dut.o.xer_ca.data == 0)
+
+ def _check_extswsli(self, m, dut):
+ m.d.comb += Assume(dut.i.ra == 0)
+ m.d.comb += Assume(dut.i.rb[6:] == 0)
+ m.d.comb += Assume(~dut.i.ctx.op.is_32bit) # all instrs. are 64-bit
+ expected = Signal(64)
+ m.d.comb += expected.eq((dut.i.rs[0:32].as_signed() << dut.i.rb[:6]))
+ m.d.comb += Assert(dut.o.o.data == expected)
+ m.d.comb += Assert(dut.o.xer_ca.data == 0)
+
+ def _check_ternlog(self, m, dut):
+ lut = dut.fields.FormTLI.TLI[:]
+ for i in range(64):
+ idx = Cat(dut.i.rb[i], dut.i.ra[i], dut.i.rc[i])
+ for j in range(8):
+ with m.If(j == idx):
+ m.d.comb += Assert(dut.o.o.data[i] == lut[j])
+ m.d.comb += Assert(dut.o.xer_ca.data == 0)
+
+ # grev removed -- leaving code for later use in grevlut
+ def _check_grev32(self, m, dut):
+ m.d.comb += Assume(dut.i.ctx.op.is_32bit)
+ # assert zero-extended
+ m.d.comb += Assert(dut.o.o.data[32:] == 0)
+ i = Signal(5)
+ m.d.comb += eq_any_const(i)
+ idx = dut.i.rb[0: 5] ^ i
+ m.d.comb += Assert((dut.o.o.data >> i)[0] == (dut.i.ra >> idx)[0])
+ m.d.comb += Assert(dut.o.xer_ca.data == 0)
+
+ # grev removed -- leaving code for later use in grevlut
+ def _check_grev64(self, m, dut):
+ m.d.comb += Assume(~dut.i.ctx.op.is_32bit)
+ i = Signal(6)
+ m.d.comb += eq_any_const(i)
+ idx = dut.i.rb[0: 6] ^ i
+ m.d.comb += Assert((dut.o.o.data >> i)[0] == (dut.i.ra >> idx)[0])
+ m.d.comb += Assert(dut.o.xer_ca.data == 0)
+
class ALUTestCase(FHDLTestCase):
- def test_formal(self):
- module = Driver()
+ def run_it(self, which):
+ module = Driver(which)
self.assertFormal(module, mode="bmc", depth=2)
self.assertFormal(module, mode="cover", depth=2)
- def test_ilang(self):
- dut = Driver()
- vl = rtlil.convert(dut, ports=[])
- with open("main_stage.il", "w") as f:
- f.write(vl)
+
+ def test_none(self):
+ self.run_it(None)
+
+ def test_shl(self):
+ self.run_it(TstOp.SHL)
+
+ def test_shr(self):
+ self.run_it(TstOp.SHR)
+
+ def test_rlc32(self):
+ self.run_it(TstOp.RLC32)
+
+ def test_rlc64(self):
+ self.run_it(TstOp.RLC64)
+
+ def test_rlcl(self):
+ self.run_it(TstOp.RLCL)
+
+ def test_rlcr(self):
+ self.run_it(TstOp.RLCR)
+
+ def test_extswsli(self):
+ self.run_it(TstOp.EXTSWSLI)
+
+ def test_ternlog(self):
+ self.run_it(TstOp.TERNLOG)
+
+ @unittest.skip("grev removed -- leaving code for later use in grevlut")
+ def test_grev32(self):
+ self.run_it(TstOp.GREV32)
+
+ @unittest.skip("grev removed -- leaving code for later use in grevlut")
+ def test_grev64(self):
+ self.run_it(TstOp.GREV64)
+
+
+# check that all test cases are covered
+for i in TstOp:
+ assert callable(getattr(ALUTestCase, f"test_{i.name.lower()}"))
if __name__ == '__main__':
# output stage
from nmigen import (Module, Signal, Cat, Repl, Mux, Const)
from nmutil.pipemodbase import PipeModBase
+from soc.fu.pipe_data import get_pspec_draft_bitmanip
from soc.fu.shift_rot.pipe_data import (ShiftRotOutputData,
- ShiftRotInputData)
-from ieee754.part.partsig import PartitionedSignal
+ ShiftRotInputData)
+from nmutil.lut import BitwiseLut
from openpower.decoder.power_enums import MicrOp
from soc.fu.shift_rot.rotator import Rotator
class ShiftRotMainStage(PipeModBase):
def __init__(self, pspec):
super().__init__(pspec, "main")
+ self.draft_bitmanip = get_pspec_draft_bitmanip(pspec)
self.fields = DecodeFields(SignalBitRange, [self.i.ctx.op.insn])
self.fields.create_specs()
return ShiftRotOutputData(self.pspec)
def elaborate(self, platform):
+ XLEN = self.pspec.XLEN
m = Module()
comb = m.d.comb
op = self.i.ctx.op
o = self.o.o
+ bitwise_lut = None
+ if self.draft_bitmanip:
+ bitwise_lut = BitwiseLut(input_count=3, width=XLEN)
+ m.submodules.bitwise_lut = bitwise_lut
+ comb += bitwise_lut.inputs[0].eq(self.i.rb)
+ comb += bitwise_lut.inputs[1].eq(self.i.ra)
+ comb += bitwise_lut.inputs[2].eq(self.i.rc)
+
# NOTE: the sh field immediate is read in by PowerDecode2
# (actually DecodeRB), whereupon by way of rb "immediate" mode
# it ends up in self.i.rb.
comb += mb_extra.eq(md_fields['mb'][0:-1][0])
# set up microwatt rotator module
- m.submodules.rotator = rotator = Rotator()
+ m.submodules.rotator = rotator = Rotator(XLEN)
comb += [
rotator.me.eq(me),
rotator.mb.eq(mb),
rotator.mb_extra.eq(mb_extra),
rotator.rs.eq(self.i.rs),
rotator.ra.eq(self.i.a),
- rotator.shift.eq(self.i.rb), # can also be sh (in immediate mode)
+ rotator.shift.eq(self.i.rb), # can also be sh (in immediate mode)
rotator.is_32bit.eq(op.is_32bit),
rotator.arith.eq(op.is_signed),
]
- comb += o.ok.eq(1) # defaults to enabled
+ comb += o.ok.eq(1) # defaults to enabled
# instruction rotate type
mode = Signal(4, reset_less=True)
- with m.Switch(op.insn_type):
- with m.Case(MicrOp.OP_SHL): comb += mode.eq(0b0000) # L-shift
- with m.Case(MicrOp.OP_SHR): comb += mode.eq(0b0001) # R-shift
- with m.Case(MicrOp.OP_RLC): comb += mode.eq(0b0110) # clear LR
- with m.Case(MicrOp.OP_RLCL): comb += mode.eq(0b0010) # clear L
- with m.Case(MicrOp.OP_RLCR): comb += mode.eq(0b0100) # clear R
- with m.Case(MicrOp.OP_EXTSWSLI): comb += mode.eq(0b1000) # L-ext
- with m.Default():
- comb += o.ok.eq(0) # otherwise disable
-
comb += Cat(rotator.right_shift,
rotator.clear_left,
rotator.clear_right,
comb += [o.data.eq(rotator.result_o),
self.o.xer_ca.data.eq(Repl(rotator.carry_out_o, 2))]
+ with m.Switch(op.insn_type):
+ with m.Case(MicrOp.OP_SHL):
+ comb += mode.eq(0b0000) # L-shift
+ with m.Case(MicrOp.OP_SHR):
+ comb += mode.eq(0b0001) # R-shift
+ with m.Case(MicrOp.OP_RLC):
+ comb += mode.eq(0b0110) # clear LR
+ with m.Case(MicrOp.OP_RLCL):
+ comb += mode.eq(0b0010) # clear L
+ with m.Case(MicrOp.OP_RLCR):
+ comb += mode.eq(0b0100) # clear R
+ with m.Case(MicrOp.OP_EXTSWSLI):
+ comb += mode.eq(0b1000) # L-ext
+ if self.draft_bitmanip:
+ with m.Case(MicrOp.OP_TERNLOG):
+ # TODO: this only works for ternlogi, change to get lut
+ # value from register when we implement other variants
+ comb += bitwise_lut.lut.eq(self.fields.FormTLI.TLI[:])
+ comb += o.data.eq(bitwise_lut.output)
+ comb += self.o.xer_ca.data.eq(0)
+ with m.Default():
+ comb += o.ok.eq(0) # otherwise disable
+
###### sticky overflow and context, both pass-through #####
comb += self.o.xer_so.data.eq(self.i.xer_so)
class ShiftRotInputData(FUBaseData):
- regspec = [('INT', 'ra', '0:63'), # RA
- ('INT', 'rb', '0:63'), # RB
- ('INT', 'rc', '0:63'), # RS
- ('XER', 'xer_so', '32'), # XER bit 32: SO
- ('XER', 'xer_ca', '34,45')] # XER bit 34/45: CA/CA32
def __init__(self, pspec):
super().__init__(pspec, False)
# convenience
self.a, self.b, self.rs = self.ra, self.rb, self.rc
+ @property
+ def regspec(self):
+ return [('INT', 'ra', self.intrange), # RA
+ ('INT', 'rb', self.intrange), # RB/immediate
+ ('INT', 'rc', self.intrange), # RB/immediate
+ ('XER', 'xer_so', '32'), # XER bit 32: SO
+ ('XER', 'xer_ca', '34,45')] # XER bit 34/45: CA/CA32
+
# input to shiftrot final stage (common output)
class ShiftRotOutputData(FUBaseData):
- regspec = [('INT', 'o', '0:63'), # RT
- ('CR', 'cr_a', '0:3'),
- ('XER', 'xer_so', '32'), # bit0: so
- ('XER', 'xer_ca', '34,45'), # XER bit 34/45: CA/CA32
- ]
def __init__(self, pspec):
super().__init__(pspec, True)
# convenience
self.cr0 = self.cr_a
+ @property
+ def regspec(self):
+ return [('INT', 'o', self.intrange),
+ ('CR', 'cr_a', '0:3'),
+ ('XER', 'xer_so', '32'), # bit0: so
+ ('XER', 'xer_ca', '34,45'), # XER bit 34/45: CA/CA32
+ ]
+
# output from shiftrot final stage (common output) - note that XER.so
# is *not* included (the only reason it's in the input is because of CR0)
class ShiftRotOutputDataFinal(FUBaseData):
- regspec = [('INT', 'o', '0:63'), # RT
- ('CR', 'cr_a', '0:3'),
- ('XER', 'xer_ca', '34,45'), # XER bit 34/45: CA/CA32
- ]
def __init__(self, pspec):
super().__init__(pspec, True)
# convenience
self.cr0 = self.cr_a
+ @property
+ def regspec(self):
+ return [('INT', 'o', self.intrange),
+ ('CR', 'cr_a', '0:3'),
+ ('XER', 'xer_ca', '34,45'), # XER bit 34/45: CA/CA32
+ ]
+
class ShiftRotPipeSpec(CommonPipeSpec):
- regspec = (ShiftRotInputData.regspec, ShiftRotOutputDataFinal.regspec)
+ regspecklses = (ShiftRotInputData, ShiftRotOutputDataFinal)
opsubsetkls = CompSROpSubset
from soc.fu.shift_rot.main_stage import ShiftRotMainStage
from soc.fu.shift_rot.output_stage import ShiftRotOutputStage
-class ShiftRotStages(PipeModBaseChain):
+class ShiftRotStart(PipeModBaseChain):
def get_chain(self):
inp = ShiftRotInputStage(self.pspec)
+ return [inp]
+
+class ShiftRotStage(PipeModBaseChain):
+ def get_chain(self):
main = ShiftRotMainStage(self.pspec)
- return [inp, main]
+ return [main]
class ShiftRotStageEnd(PipeModBaseChain):
def __init__(self, pspec):
ControlBase.__init__(self)
self.pspec = pspec
- self.pipe1 = ShiftRotStages(pspec)
- self.pipe2 = ShiftRotStageEnd(pspec)
- self._eqs = self.connect([self.pipe1, self.pipe2])
+ self.pipe1 = ShiftRotStart(pspec)
+ self.pipe2 = ShiftRotStage(pspec)
+ self.pipe3 = ShiftRotStageEnd(pspec)
+ self._eqs = self.connect([self.pipe1, self.pipe2, self.pipe3])
def elaborate(self, platform):
m = ControlBase.elaborate(self, platform)
m.submodules.pipe1 = self.pipe1
m.submodules.pipe2 = self.pipe2
+ m.submodules.pipe3 = self.pipe3
m.d.comb += self._eqs
return m
# note BE bit numbering
-def right_mask(m, mask_begin):
- ret = Signal(64, name="right_mask", reset_less=True)
- with m.If(mask_begin <= 64):
- m.d.comb += ret.eq((1 << (64-mask_begin)) - 1)
+def right_mask(m, mask_begin, width):
+ ret = Signal(width, name="right_mask", reset_less=True)
+ with m.If(mask_begin <= width):
+ m.d.comb += ret.eq((1 << (width-mask_begin)) - 1)
with m.Else():
m.d.comb += ret.eq(0)
return ret
-def left_mask(m, mask_end):
- ret = Signal(64, name="left_mask", reset_less=True)
- m.d.comb += ret.eq(~((1 << (63-mask_end)) - 1))
+def left_mask(m, mask_end, width):
+ ret = Signal(width, name="left_mask", reset_less=True)
+ m.d.comb += ret.eq(~((1 << (width-1-mask_end)) - 1))
return ret
* clear_right = 1 when insn_type is OP_RLC or OP_RLCR
"""
- def __init__(self):
+ def __init__(self, width):
+ self.width = width
# input
self.me = Signal(5, reset_less=True) # ME field
self.mb = Signal(5, reset_less=True) # MB field
# extra bit of mb in MD-form
self.mb_extra = Signal(1, reset_less=True)
- self.ra = Signal(64, reset_less=True) # RA
- self.rs = Signal(64, reset_less=True) # RS
+ self.ra = Signal(width, reset_less=True) # RA
+ self.rs = Signal(width, reset_less=True) # RS
self.shift = Signal(7, reset_less=True) # RB[0:7]
self.is_32bit = Signal(reset_less=True)
self.right_shift = Signal(reset_less=True)
self.clear_right = Signal(reset_less=True)
self.sign_ext_rs = Signal(reset_less=True)
# output
- self.result_o = Signal(64, reset_less=True)
+ self.result_o = Signal(width, reset_less=True)
self.carry_out_o = Signal(reset_less=True)
def elaborate(self, platform):
+ width = self.width
m = Module()
comb = m.d.comb
ra, rs = self.ra, self.rs
sh = Signal(7, reset_less=True)
mb = Signal(7, reset_less=True)
me = Signal(7, reset_less=True)
- mr = Signal(64, reset_less=True)
- ml = Signal(64, reset_less=True)
+ mr = Signal(width, reset_less=True)
+ ml = Signal(width, reset_less=True)
output_mode = Signal(2, reset_less=True)
hi32 = Signal(32, reset_less=True)
- repl32 = Signal(64, reset_less=True)
+ repl32 = Signal(width, reset_less=True)
# First replicate bottom 32 bits to both halves if 32-bit
with m.If(self.is_32bit):
# sign-extend bottom 32 bits
comb += hi32.eq(Repl(rs[31], 32))
with m.Else():
- comb += hi32.eq(rs[32:64])
+ if width == 64:
+ comb += hi32.eq(rs[32:64])
comb += repl32.eq(Cat(rs[0:32], hi32))
shift_signed = Signal(signed(6))
comb += rot_count.eq(self.shift[0:6])
# ROTL submodule
- m.submodules.rotl = rotl = ROTL(64)
+ m.submodules.rotl = rotl = ROTL(width)
comb += rotl.a.eq(repl32)
comb += rotl.b.eq(rot_count)
comb += rot.eq(rotl.o)
comb += me.eq(Cat(~sh[0:6], sh[6]))
# Calculate left and right masks
- m.submodules.right_mask = right_mask = Mask(64)
- with m.If(mb <= 64):
- comb += right_mask.shift.eq(64-mb)
+ m.submodules.right_mask = right_mask = Mask(width)
+ with m.If(mb <= width):
+ comb += right_mask.shift.eq(width-mb)
comb += mr.eq(right_mask.mask)
with m.Else():
comb += mr.eq(0)
#comb += mr.eq(right_mask(m, mb))
- m.submodules.left_mask = left_mask = Mask(64)
- comb += left_mask.shift.eq(63-me)
+ m.submodules.left_mask = left_mask = Mask(width)
+ comb += left_mask.shift.eq(width-1-me)
comb += ml.eq(~left_mask.mask)
#comb += ml.eq(left_mask(m, me))
# 10 for rldicl, sr[wd]
# 1z for sra[wd][i], z = 1 if rs is negative
with m.If((self.clear_left & ~self.clear_right) | self.right_shift):
- comb += output_mode.eq(Cat(self.arith & repl32[63], Const(1, 1)))
+ comb += output_mode.eq(Cat(self.arith &
+ repl32[width-1], Const(1, 1)))
with m.Else():
mbgt = self.clear_right & (mb[0:6] > me[0:6])
comb += output_mode.eq(Cat(mbgt, Const(0, 1)))
comb = m.d.comb
mr = Signal(64)
mb = Signal(6)
- comb += mr.eq(left_mask(m, mb))
+ comb += mr.eq(left_mask(m, mb, 64))
def loop():
for i in range(64):
from nmutil.formaltest import FHDLTestCase
from nmigen.cli import rtlil
from soc.fu.shift_rot.maskgen import MaskGen
-from openpower.decoder.helpers import MASK
+from openpower.decoder.helpers import ISACallerHelper
import random
import unittest
class MaskGenTestCase(FHDLTestCase):
def test_maskgen(self):
+ MASK = ISACallerHelper(64, FPSCR=None).MASK
m = Module()
comb = m.d.comb
m.submodules.dut = dut = MaskGen(64)
from nmutil.sim_tmp_alternative import Simulator, Settle
from openpower.test.shift_rot.shift_rot_cases import ShiftRotTestCase
+from openpower.test.bitmanip.bitmanip_cases import BitManipTestCase
def get_cu_inputs(dec2, sim):
def set_alu_inputs(alu, dec2, sim):
# TODO: see https://bugs.libre-soc.org/show_bug.cgi?id=305#c43
# detect the immediate here (with m.If(self.i.ctx.op.imm_data.imm_ok))
- # and place it into data_i.b
+ # and place it into i_data.b
inp = yield from get_cu_inputs(dec2, sim)
yield from ALUHelpers.set_int_ra(alu, dec2, inp)
class ShiftRotIlangCase(TestAccumulatorBase):
def case_ilang(self):
- pspec = ShiftRotPipeSpec(id_wid=2)
+ class PPspec:
+ XLEN = 64
+ pps = PPspec()
+ pspec = ShiftRotPipeSpec(id_wid=2, parent_pspec=pps)
+ pspec.draft_bitmanip = True
alu = ShiftRotBasePipe(pspec)
vl = rtlil.convert(alu, ports=alu.ports())
with open("shift_rot_pipeline.il", "w") as f:
yield from set_alu_inputs(alu, pdecode2, simulator)
# set valid for one cycle, propagate through pipeline...
- yield alu.p.valid_i.eq(1)
+ yield alu.p.i_valid.eq(1)
yield
- yield alu.p.valid_i.eq(0)
+ yield alu.p.i_valid.eq(0)
opname = code.split(' ')[0]
yield from simulator.call(opname)
index = simulator.pc.CIA.value//4
- vld = yield alu.n.valid_o
+ vld = yield alu.n.o_valid
while not vld:
yield
- vld = yield alu.n.valid_o
+ vld = yield alu.n.o_valid
yield
- alu_out = yield alu.n.data_o.o.data
+ alu_out = yield alu.n.o_data.o.data
yield from self.check_alu_outputs(alu, pdecode2,
simulator, code)
m.submodules.pdecode2 = pdecode2 = PowerDecode2(None, opkls, fn_name)
pdecode = pdecode2.dec
- pspec = ShiftRotPipeSpec(id_wid=2)
+ class PPspec:
+ XLEN = 64
+ pps = PPspec()
+ pspec = ShiftRotPipeSpec(id_wid=2, parent_pspec=pps)
+ pspec.draft_bitmanip = True
m.submodules.alu = alu = ShiftRotBasePipe(pspec)
- comb += alu.p.data_i.ctx.op.eq_from_execute1(pdecode2.do)
- comb += alu.n.ready_i.eq(1)
+ comb += alu.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
+ comb += alu.n.i_ready.eq(1)
comb += pdecode2.dec.raw_opcode_in.eq(instruction)
sim = Simulator(m)
yield from ALUHelpers.get_xer_ca(res, alu, dec2)
yield from ALUHelpers.get_int_o(res, alu, dec2)
- print ("hw outputs", res)
+ print("hw outputs", res)
yield from ALUHelpers.get_sim_int_o(sim_o, sim, dec2)
yield from ALUHelpers.get_wr_sim_cr_a(sim_o, sim, dec2)
yield from ALUHelpers.get_wr_sim_xer_ca(sim_o, sim, dec2)
- print ("sim outputs", sim_o)
+ print("sim outputs", sim_o)
ALUHelpers.check_cr_a(self, res, sim_o, "CR%d %s" % (cridx, code))
ALUHelpers.check_xer_ca(self, res, sim_o, code)
unittest.main(exit=False)
suite = unittest.TestSuite()
suite.addTest(TestRunner(ShiftRotTestCase().test_data))
+ suite.addTest(TestRunner(BitManipTestCase().test_data))
suite.addTest(TestRunner(ShiftRotIlangCase().test_data))
runner = unittest.TextTestRunner()
from openpower.decoder.power_fieldsn import SignalBitRange
# use POWER numbering. sigh.
+
+
def xer_bit(name):
return 63-XER_bits[name]
width = p.width
comb += p.eq(AnyConst(width))
- pspec = SPRPipeSpec(id_wid=2)
+ pspec = SPRPipeSpec(id_wid=2, parent_pspec=None)
m.submodules.dut = dut = SPRMainStage(pspec)
# frequently used aliases
a = dut.i.a
ca_in = dut.i.xer_ca[0] # CA carry in
- ca32_in = dut.i.xer_ca[1] # CA32 carry in 32
+ ca32_in = dut.i.xer_ca[1] # CA32 carry in 32
so_in = dut.i.xer_so # SO sticky overflow
ov_in = dut.i.xer_ov[0] # XER OV in
- ov32_in = dut.i.xer_ov[1] # XER OV32 in
+ ov32_in = dut.i.xer_ov[1] # XER OV32 in
o = dut.o.o
# setup random inputs
comb += dut.i.ctx.op.eq(rec)
# check that the operation (op) is passed through (and muxid)
- comb += Assert(dut.o.ctx.op == dut.i.ctx.op )
- comb += Assert(dut.o.ctx.muxid == dut.i.ctx.muxid )
+ comb += Assert(dut.o.ctx.op == dut.i.ctx.op)
+ comb += Assert(dut.o.ctx.muxid == dut.i.ctx.muxid)
# MTSPR
fields = DecodeFields(SignalBitRange, [dut.i.ctx.op.insn])
super().__init__(pspec, "spr_main")
# test if regfiles are reduced
self.regreduce_en = (hasattr(pspec, "regreduce") and
- (pspec.regreduce == True))
+ (pspec.regreduce == True))
self.fields = DecodeFields(SignalBitRange, [self.i.ctx.op.insn])
self.fields.create_specs()
so_i, ov_i, ca_i = self.i.xer_so, self.i.xer_ov, self.i.xer_ca
so_o, ov_o, ca_o = self.o.xer_so, self.o.xer_ov, self.o.xer_ca
o, spr1_o, fast1_o = self.o.o, self.o.spr1, self.o.fast1
+ state1_i, state1_o = self.i.state1, self.o.state1
# take copy of D-Form TO field
x_fields = self.fields.FormXFX
#### MTSPR ####
with m.Case(MicrOp.OP_MTSPR):
with m.Switch(spr):
- # fast SPRs first
+ # State SPRs first, note that this triggers a regfile write
+ # which is monitored right the way down in TestIssuerBase.
+ with m.Case(SPR.DEC, SPR.TB):
+ comb += state1_o.data.eq(a_i)
+ comb += state1_o.ok.eq(1)
+
+ # Fast SPRs second: anything in FAST regs
with m.Case(SPR.CTR, SPR.LR, SPR.TAR, SPR.SRR0,
- SPR.SRR1, SPR.XER, SPR.DEC):
+ SPR.SRR1, SPR.XER, SPR.HSRR0, SPR.HSRR1,
+ SPR.SPRG0_priv, SPR.SPRG1_priv,
+ SPR.SPRG2_priv, SPR.SPRG3,
+ SPR.HSPRG0, SPR.HSPRG1, SPR.SVSRR0):
comb += fast1_o.data.eq(a_i)
comb += fast1_o.ok.eq(1)
# XER is constructed
with m.Case(MicrOp.OP_MFSPR):
comb += o.ok.eq(1)
with m.Switch(spr):
- # fast SPRs first
- with m.Case(SPR.CTR, SPR.LR, SPR.TAR, SPR.SRR0, SPR.SRR1,
- SPR.XER, SPR.DEC, SPR.TB):
+ # state SPRs first
+ with m.Case(SPR.DEC, SPR.TB):
+ comb += o.data.eq(state1_i)
+ # TBU is upper 32-bits of State Reg
+ with m.Case(SPR.TBU):
+ comb += o.data[0:32].eq(state1_i[32:64])
+
+ # fast SPRs second
+ with m.Case(SPR.CTR, SPR.LR, SPR.TAR, SPR.SRR0,
+ SPR.SRR1, SPR.XER, SPR.HSRR0, SPR.HSRR1,
+ SPR.SPRG0_priv, SPR.SPRG1_priv,
+ SPR.SPRG2_priv, SPR.SPRG3,
+ SPR.HSPRG0, SPR.HSPRG1, SPR.SVSRR0):
comb += o.data.eq(fast1_i)
with m.If(spr == SPR.XER):
# bits 0:31 and 35:43 are treated as reserved
# and return 0s when read using mfxer
comb += o[32:64].eq(0) # MBS0 bits 0-31
- comb += o[63-43:64-35].eq(0) # MSB0 bits 35-43
+ comb += o[63-43:64-35].eq(0) # MSB0 bits 35-43
# sticky
comb += o[63-XER_bits['SO']].eq(so_i)
# overflow
# carry
comb += o[63-XER_bits['CA']].eq(ca_i[0])
comb += o[63-XER_bits['CA32']].eq(ca_i[1])
- with m.Case(SPR.TBU):
- comb += o.data[0:32].eq(fast1_i[32:64])
-
# slow SPRs TODO
with m.Default():
comb += o.data.eq(spr1_i)
regspec = [('INT', 'ra', '0:63'), # RA
('SPR', 'spr1', '0:63'), # SPR (slow)
('FAST', 'fast1', '0:63'), # SPR (fast: LR, CTR etc)
+ ('STATE', 'state1', '0:63'), # SPR (DEC/TB)
('XER', 'xer_so', '32'), # XER bit 32: SO
('XER', 'xer_ov', '33,44'), # XER bit 34/45: CA/CA32
('XER', 'xer_ca', '34,45')] # bit0: ov, bit1: ov32
# convenience
self.a = self.ra
+# note that state1 gets a corresponding "state1" write port created
+# by core.py which is "monitored" by TestIssuerBase (hack-job, sigh).
+# when writes are spotted then the DEC/TB FSM resets and re-reads
+# DEC/TB.
class SPROutputData(FUBaseData):
regspec = [('INT', 'o', '0:63'), # RT
('SPR', 'spr1', '0:63'), # SPR (slow)
('FAST', 'fast1', '0:63'), # SPR (fast: LR, CTR etc)
+ ('STATE', 'state1', '0:63'), # SPR (DEC/TB)
('XER', 'xer_so', '32'), # XER bit 32: SO
('XER', 'xer_ov', '33,44'), # XER bit 34/45: CA/CA32
('XER', 'xer_ca', '34,45')] # bit0: ov, bit1: ov32
class SPRPipeSpec(CommonPipeSpec):
- regspec = (SPRInputData.regspec, SPROutputData.regspec)
+ regspecklses = (SPRInputData, SPROutputData)
opsubsetkls = CompSPROpSubset
def set_alu_inputs(alu, dec2, sim):
# TODO: see https://bugs.libre-soc.org/show_bug.cgi?id=305#c43
# detect the immediate here (with m.If(self.i.ctx.op.imm_data.imm_ok))
- # and place it into data_i.b
+ # and place it into i_data.b
inp = yield from get_cu_inputs(dec2, sim)
yield from ALUHelpers.set_int_ra(alu, dec2, inp)
class SPRIlangCase(TestAccumulatorBase):
def case_ilang(self):
- pspec = SPRPipeSpec(id_wid=2)
+ pspec = SPRPipeSpec(id_wid=2, parent_pspec=None)
alu = SPRBasePipe(pspec)
vl = rtlil.convert(alu, ports=alu.ports())
with open("trap_pipeline.il", "w") as f:
index = pc//4
print("pc after %08x" % (pc))
- vld = yield alu.n.valid_o
+ vld = yield alu.n.o_valid
while not vld:
yield
- vld = yield alu.n.valid_o
+ vld = yield alu.n.o_valid
yield
yield from self.check_alu_outputs(alu, pdecode2, sim, code)
m.submodules.pdecode2 = pdecode2 = PowerDecode2(pdecode)
- pspec = SPRPipeSpec(id_wid=2)
+ pspec = SPRPipeSpec(id_wid=2, parent_pspec=None)
m.submodules.alu = alu = SPRBasePipe(pspec)
- comb += alu.p.data_i.ctx.op.eq_from_execute1(pdecode2.do)
- comb += alu.p.valid_i.eq(1)
- comb += alu.n.ready_i.eq(1)
+ comb += alu.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
+ comb += alu.p.i_valid.eq(1)
+ comb += alu.n.i_ready.eq(1)
comb += pdecode2.dec.raw_opcode_in.eq(instruction)
sim = Simulator(m)
comb = m.d.comb
rec = CompTrapOpSubset()
- pspec = TrapPipeSpec(id_wid=2)
+ pspec = TrapPipeSpec(id_wid=2, parent_pspec=None)
m.submodules.dut = dut = TrapMainStage(pspec)
###################
with m.Case(MicrOp.OP_MTMSRD):
- msr_od = msr_o.data # another "shortener"
+ msr_od = msr_o.data # another "shortener"
with m.If(L == 0):
# if (MSR[29:31] != 0b010) | (SRR1[29:31] != 0b000) then
# MSR[48] <- (RS)[48] | (RS)[49]
# MSR[58] <- (RS)[58] | (RS)[49]
# MSR[59] <- (RS)[59] | (RS)[49]
- PR = field(rs, 49) # alias/copy of SRR1 PR field
+ PR = field(rs, 49) # alias/copy of SRR1 PR field
comb += [
Assert(field(msr_od, 48) == field(rs, 48) | PR),
Assert(field(msr_od, 58) == field(rs, 58) | PR),
# RFID. v3.0B p955
###################
with m.Case(MicrOp.OP_RFID):
- msr_od = msr_o.data # another "shortener"
+ msr_od = msr_o.data # another "shortener"
comb += [
Assert(msr_o.ok),
Assert(nia_o.ok),
# if (MSR[29:31] != 0b010) | (SRR1[29:31] != 0b000) then
# MSR[29:31] <- SRR1[29:31]
- with m.If((field(msr_i , 29, 31) != 0b010) |
+ with m.If((field(msr_i, 29, 31) != 0b010) |
(field(srr1_i, 29, 31) != 0b000)):
comb += Assert(F(msr_od, 29, 31) == F(srr1_i, 29, 31))
with m.Else():
# MSR[48] <- (RS)[48] | (RS)[49]
# MSR[58] <- (RS)[58] | (RS)[49]
# MSR[59] <- (RS)[59] | (RS)[49]
- PR = field(srr1_i, 49) # alias/copy of SRR1 PR field
+ PR = field(srr1_i, 49) # alias/copy of SRR1 PR field
comb += [
Assert(field(msr_od, 48) == field(srr1_i, 48) | PR),
Assert(field(msr_od, 58) == field(srr1_i, 58) | PR),
if __name__ == '__main__':
unittest.main()
-
def msr_copy(msr_o, msr_i, zero_me=True):
- """msr_copy
+ """msr_copy (also used to copy relevant bits into SRR1)
+
ISA says this:
Defined MSR bits are classified as either full func tion or partial
function. Full function MSR bits are saved in SRR1 or HSRR1 when
return l
-def msr_check_pr(m, msr):
+def msr_check_pr(m, d_in, msr):
"""msr_check_pr: checks "problem state"
"""
comb = m.d.comb
- with m.If(msr[MSR.PR]):
+ with m.If(d_in[MSR.PR]):
comb += msr[MSR.EE].eq(1) # set external interrupt bit
comb += msr[MSR.IR].eq(1) # set instruction relocation bit
comb += msr[MSR.DR].eq(1) # set data relocation bit
super().__init__(pspec, "main")
self.fields = DecodeFields(SignalBitRange, [self.i.ctx.op.insn])
self.fields.create_specs()
+ self.kaivb = Signal(64) # KAIVB SPR
+ self.state_reset = Signal() # raise high to reset KAIVB cache
def trap(self, m, trap_addr, return_addr):
"""trap. sets new PC, stores MSR and old PC in SRR1 and SRR0
op = self.i.ctx.op
msr_i = op.msr
svstate_i = op.svstate
+
+ exc = LDSTException("trapexc")
+ comb += exc.eq(op.ldst_exc)
+ srr1_i = exc.srr1 # new SRR1 bits come from exception
nia_o = self.o.nia
svsrr0_o, srr0_o, srr1_o = self.o.svsrr0, self.o.srr0, self.o.srr1
- # trap address
+ # trap address, including KAIVB override
comb += nia_o.data.eq(trap_addr)
+ comb += nia_o.data[13:].eq(self.kaivb[13:])
comb += nia_o.ok.eq(1)
# addr to begin from on return
comb += srr0_o.data.eq(return_addr)
comb += srr0_o.ok.eq(1)
- # take a copy of the current MSR into SRR1
- comb += msr_copy(srr1_o.data, msr_i) # old MSR
+ # take a copy of the current MSR into SRR1, but first copy old SRR1
+ # this preserves the bits of SRR1 that are not supposed to change:
+ # MSR.IR,DR,PMM,RI,LE (0-5) and MR,FP,ME,FE0 (11-14)
+ # i would suggest reading v3.0C p1063 Book III section 7.2.1 for
+ # advice but it's so obscure and indirect, that it's just easier
+ # to copy microwatt behaviour. see writeback.vhdl
+ # IMPORTANT: PowerDecoder2 needed to actually read SRR1 for
+ # it to have the contents *of* SRR1 to copy over!
+ comb += msr_copy(srr1_o.data, msr_i, False) # old MSR
+ comb += srr1_o.data[16:22].eq(srr1_i[0:6]) # IR,DR,PMM,RI,LE
+ comb += srr1_o.data[27:31].eq(srr1_i[11:15]) # MR,FP,ME,FE0
comb += srr1_o.ok.eq(1)
# take a copy of the current SVSTATE into SVSRR0
def elaborate(self, platform):
m = Module()
- comb = m.d.comb
+ comb, sync = m.d.comb, m.d.sync
op = self.i.ctx.op
# convenience variables
srr0_o, srr1_o, svsrr0_o = self.o.srr0, self.o.srr1, self.o.svsrr0
traptype, trapaddr = op.traptype, op.trapaddr
+ # hard reset of KAIVB
+ with m.If(self.state_reset):
+ sync += self.kaivb.eq(0)
+
# take copy of D-Form TO field
i_fields = self.fields.FormD
to = Signal(i_fields.TO[0:-1].shape())
# TODO: some #defines for the bits n stuff.
with m.Switch(op.insn_type):
+ ##############
+ # KAIVB https://bugs.libre-soc.org/show_bug.cgi?id=859
+
+ with m.Case(MicrOp.OP_MTSPR):
+ sync += self.kaivb.eq(a_i)
+
+ with m.Case(MicrOp.OP_MFSPR):
+ comb += o.data.eq(self.kaivb)
+ comb += o.ok.eq(1)
+
###############
# TDI/TWI/TD/TW. v3.0B p90-91
comb += srr1_o.data[PI.FP].eq(1)
with m.If(traptype & TT.ADDR):
comb += srr1_o.data[PI.ADR].eq(1)
- with m.If(traptype & TT.MEMEXC):
+ with m.If((traptype & TT.MEMEXC).bool() &
+ (trapaddr == 0x400)):
+ # Instruction Storage Interrupt (ISI - 0x400)
+ # v3.0C Book III Chap 7.5.5 p1085
# decode exception bits, store in SRR1
exc = LDSTException("trapexc")
comb += exc.eq(op.ldst_exc)
# MTMSR/D. v3.0B p TODO - move to MSR
with m.Case(MicrOp.OP_MTMSRD, MicrOp.OP_MTMSR):
- L = self.fields.FormX.L[0:-1] # X-Form field L
+ # L => bit 16 in LSB0, bit 15 in MSB0 order
+ L = self.fields.FormX.L1[0:1] # X-Form field L1
# start with copy of msr
- comb += msr_o.eq(msr_i)
+ comb += msr_o.data.eq(msr_i)
with m.If(L):
# just update RI..EE
comb += msr_o.data[MSR.RI].eq(a_i[MSR.RI])
# mtmsr - 32-bit, only room for bottom 32 LSB flags
for stt, end in [(1,12), (13, 32)]:
comb += msr_o.data[stt:end].eq(a_i[stt:end])
- msr_check_pr(m, msr_o.data)
+ # check problem state: if set, not permitted to set EE,IR,DR
+ msr_check_pr(m, a_i, msr_o.data)
# Per https://bugs.libre-soc.org/show_bug.cgi?id=325#c123,
# this actually *is* in the microwatt code now.
# hypervisor stuff. here: bits 3 (HV) and 51 (ME) were
# copied over by msr_copy but if HV was not set we need
# the *original* (msr_i) bits
- with m.If(~msr_i[MSR.HV]):
- comb += msr_o.data[MSR.HV].eq(msr_i[MSR.HV])
- comb += msr_o.data[MSR.ME].eq(msr_i[MSR.ME])
+ # XXX taking this out to see what happens when running
+ # linux-5.7 microwatt buildroot. microwatt does not
+ # implement HV, so this is unlikely to work. 0x900
+ # linux kernel exception handling tends to support this
+ # with m.If(~msr_i[MSR.HV]):
+ # comb += msr_o.data[MSR.HV].eq(msr_i[MSR.HV])
+ # comb += msr_o.data[MSR.ME].eq(msr_i[MSR.ME])
comb += msr_o.ok.eq(1)
# MSR was in srr1: copy it over, however *caveats below*
comb += msr_copy(msr_o.data, srr1_i, zero_me=False) # don't zero
- with m.If(~self.i.ctx.op.insn[9]): # XXX BAD HACK! (hrfid)
- with m.If(field(msr_i, 3)): # HV
- comb += field(msr_o, 51).eq(field(srr1_i, 51)) # ME
- with m.Else():
- comb += field(msr_o, 51).eq(field(msr_i, 51)) # ME
-
- # check problem state
- msr_check_pr(m, msr_o.data)
+ if False: # XXX no - not doing hypervisor yet
+ with m.If(~self.i.ctx.op.insn[9]): # XXX BAD HACK! (hrfid)
+ with m.If(field(msr_i, 3)): # HV
+ comb += field(msr_o.data, 51).eq(field(srr1_i, 51)) # ME
+ with m.Else():
+ comb += field(msr_o.data, 51).eq(field(msr_i, 51)) # ME
+ else:
+ # same as microwatt: treat MSR.ME rfid same as hrfid
+ comb += field(msr_o.data, 51).eq(field(srr1_i, 51)) # ME
+
+ # check problem state: if set, not permitted to set EE,IR,DR
+ msr_check_pr(m, srr1_i, msr_o.data)
# don't understand but it's in the spec. again: bits 32-34
# are copied from srr1_i and need *restoring* to msr_i
# ... however we *do* need to *write* MSR, NIA, SVSTATE (RFID)
('STATE', 'nia', '0:63'), # NIA (Next PC)
('STATE', 'msr', '0:63'), # MSR
- ('STATE', 'svstate', '0:31')] # SVSTATE
+ ('STATE', 'svstate', '0:63')] # SVSTATE
def __init__(self, pspec):
super().__init__(pspec, True)
# convenience
class TrapPipeSpec(CommonPipeSpec):
- regspec = (TrapInputData.regspec, TrapOutputData.regspec)
+ regspecklses = (TrapInputData, TrapOutputData)
opsubsetkls = CompTrapOpSubset
def set_alu_inputs(alu, dec2, sim):
# TODO: see https://bugs.libre-soc.org/show_bug.cgi?id=305#c43
# detect the immediate here (with m.If(self.i.ctx.op.imm_data.imm_ok))
- # and place it into data_i.b
+ # and place it into i_data.b
inp = yield from get_cu_inputs(dec2, sim)
yield from ALUHelpers.set_int_ra(alu, dec2, inp)
class TrapIlangCase(TestAccumulatorBase):
def case_ilang(self):
- pspec = TrapPipeSpec(id_wid=2)
+ pspec = TrapPipeSpec(id_wid=2, parent_pspec=None)
alu = TrapBasePipe(pspec)
vl = rtlil.convert(alu, ports=alu.ports())
with open("trap_pipeline.il", "w") as f:
class TestRunner(unittest.TestCase):
- def __init__(self, test_data):
- super().__init__("run_all")
- self.test_data = test_data
- def run_all(self):
+ def execute(self, alu, instruction, pdecode2, test):
+ program = test.program
+ sim = ISA(pdecode2, test.regs, test.sprs, test.cr,
+ test.mem, test.msr,
+ bigendian=bigendian)
+ gen = program.generate_instructions()
+ instructions = list(zip(gen, program.assembly.splitlines()))
+
+ msr = sim.msr.value
+ pc = sim.pc.CIA.value
+ print("starting msr, pc %08x, %08x" % (msr, pc))
+ index = pc//4
+ while index < len(instructions):
+ ins, code = instructions[index]
+
+ print("pc %08x msr %08x instr: %08x" % (pc, msr, ins))
+ print(code)
+ if 'XER' in sim.spr:
+ so = 1 if sim.spr['XER'][XER_bits['SO']] else 0
+ ov = 1 if sim.spr['XER'][XER_bits['OV']] else 0
+ ov32 = 1 if sim.spr['XER'][XER_bits['OV32']] else 0
+ print("before: so/ov/32", so, ov, ov32)
+
+ # ask the decoder to decode this binary data (endian'd)
+ yield pdecode2.dec.bigendian.eq(bigendian) # l/big?
+ yield pdecode2.state.msr.eq(msr) # set MSR in pdecode2
+ yield pdecode2.state.pc.eq(pc) # set CIA in pdecode2
+ yield instruction.eq(ins) # raw binary instr.
+ yield Settle()
+ fn_unit = yield pdecode2.e.do.fn_unit
+ asmcode = yield pdecode2.e.asmcode
+ dec_asmcode = yield pdecode2.dec.op.asmcode
+ print("asmcode", asmcode, dec_asmcode)
+ self.assertEqual(fn_unit, Function.TRAP.value)
+ alu_o = yield from set_alu_inputs(alu, pdecode2, sim)
+
+ # set valid for one cycle, propagate through pipeline...
+ yield alu.p.i_valid.eq(1)
+ yield
+ yield alu.p.i_valid.eq(0)
+
+ opname = code.split(' ')[0]
+ yield from sim.call(opname)
+ pc = sim.pc.CIA.value
+ index = pc//4
+ print("pc after %08x" % (pc))
+ msr = sim.msr.value
+ print("msr after %08x" % (msr))
+
+ vld = yield alu.n.o_valid
+ while not vld:
+ yield
+ vld = yield alu.n.o_valid
+ yield
+
+ yield from self.check_alu_outputs(alu, pdecode2, sim, code)
+ yield Settle()
+
+ def test_it(self):
+ test_data = TrapTestCase().test_data
m = Module()
comb = m.d.comb
instruction = Signal(32)
- pdecode = create_pdecode()
-
- m.submodules.pdecode2 = pdecode2 = PowerDecode2(pdecode)
+ fn_name = "TRAP"
+ opkls = TrapPipeSpec.opsubsetkls
- pspec = TrapPipeSpec(id_wid=2)
+ pdecode = create_pdecode()
+ m.submodules.pdecode2 = pdecode2 = PowerDecode2(
+ pdecode, opkls, fn_name)
+ pdecode = pdecode2.dec
+
+ class PPspec:
+ XLEN = 64
+ pps = PPspec()
+ pspec = TrapPipeSpec(id_wid=2, parent_pspec=pps)
m.submodules.alu = alu = TrapBasePipe(pspec)
- comb += alu.p.data_i.ctx.op.eq_from_execute1(pdecode2.do)
- comb += alu.p.valid_i.eq(1)
- comb += alu.n.ready_i.eq(1)
+ comb += alu.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
+ comb += alu.n.i_ready.eq(1)
comb += pdecode2.dec.raw_opcode_in.eq(instruction)
sim = Simulator(m)
sim.add_clock(1e-6)
def process():
- for test in self.test_data:
+ for test in test_data:
print(test.name)
program = test.program
with self.subTest(test.name):
- sim = ISA(pdecode2, test.regs, test.sprs, test.cr,
- test.mem, test.msr,
- bigendian=bigendian)
- gen = program.generate_instructions()
- instructions = list(zip(gen, program.assembly.splitlines()))
-
- msr = sim.msr.value
- pc = sim.pc.CIA.value
- print("starting msr, pc %08x, %08x" % (msr, pc))
- index = pc//4
- while index < len(instructions):
- ins, code = instructions[index]
-
- print("pc %08x msr %08x instr: %08x" % (pc, msr, ins))
- print(code)
- if 'XER' in sim.spr:
- so = 1 if sim.spr['XER'][XER_bits['SO']] else 0
- ov = 1 if sim.spr['XER'][XER_bits['OV']] else 0
- ov32 = 1 if sim.spr['XER'][XER_bits['OV32']] else 0
- print("before: so/ov/32", so, ov, ov32)
-
- # ask the decoder to decode this binary data (endian'd)
- yield pdecode2.dec.bigendian.eq(bigendian) # l/big?
- yield pdecode2.state.msr.eq(msr) # set MSR in pdecode2
- yield pdecode2.state.pc.eq(pc) # set CIA in pdecode2
- yield instruction.eq(ins) # raw binary instr.
- yield Settle()
- fn_unit = yield pdecode2.e.do.fn_unit
- self.assertEqual(fn_unit, Function.TRAP.value)
- alu_o = yield from set_alu_inputs(alu, pdecode2, sim)
- yield
- opname = code.split(' ')[0]
- yield from sim.call(opname)
- pc = sim.pc.CIA.value
- index = pc//4
- print("pc after %08x" % (pc))
- msr = sim.msr.value
- print("msr after %08x" % (msr))
-
- vld = yield alu.n.valid_o
- while not vld:
- yield
- vld = yield alu.n.valid_o
- yield
-
- yield from self.check_alu_outputs(alu, pdecode2,
- sim, code)
+ yield from self.execute(alu, instruction, pdecode2, test)
sim.add_sync_process(process)
with sim.write_vcd("alu_simulator.vcd", "simulator.gtkw",
def check_alu_outputs(self, alu, dec2, sim, code):
- rc = yield dec2.e.do.rc.data
- cridx_ok = yield dec2.e.write_cr.ok
- cridx = yield dec2.e.write_cr.data
-
- print("check extra output", repr(code), cridx_ok, cridx)
- if rc:
- self.assertEqual(cridx, 0, code)
-
sim_o = {}
res = {}
if __name__ == "__main__":
- unittest.main(exit=False)
- suite = unittest.TestSuite()
- suite.addTest(TestRunner(TrapTestCase().test_data))
- suite.addTest(TestRunner(TrapIlangCase().test_data))
-
- runner = unittest.TextTestRunner()
- runner.run(suite)
+ unittest.main()
('insn', 32),
('msr', 64), # from core.state
('cia', 64), # likewise
- ('svstate', 32), # likewise
+ ('svstate', 64), # likewise
('is_32bit', 1),
('traptype', TT.size), # see trap main_stage.py, PowerDecoder2
('trapaddr', 13),
- ('ldst_exc', len(LDSTException._exc_types)),
+ ('ldst_exc', LDSTException.length), # blech
]
super().__init__(layout, name=name)
# highest priority interrupt currently presented (which is allowed
# via XICS)
#
+# Bugreports:
+#
+# * https://bugs.libre-soc.org/show_bug.cgi?id=407
"""
from nmigen import Elaboratable, Module, Signal, Cat, Const, Record, Array, Mux
from nmutil.iocontrol import RecordObject
class XICS_ICP(Elaboratable):
- def __init__(self):
- class Spec: pass
- spec = Spec()
+ def __init__(self, spec=None):
+ if spec is None:
+ class Spec: pass
+ spec = Spec()
spec.addr_wid = 30
spec.mask_wid = 4
spec.reg_wid = 32
class XICS_ICS(Elaboratable):
- def __init__(self, SRC_NUM=16, PRIO_BITS=8):
+ def __init__(self, spec=None, SRC_NUM=16, PRIO_BITS=8):
self.SRC_NUM = SRC_NUM
self.PRIO_BITS = PRIO_BITS
self.pri_masked = (1<<self.PRIO_BITS)-1
- class Spec: pass
- spec = Spec()
+ if spec is None:
+ class Spec: pass
+ spec = Spec()
spec.addr_wid = 30
spec.mask_wid = 4
spec.reg_wid = 32
-Subproject commit 6efd2e59703f6f0747435f97030e8a463233457f
+Subproject commit 0f03df1546c8cf6ab91ef63b04713dca768a84c4
# inputs: address to fetch PC, and valid/stall signalling
self.a_pc_i = Signal(self.addr_wid)
self.a_stall_i = Signal()
- self.a_valid_i = Signal()
+ self.a_i_valid = Signal()
self.f_stall_i = Signal()
- self.f_valid_i = Signal()
+ self.f_i_valid = Signal()
# outputs: instruction (or error), and busy indicators
self.a_busy_o = Signal()
def __iter__(self):
yield self.a_pc_i
yield self.a_stall_i
- yield self.a_valid_i
+ yield self.a_i_valid
yield self.f_stall_i
- yield self.f_valid_i
+ yield self.f_i_valid
yield self.a_busy_o
yield self.f_busy_o
yield self.f_instr_o
ibus_rdata = Signal.like(self.ibus.dat_r)
with m.If(self.ibus.cyc):
- with m.If(self.ibus.ack | self.ibus.err | ~self.f_valid_i):
+ with m.If(self.ibus.ack | self.ibus.err | ~self.f_i_valid):
m.d.sync += [
self.ibus.cyc.eq(0),
self.ibus.stb.eq(0),
self.ibus.sel.eq(0),
ibus_rdata.eq(self.ibus.dat_r)
]
- with m.Elif(self.a_valid_i & ~self.a_stall_i):
+ with m.Elif(self.a_i_valid & ~self.a_stall_i):
m.d.sync += [
self.ibus.adr.eq(self.a_pc_i[self.adr_lsbs:]),
self.ibus.cyc.eq(1),
icache.s1_addr.eq(self.a_pc_i[self.adr_lsbs:]),
icache.s1_flush.eq(self.a_flush),
icache.s1_stall.eq(self.a_stall_i),
- icache.s1_valid.eq(self.a_valid_i & a_icache_select),
+ icache.s1_valid.eq(self.a_i_valid & a_icache_select),
icache.s2_addr.eq(self.f_pc[self.adr_lsbs:]),
icache.s2_re.eq(Const(1)),
icache.s2_evict.eq(Const(0)),
- icache.s2_valid.eq(self.f_valid_i & f_icache_select)
+ icache.s2_valid.eq(self.f_i_valid & f_icache_select)
]
iba = WishboneArbiter(self.pspec)
bare_port = iba.port(priority=1)
bare_rdata = Signal.like(bare_port.dat_r)
with m.If(bare_port.cyc):
- with m.If(bare_port.ack | bare_port.err | ~self.f_valid_i):
+ with m.If(bare_port.ack | bare_port.err | ~self.f_i_valid):
m.d.sync += [
bare_port.cyc.eq(0),
bare_port.stb.eq(0),
bare_port.sel.eq(0),
bare_rdata.eq(bare_port.dat_r)
]
- with m.Elif(~a_icache_select & self.a_valid_i & ~self.a_stall_i):
+ with m.Elif(~a_icache_select & self.a_i_valid & ~self.a_stall_i):
m.d.sync += [
bare_port.cyc.eq(1),
bare_port.stb.eq(1),
self.x_st_data_i = Signal(data_wid) # The data to write when storing
self.x_stall_i = Signal() # do nothing until low
- self.x_valid_i = Signal() # Whether x pipeline stage is
+ self.x_i_valid = Signal() # Whether x pipeline stage is
# currently enabled (I
# think?). Set to 1 for #now
self.m_stall_i = Signal() # do nothing until low
- self.m_valid_i = Signal() # Whether m pipeline stage is
+ self.m_i_valid = Signal() # Whether m pipeline stage is
# currently enabled. Set
# to 1 for now
self.m_busy_o = Signal() # set when the memory is busy
self.m_ld_data_o = Signal(data_wid) # Data returned from memory read
- # Data validity is NOT indicated by m_valid_i or x_valid_i as
+ # Data validity is NOT indicated by m_i_valid or x_i_valid as
# those are inputs. I believe it is valid on the next cycle
# after raising m_load where busy is low
yield self.x_st_data_i
yield self.x_stall_i
- yield self.x_valid_i
+ yield self.x_i_valid
yield self.m_stall_i
- yield self.m_valid_i
+ yield self.m_i_valid
yield self.x_busy_o
yield self.m_busy_o
yield self.m_ld_data_o
with m.If(self.jtag_en): # for safety, JTAG can completely disable WB
with m.If(self.dbus.cyc):
- with m.If(self.dbus.ack | self.dbus.err | ~self.m_valid_i):
+ with m.If(self.dbus.ack | self.dbus.err | ~self.m_i_valid):
m.d.sync += [
self.dbus.cyc.eq(0),
self.dbus.stb.eq(0),
self.m_ld_data_o.eq(self.dbus.dat_r)
]
with m.Elif((self.x_ld_i | self.x_st_i) &
- self.x_valid_i & ~self.x_stall_i):
+ self.x_i_valid & ~self.x_stall_i):
m.d.sync += [
self.dbus.cyc.eq(1),
self.dbus.stb.eq(1),
dcache.s1_addr.eq(self.x_addr_i[self.adr_lsbs:]),
dcache.s1_flush.eq(self.x_flush),
dcache.s1_stall.eq(self.x_stall_i),
- dcache.s1_valid.eq(self.x_valid_i & x_dcache_select),
+ dcache.s1_valid.eq(self.x_i_valid & x_dcache_select),
dcache.s2_addr.eq(m_addr[self.adr_lsbs:]),
dcache.s2_re.eq(self.m_load),
dcache.s2_evict.eq(self.m_store),
- dcache.s2_valid.eq(self.m_valid_i & m_dcache_select)
+ dcache.s2_valid.eq(self.m_i_valid & m_dcache_select)
]
wrbuf_w_data = Record([("addr", self.addr_wid-self.adr_lsbs),
wrbuf_w_data.addr.eq(self.x_addr_i[self.adr_lsbs:]),
wrbuf_w_data.mask.eq(self.x_mask_i),
wrbuf_w_data.data.eq(self.x_st_data_i),
- wrbuf.w_en.eq(self.x_st_i & self.x_valid_i &
+ wrbuf.w_en.eq(self.x_st_i & self.x_i_valid &
x_dcache_select & ~self.x_stall_i),
wrbuf_r_data.eq(wrbuf.r_data),
]
bare_port = dba.port(priority=2)
bare_rdata = Signal.like(bare_port.dat_r)
with m.If(bare_port.cyc):
- with m.If(bare_port.ack | bare_port.err | ~self.m_valid_i):
+ with m.If(bare_port.ack | bare_port.err | ~self.m_i_valid):
m.d.sync += [
bare_port.cyc.eq(0),
bare_port.stb.eq(0),
bare_rdata.eq(bare_port.dat_r)
]
with m.Elif((self.x_ld_i | self.x_st_i) &
- ~x_dcache_select & self.x_valid_i & ~self.x_stall_i):
+ ~x_dcache_select & self.x_i_valid & ~self.x_stall_i):
m.d.sync += [
bare_port.cyc.eq(1),
bare_port.stb.eq(1),
addr_wid, mask_wid, data_wid = spec.addr_wid, spec.mask_wid, spec.reg_wid
adr_lsbs = log2_int(mask_wid) # LSBs of addr covered by mask
badwid = spec.addr_wid-adr_lsbs # MSBs (not covered by mask)
+ # test if microwatt compatibility is to be enabled
+ microwatt_compat = (hasattr(spec, "microwatt_compat") and
+ (spec.microwatt_compat == True))
+ # test if fabric compatibility is to be enabled
+ fabric_compat = (hasattr(spec, "fabric_compat") and
+ (spec.fabric_compat == True))
res = [
("adr", badwid , DIR_FANOUT),
("we", 1, DIR_FANOUT),
("err", 1, DIR_FANIN)
]
+ # microwatt needs a stall signal (operates in pipeline mode)
+ if microwatt_compat or fabric_compat:
+ res.append(("stall", 1, DIR_FANIN))
if not cti:
return res
return res + [
def read_port(self, name=None):
port = RecordObject([("ren", 1),
- ("data_o", self.width)],
+ ("o_data", self.width)],
name=name)
self._rdports.append(port)
return port
def write_port(self, name=None):
port = RecordObject([("wen", 1),
- ("data_i", self.width)],
+ ("i_data", self.width)],
name=name)
self._wrports.append(port)
return port
def elaborate(self, platform):
m = Module()
- self.reg = reg = Signal(self.width, name="reg", reset=self.reset)
+ self.reg = reg = Signal(self.width, name="reg", reset=self.reset,
+ attrs={'syn_ramstyle': "block_ram"})
if self.synced:
domain = m.d.sync
# read ports. has write-through detection (returns data written)
for rp in self._rdports:
- domain += rp.data_o.eq(0)
+ domain += rp.o_data.eq(0)
with m.If(rp.ren):
if self.writethru:
wr_detect = Signal(reset_less=False)
m.d.comb += wr_detect.eq(0)
for wp in self._wrports:
with m.If(wp.wen):
- domain += rp.data_o.eq(wp.data_i)
+ domain += rp.o_data.eq(wp.i_data)
m.d.comb += wr_detect.eq(1)
with m.If(~wr_detect):
- domain += rp.data_o.eq(reg)
+ domain += rp.o_data.eq(reg)
else:
- domain += rp.data_o.eq(reg)
+ domain += rp.o_data.eq(reg)
# write ports, delayed by 1 cycle
for wp in self._wrports:
with m.If(wp.wen):
- m.d.sync += reg.eq(wp.data_i)
+ m.d.sync += reg.eq(wp.i_data)
return m
res = list(self)
-def ortreereduce(tree, attr="data_o"):
+def ortreereduce(tree, attr="o_data"):
return treereduce(tree, operator.or_, lambda x: getattr(x, attr))
and read-en signals (per port).
"""
- def __init__(self, width, depth, synced=True, fwd_bus_mode=True):
+ def __init__(self, width, depth, synced=True, fwd_bus_mode=True,
+ resets=None):
+ if resets is None:
+ resets = [0] * depth
self.synced = synced
self.width = width
self.depth = depth
self.regs = Array(Register(width, synced=synced,
- writethru=fwd_bus_mode) \
- for _ in range(self.depth))
+ writethru=fwd_bus_mode,
+ resetval=rst) \
+ for rst in resets)
self._rdports = []
self._wrports = []
regs = self.read_reg_port(name)
regs = Array(regs)
port = RecordObject([("ren", self.depth),
- ("data_o", self.width)], name)
+ ("o_data", self.width)], name)
self._rdports.append((regs, port))
return port
regs = self.write_reg_port(name)
regs = Array(regs)
port = RecordObject([("wen", self.depth),
- ("data_i", self.width)])
+ ("i_data", self.width)])
self._wrports.append((regs, port))
return port
ren_delay = Signal.like(p.ren)
m.d.sync += ren_delay.eq(p.ren)
with m.If(ren_delay):
- m.d.comb += p.data_o.eq(ror)
+ m.d.comb += p.o_data.eq(ror)
else:
- m.d.comb += p.data_o.eq(ror)
+ m.d.comb += p.o_data.eq(ror)
for (regs, p) in self._wrports:
m.d.comb += self._get_en_sig(regs, 'wen').eq(p.wen)
for r in regs:
- m.d.comb += r.data_i.eq(p.data_i)
+ m.d.comb += r.i_data.eq(p.i_data)
return m
self.fwd_bus_mode = fwd_bus_mode
self.synced = synced
self.width, self.depth = width, depth
- self.memory = Memory(width=width, depth=depth)
+ self.memory = Memory(width=width, depth=depth,
+ attrs={'syn_ramstyle': "block_ram"})
self._rdports = {}
self._wrports = {}
bsz = log2_int(self.depth, False)
port = RecordObject([("addr", bsz),
("ren", 1),
- ("data_o", self.width)], name=name)
+ ("o_data", self.width)], name=name)
if self.synced:
domain = "sync"
else:
bsz = log2_int(self.depth, False)
port = RecordObject([("addr", bsz),
("wen", 1),
- ("data_i", self.width)], name=name)
+ ("i_data", self.width)], name=name)
self._wrports[name] = (port, self.memory.write_port())
return port
addrmatch = Signal(reset_less=False)
m.d.comb += addrmatch.eq(wp.addr == rp.addr)
with m.If(wp.wen & addrmatch):
- m.d.comb += rp.data_o.eq(wp.data_i)
+ m.d.comb += rp.o_data.eq(wp.i_data)
m.d.comb += wr_detect.eq(1)
with m.If(~wr_detect):
- m.d.comb += rp.data_o.eq(rport.data)
+ m.d.comb += rp.o_data.eq(rport.data)
else:
if self.synced:
ren_delay = Signal.like(rp.ren)
m.d.sync += ren_delay.eq(rp.ren)
with m.If(ren_delay):
- m.d.comb += rp.data_o.eq(rport.data)
+ m.d.comb += rp.o_data.eq(rport.data)
else:
- m.d.comb += rp.data_o.eq(rport.data)
+ m.d.comb += rp.o_data.eq(rport.data)
# write ports, delayed by one cycle (in the memory itself)
for name, (port, wp) in self._wrports.items():
setattr(m.submodules, "wp_"+name, wp)
comb += wp.addr.eq(port.addr)
comb += wp.en.eq(port.wen)
- comb += wp.data.eq(port.data_i)
+ comb += wp.data.eq(port.i_data)
return m
bsz = int(log(self.width) / log(2))
port = RecordObject([("addr", bsz),
("ren", 1),
- ("data_o", self.width)], name=name)
+ ("o_data", self.width)], name=name)
self._rdports.append(port)
return port
bsz = int(log(self.width) / log(2))
port = RecordObject([("addr", bsz),
("wen", 1),
- ("data_i", self.width)], name=name)
+ ("i_data", self.width)], name=name)
self._wrports.append(port)
return port
def elaborate(self, platform):
m = Module()
bsz = int(log(self.width) / log(2))
- regs = Array(Signal(self.width, name="reg") for _ in range(self.depth))
+ regs = Array(Signal(self.width, name="reg",
+ attrs={'syn_ramstyle': "block_ram"}) \
+ for _ in range(self.depth))
# read ports. has write-through detection (returns data written)
for rp in self._rdports:
addrmatch = Signal(reset_less=False)
m.d.comb += addrmatch.eq(wp.addr == rp.addr)
with m.If(wp.wen & addrmatch):
- m.d.comb += rp.data_o.eq(wp.data_i)
+ m.d.comb += rp.o_data.eq(wp.i_data)
m.d.comb += wr_detect.eq(1)
with m.If(~wr_detect):
- m.d.comb += rp.data_o.eq(regs[rp.addr])
+ m.d.comb += rp.o_data.eq(regs[rp.addr])
# write ports, delayed by one cycle
for wp in self._wrports:
with m.If(wp.wen):
- m.d.sync += regs[wp.addr].eq(wp.data_i)
+ m.d.sync += regs[wp.addr].eq(wp.i_data)
return m
def regfile_sim(dut, rp, wp):
yield wp.addr.eq(1)
- yield wp.data_i.eq(2)
+ yield wp.i_data.eq(2)
yield wp.wen.eq(1)
yield
yield wp.wen.eq(0)
yield rp.ren.eq(1)
yield rp.addr.eq(1)
yield Settle()
- data = yield rp.data_o
+ data = yield rp.o_data
print(data)
yield
- data = yield rp.data_o
+ data = yield rp.o_data
print(data)
yield
- data2 = yield rp.data_o
+ data2 = yield rp.o_data
print(data2)
assert data == 2
yield
yield rp.addr.eq(5)
yield rp.ren.eq(1)
yield wp.wen.eq(1)
- yield wp.data_i.eq(6)
+ yield wp.i_data.eq(6)
yield
- data = yield rp.data_o
+ data = yield rp.o_data
print(data)
assert data == 6
yield
yield wp.wen.eq(0)
yield rp.ren.eq(0)
yield
- data = yield rp.data_o
+ data = yield rp.o_data
print(data)
assert data == 0
yield
- data = yield rp.data_o
+ data = yield rp.o_data
print(data)
def regfile_array_sim(dut, rp1, rp2, wp, wp2):
print("regfile_array_sim")
- yield wp.data_i.eq(2)
+ yield wp.i_data.eq(2)
yield wp.wen.eq(1 << 1)
yield
yield wp.wen.eq(0)
yield rp1.ren.eq(1 << 1)
yield Settle()
- data = yield rp1.data_o
+ data = yield rp1.o_data
print(data)
assert data == 2
yield
yield rp1.ren.eq(1 << 5)
yield rp2.ren.eq(1 << 1)
yield wp.wen.eq(1 << 5)
- yield wp.data_i.eq(6)
+ yield wp.i_data.eq(6)
yield Settle()
- data = yield rp1.data_o
+ data = yield rp1.o_data
assert data == 6
print(data)
yield
yield rp1.ren.eq(0)
yield rp2.ren.eq(0)
yield Settle()
- data1 = yield rp1.data_o
+ data1 = yield rp1.o_data
print(data1)
assert data1 == 0
- data2 = yield rp2.data_o
+ data2 = yield rp2.o_data
print(data2)
assert data2 == 0
yield
- data = yield rp1.data_o
+ data = yield rp1.o_data
print(data)
assert data == 0
# XXX MAKE DAMN SURE TO KEEP THESE UP-TO-DATE if changing/adding regs
from openpower.consts import StateRegsEnum, XERRegsEnum, FastRegsEnum
+from nmigen import Module
+from nmigen.cli import rtlil
+from nmutil.latch import SRLatch
+
+
+def create_ports(rf, wr_spec, rd_spec):
+ """create_ports: creates register file ports based on requested specs
+ """
+ rf.r_ports, rf.w_ports = {}, {}
+ # create read ports based on read specs
+ for key, name in rd_spec.items():
+ if hasattr(rf, name): # some regfiles already have a port
+ rf.r_ports[key] = getattr(rf, name)
+ else:
+ rf.r_ports[key] = rf.read_port(name)
+ # create write ports based on write specs
+ for key, name in wr_spec.items():
+ if hasattr(rf, name): # some regfiles already have a port
+ rf.w_ports[key] = getattr(rf, name)
+ else:
+ rf.w_ports[key] = rf.write_port(name)
+
# "State" Regfile
class StateRegs(RegFileArray, StateRegsEnum):
(d_rd2)
"""
- def __init__(self, svp64_en=False, regreduce_en=False):
- super().__init__(64, StateRegsEnum.N_REGS)
- self.w_ports = {'nia': self.write_port("nia"),
- 'msr': self.write_port("msr"),
- 'svstate': self.write_port("svstate"),
- 'sv': self.write_port("sv"), # writing SVSTATE (issuer)
- 'd_wr1': self.write_port("d_wr1")} # writing PC (issuer)
- self.r_ports = {'cia': self.read_port("cia"), # reading PC (issuer)
- 'msr': self.read_port("msr"), # reading MSR (issuer)
- 'sv': self.read_port("sv"), # reading SV (issuer)
+ def __init__(self, svp64_en=False, regreduce_en=False, resets=None):
+ super().__init__(64, StateRegsEnum.N_REGS, resets=resets)
+ wr_spec, rd_spec = self.get_port_specs()
+ create_ports(self, wr_spec, rd_spec)
+
+ def get_port_specs(self):
+ w_port_spec = { # these 3 allow writing state by Function Units
+ # strictly speaking this should not be allowed,
+ # the information should be passed back to Issuer
+ # to work out what to do
+ 'nia': "nia",
+ 'msr': "msr",
+ 'svstate': "svstate",
+ 'issue': "issue", # writing DEC/TB
+ 'state1': "state1", # SPR pipeline
+ # these 3 allow writing state by Issuer
+ 'sv': "sv", # writing SVSTATE
+ 'd_wr1': "d_wr1", # writing PC
+ 'd_wr2': "d_wr2"} # writing MSR
+ r_port_spec = { # these are for reading state by Issuer but
+ # the FUs do not read them: they are passed in
+ # because of multi-issue / pipelining / etc.
+ # the state could be totally different and is
+ # only known *at* issue time, *by* the issuer
+ 'cia': "cia", # reading PC (issuer)
+ 'msr': "msr", # reading MSR (issuer)
+ 'sv': "sv", # reading SV (issuer)
+ # SPR and DEC/TB FSM
+ 'issue': "issue", # reading DEC/TB
+ 'state1': "state1", # SPR pipeline
}
+ return w_port_spec, r_port_spec
# Integer Regfile
* Array-based unary-indexed (not binary-indexed)
* write-through capability (read on same cycle as write)
"""
- def __init__(self, svp64_en=False, regreduce_en=False):
- super().__init__(64, 32, fwd_bus_mode=not regreduce_en)
- self.w_ports = {'o': self.write_port("dest1"),
+ def __init__(self, svp64_en=False, regreduce_en=False, reg_wid=64):
+ super().__init__(reg_wid, 32, fwd_bus_mode=False)
+ self.svp64_en = svp64_en
+ self.regreduce_en = regreduce_en
+ wr_spec, rd_spec = self.get_port_specs()
+ create_ports(self, wr_spec, rd_spec)
+
+ def get_port_specs(self):
+ w_port_spec = {'o': "dest1",
}
- self.r_ports = {
- 'dmi': self.read_port("dmi")} # needed for Debug (DMI)
- if svp64_en:
- self.r_ports['pred'] = self.read_port("pred") # for predicate mask
- if not regreduce_en:
- self.w_ports['o1'] = self.write_port("dest2") # (LD/ST update)
- self.r_ports['ra'] = self.read_port("src1")
- self.r_ports['rb'] = self.read_port("src2")
- self.r_ports['rc'] = self.read_port("src3")
+ r_port_spec = { 'dmi': "dmi" # needed for Debug (DMI)
+ }
+ if self.svp64_en:
+ r_port_spec['pred'] = "pred" # for predicate mask
+ if not self.regreduce_en:
+ w_port_spec['o1'] = "dest2" # (LD/ST update)
+ r_port_spec['ra'] = "src1"
+ r_port_spec['rb'] = "src2"
+ r_port_spec['rc'] = "src3"
else:
- self.r_ports['rabc'] = self.read_port("src1")
+ r_port_spec['rabc'] = "src1"
+ return w_port_spec, r_port_spec
# Fast SPRs Regfile
class FastRegs(RegFileMem, FastRegsEnum): #RegFileArray):
"""FastRegs
- FAST regfile - CTR, LR, TAR, SRR1, SRR2, XER, TB, DEC, SVSRR0
+ FAST regfile - CTR, LR, TAR, SRR1, SRR2, XER, SVSRR0
* QTY 6of 64-bit registers
* 3R2W
Note: r/w issue are used by issuer to increment/decrement TB/DEC.
"""
def __init__(self, svp64_en=False, regreduce_en=False):
- super().__init__(64, FastRegsEnum.N_REGS, fwd_bus_mode=not regreduce_en)
- self.w_ports = {'fast1': self.write_port("dest1"),
- 'issue': self.write_port("issue"), # writing DEC/TB
+ super().__init__(64, FastRegsEnum.N_REGS, fwd_bus_mode=False)
+ self.svp64_en = svp64_en
+ self.regreduce_en = regreduce_en
+ wr_spec, rd_spec = self.get_port_specs()
+ create_ports(self, wr_spec, rd_spec)
+
+ def get_port_specs(self):
+ w_port_spec = {'fast1': "dest1",
}
- self.r_ports = {'fast1': self.read_port("src1"),
- 'issue': self.read_port("issue"), # reading DEC/TB
+ r_port_spec = {'fast1': "src1",
+ 'dmi': "dmi" # needed for Debug (DMI)
}
- if not regreduce_en:
- self.r_ports['fast2'] = self.read_port("src2")
+ if not self.regreduce_en:
+ r_port_spec['fast2'] = "src2"
+ r_port_spec['fast3'] = "src3"
+ w_port_spec['fast2'] = "dest2"
+ w_port_spec['fast3'] = "dest3"
+
+ return w_port_spec, r_port_spec
# CR Regfile
"""
def __init__(self, svp64_en=False, regreduce_en=False):
super().__init__(32, 8, rd2=True)
- self.w_ports = {'full_cr': self.full_wr, # 32-bit (masked, 8-en lines)
- 'cr_a': self.write_port("dest1"), # 4-bit, unary-indexed
- 'cr_b': self.write_port("dest2")} # 4-bit, unary-indexed
- self.r_ports = {'full_cr': self.full_rd, # 32-bit (masked, 8-en lines)
- 'full_cr_dbg': self.full_rd2, # for DMI
- 'cr_a': self.read_port("src1"),
- 'cr_b': self.read_port("src2"),
- 'cr_c': self.read_port("src3")}
- if svp64_en:
- self.r_ports['cr_pred'] = self.read_port("cr_pred") # for predicate
+ self.svp64_en = svp64_en
+ self.regreduce_en = regreduce_en
+ wr_spec, rd_spec = self.get_port_specs()
+ create_ports(self, wr_spec, rd_spec)
+
+ def get_port_specs(self):
+ w_port_spec = {'full_cr': "full_wr", # 32-bit (masked, 8-en lines)
+ 'cr_a': "dest1", # 4-bit, unary-indexed
+ 'cr_b': "dest2"} # 4-bit, unary-indexed
+ r_port_spec = {'full_cr': "full_rd", # 32-bit (masked, 8-en lines)
+ 'full_cr_dbg': "full_rd2", # for DMI
+ 'cr_a': "src1",
+ 'cr_b': "src2",
+ 'cr_c': "src3"}
+ if self.svp64_en:
+ r_port_spec['cr_pred'] = "cr_pred" # for predicate
+
+ return w_port_spec, r_port_spec
# XER Regfile
OV=2 # OV and OV32
def __init__(self, svp64_en=False, regreduce_en=False):
super().__init__(6, XERRegsEnum.N_REGS)
- self.w_ports = {'full_xer': self.full_wr, # 6-bit (masked, 3-en lines)
- 'xer_so': self.write_port("dest1"),
- 'xer_ca': self.write_port("dest2"),
- 'xer_ov': self.write_port("dest3")}
- self.r_ports = {'full_xer': self.full_rd, # 6-bit (masked, 3-en lines)
- 'xer_so': self.read_port("src1"),
- 'xer_ca': self.read_port("src2"),
- 'xer_ov': self.read_port("src3")}
+ self.svp64_en = svp64_en
+ self.regreduce_en = regreduce_en
+ wr_spec, rd_spec = self.get_port_specs()
+ create_ports(self, wr_spec, rd_spec)
+
+ def get_port_specs(self):
+ w_port_spec = {'full_xer': "full_wr", # 6-bit (masked, 3-en lines)
+ 'xer_so': "dest1",
+ 'xer_ca': "dest2",
+ 'xer_ov': "dest3"}
+ r_port_spec = {'full_xer': "full_rd", # 6-bit (masked, 3-en lines)
+ 'xer_so': "src1",
+ 'xer_ca': "src2",
+ 'xer_ov': "src3"}
+ return w_port_spec, r_port_spec
# SPR Regfile
else:
n_sprs = len(SPRfull)
super().__init__(width=64, depth=n_sprs,
- fwd_bus_mode=not regreduce_en)
- self.w_ports = {'spr1': self.write_port("spr1")}
- self.r_ports = {'spr1': self.read_port("spr1")}
+ fwd_bus_mode=False)
+ self.svp64_en = svp64_en
+ self.regreduce_en = regreduce_en
+ wr_spec, rd_spec = self.get_port_specs()
+ create_ports(self, wr_spec, rd_spec)
+
+ def get_port_specs(self):
+ w_port_spec = {'spr1': "spr1"}
+ r_port_spec = {'spr1': "spr1"}
+ return w_port_spec, r_port_spec
# class containing all regfiles: int, cr, xer, fast, spr
class RegFiles:
- def __init__(self, pspec):
+ # Factory style classes
+ regkls = [('int', IntRegs),
+ ('cr', CRRegs),
+ ('xer', XERRegs),
+ ('fast', FastRegs),
+ ('state', StateRegs),
+ ('spr', SPRRegs),]
+ def __init__(self, pspec, make_hazard_vecs=False,
+ state_resets=None): # state file reset values
# test is SVP64 is to be enabled
svp64_en = hasattr(pspec, "svp64") and (pspec.svp64 == True)
regreduce_en = hasattr(pspec, "regreduce") and \
(pspec.regreduce == True)
- self.rf = {}
+ # get Integer File register width
+ reg_wid = 64
+ if isinstance(pspec.XLEN, int):
+ reg_wid = pspec.XLEN
+
+ self.rf = {} # register file dict
# create regfiles here, Factory style
- for (name, kls) in [('int', IntRegs),
- ('cr', CRRegs),
- ('xer', XERRegs),
- ('fast', FastRegs),
- ('state', StateRegs),
- ('spr', SPRRegs),]:
- rf = self.rf[name] = kls(svp64_en, regreduce_en)
+ for (name, kls) in RegFiles.regkls:
+ kwargs = {'svp64_en': svp64_en, 'regreduce_en': regreduce_en}
+ if name == 'state':
+ kwargs['resets'] = state_resets
+ if name == 'int':
+ kwargs['reg_wid'] = reg_wid
+ rf = self.rf[name] = kls(**kwargs)
# also add these as instances, self.state, self.fast, self.cr etc.
setattr(self, name, rf)
+ self.rv, self.wv = {}, {}
+ if make_hazard_vecs:
+ # create a read-hazard and write-hazard vectors for this regfile
+ self.wv = self.make_vecs("wr") # global write vectors
+ self.rv = self.make_vecs("rd") # global read vectors
+
+ def make_vecs(self, name):
+ vec = {}
+ # create regfiles here, Factory style
+ for (name, kls) in RegFiles.regkls:
+ rf = self.rf[name]
+ vec[name] = self.make_hazard_vec(rf, name)
+ return vec
+
+ def make_hazard_vec(self, rf, name):
+ if isinstance(rf, VirtualRegPort):
+ vec = SRLatch(sync=False, llen=rf.nregs, name=name)
+ else:
+ vec = SRLatch(sync=False, llen=rf.depth, name=name)
+ return vec
+
def elaborate_into(self, m, platform):
for (name, rf) in self.rf.items():
setattr(m.submodules, name, rf)
+ for (name, rv) in self.rv.items():
+ setattr(m.submodules, "rv_"+name, rv)
+ for (name, wv) in self.wv.items():
+ setattr(m.submodules, "wv_"+name, wv)
return m
+if __name__ == '__main__':
+ m = Module()
+ from soc.config.test.test_loadstore import TestMemPspec
+ pspec = TestMemPspec(regreduce_en=True,
+ XLEN=32) # integer reg width = 32
+ rf = RegFiles(pspec, make_hazard_vecs=True)
+ rf.elaborate_into(m, None)
+ vl = rtlil.convert(m)
+ with open("test_regfiles.il", "w") as f:
+ f.write(vl)
+
--- /dev/null
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2022 Cesar Strauss <cestrauss@gmail.com>
+# Sponsored by NLnet and NGI POINTER under EU Grants 871528 and 957073
+# Part of the Libre-SOC Project.
+
+"""
+Wrapper around a single port (1R or 1W) SRAM, to make a multi-port regfile.
+
+This SRAM primitive has one cycle delay for reads, and, after a write,
+it reads the value just written. The goal is to use it to make at least an
+1W2R regfile.
+
+See https://bugs.libre-soc.org/show_bug.cgi?id=781 and
+https://bugs.libre-soc.org/show_bug.cgi?id=502
+"""
+
+import unittest
+
+from nmigen import Elaboratable, Module, Memory, Signal, Repl, Mux
+from nmigen.back import rtlil
+from nmigen.sim import Simulator
+from nmigen.asserts import Assert, Assume, Past, AnyConst
+
+from nmutil.formaltest import FHDLTestCase
+from nmutil.gtkw import write_gtkw
+
+
+class SinglePortSRAM(Elaboratable):
+ """
+ Model of a single port SRAM, which can be simulated, verified and/or
+ synthesized to an FPGA.
+
+ :param addr_width: width of the address bus
+ :param data_width: width of the data bus
+ :param we_width: number of write enable lines
+
+ .. note:: The debug read port is meant only to assist in formal proofs!
+ """
+ def __init__(self, addr_width, data_width, we_width):
+ self.addr_width = addr_width
+ self.data_width = data_width
+ self.we_width = we_width
+ # interface signals
+ self.d = Signal(data_width); """ write data"""
+ self.q = Signal(data_width); """read data"""
+ self.a = Signal(addr_width); """ read/write address"""
+ self.we = Signal(we_width); """write enable"""
+ # debug signals, only used in formal proofs
+ self.dbg_addr = Signal(addr_width); """debug: address under test"""
+ lanes = range(we_width)
+ self.dbg_lane = Signal(lanes); """debug: write lane under test"""
+ gran = self.data_width // self.we_width
+ self.dbg_data = Signal(gran); """debug: data to keep in sync"""
+ self.dbg_wrote = Signal(); """debug: data is valid"""
+
+ def elaborate(self, platform):
+ m = Module()
+ # backing memory
+ depth = 1 << self.addr_width
+ gran = self.data_width // self.we_width
+ mem = Memory(width=self.data_width, depth=depth)
+ # create read and write ports
+ # By connecting the same address to both ports, they behave, in fact,
+ # as a single, "half-duplex" port.
+ # The transparent attribute means that, on a write, we read the new
+ # value, on the next cycle
+ # Note that nmigen memories have a one cycle delay, for reads,
+ # by default
+ m.submodules.rdport = rdport = mem.read_port(transparent=True)
+ m.submodules.wrport = wrport = mem.write_port(granularity=gran)
+ # duplicate the address to both ports
+ m.d.comb += wrport.addr.eq(self.a)
+ m.d.comb += rdport.addr.eq(self.a)
+ # write enable
+ m.d.comb += wrport.en.eq(self.we)
+ # read and write data
+ m.d.comb += wrport.data.eq(self.d)
+ m.d.comb += self.q.eq(rdport.data)
+
+ # the following is needed for induction, where an unreachable state
+ # (memory and holding register differ) is turned into an illegal one
+ if platform == "formal":
+ # the debug port is an asynchronous read port, allowing direct
+ # access to a given memory location by the formal engine
+ m.submodules.dbgport = dbgport = mem.read_port(domain="comb")
+ # first, get the value stored in our memory location,
+ # using its debug port
+ stored = Signal(self.data_width)
+ m.d.comb += dbgport.addr.eq(self.dbg_addr)
+ m.d.comb += stored.eq(dbgport.data)
+ # now, ensure that the value stored in memory is always in sync
+ # with the holding register
+ with m.If(self.dbg_wrote):
+ m.d.sync += Assert(self.dbg_data ==
+ stored.word_select(self.dbg_lane, gran))
+
+ return m
+
+ def ports(self):
+ return [
+ self.d,
+ self.a,
+ self.we,
+ self.q
+ ]
+
+
+def create_ilang(dut, ports, test_name):
+ vl = rtlil.convert(dut, name=test_name, ports=ports)
+ with open("%s.il" % test_name, "w") as f:
+ f.write(vl)
+
+
+class SinglePortSRAMTestCase(FHDLTestCase):
+ @staticmethod
+ def test_simple_rtlil():
+ """
+ Generate a simple SRAM. Try ``read_rtlil mem_simple.il; proc; show``
+ from a yosys prompt, to see the memory primitives, and
+ ``read_rtlil mem_simple.il; synth; show`` to see it implemented as
+ flip-flop RAM
+ """
+ dut = SinglePortSRAM(2, 4, 2)
+ create_ilang(dut, dut.ports(), "mem_simple")
+
+ @staticmethod
+ def test_blkram_rtlil():
+ """
+ Generates a bigger SRAM.
+ Try ``read_rtlil mem_blkram.il; synth_ecp5; show`` from a yosys
+ prompt, to see it implemented as block RAM
+ """
+ dut = SinglePortSRAM(10, 16, 2)
+ create_ilang(dut, dut.ports(), "mem_blkram")
+
+ def test_sram_model(self):
+ """
+ Simulate some read/write/modify operations on the SRAM model
+ """
+ dut = SinglePortSRAM(7, 32, 4)
+ sim = Simulator(dut)
+ sim.add_clock(1e-6)
+
+ def process():
+ # 1) write 0x12_34_56_78 to address 0
+ yield dut.a.eq(0)
+ yield dut.d.eq(0x12_34_56_78)
+ yield dut.we.eq(0b1111)
+ yield
+ # 2) write 0x9A_BC_DE_F0 to address 1
+ yield dut.a.eq(1)
+ yield dut.d.eq(0x9A_BC_DE_F0)
+ yield dut.we.eq(0b1111)
+ yield
+ # ... and read value just written to address 0
+ self.assertEqual((yield dut.q), 0x12_34_56_78)
+ # 3) prepare to read from address 0
+ yield dut.d.eq(0)
+ yield dut.we.eq(0b0000)
+ yield dut.a.eq(0)
+ yield
+ # ... and read value just written to address 1
+ self.assertEqual((yield dut.q), 0x9A_BC_DE_F0)
+ # 4) prepare to read from address 1
+ yield dut.a.eq(1)
+ yield
+ # ... and read value from address 0
+ self.assertEqual((yield dut.q), 0x12_34_56_78)
+ # 5) write 0x9A and 0xDE to bytes 1 and 3, leaving
+ # bytes 0 and 2 unchanged
+ yield dut.a.eq(0)
+ yield dut.d.eq(0x9A_FF_DE_FF)
+ yield dut.we.eq(0b1010)
+ yield
+ # ... and read value from address 1
+ self.assertEqual((yield dut.q), 0x9A_BC_DE_F0)
+ # 6) nothing more to do
+ yield dut.d.eq(0)
+ yield dut.we.eq(0)
+ yield
+ # ... other than confirm that bytes 1 and 3 were modified
+ # correctly
+ self.assertEqual((yield dut.q), 0x9A_34_DE_78)
+
+ sim.add_sync_process(process)
+ traces = ['rdport.clk', 'a[6:0]', 'we[3:0]', 'd[31:0]', 'q[31:0]']
+ write_gtkw('test_sram_model.gtkw', 'test_sram_model.vcd',
+ traces, module='top')
+ sim_writer = sim.write_vcd('test_sram_model.vcd')
+ with sim_writer:
+ sim.run()
+
+ def test_model_sram_proof(self):
+ """
+ Formal proof of the single port SRAM model
+ """
+ m = Module()
+ # 128 x 32-bit, 8-bit granularity
+ m.submodules.dut = dut = SinglePortSRAM(7, 32, 4)
+ gran = len(dut.d) // len(dut.we) # granularity
+ # choose a single random memory location to test
+ a_const = AnyConst(dut.a.shape())
+ # choose a single byte lane to test
+ lane = AnyConst(range(dut.we_width))
+ # holding data register
+ d_reg = Signal(gran)
+ # for some reason, simulated formal memory is not zeroed at reset
+ # ... so, remember whether we wrote it, at least once.
+ wrote = Signal()
+ # if our memory location and byte lane is being written
+ # ... capture the data in our holding register
+ with m.If((dut.a == a_const) & dut.we.bit_select(lane, 1)):
+ m.d.sync += d_reg.eq(dut.d.word_select(lane, gran))
+ m.d.sync += wrote.eq(1)
+ # if our memory location is being read
+ # ... and the holding register has valid data
+ # ... then its value must match the memory output, on the given lane
+ with m.If((Past(dut.a) == a_const) & wrote):
+ m.d.sync += Assert(d_reg == dut.q.word_select(lane, gran))
+
+ # pass our state to the device under test, so it can ensure that
+ # its state is in sync with ours, for induction
+ m.d.comb += [
+ dut.dbg_addr.eq(a_const),
+ dut.dbg_lane.eq(lane),
+ dut.dbg_data.eq(d_reg),
+ dut.dbg_wrote.eq(wrote),
+ ]
+
+ self.assertFormal(m, mode="prove", depth=2)
+
+
+class PhasedDualPortRegfile(Elaboratable):
+ """
+ Builds, from a pair of 1RW blocks, a pseudo 1W/1R RAM, where the
+ read port works every cycle, but the write port is only available on
+ either even (1eW/1R) or odd (1oW/1R) cycles.
+
+ :param addr_width: width of the address bus
+ :param data_width: width of the data bus
+ :param we_width: number of write enable lines
+ :param write_phase: indicates on which phase the write port will
+ accept data
+ :param transparent: whether a simultaneous read and write returns the
+ new value (True) or the old value (False)
+
+ .. note:: The debug read port is meant only to assist in formal proofs!
+ """
+
+ def __init__(self, addr_width, data_width, we_width, write_phase,
+ transparent=False):
+ self.addr_width = addr_width
+ self.data_width = data_width
+ self.we_width = we_width
+ self.write_phase = write_phase
+ self.transparent = transparent
+ # interface signals
+ self.wr_addr_i = Signal(addr_width); """write port address"""
+ self.wr_data_i = Signal(data_width); """write port data"""
+ self.wr_we_i = Signal(we_width); """write port enable"""
+ self.rd_addr_i = Signal(addr_width); """read port address"""
+ self.rd_data_o = Signal(data_width); """read port data"""
+ self.phase = Signal(); """even/odd cycle indicator"""
+ # debug signals, only used in formal proofs
+ self.dbg_addr = Signal(addr_width); """debug: address under test"""
+ lanes = range(we_width)
+ self.dbg_lane = Signal(lanes); """debug: write lane under test"""
+ gran = self.data_width // self.we_width
+ self.dbg_data = Signal(gran); """debug: data to keep in sync"""
+ self.dbg_wrote = Signal(); """debug: data is valid"""
+
+ def elaborate(self, platform):
+ m = Module()
+ # granularity
+ # instantiate the two 1RW memory blocks
+ mem1 = SinglePortSRAM(self.addr_width, self.data_width, self.we_width)
+ mem2 = SinglePortSRAM(self.addr_width, self.data_width, self.we_width)
+ m.submodules.mem1 = mem1
+ m.submodules.mem2 = mem2
+ # wire write port to first memory, and its output to the second
+ m.d.comb += mem1.d.eq(self.wr_data_i)
+ m.d.comb += mem2.d.eq(mem1.q)
+ # holding registers for the write port of the second memory
+ last_wr_addr = Signal(self.addr_width)
+ last_wr_we = Signal(self.we_width)
+ # do the read and write address coincide?
+ same_read_write = Signal()
+ with m.If(self.phase == self.write_phase):
+ # write phase, start a write on the first memory
+ m.d.comb += mem1.a.eq(self.wr_addr_i)
+ m.d.comb += mem1.we.eq(self.wr_we_i)
+ # save write address and write select for repeating the write
+ # on the second memory, later
+ m.d.sync += last_wr_we.eq(self.wr_we_i)
+ m.d.sync += last_wr_addr.eq(self.wr_addr_i)
+ # start a read on the second memory
+ m.d.comb += mem2.a.eq(self.rd_addr_i)
+ # output previously read data from the first memory
+ m.d.comb += self.rd_data_o.eq(mem1.q)
+ if self.transparent:
+ # remember whether we are reading from the same location we are
+ # writing
+ m.d.sync += same_read_write.eq(self.rd_addr_i == self.wr_addr_i)
+ with m.Else():
+ # read phase, write last written data on second memory
+ m.d.comb += mem2.a.eq(last_wr_addr)
+ m.d.comb += mem2.we.eq(last_wr_we)
+ # start a read on the first memory
+ m.d.comb += mem1.a.eq(self.rd_addr_i)
+ if self.transparent:
+ with m.If(same_read_write):
+ # when transparent, and read and write addresses coincide,
+ # output the data just written
+ m.d.comb += self.rd_data_o.eq(mem1.q)
+ with m.Else():
+ # otherwise, output previously read data
+ # from the second memory
+ m.d.comb += self.rd_data_o.eq(mem2.q)
+ else:
+ # always output the read data from the second memory,
+ # if not transparent
+ m.d.comb += self.rd_data_o.eq(mem2.q)
+
+ if platform == "formal":
+ # pass our state to the device under test, so it can ensure that
+ # its state is in sync with ours, for induction
+ m.d.comb += [
+ # pass the address and write lane under test to both memories
+ mem1.dbg_addr.eq(self.dbg_addr),
+ mem2.dbg_addr.eq(self.dbg_addr),
+ mem1.dbg_lane.eq(self.dbg_lane),
+ mem2.dbg_lane.eq(self.dbg_lane),
+ # the second memory copies its state from the first memory,
+ # after a cycle, so it has a one cycle delay
+ mem1.dbg_data.eq(self.dbg_data),
+ mem2.dbg_data.eq(Past(self.dbg_data)),
+ mem1.dbg_wrote.eq(self.dbg_wrote),
+ mem2.dbg_wrote.eq(Past(self.dbg_wrote)),
+ ]
+
+ return m
+
+ def ports(self):
+ return [
+ self.wr_addr_i,
+ self.wr_data_i,
+ self.wr_we_i,
+ self.rd_addr_i,
+ self.rd_data_o,
+ self.phase
+ ]
+
+
+class PhasedDualPortRegfileTestCase(FHDLTestCase):
+
+ def do_test_phased_dual_port_regfile(self, write_phase, transparent):
+ """
+ Simulate some read/write/modify operations on the phased write memory
+ """
+ dut = PhasedDualPortRegfile(7, 32, 4, write_phase, transparent)
+ sim = Simulator(dut)
+ sim.add_clock(1e-6)
+
+ # compare read data with previously written data
+ # and start a new read
+ def read(rd_addr_i, expected=None):
+ if expected is not None:
+ self.assertEqual((yield dut.rd_data_o), expected)
+ yield dut.rd_addr_i.eq(rd_addr_i)
+
+ # start a write, and set write phase
+ def write(wr_addr_i, wr_we_i, wr_data_i):
+ yield dut.wr_addr_i.eq(wr_addr_i)
+ yield dut.wr_we_i.eq(wr_we_i)
+ yield dut.wr_data_i.eq(wr_data_i)
+ yield dut.phase.eq(write_phase)
+
+ # disable writes, and start read phase
+ def skip_write():
+ yield dut.wr_addr_i.eq(0)
+ yield dut.wr_we_i.eq(0)
+ yield dut.wr_data_i.eq(0)
+ yield dut.phase.eq(~write_phase)
+
+ # writes a few values on the write port, and read them back
+ # ... reads can happen every cycle
+ # ... writes, only every two cycles.
+ # since reads have a one cycle delay, the expected value on
+ # each read refers to the last read performed, not the
+ # current one, which is in progress.
+ def process():
+ yield from read(0)
+ yield from write(0x42, 0b1111, 0x12345678)
+ yield
+ yield from read(0x42)
+ yield from skip_write()
+ yield
+ yield from read(0x42)
+ yield from write(0x43, 0b1111, 0x9ABCDEF0)
+ yield
+ yield from read(0x43, 0x12345678)
+ yield from skip_write()
+ yield
+ yield from read(0x42, 0x12345678)
+ yield from write(0x43, 0b1001, 0xF0FFFF9A)
+ yield
+ yield from read(0x43, 0x9ABCDEF0)
+ yield from skip_write()
+ yield
+ yield from read(0x43, 0x12345678)
+ yield from write(0x42, 0b0110, 0xFF5634FF)
+ yield
+ yield from read(0x42, 0xF0BCDE9A)
+ yield from skip_write()
+ yield
+ yield from read(0, 0xF0BCDE9A)
+ yield from write(0, 0, 0)
+ yield
+ yield from read(0, 0x12563478)
+ yield from skip_write()
+ yield
+ # try reading and writing to the same location, simultaneously
+ yield from read(0x42)
+ yield from write(0x42, 0b0101, 0x55AA9966)
+ yield
+ # ... and read again
+ yield from read(0x42)
+ yield from skip_write()
+ yield
+ if transparent:
+ # returns the value just written
+ yield from read(0, 0x12AA3466)
+ else:
+ # returns the old value
+ yield from read(0, 0x12563478)
+ yield from write(0, 0, 0)
+ yield
+ # after a cycle, always returns the new value
+ yield from read(0, 0x12AA3466)
+ yield from skip_write()
+
+ sim.add_sync_process(process)
+ debug_file = f'test_phased_dual_port_{write_phase}'
+ if transparent:
+ debug_file += '_transparent'
+ traces = ['clk', 'phase',
+ 'wr_addr_i[6:0]', 'wr_we_i[3:0]', 'wr_data_i[31:0]',
+ 'rd_addr_i[6:0]', 'rd_data_o[31:0]']
+ write_gtkw(debug_file + '.gtkw',
+ debug_file + '.vcd',
+ traces, module='top', zoom=-22)
+ sim_writer = sim.write_vcd(debug_file + '.vcd')
+ with sim_writer:
+ sim.run()
+
+ def test_phased_dual_port_regfile(self):
+ """test both types (odd and even write ports) of phased write memory"""
+ with self.subTest("writes happen on phase 0"):
+ self.do_test_phased_dual_port_regfile(0, False)
+ with self.subTest("writes happen on phase 1"):
+ self.do_test_phased_dual_port_regfile(1, False)
+ """test again, with a transparent read port"""
+ with self.subTest("writes happen on phase 0 (transparent reads)"):
+ self.do_test_phased_dual_port_regfile(0, True)
+ with self.subTest("writes happen on phase 1 (transparent reads)"):
+ self.do_test_phased_dual_port_regfile(1, True)
+
+ def do_test_phased_dual_port_regfile_proof(self, write_phase, transparent):
+ """
+ Formal proof of the pseudo 1W/1R regfile
+ """
+ m = Module()
+ # 128 x 32-bit, 8-bit granularity
+ dut = PhasedDualPortRegfile(7, 32, 4, write_phase, transparent)
+ m.submodules.dut = dut
+ gran = dut.data_width // dut.we_width # granularity
+ # choose a single random memory location to test
+ a_const = AnyConst(dut.addr_width)
+ # choose a single byte lane to test
+ lane = AnyConst(range(dut.we_width))
+ # drive alternating phases
+ m.d.comb += Assume(dut.phase != Past(dut.phase))
+ # holding data register
+ d_reg = Signal(gran)
+ # for some reason, simulated formal memory is not zeroed at reset
+ # ... so, remember whether we wrote it, at least once.
+ wrote = Signal()
+ # if our memory location and byte lane is being written,
+ # capture the data in our holding register
+ with m.If((dut.wr_addr_i == a_const)
+ & dut.wr_we_i.bit_select(lane, 1)
+ & (dut.phase == dut.write_phase)):
+ m.d.sync += d_reg.eq(dut.wr_data_i.word_select(lane, gran))
+ m.d.sync += wrote.eq(1)
+ # if our memory location is being read,
+ # and the holding register has valid data,
+ # then its value must match the memory output, on the given lane
+ with m.If(Past(dut.rd_addr_i) == a_const):
+ if transparent:
+ with m.If(wrote):
+ rd_lane = dut.rd_data_o.word_select(lane, gran)
+ m.d.sync += Assert(d_reg == rd_lane)
+ else:
+ # with a non-transparent read port, the read value depends
+ # on whether there is a simultaneous write, or not
+ with m.If((Past(dut.wr_addr_i) == a_const)
+ & Past(dut.phase) == dut.write_phase):
+ # simultaneous write -> check against last written value
+ with m.If(Past(wrote)):
+ rd_lane = dut.rd_data_o.word_select(lane, gran)
+ m.d.sync += Assert(Past(d_reg) == rd_lane)
+ with m.Else():
+ # otherwise, check against current written value
+ with m.If(wrote):
+ rd_lane = dut.rd_data_o.word_select(lane, gran)
+ m.d.sync += Assert(d_reg == rd_lane)
+
+ # pass our state to the device under test, so it can ensure that
+ # its state is in sync with ours, for induction
+ m.d.comb += [
+ # address and mask under test
+ dut.dbg_addr.eq(a_const),
+ dut.dbg_lane.eq(lane),
+ # state of our holding register
+ dut.dbg_data.eq(d_reg),
+ dut.dbg_wrote.eq(wrote),
+ ]
+
+ self.assertFormal(m, mode="prove", depth=3)
+
+ def test_phased_dual_port_regfile_proof(self):
+ """test both types (odd and even write ports) of phased write memory"""
+ with self.subTest("writes happen on phase 0"):
+ self.do_test_phased_dual_port_regfile_proof(0, False)
+ with self.subTest("writes happen on phase 1"):
+ self.do_test_phased_dual_port_regfile_proof(1, False)
+ # test again, with transparent read ports
+ with self.subTest("writes happen on phase 0 (transparent reads)"):
+ self.do_test_phased_dual_port_regfile_proof(0, True)
+ with self.subTest("writes happen on phase 1 (transparent reads)"):
+ self.do_test_phased_dual_port_regfile_proof(1, True)
+
+
+class DualPortRegfile(Elaboratable):
+ """
+ Builds, from a pair of phased 1W/1R blocks, a true 1W/1R RAM, where both
+ read and write ports work every cycle.
+ It employs a Last Value Table, that tracks to which memory each address was
+ last written.
+
+ :param addr_width: width of the address bus
+ :param data_width: width of the data bus
+ :param we_width: number of write enable lines
+ :param transparent: whether a simultaneous read and write returns the
+ new value (True) or the old value (False)
+ """
+
+ def __init__(self, addr_width, data_width, we_width, transparent=True):
+ self.addr_width = addr_width
+ self.data_width = data_width
+ self.we_width = we_width
+ self.transparent = transparent
+ # interface signals
+ self.wr_addr_i = Signal(addr_width); """write port address"""
+ self.wr_data_i = Signal(data_width); """write port data"""
+ self.wr_we_i = Signal(we_width); """write port enable"""
+ self.rd_addr_i = Signal(addr_width); """read port address"""
+ self.rd_data_o = Signal(data_width); """read port data"""
+ # debug signals, only used in formal proofs
+ # address and write lane under test
+ self.dbg_addr = Signal(addr_width); """debug: address under test"""
+ lanes = range(we_width)
+ self.dbg_lane = Signal(lanes); """debug: write lane under test"""
+ # upstream state, to keep in sync with ours
+ gran = self.data_width // self.we_width
+ self.dbg_data = Signal(gran); """debug: data to keep in sync"""
+ self.dbg_wrote = Signal(); """debug: data is valid"""
+ self.dbg_wrote_phase = Signal(); """debug: the phase data was written"""
+ self.dbg_phase = Signal(); """debug: current phase"""
+
+ def elaborate(self, platform):
+ m = Module()
+ # depth and granularity
+ depth = 1 << self.addr_width
+ gran = self.data_width // self.we_width
+ # instantiate the two phased 1R/1W memory blocks
+ mem0 = PhasedDualPortRegfile(
+ self.addr_width, self.data_width, self.we_width, 0,
+ self.transparent)
+ mem1 = PhasedDualPortRegfile(
+ self.addr_width, self.data_width, self.we_width, 1,
+ self.transparent)
+ m.submodules.mem0 = mem0
+ m.submodules.mem1 = mem1
+ # instantiate the backing memory (FFRAM or LUTRAM)
+ # for the Last Value Table
+ # it should have the same number and port types of the desired
+ # memory, but just one bit per write lane
+ lvt_mem = Memory(width=self.we_width, depth=depth)
+ lvt_wr = lvt_mem.write_port(granularity=1)
+ lvt_rd = lvt_mem.read_port(transparent=self.transparent)
+ if not self.transparent:
+ # for some reason, formal proofs don't recognize the default
+ # reset value for this signal
+ m.d.comb += lvt_rd.en.eq(1)
+ m.submodules.lvt_wr = lvt_wr
+ m.submodules.lvt_rd = lvt_rd
+ # generate and wire the phases for the phased memories
+ phase = Signal()
+ m.d.sync += phase.eq(~phase)
+ m.d.comb += [
+ mem0.phase.eq(phase),
+ mem1.phase.eq(phase),
+ ]
+ m.d.comb += [
+ # wire the write ports, directly
+ mem0.wr_addr_i.eq(self.wr_addr_i),
+ mem1.wr_addr_i.eq(self.wr_addr_i),
+ mem0.wr_we_i.eq(self.wr_we_i),
+ mem1.wr_we_i.eq(self.wr_we_i),
+ mem0.wr_data_i.eq(self.wr_data_i),
+ mem1.wr_data_i.eq(self.wr_data_i),
+ # also wire the read addresses
+ mem0.rd_addr_i.eq(self.rd_addr_i),
+ mem1.rd_addr_i.eq(self.rd_addr_i),
+ # wire read and write ports to the LVT
+ lvt_wr.addr.eq(self.wr_addr_i),
+ lvt_wr.en.eq(self.wr_we_i),
+ lvt_rd.addr.eq(self.rd_addr_i),
+ # the data for the LVT is the phase on which the value was
+ # written
+ lvt_wr.data.eq(Repl(phase, self.we_width)),
+ ]
+ for i in range(self.we_width):
+ # select the right memory to assign to the output read port,
+ # in this byte lane, according to the LVT contents
+ m.d.comb += self.rd_data_o.word_select(i, gran).eq(
+ Mux(
+ lvt_rd.data[i],
+ mem1.rd_data_o.word_select(i, gran),
+ mem0.rd_data_o.word_select(i, gran)))
+
+ if platform == "formal":
+ # pass upstream state to the memories, so they can ensure that
+ # their state are in sync with upstream, for induction
+ m.d.comb += [
+ # address and write lane under test
+ mem0.dbg_addr.eq(self.dbg_addr),
+ mem1.dbg_addr.eq(self.dbg_addr),
+ mem0.dbg_lane.eq(self.dbg_lane),
+ mem1.dbg_lane.eq(self.dbg_lane),
+ # upstream state
+ mem0.dbg_data.eq(self.dbg_data),
+ mem1.dbg_data.eq(self.dbg_data),
+ # the memory, on which the write ends up, depends on which
+ # phase it was written
+ mem0.dbg_wrote.eq(self.dbg_wrote & ~self.dbg_wrote_phase),
+ mem1.dbg_wrote.eq(self.dbg_wrote & self.dbg_wrote_phase),
+ ]
+ # sync phase to upstream
+ m.d.comb += Assert(self.dbg_phase == phase)
+ # this debug port for the LVT is an asynchronous read port,
+ # allowing direct access to a given memory location
+ # by the formal engine
+ m.submodules.dbgport = dbgport = lvt_mem.read_port(domain='comb')
+ # first, get the value stored in our memory location,
+ stored = Signal(self.we_width)
+ m.d.comb += dbgport.addr.eq(self.dbg_addr)
+ m.d.comb += stored.eq(dbgport.data)
+ # now, ensure that the value stored in memory is always in sync
+ # with the expected value (which memory the value was written to)
+ with m.If(self.dbg_wrote):
+ m.d.comb += Assert(stored.bit_select(self.dbg_lane, 1)
+ == self.dbg_wrote_phase)
+ return m
+
+ def ports(self):
+ return [
+ self.wr_addr_i,
+ self.wr_data_i,
+ self.wr_we_i,
+ self.rd_addr_i,
+ self.rd_data_o
+ ]
+
+
+class DualPortRegfileTestCase(FHDLTestCase):
+
+ def do_test_dual_port_regfile(self, transparent):
+ """
+ Simulate some read/write/modify operations on the dual port register
+ file
+ """
+ dut = DualPortRegfile(7, 32, 4, transparent)
+ sim = Simulator(dut)
+ sim.add_clock(1e-6)
+
+ expected = None
+ last_expected = None
+
+ # compare read data with previously written data
+ # and start a new read
+ def read(rd_addr_i, next_expected=None):
+ nonlocal expected, last_expected
+ if expected is not None:
+ self.assertEqual((yield dut.rd_data_o), expected)
+ yield dut.rd_addr_i.eq(rd_addr_i)
+ # account for the read latency
+ expected = last_expected
+ last_expected = next_expected
+
+ # start a write
+ def write(wr_addr_i, wr_we_i, wr_data_i):
+ yield dut.wr_addr_i.eq(wr_addr_i)
+ yield dut.wr_we_i.eq(wr_we_i)
+ yield dut.wr_data_i.eq(wr_data_i)
+
+ def process():
+ # write a pair of values, one for each memory
+ yield from read(0)
+ yield from write(0x42, 0b1111, 0x87654321)
+ yield
+ yield from read(0x42, 0x87654321)
+ yield from write(0x43, 0b1111, 0x0FEDCBA9)
+ yield
+ # skip a beat
+ yield from read(0x43, 0x0FEDCBA9)
+ yield from write(0, 0, 0)
+ yield
+ # write again, but now they switch memories
+ yield from read(0)
+ yield from write(0x42, 0b1111, 0x12345678)
+ yield
+ yield from read(0x42, 0x12345678)
+ yield from write(0x43, 0b1111, 0x9ABCDEF0)
+ yield
+ yield from read(0x43, 0x9ABCDEF0)
+ yield from write(0, 0, 0)
+ yield
+ # test partial writes
+ yield from read(0)
+ yield from write(0x42, 0b1001, 0x78FFFF12)
+ yield
+ yield from read(0)
+ yield from write(0x43, 0b0110, 0xFFDEABFF)
+ yield
+ yield from read(0x42, 0x78345612)
+ yield from write(0, 0, 0)
+ yield
+ yield from read(0x43, 0x9ADEABF0)
+ yield from write(0, 0, 0)
+ yield
+ yield from read(0)
+ yield from write(0, 0, 0)
+ yield
+ if transparent:
+ # returns the value just written
+ yield from read(0x42, 0x78AA5666)
+ else:
+ # returns the old value
+ yield from read(0x42, 0x78345612)
+ yield from write(0x42, 0b0101, 0x55AA9966)
+ yield
+ # after a cycle, always returns the new value
+ yield from read(0x42, 0x78AA5666)
+ yield from write(0, 0, 0)
+ yield
+ yield from read(0)
+ yield from write(0, 0, 0)
+ yield
+ yield from read(0)
+ yield from write(0, 0, 0)
+
+ sim.add_sync_process(process)
+ debug_file = 'test_dual_port_regfile'
+ if transparent:
+ debug_file += '_transparent'
+ traces = ['clk', 'phase',
+ {'comment': 'write port'},
+ 'wr_addr_i[6:0]', 'wr_we_i[3:0]', 'wr_data_i[31:0]',
+ {'comment': 'read port'},
+ 'rd_addr_i[6:0]', 'rd_data_o[31:0]',
+ {'comment': 'LVT write port'},
+ 'phase', 'lvt_mem_w_addr[6:0]', 'lvt_mem_w_en[3:0]',
+ 'lvt_mem_w_data[3:0]',
+ {'comment': 'LVT read port'},
+ 'lvt_mem_r_addr[6:0]', 'lvt_mem_r_data[3:0]',
+ {'comment': 'backing memory'},
+ 'mem0.rd_data_o[31:0]',
+ 'mem1.rd_data_o[31:0]',
+ ]
+ write_gtkw(debug_file + '.gtkw',
+ debug_file + '.vcd',
+ traces, module='top', zoom=-22)
+ sim_writer = sim.write_vcd(debug_file + '.vcd')
+ with sim_writer:
+ sim.run()
+
+ def test_dual_port_regfile(self):
+ with self.subTest("non-transparent reads"):
+ self.do_test_dual_port_regfile(False)
+ with self.subTest("transparent reads"):
+ self.do_test_dual_port_regfile(True)
+
+ def do_test_dual_port_regfile_proof(self, transparent=True):
+ """
+ Formal proof of the 1W/1R regfile
+ """
+ m = Module()
+ # 128 x 32-bit, 8-bit granularity
+ dut = DualPortRegfile(7, 32, 4, transparent)
+ m.submodules.dut = dut
+ gran = dut.data_width // dut.we_width # granularity
+ # choose a single random memory location to test
+ a_const = AnyConst(dut.addr_width)
+ # choose a single byte lane to test
+ lane = AnyConst(range(dut.we_width))
+ # holding data register
+ d_reg = Signal(gran)
+ # keep track of the phase, so we can remember which memory
+ # we wrote to
+ phase = Signal()
+ m.d.sync += phase.eq(~phase)
+ # for some reason, simulated formal memory is not zeroed at reset
+ # ... so, remember whether we wrote it, at least once.
+ wrote = Signal()
+ # ... and on which phase it was written
+ wrote_phase = Signal()
+ # if our memory location and byte lane is being written,
+ # capture the data in our holding register
+ with m.If((dut.wr_addr_i == a_const)
+ & dut.wr_we_i.bit_select(lane, 1)):
+ m.d.sync += d_reg.eq(dut.wr_data_i.word_select(lane, gran))
+ m.d.sync += wrote.eq(1)
+ m.d.sync += wrote_phase.eq(phase)
+ # if our memory location is being read,
+ # and the holding register has valid data,
+ # then its value must match the memory output, on the given lane
+ with m.If(Past(dut.rd_addr_i) == a_const):
+ if transparent:
+ with m.If(wrote):
+ rd_lane = dut.rd_data_o.word_select(lane, gran)
+ m.d.sync += Assert(d_reg == rd_lane)
+ else:
+ # with a non-transparent read port, the read value depends
+ # on whether there is a simultaneous write, or not
+ with m.If(Past(dut.wr_addr_i) == a_const):
+ # simultaneous write -> check against last written value
+ with m.If(wrote & Past(wrote)):
+ rd_lane = dut.rd_data_o.word_select(lane, gran)
+ m.d.sync += Assert(Past(d_reg) == rd_lane)
+ with m.Else():
+ # otherwise, check against current written value
+ with m.If(wrote):
+ rd_lane = dut.rd_data_o.word_select(lane, gran)
+ m.d.sync += Assert(d_reg == rd_lane)
+
+ m.d.comb += [
+ dut.dbg_addr.eq(a_const),
+ dut.dbg_lane.eq(lane),
+ dut.dbg_data.eq(d_reg),
+ dut.dbg_wrote.eq(wrote),
+ dut.dbg_wrote_phase.eq(wrote_phase),
+ dut.dbg_phase.eq(phase),
+ ]
+
+ self.assertFormal(m, mode="prove", depth=3)
+
+ def test_dual_port_regfile_proof(self):
+ """
+ Formal check of 1W/1R regfile (transparent and not)
+ """
+ with self.subTest("transparent reads"):
+ self.do_test_dual_port_regfile_proof(True)
+ with self.subTest("non-transparent reads"):
+ self.do_test_dual_port_regfile_proof(False)
+
+
+class PhasedReadPhasedWriteFullReadSRAM(Elaboratable):
+ """
+ Builds, from three 1RW blocks, a pseudo 1W/2R SRAM, with:
+
+ * one full read port, which works every cycle,
+ * one write port, which is only available on either even or odd cycles,
+ * an extra transparent read port, available only on the same cycles as the
+ write port
+
+ This type of SRAM is useful for a XOR-based 6x1RW implementation of
+ a 1R/1W register file.
+
+ :param addr_width: width of the address bus
+ :param data_width: width of the data bus
+ :param we_width: number of write enable lines
+ :param write_phase: indicates on which phase the write port will
+ accept data
+ :param transparent: whether a simultaneous read and write returns the
+ new value (True) or the old value (False) on the full
+ read port
+
+ .. note:: The debug read port is meant only to assist in formal proofs!
+ """
+
+ def __init__(self, addr_width, data_width, we_width, write_phase,
+ transparent=True):
+ self.addr_width = addr_width
+ self.data_width = data_width
+ self.we_width = we_width
+ self.write_phase = write_phase
+ self.transparent = transparent
+ # interface signals
+ self.wr_addr_i = Signal(addr_width); """phased write port address"""
+ self.wr_data_i = Signal(data_width); """phased write port data"""
+ self.wr_we_i = Signal(we_width); """phased write port enable"""
+ self.rd_addr_i = Signal(addr_width); """full read port address"""
+ self.rd_data_o = Signal(data_width); """full read port data"""
+ self.rdp_addr_i = Signal(addr_width); """phased read port address"""
+ self.rdp_data_o = Signal(data_width); """phased read port data"""
+ self.phase = Signal(); """even/odd cycle indicator"""
+ # debug signals, only used in formal proofs
+ self.dbg_addr = Signal(addr_width); """debug: address under test"""
+ lanes = range(we_width)
+ self.dbg_lane = Signal(lanes); """debug: write lane under test"""
+ gran = self.data_width // self.we_width
+ self.dbg_data = Signal(gran); """debug: data to keep in sync"""
+ self.dbg_wrote = Signal(); """debug: data is valid"""
+
+ def elaborate(self, platform):
+ m = Module()
+ # instantiate the 1RW memory blocks
+ mem1 = SinglePortSRAM(self.addr_width, self.data_width, self.we_width)
+ mem2 = SinglePortSRAM(self.addr_width, self.data_width, self.we_width)
+ mem3 = SinglePortSRAM(self.addr_width, self.data_width, self.we_width)
+ m.submodules.mem1 = mem1
+ m.submodules.mem2 = mem2
+ m.submodules.mem3 = mem3
+ # wire input write data to first memory, and its output to the others
+ m.d.comb += [
+ mem1.d.eq(self.wr_data_i),
+ mem2.d.eq(mem1.q),
+ mem3.d.eq(mem1.q)
+ ]
+ # holding registers for the write port of the other memories
+ last_wr_addr = Signal(self.addr_width)
+ last_wr_we = Signal(self.we_width)
+ # do read and write addresses coincide?
+ same_read_write = Signal()
+ same_phased_read_write = Signal()
+ with m.If(self.phase == self.write_phase):
+ # write phase, start a write on the first memory
+ m.d.comb += mem1.a.eq(self.wr_addr_i)
+ m.d.comb += mem1.we.eq(self.wr_we_i)
+ # save write address and write select for repeating the write
+ # on the other memories, one cycle later
+ m.d.sync += last_wr_we.eq(self.wr_we_i)
+ m.d.sync += last_wr_addr.eq(self.wr_addr_i)
+ # start a read on the other memories
+ m.d.comb += mem2.a.eq(self.rd_addr_i)
+ m.d.comb += mem3.a.eq(self.rdp_addr_i)
+ # output previously read data from the first memory
+ m.d.comb += self.rd_data_o.eq(mem1.q)
+ # remember whether we are reading from the same location as we
+ # are writing
+ m.d.sync += same_phased_read_write.eq(
+ self.rdp_addr_i == self.wr_addr_i)
+ if self.transparent:
+ m.d.sync += same_read_write.eq(self.rd_addr_i == self.wr_addr_i)
+ with m.Else():
+ # read phase, write last written data on the other memories
+ m.d.comb += [
+ mem2.a.eq(last_wr_addr),
+ mem2.we.eq(last_wr_we),
+ mem3.a.eq(last_wr_addr),
+ mem3.we.eq(last_wr_we),
+ ]
+ # start a read on the first memory
+ m.d.comb += mem1.a.eq(self.rd_addr_i)
+ # output the read data from the second memory
+ if self.transparent:
+ with m.If(same_read_write):
+ # when transparent, and read and write addresses coincide,
+ # output the data just written
+ m.d.comb += self.rd_data_o.eq(mem1.q)
+ with m.Else():
+ # otherwise, output previously read data
+ # from the second memory
+ m.d.comb += self.rd_data_o.eq(mem2.q)
+ else:
+ # always output the read data from the second memory,
+ # if not transparent
+ m.d.comb += self.rd_data_o.eq(mem2.q)
+ with m.If(same_phased_read_write):
+ # if read and write addresses coincide,
+ # output the data just written
+ m.d.comb += self.rdp_data_o.eq(mem1.q)
+ with m.Else():
+ # otherwise, output previously read data
+ # from the third memory
+ m.d.comb += self.rdp_data_o.eq(mem3.q)
+
+ if platform == "formal":
+ # pass our state to the device under test, so it can ensure that
+ # its state is in sync with ours, for induction
+ m.d.comb += [
+ # pass the address and write lane under test to both memories
+ mem1.dbg_addr.eq(self.dbg_addr),
+ mem2.dbg_addr.eq(self.dbg_addr),
+ mem3.dbg_addr.eq(self.dbg_addr),
+ mem1.dbg_lane.eq(self.dbg_lane),
+ mem2.dbg_lane.eq(self.dbg_lane),
+ mem3.dbg_lane.eq(self.dbg_lane),
+ # the other memories copy their state from the first memory,
+ # after a cycle, so they have a one cycle delay
+ mem1.dbg_data.eq(self.dbg_data),
+ mem2.dbg_data.eq(Past(self.dbg_data)),
+ mem3.dbg_data.eq(Past(self.dbg_data)),
+ mem1.dbg_wrote.eq(self.dbg_wrote),
+ mem2.dbg_wrote.eq(Past(self.dbg_wrote)),
+ mem3.dbg_wrote.eq(Past(self.dbg_wrote)),
+ ]
+
+ return m
+
+
+class PhasedReadPhasedWriteFullReadSRAMTestCase(FHDLTestCase):
+
+ def do_test_case(self, write_phase, transparent):
+ """
+ Simulate some read/write/modify operations
+ """
+ dut = PhasedReadPhasedWriteFullReadSRAM(7, 32, 4, write_phase,
+ transparent)
+ sim = Simulator(dut)
+ sim.add_clock(1e-6)
+
+ expected = None
+ last_expected = None
+
+ # compare read data with previously written data
+ # and start a new read
+ def read(rd_addr_i, next_expected=None):
+ nonlocal expected, last_expected
+ if expected is not None:
+ self.assertEqual((yield dut.rd_data_o), expected)
+ yield dut.rd_addr_i.eq(rd_addr_i)
+ # account for the read latency
+ expected = last_expected
+ last_expected = next_expected
+
+ expected2 = None
+
+ # same as above, but for the phased read port
+ def phased_read(rdp_addr_i, next_expected2=None):
+ nonlocal expected2
+ if expected2 is not None:
+ self.assertEqual((yield dut.rdp_data_o), expected2)
+ yield dut.rdp_addr_i.eq(rdp_addr_i)
+ # account for the read latency
+ expected2 = next_expected2
+
+ # start a write
+ def write(wr_addr_i, wr_we_i, wr_data_i):
+ yield dut.wr_addr_i.eq(wr_addr_i)
+ yield dut.wr_we_i.eq(wr_we_i)
+ yield dut.wr_data_i.eq(wr_data_i)
+ yield dut.phase.eq(write_phase)
+
+ # disable writes, and start read phase
+ def skip_write():
+ yield dut.wr_addr_i.eq(0)
+ yield dut.wr_we_i.eq(0)
+ yield dut.wr_data_i.eq(0)
+ yield dut.phase.eq(~write_phase)
+ # also skip reading from the phased read port
+ yield dut.rdp_addr_i.eq(0)
+
+ # writes a few values on the write port, and read them back
+ def process():
+ yield from read(0)
+ yield from phased_read(0)
+ yield from write(0x42, 0b1111, 0x12345678)
+ yield
+ yield from read(0x42, 0x12345678)
+ yield from skip_write()
+ yield
+ yield from read(0x42, 0x12345678)
+ yield from phased_read(0x42, 0x12345678)
+ yield from write(0x43, 0b1111, 0x9ABCDEF0)
+ yield
+ yield from read(0x43, 0x9ABCDEF0)
+ yield from skip_write()
+ yield
+ yield from read(0x42, 0x12345678)
+ yield from phased_read(0x42, 0x12345678)
+ yield from write(0x43, 0b1001, 0xF0FFFF9A)
+ yield
+ yield from read(0x43, 0xF0BCDE9A)
+ yield from skip_write()
+ yield
+ yield from read(0x43, 0xF0BCDE9A)
+ yield from phased_read(0x43, 0xF0BCDE9A)
+ yield from write(0x42, 0b0110, 0xFF5634FF)
+ yield
+ yield from read(0x42, 0x12563478)
+ yield from skip_write()
+ yield
+ yield from read(0)
+ yield from phased_read(0)
+ yield from write(0, 0, 0)
+ yield
+ yield from read(0)
+ yield from skip_write()
+ yield
+ # try reading and writing at the same time
+ if transparent:
+ # transparent port, return the value just written
+ yield from read(0x42, 0x12AA3466)
+ else:
+ # ... otherwise, return the old value
+ yield from read(0x42, 0x12563478)
+ # transparent port, always return the value just written
+ yield from phased_read(0x42, 0x12AA3466)
+ yield from write(0x42, 0b0101, 0x55AA9966)
+ yield
+ # after a cycle, always returns the new value
+ yield from read(0x42, 0x12AA3466)
+ yield from skip_write()
+ yield
+ yield from read(0)
+ yield from phased_read(0)
+ yield from write(0, 0, 0)
+ yield
+ yield from read(0)
+ yield from skip_write()
+
+ sim.add_sync_process(process)
+ debug_file = 'test_phased_read_write_sram_' + str(write_phase)
+ if transparent:
+ debug_file += '_transparent'
+ traces = ['clk', 'phase',
+ {'comment': 'phased write port'},
+ 'wr_addr_i[6:0]', 'wr_we_i[3:0]', 'wr_data_i[31:0]',
+ {'comment': 'full read port'},
+ 'rd_addr_i[6:0]', 'rd_data_o[31:0]',
+ {'comment': 'phased read port'},
+ 'rdp_addr_i[6:0]', 'rdp_data_o[31:0]']
+ write_gtkw(debug_file + '.gtkw',
+ debug_file + '.vcd',
+ traces, module='top', zoom=-22)
+ sim_writer = sim.write_vcd(debug_file + '.vcd')
+ with sim_writer:
+ sim.run()
+
+ def test_case(self):
+ """test both types (odd and even write ports) of phased memory"""
+ with self.subTest("writes happen on phase 0"):
+ self.do_test_case(0, True)
+ with self.subTest("writes happen on phase 1"):
+ self.do_test_case(1, True)
+ with self.subTest("writes happen on phase 0 (non-transparent reads)"):
+ self.do_test_case(0, False)
+ with self.subTest("writes happen on phase 1 (non-transparent reads)"):
+ self.do_test_case(1, False)
+
+ def do_test_formal(self, write_phase, transparent):
+ """
+ Formal proof of the pseudo 1W/2R regfile
+ """
+ m = Module()
+ # 128 x 32-bit, 8-bit granularity
+ dut = PhasedReadPhasedWriteFullReadSRAM(7, 32, 4, write_phase,
+ transparent)
+ m.submodules.dut = dut
+ gran = dut.data_width // dut.we_width # granularity
+ # choose a single random memory location to test
+ a_const = AnyConst(dut.addr_width)
+ # choose a single byte lane to test
+ lane = AnyConst(range(dut.we_width))
+ # drive alternating phases
+ m.d.comb += Assume(dut.phase != Past(dut.phase))
+ # holding data register
+ d_reg = Signal(gran)
+ # for some reason, simulated formal memory is not zeroed at reset
+ # ... so, remember whether we wrote it, at least once.
+ wrote = Signal()
+ # if our memory location and byte lane is being written,
+ # capture the data in our holding register
+ with m.If((dut.wr_addr_i == a_const)
+ & dut.wr_we_i.bit_select(lane, 1)
+ & (dut.phase == dut.write_phase)):
+ m.d.sync += d_reg.eq(dut.wr_data_i.word_select(lane, gran))
+ m.d.sync += wrote.eq(1)
+ # if our memory location is being read,
+ # and the holding register has valid data,
+ # then its value must match the memory output, on the given lane
+ with m.If(Past(dut.rd_addr_i) == a_const):
+ if transparent:
+ with m.If(wrote):
+ rd_lane = dut.rd_data_o.word_select(lane, gran)
+ m.d.sync += Assert(d_reg == rd_lane)
+ else:
+ # with a non-transparent read port, the read value depends
+ # on whether there is a simultaneous write, or not
+ with m.If((Past(dut.wr_addr_i) == a_const)
+ & Past(dut.phase) == dut.write_phase):
+ # simultaneous write -> check against last written value
+ with m.If(Past(wrote)):
+ rd_lane = dut.rd_data_o.word_select(lane, gran)
+ m.d.sync += Assert(Past(d_reg) == rd_lane)
+ with m.Else():
+ # otherwise, check against current written value
+ with m.If(wrote):
+ rd_lane = dut.rd_data_o.word_select(lane, gran)
+ m.d.sync += Assert(d_reg == rd_lane)
+ # same for the phased read port, except it's always transparent
+ # and the port works only on the write phase
+ with m.If((Past(dut.rdp_addr_i) == a_const) & wrote
+ & (Past(dut.phase) == dut.write_phase)):
+ rdp_lane = dut.rdp_data_o.word_select(lane, gran)
+ m.d.sync += Assert(d_reg == rdp_lane)
+
+ # pass our state to the device under test, so it can ensure that
+ # its state is in sync with ours, for induction
+ m.d.comb += [
+ # address and mask under test
+ dut.dbg_addr.eq(a_const),
+ dut.dbg_lane.eq(lane),
+ # state of our holding register
+ dut.dbg_data.eq(d_reg),
+ dut.dbg_wrote.eq(wrote),
+ ]
+
+ self.assertFormal(m, mode="prove", depth=3)
+
+ def test_formal(self):
+ """test both types (odd and even write ports) of phased write memory"""
+ with self.subTest("writes happen on phase 0"):
+ self.do_test_formal(0, False)
+ with self.subTest("writes happen on phase 1"):
+ self.do_test_formal(1, False)
+ # test again, with transparent read ports
+ with self.subTest("writes happen on phase 0 (transparent reads)"):
+ self.do_test_formal(0, True)
+ with self.subTest("writes happen on phase 1 (transparent reads)"):
+ self.do_test_formal(1, True)
+
+
+class DualPortXorRegfile(Elaboratable):
+ """
+ Builds, from a pair of phased 1W/2R blocks, a true 1W/1R RAM, where both
+ write and (non-transparent) read ports work every cycle.
+
+ It employs a XOR trick, as follows:
+
+ 1) Like before, there are two memories, each reading on every cycle, and
+ writing on alternate cycles
+ 2) Instead of a MUX, the read port is a direct XOR of the two memories.
+ 3) Writes happens in two cycles:
+
+ First, read the current value of the *other* memory, at the write
+ location.
+
+ Then, on *this* memory, write that read value, XORed with the desired
+ value.
+
+ This recovers the desired value when read:
+ (other XOR desired) XOR other = desired
+
+ :param addr_width: width of the address bus
+ :param data_width: width of the data bus
+ :param we_width: number of write enable lines
+ :param transparent: whether a simultaneous read and write returns the
+ new value (True) or the old value (False) on the full
+ read port
+ """
+
+ def __init__(self, addr_width, data_width, we_width, transparent):
+ self.addr_width = addr_width
+ self.data_width = data_width
+ self.we_width = we_width
+ self.transparent = transparent
+ # interface signals
+ self.wr_addr_i = Signal(addr_width); """write port address"""
+ self.wr_data_i = Signal(data_width); """write port data"""
+ self.wr_we_i = Signal(we_width); """write port enable"""
+ self.rd_addr_i = Signal(addr_width); """read port address"""
+ self.rd_data_o = Signal(data_width); """read port data"""
+
+ def elaborate(self, platform):
+ m = Module()
+ # instantiate the two phased 1W/2R memory blocks
+ mem0 = PhasedReadPhasedWriteFullReadSRAM(
+ self.addr_width, self.data_width, self.we_width, 0, True)
+ mem1 = PhasedReadPhasedWriteFullReadSRAM(
+ self.addr_width, self.data_width, self.we_width, 1, True)
+ m.submodules.mem0 = mem0
+ m.submodules.mem1 = mem1
+ # generate and wire the phases for the phased memories
+ phase = Signal()
+ m.d.sync += phase.eq(~phase)
+ m.d.comb += [
+ mem0.phase.eq(phase),
+ mem1.phase.eq(phase),
+ ]
+ # store the write information for the next cycle
+ last_addr = Signal(self.addr_width)
+ last_we = Signal(self.we_width)
+ last_data = Signal(self.data_width)
+ m.d.sync += [
+ last_addr.eq(self.wr_addr_i),
+ last_we.eq(self.wr_we_i),
+ last_data.eq(self.wr_data_i),
+ ]
+ # read path
+ # wire read address to memories, and XOR their output
+ xor_data = Signal(self.data_width)
+ m.d.comb += [
+ mem0.rd_addr_i.eq(self.rd_addr_i),
+ mem1.rd_addr_i.eq(self.rd_addr_i),
+ xor_data.eq(mem0.rd_data_o ^ mem1.rd_data_o),
+ ]
+ if self.transparent:
+ # do the read and write addresses coincide?
+ same_read_write = Signal()
+ m.d.sync += same_read_write.eq(self.rd_addr_i == self.wr_addr_i)
+ gran = self.data_width // self.we_width
+ for i in range(self.we_width):
+ # when simultaneously reading and writing to the same location
+ # and write lane, bypass the memory, and output the write
+ # holding register instead
+ with m.If(same_read_write & last_we[i]):
+ m.d.comb += self.rd_data_o.word_select(i, gran).eq(
+ last_data.word_select(i, gran))
+ # otherwise, output the xor data
+ with m.Else():
+ m.d.comb += self.rd_data_o.word_select(i, gran).eq(
+ xor_data.word_select(i, gran))
+ # when not transparent, just output the memory contents (xor data)
+ else:
+ m.d.comb += self.rd_data_o.eq(xor_data)
+ # write path
+ # 1) read the memory location which is about to be written
+ m.d.comb += [
+ mem0.rdp_addr_i.eq(self.wr_addr_i),
+ mem1.rdp_addr_i.eq(self.wr_addr_i),
+ ]
+ # 2) write the XOR of the other memory data, and the desired value
+ m.d.comb += [
+ mem0.wr_addr_i.eq(last_addr),
+ mem1.wr_addr_i.eq(last_addr),
+ mem0.wr_we_i.eq(last_we),
+ mem1.wr_we_i.eq(last_we),
+ mem0.wr_data_i.eq(last_data ^ mem1.rdp_data_o),
+ mem1.wr_data_i.eq(last_data ^ mem0.rdp_data_o),
+ ]
+ return m
+
+
+class DualPortXorRegfileTestCase(FHDLTestCase):
+
+ def do_test_case(self, transparent):
+ """
+ Simulate some read/write/modify operations on the dual port register
+ file
+ """
+ dut = DualPortXorRegfile(7, 32, 4, transparent)
+ sim = Simulator(dut)
+ sim.add_clock(1e-6)
+
+ expected = None
+ last_expected = None
+
+ # compare read data with previously written data
+ # and start a new read
+ def read(rd_addr_i, next_expected=None):
+ nonlocal expected, last_expected
+ if expected is not None:
+ self.assertEqual((yield dut.rd_data_o), expected)
+ yield dut.rd_addr_i.eq(rd_addr_i)
+ # account for the read latency
+ expected = last_expected
+ last_expected = next_expected
+
+ # start a write
+ def write(wr_addr_i, wr_we_i, wr_data_i):
+ yield dut.wr_addr_i.eq(wr_addr_i)
+ yield dut.wr_we_i.eq(wr_we_i)
+ yield dut.wr_data_i.eq(wr_data_i)
+
+ def process():
+ # write a pair of values, one for each memory
+ yield from read(0)
+ yield from write(0x42, 0b1111, 0x87654321)
+ yield
+ yield from read(0x42, 0x87654321)
+ yield from write(0x43, 0b1111, 0x0FEDCBA9)
+ yield
+ # skip a beat
+ yield from read(0x43, 0x0FEDCBA9)
+ yield from write(0, 0, 0)
+ yield
+ # write again, but now they switch memories
+ yield from read(0)
+ yield from write(0x42, 0b1111, 0x12345678)
+ yield
+ yield from read(0x42, 0x12345678)
+ yield from write(0x43, 0b1111, 0x9ABCDEF0)
+ yield
+ yield from read(0x43, 0x9ABCDEF0)
+ yield from write(0, 0, 0)
+ yield
+ # test partial writes
+ yield from read(0)
+ yield from write(0x42, 0b1001, 0x78FFFF12)
+ yield
+ yield from read(0)
+ yield from write(0x43, 0b0110, 0xFFDEABFF)
+ yield
+ yield from read(0x42, 0x78345612)
+ yield from write(0, 0, 0)
+ yield
+ yield from read(0x43, 0x9ADEABF0)
+ yield from write(0, 0, 0)
+ yield
+ yield from read(0)
+ yield from write(0, 0, 0)
+ yield
+ # test simultaneous read and write
+ if transparent:
+ # transparent reads, returns the new value
+ yield from read(0x42, 0x78AA5666)
+ else:
+ # non-transparent read: returns the old value
+ yield from read(0x42, 0x78345612)
+ yield from write(0x42, 0b0101, 0x55AA9966)
+ yield
+ # after a cycle, returns the new value
+ yield from read(0x42, 0x78AA5666)
+ yield from write(0, 0, 0)
+ yield
+ # settle down
+ yield from read(0)
+ yield from write(0, 0, 0)
+ yield
+ yield from read(0)
+ yield from write(0, 0, 0)
+
+ sim.add_sync_process(process)
+ debug_file = 'test_dual_port_xor_regfile'
+ if transparent:
+ debug_file += '_transparent'
+ traces = ['clk', 'phase',
+ {'comment': 'write port'},
+ 'wr_addr_i[6:0]', 'wr_we_i[3:0]', 'wr_data_i[31:0]',
+ {'comment': 'read port'},
+ 'rd_addr_i[6:0]', 'rd_data_o[31:0]',
+ ]
+ write_gtkw(debug_file + '.gtkw',
+ debug_file + '.vcd',
+ traces, module='top', zoom=-22)
+ sim_writer = sim.write_vcd(debug_file + '.vcd')
+ with sim_writer:
+ sim.run()
+
+ def test_case(self):
+ with self.subTest("non-transparent reads"):
+ self.do_test_case(False)
+ with self.subTest("transparent reads"):
+ self.do_test_case(True)
+
+
+if __name__ == "__main__":
+ unittest.main()
class VirtualRegPort(RegFileArray):
- def __init__(self, bitwidth, n_regs, rd2=False):
+ def __init__(self, bitwidth, n_regs, rd2=False, wr2=False, synced=True):
self.bitwidth = bitwidth
self.nregs = n_regs
self.rd2 = rd2 # eurgh hack
+ self.wr2 = wr2 # eurgh hack
self.regwidth = regwidth = bitwidth // n_regs
- super().__init__(self.regwidth, n_regs)
+ super().__init__(self.regwidth, n_regs, synced=synced)
# "full" depth variant of the "external" port
self.full_wr = RecordObject([("wen", n_regs),
- ("data_i", bitwidth)], # *full* wid
+ ("i_data", bitwidth)], # *full* wid
name="full_wr")
self.full_rd = RecordObject([("ren", n_regs),
- ("data_o", bitwidth)], # *full* wid
+ ("o_data", bitwidth)], # *full* wid
name="full_rd")
- if not rd2:
- return
- self.full_rd2 = RecordObject([("ren", n_regs),
- ("data_o", bitwidth)], # *full* wid
+ if wr2:
+ self.full_wr2 = RecordObject([("wen", n_regs),
+ ("i_data", bitwidth)], # *full* wid
+ name="full_wr2")
+ if rd2:
+ self.full_rd2 = RecordObject([("ren", n_regs),
+ ("o_data", bitwidth)], # *full* wid
name="full_rd2")
+ def connect_full_wr(self, m, wfull, name):
+ comb = m.d.comb
+ wr_regs = self.write_reg_port(name)
+
+ # wire up the enable signals from the large (full) port
+ l = map(lambda port: port.i_data, wr_regs)
+ le = map(lambda port: port.wen, wr_regs) # get port wen(s)
+
+ # get list of all i_data (and wens) and assign to them via Cat
+ comb += Cat(*l).eq(wfull.i_data)
+ comb += Cat(*le).eq(wfull.wen)
+
def connect_full_rd(self, m, rfull, name):
comb = m.d.comb
rd_regs = self.read_reg_port(name)
# wire up the enable signals and chain-accumulate the data
- l = map(lambda port: port.data_o, rd_regs) # get port data(s)
+ l = map(lambda port: port.o_data, rd_regs) # get port data(s)
le = map(lambda port: port.ren, rd_regs) # get port ren(s)
- comb += rfull.data_o.eq(Cat(*l)) # we like Cat on lists
+ comb += rfull.o_data.eq(Cat(*l)) # we like Cat on lists
comb += Cat(*le).eq(rfull.ren)
def elaborate(self, platform):
m = super().elaborate(platform)
comb = m.d.comb
- # for internal use only.
- wr_regs = self.write_reg_port(f"w")
+ # connect up full write port
+ self.connect_full_wr(m, self.full_wr, "w")
+ if self.wr2:
+ self.connect_full_wr(m, self.full_wr2, "w2")
# connect up full read port
self.connect_full_rd(m, self.full_rd, "r")
if self.rd2: # hack!
self.connect_full_rd(m, self.full_rd2, "r2")
- # connect up full write port
- wfull = self.full_wr
-
- # wire up the enable signals from the large (full) port
- l = map(lambda port: port.data_i, wr_regs)
- le = map(lambda port: port.wen, wr_regs) # get port wen(s)
-
- # get list of all data_i (and wens) and assign to them via Cat
- comb += Cat(*l).eq(wfull.data_i)
- comb += Cat(*le).eq(wfull.wen)
-
return m
def __iter__(self):
def regfile_array_sim(dut, rp1, rp2, rp3, wp):
# part-port write
- yield wp.data_i.eq(2)
+ yield wp.i_data.eq(2)
yield wp.wen.eq(1 << 1)
yield
yield wp.wen.eq(0)
# part-port read
yield rp1.ren.eq(1 << 1)
yield
- data = yield rp1.data_o
+ data = yield rp1.o_data
print(data)
assert data == 2
yield rp1.ren.eq(1 << 5)
yield rp2.ren.eq(1 << 1)
yield wp.wen.eq(1 << 5)
- yield wp.data_i.eq(6)
+ yield wp.i_data.eq(6)
yield
yield wp.wen.eq(0)
yield rp1.ren.eq(0)
yield rp2.ren.eq(0)
- data1 = yield rp1.data_o
+ data1 = yield rp1.o_data
print(data1)
assert data1 == 6, data1
- data2 = yield rp2.data_o
+ data2 = yield rp2.o_data
print(data2)
assert data2 == 2, data2
yield
- data = yield rp1.data_o
+ data = yield rp1.o_data
print(data)
# full port read (whole reg)
yield dut.full_rd.ren.eq(0xff)
yield
yield dut.full_rd.ren.eq(0)
- data = yield dut.full_rd.data_o
+ data = yield dut.full_rd.o_data
print(hex(data))
# full port read (part reg)
yield dut.full_rd.ren.eq(0x1 << 5)
yield
yield dut.full_rd.ren.eq(0)
- data = yield dut.full_rd.data_o
+ data = yield dut.full_rd.o_data
print(hex(data))
# full port part-write (part masked reg)
yield dut.full_wr.wen.eq(0x1 << 1)
- yield dut.full_wr.data_i.eq(0xe0)
+ yield dut.full_wr.i_data.eq(0xe0)
yield
yield dut.full_wr.wen.eq(0x0)
yield dut.full_rd.ren.eq(0xff)
yield
yield dut.full_rd.ren.eq(0)
- data = yield dut.full_rd.data_o
+ data = yield dut.full_rd.o_data
print(hex(data))
# full port write
yield dut.full_wr.wen.eq(0xff)
- yield dut.full_wr.data_i.eq(0xcafeface)
+ yield dut.full_wr.i_data.eq(0xcafeface)
yield
yield dut.full_wr.wen.eq(0x0)
yield dut.full_rd.ren.eq(0xff)
yield
yield dut.full_rd.ren.eq(0)
- data = yield dut.full_rd.data_o
+ data = yield dut.full_rd.o_data
print(hex(data))
# part write
- yield wp.data_i.eq(2)
+ yield wp.i_data.eq(2)
yield wp.wen.eq(1 << 1)
yield
yield wp.wen.eq(0)
yield rp1.ren.eq(1 << 1)
yield
- data = yield rp1.data_o
+ data = yield rp1.o_data
print(hex(data))
assert data == 2
yield dut.full_rd.ren.eq(0xff)
yield
yield dut.full_rd.ren.eq(0)
- data = yield dut.full_rd.data_o
+ data = yield dut.full_rd.o_data
print(hex(data))
# simultaneous read/write: full-write, part-write, 3x part-read
yield rp2.ren.eq(1 << 1)
yield rp3.ren.eq(1 << 3)
yield wp.wen.eq(1 << 3)
- yield wp.data_i.eq(6)
+ yield wp.i_data.eq(6)
yield dut.full_wr.wen.eq((1 << 1) | (1 << 5))
- yield dut.full_wr.data_i.eq((0xa << (1*4)) | (0x3 << (5*4)))
+ yield dut.full_wr.i_data.eq((0xa << (1*4)) | (0x3 << (5*4)))
yield
yield dut.full_wr.wen.eq(0)
yield wp.wen.eq(0)
yield rp1.ren.eq(0)
yield rp2.ren.eq(0)
yield rp3.ren.eq(0)
- data1 = yield rp1.data_o
+ data1 = yield rp1.o_data
print(hex(data1))
assert data1 == 0x3
- data2 = yield rp2.data_o
+ data2 = yield rp2.o_data
print(hex(data2))
assert data2 == 0xa
- data3 = yield rp3.data_o
+ data3 = yield rp3.o_data
print(hex(data3))
assert data3 == 0x6
from nmigen.compat.sim import run_simulation, Settle
from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Const, Array, Cat, Elaboratable, Repl
+from nmigen import Module, Signal, Const, Cat, Elaboratable, Repl
from nmigen.lib.coding import Decoder
from nmigen.utils import log2_int
self.n_adr = n_adr
self.bitwid = bitwid
# inputs
- self.addrs_i = Array(Signal(bitwid, name="addr") for i in range(n_adr))
+ self.addrs_i = tuple(Signal(bitwid, name="addr") for i in range(n_adr))
# self.addr_we_i = Signal(n_adr, reset_less=True) # write-enable
self.addr_en_i = Signal(n_adr, reset_less=True) # address latched in
self.addr_rs_i = Signal(n_adr, reset_less=True) # address deactivated
# output: a nomatch for each address plus individual nomatch signals
self.addr_nomatch_o = Signal(n_adr, name="nomatch_o", reset_less=True)
- self.addr_nomatch_a_o = Array(Signal(n_adr, reset_less=True,
+ self.addr_nomatch_a_o = tuple(Signal(n_adr, reset_less=True,
name="nomatch_array_o")
for i in range(n_adr))
# array of address-latches
m.submodules.l = self.l = l = SRLatch(llen=self.n_adr, sync=False)
- self.adrs_r = adrs_r = Array(Signal(self.bitwid, reset_less=True,
+ self.adrs_r = adrs_r = tuple(Signal(self.bitwid, reset_less=True,
name="a_r")
for i in range(self.n_adr))
# input: length of the LOAD/STORE
expwid = 1+self.lsbwid # XXX assume LD/ST no greater than 8
- self.lexp_i = Array(Signal(1 << expwid, reset_less=True,
+ self.lexp_i = tuple(Signal(1 << expwid, reset_less=True,
name="len") for i in range(n_adr))
# input: full address
- self.faddrs_i = Array(Signal(bitlen, reset_less=True,
+ self.faddrs_i = tuple(Signal(bitlen, reset_less=True,
name="fadr") for i in range(n_adr))
# registers for expanded len
- self.len_r = Array(Signal(expwid, reset_less=True, name="l_r")
+ self.len_r = tuple(Signal(expwid, reset_less=True, name="l_r")
for i in range(self.n_adr))
def elaborate(self, platform):
PartialAddrMatch.__init__(self, n_adr, self.midlen)
# input: length of the LOAD/STORE
- self.len_i = Array(Signal(lsbwid, reset_less=True,
+ self.len_i = tuple(Signal(lsbwid, reset_less=True,
name="len") for i in range(n_adr))
# input: full address
- self.faddrs_i = Array(Signal(bitlen, reset_less=True,
+ self.faddrs_i = tuple(Signal(bitlen, reset_less=True,
name="fadr") for i in range(n_adr))
# intermediary: address + 1
- self.addr1s = Array(Signal(self.midlen, reset_less=True,
+ self.addr1s = tuple(Signal(self.midlen, reset_less=True,
name="adr1")
for i in range(n_adr))
# expanded lengths, needed in match
expwid = 1+self.lsbwid # XXX assume LD/ST no greater than 8
- self.lexp = Array(Signal(1 << expwid, reset_less=True,
+ self.lexp = tuple(Signal(1 << expwid, reset_less=True,
name="a_l")
for i in range(self.n_adr))
# intermediaries
adrs_r, l = self.adrs_r, self.l
- len_r = Array(Signal(self.lsbwid, reset_less=True,
+ len_r = tuple(Signal(self.lsbwid, reset_less=True,
name="l_r")
for i in range(self.n_adr))
#from soc.experiment.pimem import PortInterface
-from nmigen import Elaboratable, Module, Signal, Record, Array, Const, Cat
+from nmigen import Elaboratable, Module, Signal, Record, Const, Cat
from nmutil.latch import SRLatch, latchregister
from nmigen.back.pysim import Simulator, Delay
from nmigen.cli import verilog, rtlil
def __init__(self, dwidth, awidth, mlen):
self.addr_i = Signal(awidth, reset_less=True)
self.mask_i = Signal(mlen, reset_less=True)
- self.valid_i = Signal(reset_less=True)
+ self.i_valid = Signal(reset_less=True)
self.ld_i = LDData(dwidth, "ld_i")
self.ld_o = LDData(dwidth, "ld_o")
- self.valid_o = Signal(reset_less=True)
+ self.o_valid = Signal(reset_less=True)
def elaborate(self, platform):
m = Module()
comb = m.d.comb
m.submodules.in_l = in_l = SRLatch(sync=False, name="in_l")
- comb += in_l.s.eq(self.valid_i)
- comb += self.valid_o.eq(in_l.q & self.valid_i)
- latchregister(m, self.ld_i, self.ld_o, in_l.q & self.valid_o, "ld_i_r")
+ comb += in_l.s.eq(self.i_valid)
+ comb += self.o_valid.eq(in_l.q & self.i_valid)
+ latchregister(m, self.ld_i, self.ld_o, in_l.q & self.o_valid, "ld_i_r")
return m
yield self.ld_i.data
yield self.ld_o.err
yield self.ld_o.data
- yield self.valid_i
- yield self.valid_o
+ yield self.i_valid
+ yield self.o_valid
def ports(self):
return list(self)
self.addr_i = Signal(awidth, reset_less=True)
# no match in PortInterface
self.len_i = Signal(dlen, reset_less=True)
- self.valid_i = Signal(reset_less=True)
- self.valid_o = Signal(reset_less=True)
+ self.i_valid = Signal(reset_less=True)
+ self.o_valid = Signal(reset_less=True)
self.is_ld_i = Signal(reset_less=True)
self.is_st_i = Signal(reset_less=True)
self.exc = Signal(reset_less=True) # pi.exc TODO
# TODO : create/connect two outgoing port interfaces
- self.sld_valid_o = Signal(2, reset_less=True)
- self.sld_valid_i = Signal(2, reset_less=True)
- self.sld_data_i = Array((LDData(cline_wid, "ld_data_i1"),
+ self.sld_o_valid = Signal(2, reset_less=True)
+ self.sld_i_valid = Signal(2, reset_less=True)
+ self.sld_data_i = tuple((LDData(cline_wid, "ld_data_i1"),
LDData(cline_wid, "ld_data_i2")))
- self.sst_valid_o = Signal(2, reset_less=True)
- self.sst_valid_i = Signal(2, reset_less=True)
- self.sst_data_o = Array((LDData(cline_wid, "st_data_i1"),
+ self.sst_o_valid = Signal(2, reset_less=True)
+ self.sst_i_valid = Signal(2, reset_less=True)
+ self.sst_data_o = tuple((LDData(cline_wid, "st_data_i1"),
LDData(cline_wid, "st_data_i2")))
def elaborate(self, platform):
# set up connections to LD-split. note: not active if mask is zero
for i, (ld, mask) in enumerate(((ld1, mask1),
(ld2, mask2))):
- ld_valid = Signal(name="ldvalid_i%d" % i, reset_less=True)
- comb += ld_valid.eq(self.valid_i & self.sld_valid_i[i])
- comb += ld.valid_i.eq(ld_valid & (mask != mzero))
+ ld_valid = Signal(name="ldi_valid%d" % i, reset_less=True)
+ comb += ld_valid.eq(self.i_valid & self.sld_i_valid[i])
+ comb += ld.i_valid.eq(ld_valid & (mask != mzero))
comb += ld.ld_i.eq(self.sld_data_i[i])
- comb += self.sld_valid_o[i].eq(ld.valid_o)
+ comb += self.sld_o_valid[i].eq(ld.o_valid)
# sort out valid: mask2 zero we ignore 2nd LD
with m.If(mask2 == mzero):
- comb += self.valid_o.eq(self.sld_valid_o[0])
+ comb += self.o_valid.eq(self.sld_o_valid[0])
with m.Else():
- comb += self.valid_o.eq(self.sld_valid_o.all())
+ comb += self.o_valid.eq(self.sld_o_valid.all())
## debug output -- output mask2 and mzero
## guess second port is invalid
# all bits valid (including when data error occurs!) decode ld1/ld2
- with m.If(self.valid_o):
+ with m.If(self.o_valid):
# errors cause error condition
comb += self.ld_data_o.err.eq(ld1.ld_o.err | ld2.ld_o.err)
# set busy flag -- required for unit test
for i, (ld, mask) in enumerate(((ld1, mask1),
(ld2, mask2))):
- valid = Signal(name="stvalid_i%d" % i, reset_less=True)
- comb += valid.eq(self.valid_i & self.sst_valid_i[i])
- comb += ld.valid_i.eq(valid & (mask != mzero))
- comb += self.sld_valid_o[i].eq(ld.valid_o)
+ valid = Signal(name="sti_valid%d" % i, reset_less=True)
+ comb += valid.eq(self.i_valid & self.sst_i_valid[i])
+ comb += ld.i_valid.eq(valid & (mask != mzero))
+ comb += self.sld_o_valid[i].eq(ld.o_valid)
comb += self.sst_data_o[i].data.eq(ld.ld_o.data)
comb += ld1.ld_i.eq((self.st_data_i << (ashift1*8)) & mask1)
# sort out valid: mask2 zero we ignore 2nd LD
with m.If(mask2 == mzero):
- comb += self.valid_o.eq(self.sst_valid_o[0])
+ comb += self.o_valid.eq(self.sst_o_valid[0])
with m.Else():
- comb += self.valid_o.eq(self.sst_valid_o.all())
+ comb += self.o_valid.eq(self.sst_o_valid.all())
# all bits valid (including when data error occurs!) decode ld1/ld2
- with m.If(self.valid_o):
+ with m.If(self.o_valid):
# errors cause error condition
comb += self.st_data_i.err.eq(ld1.ld_o.err | ld2.ld_o.err)
yield self.is_ld_i
yield self.ld_data_o.err
yield self.ld_data_o.data
- yield self.valid_i
- yield self.valid_o
- yield self.sld_valid_i
+ yield self.i_valid
+ yield self.o_valid
+ yield self.sld_i_valid
for i in range(2):
yield self.sld_data_i[i].err
yield self.sld_data_i[i].data
yield dut.is_ld_i.eq(1)
yield dut.len_i.eq(ld_len)
yield dut.addr_i.eq(addr)
- yield dut.valid_i.eq(1)
+ yield dut.i_valid.eq(1)
print("waiting")
while True:
- valid_o = yield dut.valid_o
- if valid_o:
+ o_valid = yield dut.o_valid
+ if o_valid:
break
yield
exc = yield dut.exc
def lds():
print("lds")
while True:
- valid_i = yield dut.valid_i
- if valid_i:
+ i_valid = yield dut.i_valid
+ if i_valid:
break
yield
data2 = (shfdata >> 128) & dmask1
print("ld data2", 1 << dlen, hex(data >> (1 << dlen)), hex(data2))
yield dut.sld_data_i[0].data.eq(data1)
- yield dut.sld_valid_i[0].eq(1)
+ yield dut.sld_i_valid[0].eq(1)
yield
yield dut.sld_data_i[1].data.eq(data2)
- yield dut.sld_valid_i[1].eq(1)
+ yield dut.sld_i_valid[1].eq(1)
yield
sim.add_sync_process(lds)
+# (DO NOT REMOVE THESE NOTICES)
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2019, 2020, 2021 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+# Part of the Libre-SOC Project.
+# Sponsored by NLnet EU Grant No: 825310 and 825322
+# Sponsored by NGI POINTER EU Grant No: 871528
+
from nmigen.compat.sim import run_simulation
from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Elaboratable, Array, Cat, Repl
+from nmigen import Module, Signal, Elaboratable, Cat, Repl
from nmutil.latch import SRLatch
-from functools import reduce
-from operator import or_
class DependencyRow(Elaboratable):
asynchronous) would be reset at the exact moment that GO was requested,
and the RSEL would be garbage.
"""
- def __init__(self, n_reg, n_src, cancel_mode=False):
+ def __init__(self, n_reg, n_src, n_dst, cancel_mode=False):
self.cancel_mode = cancel_mode
self.n_reg = n_reg
self.n_src = n_src
+ self.n_dst = n_dst
# arrays
src = []
rsel = []
src.append(Signal(n_reg, name="src%d" % j, reset_less=True))
rsel.append(Signal(n_reg, name="src%d_rsel_o" % j, reset_less=True))
fwd.append(Signal(n_reg, name="src%d_fwd_o" % j, reset_less=True))
+ dst = []
+ dsel = []
+ dfwd = []
+ for i in range(n_dst):
+ j = i + 1 # name numbering to match src1/src2
+ dst.append(Signal(n_reg, name="dst%d" % j, reset_less=True))
+ dsel.append(Signal(n_reg, name="dst%d_rsel_o" % j, reset_less=True))
+ dfwd.append(Signal(n_reg, name="dst%d_fwd_o" % j, reset_less=True))
# inputs
- self.dest_i = Signal(n_reg, reset_less=True) # Dest in (top)
- self.src_i = Array(src) # operands in (top)
- self.issue_i = Signal(reset_less=True) # Issue in (top)
+ self.dst_i = tuple(dst) # Dest in (top)
+ self.src_i = tuple(src) # operands in (top)
+ self.issue_i = Signal(reset_less=True) # Issue in (top)
self.rd_pend_i = Signal(n_reg, reset_less=True) # Read pend in (top)
self.wr_pend_i = Signal(n_reg, reset_less=True) # Write pend in (top)
self.go_die_i = Signal(reset_less=True) # Go Die in (left)
# for Register File Select Lines (vertical)
- self.dest_rsel_o = Signal(n_reg, reset_less=True) # dest reg sel (bot)
- self.src_rsel_o = Array(rsel) # src reg sel (bot)
+ self.dst_rsel_o = tuple(dsel) # dest reg sel (bot)
+ self.src_rsel_o = tuple(rsel) # src reg sel (bot)
# for Function Unit "forward progress" (horizontal)
- self.dest_fwd_o = Signal(n_reg, reset_less=True) # dest FU fw (right)
- self.src_fwd_o = Array(fwd) # src FU fw (right)
+ self.dst_fwd_o = tuple(dfwd) # dest FU fw (right)
+ self.src_fwd_o = tuple(fwd) # src FU fw (right)
+
+ # for temporary (transitional) compatibility with old API
+ # number of dests used to be 1 (one) - increasing to n_dst
+ self.dest_i = self.dst_i[0]
+ self.dest_rsel_o = self.dst_rsel_o[0]
+ self.dest_fwd_o = self.dst_fwd_o[0]
def elaborate(self, platform):
m = Module()
- m.submodules.dest_c = dest_c = SRLatch(sync=False, llen=self.n_reg)
+ # create source and dest SRLatches
+ dst_c = []
+ for i in range(self.n_dst):
+ dst_l = SRLatch(sync=False, llen=self.n_reg)
+ m.submodules["dst%d_c" % (i+1)] = dst_l
+ dst_c.append(dst_l)
+
src_c = []
for i in range(self.n_src):
src_l = SRLatch(sync=False, llen=self.n_reg)
- setattr(m.submodules, "src%d_c" % (i+1), src_l)
+ m.submodules["src%d_c" % (i+1)] = src_l
src_c.append(src_l)
# connect go_rd / go_wr (dest->wr, src->rd)
go_die = Repl(self.go_die_i, self.n_reg)
m.d.comb += wr_die.eq(Repl(self.go_wr_i, self.n_reg) | go_die)
m.d.comb += rd_die.eq(Repl(self.go_rd_i, self.n_reg) | go_die)
- m.d.comb += dest_c.r.eq(wr_die)
+ for i in range(self.n_dst):
+ m.d.comb += dst_c[i].r.eq(wr_die)
for i in range(self.n_src):
m.d.comb += src_c[i].r.eq(rd_die)
# connect input reg bit (unary)
i_ext = Repl(self.issue_i, self.n_reg)
- m.d.comb += dest_c.s.eq(i_ext & self.dest_i)
+ for i in range(self.n_dst):
+ m.d.comb += dst_c[i].s.eq(i_ext & self.dst_i[i])
for i in range(self.n_src):
m.d.comb += src_c[i].s.eq(i_ext & self.src_i[i])
# connect up hazard checks: read-after-write and write-after-read
- m.d.comb += self.dest_fwd_o.eq(dest_c.q & self.rd_pend_i)
+ for i in range(self.n_dst):
+ m.d.comb += self.dst_fwd_o[i].eq(dst_c[i].q & self.rd_pend_i)
for i in range(self.n_src):
m.d.comb += self.src_fwd_o[i].eq(src_c[i].q & self.wr_pend_i)
# connect reg-sel outputs
rd_ext = Repl(self.go_rd_i, self.n_reg)
wr_ext = Repl(self.go_wr_i, self.n_reg)
- m.d.comb += self.dest_rsel_o.eq(dest_c.qlq & wr_ext)
+ for i in range(self.n_dst):
+ m.d.comb += self.dst_rsel_o[i].eq(dst_c[i].qlq & wr_ext)
for i in range(self.n_src):
m.d.comb += self.src_rsel_o[i].eq(src_c[i].qlq & rd_ext)
src_q = []
for i in range(self.n_src):
src_q.append(src_c[i].qlq)
- m.d.comb += self.v_rd_rsel_o.eq(reduce(or_, src_q))
- m.d.comb += self.v_wr_rsel_o.eq(dest_c.qlq)
+ m.d.comb += self.v_rd_rsel_o.eq(Cat(*src_q).bool())
+ dst_q = []
+ for i in range(self.n_dst):
+ dst_q.append(dst_c[i].qlq)
+ m.d.comb += self.v_wr_rsel_o.eq(Cat(*dst_q).bool())
return m
def __iter__(self):
- yield self.dest_i
+ yield from self.dst_i
yield from self.src_i
yield self.rd_pend_i
yield self.wr_pend_i
yield self.go_wr_i
yield self.go_rd_i
yield self.go_die_i
- yield self.dest_rsel_o
+ yield from self.dst_rsel_o
yield from self.src_rsel_o
- yield self.dest_fwd_o
+ yield from self.dst_fwd_o
yield from self.src_fwd_o
def ports(self):
return list(self)
+# XXX not up-to-date but hey
def dcell_sim(dut):
yield dut.dest_i.eq(1)
yield dut.issue_i.eq(1)
yield
yield dut.issue_i.eq(0)
yield
- yield dut.src1_i.eq(1)
+ yield dut.src_i[0].eq(1)
yield dut.issue_i.eq(1)
yield
yield
yield
def test_dcell():
- dut = DependencyRow(4, 2, True)
+ dut = DependencyRow(4, 2, 2, True)
vl = rtlil.convert(dut, ports=dut.ports())
with open("test_drow.il", "w") as f:
f.write(vl)
from nmigen.compat.sim import run_simulation
from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Cat, Array, Const, Elaboratable
+from nmigen import Module, Signal, Cat, Const, Elaboratable
from nmigen.lib.coding import Decoder
from nmutil.latch import SRLatch, latchregister
* dest_i / src1_i / src2_i are in *binary*, whereas...
* ...g_rd_pend_i / g_wr_pend_i and rd_pend_o / wr_pend_o are UNARY
* req_rel_i (request release) is the direct equivalent of pipeline
- "output valid" (valid_o)
+ "output valid" (o_valid)
* recover is a local python variable (actually go_die_o)
* when shadow_wid = 0, recover and shadown are Consts (i.e. do nothing)
* wr_pend is set False for the majority of uses: however for
if n_dests > 1:
self.rfile_sel_i = Signal(range(n_dests), reset_less=True)
else:
- self.rfile_sel_i = Const(0) # no selection. gets Array[0]
+ self.rfile_sel_i = Const(0) # no selection. gets 0
self.dest_i = Signal(range(wid), reset_less=True) # Dest R# in (top)
self.src1_i = Signal(range(wid), reset_less=True) # oper1 R# in (top)
self.src2_i = Signal(range(wid), reset_less=True) # oper2 R# in (top)
self.go_rd_i = Signal(reset_less=True) # Go Read in (left)
self.req_rel_i = Signal(reset_less=True) # request release (left)
- self.g_xx_pend_i = Array(Signal(wid, reset_less=True, name="g_pend_i")
+ self.g_xx_pend_i = tuple(Signal(wid, reset_less=True, name="g_pend_i")
for i in range(n_dests)) # global rd (right)
self.g_wr_pend_i = Signal(wid, reset_less=True) # global wr (right)
# outputs
self.readable_o = Signal(reset_less=True) # Readable out (right)
- self.writable_o = Array(Signal(reset_less=True, name="writable_o")
+ self.writable_o = tuple(Signal(reset_less=True, name="writable_o")
for i in range(n_dests)) # writable out (right)
self.busy_o = Signal(reset_less=True) # busy out (left)
self.src1_pend_o = Signal(wid, reset_less=True) # src1 pending
self.src2_pend_o = Signal(wid, reset_less=True) # src1 pending
self.rd_pend_o = Signal(wid, reset_less=True) # rd pending (right)
- self.xx_pend_o = Array(Signal(wid, reset_less=True, name="pend_o")
+ self.xx_pend_o = tuple(Signal(wid, reset_less=True, name="pend_o")
for i in range(n_dests)) # wr pending (right)
def elaborate(self, platform):
from nmigen.compat.sim import run_simulation
from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Elaboratable, Array, Cat, Const
+from nmigen import Module, Signal, Elaboratable, Cat, Const
from .fu_dep_cell import FUDependenceCell
from .fu_picker_vec import FU_Pick_Vec
# ---
# matrix of dependency cells
# ---
- dm = Array(FUDependenceCell(f, self.n_fu_col) \
+ dm = tuple(FUDependenceCell(f, self.n_fu_col) \
for f in range(self.n_fu_row))
for y in range(self.n_fu_row):
setattr(m.submodules, "dm%d" % y, dm[y])
# ---
# array of Function Unit Readable/Writable: row-length, horizontal
# ---
- fur = Array(FU_Pick_Vec(self.n_fu_row) for r in range(self.n_fu_col))
+ fur = tuple(FU_Pick_Vec(self.n_fu_row) for r in range(self.n_fu_col))
for x in range(self.n_fu_col):
setattr(m.submodules, "fur_x%d" % (x), fur[x])
from nmigen.compat.sim import run_simulation
from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Elaboratable, Array, Cat, Const
+from nmigen import Module, Signal, Elaboratable, Cat, Const
from soc.scoreboard.fumem_dep_cell import FUMemDependenceCell
from soc.scoreboard.fu_mem_picker_vec import FUMem_Pick_Vec
# ---
# matrix of dependency cells
# ---
- dm = Array(FUMemDependenceCell(f, self.n_fu_col) \
+ dm = tuple(FUMemDependenceCell(f, self.n_fu_col) \
for f in range(self.n_fu_row))
for y in range(self.n_fu_row):
setattr(m.submodules, "dm%d" % y, dm[y])
# ---
# array of Function Unit Readable/Writable: row-length, horizontal
# ---
- fur = Array(FUMem_Pick_Vec(self.n_fu_row) for r in range(self.n_fu_col))
+ fur = tuple(FUMem_Pick_Vec(self.n_fu_row) for r in range(self.n_fu_col))
for x in range(self.n_fu_col):
setattr(m.submodules, "fur_x%d" % (x), fur[x])
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Elaboratable, Array, Cat, Repl
-
-from soc.scoreboard.dependence_cell import DependencyRow
-from soc.scoreboard.fu_wr_pending import FU_RW_Pend
-from soc.scoreboard.reg_select import Reg_Rsv
-from soc.scoreboard.global_pending import GlobalPending
+# (DO NOT REMOVE THESE NOTICES)
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2019, 2020, 2021 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+# Part of the Libre-SOC Project.
+# Sponsored by NLnet EU Grant No: 825310 and 825322
+# Sponsored by NGI POINTER EU Grant No: 871528
-"""
+"""Mitch Alsup 6600 Dependency Matrices: Function Units to Registers (FU-REGs)
6600 Dependency Table Matrix inputs / outputs
---------------------------------------------
d s1 s2 d s1 s2 d s1 s2 d s1 s2
reg sel reg sel reg sel reg sel
+Sub-module allocation:
+
+ <----------- DependenceRow dr_fu0 -------> FU_RW_Pend fu_fu_0
+ <----------- DependenceRow dr_fu1 -------> FU_RW_Pend fu_fu_1
+ <----------- DependenceRow dr_fu2 -------> FU_RW_Pend fu_fu_2
+ | | | | | | | | | | | |
+ v v v v v v v v v v v v
+ Reg_Rsv Reg_Rsv Reg_Rsv Reg_Rsv
+ rr_r0 rr_r1 rr_r2 rr_r3
+ | | | | | | | |
+ <---------- GlobalPending rd_v --------->
+ <---------- GlobalPending wr_v --------->
"""
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+from nmigen import Module, Signal, Elaboratable, Cat, Repl
+
+from soc.scoreboard.dependence_cell import DependencyRow
+from soc.scoreboard.fu_wr_pending import FU_RW_Pend
+from soc.scoreboard.reg_select import Reg_Rsv
+from soc.scoreboard.global_pending import GlobalPending
+
+
class FURegDepMatrix(Elaboratable):
""" implements 11.4.7 mitch alsup FU-to-Reg Dependency Matrix, p26
"""
- def __init__(self, n_fu_row, n_reg_col, n_src, cancel=None):
+ def __init__(self, n_fu_row, n_reg_col, n_src, n_dst, cancel=None):
self.n_src = n_src
+ self.n_dst = n_dst
self.n_fu_row = nf = n_fu_row # Y (FUs) ^v
self.n_reg_col = n_reg = n_reg_col # X (Regs) <>
# arrays
src = []
rsel = []
+ pend = []
for i in range(n_src):
j = i + 1 # name numbering to match src1/src2
src.append(Signal(n_reg, name="src%d" % j, reset_less=True))
rsel.append(Signal(n_reg, name="src%d_rsel_o" % j, reset_less=True))
- pend = []
- for i in range(nf):
- j = i + 1 # name numbering to match src1/src2
pend.append(Signal(nf, name="rd_src%d_pend_o" % j, reset_less=True))
-
- self.dest_i = Signal(n_reg_col, reset_less=True) # Dest in (top)
- self.src_i = Array(src) # oper in (top)
+ dst = []
+ dsel = []
+ dpnd = []
+ for i in range(n_dst):
+ j = i + 1 # name numbering to match dst1/dst2
+ dst.append(Signal(n_reg, name="dst%d" % j, reset_less=True))
+ dsel.append(Signal(n_reg, name="dst%d_rsel_o" % j, reset_less=True))
+ dpnd.append(Signal(nf, name="wr_dst%d_pend_o" % j, reset_less=True))
+
+ self.dst_i = tuple(dst) # Dest in (top)
+ self.src_i = tuple(src) # oper in (top)
+ self.dest_i = self.dst_i[0] # old API
# cancellation array (from Address Matching), ties in with go_die_i
self.cancel = cancel
self.go_die_i = Signal(n_fu_row, reset_less=True) # Go Die in (left)
# for Register File Select Lines (horizontal), per-reg
- self.dest_rsel_o = Signal(n_reg_col, reset_less=True) # dest reg (bot)
- self.src_rsel_o = Array(rsel) # src reg (bot)
+ self.dst_rsel_o = tuple(dsel) # dest reg (bot)
+ self.src_rsel_o = tuple(rsel) # src reg (bot)
+ self.dest_rsel_o = self.dst_rsel_o[0] # old API
# for Function Unit "forward progress" (vertical), per-FU
self.wr_pend_o = Signal(n_fu_row, reset_less=True) # wr pending (right)
self.rd_pend_o = Signal(n_fu_row, reset_less=True) # rd pending (right)
- self.rd_src_pend_o = Array(pend) # src1 pending
+ self.rd_src_pend_o = tuple(pend) # src pending
+ self.wr_dst_pend_o = tuple(dpnd) # dest pending
def elaborate(self, platform):
m = Module()
def _elaborate(self, m, platform):
# ---
- # matrix of dependency cells
+ # matrix of dependency cells. horizontal object, allocated vertically
# ---
cancel_mode = self.cancel is not None
- dm = Array(DependencyRow(self.n_reg_col, self.n_src, cancel_mode) \
+ dm = tuple(DependencyRow(self.n_reg_col, self.n_src, self.n_dst,
+ cancel_mode=cancel_mode) \
for r in range(self.n_fu_row))
- for fu in range(self.n_fu_row):
- setattr(m.submodules, "dr_fu%d" % fu, dm[fu])
+ for fu, dc in enumerate(dm):
+ m.submodules["dr_fu%d" % fu] = dc
# ---
- # array of Function Unit Pending vectors
+ # array of Function Unit Pending vecs. allocated vertically (per FU)
# ---
- fupend = Array(FU_RW_Pend(self.n_reg_col, self.n_src) \
+ fupend = tuple(FU_RW_Pend(self.n_reg_col, self.n_src, self.n_dst) \
for f in range(self.n_fu_row))
- for fu in range(self.n_fu_row):
- setattr(m.submodules, "fu_fu%d" % (fu), fupend[fu])
+ for fu, fup in enumerate(fupend):
+ m.submodules["fu_fu%d" % (fu)] = fup
# ---
- # array of Register Reservation vectors
+ # array of Register Reservation vecs. allocated horizontally (per reg)
# ---
- regrsv = Array(Reg_Rsv(self.n_fu_row, self.n_src) \
+ regrsv = tuple(Reg_Rsv(self.n_fu_row, self.n_src, self.n_dst) \
for r in range(self.n_reg_col))
for rn in range(self.n_reg_col):
- setattr(m.submodules, "rr_r%d" % (rn), regrsv[rn])
+ m.submodules["rr_r%d" % (rn)] = regrsv[rn]
# ---
# connect Function Unit vector
# ---
wr_pend = []
rd_pend = []
- for fu in range(self.n_fu_row):
- dc = dm[fu]
- fup = fupend[fu]
- dest_fwd_o = []
- for rn in range(self.n_reg_col):
- # accumulate cell fwd outputs for dest/src1/src2
- dest_fwd_o.append(dc.dest_fwd_o[rn])
- # connect cell fwd outputs to FU Vector in [Cat is gooood]
- m.d.comb += [fup.dest_fwd_i.eq(Cat(*dest_fwd_o)),
- ]
+ for fup in fupend:
# accumulate FU Vector outputs
wr_pend.append(fup.reg_wr_pend_o)
rd_pend.append(fup.reg_rd_pend_o)
m.d.comb += self.wr_pend_o.eq(Cat(*wr_pend))
m.d.comb += self.rd_pend_o.eq(Cat(*rd_pend))
+ # connect dst fwd vectors
+ for i in range(self.n_dst):
+ wr_dst_pend = []
+ for dc, fup in zip(dm, fupend):
+ dst_fwd_o = []
+ for rn in range(self.n_reg_col):
+ # accumulate cell fwd outputs for dest
+ dst_fwd_o.append(dc.dst_fwd_o[i][rn])
+ # connect cell fwd outputs to FU Vector in [Cat is gooood]
+ m.d.comb += fup.dst_fwd_i[i].eq(Cat(*dst_fwd_o))
+ # accumulate FU Vector outputs
+ wr_dst_pend.append(fup.reg_wr_dst_pend_o[i])
+ # ... and output them from this module (vertical, width=FUs)
+ m.d.comb += self.wr_dst_pend_o[i].eq(Cat(*wr_dst_pend))
+
# same for src
for i in range(self.n_src):
rd_src_pend = []
- for fu in range(self.n_fu_row):
- dc = dm[fu]
- fup = fupend[fu]
+ for dc, fup in zip(dm, fupend):
src_fwd_o = []
for rn in range(self.n_reg_col):
# accumulate cell fwd outputs for dest/src1/src2
src_fwd_o.append(dc.src_fwd_o[i][rn])
# connect cell fwd outputs to FU Vector in [Cat is gooood]
- m.d.comb += [fup.src_fwd_i[i].eq(Cat(*src_fwd_o)),
- ]
+ m.d.comb += fup.src_fwd_i[i].eq(Cat(*src_fwd_o))
# accumulate FU Vector outputs
rd_src_pend.append(fup.reg_rd_src_pend_o[i])
# ... and output them from this module (vertical, width=FUs)
# ---
# connect Reg Selection vector
# ---
- dest_rsel = []
- for rn in range(self.n_reg_col):
- rsv = regrsv[rn]
- dest_rsel_o = []
- for fu in range(self.n_fu_row):
- dc = dm[fu]
- # accumulate cell reg-select outputs dest/src1/src2
- dest_rsel_o.append(dc.dest_rsel_o[rn])
- # connect cell reg-select outputs to Reg Vector In
- m.d.comb += rsv.dest_rsel_i.eq(Cat(*dest_rsel_o)),
-
- # accumulate Reg-Sel Vector outputs
- dest_rsel.append(rsv.dest_rsel_o)
-
- # ... and output them from this module (horizontal, width=REGs)
- m.d.comb += self.dest_rsel_o.eq(Cat(*dest_rsel))
+ for i in range(self.n_dst):
+ dest_rsel = []
+ for rn, rsv in enumerate(regrsv):
+ dst_rsel_o = []
+ # accumulate cell reg-select outputs dest1/2/...
+ for dc in dm:
+ dst_rsel_o.append(dc.dst_rsel_o[i][rn])
+ # connect cell reg-select outputs to Reg Vector In
+ m.d.comb += rsv.dst_rsel_i[i].eq(Cat(*dst_rsel_o)),
+ # accumulate Reg-Sel Vector outputs
+ dest_rsel.append(rsv.dst_rsel_o[i])
+ # ... and output them from this module (horizontal, width=REGs)
+ m.d.comb += self.dst_rsel_o[i].eq(Cat(*dest_rsel))
# same for src
for i in range(self.n_src):
src_rsel = []
- for rn in range(self.n_reg_col):
- rsv = regrsv[rn]
+ for rn, rsv in enumerate(regrsv):
src_rsel_o = []
- for fu in range(self.n_fu_row):
- dc = dm[fu]
- # accumulate cell reg-select outputs dest/src1/src2
+ # accumulate cell reg-select outputs src1/src2
+ for dc in dm:
src_rsel_o.append(dc.src_rsel_o[i][rn])
# connect cell reg-select outputs to Reg Vector In
m.d.comb += rsv.src_rsel_i[i].eq(Cat(*src_rsel_o)),
# accumulate Reg-Sel Vector outputs
src_rsel.append(rsv.src_rsel_o[i])
-
# ... and output them from this module (horizontal, width=REGs)
m.d.comb += self.src_rsel_o[i].eq(Cat(*src_rsel))
# ---
# connect Dependency Matrix dest/src1/src2/issue to module d/s/s/i
# ---
- for fu in range(self.n_fu_row):
- dc = dm[fu]
+ for dc in dm:
# wire up inputs from module to row cell inputs (Cat is gooood)
- m.d.comb += [dc.dest_i.eq(self.dest_i),
- dc.rd_pend_i.eq(self.rd_pend_i),
+ m.d.comb += [dc.rd_pend_i.eq(self.rd_pend_i),
dc.wr_pend_i.eq(self.wr_pend_i),
]
- # same for src
- for i in range(self.n_src):
- for fu in range(self.n_fu_row):
- dc = dm[fu]
- # wire up inputs from module to row cell inputs (Cat is gooood)
+ # for dest: wire up output from module to row cell outputs
+ for i in range(self.n_dst):
+ m.d.comb += dc.dst_i[i].eq(self.dst_i[i])
+ # for src: wire up inputs from module to row cell inputs
+ for i in range(self.n_src):
m.d.comb += dc.src_i[i].eq(self.src_i[i])
# accumulate rsel bits into read/write pending vectors.
rd_pend_v = []
wr_pend_v = []
- for fu in range(self.n_fu_row):
- dc = dm[fu]
+ for dc in dm:
rd_pend_v.append(dc.v_rd_rsel_o)
wr_pend_v.append(dc.v_wr_rsel_o)
rd_v = GlobalPending(self.n_reg_col, rd_pend_v)
go_rd_i = []
go_wr_i = []
issue_i = []
- for fu in range(self.n_fu_row):
- dc = dm[fu]
+ for dc in dm:
# accumulate cell fwd outputs for dest/src1/src2
go_rd_i.append(dc.go_rd_i)
go_wr_i.append(dc.go_wr_i)
# connect Dep go_die_i
# ---
if cancel_mode:
- for fu in range(self.n_fu_row):
- dc = dm[fu]
+ for fu, dc in enumerate(dm):
go_die = Repl(self.go_die_i[fu], self.n_fu_row)
go_die = go_die | self.cancel[fu]
m.d.comb += dc.go_die_i.eq(go_die)
else:
go_die_i = []
- for fu in range(self.n_fu_row):
- dc = dm[fu]
+ for dc in dm:
# accumulate cell fwd outputs for dest/src1/src2
go_die_i.append(dc.go_die_i)
# wire up inputs from module to row cell inputs (Cat is gooood)
return m
def __iter__(self):
+ if self.cancel is not None:
+ yield self.cancel
yield self.dest_i
yield from self.src_i
yield self.issue_i
yield self.go_wr_i
yield self.go_rd_i
yield self.go_die_i
- yield self.dest_rsel_o
+ yield from self.dst_rsel_o
yield from self.src_rsel_o
yield self.wr_pend_o
yield self.rd_pend_o
yield self.v_wr_rsel_o
yield self.v_rd_rsel_o
yield from self.rd_src_pend_o
+ yield from self.wr_dst_pend_o
def ports(self):
return list(self)
yield
yield dut.issue_i.eq(0)
yield
- yield dut.src1_i.eq(1)
+ yield dut.src_i[0].eq(1)
yield dut.issue_i.eq(1)
yield
yield dut.issue_i.eq(0)
yield
def test_d_matrix():
- dut = FURegDepMatrix(n_fu_row=3, n_reg_col=4, n_src=2)
+ cancel = Signal(3)
+ dut = FURegDepMatrix(n_fu_row=3, n_reg_col=4, n_src=2, n_dst=2,
+ cancel=cancel)
vl = rtlil.convert(dut, ports=dut.ports())
with open("test_fu_reg_matrix.il", "w") as f:
f.write(vl)
-from nmigen import Elaboratable, Module, Signal, Array
+# (DO NOT REMOVE THESE NOTICES)
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2019, 2020, 2021 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+# Part of the Libre-SOC Project.
+# Sponsored by NLnet EU Grant No: 825310 and 825322
+# Sponsored by NGI POINTER EU Grant No: 871528
+
+from nmigen import Elaboratable, Module, Signal
+from nmigen.cli import verilog, rtlil
class FU_RW_Pend(Elaboratable):
""" these are allocated per-FU (horizontally),
and are of length reg_count
"""
- def __init__(self, reg_count, n_src):
+ def __init__(self, reg_count, n_src, n_dst):
self.n_src = n_src
+ self.n_dst = n_dst
self.reg_count = reg_count
- self.dest_fwd_i = Signal(reg_count, reset_less=True)
+ # create dest forwarding array
+ dst = []
+ for i in range(n_dst):
+ j = i + 1 # name numbering to match dst1/dst2
+ dst.append(Signal(reg_count, name="dst%d" % j, reset_less=True))
+ self.dst_fwd_i = tuple(dst)
+ self.dest_fwd_i = self.dst_fwd_i[0] # old API
+ # create src forwarding array
src = []
for i in range(n_src):
j = i + 1 # name numbering to match src1/src2
src.append(Signal(reg_count, name="src%d" % j, reset_less=True))
- self.src_fwd_i = Array(src)
+ self.src_fwd_i = tuple(src)
self.reg_wr_pend_o = Signal(reset_less=True)
self.reg_rd_pend_o = Signal(reset_less=True)
self.reg_rd_src_pend_o = Signal(n_src, reset_less=True)
+ self.reg_wr_dst_pend_o = Signal(n_dst, reset_less=True)
def elaborate(self, platform):
m = Module()
- m.d.comb += self.reg_wr_pend_o.eq(self.dest_fwd_i.bool())
+ for i in range(self.n_dst):
+ m.d.comb += self.reg_wr_dst_pend_o[i].eq(self.dst_fwd_i[i].bool())
+ m.d.comb += self.reg_wr_pend_o.eq(self.reg_wr_dst_pend_o.bool())
for i in range(self.n_src):
m.d.comb += self.reg_rd_src_pend_o[i].eq(self.src_fwd_i[i].bool())
m.d.comb += self.reg_rd_pend_o.eq(self.reg_rd_src_pend_o.bool())
return m
+ def __iter__(self):
+ yield self.reg_wr_pend_o
+ yield self.reg_rd_pend_o
+ yield self.reg_rd_src_pend_o
+ yield self.reg_wr_dst_pend_o
+ yield from self.dst_fwd_i
+ yield from self.src_fwd_i
+
+ def ports(self):
+ return list(self)
+
+def test_fu_rw_pend():
+ dut = FU_RW_Pend(4, 2, 2)
+ vl = rtlil.convert(dut, ports=dut.ports())
+ with open("test_fu_rw_pend.il", "w") as f:
+ f.write(vl)
+
+if __name__ == '__main__':
+ test_fu_rw_pend()
from nmigen.compat.sim import run_simulation
from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Elaboratable, Array
+from nmigen import Module, Signal, Elaboratable
#from nmutil.picker import MultiPriorityPicker as MPP
from nmutil.picker import PriorityPicker
wi.append(Signal(wid, name="writable%d_i" % i, reset_less=True))
# inputs
- self.rd_rel_i = Array(rdr) # go read in (top)
- self.req_rel_i = Array(wrr) # release request in (top)
- self.readable_i = Array(ri) # readable in (top)
- self.writable_i = Array(wi) # writable in (top)
+ self.rd_rel_i = tuple(rdr) # go read in (top)
+ self.req_rel_i = tuple(wrr) # release request in (top)
+ self.readable_i = tuple(ri) # readable in (top)
+ self.writable_i = tuple(wi) # writable in (top)
# outputs
- self.go_rd_o = Array(rd) # go read (bottom)
- self.go_wr_o = Array(wr) # go write (bottom)
+ self.go_rd_o = tuple(rd) # go read (bottom)
+ self.go_wr_o = tuple(wr) # go write (bottom)
def elaborate(self, platform):
m = Module()
self.n_out = n_out
mqbits = (int(log(iqlen) / log(2))+2, False)
- self.p_add_i = Signal(mqbits) # instructions to add (from data_i)
- self.p_ready_o = Signal() # instructions were added
- self.data_i = Instruction._nq(n_in, "data_i")
+ self.p_add_i = Signal(mqbits) # instructions to add (from i_data)
+ self.p_o_ready = Signal() # instructions were added
+ self.i_data = Instruction._nq(n_in, "i_data")
- self.data_o = Instruction._nq(n_out, "data_o")
+ self.o_data = Instruction._nq(n_out, "o_data")
self.n_sub_i = Signal(mqbits) # number of instructions to remove
self.n_sub_o = Signal(mqbits) # number of instructions removed
- self.qsz = shape(self.data_o[0])[0]
+ self.qsz = shape(self.o_data[0])[0]
q = []
for i in range(iqlen):
q.append(Signal(self.qsz, name="q%d" % i))
comb += left.eq(self.qlen_o) # - self.n_sub_o)
comb += spare.eq(mqlen - self.p_add_i)
comb += qmaxed.eq(left <= spare)
- comb += self.p_ready_o.eq(qmaxed & (self.p_add_i != 0))
+ comb += self.p_o_ready.eq(qmaxed & (self.p_add_i != 0))
# put q (flattened) into output
for i in range(self.n_out):
opos = Signal(mqbits)
comb += opos.eq(end_q + i)
- comb += cat(self.data_o[i]).eq(self.q[opos])
+ comb += cat(self.o_data[i]).eq(self.q[opos])
with m.If(self.n_sub_o):
# ok now the end's moved
sync += end_q.eq(end_q + self.n_sub_o)
- with m.If(self.p_ready_o):
+ with m.If(self.p_o_ready):
# copy in the input... insanely gate-costly... *sigh*...
for i in range(self.n_in):
with m.If(self.p_add_i > Const(i, len(self.p_add_i))):
ipos = Signal(mqbits)
comb += ipos.eq(start_q + i) # should roll round
- sync += self.q[ipos].eq(cat(self.data_i[i]))
+ sync += self.q[ipos].eq(cat(self.i_data[i]))
sync += start_q.eq(start_q + self.p_add_i)
- with m.If(self.p_ready_o):
+ with m.If(self.p_o_ready):
# update the queue length
add2 = Signal(mqbits+1)
comb += add2.eq(self.qlen_o + self.p_add_i)
def __iter__(self):
yield from self.q
- yield self.p_ready_o
- for o in self.data_i:
+ yield self.p_o_ready
+ for o in self.i_data:
yield from list(o)
yield self.p_add_i
- for o in self.data_o:
+ for o in self.o_data:
yield from list(o)
yield self.n_sub_i
yield self.n_sub_o
from nmigen.compat.sim import run_simulation
from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Elaboratable, Array, Cat, Const
+from nmigen import Module, Signal, Elaboratable, Cat, Const
from .ldst_dep_cell import LDSTDepCell
# ---
# matrix of dependency cells. actually, LDSTDepCell is a row, now
# ---
- dm = Array(LDSTDepCell(self.n_ldst) for f in range(self.n_ldst))
+ dm = tuple(LDSTDepCell(self.n_ldst) for f in range(self.n_ldst))
for fu in range(self.n_ldst):
setattr(m.submodules, "dm_fu%d" % (fu), dm[fu])
"""
def __init__(self, n_fu, addrbitwid):
PartialAddrMatch.__init__(self, n_fu, addrbitwid)
- FURegDepMatrix.__init__(self, n_fu, n_fu, 1, self.addr_nomatch_o)
+ FURegDepMatrix.__init__(self, n_fu, n_fu, 1, 1, self.addr_nomatch_o)
def elaborate(self, platform):
m = Module()
from nmigen.compat.sim import run_simulation
from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Elaboratable, Array, Cat, Repl
+from nmigen import Module, Signal, Elaboratable, Cat, Repl
from nmutil.latch import SRLatch
from nmigen.compat.sim import run_simulation
from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Elaboratable, Array, Cat
+from nmigen import Module, Signal, Elaboratable, Cat
from soc.scoreboard.mem_dependence_cell import MemDepRow
from soc.scoreboard.mem_fu_pending import MemFU_Pend
# ---
# matrix of dependency cells
# ---
- dm = Array(MemDepRow(self.n_reg_col) for r in range(self.n_fu_row))
+ dm = tuple(MemDepRow(self.n_reg_col) for r in range(self.n_fu_row))
for fu in range(self.n_fu_row):
setattr(m.submodules, "dr_fu%d" % fu, dm[fu])
# ---
# array of Function Unit Pending vectors
# ---
- fupend = Array(MemFU_Pend(self.n_reg_col) for f in range(self.n_fu_row))
+ fupend = tuple(MemFU_Pend(self.n_reg_col) for f in range(self.n_fu_row))
for fu in range(self.n_fu_row):
setattr(m.submodules, "fu_fu%d" % (fu), fupend[fu])
# ---
# array of Register Reservation vectors
# ---
- regrsv = Array(Mem_Rsv(self.n_fu_row) for r in range(self.n_reg_col))
+ regrsv = tuple(Mem_Rsv(self.n_fu_row) for r in range(self.n_reg_col))
for rn in range(self.n_reg_col):
setattr(m.submodules, "rr_r%d" % (rn), regrsv[rn])
from nmigen.compat.sim import run_simulation
from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Array, Elaboratable
+from nmigen import Module, Signal, Elaboratable
from soc.scoreboard.fu_fu_matrix import FUFUDepMatrix
from soc.scoreboard.mdm import FUMemMatchMatrix
self.fn_issue_i = Signal(n_ldsts, reset_less=True)
# address matching
- self.addrs_i = Array(Signal(self.bitwid, name="addrs_i%d" % i) \
+ self.addrs_i = tuple(Signal(self.bitwid, name="addrs_i%d" % i) \
for i in range(n_ldsts))
#self.addr_we_i = Signal(n_ldsts) # write-enable for incoming address
self.addr_en_i = Signal(n_ldsts) # address latched in
-from nmigen import Elaboratable, Module, Signal, Array
+# (DO NOT REMOVE THESE NOTICES)
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2019, 2020, 2021 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+# Part of the Libre-SOC Project.
+# Sponsored by NLnet EU Grant No: 825310 and 825322
+# Sponsored by NGI POINTER EU Grant No: 871528
+
+from nmigen.cli import verilog, rtlil
+from nmigen import Elaboratable, Module, Signal
class Reg_Rsv(Elaboratable):
""" these are allocated per-Register (vertically),
and are each of length fu_count
"""
- def __init__(self, fu_count, n_src):
+ def __init__(self, fu_count, n_src, n_dst):
self.n_src = n_src
+ self.n_dst = n_dst
self.fu_count = fu_count
- self.dest_rsel_i = Signal(fu_count, reset_less=True)
- self.src_rsel_i = Array(Signal(fu_count, name="src_rsel_i",
+ self.dst_rsel_i = tuple(Signal(fu_count, name="dst%i_rsel_i" % (i+1),
+ reset_less=True) \
+ for i in range(n_dst))
+ self.src_rsel_i = tuple(Signal(fu_count, name="src%i_rsel_i" % (i+1),
reset_less=True) \
for i in range(n_src))
- self.dest_rsel_o = Signal(reset_less=True)
+ self.dst_rsel_o = Signal(n_dst, reset_less=True)
self.src_rsel_o = Signal(n_src, reset_less=True)
def elaborate(self, platform):
m = Module()
- m.d.comb += self.dest_rsel_o.eq(self.dest_rsel_i.bool())
+ for i in range(self.n_dst):
+ m.d.comb += self.dst_rsel_o[i].eq(self.dst_rsel_i[i].bool())
for i in range(self.n_src):
m.d.comb += self.src_rsel_o[i].eq(self.src_rsel_i[i].bool())
return m
+ def __iter__(self):
+ yield from self.dst_rsel_i
+ yield from self.src_rsel_i
+ yield self.dst_rsel_o
+ yield self.src_rsel_o
+
+ def ports(self):
+ return list(self)
+
+
+def test_reg_rsv():
+ dut = Reg_Rsv(4, 2, 2)
+ vl = rtlil.convert(dut, ports=dut.ports())
+ with open("test_reg_rsv.il", "w") as f:
+ f.write(vl)
+
+
+if __name__ == '__main__':
+ test_reg_rsv()
from nmigen.compat.sim import run_simulation
from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Cat, Array, Const, Elaboratable, Repl
+from nmigen import Module, Signal, Cat, Const, Elaboratable, Repl
from nmigen.lib.coding import Decoder
from soc.scoreboard.shadow_fn import ShadowFn
# inputs
self.issue_i = Signal(n_fus, reset_less=True)
self.reset_i = Signal(n_fus, reset_less=True)
- self.shadow_i = Array(Signal(shadow_wid, name="sh_i", reset_less=True) \
+ self.shadow_i = tuple(Signal(shadow_wid, name="sh_i", reset_less=True) \
for f in range(n_fus))
- self.s_fail_i = Array(Signal(shadow_wid, name="fl_i", reset_less=True) \
+ self.s_fail_i = tuple(Signal(shadow_wid, name="fl_i", reset_less=True) \
for f in range(n_fus))
- self.s_good_i = Array(Signal(shadow_wid, name="gd_i", reset_less=True) \
+ self.s_good_i = tuple(Signal(shadow_wid, name="gd_i", reset_less=True) \
for f in range(n_fus))
# outputs
self.go_die_o = Signal(n_fus, reset_less=True)
self.shadow_i = Signal(shadow_wid, reset_less=True)
self.fu_i = Signal(n_fus, reset_less=True)
- self.waw_o = Array(Signal(shadow_wid, name="waw_o", reset_less=True) \
+ self.waw_o = tuple(Signal(shadow_wid, name="waw_o", reset_less=True) \
for f in range(n_fus))
def elaborate(self, platform):
print("sendlen", len(self.iq)-i, sendlen)
for idx in range(sendlen):
instr = self.iq[i+idx]
- yield from eq(self.dut.data_i[idx], instr)
- di = yield self.dut.data_i[idx] # .src1_i
+ yield from eq(self.dut.i_data[idx], instr)
+ di = yield self.dut.i_data[idx] # .src1_i
print("senddata %d %x" % ((i+idx), di))
self.oq.append(di)
yield self.dut.p_add_i.eq(sendlen)
yield
- o_p_ready = yield self.dut.p_ready_o
+ o_p_ready = yield self.dut.p_o_ready
while not o_p_ready:
yield
- o_p_ready = yield self.dut.p_ready_o
+ o_p_ready = yield self.dut.p_o_ready
yield self.dut.p_add_i.eq(0)
n_sub_o = yield self.dut.n_sub_o
print("recv", n_sub_o)
for j in range(n_sub_o):
- r = yield self.dut.data_o[j] # .src1_i
+ r = yield self.dut.o_data[j] # .src1_i
print("recvdata %x %s" % (r, repr(self.iq[i+j])))
assert r == self.oq[i+j]
yield
# branch is active (TODO: a better signal: this is over-using the
# go_write signal - actually the branch should not be "writing")
with m.If(br1.go_wr_i):
- sync += self.branch_direction_o.eq(br1.data_o+Const(1, 2))
+ sync += self.branch_direction_o.eq(br1.o_data+Const(1, 2))
sync += bspec.active_i.eq(0)
comb += bspec.br_i.eq(1)
# branch occurs if data == 1, failed if data == 0
- comb += bspec.br_ok_i.eq(br1.data_o == 1)
+ comb += bspec.br_ok_i.eq(br1.o_data == 1)
for i in range(n_intfus):
# *expected* direction of the branch matched against *actual*
comb += bshadow.s_good_i[i][0].eq(bspec.match_g_o[i])
comb += int_src2.ren.eq(intfus.src2_rsel_o)
# connect ALUs to regfule
- comb += int_dest.data_i.eq(cu.data_o)
- comb += cu.src1_i.eq(int_src1.data_o)
- comb += cu.src2_i.eq(int_src2.data_o)
+ comb += int_dest.i_data.eq(cu.o_data)
+ comb += cu.src1_i.eq(int_src1.o_data)
+ comb += cu.src2_i.eq(int_src2.o_data)
# connect ALU Computation Units
comb += cu.go_rd_i[0:n_intfus].eq(go_rd_o[0:n_intfus])
# branch is active (TODO: a better signal: this is over-using the
# go_write signal - actually the branch should not be "writing")
with m.If(br1.go_wr_i):
- sync += self.branch_direction_o.eq(br1.data_o+Const(1, 2))
+ sync += self.branch_direction_o.eq(br1.o_data+Const(1, 2))
sync += bspec.active_i.eq(0)
comb += bspec.br_i.eq(1)
# branch occurs if data == 1, failed if data == 0
- comb += bspec.br_ok_i.eq(br1.data_o == 1)
+ comb += bspec.br_ok_i.eq(br1.o_data == 1)
for i in range(n_intfus):
# *expected* direction of the branch matched against *actual*
comb += bshadow.s_good_i[i][0].eq(bspec.match_g_o[i])
comb += int_src2.ren.eq(intfus.src2_rsel_o)
# connect ALUs to regfule
- comb += int_dest.data_i.eq(cu.data_o)
- comb += cu.src1_i.eq(int_src1.data_o)
- comb += cu.src2_i.eq(int_src2.data_o)
+ comb += int_dest.i_data.eq(cu.o_data)
+ comb += cu.src1_i.eq(int_src1.o_data)
+ comb += cu.src2_i.eq(int_src2.o_data)
# connect ALU Computation Units
comb += cu.go_rd_i[0:n_intfus].eq(go_rd_o[0:n_intfus])
conflict of access, regfile read/write hazards are *not* analysed,
and consequently it is safer to wait for the Function Unit to complete
before allowing a new instruction to proceed.
+(update: actually this is being added now:
+https://bugs.libre-soc.org/show_bug.cgi?id=737)
"""
-from nmigen import Elaboratable, Module, Signal, ResetSignal, Cat, Mux
+from nmigen import (Elaboratable, Module, Signal, ResetSignal, Cat, Mux,
+ Const)
from nmigen.cli import rtlil
from openpower.decoder.power_decoder2 import PowerDecodeSubset
-from openpower.decoder.power_regspec_map import regspec_decode_read
-from openpower.decoder.power_regspec_map import regspec_decode_write
+from openpower.decoder.power_regspec_map import regspec_decode
from openpower.sv.svp64 import SVP64Rec
from nmutil.picker import PriorityPicker
from nmutil.util import treereduce
+from nmutil.singlepipe import ControlBase
-from soc.fu.compunits.compunits import AllFunctionUnits
+from soc.fu.compunits.compunits import AllFunctionUnits, LDSTFunctionUnit
from soc.regfile.regfiles import RegFiles
-from openpower.decoder.decode2execute1 import Decode2ToExecute1Type
-from openpower.decoder.decode2execute1 import IssuerDecode2ToOperand
from openpower.decoder.power_decoder2 import get_rdflags
-from openpower.decoder.decode2execute1 import Data
from soc.experiment.l0_cache import TstL0CacheBuffer # test only
from soc.config.test.test_loadstore import TestMemPspec
-from openpower.decoder.power_enums import MicrOp
-from soc.config.state import CoreState
+from openpower.decoder.power_enums import MicrOp, Function
+from soc.simple.core_data import CoreInput, CoreOutput
+from collections import defaultdict, namedtuple
import operator
from nmutil.util import rising_edge
+FUSpec = namedtuple("FUSpec", ["funame", "fu", "idx"])
+ByRegSpec = namedtuple("ByRegSpec", ["okflag", "regport", "wid", "specs"])
# helper function for reducing a list of signals down to a parallel
# ORed single signal.
-def ortreereduce(tree, attr="data_o"):
+def ortreereduce(tree, attr="o_data"):
return treereduce(tree, operator.or_, lambda x: getattr(x, attr))
return res # enumerate(res)
-class NonProductionCore(Elaboratable):
+# a hazard bitvector "remap" function which returns an AST expression
+# that remaps read/write hazard regfile port numbers to either a full
+# bitvector or a reduced subset one. SPR for example is reduced to a
+# single bit.
+# CRITICALLY-IMPORTANT NOTE: these bitvectors *have* to match up per
+# regfile! therefore the remapping is per regfile, *NOT* per regfile
+# port and certainly not based on whether it is a read port or write port.
+# note that any reductions here will result in degraded performance due
+# to conflicts, but at least it keeps the hazard matrix sizes down to "sane"
+def bitvector_remap(regfile, rfile, port):
+ # 8-bits (at the moment, no SVP64), CR is unary: no remap
+ if regfile == 'CR':
+ return port
+ # 3 bits, unary alrady: return the port
+ if regfile == 'XER':
+ return port
+ # 3 bits, unary: return the port
+ if regfile == 'XER':
+ return port
+ # 5 bits, unary: return the port
+ if regfile == 'STATE':
+ return port
+ # 9 bits (9 entries), might be unary already
+ if regfile == 'FAST':
+ if rfile.unary: # FAST might be unary already
+ return port
+ else:
+ return 1 << port
+ # 10 bits (!!) - reduce to one
+ if regfile == 'SPR':
+ if rfile.unary: # FAST might be unary already
+ return port
+ else:
+ return 1 << port
+ if regfile == 'INT':
+ if rfile.unary: # INT, check if unary/binary
+ return port
+ else:
+ return 1 << port
+
+
+# derive from ControlBase rather than have a separate Stage instance,
+# this is simpler to do
+class NonProductionCore(ControlBase):
def __init__(self, pspec):
self.pspec = pspec
self.regreduce_en = (hasattr(pspec, "regreduce") and
(pspec.regreduce == True))
+ # test to see if overlapping of instructions is allowed
+ # (not normally enabled for TestIssuer FSM but useful for checking
+ # the bitvector hazard detection, before doing In-Order)
+ self.allow_overlap = (hasattr(pspec, "allow_overlap") and
+ (pspec.allow_overlap == True))
+
+ # test core type
+ self.make_hazard_vecs = self.allow_overlap
+ self.core_type = "fsm"
+ if hasattr(pspec, "core_type"):
+ self.core_type = pspec.core_type
+
+ super().__init__(stage=self)
+
# single LD/ST funnel for memory access
self.l0 = l0 = TstL0CacheBuffer(pspec, n_units=1)
pi = l0.l0.dports[0]
# link LoadStore1 into MMU
mmu = self.fus.get_fu('mmu0')
+ ldst0 = self.fus.get_fu('ldst0')
print ("core pspec", pspec.ldst_ifacetype)
print ("core mmu", mmu)
- print ("core lsmem.lsi", l0.cmpi.lsmem.lsi)
if mmu is not None:
- mmu.alu.set_ldst_interface(l0.cmpi.lsmem.lsi)
+ lsi = l0.cmpi.lsmem.lsi # a LoadStore1 Interface object
+ print ("core lsmem.lsi", lsi)
+ mmu.alu.set_ldst_interface(lsi)
+ # urr store I-Cache in core so it is easier to get at
+ self.icache = lsi.icache
+
+ # alternative reset values for STATE regs. these probably shouldn't
+ # be set, here, instead have them done by Issuer. which they are.
+ # as well. because core.state overrides them. sigh.
+ self.msr_at_reset = 0x0
+ self.pc_at_reset = 0x0
+ if hasattr(pspec, "msr_reset") and isinstance(pspec.msr_reset, int):
+ self.msr_at_reset = pspec.msr_reset
+ if hasattr(pspec, "pc_reset") and isinstance(pspec.pc_reset, int):
+ self.pc_at_reset = pspec.pc_reset
+ state_resets = [self.pc_at_reset, # PC at reset
+ self.msr_at_reset, # MSR at reset
+ 0x0, # SVSTATE at reset
+ 0x0, # DEC at reset
+ 0x0] # TB at reset
# register files (yes plural)
- self.regs = RegFiles(pspec)
-
- # instruction decoder - needs a Trap-capable Record (captures EINT etc.)
- self.e = Decode2ToExecute1Type("core", opkls=IssuerDecode2ToOperand,
- regreduce_en=self.regreduce_en)
-
- # SVP64 RA_OR_ZERO needs to know if the relevant EXTRA2/3 field is zero
- self.sv_a_nz = Signal()
-
- # state and raw instruction (and SVP64 ReMap fields)
- self.state = CoreState("core")
- self.raw_insn_i = Signal(32) # raw instruction
- self.bigendian_i = Signal() # bigendian - TODO, set by MSR.BE
- if self.svp64_en:
- self.sv_rm = SVP64Rec(name="core_svp64_rm") # SVP64 RM field
- self.is_svp64_mode = Signal() # set if SVP64 mode is enabled
- self.use_svp64_ldst_dec = Signal() # use alternative LDST decoder
- self.sv_pred_sm = Signal() # TODO: SIMD width
- self.sv_pred_dm = Signal() # TODO: SIMD width
-
- # issue/valid/busy signalling
- self.ivalid_i = Signal(reset_less=True) # instruction is valid
- self.issue_i = Signal(reset_less=True)
- self.busy_o = Signal(name="corebusy_o", reset_less=True)
-
- # start/stop and terminated signalling
- self.core_terminate_o = Signal(reset=0) # indicates stopped
-
- # create per-FU instruction decoders (subsetted)
+ self.regs = RegFiles(pspec, make_hazard_vecs=self.make_hazard_vecs,
+ state_resets=state_resets)
+
+ # set up input and output: unusual requirement to set data directly
+ # (due to the way that the core is set up in a different domain,
+ # see TestIssuer.setup_peripherals
+ self.p.i_data, self.n.o_data = self.new_specs(None)
+ self.i, self.o = self.p.i_data, self.n.o_data
+
+ # actual internal input data used (captured)
+ self.ireg = self.ispec()
+
+ # create per-FU instruction decoders (subsetted). these "satellite"
+ # decoders reduce wire fan-out from the one (main) PowerDecoder2
+ # (used directly by the trap unit) to the *twelve* (or more)
+ # Function Units. we can either have 32 wires (the instruction)
+ # to each, or we can have well over a 200 wire fan-out (to 12
+ # ALUs). it's an easy choice to make.
self.decoders = {}
self.des = {}
+ # eep, these should be *per FU* i.e. for FunctionUnitBaseMulti
+ # they should be shared (put into the ALU *once*).
+
for funame, fu in self.fus.fus.items():
f_name = fu.fnunit.name
fnunit = fu.fnunit.value
# TRAP decoder is the *main* decoder
self.trapunit = funame
continue
+ assert funame not in self.decoders
self.decoders[funame] = PowerDecodeSubset(None, opkls, f_name,
final=True,
- state=self.state,
+ state=self.ireg.state,
svp64_en=self.svp64_en,
regreduce_en=self.regreduce_en)
self.des[funame] = self.decoders[funame].do
+ print ("create decoder subset", funame, opkls, self.des[funame])
+ # create per-Function Unit write-after-write hazard signals
+ # yes, really, this should have been added in ReservationStations
+ # but hey.
+ for funame, fu in self.fus.fus.items():
+ fu._waw_hazard = Signal(name="waw_%s" % funame)
+
+ # share the SPR decoder with the MMU if it exists
if "mmu0" in self.decoders:
self.decoders["mmu0"].mmu0_spr_dec = self.decoders["spr0"]
+ # allow pausing of the DEC/TB FSM back in Issuer, by spotting
+ # if there is an MTSPR instruction
+ self.pause_dec_tb = Signal()
+
+ # next 3 functions are Stage API Compliance
+ def setup(self, m, i):
+ pass
+
+ def ispec(self):
+ return CoreInput(self.pspec, self.svp64_en, self.regreduce_en)
+
+ def ospec(self):
+ return CoreOutput()
+
+ # elaborate function to create HDL
def elaborate(self, platform):
- m = Module()
+ m = super().elaborate(platform)
+
# for testing purposes, to cut down on build time in coriolis2
if hasattr(self.pspec, "nocore") and self.pspec.nocore == True:
x = Signal() # dummy signal
regs = self.regs
fus = self.fus.fus
+ # amalgamate write-hazards into a single top-level Signal
+ self.waw_hazard = Signal()
+ whaz = []
+ for funame, fu in self.fus.fus.items():
+ whaz.append(fu._waw_hazard)
+ comb += self.waw_hazard.eq(Cat(*whaz).bool())
+
# connect decoders
- for k, v in self.decoders.items():
- setattr(m.submodules, "dec_%s" % v.fn_name, v)
- comb += v.dec.raw_opcode_in.eq(self.raw_insn_i)
- comb += v.dec.bigendian.eq(self.bigendian_i)
- # sigh due to SVP64 RA_OR_ZERO detection connect these too
- comb += v.sv_a_nz.eq(self.sv_a_nz)
- if self.svp64_en:
- comb += v.pred_sm.eq(self.sv_pred_sm)
- comb += v.pred_dm.eq(self.sv_pred_dm)
- if k != self.trapunit:
- comb += v.sv_rm.eq(self.sv_rm) # pass through SVP64 ReMap
- comb += v.is_svp64_mode.eq(self.is_svp64_mode)
- # only the LDST PowerDecodeSubset *actually* needs to
- # know to use the alternative decoder. this is all
- # a terrible hack
- if k.lower().startswith("ldst"):
- comb += v.use_svp64_ldst_dec.eq(self.use_svp64_ldst_dec)
+ self.connect_satellite_decoders(m)
# ssh, cheat: trap uses the main decoder because of the rewriting
- self.des[self.trapunit] = self.e.do
-
- # connect up Function Units, then read/write ports
- fu_bitdict = self.connect_instruction(m)
- self.connect_rdports(m, fu_bitdict)
- self.connect_wrports(m, fu_bitdict)
+ self.des[self.trapunit] = self.ireg.e.do
+
+ # connect up Function Units, then read/write ports, and hazard conflict
+ self.issue_conflict = Signal()
+ fu_bitdict, fu_selected = self.connect_instruction(m)
+ raw_hazard = self.connect_rdports(m, fu_bitdict, fu_selected)
+ self.connect_wrports(m, fu_bitdict, fu_selected)
+ if self.allow_overlap:
+ comb += self.issue_conflict.eq(raw_hazard)
+
+ # note if an exception happened. in a pipelined or OoO design
+ # this needs to be accompanied by "shadowing" (or stalling)
+ el = []
+ for exc in self.fus.excs.values():
+ el.append(exc.happened)
+ if len(el) > 0: # at least one exception
+ comb += self.o.exc_happened.eq(Cat(*el).bool())
return m
+ def connect_satellite_decoders(self, m):
+ comb = m.d.comb
+ for k, v in self.decoders.items():
+ # connect each satellite decoder and give it the instruction.
+ # as subset decoders this massively reduces wire fanout given
+ # the large number of ALUs
+ m.submodules["dec_%s" % k] = v
+ comb += v.dec.raw_opcode_in.eq(self.ireg.raw_insn_i)
+ comb += v.dec.bigendian.eq(self.ireg.bigendian_i)
+ # sigh due to SVP64 RA_OR_ZERO detection connect these too
+ comb += v.sv_a_nz.eq(self.ireg.sv_a_nz)
+ if not self.svp64_en:
+ continue
+ comb += v.pred_sm.eq(self.ireg.sv_pred_sm)
+ comb += v.pred_dm.eq(self.ireg.sv_pred_dm)
+ if k == self.trapunit:
+ continue
+ comb += v.sv_rm.eq(self.ireg.sv_rm) # pass through SVP64 RM
+ comb += v.is_svp64_mode.eq(self.ireg.is_svp64_mode)
+ # only the LDST PowerDecodeSubset *actually* needs to
+ # know to use the alternative decoder. this is all
+ # a terrible hack
+ if not k.lower().startswith("ldst"):
+ continue
+ comb += v.use_svp64_ldst_dec.eq( self.ireg.use_svp64_ldst_dec)
+
def connect_instruction(self, m):
"""connect_instruction
comb, sync = m.d.comb, m.d.sync
fus = self.fus.fus
- # enable-signals for each FU, get one bit for each FU (by name)
+ # indicate if core is busy
+ busy_o = self.o.busy_o
+ any_busy_o = self.o.any_busy_o
+
+ # connect up temporary copy of incoming instruction. the FSM will
+ # either blat the incoming instruction (if valid) into self.ireg
+ # or if the instruction could not be delivered, keep dropping the
+ # latched copy into ireg
+ ilatch = self.ispec()
+ self.instr_active = Signal()
+
+ # enable/busy-signals for each FU, get one bit for each FU (by name)
fu_enable = Signal(len(fus), reset_less=True)
+ fu_busy = Signal(len(fus), reset_less=True)
fu_bitdict = {}
+ fu_selected = {}
for i, funame in enumerate(fus.keys()):
fu_bitdict[funame] = fu_enable[i]
-
- # enable the required Function Unit based on the opcode decode
- # note: this *only* works correctly for simple core when one and
- # *only* one FU is allocated per instruction
+ fu_selected[funame] = fu_busy[i]
+
+ # identify function units and create a list by fnunit so that
+ # PriorityPickers can be created for selecting one of them that
+ # isn't busy at the time the incoming instruction needs passing on
+ by_fnunit = defaultdict(list)
+ for fname, member in Function.__members__.items():
+ for funame, fu in fus.items():
+ fnunit = fu.fnunit.value
+ if member.value & fnunit: # this FU handles this type of op
+ by_fnunit[fname].append((funame, fu)) # add by Function
+
+ # ok now just print out the list of FUs by Function, because we can
+ for fname, fu_list in by_fnunit.items():
+ print ("FUs by type", fname, fu_list)
+
+ # now create a PriorityPicker per FU-type such that only one
+ # non-busy FU will be picked
+ issue_pps = {}
+ fu_found = Signal() # take a note if no Function Unit was available
+ for fname, fu_list in by_fnunit.items():
+ i_pp = PriorityPicker(len(fu_list))
+ m.submodules['i_pp_%s' % fname] = i_pp
+ i_l = []
+ for i, (funame, fu) in enumerate(fu_list):
+ # match the decoded instruction (e.do.fn_unit) against the
+ # "capability" of this FU, gate that by whether that FU is
+ # busy, and drop that into the PriorityPicker.
+ # this will give us an output of the first available *non-busy*
+ # Function Unit (Reservation Statio) capable of handling this
+ # instruction.
+ fnunit = fu.fnunit.value
+ en_req = Signal(name="issue_en_%s" % funame, reset_less=True)
+ fnmatch = (self.ireg.e.do.fn_unit & fnunit).bool()
+ comb += en_req.eq(fnmatch & ~fu.busy_o &
+ self.instr_active)
+ i_l.append(en_req) # store in list for doing the Cat-trick
+ # picker output, gated by enable: store in fu_bitdict
+ po = Signal(name="o_issue_pick_"+funame) # picker output
+ comb += po.eq(i_pp.o[i] & i_pp.en_o)
+ comb += fu_bitdict[funame].eq(po)
+ comb += fu_selected[funame].eq(fu.busy_o | po)
+ # if we don't do this, then when there are no FUs available,
+ # the "p.o_ready" signal will go back "ok we accepted this
+ # instruction" which of course isn't true.
+ with m.If(i_pp.en_o):
+ comb += fu_found.eq(1)
+ # for each input, Cat them together and drop them into the picker
+ comb += i_pp.i.eq(Cat(*i_l))
+
+ # rdmask, which is for registers needs to come from the *main* decoder
for funame, fu in fus.items():
- fnunit = fu.fnunit.value
- enable = Signal(name="en_%s" % funame, reset_less=True)
- comb += enable.eq((self.e.do.fn_unit & fnunit).bool())
- comb += fu_bitdict[funame].eq(enable)
+ rdmask = get_rdflags(m, self.ireg.e, fu)
+ comb += fu.rdmaskn.eq(~rdmask)
# sigh - need a NOP counter
counter = Signal(2)
with m.If(counter != 0):
sync += counter.eq(counter - 1)
- comb += self.busy_o.eq(1)
-
- with m.If(self.ivalid_i): # run only when valid
- with m.Switch(self.e.do.insn_type):
- # check for ATTN: halt if true
- with m.Case(MicrOp.OP_ATTN):
- m.d.sync += self.core_terminate_o.eq(1)
-
- with m.Case(MicrOp.OP_NOP):
- sync += counter.eq(2)
- comb += self.busy_o.eq(1)
-
- with m.Default():
- # connect up instructions. only one enabled at a time
+ comb += busy_o.eq(1)
+
+ # default to reading from incoming instruction: may be overridden
+ # by copy from latch when "waiting"
+ comb += self.ireg.eq(self.i)
+ # always say "ready" except if overridden
+ comb += self.p.o_ready.eq(1)
+
+ with m.FSM():
+ with m.State("READY"):
+ with m.If(self.p.i_valid): # run only when valid
+ with m.Switch(self.ireg.e.do.insn_type):
+ # check for ATTN: halt if true
+ with m.Case(MicrOp.OP_ATTN):
+ m.d.sync += self.o.core_terminate_o.eq(1)
+
+ # fake NOP - this isn't really used (Issuer detects NOP)
+ with m.Case(MicrOp.OP_NOP):
+ sync += counter.eq(2)
+ comb += busy_o.eq(1)
+
+ with m.Default():
+ comb += self.instr_active.eq(1)
+ comb += self.p.o_ready.eq(0)
+ # connect instructions. only one enabled at a time
+ for funame, fu in fus.items():
+ do = self.des[funame]
+ enable = fu_bitdict[funame]
+
+ # run this FunctionUnit if enabled route op,
+ # issue, busy, read flags and mask to FU
+ with m.If(enable):
+ # operand comes from the *local* decoder
+ # do not actually issue, though, if there
+ # is a waw hazard. decoder has to still
+ # be asserted in order to detect that, tho
+ comb += fu.oper_i.eq_from(do)
+ if funame == 'mmu0':
+ # URRR this is truly dreadful.
+ # OP_FETCH_FAILED is a "fake" op.
+ # no instruction creates it. OP_TRAP
+ # uses the *main* decoder: this is
+ # a *Satellite* decoder that reacts
+ # on *insn_in*... not fake ops. gaah.
+ main_op = self.ireg.e.do
+ with m.If(main_op.insn_type ==
+ MicrOp.OP_FETCH_FAILED):
+ comb += fu.oper_i.insn_type.eq(
+ MicrOp.OP_FETCH_FAILED)
+ comb += fu.oper_i.fn_unit.eq(
+ Function.MMU)
+ # issue when valid (and no write-hazard)
+ comb += fu.issue_i.eq(~self.waw_hazard)
+ # instruction ok, indicate ready
+ comb += self.p.o_ready.eq(1)
+
+ if self.allow_overlap:
+ with m.If(~fu_found | self.waw_hazard):
+ # latch copy of instruction
+ sync += ilatch.eq(self.i)
+ comb += self.p.o_ready.eq(1) # accept
+ comb += busy_o.eq(1)
+ m.next = "WAITING"
+
+ with m.State("WAITING"):
+ comb += self.instr_active.eq(1)
+ comb += self.p.o_ready.eq(0)
+ comb += busy_o.eq(1)
+ # using copy of instruction, keep waiting until an FU is free
+ comb += self.ireg.eq(ilatch)
+ with m.If(fu_found): # wait for conflict to clear
+ # connect instructions. only one enabled at a time
for funame, fu in fus.items():
do = self.des[funame]
enable = fu_bitdict[funame]
- # run this FunctionUnit if enabled
- # route op, issue, busy, read flags and mask to FU
+ # run this FunctionUnit if enabled route op,
+ # issue, busy, read flags and mask to FU
with m.If(enable):
- # operand comes from the *local* decoder
+ # operand comes from the *local* decoder,
+ # which is asserted even if not issued,
+ # so that WaW-detection can check for hazards.
+ # only if the waw hazard is clear does the
+ # instruction actually get issued
comb += fu.oper_i.eq_from(do)
- #comb += fu.oper_i.eq_from_execute1(e)
- comb += fu.issue_i.eq(self.issue_i)
- comb += self.busy_o.eq(fu.busy_o)
- # rdmask, which is for registers, needs to come
- # from the *main* decoder
- rdmask = get_rdflags(self.e, fu)
- comb += fu.rdmaskn.eq(~rdmask)
-
- return fu_bitdict
-
- def connect_rdport(self, m, fu_bitdict, rdpickers, regfile, regname, fspec):
+ # issue when valid
+ comb += fu.issue_i.eq(~self.waw_hazard)
+ with m.If(~self.waw_hazard):
+ comb += self.p.o_ready.eq(1)
+ comb += busy_o.eq(0)
+ m.next = "READY"
+
+ print ("core: overlap allowed", self.allow_overlap)
+ # true when any FU is busy (including the cycle where it is perhaps
+ # to be issued - because that's what fu_busy is)
+ comb += any_busy_o.eq(fu_busy.bool())
+ if not self.allow_overlap:
+ # for simple non-overlap, if any instruction is busy, set
+ # busy output for core.
+ comb += busy_o.eq(any_busy_o)
+ else:
+ # sigh deal with a fun situation that needs to be investigated
+ # and resolved
+ with m.If(self.issue_conflict):
+ comb += busy_o.eq(1)
+ # make sure that LDST, SPR, MMU, Branch and Trap all say "busy"
+ # and do not allow overlap. these are all the ones that
+ # are non-forward-progressing: exceptions etc. that otherwise
+ # change CoreState for some reason (MSR, PC, SVSTATE)
+ for funame, fu in fus.items():
+ if (funame.lower().startswith('ldst') or
+ funame.lower().startswith('branch') or
+ funame.lower().startswith('mmu') or
+ funame.lower().startswith('spr') or
+ funame.lower().startswith('trap')):
+ with m.If(fu.busy_o):
+ comb += busy_o.eq(1)
+ # for SPR pipeline pause dec/tb FSM to avoid race condition
+ # TODO: really this should be much more sophisticated,
+ # spot MTSPR, spot that DEC/TB is what is to be updated.
+ # a job for PowerDecoder2, there
+ if funame.lower().startswith('spr'):
+ with m.If(fu.busy_o #& fu.oper_i.insn_type == OP_MTSPR
+ ):
+ comb += self.pause_dec_tb.eq(1)
+
+ # return both the function unit "enable" dict as well as the "busy".
+ # the "busy-or-issued" can be passed in to the Read/Write port
+ # connecters to give them permission to request access to regfiles
+ return fu_bitdict, fu_selected
+
+ def connect_rdport(self, m, fu_bitdict, fu_selected,
+ rdpickers, regfile, regname, fspec):
comb, sync = m.d.comb, m.d.sync
fus = self.fus.fus
regs = self.regs
print("read regfile", rpidx, regfile, regs.rf.keys(),
rfile, rfile.unary)
+ # for checking if the read port has an outstanding write
+ if self.make_hazard_vecs:
+ wv = regs.wv[regfile.lower()]
+ wvchk = wv.q_int # write-vec bit-level hazard check
+
+ # if a hazard is detected on this read port, simply blithely block
+ # every FU from reading on it. this is complete overkill but very
+ # simple for now.
+ hazard_detected = Signal(name="raw_%s_%s" % (regfile, rpidx))
+
fspecs = fspec
if not isinstance(fspecs, list):
fspecs = [fspecs]
rdflags = []
pplen = 0
- reads = []
ppoffs = []
for i, fspec in enumerate(fspecs):
# get the regfile specs for this regfile port
- (rf, read, write, wid, fuspec) = fspec
- print ("fpsec", i, fspec, len(fuspec))
+ print ("fpsec", i, fspec, len(fspec.specs))
+ name = "%s_%s_%d" % (regfile, regname, i)
ppoffs.append(pplen) # record offset for picker
- pplen += len(fuspec)
- name = "rdflag_%s_%s_%d" % (regfile, regname, i)
- rdflag = Signal(name=name, reset_less=True)
- comb += rdflag.eq(rf)
+ pplen += len(fspec.specs)
+ rdflag = Signal(name="rdflag_"+name, reset_less=True)
+ comb += rdflag.eq(fspec.okflag)
rdflags.append(rdflag)
- reads.append(read)
print ("pplen", pplen)
# create a priority picker to manage this port
rdpickers[regfile][rpidx] = rdpick = PriorityPicker(pplen)
- setattr(m.submodules, "rdpick_%s_%s" % (regfile, rpidx), rdpick)
+ m.submodules["rdpick_%s_%s" % (regfile, rpidx)] = rdpick
rens = []
addrs = []
+ wvens = []
+
for i, fspec in enumerate(fspecs):
- (rf, read, write, wid, fuspec) = fspec
+ (rf, _read, wid, fuspecs) = \
+ (fspec.okflag, fspec.regport, fspec.wid, fspec.specs)
# connect up the FU req/go signals, and the reg-read to the FU
# and create a Read Broadcast Bus
- for pi, (funame, fu, idx) in enumerate(fuspec):
+ for pi, fuspec in enumerate(fspec.specs):
+ (funame, fu, idx) = (fuspec.funame, fuspec.fu, fuspec.idx)
pi += ppoffs[i]
+ name = "%s_%s_%s_%i" % (regfile, rpidx, funame, pi)
+ fu_active = fu_selected[funame]
+ fu_issued = fu_bitdict[funame]
+
+ # get (or set up) a latched copy of read register number
+ # and (sigh) also the read-ok flag
+ # TODO: use nmutil latchregister
+ rhname = "%s_%s_%d" % (regfile, regname, i)
+ rdflag = Signal(name="rdflag_%s_%s" % (funame, rhname),
+ reset_less=True)
+ if rhname not in fu.rf_latches:
+ rfl = Signal(name="rdflag_latch_%s_%s" % (funame, rhname))
+ fu.rf_latches[rhname] = rfl
+ with m.If(fu.issue_i):
+ sync += rfl.eq(rdflags[i])
+ else:
+ rfl = fu.rf_latches[rhname]
+
+ # now the register port
+ rname = "%s_%s_%s_%d" % (funame, regfile, regname, pi)
+ read = Signal.like(_read, name="read_"+rname)
+ if rname not in fu.rd_latches:
+ rdl = Signal.like(_read, name="rdlatch_"+rname)
+ fu.rd_latches[rname] = rdl
+ with m.If(fu.issue_i):
+ sync += rdl.eq(_read)
+ else:
+ rdl = fu.rd_latches[rname]
+
+ # make the read immediately available on issue cycle
+ # after the read cycle, otherwies use the latched copy.
+ # this captures the regport and okflag on issue
+ with m.If(fu.issue_i):
+ comb += read.eq(_read)
+ comb += rdflag.eq(rdflags[i])
+ with m.Else():
+ comb += read.eq(rdl)
+ comb += rdflag.eq(rfl)
# connect request-read to picker input, and output to go-rd
- fu_active = fu_bitdict[funame]
- name = "%s_%s_%s_%i" % (regfile, rpidx, funame, pi)
- addr_en = Signal.like(reads[i], name="addr_en_"+name)
+ addr_en = Signal.like(read, name="addr_en_"+name)
pick = Signal(name="pick_"+name) # picker input
rp = Signal(name="rp_"+name) # picker output
delay_pick = Signal(name="dp_"+name) # read-enable "underway"
+ rhazard = Signal(name="rhaz_"+name)
# exclude any currently-enabled read-request (mask out active)
- comb += pick.eq(fu.rd_rel_o[idx] & fu_active & rdflags[i] &
- ~delay_pick)
+ # entirely block anything hazarded from being picked
+ comb += pick.eq(fu.rd_rel_o[idx] & fu_active & rdflag &
+ ~delay_pick & ~rhazard)
comb += rdpick.i[pi].eq(pick)
comb += fu.go_rd_i[idx].eq(delay_pick) # pass in *delayed* pick
# if picked, select read-port "reg select" number to port
comb += rp.eq(rdpick.o[pi] & rdpick.en_o)
sync += delay_pick.eq(rp) # delayed "pick"
- comb += addr_en.eq(Mux(rp, reads[i], 0))
+ comb += addr_en.eq(Mux(rp, read, 0))
# the read-enable happens combinatorially (see mux-bus below)
# but it results in the data coming out on a one-cycle delay.
src = fu.src_i[idx]
print("reg connect widths",
regfile, regname, pi, funame,
- src.shape(), rport.data_o.shape())
+ src.shape(), rport.o_data.shape())
# all FUs connect to same port
- comb += src.eq(rport.data_o)
+ comb += src.eq(rport.o_data)
+
+ if not self.make_hazard_vecs:
+ continue
+
+ # read the write-hazard bitvector (wv) for any bit that is
+ wvchk_en = Signal(len(wvchk), name="wv_chk_addr_en_"+name)
+ issue_active = Signal(name="rd_iactive_"+name)
+ # XXX combinatorial loop here
+ comb += issue_active.eq(fu_active & rdflag)
+ with m.If(issue_active):
+ if rfile.unary:
+ comb += wvchk_en.eq(read)
+ else:
+ comb += wvchk_en.eq(1<<read)
+ # if FU is busy (which doesn't get set at the same time as
+ # issue) and no hazard was detected, clear wvchk_en (i.e.
+ # stop checking for hazards). there is a loop here, but it's
+ # via a DFF, so is ok. some linters may complain, but hey.
+ with m.If(fu.busy_o & ~rhazard):
+ comb += wvchk_en.eq(0)
+
+ # read-hazard is ANDed with (filtered by) what is actually
+ # being requested.
+ comb += rhazard.eq((wvchk & wvchk_en).bool())
+
+ wvens.append(wvchk_en)
# or-reduce the muxed read signals
if rfile.unary:
comb += rport.ren.eq(Cat(*rens).bool())
print ("binary", regfile, rpidx, rport, rport.ren, rens, addrs)
- def connect_rdports(self, m, fu_bitdict):
+ if not self.make_hazard_vecs:
+ return Const(0) # declare "no hazards"
+
+ # enable the read bitvectors for this issued instruction
+ # and return whether any write-hazard bit is set
+ wvchk_and = Signal(len(wvchk), name="wv_chk_"+name)
+ comb += wvchk_and.eq(wvchk & ortreereduce_sig(wvens))
+ comb += hazard_detected.eq(wvchk_and.bool())
+ return hazard_detected
+
+ def connect_rdports(self, m, fu_bitdict, fu_selected):
"""connect read ports
orders the read regspecs into a dict-of-dicts, by regfile, by
comb, sync = m.d.comb, m.d.sync
fus = self.fus.fus
regs = self.regs
+ rd_hazard = []
# dictionary of lists of regfile read ports
- byregfiles_rd, byregfiles_rdspec = self.get_byregfiles(True)
+ byregfiles_rdspec = self.get_byregfiles(m, True)
# okaay, now we need a PriorityPicker per regfile per regfile port
# loootta pickers... peter piper picked a pack of pickled peppers...
rdpickers = {}
- for regfile, spec in byregfiles_rd.items():
- fuspecs = byregfiles_rdspec[regfile]
+ for regfile, fuspecs in byregfiles_rdspec.items():
rdpickers[regfile] = {}
# argh. an experiment to merge RA and RB in the INT regfile
fuspecs['fast1'].append(fuspecs.pop('fast3'))
# for each named regfile port, connect up all FUs to that port
+ # also return (and collate) hazard detection)
for (regname, fspec) in sort_fuspecs(fuspecs):
print("connect rd", regname, fspec)
- self.connect_rdport(m, fu_bitdict, rdpickers, regfile,
+ rh = self.connect_rdport(m, fu_bitdict, fu_selected,
+ rdpickers, regfile,
regname, fspec)
+ rd_hazard.append(rh)
+
+ return Cat(*rd_hazard).bool()
+
+ def make_hazards(self, m, regfile, rfile, wvclr, wvset,
+ funame, regname, idx,
+ addr_en, wp, fu, fu_active, wrflag, write,
+ fu_wrok):
+ """make_hazards: a setter and a clearer for the regfile write ports
- def connect_wrport(self, m, fu_bitdict, wrpickers, regfile, regname, fspec):
+ setter is at issue time (using PowerDecoder2 regfile write numbers)
+ clearer is at regfile write time (when FU has said what to write to)
+
+ there is *one* unusual case here which has to be dealt with:
+ when the Function Unit does *NOT* request a write to the regfile
+ (has its data.ok bit CLEARED). this is perfectly legitimate.
+ and a royal pain.
+ """
+ comb, sync = m.d.comb, m.d.sync
+ name = "%s_%s_%d" % (funame, regname, idx)
+
+ # connect up the bitvector write hazard. unlike the
+ # regfile writeports, a ONE must be written to the corresponding
+ # bit of the hazard bitvector (to indicate the existence of
+ # the hazard)
+
+ # the detection of what shall be written to is based
+ # on *issue*. it is delayed by 1 cycle so that instructions
+ # "addi 5,5,0x2" do not cause combinatorial loops due to
+ # fake-dependency on *themselves*. this will totally fail
+ # spectacularly when doing multi-issue
+ print ("write vector (for regread)", regfile, wvset)
+ wviaddr_en = Signal(len(wvset), name="wv_issue_addr_en_"+name)
+ issue_active = Signal(name="iactive_"+name)
+ sync += issue_active.eq(fu.issue_i & fu_active & wrflag)
+ with m.If(issue_active):
+ if rfile.unary:
+ comb += wviaddr_en.eq(write)
+ else:
+ comb += wviaddr_en.eq(1<<write)
+
+ # deal with write vector clear: this kicks in when the regfile
+ # is written to, and clears the corresponding bitvector entry
+ print ("write vector", regfile, wvclr)
+ wvaddr_en = Signal(len(wvclr), name="wvaddr_en_"+name)
+ if rfile.unary:
+ comb += wvaddr_en.eq(addr_en)
+ else:
+ with m.If(wp):
+ comb += wvaddr_en.eq(1<<addr_en)
+
+ # XXX ASSUME that LDSTFunctionUnit always sets the data it intends to
+ # this may NOT be the case when an exception occurs
+ if isinstance(fu, LDSTFunctionUnit):
+ return wvaddr_en, wviaddr_en
+
+ # okaaay, this is preparation for the awkward case.
+ # * latch a copy of wrflag when issue goes high.
+ # * when the fu_wrok (data.ok) flag is NOT set,
+ # but the FU is done, the FU is NEVER going to write
+ # so the bitvector has to be cleared.
+ latch_wrflag = Signal(name="latch_wrflag_"+name)
+ with m.If(~fu.busy_o):
+ sync += latch_wrflag.eq(0)
+ with m.If(fu.issue_i & fu_active):
+ sync += latch_wrflag.eq(wrflag)
+ with m.If(fu.alu_done_o & latch_wrflag & ~fu_wrok):
+ if rfile.unary:
+ comb += wvaddr_en.eq(write) # addr_en gated with wp, don't use
+ else:
+ comb += wvaddr_en.eq(1<<addr_en) # binary addr_en not gated
+
+ return wvaddr_en, wviaddr_en
+
+ def connect_wrport(self, m, fu_bitdict, fu_selected,
+ wrpickers, regfile, regname, fspec):
comb, sync = m.d.comb, m.d.sync
fus = self.fus.fus
regs = self.regs
- print("connect wr", regname, fspec)
rpidx = regname
# select the required write port. these are pre-defined sizes
- print(regfile, regs.rf.keys())
rfile = regs.rf[regfile.lower()]
wport = rfile.w_ports[rpidx]
+ print("connect wr", regname, "unary", rfile.unary, fspec)
+ print(regfile, regs.rf.keys())
+
+ # select the write-protection hazard vector. note that this still
+ # requires to WRITE to the hazard bitvector! read-requests need
+ # to RAISE the bitvector (set it to 1), which, duh, requires a WRITE
+ if self.make_hazard_vecs:
+ wv = regs.wv[regfile.lower()]
+ wvset = wv.s # write-vec bit-level hazard ctrl
+ wvclr = wv.r # write-vec bit-level hazard ctrl
+ wvchk = wv.q # write-after-write hazard check
+
fspecs = fspec
if not isinstance(fspecs, list):
fspecs = [fspecs]
pplen = 0
writes = []
ppoffs = []
+ wrflags = []
for i, fspec in enumerate(fspecs):
# get the regfile specs for this regfile port
- (rf, read, write, wid, fuspec) = fspec
- print ("fpsec", i, fspec, len(fuspec))
+ (wf, _write, wid, fuspecs) = \
+ (fspec.okflag, fspec.regport, fspec.wid, fspec.specs)
+ print ("fpsec", i, "wrflag", wf, fspec, len(fuspecs))
ppoffs.append(pplen) # record offset for picker
- pplen += len(fuspec)
+ pplen += len(fuspecs)
+
+ name = "%s_%s_%d" % (regfile, regname, i)
+ wrflag = Signal(name="wr_flag_"+name)
+ if wf is not None:
+ comb += wrflag.eq(wf)
+ else:
+ comb += wrflag.eq(0)
+ wrflags.append(wrflag)
# create a priority picker to manage this port
wrpickers[regfile][rpidx] = wrpick = PriorityPicker(pplen)
- setattr(m.submodules, "wrpick_%s_%s" % (regfile, rpidx), wrpick)
+ m.submodules["wrpick_%s_%s" % (regfile, rpidx)] = wrpick
wsigs = []
wens = []
+ wvsets = []
+ wvseten = []
+ wvclren = []
+ #wvens = [] - not needed: reading of writevec is permanently held hi
addrs = []
for i, fspec in enumerate(fspecs):
# connect up the FU req/go signals and the reg-read to the FU
# these are arbitrated by Data.ok signals
- (rf, read, write, wid, fuspec) = fspec
- for pi, (funame, fu, idx) in enumerate(fuspec):
+ (wf, _write, wid, fuspecs) = \
+ (fspec.okflag, fspec.regport, fspec.wid, fspec.specs)
+ for pi, fuspec in enumerate(fspec.specs):
+ (funame, fu, idx) = (fuspec.funame, fuspec.fu, fuspec.idx)
+ fu_requested = fu_bitdict[funame]
pi += ppoffs[i]
+ name = "%s_%s_%s_%d" % (funame, regfile, regname, idx)
+ # get (or set up) a write-latched copy of write register number
+ write = Signal.like(_write, name="write_"+name)
+ rname = "%s_%s_%s_%d" % (funame, regfile, regname, idx)
+ if rname not in fu.wr_latches:
+ wrl = Signal.like(_write, name="wrlatch_"+rname)
+ fu.wr_latches[rname] = write
+ # do not depend on fu.issue_i here, it creates a
+ # combinatorial loop on waw checking. using the FU
+ # "enable" bitdict entry for this FU is sufficient,
+ # because the PowerDecoder2 read/write nums are
+ # valid continuously when the instruction is valid
+ with m.If(fu_requested):
+ sync += wrl.eq(_write)
+ comb += write.eq(_write)
+ with m.Else():
+ comb += write.eq(wrl)
+ else:
+ write = fu.wr_latches[rname]
# write-request comes from dest.ok
dest = fu.get_out(idx)
fu_dest_latch = fu.get_fu_out(idx) # latched output
- name = "wrflag_%s_%s_%d" % (funame, regname, idx)
- wrflag = Signal(name=name, reset_less=True)
- comb += wrflag.eq(dest.ok & fu.busy_o)
+ name = "%s_%s_%d" % (funame, regname, idx)
+ fu_wrok = Signal(name="fu_wrok_"+name, reset_less=True)
+ comb += fu_wrok.eq(dest.ok & fu.busy_o)
# connect request-write to picker input, and output to go-wr
- fu_active = fu_bitdict[funame]
- pick = fu.wr.rel_o[idx] & fu_active # & wrflag
+ fu_active = fu_selected[funame]
+ pick = fu.wr.rel_o[idx] & fu_active
comb += wrpick.i[pi].eq(pick)
# create a single-pulse go write from the picker output
- wr_pick = Signal()
+ wr_pick = Signal(name="wpick_%s_%s_%d" % (funame, regname, idx))
comb += wr_pick.eq(wrpick.o[pi] & wrpick.en_o)
comb += fu.go_wr_i[idx].eq(rising_edge(m, wr_pick))
# connect the regspec write "reg select" number to this port
# only if one FU actually requests (and is granted) the port
# will the write-enable be activated
- addr_en = Signal.like(write)
+ wname = "waddr_en_%s_%s_%d" % (funame, regname, idx)
+ addr_en = Signal.like(write, name=wname)
wp = Signal()
comb += wp.eq(wr_pick & wrpick.en_o)
comb += addr_en.eq(Mux(wp, write, 0))
# connect regfile port to input
print("reg connect widths",
regfile, regname, pi, funame,
- dest.shape(), wport.data_i.shape())
+ dest.shape(), wport.i_data.shape())
wsigs.append(fu_dest_latch)
+ # now connect up the bitvector write hazard
+ if not self.make_hazard_vecs:
+ continue
+ res = self.make_hazards(m, regfile, rfile, wvclr, wvset,
+ funame, regname, idx,
+ addr_en, wp, fu, fu_active,
+ wrflags[i], write, fu_wrok)
+ wvaddr_en, wv_issue_en = res
+ wvclren.append(wvaddr_en) # set only: no data => clear bit
+ wvseten.append(wv_issue_en) # set data same as enable
+
+ # read the write-hazard bitvector (wv) for any bit that is
+ fu_requested = fu_bitdict[funame]
+ wvchk_en = Signal(len(wvchk), name="waw_chk_addr_en_"+name)
+ issue_active = Signal(name="waw_iactive_"+name)
+ whazard = Signal(name="whaz_"+name)
+ if wf is None:
+ # XXX EEK! STATE regfile (branch) does not have an
+ # write-active indicator in regspec_decode_write()
+ print ("XXX FIXME waw_iactive", issue_active,
+ fu_requested, wf)
+ else:
+ # check bits from the incoming instruction. note (back
+ # in connect_instruction) that the decoder is held for
+ # us to be able to do this, here... *without* issue being
+ # held HI. we MUST NOT gate this with fu.issue_i or
+ # with fu_bitdict "enable": it would create a loop
+ comb += issue_active.eq(wf)
+ with m.If(issue_active):
+ if rfile.unary:
+ comb += wvchk_en.eq(write)
+ else:
+ comb += wvchk_en.eq(1<<write)
+ # if FU is busy (which doesn't get set at the same time as
+ # issue) and no hazard was detected, clear wvchk_en (i.e.
+ # stop checking for hazards). there is a loop here, but it's
+ # via a DFF, so is ok. some linters may complain, but hey.
+ with m.If(fu.busy_o & ~whazard):
+ comb += wvchk_en.eq(0)
+
+ # write-hazard is ANDed with (filtered by) what is actually
+ # being requested. the wvchk data is on a one-clock delay,
+ # and wvchk_en comes directly from the main decoder
+ comb += whazard.eq((wvchk & wvchk_en).bool())
+ with m.If(whazard):
+ comb += fu._waw_hazard.eq(1)
+
+ #wvens.append(wvchk_en)
+
# here is where we create the Write Broadcast Bus. simple, eh?
- comb += wport.data_i.eq(ortreereduce_sig(wsigs))
+ comb += wport.i_data.eq(ortreereduce_sig(wsigs))
if rfile.unary:
# for unary-addressed
comb += wport.wen.eq(ortreereduce_sig(wens))
comb += wport.addr.eq(ortreereduce_sig(addrs))
comb += wport.wen.eq(ortreereduce_sig(wens))
- def connect_wrports(self, m, fu_bitdict):
+ if not self.make_hazard_vecs:
+ return [], []
+
+ # return these here rather than set wvclr/wvset directly,
+ # because there may be more than one write-port to a given
+ # regfile. example: XER has a write-port for SO, CA, and OV
+ # and the *last one added* of those would overwrite the other
+ # two. solution: have connect_wrports collate all the
+ # or-tree-reduced bitvector set/clear requests and drop them
+ # in as a single "thing". this can only be done because the
+ # set/get is an unary bitvector.
+ print ("make write-vecs", regfile, regname, wvset, wvclr)
+ return (wvclren, # clear (regfile write)
+ wvseten) # set (issue time)
+
+ def connect_wrports(self, m, fu_bitdict, fu_selected):
"""connect write ports
orders the write regspecs into a dict-of-dicts, by regfile,
fus = self.fus.fus
regs = self.regs
# dictionary of lists of regfile write ports
- byregfiles_wr, byregfiles_wrspec = self.get_byregfiles(False)
+ byregfiles_wrspec = self.get_byregfiles(m, False)
# same for write ports.
# BLECH! complex code-duplication! BLECH!
wrpickers = {}
- for regfile, spec in byregfiles_wr.items():
- fuspecs = byregfiles_wrspec[regfile]
+ wvclrers = defaultdict(list)
+ wvseters = defaultdict(list)
+ for regfile, fuspecs in byregfiles_wrspec.items():
wrpickers[regfile] = {}
if self.regreduce_en:
if 'fast3' in fuspecs:
fuspecs['fast1'].append(fuspecs.pop('fast3'))
+ # collate these and record them by regfile because there
+ # are sometimes more write-ports per regfile
for (regname, fspec) in sort_fuspecs(fuspecs):
- self.connect_wrport(m, fu_bitdict, wrpickers,
+ wvclren, wvseten = self.connect_wrport(m,
+ fu_bitdict, fu_selected,
+ wrpickers,
regfile, regname, fspec)
-
- def get_byregfiles(self, readmode):
+ wvclrers[regfile.lower()] += wvclren
+ wvseters[regfile.lower()] += wvseten
+
+ if not self.make_hazard_vecs:
+ return
+
+ # for write-vectors: reduce the clr-ers and set-ers down to
+ # a single set of bits. otherwise if there are two write
+ # ports (on some regfiles), the last one doing comb += on
+ # the reg.wv[regfile] instance "wins" (and all others are ignored,
+ # whoops). if there was only one write-port per wv regfile this would
+ # not be an issue.
+ for regfile in wvclrers.keys():
+ wv = regs.wv[regfile]
+ wvset = wv.s # write-vec bit-level hazard ctrl
+ wvclr = wv.r # write-vec bit-level hazard ctrl
+ wvclren = wvclrers[regfile]
+ wvseten = wvseters[regfile]
+ comb += wvclr.eq(ortreereduce_sig(wvclren)) # clear (regfile write)
+ comb += wvset.eq(ortreereduce_sig(wvseten)) # set (issue time)
+
+ def get_byregfiles(self, m, readmode):
mode = "read" if readmode else "write"
regs = self.regs
fus = self.fus.fus
- e = self.e # decoded instruction to execute
+ e = self.ireg.e # decoded instruction to execute
+
+ # dictionary of dictionaries of lists/tuples of regfile ports.
+ # first key: regfile. second key: regfile port name
+ byregfiles_spec = defaultdict(dict)
- # dictionary of lists of regfile ports
- byregfiles = {}
- byregfiles_spec = {}
for (funame, fu) in fus.items():
+ # create in each FU a receptacle for the read/write register
+ # hazard numbers (and okflags for read). to be latched in
+ # connect_rd/write_ports
+ if readmode:
+ fu.rd_latches = {} # read reg number latches
+ fu.rf_latches = {} # read flag latches
+ else:
+ fu.wr_latches = {}
+
+ # construct regfile specs: read uses inspec, write outspec
print("%s ports for %s" % (mode, funame))
for idx in range(fu.n_src if readmode else fu.n_dst):
- if readmode:
- (regfile, regname, wid) = fu.get_in_spec(idx)
- else:
- (regfile, regname, wid) = fu.get_out_spec(idx)
+ (regfile, regname, wid) = fu.get_io_spec(readmode, idx)
print(" %d %s %s %s" % (idx, regfile, regname, str(wid)))
- if readmode:
- rdflag, read = regspec_decode_read(e, regfile, regname)
- write = None
- else:
- rdflag, read = None, None
- wrport, write = regspec_decode_write(e, regfile, regname)
- if regfile not in byregfiles:
- byregfiles[regfile] = {}
- byregfiles_spec[regfile] = {}
+
+ # the PowerDecoder2 (main one, not the satellites) contains
+ # the decoded regfile numbers. obtain these now
+ decinfo = regspec_decode(m, readmode, e, regfile, regname)
+ okflag, regport = decinfo.okflag, decinfo.regport
+
+ # construct the dictionary of regspec information by regfile
if regname not in byregfiles_spec[regfile]:
byregfiles_spec[regfile][regname] = \
- (rdflag, read, write, wid, [])
- # here we start to create "lanes"
- if idx not in byregfiles[regfile]:
- byregfiles[regfile][idx] = []
- fuspec = (funame, fu, idx)
- byregfiles[regfile][idx].append(fuspec)
- byregfiles_spec[regfile][regname][4].append(fuspec)
-
- # ok just print that out, for convenience
- for regfile, spec in byregfiles.items():
+ ByRegSpec(okflag, regport, wid, [])
+
+ # here we start to create "lanes" where each Function Unit
+ # requiring access to a given [single-contended resource]
+ # regfile port is appended to a list, so that PriorityPickers
+ # can be created to give uncontested access to it
+ fuspec = FUSpec(funame, fu, idx)
+ byregfiles_spec[regfile][regname].specs.append(fuspec)
+
+ # ok just print that all out, for convenience
+ for regfile, fuspecs in byregfiles_spec.items():
print("regfile %s ports:" % mode, regfile)
- fuspecs = byregfiles_spec[regfile]
for regname, fspec in fuspecs.items():
- [rdflag, read, write, wid, fuspec] = fspec
+ [okflag, regport, wid, fuspecs] = fspec
print(" rf %s port %s lane: %s" % (mode, regfile, regname))
- print(" %s" % regname, wid, read, write, rdflag)
- for (funame, fu, idx) in fuspec:
+ print(" %s" % regname, wid, okflag, regport)
+ for (funame, fu, idx) in fuspecs:
fusig = fu.src_i[idx] if readmode else fu.dest[idx]
- print(" ", funame, fu, idx, fusig)
+ print(" ", funame, fu.__class__.__name__, idx, fusig)
print()
- return byregfiles, byregfiles_spec
+ return byregfiles_spec
def __iter__(self):
yield from self.fus.ports()
- yield from self.e.ports()
+ yield from self.i.e.ports()
yield from self.l0.ports()
# TODO: regs
if __name__ == '__main__':
pspec = TestMemPspec(ldst_ifacetype='testpi',
imem_ifacetype='',
- addr_wid=48,
+ addr_wid=64,
+ allow_overlap=True,
mask_wid=8,
reg_wid=64)
dut = NonProductionCore(pspec)
--- /dev/null
+"""simple core input data
+
+"""
+
+from nmigen import Signal
+
+from openpower.sv.svp64 import SVP64Rec
+
+from openpower.decoder.decode2execute1 import Decode2ToExecute1Type
+from openpower.decoder.decode2execute1 import IssuerDecode2ToOperand
+from soc.config.state import CoreState
+
+
+class FetchInput:
+ """FetchInput: the input to the Fetch Unit
+
+ * pc - the current Program Counter
+
+ pretty much it for now!
+
+ """
+ def __init__(self):
+
+ self.pc = Signal(64)
+ self.msr = Signal(64)
+
+ def eq(self, i):
+ return [self.pc.eq(i.pc), self.msr.eq(i.msr),
+ ]
+
+
+class FetchOutput:
+ """FetchOutput: the output from the fetch unit: one single instruction
+
+ * state. this contains PC, MSR, and SVSTATE. this is crucial information.
+ (TODO: bigendian_i should really be read from the relevant MSR bit)
+
+ * the raw instruction. no decoding has been done - at all.
+
+ (TODO: provide a *pair* of raw instructions so that packet
+ inspection can be done, and SVP64 decoding and future 64-bit
+ prefix analysis carried out. however right now that is *not*
+ the focus)
+ """
+ def __init__(self): #, svp64_en):
+ #self.svp64_en = svp64_en
+
+ # state and raw instruction (and SVP64 ReMap fields)
+ self.state = CoreState("core_fetched")
+ self.raw_insn_i = Signal(32) # one raw instruction
+ self.bigendian_i = Signal() # bigendian - TODO, set by MSR.BE
+
+ def eq(self, i):
+ return [self.state.eq(i.state),
+ self.raw_insn_i.eq(i.raw_insn_i),
+ self.bigendian_i.eq(i.bigendian_i),
+ ]
+
+
+class CoreInput:
+ """CoreInput: this is the input specification for Signals coming into core.
+
+ * state. this contains PC, MSR, and SVSTATE. this is crucial information.
+ (TODO: bigendian_i should really be read from the relevant MSR bit)
+
+ * the previously-decoded instruction goes into the Decode2Execute1Type
+ data structure. no need for Core to re-decode that. however note
+ that *satellite* decoders *are* part of Core.
+
+ * the raw instruction. this is used by satellite decoders internal to
+ Core, to provide Function-Unit-specific information. really, they
+ should be part of the actual ALU itself (in order to reduce wires),
+ but hey.
+
+ * other stuff is related to SVP64. the 24-bit SV REMAP field containing
+ Vector context, etc.
+ """
+ def __init__(self, pspec, svp64_en, regreduce_en):
+ self.pspec = pspec
+ self.svp64_en = svp64_en
+ self.e = Decode2ToExecute1Type("core", opkls=IssuerDecode2ToOperand,
+ regreduce_en=regreduce_en)
+
+ # SVP64 RA_OR_ZERO needs to know if the relevant EXTRA2/3 field is zero
+ self.sv_a_nz = Signal()
+
+ # state and raw instruction (and SVP64 ReMap fields)
+ self.state = CoreState("core")
+ self.raw_insn_i = Signal(32) # raw instruction
+ self.bigendian_i = Signal() # bigendian - TODO, set by MSR.BE
+ if svp64_en:
+ self.sv_rm = SVP64Rec(name="core_svp64_rm") # SVP64 RM field
+ self.is_svp64_mode = Signal() # set if SVP64 mode is enabled
+ self.use_svp64_ldst_dec = Signal() # use alternative LDST decoder
+ self.sv_pred_sm = Signal() # TODO: SIMD width
+ self.sv_pred_dm = Signal() # TODO: SIMD width
+
+ def eq(self, i):
+ res = [self.e.eq(i.e),
+ self.sv_a_nz.eq(i.sv_a_nz),
+ self.state.eq(i.state),
+ self.raw_insn_i.eq(i.raw_insn_i),
+ self.bigendian_i.eq(i.bigendian_i),
+ ]
+ if not self.svp64_en:
+ return res
+ res += [ self.sv_rm.eq(i.sv_rm),
+ self.is_svp64_mode.eq(i.is_svp64_mode),
+ self.use_svp64_ldst_dec.eq(i.use_svp64_ldst_dec),
+ self.sv_pred_sm.eq(i.sv_pred_sm),
+ self.sv_pred_dm.eq(i.sv_pred_dm),
+ ]
+ return res
+
+
+class CoreOutput:
+ def __init__(self):
+ # start/stop and terminated signalling
+ self.core_terminate_o = Signal() # indicates stopped
+ self.busy_o = Signal(name="corebusy_o") # ALU is busy, no input
+ self.any_busy_o = Signal(name="any_busy_o") # at least one ALU busy
+ self.exc_happened = Signal() # exception happened
+
+ def eq(self, i):
+ return [self.core_terminate_o.eq(i.core_terminate_o),
+ self.busy_o.eq(i.busy_o),
+ self.any_busy_o.eq(i.any_busy_o),
+ self.exc_happened.eq(i.exc_happened),
+ ]
+
+
--- /dev/null
+"""simple core issuer
+
+not in any way intended for production use. this runs a FSM that:
+
+* reads the Program Counter from StateRegs
+* reads an instruction from a fixed-size Test Memory
+* issues it to the Simple Core
+* waits for it to complete
+* increments the PC
+* does it all over again
+
+the purpose of this module is to verify the functional correctness
+of the Function Units in the absolute simplest and clearest possible
+way, and to at provide something that can be further incrementally
+improved.
+"""
+
+from nmigen import (Elaboratable, Module, Signal,
+ Mux, Const, Repl, Cat)
+from nmigen.cli import rtlil
+from nmigen.cli import main
+import sys
+
+from nmutil.singlepipe import ControlBase
+from soc.simple.core_data import FetchOutput, FetchInput
+
+from openpower.consts import MSR
+from openpower.decoder.power_enums import MicrOp
+from openpower.state import CoreState
+from soc.regfile.regfiles import StateRegs
+from soc.config.test.test_loadstore import TestMemPspec
+from soc.experiment.icache import ICache
+
+from nmutil.util import rising_edge
+
+from soc.simple.issuer import TestIssuerBase
+
+def get_insn(f_instr_o, pc):
+ if f_instr_o.width == 32:
+ return f_instr_o
+ else:
+ # 64-bit: bit 2 of pc decides which word to select
+ return f_instr_o.word_select(pc[2], 32)
+
+
+# Fetch Finite State Machine.
+# WARNING: there are currently DriverConflicts but it's actually working.
+# TODO, here: everything that is global in nature, information from the
+# main TestIssuerInternal, needs to move to either ispec() or ospec().
+# not only that: TestIssuerInternal.imem can entirely move into here
+# because imem is only ever accessed inside the FetchFSM.
+class FetchFSM(ControlBase):
+ def __init__(self, allow_overlap, imem, core_rst,
+ pdecode2, cur_state,
+ dbg, core, svstate, nia):
+ self.allow_overlap = allow_overlap
+ self.imem = imem
+ self.core_rst = core_rst
+ self.pdecode2 = pdecode2
+ self.cur_state = cur_state
+ self.dbg = dbg
+ self.core = core
+ self.svstate = svstate
+ self.nia = nia
+
+ # set up pipeline ControlBase and allocate i/o specs
+ # (unusual: normally done by the Pipeline API)
+ super().__init__(stage=self)
+ self.p.i_data, self.n.o_data = self.new_specs(None)
+ self.i, self.o = self.p.i_data, self.n.o_data
+
+ # next 3 functions are Stage API Compliance
+ def setup(self, m, i):
+ pass
+
+ def ispec(self):
+ return FetchInput()
+
+ def ospec(self):
+ return FetchOutput()
+
+ def elaborate(self, platform):
+ """fetch FSM
+
+ this FSM performs fetch of raw instruction data, partial-decodes
+ it 32-bit at a time to detect SVP64 prefixes, and will optionally
+ read a 2nd 32-bit quantity if that occurs.
+ """
+ m = super().elaborate(platform)
+
+ dbg = self.dbg
+ core = self.core
+ pc = self.i.pc
+ msr = self.i.msr
+ svstate = self.svstate
+ nia = self.nia
+ fetch_pc_o_ready = self.p.o_ready
+ fetch_pc_i_valid = self.p.i_valid
+ fetch_insn_o_valid = self.n.o_valid
+ fetch_insn_i_ready = self.n.i_ready
+
+ comb = m.d.comb
+ sync = m.d.sync
+ pdecode2 = self.pdecode2
+ cur_state = self.cur_state
+ dec_opcode_o = pdecode2.dec.raw_opcode_in # raw opcode
+
+ # also note instruction fetch failed
+ if hasattr(core, "icache"):
+ fetch_failed = core.icache.i_out.fetch_failed
+ flush_needed = True
+ else:
+ fetch_failed = Const(0, 1)
+ flush_needed = False
+
+ # set priv / virt mode on I-Cache, sigh
+ if isinstance(self.imem, ICache):
+ comb += self.imem.i_in.priv_mode.eq(~msr[MSR.PR])
+ comb += self.imem.i_in.virt_mode.eq(msr[MSR.DR])
+
+ with m.FSM(name='fetch_fsm'):
+
+ # allow fetch to not run at startup due to I-Cache reset not
+ # having time to settle. power-on-reset holds dbg.core_stopped_i
+ with m.State("PRE_IDLE"):
+ with m.If(~dbg.core_stopped_i & ~dbg.core_stop_o):
+ m.next = "IDLE"
+
+ # waiting (zzz)
+ with m.State("IDLE"):
+ with m.If(~dbg.stopping_o & ~fetch_failed):
+ comb += fetch_pc_o_ready.eq(1)
+ with m.If(fetch_pc_i_valid & ~fetch_failed):
+ # instruction allowed to go: start by reading the PC
+ # capture the PC and also drop it into Insn Memory
+ # we have joined a pair of combinatorial memory
+ # lookups together. this is Generally Bad.
+ comb += self.imem.a_pc_i.eq(pc)
+ comb += self.imem.a_i_valid.eq(1)
+ comb += self.imem.f_i_valid.eq(1)
+ sync += cur_state.pc.eq(pc)
+ sync += cur_state.svstate.eq(svstate) # and svstate
+ sync += cur_state.msr.eq(msr) # and msr
+
+ m.next = "INSN_READ" # move to "wait for bus" phase
+
+ # dummy pause to find out why simulation is not keeping up
+ with m.State("INSN_READ"):
+ if self.allow_overlap:
+ stopping = dbg.stopping_o
+ else:
+ stopping = Const(0)
+ with m.If(stopping):
+ # stopping: jump back to idle
+ m.next = "IDLE"
+ with m.Else():
+ with m.If(self.imem.f_busy_o & ~fetch_failed): # zzz...
+ # busy but not fetch failed: stay in wait-read
+ comb += self.imem.a_i_valid.eq(1)
+ comb += self.imem.f_i_valid.eq(1)
+ with m.Else():
+ # not busy (or fetch failed!): instruction fetched
+ # when fetch failed, the instruction gets ignored
+ # by the decoder
+ insn = get_insn(self.imem.f_instr_o, cur_state.pc)
+ # not SVP64 - 32-bit only
+ sync += nia.eq(cur_state.pc + 4)
+ sync += dec_opcode_o.eq(insn)
+ m.next = "INSN_READY"
+
+ with m.State("INSN_READY"):
+ # hand over the instruction, to be decoded
+ comb += fetch_insn_o_valid.eq(1)
+ with m.If(fetch_insn_i_ready):
+ m.next = "IDLE"
+
+ # whatever was done above, over-ride it if core reset is held
+ with m.If(self.core_rst):
+ sync += nia.eq(0)
+
+ return m
+
+
+class TestIssuerInternalInOrder(TestIssuerBase):
+ """TestIssuer - reads instructions from TestMemory and issues them
+
+ efficiency and speed is not the main goal here: functional correctness
+ and code clarity is. optimisations (which almost 100% interfere with
+ easy understanding) come later.
+ """
+
+ def issue_fsm(self, m, core, nia,
+ dbg, core_rst,
+ fetch_pc_o_ready, fetch_pc_i_valid,
+ fetch_insn_o_valid, fetch_insn_i_ready,
+ exec_insn_i_valid, exec_insn_o_ready,
+ exec_pc_o_valid, exec_pc_i_ready):
+ """issue FSM
+
+ decode / issue FSM. this interacts with the "fetch" FSM
+ through fetch_insn_ready/valid (incoming) and fetch_pc_ready/valid
+ (outgoing). also interacts with the "execute" FSM
+ through exec_insn_ready/valid (outgoing) and exec_pc_ready/valid
+ (incoming).
+ SVP64 RM prefixes have already been set up by the
+ "fetch" phase, so execute is fairly straightforward.
+ """
+
+ comb = m.d.comb
+ sync = m.d.sync
+ pdecode2 = self.pdecode2
+ cur_state = self.cur_state
+
+ # temporaries
+ dec_opcode_i = pdecode2.dec.raw_opcode_in # raw opcode
+
+ # note if an exception happened. in a pipelined or OoO design
+ # this needs to be accompanied by "shadowing" (or stalling)
+ exc_happened = self.core.o.exc_happened
+ # also note instruction fetch failed
+ if hasattr(core, "icache"):
+ fetch_failed = core.icache.i_out.fetch_failed
+ flush_needed = True
+ # set to fault in decoder
+ # update (highest priority) instruction fault
+ rising_fetch_failed = rising_edge(m, fetch_failed)
+ with m.If(rising_fetch_failed):
+ sync += pdecode2.instr_fault.eq(1)
+ else:
+ fetch_failed = Const(0, 1)
+ flush_needed = False
+
+ with m.FSM(name="issue_fsm"):
+
+ # sync with the "fetch" phase which is reading the instruction
+ # at this point, there is no instruction running, that
+ # could inadvertently update the PC.
+ with m.State("ISSUE_START"):
+ # reset instruction fault
+ sync += pdecode2.instr_fault.eq(0)
+ # wait on "core stop" release, before next fetch
+ # need to do this here, in case we are in a VL==0 loop
+ with m.If(~dbg.core_stop_o & ~core_rst):
+ comb += fetch_pc_i_valid.eq(1) # tell fetch to start
+ with m.If(fetch_pc_o_ready): # fetch acknowledged us
+ m.next = "INSN_WAIT"
+ with m.Else():
+ # tell core it's stopped, and acknowledge debug handshake
+ comb += dbg.core_stopped_i.eq(1)
+
+ # wait for an instruction to arrive from Fetch
+ with m.State("INSN_WAIT"):
+ if self.allow_overlap:
+ stopping = dbg.stopping_o
+ else:
+ stopping = Const(0)
+ with m.If(stopping):
+ # stopping: jump back to idle
+ m.next = "ISSUE_START"
+ if flush_needed:
+ # request the icache to stop asserting "failed"
+ comb += core.icache.flush_in.eq(1)
+ # stop instruction fault
+ sync += pdecode2.instr_fault.eq(0)
+ with m.Else():
+ comb += fetch_insn_i_ready.eq(1)
+ with m.If(fetch_insn_o_valid):
+ # loop into ISSUE_START if it's a SVP64 instruction
+ # and VL == 0. this because VL==0 is a for-loop
+ # from 0 to 0 i.e. always, always a NOP.
+ m.next = "DECODE_SV" # skip predication
+
+ # after src/dst step have been updated, we are ready
+ # to decode the instruction
+ with m.State("DECODE_SV"):
+ # decode the instruction
+ with m.If(~fetch_failed):
+ sync += pdecode2.instr_fault.eq(0)
+ sync += core.i.e.eq(pdecode2.e)
+ sync += core.i.state.eq(cur_state)
+ sync += core.i.raw_insn_i.eq(dec_opcode_i)
+ sync += core.i.bigendian_i.eq(self.core_bigendian_i)
+ # after decoding, reset any previous exception condition,
+ # allowing it to be set again during the next execution
+ sync += pdecode2.ldst_exc.eq(0)
+
+ m.next = "INSN_EXECUTE" # move to "execute"
+
+ # handshake with execution FSM, move to "wait" once acknowledged
+ with m.State("INSN_EXECUTE"):
+ comb += exec_insn_i_valid.eq(1) # trigger execute
+ with m.If(exec_insn_o_ready): # execute acknowledged us
+ m.next = "EXECUTE_WAIT"
+
+ with m.State("EXECUTE_WAIT"):
+ # wait on "core stop" release, at instruction end
+ # need to do this here, in case we are in a VL>1 loop
+ with m.If(~dbg.core_stop_o & ~core_rst):
+ comb += exec_pc_i_ready.eq(1)
+ # see https://bugs.libre-soc.org/show_bug.cgi?id=636
+ # the exception info needs to be blatted into
+ # pdecode.ldst_exc, and the instruction "re-run".
+ # when ldst_exc.happened is set, the PowerDecoder2
+ # reacts very differently: it re-writes the instruction
+ # with a "trap" (calls PowerDecoder2.trap()) which
+ # will *overwrite* whatever was requested and jump the
+ # PC to the exception address, as well as alter MSR.
+ # nothing else needs to be done other than to note
+ # the change of PC and MSR (and, later, SVSTATE)
+ with m.If(exc_happened):
+ mmu = core.fus.get_exc("mmu0")
+ ldst = core.fus.get_exc("ldst0")
+ if mmu is not None:
+ with m.If(fetch_failed):
+ # instruction fetch: exception is from MMU
+ # reset instr_fault (highest priority)
+ sync += pdecode2.ldst_exc.eq(mmu)
+ sync += pdecode2.instr_fault.eq(0)
+ if flush_needed:
+ # request icache to stop asserting "failed"
+ comb += core.icache.flush_in.eq(1)
+ with m.If(~fetch_failed):
+ # otherwise assume it was a LDST exception
+ sync += pdecode2.ldst_exc.eq(ldst)
+
+ with m.If(exec_pc_o_valid):
+
+ # return directly to Decode if Execute generated an
+ # exception.
+ with m.If(pdecode2.ldst_exc.happened):
+ m.next = "DECODE_SV"
+
+ # if MSR, PC or SVSTATE were changed by the previous
+ # instruction, go directly back to Fetch, without
+ # updating either MSR PC or SVSTATE
+ with m.Elif(self.msr_changed | self.pc_changed |
+ self.sv_changed):
+ m.next = "ISSUE_START"
+
+ with m.Else():
+ # before going back to fetch, update the PC state
+ # register with the NIA.
+ # ok here we are not reading the branch unit.
+ # TODO: this just blithely overwrites whatever
+ # pipeline updated the PC
+ comb += self.state_w_pc.wen.eq(1 << StateRegs.PC)
+ comb += self.state_w_pc.i_data.eq(nia)
+ m.next = "ISSUE_START"
+
+ with m.Else():
+ comb += dbg.core_stopped_i.eq(1)
+ if flush_needed:
+ # request the icache to stop asserting "failed"
+ comb += core.icache.flush_in.eq(1)
+ # stop instruction fault
+ sync += pdecode2.instr_fault.eq(0)
+ if flush_needed:
+ # request the icache to stop asserting "failed"
+ comb += core.icache.flush_in.eq(1)
+ # stop instruction fault
+ sync += pdecode2.instr_fault.eq(0)
+
+ def execute_fsm(self, m, core,
+ exec_insn_i_valid, exec_insn_o_ready,
+ exec_pc_o_valid, exec_pc_i_ready):
+ """execute FSM
+
+ execute FSM. this interacts with the "issue" FSM
+ through exec_insn_ready/valid (incoming) and exec_pc_ready/valid
+ (outgoing). SVP64 RM prefixes have already been set up by the
+ "issue" phase, so execute is fairly straightforward.
+ """
+
+ comb = m.d.comb
+ sync = m.d.sync
+ pdecode2 = self.pdecode2
+
+ # temporaries
+ core_busy_o = core.n.o_data.busy_o # core is busy
+ core_ivalid_i = core.p.i_valid # instruction is valid
+
+ if hasattr(core, "icache"):
+ fetch_failed = core.icache.i_out.fetch_failed
+ else:
+ fetch_failed = Const(0, 1)
+
+ with m.FSM(name="exec_fsm"):
+
+ # waiting for instruction bus (stays there until not busy)
+ with m.State("INSN_START"):
+ comb += exec_insn_o_ready.eq(1)
+ with m.If(exec_insn_i_valid):
+ comb += core_ivalid_i.eq(1) # instruction is valid/issued
+ sync += self.sv_changed.eq(0)
+ sync += self.pc_changed.eq(0)
+ sync += self.msr_changed.eq(0)
+ with m.If(core.p.o_ready): # only move if accepted
+ m.next = "INSN_ACTIVE" # move to "wait completion"
+
+ # instruction started: must wait till it finishes
+ with m.State("INSN_ACTIVE"):
+ # note changes to MSR, PC and SVSTATE
+ # XXX oops, really must monitor *all* State Regfile write
+ # ports looking for changes!
+ with m.If(self.state_nia.wen & (1 << StateRegs.SVSTATE)):
+ sync += self.sv_changed.eq(1)
+ with m.If(self.state_nia.wen & (1 << StateRegs.MSR)):
+ sync += self.msr_changed.eq(1)
+ with m.If(self.state_nia.wen & (1 << StateRegs.PC)):
+ sync += self.pc_changed.eq(1)
+ with m.If(~core_busy_o): # instruction done!
+ comb += exec_pc_o_valid.eq(1)
+ with m.If(exec_pc_i_ready):
+ # when finished, indicate "done".
+ # however, if there was an exception, the instruction
+ # is *not* yet done. this is an implementation
+ # detail: we choose to implement exceptions by
+ # taking the exception information from the LDST
+ # unit, putting that *back* into the PowerDecoder2,
+ # and *re-running the entire instruction*.
+ # if we erroneously indicate "done" here, it is as if
+ # there were *TWO* instructions:
+ # 1) the failed LDST 2) a TRAP.
+ with m.If(~pdecode2.ldst_exc.happened &
+ ~fetch_failed):
+ comb += self.insn_done.eq(1)
+ m.next = "INSN_START" # back to fetch
+
+ def elaborate(self, platform):
+ m = super().elaborate(platform)
+ # convenience
+ comb, sync = m.d.comb, m.d.sync
+ cur_state = self.cur_state
+ pdecode2 = self.pdecode2
+ dbg = self.dbg
+ core = self.core
+
+ # set up peripherals and core
+ core_rst = self.core_rst
+
+ # indicate to outside world if any FU is still executing
+ comb += self.any_busy.eq(core.n.o_data.any_busy_o) # any FU executing
+
+ # address of the next instruction, in the absence of a branch
+ # depends on the instruction size
+ nia = Signal(64)
+
+ # connect up debug signals
+ with m.If(core.o.core_terminate_o):
+ comb += dbg.terminate_i.eq(1)
+
+ # there are *THREE^WFOUR-if-SVP64-enabled* FSMs, fetch (32/64-bit)
+ # issue, decode/execute, now joined by "Predicate fetch/calculate".
+ # these are the handshake signals between each
+
+ # fetch FSM can run as soon as the PC is valid
+ fetch_pc_i_valid = Signal() # Execute tells Fetch "start next read"
+ fetch_pc_o_ready = Signal() # Fetch Tells SVSTATE "proceed"
+
+ # fetch FSM hands over the instruction to be decoded / issued
+ fetch_insn_o_valid = Signal()
+ fetch_insn_i_ready = Signal()
+
+ # issue FSM delivers the instruction to the be executed
+ exec_insn_i_valid = Signal()
+ exec_insn_o_ready = Signal()
+
+ # execute FSM, hands over the PC/SVSTATE back to the issue FSM
+ exec_pc_o_valid = Signal()
+ exec_pc_i_ready = Signal()
+
+ # the FSMs here are perhaps unusual in that they detect conditions
+ # then "hold" information, combinatorially, for the core
+ # (as opposed to using sync - which would be on a clock's delay)
+ # this includes the actual opcode, valid flags and so on.
+
+ # Fetch, then predicate fetch, then Issue, then Execute.
+ # Issue is where the VL for-loop # lives. the ready/valid
+ # signalling is used to communicate between the four.
+
+ # set up Fetch FSM
+ fetch = FetchFSM(self.allow_overlap,
+ self.imem, core_rst, pdecode2, cur_state,
+ dbg, core,
+ dbg.state.svstate, # combinatorially same
+ nia)
+ m.submodules.fetch = fetch
+ # connect up in/out data to existing Signals
+ comb += fetch.p.i_data.pc.eq(dbg.state.pc) # combinatorially same
+ comb += fetch.p.i_data.msr.eq(dbg.state.msr) # combinatorially same
+ # and the ready/valid signalling
+ comb += fetch_pc_o_ready.eq(fetch.p.o_ready)
+ comb += fetch.p.i_valid.eq(fetch_pc_i_valid)
+ comb += fetch_insn_o_valid.eq(fetch.n.o_valid)
+ comb += fetch.n.i_ready.eq(fetch_insn_i_ready)
+
+ self.issue_fsm(m, core, nia,
+ dbg, core_rst,
+ fetch_pc_o_ready, fetch_pc_i_valid,
+ fetch_insn_o_valid, fetch_insn_i_ready,
+ exec_insn_i_valid, exec_insn_o_ready,
+ exec_pc_o_valid, exec_pc_i_ready)
+
+ self.execute_fsm(m, core,
+ exec_insn_i_valid, exec_insn_o_ready,
+ exec_pc_o_valid, exec_pc_i_ready)
+
+ return m
+
+
+# XXX TODO: update this
+
+if __name__ == '__main__':
+ units = {'alu': 1, 'cr': 1, 'branch': 1, 'trap': 1, 'logical': 1,
+ 'spr': 1,
+ 'div': 1,
+ 'mul': 1,
+ 'shiftrot': 1
+ }
+ pspec = TestMemPspec(ldst_ifacetype='bare_wb',
+ imem_ifacetype='bare_wb',
+ addr_wid=64,
+ mask_wid=8,
+ reg_wid=64,
+ units=units)
+ dut = TestIssuer(pspec)
+ vl = main(dut, ports=dut.ports(), name="test_issuer")
+
+ if len(sys.argv) == 1:
+ vl = rtlil.convert(dut, ports=dut.external_ports(), name="test_issuer")
+ with open("test_issuer.il", "w") as f:
+ f.write(vl)
from nmigen.cli import main
import sys
+from nmutil.singlepipe import ControlBase
+from soc.simple.core_data import FetchOutput, FetchInput
+
from nmigen.lib.coding import PriorityEncoder
from openpower.decoder.power_decoder import create_pdecode
from openpower.decoder.decode2execute1 import IssuerDecode2ToOperand
from openpower.decoder.decode2execute1 import Data
from openpower.decoder.power_enums import (MicrOp, SVP64PredInt, SVP64PredCR,
- SVP64PredMode)
+ SVP64PredMode)
from openpower.state import CoreState
-from openpower.consts import (CR, SVP64CROffs)
-from soc.experiment.testmem import TestMemory # test only for instructions
+from openpower.consts import (CR, SVP64CROffs, MSR)
+from soc.experiment.testmem import TestMemory # test only for instructions
from soc.regfile.regfiles import StateRegs, FastRegs
from soc.simple.core import NonProductionCore
from soc.config.test.test_loadstore import TestMemPspec
from soc.clock.select import ClockSelect
from soc.clock.dummypll import DummyPLL
from openpower.sv.svstate import SVSTATERec
-
+from soc.experiment.icache import ICache
from nmutil.util import rising_edge
+
def get_insn(f_instr_o, pc):
if f_instr_o.width == 32:
return f_instr_o
return f_instr_o.word_select(pc[2], 32)
# gets state input or reads from state regfile
-def state_get(m, core_rst, state_i, name, regfile, regnum):
+
+
+def state_get(m, res, core_rst, state_i, name, regfile, regnum):
comb = m.d.comb
sync = m.d.sync
- # read the PC
- res = Signal(64, reset_less=True, name=name)
+ # read the {insert state variable here}
res_ok_delay = Signal(name="%s_ok_delay" % name)
with m.If(~core_rst):
sync += res_ok_delay.eq(~state_i.ok)
# incoming override (start from pc_i)
comb += res.eq(state_i.data)
with m.Else():
- # otherwise read StateRegs regfile for PC...
- comb += regfile.ren.eq(1<<regnum)
+ # otherwise read StateRegs regfile for {insert state here}...
+ comb += regfile.ren.eq(1 << regnum)
# ... but on a 1-clock delay
with m.If(res_ok_delay):
- comb += res.eq(regfile.data_o)
- return res
+ comb += res.eq(regfile.o_data)
+
def get_predint(m, mask, name):
"""decode SVP64 predicate integer mask field to reg number and invert
comb += invert.eq(1)
return regread, invert, unary, all1s
+
def get_predcr(m, mask, name):
"""decode SVP64 predicate CR to reg number field and invert status
this is identical to _get_predcr in ISACaller
return idx, invert
-class TestIssuerInternal(Elaboratable):
- """TestIssuer - reads instructions from TestMemory and issues them
+class TestIssuerBase(Elaboratable):
+ """TestIssuerBase - common base class for Issuers
- efficiency and speed is not the main goal here: functional correctness
- and code clarity is. optimisations (which almost 100% interfere with
- easy understanding) come later.
+ takes care of power-on reset, peripherals, debug, DEC/TB,
+ and gets PC/MSR/SVSTATE from the State Regfile etc.
"""
+
def __init__(self, pspec):
+ # test if microwatt compatibility is to be enabled
+ self.microwatt_compat = (hasattr(pspec, "microwatt_compat") and
+ (pspec.microwatt_compat == True))
+ self.alt_reset = Signal(reset_less=True) # not connected yet (microwatt)
+ # test if fabric compatibility is to be enabled
+ self.fabric_compat = (hasattr(pspec, "fabric_compat") and
+ (pspec.fabric_compat == True))
+
+ if self.microwatt_compat or self.fabric_compat:
+
+ if hasattr(pspec, "microwatt_old"):
+ self.microwatt_old = pspec.microwatt_old
+ else:
+ self.microwatt_old = True # PLEASE DO NOT ALTER THIS
+
+ if hasattr(pspec, "microwatt_debug"):
+ self.microwatt_debug = pspec.microwatt_debug
+ else:
+ self.microwatt_debug = True # set to False when using an FPGA
+
# test is SVP64 is to be enabled
self.svp64_en = hasattr(pspec, "svp64") and (pspec.svp64 == True)
# and if regfiles are reduced
self.regreduce_en = (hasattr(pspec, "regreduce") and
- (pspec.regreduce == True))
+ (pspec.regreduce == True))
+
+ # and if overlap requested
+ self.allow_overlap = (hasattr(pspec, "allow_overlap") and
+ (pspec.allow_overlap == True))
+
+ # and get the core domain
+ self.core_domain = "coresync"
+ if (hasattr(pspec, "core_domain") and
+ isinstance(pspec.core_domain, str)):
+ self.core_domain = pspec.core_domain
# JTAG interface. add this right at the start because if it's
# added it *modifies* the pspec, by adding enable/disable signals
# for parts of the rest of the core
self.jtag_en = hasattr(pspec, "debug") and pspec.debug == 'jtag'
- self.dbg_domain = "sync" # sigh "dbgsunc" too problematic
- #self.dbg_domain = "dbgsync" # domain for DMI/JTAG clock
+ #self.dbg_domain = "sync" # sigh "dbgsunc" too problematic
+ self.dbg_domain = "dbgsync" # domain for DMI/JTAG clock
if self.jtag_en:
- # XXX MUST keep this up-to-date with litex, and
+ # XXX MUST keep this up-to-date with fabric, and
# soc-cocotb-sim, and err.. all needs sorting out, argh
subset = ['uart',
'mtwi',
'eint', 'gpio', 'mspi0',
# 'mspi1', - disabled for now
# 'pwm', 'sd0', - disabled for now
- 'sdr']
+ 'sdr']
self.jtag = JTAG(get_pinspecs(subset=subset),
domain=self.dbg_domain)
# add signals to pspec to enable/disable icache and dcache
self.sram4k = []
for i in range(4):
self.sram4k.append(SPBlock512W64B8W(name="sram4k_%d" % i,
- #features={'err'}
+ # features={'err'}
))
# add interrupt controller?
self.xics_icp = XICS_ICP()
self.xics_ics = XICS_ICS()
self.int_level_i = self.xics_ics.int_level_i
+ else:
+ self.ext_irq = Signal()
# add GPIO peripheral?
self.gpio = hasattr(pspec, "gpio") and pspec.gpio == True
# main instruction core. suitable for prototyping / demo only
self.core = core = NonProductionCore(pspec)
- self.core_rst = ResetSignal("coresync")
+ self.core_rst = ResetSignal(self.core_domain)
# instruction decoder. goes into Trap Record
- pdecode = create_pdecode()
- self.cur_state = CoreState("cur") # current state (MSR/PC/SVSTATE)
- self.pdecode2 = PowerDecode2(pdecode, state=self.cur_state,
+ #pdecode = create_pdecode()
+ self.cur_state = CoreState("cur") # current state (MSR/PC/SVSTATE)
+ self.pdecode2 = PowerDecode2(None, state=self.cur_state,
opkls=IssuerDecode2ToOperand,
svp64_en=self.svp64_en,
regreduce_en=self.regreduce_en)
+ pdecode = self.pdecode2.dec
+
if self.svp64_en:
- self.svp64 = SVP64PrefixDecoder() # for decoding SVP64 prefix
+ self.svp64 = SVP64PrefixDecoder() # for decoding SVP64 prefix
+
+ self.update_svstate = Signal() # set this if updating svstate
+ self.new_svstate = new_svstate = SVSTATERec("new_svstate")
# Test Instruction memory
+ if hasattr(core, "icache"):
+ # XXX BLECH! use pspec to transfer the I-Cache to ConfigFetchUnit
+ # truly dreadful. needs a huge reorg.
+ pspec.icache = core.icache
self.imem = ConfigFetchUnit(pspec).fu
# DMI interface
self.dbg = CoreDebug()
+ self.dbg_rst_i = Signal(reset_less=True)
# instruction go/monitor
self.pc_o = Signal(64, reset_less=True)
- self.pc_i = Data(64, "pc_i") # set "ok" to indicate "please change me"
- self.svstate_i = Data(32, "svstate_i") # ditto
- self.core_bigendian_i = Signal() # TODO: set based on MSR.LE
+ self.pc_i = Data(64, "pc_i") # set "ok" to indicate "please change me"
+ self.msr_i = Data(64, "msr_i") # set "ok" to indicate "please change me"
+ self.svstate_i = Data(64, "svstate_i") # ditto
+ self.core_bigendian_i = Signal() # TODO: set based on MSR.LE
self.busy_o = Signal(reset_less=True)
self.memerr_o = Signal(reset_less=True)
# STATE regfile read /write ports for PC, MSR, SVSTATE
staterf = self.core.regs.rf['state']
- self.state_r_pc = staterf.r_ports['cia'] # PC rd
- self.state_w_pc = staterf.w_ports['d_wr1'] # PC wr
- self.state_r_msr = staterf.r_ports['msr'] # MSR rd
- self.state_r_sv = staterf.r_ports['sv'] # SVSTATE rd
- self.state_w_sv = staterf.w_ports['sv'] # SVSTATE wr
+ self.state_r_msr = staterf.r_ports['msr'] # MSR rd
+ self.state_r_pc = staterf.r_ports['cia'] # PC rd
+ self.state_r_sv = staterf.r_ports['sv'] # SVSTATE rd
+
+ self.state_w_msr = staterf.w_ports['d_wr2'] # MSR wr
+ self.state_w_pc = staterf.w_ports['d_wr1'] # PC wr
+ self.state_w_sv = staterf.w_ports['sv'] # SVSTATE wr
# DMI interface access
intrf = self.core.regs.rf['int']
+ fastrf = self.core.regs.rf['fast']
crrf = self.core.regs.rf['cr']
xerrf = self.core.regs.rf['xer']
- self.int_r = intrf.r_ports['dmi'] # INT read
- self.cr_r = crrf.r_ports['full_cr_dbg'] # CR read
- self.xer_r = xerrf.r_ports['full_xer'] # XER read
+ self.int_r = intrf.r_ports['dmi'] # INT DMI read
+ self.cr_r = crrf.r_ports['full_cr_dbg'] # CR DMI read
+ self.xer_r = xerrf.r_ports['full_xer'] # XER DMI read
+ self.fast_r = fastrf.r_ports['dmi'] # FAST DMI read
if self.svp64_en:
# for predication
- self.int_pred = intrf.r_ports['pred'] # INT predicate read
- self.cr_pred = crrf.r_ports['cr_pred'] # CR predicate read
+ self.int_pred = intrf.r_ports['pred'] # INT predicate read
+ self.cr_pred = crrf.r_ports['cr_pred'] # CR predicate read
# hack method of keeping an eye on whether branch/trap set the PC
self.state_nia = self.core.regs.rf['state'].w_ports['nia']
self.state_nia.wen.name = 'state_nia_wen'
+ # and whether SPR pipeline sets DEC or TB (fu/spr/main_stage.py)
+ self.state_spr = self.core.regs.rf['state'].w_ports['state1']
# pulse to synchronize the simulator at instruction end
self.insn_done = Signal()
+ # indicate any instruction still outstanding, in execution
+ self.any_busy = Signal()
+
if self.svp64_en:
# store copies of predicate masks
self.srcmask = Signal(64)
self.dstmask = Signal(64)
- def fetch_fsm(self, m, core, pc, svstate, nia, is_svp64_mode,
- fetch_pc_ready_o, fetch_pc_valid_i,
- fetch_insn_valid_o, fetch_insn_ready_i):
+ # sigh, the wishbone addresses are not wishbone-compliant
+ # in old versions of microwatt, tplaten_3d_game is a new one
+ if self.microwatt_compat or self.fabric_compat:
+ self.ibus_adr = Signal(32, name='wishbone_insn_out.adr')
+ self.dbus_adr = Signal(32, name='wishbone_data_out.adr')
+
+ # add an output of the PC and instruction, and whether it was requested
+ # this is for verilator debug purposes
+ if self.microwatt_compat or self.fabric_compat:
+ self.nia = Signal(64)
+ self.msr_o = Signal(64)
+ self.nia_req = Signal(1)
+ self.insn = Signal(32)
+ self.ldst_req = Signal(1)
+ self.ldst_addr = Signal(1)
+
+ # for pausing dec/tb during an SPR pipeline event, this
+ # ensures that an SPR write (mtspr) to TB or DEC does not
+ # get overwritten by the DEC/TB FSM
+ self.pause_dec_tb = Signal()
+
+ def setup_peripherals(self, m):
+ comb, sync = m.d.comb, m.d.sync
+
+ # okaaaay so the debug module must be in coresync clock domain
+ # but NOT its reset signal. to cope with this, set every single
+ # submodule explicitly in coresync domain, debug and JTAG
+ # in their own one but using *external* reset.
+ csd = DomainRenamer(self.core_domain)
+ dbd = DomainRenamer(self.dbg_domain)
+
+ if self.microwatt_compat or self.fabric_compat:
+ m.submodules.core = core = self.core
+ else:
+ m.submodules.core = core = csd(self.core)
+
+ # this _so_ needs sorting out. ICache is added down inside
+ # LoadStore1 and is already a submodule of LoadStore1
+ if not isinstance(self.imem, ICache):
+ m.submodules.imem = imem = csd(self.imem)
+
+ # set up JTAG Debug Module (in correct domain)
+ m.submodules.dbg = dbg = dbd(self.dbg)
+ if self.jtag_en:
+ m.submodules.jtag = jtag = dbd(self.jtag)
+ # TODO: UART2GDB mux, here, from external pin
+ # see https://bugs.libre-soc.org/show_bug.cgi?id=499
+ sync += dbg.dmi.connect_to(jtag.dmi)
+
+ # fixup the clocks in microwatt-compat mode (but leave resets alone
+ # so that microwatt soc.vhdl can pull a reset on the core or DMI
+ # can do it, just like in TestIssuer)
+ if self.microwatt_compat or self.fabric_compat:
+ intclk = ClockSignal(self.core_domain)
+ dbgclk = ClockSignal(self.dbg_domain)
+ if self.core_domain != 'sync':
+ comb += intclk.eq(ClockSignal())
+ if self.dbg_domain != 'sync':
+ comb += dbgclk.eq(ClockSignal())
+
+ # if using old version of microwatt
+ # drop the first 3 bits of the incoming wishbone addresses
+ if self.microwatt_compat or self.fabric_compat:
+ ibus = self.imem.ibus
+ dbus = self.core.l0.cmpi.wb_bus()
+ if self.microwatt_old:
+ comb += self.ibus_adr.eq(Cat(Const(0, 3), ibus.adr))
+ comb += self.dbus_adr.eq(Cat(Const(0, 3), dbus.adr))
+ else:
+ comb += self.ibus_adr.eq(ibus.adr)
+ comb += self.dbus_adr.eq(dbus.adr)
+ if self.microwatt_debug:
+ # microwatt verilator debug purposes
+ pi = self.core.l0.cmpi.pi.pi
+ comb += self.ldst_req.eq(pi.addr_ok_o)
+ comb += self.ldst_addr.eq(pi.addr)
+
+ cur_state = self.cur_state
+
+ # 4x 4k SRAM blocks. these simply "exist", they get routed in fabric
+ if self.sram4x4k:
+ for i, sram in enumerate(self.sram4k):
+ m.submodules["sram4k_%d" % i] = csd(sram)
+ comb += sram.enable.eq(self.wb_sram_en)
+
+ # XICS interrupt handler
+ if self.xics:
+ m.submodules.xics_icp = icp = csd(self.xics_icp)
+ m.submodules.xics_ics = ics = csd(self.xics_ics)
+ comb += icp.ics_i.eq(ics.icp_o) # connect ICS to ICP
+ sync += cur_state.eint.eq(icp.core_irq_o) # connect ICP to core
+ else:
+ sync += cur_state.eint.eq(self.ext_irq) # connect externally
+
+ # GPIO test peripheral
+ if self.gpio:
+ m.submodules.simple_gpio = simple_gpio = csd(self.simple_gpio)
+
+ # connect one GPIO output to ICS bit 15 (like in microwatt soc.vhdl)
+ # XXX causes fabric ECP5 test to get wrong idea about input and output
+ # (but works with verilator sim *sigh*)
+ # if self.gpio and self.xics:
+ # comb += self.int_level_i[15].eq(simple_gpio.gpio_o[0])
+
+ # instruction decoder
+ pdecode = create_pdecode()
+ m.submodules.dec2 = pdecode2 = csd(self.pdecode2)
+ if self.svp64_en:
+ m.submodules.svp64 = svp64 = csd(self.svp64)
+
+ # clock delay power-on reset
+ cd_por = ClockDomain(reset_less=True)
+ cd_sync = ClockDomain()
+ m.domains += cd_por, cd_sync
+ core_sync = ClockDomain(self.core_domain)
+ if self.core_domain != "sync":
+ m.domains += core_sync
+ if self.dbg_domain != "sync":
+ dbg_sync = ClockDomain(self.dbg_domain)
+ m.domains += dbg_sync
+
+ # create a delay, but remember it is in the power-on-reset clock domain!
+ ti_rst = Signal(reset_less=True)
+ delay = Signal(range(4), reset=3)
+ stop_delay = Signal(range(16), reset=5)
+ with m.If(delay != 0):
+ m.d.por += delay.eq(delay - 1) # decrement... in POR domain!
+ with m.If(stop_delay != 0):
+ m.d.por += stop_delay.eq(stop_delay - 1) # likewise
+ comb += cd_por.clk.eq(ClockSignal())
+
+ # power-on reset delay
+ core_rst = ResetSignal(self.core_domain)
+ if self.core_domain != "sync":
+ comb += ti_rst.eq(delay != 0 | dbg.core_rst_o | ResetSignal())
+ comb += core_rst.eq(ti_rst)
+ else:
+ with m.If(delay != 0 | dbg.core_rst_o):
+ comb += core_rst.eq(1)
+ with m.If(stop_delay != 0):
+ # run DMI core-stop as well but on an extra couple of cycles
+ comb += dbg.core_stopped_i.eq(1)
+
+ # connect external reset signal to DMI Reset
+ if self.dbg_domain != "sync":
+ dbg_rst = ResetSignal(self.dbg_domain)
+ comb += dbg_rst.eq(self.dbg_rst_i)
+
+ # busy/halted signals from core
+ core_busy_o = ~core.p.o_ready | core.n.o_data.busy_o # core is busy
+ comb += self.busy_o.eq(core_busy_o)
+ comb += pdecode2.dec.bigendian.eq(self.core_bigendian_i)
+
+ # temporary hack: says "go" immediately for both address gen and ST
+ # XXX: st.go_i is set to 1 cycle delay to reduce combinatorial chains
+ l0 = core.l0
+ ldst = core.fus.fus['ldst0']
+ st_go_edge = rising_edge(m, ldst.st.rel_o)
+ # link addr-go direct to rel
+ m.d.comb += ldst.ad.go_i.eq(ldst.ad.rel_o)
+ m.d.sync += ldst.st.go_i.eq(st_go_edge) # link store-go to rising rel
+
+ def do_dmi(self, m, dbg):
+ """deals with DMI debug requests
+
+ currently only provides read requests for the INT regfile, CR and XER
+ it will later also deal with *writing* to these regfiles.
+ """
+ comb = m.d.comb
+ sync = m.d.sync
+ dmi, d_reg, d_cr, d_xer, = dbg.dmi, dbg.d_gpr, dbg.d_cr, dbg.d_xer
+ d_fast = dbg.d_fast
+ intrf = self.core.regs.rf['int']
+ fastrf = self.core.regs.rf['fast']
+
+ with m.If(d_reg.req): # request for regfile access being made
+ # TODO: error-check this
+ # XXX should this be combinatorial? sync better?
+ if intrf.unary:
+ comb += self.int_r.ren.eq(1 << d_reg.addr)
+ else:
+ comb += self.int_r.addr.eq(d_reg.addr)
+ comb += self.int_r.ren.eq(1)
+ d_reg_delay = Signal()
+ sync += d_reg_delay.eq(d_reg.req)
+ with m.If(d_reg_delay):
+ # data arrives one clock later
+ comb += d_reg.data.eq(self.int_r.o_data)
+ comb += d_reg.ack.eq(1)
+
+ # fast regfile
+ with m.If(d_fast.req): # request for regfile access being made
+ if fastrf.unary:
+ comb += self.fast_r.ren.eq(1 << d_fast.addr)
+ else:
+ comb += self.fast_r.addr.eq(d_fast.addr)
+ comb += self.fast_r.ren.eq(1)
+ d_fast_delay = Signal()
+ sync += d_fast_delay.eq(d_fast.req)
+ with m.If(d_fast_delay):
+ # data arrives one clock later
+ comb += d_fast.data.eq(self.fast_r.o_data)
+ comb += d_fast.ack.eq(1)
+
+ # sigh same thing for CR debug
+ with m.If(d_cr.req): # request for regfile access being made
+ comb += self.cr_r.ren.eq(0b11111111) # enable all
+ d_cr_delay = Signal()
+ sync += d_cr_delay.eq(d_cr.req)
+ with m.If(d_cr_delay):
+ # data arrives one clock later
+ comb += d_cr.data.eq(self.cr_r.o_data)
+ comb += d_cr.ack.eq(1)
+
+ # aaand XER...
+ with m.If(d_xer.req): # request for regfile access being made
+ comb += self.xer_r.ren.eq(0b111111) # enable all
+ d_xer_delay = Signal()
+ sync += d_xer_delay.eq(d_xer.req)
+ with m.If(d_xer_delay):
+ # data arrives one clock later
+ comb += d_xer.data.eq(self.xer_r.o_data)
+ comb += d_xer.ack.eq(1)
+
+ def tb_dec_fsm(self, m, spr_dec):
+ """tb_dec_fsm
+
+ this is a FSM for updating either dec or tb. it runs alternately
+ DEC, TB, DEC, TB. note that SPR pipeline could have written a new
+ value to DEC, however the regfile has "passthrough" on it so this
+ *should* be ok.
+
+ see v3.0B p1097-1099 for Timer Resource and p1065 and p1076
+ """
+
+ comb, sync = m.d.comb, m.d.sync
+ state_rf = self.core.regs.rf['state']
+ state_r_dectb = state_rf.r_ports['issue'] # DEC/TB
+ state_w_dectb = state_rf.w_ports['issue'] # DEC/TB
+
+
+ with m.FSM() as fsm:
+
+ # initiates read of current DEC
+ with m.State("DEC_READ"):
+ comb += state_r_dectb.ren.eq(1<<StateRegs.DEC)
+ with m.If(~self.pause_dec_tb):
+ m.next = "DEC_WRITE"
+
+ # waits for DEC read to arrive (1 cycle), updates with new value
+ # respects if dec/tb writing has been paused
+ with m.State("DEC_WRITE"):
+ with m.If(self.pause_dec_tb):
+ # if paused, return to reading
+ m.next = "DEC_READ"
+ with m.Else():
+ new_dec = Signal(64)
+ # TODO: MSR.LPCR 32-bit decrement mode
+ comb += new_dec.eq(state_r_dectb.o_data - 1)
+ comb += state_w_dectb.wen.eq(1<<StateRegs.DEC)
+ comb += state_w_dectb.i_data.eq(new_dec)
+ # copy to cur_state for decoder, for an interrupt
+ sync += spr_dec.eq(new_dec)
+ m.next = "TB_READ"
+
+ # initiates read of current TB
+ with m.State("TB_READ"):
+ comb += state_r_dectb.ren.eq(1<<StateRegs.TB)
+ with m.If(~self.pause_dec_tb):
+ m.next = "TB_WRITE"
+
+ # waits for read TB to arrive, initiates write of current TB
+ # respects if dec/tb writing has been paused
+ with m.State("TB_WRITE"):
+ with m.If(self.pause_dec_tb):
+ # if paused, return to reading
+ m.next = "TB_READ"
+ with m.Else():
+ new_tb = Signal(64)
+ comb += new_tb.eq(state_r_dectb.o_data + 1)
+ comb += state_w_dectb.wen.eq(1<<StateRegs.TB)
+ comb += state_w_dectb.i_data.eq(new_tb)
+ m.next = "DEC_READ"
+
+ return m
+
+ def elaborate(self, platform):
+ m = Module()
+ # convenience
+ comb, sync = m.d.comb, m.d.sync
+ cur_state = self.cur_state
+ pdecode2 = self.pdecode2
+ dbg = self.dbg
+
+ # set up peripherals and core
+ core_rst = self.core_rst
+ self.setup_peripherals(m)
+
+ # reset current state if core reset requested
+ with m.If(core_rst):
+ m.d.sync += self.cur_state.eq(0)
+ # and, sigh, set configured values, which are also done in regfile
+ # XXX ??? what the hell is the shift for??
+ m.d.sync += self.cur_state.pc.eq(self.core.pc_at_reset)
+ m.d.sync += self.cur_state.msr.eq(self.core.msr_at_reset)
+
+ # check halted condition: requested PC to execute matches DMI stop addr
+ # and immediately stop. address of 0xffff_ffff_ffff_ffff can never
+ # match
+ halted = Signal()
+ comb += halted.eq(dbg.stop_addr_o == dbg.state.pc)
+ with m.If(halted):
+ comb += dbg.core_stopped_i.eq(1)
+ comb += dbg.terminate_i.eq(1)
+
+ # PC and instruction from I-Memory
+ comb += self.pc_o.eq(cur_state.pc)
+ self.pc_changed = Signal() # note write to PC
+ self.msr_changed = Signal() # note write to MSR
+ self.sv_changed = Signal() # note write to SVSTATE
+
+ # read state either from incoming override or from regfile
+ state = CoreState("get") # current state (MSR/PC/SVSTATE)
+ state_get(m, state.msr, core_rst, self.msr_i,
+ "msr", # read MSR
+ self.state_r_msr, StateRegs.MSR)
+ state_get(m, state.pc, core_rst, self.pc_i,
+ "pc", # read PC
+ self.state_r_pc, StateRegs.PC)
+ state_get(m, state.svstate, core_rst, self.svstate_i,
+ "svstate", # read SVSTATE
+ self.state_r_sv, StateRegs.SVSTATE)
+
+ # don't write pc every cycle
+ comb += self.state_w_pc.wen.eq(0)
+ comb += self.state_w_pc.i_data.eq(0)
+
+ # connect up debug state. note "combinatorially same" below,
+ # this is a bit naff, passing state over in the dbg class, but
+ # because it is combinatorial it achieves the desired goal
+ comb += dbg.state.eq(state)
+
+ # this bit doesn't have to be in the FSM: connect up to read
+ # regfiles on demand from DMI
+ self.do_dmi(m, dbg)
+
+ # DEC and TB inc/dec FSM. copy of DEC is put into CoreState,
+ # (which uses that in PowerDecoder2 to raise 0x900 exception)
+ self.tb_dec_fsm(m, cur_state.dec)
+
+ # while stopped, allow updating the MSR, PC and SVSTATE.
+ # these are mainly for debugging purposes (including DMI/JTAG)
+ with m.If(dbg.core_stopped_i):
+ with m.If(self.pc_i.ok):
+ comb += self.state_w_pc.wen.eq(1 << StateRegs.PC)
+ comb += self.state_w_pc.i_data.eq(self.pc_i.data)
+ sync += self.pc_changed.eq(1)
+ with m.If(self.msr_i.ok):
+ comb += self.state_w_msr.wen.eq(1 << StateRegs.MSR)
+ comb += self.state_w_msr.i_data.eq(self.msr_i.data)
+ sync += self.msr_changed.eq(1)
+ with m.If(self.svstate_i.ok | self.update_svstate):
+ with m.If(self.svstate_i.ok): # over-ride from external source
+ comb += self.new_svstate.eq(self.svstate_i.data)
+ comb += self.state_w_sv.wen.eq(1 << StateRegs.SVSTATE)
+ comb += self.state_w_sv.i_data.eq(self.new_svstate)
+ sync += self.sv_changed.eq(1)
+
+ # start renaming some of the ports to match microwatt
+ if self.microwatt_compat or self.fabric_compat:
+ self.core.o.core_terminate_o.name = "terminated_out"
+ # names of DMI interface
+ self.dbg.dmi.addr_i.name = 'dmi_addr'
+ self.dbg.dmi.din.name = 'dmi_din'
+ self.dbg.dmi.dout.name = 'dmi_dout'
+ self.dbg.dmi.req_i.name = 'dmi_req'
+ self.dbg.dmi.we_i.name = 'dmi_wr'
+ self.dbg.dmi.ack_o.name = 'dmi_ack'
+ # wishbone instruction bus
+ ibus = self.imem.ibus
+ if self.microwatt_compat:
+ ibus.adr.name = 'wishbone_insn_out.adr'
+ ibus.dat_w.name = 'wishbone_insn_out.dat'
+ ibus.sel.name = 'wishbone_insn_out.sel'
+ ibus.cyc.name = 'wishbone_insn_out.cyc'
+ ibus.stb.name = 'wishbone_insn_out.stb'
+ ibus.we.name = 'wishbone_insn_out.we'
+ ibus.dat_r.name = 'wishbone_insn_in.dat'
+ ibus.ack.name = 'wishbone_insn_in.ack'
+ ibus.stall.name = 'wishbone_insn_in.stall'
+ # wishbone data bus
+ dbus = self.core.l0.cmpi.wb_bus()
+ if self.microwatt_compat:
+ dbus.adr.name = 'wishbone_data_out.adr'
+ dbus.dat_w.name = 'wishbone_data_out.dat'
+ dbus.sel.name = 'wishbone_data_out.sel'
+ dbus.cyc.name = 'wishbone_data_out.cyc'
+ dbus.stb.name = 'wishbone_data_out.stb'
+ dbus.we.name = 'wishbone_data_out.we'
+ dbus.dat_r.name = 'wishbone_data_in.dat'
+ dbus.ack.name = 'wishbone_data_in.ack'
+ dbus.stall.name = 'wishbone_data_in.stall'
+
+ return m
+
+ def __iter__(self):
+ yield from self.pc_i.ports()
+ yield from self.msr_i.ports()
+ yield self.pc_o
+ yield self.memerr_o
+ yield from self.core.ports()
+ yield from self.imem.ports()
+ yield self.core_bigendian_i
+ yield self.busy_o
+
+ def ports(self):
+ return list(self)
+
+ def external_ports(self):
+ if self.microwatt_compat or self.fabric_compat:
+ if self.fabric_compat:
+ ports = [self.core.o.core_terminate_o,
+ self.alt_reset, # not connected yet
+ self.nia, self.insn, self.nia_req, self.msr_o,
+ self.ldst_req, self.ldst_addr,
+ ClockSignal(),
+ ResetSignal(),
+ ]
+ else:
+ ports = [self.core.o.core_terminate_o,
+ self.ext_irq,
+ self.alt_reset, # not connected yet
+ self.nia, self.insn, self.nia_req, self.msr_o,
+ self.ldst_req, self.ldst_addr,
+ ClockSignal(),
+ ResetSignal(),
+ ]
+ ports += list(self.dbg.dmi.ports())
+ # for dbus/ibus microwatt, exclude err btw and cti
+ for name, sig in self.imem.ibus.fields.items():
+ if name not in ['err', 'bte', 'cti', 'adr']:
+ ports.append(sig)
+ for name, sig in self.core.l0.cmpi.wb_bus().fields.items():
+ if name not in ['err', 'bte', 'cti', 'adr']:
+ ports.append(sig)
+ # microwatt non-compliant with wishbone
+ ports.append(self.ibus_adr)
+ ports.append(self.dbus_adr)
+
+ if self.microwatt_compat:
+ # Ignore the remaining ports in microwatt compat mode
+ return ports
+
+ ports = self.pc_i.ports()
+ ports = self.msr_i.ports()
+ ports += [self.pc_o, self.memerr_o, self.core_bigendian_i, self.busy_o,
+ ]
+
+ if self.jtag_en:
+ ports += list(self.jtag.external_ports())
+ else:
+ # don't add DMI if JTAG is enabled
+ ports += list(self.dbg.dmi.ports())
+
+ ports += list(self.imem.ibus.fields.values())
+ ports += list(self.core.l0.cmpi.wb_bus().fields.values())
+
+ if self.sram4x4k:
+ for sram in self.sram4k:
+ ports += list(sram.bus.fields.values())
+
+ if self.xics:
+ ports += list(self.xics_icp.bus.fields.values())
+ ports += list(self.xics_ics.bus.fields.values())
+ ports.append(self.int_level_i)
+ else:
+ ports.append(self.ext_irq)
+
+ if self.gpio:
+ ports += list(self.simple_gpio.bus.fields.values())
+ ports.append(self.gpio_o)
+
+ return ports
+
+ def ports(self):
+ return list(self)
+
+
+class TestIssuerInternal(TestIssuerBase):
+ """TestIssuer - reads instructions from TestMemory and issues them
+
+ efficiency and speed is not the main goal here: functional correctness
+ and code clarity is. optimisations (which almost 100% interfere with
+ easy understanding) come later.
+ """
+
+ def fetch_fsm(self, m, dbg, core, core_rst, nia, is_svp64_mode,
+ fetch_pc_o_ready, fetch_pc_i_valid,
+ fetch_insn_o_valid, fetch_insn_i_ready):
"""fetch FSM
this FSM performs fetch of raw instruction data, partial-decodes
pdecode2 = self.pdecode2
cur_state = self.cur_state
dec_opcode_i = pdecode2.dec.raw_opcode_in # raw opcode
+ pc, msr, svstate = cur_state.pc, cur_state.msr, cur_state.svstate
- msr_read = Signal(reset=1)
+ # also note instruction fetch failed
+ if hasattr(core, "icache"):
+ fetch_failed = core.icache.i_out.fetch_failed
+ flush_needed = True
+ else:
+ fetch_failed = Const(0, 1)
+ flush_needed = False
+
+ # set priv / virt mode on I-Cache, sigh
+ if isinstance(self.imem, ICache):
+ comb += self.imem.i_in.priv_mode.eq(~msr[MSR.PR])
+ comb += self.imem.i_in.virt_mode.eq(msr[MSR.IR]) # Instr. Redir (VM)
with m.FSM(name='fetch_fsm'):
+ # allow fetch to not run at startup due to I-Cache reset not
+ # having time to settle. power-on-reset holds dbg.core_stopped_i
+ with m.State("PRE_IDLE"):
+ with m.If(~dbg.core_stopped_i & ~dbg.core_stop_o & ~core_rst):
+ m.next = "IDLE"
+
# waiting (zzz)
with m.State("IDLE"):
- comb += fetch_pc_ready_o.eq(1)
- with m.If(fetch_pc_valid_i):
+ # fetch allowed if not failed and stopped but not stepping
+ # (see dmi.py for how core_stop_o is generated)
+ with m.If(~fetch_failed & ~dbg.core_stop_o):
+ comb += fetch_pc_o_ready.eq(1)
+ with m.If(fetch_pc_i_valid & ~pdecode2.instr_fault
+ & ~dbg.core_stop_o):
# instruction allowed to go: start by reading the PC
# capture the PC and also drop it into Insn Memory
# we have joined a pair of combinatorial memory
# lookups together. this is Generally Bad.
comb += self.imem.a_pc_i.eq(pc)
- comb += self.imem.a_valid_i.eq(1)
- comb += self.imem.f_valid_i.eq(1)
- sync += cur_state.pc.eq(pc)
- sync += cur_state.svstate.eq(svstate) # and svstate
-
- # initiate read of MSR. arrives one clock later
- comb += self.state_r_msr.ren.eq(1 << StateRegs.MSR)
- sync += msr_read.eq(0)
-
+ comb += self.imem.a_i_valid.eq(1)
+ comb += self.imem.f_i_valid.eq(1)
m.next = "INSN_READ" # move to "wait for bus" phase
# dummy pause to find out why simulation is not keeping up
with m.State("INSN_READ"):
- # one cycle later, msr/sv read arrives. valid only once.
- with m.If(~msr_read):
- sync += msr_read.eq(1) # yeah don't read it again
- sync += cur_state.msr.eq(self.state_r_msr.data_o)
- with m.If(self.imem.f_busy_o): # zzz...
- # busy: stay in wait-read
- comb += self.imem.a_valid_i.eq(1)
- comb += self.imem.f_valid_i.eq(1)
+ # when using "single-step" mode, checking dbg.stopping_o
+ # prevents progress. allow fetch to proceed once started
+ stopping = Const(0)
+ #if self.allow_overlap:
+ # stopping = dbg.stopping_o
+ with m.If(stopping):
+ # stopping: jump back to idle
+ m.next = "IDLE"
with m.Else():
- # not busy: instruction fetched
- insn = get_insn(self.imem.f_instr_o, cur_state.pc)
- if self.svp64_en:
- svp64 = self.svp64
- # decode the SVP64 prefix, if any
- comb += svp64.raw_opcode_in.eq(insn)
- comb += svp64.bigendian.eq(self.core_bigendian_i)
- # pass the decoded prefix (if any) to PowerDecoder2
- sync += pdecode2.sv_rm.eq(svp64.svp64_rm)
- sync += pdecode2.is_svp64_mode.eq(is_svp64_mode)
- # remember whether this is a prefixed instruction, so
- # the FSM can readily loop when VL==0
- sync += is_svp64_mode.eq(svp64.is_svp64_mode)
- # calculate the address of the following instruction
- insn_size = Mux(svp64.is_svp64_mode, 8, 4)
- sync += nia.eq(cur_state.pc + insn_size)
- with m.If(~svp64.is_svp64_mode):
- # with no prefix, store the instruction
- # and hand it directly to the next FSM
+ with m.If(self.imem.f_busy_o &
+ ~pdecode2.instr_fault): # zzz...
+ # busy but not fetch failed: stay in wait-read
+ comb += self.imem.a_pc_i.eq(pc)
+ comb += self.imem.a_i_valid.eq(1)
+ comb += self.imem.f_i_valid.eq(1)
+ with m.Else():
+ # not busy (or fetch failed!): instruction fetched
+ # when fetch failed, the instruction gets ignored
+ # by the decoder
+ if hasattr(core, "icache"):
+ # blech, icache returns actual instruction
+ insn = self.imem.f_instr_o
+ else:
+ # but these return raw memory
+ insn = get_insn(self.imem.f_instr_o, cur_state.pc)
+ if self.svp64_en:
+ svp64 = self.svp64
+ # decode the SVP64 prefix, if any
+ comb += svp64.raw_opcode_in.eq(insn)
+ comb += svp64.bigendian.eq(self.core_bigendian_i)
+ # pass the decoded prefix (if any) to PowerDecoder2
+ sync += pdecode2.sv_rm.eq(svp64.svp64_rm)
+ sync += pdecode2.is_svp64_mode.eq(is_svp64_mode)
+ # remember whether this is a prefixed instruction,
+ # so the FSM can readily loop when VL==0
+ sync += is_svp64_mode.eq(svp64.is_svp64_mode)
+ # calculate the address of the following instruction
+ insn_size = Mux(svp64.is_svp64_mode, 8, 4)
+ sync += nia.eq(cur_state.pc + insn_size)
+ with m.If(~svp64.is_svp64_mode):
+ # with no prefix, store the instruction
+ # and hand it directly to the next FSM
+ sync += dec_opcode_i.eq(insn)
+ m.next = "INSN_READY"
+ with m.Else():
+ # fetch the rest of the instruction from memory
+ comb += self.imem.a_pc_i.eq(cur_state.pc + 4)
+ comb += self.imem.a_i_valid.eq(1)
+ comb += self.imem.f_i_valid.eq(1)
+ m.next = "INSN_READ2"
+ else:
+ # not SVP64 - 32-bit only
+ sync += nia.eq(cur_state.pc + 4)
sync += dec_opcode_i.eq(insn)
- m.next = "INSN_READY"
- with m.Else():
- # fetch the rest of the instruction from memory
- comb += self.imem.a_pc_i.eq(cur_state.pc + 4)
- comb += self.imem.a_valid_i.eq(1)
- comb += self.imem.f_valid_i.eq(1)
- m.next = "INSN_READ2"
- else:
- # not SVP64 - 32-bit only
- sync += nia.eq(cur_state.pc + 4)
- sync += dec_opcode_i.eq(insn)
- m.next = "INSN_READY"
+ if self.microwatt_compat or self.fabric_compat:
+ # for verilator debug purposes
+ comb += self.insn.eq(insn)
+ comb += self.nia.eq(cur_state.pc)
+ comb += self.msr_o.eq(cur_state.msr)
+ comb += self.nia_req.eq(1)
+ m.next = "INSN_READY"
with m.State("INSN_READ2"):
with m.If(self.imem.f_busy_o): # zzz...
# busy: stay in wait-read
- comb += self.imem.a_valid_i.eq(1)
- comb += self.imem.f_valid_i.eq(1)
+ comb += self.imem.a_i_valid.eq(1)
+ comb += self.imem.f_i_valid.eq(1)
with m.Else():
# not busy: instruction fetched
- insn = get_insn(self.imem.f_instr_o, cur_state.pc+4)
+ if hasattr(core, "icache"):
+ # blech, icache returns actual instruction
+ insn = self.imem.f_instr_o
+ else:
+ insn = get_insn(self.imem.f_instr_o, cur_state.pc+4)
sync += dec_opcode_i.eq(insn)
m.next = "INSN_READY"
# TODO: probably can start looking at pdecode2.rm_dec
with m.State("INSN_READY"):
# hand over the instruction, to be decoded
- comb += fetch_insn_valid_o.eq(1)
- with m.If(fetch_insn_ready_i):
+ comb += fetch_insn_o_valid.eq(1)
+ with m.If(fetch_insn_i_ready):
m.next = "IDLE"
+
def fetch_predicate_fsm(self, m,
- pred_insn_valid_i, pred_insn_ready_o,
- pred_mask_valid_o, pred_mask_ready_i):
+ pred_insn_i_valid, pred_insn_o_ready,
+ pred_mask_o_valid, pred_mask_i_ready):
"""fetch_predicate_fsm - obtains (constructs in the case of CR)
src/dest predicate masks
comb = m.d.comb
sync = m.d.sync
pdecode2 = self.pdecode2
- rm_dec = pdecode2.rm_dec # SVP64RMModeDecode
+ rm_dec = pdecode2.rm_dec # SVP64RMModeDecode
predmode = rm_dec.predmode
srcpred, dstpred = rm_dec.srcpred, rm_dec.dstpred
cr_pred, int_pred = self.cr_pred, self.int_pred # read regfiles
with m.FSM(name="fetch_predicate"):
with m.State("FETCH_PRED_IDLE"):
- comb += pred_insn_ready_o.eq(1)
- with m.If(pred_insn_valid_i):
+ comb += pred_insn_o_ready.eq(1)
+ with m.If(pred_insn_i_valid):
with m.If(predmode == SVP64PredMode.INT):
# skip fetching destination mask register, when zero
with m.If(dall1s):
with m.If(dunary):
# set selected mask bit for 1<<r3 mode
dst_shift = Signal(range(64))
- comb += dst_shift.eq(self.int_pred.data_o & 0b111111)
+ comb += dst_shift.eq(self.int_pred.o_data & 0b111111)
sync += new_dstmask.eq(1 << dst_shift)
with m.Else():
# invert mask if requested
- sync += new_dstmask.eq(self.int_pred.data_o ^ inv)
+ sync += new_dstmask.eq(self.int_pred.o_data ^ inv)
# skip fetching source mask register, when zero
with m.If(sall1s):
sync += new_srcmask.eq(-1)
with m.If(sunary):
# set selected mask bit for 1<<r3 mode
src_shift = Signal(range(64))
- comb += src_shift.eq(self.int_pred.data_o & 0b111111)
+ comb += src_shift.eq(self.int_pred.o_data & 0b111111)
sync += new_srcmask.eq(1 << src_shift)
with m.Else():
# invert mask if requested
- sync += new_srcmask.eq(self.int_pred.data_o ^ inv)
+ sync += new_srcmask.eq(self.int_pred.o_data ^ inv)
m.next = "FETCH_PRED_SHIFT_MASK"
# fetch masks from the CR register file
cr_field = Signal(4)
scr_bit = Signal()
dcr_bit = Signal()
- comb += cr_field.eq(cr_pred.data_o)
- comb += scr_bit.eq(cr_field.bit_select(sidx, 1) ^ scrinvert)
- comb += dcr_bit.eq(cr_field.bit_select(didx, 1) ^ dcrinvert)
+ comb += cr_field.eq(cr_pred.o_data)
+ comb += scr_bit.eq(cr_field.bit_select(sidx, 1)
+ ^ scrinvert)
+ comb += dcr_bit.eq(cr_field.bit_select(didx, 1)
+ ^ dcrinvert)
# set the corresponding mask bit
bit_to_set = Signal.like(self.srcmask)
comb += bit_to_set.eq(1 << cur_cr_idx)
m.next = "FETCH_PRED_DONE"
with m.State("FETCH_PRED_DONE"):
- comb += pred_mask_valid_o.eq(1)
- with m.If(pred_mask_ready_i):
+ comb += pred_mask_o_valid.eq(1)
+ with m.If(pred_mask_i_ready):
m.next = "FETCH_PRED_IDLE"
- def issue_fsm(self, m, core, pc_changed, sv_changed, nia,
+ def issue_fsm(self, m, core, nia,
dbg, core_rst, is_svp64_mode,
- fetch_pc_ready_o, fetch_pc_valid_i,
- fetch_insn_valid_o, fetch_insn_ready_i,
- pred_insn_valid_i, pred_insn_ready_o,
- pred_mask_valid_o, pred_mask_ready_i,
- exec_insn_valid_i, exec_insn_ready_o,
- exec_pc_valid_o, exec_pc_ready_i):
+ fetch_pc_o_ready, fetch_pc_i_valid,
+ fetch_insn_o_valid, fetch_insn_i_ready,
+ pred_insn_i_valid, pred_insn_o_ready,
+ pred_mask_o_valid, pred_mask_i_ready,
+ exec_insn_i_valid, exec_insn_o_ready,
+ exec_pc_o_valid, exec_pc_i_ready):
"""issue FSM
decode / issue FSM. this interacts with the "fetch" FSM
sync = m.d.sync
pdecode2 = self.pdecode2
cur_state = self.cur_state
+ new_svstate = self.new_svstate
# temporaries
- dec_opcode_i = pdecode2.dec.raw_opcode_in # raw opcode
+ dec_opcode_i = pdecode2.dec.raw_opcode_in # raw opcode
# for updating svstate (things like srcstep etc.)
- update_svstate = Signal() # set this (below) if updating
- new_svstate = SVSTATERec("new_svstate")
comb += new_svstate.eq(cur_state.svstate)
# precalculate srcstep+1 and dststep+1
# note if an exception happened. in a pipelined or OoO design
# this needs to be accompanied by "shadowing" (or stalling)
- el = []
- for exc in core.fus.excs.values():
- el.append(exc.happened)
- exc_happened = Signal()
- if len(el) > 0: # at least one exception
- comb += exc_happened.eq(Cat(*el).bool())
+ exc_happened = self.core.o.exc_happened
+ # also note instruction fetch failed
+ if hasattr(core, "icache"):
+ fetch_failed = core.icache.i_out.fetch_failed
+ flush_needed = True
+ # set to fault in decoder
+ # update (highest priority) instruction fault
+ rising_fetch_failed = rising_edge(m, fetch_failed)
+ with m.If(rising_fetch_failed):
+ sync += pdecode2.instr_fault.eq(1)
+ else:
+ fetch_failed = Const(0, 1)
+ flush_needed = False
+
+ sync += fetch_pc_i_valid.eq(0)
with m.FSM(name="issue_fsm"):
+ with m.State("PRE_IDLE"):
+ with m.If(~dbg.core_stop_o & ~core_rst):
+ m.next = "ISSUE_START"
+
# sync with the "fetch" phase which is reading the instruction
# at this point, there is no instruction running, that
# could inadvertently update the PC.
with m.State("ISSUE_START"):
+ # reset instruction fault
+ sync += pdecode2.instr_fault.eq(0)
# wait on "core stop" release, before next fetch
# need to do this here, in case we are in a VL==0 loop
with m.If(~dbg.core_stop_o & ~core_rst):
- comb += fetch_pc_valid_i.eq(1) # tell fetch to start
- with m.If(fetch_pc_ready_o): # fetch acknowledged us
+ sync += fetch_pc_i_valid.eq(1) # tell fetch to start
+ sync += cur_state.pc.eq(dbg.state.pc)
+ sync += cur_state.svstate.eq(dbg.state.svstate)
+ sync += cur_state.msr.eq(dbg.state.msr)
+ with m.If(fetch_pc_o_ready): # fetch acknowledged us
m.next = "INSN_WAIT"
with m.Else():
# tell core it's stopped, and acknowledge debug handshake
comb += dbg.core_stopped_i.eq(1)
- # while stopped, allow updating the PC and SVSTATE
- with m.If(self.pc_i.ok):
- comb += self.state_w_pc.wen.eq(1 << StateRegs.PC)
- comb += self.state_w_pc.data_i.eq(self.pc_i.data)
- sync += pc_changed.eq(1)
+ # while stopped, allow updating SVSTATE
with m.If(self.svstate_i.ok):
comb += new_svstate.eq(self.svstate_i.data)
- comb += update_svstate.eq(1)
- sync += sv_changed.eq(1)
+ comb += self.update_svstate.eq(1)
+ sync += self.sv_changed.eq(1)
# wait for an instruction to arrive from Fetch
with m.State("INSN_WAIT"):
- comb += fetch_insn_ready_i.eq(1)
- with m.If(fetch_insn_valid_o):
- # loop into ISSUE_START if it's a SVP64 instruction
- # and VL == 0. this because VL==0 is a for-loop
- # from 0 to 0 i.e. always, always a NOP.
- cur_vl = cur_state.svstate.vl
- with m.If(is_svp64_mode & (cur_vl == 0)):
- # update the PC before fetching the next instruction
- # since we are in a VL==0 loop, no instruction was
- # executed that we could be overwriting
- comb += self.state_w_pc.wen.eq(1 << StateRegs.PC)
- comb += self.state_w_pc.data_i.eq(nia)
- comb += self.insn_done.eq(1)
- m.next = "ISSUE_START"
- with m.Else():
- if self.svp64_en:
- m.next = "PRED_START" # start fetching predicate
- else:
- m.next = "DECODE_SV" # skip predication
+ # when using "single-step" mode, checking dbg.stopping_o
+ # prevents progress. allow issue to proceed once started
+ stopping = Const(0)
+ #if self.allow_overlap:
+ # stopping = dbg.stopping_o
+ with m.If(stopping):
+ # stopping: jump back to idle
+ m.next = "ISSUE_START"
+ if flush_needed:
+ # request the icache to stop asserting "failed"
+ comb += core.icache.flush_in.eq(1)
+ # stop instruction fault
+ sync += pdecode2.instr_fault.eq(0)
+ with m.Else():
+ comb += fetch_insn_i_ready.eq(1)
+ with m.If(fetch_insn_o_valid):
+ # loop into ISSUE_START if it's a SVP64 instruction
+ # and VL == 0. this because VL==0 is a for-loop
+ # from 0 to 0 i.e. always, always a NOP.
+ cur_vl = cur_state.svstate.vl
+ with m.If(is_svp64_mode & (cur_vl == 0)):
+ # update the PC before fetching the next instruction
+ # since we are in a VL==0 loop, no instruction was
+ # executed that we could be overwriting
+ comb += self.state_w_pc.wen.eq(1 << StateRegs.PC)
+ comb += self.state_w_pc.i_data.eq(nia)
+ comb += self.insn_done.eq(1)
+ m.next = "ISSUE_START"
+ with m.Else():
+ if self.svp64_en:
+ m.next = "PRED_START" # fetching predicate
+ else:
+ m.next = "DECODE_SV" # skip predication
with m.State("PRED_START"):
- comb += pred_insn_valid_i.eq(1) # tell fetch_pred to start
- with m.If(pred_insn_ready_o): # fetch_pred acknowledged us
+ comb += pred_insn_i_valid.eq(1) # tell fetch_pred to start
+ with m.If(pred_insn_o_ready): # fetch_pred acknowledged us
m.next = "MASK_WAIT"
with m.State("MASK_WAIT"):
- comb += pred_mask_ready_i.eq(1) # ready to receive the masks
- with m.If(pred_mask_valid_o): # predication masks are ready
+ comb += pred_mask_i_ready.eq(1) # ready to receive the masks
+ with m.If(pred_mask_o_valid): # predication masks are ready
m.next = "PRED_SKIP"
# skip zeros in predicate
(skip_dststep >= cur_vl)):
# end of VL loop. Update PC and reset src/dst step
comb += self.state_w_pc.wen.eq(1 << StateRegs.PC)
- comb += self.state_w_pc.data_i.eq(nia)
+ comb += self.state_w_pc.i_data.eq(nia)
comb += new_svstate.srcstep.eq(0)
comb += new_svstate.dststep.eq(0)
- comb += update_svstate.eq(1)
+ comb += self.update_svstate.eq(1)
# synchronize with the simulator
comb += self.insn_done.eq(1)
# go back to Issue
# update new src/dst step
comb += new_svstate.srcstep.eq(skip_srcstep)
comb += new_svstate.dststep.eq(skip_dststep)
- comb += update_svstate.eq(1)
+ comb += self.update_svstate.eq(1)
# proceed to Decode
m.next = "DECODE_SV"
# pass predicate mask bits through to satellite decoders
# TODO: for SIMD this will be *multiple* bits
- sync += core.sv_pred_sm.eq(self.srcmask[0])
- sync += core.sv_pred_dm.eq(self.dstmask[0])
+ sync += core.i.sv_pred_sm.eq(self.srcmask[0])
+ sync += core.i.sv_pred_dm.eq(self.dstmask[0])
# after src/dst step have been updated, we are ready
# to decode the instruction
with m.State("DECODE_SV"):
# decode the instruction
- sync += core.e.eq(pdecode2.e)
- sync += core.state.eq(cur_state)
- sync += core.raw_insn_i.eq(dec_opcode_i)
- sync += core.bigendian_i.eq(self.core_bigendian_i)
+ with m.If(~fetch_failed):
+ sync += pdecode2.instr_fault.eq(0)
+ sync += core.i.e.eq(pdecode2.e)
+ sync += core.i.state.eq(cur_state)
+ sync += core.i.raw_insn_i.eq(dec_opcode_i)
+ sync += core.i.bigendian_i.eq(self.core_bigendian_i)
if self.svp64_en:
- sync += core.sv_rm.eq(pdecode2.sv_rm)
+ sync += core.i.sv_rm.eq(pdecode2.sv_rm)
# set RA_OR_ZERO detection in satellite decoders
- sync += core.sv_a_nz.eq(pdecode2.sv_a_nz)
+ sync += core.i.sv_a_nz.eq(pdecode2.sv_a_nz)
# and svp64 detection
- sync += core.is_svp64_mode.eq(is_svp64_mode)
+ sync += core.i.is_svp64_mode.eq(is_svp64_mode)
# and svp64 bit-rev'd ldst mode
ldst_dec = pdecode2.use_svp64_ldst_dec
- sync += core.use_svp64_ldst_dec.eq(ldst_dec)
+ sync += core.i.use_svp64_ldst_dec.eq(ldst_dec)
+ # after decoding, reset any previous exception condition,
+ # allowing it to be set again during the next execution
+ sync += pdecode2.ldst_exc.eq(0)
m.next = "INSN_EXECUTE" # move to "execute"
# handshake with execution FSM, move to "wait" once acknowledged
with m.State("INSN_EXECUTE"):
- comb += exec_insn_valid_i.eq(1) # trigger execute
- with m.If(exec_insn_ready_o): # execute acknowledged us
- m.next = "EXECUTE_WAIT"
+ # when using "single-step" mode, checking dbg.stopping_o
+ # prevents progress. allow execute to proceed once started
+ stopping = Const(0)
+ #if self.allow_overlap:
+ # stopping = dbg.stopping_o
+ with m.If(stopping):
+ # stopping: jump back to idle
+ m.next = "ISSUE_START"
+ if flush_needed:
+ # request the icache to stop asserting "failed"
+ comb += core.icache.flush_in.eq(1)
+ # stop instruction fault
+ sync += pdecode2.instr_fault.eq(0)
+ with m.Else():
+ comb += exec_insn_i_valid.eq(1) # trigger execute
+ with m.If(exec_insn_o_ready): # execute acknowledged us
+ m.next = "EXECUTE_WAIT"
with m.State("EXECUTE_WAIT"):
- # wait on "core stop" release, at instruction end
- # need to do this here, in case we are in a VL>1 loop
- with m.If(~dbg.core_stop_o & ~core_rst):
- comb += exec_pc_ready_i.eq(1)
- # see https://bugs.libre-soc.org/show_bug.cgi?id=636
- #with m.If(exec_pc_valid_o & exc_happened):
- # probably something like this:
- # sync += pdecode2.ldst_exc.eq(core.fus.get_exc("ldst0")
- # TODO: the exception info needs to be blatted
- # into pdecode.ldst_exc, and the instruction "re-run".
- # when ldst_exc.happened is set, the PowerDecoder2
- # reacts very differently: it re-writes the instruction
- # with a "trap" (calls PowerDecoder2.trap()) which
- # will *overwrite* whatever was requested and jump the
- # PC to the exception address, as well as alter MSR.
- # nothing else needs to be done other than to note
- # the change of PC and MSR (and, later, SVSTATE)
- #with m.Elif(exec_pc_valid_o):
- with m.If(exec_pc_valid_o): # replace with Elif (above)
-
- # was this the last loop iteration?
- is_last = Signal()
- cur_vl = cur_state.svstate.vl
- comb += is_last.eq(next_srcstep == cur_vl)
+ comb += exec_pc_i_ready.eq(1)
+ # see https://bugs.libre-soc.org/show_bug.cgi?id=636
+ # the exception info needs to be blatted into
+ # pdecode.ldst_exc, and the instruction "re-run".
+ # when ldst_exc.happened is set, the PowerDecoder2
+ # reacts very differently: it re-writes the instruction
+ # with a "trap" (calls PowerDecoder2.trap()) which
+ # will *overwrite* whatever was requested and jump the
+ # PC to the exception address, as well as alter MSR.
+ # nothing else needs to be done other than to note
+ # the change of PC and MSR (and, later, SVSTATE)
+ with m.If(exc_happened):
+ mmu = core.fus.get_exc("mmu0")
+ ldst = core.fus.get_exc("ldst0")
+ if mmu is not None:
+ with m.If(fetch_failed):
+ # instruction fetch: exception is from MMU
+ # reset instr_fault (highest priority)
+ sync += pdecode2.ldst_exc.eq(mmu)
+ sync += pdecode2.instr_fault.eq(0)
+ if flush_needed:
+ # request icache to stop asserting "failed"
+ comb += core.icache.flush_in.eq(1)
+ with m.If(~fetch_failed):
+ # otherwise assume it was a LDST exception
+ sync += pdecode2.ldst_exc.eq(ldst)
+
+ with m.If(exec_pc_o_valid):
+
+ # was this the last loop iteration?
+ is_last = Signal()
+ cur_vl = cur_state.svstate.vl
+ comb += is_last.eq(next_srcstep == cur_vl)
- # if either PC or SVSTATE were changed by the previous
- # instruction, go directly back to Fetch, without
- # updating either PC or SVSTATE
- with m.If(pc_changed | sv_changed):
- m.next = "ISSUE_START"
+ with m.If(pdecode2.instr_fault):
+ # reset instruction fault, try again
+ sync += pdecode2.instr_fault.eq(0)
+ m.next = "ISSUE_START"
- # also return to Fetch, when no output was a vector
- # (regardless of SRCSTEP and VL), or when the last
- # instruction was really the last one of the VL loop
- with m.Elif((~pdecode2.loop_continue) | is_last):
- # before going back to fetch, update the PC state
- # register with the NIA.
- # ok here we are not reading the branch unit.
- # TODO: this just blithely overwrites whatever
- # pipeline updated the PC
- comb += self.state_w_pc.wen.eq(1 << StateRegs.PC)
- comb += self.state_w_pc.data_i.eq(nia)
- # reset SRCSTEP before returning to Fetch
- if self.svp64_en:
- with m.If(pdecode2.loop_continue):
- comb += new_svstate.srcstep.eq(0)
- comb += new_svstate.dststep.eq(0)
- comb += update_svstate.eq(1)
- else:
+ # return directly to Decode if Execute generated an
+ # exception.
+ with m.Elif(pdecode2.ldst_exc.happened):
+ m.next = "DECODE_SV"
+
+ # if MSR, PC or SVSTATE were changed by the previous
+ # instruction, go directly back to Fetch, without
+ # updating either MSR PC or SVSTATE
+ with m.Elif(self.msr_changed | self.pc_changed |
+ self.sv_changed):
+ m.next = "ISSUE_START"
+
+ # also return to Fetch, when no output was a vector
+ # (regardless of SRCSTEP and VL), or when the last
+ # instruction was really the last one of the VL loop
+ with m.Elif((~pdecode2.loop_continue) | is_last):
+ # before going back to fetch, update the PC state
+ # register with the NIA.
+ # ok here we are not reading the branch unit.
+ # TODO: this just blithely overwrites whatever
+ # pipeline updated the PC
+ comb += self.state_w_pc.wen.eq(1 << StateRegs.PC)
+ comb += self.state_w_pc.i_data.eq(nia)
+ # reset SRCSTEP before returning to Fetch
+ if self.svp64_en:
+ with m.If(pdecode2.loop_continue):
comb += new_svstate.srcstep.eq(0)
comb += new_svstate.dststep.eq(0)
- comb += update_svstate.eq(1)
- m.next = "ISSUE_START"
+ comb += self.update_svstate.eq(1)
+ else:
+ comb += new_svstate.srcstep.eq(0)
+ comb += new_svstate.dststep.eq(0)
+ comb += self.update_svstate.eq(1)
+ m.next = "ISSUE_START"
- # returning to Execute? then, first update SRCSTEP
- with m.Else():
- comb += new_svstate.srcstep.eq(next_srcstep)
- comb += new_svstate.dststep.eq(next_dststep)
- comb += update_svstate.eq(1)
- # return to mask skip loop
- m.next = "PRED_SKIP"
+ # returning to Execute? then, first update SRCSTEP
+ with m.Else():
+ comb += new_svstate.srcstep.eq(next_srcstep)
+ comb += new_svstate.dststep.eq(next_dststep)
+ comb += self.update_svstate.eq(1)
+ # return to mask skip loop
+ m.next = "PRED_SKIP"
- with m.Else():
- comb += dbg.core_stopped_i.eq(1)
- # while stopped, allow updating the PC and SVSTATE
- with m.If(self.pc_i.ok):
- comb += self.state_w_pc.wen.eq(1 << StateRegs.PC)
- comb += self.state_w_pc.data_i.eq(self.pc_i.data)
- sync += pc_changed.eq(1)
- with m.If(self.svstate_i.ok):
- comb += new_svstate.eq(self.svstate_i.data)
- comb += update_svstate.eq(1)
- sync += sv_changed.eq(1)
# check if svstate needs updating: if so, write it to State Regfile
- with m.If(update_svstate):
- comb += self.state_w_sv.wen.eq(1<<StateRegs.SVSTATE)
- comb += self.state_w_sv.data_i.eq(new_svstate)
- sync += cur_state.svstate.eq(new_svstate) # for next clock
-
- def execute_fsm(self, m, core, pc_changed, sv_changed,
- exec_insn_valid_i, exec_insn_ready_o,
- exec_pc_valid_o, exec_pc_ready_i):
+ with m.If(self.update_svstate):
+ sync += cur_state.svstate.eq(self.new_svstate) # for next clock
+
+ def execute_fsm(self, m, core,
+ exec_insn_i_valid, exec_insn_o_ready,
+ exec_pc_o_valid, exec_pc_i_ready):
"""execute FSM
execute FSM. this interacts with the "issue" FSM
comb = m.d.comb
sync = m.d.sync
+ dbg = self.dbg
pdecode2 = self.pdecode2
+ cur_state = self.cur_state
# temporaries
- core_busy_o = core.busy_o # core is busy
- core_ivalid_i = core.ivalid_i # instruction is valid
- core_issue_i = core.issue_i # instruction is issued
- insn_type = core.e.do.insn_type # instruction MicroOp type
+ core_busy_o = core.n.o_data.busy_o # core is busy
+ core_ivalid_i = core.p.i_valid # instruction is valid
+
+ if hasattr(core, "icache"):
+ fetch_failed = core.icache.i_out.fetch_failed
+ else:
+ fetch_failed = Const(0, 1)
with m.FSM(name="exec_fsm"):
# waiting for instruction bus (stays there until not busy)
with m.State("INSN_START"):
- comb += exec_insn_ready_o.eq(1)
- with m.If(exec_insn_valid_i):
- comb += core_ivalid_i.eq(1) # instruction is valid
- comb += core_issue_i.eq(1) # and issued
- sync += sv_changed.eq(0)
- sync += pc_changed.eq(0)
- m.next = "INSN_ACTIVE" # move to "wait completion"
+ comb += exec_insn_o_ready.eq(1)
+ with m.If(exec_insn_i_valid):
+ comb += core_ivalid_i.eq(1) # instruction is valid/issued
+ sync += self.sv_changed.eq(0)
+ sync += self.pc_changed.eq(0)
+ sync += self.msr_changed.eq(0)
+ with m.If(core.p.o_ready): # only move if accepted
+ m.next = "INSN_ACTIVE" # move to "wait completion"
# instruction started: must wait till it finishes
with m.State("INSN_ACTIVE"):
- with m.If(insn_type != MicrOp.OP_NOP):
- comb += core_ivalid_i.eq(1) # instruction is valid
- # note changes to PC and SVSTATE
- with m.If(self.state_nia.wen & (1<<StateRegs.SVSTATE)):
- sync += sv_changed.eq(1)
- with m.If(self.state_nia.wen & (1<<StateRegs.PC)):
- sync += pc_changed.eq(1)
- with m.If(~core_busy_o): # instruction done!
- comb += exec_pc_valid_o.eq(1)
- with m.If(exec_pc_ready_i):
- comb += self.insn_done.eq(1)
+ # note changes to MSR, PC and SVSTATE
+ with m.If(self.state_nia.wen & (1 << StateRegs.SVSTATE)):
+ sync += self.sv_changed.eq(1)
+ with m.If(self.state_nia.wen & (1 << StateRegs.MSR)):
+ sync += self.msr_changed.eq(1)
+ with m.If(self.state_nia.wen & (1 << StateRegs.PC)):
+ sync += self.pc_changed.eq(1)
+ # and note changes to DEC/TB, to be passed to DEC/TB FSM
+ with m.If(self.state_spr.wen & (1 << StateRegs.TB)):
+ comb += self.pause_dec_tb.eq(1)
+ # but also zero-out the cur_state DEC so that, on
+ # the next instruction, if it is "enable interrupt"
+ # the delay between the DEC/TB FSM reading and updating
+ # cur_state.dec doesn't trigger a spurious interrupt.
+ # the DEC/TB FSM will read the regfile and update to
+ # the correct value, so having cur_state.dec set to zero
+ # for a while is no big deal.
+ with m.If(self.state_spr.wen & (1 << StateRegs.DEC)):
+ comb += self.pause_dec_tb.eq(1)
+ sync += cur_state.dec.eq(0) # only needs top bit clear
+ with m.If(~core_busy_o): # instruction done!
+ comb += exec_pc_o_valid.eq(1)
+ with m.If(exec_pc_i_ready):
+ # when finished, indicate "done".
+ # however, if there was an exception, the instruction
+ # is *not* yet done. this is an implementation
+ # detail: we choose to implement exceptions by
+ # taking the exception information from the LDST
+ # unit, putting that *back* into the PowerDecoder2,
+ # and *re-running the entire instruction*.
+ # if we erroneously indicate "done" here, it is as if
+ # there were *TWO* instructions:
+ # 1) the failed LDST 2) a TRAP.
+ with m.If(~pdecode2.ldst_exc.happened &
+ ~pdecode2.instr_fault):
+ comb += self.insn_done.eq(1)
m.next = "INSN_START" # back to fetch
-
- def setup_peripherals(self, m):
- comb, sync = m.d.comb, m.d.sync
-
- # okaaaay so the debug module must be in coresync clock domain
- # but NOT its reset signal. to cope with this, set every single
- # submodule explicitly in coresync domain, debug and JTAG
- # in their own one but using *external* reset.
- csd = DomainRenamer("coresync")
- dbd = DomainRenamer(self.dbg_domain)
-
- m.submodules.core = core = csd(self.core)
- m.submodules.imem = imem = csd(self.imem)
- m.submodules.dbg = dbg = dbd(self.dbg)
- if self.jtag_en:
- m.submodules.jtag = jtag = dbd(self.jtag)
- # TODO: UART2GDB mux, here, from external pin
- # see https://bugs.libre-soc.org/show_bug.cgi?id=499
- sync += dbg.dmi.connect_to(jtag.dmi)
-
- cur_state = self.cur_state
-
- # 4x 4k SRAM blocks. these simply "exist", they get routed in litex
- if self.sram4x4k:
- for i, sram in enumerate(self.sram4k):
- m.submodules["sram4k_%d" % i] = csd(sram)
- comb += sram.enable.eq(self.wb_sram_en)
-
- # XICS interrupt handler
- if self.xics:
- m.submodules.xics_icp = icp = csd(self.xics_icp)
- m.submodules.xics_ics = ics = csd(self.xics_ics)
- comb += icp.ics_i.eq(ics.icp_o) # connect ICS to ICP
- sync += cur_state.eint.eq(icp.core_irq_o) # connect ICP to core
-
- # GPIO test peripheral
- if self.gpio:
- m.submodules.simple_gpio = simple_gpio = csd(self.simple_gpio)
-
- # connect one GPIO output to ICS bit 15 (like in microwatt soc.vhdl)
- # XXX causes litex ECP5 test to get wrong idea about input and output
- # (but works with verilator sim *sigh*)
- #if self.gpio and self.xics:
- # comb += self.int_level_i[15].eq(simple_gpio.gpio_o[0])
-
- # instruction decoder
- pdecode = create_pdecode()
- m.submodules.dec2 = pdecode2 = csd(self.pdecode2)
- if self.svp64_en:
- m.submodules.svp64 = svp64 = csd(self.svp64)
-
- # convenience
- dmi, d_reg, d_cr, d_xer, = dbg.dmi, dbg.d_gpr, dbg.d_cr, dbg.d_xer
- intrf = self.core.regs.rf['int']
-
- # clock delay power-on reset
- cd_por = ClockDomain(reset_less=True)
- cd_sync = ClockDomain()
- core_sync = ClockDomain("coresync")
- m.domains += cd_por, cd_sync, core_sync
- if self.dbg_domain != "sync":
- dbg_sync = ClockDomain(self.dbg_domain)
- m.domains += dbg_sync
-
- ti_rst = Signal(reset_less=True)
- delay = Signal(range(4), reset=3)
- with m.If(delay != 0):
- m.d.por += delay.eq(delay - 1)
- comb += cd_por.clk.eq(ClockSignal())
-
- # power-on reset delay
- core_rst = ResetSignal("coresync")
- comb += ti_rst.eq(delay != 0 | dbg.core_rst_o | ResetSignal())
- comb += core_rst.eq(ti_rst)
-
- # debug clock is same as coresync, but reset is *main external*
- if self.dbg_domain != "sync":
- dbg_rst = ResetSignal(self.dbg_domain)
- comb += dbg_rst.eq(ResetSignal())
-
- # busy/halted signals from core
- comb += self.busy_o.eq(core.busy_o)
- comb += pdecode2.dec.bigendian.eq(self.core_bigendian_i)
-
- # temporary hack: says "go" immediately for both address gen and ST
- l0 = core.l0
- ldst = core.fus.fus['ldst0']
- st_go_edge = rising_edge(m, ldst.st.rel_o)
- m.d.comb += ldst.ad.go_i.eq(ldst.ad.rel_o) # link addr-go direct to rel
- m.d.comb += ldst.st.go_i.eq(st_go_edge) # link store-go to rising rel
+ # terminate returns directly to INSN_START
+ with m.If(dbg.terminate_i):
+ # comb += self.insn_done.eq(1) - no because it's not
+ m.next = "INSN_START" # back to fetch
def elaborate(self, platform):
- m = Module()
+ m = super().elaborate(platform)
# convenience
comb, sync = m.d.comb, m.d.sync
cur_state = self.cur_state
# set up peripherals and core
core_rst = self.core_rst
- self.setup_peripherals(m)
- # reset current state if core reset requested
- with m.If(core_rst):
- m.d.sync += self.cur_state.eq(0)
-
- # PC and instruction from I-Memory
- comb += self.pc_o.eq(cur_state.pc)
- pc_changed = Signal() # note write to PC
- sv_changed = Signal() # note write to SVSTATE
-
- # read state either from incoming override or from regfile
- # TODO: really should be doing MSR in the same way
- pc = state_get(m, core_rst, self.pc_i,
- "pc", # read PC
- self.state_r_pc, StateRegs.PC)
- svstate = state_get(m, core_rst, self.svstate_i,
- "svstate", # read SVSTATE
- self.state_r_sv, StateRegs.SVSTATE)
-
- # don't write pc every cycle
- comb += self.state_w_pc.wen.eq(0)
- comb += self.state_w_pc.data_i.eq(0)
-
- # don't read msr every cycle
- comb += self.state_r_msr.ren.eq(0)
+ # indicate to outside world if any FU is still executing
+ comb += self.any_busy.eq(core.n.o_data.any_busy_o) # any FU executing
# address of the next instruction, in the absence of a branch
# depends on the instruction size
nia = Signal(64)
# connect up debug signals
- # TODO comb += core.icache_rst_i.eq(dbg.icache_rst_o)
- comb += dbg.terminate_i.eq(core.core_terminate_o)
- comb += dbg.state.pc.eq(pc)
- comb += dbg.state.svstate.eq(svstate)
- comb += dbg.state.msr.eq(cur_state.msr)
+ with m.If(core.o.core_terminate_o):
+ comb += dbg.terminate_i.eq(1)
# pass the prefix mode from Fetch to Issue, so the latter can loop
# on VL==0
# these are the handshake signals between each
# fetch FSM can run as soon as the PC is valid
- fetch_pc_valid_i = Signal() # Execute tells Fetch "start next read"
- fetch_pc_ready_o = Signal() # Fetch Tells SVSTATE "proceed"
+ fetch_pc_i_valid = Signal() # Execute tells Fetch "start next read"
+ fetch_pc_o_ready = Signal() # Fetch Tells SVSTATE "proceed"
# fetch FSM hands over the instruction to be decoded / issued
- fetch_insn_valid_o = Signal()
- fetch_insn_ready_i = Signal()
+ fetch_insn_o_valid = Signal()
+ fetch_insn_i_ready = Signal()
# predicate fetch FSM decodes and fetches the predicate
- pred_insn_valid_i = Signal()
- pred_insn_ready_o = Signal()
+ pred_insn_i_valid = Signal()
+ pred_insn_o_ready = Signal()
# predicate fetch FSM delivers the masks
- pred_mask_valid_o = Signal()
- pred_mask_ready_i = Signal()
+ pred_mask_o_valid = Signal()
+ pred_mask_i_ready = Signal()
# issue FSM delivers the instruction to the be executed
- exec_insn_valid_i = Signal()
- exec_insn_ready_o = Signal()
+ exec_insn_i_valid = Signal()
+ exec_insn_o_ready = Signal()
# execute FSM, hands over the PC/SVSTATE back to the issue FSM
- exec_pc_valid_o = Signal()
- exec_pc_ready_i = Signal()
+ exec_pc_o_valid = Signal()
+ exec_pc_i_ready = Signal()
# the FSMs here are perhaps unusual in that they detect conditions
# then "hold" information, combinatorially, for the core
# Issue is where the VL for-loop # lives. the ready/valid
# signalling is used to communicate between the four.
- self.fetch_fsm(m, core, pc, svstate, nia, is_svp64_mode,
- fetch_pc_ready_o, fetch_pc_valid_i,
- fetch_insn_valid_o, fetch_insn_ready_i)
+ self.fetch_fsm(m, dbg, core, core_rst, nia, is_svp64_mode,
+ fetch_pc_o_ready, fetch_pc_i_valid,
+ fetch_insn_o_valid, fetch_insn_i_ready)
- self.issue_fsm(m, core, pc_changed, sv_changed, nia,
+ self.issue_fsm(m, core, nia,
dbg, core_rst, is_svp64_mode,
- fetch_pc_ready_o, fetch_pc_valid_i,
- fetch_insn_valid_o, fetch_insn_ready_i,
- pred_insn_valid_i, pred_insn_ready_o,
- pred_mask_valid_o, pred_mask_ready_i,
- exec_insn_valid_i, exec_insn_ready_o,
- exec_pc_valid_o, exec_pc_ready_i)
+ fetch_pc_o_ready, fetch_pc_i_valid,
+ fetch_insn_o_valid, fetch_insn_i_ready,
+ pred_insn_i_valid, pred_insn_o_ready,
+ pred_mask_o_valid, pred_mask_i_ready,
+ exec_insn_i_valid, exec_insn_o_ready,
+ exec_pc_o_valid, exec_pc_i_ready)
if self.svp64_en:
self.fetch_predicate_fsm(m,
- pred_insn_valid_i, pred_insn_ready_o,
- pred_mask_valid_o, pred_mask_ready_i)
+ pred_insn_i_valid, pred_insn_o_ready,
+ pred_mask_o_valid, pred_mask_i_ready)
- self.execute_fsm(m, core, pc_changed, sv_changed,
- exec_insn_valid_i, exec_insn_ready_o,
- exec_pc_valid_o, exec_pc_ready_i)
+ self.execute_fsm(m, core,
+ exec_insn_i_valid, exec_insn_o_ready,
+ exec_pc_o_valid, exec_pc_i_ready)
- # whatever was done above, over-ride it if core reset is held
+ # whatever was done above, over-ride it if core reset is held.
+ # set NIA to pc_at_reset
with m.If(core_rst):
- sync += nia.eq(0)
-
- # this bit doesn't have to be in the FSM: connect up to read
- # regfiles on demand from DMI
- self.do_dmi(m, dbg)
-
- # DEC and TB inc/dec FSM. copy of DEC is put into CoreState,
- # (which uses that in PowerDecoder2 to raise 0x900 exception)
- self.tb_dec_fsm(m, cur_state.dec)
-
- return m
-
- def do_dmi(self, m, dbg):
- """deals with DMI debug requests
-
- currently only provides read requests for the INT regfile, CR and XER
- it will later also deal with *writing* to these regfiles.
- """
- comb = m.d.comb
- sync = m.d.sync
- dmi, d_reg, d_cr, d_xer, = dbg.dmi, dbg.d_gpr, dbg.d_cr, dbg.d_xer
- intrf = self.core.regs.rf['int']
-
- with m.If(d_reg.req): # request for regfile access being made
- # TODO: error-check this
- # XXX should this be combinatorial? sync better?
- if intrf.unary:
- comb += self.int_r.ren.eq(1<<d_reg.addr)
- else:
- comb += self.int_r.addr.eq(d_reg.addr)
- comb += self.int_r.ren.eq(1)
- d_reg_delay = Signal()
- sync += d_reg_delay.eq(d_reg.req)
- with m.If(d_reg_delay):
- # data arrives one clock later
- comb += d_reg.data.eq(self.int_r.data_o)
- comb += d_reg.ack.eq(1)
-
- # sigh same thing for CR debug
- with m.If(d_cr.req): # request for regfile access being made
- comb += self.cr_r.ren.eq(0b11111111) # enable all
- d_cr_delay = Signal()
- sync += d_cr_delay.eq(d_cr.req)
- with m.If(d_cr_delay):
- # data arrives one clock later
- comb += d_cr.data.eq(self.cr_r.data_o)
- comb += d_cr.ack.eq(1)
-
- # aaand XER...
- with m.If(d_xer.req): # request for regfile access being made
- comb += self.xer_r.ren.eq(0b111111) # enable all
- d_xer_delay = Signal()
- sync += d_xer_delay.eq(d_xer.req)
- with m.If(d_xer_delay):
- # data arrives one clock later
- comb += d_xer.data.eq(self.xer_r.data_o)
- comb += d_xer.ack.eq(1)
-
- def tb_dec_fsm(self, m, spr_dec):
- """tb_dec_fsm
-
- this is a FSM for updating either dec or tb. it runs alternately
- DEC, TB, DEC, TB. note that SPR pipeline could have written a new
- value to DEC, however the regfile has "passthrough" on it so this
- *should* be ok.
-
- see v3.0B p1097-1099 for Timeer Resource and p1065 and p1076
- """
-
- comb, sync = m.d.comb, m.d.sync
- fast_rf = self.core.regs.rf['fast']
- fast_r_dectb = fast_rf.r_ports['issue'] # DEC/TB
- fast_w_dectb = fast_rf.w_ports['issue'] # DEC/TB
-
- with m.FSM() as fsm:
-
- # initiates read of current DEC
- with m.State("DEC_READ"):
- comb += fast_r_dectb.addr.eq(FastRegs.DEC)
- comb += fast_r_dectb.ren.eq(1)
- m.next = "DEC_WRITE"
-
- # waits for DEC read to arrive (1 cycle), updates with new value
- with m.State("DEC_WRITE"):
- new_dec = Signal(64)
- # TODO: MSR.LPCR 32-bit decrement mode
- comb += new_dec.eq(fast_r_dectb.data_o - 1)
- comb += fast_w_dectb.addr.eq(FastRegs.DEC)
- comb += fast_w_dectb.wen.eq(1)
- comb += fast_w_dectb.data_i.eq(new_dec)
- sync += spr_dec.eq(new_dec) # copy into cur_state for decoder
- m.next = "TB_READ"
-
- # initiates read of current TB
- with m.State("TB_READ"):
- comb += fast_r_dectb.addr.eq(FastRegs.TB)
- comb += fast_r_dectb.ren.eq(1)
- m.next = "TB_WRITE"
-
- # waits for read TB to arrive, initiates write of current TB
- with m.State("TB_WRITE"):
- new_tb = Signal(64)
- comb += new_tb.eq(fast_r_dectb.data_o + 1)
- comb += fast_w_dectb.addr.eq(FastRegs.TB)
- comb += fast_w_dectb.wen.eq(1)
- comb += fast_w_dectb.data_i.eq(new_tb)
- m.next = "DEC_READ"
+ sync += nia.eq(self.core.pc_at_reset)
return m
- def __iter__(self):
- yield from self.pc_i.ports()
- yield self.pc_o
- yield self.memerr_o
- yield from self.core.ports()
- yield from self.imem.ports()
- yield self.core_bigendian_i
- yield self.busy_o
-
- def ports(self):
- return list(self)
-
- def external_ports(self):
- ports = self.pc_i.ports()
- ports += [self.pc_o, self.memerr_o, self.core_bigendian_i, self.busy_o,
- ]
-
- if self.jtag_en:
- ports += list(self.jtag.external_ports())
- else:
- # don't add DMI if JTAG is enabled
- ports += list(self.dbg.dmi.ports())
-
- ports += list(self.imem.ibus.fields.values())
- ports += list(self.core.l0.cmpi.wb_bus().fields.values())
-
- if self.sram4x4k:
- for sram in self.sram4k:
- ports += list(sram.bus.fields.values())
-
- if self.xics:
- ports += list(self.xics_icp.bus.fields.values())
- ports += list(self.xics_ics.bus.fields.values())
- ports.append(self.int_level_i)
-
- if self.gpio:
- ports += list(self.simple_gpio.bus.fields.values())
- ports.append(self.gpio_o)
-
- return ports
-
- def ports(self):
- return list(self)
-
class TestIssuer(Elaboratable):
def __init__(self, pspec):
self.ti = TestIssuerInternal(pspec)
self.pll = DummyPLL(instance=True)
+ self.dbg_rst_i = Signal(reset_less=True)
+
# PLL direct clock or not
self.pll_en = hasattr(pspec, "use_pll") and pspec.use_pll
if self.pll_en:
self.pll_test_o = Signal(reset_less=True)
self.pll_vco_o = Signal(reset_less=True)
self.clk_sel_i = Signal(2, reset_less=True)
- self.ref_clk = ClockSignal() # can't rename it but that's ok
+ self.ref_clk = ClockSignal() # can't rename it but that's ok
self.pllclk_clk = ClockSignal("pllclk")
def elaborate(self, platform):
# internal clock is set to selector clock-out. has the side-effect of
# running TestIssuer at this speed (see DomainRenamer("intclk") above)
# debug clock runs at coresync internal clock
- cd_coresync = ClockDomain("coresync")
- #m.domains += cd_coresync
if self.ti.dbg_domain != 'sync':
cd_dbgsync = ClockDomain("dbgsync")
- #m.domains += cd_dbgsync
- intclk = ClockSignal("coresync")
+ intclk = ClockSignal(self.ti.core_domain)
dbgclk = ClockSignal(self.ti.dbg_domain)
# XXX BYPASS PLL XXX
# XXX BYPASS PLL XXX
# XXX BYPASS PLL XXX
if self.pll_en:
comb += intclk.eq(self.ref_clk)
+ assert self.ti.core_domain != 'sync', \
+ "cannot set core_domain to sync and use pll at the same time"
else:
- comb += intclk.eq(ClockSignal())
+ if self.ti.core_domain != 'sync':
+ comb += intclk.eq(ClockSignal())
if self.ti.dbg_domain != 'sync':
dbgclk = ClockSignal(self.ti.dbg_domain)
comb += dbgclk.eq(intclk)
+ comb += self.ti.dbg_rst_i.eq(self.dbg_rst_i)
return m
def ports(self):
return list(self.ti.ports()) + list(self.pll.ports()) + \
- [ClockSignal(), ResetSignal()]
+ [ClockSignal(), ResetSignal()]
def external_ports(self):
ports = self.ti.external_ports()
'div': 1,
'mul': 1,
'shiftrot': 1
- }
+ }
pspec = TestMemPspec(ldst_ifacetype='bare_wb',
imem_ifacetype='bare_wb',
- addr_wid=48,
+ addr_wid=64,
mask_wid=8,
reg_wid=64,
units=units)
import argparse
from nmigen.cli import verilog
+from openpower.consts import MSR
from soc.config.test.test_loadstore import TestMemPspec
-from soc.simple.issuer import TestIssuer
+from soc.simple.issuer import TestIssuer, TestIssuerInternal
if __name__ == '__main__':
parser.add_argument("--disable-svp64", dest='svp64', action="store_false",
help="disable SVP64",
default=False)
+ parser.add_argument("--pc-reset", default="0",
+ help="Set PC at reset (default 0)")
+ parser.add_argument("--xlen", default=64, type=int,
+ help="Set register width [default 64]")
+ # create a module that's directly compatible as a drop-in replacement
+ # in microwatt.v
+ parser.add_argument("--microwatt-compat", dest='mwcompat',
+ action="store_true",
+ help="generate microwatt-compatible interface",
+ default=False)
+ parser.add_argument("--microwatt-compat-svp64", dest='mwcompatsvp64',
+ action="store_true",
+ help="generate microwatt-compatible interface + SVP64",
+ default=False)
+ parser.add_argument("--old-microwatt-compat", dest='old_mwcompat',
+ action="store_true",
+ help="generate old microwatt-compatible interface",
+ default=True)
+ parser.add_argument("--microwatt-debug", dest='mwdebug',
+ action="store_true",
+ help="generate old microwatt-compatible interface",
+ default=False)
+ # create a module with Fabric compatibility
+ parser.add_argument("--fabric-compat", dest='fabriccompat',
+ action="store_true",
+ help="generate Fabric-compatible interface",
+ default=False)
+ # small cache option
+ parser.add_argument("--small-cache", dest='smallcache',
+ action="store_true",
+ help="generate small caches",
+ default=False)
+
+ # allow overlaps in TestIssuer
+ parser.add_argument("--allow-overlap", dest='allow_overlap',
+ action="store_true",
+ help="allow overlap in TestIssuer",
+ default=False)
args = parser.parse_args()
+ # convenience: set some defaults
+ if args.mwcompat:
+ args.pll = False
+ args.debug = 'dmi'
+ args.core = True
+ args.xics = False
+ args.gpio = False
+ args.sram4x4kblock = False
+ args.svp64 = False
+
+ # Yes, this is duplicating mwcompat, but for the sake of simplicity
+ # adding support for svp64 like this
+ if args.mwcompatsvp64:
+ args.pll = False
+ args.debug = 'dmi'
+ args.core = True
+ args.xics = False
+ args.gpio = False
+ args.sram4x4kblock = False
+ args.svp64 = True
+ args.mwcompat = True # Ensures TestMemPspec gets the expected value
+
print(args)
units = {'alu': 1,
# decide which memory type to configure
if args.mmu:
ldst_ifacetype = 'mmu_cache_wb'
+ imem_ifacetype = 'mmu_cache_wb'
else:
ldst_ifacetype = 'bare_wb'
- imem_ifacetype = 'bare_wb'
+ imem_ifacetype = 'bare_wb'
+
+ # default MSR
+ msr_reset = (1<<MSR.LE) | (1<<MSR.SF) # 64-bit, little-endian default
+
+ # default PC
+ if args.pc_reset.startswith("0x"):
+ pc_reset = int(args.pc_reset, 16)
+ else:
+ pc_reset = int(args.pc_reset)
pspec = TestMemPspec(ldst_ifacetype=ldst_ifacetype,
imem_ifacetype=imem_ifacetype,
- addr_wid=48,
+ addr_wid=64,
mask_wid=8,
+ # pipeline and integer register file width
+ XLEN=args.xlen,
# must leave at 64
reg_wid=64,
# set to 32 for instruction-memory width=32
sram4x4kblock=args.enable_sram4x4kblock, # add SRAMs
debug=args.debug, # set to jtag or dmi
svp64=args.svp64, # enable SVP64
- mmu=args.mmu, # enable MMU
- units=units)
+ microwatt_mmu=args.mmu, # enable MMU
+ microwatt_compat=args.mwcompat, # microwatt compatible
+ microwatt_old=args.old_mwcompat, # old microwatt api
+ microwatt_debug=args.mwdebug, # microwatt debug signals
+ fabric_compat=args.fabriccompat, # fabric compatible (overlaps with microwatt compat)
+ small_cache=args.smallcache, # small cache/TLB sizes
+ allow_overlap=args.allow_overlap, # allow overlap
+ units=units,
+ msr_reset=msr_reset,
+ pc_reset=pc_reset)
+ #if args.mwcompat:
+ # pspec.core_domain = 'sync'
- print("mmu", pspec.__dict__["mmu"])
+ print("mmu", pspec.__dict__["microwatt_mmu"])
print("nocore", pspec.__dict__["nocore"])
print("regreduce", pspec.__dict__["regreduce"])
print("gpio", pspec.__dict__["gpio"])
print("use_pll", pspec.__dict__["use_pll"])
print("debug", pspec.__dict__["debug"])
print("SVP64", pspec.__dict__["svp64"])
+ print("XLEN", pspec.__dict__["XLEN"])
+ print("MSR@reset", hex(pspec.__dict__["msr_reset"]))
+ print("PC@reset", hex(pspec.__dict__["pc_reset"]))
+ print("Microwatt compatibility", pspec.__dict__["microwatt_compat"])
+ print("Old Microwatt compatibility", pspec.__dict__["microwatt_old"])
+ print("Microwatt debug", pspec.__dict__["microwatt_debug"])
+ print("Fabric compatibility", pspec.__dict__["fabric_compat"])
+ print("Small Cache/TLB", pspec.__dict__["small_cache"])
- dut = TestIssuer(pspec)
+ if args.mwcompat:
+ dut = TestIssuerInternal(pspec)
+ name = "external_core_top"
+ else:
+ dut = TestIssuer(pspec)
+ name = "test_issuer"
- vl = verilog.convert(dut, ports=dut.external_ports(), name="test_issuer")
+ vl = verilog.convert(dut, ports=dut.external_ports(), name=name)
with open(args.output_filename, "w") as f:
f.write(vl)
related bugs:
* https://bugs.libre-soc.org/show_bug.cgi?id=363
+ * https://bugs.libre-soc.org/show_bug.cgi?id=686
"""
+
from nmigen import Module, Signal, Cat
from nmigen.back.pysim import Simulator, Delay, Settle
from nmutil.formaltest import FHDLTestCase
from nmigen.cli import rtlil
import unittest
+from openpower.test.state import (SimState, teststate_check_regs,
+ teststate_check_mem)
+from soc.simple.test.teststate import HDLState
from openpower.decoder.isa.caller import special_sprs
from openpower.decoder.power_decoder import create_pdecode
from openpower.decoder.power_decoder2 import PowerDecode2
from openpower.decoder.selectable_int import SelectableInt
from openpower.decoder.isa.all import ISA
+from openpower.decoder.decode2execute1 import IssuerDecode2ToOperand
+from openpower.state import CoreState
# note that using SPRreduced has to be done to match the
# PowerDecoder2 SPR map
from openpower.decoder.power_enums import spr_dict, Function, XER_bits
from soc.config.test.test_loadstore import TestMemPspec
from openpower.endian import bigendian
+from soc.regfile.regfiles import StateRegs
from soc.simple.core import NonProductionCore
from soc.experiment.compalu_multi import find_ok # hack
-from soc.fu.compunits.test.test_compunit import (setup_test_memory,
+from soc.fu.compunits.test.test_compunit import (setup_tst_memory,
check_sim_memory)
# test with ALU data and Logical data
from soc.fu.cr.test.test_pipe_caller import CRTestCase
from soc.fu.branch.test.test_pipe_caller import BranchTestCase
from soc.fu.ldst.test.test_pipe_caller import LDSTTestCase
-from openpower.util import spr_to_fast_reg
+from openpower.test.general.overlap_hazards import (HazardTestCase,
+ RandomHazardTestCase)
+from openpower.util import spr_to_fast_reg, spr_to_state_reg
+
+from openpower.consts import StateRegsEnum
# list of SPRs that are controlled and managed by the MMU
-mmu_sprs = ["PRTBL", "DSISR", "DAR", "PIDR"]
+mmu_sprs = ["PRTBL", "PIDR"]
+ldst_sprs = ["DAR", "DSISR"]
+
-def set_mmu_spr(name, i, val, core): #important keep pep8 formatting
- fsm = core.fus.get_fu("mmu0").alu
- yield fsm.mmu.l_in.mtspr.eq(1)
- yield fsm.mmu.l_in.sprn.eq(i)
- yield fsm.mmu.l_in.rs.eq(val)
+def set_mmu_spr(name, i, val, core): # important keep pep8 formatting
+ fsm = core.fus.get_fu("mmu0").alu
+ yield fsm.mmu.l_in.mtspr.eq(1)
+ yield fsm.mmu.l_in.sprn.eq(i)
+ yield fsm.mmu.l_in.rs.eq(val)
+ yield
+ yield fsm.mmu.l_in.mtspr.eq(0)
+ while True:
+ done = yield fsm.mmu.l_out.done
+ if done:
+ break
yield
- yield fsm.mmu.l_in.mtspr.eq(0)
- print("mmu_spr was updated")
+ yield
+ print("mmu_spr %s %d was updated %x" % (name, i, val))
+
+
+def set_ldst_spr(name, i, val, core): # important keep pep8 formatting
+ ldst = core.fus.get_fu("mmu0").alu.ldst # awkward to get at but it works
+ yield ldst.sprval_in.eq(val)
+ yield ldst.mmu_set_spr.eq(1)
+ if name == 'DAR':
+ yield ldst.mmu_set_dar.eq(1)
+ yield
+ yield ldst.mmu_set_dar.eq(0)
+ else:
+ yield ldst.mmu_set_dsisr.eq(1)
+ yield
+ yield ldst.mmu_set_dsisr.eq(0)
+ yield ldst.mmu_set_spr.eq(0)
+ print("ldst_spr %s %d was updated %x" % (name, i, val))
+
def setup_regs(pdecode2, core, test):
yield intregs.memory._array[i].eq(test.regs[i])
yield Settle()
+ # set up MSR in STATE regfile, "direct" write (bypass rd/write ports)
+ stateregs = core.regs.state
+ yield stateregs.regs[StateRegsEnum.MSR].reg.eq(test.msr)
+
# set up CR regfile, "direct" write across all CRs
cr = test.cr
crregs = core.regs.cr
print("setup cr reg", hex(cr))
for i in range(8):
#j = 7-i
- cri = (cr >> (i*4)) & 0xf
+ cri = (cr >> (i * 4)) & 0xf
#cri = int('{:04b}'.format(cri)[::-1], 2)
print("setup cr reg", hex(cri), i,
crregs.regs[i].reg.shape())
# setting both fast and slow SPRs from test data
fregs = core.regs.fast
+ stateregs = core.regs.state
sregs = core.regs.spr
for sprname, val in test.sprs.items():
if isinstance(val, SelectableInt):
sprname = spr_dict[sprname].SPR
if sprname == 'XER':
continue
+ print ('set spr %s val %x' % (sprname, val))
+
fast = spr_to_fast_reg(sprname)
- if fast is None:
+ state = spr_to_state_reg(sprname)
+
+ if fast is None and state is None:
# match behaviour of SPRMap in power_decoder2.py
for i, x in enumerate(SPR):
if sprname == x.name:
- print("setting slow SPR %d (%s) to %x" %
- (i, sprname, val))
- if not sprname in mmu_sprs:
- yield sregs.memory._array[i].eq(val)
+ print("setting slow SPR %d (%s/%d) to %x" %
+ (i, sprname, x.value, val))
+ if sprname in mmu_sprs:
+ yield from set_mmu_spr(sprname, x.value, val, core)
+ elif sprname in ldst_sprs:
+ yield from set_ldst_spr(sprname, x.value, val, core)
else:
- yield from set_mmu_spr(sprname, i, val, core)
+ yield sregs.memory._array[i].eq(val)
+ elif state is not None:
+ print("setting state reg %d (%s) to %x" %
+ (state, sprname, val))
+ if stateregs.unary:
+ rval = stateregs.regs[state].reg
+ else:
+ rval = stateregs.memory._array[state]
+ yield rval.eq(val)
else:
print("setting fast reg %d (%s) to %x" %
(fast, sprname, val))
def check_regs(dut, sim, core, test, code):
- # int regs
- intregs = []
- for i in range(32):
- if core.regs.int.unary:
- rval = yield core.regs.int.regs[i].reg
- else:
- rval = yield core.regs.int.memory._array[i]
- intregs.append(rval)
- print("int regs", list(map(hex, intregs)))
- for i in range(32):
- simregval = sim.gpr[i].asint()
- dut.assertEqual(simregval, intregs[i],
- "int reg %d not equal %s. got %x expected %x" % \
- (i, repr(code), simregval, intregs[i]))
-
- # CRs
- crregs = []
- for i in range(8):
- rval = yield core.regs.cr.regs[i].reg
- crregs.append(rval)
- print("cr regs", list(map(hex, crregs)))
- for i in range(8):
- rval = crregs[i]
- cri = sim.crl[7-i].get_range().value
- print("cr reg", i, hex(cri), i, hex(rval))
- # XXX https://bugs.libre-soc.org/show_bug.cgi?id=363
- dut.assertEqual(cri, rval,
- "cr reg %d not equal %s" % (i, repr(code)))
-
- # XER
- xregs = core.regs.xer
- so = yield xregs.regs[xregs.SO].reg
- ov = yield xregs.regs[xregs.OV].reg
- ca = yield xregs.regs[xregs.CA].reg
-
- print("sim SO", sim.spr['XER'][XER_bits['SO']])
- e_so = sim.spr['XER'][XER_bits['SO']].value
- e_ov = sim.spr['XER'][XER_bits['OV']].value
- e_ov32 = sim.spr['XER'][XER_bits['OV32']].value
- e_ca = sim.spr['XER'][XER_bits['CA']].value
- e_ca32 = sim.spr['XER'][XER_bits['CA32']].value
-
- e_ov = e_ov | (e_ov32 << 1)
- e_ca = e_ca | (e_ca32 << 1)
+ # create the two states and compare
+ testdic = {'sim': sim, 'hdl': core}
+ yield from teststate_check_regs(dut, testdic, test, code)
- print("after: so/ov-32/ca-32", so, bin(ov), bin(ca))
- dut.assertEqual(e_so, so, "so mismatch %s" % (repr(code)))
- dut.assertEqual(e_ov, ov, "ov mismatch %s" % (repr(code)))
- dut.assertEqual(e_ca, ca, "ca mismatch %s" % (repr(code)))
- # Check the PC as well
- state = core.regs.state
- pc = yield state.r_ports['cia'].data_o
- e_pc = sim.pc.CIA.value
- dut.assertEqual(e_pc, pc)
+def check_mem(dut, sim, core, test, code):
+ # create the two states and compare mem
+ testdic = {'sim': sim, 'hdl': core}
+ yield from teststate_check_mem(dut, testdic, test, code)
def wait_for_busy_hi(cu):
def wait_for_busy_clear(cu):
while True:
- busy_o = yield cu.busy_o
- terminate_o = yield cu.core_terminate_o
+ busy_o = yield cu.o.busy_o
+ terminate_o = yield cu.o.core_terminate_o
if not busy_o:
print("busy/terminate:", busy_o, terminate_o)
break
m = Module()
comb = m.d.comb
instruction = Signal(32)
- ivalid_i = Signal()
+
+ units = {'alu': 3, 'cr': 1, 'branch': 1, 'trap': 1,
+ 'spr': 1,
+ 'logical': 1,
+ 'mul': 3,
+ 'div': 1, 'shiftrot': 1}
pspec = TestMemPspec(ldst_ifacetype='testpi',
imem_ifacetype='',
addr_wid=48,
mask_wid=8,
+ units=units,
+ allow_overlap=True,
reg_wid=64)
+ cur_state = CoreState("cur") # current state (MSR/PC/SVSTATE)
+ pdecode2 = PowerDecode2(None, state=cur_state,
+ #opkls=IssuerDecode2ToOperand,
+ svp64_en=True, # self.svp64_en,
+ regreduce_en=False, #self.regreduce_en
+ )
+
m.submodules.core = core = NonProductionCore(pspec)
- pdecode2 = core.pdecode2
+ m.submodules.pdecode2 = pdecode2
+ core.pdecode2 = pdecode2
l0 = core.l0
- comb += core.raw_opcode_i.eq(instruction)
- comb += core.ivalid_i.eq(ivalid_i)
+ comb += pdecode2.dec.raw_opcode_in.eq(instruction)
+ comb += pdecode2.dec.bigendian.eq(bigendian) # little / big?
+ comb += core.i.e.eq(pdecode2.e)
+ comb += core.i.state.eq(cur_state)
+ comb += core.i.raw_insn_i.eq(instruction)
+ comb += core.i.bigendian_i.eq(bigendian)
+
+ # set the PC StateRegs read port to always send back the PC
+ stateregs = core.regs.state
+ pc_regnum = StateRegs.PC
+ comb += stateregs.r_ports['cia'].ren.eq(1<<pc_regnum)
# temporary hack: says "go" immediately for both address gen and ST
ldst = core.fus.fus['ldst0']
- m.d.comb += ldst.ad.go.eq(ldst.ad.rel) # link addr-go direct to rel
- m.d.comb += ldst.st.go.eq(ldst.st.rel) # link store-go direct to rel
+ m.d.comb += ldst.ad.go_i.eq(ldst.ad.rel_o) # link addr-go to rel
+ m.d.comb += ldst.st.go_i.eq(ldst.st.rel_o) # link store-go to rel
# nmigen Simulation
sim = Simulator(m)
sim.add_clock(1e-6)
def process():
- yield core.issue_i.eq(0)
yield
for test in self.test_data:
print(test.name)
program = test.program
- self.subTest(test.name)
- sim = ISA(pdecode2, test.regs, test.sprs, test.cr, test.mem,
- test.msr,
- bigendian=bigendian)
- gen = program.generate_instructions()
- instructions = list(zip(gen, program.assembly.splitlines()))
-
- yield from setup_test_memory(l0, sim)
- yield from setup_regs(core, test)
-
- index = sim.pc.CIA.value//4
- while index < len(instructions):
- ins, code = instructions[index]
-
- print("instruction: 0x{:X}".format(ins & 0xffffffff))
- print(code)
-
- # ask the decoder to decode this binary data (endian'd)
- yield core.bigendian_i.eq(bigendian) # little / big?
- yield instruction.eq(ins) # raw binary instr.
- yield ivalid_i.eq(1)
- yield Settle()
- # fn_unit = yield pdecode2.e.fn_unit
- #fuval = self.funit.value
- #self.assertEqual(fn_unit & fuval, fuval)
-
- # set operand and get inputs
- yield from set_issue(core, pdecode2, sim)
- yield Settle()
-
- yield from wait_for_busy_clear(core)
- yield ivalid_i.eq(0)
- yield
-
- print("sim", code)
- # call simulated operation
- opname = code.split(' ')[0]
- yield from sim.call(opname)
- index = sim.pc.CIA.value//4
-
- # register check
- yield from check_regs(self, sim, core, test, code)
-
- # Memory check
- yield from check_sim_memory(self, l0, sim, code)
+ with self.subTest(test.name):
+ sim = ISA(pdecode2, test.regs, test.sprs, test.cr,
+ test.mem,
+ test.msr,
+ bigendian=bigendian)
+ gen = program.generate_instructions()
+ instructions = list(zip(gen, program.assembly.splitlines()))
+
+ yield from setup_tst_memory(l0, test.mem)
+ yield from setup_regs(pdecode2, core, test)
+
+ index = sim.pc.CIA.value // 4
+ while index < len(instructions):
+ ins, code = instructions[index]
+
+ print("instruction: 0x{:X}".format(ins & 0xffffffff))
+ print(code)
+
+ # ask the decoder to decode this binary data (endian'd)
+ yield instruction.eq(ins) # raw binary instr.
+ yield Settle()
+
+ print("sim", code)
+ # call simulated operation
+ opname = code.split(' ')[0]
+ yield from sim.call(opname)
+ pc = sim.pc.CIA.value
+ nia = sim.pc.NIA.value
+ index = pc // 4
+
+ # set the PC to the same simulated value
+ # (core is not able to do this itself, except
+ # for branch / TRAP)
+ print ("after call, pc nia", pc, nia)
+ yield stateregs.regs[pc_regnum].reg.eq(pc)
+ yield Settle()
+
+ yield core.p.i_valid.eq(1)
+ yield
+ o_ready = yield core.p.o_ready
+ while True:
+ if o_ready:
+ break
+ yield
+ o_ready = yield core.p.o_ready
+ yield core.p.i_valid.eq(0)
+
+ # set operand and get inputs
+ yield from wait_for_busy_clear(core)
+
+ # synchronised (non-overlap) is fine to check
+ if not core.allow_overlap:
+ # register check
+ yield from check_regs(self, sim, core, test, code)
+
+ # Memory check
+ yield from check_mem(self, sim, core, test, code)
+
+ # non-overlap mode is only fine to check right at the end
+ if core.allow_overlap:
+ # wait until all settled
+ # XXX really this should be in DMI, which should in turn
+ # use issuer.any_busy to not send back "stopped" signal
+ while (yield core.o.any_busy_o):
+ yield
+ yield Settle()
+
+ # register check
+ yield from check_regs(self, sim, core, test, code)
+
+ # Memory check
+ yield from check_mem(self, sim, core, test, code)
+
+ # give a couple extra clock cycles for gtkwave display to be happy
+ yield
+ yield
sim.add_sync_process(process)
with sim.write_vcd("core_simulator.vcd", "core_simulator.gtkw",
if __name__ == "__main__":
unittest.main(exit=False)
suite = unittest.TestSuite()
- suite.addTest(TestRunner(LDSTTestCase().test_data))
- suite.addTest(TestRunner(CRTestCase().test_data))
- suite.addTest(TestRunner(ShiftRotTestCase().test_data))
- suite.addTest(TestRunner(LogicalTestCase().test_data))
- suite.addTest(TestRunner(ALUTestCase().test_data))
- suite.addTest(TestRunner(BranchTestCase().test_data))
+ suite.addTest(TestRunner(HazardTestCase().test_data))
+ suite.addTest(TestRunner(RandomHazardTestCase().test_data))
+ #suite.addTest(TestRunner(LDSTTestCase().test_data))
+ #suite.addTest(TestRunner(CRTestCase().test_data))
+ #suite.addTest(TestRunner(ShiftRotTestCase().test_data))
+ #suite.addTest(TestRunner(LogicalTestCase().test_data))
+ #suite.addTest(TestRunner(ALUTestCase().test_data))
+ #suite.addTest(TestRunner(BranchTestCase().test_data))
runner = unittest.TextTestRunner()
runner.run(suite)
# test with ALU data and Logical data
from openpower.test.alu.alu_cases import ALUTestCase
+from openpower.test.general.overlap_hazards import HazardTestCase
from openpower.test.div.div_cases import DivTestCases
+from openpower.test.mul.mul_cases import MulTestCases2Arg
from openpower.test.logical.logical_cases import LogicalTestCase
from openpower.test.shift_rot.shift_rot_cases import ShiftRotTestCase
+from openpower.test.shift_rot.shift_rot_cases2 import ShiftRotTestCase2
from openpower.test.cr.cr_cases import CRTestCase
from openpower.test.branch.branch_cases import BranchTestCase
-# from soc.fu.spr.test.test_pipe_caller import SPRTestCase
+from soc.fu.spr.test.test_pipe_caller import SPRTestCase
from openpower.test.ldst.ldst_cases import LDSTTestCase
+from openpower.test.trap.trap_cases import TrapTestCase
from openpower.simulator.test_sim import (GeneralTestCases, AttnTestCase)
-# from openpower.simulator.test_helloworld_sim import HelloTestCases
+from openpower.simulator.test_helloworld_sim import HelloTestCases
if __name__ == "__main__":
svp64 = True
- if len(sys.argv) == 2:
- if sys.argv[1] == 'nosvp64':
- svp64 = False
- sys.argv.pop()
+ if len(sys.argv) > 1 and sys.argv[1] == 'nosvp64':
+ svp64 = False
+ del sys.argv[1]
- print ("SVP64 test mode enabled", svp64)
+ # detect overlap case
+ allow_overlap = False
+ if len(sys.argv) >= 2 and sys.argv[1] == '--allow-overlap':
+ allow_overlap = True
+ del sys.argv[1]
+
+ # use in-order issuer, instead of the original FSM based one
+ inorder = False
+ if len(sys.argv) >= 2 and sys.argv[1] == '--inorder':
+ inorder = True
+ del sys.argv[1]
+
+ # allow list of testing to be selected by command-line
+ testing = []
+ for i in reversed(range(1, len(sys.argv))):
+ if not sys.argv[i].startswith('-'):
+ testing.append(sys.argv.pop(i))
+
+ if not testing:
+ testing = ['general', 'ldst', 'cr', 'shiftrot', 'shiftrot2',
+ 'logical', 'alu',
+ 'branch', 'div', 'mul', 'hazard']
+
+ print("SVP64 test mode enabled", svp64, "overlap",
+ allow_overlap, "in-order", inorder, "testing", testing)
unittest.main(exit=False)
suite = unittest.TestSuite()
- # suite.addTest(TestRunner(HelloTestCases.test_data, svp64=svp64))
- suite.addTest(TestRunner(DivTestCases().test_data, svp64=svp64))
- # suite.addTest(TestRunner(AttnTestCase.test_data, svp64=svp64))
- suite.addTest(TestRunner(GeneralTestCases.test_data, svp64=svp64))
- suite.addTest(TestRunner(LDSTTestCase().test_data, svp64=svp64))
- suite.addTest(TestRunner(CRTestCase().test_data, svp64=svp64))
- suite.addTest(TestRunner(ShiftRotTestCase().test_data, svp64=svp64))
- suite.addTest(TestRunner(LogicalTestCase().test_data, svp64=svp64))
- suite.addTest(TestRunner(ALUTestCase().test_data, svp64=svp64))
- suite.addTest(TestRunner(BranchTestCase().test_data, svp64=svp64))
- # suite.addTest(TestRunner(SPRTestCase.test_data, svp64=svp64))
+
+ # dictionary of data for tests
+ tests = {'hello': HelloTestCases.test_data,
+ 'div': DivTestCases().test_data,
+ 'mul': MulTestCases2Arg().test_data,
+ 'attn': AttnTestCase.test_data,
+ 'general': GeneralTestCases.test_data,
+ 'ldst': LDSTTestCase().test_data,
+ 'cr': CRTestCase().test_data,
+ 'shiftrot': ShiftRotTestCase().test_data,
+ 'shiftrot2': ShiftRotTestCase2().test_data,
+ 'logical': LogicalTestCase().test_data,
+ 'hazard': HazardTestCase().test_data,
+ 'alu': ALUTestCase().test_data,
+ 'branch': BranchTestCase().test_data,
+ 'trap': TrapTestCase().test_data,
+ 'spr': SPRTestCase().test_data
+ }
+
+ # walk through all tests, those requested get added
+ for tname, data in tests.items():
+ if tname in testing:
+ suite.addTest(TestRunner(data, svp64=svp64, inorder=inorder,
+ allow_overlap=allow_overlap))
runner = unittest.TextTestRunner()
runner.run(suite)
--- /dev/null
+"""dcbz test case
+
+related bugs:
+
+ * https://bugs.libre-soc.org/show_bug.cgi?id=51
+"""
+
+# NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
+# Also, check out the cxxsim nmigen branch, and latest yosys from git
+
+import unittest
+import sys
+
+# here is the logic which takes test cases and "executes" them.
+# in this instance (TestRunner) its job is to instantiate both
+# a Libre-SOC nmigen-based HDL instance and an ISACaller python
+# simulator. it's also responsible for performing the single
+# step and comparison.
+from soc.simple.test.test_runner import TestRunner
+
+##########
+from openpower.simulator.program import Program
+from openpower.endian import bigendian
+from openpower.test.common import TestAccumulatorBase
+
+class DCBZTestCase(TestAccumulatorBase):
+
+ def case_1_dcbz(self):
+ lst = ["dcbz 1, 2"]
+ initial_regs = [0] * 32
+ initial_regs[1] = 0x0004
+ initial_regs[2] = 0x0008
+ initial_mem = {0x0000: (0x5432123412345678, 8),
+ 0x0008: (0xabcdef0187654321, 8),
+ 0x0020: (0x1828384822324252, 8),
+ }
+ self.add_case(Program(lst, bigendian), initial_regs,
+ initial_mem=initial_mem)
+##########
+
+
+if __name__ == "__main__":
+ svp64 = False
+
+ unittest.main(exit=False)
+ suite = unittest.TestSuite()
+
+ # add other test cases later
+ suite.addTest(TestRunner(DCBZTestCase().test_data, svp64=svp64,
+ microwatt_mmu=True))
+
+ runner = unittest.TextTestRunner()
+ runner.run(suite)
--- /dev/null
+"""simple core test, runs instructions from a TestMemory
+
+related bugs:
+
+ * https://bugs.libre-soc.org/show_bug.cgi?id=363
+"""
+
+# NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
+# Also, check out the cxxsim nmigen branch, and latest yosys from git
+
+import unittest
+import sys
+
+# here is the logic which takes test cases and "executes" them.
+# in this instance (TestRunner) its job is to instantiate both
+# a Libre-SOC nmigen-based HDL instance and an ISACaller python
+# simulator. it's also responsible for performing the single
+# step and comparison.
+from soc.simple.test.test_runner import TestRunner
+
+#@platen:bookmarks
+#src/openpower/test/runner.py:class TestRunnerBase(FHDLTestCase):
+
+# test with MMU
+from openpower.test.mmu.mmu_cases import MMUTestCase
+from openpower.test.mmu.mmu_rom_cases import MMUTestCaseROM, default_mem
+from openpower.test.ldst.ldst_cases import LDSTTestCase
+from openpower.test.ldst.ldst_exc_cases import LDSTExceptionTestCase
+#from openpower.simulator.test_sim import (GeneralTestCases, AttnTestCase)
+from soc.experiment.test import pagetables
+
+
+from openpower.simulator.program import Program
+from openpower.endian import bigendian
+from openpower.test.common import TestAccumulatorBase
+
+from openpower.consts import MSR
+
+from soc.experiment.test import pagetables
+
+
+class MMUTestCase(TestAccumulatorBase):
+
+ def case_first_vm_enabled(self):
+ lst = [
+ "std 6,0(2)",
+ ]
+
+ # set up regs
+ initial_regs = [0] * 32
+ initial_regs[2] = 0xc0000000005fc190
+ initial_regs[6] = 0x0101
+
+ # memory same as microwatt test
+ initial_mem = pagetables.microwatt_linux_5_7_boot
+
+ # set virtual and non-privileged
+ # msr: 8000000000000011
+ initial_msr = 0 << MSR.PR # must set "problem" state
+ initial_msr |= 1 << MSR.LE # little-endian
+ initial_msr |= 1 << MSR.SF # 64-bit
+ initial_msr |= 1 << MSR.DR # set "virtual" state for data
+
+ # set PRTBL to 0xe000000
+ initial_sprs = {720: 0xe000000, # PRTBL
+ 48: 1 # PIDR
+ }
+
+ print("MMUTEST: initial_msr=",initial_msr)
+ self.add_case(Program(lst, bigendian), initial_regs,
+ initial_mem=initial_mem,
+ initial_sprs=initial_sprs,
+ initial_msr=initial_msr)
+
+
+ def case_first_vm_enabled_2(self):
+ lst = [
+ "std 6,0(2)",
+ ]
+
+ # set up regs
+ initial_regs = [0] * 32
+ initial_regs[2] = 0xc000000000598000
+ initial_regs[6] = 0x0101
+
+ # memory same as microwatt test
+ initial_mem = pagetables.microwatt_linux_5_7_boot
+
+ # set virtual and non-privileged
+ # msr: 8000000000000011
+ initial_msr = 0 << MSR.PR # must set "problem" state
+ initial_msr |= 1 << MSR.LE # little-endian
+ initial_msr |= 1 << MSR.SF # 64-bit
+ initial_msr |= 1 << MSR.DR # set "virtual" state for data
+
+ # set PRTBL to 0xe000000
+ initial_sprs = {720: 0xe00000c, # PRTBL
+ 48: 1 # PIDR
+ }
+
+ print("MMUTEST: initial_msr=",initial_msr)
+ self.add_case(Program(lst, bigendian), initial_regs,
+ initial_mem=initial_mem,
+ initial_sprs=initial_sprs,
+ initial_msr=initial_msr)
+
+
+if __name__ == "__main__":
+ svp64 = True
+ if len(sys.argv) == 2:
+ if sys.argv[1] == 'nosvp64':
+ svp64 = False
+ sys.argv.pop()
+
+ print ("SVP64 test mode enabled", svp64)
+
+ unittest.main(exit=False)
+ suite = unittest.TestSuite()
+
+ # MMU/DCache integration tests
+ suite.addTest(TestRunner(MMUTestCase().test_data, svp64=svp64,
+ microwatt_mmu=True,
+ rom=pagetables.microwatt_linux_5_7_boot))
+
+ runner = unittest.TextTestRunner()
+ runner.run(suite)
# step and comparison.
from soc.simple.test.test_runner import TestRunner
+#@platen:bookmarks
+#src/openpower/test/runner.py:class TestRunnerBase(FHDLTestCase):
+
# test with MMU
from openpower.test.mmu.mmu_cases import MMUTestCase
from openpower.test.mmu.mmu_rom_cases import MMUTestCaseROM, default_mem
from openpower.test.ldst.ldst_exc_cases import LDSTExceptionTestCase
#from openpower.simulator.test_sim import (GeneralTestCases, AttnTestCase)
+from openpower.simulator.program import Program
+from openpower.endian import bigendian
+from openpower.test.common import TestAccumulatorBase
+
+from openpower.consts import MSR
+
+class MMUTestCase(TestAccumulatorBase):
+
+ # now working correctly
+ def case_1_dcbz(self):
+ lst = ["dcbz 1, 2", # MMUTEST.DCBZ: EA from adder 12
+ "dcbz 1, 3"] # MMUTEST.DCBZ: EA from adder 11
+ initial_regs = [0] * 32
+ initial_regs[1] = 0x0004
+ initial_regs[2] = 0x0008
+ initial_regs[3] = 0x0007
+ initial_mem = {}
+ self.add_case(Program(lst, bigendian), initial_regs,
+ initial_mem=initial_mem)
+
+ # MMUTEST: OP_TLBIE: insn_bits=39
+ def case_2_tlbie(self):
+ lst = ["tlbie 1,1,1,1,1"] # tlbie RB,RS,RIC,PRS,R
+ initial_regs = [0] * 32
+ initial_mem = {}
+ self.add_case(Program(lst, bigendian), initial_regs,
+ initial_mem=initial_mem)
+
+ # OP_MTSPR: spr=720
+ def case_3_mtspr(self):
+ lst = ["mtspr 720,1"] # mtspr PRTBL,r1
+ initial_regs = [0] * 32
+ initial_regs[1] = 0x1234
+ initial_mem = {}
+ self.add_case(Program(lst, bigendian), initial_regs,
+ initial_mem=initial_mem)
+
+ # OP_MFSPR: spr=18/19
+ def case_4_mfspr(self):
+ lst = ["mfspr 1,18", # mtspr r1,DSISR
+ "mfspr 2,19"] # mtspr r2,DAR
+ initial_regs = [0] * 32
+ initial_regs[1] = 0x1234
+ initial_regs[2] = 0x3456
+ initial_mem = {}
+ self.add_case(Program(lst, bigendian), initial_regs,
+ initial_mem=initial_mem)
+
+ # new testcase for all sprs
+ def case_5_allsprs(self):
+ lst = ["mtspr 720,1", #MMUTEST: OP_MTSPR: spr=720
+ "mtspr 48,2", #MMUTEST: OP_MTSPR: spr=48
+ "mtspr 18,3", #MMUTEST: OP_MTSPR: spr=18
+ "mtspr 19,4", #MMUTEST: OP_MTSPR: spr=19
+ "mfspr 5,720", #MMUTEST: OP_MFSPR: spr=720 returns=4660
+ "mfspr 6,48", #MMUTEST: OP_MFSPR: spr=48 returns=13398
+ "mfspr 7,18", #MMUTEST: OP_MFSPR: spr=18 returns=17185
+ "mfspr 8,19" #MMUTEST: OP_MFSPR: spr=19 returns=25923
+ ]
+ initial_regs = [0] * 32
+ initial_regs[1] = 0x1234
+ initial_regs[2] = 0x3456
+ initial_regs[3] = 0x4321
+ initial_regs[4] = 0x6543
+ initial_mem = {}
+ self.add_case(Program(lst, bigendian), initial_regs,
+ initial_mem=initial_mem)
+
+ # MMUTEST: initial_msr= 16384
+ # msr 16384
+ # ISACaller initial_msr 16384
+ # FIXME msr does not get passed to LoadStore1
+ def case_5_ldst_exception(self):
+ lst = ["stb 10,0(2)"]
+ initial_regs = [0] * 32
+ initial_regs[1] = 0x1234
+ initial_regs[2] = 0x3456
+ initial_regs[3] = 0x4321
+ initial_regs[4] = 0x6543
+ initial_regs[10] = 0xfe
+ initial_mem = {}
+ #enable virtmode
+ initial_msr = 1 << MSR.PR # must set "problem" state for virtual memory
+ print("MMUTEST: initial_msr=",initial_msr)
+ self.add_case(Program(lst, bigendian), initial_regs,
+ initial_mem=initial_mem,initial_msr=initial_msr)
+
+ # deliberately misalign
+ def case_6_ldst_misalign(self):
+ lst = ["std 10,0(2)"]
+ initial_regs = [0] * 32
+ initial_regs[1] = 0x1234
+ initial_regs[2] = 0x3456
+ initial_regs[3] = 0x4321
+ initial_regs[4] = 0x6543
+ initial_regs[10] = 0x0123456789abcdef
+ initial_mem = {}
+ #enable virtmode
+ initial_msr = 1 << MSR.PR # must set "problem" state for virtual memory
+ print("MMUTEST: initial_msr=",initial_msr)
+ self.add_case(Program(lst, bigendian), initial_regs,
+ initial_mem=initial_mem,initial_msr=initial_msr)
+
if __name__ == "__main__":
svp64 = True
if len(sys.argv) == 2:
unittest.main(exit=False)
suite = unittest.TestSuite()
- #suite.addTest(TestRunner(GeneralTestCases.test_data, svp64=svp64,
- # microwatt_mmu=True))
- #suite.addTest(TestRunner(MMUTestCase().test_data, svp64=svp64,
- # microwatt_mmu=True))
-
- # without ROM set
- #suite.addTest(TestRunner(MMUTestCaseROM().test_data, svp64=svp64,
- # microwatt_mmu=True))
- # LD/ST tests should all still work
- suite.addTest(TestRunner(LDSTTestCase().test_data, svp64=svp64,
+ # MMU/DCache integration tests
+ suite.addTest(TestRunner(MMUTestCase().test_data, svp64=svp64,
microwatt_mmu=True))
- # LD/ST exception cases
- #suite.addTest(TestRunner(LDSTExceptionTestCase().test_data, svp64=svp64,
- # microwatt_mmu=True))
-
runner = unittest.TextTestRunner()
runner.run(suite)
--- /dev/null
+"""simple core test, runs instructions from a TestMemory
+
+related bugs:
+
+ * https://bugs.libre-soc.org/show_bug.cgi?id=363
+"""
+
+# NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
+# Also, check out the cxxsim nmigen branch, and latest yosys from git
+
+import unittest
+import sys
+
+# here is the logic which takes test cases and "executes" them.
+# in this instance (TestRunner) its job is to instantiate both
+# a Libre-SOC nmigen-based HDL instance and an ISACaller python
+# simulator. it's also responsible for performing the single
+# step and comparison.
+from soc.simple.test.test_runner import TestRunner
+
+#@platen:bookmarks
+#src/openpower/test/runner.py:class TestRunnerBase(FHDLTestCase):
+
+# test with MMU
+from openpower.test.mmu.mmu_cases import MMUTestCase
+from openpower.test.mmu.mmu_rom_cases import MMUTestCaseROM, default_mem
+from openpower.test.ldst.ldst_cases import LDSTTestCase
+from openpower.test.ldst.ldst_exc_cases import LDSTExceptionTestCase
+#from openpower.simulator.test_sim import (GeneralTestCases, AttnTestCase)
+
+from openpower.simulator.program import Program
+from openpower.endian import bigendian
+from openpower.test.common import TestAccumulatorBase
+
+from openpower.consts import MSR
+
+from soc.experiment.test import pagetables
+
+
+class MMUTestCase(TestAccumulatorBase):
+
+ def case_virtual_ld_st(self):
+ lst = ["stb 10,0(2)",
+ "addi 10,0, -4",
+ "stb 10,0(5)",
+ "lhz 6,0(2)",
+ ]
+
+ # set up regs
+ initial_regs = [0] * 32
+ initial_regs[1] = 0x1000000 # hm, was going to do mtspr 720,1 with this
+ initial_regs[2] = 0x3456
+ initial_regs[3] = 0x4321
+ initial_regs[4] = 0x6543
+ initial_regs[5] = 0x3457
+ initial_regs[10] = 0xfe
+
+ # no pre-loaded memory here
+ initial_mem = {}
+
+ # set virtual and non-privileged
+ initial_msr = 0 << MSR.PR # must set "problem" state
+ #initial_msr |= 1 << MSR.DR # set "virtual" state for data
+ initial_msr |= 1 << MSR.IR # set "virtual" state for instructions
+ initial_msr |= 1 << MSR.LE # set little-endian
+
+ # set PRTBL to 0x1000000
+ initial_sprs = {720: 0x1000000} # PRTBL
+
+ print("MMUTEST: initial_msr=",initial_msr)
+ self.add_case(Program(lst, bigendian), initial_regs,
+ initial_mem=initial_mem,
+ initial_sprs=initial_sprs,
+ initial_msr=initial_msr)
+
+ def case_virtual_invalid_no_prtbl(self):
+ """virtual memory test but with no PRTBL set it is expected
+ to throw an "invalid" exception
+ """
+ lst = ["stb 10,0(2)",
+ ]
+
+ # set up regs
+ initial_regs = [0] * 32
+
+ # set virtual and non-privileged
+ initial_msr = 1 << MSR.PR # must set "problem" state
+ initial_msr |= 1 << MSR.DR # set "virtual" state for data
+ initial_msr |= 1 << MSR.IR # set "virtual" state for instructions
+
+ print("MMUTEST: initial_msr=",initial_msr)
+ self.add_case(Program(lst, bigendian), initial_regs,
+ initial_msr=initial_msr,
+ stop_at_pc=0x400) # stop at this exception addr
+
+if __name__ == "__main__":
+ svp64 = True
+ if len(sys.argv) == 2:
+ if sys.argv[1] == 'nosvp64':
+ svp64 = False
+ sys.argv.pop()
+
+ print ("SVP64 test mode enabled", svp64)
+
+ unittest.main(exit=False)
+ suite = unittest.TestSuite()
+
+ # MMU/DCache integration tests
+ suite.addTest(TestRunner(MMUTestCase().test_data, svp64=svp64,
+ microwatt_mmu=True,
+ rom=pagetables.test1))
+
+ runner = unittest.TextTestRunner()
+ runner.run(suite)
--- /dev/null
+"""simple core test, runs instructions from a TestMemory
+
+related bugs:
+
+ * https://bugs.libre-soc.org/show_bug.cgi?id=363
+"""
+
+# NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
+# Also, check out the cxxsim nmigen branch, and latest yosys from git
+
+import unittest
+import sys
+
+# here is the logic which takes test cases and "executes" them.
+# in this instance (TestRunner) its job is to instantiate both
+# a Libre-SOC nmigen-based HDL instance and an ISACaller python
+# simulator. it's also responsible for performing the single
+# step and comparison.
+from soc.simple.test.test_runner import TestRunner
+
+#@platen:bookmarks
+#src/openpower/test/runner.py:class TestRunnerBase(FHDLTestCase):
+
+# test with MMU
+from openpower.test.mmu.mmu_cases import MMUTestCase
+from openpower.test.mmu.mmu_rom_cases import MMUTestCaseROM, default_mem
+from openpower.test.ldst.ldst_cases import LDSTTestCase
+from openpower.test.ldst.ldst_exc_cases import LDSTExceptionTestCase
+#from openpower.simulator.test_sim import (GeneralTestCases, AttnTestCase)
+from soc.experiment.test import pagetables
+
+
+from openpower.simulator.program import Program
+from openpower.endian import bigendian
+from openpower.test.common import TestAccumulatorBase
+
+from openpower.consts import MSR
+
+from soc.experiment.test import pagetables
+
+
+class MMUTestCase(TestAccumulatorBase):
+
+ def case_microwatt_test_3_mmu_ld(self):
+ lst = [
+ "ld 6,0(2)",
+ ]
+
+ # set up regs
+ initial_regs = [0] * 32
+ initial_regs[2] = 0x124108
+
+ # memory same as microwatt test
+ initial_mem = pagetables.microwatt_test2
+
+ # set virtual and non-privileged
+ # msr: 8000000000000011
+ initial_msr = 0 << MSR.PR # must set "problem" state
+ initial_msr |= 1 << MSR.LE # little-endian
+ initial_msr |= 1 << MSR.SF # 64-bit
+ initial_msr |= 1 << MSR.DR # set "virtual" state for data
+
+ # set PRTBL to 0x12000
+ initial_sprs = {720: 0x12000, # PRTBL
+ 48: 1 # PIDR
+ }
+
+ print("MMUTEST: initial_msr=",initial_msr)
+ self.add_case(Program(lst, bigendian), initial_regs,
+ initial_mem=initial_mem,
+ initial_sprs=initial_sprs,
+ initial_msr=initial_msr)
+
+
+if __name__ == "__main__":
+ svp64 = True
+ if len(sys.argv) == 2:
+ if sys.argv[1] == 'nosvp64':
+ svp64 = False
+ sys.argv.pop()
+
+ print ("SVP64 test mode enabled", svp64)
+
+ unittest.main(exit=False)
+ suite = unittest.TestSuite()
+
+ # MMU/DCache integration tests
+ suite.addTest(TestRunner(MMUTestCase().test_data, svp64=svp64,
+ microwatt_mmu=True,
+ rom=pagetables.microwatt_test2))
+
+ runner = unittest.TextTestRunner()
+ runner.run(suite)
from soc.simple.test.test_core import (setup_regs, check_regs,
wait_for_busy_clear,
wait_for_busy_hi)
-from soc.fu.compunits.test.test_compunit import (setup_test_memory,
- check_sim_memory,
+from soc.fu.compunits.test.test_compunit import (check_sim_memory,
get_l0_mem)
-from soc.simple.test.test_issuer import setup_i_memory
+from soc.simple.test.test_runner import setup_i_memory
+
+from pathlib import Path
import sys
sys.setrecursionlimit(10**6)
with Program("1.bin", bigendian) as program:
self.run_tst_program(program)
+ @unittest.skipUnless(Path("hello_world.bin").exists(),
+ "missing hello_world.bin")
def test_binary(self):
with Program("hello_world.bin", bigendian) as program:
self.run_tst_program(program)
pspec = TestMemPspec(ldst_ifacetype='test_bare_wb',
imem_ifacetype='test_bare_wb',
- addr_wid=48,
+ addr_wid=64,
mask_wid=8,
reg_wid=64,
imem_test_depth=32768,
# blech! put the same listing into the data memory
data_mem = get_l0_mem(l0)
yield from setup_i_memory(data_mem, pc, instructions)
- # yield from setup_test_memory(l0, sim)
yield from setup_regs(core, test)
yield pc_i.eq(pc)
related bugs:
* https://bugs.libre-soc.org/show_bug.cgi?id=363
+ * https://bugs.libre-soc.org/show_bug.cgi?id=686#c51
"""
-from nmigen import Module, Signal, Cat, ClockSignal
+from nmigen import Module, Signal
from nmigen.hdl.xfrm import ResetInserter
+from copy import copy
+from pprint import pprint
# NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
# Also, check out the cxxsim nmigen branch, and latest yosys from git
from nmutil.sim_tmp_alternative import Simulator, Settle
-from nmutil.formaltest import FHDLTestCase
-from nmutil.gtkw import write_gtkw
-from nmigen.cli import rtlil
-from openpower.decoder.isa.caller import special_sprs, SVP64State
+from openpower.decoder.isa.caller import SVP64State
from openpower.decoder.isa.all import ISA
from openpower.endian import bigendian
-from openpower.decoder.power_decoder import create_pdecode
-from openpower.decoder.power_decoder2 import PowerDecode2
-from soc.regfile.regfiles import StateRegs
-
from soc.simple.issuer import TestIssuerInternal
+from soc.simple.inorder import TestIssuerInternalInOrder
-from soc.config.test.test_loadstore import TestMemPspec
-from soc.simple.test.test_core import (setup_regs, check_regs,
+from soc.simple.test.test_core import (setup_regs, check_regs, check_mem,
wait_for_busy_clear,
wait_for_busy_hi)
-from soc.fu.compunits.test.test_compunit import (setup_test_memory,
+from soc.fu.compunits.test.test_compunit import (setup_tst_memory,
check_sim_memory)
from soc.debug.dmi import DBGCore, DBGCtrl, DBGStat
from nmutil.util import wrap
-from soc.experiment.test.test_mmu_dcache import wb_get
+from openpower.test.state import TestState, StateRunner
+from openpower.test.runner import TestRunnerBase
+
+
+def insert_into_rom(startaddr, instructions, rom):
+ print("insn before, init rom", len(instructions))
+ pprint(rom)
+
+ startaddr //= 4 # instructions are 32-bit
+
+ # 64 bit
+ mask = ((1 << 64)-1)
+ for ins in instructions:
+ if isinstance(ins, tuple):
+ insn, code = ins
+ else:
+ insn, code = ins, ''
+ insn = insn & 0xffffffff
+ msbs = (startaddr >> 1) & mask
+ lsb = 1 if (startaddr & 1) else 0
+ print ("insn", hex(insn), hex(msbs), hex(lsb))
+
+ val = rom.get(msbs<<3, 0)
+ if insn != 0:
+ print("before set", hex(4*startaddr),
+ hex(msbs), hex(val), hex(insn))
+ val = (val | (insn << (lsb*32)))
+ val = val & mask
+ rom[msbs<<3] = val
+ if insn != 0:
+ print("after set", hex(4*startaddr), hex(msbs), hex(val))
+ print("instr: %06x 0x%x %s %08x" % (4*startaddr, insn, code, val))
+ startaddr += 1
+ startaddr = startaddr & mask
+ print ("after insn insert")
+ pprint(rom)
-def setup_i_memory(imem, startaddr, instructions):
+
+def setup_i_memory(imem, startaddr, instructions, rom):
mem = imem
print("insn before, init mem", mem.depth, mem.width, mem,
len(instructions))
- for i in range(mem.depth):
- yield mem._array[i].eq(0)
- yield Settle()
+
+ if not rom:
+ # initialise mem array to zero
+ for i in range(mem.depth):
+ yield mem._array[i].eq(0)
+ yield Settle()
+
startaddr //= 4 # instructions are 32-bit
if mem.width == 32:
+ assert rom is None, "cannot do 32-bit from wb_get ROM yet"
mask = ((1 << 32)-1)
for ins in instructions:
if isinstance(ins, tuple):
insn, code = ins, ''
insn = insn & 0xffffffff
msbs = (startaddr >> 1) & mask
- val = yield mem._array[msbs]
+ lsb = 1 if (startaddr & 1) else 0
+
+ if rom: # must put the value into the wb_get area
+ val = rom[msbs<<1]
+ else:
+ val = yield mem._array[msbs]
if insn != 0:
print("before set", hex(4*startaddr),
hex(msbs), hex(val), hex(insn))
- lsb = 1 if (startaddr & 1) else 0
val = (val | (insn << (lsb*32)))
val = val & mask
- yield mem._array[msbs].eq(val)
- yield Settle()
+ if rom: # must put the value into the wb_get area
+ rom[msbs<<1] = val
+ else:
+ yield mem._array[msbs].eq(val)
+ yield Settle()
if insn != 0:
print("after set", hex(4*startaddr), hex(msbs), hex(val))
print("instr: %06x 0x%x %s %08x" % (4*startaddr, insn, code, val))
return data
-class TestRunner(FHDLTestCase):
- def __init__(self, tst_data, microwatt_mmu=False, rom=None,
- svp64=True):
- super().__init__("run_all")
- self.test_data = tst_data
- self.microwatt_mmu = microwatt_mmu
- self.rom = rom
- self.svp64 = svp64
-
- def run_all(self):
- m = Module()
- comb = m.d.comb
- pc_i = Signal(32)
- svstate_i = Signal(32)
+class HDLRunner(StateRunner):
+ """HDLRunner: Implements methods for the setup, preparation, and
+ running of tests using nmigen HDL simulation.
+ """
+
+ def __init__(self, dut, m, pspec):
+ super().__init__("hdl", HDLRunner)
+
+ self.dut = dut
+ self.pspec = pspec
+ self.pc_i = Signal(32)
+ self.svstate_i = Signal(64)
- if self.microwatt_mmu:
- ldst_ifacetype = 'test_mmu_cache_wb'
- else:
- ldst_ifacetype = 'test_bare_wb'
- imem_ifacetype = 'test_bare_wb'
-
- pspec = TestMemPspec(ldst_ifacetype=ldst_ifacetype,
- imem_ifacetype=imem_ifacetype,
- addr_wid=48,
- mask_wid=8,
- imem_reg_wid=64,
- # wb_data_width=32,
- use_pll=False,
- nocore=False,
- xics=False,
- gpio=False,
- regreduce=True,
- svp64=self.svp64,
- mmu=self.microwatt_mmu,
- reg_wid=64)
#hard_reset = Signal(reset_less=True)
- issuer = TestIssuerInternal(pspec)
+ if pspec.inorder:
+ self.issuer = TestIssuerInternalInOrder(pspec)
+ else:
+ self.issuer = TestIssuerInternal(pspec)
# use DMI RESET command instead, this does actually work though
- #issuer = ResetInserter({'coresync': hard_reset,
+ # issuer = ResetInserter({'coresync': hard_reset,
# 'sync': hard_reset})(issuer)
- m.submodules.issuer = issuer
- imem = issuer.imem._get_memory()
- core = issuer.core
- dmi = issuer.dbg.dmi
- pdecode2 = issuer.pdecode2
- l0 = core.l0
- regreduce_en = pspec.regreduce_en == True
+ m.submodules.issuer = self.issuer
+ self.dmi = self.issuer.dbg.dmi
- # copy of the decoder for simulator
- simdec = create_pdecode()
- simdec2 = PowerDecode2(simdec, regreduce_en=regreduce_en)
- m.submodules.simdec2 = simdec2 # pain in the neck
+ comb = m.d.comb
+ comb += self.issuer.pc_i.data.eq(self.pc_i)
+ comb += self.issuer.svstate_i.data.eq(self.svstate_i)
- # run core clock at same rate as test clock
- intclk = ClockSignal("coresync")
- comb += intclk.eq(ClockSignal())
+ def prepare_for_test(self, test):
+ self.test = test
+ #print ("preparing for test name", test.name)
- comb += issuer.pc_i.data.eq(pc_i)
- comb += issuer.svstate_i.data.eq(svstate_i)
+ # set up bigendian (TODO: don't do this, use MSR)
+ yield self.issuer.core_bigendian_i.eq(bigendian)
+ yield Settle()
- # nmigen Simulation
- sim = Simulator(m)
- sim.add_clock(1e-6)
+ yield
+ yield
+ yield
+ yield
+ #print ("end of test preparation", test.name)
+
+ def setup_during_test(self):
+ # first run a manual hard-reset of the debug interface.
+ # core is counting down on a 3-clock delay at this point
+ yield self.issuer.dbg_rst_i.eq(1)
+ yield
+ yield self.issuer.dbg_rst_i.eq(0)
- def process():
+ # now run a DMI-interface reset. because DMI is running
+ # in dbgsync domain its reset is *NOT* connected to
+ # core reset (hence the dbg_rst_i blip, above)
+ yield from set_dmi(self.dmi, DBGCore.CTRL, 1 << DBGCtrl.STOP)
+ yield
+ #print("test setup")
+
+ def run_test(self, instructions):
+ """run_hdl_state - runs a TestIssuer nmigen HDL simulation
+ """
+
+ #print("starting test")
+
+ if self.dut.rom is None:
+ imem = self.issuer.imem._get_memory()
+ #print("got memory", imem)
+ else:
+ print("skipping memory get due to rom")
+ pprint(self.dut.rom)
+ core = self.issuer.core
+ dmi = self.issuer.dbg.dmi
+ pdecode2 = self.issuer.pdecode2
+ l0 = core.l0
+ hdl_states = []
+
+ # establish the TestIssuer context (mem, regs etc)
+
+ pc = 0 # start address
+ counter = 0 # test to pause/start
+
+ # XXX for now, when ROM (run under wb_get) is detected,
+ # skip setup of memories. must be done a different way
+ if self.dut.rom is None:
+ yield from setup_i_memory(imem, pc, instructions, self.dut.rom)
+ yield from setup_tst_memory(l0, self.test.mem)
+ else:
+ insert_into_rom(pc, instructions, self.dut.default_mem)
+ print("about to setup regs")
+ yield from setup_regs(pdecode2, core, self.test)
+ #print("setup mem and regs done")
+
+ # set PC and SVSTATE
+ yield self.pc_i.eq(pc)
+ yield self.issuer.pc_i.ok.eq(1)
+
+ # copy initial SVSTATE
+ initial_svstate = copy(self.test.svstate)
+ if isinstance(initial_svstate, int):
+ initial_svstate = SVP64State(initial_svstate)
+ yield self.svstate_i.eq(initial_svstate.value)
+ yield self.issuer.svstate_i.ok.eq(1)
+ yield
- # start in stopped
- yield from set_dmi(dmi, DBGCore.CTRL, 1<<DBGCtrl.STOP)
- yield
+ print("instructions", instructions)
- # get each test, completely reset the core, and run it
+ # before starting the simulation, set the core stop address to be
+ # just after the last instruction. if a load of an instruction is
+ # requested at this address, the core is immediately put into "halt"
+ # XXX: keep an eye out for in-order problems
+ hard_stop_addr = self.test.stop_at_pc
+ if hard_stop_addr is None:
+ hard_stop_addr = len(instructions)*4
+ yield from set_dmi(dmi, DBGCore.STOPADDR, hard_stop_addr)
- for test in self.test_data:
+ # run the loop of the instructions on the current test
+ index = (yield self.issuer.cur_state.pc) // 4
+ while index < len(instructions):
+ ins, code = instructions[index]
- # set up bigendian (TODO: don't do this, use MSR)
- yield issuer.core_bigendian_i.eq(bigendian)
- yield Settle()
+ print("hdl instr: 0x{:X}".format(ins & 0xffffffff))
+ print(index, code)
+ if counter == 0:
+ # start the core
yield
+ yield from set_dmi(dmi, DBGCore.CTRL,
+ 1 << DBGCtrl.START)
+ yield self.issuer.pc_i.ok.eq(0) # no change PC after this
+ yield self.issuer.svstate_i.ok.eq(0) # ditto
yield
yield
+
+ counter = counter + 1
+
+ # wait until executed
+ while not ((yield self.issuer.insn_done) or
+ (yield self.issuer.dbg.terminated_o)):
yield
- print(test.name)
- program = test.program
- with self.subTest(test.name):
- print("regs", test.regs)
- print("sprs", test.sprs)
- print("cr", test.cr)
- print("mem", test.mem)
- print("msr", test.msr)
- print("assem", program.assembly)
- gen = list(program.generate_instructions())
- insncode = program.assembly.splitlines()
- instructions = list(zip(gen, insncode))
-
- # set up the Simulator (which must track TestIssuer exactly)
- sim = ISA(simdec2, test.regs, test.sprs, test.cr, test.mem,
- test.msr,
- initial_insns=gen, respect_pc=True,
- disassembly=insncode,
- bigendian=bigendian,
- initial_svstate=test.svstate)
-
- # establish the TestIssuer context (mem, regs etc)
-
- pc = 0 # start address
- counter = 0 # test to pause/start
-
- yield from setup_i_memory(imem, pc, instructions)
- yield from setup_test_memory(l0, sim)
- yield from setup_regs(pdecode2, core, test)
-
- # set PC and SVSTATE
- yield pc_i.eq(pc)
- yield issuer.pc_i.ok.eq(1)
-
- initial_svstate = test.svstate
- if isinstance(initial_svstate, int):
- initial_svstate = SVP64State(initial_svstate)
- yield svstate_i.eq(initial_svstate.spr.value)
- yield issuer.svstate_i.ok.eq(1)
- yield
+ # okaaay long story: in overlap mode, PC is updated one cycle
+ # late.
+ if self.dut.allow_overlap:
+ yield
+ yield Settle()
- print("instructions", instructions)
-
- # run the loop of the instructions on the current test
- index = sim.pc.CIA.value//4
- while index < len(instructions):
- ins, code = instructions[index]
-
- print("instruction: 0x{:X}".format(ins & 0xffffffff))
- print(index, code)
-
- if counter == 0:
- # start the core
- yield
- yield from set_dmi(dmi, DBGCore.CTRL,
- 1<<DBGCtrl.START)
- yield issuer.pc_i.ok.eq(0) # no change PC after this
- yield issuer.svstate_i.ok.eq(0) # ditto
- yield
- yield
-
- counter = counter + 1
-
- # wait until executed
- while not (yield issuer.insn_done):
- yield
-
- # set up simulated instruction (in simdec2)
- try:
- yield from sim.setup_one()
- except KeyError: # instruction not in imem: stop
- break
- yield Settle()
-
- # call simulated operation
- print("sim", code)
- yield from sim.execute_one()
- yield Settle()
- index = sim.pc.CIA.value//4
-
- terminated = yield issuer.dbg.terminated_o
- print("terminated", terminated)
-
- if index >= len(instructions):
- print ("index over, send dmi stop")
- # stop at end
- yield from set_dmi(dmi, DBGCore.CTRL,
- 1<<DBGCtrl.STOP)
- yield
- yield
-
- # register check
- yield from check_regs(self, sim, core, test, code)
-
- # Memory check
- yield from check_sim_memory(self, l0, sim, code)
-
- terminated = yield issuer.dbg.terminated_o
- print("terminated(2)", terminated)
- if terminated:
- break
+ index = (yield self.issuer.cur_state.pc) // 4
+ terminated = yield self.issuer.dbg.terminated_o
+ print("terminated", terminated, index, len(instructions))
+
+ if index < len(instructions):
+ # Get HDL mem and state
+ state = yield from TestState("hdl", core, self.dut,
+ code)
+ hdl_states.append(state)
+
+ if index >= len(instructions):
+ print("index over, send dmi stop")
# stop at end
- yield from set_dmi(dmi, DBGCore.CTRL, 1<<DBGCtrl.STOP)
+ yield from set_dmi(dmi, DBGCore.CTRL, 1 << DBGCtrl.STOP)
yield
yield
+ # hmm really should use DMI status check here but hey it's quick
+ while True:
+ stopped = yield self.issuer.dbg.core_stop_o
+ if stopped:
+ break
+ yield
+ break
+
+ terminated = yield self.issuer.dbg.terminated_o
+ print("terminated(2)", terminated)
+ if terminated:
+ break
+
+ if self.dut.allow_overlap: # or not self.dut.rom: ??
+ # wait until all settled
+ # XXX really this should be in DMI, which should in turn
+ # use issuer.any_busy to not send back "stopped" signal
+ while (yield self.issuer.any_busy):
+ yield
- # get CR
- cr = yield from get_dmi(dmi, DBGCore.CR)
- print("after test %s cr value %x" % (test.name, cr))
+ if self.dut.allow_overlap:
+ # get last state, at end of run
+ state = yield from TestState("hdl", core, self.dut,
+ code)
+ hdl_states.append(state)
- # get XER
- xer = yield from get_dmi(dmi, DBGCore.XER)
- print("after test %s XER value %x" % (test.name, xer))
+ return hdl_states
- # test of dmi reg get
- for int_reg in range(32):
- yield from set_dmi(dmi, DBGCore.GSPR_IDX, int_reg)
- value = yield from get_dmi(dmi, DBGCore.GSPR_DATA)
+ def end_test(self):
+ yield from set_dmi(self.dmi, DBGCore.CTRL, 1 << DBGCtrl.STOP)
+ yield
+ yield
- print("after test %s reg %2d value %x" %
- (test.name, int_reg, value))
+ # TODO, here is where the static (expected) results
+ # can be checked: register check (TODO, memory check)
+ # see https://bugs.libre-soc.org/show_bug.cgi?id=686#c51
+ # yield from check_regs(self, sim, core, test, code,
+ # >>>expected_data<<<)
- # pull a reset
- yield from set_dmi(dmi, DBGCore.CTRL, 1<<DBGCtrl.RESET)
- yield
+ # get CR
+ cr = yield from get_dmi(self.dmi, DBGCore.CR)
+ print("after test %s cr value %x" % (self.test.name, cr))
+
+ # get XER
+ xer = yield from get_dmi(self.dmi, DBGCore.XER)
+ print("after test %s XER value %x" % (self.test.name, xer))
+
+ # get MSR
+ msr = yield from get_dmi(self.dmi, DBGCore.MSR)
+ print("after test %s MSR value %x" % (self.test.name, msr))
+
+ # test of dmi reg get
+ for int_reg in range(32):
+ yield from set_dmi(self.dmi, DBGCore.GSPR_IDX, int_reg)
+ value = yield from get_dmi(self.dmi, DBGCore.GSPR_DATA)
- styles = {
- 'dec': {'base': 'dec'},
- 'bin': {'base': 'bin'},
- 'closed': {'closed': True}
- }
-
- traces = [
- 'clk',
- ('state machines', 'closed', [
- 'fetch_pc_valid_i', 'fetch_pc_ready_o',
- 'fetch_fsm_state',
- 'fetch_insn_valid_o', 'fetch_insn_ready_i',
- 'pred_insn_valid_i', 'pred_insn_ready_o',
- 'fetch_predicate_state',
- 'pred_mask_valid_o', 'pred_mask_ready_i',
- 'issue_fsm_state',
- 'exec_insn_valid_i', 'exec_insn_ready_o',
- 'exec_fsm_state',
- 'exec_pc_valid_o', 'exec_pc_ready_i',
- 'insn_done', 'core_stop_o', 'pc_i_ok', 'pc_changed',
- 'is_last', 'dec2.no_out_vec']),
- {'comment': 'fetch and decode'},
- (None, 'dec', [
- 'cia[63:0]', 'nia[63:0]', 'pc[63:0]',
- 'cur_pc[63:0]', 'core_core_cia[63:0]']),
- 'raw_insn_i[31:0]',
- 'raw_opcode_in[31:0]', 'insn_type',
- ('svp64 decoding', 'closed', [
- 'svp64_rm[23:0]', ('dec2.extra[8:0]', 'bin'),
- 'dec2.sv_rm_dec.mode', 'dec2.sv_rm_dec.predmode',
- 'dec2.sv_rm_dec.ptype_in',
- 'dec2.sv_rm_dec.dstpred[2:0]', 'dec2.sv_rm_dec.srcpred[2:0]',
- 'dstmask[63:0]', 'srcmask[63:0]',
- 'dregread[4:0]', 'dinvert',
- 'sregread[4:0]', 'sinvert',
- 'core.int.pred__addr[4:0]', 'core.int.pred__data_o[63:0]',
- 'core.int.pred__ren']),
- ('register augmentation', 'dec', 'closed', [
- {'comment': 'v3.0b registers'},
- 'dec2.dec_o.RT[4:0]',
- 'dec2.dec_a.RA[4:0]',
- 'dec2.dec_b.RB[4:0]',
- ('Rdest', [
- 'dec2.o_svdec.reg_in[4:0]',
- ('dec2.o_svdec.spec[2:0]', 'bin'),
- 'dec2.o_svdec.reg_out[6:0]']),
- ('Rsrc1', [
- 'dec2.in1_svdec.reg_in[4:0]',
- ('dec2.in1_svdec.spec[2:0]', 'bin'),
- 'dec2.in1_svdec.reg_out[6:0]']),
- ('Rsrc1', [
- 'dec2.in2_svdec.reg_in[4:0]',
- ('dec2.in2_svdec.spec[2:0]', 'bin'),
- 'dec2.in2_svdec.reg_out[6:0]']),
- {'comment': 'SVP64 registers'},
- 'dec2.rego[6:0]', 'dec2.reg1[6:0]', 'dec2.reg2[6:0]'
- ]),
- {'comment': 'svp64 context'},
- 'core_core_vl[6:0]', 'core_core_maxvl[6:0]',
- 'core_core_srcstep[6:0]', 'next_srcstep[6:0]',
- 'core_core_dststep[6:0]',
- {'comment': 'issue and execute'},
- 'core.core_core_insn_type',
- (None, 'dec', [
- 'core_rego[6:0]', 'core_reg1[6:0]', 'core_reg2[6:0]']),
- 'issue_i', 'busy_o',
- {'comment': 'dmi'},
- 'dbg.dmi_req_i', 'dbg.dmi_ack_o',
- {'comment': 'instruction memory'},
- 'imem.sram.rdport.memory(0)[63:0]',
- {'comment': 'registers'},
- # match with soc.regfile.regfiles.IntRegs port names
- 'core.int.rp_src1.memory(0)[63:0]',
- 'core.int.rp_src1.memory(1)[63:0]',
- 'core.int.rp_src1.memory(2)[63:0]',
- 'core.int.rp_src1.memory(3)[63:0]',
- 'core.int.rp_src1.memory(4)[63:0]',
- 'core.int.rp_src1.memory(5)[63:0]',
- 'core.int.rp_src1.memory(6)[63:0]',
- 'core.int.rp_src1.memory(7)[63:0]',
- 'core.int.rp_src1.memory(9)[63:0]',
- 'core.int.rp_src1.memory(10)[63:0]',
- 'core.int.rp_src1.memory(13)[63:0]',
- ]
-
- if self.microwatt_mmu:
- traces += [
- {'comment': 'microwatt_mmu'},
- 'core.fus.mmu0.alu_mmu0.illegal',
- 'core.fus.mmu0.alu_mmu0.debug0[3:0]',
- 'core.fus.mmu0.alu_mmu0.mmu.state',
- 'core.fus.mmu0.alu_mmu0.mmu.pid[31:0]',
- 'core.fus.mmu0.alu_mmu0.mmu.prtbl[63:0]',
- {'comment': 'wishbone_memory'},
- 'core.fus.mmu0.alu_mmu0.dcache.stb',
- 'core.fus.mmu0.alu_mmu0.dcache.cyc',
- 'core.fus.mmu0.alu_mmu0.dcache.we',
- 'core.fus.mmu0.alu_mmu0.dcache.ack',
- 'core.fus.mmu0.alu_mmu0.dcache.stall,'
- ]
-
- write_gtkw("issuer_simulator.gtkw",
- "issuer_simulator.vcd",
- traces, styles, module='top.issuer')
-
- # add run of instructions
- sim.add_sync_process(process)
-
- # optionally, if a wishbone-based ROM is passed in, run that as an
- # extra emulated process
- if self.rom is not None:
- dcache = core.fus.fus["mmu0"].alu.dcache
- default_mem = self.rom
- sim.add_sync_process(wrap(wb_get(dcache, default_mem, "DCACHE")))
-
- with sim.write_vcd("issuer_simulator.vcd"):
- sim.run()
+ print("after test %s reg %2d value %x" %
+ (self.test.name, int_reg, value))
+
+ # pull a reset
+ yield from set_dmi(self.dmi, DBGCore.CTRL, 1 << DBGCtrl.RESET)
+ yield
+
+
+class TestRunner(TestRunnerBase):
+ def __init__(self, tst_data, microwatt_mmu=False, rom=None,
+ svp64=True, inorder=False, run_hdl=True, run_sim=True,
+ allow_overlap=False):
+ if run_hdl:
+ run_hdl = HDLRunner
+ super().__init__(tst_data, microwatt_mmu=microwatt_mmu,
+ rom=rom, inorder=inorder,
+ svp64=svp64, run_hdl=run_hdl, run_sim=run_sim,
+ allow_overlap=allow_overlap)
--- /dev/null
+""" Power ISA test API
+
+This module implements the creation, inspection and comparison
+of test states for TestIssuer HDL
+
+"""
+
+from openpower.decoder.power_enums import XER_bits
+from openpower.util import log
+from openpower.test.state import (State, state_add, state_factory,
+ TestState,)
+from soc.fu.compunits.test.test_compunit import get_l0_mem
+
+class HDLState(State):
+ """HDLState: Obtains registers and memory from an nmigen simulator
+ object by implementing State class methods.
+ """
+ def __init__(self, core):
+ super().__init__()
+ self.core = core
+
+ def get_fpregs(self):
+ if False:
+ yield
+ self.fpregs = []
+ for i in range(32):
+ self.fpregs.append(0)
+
+ def get_intregs(self):
+ self.intregs = []
+ for i in range(32):
+ if self.core.regs.int.unary:
+ rval = yield self.core.regs.int.regs[i].reg
+ else:
+ rval = yield self.core.regs.int.memory._array[i]
+ self.intregs.append(rval)
+ log("class hdl int regs", list(map(hex, self.intregs)))
+
+ def get_crregs(self):
+ self.crregs = []
+ for i in range(8):
+ rval = yield self.core.regs.cr.regs[7-i].reg
+ self.crregs.append(rval)
+ log("class hdl cr regs", list(map(hex, self.crregs)))
+
+ def get_xregs(self):
+ self.xregs = []
+ self.xr = self.core.regs.xer
+ self.so = yield self.xr.regs[self.xr.SO].reg
+ self.ov = yield self.xr.regs[self.xr.OV].reg
+ self.ca = yield self.xr.regs[self.xr.CA].reg
+ self.xregs.extend((self.so, self.ov, self.ca))
+ log("class hdl xregs", list(map(hex, self.xregs)))
+
+ def get_pc(self):
+ self.pcl = []
+ self.state = self.core.regs.state
+ # relies on the state.r_port being permanently held as PC
+ self.pc = yield self.state.r_ports['cia'].o_data
+ self.pcl.append(self.pc)
+ log("class hdl pc", hex(self.pc))
+
+ def get_mem(self):
+ self.mem = {}
+ # get the underlying HDL-simulated memory from the L0CacheBuffer
+ if hasattr(self.core, "icache"):
+ # err temporarily ignore memory
+ return # XXX have to work out how to deal with wb_get
+ hdlmem = get_l0_mem(self.core.l0)
+ for i in range(hdlmem.depth):
+ value = yield hdlmem._array[i] # should not really do this
+ self.mem[i*8] = value
+
+
+# add to State Factory
+state_add('hdl', HDLState)
+++ /dev/null
-*.wpr
-__pycache__
+++ /dev/null
-from nmigen import Module, Signal, Elaboratable
-from nmigen.lib.coding import Encoder, PriorityEncoder
-
-
-class AddressEncoder(Elaboratable):
- """Address Encoder
-
- The purpose of this module is to take in a vector and
- encode the bits that are one hot into an address. This module
- combines both nmigen's Encoder and PriorityEncoder and will state
- whether the input line has a single bit hot, multiple bits hot,
- or no bits hot. The output line will always have the lowest value
- address output.
-
- Usage:
- The output is valid when either single or multiple match is high.
- Otherwise output is 0.
- """
-
- def __init__(self, width):
- """ Arguments:
- * width: The desired length of the input vector
- """
- # Internal
- self.encoder = Encoder(width)
- self.p_encoder = PriorityEncoder(width)
-
- # Input
- self.i = Signal(width)
-
- # Output
- self.single_match = Signal(1)
- self.multiple_match = Signal(1)
- self.o = Signal(range(width))
-
- def elaborate(self, platform=None):
- m = Module()
-
- # Add internal submodules
- m.submodules.encoder = self.encoder
- m.submodules.p_encoder = self.p_encoder
-
- m.d.comb += [
- self.encoder.i.eq(self.i),
- self.p_encoder.i.eq(self.i)
- ]
-
- # Steps:
- # 1. check if the input vector is non-zero
- # 2. if non-zero, check if single match or multiple match
- # 3. set output line to be lowest value address output
-
- # If the priority encoder recieves an input of 0
- # If n is 1 then the output is not valid
- with m.If(self.p_encoder.n):
- m.d.comb += [
- self.single_match.eq(0),
- self.multiple_match.eq(0),
- self.o.eq(0)
- ]
- # If the priority encoder recieves an input > 0
- with m.Else():
- # Multiple Match if encoder n is invalid
- with m.If(self.encoder.n):
- m.d.comb += [
- self.single_match.eq(0),
- self.multiple_match.eq(1)
- ]
- # Single Match if encoder n is valid
- with m.Else():
- m.d.comb += [
- self.single_match.eq(1),
- self.multiple_match.eq(0)
- ]
- # Always set output based on priority encoder output
- m.d.comb += self.o.eq(self.p_encoder.o)
- return m
+++ /dev/null
-from nmigen import Array, Cat, Module, Signal, Elaboratable
-from nmigen.lib.coding import Decoder
-from nmigen.cli import main # , verilog
-
-from .CamEntry import CamEntry
-from .AddressEncoder import AddressEncoder
-
-
-class Cam(Elaboratable):
- """ Content Addressable Memory (CAM)
-
- The purpose of this module is to quickly look up whether an
- entry exists given a data key.
- This module will search for the given data in all internal entries
- and output whether a single or multiple match was found.
- If an single entry is found the address be returned and single_match
- is set HIGH. If multiple entries are found the lowest address is
- returned and multiple_match is set HIGH. If neither single_match or
- multiple_match are HIGH this implies no match was found. To write
- to the CAM set the address bus to the desired entry and set write_enable
- HIGH. Entry managment should be performed one level above this block
- as lookup is performed within.
-
- Notes:
- The read and write operations take one clock cycle to complete.
- Currently the read_warning line is present for interfacing but
- is not necessary for this design. This module is capable of writing
- in the first cycle, reading on the second, and output the correct
- address on the third.
- """
-
- def __init__(self, data_size, cam_size):
- """ Arguments:
- * data_size: (bits) The bit size of the data
- * cam_size: (number) The number of entries in the CAM
- """
-
- # Internal
- self.cam_size = cam_size
- self.encoder = AddressEncoder(cam_size)
- self.decoder = Decoder(cam_size)
- self.entry_array = Array(CamEntry(data_size) for x in range(cam_size))
-
- # Input
- self.enable = Signal(1)
- self.write_enable = Signal(1)
- self.data_in = Signal(data_size) # The data to be written
- self.data_mask = Signal(data_size) # mask for ternary writes
- # address of CAM Entry to write
- self.address_in = Signal(range(cam_size))
-
- # Output
- self.read_warning = Signal(1) # High when a read interrupts a write
- self.single_match = Signal(1) # High when there is only one match
- self.multiple_match = Signal(1) # High when there at least two matches
- # The lowest address matched
- self.match_address = Signal(range(cam_size))
-
- def elaborate(self, platform=None):
- m = Module()
- # AddressEncoder for match types and output address
- m.submodules.AddressEncoder = self.encoder
- # Decoder is used to select which entry will be written to
- m.submodules.Decoder = self.decoder
- # CamEntry Array Submodules
- # Note these area added anonymously
- entry_array = self.entry_array
- m.submodules += entry_array
-
- # Decoder logic
- m.d.comb += [
- self.decoder.i.eq(self.address_in),
- self.decoder.n.eq(0)
- ]
-
- encoder_vector = []
- with m.If(self.enable):
- # Set the key value for every CamEntry
- for index in range(self.cam_size):
-
- # Write Operation
- with m.If(self.write_enable):
- with m.If(self.decoder.o[index]):
- m.d.comb += entry_array[index].command.eq(2)
- with m.Else():
- m.d.comb += entry_array[index].command.eq(0)
-
- # Read Operation
- with m.Else():
- m.d.comb += entry_array[index].command.eq(1)
-
- # Send data input to all entries
- m.d.comb += entry_array[index].data_in.eq(self.data_in)
- # Send all entry matches to encoder
- ematch = entry_array[index].match
- encoder_vector.append(ematch)
-
- # Give input to and accept output from encoder module
- m.d.comb += [
- self.encoder.i.eq(Cat(*encoder_vector)),
- self.single_match.eq(self.encoder.single_match),
- self.multiple_match.eq(self.encoder.multiple_match),
- self.match_address.eq(self.encoder.o)
- ]
-
- # If the CAM is not enabled set all outputs to 0
- with m.Else():
- m.d.comb += [
- self.read_warning.eq(0),
- self.single_match.eq(0),
- self.multiple_match.eq(0),
- self.match_address.eq(0)
- ]
-
- return m
-
- def ports(self):
- return [self.enable, self.write_enable,
- self.data_in, self.data_mask,
- self.read_warning, self.single_match,
- self.multiple_match, self.match_address]
-
-
-if __name__ == '__main__':
- cam = Cam(4, 4)
- main(cam, ports=cam.ports())
+++ /dev/null
-from nmigen import Module, Signal, Elaboratable
-
-
-class CamEntry(Elaboratable):
- """ Content Addressable Memory (CAM) Entry
-
- The purpose of this module is to represent an entry within a CAM.
- This module when given a read command will compare the given data
- and output whether a match was found or not. When given a write
- command it will write the given data into internal registers.
- """
-
- def __init__(self, data_size):
- """ Arguments:
- * data_size: (bit count) The size of the data
- """
- # Input
- self.command = Signal(2) # 00 => NA 01 => Read 10 => Write 11 => Reset
- self.data_in = Signal(data_size) # Data input when writing
-
- # Output
- self.match = Signal(1) # Result of the internal/input key comparison
- self.data = Signal(data_size)
-
- def elaborate(self, platform=None):
- m = Module()
- with m.Switch(self.command):
- with m.Case("00"):
- m.d.sync += self.match.eq(0)
- with m.Case("01"):
- with m.If(self.data == self.data_in):
- m.d.sync += self.match.eq(1)
- with m.Else():
- m.d.sync += self.match.eq(0)
- with m.Case("10"):
- m.d.sync += [
- self.data.eq(self.data_in),
- self.match.eq(0)
- ]
- with m.Case():
- m.d.sync += [
- self.match.eq(0),
- self.data.eq(0)
- ]
-
- return m
+++ /dev/null
-# SPDX-License-Identifier: LGPL-2.1-or-later
-# See Notices.txt for copyright information
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-from nmigen.cli import verilog, rtlil
-
-
-class LFSRPolynomial(set):
- """ implements a polynomial for use in LFSR
- """
- def __init__(self, exponents=()):
- for e in exponents:
- assert isinstance(e, int), TypeError("%s must be an int" % repr(e))
- assert (e >= 0), ValueError("%d must not be negative" % e)
- set.__init__(self, set(exponents).union({0})) # must contain zero
-
- @property
- def max_exponent(self):
- return max(self) # derived from set, so this returns the max exponent
-
- @property
- def exponents(self):
- exponents = list(self) # get elements of set as a list
- exponents.sort(reverse=True)
- return exponents
-
- def __str__(self):
- expd = {0: "1", 1: 'x', 2: "x^{}"} # case 2 isn't 2, it's min(i,2)
- retval = map(lambda i: expd[min(i,2)].format(i), self.exponents)
- return " + ".join(retval)
-
- def __repr__(self):
- return "LFSRPolynomial(%s)" % self.exponents
-
-
-# list of selected polynomials from https://web.archive.org/web/20190418121923/https://en.wikipedia.org/wiki/Linear-feedback_shift_register#Some_polynomials_for_maximal_LFSRs # noqa
-LFSR_POLY_2 = LFSRPolynomial([2, 1, 0])
-LFSR_POLY_3 = LFSRPolynomial([3, 2, 0])
-LFSR_POLY_4 = LFSRPolynomial([4, 3, 0])
-LFSR_POLY_5 = LFSRPolynomial([5, 3, 0])
-LFSR_POLY_6 = LFSRPolynomial([6, 5, 0])
-LFSR_POLY_7 = LFSRPolynomial([7, 6, 0])
-LFSR_POLY_8 = LFSRPolynomial([8, 6, 5, 4, 0])
-LFSR_POLY_9 = LFSRPolynomial([9, 5, 0])
-LFSR_POLY_10 = LFSRPolynomial([10, 7, 0])
-LFSR_POLY_11 = LFSRPolynomial([11, 9, 0])
-LFSR_POLY_12 = LFSRPolynomial([12, 11, 10, 4, 0])
-LFSR_POLY_13 = LFSRPolynomial([13, 12, 11, 8, 0])
-LFSR_POLY_14 = LFSRPolynomial([14, 13, 12, 2, 0])
-LFSR_POLY_15 = LFSRPolynomial([15, 14, 0])
-LFSR_POLY_16 = LFSRPolynomial([16, 15, 13, 4, 0])
-LFSR_POLY_17 = LFSRPolynomial([17, 14, 0])
-LFSR_POLY_18 = LFSRPolynomial([18, 11, 0])
-LFSR_POLY_19 = LFSRPolynomial([19, 18, 17, 14, 0])
-LFSR_POLY_20 = LFSRPolynomial([20, 17, 0])
-LFSR_POLY_21 = LFSRPolynomial([21, 19, 0])
-LFSR_POLY_22 = LFSRPolynomial([22, 21, 0])
-LFSR_POLY_23 = LFSRPolynomial([23, 18, 0])
-LFSR_POLY_24 = LFSRPolynomial([24, 23, 22, 17, 0])
-
-
-class LFSR(LFSRPolynomial, Elaboratable):
- """ implements a Linear Feedback Shift Register
- """
- def __init__(self, polynomial):
- """ Inputs:
- ------
- :polynomial: the polynomial to feedback on. may be a LFSRPolynomial
- instance or an iterable of ints (list/tuple/generator)
- :enable: enable (set LO to disable. NOTE: defaults to HI)
-
- Outputs:
- -------
- :state: the LFSR state. bitwidth is taken from the polynomial
- maximum exponent.
-
- Note: if an LFSRPolynomial is passed in as the input, because
- LFSRPolynomial is derived from set() it's ok:
- LFSRPolynomial(LFSRPolynomial(p)) == LFSRPolynomial(p)
- """
- LFSRPolynomial.__init__(self, polynomial)
- self.state = Signal(self.max_exponent, reset=1)
- self.enable = Signal(reset=1)
-
- def elaborate(self, platform):
- m = Module()
- # do absolutely nothing if the polynomial is empty (always has a zero)
- if self.max_exponent <= 1:
- return m
-
- # create XOR-bunch, select bits from state based on exponent
- feedback = Const(0) # doesn't do any harm starting from 0b0 (xor chain)
- for exponent in self:
- if exponent > 0: # don't have to skip, saves CPU cycles though
- feedback ^= self.state[exponent - 1]
-
- # if enabled, shift-and-feedback
- with m.If(self.enable):
- # shift up lower bits by Cat'ing in a new bit zero (feedback)
- newstate = Cat(feedback, self.state[:-1])
- m.d.sync += self.state.eq(newstate)
-
- return m
-
-
-# example: Poly24
-if __name__ == '__main__':
- p24 = rtlil.convert(LFSR(LFSR_POLY_24))
- with open("lfsr2_p24.il", "w") as f:
- f.write(p24)
+++ /dev/null
-# SPDX-License-Identifier: LGPL-2.1-or-later
-# See Notices.txt for copyright information
-from nmigen import Module
-from typing import Iterable, Optional, Iterator, Any, Union
-from typing_extensions import final
-
-
-@final
-class LFSRPolynomial(set):
- def __init__(self, exponents: Iterable[int] = ()):
- def elements() -> Iterable[int]: ...
- @property
- def exponents(self) -> list[int]: ...
- def __str__(self) -> str: ...
- def __repr__(self) -> str: ...
-
-
-@final
-class LFSR:
- def __init__(self, polynomial: Union[Iterable[int], LFSRPolynomial]): ...
- @property
- def width(self) -> int: ...
- def elaborate(self, platform: Any) -> Module: ...
+++ /dev/null
-verilog:
- python3 Cam.py generate -t v > Cam.v
+++ /dev/null
-from nmigen import Cat, Memory, Module, Signal, Elaboratable
-from nmigen.cli import main
-from nmigen.cli import verilog, rtlil
-
-
-class MemorySet(Elaboratable):
- def __init__(self, data_size, tag_size, set_count, active):
- self.active = active
- input_size = tag_size + data_size # Size of the input data
- memory_width = input_size + 1 # The width of the cache memory
- self.active = active
- self.data_size = data_size
- self.tag_size = tag_size
-
- # XXX TODO, use rd-enable and wr-enable?
- self.mem = Memory(width=memory_width, depth=set_count)
- self.r = self.mem.read_port()
- self.w = self.mem.write_port()
-
- # inputs (address)
- self.cset = Signal(range(set_count)) # The set to be checked
- self.tag = Signal(tag_size) # The tag to find
- self.data_i = Signal(data_size) # Incoming data
-
- # outputs
- self.valid = Signal()
- self.data_o = Signal(data_size) # Outgoing data (excludes tag)
-
- def elaborate(self, platform):
- m = Module()
- m.submodules.mem = self.mem
- m.submodules.r = self.r
- m.submodules.w = self.w
-
- # temporaries
- active_bit = Signal()
- tag_valid = Signal()
- data_start = self.active + 1
- data_end = data_start + self.data_size
- tag_start = data_end
- tag_end = tag_start + self.tag_size
-
- # connect the read port address to the set/entry
- read_port = self.r
- m.d.comb += read_port.addr.eq(self.cset)
- # Pull out active bit from data
- data = read_port.data
- m.d.comb += active_bit.eq(data[self.active])
- # Validate given tag vs stored tag
- tag = data[tag_start:tag_end]
- m.d.comb += tag_valid.eq(self.tag == tag)
- # An entry is only valid if the tags match AND
- # is marked as a valid entry
- m.d.comb += self.valid.eq(tag_valid & active_bit)
-
- # output data: TODO, check rd-enable?
- m.d.comb += self.data_o.eq(data[data_start:data_end])
-
- # connect the write port addr to the set/entry (only if write enabled)
- # (which is only done on a match, see SAC.write_entry below)
- write_port = self.w
- with m.If(write_port.en):
- m.d.comb += write_port.addr.eq(self.cset)
- m.d.comb += write_port.data.eq(Cat(1, self.data_i, self.tag))
-
- return m
+++ /dev/null
-from nmigen import Module, Signal, Elaboratable
-from nmigen.cli import main
-
-from soc.TLB.PteEntry import PteEntry
-
-
-class PermissionValidator(Elaboratable):
- """ The purpose of this Module is to check the Permissions of a given PTE
- against the requested access permissions.
-
- This module will either validate (by setting the valid bit HIGH)
- the request or find a permission fault and invalidate (by setting
- the valid bit LOW) the request
- """
-
- def __init__(self, asid_size, pte_size):
- """ Arguments:
- * asid_size: (bit count) The size of the asid to be processed
- * pte_size: (bit count) The size of the pte to be processed
-
- Return:
- * valid HIGH when permissions are correct
- """
- # Internal
- self.pte_entry = PteEntry(asid_size, pte_size)
-
- # Input
- self.data = Signal(asid_size + pte_size)
- self.xwr = Signal(3) # Execute, Write, Read
- self.super_mode = Signal(1) # Supervisor Mode
- self.super_access = Signal(1) # Supervisor Access
- self.asid = Signal(15) # Address Space IDentifier (ASID)
-
- # Output
- self.valid = Signal(1) # Denotes if the permissions are correct
-
- def elaborate(self, platform=None):
- m = Module()
-
- m.submodules.pte_entry = self.pte_entry
-
- m.d.comb += self.pte_entry.i.eq(self.data)
-
- # Check if the entry is valid
- with m.If(self.pte_entry.v):
- # ASID match or Global Permission
- # Note that the MSB bound is exclusive
- with m.If((self.pte_entry.asid == self.asid) | self.pte_entry.g):
- # Check Execute, Write, Read (XWR) Permissions
- with m.If(self.pte_entry.xwr == self.xwr):
- # Supervisor Logic
- with m.If(self.super_mode):
- # Valid if entry is not in user mode or supervisor
- # has Supervisor User Memory (SUM) access via the
- # SUM bit in the sstatus register
- m.d.comb += self.valid.eq((~self.pte_entry.u)
- | self.super_access)
- # User logic
- with m.Else():
- # Valid if the entry is in user mode only
- m.d.comb += self.valid.eq(self.pte_entry.u)
- with m.Else():
- m.d.comb += self.valid.eq(0)
- with m.Else():
- m.d.comb += self.valid.eq(0)
- with m.Else():
- m.d.comb += self.valid.eq(0)
- return m
+++ /dev/null
-from nmigen import Module, Signal, Elaboratable
-from nmigen.cli import main
-
-
-class PteEntry(Elaboratable):
- """ The purpose of this Module is to centralize the parsing of Page
- Table Entries (PTE) into one module to prevent common mistakes
- and duplication of code. The control bits are parsed out for
- ease of use.
-
- This module parses according to the standard PTE given by the
- Volume II: RISC-V Privileged Architectures V1.10 Pg 60.
- The Address Space IDentifier (ASID) is appended to the MSB of the input
- and is parsed out as such.
-
- An valid input Signal would be:
- ASID PTE
- Bits:[78-64][63-0]
-
- The output PTE value will include the control bits.
- """
- def __init__(self, asid_size, pte_size):
- """ Arguments:
- * asid_size: (bit count) The size of the asid to be processed
- * pte_size: (bit count) The size of the pte to be processed
-
- Return:
- * d The Dirty bit from the PTE portion of i
- * a The Accessed bit from the PTE portion of i
- * g The Global bit from the PTE portion of i
- * u The User Mode bit from the PTE portion of i
- * xwr The Execute/Write/Read bit from the PTE portion of i
- * v The Valid bit from the PTE portion of i
- * asid The asid portion of i
- * pte The pte portion of i
- """
- # Internal
- self.asid_start = pte_size
- self.asid_end = pte_size + asid_size
-
- # Input
- self.i = Signal(asid_size + pte_size)
-
- # Output
- self.d = Signal(1) # Dirty bit (From pte)
- self.a = Signal(1) # Accessed bit (From pte)
- self.g = Signal(1) # Global Access (From pte)
- self.u = Signal(1) # User Mode (From pte)
- self.xwr = Signal(3) # Execute Read Write (From pte)
- self.v = Signal(1) # Valid (From pte)
- self.asid = Signal(asid_size) # Associated Address Space IDentifier
- self.pte = Signal(pte_size) # Full Page Table Entry
-
- def elaborate(self, platform=None):
- m = Module()
- # Pull out all control bites from PTE
- m.d.comb += [
- self.d.eq(self.i[7]),
- self.a.eq(self.i[6]),
- self.g.eq(self.i[5]),
- self.u.eq(self.i[4]),
- self.xwr.eq(self.i[1:4]),
- self.v.eq(self.i[0])
- ]
- m.d.comb += self.asid.eq(self.i[self.asid_start:self.asid_end])
- m.d.comb += self.pte.eq(self.i[0:self.asid_start])
- return m
+++ /dev/null
-"""
-
-Online simulator of 4-way set-associative cache:
-http://www.ntu.edu.sg/home/smitha/ParaCache/Paracache/sa4.html
-
-Python simulator of a N-way set-associative cache:
-https://github.com/vaskevich/CacheSim/blob/master/cachesim.py
-"""
-
-from nmigen import Array, Cat, Memory, Module, Signal, Mux, Elaboratable
-from nmigen.compat.genlib import fsm
-from nmigen.cli import main
-from nmigen.cli import verilog, rtlil
-
-from .AddressEncoder import AddressEncoder
-from .MemorySet import MemorySet
-
-# TODO: use a LFSR that advances continuously and picking the bottom
-# few bits from it to select which cache line to replace, instead of PLRU
-# http://bugs.libre-riscv.org/show_bug.cgi?id=71
-from .ariane.plru import PLRU
-from .LFSR import LFSR, LFSR_POLY_24
-
-SA_NA = "00" # no action (none)
-SA_RD = "01" # read
-SA_WR = "10" # write
-
-
-class SetAssociativeCache(Elaboratable):
- """ Set Associative Cache Memory
-
- The purpose of this module is to generate a memory cache given the
- constraints passed in. This will create a n-way set associative cache.
- It is expected for the SV TLB that the VMA will provide the set number
- while the ASID provides the tag (still to be decided).
-
- """
-
- def __init__(self, tag_size, data_size, set_count, way_count, lfsr=False):
- """ Arguments
- * tag_size (bits): The bit count of the tag
- * data_size (bits): The bit count of the data to be stored
- * set_count (number): The number of sets/entries in the cache
- * way_count (number): The number of slots a data can be stored
- in one set
- * lfsr: if set, use an LFSR for (pseudo-randomly) selecting
- set/entry to write to. otherwise, use a PLRU
- """
- # Internals
- self.lfsr_mode = lfsr
- self.way_count = way_count # The number of slots in one set
- self.tag_size = tag_size # The bit count of the tag
- self.data_size = data_size # The bit count of the data to be stored
-
- # set up Memory array
- self.mem_array = Array() # memory array
- for i in range(way_count):
- ms = MemorySet(data_size, tag_size, set_count, active=0)
- self.mem_array.append(ms)
-
- # Finds valid entries
- self.encoder = AddressEncoder(way_count)
-
- # setup PLRU or LFSR
- if lfsr:
- # LFSR mode
- self.lfsr = LFSR(LFSR_POLY_24)
- else:
- # PLRU mode
- # One block to handle plru calculations
- self.plru = PLRU(way_count)
- self.plru_array = Array() # PLRU data on each set
- for i in range(set_count):
- name = "plru%d" % i
- self.plru_array.append(Signal(self.plru.TLBSZ, name=name))
-
- # Input
- self.enable = Signal(1) # Whether the cache is enabled
- self.command = Signal(2) # 00=None, 01=Read, 10=Write (see SA_XX)
- self.cset = Signal(range(set_count)) # The set to be checked
- self.tag = Signal(tag_size) # The tag to find
- self.data_i = Signal(data_size) # The input data
-
- # Output
- self.ready = Signal(1) # 0 => Processing 1 => Ready for commands
- self.hit = Signal(1) # Tag matched one way in the given set
- # Tag matched many ways in the given set
- self.multiple_hit = Signal(1)
- self.data_o = Signal(data_size) # The data linked to the matched tag
-
- def check_tags(self, m):
- """ Validate the tags in the selected set. If one and only one
- tag matches set its state to zero and increment all others
- by one. We only advance to next state if a single hit is found.
- """
- # Vector to store way valid results
- # A zero denotes a way is invalid
- valid_vector = []
- # Loop through memory to prep read/write ports and set valid_vector
- for i in range(self.way_count):
- valid_vector.append(self.mem_array[i].valid)
-
- # Pass encoder the valid vector
- m.d.comb += self.encoder.i.eq(Cat(*valid_vector))
-
- # Only one entry should be marked
- # This is due to already verifying the tags
- # matched and the valid bit is high
- with m.If(self.hit):
- m.next = "FINISHED_READ"
- # Pull out data from the read port
- data = self.mem_array[self.encoder.o].data_o
- m.d.comb += self.data_o.eq(data)
- if not self.lfsr_mode:
- self.access_plru(m)
-
- # Oh no! Seal the gates! Multiple tags matched?!? kasd;ljkafdsj;k
- with m.Elif(self.multiple_hit):
- # XXX TODO, m.next = "FINISHED_READ" ? otherwise stuck
- m.d.comb += self.data_o.eq(0)
-
- # No tag matches means no data
- with m.Else():
- # XXX TODO, m.next = "FINISHED_READ" ? otherwise stuck
- m.d.comb += self.data_o.eq(0)
-
- def access_plru(self, m):
- """ An entry was accessed and the plru tree must now be updated
- """
- # Pull out the set's entry being edited
- plru_entry = self.plru_array[self.cset]
- m.d.comb += [
- # Set the plru data to the current state
- self.plru.plru_tree.eq(plru_entry),
- # Set that the cache was accessed
- self.plru.lu_access_i.eq(1)
- ]
-
- def read(self, m):
- """ Go through the read process of the cache.
- This takes two cycles to complete. First it checks for a valid tag
- and secondly it updates the LRU values.
- """
- with m.FSM() as fsm_read:
- with m.State("READY"):
- m.d.comb += self.ready.eq(0)
- # check_tags will set the state if the conditions are met
- self.check_tags(m)
- with m.State("FINISHED_READ"):
- m.next = "READY"
- m.d.comb += self.ready.eq(1)
- if not self.lfsr_mode:
- plru_tree_o = self.plru.plru_tree_o
- m.d.sync += self.plru_array[self.cset].eq(plru_tree_o)
-
- def write_entry(self, m):
- if not self.lfsr_mode:
- m.d.comb += [ # set cset (mem address) into PLRU
- self.plru.plru_tree.eq(self.plru_array[self.cset]),
- # and connect plru to encoder for write
- self.encoder.i.eq(self.plru.replace_en_o)
- ]
- write_port = self.mem_array[self.encoder.o].w
- else:
- # use the LFSR to generate a random(ish) one of the mem array
- lfsr_output = Signal(range(self.way_count))
- lfsr_random = Signal(range(self.way_count))
- m.d.comb += lfsr_output.eq(self.lfsr.state) # lose some bits
- # address too big, limit to range of array
- m.d.comb += lfsr_random.eq(Mux(lfsr_output > self.way_count,
- lfsr_output - self.way_count,
- lfsr_output))
- write_port = self.mem_array[lfsr_random].w
-
- # then if there is a match from the encoder, enable the selected write
- with m.If(self.encoder.single_match):
- m.d.comb += write_port.en.eq(1)
-
- def write(self, m):
- """ Go through the write process of the cache.
- This takes two cycles to complete. First it writes the entry,
- and secondly it updates the PLRU (in plru mode)
- """
- with m.FSM() as fsm_write:
- with m.State("READY"):
- m.d.comb += self.ready.eq(0)
- self.write_entry(m)
- m.next = "FINISHED_WRITE"
- with m.State("FINISHED_WRITE"):
- m.d.comb += self.ready.eq(1)
- if not self.lfsr_mode:
- plru_entry = self.plru_array[self.cset]
- m.d.sync += plru_entry.eq(self.plru.plru_tree_o)
- m.next = "READY"
-
- def elaborate(self, platform=None):
- m = Module()
-
- # ----
- # set up Modules: AddressEncoder, LFSR/PLRU, Mem Array
- # ----
-
- m.submodules.AddressEncoder = self.encoder
- if self.lfsr_mode:
- m.submodules.LFSR = self.lfsr
- else:
- m.submodules.PLRU = self.plru
-
- for i, mem in enumerate(self.mem_array):
- setattr(m.submodules, "mem%d" % i, mem)
-
- # ----
- # select mode: PLRU connect to encoder, LFSR do... something
- # ----
-
- if not self.lfsr_mode:
- # Set what entry was hit
- m.d.comb += self.plru.lu_hit.eq(self.encoder.o)
- else:
- # enable LFSR
- m.d.comb += self.lfsr.enable.eq(self.enable)
-
- # ----
- # connect hit/multiple hit to encoder output
- # ----
-
- m.d.comb += [
- self.hit.eq(self.encoder.single_match),
- self.multiple_hit.eq(self.encoder.multiple_match),
- ]
-
- # ----
- # connect incoming data/tag/cset(addr) to mem_array
- # ----
-
- for mem in self.mem_array:
- write_port = mem.w
- m.d.comb += [mem.cset.eq(self.cset),
- mem.tag.eq(self.tag),
- mem.data_i.eq(self.data_i),
- write_port.en.eq(0), # default: disable write
- ]
- # ----
- # Commands: READ/WRITE/TODO
- # ----
-
- with m.If(self.enable):
- with m.Switch(self.command):
- # Search all sets at a particular tag
- with m.Case(SA_RD):
- self.read(m)
- with m.Case(SA_WR):
- self.write(m)
- # Maybe catch multiple tags write here?
- # TODO
- # TODO: invalidate/flush, flush-all?
-
- return m
-
- def ports(self):
- return [self.enable, self.command, self.cset, self.tag, self.data_i,
- self.ready, self.hit, self.multiple_hit, self.data_o]
-
-
-if __name__ == '__main__':
- sac = SetAssociativeCache(4, 8, 4, 6)
- vl = rtlil.convert(sac, ports=sac.ports())
- with open("SetAssociativeCache.il", "w") as f:
- f.write(vl)
-
- sac_lfsr = SetAssociativeCache(4, 8, 4, 6, True)
- vl = rtlil.convert(sac_lfsr, ports=sac_lfsr.ports())
- with open("SetAssociativeCacheLFSR.il", "w") as f:
- f.write(vl)
+++ /dev/null
-""" TLB Module
-
- The expected form of the data is:
- * Item (Bits)
- * Tag (N - 79) / ASID (78 - 64) / PTE (63 - 0)
-"""
-
-from nmigen import Memory, Module, Signal, Cat, Elaboratable
-from nmigen.cli import main
-
-from .PermissionValidator import PermissionValidator
-from .Cam import Cam
-
-
-class TLB(Elaboratable):
- def __init__(self, asid_size, vma_size, pte_size, L1_size):
- """ Arguments
- * asid_size: Address Space IDentifier (ASID) typically 15 bits
- * vma_size: Virtual Memory Address (VMA) typically 36 bits
- * pte_size: Page Table Entry (PTE) typically 64 bits
-
- Notes:
- These arguments should represent the largest possible size
- defined by the MODE settings. See
- Volume II: RISC-V Privileged Architectures V1.10 Page 57
- """
-
- # Internal
- self.state = 0
- # L1 Cache Modules
- self.cam_L1 = Cam(vma_size, L1_size)
- self.mem_L1 = Memory(width=asid_size + pte_size, depth=L1_size)
-
- # Permission Validator
- self.perm_validator = PermissionValidator(asid_size, pte_size)
-
- # Inputs
- self.supermode = Signal(1) # Supervisor Mode
- self.super_access = Signal(1) # Supervisor Access
- # 00=None, 01=Search, 10=Write L1, 11=Write L2
- self.command = Signal(2)
- self.xwr = Signal(3) # Execute, Write, Read
- self.mode = Signal(4) # 4 bits for access to Sv48 on Rv64
- self.address_L1 = Signal(range(L1_size))
- self.asid = Signal(asid_size) # Address Space IDentifier (ASID)
- self.vma = Signal(vma_size) # Virtual Memory Address (VMA)
- self.pte_in = Signal(pte_size) # To be saved Page Table Entry (PTE)
-
- # Outputs
- self.hit = Signal(1) # Denotes if the VMA had a mapped PTE
- self.perm_valid = Signal(1) # Denotes if the permissions are correct
- self.pte_out = Signal(pte_size) # PTE that was mapped to by the VMA
-
- def search(self, m, read_L1, write_L1):
- """ searches the TLB
- """
- m.d.comb += [
- write_L1.en.eq(0),
- self.cam_L1.write_enable.eq(0),
- self.cam_L1.data_in.eq(self.vma)
- ]
- # Match found in L1 CAM
- match_found = Signal(reset_less=True)
- m.d.comb += match_found.eq(self.cam_L1.single_match
- | self.cam_L1.multiple_match)
- with m.If(match_found):
- # Memory shortcut variables
- mem_address = self.cam_L1.match_address
- # Memory Logic
- m.d.comb += read_L1.addr.eq(mem_address)
- # Permission Validator Logic
- m.d.comb += [
- self.hit.eq(1),
- # Set permission validator data to the correct
- # register file data according to CAM match
- # address
- self.perm_validator.data.eq(read_L1.data),
- # Execute, Read, Write
- self.perm_validator.xwr.eq(self.xwr),
- # Supervisor Mode
- self.perm_validator.super_mode.eq(self.supermode),
- # Supverisor Access
- self.perm_validator.super_access.eq(self.super_access),
- # Address Space IDentifier (ASID)
- self.perm_validator.asid.eq(self.asid),
- # Output result of permission validation
- self.perm_valid.eq(self.perm_validator.valid)
- ]
- # Only output PTE if permissions are valid
- with m.If(self.perm_validator.valid):
- # XXX TODO - dummy for now
- reg_data = Signal.like(self.pte_out)
- m.d.comb += [
- self.pte_out.eq(reg_data)
- ]
- with m.Else():
- m.d.comb += [
- self.pte_out.eq(0)
- ]
- # Miss Logic
- with m.Else():
- m.d.comb += [
- self.hit.eq(0),
- self.perm_valid.eq(0),
- self.pte_out.eq(0)
- ]
-
- def write_l1(self, m, read_L1, write_L1):
- """ writes to the L1 cache
- """
- # Memory_L1 Logic
- m.d.comb += [
- write_L1.en.eq(1),
- write_L1.addr.eq(self.address_L1),
- # The Cat places arguments from LSB -> MSB
- write_L1.data.eq(Cat(self.pte_in, self.asid))
- ]
- # CAM_L1 Logic
- m.d.comb += [
- self.cam_L1.write_enable.eq(1),
- self.cam_L1.data_in.eq(self.vma), # data_in is sent to all entries
- # self.cam_L1.address_in.eq(todo) # a CAM entry needs to be selected
-
- ]
-
- def elaborate(self, platform):
- m = Module()
- # Add submodules
- # Submodules for L1 Cache
- m.submodules.cam_L1 = self.cam_L1
- m.submodules.read_L1 = read_L1 = self.mem_L1.read_port()
- m.submodules.write_L1 = write_L1 = self.mem_L1.write_port()
-
- # Permission Validator Submodule
- m.submodules.perm_valididator = self.perm_validator
-
- # When MODE specifies translation
- # TODO add in different bit length handling ie prefix 0s
- tlb_enable = Signal(reset_less=True)
- m.d.comb += tlb_enable.eq(self.mode != 0)
-
- with m.If(tlb_enable):
- m.d.comb += [
- self.cam_L1.enable.eq(1)
- ]
- with m.Switch(self.command):
- # Search
- with m.Case("01"):
- self.search(m, read_L1, write_L1)
-
- # Write L1
- # Expected that the miss will be handled in software
- with m.Case("10"):
- self.write_l1(m, read_L1, write_L1)
-
- # TODO
- # with m.Case("11"):
-
- # When disabled
- with m.Else():
- m.d.comb += [
- self.cam_L1.enable.eq(0),
- # XXX TODO - self.reg_file.enable.eq(0),
- self.hit.eq(0),
- self.perm_valid.eq(0), # XXX TODO, check this
- self.pte_out.eq(0)
- ]
- return m
-
-
-if __name__ == '__main__':
- tlb = TLB(15, 36, 64, 4)
- main(tlb, ports=[tlb.supermode, tlb.super_access, tlb.command,
- tlb.xwr, tlb.mode, tlb.address_L1, tlb.asid,
- tlb.vma, tlb.pte_in,
- tlb.hit, tlb.perm_valid, tlb.pte_out,
- ] + tlb.cam_L1.ports())
+++ /dev/null
-#include <cstdint>
-#include <iostream>
-#include <cmath>
-
-
-#define NWAY 4
-#define NLINE 256
-#define HIT 0
-#define MISS 1
-#define MS 1000
-/*
-Detailed TreePLRU inference see here: https://docs.google.com/spreadsheets/d/14zQpPYPwDAbCCjBT_a3KLaE5FEk-RNhI8Z7Qm_biW8g/edit?usp=sharing
-Ref: https://people.cs.clemson.edu/~mark/464/p_lru.txt
-four-way set associative - three bits
- each bit represents one branch point in a binary decision tree; let 1
- represent that the left side has been referenced more recently than the
- right side, and 0 vice-versa
- are all 4 lines valid?
- / \
- yes no, use an invalid line
- |
- |
- |
- bit_0 == 0? state | replace ref to | next state
- / \ ------+-------- -------+-----------
- y n 00x | line_0 line_0 | 11_
- / \ 01x | line_1 line_1 | 10_
- bit_1 == 0? bit_2 == 0? 1x0 | line_2 line_2 | 0_1
- / \ / \ 1x1 | line_3 line_3 | 0_0
- y n y n
- / \ / \ ('x' means ('_' means unchanged)
- line_0 line_1 line_2 line_3 don't care)
- 8-way set associative - 7 = 1+2+4 bits
-16-way set associative - 15 = 1+2+4+8 bits
-32-way set associative - 31 = 1+2+4+8+16 bits
-64-way set associative - 63 = 1+2+4+8+16+32 bits
-*/
-using namespace std;
-struct AddressField {
- uint64_t wd_idx : 2;//Unused
- uint64_t offset : 4;//Unused
- uint64_t index : 8;//NLINE = 256 = 2^8
- uint64_t tag : 50;
-};
-
-union Address {
- uint32_t* p;
- AddressField fields;
-};
-
-struct Cell {
- bool v;
- uint64_t tag;
-
- Cell() : v(false), tag(0) {}
-
- bool isHit(uint64_t tag) {
- return v && (tag == this->tag);
- }
-
- void fetch(uint32_t* address) {
- Address addr;
- addr.p = address;
- addr.fields.offset = 0;
- addr.fields.wd_idx = 0;
- tag = addr.fields.tag;
- v = true;
- }
-};
-
-ostream& operator<<(ostream & out, const Cell& cell) {
- out << " v:" << cell.v << " tag:" << hex << cell.tag;
- return out;
-}
-
-struct Block {
- Cell cell[NWAY];
- uint32_t state;
- uint64_t *mask;//Mask the state to get accurate value for specified 1 bit.
- uint64_t *value;
- uint64_t *next_value;
-
- Block() : state(0) {
- switch (NWAY) {
- case 4:
- mask = new uint64_t[4]{0b110, 0b110, 0b101, 0b101};
- value = new uint64_t[4]{0b000, 0b010, 0b100, 0b101};
- next_value = new uint64_t[4]{0b110, 0b100, 0b001, 0b000};
- break;
- case 8:
- mask = new uint64_t[8]{0b1101000, 0b1101000, 0b1100100, 0b1100100, 0b1010010, 0b1010010, 0b1010001,
- 0b1010001};
- value = new uint64_t[8]{0b0000000, 0b0001000, 0b0100000, 0b0100100, 0b1000000, 0b1000010, 0b1010000,
- 0b1010001};
- next_value = new uint64_t[8]{0b1101000, 0b1100000, 0b1000100, 0b1000000, 0b0010010, 0b0010000,
- 0b0000001, 0b0000000};
- break;
- //TODO - more NWAY goes here.
- default:
- std::cout << "Error definition NWAY = " << NWAY << std::endl;
- }
- }
-
- uint32_t *getByTag(uint64_t tag, uint32_t *pway) {
- for (int i = 0; i < NWAY; ++i) {
- if (cell[i].isHit(tag)) {
- *pway = i;
- return pway;
- }
- }
- return NULL;
- }
-
- void setLRU(uint32_t *address) {
- int way = 0;
- uint32_t st = state;
- for (int i = 0; i < NWAY; ++i) {
- if ((state & mask[i]) == value[i]) {
- state ^= mask[i];
- way = i;
- break;
- }
- }
- cell[way].fetch(address);
- cout << "MISS: way:" << way << " address:" << address << " state:" << st << "->" << state << endl;
- }
-
- uint32_t *get(uint32_t *address, uint32_t *pway) {
- Address addr;
- addr.p = address;
- uint32_t *d = getByTag(addr.fields.tag, pway);
- if (d != NULL) {
- return &d[addr.fields.offset];
- }
- return d;
- }
-
- int set(uint32_t *address) {
- uint32_t way = 0;
- uint32_t *p = get(address, &way);
- if (p != NULL) {
- printf("HIT: address:%p ref_to way:%d state %X --> ", address, way, state);
- state &= ~mask[way];
- printf("%X --> ", state);
- state |= next_value[way];
- printf("%X\n", state);
- // *p = *address; //skip since address is fake.
- return HIT;
- } else {
- setLRU(address);
- return MISS;
- }
- }
-};
-
-ostream& operator<<(ostream & out, const Block& block) {
- out << "state:" << block.state << " ";
- for (int i = 0; i<NWAY; i++) {
- out << block.cell[i];
- }
- return out;
-}
-
-struct Cache {
- Block block[NLINE];
- uint32_t count[2];
- Cache() { count[HIT] = 0; count[MISS] = 0; }
-
- void access(uint32_t* address) {
- Address addr;
- addr.p = address;
- Block& b = block[addr.fields.index];
- ++count[b.set(address)];
- }
-
-};
-ostream& operator<<(ostream & out, const Cache& cache) {
- out << "\n==Summary==\n\tHit: " << cache.count[HIT] << " Miss: " << cache.count[MISS] << std::endl;
- for (int i = 0; i < NLINE; i++) {
- out << cache.block[i] << endl;
- }
- return out;
-}
-
-Cache cache;
-void multiply(uint32_t* m1, uint32_t* m2, uint32_t* res)
-{
- int x, i, j;
- for (i = 0; i < MS; i++) {
- for (j = 0; j < MS; j++) {
- cache.access(res + i*MS +j);
- for (x = 0; x < MS; x++) {
- cache.access(m1 + i*MS + x);
- cache.access(m2 + x*MS + j);
- cache.access(res + i*MS +j);
- // res[i][j] += m1[i][x] * m2[x][j];
- cache.access(res + i*MS +j);
- }
- }
- }
-}
-
-int main()
-{
- uint32_t* m1 = (uint32_t*) 0xFACE00A000000000LL; // fake virtual address; don’t access it
- uint32_t* m2 = (uint32_t*) 0xFACE00B000000000LL; // fake virtual address; don’t access it
- uint32_t* res = (uint32_t*) 0xFACE00C000000000LL; // fake virtual address; don’t access it
- multiply(m1, m2, res);
- cout << cache << endl;
- return 0;
-}
+++ /dev/null
-from nmigen import Const
-
-INSTR_ADDR_MISALIGNED = Const(0, 64)
-INSTR_ACCESS_FAULT = Const(1, 64)
-ILLEGAL_INSTR = Const(2, 64)
-BREAKPOINT = Const(3, 64)
-LD_ADDR_MISALIGNED = Const(4, 64)
-LD_ACCESS_FAULT = Const(5, 64)
-ST_ADDR_MISALIGNED = Const(6, 64)
-ST_ACCESS_FAULT = Const(7, 64)
-ENV_CALL_UMODE = Const(8, 64) # environment call from user mode
-ENV_CALL_SMODE = Const(9, 64) # environment call from supervisor mode
-ENV_CALL_MMODE = Const(11, 64) # environment call from machine mode
-INSTR_PAGE_FAULT = Const(12, 64) # Instruction page fault
-LOAD_PAGE_FAULT = Const(13, 64) # Load page fault
-STORE_PAGE_FAULT = Const(15, 64) # Store page fault
+++ /dev/null
-# Copyright 2018 ETH Zurich and University of Bologna.
-# Copyright and related rights are licensed under the Solderpad Hardware
-# License, Version 0.51 (the "License"); you may not use this file except in
-# compliance with the License. You may obtain a copy of the License at
-# http:#solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# or agreed to in writing, software, hardware and materials distributed under
-# this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations under the License.
-#
-# Author: Florian Zaruba, ETH Zurich
-# Date: 12.11.2017
-# Description: Handles cache misses.
-from nmigen.lib.coding import Encoder, PriorityEncoder
-
-
-# --------------
-# MISS Handler
-# --------------
-import ariane_pkg::*;
-import std_cache_pkg::*;
-
-unsigned NR_PORTS = 3
-
-class MissReq(RecordObject):
- def __init__(self, name=None):
- Record.__init__(self, name)
- self.valid = Signal()
- self.addr = Signal(64)
- self.be = Signal(8)
- self.size = Signal(2)
- self.we = Signal()
- self.wdata = Signal(64)
- bypass = Signal()
-
-class CacheLine:
- def __init__(self):
- self.tag = Signal(DCACHE_TAG_WIDTH) # tag array
- self.data = Signal(DCACHE_LINE_WIDTH) # data array
- self.valid = Signal() # state array
- self.dirty = Signal() # state array
-
-# cache line byte enable
-class CLBE:
- def __init__(self):
- self.tag = Signal(DCACHE_TAG_WIDTH+7)//8) # byte enable into tag array
- self.data = Signal(DCACHE_LINE_WIDTH+7)//8) # byte enable data array
- # bit enable into state array (valid for a pair of dirty/valid bits)
- self.vldrty = Signal(DCACHE_SET_ASSOC)
- } cl_be_t;
-
-
-
- # FSM states
-"""
- enum logic [3:0] {
- IDLE, # 0
- FLUSHING, # 1
- FLUSH, # 2
- WB_CACHELINE_FLUSH, # 3
- FLUSH_REQ_STATUS, # 4
- WB_CACHELINE_MISS, # 5
- WAIT_GNT_SRAM, # 6
- MISS, # 7
- REQ_CACHELINE, # 8
- MISS_REPL, # 9
- SAVE_CACHELINE, # A
- INIT, # B
- AMO_LOAD, # C
- AMO_SAVE_LOAD, # D
- AMO_STORE # E
- } state_d, state_q;
-"""
-
-class MissHandler(Elaboratable):
- def __init__(self, NR_PORTS):
- self.NR_PORTS = NR_PORTS
- self.pwid = pwid = ceil(log(NR_PORTS) / log(2))
- self.flush_i = Signal() # flush request
- self.flush_ack_o = Signal() # acknowledge successful flush
- self.miss_o = Signal()
- self.busy_i = Signal() # dcache is busy with something
-
- # Bypass or miss
- self.miss_req_i = Array(MissReq(name="missreq") for i in range(NR_PORTS))
- # Bypass handling
- self.bypass_gnt_o = Signal(NR_PORTS)
- self.bypass_valid_o = Signal(NR_PORTS)
- self.bypass_data_o = Array(Signal(name="bdata_o", 64) \
- for i in range(NR_PORTS))
-
- # AXI port
- output ariane_axi::req_t axi_bypass_o,
- input ariane_axi::resp_t axi_bypass_i,
-
- # Miss handling (~> cacheline refill)
- self.miss_gnt_o = Signal(NR_PORTS)
- self.active_serving_o = Signal(NR_PORTS)
-
- self.critical_word_o = Signal(64)
- self.critical_word_valid_o = Signal()
- output ariane_axi::req_t axi_data_o,
- input ariane_axi::resp_t axi_data_i,
-
- self.mshr_addr_i = Array(Signal(name="bdata_o", 56) \
- for i in range(NR_PORTS))
- self.mshr_addr_matches_o = Signal(NR_PORTS)
- self.mshr_index_matches_o = Signal(NR_PORTS)
-
- # AMO
- self.amo_req_i = AMOReq()
- self.amo_resp_o = AMOResp()
- # Port to SRAMs, for refill and eviction
- self.req_o = Signal(DCACHE_SET_ASSOC)
- self.addr_o = Signal(DCACHE_INDEX_WIDTH) # address into cache array
- self.data_o = CacheLine()
- self.be_o = CLBE()
- self.data_i = Array(CacheLine() \
- for i in range(DCACHE_SET_ASSOC))
- self.we_o = Signal()
-
- def elaborate(self, platform):
- # Registers
- mshr_t mshr_d, mshr_q;
- logic [DCACHE_INDEX_WIDTH-1:0] cnt_d, cnt_q;
- logic [DCACHE_SET_ASSOC-1:0] evict_way_d, evict_way_q;
- # cache line to evict
- cache_line_t evict_cl_d, evict_cl_q;
-
- logic serve_amo_d, serve_amo_q;
- # Request from one FSM
- miss_req_valid = Signal(self.NR_PORTS)
- miss_req_bypass = Signal(self.NR_PORTS)
- miss_req_addr = Array(Signal(name="miss_req_addr", 64) \
- for i in range(NR_PORTS))
- miss_req_wdata = Array(Signal(name="miss_req_wdata", 64) \
- for i in range(NR_PORTS))
- miss_req_we = Signal(self.NR_PORTS)
- miss_req_be = Array(Signal(name="miss_req_be", 8) \
- for i in range(NR_PORTS))
- miss_req_size = Array(Signal(name="miss_req_size", 2) \
- for i in range(NR_PORTS))
-
- # Cache Line Refill <-> AXI
- req_fsm_miss_valid = Signal()
- req_fsm_miss_addr = Signal(64)
- req_fsm_miss_wdata = Signal(DCACHE_LINE_WIDTH)
- req_fsm_miss_we = Signal()
- req_fsm_miss_be = Signal(DCACHE_LINE_WIDTH//8)
- ariane_axi::ad_req_t req_fsm_miss_req;
- req_fsm_miss_size = Signal(2)
-
- gnt_miss_fsm = Signal()
- valid_miss_fsm = Signal()
- nmiss = DCACHE_LINE_WIDTH//64
- data_miss_fsm = Array(Signal(name="data_miss_fsm", 64) \
- for i in range(nmiss))
-
- # Cache Management <-> LFSR
- lfsr_enable = Signal()
- lfsr_oh = Signal(DCACHE_SET_ASSOC)
- lfsr_bin = Signal($clog2(DCACHE_SET_ASSOC-1))
- # AMOs
- ariane_pkg::amo_t amo_op;
- amo_operand_a = Signal(64)
- amo_operand_b = Signal(64)
- amo_result_o = Signal(64)
-
- struct packed {
- logic [63:3] address;
- logic valid;
- } reservation_d, reservation_q;
-
- # ------------------------------
- # Cache Management
- # ------------------------------
- evict_way = Signal(DCACHE_SET_ASSOC)
- valid_way = Signal(DCACHE_SET_ASSOC)
-
- for (i in range(DCACHE_SET_ASSOC):
- comb += evict_way[i].eq(data_i[i].valid & data_i[i].dirty)
- comb += valid_way[i].eq(data_i[i].valid)
-
- # ----------------------
- # Default Assignments
- # ----------------------
- # to AXI refill
- req_fsm_miss_req = ariane_axi::CACHE_LINE_REQ;
- req_fsm_miss_size = Const(0b11, 2)
- # core
- serve_amo_d = serve_amo_q;
- # --------------------------------
- # Flush and Miss operation
- # --------------------------------
- state_d = state_q;
- cnt_d = cnt_q;
- evict_way_d = evict_way_q;
- evict_cl_d = evict_cl_q;
- mshr_d = mshr_q;
- # communicate to the requester which unit we are currently serving
- active_serving_o[mshr_q.id] = mshr_q.valid;
- # AMOs
- # silence the unit when not used
- amo_op = amo_req_i.amo_op;
-
- reservation_d = reservation_q;
- with m.FSM() as state_q:
-
- with m.Case("IDLE"):
- # lowest priority are AMOs, wait until everything else
- # is served before going for the AMOs
- with m.If (amo_req_i.req & ~busy_i):
- # 1. Flush the cache
- with m.If(~serve_amo_q):
- m.next = "FLUSH_REQ_STATUS"
- serve_amo_d.eq(0b1
- cnt_d.eq(0
- # 2. Do the AMO
- with m.Else():
- m.next = "AMO_LOAD"
- serve_amo_d.eq(0b0
-
- # check if we want to flush and can flush
- # e.g.: we are not busy anymore
- # TODO: Check that the busy flag is indeed needed
- with m.If (flush_i & ~busy_i):
- m.next = "FLUSH_REQ_STATUS"
- cnt_d = 0
-
- # check if one of the state machines missed
- for i in range(NR_PORTS):
- # here comes the refill portion of code
- with m.If (miss_req_valid[i] & ~miss_req_bypass[i]):
- m.next = "MISS"
- # we are taking another request so don't
- # take the AMO
- serve_amo_d = 0b0;
- # save to MSHR
- wid = DCACHE_TAG_WIDTH+DCACHE_INDEX_WIDTH
- comb += [ mshr_d.valid.eq(0b1),
- mshr_d.we.eq(miss_req_we[i]),
- mshr_d.id.eq(i),
- mshr_d.addr.eq(miss_req_addr[i][0:wid]),
- mshr_d.wdata.eq(miss_req_wdata[i]),
- mshr_d.be.eq(miss_req_be[i]),
- ]
- break
-
- # ~> we missed on the cache
- with m.Case("MISS"):
- # 1. Check if there is an empty cache-line
- # 2. If not -> evict one
- comb += req_o.eq(1)
- sync += addr_o.eq(mshr_q.addr[:DCACHE_INDEX_WIDTH]
- m.next = "MISS_REPL"
- comb += miss_o.eq(1)
-
- # ~> second miss cycle
- with m.Case("MISS_REPL"):
- # if all are valid we need to evict one,
- # pseudo random from LFSR
- with m.If(~(~valid_way).bool()):
- comb += lfsr_enable.eq(0b1)
- comb += evict_way_d.eq(lfsr_oh)
- # do we need to write back the cache line?
- with m.If(data_i[lfsr_bin].dirty):
- state_d = WB_CACHELINE_MISS;
- comb += evict_cl_d.tag.eq(data_i[lfsr_bin].tag)
- comb += evict_cl_d.data.eq(data_i[lfsr_bin].data)
- comb += cnt_d.eq(mshr_q.addr[:DCACHE_INDEX_WIDTH])
- # no - we can request a cache line now
- with m.Else():
- m.next = "REQ_CACHELINE"
- # we have at least one free way
- with m.Else():
- # get victim cache-line by looking for the
- # first non-valid bit
- comb += evict_way_d.eq(get_victim_cl(~valid_way)
- m.next = "REQ_CACHELINE"
-
- # ~> we can just load the cache-line,
- # the way is store in evict_way_q
- with m.Case("REQ_CACHELINE"):
- comb += req_fsm_miss_valid .eq(1)
- sync += req_fsm_miss_addr .eq(mshr_q.addr)
-
- with m.If (gnt_miss_fsm):
- m.next = "SAVE_CACHELINE"
- comb += miss_gnt_o[mshr_q.id].eq(1)
-
- # ~> replace the cacheline
- with m.Case("SAVE_CACHELINE"):
- # calculate cacheline offset
- automatic logic [$clog2(DCACHE_LINE_WIDTH)-1:0] cl_offset;
- sync += cl_offset.eq(mshr_q.addr[3:DCACHE_BYTE_OFFSET] << 6)
- # we've got a valid response from refill unit
- with m.If (valid_miss_fsm):
- wid = DCACHE_TAG_WIDTH+DCACHE_INDEX_WIDTH
- sync += addr_o .eq(mshr_q.addr[:DCACHE_INDEX_WIDTH])
- sync += req_o .eq(evict_way_q)
- comb += we_o .eq(1)
- comb += be_o .eq(1)
- sync += be_o.vldrty .eq(evict_way_q)
- sync += data_o.tag .eq(mshr_q.addr[DCACHE_INDEX_WIDTH:wid]
- comb += data_o.data .eq(data_miss_fsm)
- comb += data_o.valid.eq(1)
- comb += data_o.dirty.eq(0)
-
- # is this a write?
- with m.If (mshr_q.we):
- # Yes, so safe the updated data now
- for i in range(8):
- # check if we really want to write
- # the corresponding byte
- with m.If (mshr_q.be[i]):
- sync += data_o.data[(cl_offset + i*8) +: 8].eq(mshr_q.wdata[i];
- # it's immediately dirty if we write
- comb += data_o.dirty.eq(1)
-
- # reset MSHR
- comb += mshr_d.valid.eq(0)
- # go back to idle
- m.next = 'IDLE'
-
- # ------------------------------
- # Write Back Operation
- # ------------------------------
- # ~> evict a cache line from way saved in evict_way_q
- with m.Case("WB_CACHELINE_FLUSH"):
- with m.Case("WB_CACHELINE_MISS"):
-
- comb += req_fsm_miss_valid .eq(0b1)
- sync += req_fsm_miss_addr .eq({evict_cl_q.tag, cnt_q[DCACHE_INDEX_WIDTH-1:DCACHE_BYTE_OFFSET], {{DCACHE_BYTE_OFFSET}{0b0}}};
- comb += req_fsm_miss_be .eq(1)
- comb += req_fsm_miss_we .eq(0b1)
- sync += req_fsm_miss_wdata .eq(evict_cl_q.data;
-
- # we've got a grant --> this is timing critical, think about it
- if (gnt_miss_fsm) begin
- # write status array
- sync += addr_o .eq(cnt_q)
- comb += req_o .eq(0b1)
- comb += we_o .eq(0b1)
- comb += data_o.valid.eq(INVALIDATE_ON_FLUSH ? 0b0 : 0b1)
- # invalidate
- sync += be_o.vldrty.eq(evict_way_q)
- # go back to handling the miss or flushing,
- # depending on where we came from
- with m.If(state_q == WB_CACHELINE_MISS):
- m.next = "MISS"
- with m.Else():
- m.next = "FLUSH_REQ_STATUS"
-
- # ------------------------------
- # Flushing & Initialization
- # ------------------------------
- # ~> make another request to check the same
- # cache-line if there are still some valid entries
- with m.Case("FLUSH_REQ_STATUS"):
- comb += req_o .eq(1)
- sync += addr_o .eq(cnt_q)
- m.next = "FLUSHING"
-
- with m.Case("FLUSHING"):
- # this has priority
- # at least one of the cache lines is dirty
- with m.If(~evict_way):
- # evict cache line, look for the first
- # cache-line which is dirty
- comb += evict_way_d.eq(get_victim_cl(evict_way))
- comb += evict_cl_d .eq(data_i[one_hot_to_bin(evict_way)])
- state_d = WB_CACHELINE_FLUSH;
- # not dirty ~> increment and continue
- with m.Else():
- # increment and re-request
- sync += cnt_d.eq(cnt_q + (1 << DCACHE_BYTE_OFFSET))
- m.next = "FLUSH_REQ_STATUS"
- sync += addr_o .eq(cnt_q)
- comb += req_o .eq(1)
- comb += be_o.vldrty.eq(INVALIDATE_ON_FLUSH ? 1 : 0)
- comb += we_o .eq(1)
- # finished with flushing operation, go back to idle
- with m.If (cnt_q[DCACHE_BYTE_OFFSET:DCACHE_INDEX_WIDTH] \
- == DCACHE_NUM_WORDS-1):
- # only acknowledge if the flush wasn't
- # triggered by an atomic
- sync += flush_ack_o.eq(~serve_amo_q)
- m.next = "IDLE"
-
- # ~> only called after reset
- with m.Case("INIT"):
- # initialize status array
- sync += addr_o.eq(cnt_q)
- comb += req_o .eq(1)
- comb += we_o .eq(1)
- # only write the dirty array
- comb += be_o.vldrty.eq(1)
- sync += cnt_d .eq(cnt_q + (1 << DCACHE_BYTE_OFFSET))
- # finished initialization
- with m.If (cnt_q[DCACHE_BYTE_OFFSET:DCACHE_INDEX_WIDTH] \
- == DCACHE_NUM_WORDS-1)
- m.next = "IDLE"
-
- # ----------------------
- # AMOs
- # ----------------------
- # TODO(zarubaf) Move this closer to memory
- # ~> we are here because we need to do the AMO,
- # the cache is clean at this point
- # start by executing the load
- with m.Case("AMO_LOAD"):
- comb += req_fsm_miss_valid.eq(1)
- # address is in operand a
- comb += req_fsm_miss_addr.eq(amo_req_i.operand_a)
- comb += req_fsm_miss_req.eq(ariane_axi::SINGLE_REQ)
- comb += req_fsm_miss_size.eq(amo_req_i.size)
- # the request has been granted
- with m.If(gnt_miss_fsm):
- m.next = "AMO_SAVE_LOAD"
- # save the load value
- with m.Case("AMO_SAVE_LOAD"):
- with m.If (valid_miss_fsm):
- # we are only concerned about the lower 64-bit
- comb += mshr_d.wdata.eq(data_miss_fsm[0])
- m.next = "AMO_STORE"
- # and do the store
- with m.Case("AMO_STORE"):
- load_data = Signal(64)
- # re-align load data
- comb += load_data.eq(data_align(amo_req_i.operand_a[:3],
- mshr_q.wdata))
- # Sign-extend for word operation
- with m.If (amo_req_i.size == 0b10):
- comb += amo_operand_a.eq(sext32(load_data[:32]))
- comb += amo_operand_b.eq(sext32(amo_req_i.operand_b[:32]))
- with m.Else():
- comb += amo_operand_a.eq(load_data)
- comb += amo_operand_b.eq(amo_req_i.operand_b)
-
- # we do not need a store request for load reserved
- # or a failing store conditional
- # we can bail-out without making any further requests
- with m.If ((amo_req_i.amo_op == AMO_LR) | \
- ((amo_req_i.amo_op == AMO_SC) & \
- ((reservation_q.valid & \
- (reservation_q.address != \
- amo_req_i.operand_a[3:64])) | \
- ~reservation_q.valid))):
- comb += req_fsm_miss_valid.eq(0)
- m.next = "IDLE"
- comb += amo_resp_o.ack.eq(1)
- # write-back the result
- comb += amo_resp_o.result.eq(amo_operand_a)
- # we know that the SC failed
- with m.If (amo_req_i.amo_op == AMO_SC):
- comb += amo_resp_o.result.eq(1)
- # also clear the reservation
- comb += reservation_d.valid.eq(0)
- with m.Else():
- comb += req_fsm_miss_valid.eq(1)
-
- comb += req_fsm_miss_we .eq(1)
- comb += req_fsm_miss_req .eq(ariane_axi::SINGLE_REQ)
- comb += req_fsm_miss_size.eq(amo_req_i.size)
- comb += req_fsm_miss_addr.eq(amo_req_i.operand_a)
-
- comb += req_fsm_miss_wdata.eq(
- data_align(amo_req_i.operand_a[0:3], amo_result_o))
- comb += req_fsm_miss_be.eq(
- be_gen(amo_req_i.operand_a[0:3], amo_req_i.size))
-
- # place a reservation on the memory
- with m.If (amo_req_i.amo_op == AMO_LR):
- comb += reservation_d.address.eq(amo_req_i.operand_a[3:64])
- comb += reservation_d.valid.eq(1)
-
- # the request is valid or we didn't need to go for another store
- with m.If (valid_miss_fsm):
- m.next = "IDLE"
- comb += amo_resp_o.ack.eq(1)
- # write-back the result
- comb += amo_resp_o.result.eq(amo_operand_a;
-
- if (amo_req_i.amo_op == AMO_SC) begin
- comb += amo_resp_o.result.eq(0)
- # An SC must fail if there is another SC
- # (to any address) between the LR and the SC in
- # program order (even to the same address).
- # in any case destroy the reservation
- comb += reservation_d.valid.eq(0)
-
- # check MSHR for aliasing
-
- comb += mshr_addr_matches_o .eq(0)
- comb += mshr_index_matches_o.eq()
-
- for i in range(NR_PORTS):
- # check mshr for potential matching of other units,
- # exclude the unit currently being served
- with m.If (mshr_q.valid & \
- (mshr_addr_i[i][DCACHE_BYTE_OFFSET:56] == \
- mshr_q.addr[DCACHE_BYTE_OFFSET:56])):
- comb += mshr_addr_matches_o[i].eq(1)
-
- # same as previous, but checking only the index
- with m.If (mshr_q.valid & \
- (mshr_addr_i[i][DCACHE_BYTE_OFFSET:DCACHE_INDEX_WIDTH] == \
- mshr_q.addr[DCACHE_BYTE_OFFSET:DCACHE_INDEX_WIDTH])):
- mshr_index_matches_o[i].eq(1)
-
- # --------------------
- # Sequential Process
- # --------------------
-
- """
- #pragma translate_off
- `ifndef VERILATOR
- # assert that cache only hits on one way
- assert property (
- @(posedge clk_i) $onehot0(evict_way_q)) else $warning("Evict-way should be one-hot encoded");
- `endif
- #pragma translate_on
- """
-
- # ----------------------
- # Bypass Arbiter
- # ----------------------
- # Connection Arbiter <-> AXI
- req_fsm_bypass_valid = Signal()
- req_fsm_bypass_addr = Signal(64)
- req_fsm_bypass_wdata = Signal(64)
- req_fsm_bypass_we = Signal()
- req_fsm_bypass_be = Signal(8)
- req_fsm_bypass_size = Signal(2)
- gnt_bypass_fsm = Signal()
- valid_bypass_fsm = Signal()
- data_bypass_fsm = Signal(64)
- logic [$clog2(NR_PORTS)-1:0] id_fsm_bypass;
- logic [3:0] id_bypass_fsm;
- logic [3:0] gnt_id_bypass_fsm;
-
- i_bypass_arbiter = ib = AXIArbiter( NR_PORTS, 64)
- comb += [
- # Master Side
- ib.data_req_i .eq( miss_req_valid & miss_req_bypass ),
- ib.address_i .eq( miss_req_addr ),
- ib.data_wdata_i .eq( miss_req_wdata ),
- ib.data_we_i .eq( miss_req_we ),
- ib.data_be_i .eq( miss_req_be ),
- ib.data_size_i .eq( miss_req_size ),
- ib.data_gnt_o .eq( bypass_gnt_o ),
- ib.data_rvalid_o .eq( bypass_valid_o ),
- ib.data_rdata_o .eq( bypass_data_o ),
- # Slave Sid
- ib.id_i .eq( id_bypass_fsm[$clog2(NR_PORTS)-1:0] ),
- ib.id_o .eq( id_fsm_bypass ),
- ib.gnt_id_i .eq( gnt_id_bypass_fsm[$clog2(NR_PORTS)-1:0] ),
- ib.address_o .eq( req_fsm_bypass_addr ),
- ib.data_wdata_o .eq( req_fsm_bypass_wdata ),
- ib.data_req_o .eq( req_fsm_bypass_valid ),
- ib.data_we_o .eq( req_fsm_bypass_we ),
- ib.data_be_o .eq( req_fsm_bypass_be ),
- ib.data_size_o .eq( req_fsm_bypass_size ),
- ib.data_gnt_i .eq( gnt_bypass_fsm ),
- ib.data_rvalid_i .eq( valid_bypass_fsm ),
- ib.data_rdata_i .eq( data_bypass_fsm ),
- ]
-
- axi_adapter #(
- .DATA_WIDTH ( 64 ),
- .AXI_ID_WIDTH ( 4 ),
- .CACHELINE_BYTE_OFFSET ( DCACHE_BYTE_OFFSET )
- ) i_bypass_axi_adapter (
- .clk_i,
- .rst_ni,
- .req_i ( req_fsm_bypass_valid ),
- .type_i ( ariane_axi::SINGLE_REQ ),
- .gnt_o ( gnt_bypass_fsm ),
- .addr_i ( req_fsm_bypass_addr ),
- .we_i ( req_fsm_bypass_we ),
- .wdata_i ( req_fsm_bypass_wdata ),
- .be_i ( req_fsm_bypass_be ),
- .size_i ( req_fsm_bypass_size ),
- .id_i ( Cat(id_fsm_bypass, 0, 0) ),
- .valid_o ( valid_bypass_fsm ),
- .rdata_o ( data_bypass_fsm ),
- .gnt_id_o ( gnt_id_bypass_fsm ),
- .id_o ( id_bypass_fsm ),
- .critical_word_o ( ), # not used for single requests
- .critical_word_valid_o ( ), # not used for single requests
- .axi_req_o ( axi_bypass_o ),
- .axi_resp_i ( axi_bypass_i )
- );
-
- # ----------------------
- # Cache Line AXI Refill
- # ----------------------
- axi_adapter #(
- .DATA_WIDTH ( DCACHE_LINE_WIDTH ),
- .AXI_ID_WIDTH ( 4 ),
- .CACHELINE_BYTE_OFFSET ( DCACHE_BYTE_OFFSET )
- ) i_miss_axi_adapter (
- .clk_i,
- .rst_ni,
- .req_i ( req_fsm_miss_valid ),
- .type_i ( req_fsm_miss_req ),
- .gnt_o ( gnt_miss_fsm ),
- .addr_i ( req_fsm_miss_addr ),
- .we_i ( req_fsm_miss_we ),
- .wdata_i ( req_fsm_miss_wdata ),
- .be_i ( req_fsm_miss_be ),
- .size_i ( req_fsm_miss_size ),
- .id_i ( Const(0b1100, 4) ),
- .gnt_id_o ( ), # open
- .valid_o ( valid_miss_fsm ),
- .rdata_o ( data_miss_fsm ),
- .id_o ( ),
- .critical_word_o,
- .critical_word_valid_o,
- .axi_req_o ( axi_data_o ),
- .axi_resp_i ( axi_data_i )
- );
-
- # -----------------
- # Replacement LFSR
- # -----------------
- lfsr_8bit #(.WIDTH (DCACHE_SET_ASSOC)) i_lfsr (
- .en_i ( lfsr_enable ),
- .refill_way_oh ( lfsr_oh ),
- .refill_way_bin ( lfsr_bin ),
- .*
- );
-
- # -----------------
- # AMO ALU
- # -----------------
- amo_alu i_amo_alu (
- .amo_op_i ( amo_op ),
- .amo_operand_a_i ( amo_operand_a ),
- .amo_operand_b_i ( amo_operand_b ),
- .amo_result_o ( amo_result_o )
- );
-
- # -----------------
- # Struct Split
- # -----------------
-
- for i in range(NR_PORTS):
- miss_req = MissReq()
- comb += miss_req.eq(miss_req_i[i]);
- comb += miss_req_valid [i] .eq(miss_req.valid)
- comb += miss_req_bypass [i] .eq(miss_req.bypass)
- comb += miss_req_addr [i] .eq(miss_req.addr)
- comb += miss_req_wdata [i] .eq(miss_req.wdata)
- comb += miss_req_we [i] .eq(miss_req.we)
- comb += miss_req_be [i] .eq(miss_req.be)
- comb += miss_req_size [i] .eq(miss_req.size)
-
- # --------------
- # AXI Arbiter
- # --------------s
- #
- # Description: Arbitrates access to AXI refill/bypass
- #
-class AXIArbiter:
- def __init__(self, NR_PORTS = 3, DATA_WIDTH = 64):
- self.NR_PORTS = NR_PORTS
- self.DATA_WIDTH = DATA_WIDTH
- self.pwid = pwid = ceil(log(NR_PORTS) / log(2))
- rst_ni = ResetSignal() # Asynchronous reset active low
- # master ports
- self.data_req_i = Signal(NR_PORTS)
- self.address_i = Array(Signal(name="address_i", 64) \
- for i in range(NR_PORTS))
- self.data_wdata_i = Array(Signal(name="data_wdata_i", 64) \
- for i in range(NR_PORTS))
- self.data_we_i = Signal(NR_PORTS)
- self.data_be_i = Array(Signal(name="data_wdata_i", DATA_WIDTH/8) \
- for i in range(NR_PORTS))
- self.data_size_i = Array(Signal(name="data_size_i", 2) \
- for i in range(NR_PORTS))
- self.data_gnt_o = Signal(NR_PORTS)
- self.data_rvalid_o = Signal(NR_PORTS)
- self.data_rdata_o = Array(Signal(name="data_rdata_o", 64) \
- for i in range(NR_PORTS))
-
- # slave port
- self.id_i = Signal(pwid)
- self.id_o = Signal(pwid)
- self.gnt_id_i = Signal(pwid)
- self.data_req_o = Signal()
- self.address_o = Signal(64)
- self.data_wdata_o = Signal(DATA_WIDTH)
- self.data_we_o = Signal()
- self.data_be_o = Signal(DATA_WIDTH/8)
- self.data_size_o = Signal(2)
- self.data_gnt_i = Signal()
- self.data_rvalid_i = Signal()
- self.data_rdata_i = Signal(DATA_WIDTH)
-
- def elaborate(self, platform):
- #enum logic [1:0] { IDLE, REQ, SERVING } state_d, state_q;
-
- class Packet:
- def __init__(self, pwid, DATA_WIDTH):
- self.id = Signal(pwid)
- self.address = Signal(64)
- self.data = Signal(64)
- self.size = Signal(2)
- self.be = Signal(DATA_WIDTH/8)
- self.we = Signal()
-
- request_index = Signal(self.pwid)
- req_q = Packet(self.pwid, self.DATA_WIDTH)
- req_d = Packet(self.pwid, self.DATA_WIDTH)
-
- # request register
- sync += req_q.eq(req_d)
-
- # request port
- comb += self.address_o .eq(req_q.address)
- comb += self.data_wdata_o .eq(req_q.data)
- comb += self.data_be_o .eq(req_q.be)
- comb += self.data_size_o .eq(req_q.size)
- comb += self.data_we_o .eq(req_q.we)
- comb += self.id_o .eq(req_q.id)
- comb += self.data_gnt_o .eq(0)
- # read port
- comb += self.data_rvalid_o .eq(0)
- comb += self.data_rdata_o .eq(0)
- comb += self.data_rdata_o[req_q.id].eq(data_rdata_i)
-
- m.submodules.pp = pp = PriorityEncoder(self.NR_PORTS)
- comb += pp.i.eq(self.data_req_i) # select one request (priority-based)
- comb += request_index.eq(pp.o)
-
- with m.Switch("state") as s:
-
- with m.Case("IDLE"):
- # wait for incoming requests (priority encoder data_req_i)
- with m.If(~pp.n): # one output valid from encoder
- comb += self.data_req_o .eq(self.data_req_i[i])
- comb += self.data_gnt_o[i].eq(self.data_req_i[i])
- # save the request
- comb += req_d.address.eq(self.address_i[i])
- comb += req_d.id.eq(request_index)
- comb += req_d.data.eq(self.data_wdata_i[i])
- comb += req_d.size.eq(self.data_size_i[i])
- comb += req_d.be.eq(self.data_be_i[i])
- comb += req_d.we.eq(self.data_we_i[i])
- m.next = "SERVING"
-
- comb += self.address_o .eq(self.address_i[request_index])
- comb += self.data_wdata_o .eq(self.data_wdata_i[request_index])
- comb += self.data_be_o .eq(self.data_be_i[request_index])
- comb += self.data_size_o .eq(self.data_size_i[request_index])
- comb += self.data_we_o .eq(self.data_we_i[request_index])
- comb += self.id_o .eq(request_index)
-
- with m.Case("SERVING"):
- comb += self.data_req_o.eq(1)
- with m.If (self.data_rvalid_i):
- comb += self.data_rvalid_o[req_q.id].eq(1)
- m.next = "IDLE"
-
- # ------------
- # Assertions
- # ------------
-
- """
-#pragma translate_off
-`ifndef VERILATOR
-# make sure that we eventually get an rvalid after we received a grant
-assert property (@(posedge clk_i) data_gnt_i |-> ##[1:$] data_rvalid_i )
- else begin $error("There was a grant without a rvalid"); $stop(); end
-# assert that there is no grant without a request
-assert property (@(negedge clk_i) data_gnt_i |-> data_req_o)
- else begin $error("There was a grant without a request."); $stop(); end
-# assert that the address does not contain X when request is sent
-assert property ( @(posedge clk_i) (data_req_o) |-> (!$isunknown(address_o)) )
- else begin $error("address contains X when request is set"); $stop(); end
-
-`endif
-#pragma translate_on
- """
-
+++ /dev/null
-"""
-# Copyright 2018 ETH Zurich and University of Bologna.
-# Copyright and related rights are licensed under the Solderpad Hardware
-# License, Version 0.51 (the "License"); you may not use this file except in
-# compliance with the License. You may obtain a copy of the License at
-# http:#solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# or agreed to in writing, software, hardware and materials distributed under
-# this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations under the License.
-#
-# Author: Florian Zaruba, ETH Zurich
-# Date: 19/04/2017
-# Description: Memory Management Unit for Ariane, contains TLB and
-# address translation unit. SV48 as defined in
-# Volume II: RISC-V Privileged Architectures V1.10 Page 63
-
-import ariane_pkg::*;
-"""
-
-from nmigen import Const, Signal, Cat, Module, Mux
-from nmigen.cli import verilog, rtlil
-
-from ptw import DCacheReqI, DCacheReqO, TLBUpdate, PTE, PTW
-from tlb import TLB
-from exceptcause import (INSTR_ACCESS_FAULT, INSTR_PAGE_FAULT,
- LOAD_PAGE_FAULT, STORE_PAGE_FAULT)
-
-PRIV_LVL_M = Const(0b11, 2)
-PRIV_LVL_S = Const(0b01, 2)
-PRIV_LVL_U = Const(0b00, 2)
-
-
-class RVException:
- def __init__(self):
- self.cause = Signal(64) # cause of exception
- self.tval = Signal(64) # more info of causing exception
- # (e.g.: instruction causing it),
- # address of LD/ST fault
- self.valid = Signal()
-
- def eq(self, inp):
- res = []
- for (o, i) in zip(self.ports(), inp.ports()):
- res.append(o.eq(i))
- return res
-
- def __iter__(self):
- yield self.cause
- yield self.tval
- yield self.valid
-
- def ports(self):
- return list(self)
-
-
-class ICacheReqI:
- def __init__(self):
- self.fetch_valid = Signal() # address translation valid
- self.fetch_paddr = Signal(64) # physical address in
- self.fetch_exception = RVException() # exception occurred during fetch
-
- def __iter__(self):
- yield self.fetch_valid
- yield self.fetch_paddr
- yield from self.fetch_exception
-
- def ports(self):
- return list(self)
-
-
-class ICacheReqO:
- def __init__(self):
- self.fetch_req = Signal() # address translation request
- self.fetch_vaddr = Signal(64) # virtual address out
-
- def __iter__(self):
- yield self.fetch_req
- yield self.fetch_vaddr
-
- def ports(self):
- return list(self)
-
-
-class MMU:
- def __init__(self, instr_tlb_entries = 4,
- data_tlb_entries = 4,
- asid_width = 1):
- self.instr_tlb_entries = instr_tlb_entries
- self.data_tlb_entries = data_tlb_entries
- self.asid_width = asid_width
-
- self.flush_i = Signal()
- self.enable_translation_i = Signal()
- self.en_ld_st_translation_i = Signal() # enable VM translation for LD/ST
- # IF interface
- self.icache_areq_i = ICacheReqO()
- self.icache_areq_o = ICacheReqI()
- # LSU interface
- # this is a more minimalistic interface because the actual addressing
- # logic is handled in the LSU as we distinguish load and stores,
- # what we do here is simple address translation
- self.misaligned_ex_i = RVException()
- self.lsu_req_i = Signal() # request address translation
- self.lsu_vaddr_i = Signal(64) # virtual address in
- self.lsu_is_store_i = Signal() # the translation is requested by a store
- # if we need to walk the page table we can't grant in the same cycle
-
- # Cycle 0
- self.lsu_dtlb_hit_o = Signal() # sent in the same cycle as the request
- # if translation hits in the DTLB
- # Cycle 1
- self.lsu_valid_o = Signal() # translation is valid
- self.lsu_paddr_o = Signal(64) # translated address
- self.lsu_exception_o = RVException() # addr translate threw exception
-
- # General control signals
- self.priv_lvl_i = Signal(2)
- self.ld_st_priv_lvl_i = Signal(2)
- self.sum_i = Signal()
- self.mxr_i = Signal()
- # input logic flag_mprv_i,
- self.satp_ppn_i = Signal(44)
- self.asid_i = Signal(self.asid_width)
- self.flush_tlb_i = Signal()
- # Performance counters
- self.itlb_miss_o = Signal()
- self.dtlb_miss_o = Signal()
- # PTW memory interface
- self.req_port_i = DCacheReqO()
- self.req_port_o = DCacheReqI()
-
- def elaborate(self, platform):
- m = Module()
-
- iaccess_err = Signal() # insufficient priv to access instr page
- daccess_err = Signal() # insufficient priv to access data page
- ptw_active = Signal() # PTW is currently walking a page table
- walking_instr = Signal() # PTW is walking because of an ITLB miss
- ptw_error = Signal() # PTW threw an exception
-
- update_vaddr = Signal(48) # guessed
- uaddr64 = Cat(update_vaddr, Const(0, 25)) # extend to 64bit with zeros
- update_ptw_itlb = TLBUpdate(self.asid_width)
- update_ptw_dtlb = TLBUpdate(self.asid_width)
-
- itlb_lu_access = Signal()
- itlb_content = PTE()
- itlb_is_2M = Signal()
- itlb_is_1G = Signal()
- itlb_is_512G = Signal()
- itlb_lu_hit = Signal()
-
- dtlb_lu_access = Signal()
- dtlb_content = PTE()
- dtlb_is_2M = Signal()
- dtlb_is_1G = Signal()
- dtlb_is_512G = Signal()
- dtlb_lu_hit = Signal()
-
- # Assignments
- m.d.comb += [itlb_lu_access.eq(self.icache_areq_i.fetch_req),
- dtlb_lu_access.eq(self.lsu_req_i)
- ]
-
- # ITLB
- m.submodules.i_tlb = i_tlb = TLB(self.instr_tlb_entries,
- self.asid_width)
- m.d.comb += [i_tlb.flush_i.eq(self.flush_tlb_i),
- i_tlb.update_i.eq(update_ptw_itlb),
- i_tlb.lu_access_i.eq(itlb_lu_access),
- i_tlb.lu_asid_i.eq(self.asid_i),
- i_tlb.lu_vaddr_i.eq(self.icache_areq_i.fetch_vaddr),
- itlb_content.eq(i_tlb.lu_content_o),
- itlb_is_2M.eq(i_tlb.lu_is_2M_o),
- itlb_is_1G.eq(i_tlb.lu_is_1G_o),
- itlb_is_512G.eq(i_tlb.lu_is_512G_o),
- itlb_lu_hit.eq(i_tlb.lu_hit_o),
- ]
-
- # DTLB
- m.submodules.d_tlb = d_tlb = TLB(self.data_tlb_entries,
- self.asid_width)
- m.d.comb += [d_tlb.flush_i.eq(self.flush_tlb_i),
- d_tlb.update_i.eq(update_ptw_dtlb),
- d_tlb.lu_access_i.eq(dtlb_lu_access),
- d_tlb.lu_asid_i.eq(self.asid_i),
- d_tlb.lu_vaddr_i.eq(self.lsu_vaddr_i),
- dtlb_content.eq(d_tlb.lu_content_o),
- dtlb_is_2M.eq(d_tlb.lu_is_2M_o),
- dtlb_is_1G.eq(d_tlb.lu_is_1G_o),
- dtlb_is_512G.eq(d_tlb.lu_is_512G_o),
- dtlb_lu_hit.eq(d_tlb.lu_hit_o),
- ]
-
- # PTW
- m.submodules.ptw = ptw = PTW(self.asid_width)
- m.d.comb += [ptw_active.eq(ptw.ptw_active_o),
- walking_instr.eq(ptw.walking_instr_o),
- ptw_error.eq(ptw.ptw_error_o),
- ptw.enable_translation_i.eq(self.enable_translation_i),
-
- update_vaddr.eq(ptw.update_vaddr_o),
- update_ptw_itlb.eq(ptw.itlb_update_o),
- update_ptw_dtlb.eq(ptw.dtlb_update_o),
-
- ptw.itlb_access_i.eq(itlb_lu_access),
- ptw.itlb_hit_i.eq(itlb_lu_hit),
- ptw.itlb_vaddr_i.eq(self.icache_areq_i.fetch_vaddr),
-
- ptw.dtlb_access_i.eq(dtlb_lu_access),
- ptw.dtlb_hit_i.eq(dtlb_lu_hit),
- ptw.dtlb_vaddr_i.eq(self.lsu_vaddr_i),
-
- ptw.req_port_i.eq(self.req_port_i),
- self.req_port_o.eq(ptw.req_port_o),
- ]
-
- # ila_1 i_ila_1 (
- # .clk(clk_i), # input wire clk
- # .probe0({req_port_o.address_tag, req_port_o.address_index}),
- # .probe1(req_port_o.data_req), # input wire [63:0] probe1
- # .probe2(req_port_i.data_gnt), # input wire [0:0] probe2
- # .probe3(req_port_i.data_rdata), # input wire [0:0] probe3
- # .probe4(req_port_i.data_rvalid), # input wire [0:0] probe4
- # .probe5(ptw_error), # input wire [1:0] probe5
- # .probe6(update_vaddr), # input wire [0:0] probe6
- # .probe7(update_ptw_itlb.valid), # input wire [0:0] probe7
- # .probe8(update_ptw_dtlb.valid), # input wire [0:0] probe8
- # .probe9(dtlb_lu_access), # input wire [0:0] probe9
- # .probe10(lsu_vaddr_i), # input wire [0:0] probe10
- # .probe11(dtlb_lu_hit), # input wire [0:0] probe11
- # .probe12(itlb_lu_access), # input wire [0:0] probe12
- # .probe13(icache_areq_i.fetch_vaddr), # input wire [0:0] probe13
- # .probe14(itlb_lu_hit) # input wire [0:0] probe13
- # );
-
- #-----------------------
- # Instruction Interface
- #-----------------------
- # The instruction interface is a simple request response interface
-
- # MMU disabled: just pass through
- m.d.comb += [self.icache_areq_o.fetch_valid.eq(
- self.icache_areq_i.fetch_req),
- # play through in case we disabled address translation
- self.icache_areq_o.fetch_paddr.eq(
- self.icache_areq_i.fetch_vaddr)
- ]
- # two potential exception sources:
- # 1. HPTW threw an exception -> signal with a page fault exception
- # 2. We got an access error because of insufficient permissions ->
- # throw an access exception
- m.d.comb += self.icache_areq_o.fetch_exception.valid.eq(0)
- # Check whether we are allowed to access this memory region
- # from a fetch perspective
-
- # PLATEN TODO: use PermissionValidator instead [we like modules]
- m.d.comb += iaccess_err.eq(self.icache_areq_i.fetch_req & \
- (((self.priv_lvl_i == PRIV_LVL_U) & \
- ~itlb_content.u) | \
- ((self.priv_lvl_i == PRIV_LVL_S) & \
- itlb_content.u)))
-
- # MMU enabled: address from TLB, request delayed until hit.
- # Error when TLB hit and no access right or TLB hit and
- # translated address not valid (e.g. AXI decode error),
- # or when PTW performs walk due to ITLB miss and raises
- # an error.
- with m.If (self.enable_translation_i):
- # we work with SV48, so if VM is enabled, check that
- # all bits [47:38] are equal
- with m.If (self.icache_areq_i.fetch_req & \
- ~(((~self.icache_areq_i.fetch_vaddr[47:64]) == 0) | \
- (self.icache_areq_i.fetch_vaddr[47:64]) == 0)):
- fe = self.icache_areq_o.fetch_exception
- m.d.comb += [fe.cause.eq(INSTR_ACCESS_FAULT),
- fe.tval.eq(self.icache_areq_i.fetch_vaddr),
- fe.valid.eq(1)
- ]
-
- m.d.comb += self.icache_areq_o.fetch_valid.eq(0)
-
- # 4K page
- paddr = Signal.like(self.icache_areq_o.fetch_paddr)
- paddr4k = Cat(self.icache_areq_i.fetch_vaddr[0:12],
- itlb_content.ppn)
- m.d.comb += paddr.eq(paddr4k)
- # Mega page
- with m.If(itlb_is_2M):
- m.d.comb += paddr[12:21].eq(
- self.icache_areq_i.fetch_vaddr[12:21])
- # Giga page
- with m.If(itlb_is_1G):
- m.d.comb += paddr[12:30].eq(
- self.icache_areq_i.fetch_vaddr[12:30])
- m.d.comb += self.icache_areq_o.fetch_paddr.eq(paddr)
- # Tera page
- with m.If(itlb_is_512G):
- m.d.comb += paddr[12:39].eq(
- self.icache_areq_i.fetch_vaddr[12:39])
- m.d.comb += self.icache_areq_o.fetch_paddr.eq(paddr)
-
- # ---------
- # ITLB Hit
- # --------
- # if we hit the ITLB output the request signal immediately
- with m.If(itlb_lu_hit):
- m.d.comb += self.icache_areq_o.fetch_valid.eq(
- self.icache_areq_i.fetch_req)
- # we got an access error
- with m.If (iaccess_err):
- # throw a page fault
- fe = self.icache_areq_o.fetch_exception
- m.d.comb += [fe.cause.eq(INSTR_ACCESS_FAULT),
- fe.tval.eq(self.icache_areq_i.fetch_vaddr),
- fe.valid.eq(1)
- ]
- # ---------
- # ITLB Miss
- # ---------
- # watch out for exceptions happening during walking the page table
- with m.Elif(ptw_active & walking_instr):
- m.d.comb += self.icache_areq_o.fetch_valid.eq(ptw_error)
- fe = self.icache_areq_o.fetch_exception
- m.d.comb += [fe.cause.eq(INSTR_PAGE_FAULT),
- fe.tval.eq(uaddr64),
- fe.valid.eq(1)
- ]
-
- #-----------------------
- # Data Interface
- #-----------------------
-
- lsu_vaddr = Signal(64)
- dtlb_pte = PTE()
- misaligned_ex = RVException()
- lsu_req = Signal()
- lsu_is_store = Signal()
- dtlb_hit = Signal()
- #dtlb_is_2M = Signal()
- #dtlb_is_1G = Signal()
- #dtlb_is_512 = Signal()
-
- # check if we need to do translation or if we are always
- # ready (e.g.: we are not translating anything)
- m.d.comb += self.lsu_dtlb_hit_o.eq(Mux(self.en_ld_st_translation_i,
- dtlb_lu_hit, 1))
-
- # The data interface is simpler and only consists of a
- # request/response interface
- m.d.comb += [
- # save request and DTLB response
- lsu_vaddr.eq(self.lsu_vaddr_i),
- lsu_req.eq(self.lsu_req_i),
- misaligned_ex.eq(self.misaligned_ex_i),
- dtlb_pte.eq(dtlb_content),
- dtlb_hit.eq(dtlb_lu_hit),
- lsu_is_store.eq(self.lsu_is_store_i),
- #dtlb_is_2M.eq(dtlb_is_2M),
- #dtlb_is_1G.eq(dtlb_is_1G),
- ##dtlb_is_512.eq(self.dtlb_is_512G) #????
- ]
- m.d.sync += [
- self.lsu_paddr_o.eq(lsu_vaddr),
- self.lsu_valid_o.eq(lsu_req),
- self.lsu_exception_o.eq(misaligned_ex),
- ]
-
- sverr = Signal()
- usrerr = Signal()
-
- m.d.comb += [
- # mute misaligned exceptions if there is no request
- # otherwise they will throw accidental exceptions
- misaligned_ex.valid.eq(self.misaligned_ex_i.valid & self.lsu_req_i),
-
- # SUM is not set and we are trying to access a user
- # page in supervisor mode
- sverr.eq(self.ld_st_priv_lvl_i == PRIV_LVL_S & ~self.sum_i & \
- dtlb_pte.u),
- # this is not a user page but we are in user mode and
- # trying to access it
- usrerr.eq(self.ld_st_priv_lvl_i == PRIV_LVL_U & ~dtlb_pte.u),
-
- # Check if the User flag is set, then we may only
- # access it in supervisor mode if SUM is enabled
- daccess_err.eq(sverr | usrerr),
- ]
-
- # translation is enabled and no misaligned exception occurred
- with m.If(self.en_ld_st_translation_i & ~misaligned_ex.valid):
- m.d.comb += lsu_req.eq(0)
- # 4K page
- paddr = Signal.like(lsu_vaddr)
- paddr4k = Cat(lsu_vaddr[0:12], itlb_content.ppn)
- m.d.comb += paddr.eq(paddr4k)
- # Mega page
- with m.If(dtlb_is_2M):
- m.d.comb += paddr[12:21].eq(lsu_vaddr[12:21])
- # Giga page
- with m.If(dtlb_is_1G):
- m.d.comb += paddr[12:30].eq(lsu_vaddr[12:30])
- m.d.sync += self.lsu_paddr_o.eq(paddr)
- # TODO platen tera_page
-
- # ---------
- # DTLB Hit
- # --------
- with m.If(dtlb_hit & lsu_req):
- m.d.comb += lsu_req.eq(1)
- # this is a store
- with m.If (lsu_is_store):
- # check if the page is write-able and
- # we are not violating privileges
- # also check if the dirty flag is set
- with m.If(~dtlb_pte.w | daccess_err | ~dtlb_pte.d):
- le = self.lsu_exception_o
- m.d.sync += [le.cause.eq(STORE_PAGE_FAULT),
- le.tval.eq(lsu_vaddr),
- le.valid.eq(1)
- ]
-
- # this is a load, check for sufficient access
- # privileges - throw a page fault if necessary
- with m.Elif(daccess_err):
- le = self.lsu_exception_o
- m.d.sync += [le.cause.eq(LOAD_PAGE_FAULT),
- le.tval.eq(lsu_vaddr),
- le.valid.eq(1)
- ]
- # ---------
- # DTLB Miss
- # ---------
- # watch out for exceptions
- with m.Elif (ptw_active & ~walking_instr):
- # page table walker threw an exception
- with m.If (ptw_error):
- # an error makes the translation valid
- m.d.comb += lsu_req.eq(1)
- # the page table walker can only throw page faults
- with m.If (lsu_is_store):
- le = self.lsu_exception_o
- m.d.sync += [le.cause.eq(STORE_PAGE_FAULT),
- le.tval.eq(uaddr64),
- le.valid.eq(1)
- ]
- with m.Else():
- m.d.sync += [le.cause.eq(LOAD_PAGE_FAULT),
- le.tval.eq(uaddr64),
- le.valid.eq(1)
- ]
-
- return m
-
- def ports(self):
- return [self.flush_i, self.enable_translation_i,
- self.en_ld_st_translation_i,
- self.lsu_req_i,
- self.lsu_vaddr_i, self.lsu_is_store_i, self.lsu_dtlb_hit_o,
- self.lsu_valid_o, self.lsu_paddr_o,
- self.priv_lvl_i, self.ld_st_priv_lvl_i, self.sum_i, self.mxr_i,
- self.satp_ppn_i, self.asid_i, self.flush_tlb_i,
- self.itlb_miss_o, self.dtlb_miss_o] + \
- self.icache_areq_i.ports() + self.icache_areq_o.ports() + \
- self.req_port_i.ports() + self.req_port_o.ports() + \
- self.misaligned_ex_i.ports() + self.lsu_exception_o.ports()
-
-if __name__ == '__main__':
- mmu = MMU()
- vl = rtlil.convert(mmu, ports=mmu.ports())
- with open("test_mmu.il", "w") as f:
- f.write(vl)
-
+++ /dev/null
-pseudo-LRU
-
-two-way set associative - one bit
-
- indicates which line of the two has been reference more recently
-
-
-four-way set associative - three bits
-
- each bit represents one branch point in a binary decision tree; let 1
- represent that the left side has been referenced more recently than the
- right side, and 0 vice-versa
-
- are all 4 lines valid?
- / \
- yes no, use an invalid line
- |
- |
- |
- bit_0 == 0? state | replace ref to | next state
- / \ ------+-------- -------+-----------
- y n 00x | line_0 line_0 | 11_
- / \ 01x | line_1 line_1 | 10_
- bit_1 == 0? bit_2 == 0? 1x0 | line_2 line_2 | 0_1
- / \ / \ 1x1 | line_3 line_3 | 0_0
- y n y n
- / \ / \ ('x' means ('_' means unchanged)
- line_0 line_1 line_2 line_3 don't care)
-
- (see Figure 3-7, p. 3-18, in Intel Embedded Pentium Processor Family Dev.
- Manual, 1998, http://www.intel.com/design/intarch/manuals/273204.htm)
-
-
-note that there is a 6-bit encoding for true LRU for four-way set associative
-
- bit 0: bank[1] more recently used than bank[0]
- bit 1: bank[2] more recently used than bank[0]
- bit 2: bank[2] more recently used than bank[1]
- bit 3: bank[3] more recently used than bank[0]
- bit 4: bank[3] more recently used than bank[1]
- bit 5: bank[3] more recently used than bank[2]
-
- this results in 24 valid bit patterns within the 64 possible bit patterns
- (4! possible valid traces for bank references)
-
- e.g., a trace of 0 1 2 3, where 0 is LRU and 3 is MRU, is encoded as 111111
-
- you can implement a state machine with a 256x6 ROM (6-bit state encoding
- appended with a 2-bit bank reference input will yield a new 6-bit state),
- and you can implement an LRU bank indicator with a 64x2 ROM
-
+++ /dev/null
-# moved to nmutil https://git.libre-soc.org/?p=nmutil.git;a=tree
-from nmutil.plru import PLRU
+++ /dev/null
-"""
-# Copyright 2018 ETH Zurich and University of Bologna.
-# Copyright and related rights are licensed under the Solderpad Hardware
-# License, Version 0.51 (the "License"); you may not use this file except in
-# compliance with the License. You may obtain a copy of the License at
-# http:#solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# or agreed to in writing, software, hardware and materials distributed under
-# this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations under the License.
-#
-# Author: David Schaffenrath, TU Graz
-# Author: Florian Zaruba, ETH Zurich
-# Date: 24.4.2017
-# Description: Hardware-PTW
-
-/* verilator lint_off WIDTH */
-import ariane_pkg::*;
-
-see linux kernel source:
-
-* "arch/riscv/include/asm/page.h"
-* "arch/riscv/include/asm/mmu_context.h"
-* "arch/riscv/Kconfig" (CONFIG_PAGE_OFFSET)
-
-"""
-
-from nmigen import Const, Signal, Cat, Module, Elaboratable
-from nmigen.hdl.ast import ArrayProxy
-from nmigen.cli import verilog, rtlil
-from math import log2
-
-
-DCACHE_SET_ASSOC = 8
-CONFIG_L1D_SIZE = 32*1024
-DCACHE_INDEX_WIDTH = int(log2(CONFIG_L1D_SIZE / DCACHE_SET_ASSOC))
-DCACHE_TAG_WIDTH = 56 - DCACHE_INDEX_WIDTH
-
-ASID_WIDTH = 8
-
-
-class DCacheReqI:
- def __init__(self):
- self.address_index = Signal(DCACHE_INDEX_WIDTH)
- self.address_tag = Signal(DCACHE_TAG_WIDTH)
- self.data_wdata = Signal(64)
- self.data_req = Signal()
- self.data_we = Signal()
- self.data_be = Signal(8)
- self.data_size = Signal(2)
- self.kill_req = Signal()
- self.tag_valid = Signal()
-
- def eq(self, inp):
- res = []
- for (o, i) in zip(self.ports(), inp.ports()):
- res.append(o.eq(i))
- return res
-
- def ports(self):
- return [self.address_index, self.address_tag,
- self.data_wdata, self.data_req,
- self.data_we, self.data_be, self.data_size,
- self.kill_req, self.tag_valid,
- ]
-
-class DCacheReqO:
- def __init__(self):
- self.data_gnt = Signal()
- self.data_rvalid = Signal()
- self.data_rdata = Signal(64) # actually in PTE object format
-
- def eq(self, inp):
- res = []
- for (o, i) in zip(self.ports(), inp.ports()):
- res.append(o.eq(i))
- return res
-
- def ports(self):
- return [self.data_gnt, self.data_rvalid, self.data_rdata]
-
-
-class PTE: #(RecordObject):
- def __init__(self):
- self.v = Signal()
- self.r = Signal()
- self.w = Signal()
- self.x = Signal()
- self.u = Signal()
- self.g = Signal()
- self.a = Signal()
- self.d = Signal()
- self.rsw = Signal(2)
- self.ppn = Signal(44)
- self.reserved = Signal(10)
-
- def flatten(self):
- return Cat(*self.ports())
-
- def eq(self, x):
- if isinstance(x, ArrayProxy):
- res = []
- for o in self.ports():
- i = getattr(x, o.name)
- res.append(i)
- x = Cat(*res)
- else:
- x = x.flatten()
- return self.flatten().eq(x)
-
- def __iter__(self):
- """ order is critical so that flatten creates LSB to MSB
- """
- yield self.v
- yield self.r
- yield self.w
- yield self.x
- yield self.u
- yield self.g
- yield self.a
- yield self.d
- yield self.rsw
- yield self.ppn
- yield self.reserved
-
- def ports(self):
- return list(self)
-
-
-class TLBUpdate:
- def __init__(self, asid_width):
- self.valid = Signal() # valid flag
- self.is_2M = Signal()
- self.is_1G = Signal()
- self.is_512G = Signal()
- self.vpn = Signal(36)
- self.asid = Signal(asid_width)
- self.content = PTE()
-
- def flatten(self):
- return Cat(*self.ports())
-
- def eq(self, x):
- return self.flatten().eq(x.flatten())
-
- def ports(self):
- return [self.valid, self.is_2M, self.is_1G, self.vpn, self.asid] + \
- self.content.ports()
-
-
-# SV48 defines four levels of page tables
-LVL1 = Const(0, 2) # defined to 0 so that ptw_lvl default-resets to LVL1
-LVL2 = Const(1, 2)
-LVL3 = Const(2, 2)
-LVL4 = Const(3, 2)
-
-
-class PTW(Elaboratable):
- def __init__(self, asid_width=8):
- self.asid_width = asid_width
-
- self.flush_i = Signal() # flush everything, we need to do this because
- # actually everything we do is speculative at this stage
- # e.g.: there could be a CSR instruction that changes everything
- self.ptw_active_o = Signal(reset=1) # active if not IDLE
- self.walking_instr_o = Signal() # set when walking for TLB
- self.ptw_error_o = Signal() # set when an error occurred
- self.enable_translation_i = Signal() # CSRs indicate to enable SV48
- self.en_ld_st_translation_i = Signal() # enable VM translation for ld/st
-
- self.lsu_is_store_i = Signal() # translation triggered by store
- # PTW memory interface
- self.req_port_i = DCacheReqO()
- self.req_port_o = DCacheReqI()
-
- # to TLBs, update logic
- self.itlb_update_o = TLBUpdate(asid_width)
- self.dtlb_update_o = TLBUpdate(asid_width)
-
- self.update_vaddr_o = Signal(48)
-
- self.asid_i = Signal(self.asid_width)
- # from TLBs
- # did we miss?
- self.itlb_access_i = Signal()
- self.itlb_hit_i = Signal()
- self.itlb_vaddr_i = Signal(64)
-
- self.dtlb_access_i = Signal()
- self.dtlb_hit_i = Signal()
- self.dtlb_vaddr_i = Signal(64)
- # from CSR file
- self.satp_ppn_i = Signal(44) # ppn from satp
- self.mxr_i = Signal()
- # Performance counters
- self.itlb_miss_o = Signal()
- self.dtlb_miss_o = Signal()
-
- def ports(self):
- return [self.ptw_active_o, self.walking_instr_o, self.ptw_error_o,
- ]
- return [
- self.enable_translation_i, self.en_ld_st_translation_i,
- self.lsu_is_store_i, self.req_port_i, self.req_port_o,
- self.update_vaddr_o,
- self.asid_i,
- self.itlb_access_i, self.itlb_hit_i, self.itlb_vaddr_i,
- self.dtlb_access_i, self.dtlb_hit_i, self.dtlb_vaddr_i,
- self.satp_ppn_i, self.mxr_i,
- self.itlb_miss_o, self.dtlb_miss_o
- ] + self.itlb_update_o.ports() + self.dtlb_update_o.ports()
-
- def elaborate(self, platform):
- m = Module()
-
- # input registers
- data_rvalid = Signal()
- data_rdata = Signal(64)
-
- # NOTE: pte decodes the incoming bit-field (data_rdata). data_rdata
- # is spec'd in 64-bit binary-format: better to spec as Record?
- pte = PTE()
- m.d.comb += pte.flatten().eq(data_rdata)
-
- # SV48 defines four levels of page tables
- ptw_lvl = Signal(2) # default=0=LVL1 on reset (see above)
- ptw_lvl1 = Signal()
- ptw_lvl2 = Signal()
- ptw_lvl3 = Signal()
- ptw_lvl4 = Signal()
- m.d.comb += [ptw_lvl1.eq(ptw_lvl == LVL1),
- ptw_lvl2.eq(ptw_lvl == LVL2),
- ptw_lvl3.eq(ptw_lvl == LVL3),
- ptw_lvl4.eq(ptw_lvl == LVL4)
- ]
-
- # is this an instruction page table walk?
- is_instr_ptw = Signal()
- global_mapping = Signal()
- # latched tag signal
- tag_valid = Signal()
- # register the ASID
- tlb_update_asid = Signal(self.asid_width)
- # register VPN we need to walk, SV48 defines a 48 bit virtual addr
- vaddr = Signal(64)
- # 4 byte aligned physical pointer
- ptw_pptr = Signal(56)
-
- end = DCACHE_INDEX_WIDTH + DCACHE_TAG_WIDTH
- m.d.sync += [
- # Assignments
- self.update_vaddr_o.eq(vaddr),
-
- self.walking_instr_o.eq(is_instr_ptw),
- # directly output the correct physical address
- self.req_port_o.address_index.eq(ptw_pptr[0:DCACHE_INDEX_WIDTH]),
- self.req_port_o.address_tag.eq(ptw_pptr[DCACHE_INDEX_WIDTH:end]),
- # we are never going to kill this request
- self.req_port_o.kill_req.eq(0), # XXX assign comb?
- # we are never going to write with the HPTW
- self.req_port_o.data_wdata.eq(Const(0, 64)), # XXX assign comb?
- # -----------
- # TLB Update
- # -----------
- self.itlb_update_o.vpn.eq(vaddr[12:48]),
- self.dtlb_update_o.vpn.eq(vaddr[12:48]),
- # update the correct page table level
- self.itlb_update_o.is_2M.eq(ptw_lvl3),
- self.itlb_update_o.is_1G.eq(ptw_lvl2),
- self.itlb_update_o.is_512G.eq(ptw_lvl1),
- self.dtlb_update_o.is_2M.eq(ptw_lvl3),
- self.dtlb_update_o.is_1G.eq(ptw_lvl2),
- self.dtlb_update_o.is_512G.eq(ptw_lvl1),
-
- # output the correct ASID
- self.itlb_update_o.asid.eq(tlb_update_asid),
- self.dtlb_update_o.asid.eq(tlb_update_asid),
- # set the global mapping bit
- self.itlb_update_o.content.eq(pte),
- self.itlb_update_o.content.g.eq(global_mapping),
- self.dtlb_update_o.content.eq(pte),
- self.dtlb_update_o.content.g.eq(global_mapping),
-
- self.req_port_o.tag_valid.eq(tag_valid),
- ]
-
- #-------------------
- # Page table walker #needs update
- #-------------------
- # A virtual address va is translated into a physical address pa as
- # follows:
- # 1. Let a be sptbr.ppn × PAGESIZE, and let i = LEVELS-1. (For Sv48,
- # PAGESIZE=2^12 and LEVELS=4.)
- # 2. Let pte be the value of the PTE at address a+va.vpn[i]×PTESIZE.
- # (For Sv32, PTESIZE=4.)
- # 3. If pte.v = 0, or if pte.r = 0 and pte.w = 1, stop and raise an
- # access exception.
- # 4. Otherwise, the PTE is valid. If pte.r = 1 or pte.x = 1, go to
- # step 5. Otherwise, this PTE is a pointer to the next level of
- # the page table.
- # Let i=i-1. If i < 0, stop and raise an access exception.
- # Otherwise, let a = pte.ppn × PAGESIZE and go to step 2.
- # 5. A leaf PTE has been found. Determine if the requested memory
- # access is allowed by the pte.r, pte.w, and pte.x bits. If not,
- # stop and raise an access exception. Otherwise, the translation is
- # successful. Set pte.a to 1, and, if the memory access is a
- # store, set pte.d to 1.
- # The translated physical address is given as follows:
- # - pa.pgoff = va.pgoff.
- # - If i > 0, then this is a superpage translation and
- # pa.ppn[i-1:0] = va.vpn[i-1:0].
- # - pa.ppn[LEVELS-1:i] = pte.ppn[LEVELS-1:i].
- # 6. If i > 0 and pa.ppn[i − 1 : 0] != 0, this is a misaligned
- # superpage stop and raise a page-fault exception.
-
- m.d.sync += tag_valid.eq(0)
-
- # default assignments
- m.d.comb += [
- # PTW memory interface
- self.req_port_o.data_req.eq(0),
- self.req_port_o.data_be.eq(Const(0xFF, 8)),
- self.req_port_o.data_size.eq(Const(0b11, 2)),
- self.req_port_o.data_we.eq(0),
- self.ptw_error_o.eq(0),
- self.itlb_update_o.valid.eq(0),
- self.dtlb_update_o.valid.eq(0),
-
- self.itlb_miss_o.eq(0),
- self.dtlb_miss_o.eq(0),
- ]
-
- # ------------
- # State Machine
- # ------------
-
- with m.FSM() as fsm:
-
- with m.State("IDLE"):
- self.idle(m, is_instr_ptw, ptw_lvl, global_mapping,
- ptw_pptr, vaddr, tlb_update_asid)
-
- with m.State("WAIT_GRANT"):
- self.grant(m, tag_valid, data_rvalid)
-
- with m.State("PTE_LOOKUP"):
- # we wait for the valid signal
- with m.If(data_rvalid):
- self.lookup(m, pte, ptw_lvl, ptw_lvl1, ptw_lvl2, ptw_lvl3, ptw_lvl4,
- data_rvalid, global_mapping,
- is_instr_ptw, ptw_pptr)
-
- # Propagate error to MMU/LSU
- with m.State("PROPAGATE_ERROR"):
- m.next = "IDLE"
- m.d.comb += self.ptw_error_o.eq(1)
-
- # wait for the rvalid before going back to IDLE
- with m.State("WAIT_RVALID"):
- with m.If(data_rvalid):
- m.next = "IDLE"
-
- m.d.sync += [data_rdata.eq(self.req_port_i.data_rdata),
- data_rvalid.eq(self.req_port_i.data_rvalid)
- ]
-
- return m
-
- def set_grant_state(self, m):
- # should we have flushed before we got an rvalid,
- # wait for it until going back to IDLE
- with m.If(self.flush_i):
- with m.If (self.req_port_i.data_gnt):
- m.next = "WAIT_RVALID"
- with m.Else():
- m.next = "IDLE"
- with m.Else():
- m.next = "WAIT_GRANT"
-
- def idle(self, m, is_instr_ptw, ptw_lvl, global_mapping,
- ptw_pptr, vaddr, tlb_update_asid):
- # by default we start with the top-most page table
- m.d.sync += [is_instr_ptw.eq(0),
- ptw_lvl.eq(LVL1),
- global_mapping.eq(0),
- self.ptw_active_o.eq(0), # deactive (IDLE)
- ]
- # work out itlb/dtlb miss
- m.d.comb += self.itlb_miss_o.eq(self.enable_translation_i & \
- self.itlb_access_i & \
- ~self.itlb_hit_i & \
- ~self.dtlb_access_i)
- m.d.comb += self.dtlb_miss_o.eq(self.en_ld_st_translation_i & \
- self.dtlb_access_i & \
- ~self.dtlb_hit_i)
- # we got an ITLB miss?
- with m.If(self.itlb_miss_o):
- pptr = Cat(Const(0, 3), self.itlb_vaddr_i[30:48],
- self.satp_ppn_i)
- m.d.sync += [ptw_pptr.eq(pptr),
- is_instr_ptw.eq(1),
- vaddr.eq(self.itlb_vaddr_i),
- tlb_update_asid.eq(self.asid_i),
- ]
- self.set_grant_state(m)
-
- # we got a DTLB miss?
- with m.Elif(self.dtlb_miss_o):
- pptr = Cat(Const(0, 3), self.dtlb_vaddr_i[30:48],
- self.satp_ppn_i)
- m.d.sync += [ptw_pptr.eq(pptr),
- vaddr.eq(self.dtlb_vaddr_i),
- tlb_update_asid.eq(self.asid_i),
- ]
- self.set_grant_state(m)
-
- def grant(self, m, tag_valid, data_rvalid):
- # we've got a data WAIT_GRANT so tell the
- # cache that the tag is valid
-
- # send a request out
- m.d.comb += self.req_port_o.data_req.eq(1)
- # wait for the WAIT_GRANT
- with m.If(self.req_port_i.data_gnt):
- # send the tag valid signal one cycle later
- m.d.sync += tag_valid.eq(1)
- # should we have flushed before we got an rvalid,
- # wait for it until going back to IDLE
- with m.If(self.flush_i):
- with m.If (~data_rvalid):
- m.next = "WAIT_RVALID"
- with m.Else():
- m.next = "IDLE"
- with m.Else():
- m.next = "PTE_LOOKUP"
-
- def lookup(self, m, pte, ptw_lvl, ptw_lvl1, ptw_lvl2, ptw_lvl3, ptw_lvl4,
- data_rvalid, global_mapping,
- is_instr_ptw, ptw_pptr):
- # temporaries
- pte_rx = Signal(reset_less=True)
- pte_exe = Signal(reset_less=True)
- pte_inv = Signal(reset_less=True)
- pte_a = Signal(reset_less=True)
- st_wd = Signal(reset_less=True)
- m.d.comb += [pte_rx.eq(pte.r | pte.x),
- pte_exe.eq(~pte.x | ~pte.a),
- pte_inv.eq(~pte.v | (~pte.r & pte.w)),
- pte_a.eq(pte.a & (pte.r | (pte.x & self.mxr_i))),
- st_wd.eq(self.lsu_is_store_i & (~pte.w | ~pte.d))]
-
- l1err = Signal(reset_less=True)
- l2err = Signal(reset_less=True)
- l3err = Signal(reset_less=True)
- m.d.comb += [l3err.eq((ptw_lvl3) & pte.ppn[0:9] != Const(0,0)),
- l2err.eq((ptw_lvl2) & pte.ppn[0:18] != Const(0, 18)),
- l1err.eq((ptw_lvl1) & pte.ppn[0:27] != Const(0, 27))]
-
- # check if the global mapping bit is set
- with m.If (pte.g):
- m.d.sync += global_mapping.eq(1)
-
- m.next = "IDLE"
-
- # -------------
- # Invalid PTE
- # -------------
- # If pte.v = 0, or if pte.r = 0 and pte.w = 1,
- # stop and raise a page-fault exception.
- with m.If (pte_inv):
- m.next = "PROPAGATE_ERROR"
-
- # -----------
- # Valid PTE
- # -----------
-
- # it is a valid PTE
- # if pte.r = 1 or pte.x = 1 it is a valid PTE
- with m.Elif (pte_rx):
- # Valid translation found (either 1G, 2M or 4K)
- with m.If(is_instr_ptw):
- # ------------
- # Update ITLB
- # ------------
- # If page not executable, we can directly raise error.
- # This doesn't put a useless entry into the TLB.
- # The same idea applies to the access flag since we let
- # the access flag be managed by SW.
- with m.If (pte_exe):
- m.next = "IDLE"
- with m.Else():
- m.d.comb += self.itlb_update_o.valid.eq(1)
-
- with m.Else():
- # ------------
- # Update DTLB
- # ------------
- # Check if the access flag has been set, otherwise
- # throw page-fault and let software handle those bits.
- # If page not readable (there are no write-only pages)
- # directly raise an error. This doesn't put a useless
- # entry into the TLB.
- with m.If(pte_a):
- m.d.comb += self.dtlb_update_o.valid.eq(1)
- with m.Else():
- m.next = "PROPAGATE_ERROR"
- # Request is a store: perform additional checks
- # If the request was a store and the page not
- # write-able, raise an error
- # the same applies if the dirty flag is not set
- with m.If (st_wd):
- m.d.comb += self.dtlb_update_o.valid.eq(0)
- m.next = "PROPAGATE_ERROR"
-
- # check if the ppn is correctly aligned: Case (6)
- with m.If(l1err | l2err | l3err):
- m.next = "PROPAGATE_ERROR"
- m.d.comb += [self.dtlb_update_o.valid.eq(0),
- self.itlb_update_o.valid.eq(0)]
-
- # this is a pointer to the next TLB level
- with m.Else():
- # pointer to next level of page table
- with m.If (ptw_lvl1):
- # we are in the second level now
- pptr = Cat(Const(0, 3), self.dtlb_vaddr_i[30:39], pte.ppn)
- m.d.sync += [ptw_pptr.eq(pptr),
- ptw_lvl.eq(LVL2)
- ]
- with m.If(ptw_lvl2):
- # here we received a pointer to the third level
- pptr = Cat(Const(0, 3), self.dtlb_vaddr_i[21:30], pte.ppn)
- m.d.sync += [ptw_pptr.eq(pptr),
- ptw_lvl.eq(LVL3)
- ]
- with m.If(ptw_lvl3): #guess: shift page levels by one
- # here we received a pointer to the fourth level
- # the last one is near the page offset
- pptr = Cat(Const(0, 3), self.dtlb_vaddr_i[12:21], pte.ppn)
- m.d.sync += [ptw_pptr.eq(pptr),
- ptw_lvl.eq(LVL4)
- ]
- self.set_grant_state(m)
-
- with m.If (ptw_lvl4):
- # Should already be the last level
- # page table => Error
- m.d.sync += ptw_lvl.eq(LVL4)
- m.next = "PROPAGATE_ERROR"
-
-
-if __name__ == '__main__':
- ptw = PTW()
- vl = rtlil.convert(ptw, ports=ptw.ports())
- with open("test_ptw.il", "w") as f:
- f.write(vl)
+++ /dev/null
-import sys
-from soc.TLB.ariane.plru import PLRU
-from nmigen.compat.sim import run_simulation
-
-
-def tbench(dut):
- yield
-
-
-if __name__ == "__main__":
- dut = PLRU(4)
- run_simulation(dut, tbench(dut), vcd_name="test_plru.vcd")
- print("PLRU Unit Test Success")
+++ /dev/null
-from nmigen.compat.sim import run_simulation
-from soc.TLB.ariane.ptw import PTW, PTE
-
-# unit was changed, test needs to be changed
-
-
-def tbench(dut):
-
- addr = 0x8000000
-
- #pte = PTE()
- # yield pte.v.eq(1)
- # yield pte.r.eq(1)
-
- yield dut.req_port_i.data_gnt.eq(1)
- yield dut.req_port_i.data_rvalid.eq(1)
- yield dut.req_port_i.data_rdata.eq(0x43) # pte.flatten())
-
- # data lookup
- yield dut.en_ld_st_translation_i.eq(1)
- yield dut.asid_i.eq(1)
-
- yield dut.dtlb_access_i.eq(1)
- yield dut.dtlb_hit_i.eq(0)
- yield dut.dtlb_vaddr_i.eq(0x400000000)
-
- yield
- yield
- yield
-
- yield dut.dtlb_access_i.eq(1)
- yield dut.dtlb_hit_i.eq(0)
- yield dut.dtlb_vaddr_i.eq(0x200000)
-
- yield
- yield
- yield
-
- yield dut.req_port_i.data_gnt.eq(0)
- yield dut.dtlb_access_i.eq(1)
- yield dut.dtlb_hit_i.eq(0)
- yield dut.dtlb_vaddr_i.eq(0x400000011)
-
- yield
- yield dut.req_port_i.data_gnt.eq(1)
- yield
- yield
-
- # data lookup, PTW levels 1-2-3
- addr = 0x4000000
- yield dut.dtlb_vaddr_i.eq(addr)
- yield dut.mxr_i.eq(0x1)
- yield dut.req_port_i.data_gnt.eq(1)
- yield dut.req_port_i.data_rvalid.eq(1)
- # pte.flatten())
- yield dut.req_port_i.data_rdata.eq(0x41 | (addr >> 12) << 10)
-
- yield dut.en_ld_st_translation_i.eq(1)
- yield dut.asid_i.eq(1)
-
- yield dut.dtlb_access_i.eq(1)
- yield dut.dtlb_hit_i.eq(0)
- yield dut.dtlb_vaddr_i.eq(addr)
-
- yield
- yield
- yield
- yield
- yield
- yield
- yield
- yield
-
- yield dut.req_port_i.data_gnt.eq(0)
- yield dut.dtlb_access_i.eq(1)
- yield dut.dtlb_hit_i.eq(0)
- yield dut.dtlb_vaddr_i.eq(0x400000011)
-
- yield
- yield dut.req_port_i.data_gnt.eq(1)
- yield
- yield
- yield
- yield
-
- # instruction lookup
- yield dut.en_ld_st_translation_i.eq(0)
- yield dut.enable_translation_i.eq(1)
- yield dut.asid_i.eq(1)
-
- yield dut.itlb_access_i.eq(1)
- yield dut.itlb_hit_i.eq(0)
- yield dut.itlb_vaddr_i.eq(0x800000)
-
- yield
- yield
- yield
-
- yield dut.itlb_access_i.eq(1)
- yield dut.itlb_hit_i.eq(0)
- yield dut.itlb_vaddr_i.eq(0x200000)
-
- yield
- yield
- yield
-
- yield dut.req_port_i.data_gnt.eq(0)
- yield dut.itlb_access_i.eq(1)
- yield dut.itlb_hit_i.eq(0)
- yield dut.itlb_vaddr_i.eq(0x800011)
-
- yield
- yield dut.req_port_i.data_gnt.eq(1)
- yield
- yield
-
- yield
-
-
-def test_ptw():
- dut = PTW()
- run_simulation(dut, tbench(dut), vcd_name="test_ptw.vcd")
- print("PTW Unit Test Success")
-
-
-if __name__ == "__main__":
- test_ptw()
+++ /dev/null
-from nmigen.compat.sim import run_simulation
-
-from soc.TLB.ariane.tlb import TLB
-
-
-def set_vaddr(addr):
- yield dut.lu_vaddr_i.eq(addr)
- yield dut.update_i.vpn.eq(addr >> 12)
-
-
-def tbench(dut):
- yield dut.lu_access_i.eq(1)
- yield dut.lu_asid_i.eq(1)
- yield dut.update_i.valid.eq(1)
- yield dut.update_i.is_1G.eq(0)
- yield dut.update_i.is_2M.eq(0)
- yield dut.update_i.asid.eq(1)
- yield dut.update_i.content.ppn.eq(0)
- yield dut.update_i.content.rsw.eq(0)
- yield dut.update_i.content.r.eq(1)
-
- yield
-
- addr = 0x80000
- yield from set_vaddr(addr)
- yield
-
- addr = 0x90001
- yield from set_vaddr(addr)
- yield
-
- addr = 0x28000000
- yield from set_vaddr(addr)
- yield
-
- addr = 0x28000001
- yield from set_vaddr(addr)
-
- addr = 0x28000001
- yield from set_vaddr(addr)
- yield
-
- addr = 0x1000040000
- yield from set_vaddr(addr)
- yield
-
- addr = 0x1000040001
- yield from set_vaddr(addr)
- yield
-
- yield dut.update_i.is_1G.eq(1)
- addr = 0x2040000
- yield from set_vaddr(addr)
- yield
-
- yield dut.update_i.is_1G.eq(1)
- addr = 0x2040001
- yield from set_vaddr(addr)
- yield
-
- yield
-
-
-if __name__ == "__main__":
- dut = TLB()
- run_simulation(dut, tbench(dut), vcd_name="test_tlb.vcd")
- print("TLB Unit Test Success")
+++ /dev/null
-from nmigen.compat.sim import run_simulation
-
-from soc.TLB.ariane.tlb_content import TLBContent
-from soc.TestUtil.test_helper import assert_op, assert_eq
-
-
-def update(dut, a, t, g, m):
- yield dut.replace_en_i.eq(1)
- yield dut.update_i.valid.eq(1)
- yield dut.update_i.is_512G.eq(t)
- yield dut.update_i.is_1G.eq(g)
- yield dut.update_i.is_2M.eq(m)
- yield dut.update_i.vpn.eq(a)
- yield
- yield
-
-
-def check_hit(dut, hit, pagesize):
- hit_d = yield dut.lu_hit_o
- assert_eq("hit", hit_d, hit)
-
- if(hit):
- if(pagesize == "t"):
- hitp = yield dut.lu_is_512G_o
- assert_eq("lu_is_512G_o", hitp, 1)
- elif(pagesize == "g"):
- hitp = yield dut.lu_is_1G_o
- assert_eq("lu_is_1G_o", hitp, 1)
- elif(pagesize == "m"):
- hitp = yield dut.lu_is_2M_o
- assert_eq("lu_is_2M_o", hitp, 1)
-
-
-def addr(a, b, c, d):
- return a | b << 9 | c << 18 | d << 27
-
-
-def tbench(dut):
- yield dut.vpn0.eq(0x0A)
- yield dut.vpn1.eq(0x0B)
- yield dut.vpn2.eq(0x0C)
- yield dut.vpn3.eq(0x0D)
- yield from update(dut, addr(0xFF, 0xFF, 0xFF, 0x0D), 1, 0, 0)
- yield from check_hit(dut, 1, "t")
-
- yield from update(dut, addr(0xFF, 0xFF, 0x0C, 0x0D), 0, 1, 0)
- yield from check_hit(dut, 1, "g")
-
- yield from update(dut, addr(0xFF, 0x0B, 0x0C, 0x0D), 0, 0, 1)
- yield from check_hit(dut, 1, "m")
-
- yield from update(dut, addr(0x0A, 0x0B, 0x0C, 0x0D), 0, 0, 0)
- yield from check_hit(dut, 1, "")
-
- yield from update(dut, addr(0xAA, 0xBB, 0xCC, 0xDD), 0, 0, 0)
- yield from check_hit(dut, 0, "miss")
-
-
-if __name__ == "__main__":
- dut = TLBContent(4, 4)
- #
- run_simulation(dut, tbench(dut), vcd_name="test_tlb_content.vcd")
- print("TLBContent Unit Test Success")
+++ /dev/null
-"""
-# Copyright 2018 ETH Zurich and University of Bologna.
-# Copyright and related rights are licensed under the Solderpad Hardware
-# License, Version 0.51 (the "License"); you may not use this file except in
-# compliance with the License. You may obtain a copy of the License at
-# http:#solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# or agreed to in writing, software, hardware and materials distributed under
-# this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations under the License.
-#
-# Author: David Schaffenrath, TU Graz
-# Author: Florian Zaruba, ETH Zurich
-# Date: 21.4.2017
-# Description: Translation Lookaside Buffer, SV48
-# fully set-associative
-
-Implementation in c++:
-https://raw.githubusercontent.com/Tony-Hu/TreePLRU/master/TreePLRU.cpp
-
-Text description:
-https://people.cs.clemson.edu/~mark/464/p_lru.txt
-
-Online simulator:
-http://www.ntu.edu.sg/home/smitha/ParaCache/Paracache/vm.html
-"""
-from math import log2
-from nmigen import Signal, Module, Cat, Const, Array, Elaboratable
-from nmigen.cli import verilog, rtlil
-from nmigen.lib.coding import Encoder
-
-from soc.TLB.ariane.ptw import TLBUpdate, PTE, ASID_WIDTH
-from soc.TLB.ariane.plru import PLRU
-from soc.TLB.ariane.tlb_content import TLBContent
-
-TLB_ENTRIES = 8
-
-
-class TLB(Elaboratable):
- def __init__(self, tlb_entries=8, asid_width=8):
- self.tlb_entries = tlb_entries
- self.asid_width = asid_width
-
- self.flush_i = Signal() # Flush signal
- # Lookup signals
- self.lu_access_i = Signal()
- self.lu_asid_i = Signal(self.asid_width)
- self.lu_vaddr_i = Signal(64)
- self.lu_content_o = PTE()
- self.lu_is_2M_o = Signal()
- self.lu_is_1G_o = Signal()
- self.lu_is_512G_o = Signal()
- self.lu_hit_o = Signal()
- # Update TLB
- self.pte_width = len(self.lu_content_o.flatten())
- self.update_i = TLBUpdate(asid_width)
-
- def elaborate(self, platform):
- m = Module()
-
- vpn3 = Signal(9) # FIXME unused signal
- vpn2 = Signal(9)
- vpn1 = Signal(9)
- vpn0 = Signal(9)
-
- # -------------
- # Translation
- # -------------
-
- # SV48 defines four levels of page tables
- m.d.comb += [vpn0.eq(self.lu_vaddr_i[12:21]),
- vpn1.eq(self.lu_vaddr_i[21:30]),
- vpn2.eq(self.lu_vaddr_i[30:39]),
- vpn3.eq(self.lu_vaddr_i[39:48]), # FIXME
- ]
-
- tc = []
- for i in range(self.tlb_entries):
- tlc = TLBContent(self.pte_width, self.asid_width)
- setattr(m.submodules, "tc%d" % i, tlc)
- tc.append(tlc)
- # connect inputs
- tlc.update_i = self.update_i # saves a lot of graphviz links
- m.d.comb += [tlc.vpn0.eq(vpn0),
- tlc.vpn1.eq(vpn1),
- tlc.vpn2.eq(vpn2),
- # TODO 4th
- tlc.flush_i.eq(self.flush_i),
- # tlc.update_i.eq(self.update_i),
- tlc.lu_asid_i.eq(self.lu_asid_i)]
- tc = Array(tc)
-
- # --------------
- # Select hit
- # --------------
-
- # use Encoder to select hit index
- # XXX TODO: assert that there's only one valid entry (one lu_hit)
- hitsel = Encoder(self.tlb_entries)
- m.submodules.hitsel = hitsel
-
- hits = []
- for i in range(self.tlb_entries):
- hits.append(tc[i].lu_hit_o)
- m.d.comb += hitsel.i.eq(Cat(*hits)) # (goes into plru as well)
- idx = hitsel.o
-
- active = Signal(reset_less=True)
- m.d.comb += active.eq(~hitsel.n)
- with m.If(active):
- # active hit, send selected as output
- m.d.comb += [self.lu_is_512G_o.eq(tc[idx].lu_is_512G_o),
- self.lu_is_1G_o.eq(tc[idx].lu_is_1G_o),
- self.lu_is_2M_o.eq(tc[idx].lu_is_2M_o),
- self.lu_hit_o.eq(1),
- self.lu_content_o.flatten().eq(tc[idx].lu_content_o),
- ]
-
- # --------------
- # PLRU.
- # --------------
-
- p = PLRU(self.tlb_entries)
- plru_tree = Signal(p.TLBSZ)
- m.submodules.plru = p
-
- # connect PLRU inputs/outputs
- # XXX TODO: assert that there's only one valid entry (one replace_en)
- en = []
- for i in range(self.tlb_entries):
- en.append(tc[i].replace_en_i)
- m.d.comb += [Cat(*en).eq(p.replace_en_o), # output from PLRU into tags
- p.lu_hit.eq(hitsel.i),
- p.lu_access_i.eq(self.lu_access_i),
- p.plru_tree.eq(plru_tree)]
- m.d.sync += plru_tree.eq(p.plru_tree_o)
-
- # --------------
- # Sanity checks
- # --------------
-
- assert (self.tlb_entries % 2 == 0) and (self.tlb_entries > 1), \
- "TLB size must be a multiple of 2 and greater than 1"
- assert (self.asid_width >= 1), \
- "ASID width must be at least 1"
-
- return m
-
- """
- # Just for checking
- function int countSetBits(logic[self.tlb_entries-1:0] vector);
- automatic int count = 0;
- foreach (vector[idx]) begin
- count += vector[idx];
- end
- return count;
- endfunction
-
- assert property (@(posedge clk_i)(countSetBits(lu_hit) <= 1))
- else $error("More then one hit in TLB!"); $stop(); end
- assert property (@(posedge clk_i)(countSetBits(replace_en) <= 1))
- else $error("More then one TLB entry selected for next replace!");
- """
-
- def ports(self):
- return [self.flush_i, self.lu_access_i,
- self.lu_asid_i, self.lu_vaddr_i,
- self.lu_is_2M_o, self.lu_1G_o, self.lu_is_512G_o, self.lu_hit_o
- ] + self.lu_content_o.ports() + self.update_i.ports()
-
-
-if __name__ == '__main__':
- tlb = TLB()
- vl = rtlil.convert(tlb, ports=tlb.ports())
- with open("test_tlb.il", "w") as f:
- f.write(vl)
+++ /dev/null
-from nmigen import Signal, Module, Cat, Const, Elaboratable
-
-from soc.TLB.ariane.ptw import TLBUpdate, PTE
-
-
-class TLBEntry:
- def __init__(self, asid_width):
- self.asid = Signal(asid_width, name="ent_asid")
- # SV48 defines four levels of page tables
- self.vpn0 = Signal(9, name="ent_vpn0")
- self.vpn1 = Signal(9, name="ent_vpn1")
- self.vpn2 = Signal(9, name="ent_vpn2")
- self.vpn3 = Signal(9, name="ent_vpn3")
- self.is_2M = Signal(name="ent_is_2M")
- self.is_1G = Signal(name="ent_is_1G")
- self.is_512G = Signal(name="ent_is_512G")
- self.valid = Signal(name="ent_valid")
-
- def flatten(self):
- return Cat(*self.ports())
-
- def eq(self, x):
- return self.flatten().eq(x.flatten())
-
- def ports(self):
- return [self.asid, self.vpn0, self.vpn1, self.vpn2,
- self.is_2M, self.is_1G, self.valid]
-
-
-class TLBContent(Elaboratable):
- def __init__(self, pte_width, asid_width):
- self.asid_width = asid_width
- self.pte_width = pte_width
- self.flush_i = Signal() # Flush signal
- # Update TLB
- self.update_i = TLBUpdate(asid_width)
- self.vpn3 = Signal(9)
- self.vpn2 = Signal(9)
- self.vpn1 = Signal(9)
- self.vpn0 = Signal(9)
- self.replace_en_i = Signal() # replace the following entry,
- # set by replacement strategy
- # Lookup signals
- self.lu_asid_i = Signal(asid_width)
- self.lu_content_o = Signal(pte_width)
- self.lu_is_512G_o = Signal()
- self.lu_is_2M_o = Signal()
- self.lu_is_1G_o = Signal()
- self.lu_hit_o = Signal()
-
- def elaborate(self, platform):
- m = Module()
-
- tags = TLBEntry(self.asid_width)
-
- content = Signal(self.pte_width)
-
- m.d.comb += [self.lu_hit_o.eq(0),
- self.lu_is_512G_o.eq(0),
- self.lu_is_2M_o.eq(0),
- self.lu_is_1G_o.eq(0)]
-
- # temporaries for lookup
- asid_ok = Signal(reset_less=True)
- # tags_ok = Signal(reset_less=True)
-
- vpn3_ok = Signal(reset_less=True)
- vpn2_ok = Signal(reset_less=True)
- vpn1_ok = Signal(reset_less=True)
- vpn0_ok = Signal(reset_less=True)
-
- #tags_2M = Signal(reset_less=True)
- vpn0_or_2M = Signal(reset_less=True)
-
- m.d.comb += [
- # compare asid and vpn*
- asid_ok.eq(tags.asid == self.lu_asid_i),
- vpn3_ok.eq(tags.vpn3 == self.vpn3),
- vpn2_ok.eq(tags.vpn2 == self.vpn2),
- vpn1_ok.eq(tags.vpn1 == self.vpn1),
- vpn0_ok.eq(tags.vpn0 == self.vpn0),
- vpn0_or_2M.eq(tags.is_2M | vpn0_ok)
- ]
-
- with m.If(asid_ok & tags.valid):
- # first level, only vpn3 needs to match
- with m.If(tags.is_512G & vpn3_ok):
- m.d.comb += [self.lu_content_o.eq(content),
- self.lu_is_512G_o.eq(1),
- self.lu_hit_o.eq(1),
- ]
- # second level , second level vpn2 and vpn3 need to match
- with m.Elif(tags.is_1G & vpn2_ok & vpn3_ok):
- m.d.comb += [self.lu_content_o.eq(content),
- self.lu_is_1G_o.eq(1),
- self.lu_hit_o.eq(1),
- ]
- # not a giga page hit nor a tera page hit so check further
- with m.Elif(vpn1_ok):
- # this could be a 2 mega page hit or a 4 kB hit
- # output accordingly
- with m.If(vpn0_or_2M):
- m.d.comb += [self.lu_content_o.eq(content),
- self.lu_is_2M_o.eq(tags.is_2M),
- self.lu_hit_o.eq(1),
- ]
- # ------------------
- # Update or Flush
- # ------------------
-
- # temporaries
- replace_valid = Signal(reset_less=True)
- m.d.comb += replace_valid.eq(self.update_i.valid & self.replace_en_i)
-
- # flush
- with m.If(self.flush_i):
- # invalidate (flush) conditions: all if zero or just this ASID
- with m.If(self.lu_asid_i == Const(0, self.asid_width) |
- (self.lu_asid_i == tags.asid)):
- m.d.sync += tags.valid.eq(0)
-
- # normal replacement
- with m.Elif(replace_valid):
- m.d.sync += [ # update tag array
- tags.asid.eq(self.update_i.asid),
- tags.vpn3.eq(self.update_i.vpn[27:36]),
- tags.vpn2.eq(self.update_i.vpn[18:27]),
- tags.vpn1.eq(self.update_i.vpn[9:18]),
- tags.vpn0.eq(self.update_i.vpn[0:9]),
- tags.is_512G.eq(self.update_i.is_512G),
- tags.is_1G.eq(self.update_i.is_1G),
- tags.is_2M.eq(self.update_i.is_2M),
- tags.valid.eq(1),
- # and content as well
- content.eq(self.update_i.content.flatten())
- ]
- return m
-
- def ports(self):
- return [self.flush_i,
- self.lu_asid_i,
- self.lu_is_2M_o, self.lu_is_1G_o, self.lu_is_512G_o, self.lu_hit_o,
- ] + self.update_i.content.ports() + self.update_i.ports()
+++ /dev/null
-# SPDX-License-Identifier: LGPL-2.1-or-later
-# See Notices.txt for copyright information
-from soc.TLB.LFSR import LFSR, LFSRPolynomial, LFSR_POLY_3
-
-from nmigen.back.pysim import Simulator, Delay, Tick
-import unittest
-
-
-class TestLFSR(unittest.TestCase):
- def test_poly(self):
- v = LFSRPolynomial()
- self.assertEqual(repr(v), "LFSRPolynomial([0])")
- self.assertEqual(str(v), "1")
- v = LFSRPolynomial([1])
- self.assertEqual(repr(v), "LFSRPolynomial([1, 0])")
- self.assertEqual(str(v), "x + 1")
- v = LFSRPolynomial([0, 1])
- self.assertEqual(repr(v), "LFSRPolynomial([1, 0])")
- self.assertEqual(str(v), "x + 1")
- v = LFSRPolynomial([1, 2])
- self.assertEqual(repr(v), "LFSRPolynomial([2, 1, 0])")
- self.assertEqual(str(v), "x^2 + x + 1")
- v = LFSRPolynomial([2])
- self.assertEqual(repr(v), "LFSRPolynomial([2, 0])")
- self.assertEqual(str(v), "x^2 + 1")
- self.assertEqual(str(LFSR_POLY_3), "x^3 + x^2 + 1")
-
- def test_lfsr_3(self):
- module = LFSR(LFSR_POLY_3)
- traces = [module.state, module.enable]
- with Simulator(module,
- vcd_file=open("Waveforms/test_LFSR2.vcd", "w"),
- gtkw_file=open("Waveforms/test_LFSR2.gtkw", "w"),
- traces=traces) as sim:
- sim.add_clock(1e-6, phase=0.25e-6)
- delay = Delay(1e-7)
-
- def async_process():
- yield module.enable.eq(0)
- yield Tick()
- self.assertEqual((yield module.state), 0x1)
- yield Tick()
- self.assertEqual((yield module.state), 0x1)
- yield module.enable.eq(1)
- yield Tick()
- yield delay
- self.assertEqual((yield module.state), 0x2)
- yield Tick()
- yield delay
- self.assertEqual((yield module.state), 0x5)
- yield Tick()
- yield delay
- self.assertEqual((yield module.state), 0x3)
- yield Tick()
- yield delay
- self.assertEqual((yield module.state), 0x7)
- yield Tick()
- yield delay
- self.assertEqual((yield module.state), 0x6)
- yield Tick()
- yield delay
- self.assertEqual((yield module.state), 0x4)
- yield Tick()
- yield delay
- self.assertEqual((yield module.state), 0x1)
- yield Tick()
-
- sim.add_process(async_process)
- sim.run()
+++ /dev/null
-from nmigen.compat.sim import run_simulation
-from soc.TLB.AddressEncoder import AddressEncoder
-from soc.TestUtil.test_helper import assert_eq, assert_ne, assert_op
-
-
-# This function allows for the easy setting of values to the AddressEncoder
-# Arguments:
-# dut: The AddressEncoder being tested
-# i (Input): The array of single bits to be written
-def set_encoder(dut, i):
- yield dut.i.eq(i)
- yield
-
-# Checks the single match of the AddressEncoder
-# Arguments:
-# dut: The AddressEncoder being tested
-# sm (Single Match): The expected match result
-# op (Operation): (0 => ==), (1 => !=)
-
-
-def check_single_match(dut, sm, op):
- out_sm = yield dut.single_match
- assert_op("Single Match", out_sm, sm, op)
-
-# Checks the multiple match of the AddressEncoder
-# Arguments:
-# dut: The AddressEncoder being tested
-# mm (Multiple Match): The expected match result
-# op (Operation): (0 => ==), (1 => !=)
-
-
-def check_multiple_match(dut, mm, op):
- out_mm = yield dut.multiple_match
- assert_op("Multiple Match", out_mm, mm, op)
-
-# Checks the output of the AddressEncoder
-# Arguments:
-# dut: The AddressEncoder being tested
-# o (Output): The expected output
-# op (Operation): (0 => ==), (1 => !=)
-
-
-def check_output(dut, o, op):
- out_o = yield dut.o
- assert_op("Output", out_o, o, op)
-
-# Checks the state of the AddressEncoder
-# Arguments:
-# dut: The AddressEncoder being tested
-# sm (Single Match): The expected match result
-# mm (Multiple Match): The expected match result
-# o (Output): The expected output
-# ss_op (Operation): Operation for the match assertion (0 => ==), (1 => !=)
-# mm_op (Operation): Operation for the match assertion (0 => ==), (1 => !=)
-# o_op (Operation): Operation for the match assertion (0 => ==), (1 => !=)
-
-
-def check_all(dut, sm, mm, o, sm_op, mm_op, o_op):
- yield from check_single_match(dut, sm, sm_op)
- yield from check_multiple_match(dut, mm, mm_op)
- yield from check_output(dut, o, o_op)
-
-
-def tbench(dut):
- # Check invalid input
- in_val = 0b000
- single_match = 0
- multiple_match = 0
- output = 0
- yield from set_encoder(dut, in_val)
- yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0)
-
- # Check single bit
- in_val = 0b001
- single_match = 1
- multiple_match = 0
- output = 0
- yield from set_encoder(dut, in_val)
- yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0)
-
- # Check another single bit
- in_val = 0b100
- single_match = 1
- multiple_match = 0
- output = 2
- yield from set_encoder(dut, in_val)
- yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0)
-
- # Check multiple match
- # We expected the lowest bit to be returned which is address 0
- in_val = 0b101
- single_match = 0
- multiple_match = 1
- output = 0
- yield from set_encoder(dut, in_val)
- yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0)
-
- # Check another multiple match
- # We expected the lowest bit to be returned which is address 1
- in_val = 0b110
- single_match = 0
- multiple_match = 1
- output = 1
- yield from set_encoder(dut, in_val)
- yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0)
-
-
-def test_addr():
- dut = AddressEncoder(4)
- run_simulation(dut, tbench(dut),
- vcd_name="Waveforms/test_address_encoder.vcd")
- print("AddressEncoder Unit Test Success")
-
-
-if __name__ == "__main__":
- test_addr()
+++ /dev/null
-from nmigen.compat.sim import run_simulation
-
-from soc.TLB.Cam import Cam
-
-from soc.TestUtil.test_helper import assert_eq, assert_ne, assert_op
-
-# This function allows for the easy setting of values to the Cam
-# Arguments:
-# dut: The Cam being tested
-# e (Enable): Whether the block is going to be enabled
-# we (Write Enable): Whether the Cam will write on the next cycle
-# a (Address): Where the data will be written if write enable is high
-# d (Data): Either what we are looking for or will write to the address
-
-
-def set_cam(dut, e, we, a, d):
- yield dut.enable.eq(e)
- yield dut.write_enable.eq(we)
- yield dut.address_in.eq(a)
- yield dut.data_in.eq(d)
- yield
-
-# Checks the multiple match of the Cam
-# Arguments:
-# dut: The Cam being tested
-# mm (Multiple Match): The expected match result
-# op (Operation): (0 => ==), (1 => !=)
-
-
-def check_multiple_match(dut, mm, op):
- out_mm = yield dut.multiple_match
- assert_op("Multiple Match", out_mm, mm, op)
-
-# Checks the single match of the Cam
-# Arguments:
-# dut: The Cam being tested
-# sm (Single Match): The expected match result
-# op (Operation): (0 => ==), (1 => !=)
-
-
-def check_single_match(dut, sm, op):
- out_sm = yield dut.single_match
- assert_op("Single Match", out_sm, sm, op)
-
-# Checks the address output of the Cam
-# Arguments:
-# dut: The Cam being tested
-# ma (Match Address): The expected match result
-# op (Operation): (0 => ==), (1 => !=)
-
-
-def check_match_address(dut, ma, op):
- out_ma = yield dut.match_address
- assert_op("Match Address", out_ma, ma, op)
-
-# Checks the state of the Cam
-# Arguments:
-# dut: The Cam being tested
-# sm (Single Match): The expected match result
-# mm (Multiple Match): The expected match result
-# ma: (Match Address): The expected address output
-# ss_op (Operation): Operation for the match assertion (0 => ==), (1 => !=)
-# mm_op (Operation): Operation for the match assertion (0 => ==), (1 => !=)
-# ma_op (Operation): Operation for the address assertion (0 => ==), (1 => !=)
-
-
-def check_all(dut, mm, sm, ma, mm_op, sm_op, ma_op):
- yield from check_multiple_match(dut, mm, mm_op)
- yield from check_single_match(dut, sm, sm_op)
- yield from check_match_address(dut, ma, ma_op)
-
-
-def tbench(dut):
- # NA
- enable = 0
- write_enable = 0
- address = 0
- data = 0
- single_match = 0
- yield from set_cam(dut, enable, write_enable, address, data)
- yield
- yield from check_single_match(dut, single_match, 0)
-
- # Read Miss Multiple
- # Note that the default starting entry data bits are all 0
- enable = 1
- write_enable = 0
- address = 0
- data = 0
- multiple_match = 1
- single_match = 0
- yield from set_cam(dut, enable, write_enable, address, data)
- yield
- yield from check_multiple_match(dut, multiple_match, 0)
-
- # Read Miss
- # Note that the default starting entry data bits are all 0
- enable = 1
- write_enable = 0
- address = 0
- data = 1
- multiple_match = 0
- single_match = 0
- yield from set_cam(dut, enable, write_enable, address, data)
- yield
- yield from check_single_match(dut, single_match, 0)
-
- # Write Entry 0
- enable = 1
- write_enable = 1
- address = 0
- data = 4
- multiple_match = 0
- single_match = 0
- yield from set_cam(dut, enable, write_enable, address, data)
- yield
- yield from check_single_match(dut, single_match, 0)
-
- # Read Hit Entry 0
- enable = 1
- write_enable = 0
- address = 0
- data = 4
- multiple_match = 0
- single_match = 1
- yield from set_cam(dut, enable, write_enable, address, data)
- yield
- yield from check_all(dut, multiple_match, single_match, address, 0, 0, 0)
-
- # Search Hit
- enable = 1
- write_enable = 0
- address = 0
- data = 4
- multiple_match = 0
- single_match = 1
- yield from set_cam(dut, enable, write_enable, address, data)
- yield
- yield from check_all(dut, multiple_match, single_match, address, 0, 0, 0)
-
- # Search Miss
- enable = 1
- write_enable = 0
- address = 0
- data = 5
- single_match = 0
- yield from set_cam(dut, enable, write_enable, address, data)
- yield
- yield from check_single_match(dut, single_match, 0)
-
- # Multiple Match test
- # Write Entry 1
- enable = 1
- write_enable = 1
- address = 1
- data = 5
- multiple_match = 0
- single_match = 0
- yield from set_cam(dut, enable, write_enable, address, data)
- yield
- yield from check_single_match(dut, single_match, 0)
-
- # Write Entry 2
- # Same data as Entry 1
- enable = 1
- write_enable = 1
- address = 2
- data = 5
- multiple_match = 0
- single_match = 0
- yield from set_cam(dut, enable, write_enable, address, data)
- yield
- yield from check_single_match(dut, single_match, 0)
-
- # Read Hit Data 5
- enable = 1
- write_enable = 0
- address = 1
- data = 5
- multiple_match = 1
- single_match = 0
- yield from set_cam(dut, enable, write_enable, address, data)
- yield
- yield from check_all(dut, multiple_match, single_match, address, 0, 0, 0)
-
- # Verify read_warning is not caused
- # Write Entry 0
- enable = 1
- write_enable = 1
- address = 0
- data = 7
- multiple_match = 0
- single_match = 0
- yield from set_cam(dut, enable, write_enable, address, data)
- # Note there is no yield we immediately attempt to read in the next cycle
-
- # Read Hit Data 7
- enable = 1
- write_enable = 0
- address = 0
- data = 7
- multiple_match = 0
- single_match = 1
- yield from set_cam(dut, enable, write_enable, address, data)
- yield
- yield from check_single_match(dut, single_match, 0)
-
- yield
-
-
-def test_cam():
- dut = Cam(4, 4)
- run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_cam.vcd")
- print("Cam Unit Test Success")
-
-
-if __name__ == "__main__":
- test_cam()
+++ /dev/null
-from nmigen.compat.sim import run_simulation
-
-from soc.TestUtil.test_helper import assert_eq, assert_ne, assert_op
-from soc.TLB.CamEntry import CamEntry
-
-# This function allows for the easy setting of values to the Cam Entry
-# Arguments:
-# dut: The CamEntry being tested
-# c (command): NA (0), Read (1), Write (2), Reserve (3)
-# d (data): The data to be set
-
-
-def set_cam_entry(dut, c, d):
- # Write desired values
- yield dut.command.eq(c)
- yield dut.data_in.eq(d)
- yield
- # Reset all lines
- yield dut.command.eq(0)
- yield dut.data_in.eq(0)
- yield
-
-# Checks the data state of the CAM entry
-# Arguments:
-# dut: The CamEntry being tested
-# d (Data): The expected data
-# op (Operation): (0 => ==), (1 => !=)
-
-
-def check_data(dut, d, op):
- out_d = yield dut.data
- assert_op("Data", out_d, d, op)
-
-# Checks the match state of the CAM entry
-# Arguments:
-# dut: The CamEntry being tested
-# m (Match): The expected match
-# op (Operation): (0 => ==), (1 => !=)
-
-
-def check_match(dut, m, op):
- out_m = yield dut.match
- assert_op("Match", out_m, m, op)
-
-# Checks the state of the CAM entry
-# Arguments:
-# dut: The CamEntry being tested
-# d (data): The expected data
-# m (match): The expected match
-# d_op (Operation): Operation for the data assertion (0 => ==), (1 => !=)
-# m_op (Operation): Operation for the match assertion (0 => ==), (1 => !=)
-
-
-def check_all(dut, d, m, d_op, m_op):
- yield from check_data(dut, d, d_op)
- yield from check_match(dut, m, m_op)
-
-# This tbench goes through the paces of testing the CamEntry module
-# It is done by writing and then reading various combinations of key/data pairs
-# and reading the results with varying keys to verify the resulting stored
-# data is correct.
-
-
-def tbench(dut):
- # Check write
- command = 2
- data = 1
- match = 0
- yield from set_cam_entry(dut, command, data)
- yield from check_all(dut, data, match, 0, 0)
-
- # Check read miss
- command = 1
- data = 2
- match = 0
- yield from set_cam_entry(dut, command, data)
- yield from check_all(dut, data, match, 1, 0)
-
- # Check read hit
- command = 1
- data = 1
- match = 1
- yield from set_cam_entry(dut, command, data)
- yield from check_all(dut, data, match, 0, 0)
-
- # Check overwrite
- command = 2
- data = 5
- match = 0
- yield from set_cam_entry(dut, command, data)
- yield
- yield from check_all(dut, data, match, 0, 0)
-
- # Check read hit
- command = 1
- data = 5
- match = 1
- yield from set_cam_entry(dut, command, data)
- yield from check_all(dut, data, match, 0, 0)
-
- # Check reset
- command = 3
- data = 0
- match = 0
- yield from set_cam_entry(dut, command, data)
- yield from check_all(dut, data, match, 0, 0)
-
- # Extra clock cycle for waveform
- yield
-
-
-def test_camentry():
- dut = CamEntry(4)
- run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_cam_entry.vcd")
- print("CamEntry Unit Test Success")
-
-
-if __name__ == "__main__":
- test_camentry()
+++ /dev/null
-from nmigen.compat.sim import run_simulation
-
-from soc.TLB.PermissionValidator import PermissionValidator
-
-from soc.TestUtil.test_helper import assert_op
-
-
-def set_validator(dut, d, xwr, sm, sa, asid):
- yield dut.data.eq(d)
- yield dut.xwr.eq(xwr)
- yield dut.super_mode.eq(sm)
- yield dut.super_access.eq(sa)
- yield dut.asid.eq(asid)
- yield
-
-
-def check_valid(dut, v, op):
- out_v = yield dut.valid
- assert_op("Valid", out_v, v, op)
-
-
-def tbench(dut):
- # 80 bits represented. Ignore the MSB as it will be truncated
- # ASID is bits first 4 hex values (bits 64 - 78)
-
- # Test user mode entry valid
- # Global Bit matching ASID
- # Ensure that user mode and valid is enabled!
- data = 0x7FFF0000000000000031
- # Ignore MSB it will be truncated
- asid = 0x7FFF
- super_mode = 0
- super_access = 0
- xwr = 0
- valid = 1
- yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
- yield from check_valid(dut, valid, 0)
-
- # Test user mode entry valid
- # Global Bit nonmatching ASID
- # Ensure that user mode and valid is enabled!
- data = 0x7FFF0000000000000031
- # Ignore MSB it will be truncated
- asid = 0x7FF6
- super_mode = 0
- super_access = 0
- xwr = 0
- valid = 1
- yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
- yield from check_valid(dut, valid, 0)
-
- # Test user mode entry invalid
- # Global Bit nonmatching ASID
- # Ensure that user mode and valid is enabled!
- data = 0x7FFF0000000000000021
- # Ignore MSB it will be truncated
- asid = 0x7FF6
- super_mode = 0
- super_access = 0
- xwr = 0
- valid = 0
- yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
- yield from check_valid(dut, valid, 0)
-
- # Test user mode entry valid
- # Ensure that user mode and valid is enabled!
- data = 0x7FFF0000000000000011
- # Ignore MSB it will be truncated
- asid = 0x7FFF
- super_mode = 0
- super_access = 0
- xwr = 0
- valid = 1
- yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
- yield from check_valid(dut, valid, 0)
-
- # Test user mode entry invalid
- # Ensure that user mode and valid is enabled!
- data = 0x7FFF0000000000000011
- # Ignore MSB it will be truncated
- asid = 0x7FF6
- super_mode = 0
- super_access = 0
- xwr = 0
- valid = 0
- yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
- yield from check_valid(dut, valid, 0)
-
- # Test supervisor mode entry valid
- # The entry is NOT in user mode
- # Ensure that user mode and valid is enabled!
- data = 0x7FFF0000000000000001
- # Ignore MSB it will be truncated
- asid = 0x7FFF
- super_mode = 1
- super_access = 0
- xwr = 0
- valid = 1
- yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
- yield from check_valid(dut, valid, 0)
-
- # Test supervisor mode entry invalid
- # The entry is in user mode
- # Ensure that user mode and valid is enabled!
- data = 0x7FFF0000000000000011
- # Ignore MSB it will be truncated
- asid = 0x7FFF
- super_mode = 1
- super_access = 0
- xwr = 0
- valid = 0
- yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
- yield from check_valid(dut, valid, 0)
-
- # Test supervisor mode entry valid
- # The entry is NOT in user mode with access
- # Ensure that user mode and valid is enabled!
- data = 0x7FFF0000000000000001
- # Ignore MSB it will be truncated
- asid = 0x7FFF
- super_mode = 1
- super_access = 1
- xwr = 0
- valid = 1
- yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
- yield from check_valid(dut, valid, 0)
-
- # Test supervisor mode entry valid
- # The entry is in user mode with access
- # Ensure that user mode and valid is enabled!
- data = 0x7FFF0000000000000011
- # Ignore MSB it will be truncated
- asid = 0x7FFF
- super_mode = 1
- super_access = 1
- xwr = 0
- valid = 1
- yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
- yield from check_valid(dut, valid, 0)
-
-
-def test_permv():
- dut = PermissionValidator(15, 64)
- run_simulation(dut, tbench(
- dut), vcd_name="Waveforms/test_permission_validator.vcd")
- print("PermissionValidator Unit Test Success")
-
-
-if __name__ == "__main__":
- test_permv()
+++ /dev/null
-from nmigen.compat.sim import run_simulation
-
-from soc.TLB.PteEntry import PteEntry
-
-from soc.TestUtil.test_helper import assert_op
-
-
-def set_entry(dut, i):
- yield dut.i.eq(i)
- yield
-
-
-def check_dirty(dut, d, op):
- out_d = yield dut.d
- assert_op("Dirty", out_d, d, op)
-
-
-def check_accessed(dut, a, op):
- out_a = yield dut.a
- assert_op("Accessed", out_a, a, op)
-
-
-def check_global(dut, o, op):
- out = yield dut.g
- assert_op("Global", out, o, op)
-
-
-def check_user(dut, o, op):
- out = yield dut.u
- assert_op("User Mode", out, o, op)
-
-
-def check_xwr(dut, o, op):
- out = yield dut.xwr
- assert_op("XWR", out, o, op)
-
-
-def check_asid(dut, o, op):
- out = yield dut.asid
- assert_op("ASID", out, o, op)
-
-
-def check_pte(dut, o, op):
- out = yield dut.pte
- assert_op("ASID", out, o, op)
-
-
-def check_valid(dut, v, op):
- out_v = yield dut.v
- assert_op("Valid", out_v, v, op)
-
-
-def check_all(dut, d, a, g, u, xwr, v, asid, pte):
- yield from check_dirty(dut, d, 0)
- yield from check_accessed(dut, a, 0)
- yield from check_global(dut, g, 0)
- yield from check_user(dut, u, 0)
- yield from check_xwr(dut, xwr, 0)
- yield from check_asid(dut, asid, 0)
- yield from check_pte(dut, pte, 0)
- yield from check_valid(dut, v, 0)
-
-
-def tbench(dut):
- # 80 bits represented. Ignore the MSB as it will be truncated
- # ASID is bits first 4 hex values (bits 64 - 78)
-
- i = 0x7FFF0000000000000031
- dirty = 0
- access = 0
- glob = 1
- user = 1
- xwr = 0
- valid = 1
- asid = 0x7FFF
- pte = 0x0000000000000031
- yield from set_entry(dut, i)
- yield from check_all(dut, dirty, access, glob, user, xwr, valid, asid, pte)
-
- i = 0x0FFF00000000000000FF
- dirty = 1
- access = 1
- glob = 1
- user = 1
- xwr = 7
- valid = 1
- asid = 0x0FFF
- pte = 0x00000000000000FF
- yield from set_entry(dut, i)
- yield from check_all(dut, dirty, access, glob, user, xwr, valid, asid, pte)
-
- i = 0x0721000000001100001F
- dirty = 0
- access = 0
- glob = 0
- user = 1
- xwr = 7
- valid = 1
- asid = 0x0721
- pte = 0x000000001100001F
- yield from set_entry(dut, i)
- yield from check_all(dut, dirty, access, glob, user, xwr, valid, asid, pte)
-
- yield
-
-
-def test_pteentry():
- dut = PteEntry(15, 64)
- run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_pte_entry.vcd")
- print("PteEntry Unit Test Success")
-
-
-if __name__ == "__main__":
- test_pteentry()
+++ /dev/null
-from nmigen.compat.sim import run_simulation
-
-from soc.TLB.SetAssociativeCache import SetAssociativeCache
-
-from soc.TestUtil.test_helper import assert_eq, assert_ne, assert_op
-
-
-def set_sac(dut, e, c, s, t, d):
- yield dut.enable.eq(e)
- yield dut.command.eq(c)
- yield dut.cset.eq(s)
- yield dut.tag.eq(t)
- yield dut.data_i.eq(d)
- yield
-
-
-def tbench(dut):
- enable = 1
- command = 2
- cset = 1
- tag = 2
- data = 3
- yield from set_sac(dut, enable, command, cset, tag, data)
- yield
-
- enable = 1
- command = 2
- cset = 1
- tag = 5
- data = 8
- yield from set_sac(dut, enable, command, cset, tag, data)
- yield
-
-
-def test_assoc_cache():
- dut = SetAssociativeCache(4, 4, 4, 4)
- run_simulation(dut, tbench(
- dut), vcd_name="Waveforms/test_set_associative_cache.vcd")
- print("Set Associative Cache Unit Test Success")
-
-
-if __name__ == "__main__":
- test_assoc_cache()
+++ /dev/null
-#import tracemalloc
-# tracemalloc.start()
-
-from nmigen.compat.sim import run_simulation
-
-from soc.TLB.TLB import TLB
-
-from soc.TestUtil.test_helper import assert_op, assert_eq
-
-# self.supermode = Signal(1) # Supervisor Mode
-# self.super_access = Signal(1) # Supervisor Access
-# self.command = Signal(2) # 00=None, 01=Search, 10=Write L1, 11=Write L2
-# self.xwr = Signal(3) # Execute, Write, Read
-# self.mode = Signal(4) # 4 bits for access to Sv48 on Rv64
-#self.address_L1 = Signal(range(L1_size))
-# self.asid = Signal(asid_size) # Address Space IDentifier (ASID)
-# self.vma = Signal(vma_size) # Virtual Memory Address (VMA)
-# self.pte_in = Signal(pte_size) # To be saved Page Table Entry (PTE)
-#
-# self.hit = Signal(1) # Denotes if the VMA had a mapped PTE
-# self.perm_valid = Signal(1) # Denotes if the permissions are correct
-# self.pte_out = Signal(pte_size) # PTE that was mapped to by the VMA
-
-COMMAND_READ = 1
-COMMAND_WRITE_L1 = 2
-
-# Checks the data state of the CAM entry
-# Arguments:
-# dut: The CamEntry being tested
-# d (Data): The expected data
-# op (Operation): (0 => ==), (1 => !=)
-
-
-def check_hit(dut, d):
- hit_d = yield dut.hit
- #assert_eq("hit", hit_d, d)
-
-
-def tst_command(dut, cmd, xwr, cycles):
- yield dut.command.eq(cmd)
- yield dut.xwr.eq(xwr)
- for i in range(0, cycles):
- yield
-
-
-def tst_write_L1(dut, vma, address_L1, asid, pte_in):
- yield dut.address_L1.eq(address_L1)
- yield dut.asid.eq(asid)
- yield dut.vma.eq(vma)
- yield dut.pte_in.eq(pte_in)
- yield from tst_command(dut, COMMAND_WRITE_L1, 7, 2)
-
-
-def tst_search(dut, vma, found):
- yield dut.vma.eq(vma)
- yield from tst_command(dut, COMMAND_READ, 7, 1)
- yield from check_hit(dut, found)
-
-
-def zero(dut):
- yield dut.supermode.eq(0)
- yield dut.super_access.eq(0)
- yield dut.mode.eq(0)
- yield dut.address_L1.eq(0)
- yield dut.asid.eq(0)
- yield dut.vma.eq(0)
- yield dut.pte_in.eq(0)
-
-
-def tbench(dut):
- yield from zero(dut)
- yield dut.mode.eq(0xF) # enable TLB
- # test hit
- yield from tst_write_L1(dut, 0xFEEDFACE, 0, 0xFFFF, 0xF0F0)
- yield from tst_search(dut, 0xFEEDFACE, 1)
- yield from tst_search(dut, 0xFACEFEED, 0)
-
-
-def test_tlb():
- dut = TLB(15, 36, 64, 8)
- run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_tlb.vcd")
- print("TLB Unit Test Success")
-
-
-if __name__ == "__main__":
- test_tlb()
+++ /dev/null
-class DualPortSplitter(Elaboratable):
- """DualPortSplitter
-
- * one incoming PortInterface
- * two *OUTGOING* PortInterfaces
- * uses LDSTSplitter to do it
-
- (actually, thinking about it LDSTSplitter could simply be
- modified to conform to PortInterface: one in, two out)
-
- once that is done each pair of ports may be wired directly
- to the dual ports of L0CacheBuffer
-
- The split is carried out so that, regardless of alignment or
- mis-alignment, outgoing PortInterface[0] takes bit 4 == 0
- of the address, whilst outgoing PortInterface[1] takes
- bit 4 == 1.
-
- PortInterface *may* need to be changed so that the length is
- a binary number (accepting values 1-16).
- """
-
- def __init__(self,inp):
- self.outp = [PortInterface(name="outp_0"),
- PortInterface(name="outp_1")]
- print(self.outp)
-
- def elaborate(self, platform):
- m = Module()
- comb = m.d.comb
- m.submodules.splitter = splitter = LDSTSplitter(64, 48, 4)
- self.inp = splitter.pi
- comb += splitter.addr_i.eq(self.inp.addr) # XXX
- #comb += splitter.len_i.eq()
- #comb += splitter.valid_i.eq()
- comb += splitter.is_ld_i.eq(self.inp.is_ld_i)
- comb += splitter.is_st_i.eq(self.inp.is_st_i)
- #comb += splitter.st_data_i.eq()
- #comb += splitter.sld_valid_i.eq()
- #comb += splitter.sld_data_i.eq()
- #comb += splitter.sst_valid_i.eq()
- return m
+++ /dev/null
-# Copyright 2018 ETH Zurich and University of Bologna.
-# Copyright and related rights are licensed under the Solderpad Hardware
-# License, Version 0.51 (the "License"); you may not use this file except in
-# compliance with the License. You may obtain a copy of the License at
-# http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# or agreed to in writing, software, hardware and materials distributed under
-# this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations under the License.
-
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-# module axi4_ar_buffer
-# #(
-# parameter AXI_ID_WIDTH = 4,
-# parameter AXI_USER_WIDTH = 4
-# )
-# (
-# input logic axi4_aclk,
-# input logic axi4_arstn,
-#
-# input logic [AXI_ID_WIDTH-1:0] s_axi4_arid,
-# input logic [31:0] s_axi4_araddr,
-# input logic s_axi4_arvalid,
-# output logic s_axi4_arready,
-# input logic [7:0] s_axi4_arlen,
-# input logic [2:0] s_axi4_arsize,
-# input logic [1:0] s_axi4_arburst,
-# input logic s_axi4_arlock,
-# input logic [2:0] s_axi4_arprot,
-# input logic [3:0] s_axi4_arcache,
-# input logic [AXI_USER_WIDTH-1:0] s_axi4_aruser,
-#
-# output logic [AXI_ID_WIDTH-1:0] m_axi4_arid,
-# output logic [31:0] m_axi4_araddr,
-# output logic m_axi4_arvalid,
-# input logic m_axi4_arready,
-# output logic [7:0] m_axi4_arlen,
-# output logic [2:0] m_axi4_arsize,
-# output logic [1:0] m_axi4_arburst,
-# output logic m_axi4_arlock,
-# output logic [2:0] m_axi4_arprot,
-# output logic [3:0] m_axi4_arcache,
-# output logic [AXI_USER_WIDTH-1:0] m_axi4_aruser
-# );
-
-
-class axi4_ar_buffer(Elaboratable):
-
- def __init__(self):
- # self.axi4_aclk = Signal() # input
- # self.axi4_arstn = Signal() # input
- self.s_axi4_arid = Signal(AXI_ID_WIDTH) # input
- self.s_axi4_araddr = Signal(32) # input
- self.s_axi4_arvalid = Signal() # input
- self.s_axi4_arready = Signal() # output
- self.s_axi4_arlen = Signal(8) # input
- self.s_axi4_arsize = Signal(3) # input
- self.s_axi4_arburst = Signal(2) # input
- self.s_axi4_arlock = Signal() # input
- self.s_axi4_arprot = Signal(3) # input
- self.s_axi4_arcache = Signal(4) # input
- self.s_axi4_aruser = Signal(AXI_USER_WIDTH) # input
- self.m_axi4_arid = Signal(AXI_ID_WIDTH) # output
- self.m_axi4_araddr = Signal(32) # output
- self.m_axi4_arvalid = Signal() # output
- self.m_axi4_arready = Signal() # input
- self.m_axi4_arlen = Signal(8) # output
- self.m_axi4_arsize = Signal(3) # output
- self.m_axi4_arburst = Signal(2) # output
- self.m_axi4_arlock = Signal() # output
- self.m_axi4_arprot = Signal(3) # output
- self.m_axi4_arcache = Signal(4) # output
- self.m_axi4_aruser = Signal(AXI_USER_WIDTH) # output
-
- def elaborate(self, platform=None):
- m = Module()
- # #TODO use record types here
- # wire [AXI_ID_WIDTH+AXI_USER_WIDTH+52:0] data_in;
- # wire [AXI_ID_WIDTH+AXI_USER_WIDTH+52:0] data_out;
-
- # assign data_in [3:0] = s_axi4_arcache;
- # assign data_in [6:4] = s_axi4_arprot;
- # assign data_in [7] = s_axi4_arlock;
- # assign data_in [9:8] = s_axi4_arburst;
- # assign data_in [12:10] = s_axi4_arsize;
- # assign data_in [20:13] = s_axi4_arlen;
- # assign data_in [52:21] = s_axi4_araddr;
- # assign data_in [52+AXI_ID_WIDTH:53] = s_axi4_arid;
- # assign data_in[52+AXI_ID_WIDTH+AXI_USER_WIDTH:53+AXI_ID_WIDTH] = s_axi4_aruser;
- #
- # assign m_axi4_arcache = data_out[3:0];
- # assign m_axi4_arprot = data_out[6:4];
- # assign m_axi4_arlock = data_out[7];
- # assign m_axi4_arburst = data_out[9:8];
- # assign m_axi4_arsize = data_out[12:10];
- # assign m_axi4_arlen = data_out[20:13];
- # assign m_axi4_araddr = data_out[52:21];
- # assign m_axi4_arid = data_out[52+AXI_ID_WIDTH:53];
- # assign m_axi4_aruser = data_out[52+AXI_ID_WIDTH+AXI_USER_WIDTH:53+AXI_ID_WIDTH];
-
- # m.d.comb += self.m_axi4_arcache.eq(..)
- # m.d.comb += self.m_axi4_arprot.eq(..)
- # m.d.comb += self.m_axi4_arlock.eq(..)
- # m.d.comb += self.m_axi4_arburst.eq(..)
- # m.d.comb += self.m_axi4_arsize.eq(..)
- # m.d.comb += self.m_axi4_arlen.eq(..)
- # m.d.comb += self.m_axi4_araddr.eq(..)
- # m.d.comb += self.m_axi4_arid.eq(..)
- # m.d.comb += self.m_axi4_aruser.eq(..)
- return m
-
-# TODO convert axi_buffer_rab.sv
-#
-# axi_buffer_rab
-# #(
-# .DATA_WIDTH ( AXI_ID_WIDTH+AXI_USER_WIDTH+53 ),
-# .BUFFER_DEPTH ( 4 )
-# )
-# u_buffer
-# (
-# .clk ( axi4_aclk ),
-# .rstn ( axi4_arstn ),
-# .valid_out ( m_axi4_arvalid ),
-# .data_out ( data_out ),
-# .ready_in ( m_axi4_arready ),
-# .valid_in ( s_axi4_arvalid ),
-# .data_in ( data_in ),
-# .ready_out ( s_axi4_arready )
-# );
-#
-
-# endmodule
+++ /dev/null
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi4_ar_sender(Elaboratable):
-
- def __init__(self):
- self.axi4_aclk = Signal() # input
- self.axi4_arstn = Signal() # input
- self.l1_done_o = Signal() # output
- self.l1_accept_i = Signal() # input
- self.l1_drop_i = Signal() # input
- self.l1_save_i = Signal() # input
- self.l2_done_o = Signal() # output
- self.l2_accept_i = Signal() # input
- self.l2_drop_i = Signal() # input
- self.l2_sending_o = Signal() # output
- self.l1_araddr_i = Signal(AXI_ADDR_WIDTH) # input
- self.l2_araddr_i = Signal(AXI_ADDR_WIDTH) # input
- self.s_axi4_arid = Signal(AXI_ID_WIDTH) # input
- self.s_axi4_arvalid = Signal() # input
- self.s_axi4_arready = Signal() # output
- self.s_axi4_arlen = Signal(8) # input
- self.s_axi4_arsize = Signal(3) # input
- self.s_axi4_arburst = Signal(2) # input
- self.s_axi4_arlock = Signal() # input
- self.s_axi4_arprot = Signal(3) # input
- self.s_axi4_arcache = Signal(4) # input
- self.s_axi4_aruser = Signal(AXI_USER_WIDTH) # input
- self.m_axi4_arid = Signal(AXI_ID_WIDTH) # output
- self.m_axi4_araddr = Signal(AXI_ADDR_WIDTH) # output
- self.m_axi4_arvalid = Signal() # output
- self.m_axi4_arready = Signal() # input
- self.m_axi4_arlen = Signal(8) # output
- self.m_axi4_arsize = Signal(3) # output
- self.m_axi4_arburst = Signal(2) # output
- self.m_axi4_arlock = Signal() # output
- self.m_axi4_arprot = Signal(3) # output
- self.m_axi4_arcache = Signal(4) # output
- self.m_axi4_aruser = Signal(AXI_USER_WIDTH) # output
-
- def elaborate(self, platform=None):
- m = Module()
- m.d.comb += self.l1_save.eq(self.None)
- m.d.comb += self.l1_done_o.eq(self.None)
- m.d.comb += self.m_axi4_arvalid.eq(self.None)
- m.d.comb += self.s_axi4_arready.eq(self.None)
- m.d.comb += self.m_axi4_aruser.eq(self.None)
- m.d.comb += self.m_axi4_arcache.eq(self.None)
- m.d.comb += self.m_axi4_arprot.eq(self.None)
- m.d.comb += self.m_axi4_arlock.eq(self.None)
- m.d.comb += self.m_axi4_arburst.eq(self.None)
- m.d.comb += self.m_axi4_arsize.eq(self.None)
- m.d.comb += self.m_axi4_arlen.eq(self.None)
- m.d.comb += self.m_axi4_araddr.eq(self.None)
- m.d.comb += self.m_axi4_arid.eq(self.None)
- m.d.comb += self.l2_sending_o.eq(self.None)
- m.d.comb += self.l2_sent.eq(self.None)
- m.d.comb += self.l2_done_o.eq(self.None)
- m.d.comb += self.m_axi4_aruser.eq(self.s_axi4_aruser)
- m.d.comb += self.m_axi4_arcache.eq(self.s_axi4_arcache)
- m.d.comb += self.m_axi4_arprot.eq(self.s_axi4_arprot)
- m.d.comb += self.m_axi4_arlock.eq(self.s_axi4_arlock)
- m.d.comb += self.m_axi4_arburst.eq(self.s_axi4_arburst)
- m.d.comb += self.m_axi4_arsize.eq(self.s_axi4_arsize)
- m.d.comb += self.m_axi4_arlen.eq(self.s_axi4_arlen)
- m.d.comb += self.m_axi4_araddr.eq(self.l1_araddr_i)
- m.d.comb += self.m_axi4_arid.eq(self.s_axi4_arid)
- m.d.comb += self.l2_sending_o.eq(self.1: 'b0)
- m.d.comb += self.l2_available_q.eq(self.1: 'b0)
- m.d.comb += self.l2_done_o.eq(self.1: 'b0)
- return m
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License. You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# module axi4_ar_sender
-# #(
-# parameter AXI_ADDR_WIDTH = 40,
-# parameter AXI_ID_WIDTH = 4,
-# parameter AXI_USER_WIDTH = 4,
-# parameter ENABLE_L2TLB = 0
-# )
-# (
-# input logic axi4_aclk,
-# input logic axi4_arstn,
-#
-# output logic l1_done_o,
-# input logic l1_accept_i,
-# input logic l1_drop_i,
-# input logic l1_save_i,
-#
-# output logic l2_done_o,
-# input logic l2_accept_i,
-# input logic l2_drop_i,
-# output logic l2_sending_o,
-#
-# input logic [AXI_ADDR_WIDTH-1:0] l1_araddr_i,
-# input logic [AXI_ADDR_WIDTH-1:0] l2_araddr_i,
-#
-# input logic [AXI_ID_WIDTH-1:0] s_axi4_arid,
-# input logic s_axi4_arvalid,
-# output logic s_axi4_arready,
-# input logic [7:0] s_axi4_arlen,
-# input logic [2:0] s_axi4_arsize,
-# input logic [1:0] s_axi4_arburst,
-# input logic s_axi4_arlock,
-# input logic [2:0] s_axi4_arprot,
-# input logic [3:0] s_axi4_arcache,
-# input logic [AXI_USER_WIDTH-1:0] s_axi4_aruser,
-#
-# output logic [AXI_ID_WIDTH-1:0] m_axi4_arid,
-# output logic [AXI_ADDR_WIDTH-1:0] m_axi4_araddr,
-# output logic m_axi4_arvalid,
-# input logic m_axi4_arready,
-# output logic [7:0] m_axi4_arlen,
-# output logic [2:0] m_axi4_arsize,
-# output logic [1:0] m_axi4_arburst,
-# output logic m_axi4_arlock,
-# output logic [2:0] m_axi4_arprot,
-# output logic [3:0] m_axi4_arcache,
-# output logic [AXI_USER_WIDTH-1:0] m_axi4_aruser
-# );
-#
-# logic l1_save;
-#
-# logic l2_sent;
-# logic l2_available_q;
-#
-# assign l1_save = l1_save_i & l2_available_q;
-#
-# assign l1_done_o = s_axi4_arvalid & s_axi4_arready ;
-#
-# // if 1: accept and forward a transaction translated by L1
-# // 2: drop or save request (if L2 slot not occupied already)
-# assign m_axi4_arvalid = (s_axi4_arvalid & l1_accept_i) |
-# l2_sending_o;
-# assign s_axi4_arready = (m_axi4_arvalid & m_axi4_arready & ~l2_sending_o) |
-# (s_axi4_arvalid & (l1_drop_i | l1_save));
-#
-# generate
-# if (ENABLE_L2TLB == 1) begin
-# logic [AXI_USER_WIDTH-1:0] l2_axi4_aruser ;
-# logic [3:0] l2_axi4_arcache ;
-# logic [3:0] l2_axi4_arregion;
-# logic [3:0] l2_axi4_arqos ;
-# logic [2:0] l2_axi4_arprot ;
-# logic l2_axi4_arlock ;
-# logic [1:0] l2_axi4_arburst ;
-# logic [2:0] l2_axi4_arsize ;
-# logic [7:0] l2_axi4_arlen ;
-# logic [AXI_ID_WIDTH-1:0] l2_axi4_arid ;
-#
-# assign m_axi4_aruser = l2_sending_o ? l2_axi4_aruser : s_axi4_aruser;
-# assign m_axi4_arcache = l2_sending_o ? l2_axi4_arcache : s_axi4_arcache;
-# assign m_axi4_arprot = l2_sending_o ? l2_axi4_arprot : s_axi4_arprot;
-# assign m_axi4_arlock = l2_sending_o ? l2_axi4_arlock : s_axi4_arlock;
-# assign m_axi4_arburst = l2_sending_o ? l2_axi4_arburst : s_axi4_arburst;
-# assign m_axi4_arsize = l2_sending_o ? l2_axi4_arsize : s_axi4_arsize;
-# assign m_axi4_arlen = l2_sending_o ? l2_axi4_arlen : s_axi4_arlen;
-# assign m_axi4_araddr = l2_sending_o ? l2_araddr_i : l1_araddr_i;
-# assign m_axi4_arid = l2_sending_o ? l2_axi4_arid : s_axi4_arid;
-#
-# // Buffer AXI signals in case of L1 miss
-# always @(posedge axi4_aclk or negedge axi4_arstn) begin
-# if (axi4_arstn == 1'b0) begin
-# l2_axi4_aruser <= 'b0;
-# l2_axi4_arcache <= 'b0;
-# l2_axi4_arprot <= 'b0;
-# l2_axi4_arlock <= 1'b0;
-# l2_axi4_arburst <= 'b0;
-# l2_axi4_arsize <= 'b0;
-# l2_axi4_arlen <= 'b0;
-# l2_axi4_arid <= 'b0;
-# end else if (l1_save) begin
-# l2_axi4_aruser <= s_axi4_aruser;
-# l2_axi4_arcache <= s_axi4_arcache;
-# l2_axi4_arprot <= s_axi4_arprot;
-# l2_axi4_arlock <= s_axi4_arlock;
-# l2_axi4_arburst <= s_axi4_arburst;
-# l2_axi4_arsize <= s_axi4_arsize;
-# l2_axi4_arlen <= s_axi4_arlen;
-# l2_axi4_arid <= s_axi4_arid;
-# end
-# end
-#
-# // signal that an l1_save_i can be accepted
-# always @(posedge axi4_aclk or negedge axi4_arstn) begin
-# if (axi4_arstn == 1'b0) begin
-# l2_available_q <= 1'b1;
-# end else if (l2_sent | l2_drop_i) begin
-# l2_available_q <= 1'b1;
-# end else if (l1_save) begin
-# l2_available_q <= 1'b0;
-# end
-# end
-#
-# assign l2_sending_o = l2_accept_i & ~l2_available_q;
-# assign l2_sent = l2_sending_o & m_axi4_arvalid & m_axi4_arready;
-#
-# // if 1: having sent out a transaction translated by L2
-# // 2: drop request (L2 slot is available again)
-# assign l2_done_o = l2_sent | l2_drop_i;
-#
-# end else begin // !`ifdef ENABLE_L2TLB
-# assign m_axi4_aruser = s_axi4_aruser;
-# assign m_axi4_arcache = s_axi4_arcache;
-# assign m_axi4_arprot = s_axi4_arprot;
-# assign m_axi4_arlock = s_axi4_arlock;
-# assign m_axi4_arburst = s_axi4_arburst;
-# assign m_axi4_arsize = s_axi4_arsize;
-# assign m_axi4_arlen = s_axi4_arlen;
-# assign m_axi4_araddr = l1_araddr_i;
-# assign m_axi4_arid = s_axi4_arid;
-#
-# assign l2_sending_o = 1'b0;
-# assign l2_available_q = 1'b0;
-# assign l2_done_o = 1'b0;
-# end // else: !if(ENABLE_L2TLB == 1)
-# endgenerate
-#
-# endmodule
-#
-#
+++ /dev/null
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi4_aw_buffer(Elaboratable):
-
- def __init__(self):
- self.axi4_aclk = Signal() # input
- self.axi4_arstn = Signal() # input
- self.s_axi4_awid = Signal(AXI_ID_WIDTH) # input
- self.s_axi4_awaddr = Signal(32) # input
- self.s_axi4_awvalid = Signal() # input
- self.s_axi4_awready = Signal() # output
- self.s_axi4_awlen = Signal(8) # input
- self.s_axi4_awsize = Signal(3) # input
- self.s_axi4_awburst = Signal(2) # input
- self.s_axi4_awlock = Signal() # input
- self.s_axi4_awprot = Signal(3) # input
- self.s_axi4_awcache = Signal(4) # input
- self.s_axi4_awregion = Signal(4) # input
- self.s_axi4_awqos = Signal(4) # input
- self.s_axi4_awuser = Signal(AXI_USER_WIDTH) # input
- self.m_axi4_awid = Signal(AXI_ID_WIDTH) # output
- self.m_axi4_awaddr = Signal(32) # output
- self.m_axi4_awvalid = Signal() # output
- self.m_axi4_awready = Signal() # input
- self.m_axi4_awlen = Signal(8) # output
- self.m_axi4_awsize = Signal(3) # output
- self.m_axi4_awburst = Signal(2) # output
- self.m_axi4_awlock = Signal() # output
- self.m_axi4_awprot = Signal(3) # output
- self.m_axi4_awcache = Signal(4) # output
- self.m_axi4_awregion = Signal(4) # output
- self.m_axi4_awqos = Signal(4) # output
- self.m_axi4_awuser = Signal(AXI_USER_WIDTH) # output
-
- def elaborate(self, platform=None):
- m = Module()
- m.d.comb += self.None.eq(self.s_axi4_awcache)
- m.d.comb += self.None.eq(self.s_axi4_awprot)
- m.d.comb += self.None.eq(self.s_axi4_awlock)
- m.d.comb += self.None.eq(self.s_axi4_awburst)
- m.d.comb += self.None.eq(self.s_axi4_awsize)
- m.d.comb += self.None.eq(self.s_axi4_awlen)
- m.d.comb += self.None.eq(self.s_axi4_awaddr)
- m.d.comb += self.None.eq(self.s_axi4_awregion)
- m.d.comb += self.None.eq(self.s_axi4_awqos)
- m.d.comb += self.None.eq(self.s_axi4_awid)
- m.d.comb += self.None.eq(self.s_axi4_awuser)
- m.d.comb += self.m_axi4_awcache.eq(self.None)
- m.d.comb += self.m_axi4_awprot.eq(self.None)
- m.d.comb += self.m_axi4_awlock.eq(self.None)
- m.d.comb += self.m_axi4_awburst.eq(self.None)
- m.d.comb += self.m_axi4_awsize.eq(self.None)
- m.d.comb += self.m_axi4_awlen.eq(self.None)
- m.d.comb += self.m_axi4_awaddr.eq(self.None)
- m.d.comb += self.m_axi4_awregion.eq(self.None)
- m.d.comb += self.m_axi4_awqos.eq(self.None)
- m.d.comb += self.m_axi4_awid.eq(self.None)
- m.d.comb += self.m_axi4_awuser.eq(self.None)
- return m
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License. You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# module axi4_aw_buffer
-# #(
-# parameter AXI_ID_WIDTH = 4,
-# parameter AXI_USER_WIDTH = 4
-# )
-# (
-# input logic axi4_aclk,
-# input logic axi4_arstn,
-#
-# input logic [AXI_ID_WIDTH-1:0] s_axi4_awid,
-# input logic [31:0] s_axi4_awaddr,
-# input logic s_axi4_awvalid,
-# output logic s_axi4_awready,
-# input logic [7:0] s_axi4_awlen,
-# input logic [2:0] s_axi4_awsize,
-# input logic [1:0] s_axi4_awburst,
-# input logic s_axi4_awlock,
-# input logic [2:0] s_axi4_awprot,
-# input logic [3:0] s_axi4_awcache,
-# input logic [3:0] s_axi4_awregion,
-# input logic [3:0] s_axi4_awqos,
-# input logic [AXI_USER_WIDTH-1:0] s_axi4_awuser,
-#
-# output logic [AXI_ID_WIDTH-1:0] m_axi4_awid,
-# output logic [31:0] m_axi4_awaddr,
-# output logic m_axi4_awvalid,
-# input logic m_axi4_awready,
-# output logic [7:0] m_axi4_awlen,
-# output logic [2:0] m_axi4_awsize,
-# output logic [1:0] m_axi4_awburst,
-# output logic m_axi4_awlock,
-# output logic [2:0] m_axi4_awprot,
-# output logic [3:0] m_axi4_awcache,
-# output logic [3:0] m_axi4_awregion,
-# output logic [3:0] m_axi4_awqos,
-# output logic [AXI_USER_WIDTH-1:0] m_axi4_awuser
-# );
-#
-# wire [AXI_USER_WIDTH+AXI_ID_WIDTH+60:0] data_in;
-# wire [AXI_USER_WIDTH+AXI_ID_WIDTH+60:0] data_out;
-#
-# assign data_in [3:0] = s_axi4_awcache;
-# assign data_in [6:4] = s_axi4_awprot;
-# assign data_in [7] = s_axi4_awlock;
-# assign data_in [9:8] = s_axi4_awburst;
-# assign data_in [12:10] = s_axi4_awsize;
-# assign data_in [20:13] = s_axi4_awlen;
-# assign data_in [52:21] = s_axi4_awaddr;
-# assign data_in [56:53] = s_axi4_awregion;
-# assign data_in [60:57] = s_axi4_awqos;
-# assign data_in [60+AXI_ID_WIDTH:61] = s_axi4_awid;
-# assign data_in [60+AXI_ID_WIDTH+AXI_USER_WIDTH:61+AXI_ID_WIDTH] = s_axi4_awuser;
-#
-# assign m_axi4_awcache = data_out[3:0];
-# assign m_axi4_awprot = data_out[6:4];
-# assign m_axi4_awlock = data_out[7];
-# assign m_axi4_awburst = data_out[9:8];
-# assign m_axi4_awsize = data_out[12:10];
-# assign m_axi4_awlen = data_out[20:13];
-# assign m_axi4_awaddr = data_out[52:21];
-# assign m_axi4_awregion = data_out[56:53];
-# assign m_axi4_awqos = data_out[60:57];
-# assign m_axi4_awid = data_out[60+AXI_ID_WIDTH:61];
-# assign m_axi4_awuser = data_out[60+AXI_ID_WIDTH+AXI_USER_WIDTH:61+AXI_ID_WIDTH];
-#
-# axi_buffer_rab
-# #(
-# .DATA_WIDTH ( AXI_ID_WIDTH+AXI_USER_WIDTH+61 ),
-# .BUFFER_DEPTH ( 4 )
-# )
-# u_buffer
-# (
-# .clk ( axi4_aclk ),
-# .rstn ( axi4_arstn ),
-# .valid_out ( m_axi4_awvalid ),
-# .data_out ( data_out ),
-# .ready_in ( m_axi4_awready ),
-# .valid_in ( s_axi4_awvalid ),
-# .data_in ( data_in ),
-# .ready_out ( s_axi4_awready )
-# );
-# endmodule
-#
-#
+++ /dev/null
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi4_aw_sender(Elaboratable):
-
- def __init__(self):
- self.axi4_aclk = Signal() # input
- self.axi4_arstn = Signal() # input
- self.l1_done_o = Signal() # output
- self.l1_accept_i = Signal() # input
- self.l1_drop_i = Signal() # input
- self.l1_save_i = Signal() # input
- self.l2_done_o = Signal() # output
- self.l2_accept_i = Signal() # input
- self.l2_drop_i = Signal() # input
- self.l2_sending_o = Signal() # output
- self.l1_awaddr_i = Signal(AXI_ADDR_WIDTH) # input
- self.l2_awaddr_i = Signal(AXI_ADDR_WIDTH) # input
- self.s_axi4_awid = Signal(AXI_ID_WIDTH) # input
- self.s_axi4_awvalid = Signal() # input
- self.s_axi4_awready = Signal() # output
- self.s_axi4_awlen = Signal(8) # input
- self.s_axi4_awsize = Signal(3) # input
- self.s_axi4_awburst = Signal(2) # input
- self.s_axi4_awlock = Signal() # input
- self.s_axi4_awprot = Signal(3) # input
- self.s_axi4_awcache = Signal(4) # input
- self.s_axi4_awregion = Signal(4) # input
- self.s_axi4_awqos = Signal(4) # input
- self.s_axi4_awuser = Signal(AXI_USER_WIDTH) # input
- self.m_axi4_awid = Signal(AXI_ID_WIDTH) # output
- self.m_axi4_awaddr = Signal(AXI_ADDR_WIDTH) # output
- self.m_axi4_awvalid = Signal() # output
- self.m_axi4_awready = Signal() # input
- self.m_axi4_awlen = Signal(8) # output
- self.m_axi4_awsize = Signal(3) # output
- self.m_axi4_awburst = Signal(2) # output
- self.m_axi4_awlock = Signal() # output
- self.m_axi4_awprot = Signal(3) # output
- self.m_axi4_awcache = Signal(4) # output
- self.m_axi4_awregion = Signal(4) # output
- self.m_axi4_awqos = Signal(4) # output
- self.m_axi4_awuser = Signal(AXI_USER_WIDTH) # output
-
- def elaborate(self, platform=None):
- m = Module()
- m.d.comb += self.l1_save.eq(self.None)
- m.d.comb += self.l1_done_o.eq(self.None)
- m.d.comb += self.m_axi4_awvalid.eq(self.None)
- m.d.comb += self.s_axi4_awready.eq(self.None)
- m.d.comb += self.m_axi4_awuser.eq(self.None)
- m.d.comb += self.m_axi4_awcache.eq(self.None)
- m.d.comb += self.m_axi4_awregion.eq(self.None)
- m.d.comb += self.m_axi4_awqos.eq(self.None)
- m.d.comb += self.m_axi4_awprot.eq(self.None)
- m.d.comb += self.m_axi4_awlock.eq(self.None)
- m.d.comb += self.m_axi4_awburst.eq(self.None)
- m.d.comb += self.m_axi4_awsize.eq(self.None)
- m.d.comb += self.m_axi4_awlen.eq(self.None)
- m.d.comb += self.m_axi4_awaddr.eq(self.None)
- m.d.comb += self.m_axi4_awid.eq(self.None)
- m.d.comb += self.l2_sending_o.eq(self.None)
- m.d.comb += self.l2_sent.eq(self.None)
- m.d.comb += self.l2_done_o.eq(self.None)
- m.d.comb += self.m_axi4_awuser.eq(self.s_axi4_awuser)
- m.d.comb += self.m_axi4_awcache.eq(self.s_axi4_awcache)
- m.d.comb += self.m_axi4_awregion.eq(self.s_axi4_awregion)
- m.d.comb += self.m_axi4_awqos.eq(self.s_axi4_awqos)
- m.d.comb += self.m_axi4_awprot.eq(self.s_axi4_awprot)
- m.d.comb += self.m_axi4_awlock.eq(self.s_axi4_awlock)
- m.d.comb += self.m_axi4_awburst.eq(self.s_axi4_awburst)
- m.d.comb += self.m_axi4_awsize.eq(self.s_axi4_awsize)
- m.d.comb += self.m_axi4_awlen.eq(self.s_axi4_awlen)
- m.d.comb += self.m_axi4_awaddr.eq(self.l1_awaddr_i)
- m.d.comb += self.m_axi4_awid.eq(self.s_axi4_awid)
- m.d.comb += self.l2_sending_o.eq(self.1: 'b0)
- m.d.comb += self.l2_available_q.eq(self.1: 'b0)
- m.d.comb += self.l2_done_o.eq(self.1: 'b0)
- return m
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License. You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# module axi4_aw_sender
-# #(
-# parameter AXI_ADDR_WIDTH = 40,
-# parameter AXI_ID_WIDTH = 4,
-# parameter AXI_USER_WIDTH = 4,
-# parameter ENABLE_L2TLB = 0
-# )
-# (
-# input logic axi4_aclk,
-# input logic axi4_arstn,
-#
-# output logic l1_done_o,
-# input logic l1_accept_i,
-# input logic l1_drop_i,
-# input logic l1_save_i,
-#
-# output logic l2_done_o,
-# input logic l2_accept_i,
-# input logic l2_drop_i,
-# output logic l2_sending_o,
-#
-# input logic [AXI_ADDR_WIDTH-1:0] l1_awaddr_i,
-# input logic [AXI_ADDR_WIDTH-1:0] l2_awaddr_i,
-#
-# input logic [AXI_ID_WIDTH-1:0] s_axi4_awid,
-# input logic s_axi4_awvalid,
-# output logic s_axi4_awready,
-# input logic [7:0] s_axi4_awlen,
-# input logic [2:0] s_axi4_awsize,
-# input logic [1:0] s_axi4_awburst,
-# input logic s_axi4_awlock,
-# input logic [2:0] s_axi4_awprot,
-# input logic [3:0] s_axi4_awcache,
-# input logic [3:0] s_axi4_awregion,
-# input logic [3:0] s_axi4_awqos,
-# input logic [AXI_USER_WIDTH-1:0] s_axi4_awuser,
-#
-# output logic [AXI_ID_WIDTH-1:0] m_axi4_awid,
-# output logic [AXI_ADDR_WIDTH-1:0] m_axi4_awaddr,
-# output logic m_axi4_awvalid,
-# input logic m_axi4_awready,
-# output logic [7:0] m_axi4_awlen,
-# output logic [2:0] m_axi4_awsize,
-# output logic [1:0] m_axi4_awburst,
-# output logic m_axi4_awlock,
-# output logic [2:0] m_axi4_awprot,
-# output logic [3:0] m_axi4_awcache,
-# output logic [3:0] m_axi4_awregion,
-# output logic [3:0] m_axi4_awqos,
-# output logic [AXI_USER_WIDTH-1:0] m_axi4_awuser
-# );
-#
-# logic l1_save;
-#
-# logic l2_sent;
-# logic l2_available_q;
-#
-# assign l1_save = l1_save_i & l2_available_q;
-#
-# assign l1_done_o = s_axi4_awvalid & s_axi4_awready ;
-#
-# // if 1: accept and forward a transaction translated by L1
-# // 2: drop or save request (if L2 slot not occupied already)
-# assign m_axi4_awvalid = (s_axi4_awvalid & l1_accept_i) |
-# l2_sending_o;
-# assign s_axi4_awready = (m_axi4_awvalid & m_axi4_awready & ~l2_sending_o) |
-# (s_axi4_awvalid & (l1_drop_i | l1_save));
-#
-# generate
-# if (ENABLE_L2TLB == 1) begin
-# logic [AXI_USER_WIDTH-1:0] l2_axi4_awuser ;
-# logic [3:0] l2_axi4_awcache ;
-# logic [3:0] l2_axi4_awregion;
-# logic [3:0] l2_axi4_awqos ;
-# logic [2:0] l2_axi4_awprot ;
-# logic l2_axi4_awlock ;
-# logic [1:0] l2_axi4_awburst ;
-# logic [2:0] l2_axi4_awsize ;
-# logic [7:0] l2_axi4_awlen ;
-# logic [AXI_ID_WIDTH-1:0] l2_axi4_awid ;
-#
-# assign m_axi4_awuser = l2_sending_o ? l2_axi4_awuser : s_axi4_awuser;
-# assign m_axi4_awcache = l2_sending_o ? l2_axi4_awcache : s_axi4_awcache;
-# assign m_axi4_awregion = l2_sending_o ? l2_axi4_awregion : s_axi4_awregion;
-# assign m_axi4_awqos = l2_sending_o ? l2_axi4_awqos : s_axi4_awqos;
-# assign m_axi4_awprot = l2_sending_o ? l2_axi4_awprot : s_axi4_awprot;
-# assign m_axi4_awlock = l2_sending_o ? l2_axi4_awlock : s_axi4_awlock;
-# assign m_axi4_awburst = l2_sending_o ? l2_axi4_awburst : s_axi4_awburst;
-# assign m_axi4_awsize = l2_sending_o ? l2_axi4_awsize : s_axi4_awsize;
-# assign m_axi4_awlen = l2_sending_o ? l2_axi4_awlen : s_axi4_awlen;
-# assign m_axi4_awaddr = l2_sending_o ? l2_awaddr_i : l1_awaddr_i;
-# assign m_axi4_awid = l2_sending_o ? l2_axi4_awid : s_axi4_awid;
-#
-# // buffer AXI signals in case of L1 miss
-# always @(posedge axi4_aclk or negedge axi4_arstn) begin
-# if (axi4_arstn == 1'b0) begin
-# l2_axi4_awuser <= 'b0;
-# l2_axi4_awcache <= 'b0;
-# l2_axi4_awregion <= 'b0;
-# l2_axi4_awqos <= 'b0;
-# l2_axi4_awprot <= 'b0;
-# l2_axi4_awlock <= 1'b0;
-# l2_axi4_awburst <= 'b0;
-# l2_axi4_awsize <= 'b0;
-# l2_axi4_awlen <= 'b0;
-# l2_axi4_awid <= 'b0;
-# end else if (l1_save) begin
-# l2_axi4_awuser <= s_axi4_awuser;
-# l2_axi4_awcache <= s_axi4_awcache;
-# l2_axi4_awregion <= s_axi4_awregion;
-# l2_axi4_awqos <= s_axi4_awqos;
-# l2_axi4_awprot <= s_axi4_awprot;
-# l2_axi4_awlock <= s_axi4_awlock;
-# l2_axi4_awburst <= s_axi4_awburst;
-# l2_axi4_awsize <= s_axi4_awsize;
-# l2_axi4_awlen <= s_axi4_awlen;
-# l2_axi4_awid <= s_axi4_awid;
-# end
-# end
-#
-# // signal that an l1_save_i can be accepted
-# always @(posedge axi4_aclk or negedge axi4_arstn) begin
-# if (axi4_arstn == 1'b0) begin
-# l2_available_q <= 1'b1;
-# end else if (l2_sent | l2_drop_i) begin
-# l2_available_q <= 1'b1;
-# end else if (l1_save) begin
-# l2_available_q <= 1'b0;
-# end
-# end
-#
-# assign l2_sending_o = l2_accept_i & ~l2_available_q;
-# assign l2_sent = l2_sending_o & m_axi4_awvalid & m_axi4_awready;
-#
-# // if 1: having sent out a transaction translated by L2
-# // 2: drop request (L2 slot is available again)
-# assign l2_done_o = l2_sent | l2_drop_i;
-#
-# end else begin // !`ifdef ENABLE_L2TLB
-# assign m_axi4_awuser = s_axi4_awuser;
-# assign m_axi4_awcache = s_axi4_awcache;
-# assign m_axi4_awregion = s_axi4_awregion;
-# assign m_axi4_awqos = s_axi4_awqos;
-# assign m_axi4_awprot = s_axi4_awprot;
-# assign m_axi4_awlock = s_axi4_awlock;
-# assign m_axi4_awburst = s_axi4_awburst;
-# assign m_axi4_awsize = s_axi4_awsize;
-# assign m_axi4_awlen = s_axi4_awlen;
-# assign m_axi4_awaddr = l1_awaddr_i;
-# assign m_axi4_awid = s_axi4_awid;
-#
-# assign l2_sending_o = 1'b0;
-# assign l2_available_q = 1'b0;
-# assign l2_done_o = 1'b0;
-# end // !`ifdef ENABLE_L2TLB
-# endgenerate
-#
-# endmodule
-#
-#
+++ /dev/null
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi4_b_buffer(Elaboratable):
-
- def __init__(self):
- self.axi4_aclk = Signal() # input
- self.axi4_arstn = Signal() # input
- self.s_axi4_bid = Signal(AXI_ID_WIDTH) # output
- self.s_axi4_bresp = Signal(2) # output
- self.s_axi4_bvalid = Signal() # output
- self.s_axi4_buser = Signal(AXI_USER_WIDTH) # output
- self.s_axi4_bready = Signal() # input
- self.m_axi4_bid = Signal(AXI_ID_WIDTH) # input
- self.m_axi4_bresp = Signal(2) # input
- self.m_axi4_bvalid = Signal() # input
- self.m_axi4_buser = Signal(AXI_USER_WIDTH) # input
- self.m_axi4_bready = Signal() # output
-
- def elaborate(self, platform=None):
- m = Module()
- m.d.comb += self.None.eq(self.m_axi4_bresp)
- m.d.comb += self.None.eq(self.m_axi4_bid)
- m.d.comb += self.None.eq(self.m_axi4_buser)
- m.d.comb += self.s_axi4_buser.eq(self.None)
- m.d.comb += self.s_axi4_bid.eq(self.None)
- m.d.comb += self.s_axi4_bresp.eq(self.None)
- return m
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License. You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# module axi4_b_buffer
-# #(
-# parameter AXI_ID_WIDTH = 4,
-# parameter AXI_USER_WIDTH = 4
-# )
-# (
-# input logic axi4_aclk,
-# input logic axi4_arstn,
-#
-# output logic [AXI_ID_WIDTH-1:0] s_axi4_bid,
-# output logic [1:0] s_axi4_bresp,
-# output logic s_axi4_bvalid,
-# output logic [AXI_USER_WIDTH-1:0] s_axi4_buser,
-# input logic s_axi4_bready,
-#
-# input logic [AXI_ID_WIDTH-1:0] m_axi4_bid,
-# input logic [1:0] m_axi4_bresp,
-# input logic m_axi4_bvalid,
-# input logic [AXI_USER_WIDTH-1:0] m_axi4_buser,
-# output logic m_axi4_bready
-# );
-#
-# wire [AXI_ID_WIDTH+AXI_USER_WIDTH+1:0] data_in;
-# wire [AXI_ID_WIDTH+AXI_USER_WIDTH+1:0] data_out;
-#
-# assign data_in [1:0] = m_axi4_bresp;
-# assign data_in [AXI_ID_WIDTH+1:2] = m_axi4_bid;
-# assign data_in[AXI_ID_WIDTH+AXI_USER_WIDTH+1:AXI_ID_WIDTH+2] = m_axi4_buser;
-#
-# assign s_axi4_buser = data_out[AXI_ID_WIDTH+AXI_USER_WIDTH+1:AXI_ID_WIDTH+2];
-# assign s_axi4_bid = data_out[AXI_ID_WIDTH+1:2];
-# assign s_axi4_bresp = data_out[1:0];
-#
-# axi_buffer_rab
-# #(
-# .DATA_WIDTH ( AXI_ID_WIDTH+AXI_USER_WIDTH+2 ),
-# .BUFFER_DEPTH ( 4 )
-# )
-# u_buffer
-# (
-# .clk ( axi4_aclk ),
-# .rstn ( axi4_arstn ),
-# .valid_out( s_axi4_bvalid ),
-# .data_out ( data_out ),
-# .ready_in ( s_axi4_bready ),
-# .valid_in ( m_axi4_bvalid ),
-# .data_in ( data_in ),
-# .ready_out( m_axi4_bready )
-# );
-#
-# endmodule
-#
-#
+++ /dev/null
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi4_b_sender(Elaboratable):
-
- def __init__(self):
- self.axi4_aclk = Signal() # input
- self.axi4_arstn = Signal() # input
- self.drop_i = Signal() # input
- self.done_o = Signal() # output
- self.id_i = Signal(AXI_ID_WIDTH) # input
- self.prefetch_i = Signal() # input
- self.hit_i = Signal() # input
- self.s_axi4_bid = Signal(AXI_ID_WIDTH) # output
- self.s_axi4_bresp = Signal(2) # output
- self.s_axi4_bvalid = Signal() # output
- self.s_axi4_buser = Signal(AXI_USER_WIDTH) # output
- self.s_axi4_bready = Signal() # input
- self.m_axi4_bid = Signal(AXI_ID_WIDTH) # input
- self.m_axi4_bresp = Signal(2) # input
- self.m_axi4_bvalid = Signal() # input
- self.m_axi4_buser = Signal(AXI_USER_WIDTH) # input
- self.m_axi4_bready = Signal() # output
-
- def elaborate(self, platform=None):
- m = Module()
- m.d.comb += self.fifo_push.eq(self.None)
- m.d.comb += self.done_o.eq(self.fifo_push)
- m.d.comb += self.fifo_pop.eq(self.None)
- m.d.comb += self.s_axi4_buser.eq(self.None)
- m.d.comb += self.s_axi4_bid.eq(self.None)
- m.d.comb += self.s_axi4_bresp.eq(self.None)
- m.d.comb += self.s_axi4_bvalid.eq(self.None)
- m.d.comb += self.m_axi4_bready.eq(self.None)
- return m
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License. You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# module axi4_b_sender
-# #(
-# parameter AXI_ID_WIDTH = 10,
-# parameter AXI_USER_WIDTH = 4
-# )
-# (
-# input logic axi4_aclk,
-# input logic axi4_arstn,
-#
-# input logic drop_i,
-# output logic done_o,
-# input logic [AXI_ID_WIDTH-1:0] id_i,
-# input logic prefetch_i,
-# input logic hit_i,
-#
-# output logic [AXI_ID_WIDTH-1:0] s_axi4_bid,
-# output logic [1:0] s_axi4_bresp,
-# output logic s_axi4_bvalid,
-# output logic [AXI_USER_WIDTH-1:0] s_axi4_buser,
-# input logic s_axi4_bready,
-#
-# input logic [AXI_ID_WIDTH-1:0] m_axi4_bid,
-# input logic [1:0] m_axi4_bresp,
-# input logic m_axi4_bvalid,
-# input logic [AXI_USER_WIDTH-1:0] m_axi4_buser,
-# output logic m_axi4_bready
-# );
-#
-# logic fifo_valid;
-# logic fifo_pop;
-# logic fifo_push;
-# logic fifo_ready;
-# logic [AXI_ID_WIDTH-1:0] id;
-# logic prefetch;
-# logic hit;
-#
-# logic dropping;
-#
-# axi_buffer_rab
-# #(
-# .DATA_WIDTH ( 2+AXI_ID_WIDTH ),
-# .BUFFER_DEPTH ( 4 )
-# )
-# u_fifo
-# (
-# .clk ( axi4_aclk ),
-# .rstn ( axi4_arstn ),
-# // Pop
-# .data_out ( {prefetch, hit, id} ),
-# .valid_out ( fifo_valid ),
-# .ready_in ( fifo_pop ),
-# // Push
-# .valid_in ( fifo_push ),
-# .data_in ( {prefetch_i, hit_i, id_i} ),
-# .ready_out ( fifo_ready )
-# );
-#
-# assign fifo_push = drop_i & fifo_ready;
-# assign done_o = fifo_push;
-#
-# assign fifo_pop = dropping & s_axi4_bready;
-#
-# always @ (posedge axi4_aclk or negedge axi4_arstn) begin
-# if (axi4_arstn == 1'b0) begin
-# dropping <= 1'b0;
-# end else begin
-# if (fifo_valid && ~dropping)
-# dropping <= 1'b1;
-# else if (fifo_pop)
-# dropping <= 1'b0;
-# end
-# end
-#
-# assign s_axi4_buser = dropping ? {AXI_USER_WIDTH{1'b0}} : m_axi4_buser;
-# assign s_axi4_bid = dropping ? id : m_axi4_bid;
-#
-# assign s_axi4_bresp = (dropping & prefetch & hit) ? 2'b00 : // prefetch hit, mutli, prot
-# (dropping & prefetch ) ? 2'b10 : // prefetch miss
-# (dropping & hit) ? 2'b10 : // non-prefetch multi, prot
-# (dropping ) ? 2'b10 : // non-prefetch miss
-# m_axi4_bresp;
-#
-# assign s_axi4_bvalid = dropping | m_axi4_bvalid;
-# assign m_axi4_bready = ~dropping & s_axi4_bready;
-#
-# endmodule
-#
-#
+++ /dev/null
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi4_r_buffer(Elaboratable):
-
- def __init__(self):
- self.axi4_aclk = Signal() # input
- self.axi4_arstn = Signal() # input
- self.s_axi4_rid = Signal(AXI_ID_WIDTH) # output
- self.s_axi4_rresp = Signal(2) # output
- self.s_axi4_rdata = Signal(AXI_DATA_WIDTH) # output
- self.s_axi4_rlast = Signal() # output
- self.s_axi4_rvalid = Signal() # output
- self.s_axi4_ruser = Signal(AXI_USER_WIDTH) # output
- self.s_axi4_rready = Signal() # input
- self.m_axi4_rid = Signal(AXI_ID_WIDTH) # input
- self.m_axi4_rresp = Signal(2) # input
- self.m_axi4_rdata = Signal(AXI_DATA_WIDTH) # input
- self.m_axi4_rlast = Signal() # input
- self.m_axi4_rvalid = Signal() # input
- self.m_axi4_ruser = Signal(AXI_USER_WIDTH) # input
- self.m_axi4_rready = Signal() # output
-
- def elaborate(self, platform=None):
- m = Module()
- m.d.comb += self.None.eq(self.m_axi4_rresp)
- m.d.comb += self.None.eq(self.m_axi4_rlast)
- m.d.comb += self.None.eq(self.m_axi4_rid)
- m.d.comb += self.None.eq(self.m_axi4_rdata)
- m.d.comb += self.None.eq(self.m_axi4_ruser)
- m.d.comb += self.s_axi4_rresp.eq(self.None)
- m.d.comb += self.s_axi4_rlast.eq(self.None)
- m.d.comb += self.s_axi4_rid.eq(self.None)
- m.d.comb += self.s_axi4_rdata.eq(self.None)
- m.d.comb += self.s_axi4_ruser.eq(self.None)
- return m
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License. You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# module axi4_r_buffer
-# #(
-# parameter AXI_DATA_WIDTH = 32,
-# parameter AXI_ID_WIDTH = 4,
-# parameter AXI_USER_WIDTH = 4
-# )
-# (
-# input logic axi4_aclk,
-# input logic axi4_arstn,
-#
-# output logic [AXI_ID_WIDTH-1:0] s_axi4_rid,
-# output logic [1:0] s_axi4_rresp,
-# output logic [AXI_DATA_WIDTH-1:0] s_axi4_rdata,
-# output logic s_axi4_rlast,
-# output logic s_axi4_rvalid,
-# output logic [AXI_USER_WIDTH-1:0] s_axi4_ruser,
-# input logic s_axi4_rready,
-#
-# input logic [AXI_ID_WIDTH-1:0] m_axi4_rid,
-# input logic [1:0] m_axi4_rresp,
-# input logic [AXI_DATA_WIDTH-1:0] m_axi4_rdata,
-# input logic m_axi4_rlast,
-# input logic m_axi4_rvalid,
-# input logic [AXI_USER_WIDTH-1:0] m_axi4_ruser,
-# output logic m_axi4_rready
-# );
-#
-# wire [AXI_DATA_WIDTH+AXI_ID_WIDTH+AXI_USER_WIDTH+3-1:0] data_in;
-# wire [AXI_DATA_WIDTH+AXI_ID_WIDTH+AXI_USER_WIDTH+3-1:0] data_out;
-#
-# localparam ID_START = 3;
-# localparam ID_END = AXI_ID_WIDTH-1 + ID_START;
-# localparam DATA_START = ID_END + 1;
-# localparam DATA_END = AXI_DATA_WIDTH-1 + DATA_START;
-# localparam USER_START = DATA_END + 1;
-# localparam USER_END = AXI_USER_WIDTH-1 + USER_START;
-#
-# assign data_in [1:0] = m_axi4_rresp;
-# assign data_in [2] = m_axi4_rlast;
-# assign data_in [ID_END:ID_START] = m_axi4_rid;
-# assign data_in[DATA_END:DATA_START] = m_axi4_rdata;
-# assign data_in[USER_END:USER_START] = m_axi4_ruser;
-#
-# assign s_axi4_rresp = data_out [1:0];
-# assign s_axi4_rlast = data_out [2];
-# assign s_axi4_rid = data_out [ID_END:ID_START];
-# assign s_axi4_rdata = data_out[DATA_END:DATA_START];
-# assign s_axi4_ruser = data_out[USER_END:USER_START];
-#
-# axi_buffer_rab
-# #(
-# .DATA_WIDTH ( AXI_DATA_WIDTH+AXI_ID_WIDTH+AXI_USER_WIDTH+3 ),
-# .BUFFER_DEPTH ( 4 )
-# )
-# u_buffer
-# (
-# .clk ( axi4_aclk ),
-# .rstn ( axi4_arstn ),
-# // Pop
-# .valid_out ( s_axi4_rvalid ),
-# .data_out ( data_out ),
-# .ready_in ( s_axi4_rready ),
-# // Push
-# .valid_in ( m_axi4_rvalid ),
-# .data_in ( data_in ),
-# .ready_out ( m_axi4_rready )
-# );
-#
-# endmodule
-#
-#
+++ /dev/null
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi4_r_sender(Elaboratable):
-
- def __init__(self):
- self.axi4_aclk = Signal() # input
- self.axi4_arstn = Signal() # input
- self.drop_i = Signal() # input
- self.drop_len_i = Signal(8) # input
- self.done_o = Signal() # output
- self.id_i = Signal(AXI_ID_WIDTH) # input
- self.prefetch_i = Signal() # input
- self.hit_i = Signal() # input
- self.s_axi4_rid = Signal(AXI_ID_WIDTH) # output
- self.s_axi4_rresp = Signal(2) # output
- self.s_axi4_rdata = Signal(AXI_DATA_WIDTH) # output
- self.s_axi4_rlast = Signal() # output
- self.s_axi4_rvalid = Signal() # output
- self.s_axi4_ruser = Signal(AXI_USER_WIDTH) # output
- self.s_axi4_rready = Signal() # input
- self.m_axi4_rid = Signal(AXI_ID_WIDTH) # input
- self.m_axi4_rresp = Signal(2) # input
- self.m_axi4_rdata = Signal(AXI_DATA_WIDTH) # input
- self.m_axi4_rlast = Signal() # input
- self.m_axi4_rvalid = Signal() # input
- self.m_axi4_ruser = Signal(AXI_USER_WIDTH) # input
- self.m_axi4_rready = Signal() # output
-
- def elaborate(self, platform=None):
- m = Module()
- m.d.comb += self.fifo_push.eq(self.None)
- m.d.comb += self.done_o.eq(self.fifo_push)
- m.d.comb += self.s_axi4_rdata.eq(self.m_axi4_rdata)
- m.d.comb += self.s_axi4_ruser.eq(self.None)
- m.d.comb += self.s_axi4_rid.eq(self.None)
- m.d.comb += self.s_axi4_rresp.eq(self.None)
- m.d.comb += self.s_axi4_rvalid.eq(self.None)
- m.d.comb += self.m_axi4_rready.eq(self.None)
- return m
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License. You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# //import CfMath::log2;
-#
-# module axi4_r_sender
-# #(
-# parameter AXI_DATA_WIDTH = 32,
-# parameter AXI_ID_WIDTH = 4,
-# parameter AXI_USER_WIDTH = 4
-# )
-# (
-# input logic axi4_aclk,
-# input logic axi4_arstn,
-#
-# input logic drop_i,
-# input logic [7:0] drop_len_i,
-# output logic done_o,
-# input logic [AXI_ID_WIDTH-1:0] id_i,
-# input logic prefetch_i,
-# input logic hit_i,
-#
-# output logic [AXI_ID_WIDTH-1:0] s_axi4_rid,
-# output logic [1:0] s_axi4_rresp,
-# output logic [AXI_DATA_WIDTH-1:0] s_axi4_rdata,
-# output logic s_axi4_rlast,
-# output logic s_axi4_rvalid,
-# output logic [AXI_USER_WIDTH-1:0] s_axi4_ruser,
-# input logic s_axi4_rready,
-#
-# input logic [AXI_ID_WIDTH-1:0] m_axi4_rid,
-# input logic [1:0] m_axi4_rresp,
-# input logic [AXI_DATA_WIDTH-1:0] m_axi4_rdata,
-# input logic m_axi4_rlast,
-# input logic m_axi4_rvalid,
-# input logic [AXI_USER_WIDTH-1:0] m_axi4_ruser,
-# output logic m_axi4_rready
-# );
-#
-# localparam BUFFER_DEPTH = 16;
-#
-# logic fifo_valid;
-# logic fifo_pop;
-# logic fifo_push;
-# logic fifo_ready;
-# logic [AXI_ID_WIDTH-1:0] id;
-# logic [7:0] len;
-# logic prefetch;
-# logic hit;
-#
-# logic dropping;
-#
-# enum logic [1:0] { FORWARDING, DROPPING }
-# state_d, state_q;
-# logic burst_ongoing_d, burst_ongoing_q;
-# logic [7:0] drop_cnt_d, drop_cnt_q;
-#
-# axi_buffer_rab
-# #(
-# .DATA_WIDTH ( 2+AXI_ID_WIDTH+8 ),
-# .BUFFER_DEPTH ( BUFFER_DEPTH )
-# )
-# u_fifo
-# (
-# .clk ( axi4_aclk ),
-# .rstn ( axi4_arstn ),
-# // Pop
-# .data_out ( {prefetch, hit, id, len} ),
-# .valid_out ( fifo_valid ),
-# .ready_in ( fifo_pop ),
-# // Push
-# .valid_in ( fifo_push ),
-# .data_in ( {prefetch_i, hit_i, id_i, drop_len_i} ),
-# .ready_out ( fifo_ready )
-# );
-#
-# assign fifo_push = drop_i & fifo_ready;
-# assign done_o = fifo_push;
-#
-# always_comb begin
-# burst_ongoing_d = burst_ongoing_q;
-# drop_cnt_d = drop_cnt_q;
-# dropping = 1'b0;
-# s_axi4_rlast = 1'b0;
-# fifo_pop = 1'b0;
-# state_d = state_q;
-#
-# case (state_q)
-# FORWARDING: begin
-# s_axi4_rlast = m_axi4_rlast;
-# // Remember whether there is currently a burst ongoing.
-# if (m_axi4_rvalid && m_axi4_rready) begin
-# if (m_axi4_rlast) begin
-# burst_ongoing_d = 1'b0;
-# end else begin
-# burst_ongoing_d = 1'b1;
-# end
-# end
-# // If there is no burst ongoing and the FIFO has a drop request ready, process it.
-# if (!burst_ongoing_d && fifo_valid) begin
-# drop_cnt_d = len;
-# state_d = DROPPING;
-# end
-# end
-#
-# DROPPING: begin
-# dropping = 1'b1;
-# s_axi4_rlast = (drop_cnt_q == '0);
-# // Handshake on slave interface
-# if (s_axi4_rready) begin
-# drop_cnt_d -= 1;
-# if (drop_cnt_q == '0) begin
-# drop_cnt_d = '0;
-# fifo_pop = 1'b1;
-# state_d = FORWARDING;
-# end
-# end
-# end
-#
-# default: begin
-# state_d = FORWARDING;
-# end
-# endcase
-# end
-#
-# assign s_axi4_rdata = m_axi4_rdata;
-#
-# assign s_axi4_ruser = dropping ? {AXI_USER_WIDTH{1'b0}} : m_axi4_ruser;
-# assign s_axi4_rid = dropping ? id : m_axi4_rid;
-#
-# assign s_axi4_rresp = (dropping & prefetch & hit) ? 2'b00 : // prefetch hit, mutli, prot
-# (dropping & prefetch ) ? 2'b10 : // prefetch miss
-# (dropping & hit) ? 2'b10 : // non-prefetch multi, prot
-# (dropping ) ? 2'b10 : // non-prefetch miss
-# m_axi4_rresp;
-#
-# assign s_axi4_rvalid = dropping | m_axi4_rvalid;
-# assign m_axi4_rready = ~dropping & s_axi4_rready;
-#
-# always_ff @(posedge axi4_aclk, negedge axi4_arstn) begin
-# if (axi4_arstn == 1'b0) begin
-# burst_ongoing_q <= 1'b0;
-# drop_cnt_q <= 'b0;
-# state_q <= FORWARDING;
-# end else begin
-# burst_ongoing_q <= burst_ongoing_d;
-# drop_cnt_q <= drop_cnt_d;
-# state_q <= state_d;
-# end
-# end
-#
-# endmodule
-#
-#
-#
-#
+++ /dev/null
-# this file has been generated by sv2nmigen
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License. You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi4_w_buffer(Elaboratable):
-
- def __init__(self):
- self.axi4_aclk = Signal() # input
- self.axi4_arstn = Signal() # input
- self.l1_done_o = Signal() # output
- self.l1_accept_i = Signal() # input
- self.l1_save_i = Signal() # input
- self.l1_drop_i = Signal() # input
- self.l1_master_i = Signal() # input
- self.l1_id_i = Signal(AXI_ID_WIDTH) # input
- self.l1_len_i = Signal(8) # input
- self.l1_prefetch_i = Signal() # input
- self.l1_hit_i = Signal() # input
- self.l2_done_o = Signal() # output
- self.l2_accept_i = Signal() # input
- self.l2_drop_i = Signal() # input
- self.l2_master_i = Signal() # input
- self.l2_id_i = Signal(AXI_ID_WIDTH) # input
- self.l2_len_i = Signal(8) # input
- self.l2_prefetch_i = Signal() # input
- self.l2_hit_i = Signal() # input
- self.master_select_o = Signal() # output
- self.input_stall_o = Signal() # output
- self.output_stall_o = Signal() # output
- self.b_drop_o = Signal() # output
- self.b_done_i = Signal() # input
- self.id_o = Signal(AXI_ID_WIDTH) # output
- self.prefetch_o = Signal() # output
- self.hit_o = Signal() # output
- self.s_axi4_wdata = Signal(AXI_DATA_WIDTH) # input
- self.s_axi4_wvalid = Signal() # input
- self.s_axi4_wready = Signal() # output
- self.s_axi4_wstrb = Signal(1+ERROR p_expression_25) # input
- self.s_axi4_wlast = Signal() # input
- self.s_axi4_wuser = Signal(AXI_USER_WIDTH) # input
- self.m_axi4_wdata = Signal(AXI_DATA_WIDTH) # output
- self.m_axi4_wvalid = Signal() # output
- self.m_axi4_wready = Signal() # input
- self.m_axi4_wstrb = Signal(1+ERROR p_expression_25) # output
- self.m_axi4_wlast = Signal() # output
- self.m_axi4_wuser = Signal(AXI_USER_WIDTH) # output
-
- def elaborate(self, platform=None):
- m = Module()
- return m
-
-
-#
-# //import CfMath::log2;
-#
-# module axi4_w_buffer
-# #(
-# parameter AXI_DATA_WIDTH = 32,
-# parameter AXI_ID_WIDTH = 4,
-# parameter AXI_USER_WIDTH = 4,
-# parameter ENABLE_L2TLB = 0,
-# parameter HUM_BUFFER_DEPTH = 16
-# )
-# (
-# input logic axi4_aclk,
-# input logic axi4_arstn,
-#
-# // L1 & L2 interfaces
-# output logic l1_done_o,
-# input logic l1_accept_i,
-# input logic l1_save_i,
-# input logic l1_drop_i,
-# input logic l1_master_i,
-# input logic [AXI_ID_WIDTH-1:0] l1_id_i,
-# input logic [7:0] l1_len_i,
-# input logic l1_prefetch_i,
-# input logic l1_hit_i,
-#
-# output logic l2_done_o,
-# input logic l2_accept_i,
-# input logic l2_drop_i,
-# input logic l2_master_i,
-# input logic [AXI_ID_WIDTH-1:0] l2_id_i,
-# input logic [7:0] l2_len_i,
-# input logic l2_prefetch_i,
-# input logic l2_hit_i,
-#
-# output logic master_select_o,
-# output logic input_stall_o,
-# output logic output_stall_o,
-#
-# // B sender interface
-# output logic b_drop_o,
-# input logic b_done_i,
-# output logic [AXI_ID_WIDTH-1:0] id_o,
-# output logic prefetch_o,
-# output logic hit_o,
-#
-# // AXI W channel interfaces
-# input logic [AXI_DATA_WIDTH-1:0] s_axi4_wdata,
-# input logic s_axi4_wvalid,
-# output logic s_axi4_wready,
-# input logic [AXI_DATA_WIDTH/8-1:0] s_axi4_wstrb,
-# input logic s_axi4_wlast,
-# input logic [AXI_USER_WIDTH-1:0] s_axi4_wuser,
-#
-# output logic [AXI_DATA_WIDTH-1:0] m_axi4_wdata,
-# output logic m_axi4_wvalid,
-# input logic m_axi4_wready,
-# output logic [AXI_DATA_WIDTH/8-1:0] m_axi4_wstrb,
-# output logic m_axi4_wlast,
-# output logic [AXI_USER_WIDTH-1:0] m_axi4_wuser
-# );
-#
-"""
-
- localparam BUFFER_WIDTH = AXI_DATA_WIDTH+AXI_USER_WIDTH+AXI_DATA_WIDTH/8+1;
-
- localparam INPUT_BUFFER_DEPTH = 4;
- localparam L1_FIFO_DEPTH = 8;
- localparam L2_FIFO_DEPTH = 4;
-
- logic [AXI_DATA_WIDTH-1:0] axi4_wdata;
- logic axi4_wvalid;
- logic axi4_wready;
- logic [AXI_DATA_WIDTH/8-1:0] axi4_wstrb;
- logic axi4_wlast;
- logic [AXI_USER_WIDTH-1:0] axi4_wuser;
-
- logic l1_fifo_valid_out;
- logic l1_fifo_ready_in;
- logic l1_fifo_valid_in;
- logic l1_fifo_ready_out;
-
- logic l1_req;
- logic l1_accept_cur, l1_save_cur, l1_drop_cur;
- logic l1_master_cur;
- logic [AXI_ID_WIDTH-1:0] l1_id_cur;
- logic [7:0] l1_len_cur;
- logic l1_hit_cur, l1_prefetch_cur;
- logic l1_save_in, l1_save_out;
- logic [log2(L1_FIFO_DEPTH)-1:0] n_l1_save_SP;
-
- logic l2_fifo_valid_out;
- logic l2_fifo_ready_in;
- logic l2_fifo_valid_in;
- logic l2_fifo_ready_out;
-
- logic l2_req;
- logic l2_accept_cur, l2_drop_cur;
- logic l2_master_cur;
- logic [AXI_ID_WIDTH-1:0] l2_id_cur;
- logic [7:0] l2_len_cur;
- logic l2_hit_cur, l2_prefetch_cur;
-
- logic fifo_select, fifo_select_SN, fifo_select_SP;
- logic w_done;
- logic b_drop_set;
-
- // HUM buffer signals
- logic hum_buf_ready_out;
- logic hum_buf_valid_in;
- logic hum_buf_ready_in;
- logic hum_buf_valid_out;
- logic hum_buf_underfull;
-
- logic [AXI_DATA_WIDTH-1:0] hum_buf_wdata;
- logic [AXI_DATA_WIDTH/8-1:0] hum_buf_wstrb;
- logic hum_buf_wlast;
- logic [AXI_USER_WIDTH-1:0] hum_buf_wuser;
-
- logic hum_buf_drop_req_SN, hum_buf_drop_req_SP;
- logic [7:0] hum_buf_drop_len_SN, hum_buf_drop_len_SP;
- logic hum_buf_almost_full;
-
- logic stop_store;
- logic wlast_in, wlast_out;
- logic signed [3:0] n_wlast_SN, n_wlast_SP;
- logic block_forwarding;
-
- // Search FSM
- typedef enum logic [3:0] {STORE, BYPASS,
- WAIT_L1_BYPASS_YES, WAIT_L2_BYPASS_YES,
- WAIT_L1_BYPASS_NO, WAIT_L2_BYPASS_NO,
- FLUSH, DISCARD,
- DISCARD_FINISH}
- hum_buf_state_t;
- hum_buf_state_t hum_buf_SP; // Present state
- hum_buf_state_tbg hum_buf_SN; // Next State
-
- axi_buffer_rab
- #(
- .DATA_WIDTH ( BUFFER_WIDTH ),
- .BUFFER_DEPTH ( INPUT_BUFFER_DEPTH )
- )
- u_input_buf
- (
- .clk ( axi4_aclk ),
- .rstn ( axi4_arstn ),
- // Push
- .data_in ( {s_axi4_wuser, s_axi4_wstrb, s_axi4_wdata, s_axi4_wlast} ),
- .valid_in ( s_axi4_wvalid ),
- .ready_out ( s_axi4_wready ),
- // Pop
- .data_out ( {axi4_wuser, axi4_wstrb, axi4_wdata, axi4_wlast} ),
- .valid_out ( axi4_wvalid ),
- .ready_in ( axi4_wready )
- );
-
- axi_buffer_rab
- #(
- .DATA_WIDTH ( 2+AXI_ID_WIDTH+8+4 ),
- .BUFFER_DEPTH ( L1_FIFO_DEPTH )
- )
- u_l1_fifo
- (
- .clk ( axi4_aclk ),
- .rstn ( axi4_arstn ),
- // Push
- .data_in ( {l1_prefetch_i, l1_hit_i, l1_id_i, l1_len_i, l1_master_i, l1_accept_i, l1_save_i, l1_drop_i} ),
- .valid_in ( l1_fifo_valid_in ),
- .ready_out ( l1_fifo_ready_out ),
- // Pop
- .data_out ( {l1_prefetch_cur, l1_hit_cur, l1_id_cur, l1_len_cur, l1_master_cur, l1_accept_cur, l1_save_cur, l1_drop_cur} ),
- .valid_out ( l1_fifo_valid_out ),
- .ready_in ( l1_fifo_ready_in )
- );
-
- // Push upon receiving new requests from the TLB.
- assign l1_req = l1_accept_i | l1_save_i | l1_drop_i;
- assign l1_fifo_valid_in = l1_req & l1_fifo_ready_out;
-
- // Signal handshake
- assign l1_done_o = l1_fifo_valid_in;
- assign l2_done_o = l2_fifo_valid_in;
-
- // Stall AW input of L1 TLB
- assign input_stall_o = ~(l1_fifo_ready_out & l2_fifo_ready_out);
-
- // Interface b_drop signals + handshake
- always_comb begin
- if (fifo_select == 1'b0) begin
- prefetch_o = l1_prefetch_cur;
- hit_o = l1_hit_cur;
- id_o = l1_id_cur;
-
- l1_fifo_ready_in = w_done | b_done_i;
- l2_fifo_ready_in = 1'b0;
- end else begin
- prefetch_o = l2_prefetch_cur;
- hit_o = l2_hit_cur;
- id_o = l2_id_cur;
-
- l1_fifo_ready_in = 1'b0;
- l2_fifo_ready_in = w_done | b_done_i;
- end
- end
-
- // Detect when an L1 transaction save request enters or exits the L1 FIFO.
- assign l1_save_in = l1_fifo_valid_in & l1_save_i;
- assign l1_save_out = l1_fifo_ready_in & l1_save_cur;
-
- // Count the number of L1 transaction to save in the L1 FIFO.
- always_ff @(posedge axi4_aclk or negedge axi4_arstn) begin
- if (axi4_arstn == 0) begin
- n_l1_save_SP <= '0;
- end else if (l1_save_in ^ l1_save_out) begin
- if (l1_save_in) begin
- n_l1_save_SP <= n_l1_save_SP + 1'b1;
- end else if (l1_save_out) begin
- n_l1_save_SP <= n_l1_save_SP - 1'b1;
- end
- end
- end
-
- // Stall forwarding of AW L1 hits if:
- // 1. The HUM buffer does not allow to be bypassed.
- // 2. There are multiple L1 save requests in the FIFO, i.e., multiple L2 outputs pending.
- assign output_stall_o = (n_l1_save_SP > 1) || (block_forwarding == 1'b1);
-
- generate
- if (ENABLE_L2TLB == 1) begin : HUM_BUFFER
-
- axi_buffer_rab_bram
- #(
- .DATA_WIDTH ( BUFFER_WIDTH ),
- .BUFFER_DEPTH ( HUM_BUFFER_DEPTH )
- )
- u_hum_buf
- (
- .clk ( axi4_aclk ),
- .rstn ( axi4_arstn ),
- // Push
- .data_in ( {axi4_wuser, axi4_wstrb, axi4_wdata, axi4_wlast} ),
- .valid_in ( hum_buf_valid_in ),
- .ready_out ( hum_buf_ready_out ),
- // Pop
- .data_out ( {hum_buf_wuser, hum_buf_wstrb, hum_buf_wdata, hum_buf_wlast} ),
- .valid_out ( hum_buf_valid_out ),
- .ready_in ( hum_buf_ready_in ),
- // Clear
- .almost_full ( hum_buf_almost_full ),
- .underfull ( hum_buf_underfull ),
- .drop_req ( hum_buf_drop_req_SP ),
- .drop_len ( hum_buf_drop_len_SP )
- );
-
- axi_buffer_rab
- #(
- .DATA_WIDTH ( 2+AXI_ID_WIDTH+8+3 ),
- .BUFFER_DEPTH ( L2_FIFO_DEPTH )
- )
- u_l2_fifo
- (
- .clk ( axi4_aclk ),
- .rstn ( axi4_arstn ),
- // Push
- .data_in ( {l2_prefetch_i, l2_hit_i, l2_id_i, l2_len_i, l2_master_i, l2_accept_i, l2_drop_i} ),
- .valid_in ( l2_fifo_valid_in ),
- .ready_out ( l2_fifo_ready_out ),
- // Pop
- .data_out ( {l2_prefetch_cur, l2_hit_cur, l2_id_cur, l2_len_cur, l2_master_cur, l2_accept_cur, l2_drop_cur} ),
- .valid_out ( l2_fifo_valid_out ),
- .ready_in ( l2_fifo_ready_in )
- );
-
- // Push upon receiving new result from TLB.
- assign l2_req = l2_accept_i | l2_drop_i;
- assign l2_fifo_valid_in = l2_req & l2_fifo_ready_out;
-
- assign wlast_in = axi4_wlast & hum_buf_valid_in & hum_buf_ready_out;
- assign wlast_out = hum_buf_wlast & hum_buf_valid_out & hum_buf_ready_in;
-
- always_ff @(posedge axi4_aclk or negedge axi4_arstn) begin
- if (axi4_arstn == 0) begin
- fifo_select_SP <= 1'b0;
- hum_buf_drop_len_SP <= 'b0;
- hum_buf_drop_req_SP <= 1'b0;
- hum_buf_SP <= STORE;
- n_wlast_SP <= 'b0;
- end else begin
- fifo_select_SP <= fifo_select_SN;
- hum_buf_drop_len_SP <= hum_buf_drop_len_SN;
- hum_buf_drop_req_SP <= hum_buf_drop_req_SN;
- hum_buf_SP <= hum_buf_SN;
- n_wlast_SP <= n_wlast_SN;
- end
- end
-
- always_comb begin
- n_wlast_SN = n_wlast_SP;
- if (hum_buf_drop_req_SP) begin // Happens exactly once per burst to be dropped.
- n_wlast_SN -= 1;
- end
- if (wlast_in) begin
- n_wlast_SN += 1;
- end
- if (wlast_out) begin
- n_wlast_SN -= 1;
- end
- end
-
- always_comb begin : HUM_BUFFER_FSM
- hum_buf_SN = hum_buf_SP;
-
- m_axi4_wlast = 1'b0;
- m_axi4_wdata = 'b0;
- m_axi4_wstrb = 'b0;
- m_axi4_wuser = 'b0;
-
- m_axi4_wvalid = 1'b0;
- axi4_wready = 1'b0;
-
- hum_buf_valid_in = 1'b0;
- hum_buf_ready_in = 1'b0;
-
- hum_buf_drop_req_SN = hum_buf_drop_req_SP;
- hum_buf_drop_len_SN = hum_buf_drop_len_SP;
- master_select_o = 1'b0;
-
- w_done = 1'b0; // read from FIFO without handshake with B sender
- b_drop_o = 1'b0; // send data from FIFO to B sender (with handshake)
- fifo_select = 1'b0;
-
- fifo_select_SN = fifo_select_SP;
- stop_store = 1'b0;
-
- block_forwarding = 1'b0;
-
- unique case (hum_buf_SP)
-
- STORE : begin
- // Simply store the data in the buffer.
- hum_buf_valid_in = axi4_wvalid & hum_buf_ready_out;
- axi4_wready = hum_buf_ready_out;
-
- // We have got a full burst in the HUM buffer, thus stop storing.
- if (wlast_in & !hum_buf_underfull | (n_wlast_SP > $signed(0))) begin
- hum_buf_SN = WAIT_L1_BYPASS_YES;
-
- // The buffer is full, thus wait for decision.
- end else if (~hum_buf_ready_out) begin
- hum_buf_SN = WAIT_L1_BYPASS_NO;
- end
-
- // Avoid the forwarding of L1 hits until we know whether we can bypass.
- if (l1_fifo_valid_out & l1_save_cur) begin
- block_forwarding = 1'b1;
- end
- end
-
- WAIT_L1_BYPASS_YES : begin
- // Wait for orders from L1 TLB.
- if (l1_fifo_valid_out) begin
-
- // L1 hit - forward data from buffer
- if (l1_accept_cur) begin
- m_axi4_wlast = hum_buf_wlast;
- m_axi4_wdata = hum_buf_wdata;
- m_axi4_wstrb = hum_buf_wstrb;
- m_axi4_wuser = hum_buf_wuser;
-
- m_axi4_wvalid = hum_buf_valid_out;
- hum_buf_ready_in = m_axi4_wready;
-
- master_select_o = l1_master_cur;
-
- // Detect last data beat.
- if (wlast_out) begin
- fifo_select = 1'b0;
- w_done = 1'b1;
- hum_buf_SN = STORE;
- end
-
- // L1 miss - wait for L2
- end else if (l1_save_cur) begin
- fifo_select = 1'b0;
- w_done = 1'b1;
- hum_buf_SN = WAIT_L2_BYPASS_YES;
-
- // L1 prefetch, prot, multi - drop data
- end else if (l1_drop_cur) begin
- fifo_select_SN = 1'b0; // L1
- hum_buf_drop_req_SN = 1'b1;
- hum_buf_drop_len_SN = l1_len_cur;
- hum_buf_SN = FLUSH;
- end
- end
- end
-
- WAIT_L2_BYPASS_YES : begin
- // Wait for orders from L2 TLB.
- if (l2_fifo_valid_out) begin
-
- // L2 hit - forward data from buffer
- if (l2_accept_cur) begin
- m_axi4_wlast = hum_buf_wlast;
- m_axi4_wdata = hum_buf_wdata;
- m_axi4_wstrb = hum_buf_wstrb;
- m_axi4_wuser = hum_buf_wuser;
-
- m_axi4_wvalid = hum_buf_valid_out;
- hum_buf_ready_in = m_axi4_wready;
-
- master_select_o = l2_master_cur;
-
- // Detect last data beat.
- if (wlast_out) begin
- fifo_select = 1'b1;
- w_done = 1'b1;
- hum_buf_SN = STORE;
- end
-
- // L2 miss/prefetch hit
- end else if (l2_drop_cur) begin
- fifo_select_SN = 1'b1; // L2
- hum_buf_drop_req_SN = 1'b1;
- hum_buf_drop_len_SN = l2_len_cur;
- hum_buf_SN = FLUSH;
- end
-
- // While we wait for orders from L2 TLB, we can still drop and accept L1 transactions.
- end else if (l1_fifo_valid_out) begin
-
- // L1 hit
- if (l1_accept_cur) begin
- hum_buf_SN = BYPASS;
-
- // L1 prefetch/prot/multi
- end else if (l1_drop_cur) begin
- hum_buf_SN = DISCARD;
- end
- end
- end
-
- FLUSH : begin
- // Clear HUM buffer flush request.
- hum_buf_drop_req_SN = 1'b0;
-
- // perform handshake with B sender
- fifo_select = fifo_select_SP;
- b_drop_o = 1'b1;
- if (b_done_i) begin
- hum_buf_SN = STORE;
- end
- end
-
- BYPASS : begin
- // Forward one full transaction from input buffer.
- m_axi4_wlast = axi4_wlast;
- m_axi4_wdata = axi4_wdata;
- m_axi4_wstrb = axi4_wstrb;
- m_axi4_wuser = axi4_wuser;
-
- m_axi4_wvalid = axi4_wvalid;
- axi4_wready = m_axi4_wready;
-
- master_select_o = l1_master_cur;
-
- // We have got a full transaction.
- if (axi4_wlast & axi4_wready & axi4_wvalid) begin
- fifo_select = 1'b0;
- w_done = 1'b1;
- hum_buf_SN = WAIT_L2_BYPASS_YES;
- end
- end
-
- DISCARD : begin
- // Discard one full transaction from input buffer.
- axi4_wready = 1'b1;
-
- // We have got a full transaction.
- if (axi4_wlast & axi4_wready & axi4_wvalid) begin
- // Try to perform handshake with B sender.
- fifo_select = 1'b0;
- b_drop_o = 1'b1;
- // We cannot wait here due to axi4_wready.
- if (b_done_i) begin
- hum_buf_SN = WAIT_L2_BYPASS_YES;
- end else begin
- hum_buf_SN = DISCARD_FINISH;
- end
- end
- end
-
- DISCARD_FINISH : begin
- // Perform handshake with B sender.
- fifo_select = 1'b0;
- b_drop_o = 1'b1;
- if (b_done_i) begin
- hum_buf_SN = WAIT_L2_BYPASS_YES;
- end
- end
-
- WAIT_L1_BYPASS_NO : begin
- // Do not allow the forwarding of L1 hits.
- block_forwarding = 1'b1;
-
- // Wait for orders from L1 TLB.
- if (l1_fifo_valid_out) begin
-
- // L1 hit - forward data from/through HUM buffer and refill the buffer
- if (l1_accept_cur) begin
- // Forward data from HUM buffer.
- m_axi4_wlast = hum_buf_wlast;
- m_axi4_wdata = hum_buf_wdata;
- m_axi4_wstrb = hum_buf_wstrb;
- m_axi4_wuser = hum_buf_wuser;
-
- m_axi4_wvalid = hum_buf_valid_out;
- hum_buf_ready_in = m_axi4_wready;
-
- master_select_o = l1_master_cur;
-
- // Refill the HUM buffer. Stop when buffer full.
- stop_store = ~hum_buf_ready_out;
- hum_buf_valid_in = stop_store ? 1'b0 : axi4_wvalid ;
- axi4_wready = stop_store ? 1'b0 : hum_buf_ready_out;
-
- // Detect last data beat.
- if (wlast_out) begin
- fifo_select = 1'b0;
- w_done = 1'b1;
- if (~hum_buf_ready_out | hum_buf_almost_full) begin
- hum_buf_SN = WAIT_L1_BYPASS_NO;
- end else begin
- hum_buf_SN = STORE;
- end
- end
-
- // Allow the forwarding of L1 hits.
- block_forwarding = 1'b0;
-
- // L1 miss - wait for L2
- end else if (l1_save_cur) begin
- fifo_select = 1'b0;
- w_done = 1'b1;
- hum_buf_SN = WAIT_L2_BYPASS_NO;
-
- // L1 prefetch, prot, multi - drop data
- end else if (l1_drop_cur) begin
- fifo_select_SN = 1'b0; // L1
- hum_buf_drop_req_SN = 1'b1;
- hum_buf_drop_len_SN = l1_len_cur;
- hum_buf_SN = FLUSH;
-
- // Allow the forwarding of L1 hits.
- block_forwarding = 1'b0;
- end
- end
- end
-
- WAIT_L2_BYPASS_NO : begin
- // Do not allow the forwarding of L1 hits.
- block_forwarding = 1'b1;
-
- // Wait for orders from L2 TLB.
- if (l2_fifo_valid_out) begin
-
- // L2 hit - forward first part from HUM buffer, rest from input buffer
- if (l2_accept_cur) begin
- // Forward data from HUM buffer.
- m_axi4_wlast = hum_buf_wlast;
- m_axi4_wdata = hum_buf_wdata;
- m_axi4_wstrb = hum_buf_wstrb;
- m_axi4_wuser = hum_buf_wuser;
-
- m_axi4_wvalid = hum_buf_valid_out;
- hum_buf_ready_in = m_axi4_wready;
-
- master_select_o = l2_master_cur;
-
- // Refill the HUM buffer. Stop when buffer full.
- stop_store = ~hum_buf_ready_out;
- hum_buf_valid_in = stop_store ? 1'b0 : axi4_wvalid ;
- axi4_wready = stop_store ? 1'b0 : hum_buf_ready_out;
-
- // Detect last data beat.
- if (wlast_out) begin
- fifo_select = 1'b1;
- w_done = 1'b1;
- if (~hum_buf_ready_out | hum_buf_almost_full) begin
- hum_buf_SN = WAIT_L1_BYPASS_NO;
- end else begin
- hum_buf_SN = STORE;
- end
- end
-
- // Allow the forwarding of L1 hits.
- block_forwarding = 1'b0;
-
- // L2 miss/prefetch hit - drop data
- end else if (l2_drop_cur) begin
- fifo_select_SN = 1'b1; // L2
- hum_buf_drop_req_SN = 1'b1;
- hum_buf_drop_len_SN = l2_len_cur;
- hum_buf_SN = FLUSH;
-
- // Allow the forwarding of L1 hits.
- block_forwarding = 1'b0;
- end
- end
- end
-
-
- default: begin
- hum_buf_SN = STORE;
- end
-
- endcase // hum_buf_SP
- end // HUM_BUFFER_FSM
-
- assign b_drop_set = 1'b0;
-
- end else begin // HUM_BUFFER
-
- // register to perform the handshake with B sender
- always_ff @(posedge axi4_aclk or negedge axi4_arstn) begin
- if (axi4_arstn == 0) begin
- b_drop_o <= 1'b0;
- end else if (b_done_i) begin
- b_drop_o <= 1'b0;
- end else if (b_drop_set) begin
- b_drop_o <= 1'b1;;
- end
- end
-
- always_comb begin : OUTPUT_CTRL
-
- fifo_select = 1'b0;
- w_done = 1'b0;
- b_drop_set = 1'b0;
-
- m_axi4_wlast = 1'b0;
- m_axi4_wdata = 'b0;
- m_axi4_wstrb = 'b0;
- m_axi4_wuser = 'b0;
-
- m_axi4_wvalid = 1'b0;
- axi4_wready = 1'b0;
-
- if (l1_fifo_valid_out) begin
- // forward data
- if (l1_accept_cur) begin
- m_axi4_wlast = axi4_wlast;
- m_axi4_wdata = axi4_wdata;
- m_axi4_wstrb = axi4_wstrb;
- m_axi4_wuser = axi4_wuser;
-
- m_axi4_wvalid = axi4_wvalid;
- axi4_wready = m_axi4_wready;
-
- // Simply pop from FIFO upon last data beat.
- w_done = axi4_wlast & axi4_wvalid & axi4_wready;
-
- // discard entire burst
- end else if (b_drop_o == 1'b0) begin
- axi4_wready = 1'b1;
-
- // Simply pop from FIFO upon last data beat. Perform handshake with B sender.
- if (axi4_wlast & axi4_wvalid & axi4_wready)
- b_drop_set = 1'b1;
- end
- end
-
- end // OUTPUT_CTRL
-
- assign master_select_o = l1_master_cur;
- assign l2_fifo_ready_out = 1'b1;
- assign block_forwarding = 1'b0;
-
- // unused signals
- assign hum_buf_ready_out = 1'b0;
- assign hum_buf_valid_in = 1'b0;
- assign hum_buf_ready_in = 1'b0;
- assign hum_buf_valid_out = 1'b0;
- assign hum_buf_wdata = 'b0;
- assign hum_buf_wstrb = 'b0;
- assign hum_buf_wlast = 1'b0;
- assign hum_buf_wuser = 'b0;
- assign hum_buf_drop_len_SN = 'b0;
- assign hum_buf_drop_req_SN = 1'b0;
- assign hum_buf_almost_full = 1'b0;
-
- assign l2_fifo_valid_in = 1'b0;
- assign l2_fifo_valid_out = 1'b0;
- assign l2_prefetch_cur = 1'b0;
- assign l2_hit_cur = 1'b0;
- assign l2_id_cur = 'b0;
- assign l2_len_cur = 'b0;
- assign l2_master_cur = 1'b0;
- assign l2_accept_cur = 1'b0;
- assign l2_drop_cur = 1'b0;
-
- assign l2_req = 1'b0;
-
- assign fifo_select_SN = 1'b0;
- assign fifo_select_SP = 1'b0;
-
- assign stop_store = 1'b0;
- assign n_wlast_SP = 'b0;
- assign wlast_in = 1'b0;
- assign wlast_out = 1'b0;
-
- end // HUM_BUFFER
-
- endgenerate
-"""
+++ /dev/null
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi4_w_sender(Elaboratable):
-
- def __init__(self):
- self.axi4_aclk = Signal() # input
- self.axi4_arstn = Signal() # input
- self.s_axi4_wdata = Signal() # input
- self.s_axi4_wvalid = Signal() # input
- self.s_axi4_wready = Signal() # output
- self.s_axi4_wstrb = Signal() # input
- self.s_axi4_wlast = Signal() # input
- self.s_axi4_wuser = Signal() # input
- self.m_axi4_wdata = Signal() # output
- self.m_axi4_wvalid = Signal() # output
- self.m_axi4_wready = Signal() # input
- self.m_axi4_wstrb = Signal() # output
- self.m_axi4_wlast = Signal() # output
- self.m_axi4_wuser = Signal() # output
-
- def elaborate(self, platform=None):
- m = Module()
- m.d.comb += self.m_axi4_wdata.eq(self.s_axi4_wdata)
- m.d.comb += self.m_axi4_wstrb.eq(self.s_axi4_wstrb)
- m.d.comb += self.m_axi4_wlast.eq(self.s_axi4_wlast)
- m.d.comb += self.m_axi4_wuser.eq(self.s_axi4_wuser)
- m.d.comb += self.m_axi4_wvalid.eq(self.s_axi4_wvalid)
- m.d.comb += self.s_axi4_wready.eq(self.m_axi4_wready)
- return m
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License. You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# module axi4_w_sender
-# #(
-# parameter AXI_DATA_WIDTH = 32,
-# parameter AXI_USER_WIDTH = 2
-# )
-# (
-# input axi4_aclk,
-# input axi4_arstn,
-#
-# input [AXI_DATA_WIDTH-1:0] s_axi4_wdata,
-# input s_axi4_wvalid,
-# output s_axi4_wready,
-# input [AXI_DATA_WIDTH/8-1:0] s_axi4_wstrb,
-# input s_axi4_wlast,
-# input [AXI_USER_WIDTH-1:0] s_axi4_wuser,
-#
-# output [AXI_DATA_WIDTH-1:0] m_axi4_wdata,
-# output m_axi4_wvalid,
-# input m_axi4_wready,
-# output [AXI_DATA_WIDTH/8-1:0] m_axi4_wstrb,
-# output m_axi4_wlast,
-# output [AXI_USER_WIDTH-1:0] m_axi4_wuser
-# );
-#
-# assign m_axi4_wdata = s_axi4_wdata;
-# assign m_axi4_wstrb = s_axi4_wstrb;
-# assign m_axi4_wlast = s_axi4_wlast;
-# assign m_axi4_wuser = s_axi4_wuser;
-#
-# assign m_axi4_wvalid = s_axi4_wvalid;
-# assign s_axi4_wready = m_axi4_wready;
-#
-# endmodule
-#
-#
+++ /dev/null
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi_buffer_rab(Elaboratable):
-
- def __init__(self):
- self.clk = Signal() # input
- self.rstn = Signal() # input
- self.data_out = Signal(DATA_WIDTH) # output
- self.valid_out = Signal() # output
- self.ready_in = Signal() # input
- self.valid_in = Signal() # input
- self.data_in = Signal(DATA_WIDTH) # input
- self.ready_out = Signal() # output
-
- def elaborate(self, platform=None):
- m = Module()
- m.d.comb += self.full.eq(self.None)
- m.d.comb += self.data_out.eq(self.None)
- m.d.comb += self.valid_out.eq(self.None)
- m.d.comb += self.ready_out.eq(self.None)
- return m
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License. You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# //import CfMath::log2;
-#
-# module axi_buffer_rab
-# //#(
-# // parameter DATA_WIDTH,
-# // parameter BUFFER_DEPTH
-# //)
-# (
-# input logic clk,
-# input logic rstn,
-#
-# // Downstream port
-# output logic [DATA_WIDTH-1:0] data_out,
-# output logic valid_out,
-# input logic ready_in,
-#
-# // Upstream port
-# input logic valid_in,
-# input logic [DATA_WIDTH-1:0] data_in,
-# output logic ready_out
-# );
-#
-# localparam integer LOG_BUFFER_DEPTH = log2(BUFFER_DEPTH);
-#
-# // Internal data structures
-# reg [LOG_BUFFER_DEPTH - 1 : 0] pointer_in; // location to which we last wrote
-# reg [LOG_BUFFER_DEPTH - 1 : 0] pointer_out; // location from which we last sent
-# reg [LOG_BUFFER_DEPTH : 0] elements; // number of elements in the buffer
-# reg [DATA_WIDTH - 1 : 0] buffer [BUFFER_DEPTH - 1 : 0];
-#
-# wire full;
-#
-# integer loop1;
-#
-# assign full = (elements == BUFFER_DEPTH);
-#
-# always @(posedge clk or negedge rstn)
-# begin: elements_sequential
-# if (rstn == 1'b0)
-# elements <= 0;
-# else
-# begin
-# // ------------------
-# // Are we filling up?
-# // ------------------
-# // One out, none in
-# if (ready_in && valid_out && (!valid_in || full))
-# elements <= elements - 1;
-# // None out, one in
-# else if ((!valid_out || !ready_in) && valid_in && !full)
-# elements <= elements + 1;
-# // Else, either one out and one in, or none out and none in - stays unchanged
-# end
-# end
-#
-# always @(posedge clk or negedge rstn)
-# begin: buffers_sequential
-# if (rstn == 1'b0)
-# begin
-# for (loop1 = 0 ; loop1 < BUFFER_DEPTH ; loop1 = loop1 + 1)
-# buffer[loop1] <= 0;
-# end
-# else
-# begin
-# // Update the memory
-# if (valid_in && !full)
-# buffer[pointer_in] <= data_in;
-# end
-# end
-#
-# always @(posedge clk or negedge rstn)
-# begin: sequential
-# if (rstn == 1'b0)
-# begin
-# pointer_out <= 0;
-# pointer_in <= 0;
-# end
-# else
-# begin
-# // ------------------------------------
-# // Check what to do with the input side
-# // ------------------------------------
-# // We have some input, increase by 1 the input pointer
-# if (valid_in && !full)
-# begin
-# if (pointer_in == $unsigned(BUFFER_DEPTH - 1))
-# pointer_in <= 0;
-# else
-# pointer_in <= pointer_in + 1;
-# end
-# // Else we don't have any input, the input pointer stays the same
-#
-# // -------------------------------------
-# // Check what to do with the output side
-# // -------------------------------------
-# // We had pushed one flit out, we can try to go for the next one
-# if (ready_in && valid_out)
-# begin
-# if (pointer_out == $unsigned(BUFFER_DEPTH - 1))
-# pointer_out <= 0;
-# else
-# pointer_out <= pointer_out + 1;
-# end
-# // Else stay on the same output location
-# end
-# end
-#
-# // Update output ports
-# assign data_out = buffer[pointer_out];
-# assign valid_out = (elements != 0);
-#
-# assign ready_out = ~full;
-#
-# endmodule
-#
-#
+++ /dev/null
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi_buffer_rab_bram(Elaboratable):
-
- def __init__(self):
- self.clk = Signal() # input
- self.rstn = Signal() # input
- self.data_out = Signal(DATA_WIDTH) # output
- self.valid_out = Signal() # output
- self.ready_in = Signal() # input
- self.valid_in = Signal() # input
- self.data_in = Signal(DATA_WIDTH) # input
- self.ready_out = Signal() # output
- self.almost_full = Signal() # output
- self.underfull = Signal() # output
- self.drop_req = Signal() # input
- self.drop_len = Signal(8) # input
-
- def elaborate(self, platform=None):
- m = Module()
- return m
-
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License. You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# ////import CfMath::log2;
-#
-# module axi_buffer_rab_bram
-# //#(
-# // parameter DATA_WIDTH,
-# // parameter BUFFER_DEPTH
-# // )
-# (
-# input logic clk,
-# input logic rstn,
-#
-# // Downstream port
-# output logic [DATA_WIDTH-1:0] data_out,
-# output logic valid_out,
-# input logic ready_in,
-#
-# // Upstream port
-# input logic valid_in,
-# input logic [DATA_WIDTH-1:0] data_in,
-# output logic ready_out,
-#
-# // Status and drop control
-# output logic almost_full,
-# output logic underfull,
-# input logic drop_req,
-# // Number of items to drop. As for AXI lengths, counting starts at zero, i.e., `drop_len == 0`
-# // and `drop_req` means drop one item.
-# input logic [7:0] drop_len
-# );
-#
-""" #docstring_begin
- // The BRAM needs to be in "write-first" mode for first-word fall-through FIFO behavior.
- // To still push and pop simultaneously if the buffer is full, we internally increase the
- // buffer depth by 1.
- localparam ACT_BUFFER_DEPTH = BUFFER_DEPTH+1;
- localparam ACT_LOG_BUFFER_DEPTH = log2(ACT_BUFFER_DEPTH+1);
-
- /**
- * Internal data structures
- */
- // Location to which we last wrote
- logic [ACT_LOG_BUFFER_DEPTH-1:0] ptr_in_d, ptr_in_q;
- // Location from which we last sent
- logic [ACT_LOG_BUFFER_DEPTH-1:0] ptr_out_d, ptr_out_q;
- // Required for fall-through behavior on the first word
- logic [ACT_LOG_BUFFER_DEPTH-1:0] ptr_out_bram;
- // Number of elements in the buffer. Can be negative if elements that have been dropped have not
- // yet been written.
- logic signed [ACT_LOG_BUFFER_DEPTH:0] n_elems_d, n_elems_q;
-
- logic [DATA_WIDTH-1:0] data_out_bram, data_out_q;
- logic valid_out_q;
-
- logic full;
-
- assign almost_full = (n_elems_q == BUFFER_DEPTH-1);
- assign full = (n_elems_q == BUFFER_DEPTH);
-
- always_ff @(posedge clk, negedge rstn) begin
- if (~rstn) begin
- n_elems_q <= '0;
- ptr_in_q <= '0;
- ptr_out_q <= '0;
- end else begin
- n_elems_q <= n_elems_d;
- ptr_in_q <= ptr_in_d;
- ptr_out_q <= ptr_out_d;
- end
- end
-
- // Update the number of elements.
- always_comb begin
- n_elems_d = n_elems_q;
- if (drop_req) begin
- n_elems_d -= (drop_len + 1);
- end
- if (valid_in && ready_out) begin
- n_elems_d += 1;
- end
- if (valid_out && ready_in) begin
- n_elems_d -= 1;
- end
- end
-
- // Update the output pointer.
- always_comb begin
- ptr_out_d = ptr_out_q;
- if (drop_req) begin
- if ((ptr_out_q + drop_len + 1) > (ACT_BUFFER_DEPTH - 1)) begin
- ptr_out_d = drop_len + 1 - (ACT_BUFFER_DEPTH - ptr_out_q);
- end else begin
- ptr_out_d += (drop_len + 1);
- end
- end
- if (valid_out && ready_in) begin
- if (ptr_out_d == (ACT_BUFFER_DEPTH - 1)) begin
- ptr_out_d = '0;
- end else begin
- ptr_out_d += 1;
- end
- end
- end
-
- // The BRAM has a read latency of one cycle, so apply the new address one cycle earlier for
- // first-word fall-through FIFO behavior.
- //assign ptr_out_bram = (ptr_out_q == (ACT_BUFFER_DEPTH-1)) ? '0 : (ptr_out_q + 1);
- assign ptr_out_bram = ptr_out_d;
-
- // Update the input pointer.
- always_comb begin
- ptr_in_d = ptr_in_q;
- if (valid_in && ready_out) begin
- if (ptr_in_d == (ACT_BUFFER_DEPTH - 1)) begin
- ptr_in_d = '0;
- end else begin
- ptr_in_d += 1;
- end
- end
- end
-
- // Update output ports.
- assign valid_out = (n_elems_q > $signed(0));
- assign underfull = (n_elems_q < $signed(0));
- assign ready_out = ~full;
-
- ram_tp_write_first #(
- .ADDR_WIDTH ( ACT_LOG_BUFFER_DEPTH ),
- .DATA_WIDTH ( DATA_WIDTH )
- )
- ram_tp_write_first_0
- (
- .clk ( clk ),
- .we ( valid_in & ~full ),
- .addr0 ( ptr_in_q ),
- .addr1 ( ptr_out_bram ),
- .d_i ( data_in ),
- .d0_o ( ),
- .d1_o ( data_out_bram )
- );
-
- // When reading from/writing two the same address on both ports ("Write-Read Collision"),
- // the data on the read port is invalid (during the write cycle). In this implementation,
- // this can happen only when the buffer is empty. Thus, we forward the data from an
- // register in this case.
- always @(posedge clk) begin
- if (rstn == 1'b0) begin
- data_out_q <= 'b0;
- end else if ( (ptr_out_bram == ptr_in_q) && (valid_in && !full) ) begin
- data_out_q <= data_in;
- end
- end
-
- always @(posedge clk) begin
- if (rstn == 1'b0) begin
- valid_out_q <= 'b0;
- end else begin
- valid_out_q <= valid_out;
- end
- end
-
- // Drive output data
- always_comb begin
- if (valid_out && !valid_out_q) begin // We have just written to an empty FIFO
- data_out = data_out_q;
- end else begin
- data_out = data_out_bram;
- end
- end
-
-"""
-# endmodule
-#
-#
+++ /dev/null
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi_rab_cfg(Elaboratable):
-
- def __init__(self):
- self.Clk_CI = Signal() # input
- self.Rst_RBI = Signal() # input
- self.s_axi_awaddr = Signal(AXI_ADDR_WIDTH) # input
- self.s_axi_awvalid = Signal() # input
- self.s_axi_awready = Signal() # output
- self.s_axi_wdata = Signal() # input
- self.s_axi_wstrb = Signal(1+ERROR p_expression_25) # input
- self.s_axi_wvalid = Signal() # input
- self.s_axi_wready = Signal() # output
- self.s_axi_bresp = Signal(2) # output
- self.s_axi_bvalid = Signal() # output
- self.s_axi_bready = Signal() # input
- self.s_axi_araddr = Signal(AXI_ADDR_WIDTH) # input
- self.s_axi_arvalid = Signal() # input
- self.s_axi_arready = Signal() # output
- self.s_axi_rdata = Signal(AXI_DATA_WIDTH) # output
- self.s_axi_rresp = Signal(2) # output
- self.s_axi_rvalid = Signal() # output
- self.s_axi_rready = Signal() # input
- self.L1Cfg_DO = Signal() # output
- self.L1AllowMultiHit_SO = Signal() # output
- self.MissAddr_DI = Signal(ADDR_WIDTH_VIRT) # input
- self.MissMeta_DI = Signal(MISS_META_WIDTH) # input
- self.Miss_SI = Signal() # input
- self.MhFifoFull_SO = Signal() # output
- self.wdata_l2 = Signal() # output
- self.waddr_l2 = Signal() # output
- self.wren_l2 = Signal(N_PORTS) # output
-
- def elaborate(self, platform=None):
- m = Module()
- return m
-
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License. You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# // --=========================================================================--
-# //
-# // █████╗ ██╗ ██╗██╗ ██████╗ █████╗ ██████╗ ██████╗███████╗ ██████╗
-# // ██╔══██╗╚██╗██╔╝██║ ██╔══██╗██╔══██╗██╔══██╗ ██╔════╝██╔════╝██╔════╝
-# // ███████║ ╚███╔╝ ██║ ██████╔╝███████║██████╔╝ ██║ █████╗ ██║ ███╗
-# // ██╔══██║ ██╔██╗ ██║ ██╔══██╗██╔══██║██╔══██╗ ██║ ██╔══╝ ██║ ██║
-# // ██║ ██║██╔╝ ██╗██║ ██║ ██║██║ ██║██████╔╝ ╚██████╗██║ ╚██████╔╝
-# // ╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═╝ ╚═╝╚═╝ ╚═╝╚═════╝ ╚═════╝╚═╝ ╚═════╝
-# //
-# //
-# // Author: Pirmin Vogel - vogelpi@iis.ee.ethz.ch
-# //
-# // Purpose : AXI4-Lite configuration and miss handling interface for RAB
-# //
-# // --=========================================================================--
-#
-# //import CfMath::log2;
-#
-# module axi_rab_cfg
-# #(
-# parameter N_PORTS = 3,
-# parameter N_REGS = 196,
-# parameter N_L2_SETS = 32,
-# parameter N_L2_SET_ENTRIES= 32,
-# parameter ADDR_WIDTH_PHYS = 40,
-# parameter ADDR_WIDTH_VIRT = 32,
-# parameter N_FLAGS = 4,
-# parameter AXI_DATA_WIDTH = 64,
-# parameter AXI_ADDR_WIDTH = 32,
-# parameter MISS_META_WIDTH = 10, // <= FIFO_WIDTH
-# parameter MH_FIFO_DEPTH = 16
-# )
-# (
-# input logic Clk_CI,
-# input logic Rst_RBI,
-#
-# // AXI Lite interface
-# input logic [AXI_ADDR_WIDTH-1:0] s_axi_awaddr,
-# input logic s_axi_awvalid,
-# output logic s_axi_awready,
-# input logic [AXI_DATA_WIDTH/8-1:0][7:0] s_axi_wdata,
-# input logic [AXI_DATA_WIDTH/8-1:0] s_axi_wstrb,
-# input logic s_axi_wvalid,
-# output logic s_axi_wready,
-# output logic [1:0] s_axi_bresp,
-# output logic s_axi_bvalid,
-# input logic s_axi_bready,
-# input logic [AXI_ADDR_WIDTH-1:0] s_axi_araddr,
-# input logic s_axi_arvalid,
-# output logic s_axi_arready,
-# output logic [AXI_DATA_WIDTH-1:0] s_axi_rdata,
-# output logic [1:0] s_axi_rresp,
-# output logic s_axi_rvalid,
-# input logic s_axi_rready,
-#
-# // Slice configuration
-# output logic [N_REGS-1:0][63:0] L1Cfg_DO,
-# output logic L1AllowMultiHit_SO,
-#
-# // Miss handling
-# input logic [ADDR_WIDTH_VIRT-1:0] MissAddr_DI,
-# input logic [MISS_META_WIDTH-1:0] MissMeta_DI,
-# input logic Miss_SI,
-# output logic MhFifoFull_SO,
-#
-# // L2 TLB
-# output logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] wdata_l2,
-# output logic [N_PORTS-1:0] [AXI_ADDR_WIDTH-1:0] waddr_l2,
-# output logic [N_PORTS-1:0] wren_l2
-# );
-#
-""" #docstring_begin
-
- localparam ADDR_LSB = log2(64/8); // 64 even if the AXI Lite interface is 32,
- // because RAB slices are 64 bit wide.
- localparam ADDR_MSB = log2(N_REGS)+ADDR_LSB-1;
-
- localparam L2SINGLE_AMAP_SIZE = 16'h4000; // Maximum 2048 TLB entries in L2
-
- localparam integer N_L2_ENTRIES = N_L2_SETS * N_L2_SET_ENTRIES;
-
- localparam logic [AXI_ADDR_WIDTH-1:0] L2_VA_MAX_ADDR = (N_L2_ENTRIES-1) << 2;
-
- logic [AXI_DATA_WIDTH/8-1:0][7:0] L1Cfg_DP[N_REGS]; // [Byte][Bit]
- genvar j;
-
- // █████╗ ██╗ ██╗██╗██╗ ██╗ ██╗ ██╗████████╗███████╗
- // ██╔══██╗╚██╗██╔╝██║██║ ██║ ██║ ██║╚══██╔══╝██╔════╝
- // ███████║ ╚███╔╝ ██║███████║█████╗██║ ██║ ██║ █████╗
- // ██╔══██║ ██╔██╗ ██║╚════██║╚════╝██║ ██║ ██║ ██╔══╝
- // ██║ ██║██╔╝ ██╗██║ ██║ ███████╗██║ ██║ ███████╗
- // ╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═╝ ╚══════╝╚═╝ ╚═╝ ╚══════╝
- //
- logic [AXI_ADDR_WIDTH-1:0] awaddr_reg;
- logic awaddr_done_rise;
- logic awaddr_done_reg;
- logic awaddr_done_reg_dly;
-
- logic [AXI_DATA_WIDTH/8-1:0][7:0] wdata_reg;
- logic [AXI_DATA_WIDTH/8-1:0] wstrb_reg;
- logic wdata_done_rise;
- logic wdata_done_reg;
- logic wdata_done_reg_dly;
-
- logic wresp_done_reg;
- logic wresp_running_reg;
-
- logic [AXI_ADDR_WIDTH-1:0] araddr_reg;
- logic araddr_done_reg;
-
- logic [AXI_DATA_WIDTH-1:0] rdata_reg;
- logic rresp_done_reg;
- logic rresp_running_reg;
-
- logic awready;
- logic wready;
- logic bvalid;
-
- logic arready;
- logic rvalid;
-
- logic wren;
- logic wren_l1;
-
- assign wren = ( wdata_done_rise & awaddr_done_reg ) | ( awaddr_done_rise & wdata_done_reg );
- assign wdata_done_rise = wdata_done_reg & ~wdata_done_reg_dly;
- assign awaddr_done_rise = awaddr_done_reg & ~awaddr_done_reg_dly;
-
- // reg_dly
- always @(posedge Clk_CI or negedge Rst_RBI)
- begin
- if (!Rst_RBI)
- begin
- wdata_done_reg_dly <= 1'b0;
- awaddr_done_reg_dly <= 1'b0;
- end
- else
- begin
- wdata_done_reg_dly <= wdata_done_reg;
- awaddr_done_reg_dly <= awaddr_done_reg;
- end
- end
-
- // AW Channel
- always @(posedge Clk_CI or negedge Rst_RBI)
- begin
- if (!Rst_RBI)
- begin
- awaddr_done_reg <= 1'b0;
- awaddr_reg <= '0;
- awready <= 1'b1;
- end
- else
- begin
- if (awready && s_axi_awvalid)
- begin
- awready <= 1'b0;
- awaddr_done_reg <= 1'b1;
- awaddr_reg <= s_axi_awaddr;
- end
- else if (awaddr_done_reg && wresp_done_reg)
- begin
- awready <= 1'b1;
- awaddr_done_reg <= 1'b0;
- end
- end
- end
-
- // W Channel
- always @(posedge Clk_CI or negedge Rst_RBI)
- begin
- if (!Rst_RBI)
- begin
- wdata_done_reg <= 1'b0;
- wready <= 1'b1;
- wdata_reg <= '0;
- wstrb_reg <= '0;
- end
- else
- begin
- if (wready && s_axi_wvalid)
- begin
- wready <= 1'b0;
- wdata_done_reg <= 1'b1;
- wdata_reg <= s_axi_wdata;
- wstrb_reg <= s_axi_wstrb;
- end
- else if (wdata_done_reg && wresp_done_reg)
- begin
- wready <= 1'b1;
- wdata_done_reg <= 1'b0;
- end
- end
- end
-
- // B Channel
- always @(posedge Clk_CI or negedge Rst_RBI)
- begin
- if (!Rst_RBI)
- begin
- bvalid <= 1'b0;
- wresp_done_reg <= 1'b0;
- wresp_running_reg <= 1'b0;
- end
- else
- begin
- if (awaddr_done_reg && wdata_done_reg && !wresp_done_reg)
- begin
- if (!wresp_running_reg)
- begin
- bvalid <= 1'b1;
- wresp_running_reg <= 1'b1;
- end
- else if (s_axi_bready)
- begin
- bvalid <= 1'b0;
- wresp_done_reg <= 1'b1;
- wresp_running_reg <= 1'b0;
- end
- end
- else
- begin
- bvalid <= 1'b0;
- wresp_done_reg <= 1'b0;
- wresp_running_reg <= 1'b0;
- end
- end
- end
-
- // AR Channel
- always @(posedge Clk_CI or negedge Rst_RBI)
- begin
- if (!Rst_RBI)
- begin
- araddr_done_reg <= 1'b0;
- arready <= 1'b1;
- araddr_reg <= '0;
- end
- else
- begin
- if (arready && s_axi_arvalid)
- begin
- arready <= 1'b0;
- araddr_done_reg <= 1'b1;
- araddr_reg <= s_axi_araddr;
- end
- else if (araddr_done_reg && rresp_done_reg)
- begin
- arready <= 1'b1;
- araddr_done_reg <= 1'b0;
- end
- end
- end
-
- // R Channel
- always @(posedge Clk_CI or negedge Rst_RBI)
- begin
- if (!Rst_RBI)
- begin
- rresp_done_reg <= 1'b0;
- rvalid <= 1'b0;
- rresp_running_reg <= 1'b0;
- end
- else
- begin
- if (araddr_done_reg && !rresp_done_reg)
- begin
- if (!rresp_running_reg)
- begin
- rvalid <= 1'b1;
- rresp_running_reg <= 1'b1;
- end
- else if (s_axi_rready)
- begin
- rvalid <= 1'b0;
- rresp_done_reg <= 1'b1;
- rresp_running_reg <= 1'b0;
- end
- end
- else
- begin
- rvalid <= 1'b0;
- rresp_done_reg <= 1'b0;
- rresp_running_reg <= 1'b0;
- end
- end
- end
-
- // ██╗ ██╗ ██████╗███████╗ ██████╗ ██████╗ ███████╗ ██████╗
- // ██║ ███║ ██╔════╝██╔════╝██╔════╝ ██╔══██╗██╔════╝██╔════╝
- // ██║ ╚██║ ██║ █████╗ ██║ ███╗ ██████╔╝█████╗ ██║ ███╗
- // ██║ ██║ ██║ ██╔══╝ ██║ ██║ ██╔══██╗██╔══╝ ██║ ██║
- // ███████╗██║ ╚██████╗██║ ╚██████╔╝ ██║ ██║███████╗╚██████╔╝
- // ╚══════╝╚═╝ ╚═════╝╚═╝ ╚═════╝ ╚═╝ ╚═╝╚══════╝ ╚═════╝
- //
- assign wren_l1 = wren && (awaddr_reg < L2SINGLE_AMAP_SIZE);
-
- always @( posedge Clk_CI or negedge Rst_RBI )
- begin
- var integer idx_reg, idx_byte;
- if ( Rst_RBI == 1'b0 )
- begin
- for ( idx_reg = 0; idx_reg < N_REGS; idx_reg++ )
- L1Cfg_DP[idx_reg] <= '0;
- end
- else if ( wren_l1 )
- begin
- if ( awaddr_reg[ADDR_LSB+1] == 1'b0 ) begin // VIRT_ADDR
- for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8; idx_byte++ ) begin
- if ( (idx_byte < ADDR_WIDTH_VIRT/8) ) begin
- if ( wstrb_reg[idx_byte] ) begin
- L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= wdata_reg[idx_byte];
- end
- end
- else begin // Let synthesizer optimize away unused registers.
- L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= '0;
- end
- end
- end
- else if ( awaddr_reg[ADDR_LSB+1:ADDR_LSB] == 2'b10 ) begin // PHYS_ADDR
- for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8; idx_byte++ ) begin
- if ( (idx_byte < ADDR_WIDTH_PHYS/8) ) begin
- if ( wstrb_reg[idx_byte] ) begin
- L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= wdata_reg[idx_byte];
- end
- end
- else begin // Let synthesizer optimize away unused registers.
- L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= '0;
- end
- end
- end
- else begin // ( awaddr_reg[ADDR_LSB+1:ADDR_LSB] == 2'b11 ) // FLAGS
- for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8; idx_byte++ ) begin
- if ( (idx_byte < 1) ) begin
- if ( wstrb_reg[idx_byte] ) begin
- L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= wdata_reg[idx_byte] & { {{8-N_FLAGS}{1'b0}}, {{N_FLAGS}{1'b1}} };
- end
- end
- else begin // Let synthesizer optimize away unused registers.
- L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= '0;
- end
- end
- end
- end
- end // always @ ( posedge Clk_CI or negedge Rst_RBI )
-
- generate
- // Mask unused bits -> Synthesizer should optimize away unused registers
- for( j=0; j<N_REGS; j++ ) begin
- if ( j[1] == 1'b0 ) // VIRT_ADDR
- assign L1Cfg_DO[j] = { {{64-ADDR_WIDTH_VIRT}{1'b0}},{ADDR_WIDTH_VIRT{1'b1}} } & L1Cfg_DP[j];
- else if ( j[1:0] == 2'b10 ) // PHYS_ADDR
- assign L1Cfg_DO[j] = { {{64-ADDR_WIDTH_PHYS}{1'b0}},{ADDR_WIDTH_PHYS{1'b1}} } & L1Cfg_DP[j];
- else // if ( j[1:0] == 2'b11 ) // FLAGS
- assign L1Cfg_DO[j] = { {{64-N_FLAGS}{1'b0}},{N_FLAGS{1'b1}} } & L1Cfg_DP[j];
- end
- endgenerate
-
- always_comb
- begin
- if ( araddr_reg[ADDR_LSB-1] == 1'b1 ) // read upper 32 bit, for debugging over 32-bit interface
- rdata_reg = { {32'h00000000},{L1Cfg_DO[araddr_reg[ADDR_MSB:ADDR_LSB]][63:32]} };
- else
- rdata_reg = L1Cfg_DO[araddr_reg[ADDR_MSB:ADDR_LSB]];
- end
-
- assign s_axi_awready = awready;
- assign s_axi_wready = wready;
-
- assign s_axi_bresp = 2'b00;
- assign s_axi_bvalid = bvalid;
-
- assign s_axi_arready = arready;
- assign s_axi_rresp = 2'b00;
- assign s_axi_rvalid = rvalid;
-
- // ██╗ ██████╗ ██████╗███████╗ ██████╗
- // ██║ ╚════██╗ ██╔════╝██╔════╝██╔════╝
- // ██║ █████╔╝ ██║ █████╗ ██║ ███╗
- // ██║ ██╔═══╝ ██║ ██╔══╝ ██║ ██║
- // ███████╗███████╗ ╚██████╗██║ ╚██████╔╝
- // ╚══════╝╚══════╝ ╚═════╝╚═╝ ╚═════╝
- //
- logic [N_PORTS-1:0] l2_addr_is_in_va_rams;
- logic [N_PORTS-1:0] upper_word_is_written;
- logic [N_PORTS-1:0] lower_word_is_written;
- generate
- for( j=0; j< N_PORTS; j++)
- begin
- if (AXI_DATA_WIDTH == 64) begin
- assign l2_addr_is_in_va_rams[j] = (awaddr_reg >= (j+1)*L2SINGLE_AMAP_SIZE) && (awaddr_reg[log2(L2SINGLE_AMAP_SIZE)-1:0] <= L2_VA_MAX_ADDR);
- assign upper_word_is_written[j] = (wstrb_reg[7:4] != 4'b0000);
- assign lower_word_is_written[j] = (wstrb_reg[3:0] != 4'b0000);
- end else begin
- assign l2_addr_is_in_va_rams[j] = 1'b0;
- assign upper_word_is_written[j] = 1'b0;
- assign lower_word_is_written[j] = 1'b0;
- end
-
- always @( posedge Clk_CI or negedge Rst_RBI ) begin
- var integer idx_byte, off_byte;
- if ( Rst_RBI == 1'b0 )
- begin
- wren_l2[j] <= 1'b0;
- wdata_l2[j] <= '0;
- end
- else if (wren)
- begin
- if ( (awaddr_reg >= (j+1)*L2SINGLE_AMAP_SIZE) && (awaddr_reg < (j+2)*L2SINGLE_AMAP_SIZE) && (|wstrb_reg) )
- wren_l2[j] <= 1'b1;
- if (AXI_DATA_WIDTH == 32) begin
- for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8; idx_byte++ )
- wdata_l2[j][idx_byte*8 +: 8] <= wdata_reg[idx_byte] & {8{wstrb_reg[idx_byte]}};
- end
- else if (AXI_DATA_WIDTH == 64) begin
- if (lower_word_is_written[j] == 1'b1)
- off_byte = 0;
- else
- off_byte = 4;
- // always put the payload in the lower word and set upper word to 0
- for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8/2; idx_byte++ )
- wdata_l2[j][idx_byte*8 +: 8] <= wdata_reg[idx_byte+off_byte] & {8{wstrb_reg[idx_byte+off_byte]}};
- wdata_l2[j][AXI_DATA_WIDTH-1:AXI_DATA_WIDTH/2] <= 'b0;
- end
- // pragma translate_off
- else
- $fatal(1, "Unsupported AXI_DATA_WIDTH!");
- // pragma translate_on
- end
- else
- wren_l2[j] <= '0;
- end // always @ ( posedge Clk_CI or negedge Rst_RBI )
-
- // Properly align the 32-bit word address when writing from 64-bit interface:
- // Depending on the system, the incoming address is (non-)aligned to the 64-bit
- // word when writing the upper 32-bit word.
- always_comb begin
- waddr_l2[j] = (awaddr_reg -(j+1)*L2SINGLE_AMAP_SIZE)/4;
- if (wren_l2[j]) begin
- if (AXI_DATA_WIDTH == 64) begin
- if (upper_word_is_written[j] == 1'b1) begin
- // address must be non-aligned
- waddr_l2[j][0] = 1'b1;
- end
- end
- // pragma translate_off
- else if (AXI_DATA_WIDTH != 32) begin
- $fatal(1, "Unsupported AXI_DATA_WIDTH!");
- end
- // pragma translate_on
- end
- end
-
- // Assert that only one 32-bit word is ever written at a time to VA RAMs on 64-bit data
- // systems.
- // pragma translate_off
- always_ff @ (posedge Clk_CI) begin
- if (AXI_DATA_WIDTH == 64) begin
- if (l2_addr_is_in_va_rams[j]) begin
- if (upper_word_is_written[j]) begin
- assert (!lower_word_is_written[j])
- else $error("Unsupported write across two 32-bit words to VA RAMs!");
- end
- else if (lower_word_is_written[j]) begin
- assert (!upper_word_is_written[j])
- else $error("Unsupported write across two 32-bit words to VA RAMs!");
- end
- end
- end
- end
- // pragma translate_on
-
- end // for (j=0; j< N_PORTS; j++)
- endgenerate
-
- // ███╗ ███╗██╗ ██╗ ███████╗██╗███████╗ ██████╗ ███████╗
- // ████╗ ████║██║ ██║ ██╔════╝██║██╔════╝██╔═══██╗██╔════╝
- // ██╔████╔██║███████║ █████╗ ██║█████╗ ██║ ██║███████╗
- // ██║╚██╔╝██║██╔══██║ ██╔══╝ ██║██╔══╝ ██║ ██║╚════██║
- // ██║ ╚═╝ ██║██║ ██║ ██║ ██║██║ ╚██████╔╝███████║
- // ╚═╝ ╚═╝╚═╝ ╚═╝ ╚═╝ ╚═╝╚═╝ ╚═════╝ ╚══════╝
- //
- logic [ADDR_WIDTH_VIRT-1:0] AddrFifoDin_D;
- logic AddrFifoWen_S;
- logic AddrFifoRen_S;
- logic [ADDR_WIDTH_VIRT-1:0] AddrFifoDout_D;
- logic AddrFifoFull_S;
- logic AddrFifoEmpty_S;
- logic AddrFifoEmpty_SB;
- logic AddrFifoFull_SB;
-
- logic [MISS_META_WIDTH-1:0] MetaFifoDin_D;
- logic MetaFifoWen_S;
- logic MetaFifoRen_S;
- logic [MISS_META_WIDTH-1:0] MetaFifoDout_D;
- logic MetaFifoFull_S;
- logic MetaFifoEmpty_S;
- logic MetaFifoEmpty_SB;
- logic MetaFifoFull_SB;
-
- logic FifosDisabled_S;
- logic ConfRegWen_S;
- logic [1:0] ConfReg_DN;
- logic [1:0] ConfReg_DP;
-
- logic [AXI_DATA_WIDTH-1:0] wdata_reg_vec;
-
- assign FifosDisabled_S = ConfReg_DP[0];
- assign L1AllowMultiHit_SO = ConfReg_DP[1];
-
- assign AddrFifoEmpty_S = ~AddrFifoEmpty_SB;
- assign MetaFifoEmpty_S = ~MetaFifoEmpty_SB;
-
- assign AddrFifoFull_S = ~AddrFifoFull_SB;
- assign MetaFifoFull_S = ~MetaFifoFull_SB;
-
- assign MhFifoFull_SO = (AddrFifoWen_S & AddrFifoFull_S) | (MetaFifoWen_S & MetaFifoFull_S);
-
- generate
- for ( j=0; j<AXI_DATA_WIDTH/8; j++ )
- assign wdata_reg_vec[(j+1)*8-1:j*8] = wdata_reg[j];
- endgenerate
-
- // write address FIFO
- always_comb
- begin
- AddrFifoWen_S = 1'b0;
- AddrFifoDin_D = 'b0;
- if ( (Miss_SI == 1'b1) && (FifosDisabled_S == 1'b0) ) // register a new miss
- begin
- AddrFifoWen_S = 1'b1;
- AddrFifoDin_D = MissAddr_DI;
- end
- else if ( (wren_l1 == 1'b1) && (awaddr_reg[ADDR_MSB:0] == 'b0) && (FifosDisabled_S == 1'b0)) // write request from AXI interface
- begin
- AddrFifoWen_S = 1'b1;
- AddrFifoDin_D = wdata_reg_vec[ADDR_WIDTH_VIRT-1:0];
- end
- end
-
- // write meta FIFO
- always_comb
- begin
- MetaFifoWen_S = 1'b0;
- MetaFifoDin_D = 'b0;
- if ( (Miss_SI == 1'b1) && (FifosDisabled_S == 1'b0) ) // register a new miss
- begin
- MetaFifoWen_S = 1'b1;
- MetaFifoDin_D[MISS_META_WIDTH-1:0] = MissMeta_DI;
- end
- else if ( (wren_l1 == 1'b1) && (awaddr_reg[ADDR_MSB:0] == 4'h8) && (FifosDisabled_S == 1'b0) ) // write request from AXI interface
- begin
- MetaFifoWen_S = 1'b1;
- MetaFifoDin_D = wdata_reg_vec[MISS_META_WIDTH-1:0];
- end
- end
-
- // write configuration register
- always_comb
- begin
- ConfRegWen_S = 1'b0;
- ConfReg_DN = 1'b0;
- if ( (wren_l1 == 1'b1) && (awaddr_reg[ADDR_MSB:0] == 8'h10) ) // write request from AXI interface
- begin
- ConfRegWen_S = 1'b1;
- ConfReg_DN = wdata_reg_vec[$high(ConfReg_DN):0];
- end
- end
-
- // AXI read data
- always_comb
- begin
- s_axi_rdata = rdata_reg; // read L1 config
- AddrFifoRen_S = 1'b0;
- MetaFifoRen_S = 1'b0;
- if ( rvalid == 1'b1 )
- begin
- // read address FIFO
- if ( araddr_reg[ADDR_MSB:0] == 'b0 )
- begin
- s_axi_rdata = {AXI_DATA_WIDTH{1'b0}};
- s_axi_rdata[ADDR_WIDTH_VIRT-1:0] = AddrFifoDout_D;
- if ( AddrFifoEmpty_S == 1'b0 )
- AddrFifoRen_S = 1'b1;
- end
- // read meta FIFO
- else if ( araddr_reg[ADDR_MSB:0] == 4'h8 )
- begin
- s_axi_rdata = {AXI_DATA_WIDTH{1'b0}};
- s_axi_rdata[31] = MetaFifoEmpty_S;
- s_axi_rdata[MISS_META_WIDTH-1:0] = MetaFifoDout_D;
- if ( MetaFifoEmpty_S == 1'b0 )
- MetaFifoRen_S = 1'b1;
- end
- // read configuration register
- else if ( araddr_reg[ADDR_MSB:0] == 8'h10 )
- begin
- s_axi_rdata = {AXI_DATA_WIDTH{1'b0}};
- s_axi_rdata[$high(ConfReg_DP):0] = ConfReg_DP;
- end
- end // if ( rvalid == 1'b1 )
- end // always_comb begin
-
- // configuration register
- always_ff @(posedge Clk_CI or negedge Rst_RBI) begin
- if (Rst_RBI == 1'b0)
- begin
- ConfReg_DP <= 'b0;
- end
- else if (ConfRegWen_S == 1'b1)
- begin
- ConfReg_DP <= ConfReg_DN;
- end
- end
-
- generic_fifo
- #(
- .DATA_WIDTH ( ADDR_WIDTH_VIRT ),
- .DATA_DEPTH ( MH_FIFO_DEPTH )
- )
- fifo_addr_i
- (
- .clk ( Clk_CI ),
- .rst_n ( Rst_RBI ),
- .data_i ( AddrFifoDin_D ),
- .valid_i ( AddrFifoWen_S & AddrFifoFull_SB ),
- .grant_o ( AddrFifoFull_SB ),
- .data_o ( AddrFifoDout_D ),
- .valid_o ( AddrFifoEmpty_SB ),
- .grant_i ( AddrFifoRen_S ),
- .test_mode_i ( 1'b0 )
- );
-
- generic_fifo
- #(
- .DATA_WIDTH ( MISS_META_WIDTH ),
- .DATA_DEPTH ( MH_FIFO_DEPTH )
- )
- fifo_meta_i
- (
- .clk ( Clk_CI ),
- .rst_n ( Rst_RBI ),
- .data_i ( MetaFifoDin_D ),
- .valid_i ( MetaFifoWen_S & MetaFifoFull_SB ),
- .grant_o ( MetaFifoFull_SB ),
- .data_o ( MetaFifoDout_D ),
- .valid_o ( MetaFifoEmpty_SB ),
- .grant_i ( MetaFifoRen_S ),
- .test_mode_i ( 1'b0 )
- );
-"""
-#
-# endmodule
-#
-#
+++ /dev/null
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi_rab_top(Elaboratable):
-
- def __init__(self):
- self.Clk_CI = Signal() # input
- self.NonGatedClk_CI = Signal() # input
- self.Rst_RBI = Signal() # input
- self.s_axi4_awid = Signal() # input
- self.s_axi4_awaddr = Signal() # input
- self.s_axi4_awvalid = Signal(N_PORTS) # input
- self.s_axi4_awready = Signal(N_PORTS) # output
- self.s_axi4_awlen = Signal() # input
- self.s_axi4_awsize = Signal() # input
- self.s_axi4_awburst = Signal() # input
- self.s_axi4_awlock = Signal(N_PORTS) # input
- self.s_axi4_awprot = Signal() # input
- self.s_axi4_awcache = Signal() # input
- self.s_axi4_awregion = Signal() # input
- self.s_axi4_awqos = Signal() # input
- self.s_axi4_awuser = Signal() # input
- self.s_axi4_wdata = Signal() # input
- self.s_axi4_wvalid = Signal(N_PORTS) # input
- self.s_axi4_wready = Signal(N_PORTS) # output
- self.s_axi4_wstrb = Signal() # input
- self.s_axi4_wlast = Signal(N_PORTS) # input
- self.s_axi4_wuser = Signal() # input
- self.s_axi4_bid = Signal() # output
- self.s_axi4_bresp = Signal() # output
- self.s_axi4_bvalid = Signal(N_PORTS) # output
- self.s_axi4_buser = Signal() # output
- self.s_axi4_bready = Signal(N_PORTS) # input
- self.s_axi4_arid = Signal() # input
- self.s_axi4_araddr = Signal() # input
- self.s_axi4_arvalid = Signal(N_PORTS) # input
- self.s_axi4_arready = Signal(N_PORTS) # output
- self.s_axi4_arlen = Signal() # input
- self.s_axi4_arsize = Signal() # input
- self.s_axi4_arburst = Signal() # input
- self.s_axi4_arlock = Signal(N_PORTS) # input
- self.s_axi4_arprot = Signal() # input
- self.s_axi4_arcache = Signal() # input
- self.s_axi4_aruser = Signal() # input
- self.s_axi4_rid = Signal() # output
- self.s_axi4_rdata = Signal() # output
- self.s_axi4_rresp = Signal() # output
- self.s_axi4_rvalid = Signal(N_PORTS) # output
- self.s_axi4_rready = Signal(N_PORTS) # input
- self.s_axi4_rlast = Signal(N_PORTS) # output
- self.s_axi4_ruser = Signal() # output
- self.m0_axi4_awid = Signal() # output
- self.m0_axi4_awaddr = Signal() # output
- self.m0_axi4_awvalid = Signal(N_PORTS) # output
- self.m0_axi4_awready = Signal(N_PORTS) # input
- self.m0_axi4_awlen = Signal() # output
- self.m0_axi4_awsize = Signal() # output
- self.m0_axi4_awburst = Signal() # output
- self.m0_axi4_awlock = Signal(N_PORTS) # output
- self.m0_axi4_awprot = Signal() # output
- self.m0_axi4_awcache = Signal() # output
- self.m0_axi4_awregion = Signal() # output
- self.m0_axi4_awqos = Signal() # output
- self.m0_axi4_awuser = Signal() # output
- self.m0_axi4_wdata = Signal() # output
- self.m0_axi4_wvalid = Signal(N_PORTS) # output
- self.m0_axi4_wready = Signal(N_PORTS) # input
- self.m0_axi4_wstrb = Signal() # output
- self.m0_axi4_wlast = Signal(N_PORTS) # output
- self.m0_axi4_wuser = Signal() # output
- self.m0_axi4_bid = Signal() # input
- self.m0_axi4_bresp = Signal() # input
- self.m0_axi4_bvalid = Signal(N_PORTS) # input
- self.m0_axi4_buser = Signal() # input
- self.m0_axi4_bready = Signal(N_PORTS) # output
- self.m0_axi4_arid = Signal() # output
- self.m0_axi4_araddr = Signal() # output
- self.m0_axi4_arvalid = Signal(N_PORTS) # output
- self.m0_axi4_arready = Signal(N_PORTS) # input
- self.m0_axi4_arlen = Signal() # output
- self.m0_axi4_arsize = Signal() # output
- self.m0_axi4_arburst = Signal() # output
- self.m0_axi4_arlock = Signal(N_PORTS) # output
- self.m0_axi4_arprot = Signal() # output
- self.m0_axi4_arcache = Signal() # output
- self.m0_axi4_aruser = Signal() # output
- self.m0_axi4_rid = Signal() # input
- self.m0_axi4_rdata = Signal() # input
- self.m0_axi4_rresp = Signal() # input
- self.m0_axi4_rvalid = Signal(N_PORTS) # input
- self.m0_axi4_rready = Signal(N_PORTS) # output
- self.m0_axi4_rlast = Signal(N_PORTS) # input
- self.m0_axi4_ruser = Signal() # input
- self.m1_axi4_awid = Signal() # output
- self.m1_axi4_awaddr = Signal() # output
- self.m1_axi4_awvalid = Signal(N_PORTS) # output
- self.m1_axi4_awready = Signal(N_PORTS) # input
- self.m1_axi4_awlen = Signal() # output
- self.m1_axi4_awsize = Signal() # output
- self.m1_axi4_awburst = Signal() # output
- self.m1_axi4_awlock = Signal(N_PORTS) # output
- self.m1_axi4_awprot = Signal() # output
- self.m1_axi4_awcache = Signal() # output
- self.m1_axi4_awregion = Signal() # output
- self.m1_axi4_awqos = Signal() # output
- self.m1_axi4_awuser = Signal() # output
- self.m1_axi4_wdata = Signal() # output
- self.m1_axi4_wvalid = Signal(N_PORTS) # output
- self.m1_axi4_wready = Signal(N_PORTS) # input
- self.m1_axi4_wstrb = Signal() # output
- self.m1_axi4_wlast = Signal(N_PORTS) # output
- self.m1_axi4_wuser = Signal() # output
- self.m1_axi4_bid = Signal() # input
- self.m1_axi4_bresp = Signal() # input
- self.m1_axi4_bvalid = Signal(N_PORTS) # input
- self.m1_axi4_buser = Signal() # input
- self.m1_axi4_bready = Signal(N_PORTS) # output
- self.m1_axi4_arid = Signal() # output
- self.m1_axi4_araddr = Signal() # output
- self.m1_axi4_arvalid = Signal(N_PORTS) # output
- self.m1_axi4_arready = Signal(N_PORTS) # input
- self.m1_axi4_arlen = Signal() # output
- self.m1_axi4_arsize = Signal() # output
- self.m1_axi4_arburst = Signal() # output
- self.m1_axi4_arlock = Signal(N_PORTS) # output
- self.m1_axi4_arprot = Signal() # output
- self.m1_axi4_arcache = Signal() # output
- self.m1_axi4_aruser = Signal() # output
- self.m1_axi4_rid = Signal() # input
- self.m1_axi4_rdata = Signal() # input
- self.m1_axi4_rresp = Signal() # input
- self.m1_axi4_rvalid = Signal(N_PORTS) # input
- self.m1_axi4_rready = Signal(N_PORTS) # output
- self.m1_axi4_rlast = Signal(N_PORTS) # input
- self.m1_axi4_ruser = Signal() # input
- self.s_axi4lite_awaddr = Signal(AXI_LITE_ADDR_WIDTH) # input
- self.s_axi4lite_awvalid = Signal() # input
- self.s_axi4lite_awready = Signal() # output
- self.s_axi4lite_wdata = Signal(AXI_LITE_DATA_WIDTH) # input
- self.s_axi4lite_wvalid = Signal() # input
- self.s_axi4lite_wready = Signal() # output
- self.s_axi4lite_wstrb = Signal(1+ERROR p_expression_25) # input
- self.s_axi4lite_bresp = Signal(2) # output
- self.s_axi4lite_bvalid = Signal() # output
- self.s_axi4lite_bready = Signal() # input
- self.s_axi4lite_araddr = Signal(AXI_LITE_ADDR_WIDTH) # input
- self.s_axi4lite_arvalid = Signal() # input
- self.s_axi4lite_arready = Signal() # output
- self.s_axi4lite_rdata = Signal(AXI_LITE_DATA_WIDTH) # output
- self.s_axi4lite_rresp = Signal(2) # output
- self.s_axi4lite_rvalid = Signal() # output
- self.s_axi4lite_rready = Signal() # input
- self.int_miss = Signal(N_PORTS) # output
- self.int_multi = Signal(N_PORTS) # output
- self.int_prot = Signal(N_PORTS) # output
- self.int_mhf_full = Signal() # output
-
- def elaborate(self, platform=None):
- m = Module()
- return m
-
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License. You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# // --=========================================================================--
-# //
-# // █████╗ ██╗ ██╗██╗ ██████╗ █████╗ ██████╗ ████████╗ ██████╗ ██████╗
-# // ██╔══██╗╚██╗██╔╝██║ ██╔══██╗██╔══██╗██╔══██╗ ╚══██╔══╝██╔═══██╗██╔══██╗
-# // ███████║ ╚███╔╝ ██║ ██████╔╝███████║██████╔╝ ██║ ██║ ██║██████╔╝
-# // ██╔══██║ ██╔██╗ ██║ ██╔══██╗██╔══██║██╔══██╗ ██║ ██║ ██║██╔═══╝
-# // ██║ ██║██╔╝ ██╗██║ ██║ ██║██║ ██║██████╔╝ ██║ ╚██████╔╝██║
-# // ╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═╝ ╚═╝╚═╝ ╚═╝╚═════╝ ╚═╝ ╚═════╝ ╚═╝
-# //
-# // --=========================================================================--
-# /*
-# * axi_rab_top
-# *
-# * The remapping address block (RAB) performs address translation for AXI
-# * transactions arriving at the input port and forwards them to different
-# * downstream AXI ports.
-# *
-# * The five axi channels are each buffered on the input side using a FIFO,
-# * described in axi4_XX_buffer. The RAB lookup result is merged into the
-# * AXI transaction via the axi4_XX_sender instances, which manages upstream
-# * error signaling for failed lookups.
-# *
-# * Address translation is performed based on data stored in up to two
-# * translation lookaside buffers (TLBs), which are private per RAB port (each
-# * of which having two AXI master ports and one AXI slave port). These TLBs
-# * are managed in software through the AXI-Lite interface.
-# *
-# * If ACP is enabled, the `cache_coherent` flag in the TLBs is used to
-# * multiplex between the two ports. If ACP is disabled, only the first master
-# * port is used. In this case, the `cache_coherent` flag is used to set the
-# * AxCACHE signals of the AXI bus accordingly.
-# *
-# * Authors:
-# * Antonio Pullini <pullinia@iis.ee.ethz.ch>
-# * Conrad Burchert <bconrad@ethz.ch>
-# * Maheshwara Sharma <msharma@student.ethz.ch>
-# * Andreas Kurth <akurth@iis.ee.ethz.ch>
-# * Johannes Weinbuch <jweinbuch@student.ethz.ch>
-# * Pirmin Vogel <vogelpi@iis.ee.ethz.ch>
-# */
-#
-# //`include "pulp_soc_defines.sv"
-#
-# ////import CfMath::log2;
-#
-# module axi_rab_top
-#
-# // Parameters {{{
-# #(
-# parameter N_PORTS = 2,
-# parameter N_L2_SETS = 32,
-# parameter N_L2_SET_ENTRIES = 32,
-# parameter AXI_DATA_WIDTH = 64,
-# parameter AXI_S_ADDR_WIDTH = 32,
-# parameter AXI_M_ADDR_WIDTH = 40,
-# parameter AXI_LITE_DATA_WIDTH = 64,
-# parameter AXI_LITE_ADDR_WIDTH = 32,
-# parameter AXI_ID_WIDTH = 10,
-# parameter AXI_USER_WIDTH = 6,
-# parameter MH_FIFO_DEPTH = 16
-# )
-# // }}}
-#
-# // Ports {{{
-# (
-#
-# input logic Clk_CI, // This clock may be gated.
-# input logic NonGatedClk_CI,
-# input logic Rst_RBI,
-#
-# // For every slave port there are two master ports. The master
-# // port to use can be set using the master_select flag of the protection
-# // bits of a slice
-#
-# // AXI4 Slave {{{
-# input logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] s_axi4_awid,
-# input logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] s_axi4_awaddr,
-# input logic [N_PORTS-1:0] s_axi4_awvalid,
-# output logic [N_PORTS-1:0] s_axi4_awready,
-# input logic [N_PORTS-1:0] [7:0] s_axi4_awlen,
-# input logic [N_PORTS-1:0] [2:0] s_axi4_awsize,
-# input logic [N_PORTS-1:0] [1:0] s_axi4_awburst,
-# input logic [N_PORTS-1:0] s_axi4_awlock,
-# input logic [N_PORTS-1:0] [2:0] s_axi4_awprot,
-# input logic [N_PORTS-1:0] [3:0] s_axi4_awcache,
-# input logic [N_PORTS-1:0] [3:0] s_axi4_awregion,
-# input logic [N_PORTS-1:0] [3:0] s_axi4_awqos,
-# input logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] s_axi4_awuser,
-#
-# input logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] s_axi4_wdata,
-# input logic [N_PORTS-1:0] s_axi4_wvalid,
-# output logic [N_PORTS-1:0] s_axi4_wready,
-# input logic [N_PORTS-1:0] [AXI_DATA_WIDTH/8-1:0] s_axi4_wstrb,
-# input logic [N_PORTS-1:0] s_axi4_wlast,
-# input logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] s_axi4_wuser,
-#
-# output logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] s_axi4_bid,
-# output logic [N_PORTS-1:0] [1:0] s_axi4_bresp,
-# output logic [N_PORTS-1:0] s_axi4_bvalid,
-# output logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] s_axi4_buser,
-# input logic [N_PORTS-1:0] s_axi4_bready,
-#
-# input logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] s_axi4_arid,
-# input logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] s_axi4_araddr,
-# input logic [N_PORTS-1:0] s_axi4_arvalid,
-# output logic [N_PORTS-1:0] s_axi4_arready,
-# input logic [N_PORTS-1:0] [7:0] s_axi4_arlen,
-# input logic [N_PORTS-1:0] [2:0] s_axi4_arsize,
-# input logic [N_PORTS-1:0] [1:0] s_axi4_arburst,
-# input logic [N_PORTS-1:0] s_axi4_arlock,
-# input logic [N_PORTS-1:0] [2:0] s_axi4_arprot,
-# input logic [N_PORTS-1:0] [3:0] s_axi4_arcache,
-# input logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] s_axi4_aruser,
-#
-# output logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] s_axi4_rid,
-# output logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] s_axi4_rdata,
-# output logic [N_PORTS-1:0] [1:0] s_axi4_rresp,
-# output logic [N_PORTS-1:0] s_axi4_rvalid,
-# input logic [N_PORTS-1:0] s_axi4_rready,
-# output logic [N_PORTS-1:0] s_axi4_rlast,
-# output logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] s_axi4_ruser,
-# // }}}
-#
-# // AXI4 Master 0 {{{
-# output logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] m0_axi4_awid,
-# output logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] m0_axi4_awaddr,
-# output logic [N_PORTS-1:0] m0_axi4_awvalid,
-# input logic [N_PORTS-1:0] m0_axi4_awready,
-# output logic [N_PORTS-1:0] [7:0] m0_axi4_awlen,
-# output logic [N_PORTS-1:0] [2:0] m0_axi4_awsize,
-# output logic [N_PORTS-1:0] [1:0] m0_axi4_awburst,
-# output logic [N_PORTS-1:0] m0_axi4_awlock,
-# output logic [N_PORTS-1:0] [2:0] m0_axi4_awprot,
-# output logic [N_PORTS-1:0] [3:0] m0_axi4_awcache,
-# output logic [N_PORTS-1:0] [3:0] m0_axi4_awregion,
-# output logic [N_PORTS-1:0] [3:0] m0_axi4_awqos,
-# output logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m0_axi4_awuser,
-#
-# output logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] m0_axi4_wdata,
-# output logic [N_PORTS-1:0] m0_axi4_wvalid,
-# input logic [N_PORTS-1:0] m0_axi4_wready,
-# output logic [N_PORTS-1:0] [AXI_DATA_WIDTH/8-1:0] m0_axi4_wstrb,
-# output logic [N_PORTS-1:0] m0_axi4_wlast,
-# output logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m0_axi4_wuser,
-#
-# input logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] m0_axi4_bid,
-# input logic [N_PORTS-1:0] [1:0] m0_axi4_bresp,
-# input logic [N_PORTS-1:0] m0_axi4_bvalid,
-# input logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m0_axi4_buser,
-# output logic [N_PORTS-1:0] m0_axi4_bready,
-#
-# output logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] m0_axi4_arid,
-# output logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] m0_axi4_araddr,
-# output logic [N_PORTS-1:0] m0_axi4_arvalid,
-# input logic [N_PORTS-1:0] m0_axi4_arready,
-# output logic [N_PORTS-1:0] [7:0] m0_axi4_arlen,
-# output logic [N_PORTS-1:0] [2:0] m0_axi4_arsize,
-# output logic [N_PORTS-1:0] [1:0] m0_axi4_arburst,
-# output logic [N_PORTS-1:0] m0_axi4_arlock,
-# output logic [N_PORTS-1:0] [2:0] m0_axi4_arprot,
-# output logic [N_PORTS-1:0] [3:0] m0_axi4_arcache,
-# output logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m0_axi4_aruser,
-#
-# input logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] m0_axi4_rid,
-# input logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] m0_axi4_rdata,
-# input logic [N_PORTS-1:0] [1:0] m0_axi4_rresp,
-# input logic [N_PORTS-1:0] m0_axi4_rvalid,
-# output logic [N_PORTS-1:0] m0_axi4_rready,
-# input logic [N_PORTS-1:0] m0_axi4_rlast,
-# input logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m0_axi4_ruser,
-# // }}}
-#
-# // AXI4 Master 1 {{{
-# output logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] m1_axi4_awid,
-# output logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] m1_axi4_awaddr,
-# output logic [N_PORTS-1:0] m1_axi4_awvalid,
-# input logic [N_PORTS-1:0] m1_axi4_awready,
-# output logic [N_PORTS-1:0] [7:0] m1_axi4_awlen,
-# output logic [N_PORTS-1:0] [2:0] m1_axi4_awsize,
-# output logic [N_PORTS-1:0] [1:0] m1_axi4_awburst,
-# output logic [N_PORTS-1:0] m1_axi4_awlock,
-# output logic [N_PORTS-1:0] [2:0] m1_axi4_awprot,
-# output logic [N_PORTS-1:0] [3:0] m1_axi4_awcache,
-# output logic [N_PORTS-1:0] [3:0] m1_axi4_awregion,
-# output logic [N_PORTS-1:0] [3:0] m1_axi4_awqos,
-# output logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m1_axi4_awuser,
-#
-# output logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] m1_axi4_wdata,
-# output logic [N_PORTS-1:0] m1_axi4_wvalid,
-# input logic [N_PORTS-1:0] m1_axi4_wready,
-# output logic [N_PORTS-1:0] [AXI_DATA_WIDTH/8-1:0] m1_axi4_wstrb,
-# output logic [N_PORTS-1:0] m1_axi4_wlast,
-# output logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m1_axi4_wuser,
-#
-# input logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] m1_axi4_bid,
-# input logic [N_PORTS-1:0] [1:0] m1_axi4_bresp,
-# input logic [N_PORTS-1:0] m1_axi4_bvalid,
-# input logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m1_axi4_buser,
-# output logic [N_PORTS-1:0] m1_axi4_bready,
-#
-# output logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] m1_axi4_arid,
-# output logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] m1_axi4_araddr,
-# output logic [N_PORTS-1:0] m1_axi4_arvalid,
-# input logic [N_PORTS-1:0] m1_axi4_arready,
-# output logic [N_PORTS-1:0] [7:0] m1_axi4_arlen,
-# output logic [N_PORTS-1:0] [2:0] m1_axi4_arsize,
-# output logic [N_PORTS-1:0] [1:0] m1_axi4_arburst,
-# output logic [N_PORTS-1:0] m1_axi4_arlock,
-# output logic [N_PORTS-1:0] [2:0] m1_axi4_arprot,
-# output logic [N_PORTS-1:0] [3:0] m1_axi4_arcache,
-# output logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m1_axi4_aruser,
-#
-# input logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] m1_axi4_rid,
-# input logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] m1_axi4_rdata,
-# input logic [N_PORTS-1:0] [1:0] m1_axi4_rresp,
-# input logic [N_PORTS-1:0] m1_axi4_rvalid,
-# output logic [N_PORTS-1:0] m1_axi4_rready,
-# input logic [N_PORTS-1:0] m1_axi4_rlast,
-# input logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m1_axi4_ruser,
-# // }}}
-#
-# // AXI 4 Lite Slave (Configuration Interface) {{{
-# // AXI4-Lite port to setup the rab slices
-# // use this to program the configuration registers
-# input logic [AXI_LITE_ADDR_WIDTH-1:0] s_axi4lite_awaddr,
-# input logic s_axi4lite_awvalid,
-# output logic s_axi4lite_awready,
-#
-# input logic [AXI_LITE_DATA_WIDTH-1:0] s_axi4lite_wdata,
-# input logic s_axi4lite_wvalid,
-# output logic s_axi4lite_wready,
-# input logic [AXI_LITE_DATA_WIDTH/8-1:0] s_axi4lite_wstrb,
-#
-# output logic [1:0] s_axi4lite_bresp,
-# output logic s_axi4lite_bvalid,
-# input logic s_axi4lite_bready,
-#
-# input logic [AXI_LITE_ADDR_WIDTH-1:0] s_axi4lite_araddr,
-# input logic s_axi4lite_arvalid,
-# output logic s_axi4lite_arready,
-#
-# output logic [AXI_LITE_DATA_WIDTH-1:0] s_axi4lite_rdata,
-# output logic [1:0] s_axi4lite_rresp,
-# output logic s_axi4lite_rvalid,
-# input logic s_axi4lite_rready,
-# // }}}
-#
-# // BRAMs {{{
-# //`ifdef RAB_AX_LOG_EN
-# // BramPort.Slave ArBram_PS,
-# // BramPort.Slave AwBram_PS,
-# //`endif
-# // }}}
-#
-# // Logger Control {{{
-# //`ifdef RAB_AX_LOG_EN
-# // input logic LogEn_SI,
-# // input logic ArLogClr_SI,
-# // input logic AwLogClr_SI,
-# // output logic ArLogRdy_SO,
-# // output logic AwLogRdy_SO,
-# //`endif
-# // }}}
-#
-# // Interrupt Outputs {{{
-# // Interrupt lines to handle misses, collisions of slices/multiple hits,
-# // protection faults and overflow of the miss handling fifo
-# //`ifdef RAB_AX_LOG_EN
-# // output logic int_ar_log_full,
-# // output logic int_aw_log_full,
-# //`endif
-# output logic [N_PORTS-1:0] int_miss,
-# output logic [N_PORTS-1:0] int_multi,
-# output logic [N_PORTS-1:0] int_prot,
-# output logic int_mhf_full
-# // }}}
-#
-# );
-#
-"""#docstring_begin
-
- // }}}
-
- // Signals {{{
- // ███████╗██╗ ██████╗ ███╗ ██╗ █████╗ ██╗ ███████╗
- // ██╔════╝██║██╔════╝ ████╗ ██║██╔══██╗██║ ██╔════╝
- // ███████╗██║██║ ███╗██╔██╗ ██║███████║██║ ███████╗
- // ╚════██║██║██║ ██║██║╚██╗██║██╔══██║██║ ╚════██║
- // ███████║██║╚██████╔╝██║ ╚████║██║ ██║███████╗███████║
- // ╚══════╝╚═╝ ╚═════╝ ╚═╝ ╚═══╝╚═╝ ╚═╝╚══════╝╚══════╝
- //
-
- // Internal AXI4 lines, these connect buffers on the slave side to the rab core and
- // multiplexers which switch between the two master outputs
- logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] int_awid;
- logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] int_awaddr;
- logic [N_PORTS-1:0] int_awvalid;
- logic [N_PORTS-1:0] int_awready;
- logic [N_PORTS-1:0] [7:0] int_awlen;
- logic [N_PORTS-1:0] [2:0] int_awsize;
- logic [N_PORTS-1:0] [1:0] int_awburst;
- logic [N_PORTS-1:0] int_awlock;
- logic [N_PORTS-1:0] [2:0] int_awprot;
- logic [N_PORTS-1:0] [3:0] int_awcache;
- logic [N_PORTS-1:0] [3:0] int_awregion;
- logic [N_PORTS-1:0] [3:0] int_awqos;
- logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_awuser;
-
- logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] int_wdata;
- logic [N_PORTS-1:0] int_wvalid;
- logic [N_PORTS-1:0] int_wready;
- logic [N_PORTS-1:0] [AXI_DATA_WIDTH/8-1:0] int_wstrb;
- logic [N_PORTS-1:0] int_wlast;
- logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_wuser;
-
- logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] int_bid;
- logic [N_PORTS-1:0] [1:0] int_bresp;
- logic [N_PORTS-1:0] int_bvalid;
- logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_buser;
- logic [N_PORTS-1:0] int_bready;
-
- logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] int_arid;
- logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] int_araddr;
- logic [N_PORTS-1:0] int_arvalid;
- logic [N_PORTS-1:0] int_arready;
- logic [N_PORTS-1:0] [7:0] int_arlen;
- logic [N_PORTS-1:0] [2:0] int_arsize;
- logic [N_PORTS-1:0] [1:0] int_arburst;
- logic [N_PORTS-1:0] int_arlock;
- logic [N_PORTS-1:0] [2:0] int_arprot;
- logic [N_PORTS-1:0] [3:0] int_arcache;
- logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_aruser;
-
- logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] int_rid;
- logic [N_PORTS-1:0] [1:0] int_rresp;
- logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] int_rdata;
- logic [N_PORTS-1:0] int_rlast;
- logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_ruser;
- logic [N_PORTS-1:0] int_rvalid;
- logic [N_PORTS-1:0] int_rready;
-
- // rab_core outputs
- logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] int_wtrans_addr;
- logic [N_PORTS-1:0] int_wtrans_accept;
- logic [N_PORTS-1:0] int_wtrans_drop;
- logic [N_PORTS-1:0] int_wtrans_miss;
- logic [N_PORTS-1:0] int_wtrans_sent;
- logic [N_PORTS-1:0] int_wtrans_cache_coherent;
- logic [N_PORTS-1:0] int_wmaster_select;
-
- logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] int_rtrans_addr;
- logic [N_PORTS-1:0] int_rtrans_accept;
- logic [N_PORTS-1:0] int_rtrans_drop;
- logic [N_PORTS-1:0] int_rtrans_miss;
- logic [N_PORTS-1:0] int_rtrans_sent;
- logic [N_PORTS-1:0] int_rtrans_cache_coherent;
- logic [N_PORTS-1:0] int_rmaster_select;
-
- logic [N_PORTS-1:0] w_master_select;
-
- // Internal master0 AXI4 lines. These connect the first master port to the
- // multiplexers
- // For channels read address, write address and write data the other lines
- // are ignored if valid is not set, therefore we only need to multiplex those
- logic [N_PORTS-1:0] int_m0_awvalid;
- logic [N_PORTS-1:0] int_m0_awready;
-
- logic [N_PORTS-1:0] int_m0_wvalid;
- logic [N_PORTS-1:0] int_m0_wready;
-
- logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] int_m0_bid;
- logic [N_PORTS-1:0] [1:0] int_m0_bresp;
- logic [N_PORTS-1:0] int_m0_bvalid;
- logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_m0_buser;
- logic [N_PORTS-1:0] int_m0_bready;
-
- logic [N_PORTS-1:0] int_m0_arvalid;
- logic [N_PORTS-1:0] int_m0_arready;
-
- logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] int_m0_rid;
- logic [N_PORTS-1:0] [1:0] int_m0_rresp;
- logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] int_m0_rdata;
- logic [N_PORTS-1:0] int_m0_rlast;
- logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_m0_ruser;
- logic [N_PORTS-1:0] int_m0_rready;
- logic [N_PORTS-1:0] int_m0_rvalid;
-
- logic [N_PORTS-1:0] l1_m0_ar_accept;
- logic [N_PORTS-1:0] l1_m0_ar_drop;
- logic [N_PORTS-1:0] l1_m0_ar_save;
- logic [N_PORTS-1:0] l1_m0_ar_done;
- logic [N_PORTS-1:0] l2_m0_ar_accept;
- logic [N_PORTS-1:0] l2_m0_ar_drop;
- logic [N_PORTS-1:0] l2_m0_ar_done;
- logic [N_PORTS-1:0] l2_m0_ar_sending;
-
- logic [N_PORTS-1:0] l1_m0_aw_accept;
- logic [N_PORTS-1:0] l1_m0_aw_drop;
- logic [N_PORTS-1:0] l1_m0_aw_save;
- logic [N_PORTS-1:0] l1_m0_aw_done;
- logic [N_PORTS-1:0] l2_m0_aw_accept;
- logic [N_PORTS-1:0] l2_m0_aw_drop;
- logic [N_PORTS-1:0] l2_m0_aw_done;
- logic [N_PORTS-1:0] l2_m0_aw_sending;
-
- // Internal master1 AXI4 lines. These connect the second master port to the
- // multiplexers
- // For channels read address, write address and write data the other lines
- // are ignored if valid is not set, therefore we only need to multiplex those
- logic [N_PORTS-1:0] int_m1_awvalid;
- logic [N_PORTS-1:0] int_m1_awready;
-
- logic [N_PORTS-1:0] int_m1_wvalid;
- logic [N_PORTS-1:0] int_m1_wready;
-
- logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] int_m1_bid;
- logic [N_PORTS-1:0] [1:0] int_m1_bresp;
- logic [N_PORTS-1:0] int_m1_bvalid;
- logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_m1_buser;
- logic [N_PORTS-1:0] int_m1_bready;
-
- logic [N_PORTS-1:0] int_m1_arvalid;
- logic [N_PORTS-1:0] int_m1_arready;
-
- logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] int_m1_rid;
- logic [N_PORTS-1:0] [1:0] int_m1_rresp;
- logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] int_m1_rdata;
- logic [N_PORTS-1:0] int_m1_rlast;
- logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_m1_ruser;
- logic [N_PORTS-1:0] int_m1_rvalid;
- logic [N_PORTS-1:0] int_m1_rready;
-
- logic [N_PORTS-1:0] l1_m1_ar_accept;
- logic [N_PORTS-1:0] l1_m1_ar_drop;
- logic [N_PORTS-1:0] l1_m1_ar_save;
- logic [N_PORTS-1:0] l1_m1_ar_done;
- logic [N_PORTS-1:0] l2_m1_ar_accept;
- logic [N_PORTS-1:0] l2_m1_ar_drop;
- logic [N_PORTS-1:0] l2_m1_ar_done;
-
- logic [N_PORTS-1:0] l1_m1_aw_accept;
- logic [N_PORTS-1:0] l1_m1_aw_drop;
- logic [N_PORTS-1:0] l1_m1_aw_save;
- logic [N_PORTS-1:0] l1_m1_aw_done;
- logic [N_PORTS-1:0] l2_m1_aw_accept;
- logic [N_PORTS-1:0] l2_m1_aw_drop;
- logic [N_PORTS-1:0] l2_m1_aw_done;
-
- // L1 outputs
- logic [N_PORTS-1:0] rab_miss; // L1 RAB miss
- logic [N_PORTS-1:0] rab_prot;
- logic [N_PORTS-1:0] rab_multi;
- logic [N_PORTS-1:0] rab_prefetch;
-
- //
- // Signals used to support L2 TLB
- //
- // L2 RAM configuration signals
- logic [N_PORTS-1:0] [AXI_LITE_DATA_WIDTH-1:0] L2CfgWData_D;
- logic [N_PORTS-1:0] [AXI_LITE_ADDR_WIDTH-1:0] L2CfgWAddr_D;
- logic [N_PORTS-1:0] L2CfgWE_S;
-
- // L1 output and drop Buffer
- logic [N_PORTS-1:0] L1OutRwType_D, L1DropRwType_DP;
- logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] L1OutUser_D, L1DropUser_DP;
- logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] L1OutId_D, L1DropId_DP;
- logic [N_PORTS-1:0] [7:0] L1OutLen_D, L1DropLen_DP;
- logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] L1OutAddr_D, L1DropAddr_DP;
- logic [N_PORTS-1:0] L1OutProt_D, L1DropProt_DP;
- logic [N_PORTS-1:0] L1OutMulti_D, L1DropMulti_DP;
- logic [N_PORTS-1:0] L1DropEn_S;
- logic [N_PORTS-1:0] L1DropPrefetch_S;
-
- logic [N_PORTS-1:0] L1DropValid_SN, L1DropValid_SP;
-
- // L2 input Buffer
- logic [N_PORTS-1:0] L2InRwType_DP;
- logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] L2InUser_DP;
- logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] L2InId_DP;
- logic [N_PORTS-1:0] [7:0] L2InLen_DP;
- logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] L2InAddr_DP;
- logic [N_PORTS-1:0] L2InEn_S;
-
- // L2 output Buffer
- logic [N_PORTS-1:0] L2OutRwType_DP;
- logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] L2OutUser_DP;
- logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] L2OutId_DP;
- logic [N_PORTS-1:0] [7:0] L2OutLen_DP;
- logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] L2OutInAddr_DP;
-
- logic [N_PORTS-1:0] L2OutHit_SN, L2OutHit_SP;
- logic [N_PORTS-1:0] L2OutMiss_SN, L2OutMiss_SP;
- logic [N_PORTS-1:0] L2OutProt_SN, L2OutProt_SP;
- logic [N_PORTS-1:0] L2OutMulti_SN, L2OutMulti_SP;
- logic [N_PORTS-1:0] L2OutCC_SN, L2OutCC_SP;
- logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] L2OutAddr_DN, L2OutAddr_DP;
-
- logic [N_PORTS-1:0] L2OutValid_SN, L2OutValid_SP;
- logic [N_PORTS-1:0] L2OutPrefetch_S;
- logic [N_PORTS-1:0] L2OutReady_S;
- logic [N_PORTS-1:0] L2OutEn_S;
-
- // L2 outputs
- logic [N_PORTS-1:0] L2Busy_S;
- logic [N_PORTS-1:0] L2OutValid_S;
-
- logic [N_PORTS-1:0] L2Miss_S;
-
- // Signals for interfacing the AXI modules
- logic [N_PORTS-1:0] l1_ar_accept;
- logic [N_PORTS-1:0] l1_aw_accept;
- logic [N_PORTS-1:0] l1_w_accept;
- logic [N_PORTS-1:0] l1_xw_accept;
-
- logic [N_PORTS-1:0] l1_ar_drop;
- logic [N_PORTS-1:0] l1_aw_drop;
- logic [N_PORTS-1:0] l1_w_drop;
- logic [N_PORTS-1:0] l1_xw_drop;
-
- logic [N_PORTS-1:0] l1_ar_save;
- logic [N_PORTS-1:0] l1_aw_save;
- logic [N_PORTS-1:0] l1_w_save;
- logic [N_PORTS-1:0] l1_xw_save;
-
- logic [N_PORTS-1:0] l1_ar_done;
- logic [N_PORTS-1:0] l1_r_done;
- logic [N_PORTS-1:0] l1_r_drop;
- logic [N_PORTS-1:0] lx_r_drop;
- logic [N_PORTS-1:0] lx_r_done;
-
- logic [N_PORTS-1:0] l1_aw_done;
- logic [N_PORTS-1:0] l1_w_done;
- logic [N_PORTS-1:0] l1_xw_done;
- logic [N_PORTS-1:0] l1_aw_done_SP;
- logic [N_PORTS-1:0] l1_w_done_SP;
-
- logic [N_PORTS-1:0] l2_ar_accept;
- logic [N_PORTS-1:0] l2_aw_accept;
- logic [N_PORTS-1:0] l2_w_accept;
- logic [N_PORTS-1:0] l2_xw_accept;
-
- logic [N_PORTS-1:0] l2_ar_drop;
- logic [N_PORTS-1:0] l2_r_drop;
- logic [N_PORTS-1:0] l2_xr_drop;
- logic [N_PORTS-1:0] l2_aw_drop;
- logic [N_PORTS-1:0] l2_w_drop;
- logic [N_PORTS-1:0] l2_xw_drop;
-
- logic [N_PORTS-1:0] l2_aw_done;
- logic [N_PORTS-1:0] l2_w_done;
- logic [N_PORTS-1:0] l2_xw_done;
- logic [N_PORTS-1:0] l2_aw_done_SP;
- logic [N_PORTS-1:0] l2_w_done_SP;
-
- logic [N_PORTS-1:0] l2_ar_done;
- logic [N_PORTS-1:0] l2_r_done;
- logic [N_PORTS-1:0] l2_xr_done;
- logic [N_PORTS-1:0] l2_ar_done_SP;
- logic [N_PORTS-1:0] l2_r_done_SP;
-
- logic [N_PORTS-1:0] l1_mx_aw_done;
- logic [N_PORTS-1:0] l1_mx_ar_done;
- logic [N_PORTS-1:0] l1_m0_aw_done_SP;
- logic [N_PORTS-1:0] l1_m0_ar_done_SP;
- logic [N_PORTS-1:0] l1_m1_aw_done_SP;
- logic [N_PORTS-1:0] l1_m1_ar_done_SP;
-
- logic [N_PORTS-1:0] l2_mx_aw_done;
- logic [N_PORTS-1:0] l2_mx_ar_done;
- logic [N_PORTS-1:0] l2_m0_aw_done_SP;
- logic [N_PORTS-1:0] l2_m0_ar_done_SP;
- logic [N_PORTS-1:0] l2_m1_aw_done_SP;
- logic [N_PORTS-1:0] l2_m1_ar_done_SP;
-
- logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] l1_id_drop, lx_id_drop, b_id_drop;
- logic [N_PORTS-1:0] [7:0] l1_len_drop, lx_len_drop;
- logic [N_PORTS-1:0] l1_prefetch_drop, lx_prefetch_drop, b_prefetch_drop;
- logic [N_PORTS-1:0] l1_hit_drop, lx_hit_drop, b_hit_drop;
-
- logic [N_PORTS-1:0] b_drop;
- logic [N_PORTS-1:0] b_done;
-
- logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] l2_aw_addr;
- logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] l2_ar_addr;
-
- logic [N_PORTS-1:0] l2_cache_coherent;
- logic [N_PORTS-1:0] l2_master_select;
-
- logic [N_PORTS-1:0] aw_in_stall;
- logic [N_PORTS-1:0] aw_out_stall;
-
- genvar i;
-
- // RRESP FSM
- typedef enum logic {IDLE, BUSY} r_resp_mux_ctrl_state_t;
- r_resp_mux_ctrl_state_t [N_PORTS-1:0] RRespMuxCtrl_SN, RRespMuxCtrl_SP;
- logic [N_PORTS-1:0] RRespSel_SN, RRespSel_SP;
- logic [N_PORTS-1:0] RRespBurst_S;
- logic [N_PORTS-1:0] RRespSelIm_S;
-
- // }}}
-
- // Local parameters {{{
-
- // Enable L2 for select ports
- localparam integer ENABLE_L2TLB[N_PORTS-1:0] = `EN_L2TLB_ARRAY;
-
- // L2TLB parameters
- localparam integer HUM_BUFFER_DEPTH = (N_L2_SET_ENTRIES/2/`RAB_L2_N_PAR_VA_RAMS)+13;
-
- // }}}
-
- // Derive `master_select` from cache coherency flag. {{{
- `ifdef EN_ACP
- assign int_wmaster_select = int_wtrans_cache_coherent;
- assign int_rmaster_select = int_rtrans_cache_coherent;
- assign l2_master_select = l2_cache_coherent;
- `else
- assign int_wmaster_select = '0;
- assign int_rmaster_select = '0;
- assign l2_master_select = '0;
- `endif
- // }}}
-
- // Buf and Send {{{
- // ██████╗ ██╗ ██╗███████╗ ██╗ ███████╗███████╗███╗ ██╗██████╗
- // ██╔══██╗██║ ██║██╔════╝ ██║ ██╔════╝██╔════╝████╗ ██║██╔══██╗
- // ██████╔╝██║ ██║█████╗ ████████╗ ███████╗█████╗ ██╔██╗ ██║██║ ██║
- // ██╔══██╗██║ ██║██╔══╝ ██╔═██╔═╝ ╚════██║██╔══╝ ██║╚██╗██║██║ ██║
- // ██████╔╝╚██████╔╝██║ ██████║ ███████║███████╗██║ ╚████║██████╔╝
- // ╚═════╝ ╚═════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝╚═╝ ╚═══╝╚═════╝
- //
- logic[N_PORTS-1:0] m0_write_is_burst, m0_read_is_burst;
- logic[N_PORTS-1:0] m1_write_is_burst, m1_read_is_burst;
-
- generate for (i = 0; i < N_PORTS; i++) begin : BUF_AND_SEND
-
- // Write Address channel (aw) {{{
- /*
- * write address channel (aw)
- *
- * ██╗ ██╗██████╗ ██╗████████╗███████╗ █████╗ ██████╗ ██████╗ ██████╗
- * ██║ ██║██╔══██╗██║╚══██╔══╝██╔════╝ ██╔══██╗██╔══██╗██╔══██╗██╔══██╗
- * ██║ █╗ ██║██████╔╝██║ ██║ █████╗ ███████║██║ ██║██║ ██║██████╔╝
- * ██║███╗██║██╔══██╗██║ ██║ ██╔══╝ ██╔══██║██║ ██║██║ ██║██╔══██╗
- * ╚███╔███╔╝██║ ██║██║ ██║ ███████╗ ██║ ██║██████╔╝██████╔╝██║ ██║
- * ╚══╝╚══╝ ╚═╝ ╚═╝╚═╝ ╚═╝ ╚══════╝ ╚═╝ ╚═╝╚═════╝ ╚═════╝ ╚═╝ ╚═╝
- *
- */
-
- axi4_aw_buffer
- #(
- .AXI_ID_WIDTH ( AXI_ID_WIDTH ),
- .AXI_USER_WIDTH ( AXI_USER_WIDTH )
- )
- u_aw_buffer
- (
- .axi4_aclk ( Clk_CI ),
- .axi4_arstn ( Rst_RBI ),
- .s_axi4_awid ( s_axi4_awid[i] ),
- .s_axi4_awaddr ( s_axi4_awaddr[i] ),
- .s_axi4_awvalid ( s_axi4_awvalid[i] ),
- .s_axi4_awready ( s_axi4_awready[i] ),
- .s_axi4_awlen ( s_axi4_awlen[i] ),
- .s_axi4_awsize ( s_axi4_awsize[i] ),
- .s_axi4_awburst ( s_axi4_awburst[i] ),
- .s_axi4_awlock ( s_axi4_awlock[i] ),
- .s_axi4_awprot ( s_axi4_awprot[i] ),
- .s_axi4_awcache ( s_axi4_awcache[i] ),
- .s_axi4_awregion ( s_axi4_awregion[i] ),
- .s_axi4_awqos ( s_axi4_awqos[i] ),
- .s_axi4_awuser ( s_axi4_awuser[i] ),
- .m_axi4_awid ( int_awid[i] ),
- .m_axi4_awaddr ( int_awaddr[i] ),
- .m_axi4_awvalid ( int_awvalid[i] ),
- .m_axi4_awready ( int_awready[i] ),
- .m_axi4_awlen ( int_awlen[i] ),
- .m_axi4_awsize ( int_awsize[i] ),
- .m_axi4_awburst ( int_awburst[i] ),
- .m_axi4_awlock ( int_awlock[i] ),
- .m_axi4_awprot ( int_awprot[i] ),
- .m_axi4_awcache ( int_awcache[i] ),
- .m_axi4_awregion ( int_awregion[i] ),
- .m_axi4_awqos ( int_awqos[i] ),
- .m_axi4_awuser ( int_awuser[i] )
- );
-
- axi4_aw_sender
- #(
- .AXI_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ),
- .AXI_ID_WIDTH ( AXI_ID_WIDTH ),
- .AXI_USER_WIDTH ( AXI_USER_WIDTH ),
- .ENABLE_L2TLB ( ENABLE_L2TLB[i] )
- )
- u_aw_sender_m0
- (
- .axi4_aclk ( Clk_CI ),
- .axi4_arstn ( Rst_RBI ),
- .l1_done_o ( l1_m0_aw_done[i] ),
- .l1_accept_i ( l1_m0_aw_accept[i] ),
- .l1_drop_i ( l1_m0_aw_drop[i] ),
- .l1_save_i ( l1_m0_aw_save[i] ),
- .l2_done_o ( l2_m0_aw_done[i] ),
- .l2_accept_i ( l2_m0_aw_accept[i] ),
- .l2_drop_i ( l2_m0_aw_drop[i] ),
- .l2_sending_o ( l2_m0_aw_sending[i] ),
- .l1_awaddr_i ( int_wtrans_addr[i] ),
- .l2_awaddr_i ( l2_aw_addr[i] ),
- .s_axi4_awid ( int_awid[i] ),
- .s_axi4_awvalid ( int_m0_awvalid[i] ),
- .s_axi4_awready ( int_m0_awready[i] ),
- .s_axi4_awlen ( int_awlen[i] ),
- .s_axi4_awsize ( int_awsize[i] ),
- .s_axi4_awburst ( int_awburst[i] ),
- .s_axi4_awlock ( int_awlock[i] ),
- .s_axi4_awprot ( int_awprot[i] ),
- .s_axi4_awcache ( int_awcache[i] ),
- .s_axi4_awregion ( int_awregion[i] ),
- .s_axi4_awqos ( int_awqos[i] ),
- .s_axi4_awuser ( int_awuser[i] ),
- .m_axi4_awid ( m0_axi4_awid[i] ),
- .m_axi4_awaddr ( m0_axi4_awaddr[i] ),
- .m_axi4_awvalid ( m0_axi4_awvalid[i] ),
- .m_axi4_awready ( m0_axi4_awready[i] ),
- .m_axi4_awlen ( m0_axi4_awlen[i] ),
- .m_axi4_awsize ( m0_axi4_awsize[i] ),
- .m_axi4_awburst ( m0_axi4_awburst[i] ),
- .m_axi4_awlock ( m0_axi4_awlock[i] ),
- .m_axi4_awprot ( m0_axi4_awprot[i] ),
- .m_axi4_awcache ( ),
- .m_axi4_awregion ( m0_axi4_awregion[i] ),
- .m_axi4_awqos ( m0_axi4_awqos[i] ),
- .m_axi4_awuser ( m0_axi4_awuser[i] )
- );
-
- // The AXCACHE signals are set according to burstiness and cache coherence or statically
- // when not connected to ACP on Zynq (implemented below).
- assign m0_write_is_burst[i] = (m0_axi4_awlen[i] != {8{1'b0}}) && (m0_axi4_awburst[i] != 2'b00);
- `ifndef EN_ACP
- always_comb begin
- if ( (l2_m0_aw_sending[i] & l2_cache_coherent[i]) | int_wtrans_cache_coherent[i]) begin
- if (m0_write_is_burst[i]) begin
- m0_axi4_awcache[i] = 4'b0111;
- end else begin
- m0_axi4_awcache[i] = 4'b1111;
- end
- end else begin
- m0_axi4_awcache[i] = 4'b0011;
- end
- end
- `else
- assign m0_axi4_awcache[i] = 4'b0011;
- `endif
-
- axi4_aw_sender
- #(
- .AXI_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ),
- .AXI_ID_WIDTH ( AXI_ID_WIDTH ),
- .AXI_USER_WIDTH ( AXI_USER_WIDTH ),
- .ENABLE_L2TLB ( ENABLE_L2TLB[i] )
- )
- u_aw_sender_m1
- (
- .axi4_aclk ( Clk_CI ),
- .axi4_arstn ( Rst_RBI ),
- .l1_accept_i ( l1_m1_aw_accept[i] ),
- .l1_drop_i ( l1_m1_aw_drop[i] ),
- .l1_save_i ( l1_m1_aw_save[i] ),
- .l1_done_o ( l1_m1_aw_done[i] ),
- .l2_accept_i ( l2_m1_aw_accept[i] ),
- .l2_drop_i ( l2_m1_aw_drop[i] ),
- .l2_done_o ( l2_m1_aw_done[i] ),
- .l2_sending_o ( ), // just helps to set axcache
- .l1_awaddr_i ( int_wtrans_addr[i] ),
- .l2_awaddr_i ( l2_aw_addr[i] ),
- .s_axi4_awid ( int_awid[i] ),
- .s_axi4_awvalid ( int_m1_awvalid[i] ),
- .s_axi4_awready ( int_m1_awready[i] ),
- .s_axi4_awlen ( int_awlen[i] ),
- .s_axi4_awsize ( int_awsize[i] ),
- .s_axi4_awburst ( int_awburst[i] ),
- .s_axi4_awlock ( int_awlock[i] ),
- .s_axi4_awprot ( int_awprot[i] ),
- .s_axi4_awcache ( int_awcache[i] ),
- .s_axi4_awregion ( int_awregion[i] ),
- .s_axi4_awqos ( int_awqos[i] ),
- .s_axi4_awuser ( int_awuser[i] ),
- .m_axi4_awid ( m1_axi4_awid[i] ),
- .m_axi4_awaddr ( m1_axi4_awaddr[i] ),
- .m_axi4_awvalid ( m1_axi4_awvalid[i] ),
- .m_axi4_awready ( m1_axi4_awready[i] ),
- .m_axi4_awlen ( m1_axi4_awlen[i] ),
- .m_axi4_awsize ( m1_axi4_awsize[i] ),
- .m_axi4_awburst ( m1_axi4_awburst[i] ),
- .m_axi4_awlock ( m1_axi4_awlock[i] ),
- .m_axi4_awprot ( m1_axi4_awprot[i] ),
- .m_axi4_awcache ( ),
- .m_axi4_awregion ( m1_axi4_awregion[i] ),
- .m_axi4_awqos ( m1_axi4_awqos[i] ),
- .m_axi4_awuser ( m1_axi4_awuser[i] )
- );
-
- // The AXCACHE signals are set according to burstiness and cache coherence or statically
- // when not connected to ACP on Zynq (implemented below).
- assign m1_write_is_burst[i] = (m1_axi4_awlen[i] != {8{1'b0}}) && (m1_axi4_awburst[i] != 2'b00);
- `ifdef EN_ACP
- always_comb begin
- if (m1_write_is_burst[i]) begin
- m1_axi4_awcache[i] = 4'b1011;
- end else begin
- m1_axi4_awcache[i] = 4'b1111;
- end
- end
- `else
- assign m1_axi4_awcache[i] = 4'b0011;
- `endif
-
- // }}}
-
- // Write Data channel (w) {{{
- /*
- * write data channel (w)
- *
- * ██╗ ██╗██████╗ ██╗████████╗███████╗ ██████╗ █████╗ ████████╗ █████╗
- * ██║ ██║██╔══██╗██║╚══██╔══╝██╔════╝ ██╔══██╗██╔══██╗╚══██╔══╝██╔══██╗
- * ██║ █╗ ██║██████╔╝██║ ██║ █████╗ ██║ ██║███████║ ██║ ███████║
- * ██║███╗██║██╔══██╗██║ ██║ ██╔══╝ ██║ ██║██╔══██║ ██║ ██╔══██║
- * ╚███╔███╔╝██║ ██║██║ ██║ ███████╗ ██████╔╝██║ ██║ ██║ ██║ ██║
- * ╚══╝╚══╝ ╚═╝ ╚═╝╚═╝ ╚═╝ ╚══════╝ ╚═════╝ ╚═╝ ╚═╝ ╚═╝ ╚═╝ ╚═╝
- *
- */
- axi4_w_buffer
- #(
- .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ),
- .AXI_ID_WIDTH ( AXI_ID_WIDTH ),
- .AXI_USER_WIDTH ( AXI_USER_WIDTH ),
- .ENABLE_L2TLB ( ENABLE_L2TLB[i] ),
- .HUM_BUFFER_DEPTH ( HUM_BUFFER_DEPTH )
- )
- u_w_buffer
- (
- .axi4_aclk ( Clk_CI ),
- .axi4_arstn ( Rst_RBI ),
-
- // L1 interface
- .l1_done_o ( l1_w_done[i] ),
- .l1_accept_i ( l1_w_accept[i] ),
- .l1_save_i ( l1_w_save[i] ),
- .l1_drop_i ( l1_w_drop[i] ),
- .l1_master_i ( int_wmaster_select[i] ),
- .l1_id_i ( l1_id_drop[i] ),
- .l1_len_i ( l1_len_drop[i] ),
- .l1_prefetch_i ( l1_prefetch_drop[i] ),
- .l1_hit_i ( l1_hit_drop[i] ),
-
- // L2 interface
- .l2_done_o ( l2_w_done[i] ),
- .l2_accept_i ( l2_w_accept[i] ),
- .l2_drop_i ( l2_w_drop[i] ),
- .l2_master_i ( l2_master_select[i] ),
- .l2_id_i ( lx_id_drop[i] ),
- .l2_len_i ( lx_len_drop[i] ),
- .l2_prefetch_i ( lx_prefetch_drop[i] ),
- .l2_hit_i ( lx_hit_drop[i] ),
-
- // Top-level control outputs
- .master_select_o ( w_master_select[i] ),
- .input_stall_o ( aw_in_stall[i] ), // stall L1 AW input if request buffers full
- .output_stall_o ( aw_out_stall[i] ), // stall L1 AW hit forwarding if bypass not possible
-
- // B sender interface
- .b_drop_o ( b_drop[i] ),
- .b_done_i ( b_done[i] ),
- .id_o ( b_id_drop[i] ),
- .prefetch_o ( b_prefetch_drop[i] ),
- .hit_o ( b_hit_drop[i] ),
-
- // AXI W channel interfaces
- .s_axi4_wdata ( s_axi4_wdata[i] ),
- .s_axi4_wvalid ( s_axi4_wvalid[i] ),
- .s_axi4_wready ( s_axi4_wready[i] ),
- .s_axi4_wstrb ( s_axi4_wstrb[i] ),
- .s_axi4_wlast ( s_axi4_wlast[i] ),
- .s_axi4_wuser ( s_axi4_wuser[i] ),
- .m_axi4_wdata ( int_wdata[i] ),
- .m_axi4_wvalid ( int_wvalid[i] ),
- .m_axi4_wready ( int_wready[i] ),
- .m_axi4_wstrb ( int_wstrb[i] ),
- .m_axi4_wlast ( int_wlast[i] ),
- .m_axi4_wuser ( int_wuser[i] )
- );
-
- axi4_w_sender
- #(
- .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ),
- .AXI_USER_WIDTH ( AXI_USER_WIDTH )
- )
- u_w_sender_m0
- (
- .axi4_aclk ( Clk_CI ),
- .axi4_arstn ( Rst_RBI ),
- .s_axi4_wdata ( int_wdata[i] ),
- .s_axi4_wvalid ( int_m0_wvalid[i] ),
- .s_axi4_wready ( int_m0_wready[i] ),
- .s_axi4_wstrb ( int_wstrb[i] ),
- .s_axi4_wlast ( int_wlast[i] ),
- .s_axi4_wuser ( int_wuser[i] ),
- .m_axi4_wdata ( m0_axi4_wdata[i] ),
- .m_axi4_wvalid ( m0_axi4_wvalid[i] ),
- .m_axi4_wready ( m0_axi4_wready[i] ),
- .m_axi4_wstrb ( m0_axi4_wstrb[i] ),
- .m_axi4_wlast ( m0_axi4_wlast[i] ),
- .m_axi4_wuser ( m0_axi4_wuser[i] )
- );
-
- axi4_w_sender
- #(
- .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ),
- .AXI_USER_WIDTH ( AXI_USER_WIDTH )
-
- )
- u_w_sender_m1
- (
- .axi4_aclk ( Clk_CI ),
- .axi4_arstn ( Rst_RBI ),
- .s_axi4_wdata ( int_wdata[i] ),
- .s_axi4_wvalid ( int_m1_wvalid[i] ),
- .s_axi4_wready ( int_m1_wready[i] ),
- .s_axi4_wstrb ( int_wstrb[i] ),
- .s_axi4_wlast ( int_wlast[i] ),
- .s_axi4_wuser ( int_wuser[i] ),
- .m_axi4_wdata ( m1_axi4_wdata[i] ),
- .m_axi4_wvalid ( m1_axi4_wvalid[i] ),
- .m_axi4_wready ( m1_axi4_wready[i] ),
- .m_axi4_wstrb ( m1_axi4_wstrb[i] ),
- .m_axi4_wlast ( m1_axi4_wlast[i] ),
- .m_axi4_wuser ( m1_axi4_wuser[i] )
- );
-
- /*
- * Multiplexer to switch between the two output master ports on the write data (w) channel
- */
- always_comb begin
- /* Only one output can be selected at any time */
- if (w_master_select[i] == 1'b0) begin
- int_m0_wvalid[i] = int_wvalid[i];
- int_m1_wvalid[i] = 1'b0;
- int_wready[i] = int_m0_wready[i];
- end else begin
- int_m0_wvalid[i] = 1'b0;
- int_m1_wvalid[i] = int_wvalid[i];
- int_wready[i] = int_m1_wready[i];
- end
- end
-
- // }}}
-
- // Write Response channel (b) {{{
- /*
- * write response channel (b)
- *
- * ██╗ ██╗██████╗ ██╗████████╗███████╗ ██████╗ ███████╗███████╗██████╗
- * ██║ ██║██╔══██╗██║╚══██╔══╝██╔════╝ ██╔══██╗██╔════╝██╔════╝██╔══██╗
- * ██║ █╗ ██║██████╔╝██║ ██║ █████╗ ██████╔╝█████╗ ███████╗██████╔╝
- * ██║███╗██║██╔══██╗██║ ██║ ██╔══╝ ██╔══██╗██╔══╝ ╚════██║██╔═══╝
- * ╚███╔███╔╝██║ ██║██║ ██║ ███████╗ ██║ ██║███████╗███████║██║
- * ╚══╝╚══╝ ╚═╝ ╚═╝╚═╝ ╚═╝ ╚══════╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝
- *
- */
- axi4_b_buffer
- #(
- .AXI_ID_WIDTH ( AXI_ID_WIDTH ),
- .AXI_USER_WIDTH ( AXI_USER_WIDTH )
- )
- u_b_buffer_m0
- (
- .axi4_aclk ( Clk_CI ),
- .axi4_arstn ( Rst_RBI ),
- .s_axi4_bid ( int_m0_bid[i] ),
- .s_axi4_bresp ( int_m0_bresp[i] ),
- .s_axi4_bvalid ( int_m0_bvalid[i] ),
- .s_axi4_buser ( int_m0_buser[i] ),
- .s_axi4_bready ( int_m0_bready[i] ),
- .m_axi4_bid ( m0_axi4_bid[i] ),
- .m_axi4_bresp ( m0_axi4_bresp[i] ),
- .m_axi4_bvalid ( m0_axi4_bvalid[i] ),
- .m_axi4_buser ( m0_axi4_buser[i] ),
- .m_axi4_bready ( m0_axi4_bready[i] )
- );
-
- axi4_b_buffer
- #(
- .AXI_ID_WIDTH ( AXI_ID_WIDTH ),
- .AXI_USER_WIDTH ( AXI_USER_WIDTH )
- )
- u_b_buffer_m1
- (
- .axi4_aclk ( Clk_CI ),
- .axi4_arstn ( Rst_RBI ),
- .s_axi4_bid ( int_m1_bid[i] ),
- .s_axi4_bresp ( int_m1_bresp[i] ),
- .s_axi4_bvalid ( int_m1_bvalid[i] ),
- .s_axi4_buser ( int_m1_buser[i] ),
- .s_axi4_bready ( int_m1_bready[i] ),
- .m_axi4_bid ( m1_axi4_bid[i] ),
- .m_axi4_bresp ( m1_axi4_bresp[i] ),
- .m_axi4_bvalid ( m1_axi4_bvalid[i] ),
- .m_axi4_buser ( m1_axi4_buser[i] ),
- .m_axi4_bready ( m1_axi4_bready[i] )
- );
-
- axi4_b_sender
- #(
- .AXI_ID_WIDTH ( AXI_ID_WIDTH ),
- .AXI_USER_WIDTH ( AXI_USER_WIDTH )
- )
- u_b_sender
- (
- .axi4_aclk ( Clk_CI ),
- .axi4_arstn ( Rst_RBI ),
- .drop_i ( b_drop[i] ),
- .done_o ( b_done[i] ),
- .id_i ( b_id_drop[i] ),
- .prefetch_i ( b_prefetch_drop[i] ),
- .hit_i ( b_hit_drop[i] ),
- .s_axi4_bid ( s_axi4_bid[i] ),
- .s_axi4_bresp ( s_axi4_bresp[i] ),
- .s_axi4_bvalid ( s_axi4_bvalid[i] ),
- .s_axi4_buser ( s_axi4_buser[i] ),
- .s_axi4_bready ( s_axi4_bready[i] ),
- .m_axi4_bid ( int_bid[i] ),
- .m_axi4_bresp ( int_bresp[i] ),
- .m_axi4_bvalid ( int_bvalid[i] ),
- .m_axi4_buser ( int_buser[i] ),
- .m_axi4_bready ( int_bready[i] )
- );
-
- /*
- * Multiplexer to switch between the two output master ports on the write response (b) channel
- */
- always_comb begin
- /* Output 1 always gets priority, so if it has something to send connect
- it and let output 0 wait using rready = 0 */
- if (int_m1_bvalid[i] == 1'b1) begin
- int_m0_bready[i] = 1'b0;
- int_m1_bready[i] = int_bready[i];
-
- int_bid[i] = int_m1_bid[i];
- int_bresp[i] = int_m1_bresp[i];
- int_buser[i] = int_m1_buser[i];
- int_bvalid[i] = int_m1_bvalid[i];
- end else begin
- int_m0_bready[i] = int_bready[i];
- int_m1_bready[i] = 1'b0;
-
- int_bid[i] = int_m0_bid[i];
- int_bresp[i] = int_m0_bresp[i];
- int_buser[i] = int_m0_buser[i];
- int_bvalid[i] = int_m0_bvalid[i];
- end
- end
-
- // }}}
-
- // Read Address channel (ar) {{{
- /*
- * read address channel (ar)
- *
- * ██████╗ ███████╗ █████╗ ██████╗ █████╗ ██████╗ ██████╗ ██████╗
- * ██╔══██╗██╔════╝██╔══██╗██╔══██╗ ██╔══██╗██╔══██╗██╔══██╗██╔══██╗
- * ██████╔╝█████╗ ███████║██║ ██║ ███████║██║ ██║██║ ██║██████╔╝
- * ██╔══██╗██╔══╝ ██╔══██║██║ ██║ ██╔══██║██║ ██║██║ ██║██╔══██╗
- * ██║ ██║███████╗██║ ██║██████╔╝ ██║ ██║██████╔╝██████╔╝██║ ██║
- * ╚═╝ ╚═╝╚══════╝╚═╝ ╚═╝╚═════╝ ╚═╝ ╚═╝╚═════╝ ╚═════╝ ╚═╝ ╚═╝
- *
- */
- axi4_ar_buffer
- #(
- .AXI_ID_WIDTH ( AXI_ID_WIDTH ),
- .AXI_USER_WIDTH ( AXI_USER_WIDTH )
- )
- u_ar_buffer
- (
- .axi4_aclk ( Clk_CI ),
- .axi4_arstn ( Rst_RBI ),
- .s_axi4_arid ( s_axi4_arid[i] ),
- .s_axi4_araddr ( s_axi4_araddr[i] ),
- .s_axi4_arvalid ( s_axi4_arvalid[i] ),
- .s_axi4_arready ( s_axi4_arready[i] ),
- .s_axi4_arlen ( s_axi4_arlen[i] ),
- .s_axi4_arsize ( s_axi4_arsize[i] ),
- .s_axi4_arburst ( s_axi4_arburst[i] ),
- .s_axi4_arlock ( s_axi4_arlock[i] ),
- .s_axi4_arprot ( s_axi4_arprot[i] ),
- .s_axi4_arcache ( s_axi4_arcache[i] ),
- .s_axi4_aruser ( s_axi4_aruser[i] ),
- .m_axi4_arid ( int_arid[i] ),
- .m_axi4_araddr ( int_araddr[i] ),
- .m_axi4_arvalid ( int_arvalid[i] ),
- .m_axi4_arready ( int_arready[i] ),
- .m_axi4_arlen ( int_arlen[i] ),
- .m_axi4_arsize ( int_arsize[i] ),
- .m_axi4_arburst ( int_arburst[i] ),
- .m_axi4_arlock ( int_arlock[i] ),
- .m_axi4_arprot ( int_arprot[i] ),
- .m_axi4_arcache ( int_arcache[i] ),
- .m_axi4_aruser ( int_aruser[i] )
- );
-
- axi4_ar_sender
- #(
- .AXI_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ),
- .AXI_ID_WIDTH ( AXI_ID_WIDTH ),
- .AXI_USER_WIDTH ( AXI_USER_WIDTH ),
- .ENABLE_L2TLB ( ENABLE_L2TLB[i] )
- )
- u_ar_sender_m0
- (
- .axi4_aclk ( Clk_CI ),
- .axi4_arstn ( Rst_RBI ),
- .l1_done_o ( l1_m0_ar_done[i] ),
- .l1_accept_i ( l1_m0_ar_accept[i] ),
- .l1_drop_i ( l1_m0_ar_drop[i] ),
- .l1_save_i ( l1_m0_ar_save[i] ),
- .l2_done_o ( l2_m0_ar_done[i] ),
- .l2_accept_i ( l2_m0_ar_accept[i] ),
- .l2_drop_i ( l2_m0_ar_drop[i] ),
- .l2_sending_o ( l2_m0_ar_sending[i] ),
- .l1_araddr_i ( int_rtrans_addr[i] ),
- .l2_araddr_i ( l2_ar_addr[i] ),
- .s_axi4_arid ( int_arid[i] ),
- .s_axi4_arvalid ( int_m0_arvalid[i] ),
- .s_axi4_arready ( int_m0_arready[i] ),
- .s_axi4_arlen ( int_arlen[i] ),
- .s_axi4_arsize ( int_arsize[i] ),
- .s_axi4_arburst ( int_arburst[i] ),
- .s_axi4_arlock ( int_arlock[i] ),
- .s_axi4_arprot ( int_arprot[i] ),
- .s_axi4_arcache ( int_arcache[i] ),
- .s_axi4_aruser ( int_aruser[i] ),
- .m_axi4_arid ( m0_axi4_arid[i] ),
- .m_axi4_araddr ( m0_axi4_araddr[i] ),
- .m_axi4_arvalid ( m0_axi4_arvalid[i] ),
- .m_axi4_arready ( m0_axi4_arready[i] ),
- .m_axi4_arlen ( m0_axi4_arlen[i] ),
- .m_axi4_arsize ( m0_axi4_arsize[i] ),
- .m_axi4_arburst ( m0_axi4_arburst[i] ),
- .m_axi4_arlock ( m0_axi4_arlock[i] ),
- .m_axi4_arprot ( m0_axi4_arprot[i] ),
- .m_axi4_arcache ( ),
- .m_axi4_aruser ( m0_axi4_aruser[i] )
- );
-
- // The AXCACHE signals are set according to burstiness and cache coherence or statically
- // when not connected to ACP on Zynq (implemented below).
- assign m0_read_is_burst[i] = (m0_axi4_arlen[i] != {8{1'b0}}) && (m0_axi4_arburst[i] != 2'b00);
- `ifndef EN_ACP
- always_comb begin
- if ( (l2_m0_ar_sending[i] & l2_cache_coherent[i]) | int_rtrans_cache_coherent[i]) begin
- if (m0_read_is_burst[i]) begin
- m0_axi4_arcache[i] = 4'b1011;
- end else begin
- m0_axi4_arcache[i] = 4'b1111;
- end
- end else begin
- m0_axi4_arcache[i] = 4'b0011;
- end
- end
- `else
- assign m0_axi4_arcache[i] = 4'b0011;
- `endif
-
- axi4_ar_sender
- #(
- .AXI_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ),
- .AXI_ID_WIDTH ( AXI_ID_WIDTH ),
- .AXI_USER_WIDTH ( AXI_USER_WIDTH ),
- .ENABLE_L2TLB ( ENABLE_L2TLB[i] )
- )
- u_ar_sender_m1
- (
- .axi4_aclk ( Clk_CI ),
- .axi4_arstn ( Rst_RBI ),
- .l1_done_o ( l1_m1_ar_done[i] ),
- .l1_accept_i ( l1_m1_ar_accept[i] ),
- .l1_drop_i ( l1_m1_ar_drop[i] ),
- .l1_save_i ( l1_m1_ar_save[i] ),
- .l2_done_o ( l2_m1_ar_done[i] ),
- .l2_accept_i ( l2_m1_ar_accept[i] ),
- .l2_drop_i ( l2_m1_ar_drop[i] ),
- .l2_sending_o ( ), // just helps to set axcache
- .l1_araddr_i ( int_rtrans_addr[i] ),
- .l2_araddr_i ( l2_ar_addr[i] ),
- .s_axi4_arid ( int_arid[i] ),
- .s_axi4_arvalid ( int_m1_arvalid[i] ),
- .s_axi4_arready ( int_m1_arready[i] ),
- .s_axi4_arlen ( int_arlen[i] ),
- .s_axi4_arsize ( int_arsize[i] ),
- .s_axi4_arburst ( int_arburst[i] ),
- .s_axi4_arlock ( int_arlock[i] ),
- .s_axi4_arprot ( int_arprot[i] ),
- .s_axi4_arcache ( int_arcache[i] ),
- .s_axi4_aruser ( int_aruser[i] ),
- .m_axi4_arid ( m1_axi4_arid[i] ),
- .m_axi4_araddr ( m1_axi4_araddr[i] ),
- .m_axi4_arvalid ( m1_axi4_arvalid[i] ),
- .m_axi4_arready ( m1_axi4_arready[i] ),
- .m_axi4_arlen ( m1_axi4_arlen[i] ),
- .m_axi4_arsize ( m1_axi4_arsize[i] ),
- .m_axi4_arburst ( m1_axi4_arburst[i] ),
- .m_axi4_arlock ( m1_axi4_arlock[i] ),
- .m_axi4_arprot ( m1_axi4_arprot[i] ),
- .m_axi4_arcache ( ),
- .m_axi4_aruser ( m1_axi4_aruser[i] )
- );
-
- // The AXCACHE signals are set according to burstiness and cache coherence or statically
- // when not connected to ACP on Zynq (implemented below).
- assign m1_read_is_burst[i] = (m1_axi4_arlen[i] != {8{1'b0}}) && (m1_axi4_arburst[i] != 2'b00);
- `ifdef EN_ACP
- always_comb begin
- if (m1_read_is_burst[i]) begin
- m1_axi4_arcache[i] = 4'b1011;
- end else begin
- m1_axi4_arcache[i] = 4'b1111;
- end
- end
- `else
- assign m1_axi4_arcache[i] = 4'b0011;
- `endif
-
- // }}}
-
- // Read Response channel (r) {{{
- /*
- * read response channel (r)
- *
- * ██████╗ ███████╗ █████╗ ██████╗ ██████╗ ███████╗███████╗██████╗
- * ██╔══██╗██╔════╝██╔══██╗██╔══██╗ ██╔══██╗██╔════╝██╔════╝██╔══██╗
- * ██████╔╝█████╗ ███████║██║ ██║ ██████╔╝█████╗ ███████╗██████╔╝
- * ██╔══██╗██╔══╝ ██╔══██║██║ ██║ ██╔══██╗██╔══╝ ╚════██║██╔═══╝
- * ██║ ██║███████╗██║ ██║██████╔╝ ██║ ██║███████╗███████║██║
- * ╚═╝ ╚═╝╚══════╝╚═╝ ╚═╝╚═════╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝
- *
- */
- axi4_r_buffer
- #(
- .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ),
- .AXI_ID_WIDTH ( AXI_ID_WIDTH ),
- .AXI_USER_WIDTH ( AXI_USER_WIDTH )
- )
- u_r_buffer_m0
- (
- .axi4_aclk ( Clk_CI ),
- .axi4_arstn ( Rst_RBI ),
- .s_axi4_rid ( int_m0_rid[i] ),
- .s_axi4_rresp ( int_m0_rresp[i] ),
- .s_axi4_rdata ( int_m0_rdata[i] ),
- .s_axi4_rlast ( int_m0_rlast[i] ),
- .s_axi4_rvalid ( int_m0_rvalid[i] ),
- .s_axi4_ruser ( int_m0_ruser[i] ),
- .s_axi4_rready ( int_m0_rready[i] ),
- .m_axi4_rid ( m0_axi4_rid[i] ),
- .m_axi4_rresp ( m0_axi4_rresp[i] ),
- .m_axi4_rdata ( m0_axi4_rdata[i] ),
- .m_axi4_rlast ( m0_axi4_rlast[i] ),
- .m_axi4_rvalid ( m0_axi4_rvalid[i] ),
- .m_axi4_ruser ( m0_axi4_ruser[i] ),
- .m_axi4_rready ( m0_axi4_rready[i] )
- );
-
- axi4_r_buffer
- #(
- .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ),
- .AXI_ID_WIDTH ( AXI_ID_WIDTH ),
- .AXI_USER_WIDTH ( AXI_USER_WIDTH )
- )
- u_r_buffer_m1
- (
- .axi4_aclk ( Clk_CI ),
- .axi4_arstn ( Rst_RBI ),
- .s_axi4_rid ( int_m1_rid[i] ),
- .s_axi4_rresp ( int_m1_rresp[i] ),
- .s_axi4_rdata ( int_m1_rdata[i] ),
- .s_axi4_rlast ( int_m1_rlast[i] ),
- .s_axi4_rvalid ( int_m1_rvalid[i] ),
- .s_axi4_ruser ( int_m1_ruser[i] ),
- .s_axi4_rready ( int_m1_rready[i] ),
- .m_axi4_rid ( m1_axi4_rid[i] ),
- .m_axi4_rresp ( m1_axi4_rresp[i] ),
- .m_axi4_rdata ( m1_axi4_rdata[i] ),
- .m_axi4_rlast ( m1_axi4_rlast[i] ),
- .m_axi4_rvalid ( m1_axi4_rvalid[i] ),
- .m_axi4_ruser ( m1_axi4_ruser[i] ),
- .m_axi4_rready ( m1_axi4_rready[i] )
- );
-
- axi4_r_sender
- #(
- .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ),
- .AXI_ID_WIDTH ( AXI_ID_WIDTH ),
- .AXI_USER_WIDTH ( AXI_USER_WIDTH )
- )
- u_r_sender
- (
- .axi4_aclk ( Clk_CI ),
- .axi4_arstn ( Rst_RBI ),
- .drop_i ( lx_r_drop[i] ),
- .drop_len_i ( lx_len_drop[i] ),
- .done_o ( lx_r_done[i] ),
- .id_i ( lx_id_drop[i] ),
- .prefetch_i ( lx_prefetch_drop[i] ),
- .hit_i ( lx_hit_drop[i] ),
- .s_axi4_rid ( s_axi4_rid[i] ),
- .s_axi4_rresp ( s_axi4_rresp[i] ),
- .s_axi4_rdata ( s_axi4_rdata[i] ),
- .s_axi4_rlast ( s_axi4_rlast[i] ),
- .s_axi4_rvalid ( s_axi4_rvalid[i] ),
- .s_axi4_ruser ( s_axi4_ruser[i] ),
- .s_axi4_rready ( s_axi4_rready[i] ),
- .m_axi4_rid ( int_rid[i] ),
- .m_axi4_rresp ( int_rresp[i] ),
- .m_axi4_rdata ( int_rdata[i] ),
- .m_axi4_rlast ( int_rlast[i] ),
- .m_axi4_rvalid ( int_rvalid[i] ),
- .m_axi4_ruser ( int_ruser[i] ),
- .m_axi4_rready ( int_rready[i] )
- );
-
- /*
- * Multiplexer to switch between the two output master ports on the read response(r) channel
- *
- * Do not perform read burst interleaving as the DMA does not support it. This means we can only
- * switch between the two masters upon sending rlast or when idle.
- *
- * However, if the downstream already performs burst interleaving, this cannot be undone here.
- * Also, the downstream may interleave a burst reponse with a single-beat transaction. In this
- * case, the FSM below falls out of the burst mode. To avoid it performing burst interleaving
- * after such an event, it gives priority to the master which received the last burst in case
- * both have a have a burst ready (rvalid).
- *
- * Order of priority:
- * 1. Ongoing burst transaction
- * 2. Single-beat transaction on Master 1.
- * 3. Single-beat transaction on Master 0.
- * 4. Burst transaction on master that received the last burst.
- */
- // Select signal
- always_ff @(posedge Clk_CI) begin
- if (Rst_RBI == 0) begin
- RRespSel_SP[i] <= 1'b0;
- end else begin
- RRespSel_SP[i] <= RRespSel_SN[i];
- end
- end
-
- // FSM
- always_comb begin : RRespMuxFsm
- RRespMuxCtrl_SN[i] = RRespMuxCtrl_SP[i];
- RRespSel_SN[i] = RRespSel_SP[i];
-
- RRespBurst_S[i] = 1'b0;
- RRespSelIm_S[i] = 1'b0;
-
- unique case (RRespMuxCtrl_SP[i])
-
- IDLE: begin
- // immediately forward single-beat transactions
- if (int_m1_rvalid[i] && int_m1_rlast[i])
- RRespSelIm_S[i] = 1'b1;
- else if (int_m0_rvalid[i] && int_m0_rlast[i])
- RRespSelIm_S[i] = 1'b0;
-
- // bursts - they also start immediately
- else if (int_m1_rvalid[i] || int_m0_rvalid[i]) begin
- RRespMuxCtrl_SN[i] = BUSY;
-
- // in case both are ready, continue with the master that had the last burst
- if (int_m1_rvalid[i] && int_m0_rvalid[i]) begin
- RRespSel_SN[i] = RRespSel_SP[i];
- RRespSelIm_S[i] = RRespSel_SP[i];
- end else if (int_m1_rvalid[i]) begin
- RRespSel_SN[i] = 1'b1;
- RRespSelIm_S[i] = 1'b1;
- end else begin
- RRespSel_SN[i] = 1'b0;
- RRespSelIm_S[i] = 1'b0;
- end
- end
- end
-
- BUSY: begin
- RRespBurst_S[i] = 1'b1;
- // detect last handshake of currently ongoing transfer
- if (int_rvalid[i] && int_rready[i] && int_rlast[i])
- RRespMuxCtrl_SN[i] = IDLE;
- end
-
- default: begin
- RRespMuxCtrl_SN[i] = IDLE;
- end
-
- endcase
- end
-
- // FSM state
- always_ff @(posedge Clk_CI) begin
- if (Rst_RBI == 0) begin
- RRespMuxCtrl_SP[i] <= IDLE;
- end else begin
- RRespMuxCtrl_SP[i] <= RRespMuxCtrl_SN[i];
- end
- end
-
- // Actual multiplexer
- always_comb begin
- if ( (RRespBurst_S[i] && RRespSel_SP[i]) || (!RRespBurst_S[i] && RRespSelIm_S[i]) ) begin
- int_m0_rready[i] = 1'b0;
- int_m1_rready[i] = int_rready[i];
-
- int_rid[i] = int_m1_rid[i];
- int_rresp[i] = int_m1_rresp[i];
- int_rdata[i] = int_m1_rdata[i];
- int_rlast[i] = int_m1_rlast[i];
- int_ruser[i] = int_m1_ruser[i];
- int_rvalid[i] = int_m1_rvalid[i];
- end else begin
- int_m0_rready[i] = int_rready[i];
- int_m1_rready[i] = 1'b0;
-
- int_rid[i] = int_m0_rid[i];
- int_rresp[i] = int_m0_rresp[i];
- int_rdata[i] = int_m0_rdata[i];
- int_rlast[i] = int_m0_rlast[i];
- int_ruser[i] = int_m0_ruser[i];
- int_rvalid[i] = int_m0_rvalid[i];
- end
- end
-
- end // BUF & SEND
-
- // }}}
-
- endgenerate // BUF & SEND }}}
-
- // Log {{{
-
-`ifdef RAB_AX_LOG_EN
- AxiBramLogger
- #(
- .AXI_ID_BITW ( AXI_ID_WIDTH ),
- .AXI_ADDR_BITW ( AXI_S_ADDR_WIDTH ),
- .NUM_LOG_ENTRIES ( `RAB_AX_LOG_ENTRIES )
- )
- u_aw_logger
- (
- .Clk_CI ( NonGatedClk_CI ),
- .TimestampClk_CI ( Clk_CI ),
- .Rst_RBI ( Rst_RBI ),
- .AxiValid_SI ( s_axi4_awvalid[1] ),
- .AxiReady_SI ( s_axi4_awready[1] ),
- .AxiId_DI ( s_axi4_awid[1] ),
- .AxiAddr_DI ( s_axi4_awaddr[1] ),
- .AxiLen_DI ( s_axi4_awlen[1] ),
- .Clear_SI ( AwLogClr_SI ),
- .LogEn_SI ( LogEn_SI ),
- .Full_SO ( int_aw_log_full ),
- .Ready_SO ( AwLogRdy_SO ),
- .Bram_PS ( AwBram_PS )
- );
-
- AxiBramLogger
- #(
- .AXI_ID_BITW ( AXI_ID_WIDTH ),
- .AXI_ADDR_BITW ( AXI_S_ADDR_WIDTH ),
- .NUM_LOG_ENTRIES ( `RAB_AX_LOG_ENTRIES )
- )
- u_ar_logger
- (
- .Clk_CI ( NonGatedClk_CI ),
- .TimestampClk_CI ( Clk_CI ),
- .Rst_RBI ( Rst_RBI ),
- .AxiValid_SI ( s_axi4_arvalid[1] ),
- .AxiReady_SI ( s_axi4_arready[1] ),
- .AxiId_DI ( s_axi4_arid[1] ),
- .AxiAddr_DI ( s_axi4_araddr[1] ),
- .AxiLen_DI ( s_axi4_arlen[1] ),
- .Clear_SI ( ArLogClr_SI ),
- .LogEn_SI ( LogEn_SI ),
- .Full_SO ( int_ar_log_full ),
- .Ready_SO ( ArLogRdy_SO ),
- .Bram_PS ( ArBram_PS )
- );
-`endif
-
- // }}}
-
- // RAB Core {{{
- // ██████╗ █████╗ ██████╗ ██████╗ ██████╗ ██████╗ ███████╗
- // ██╔══██╗██╔══██╗██╔══██╗ ██╔════╝██╔═══██╗██╔══██╗██╔════╝
- // ██████╔╝███████║██████╔╝ ██║ ██║ ██║██████╔╝█████╗
- // ██╔══██╗██╔══██║██╔══██╗ ██║ ██║ ██║██╔══██╗██╔══╝
- // ██║ ██║██║ ██║██████╔╝ ╚██████╗╚██████╔╝██║ ██║███████╗
- // ╚═╝ ╚═╝╚═╝ ╚═╝╚═════╝ ╚═════╝ ╚═════╝ ╚═╝ ╚═╝╚══════╝
- //
- /*
- * rab_core
- *
- * The rab core translates addresses. It has two ports, which can be used
- * independently, however they will compete for time internally, as lookups
- * are serialized.
- *
- * type is the read(0) or write(1) used to check the protection flags. If they
- * don't match an interrupt is created on the int_prot line.
- */
-
- rab_core
- #(
- .N_PORTS ( N_PORTS ),
- .N_L2_SETS ( N_L2_SETS ),
- .N_L2_SET_ENTRIES ( N_L2_SET_ENTRIES ),
- .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ),
- .AXI_S_ADDR_WIDTH ( AXI_S_ADDR_WIDTH ),
- .AXI_M_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ),
- .AXI_LITE_DATA_WIDTH ( AXI_LITE_DATA_WIDTH ),
- .AXI_LITE_ADDR_WIDTH ( AXI_LITE_ADDR_WIDTH ),
- .AXI_ID_WIDTH ( AXI_ID_WIDTH ),
- .AXI_USER_WIDTH ( AXI_USER_WIDTH ),
- .MH_FIFO_DEPTH ( MH_FIFO_DEPTH )
- )
- u_rab_core
- (
- .Clk_CI ( Clk_CI ),
- .Rst_RBI ( Rst_RBI ),
-
- // Config IF
- .s_axi_awaddr ( s_axi4lite_awaddr ),
- .s_axi_awvalid ( s_axi4lite_awvalid ),
- .s_axi_awready ( s_axi4lite_awready ),
- .s_axi_wdata ( s_axi4lite_wdata ),
- .s_axi_wstrb ( s_axi4lite_wstrb ),
- .s_axi_wvalid ( s_axi4lite_wvalid ),
- .s_axi_wready ( s_axi4lite_wready ),
- .s_axi_bresp ( s_axi4lite_bresp ),
- .s_axi_bvalid ( s_axi4lite_bvalid ),
- .s_axi_bready ( s_axi4lite_bready ),
- .s_axi_araddr ( s_axi4lite_araddr ),
- .s_axi_arvalid ( s_axi4lite_arvalid ),
- .s_axi_arready ( s_axi4lite_arready ),
- .s_axi_rready ( s_axi4lite_rready ),
- .s_axi_rdata ( s_axi4lite_rdata ),
- .s_axi_rresp ( s_axi4lite_rresp ),
- .s_axi_rvalid ( s_axi4lite_rvalid ),
-
- // L1 miss info outputs -> L2 TLB arbitration
- .int_miss ( rab_miss ),
- .int_multi ( rab_multi ),
- .int_prot ( rab_prot ),
- .int_prefetch ( rab_prefetch ),
- .int_mhf_full ( int_mhf_full ),
-
- // L1 transaction info outputs -> L2 TLB arbitration
- .int_axaddr_o ( L1OutAddr_D ),
- .int_axid_o ( L1OutId_D ),
- .int_axlen_o ( L1OutLen_D ),
- .int_axuser_o ( L1OutUser_D ),
-
- // Write Req IF
- .port1_addr ( int_awaddr ),
- .port1_id ( int_awid ),
- .port1_len ( int_awlen ),
- .port1_size ( int_awsize ),
- .port1_addr_valid ( int_awvalid & ~aw_in_stall ), // avoid the FSM accepting new AW requests
- .port1_type ( {N_PORTS{1'b1}} ),
- .port1_user ( int_awuser ),
- .port1_sent ( int_wtrans_sent ), // signal done to L1 FSM
- .port1_out_addr ( int_wtrans_addr ),
- .port1_cache_coherent ( int_wtrans_cache_coherent ),
- .port1_accept ( int_wtrans_accept ),
- .port1_drop ( int_wtrans_drop ),
- .port1_miss ( int_wtrans_miss ),
-
- // Read Req IF
- .port2_addr ( int_araddr ),
- .port2_id ( int_arid ),
- .port2_len ( int_arlen ),
- .port2_size ( int_arsize ),
- .port2_addr_valid ( int_arvalid ),
- .port2_type ( {N_PORTS{1'b0}} ),
- .port2_user ( int_aruser ),
- .port2_sent ( int_rtrans_sent ), // signal done to L1 FSM
- .port2_out_addr ( int_rtrans_addr ),
- .port2_cache_coherent ( int_rtrans_cache_coherent ),
- .port2_accept ( int_rtrans_accept ),
- .port2_drop ( int_rtrans_drop ),
- .port2_miss ( int_rtrans_miss ),
-
- // L2 miss info inputs -> axi_rab_cfg
- .miss_l2_i ( L2Miss_S ),
- .miss_l2_addr_i ( L2OutInAddr_DP ),
- .miss_l2_id_i ( L2OutId_DP ),
- .miss_l2_user_i ( L2OutUser_DP ),
-
- // L2 config outputs
- .wdata_l2_o ( L2CfgWData_D ),
- .waddr_l2_o ( L2CfgWAddr_D ),
- .wren_l2_o ( L2CfgWE_S )
- );
-
- // }}}
-
- // AX SPLITS {{{
- // █████╗ ██╗ ██╗ ███████╗██████╗ ██╗ ██╗████████╗
- // ██╔══██╗╚██╗██╔╝ ██╔════╝██╔══██╗██║ ██║╚══██╔══╝
- // ███████║ ╚███╔╝ ███████╗██████╔╝██║ ██║ ██║
- // ██╔══██║ ██╔██╗ ╚════██║██╔═══╝ ██║ ██║ ██║
- // ██║ ██║██╔╝ ██╗ ███████║██║ ███████╗██║ ██║
- // ╚═╝ ╚═╝╚═╝ ╚═╝ ╚══════╝╚═╝ ╚══════╝╚═╝ ╚═╝
- //
- /**
- * Multiplex the two output master ports of the Read Address and Write Address (AR/AW) channels.
- *
- * Use the `int_xmaster_select` signal to route the signals to either Master 0 (to memory) or
- * Master 1 (to ACP). In case of an L1 miss: Route the signals to both masters. They shall be
- * saved until the L2 outputs are available.
- */
- generate for (i = 0; i < N_PORTS; i++) begin : AX_SPLIT
-
- /*
- * When accepting L1 transactions, we must just do so on the selected master. Drop requests must
- * be performed on any one of the two masters. Save requests must be performed by both masters.
- */
- always_comb begin : AW_L1_SPLIT
-
- // TLB handshake
- l1_m0_aw_accept[i] = 1'b0;
- l1_m1_aw_accept[i] = 1'b0;
- l1_m0_aw_drop[i] = 1'b0;
- l1_m1_aw_drop[i] = 1'b0;
- l1_m0_aw_save[i] = 1'b0;
- l1_m1_aw_save[i] = 1'b0;
-
- l1_mx_aw_done[i] = 1'b0;
-
- // AXI sender input handshake
- int_m0_awvalid[i] = 1'b0;
- int_m1_awvalid[i] = 1'b0;
- int_awready[i] = 1'b0;
-
- // accept on selected master only
- if (l1_aw_accept[i]) begin
- if (int_wmaster_select[i]) begin
- l1_m1_aw_accept[i] = 1'b1;
- l1_mx_aw_done[i] = l1_m1_aw_done[i];
-
- int_m1_awvalid[i] = int_awvalid[i];
- int_awready[i] = int_m1_awready[i];
-
- end else begin
- l1_m0_aw_accept[i] = 1'b1;
- l1_mx_aw_done[i] = l1_m0_aw_done[i];
-
- int_m0_awvalid[i] = int_awvalid[i];
- int_awready[i] = int_m0_awready[i];
- end
-
- // drop on Master 0 only
- end else if (l1_aw_drop[i]) begin
- l1_m0_aw_drop[i] = 1'b1;
- l1_mx_aw_done[i] = l1_m0_aw_done[i];
-
- int_m0_awvalid[i] = int_awvalid[i];
- int_awready[i] = l1_m0_aw_done[i];
-
- // save on both masters
- end else if (l1_aw_save[i]) begin
- // split save
- l1_m0_aw_save[i] = ~l1_m0_aw_done_SP[i];
- l1_m1_aw_save[i] = ~l1_m1_aw_done_SP[i];
-
- // combine done
- l1_mx_aw_done[i] = l1_m0_aw_done_SP[i] & l1_m1_aw_done_SP[i];
-
- int_m0_awvalid[i] = int_awvalid[i];
- int_m1_awvalid[i] = int_awvalid[i];
- int_awready[i] = l1_mx_aw_done[i];
- end
- end
-
- // signal back to handshake splitter
- assign l1_aw_done[i] = l1_mx_aw_done[i];
-
- always_ff @(posedge Clk_CI) begin : L1_MX_AW_DONE_REG
- if (Rst_RBI == 0) begin
- l1_m0_aw_done_SP[i] <= 1'b0;
- l1_m1_aw_done_SP[i] <= 1'b0;
- end else if (l1_mx_aw_done[i]) begin
- l1_m0_aw_done_SP[i] <= 1'b0;
- l1_m1_aw_done_SP[i] <= 1'b0;
- end else begin
- l1_m0_aw_done_SP[i] <= l1_m0_aw_done_SP[i] | l1_m0_aw_done[i];
- l1_m1_aw_done_SP[i] <= l1_m1_aw_done_SP[i] | l1_m1_aw_done[i];
- end
- end
-
- /*
- * When accepting L2 transactions, we must drop the corresponding transaction from the other
- * master to make it available again for save requests from L1_DROP_SAVE.
- */
- always_comb begin : AW_L2_SPLIT
-
- l2_m0_aw_accept[i] = 1'b0;
- l2_m1_aw_accept[i] = 1'b0;
- l2_m0_aw_drop[i] = 1'b0;
- l2_m1_aw_drop[i] = 1'b0;
-
- // de-assert request signals individually upon handshakes
- if (l2_aw_accept[i]) begin
- if (l2_master_select[i]) begin
- l2_m1_aw_accept[i] = ~l2_m1_aw_done_SP[i];
- l2_m0_aw_drop[i] = ~l2_m0_aw_done_SP[i];
-
- end else begin
- l2_m0_aw_accept[i] = ~l2_m0_aw_done_SP[i];
- l2_m1_aw_drop[i] = ~l2_m1_aw_done_SP[i];
-
- end
- end else begin
- l2_m0_aw_drop[i] = ~l2_m0_aw_done_SP[i] ? l2_aw_drop[i] : 1'b0;
- l2_m1_aw_drop[i] = ~l2_m1_aw_done_SP[i] ? l2_aw_drop[i] : 1'b0;
-
- end
-
- // combine done
- l2_mx_aw_done[i] = l2_m0_aw_done_SP[i] & l2_m1_aw_done_SP[i];
-
- l2_aw_done[i] = l2_mx_aw_done[i];
- end
-
- always_ff @(posedge Clk_CI) begin : L2_MX_AW_DONE_REG
- if (Rst_RBI == 0) begin
- l2_m0_aw_done_SP[i] <= 1'b0;
- l2_m1_aw_done_SP[i] <= 1'b0;
- end else if (l2_mx_aw_done[i]) begin
- l2_m0_aw_done_SP[i] <= 1'b0;
- l2_m1_aw_done_SP[i] <= 1'b0;
- end else begin
- l2_m0_aw_done_SP[i] <= l2_m0_aw_done_SP[i] | l2_m0_aw_done[i];
- l2_m1_aw_done_SP[i] <= l2_m1_aw_done_SP[i] | l2_m1_aw_done[i];
- end
- end
-
- /*
- * When accepting L1 transactions, we must just do so on the selected master. Drop requests must
- * be performed on any one of the two masters. Save requests must be performed by both masters.
- */
- always_comb begin : AR_L1_SPLIT
-
- // TLB handshake
- l1_m0_ar_accept[i] = 1'b0;
- l1_m1_ar_accept[i] = 1'b0;
- l1_m0_ar_drop[i] = 1'b0;
- l1_m1_ar_drop[i] = 1'b0;
- l1_m0_ar_save[i] = 1'b0;
- l1_m1_ar_save[i] = 1'b0;
-
- l1_mx_ar_done[i] = 1'b0;
-
- // AXI sender input handshake
- int_m0_arvalid[i] = 1'b0;
- int_m1_arvalid[i] = 1'b0;
- int_arready[i] = 1'b0;
-
- // accept on selected master only
- if (l1_ar_accept[i]) begin
- if (int_rmaster_select[i]) begin
- l1_m1_ar_accept[i] = 1'b1;
- l1_mx_ar_done[i] = l1_m1_ar_done[i];
-
- int_m1_arvalid[i] = int_arvalid[i];
- int_arready[i] = int_m1_arready[i];
-
- end else begin
- l1_m0_ar_accept[i] = 1'b1;
- l1_mx_ar_done[i] = l1_m0_ar_done[i];
-
- int_m0_arvalid[i] = int_arvalid[i];
- int_arready[i] = int_m0_arready[i];
- end
-
- // drop on Master 0 only
- end else if (l1_ar_drop[i]) begin
- l1_m0_ar_drop[i] = 1'b1;
- l1_mx_ar_done[i] = l1_m0_ar_done[i];
-
- int_m0_arvalid[i] = int_arvalid[i];
- int_arready[i] = l1_m0_ar_done[i];
-
- // save on both masters
- end else if (l1_ar_save[i]) begin
- // split save
- l1_m0_ar_save[i] = ~l1_m0_ar_done_SP[i];
- l1_m1_ar_save[i] = ~l1_m1_ar_done_SP[i];
-
- // combine done
- l1_mx_ar_done[i] = l1_m0_ar_done_SP[i] & l1_m1_ar_done_SP[i];
-
- int_m0_arvalid[i] = int_arvalid[i];
- int_m1_arvalid[i] = int_arvalid[i];
- int_arready[i] = l1_mx_ar_done[i];
- end
- end
-
- // signal back to handshake splitter
- assign l1_ar_done[i] = l1_mx_ar_done[i];
-
- always_ff @(posedge Clk_CI) begin : L1_MX_AR_DONE_REG
- if (Rst_RBI == 0) begin
- l1_m0_ar_done_SP[i] <= 1'b0;
- l1_m1_ar_done_SP[i] <= 1'b0;
- end else if (l1_mx_ar_done[i]) begin
- l1_m0_ar_done_SP[i] <= 1'b0;
- l1_m1_ar_done_SP[i] <= 1'b0;
- end else begin
- l1_m0_ar_done_SP[i] <= l1_m0_ar_done_SP[i] | l1_m0_ar_done[i];
- l1_m1_ar_done_SP[i] <= l1_m1_ar_done_SP[i] | l1_m1_ar_done[i];
- end
- end
-
- /*
- * When accepting L2 transactions, we must drop the corresponding transaction from the other
- * master to make it available again for save requests from L1_DROP_SAVE.
- */
- always_comb begin : AR_L2_SPLIT
-
- l2_m0_ar_accept[i] = 1'b0;
- l2_m1_ar_accept[i] = 1'b0;
- l2_m0_ar_drop[i] = 1'b0;
- l2_m1_ar_drop[i] = 1'b0;
-
- // de-assert request signals individually upon handshakes
- if (l2_ar_accept[i]) begin
- if (l2_master_select[i]) begin
- l2_m1_ar_accept[i] = ~l2_m1_ar_done_SP[i];
- l2_m0_ar_drop[i] = ~l2_m0_ar_done_SP[i];
-
- end else begin
- l2_m0_ar_accept[i] = ~l2_m0_ar_done_SP[i];
- l2_m1_ar_drop[i] = ~l2_m1_ar_done_SP[i];
-
- end
- end else if (l2_ar_drop[i]) begin
- l2_m0_ar_drop[i] = ~l2_m0_ar_done_SP[i] ? l2_ar_drop[i] : 1'b0;
- l2_m1_ar_drop[i] = ~l2_m1_ar_done_SP[i] ? l2_ar_drop[i] : 1'b0;
-
- end
-
- // combine done
- l2_mx_ar_done[i] = l2_m0_ar_done_SP[i] & l2_m1_ar_done_SP[i];
-
- l2_ar_done[i] = l2_mx_ar_done[i];
- end
-
- always_ff @(posedge Clk_CI) begin : L2_MX_AR_DONE_REG
- if (Rst_RBI == 0) begin
- l2_m0_ar_done_SP[i] <= 1'b0;
- l2_m1_ar_done_SP[i] <= 1'b0;
- end else if (l2_mx_ar_done[i]) begin
- l2_m0_ar_done_SP[i] <= 1'b0;
- l2_m1_ar_done_SP[i] <= 1'b0;
- end else begin
- l2_m0_ar_done_SP[i] <= l2_m0_ar_done_SP[i] | l2_m0_ar_done[i];
- l2_m1_ar_done_SP[i] <= l2_m1_ar_done_SP[i] | l2_m1_ar_done[i];
- end
- end
-
- end // AX_SPLIT
- endgenerate // AX_SPLIT
-
- // }}}
-
- // HANDSHAKE SPLITS {{{
- // ██╗ ██╗███████╗ ███████╗██████╗ ██╗ ██╗████████╗
- // ██║ ██║██╔════╝ ██╔════╝██╔══██╗██║ ██║╚══██╔══╝
- // ███████║███████╗ ███████╗██████╔╝██║ ██║ ██║
- // ██╔══██║╚════██║ ╚════██║██╔═══╝ ██║ ██║ ██║
- // ██║ ██║███████║ ███████║██║ ███████╗██║ ██║
- // ╚═╝ ╚═╝╚══════╝ ╚══════╝╚═╝ ╚══════╝╚═╝ ╚═╝
- //
- /*
- * We need to perform combined handshakes with multiple AXI modules
- * upon transactions drops, accepts, saves etc. from two TLBs.
- */
- generate for (i = 0; i < N_PORTS; i++) begin : HANDSHAKE_SPLIT
-
- assign l1_xw_accept[i] = int_wtrans_accept[i] & ~aw_out_stall[i];
- assign int_wtrans_sent[i] = l1_xw_done[i];
-
- assign l1_ar_accept[i] = int_rtrans_accept[i];
- assign int_rtrans_sent[i] = l1_ar_done[i];
-
- /*
- * L1 AW sender + W buffer handshake split
- */
- // forward
- assign l1_aw_accept[i] = l1_xw_accept[i] & ~l1_aw_done_SP[i];
- assign l1_w_accept[i] = l1_xw_accept[i] & ~l1_w_done_SP[i];
-
- assign l1_aw_save[i] = l1_xw_save[i] & ~l1_aw_done_SP[i];
- assign l1_w_save[i] = l1_xw_save[i] & ~l1_w_done_SP[i];
-
- assign l1_aw_drop[i] = l1_xw_drop[i] & ~l1_aw_done_SP[i];
- assign l1_w_drop[i] = l1_xw_drop[i] & ~l1_w_done_SP[i];
-
- // backward
- assign l1_xw_done[i] = l1_aw_done_SP[i] & l1_w_done_SP[i];
-
- always_ff @(posedge Clk_CI) begin : L1_XW_HS_SPLIT
- if (Rst_RBI == 0) begin
- l1_aw_done_SP[i] <= 1'b0;
- l1_w_done_SP[i] <= 1'b0;
- end else if (l1_xw_done[i]) begin
- l1_aw_done_SP[i] <= 1'b0;
- l1_w_done_SP[i] <= 1'b0;
- end else begin
- l1_aw_done_SP[i] <= l1_aw_done_SP[i] | l1_aw_done[i];
- l1_w_done_SP[i] <= l1_w_done_SP[i] | l1_w_done[i];
- end
- end
-
- if (ENABLE_L2TLB[i] == 1) begin : L2_HS_SPLIT
-
- /*
- * L1 AR sender + R sender handshake split
- *
- * AR and R do not need to be strictly in sync. We thus use separate handshakes.
- * But the handshake signals for the R sender are multiplexed with the those for
- * the L2. However, L2_ACCEPT_DROP_SAVE has always higher priority.
- */
- assign lx_r_drop[i] = l2_r_drop[i] | l1_r_drop[i];
- assign l1_r_done[i] = l2_r_drop[i] ? 1'b0 : lx_r_done[i];
- assign l2_r_done[i] = l2_r_drop[i] ? lx_r_done[i] : 1'b0;
-
- /*
- * L2 AW sender + W buffer handshake split
- */
- // forward
- assign l2_aw_accept[i] = l2_xw_accept[i] & ~l2_aw_done_SP[i];
- assign l2_w_accept[i] = l2_xw_accept[i] & ~l2_w_done_SP[i];
-
- assign l2_aw_drop[i] = l2_xw_drop[i] & ~l2_aw_done_SP[i];
- assign l2_w_drop[i] = l2_xw_drop[i] & ~l2_w_done_SP[i];
-
- // backward
- assign l2_xw_done[i] = l2_aw_done_SP[i] & l2_w_done_SP[i];
-
- always_ff @(posedge Clk_CI) begin : L2_XW_HS_SPLIT
- if (Rst_RBI == 0) begin
- l2_aw_done_SP[i] <= 1'b0;
- l2_w_done_SP[i] <= 1'b0;
- end else if (l2_xw_done[i]) begin
- l2_aw_done_SP[i] <= 1'b0;
- l2_w_done_SP[i] <= 1'b0;
- end else begin
- l2_aw_done_SP[i] <= l2_aw_done_SP[i] | l2_aw_done[i];
- l2_w_done_SP[i] <= l2_w_done_SP[i] | l2_w_done[i];
- end
- end
-
- /*
- * L2 AR + R sender handshake split
- */
- // forward
- assign l2_ar_drop[i] = l2_xr_drop[i] & ~l2_ar_done_SP[i];
- assign l2_r_drop[i] = l2_xr_drop[i] & ~l2_r_done_SP[i];
-
- // backward - make sure to always clear L2_XR_HS_SPLIT
- always_comb begin
- if (l2_xr_drop[i]) begin
- l2_xr_done[i] = l2_ar_done_SP[i] & l2_r_done_SP[i];
- end else begin
- l2_xr_done[i] = l2_ar_done_SP[i];
- end
- end
-
- always_ff @(posedge Clk_CI) begin : L2_XR_HS_SPLIT
- if (Rst_RBI == 0) begin
- l2_ar_done_SP[i] <= 1'b0;
- l2_r_done_SP[i] <= 1'b0;
- end else if (l2_xr_done[i]) begin
- l2_ar_done_SP[i] <= 1'b0;
- l2_r_done_SP[i] <= 1'b0;
- end else begin
- l2_ar_done_SP[i] <= l2_ar_done_SP[i] | l2_ar_done[i];
- l2_r_done_SP[i] <= l2_r_done_SP[i] | l2_r_done[i];
- end
- end
-
- end else begin // if (ENABLE_L2TLB[i] == 1)
-
- assign lx_r_drop[i] = l1_r_drop[i];
- assign l1_r_done[i] = lx_r_done[i];
-
- assign l2_aw_accept[i] = 1'b0;
- assign l2_w_accept[i] = 1'b0;
- assign l2_aw_drop[i] = 1'b0;
- assign l2_w_drop[i] = 1'b0;
- assign l2_xw_done[i] = 1'b0;
- assign l2_aw_done_SP[i] = 1'b0;
- assign l2_w_done_SP[i] = 1'b0;
-
- assign l2_ar_accept[i] = 1'b0;
- assign l2_ar_drop[i] = 1'b0;
- assign l2_r_drop[i] = 1'b0;
- assign l2_xr_done[i] = 1'b0;
- assign l2_r_done[i] = 1'b0;
- assign l2_ar_done_SP[i] = 1'b0;
- assign l2_r_done_SP[i] = 1'b0;
-
- end // if (ENABLE_L2TLB[i] == 1)
-
- end // HANDSHAKE_SPLIT
- endgenerate // HANDSHAKE_SPLIT
-
- // }}}
-
- // L2 TLB {{{
- // ██╗ ██████╗ ████████╗██╗ ██████╗
- // ██║ ╚════██╗ ╚══██╔══╝██║ ██╔══██╗
- // ██║ █████╔╝ ██║ ██║ ██████╔╝
- // ██║ ██╔═══╝ ██║ ██║ ██╔══██╗
- // ███████╗███████╗ ██║ ███████╗██████╔╝
- // ╚══════╝╚══════╝ ╚═╝ ╚══════╝╚═════╝
- //
- /*
- * l2_tlb
- *
- * The L2 TLB translates addresses upon misses in the L1 TLB (rab_core).
- *
- * It supports one ongoing translation at a time. If an L1 miss occurs while the L2 is busy,
- * the L1 is stalled untill the L2 is available again.
- *
- */
- generate for (i = 0; i < N_PORTS; i++) begin : L2_TLB
- if (ENABLE_L2TLB[i] == 1) begin : L2_TLB
-
- /*
- * L1 output selector
- */
- assign L1OutRwType_D[i] = int_wtrans_drop[i] ? 1'b1 : 1'b0;
- assign L1OutProt_D[i] = rab_prot[i];
- assign L1OutMulti_D[i] = rab_multi[i];
-
- /*
- * L1 output control + L1_DROP_BUF, L2_IN_BUF management
- *
- * Forward the L1 drop request to AR/AW sender modules if
- * 1. the transactions needs to be dropped (L1 multi, prot, prefetch), or
- * 2. if a lookup in the L2 TLB is required (L1 miss) and the input buffer is not full.
- *
- * The AR/AW senders do not support more than 1 oustanding L1 miss. The push back towards
- * the upstream is realized by not accepting the save request (saving the L1 transaction)
- * in the senders as long as the L2 TLB is busy or has valid output. This ultimately
- * blocks the L1 TLB.
- *
- * Together with the AW drop/save, we also perform the W drop/save as AW and W need to
- * absolutely remain in order. In contrast, the R drop is performed
- */
- always_comb begin : L1_DROP_SAVE
-
- l1_ar_drop[i] = 1'b0;
- l1_ar_save[i] = 1'b0;
- l1_xw_drop[i] = 1'b0;
- l1_xw_save[i] = 1'b0;
-
- l1_id_drop[i] = L1OutId_D[i];
- l1_len_drop[i] = L1OutLen_D[i];
- l1_prefetch_drop[i] = rab_prefetch[i];
- l1_hit_drop[i] = 1'b1; // there are no drops for L1 misses
-
- L1DropEn_S[i] = 1'b0;
- L2InEn_S[i] = 1'b0;
-
- if ( rab_prot[i] | rab_multi[i] | rab_prefetch[i] ) begin
- // 1. Drop
- l1_ar_drop[i] = int_rtrans_drop[i] & ~L1DropValid_SP[i];
- l1_xw_drop[i] = int_wtrans_drop[i] & ~L1DropValid_SP[i];
-
- // Store to L1_DROP_BUF upon handshake
- L1DropEn_S[i] = (l1_ar_drop[i] & l1_ar_done[i]) |
- (l1_xw_drop[i] & l1_xw_done[i]);
-
- end else if ( rab_miss[i] ) begin
- // 2. Save - Make sure L2 is really available.
- l1_ar_save[i] = int_rtrans_drop[i] & ~L2Busy_S[i];
- l1_xw_save[i] = int_wtrans_drop[i] & ~L2Busy_S[i];
-
- // Store to L2_IN_BUF upon handshake - triggers the L2 TLB
- L2InEn_S[i] = (l1_ar_save[i] & l1_ar_done[i]) |
- (l1_xw_save[i] & l1_xw_done[i]);
- end
- end
-
- /*
- * L2 output control + L2_OUT_BUF management + R/B sender control + W buffer control
- *
- * Perform L1 R transaction drops unless the L2 output buffer holds valid data. The AXI specs
- * require the B response to be sent only after consuming/discarding the corresponding data
- * in the W channel. Thus, we only send L2 drop request to the W buffer here. The drop
- * request to the B sender is then sent by the W buffer autonomously.
- *
- * L1 AW/W drop requests are managed by L1_DROP_SAVE.
- */
- always_comb begin : L2_ACCEPT_DROP_SAVE
-
- l2_ar_addr[i] = 'b0;
- l2_aw_addr[i] = 'b0;
- l2_ar_accept[i] = 1'b0;
- l2_xr_drop[i] = 1'b0;
- l2_xw_accept[i] = 1'b0;
- l2_xw_drop[i] = 1'b0;
-
- l1_r_drop[i] = 1'b0;
-
- lx_id_drop[i] = 'b0;
- lx_len_drop[i] = 'b0;
- lx_prefetch_drop[i] = 1'b0;
- lx_hit_drop[i] = 1'b0;
-
- L1DropValid_SN[i] = L1DropValid_SP[i] | L1DropEn_S[i];
- L2OutValid_SN[i] = L2OutValid_SP[i];
- L2OutReady_S[i] = 1'b0;
- L2OutEn_S[i] = 1'b0;
-
- L2Miss_S[i] = 1'b0;
- int_multi[i] = 1'b0;
- int_prot[i] = 1'b0;
-
- if (L2OutValid_SP[i] == 1'b0) begin
-
- // Drop L1 from R senders
- if (L1DropValid_SP[i] == 1'b1) begin
-
- // Only perform the R sender drop here.
- if (~L1DropRwType_DP[i]) begin
-
- l1_r_drop[i] = 1'b1;
- lx_id_drop[i] = L1DropId_DP[i];
- lx_len_drop[i] = L1DropLen_DP[i];
- lx_prefetch_drop[i] = L1DropPrefetch_S[i];
- lx_hit_drop[i] = 1'b1; // there are no drops for L1 misses
-
- // Invalidate L1_DROP_BUF upon handshake
- if ( l1_r_drop[i] & l1_r_done[i] ) begin
-
- L1DropValid_SN[i] = 1'b0;
- int_prot[i] = L1DropProt_DP[i];
- int_multi[i] = L1DropMulti_DP[i];
- end
-
- end else begin
- // Invalidate L1_DROP_BUF
- L1DropValid_SN[i] = 1'b0;
- int_prot[i] = L1DropProt_DP[i];
- int_multi[i] = L1DropMulti_DP[i];
- end
- end
-
- end else begin // L2_OUT_BUF has valid data
-
- if ( L2OutHit_SP[i] & ~(L2OutPrefetch_S[i] | L2OutProt_SP[i] | L2OutMulti_SP[i]) ) begin
-
- l2_ar_addr[i] = L2OutAddr_DP[i];
- l2_aw_addr[i] = L2OutAddr_DP[i];
-
- l2_ar_accept[i] = L2OutRwType_DP[i] ? 1'b0 : 1'b1;
- l2_xw_accept[i] = L2OutRwType_DP[i] ? 1'b1 : 1'b0;
-
- // Invalidate L2_OUT_BUF upon handshake
- L2OutValid_SN[i] = ~( (l2_ar_accept[i] & l2_ar_done[i]) |
- (l2_xw_accept[i] & l2_xw_done[i]) );
- end else begin
-
- lx_id_drop[i] = L2OutId_DP[i];
- lx_len_drop[i] = L2OutLen_DP[i];
- lx_prefetch_drop[i] = L2OutPrefetch_S[i];
- lx_hit_drop[i] = L2OutHit_SP[i];
-
- // The l2_xr_drop will also perform the handshake with the R sender
- l2_xr_drop[i] = L2OutRwType_DP[i] ? 1'b0 : 1'b1;
- l2_xw_drop[i] = L2OutRwType_DP[i] ? 1'b1 : 1'b0;
-
- // Invalidate L1_DROP_BUF upon handshake
- if ( (l2_xr_drop[i] & l2_xr_done[i]) | (l2_xw_drop[i] & l2_xw_done[i]) ) begin
-
- L2OutValid_SN[i] = 1'b0;
- L2Miss_S[i] = ~L2OutHit_SP[i];
- int_prot[i] = L2OutProt_SP[i];
- int_multi[i] = L2OutMulti_SP[i];
- end
- end
- end
-
- // Only accept new L2 output after ongoing drops have finished.
- if ( (l2_xr_drop[i] == l2_xr_done[i]) &
- (l2_xw_drop[i] == l2_xw_done[i]) &
- (l1_r_drop[i] == l1_r_done[i] ) ) begin
- // Store to L2_OUT_BUF upon handshake with L2 TLB module
- if ( (L2OutValid_SP[i] == 1'b0) && (L2OutValid_S[i] == 1'b1) ) begin
- L2OutValid_SN[i] = 1'b1;
- L2OutReady_S[i] = 1'b1;
- L2OutEn_S[i] = 1'b1;
- end
- end
- end
-
- /*
- * L1 drop buffer
- *
- * Used in case of multi, prot and prefetch hits in the L1 TLB.
- */
- always_ff @(posedge Clk_CI) begin : L1_DROP_BUF
- if (Rst_RBI == 0) begin
- L1DropProt_DP[i] <= 1'b0;
- L1DropMulti_DP[i] <= 1'b0;
- L1DropRwType_DP[i] <= 1'b0;
- L1DropUser_DP[i] <= 'b0;
- L1DropId_DP[i] <= 'b0;
- L1DropLen_DP[i] <= 'b0;
- L1DropAddr_DP[i] <= 'b0;
- end else if (L1DropEn_S[i] == 1'b1) begin
- L1DropProt_DP[i] <= L1OutProt_D[i] ;
- L1DropMulti_DP[i] <= L1OutMulti_D[i] ;
- L1DropRwType_DP[i] <= L1OutRwType_D[i];
- L1DropUser_DP[i] <= L1OutUser_D[i] ;
- L1DropId_DP[i] <= L1OutId_D[i] ;
- L1DropLen_DP[i] <= L1OutLen_D[i] ;
- L1DropAddr_DP[i] <= L1OutAddr_D[i] ;
- end
- end // always_ff @ (posedge Clk_CI)
-
- /*
- * L2 input buffer
- *
- * Make sure there are no combinational paths between L1 TLB/inputs and L2 TLB.
- */
- always_ff @(posedge Clk_CI) begin : L2_IN_BUF
- if (Rst_RBI == 0) begin
- L2InRwType_DP[i] <= 1'b0;
- L2InUser_DP[i] <= 'b0;
- L2InId_DP[i] <= 'b0;
- L2InLen_DP[i] <= 'b0;
- L2InAddr_DP[i] <= 'b0;
- end else if (L2InEn_S[i] == 1'b1) begin
- L2InRwType_DP[i] <= L1OutRwType_D[i];
- L2InUser_DP[i] <= L1OutUser_D[i] ;
- L2InId_DP[i] <= L1OutId_D[i] ;
- L2InLen_DP[i] <= L1OutLen_D[i] ;
- L2InAddr_DP[i] <= L1OutAddr_D[i] ;
- end
- end // always_ff @ (posedge Clk_CI)
-
- l2_tlb
- #(
- .AXI_S_ADDR_WIDTH ( AXI_S_ADDR_WIDTH ),
- .AXI_M_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ),
- .AXI_LITE_DATA_WIDTH ( AXI_LITE_DATA_WIDTH ),
- .AXI_LITE_ADDR_WIDTH ( AXI_LITE_ADDR_WIDTH ),
- .N_SETS ( `RAB_L2_N_SETS ),
- .N_OFFSETS ( `RAB_L2_N_SET_ENTRIES/2/`RAB_L2_N_PAR_VA_RAMS ),
- .N_PAR_VA_RAMS ( `RAB_L2_N_PAR_VA_RAMS ),
- .HIT_OFFSET_STORE_WIDTH ( log2(`RAB_L2_N_SET_ENTRIES/2/`RAB_L2_N_PAR_VA_RAMS) )
- )
- u_l2_tlb
- (
- .clk_i ( Clk_CI ),
- .rst_ni ( Rst_RBI ),
-
- // Config inputs
- .we_i ( L2CfgWE_S[i] ),
- .waddr_i ( L2CfgWAddr_D[i] ),
- .wdata_i ( L2CfgWData_D[i] ),
-
- // Request input
- .start_i ( L2InEn_S[i] ),
- .busy_o ( L2Busy_S[i] ),
- .rw_type_i ( L2InRwType_DP[i] ),
- .in_addr_i ( L2InAddr_DP[i] ),
-
- // Response output
- .out_ready_i ( L2OutReady_S[i] ),
- .out_valid_o ( L2OutValid_S[i] ),
- .hit_o ( L2OutHit_SN[i] ),
- .miss_o ( L2OutMiss_SN[i] ),
- .prot_o ( L2OutProt_SN[i] ),
- .multi_o ( L2OutMulti_SN[i] ),
- .cache_coherent_o ( L2OutCC_SN[i] ),
- .out_addr_o ( L2OutAddr_DN[i] )
- );
-
- /*
- * L2 output buffer
- *
- * Make sure there are no combinational paths between L1 TLB/inputs and L2 TLB.
- */
- always_ff @(posedge Clk_CI) begin : L2_OUT_BUF
- if (Rst_RBI == 0) begin
- L2OutRwType_DP[i] <= 1'b0;
- L2OutUser_DP[i] <= 'b0;
- L2OutLen_DP[i] <= 'b0;
- L2OutId_DP[i] <= 'b0;
- L2OutInAddr_DP[i] <= 'b0;
-
- L2OutHit_SP[i] <= 1'b0;
- L2OutMiss_SP[i] <= 1'b0;
- L2OutProt_SP[i] <= 1'b0;
- L2OutMulti_SP[i] <= 1'b0;
- L2OutCC_SP[i] <= 1'b0;
- L2OutAddr_DP[i] <= 'b0;
- end else if (L2OutEn_S[i] == 1'b1) begin
- L2OutRwType_DP[i] <= L2InRwType_DP[i];
- L2OutUser_DP[i] <= L2InUser_DP[i] ;
- L2OutLen_DP[i] <= L2InLen_DP[i] ;
- L2OutId_DP[i] <= L2InId_DP[i] ;
- L2OutInAddr_DP[i] <= L2InAddr_DP[i] ;
-
- L2OutHit_SP[i] <= L2OutHit_SN[i] ;
- L2OutMiss_SP[i] <= L2OutMiss_SN[i] ;
- L2OutProt_SP[i] <= L2OutProt_SN[i] ;
- L2OutMulti_SP[i] <= L2OutMulti_SN[i];
- L2OutCC_SP[i] <= L2OutCC_SN[i] ;
- L2OutAddr_DP[i] <= L2OutAddr_DN[i] ;
- end
- end // always_ff @ (posedge Clk_CI)
-
- always_ff @(posedge Clk_CI) begin : BUF_VALID
- if (Rst_RBI == 0) begin
- L1DropValid_SP[i] = 1'b0;
- L2OutValid_SP[i] = 1'b0;
- end else begin
- L1DropValid_SP[i] = L1DropValid_SN[i];
- L2OutValid_SP[i] = L2OutValid_SN[i];
- end
- end
-
- always_comb begin : BUF_TO_PREFETCH
- // L1 Drop Buf
- if (L1DropUser_DP[i] == {AXI_USER_WIDTH{1'b1}})
- L1DropPrefetch_S[i] = 1'b1;
- else
- L1DropPrefetch_S[i] = 1'b0;
-
- // L2 Out Buf
- if (L2OutUser_DP[i] == {AXI_USER_WIDTH{1'b1}})
- L2OutPrefetch_S[i] = 1'b1;
- else
- L2OutPrefetch_S[i] = 1'b0;
- end
-
- assign l2_cache_coherent[i] = L2OutCC_SP[i];
- assign int_miss[i] = L2Miss_S[i];
-
- end else begin : L2_TLB_STUB // if (ENABLE_L2TLB[i] == 1)
-
- assign l1_ar_drop[i] = int_rtrans_drop[i];
- assign l1_r_drop[i] = int_rtrans_drop[i];
- assign l1_xw_drop[i] = int_wtrans_drop[i];
-
- assign l1_ar_save[i] = 1'b0;
- assign l1_xw_save[i] = 1'b0;
- assign l2_xw_accept[i] = 1'b0;
- assign l2_xr_drop[i] = 1'b0;
- assign l2_xw_drop[i] = 1'b0;
-
- assign l2_ar_addr[i] = 'b0;
- assign l2_aw_addr[i] = 'b0;
-
- assign l1_id_drop[i] = int_wtrans_drop[i] ? int_awid[i] :
- int_rtrans_drop[i] ? int_arid[i] :
- '0;
- assign l1_len_drop[i] = int_wtrans_drop[i] ? int_awlen[i] :
- int_rtrans_drop[i] ? int_arlen[i] :
- '0;
- assign l1_prefetch_drop[i] = rab_prefetch[i];
- assign l1_hit_drop[i] = ~rab_miss[i];
-
- assign lx_id_drop[i] = int_wtrans_drop[i] ? int_awid[i] :
- int_rtrans_drop[i] ? int_arid[i] :
- '0;
- assign lx_len_drop[i] = int_wtrans_drop[i] ? int_awlen[i] :
- int_rtrans_drop[i] ? int_arlen[i] :
- '0;
- assign lx_prefetch_drop[i] = rab_prefetch[i];
- assign lx_hit_drop[i] = ~rab_miss[i];
-
- assign l2_cache_coherent[i] = 1'b0;
-
- assign int_miss[i] = rab_miss[i];
- assign int_prot[i] = rab_prot[i];
- assign int_multi[i] = rab_multi[i];
-
- // unused signals
- assign L2Miss_S[i] = 1'b0;
-
- assign L1OutRwType_D[i] = 1'b0;
- assign L1OutProt_D[i] = 1'b0;
- assign L1OutMulti_D[i] = 1'b0;
-
- assign L1DropRwType_DP[i] = 1'b0;
- assign L1DropUser_DP[i] = 'b0;
- assign L1DropId_DP[i] = 'b0;
- assign L1DropLen_DP[i] = 'b0;
- assign L1DropAddr_DP[i] = 'b0;
- assign L1DropProt_DP[i] = 1'b0;
- assign L1DropMulti_DP[i] = 1'b0;
-
- assign L1DropEn_S[i] = 1'b0;
- assign L1DropPrefetch_S[i] = 1'b0;
- assign L1DropValid_SN[i] = 1'b0;
- assign L1DropValid_SP[i] = 1'b0;
-
- assign L2InRwType_DP[i] = 1'b0;
- assign L2InUser_DP[i] = 'b0;
- assign L2InId_DP[i] = 'b0;
- assign L2InLen_DP[i] = 'b0;
- assign L2InAddr_DP[i] = 'b0;
-
- assign L2InEn_S[i] = 1'b0;
-
- assign L2OutHit_SN[i] = 1'b0;
- assign L2OutMiss_SN[i] = 1'b0;
- assign L2OutProt_SN[i] = 1'b0;
- assign L2OutMulti_SN[i] = 1'b0;
- assign L2OutCC_SN[i] = 1'b0;
- assign L2OutAddr_DN[i] = 'b0;
-
- assign L2OutRwType_DP[i] = 1'b0;
- assign L2OutUser_DP[i] = 'b0;
- assign L2OutId_DP[i] = 'b0;
- assign L2OutLen_DP[i] = 'b0;
- assign L2OutInAddr_DP[i] = 'b0;
- assign L2OutHit_SP[i] = 1'b0;
- assign L2OutMiss_SP[i] = 1'b0;
- assign L2OutProt_SP[i] = 1'b0;
- assign L2OutMulti_SP[i] = 1'b0;
- assign L2OutCC_SP[i] = 1'b0;
- assign L2OutAddr_DP[i] = 'b0;
-
- assign L2OutEn_S[i] = 1'b0;
- assign L2OutPrefetch_S[i] = 1'b0;
- assign L2Busy_S[i] = 1'b0;
- assign L2OutValid_S[i] = 1'b0;
- assign L2OutValid_SN[i] = 1'b0;
- assign L2OutValid_SP[i] = 1'b0;
- assign L2OutReady_S[i] = 1'b0;
-
- end // !`ifdef ENABLE_L2TLB
- end // for (i = 0; i < N_PORTS; i++)
- endgenerate
-
-// }}}
-"""
-# endmodule
-#
-#
-# // vim: ts=2 sw=2 sts=2 et nosmartindent autoindent foldmethod=marker
-#
-#
+++ /dev/null
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class check_ram(Elaboratable):
-
- def __init__(self):
- self.clk_i = Signal() # input
- self.rst_ni = Signal() # input
- self.in_addr = Signal(ADDR_WIDTH) # input
- self.rw_type = Signal() # input
- self.ram_we = Signal() # input
- self.port0_addr = Signal(1+ERROR p_expression_25) # input
- self.port1_addr = Signal(1+ERROR p_expression_25) # input
- self.ram_wdata = Signal(RAM_DATA_WIDTH) # input
- self.output_sent = Signal() # input
- self.output_valid = Signal() # input
- self.offset_addr_d = Signal(OFFSET_WIDTH) # input
- self.hit_addr = Signal(1+ERROR p_expression_25) # output
- self.master = Signal() # output
- self.hit = Signal() # output
- self.multi_hit = Signal() # output
- self.prot = Signal() # output
-
- def elaborate(self, platform=None):
- m = Module()
- return m
-
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License. You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# //import CfMath::log2;
-#
-# //`define MULTI_HIT_FULL_SET
-#
-# module check_ram
-# //#(
-# // parameter ADDR_WIDTH = 32,
-# // parameter RAM_DATA_WIDTH = 32,
-# // parameter PAGE_SIZE = 4096, // 4kB
-# // parameter SET_WIDTH = 5,
-# // parameter OFFSET_WIDTH = 4
-# // )
-# (
-# input logic clk_i,
-# input logic rst_ni,
-# input logic [ADDR_WIDTH-1:0] in_addr,
-# input logic rw_type, // 1 => write, 0=> read
-# input logic ram_we,
-# input logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] port0_addr,
-# input logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] port1_addr,
-# input logic [RAM_DATA_WIDTH-1:0] ram_wdata,
-# input logic output_sent,
-# input logic output_valid,
-# input logic [OFFSET_WIDTH-1:0] offset_addr_d,
-# output logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] hit_addr,
-# output logic master,
-# output logic hit,
-# output logic multi_hit,
-# output logic prot
-# );
-#
-""" #docstring_begin
-
- localparam IGNORE_LSB = log2(PAGE_SIZE); // 12
-
- logic [RAM_DATA_WIDTH-1:0] port0_data_o, port1_data_o; // RAM read data outputs
- logic port0_hit, port1_hit; // Ram output matches in_addr
-
- logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] port0_addr_saved, port1_addr_saved;
-
- // Hit FSM Signals
- typedef enum logic {SEARCH, HIT} hit_state_t;
- hit_state_t hit_SP; // Hit FSM state
- hit_state_t hit_SN; // Hit FSM next state
-
- // Multi Hit FSM signals
-`ifdef MULTI_HIT_FULL_SET
- typedef enum logic[1:0] {NO_HITS, ONE_HIT, MULTI_HIT} multi_state_t;
- multi_state_t multi_SP; // Multi Hit FSM state
- multi_state_t multi_SN; // Multi Hit FSM next state
-
- logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] hit_addr_saved;
- logic master_saved;
-`endif
-
- //// --------------- Block RAM (Dual Port) -------------- ////
-
- // The outputs of the BRAMs are only valid if in the previous cycle:
- // 1. the inputs were valid, and
- // 2. the BRAM was not written to.
- // Otherwise, the outputs must be ignored which is controlled by the output_valid signal.
- // This signal is driven by the uppler level L2 TLB module.
- ram_tp_no_change #(
- .ADDR_WIDTH( SET_WIDTH+OFFSET_WIDTH+1 ),
- .DATA_WIDTH( RAM_DATA_WIDTH )
- )
- ram_tp_no_change_0
- (
- .clk ( clk_i ),
- .we ( ram_we ),
- .addr0 ( port0_addr ),
- .addr1 ( port1_addr ),
- .d_i ( ram_wdata ),
- .d0_o ( port0_data_o ),
- .d1_o ( port1_data_o )
- );
-
- //// Check Ram Outputs
- assign port0_hit = (port0_data_o[0] == 1'b1) && (in_addr[ADDR_WIDTH-1: IGNORE_LSB] == port0_data_o[RAM_DATA_WIDTH-1:4]);
- assign port1_hit = (port1_data_o[0] == 1'b1) && (in_addr[ADDR_WIDTH-1: IGNORE_LSB] == port1_data_o[RAM_DATA_WIDTH-1:4]);
- //// ----------------------------------------------------- /////
-
- //// ------------------- Check if Hit ------------------------ ////
- // FSM
- always_ff @(posedge clk_i) begin
- if (rst_ni == 0) begin
- hit_SP <= SEARCH;
- end else begin
- hit_SP <= hit_SN;
- end
- end
-
- always_ff @(posedge clk_i, negedge rst_ni) begin
- if (!rst_ni) begin
- port0_addr_saved <= '0;
- port1_addr_saved <= '0;
- end else begin
- port0_addr_saved <= port0_addr;
- port1_addr_saved <= port1_addr;
- end
- end
-
- always_comb begin
- hit_SN = hit_SP;
- hit = 1'b0;
- hit_addr = 0;
- master = 1'b0;
- unique case(hit_SP)
- SEARCH :
- if (output_valid)
- if (port0_hit || port1_hit) begin
- hit_SN = HIT;
- hit = 1'b1;
- hit_addr = port0_hit ? {port0_addr_saved[SET_WIDTH+OFFSET_WIDTH:OFFSET_WIDTH], offset_addr_d} :
- port1_hit ? {port1_addr_saved[SET_WIDTH+OFFSET_WIDTH:OFFSET_WIDTH], offset_addr_d} :
- 0;
- master = port0_hit ? port0_data_o[3] :
- port1_hit ? port1_data_o[3] :
- 1'b0;
- end
-
- HIT : begin
-`ifdef MULTI_HIT_FULL_SET // Since the search continues after the first hit, it needs to be saved to be accessed later.
- hit = 1'b1;
- hit_addr = hit_addr_saved;
- master = master_saved;
-`endif
- if (output_sent)
- hit_SN = SEARCH;
- end
-
- default : begin
- hit_SN = SEARCH;
- end
- endcase // case (hit_SP)
- end // always_comb begin
-
- //// ------------------------------------------- ////
-
- assign prot = output_valid && port0_hit ? ((~port0_data_o[2] && rw_type) || (~port0_data_o[1] && ~rw_type)) :
- output_valid && port1_hit ? ((~port1_data_o[2] && rw_type) || (~port1_data_o[1] && ~rw_type)) :
- 1'b0;
-
- //// ------------------- Multi ------------------- ////
-`ifdef MULTI_HIT_FULL_SET
-
- always_ff @(posedge clk_i) begin
- if (rst_ni == 0) begin
- hit_addr_saved <= 0;
- master_saved <= 1'b0;
- end else if (output_valid) begin
- hit_addr_saved <= hit_addr;
- master_saved <= master;
- end
- end
-
- // FSM
- always_ff @(posedge clk_i) begin
- if (rst_ni == 0) begin
- multi_SP <= NO_HITS;
- end else begin
- multi_SP <= multi_SN;
- end
- end
-
- always_comb begin
- multi_SN = multi_SP;
- multi_hit = 1'b0;
- unique case(multi_SP)
- NO_HITS :
- if(output_valid && (port0_hit && port1_hit)) begin
- multi_SN = MULTI_HIT;
- multi_hit = 1'b1;
- end else if(output_valid && (port0_hit || port1_hit))
- multi_SN = ONE_HIT;
-
- ONE_HIT :
- if(output_valid && (port0_hit || port1_hit)) begin
- multi_SN = MULTI_HIT;
- multi_hit = 1'b1;
- end else if (output_sent)
- multi_SN = NO_HITS;
-
- MULTI_HIT : begin
- multi_hit = 1'b1;
- if (output_sent)
- multi_SN = NO_HITS;
- end
-
- endcase // case (multi_SP)
- end // always_comb begin
-
-`else // !`ifdef MULTI_HIT_FULL_SET
- assign multi_hit = output_valid && port0_hit && port1_hit;
-`endif // !`ifdef MULTI_HIT_FULL_SET
- //// ------------------------------------------- ////
-"""
-# endmodule
-#
-#
+++ /dev/null
-class CoreConfig:
- def __init__(self):
- self.N_SLICES = 16
- self.N_REGS = 4*self.N_SLICES
- self.ADDR_WIDTH_PHYS = 40
- self.ADDR_WIDTH_VIRT = 32
+++ /dev/null
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class fsm(Elaboratable):
-
- def __init__(self):
- self.Clk_CI = Signal() # input
- self.Rst_RBI = Signal() # input
- self.port1_addr_valid_i = Signal() # input
- self.port2_addr_valid_i = Signal() # input
- self.port1_sent_i = Signal() # input
- self.port2_sent_i = Signal() # input
- self.select_i = Signal() # input
- self.no_hit_i = Signal() # input
- self.multi_hit_i = Signal() # input
- self.no_prot_i = Signal() # input
- self.prefetch_i = Signal() # input
- self.out_addr_i = Signal(AXI_M_ADDR_WIDTH) # input
- self.cache_coherent_i = Signal() # input
- self.port1_accept_o = Signal() # output
- self.port1_drop_o = Signal() # output
- self.port1_miss_o = Signal() # output
- self.port2_accept_o = Signal() # output
- self.port2_drop_o = Signal() # output
- self.port2_miss_o = Signal() # output
- self.out_addr_o = Signal(AXI_M_ADDR_WIDTH) # output
- self.cache_coherent_o = Signal() # output
- self.miss_o = Signal() # output
- self.multi_o = Signal() # output
- self.prot_o = Signal() # output
- self.prefetch_o = Signal() # output
- self.in_addr_i = Signal(AXI_S_ADDR_WIDTH) # input
- self.in_id_i = Signal(AXI_ID_WIDTH) # input
- self.in_len_i = Signal(8) # input
- self.in_user_i = Signal(AXI_USER_WIDTH) # input
- self.in_addr_o = Signal(AXI_S_ADDR_WIDTH) # output
- self.in_id_o = Signal(AXI_ID_WIDTH) # output
- self.in_len_o = Signal(8) # output
- self.in_user_o = Signal(AXI_USER_WIDTH) # output
-
- def elaborate(self, platform=None):
- m = Module()
- return m
-
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License. You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# //`timescale 1ns / 1ps
-#
-# module fsm
-# #(
-# parameter AXI_M_ADDR_WIDTH = 40,
-# parameter AXI_S_ADDR_WIDTH = 32,
-# parameter AXI_ID_WIDTH = 8,
-# parameter AXI_USER_WIDTH = 6
-# )
-# (
-# input logic Clk_CI,
-# input logic Rst_RBI,
-#
-# input logic port1_addr_valid_i,
-# input logic port2_addr_valid_i,
-# input logic port1_sent_i,
-# input logic port2_sent_i,
-# input logic select_i,
-# input logic no_hit_i,
-# input logic multi_hit_i,
-# input logic no_prot_i,
-# input logic prefetch_i,
-# input logic [AXI_M_ADDR_WIDTH-1:0] out_addr_i,
-# input logic cache_coherent_i,
-# output logic port1_accept_o,
-# output logic port1_drop_o,
-# output logic port1_miss_o,
-# output logic port2_accept_o,
-# output logic port2_drop_o,
-# output logic port2_miss_o,
-# output logic [AXI_M_ADDR_WIDTH-1:0] out_addr_o,
-# output logic cache_coherent_o,
-# output logic miss_o,
-# output logic multi_o,
-# output logic prot_o,
-# output logic prefetch_o,
-# input logic [AXI_S_ADDR_WIDTH-1:0] in_addr_i,
-# input logic [AXI_ID_WIDTH-1:0] in_id_i,
-# input logic [7:0] in_len_i,
-# input logic [AXI_USER_WIDTH-1:0] in_user_i,
-# output logic [AXI_S_ADDR_WIDTH-1:0] in_addr_o,
-# output logic [AXI_ID_WIDTH-1:0] in_id_o,
-# output logic [7:0] in_len_o,
-# output logic [AXI_USER_WIDTH-1:0] in_user_o
-# );
-#
-""" #docstring_begin
-
- //-------------Internal Signals----------------------
-
- typedef enum logic {IDLE, WAIT} state_t;
- logic state_SP; // Present state
- logic state_SN; // Next State
-
- logic port1_accept_SN;
- logic port1_drop_SN;
- logic port1_miss_SN;
- logic port2_accept_SN;
- logic port2_drop_SN;
- logic port2_miss_SN;
- logic miss_SN;
- logic multi_SN;
- logic prot_SN;
- logic prefetch_SN;
- logic cache_coherent_SN;
- logic [AXI_M_ADDR_WIDTH-1:0] out_addr_DN;
-
- logic out_reg_en_S;
-
- //----------FSM comb------------------------------
-
- always_comb begin: FSM_COMBO
- state_SN = state_SP;
-
- port1_accept_SN = 1'b0;
- port1_drop_SN = 1'b0;
- port1_miss_SN = 1'b0;
- port2_accept_SN = 1'b0;
- port2_drop_SN = 1'b0;
- port2_miss_SN = 1'b0;
- miss_SN = 1'b0;
- multi_SN = 1'b0;
- prot_SN = 1'b0;
- prefetch_SN = 1'b0;
- cache_coherent_SN = 1'b0;
- out_addr_DN = '0;
-
- out_reg_en_S = 1'b0; // by default hold register output
-
- unique case(state_SP)
- IDLE :
- if ( (port1_addr_valid_i & select_i) | (port2_addr_valid_i & ~select_i) ) begin
- out_reg_en_S = 1'b1;
- state_SN = WAIT;
-
- // Select inputs for output registers
- if (port1_addr_valid_i & select_i) begin
- port1_accept_SN = ~(no_hit_i | multi_hit_i | ~no_prot_i | prefetch_i);
- port1_drop_SN = (no_hit_i | multi_hit_i | ~no_prot_i | prefetch_i);
- port1_miss_SN = no_hit_i;
- port2_accept_SN = 1'b0;
- port2_drop_SN = 1'b0;
- port2_miss_SN = 1'b0;
- end else if (port2_addr_valid_i & ~select_i) begin
- port1_accept_SN = 1'b0;
- port1_drop_SN = 1'b0;
- port1_miss_SN = 1'b0;
- port2_accept_SN = ~(no_hit_i | multi_hit_i | ~no_prot_i | prefetch_i);
- port2_drop_SN = (no_hit_i | multi_hit_i | ~no_prot_i | prefetch_i);
- port2_miss_SN = no_hit_i;
- end
-
- miss_SN = port1_miss_SN | port2_miss_SN;
- multi_SN = multi_hit_i;
- prot_SN = ~no_prot_i;
- prefetch_SN = ~no_hit_i & prefetch_i;
-
- cache_coherent_SN = cache_coherent_i;
- out_addr_DN = out_addr_i;
- end
-
- WAIT :
- if ( port1_sent_i | port2_sent_i ) begin
- out_reg_en_S = 1'b1; // "clear" the register
- state_SN = IDLE;
- end
-
- default : begin
- state_SN = IDLE;
- end
- endcase
- end
-
- //----------FSM seq-------------------------------
-
- always_ff @(posedge Clk_CI, negedge Rst_RBI) begin: FSM_SEQ
- if (Rst_RBI == 1'b0)
- state_SP <= IDLE;
- else
- state_SP <= state_SN;
- end
-
- //----------Output seq--------------------------
-
- always_ff @(posedge Clk_CI, negedge Rst_RBI) begin: OUTPUT_SEQ
- if (Rst_RBI == 1'b0) begin
- port1_accept_o = 1'b0;
- port1_drop_o = 1'b0;
- port1_miss_o = 1'b0;
- port2_accept_o = 1'b0;
- port2_drop_o = 1'b0;
- port2_miss_o = 1'b0;
- miss_o = 1'b0;
- multi_o = 1'b0;
- prot_o = 1'b0;
- prefetch_o = 1'b0;
- cache_coherent_o = 1'b0;
- out_addr_o = '0;
- in_addr_o = '0;
- in_id_o = '0;
- in_len_o = '0;
- in_user_o = '0;
- end else if (out_reg_en_S == 1'b1) begin
- port1_accept_o = port1_accept_SN;
- port1_drop_o = port1_drop_SN;
- port1_miss_o = port1_miss_SN;
- port2_accept_o = port2_accept_SN;
- port2_drop_o = port2_drop_SN;
- port2_miss_o = port2_miss_SN;
- miss_o = miss_SN;
- multi_o = multi_SN;
- prot_o = prot_SN;
- prefetch_o = prefetch_SN;
- cache_coherent_o = cache_coherent_SN;
- out_addr_o = out_addr_DN;
- in_addr_o = in_addr_i;
- in_id_o = in_id_i;
- in_len_o = in_len_i;
- in_user_o = in_user_i;
- end
- end // block: OUTPUT_SEQ
-"""
-#
-# endmodule
-#
-#
+++ /dev/null
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class l2_tlb(Elaboratable):
-
- def __init__(self):
- self.clk_i = Signal() # input
- self.rst_ni = Signal() # input
- self.we_i = Signal() # input
- self.waddr_i = Signal(AXI_LITE_ADDR_WIDTH) # input
- self.wdata_i = Signal(AXI_LITE_DATA_WIDTH) # input
- self.start_i = Signal() # input
- self.busy_o = Signal() # output
- self.in_addr_i = Signal(AXI_S_ADDR_WIDTH) # input
- self.rw_type_i = Signal() # input
- self.out_ready_i = Signal() # input
- self.out_valid_o = Signal() # output
- self.hit_o = Signal() # output
- self.miss_o = Signal() # output
- self.prot_o = Signal() # output
- self.multi_o = Signal() # output
- self.cache_coherent_o = Signal() # output
- self.out_addr_o = Signal(AXI_M_ADDR_WIDTH) # output
-
- def elaborate(self, platform=None):
- m = Module()
- return m
-
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License. You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# //`include "pulp_soc_defines.sv"
-#
-# ////import CfMath::log2;
-#
-# //`define MULTI_HIT_FULL_SET // Enable full multi hit detection. Always the entire set is searched.
-# //`define MULTI_HIT_CUR_CYCLE // Enable partial multi hit detection. Only multi hits in the same search cycle are detected.
-#
-# //`ifdef MULTI_HIT_FULL_SET
-# // `ifndef MULTI_HIT_CUR_CYCLE
-# // `define MULTI_HIT_CUR_CYCLE
-# // `endif
-# //`endif
-#
-# module l2_tlb
-# //#(
-# // parameter AXI_S_ADDR_WIDTH = 32,
-# // parameter AXI_M_ADDR_WIDTH = 40,
-# // parameter AXI_LITE_DATA_WIDTH = 64,
-# // parameter AXI_LITE_ADDR_WIDTH = 32,
-# // parameter N_SETS = 32,
-# // parameter N_OFFSETS = 4, //per port. There are 2 ports.
-# // parameter PAGE_SIZE = 4096, // 4kB
-# // parameter N_PAR_VA_RAMS = 4,
-# // parameter HIT_OFFSET_STORE_WIDTH = 2 // Num of bits of VA RAM offset stored. This should not be greater than OFFSET_WIDTH
-# // )
-# (
-# input logic clk_i,
-# input logic rst_ni,
-#
-# input logic we_i,
-# input logic [AXI_LITE_ADDR_WIDTH-1:0] waddr_i,
-# input logic [AXI_LITE_DATA_WIDTH-1:0] wdata_i,
-#
-# input logic start_i,
-# output logic busy_o,
-# input logic [AXI_S_ADDR_WIDTH-1:0] in_addr_i,
-# input logic rw_type_i, //1 => write, 0=> read
-#
-# input logic out_ready_i,
-# output logic out_valid_o,
-# output logic hit_o,
-# output logic miss_o,
-# output logic prot_o,
-# output logic multi_o,
-# output logic cache_coherent_o,
-# output logic [AXI_M_ADDR_WIDTH-1:0] out_addr_o
-# );
-#
-""" #docstring_begin
-
- localparam VA_RAM_DEPTH = N_SETS * N_OFFSETS * 2;
- localparam PA_RAM_DEPTH = VA_RAM_DEPTH * N_PAR_VA_RAMS;
- localparam VA_RAM_ADDR_WIDTH = log2(VA_RAM_DEPTH);
- localparam PA_RAM_ADDR_WIDTH = log2(PA_RAM_DEPTH);
- localparam SET_WIDTH = log2(N_SETS);
- localparam OFFSET_WIDTH = log2(N_OFFSETS);
- localparam LL_WIDTH = log2(N_PAR_VA_RAMS);
- localparam IGNORE_LSB = log2(PAGE_SIZE);
-
- localparam VA_RAM_DATA_WIDTH = AXI_S_ADDR_WIDTH - IGNORE_LSB + 4;
- localparam PA_RAM_DATA_WIDTH = AXI_M_ADDR_WIDTH - IGNORE_LSB;
-
- logic [N_PAR_VA_RAMS-1:0] hit, prot, multi_hit, cache_coherent;
- logic [N_PAR_VA_RAMS-1:0] ram_we;
- logic last_search, last_search_next;
- logic first_search, first_search_next;
- logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] ram_waddr;
- logic [N_PAR_VA_RAMS-1:0][SET_WIDTH+OFFSET_WIDTH+1-1:0] hit_addr;
- logic pa_ram_we;
- logic [PA_RAM_ADDR_WIDTH-1:0] pa_port0_raddr, pa_port0_waddr; // PA RAM read, Write addr;
- logic [PA_RAM_ADDR_WIDTH-1:0] pa_port0_raddr_reg_SN, pa_port0_raddr_reg_SP; // registered addresses, needed for WAIT_ON_WRITE;
- logic [PA_RAM_ADDR_WIDTH-1:0] pa_port0_addr; // PA RAM addr
- logic [PA_RAM_DATA_WIDTH-1:0] pa_port0_data, pa_data, pa_port0_data_reg; // PA RAM data
- logic pa_ram_store_data_SN, pa_ram_store_data_SP;
- logic hit_top, prot_top, multi_hit_top, first_hit_top;
- logic output_sent;
- int hit_block_num;
-
- logic searching, search_done;
- logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] port0_addr, port0_raddr; // VA RAM port0 addr
- logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] port1_addr; // VA RAM port1 addr
- logic [OFFSET_WIDTH-1:0] offset_addr, offset_addr_d;
- logic [OFFSET_WIDTH-1:0] offset_start_addr, offset_end_addr;
- logic [SET_WIDTH-1:0] set_num;
-
- logic va_output_valid;
- logic searching_q;
-
- genvar z;
-
- // Search FSM
- typedef enum logic [1:0] {IDLE, SEARCH, DONE} search_state_t;
- search_state_t search_SP; // Present state
- search_state_t search_SN; // Next State
-
- // Output FSM
- typedef enum logic [1:0] {OUT_IDLE, SEND_OUTPUT, WAIT_ON_WRITE} out_state_t;
- out_state_t out_SP; // Present state
- out_state_t out_SN; // Next State
-
- logic miss_next;
- logic hit_next;
- logic prot_next;
- logic multi_next;
- logic cache_coherent_next;
-
- // Generate the VA Block rams and their surrounding logic
- generate
- for (z = 0; z < N_PAR_VA_RAMS; z++) begin : VA_RAMS
- check_ram
- #(
- .ADDR_WIDTH ( AXI_S_ADDR_WIDTH ),
- .RAM_DATA_WIDTH ( VA_RAM_DATA_WIDTH ),
- .PAGE_SIZE ( PAGE_SIZE ),
- .SET_WIDTH ( SET_WIDTH ),
- .OFFSET_WIDTH ( OFFSET_WIDTH )
- )
- u_check_ram
- (
- .clk_i ( clk_i ),
- .rst_ni ( rst_ni ),
- .in_addr ( in_addr_i ),
- .rw_type ( rw_type_i ),
- .ram_we ( ram_we[z] ),
- .port0_addr ( port0_addr ),
- .port1_addr ( port1_addr ),
- .ram_wdata ( wdata_i[VA_RAM_DATA_WIDTH-1:0] ),
- .output_sent ( output_sent ),
- .output_valid ( va_output_valid ),
- .offset_addr_d ( offset_addr_d ),
- .hit_addr ( hit_addr[z] ),
- .master ( cache_coherent[z] ),
- .hit ( hit[z] ),
- .multi_hit ( multi_hit[z] ),
- .prot ( prot[z] )
- );
- end // for (z = 0; z < N_PORTS; z++)
- endgenerate
-
- ////////////////// ---------------- Control and Address --------------- ////////////////////////
- // FSM
- always_ff @(posedge clk_i) begin
- if (rst_ni == 0) begin
- search_SP <= IDLE;
- end else begin
- search_SP <= search_SN;
- end
- end
-
- always_comb begin : SEARCH_FSM
- search_SN = search_SP;
- busy_o = 1'b0;
- searching = 1'b0;
- search_done = 1'b0;
- last_search_next = 1'b0;
- first_search_next = first_search;
-
- unique case (search_SP)
- IDLE : begin
- if (start_i) begin
- search_SN = SEARCH;
- first_search_next = 1'b1;
- end
- end
-
- SEARCH : begin
- busy_o = 1'b1;
-
- // detect last search cycle
- if ( (first_search == 1'b0) && (offset_addr == offset_end_addr) )
- last_search_next = 1'b1;
-
- // pause search during VA RAM reconfigration
- if (|ram_we) begin
- searching = 1'b0;
- end else begin
- searching = 1'b1;
- first_search_next = 1'b0;
- end
-
- if (va_output_valid) begin
- // stop search
-`ifdef MULTI_HIT_FULL_SET
- if (last_search | prot_top | multi_hit_top) begin
-`else
- if (last_search | prot_top | multi_hit_top | hit_top ) begin
-`endif
- search_SN = DONE;
- search_done = 1'b1;
- end
- end
- end
-
- DONE : begin
- busy_o = 1'b1;
- if (out_valid_o & out_ready_i)
- search_SN = IDLE;
- end
-
- default : begin
- search_SN = IDLE;
- end
- endcase // case (prot_SP)
- end // always_comb begin
-
- always_ff @(posedge clk_i) begin
- if (rst_ni == 0) begin
- last_search <= 1'b0;
- first_search <= 1'b0;
- end else begin
- last_search <= last_search_next;
- first_search <= first_search_next;
- end
- end
-
- /*
- * VA RAM address generation
- *
- * The input address and set number, and thus the offset start address, are available in the
- * cycle after the start signal. The buffered offset_addr becomes available one cycle later.
- * During the first search cycle, we therefore directly use offset_addr_start for the lookup.
- */
- assign set_num = in_addr_i[SET_WIDTH+IGNORE_LSB -1 : IGNORE_LSB];
-
- assign port0_raddr[OFFSET_WIDTH] = 1'b0;
- assign port1_addr [OFFSET_WIDTH] = 1'b1;
-
- assign port0_raddr[OFFSET_WIDTH-1:0] = first_search ? offset_start_addr : offset_addr;
- assign port1_addr [OFFSET_WIDTH-1:0] = first_search ? offset_start_addr : offset_addr;
-
- assign port0_raddr[SET_WIDTH+OFFSET_WIDTH : OFFSET_WIDTH+1] = set_num;
- assign port1_addr [SET_WIDTH+OFFSET_WIDTH : OFFSET_WIDTH+1] = set_num;
-
- assign port0_addr = ram_we ? ram_waddr : port0_raddr;
-
- // The outputs of the BRAMs are only valid if in the previous cycle:
- // 1. the inputs were valid, and
- // 2. the BRAMs were not written to.
- // Otherwise, the outputs must be ignored.
- always_ff @(posedge clk_i) begin
- if (rst_ni == 0) begin
- searching_q <= 1'b0;
- end else begin
- searching_q <= searching;
- end
- end
- assign va_output_valid = searching_q;
-
- // Address offset for looking up the VA RAMs
- always_ff @(posedge clk_i) begin
- if (rst_ni == 0) begin
- offset_addr <= 0;
- end else if (first_search) begin
- offset_addr <= offset_start_addr + 1'b1;
- end else if (searching) begin
- offset_addr <= offset_addr + 1'b1;
- end
- end
-
- // Delayed address offest for looking up the PA RAM upon a hit in the VA RAMs
- always_ff @(posedge clk_i) begin
- if (rst_ni == 0) begin
- offset_addr_d <= 0;
- end else if (first_search) begin
- offset_addr_d <= offset_start_addr;
- end else if (searching) begin
- offset_addr_d <= offset_addr_d + 1'b1;
- end
- end
-
- // Store the offset addr for hit to reduce latency for next search.
- generate
- if (HIT_OFFSET_STORE_WIDTH > 0) begin : OFFSET_STORE
-`ifndef MULTI_HIT_FULL_SET
- logic [N_SETS-1:0][HIT_OFFSET_STORE_WIDTH-1:0] hit_offset_addr; // Contains offset addr for previous hit for every SET.
- logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] hit_addr_reg;
-
- assign offset_start_addr = { hit_offset_addr[set_num] , {{OFFSET_WIDTH-HIT_OFFSET_STORE_WIDTH}{1'b0}} };
- assign offset_end_addr = hit_offset_addr[set_num]-1'b1;
-
- // Register the hit addr
- always_ff @(posedge clk_i) begin
- if (rst_ni == 0) begin
- hit_addr_reg <= 0;
- end else if (hit_top) begin
- hit_addr_reg <= hit_addr[hit_block_num];
- end
- end
-
- // Store hit addr for each set. The next search in the same set will start from the saved addr.
- always_ff @(posedge clk_i) begin
- if (rst_ni == 0) begin
- hit_offset_addr <= 0;
- end else if (hit_o) begin
- hit_offset_addr[set_num][HIT_OFFSET_STORE_WIDTH-1:0] <= hit_addr_reg[OFFSET_WIDTH-1 : (OFFSET_WIDTH - HIT_OFFSET_STORE_WIDTH)];
- end
- end
-`else // No need to store offset if full multi hit detection is enabled because the entire SET is searched.
- assign offset_start_addr = 0;
- assign offset_end_addr = {OFFSET_WIDTH{1'b1}};
-`endif
- end else begin // if (HIT_OFFSET_STORE_WIDTH > 0)
- assign offset_start_addr = 0;
- assign offset_end_addr = {OFFSET_WIDTH{1'b1}};
- end
- endgenerate
-
- assign prot_top = |prot;
-
- //////////////////////////////////////////////////////////////////////////////////////
- // check for hit, multi hit
- // In case of a multi hit, the hit_block_num indicates the lowest VA RAM with a hit.
- // In case of a multi hit in the same VA RAM, Port 0 is given priority.
- always_comb begin : HIT_CHECK
- hit_top = |hit;
- hit_block_num = 0;
- first_hit_top = 1'b0;
- multi_hit_top = 1'b0;
- for (int i=N_PAR_VA_RAMS-1; i>=0; i--) begin
- if (hit[i] == 1'b1) begin
-`ifdef MULTI_HIT_CUR_CYCLE
- if (multi_hit[i] | first_hit_top ) begin
- multi_hit_top = 1'b1;
- end
-`endif
- first_hit_top = 1'b1;
- hit_block_num = i;
- end
- end // for (int i=0; i<N_PAR_VA_RAMS; i++)
- end // always_comb begin
-
- ///////////////////// ------------- Outputs ------------ //////////////////////////////////
- //// FSM
- always_ff @(posedge clk_i) begin
- if (rst_ni == 0) begin
- out_SP <= OUT_IDLE;
- pa_ram_store_data_SP <= 1'b0;
- pa_port0_raddr_reg_SP <= 'b0;
- end else begin
- out_SP <= out_SN;
- pa_ram_store_data_SP <= pa_ram_store_data_SN;
- pa_port0_raddr_reg_SP <= pa_port0_raddr_reg_SN;
- end
- end
-
- always_comb begin : OUTPUT_FSM
- out_SN = out_SP;
-
- miss_next = miss_o;
- prot_next = prot_o;
- multi_next = multi_o;
- hit_next = hit_o;
- cache_coherent_next = cache_coherent_o;
- pa_port0_raddr_reg_SN = pa_port0_raddr_reg_SP;
-
- pa_port0_raddr = 'b0;
- pa_ram_store_data_SN = 1'b0;
-
- out_valid_o = 1'b0;
- output_sent = 1'b0;
-
- unique case (out_SP)
- OUT_IDLE : begin
- hit_next = 1'b0;
- miss_next = 1'b0;
- prot_next = 1'b0;
- multi_next = 1'b0;
- cache_coherent_next = 1'b0;
-
- // abort transaction
- if ((search_done & ~hit_top) | prot_top | multi_hit_top) begin
- out_SN = SEND_OUTPUT;
-
- if (search_done & ~hit_top) begin
- miss_next = 1'b1;
- end
- if (prot_top) begin
- prot_next = 1'b1;
- hit_next = 1'b1;
- end
- if (multi_hit_top) begin
- multi_next = 1'b1;
- hit_next = 1'b1;
- end
-
- // read PA RAM
- end else if (search_done & hit_top) begin
- hit_next = 1'b1;
- cache_coherent_next = cache_coherent[hit_block_num];
- pa_port0_raddr = (N_PAR_VA_RAMS * hit_addr[hit_block_num]) + hit_block_num;
- pa_port0_raddr_reg_SN = pa_port0_raddr;
-
- // read PA RAM now
- if (~pa_ram_we) begin
- out_SN = SEND_OUTPUT;
- pa_ram_store_data_SN = 1'b1;
-
- // read PA RAM after PA RAM reconfiguration
- end else begin // pa_ram_we
- out_SN = WAIT_ON_WRITE;
-
- end
- end
- end
-
- WAIT_ON_WRITE : begin
- if ( ~pa_ram_we ) begin
- out_SN = SEND_OUTPUT;
- pa_port0_raddr = pa_port0_raddr_reg_SP;
- pa_ram_store_data_SN = 1'b1;
- end
- end
-
- SEND_OUTPUT : begin
- out_valid_o = 1'b1;
- if (out_ready_i) begin
- out_SN = OUT_IDLE;
- output_sent = 1'b1;
- end
- end
-
- default : begin
- out_SN = OUT_IDLE;
- end
-
- endcase // case (out_SP)
- end // always_comb begin
-
- //// Output signals
- always_ff @(posedge clk_i) begin
- if (rst_ni == 0) begin
- miss_o <= 1'b0;
- prot_o <= 1'b0;
- multi_o <= 1'b0;
- hit_o <= 1'b0;
- cache_coherent_o <= 1'b0;
- end else begin
- miss_o <= miss_next;
- prot_o <= prot_next;
- multi_o <= multi_next;
- hit_o <= hit_next;
- cache_coherent_o <= cache_coherent_next;
- end
- end
-
- ///////////////////////////////////////////////////////////////////////////////////////////////////
-
-
- ///////////////////// --------------- Physical Address -------------- ////////////////////////////
-
- /// PA Block RAM
- ram_tp_no_change #(
- .ADDR_WIDTH( PA_RAM_ADDR_WIDTH ),
- .DATA_WIDTH( PA_RAM_DATA_WIDTH )
- )
- pa_ram
- (
- .clk ( clk_i ),
- .we ( pa_ram_we ),
- .addr0 ( pa_port0_addr ),
- .addr1 ( '0 ),
- .d_i ( wdata_i[PA_RAM_DATA_WIDTH-1:0] ),
- .d0_o ( pa_port0_data ),
- .d1_o ( )
- );
-
- assign out_addr_o[IGNORE_LSB-1:0] = in_addr_i[IGNORE_LSB-1:0];
- assign out_addr_o[AXI_M_ADDR_WIDTH-1:IGNORE_LSB] = pa_data;
-
- always_ff @(posedge clk_i) begin
- if (rst_ni == 0) begin
- pa_port0_data_reg <= 0;
- end else if (pa_ram_store_data_SP) begin
- pa_port0_data_reg <= pa_port0_data;
- end
- end
-
- assign pa_data = pa_ram_store_data_SP ? pa_port0_data : pa_port0_data_reg;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-///// Write enable for all block rams
-generate if (LL_WIDTH != 0) begin
- always_comb begin
- var reg[LL_WIDTH:0] para;
- var int para_int;
- for (para = 0; para < N_PAR_VA_RAMS; para=para+1'b1) begin
- para_int = int'(para);
- ram_we[para_int] = we_i && (waddr_i[LL_WIDTH+VA_RAM_ADDR_WIDTH] == 1'b0) && (waddr_i[LL_WIDTH-1:0] == para);
- end
- end
-end else begin
- assign ram_we[0] = we_i && (waddr_i[LL_WIDTH+VA_RAM_ADDR_WIDTH] == 1'b0);
-end
-
-endgenerate
-
-// Addresses are word, not byte addresses
-assign pa_ram_we = we_i && (waddr_i[LL_WIDTH+VA_RAM_ADDR_WIDTH] == 1'b1); //waddr_i[LL_WIDTH+VA_RAM_ADDR_WIDTH] will be 0 for all VA writes and 1 for all PA writes
-assign ram_waddr = waddr_i[LL_WIDTH+VA_RAM_ADDR_WIDTH-1:LL_WIDTH];
-assign pa_port0_waddr = waddr_i[PA_RAM_ADDR_WIDTH-1:0];
-assign pa_port0_addr = pa_ram_we ? pa_port0_waddr : pa_port0_raddr;
-
-"""
-# endmodule
-#
-# // vim: ts=3 sw=3 sts=3 et nosmartindent autoindent foldmethod=marker tw=100
-#
-#
+++ /dev/null
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License. You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-
-# this file has been generated by sv2nmigen
-
-#
-# //`include "pulp_soc_defines.sv"
-#
-# ////import CfMath::log2;
-#
-# //`define MY_ARRAY_SUM(MY_ARRAY,ARRAY_SIZE) ( (ARRAY_SIZE==1) ? MY_ARRAY[0] : (ARRAY_SIZE==2) ? MY_ARRAY[0] + MY_ARRAY[1] : (ARRAY_SIZE==3) ? MY_ARRAY[0] + MY_ARRAY[1] + MY_ARRAY[2] : (ARRAY_SIZE==4) ? MY_ARRAY[0] + MY_ARRAY[1] + MY_ARRAY[2] + MY_ARRAY[3] : 0 )
-#
-
-# module rab_core
-# #(
-# parameter N_PORTS = 3,
-# parameter N_L2_SETS = 32,
-# parameter N_L2_SET_ENTRIES = 32,
-# parameter AXI_DATA_WIDTH = 64,
-# parameter AXI_S_ADDR_WIDTH = 32,
-# parameter AXI_M_ADDR_WIDTH = 40,
-# parameter AXI_LITE_DATA_WIDTH = 64,
-# parameter AXI_LITE_ADDR_WIDTH = 32,
-# parameter AXI_ID_WIDTH = 8,
-# parameter AXI_USER_WIDTH = 6,
-# parameter MH_FIFO_DEPTH = 16
-# )
-# (
-# input logic Clk_CI,
-# input logic Rst_RBI,
-#
-# input logic [AXI_LITE_ADDR_WIDTH-1:0] s_axi_awaddr,
-# input logic s_axi_awvalid,
-# output logic s_axi_awready,
-#
-# input logic [AXI_LITE_DATA_WIDTH-1:0] s_axi_wdata,
-# input logic [AXI_LITE_DATA_WIDTH/8-1:0] s_axi_wstrb,
-# input logic s_axi_wvalid,
-# output logic s_axi_wready,
-#
-# input logic [AXI_LITE_ADDR_WIDTH-1:0] s_axi_araddr,
-# input logic s_axi_arvalid,
-# output logic s_axi_arready,
-#
-# input logic s_axi_rready,
-# output logic [AXI_LITE_DATA_WIDTH-1:0] s_axi_rdata,
-# output logic [1:0] s_axi_rresp,
-# output logic s_axi_rvalid,
-#
-# output logic [1:0] s_axi_bresp,
-# output logic s_axi_bvalid,
-# input logic s_axi_bready,
-#
-# output logic [N_PORTS-1:0] int_miss,
-# output logic [N_PORTS-1:0] int_prot,
-# output logic [N_PORTS-1:0] int_multi,
-# output logic [N_PORTS-1:0] int_prefetch,
-# output logic int_mhf_full,
-#
-# output logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] int_axaddr_o,
-# output logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] int_axid_o,
-# output logic [N_PORTS-1:0] [7:0] int_axlen_o,
-# output logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_axuser_o,
-#
-# input logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] port1_addr,
-# input logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] port1_id,
-# input logic [N_PORTS-1:0] [7:0] port1_len,
-# input logic [N_PORTS-1:0] [2:0] port1_size,
-# input logic [N_PORTS-1:0] port1_addr_valid,
-# input logic [N_PORTS-1:0] port1_type,
-# input logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] port1_user,
-# input logic [N_PORTS-1:0] port1_sent,
-# output logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] port1_out_addr,
-# output logic [N_PORTS-1:0] port1_cache_coherent,
-# output logic [N_PORTS-1:0] port1_accept,
-# output logic [N_PORTS-1:0] port1_drop,
-# output logic [N_PORTS-1:0] port1_miss,
-#
-# input logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] port2_addr,
-# input logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] port2_id,
-# input logic [N_PORTS-1:0] [7:0] port2_len,
-# input logic [N_PORTS-1:0] [2:0] port2_size,
-# input logic [N_PORTS-1:0] port2_addr_valid,
-# input logic [N_PORTS-1:0] port2_type,
-# input logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] port2_user,
-# input logic [N_PORTS-1:0] port2_sent,
-# output logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] port2_out_addr,
-# output logic [N_PORTS-1:0] port2_cache_coherent,
-# output logic [N_PORTS-1:0] port2_accept,
-# output logic [N_PORTS-1:0] port2_drop,
-# output logic [N_PORTS-1:0] port2_miss,
-#
-# input logic [N_PORTS-1:0] miss_l2_i,
-# input logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] miss_l2_addr_i,
-# input logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] miss_l2_id_i,
-# input logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] miss_l2_user_i,
-#
-# output logic [N_PORTS-1:0] [AXI_LITE_DATA_WIDTH-1:0] wdata_l2_o,
-# output logic [N_PORTS-1:0] [AXI_LITE_ADDR_WIDTH-1:0] waddr_l2_o,
-# output logic [N_PORTS-1:0] wren_l2_o
-# );
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class rab_core(Elaboratable):
-
- def __init__(self):
- self.s_axi_awaddr = Signal(AXI_LITE_ADDR_WIDTH) # input
- self.s_axi_awvalid = Signal() # input
- self.s_axi_awready = Signal() # output
- self.s_axi_wdata = Signal(AXI_LITE_DATA_WIDTH) # input
- self.s_axi_wstrb = Signal(FIXME) # input
- self.s_axi_wvalid = Signal() # input
- self.s_axi_wready = Signal() # output
- self.s_axi_araddr = Signal(AXI_LITE_ADDR_WIDTH) # input
- self.s_axi_arvalid = Signal() # input
- self.s_axi_arready = Signal() # output
- self.s_axi_rready = Signal() # input
- self.s_axi_rdata = Signal(AXI_LITE_DATA_WIDTH) # output
- self.s_axi_rresp = Signal(2) # output
- self.s_axi_rvalid = Signal() # output
- self.s_axi_bresp = Signal(2) # output
- self.s_axi_bvalid = Signal() # output
- self.s_axi_bready = Signal() # input
- self.int_miss = Signal(N_PORTS) # output
- self.int_prot = Signal(N_PORTS) # output
- self.int_multi = Signal(N_PORTS) # output
- self.int_prefetch = Signal(N_PORTS) # output
- self.int_mhf_full = Signal() # output
- self.int_axaddr_o = Signal() # output
- self.int_axid_o = Signal() # output
- self.int_axlen_o = Signal() # output
- self.int_axuser_o = Signal() # output
- self.port1_addr = Signal() # input
- self.port1_id = Signal() # input
- self.port1_len = Signal() # input
- self.port1_size = Signal() # input
- self.port1_addr_valid = Signal(N_PORTS) # input
- self.port1_type = Signal(N_PORTS) # input
- self.port1_user = Signal() # input
- self.port1_sent = Signal(N_PORTS) # input
- self.port1_out_addr = Signal() # output
- self.port1_cache_coherent = Signal(N_PORTS) # output
- self.port1_accept = Signal(N_PORTS) # output
- self.port1_drop = Signal(N_PORTS) # output
- self.port1_miss = Signal(N_PORTS) # output
- self.port2_addr = Signal() # input
- self.port2_id = Signal() # input
- self.port2_len = Signal() # input
- self.port2_size = Signal() # input
- self.port2_addr_valid = Signal(N_PORTS) # input
- self.port2_type = Signal(N_PORTS) # input
- self.port2_user = Signal() # input
- self.port2_sent = Signal(N_PORTS) # input
- self.port2_out_addr = Signal() # output
- self.port2_cache_coherent = Signal(N_PORTS) # output
- self.port2_accept = Signal(N_PORTS) # output
- self.port2_drop = Signal(N_PORTS) # output
- self.port2_miss = Signal(N_PORTS) # output
- self.miss_l2_i = Signal(N_PORTS) # input
- self.miss_l2_addr_i = Signal() # input
- self.miss_l2_id_i = Signal() # input
- self.miss_l2_user_i = Signal() # input
- self.wdata_l2_o = Signal() # output
- self.waddr_l2_o = Signal() # output
- self.wren_l2_o = Signal(N_PORTS) # output
-
- def elaborate(self, platform=None):
- m = Module()
- return m
-
-
-"""
-
-
- // ███████╗██╗ ██████╗ ███╗ ██╗ █████╗ ██╗ ███████╗
- // ██╔════╝██║██╔════╝ ████╗ ██║██╔══██╗██║ ██╔════╝
- // ███████╗██║██║ ███╗██╔██╗ ██║███████║██║ ███████╗
- // ╚════██║██║██║ ██║██║╚██╗██║██╔══██║██║ ╚════██║
- // ███████║██║╚██████╔╝██║ ╚████║██║ ██║███████╗███████║
- // ╚══════╝╚═╝ ╚═════╝ ╚═╝ ╚═══╝╚═╝ ╚═╝╚══════╝╚══════╝
- // signals
-
- localparam integer ENABLE_L2TLB[N_PORTS-1:0] = `EN_L2TLB_ARRAY;
-
- localparam integer N_SLICES[N_PORTS-1:0] = `N_SLICES_ARRAY;
- localparam N_SLICES_TOT = `MY_ARRAY_SUM(N_SLICES,N_PORTS);
- localparam N_SLICES_MAX = `N_SLICES_MAX;
-
- localparam N_REGS = 4*N_SLICES_TOT + 4;
- localparam AXI_SIZE_WIDTH = log2(AXI_DATA_WIDTH/8);
-
- localparam PORT_ID_WIDTH = (N_PORTS < 2) ? 1 : log2(N_PORTS);
- localparam MISS_META_WIDTH = PORT_ID_WIDTH + AXI_USER_WIDTH + AXI_ID_WIDTH;
-
- logic [N_PORTS-1:0] [15:0] p1_burst_size;
- logic [N_PORTS-1:0] [15:0] p2_burst_size;
-
- logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] p1_align_addr;
- logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] p2_align_addr;
-
- logic [N_PORTS-1:0] [AXI_SIZE_WIDTH-1:0] p1_mask;
- logic [N_PORTS-1:0] [AXI_SIZE_WIDTH-1:0] p2_mask;
-
- logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] p1_max_addr;
- logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] p2_max_addr;
-
- logic [N_PORTS-1:0] p1_prefetch;
- logic [N_PORTS-1:0] p2_prefetch;
-
- logic [N_PORTS-1:0] int_rw;
- logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] int_addr_min;
- logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] int_addr_max;
- logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] int_id;
- logic [N_PORTS-1:0] [7:0] int_len;
- logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_user;
-
- logic [N_PORTS-1:0] hit;
- logic [N_PORTS-1:0] prot;
- logic [N_PORTS-1:0] prefetch;
-
- logic [N_PORTS-1:0] no_hit;
- logic [N_PORTS-1:0] no_prot;
-
- logic [N_PORTS-1:0] [N_SLICES_MAX-1:0] hit_slices;
- logic [N_PORTS-1:0] [N_SLICES_MAX-1:0] prot_slices;
-
- logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] out_addr;
- logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] out_addr_reg;
-
- logic [N_PORTS-1:0] cache_coherent;
- logic [N_PORTS-1:0] cache_coherent_reg;
-
- logic [N_PORTS-1:0] select;
- reg [N_PORTS-1:0] curr_priority;
-
- reg [N_PORTS-1:0] multi_hit;
-
- logic [N_PORTS-1:0] miss_valid_mhf;
- logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] miss_addr_mhf;
- logic [N_PORTS-1:0] [MISS_META_WIDTH-1:0] miss_meta_mhf;
-
- logic [N_REGS-1:0] [63:0] int_cfg_regs;
- logic [N_PORTS-1:0] [4*N_SLICES_MAX-1:0] [63:0] int_cfg_regs_slices;
-
- logic L1AllowMultiHit_S;
-
- genvar z;
-
- // █████╗ ███████╗███████╗██╗ ██████╗ ███╗ ██╗███╗ ███╗███████╗███╗ ██╗████████╗███████╗
- // ██╔══██╗██╔════╝██╔════╝██║██╔════╝ ████╗ ██║████╗ ████║██╔════╝████╗ ██║╚══██╔══╝██╔════╝
- // ███████║███████╗███████╗██║██║ ███╗██╔██╗ ██║██╔████╔██║█████╗ ██╔██╗ ██║ ██║ ███████╗
- // ██╔══██║╚════██║╚════██║██║██║ ██║██║╚██╗██║██║╚██╔╝██║██╔══╝ ██║╚██╗██║ ██║ ╚════██║
- // ██║ ██║███████║███████║██║╚██████╔╝██║ ╚████║██║ ╚═╝ ██║███████╗██║ ╚████║ ██║ ███████║
- // ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝ ╚═════╝ ╚═╝ ╚═══╝╚═╝ ╚═╝╚══════╝╚═╝ ╚═══╝ ╚═╝ ╚══════╝
- // assignments
-
- always_comb
- begin : PORT_SELECT
- var integer idx;
-
- for (idx=0; idx<N_PORTS; idx++) begin
-
- // select = 1 -> port1 active
- // select = 0 -> port2 active
- select[idx] = (curr_priority[idx] & port1_addr_valid[idx]) | ~port2_addr_valid[idx];
-
- p1_burst_size[idx] = (port1_len[idx] + 1) << port1_size[idx];
- p2_burst_size[idx] = (port2_len[idx] + 1) << port2_size[idx];
-
- // align min addr for max addr computation to allow for smart AXI bursts around the 4k boundary
- if (port1_size[idx] == 3'b001)
- p1_mask[idx] = 3'b110;
- else if (port1_size[idx] == 3'b010)
- p1_mask[idx] = 3'b100;
- else if (port1_size[idx] == 3'b011)
- p1_mask[idx] = 3'b000;
- else
- p1_mask[idx] = 3'b111;
-
- p1_align_addr[idx][AXI_S_ADDR_WIDTH-1:AXI_SIZE_WIDTH] = port1_addr[idx][AXI_S_ADDR_WIDTH-1:AXI_SIZE_WIDTH];
- p1_align_addr[idx][AXI_SIZE_WIDTH-1:0] = port1_addr[idx][AXI_SIZE_WIDTH-1:0] & p1_mask[idx];
-
- if (port2_size[idx] == 3'b001)
- p2_mask[idx] = 3'b110;
- else if (port2_size[idx] == 3'b010)
- p2_mask[idx] = 3'b100;
- else if (port2_size[idx] == 3'b011)
- p2_mask[idx] = 3'b000;
- else
- p2_mask[idx] = 3'b111;
-
- if (port1_user[idx] == {AXI_USER_WIDTH{1'b1}})
- p1_prefetch[idx] = 1'b1;
- else
- p1_prefetch[idx] = 1'b0;
-
- if (port2_user[idx] == {AXI_USER_WIDTH{1'b1}})
- p2_prefetch[idx] = 1'b1;
- else
- p2_prefetch[idx] = 1'b0;
-
- p2_align_addr[idx][AXI_S_ADDR_WIDTH-1:AXI_SIZE_WIDTH] = port2_addr[idx][AXI_S_ADDR_WIDTH-1:AXI_SIZE_WIDTH];
- p2_align_addr[idx][AXI_SIZE_WIDTH-1:0] = port2_addr[idx][AXI_SIZE_WIDTH-1:0] & p2_mask[idx];
-
- p1_max_addr[idx] = p1_align_addr[idx] + p1_burst_size[idx] - 1;
- p2_max_addr[idx] = p2_align_addr[idx] + p2_burst_size[idx] - 1;
-
- int_addr_min[idx] = select[idx] ? port1_addr[idx] : port2_addr[idx];
- int_addr_max[idx] = select[idx] ? p1_max_addr[idx] : p2_max_addr[idx];
- int_rw[idx] = select[idx] ? port1_type[idx] : port2_type[idx];
- int_id[idx] = select[idx] ? port1_id[idx] : port2_id[idx];
- int_len[idx] = select[idx] ? port1_len[idx] : port2_len[idx];
- int_user[idx] = select[idx] ? port1_user[idx] : port2_user[idx];
- prefetch[idx] = select[idx] ? p1_prefetch[idx] : p2_prefetch[idx];
-
- hit [idx] = | hit_slices [idx];
- prot[idx] = | prot_slices[idx];
-
- no_hit [idx] = ~hit [idx];
- no_prot[idx] = ~prot[idx];
-
- port1_out_addr[idx] = out_addr_reg[idx];
- port2_out_addr[idx] = out_addr_reg[idx];
-
- port1_cache_coherent[idx] = cache_coherent_reg[idx];
- port2_cache_coherent[idx] = cache_coherent_reg[idx];
- end
- end
-
- always_comb
- begin
- var integer idx_port, idx_slice;
- var integer reg_num;
- reg_num=0;
- for ( idx_port = 0; idx_port < N_PORTS; idx_port++ ) begin
- for ( idx_slice = 0; idx_slice < 4*N_SLICES[idx_port]; idx_slice++ ) begin
- int_cfg_regs_slices[idx_port][idx_slice] = int_cfg_regs[4+reg_num];
- reg_num++;
- end
- // int_cfg_regs_slices[idx_port][N_SLICES_MAX:N_SLICES[idx_port]] will be dangling
- // Fix to zero. Synthesis will remove these signals.
- // int_cfg_regs_slices[idx_port][4*N_SLICES_MAX-1:4*N_SLICES[idx_port]] = 0;
- end
- end
-
- always @(posedge Clk_CI or negedge Rst_RBI)
- begin : PORT_PRIORITY
- var integer idx;
- if (Rst_RBI == 1'b0)
- curr_priority = 'h0;
- else begin
- for (idx=0; idx<N_PORTS; idx++) begin
- if (port1_accept[idx] || port1_drop[idx])
- curr_priority[idx] = 1'b1;
- else if (port2_accept[idx] || port2_drop[idx])
- curr_priority[idx] = 1'b0;
- end
- end
- end
-
- // find port that misses
- logic [PORT_ID_WIDTH-1:0] PortIdx_D; // index of the first missing port
- var integer idx_miss;
- always_comb begin : MHF_PORT_SELECT
- PortIdx_D = 'b0;
- for (idx_miss = 0; idx_miss < N_PORTS; idx_miss++) begin
- if (miss_valid_mhf[idx_miss] == 1'b1) begin
- PortIdx_D = idx_miss;
- break;
- end
- end
- end // always_comb begin
-
- // █████╗ ██╗ ██╗██╗ ██████╗ █████╗ ██████╗ ██████╗███████╗ ██████╗
- // ██╔══██╗╚██╗██╔╝██║ ██╔══██╗██╔══██╗██╔══██╗ ██╔════╝██╔════╝██╔════╝
- // ███████║ ╚███╔╝ ██║ ██████╔╝███████║██████╔╝ ██║ █████╗ ██║ ███╗
- // ██╔══██║ ██╔██╗ ██║ ██╔══██╗██╔══██║██╔══██╗ ██║ ██╔══╝ ██║ ██║
- // ██║ ██║██╔╝ ██╗██║ ██║ ██║██║ ██║██████╔╝ ╚██████╗██║ ╚██████╔╝
- // ╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═╝ ╚═╝╚═╝ ╚═╝╚═════╝ ╚═════╝╚═╝ ╚═════╝
- axi_rab_cfg
- #(
- .N_PORTS ( N_PORTS ),
- .N_REGS ( N_REGS ),
- .N_L2_SETS ( N_L2_SETS ),
- .N_L2_SET_ENTRIES( N_L2_SET_ENTRIES ),
- .ADDR_WIDTH_PHYS ( AXI_M_ADDR_WIDTH ),
- .ADDR_WIDTH_VIRT ( AXI_S_ADDR_WIDTH ),
- .N_FLAGS ( 4 ),
- .AXI_DATA_WIDTH ( AXI_LITE_DATA_WIDTH ),
- .AXI_ADDR_WIDTH ( AXI_LITE_ADDR_WIDTH ),
- .MISS_META_WIDTH ( MISS_META_WIDTH ),
- .MH_FIFO_DEPTH ( MH_FIFO_DEPTH )
- )
- u_axi_rab_cfg
- (
- .Clk_CI ( Clk_CI ),
- .Rst_RBI ( Rst_RBI ),
- .s_axi_awaddr ( s_axi_awaddr ),
- .s_axi_awvalid ( s_axi_awvalid ),
- .s_axi_wdata ( s_axi_wdata ),
- .s_axi_wstrb ( s_axi_wstrb ),
- .s_axi_wvalid ( s_axi_wvalid ),
- .s_axi_bready ( s_axi_bready ),
- .s_axi_araddr ( s_axi_araddr ),
- .s_axi_arvalid ( s_axi_arvalid ),
- .s_axi_rready ( s_axi_rready ),
- .s_axi_arready ( s_axi_arready ),
- .s_axi_rdata ( s_axi_rdata ),
- .s_axi_rresp ( s_axi_rresp ),
- .s_axi_rvalid ( s_axi_rvalid ),
- .s_axi_wready ( s_axi_wready ),
- .s_axi_bresp ( s_axi_bresp ),
- .s_axi_bvalid ( s_axi_bvalid ),
- .s_axi_awready ( s_axi_awready ),
- .L1Cfg_DO ( int_cfg_regs ),
- .L1AllowMultiHit_SO ( L1AllowMultiHit_S ),
- .MissAddr_DI ( miss_addr_mhf[PortIdx_D] ),
- .MissMeta_DI ( miss_meta_mhf[PortIdx_D] ),
- .Miss_SI ( miss_valid_mhf[PortIdx_D] ),
- .MhFifoFull_SO ( int_mhf_full ),
- .wdata_l2 ( wdata_l2_o ),
- .waddr_l2 ( waddr_l2_o ),
- .wren_l2 ( wren_l2_o )
- );
-
- generate for (z = 0; z < N_PORTS; z++) begin : MHF_TLB_SELECT
- if (ENABLE_L2TLB[z] == 1) begin // L2 TLB is enabled
- assign miss_valid_mhf[z] = miss_l2_i[z];
- assign miss_addr_mhf[z] = miss_l2_addr_i[z];
- assign miss_meta_mhf[z] = {miss_l2_user_i[z], PortIdx_D, miss_l2_id_i[z]};
- end else begin// L2 TLB is disabled
- assign miss_valid_mhf[z] = int_miss[z];
- assign miss_addr_mhf[z] = int_addr_min[z];
- assign miss_meta_mhf[z] = {int_user[z], PortIdx_D, int_id[z]};
- end
- end
- endgenerate
-
- // ███████╗██╗ ██╗ ██████╗███████╗ ████████╗ ██████╗ ██████╗
- // ██╔════╝██║ ██║██╔════╝██╔════╝ ╚══██╔══╝██╔═══██╗██╔══██╗
- // ███████╗██║ ██║██║ █████╗ ██║ ██║ ██║██████╔╝
- // ╚════██║██║ ██║██║ ██╔══╝ ██║ ██║ ██║██╔═══╝
- // ███████║███████╗██║╚██████╗███████╗ ██║ ╚██████╔╝██║
- // ╚══════╝╚══════╝╚═╝ ╚═════╝╚══════╝ ╚═╝ ╚═════╝ ╚═╝
- generate for (z = 0; z < N_PORTS; z++) begin : SLICE_TOP_GEN
- slice_top
- #(
- .N_SLICES ( N_SLICES[z] ),
- .N_REGS ( 4*N_SLICES[z] ),
- .ADDR_WIDTH_PHYS ( AXI_M_ADDR_WIDTH ),
- .ADDR_WIDTH_VIRT ( AXI_S_ADDR_WIDTH )
- )
- u_slice_top
- (
- .int_cfg_regs ( int_cfg_regs_slices[z][4*N_SLICES[z]-1:0] ),
- .int_rw ( int_rw[z] ),
- .int_addr_min ( int_addr_min[z] ),
- .int_addr_max ( int_addr_max[z] ),
- .multi_hit_allow ( L1AllowMultiHit_S ),
- .multi_hit ( multi_hit[z] ),
- .prot ( prot_slices[z][N_SLICES[z]-1:0] ),
- .hit ( hit_slices [z][N_SLICES[z]-1:0] ),
- .cache_coherent ( cache_coherent[z] ),
- .out_addr ( out_addr[z] )
- );
- // hit_slices [N_SLICES_MAX-1:N_SLICES_MAX-N_SLICES[z]] will be dangling
- // prot_slices[N_SLICES_MAX-1:N_SLICES_MAX-N_SLICES[z]] will be dangling
- // Fix to zero. Synthesis will remove these signals.
- if ( N_SLICES[z] < N_SLICES_MAX ) begin
- assign hit_slices [z][N_SLICES_MAX-1:N_SLICES[z]] = 0;
- assign prot_slices[z][N_SLICES_MAX-1:N_SLICES[z]] = 0;
- end
- end // for (z = 0; z < N_PORTS; z++)
- endgenerate
-
- // ███████╗███████╗███╗ ███╗
- // ██╔════╝██╔════╝████╗ ████║
- // █████╗ ███████╗██╔████╔██║
- // ██╔══╝ ╚════██║██║╚██╔╝██║
- // ██║ ███████║██║ ╚═╝ ██║
- // ╚═╝ ╚══════╝╚═╝ ╚═╝
- //
- generate for (z = 0; z < N_PORTS; z++) begin : FSM_GEN
- fsm
- #(
- .AXI_M_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ),
- .AXI_S_ADDR_WIDTH ( AXI_S_ADDR_WIDTH ),
- .AXI_ID_WIDTH ( AXI_ID_WIDTH ),
- .AXI_USER_WIDTH ( AXI_USER_WIDTH )
- )
- u_fsm
- (
- .Clk_CI ( Clk_CI ),
- .Rst_RBI ( Rst_RBI ),
- .port1_addr_valid_i ( port1_addr_valid[z] ),
- .port2_addr_valid_i ( port2_addr_valid[z] ),
- .port1_sent_i ( port1_sent[z] ),
- .port2_sent_i ( port2_sent[z] ),
- .select_i ( select[z] ),
- .no_hit_i ( no_hit[z] ),
- .multi_hit_i ( multi_hit[z] ),
- .no_prot_i ( no_prot[z] ),
- .prefetch_i ( prefetch[z] ),
- .out_addr_i ( out_addr[z] ),
- .cache_coherent_i ( cache_coherent[z] ),
- .port1_accept_o ( port1_accept[z] ),
- .port1_drop_o ( port1_drop[z] ),
- .port1_miss_o ( port1_miss[z] ),
- .port2_accept_o ( port2_accept[z] ),
- .port2_drop_o ( port2_drop[z] ),
- .port2_miss_o ( port2_miss[z] ),
- .out_addr_o ( out_addr_reg[z] ),
- .cache_coherent_o ( cache_coherent_reg[z] ),
- .miss_o ( int_miss[z] ),
- .multi_o ( int_multi[z] ),
- .prot_o ( int_prot[z] ),
- .prefetch_o ( int_prefetch[z] ),
- .in_addr_i ( int_addr_min[z] ),
- .in_id_i ( int_id[z] ),
- .in_len_i ( int_len[z] ),
- .in_user_i ( int_user[z] ),
- .in_addr_o ( int_axaddr_o[z] ),
- .in_id_o ( int_axid_o[z] ),
- .in_len_o ( int_axlen_o[z] ),
- .in_user_o ( int_axuser_o[z] )
- );
- end
- endgenerate
-
-"""
+++ /dev/null
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License. You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# module rab_slice
-# #(
-# parameter ADDR_WIDTH_PHYS = 40,
-# parameter ADDR_WIDTH_VIRT = 32
-# )
-# (
-# input logic [ADDR_WIDTH_VIRT-1:0] cfg_min,
-# input logic [ADDR_WIDTH_VIRT-1:0] cfg_max,
-# input logic [ADDR_WIDTH_PHYS-1:0] cfg_offset,
-# input logic cfg_wen,
-# input logic cfg_ren,
-# input logic cfg_en,
-# input logic in_trans_type,
-# input logic [ADDR_WIDTH_VIRT-1:0] in_addr_min,
-# input logic [ADDR_WIDTH_VIRT-1:0] in_addr_max,
-# output logic out_hit,
-# output logic out_prot,
-# output logic [ADDR_WIDTH_PHYS-1:0] out_addr
-# );
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class rab_slice(Elaboratable):
-
- def __init__(self, params): # pass config object
- # TODO parameters
- self.params = params
- self.cfg_min = Signal(params.ADDR_WIDTH_VIRT) # input
- self.cfg_max = Signal(params.ADDR_WIDTH_VIRT) # input
- self.cfg_offset = Signal(params.ADDR_WIDTH_PHYS) # input
- self.cfg_wen = Signal() # input
- self.cfg_ren = Signal() # input
- self.cfg_en = Signal() # input
- self.in_trans_type = Signal() # input
- self.in_addr_min = Signal(params.ADDR_WIDTH_VIRT) # input
- self.in_addr_max = Signal(params.ADDR_WIDTH_VIRT) # input
- self.out_hit = Signal() # output
- self.out_prot = Signal() # output
- self.out_addr = Signal(params.ADDR_WIDTH_PHYS) # output
-
- def elaborate(self, platform=None):
- m = Module()
- min_above_min = Signal()
- min_below_max = Signal()
- max_below_max = Signal()
-
- # assign min_above_min = (in_addr_min >= cfg_min) ? 1'b1 : 1'b0;
- # assign min_below_max = (in_addr_min <= cfg_max) ? 1'b1 : 1'b0;
- # assign max_below_max = (in_addr_max <= cfg_max) ? 1'b1 : 1'b0;
- # assign out_hit = cfg_en & min_above_min & min_below_max & max_below_max;
- # assign out_prot = out_hit & ((in_trans_type & ~cfg_wen) | (~in_trans_type & ~cfg_ren));
- # assign out_addr = in_addr_min - cfg_min + cfg_offset;
- m.d.comb += [
- min_above_min.eq(self.in_addr_min >= self.cfg_min),
- min_below_max.eq(self.in_addr_min <= self.cfg_max),
- max_below_max.eq(self.in_addr_max <= self.cfg_max),
- self.out_hit.eq(self.cfg_en & min_above_min &
- min_below_max & max_below_max),
- self.out_prot.eq(self.out_hit & (
- (self.in_trans_type & ~self.cfg_wen) | (~self.in_trans_type & ~self.cfg_ren))),
- self.out_addr.eq(self.in_addr_min - self.cfg_min + self.cfg_offset)
- ]
-
- return m
+++ /dev/null
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License. You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# /*
-# * ram_tp_no_change
-# *
-# * This code implements a parameterizable two-port memory. Port 0 can read and
-# * write while Port 1 can read only. The Xilinx tools will infer a BRAM with
-# * Port 0 in "no change" mode, i.e., during a write, it retains the last read
-# * value on the output. Port 1 (read-only) is in "write first" mode. Still, it
-# * outputs the old data during the write cycle. Note: Port 1 outputs invalid
-# * data in the cycle after the write when reading the same address.
-# *
-# * For more information, see Xilinx PG058 Block Memory Generator Product Guide.
-# */
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-from nmigen import Memory
-
-import math
-
-#
-# module ram_tp_no_change
-# #(
-ADDR_WIDTH = 10
-DATA_WIDTH = 36
-# )
-# (
-# input clk,
-# input we,
-# input [ADDR_WIDTH-1:0] addr0,
-# input [ADDR_WIDTH-1:0] addr1,
-# input [DATA_WIDTH-1:0] d_i,
-# output [DATA_WIDTH-1:0] d0_o,
-# output [DATA_WIDTH-1:0] d1_o
-# );
-
-
-class ram_tp_no_change(Elaboratable):
-
- def __init__(self):
- self.we = Signal() # input
- self.addr0 = Signal(ADDR_WIDTH) # input
- self.addr1 = Signal(ADDR_WIDTH) # input
- self.d_i = Signal(DATA_WIDTH) # input
- self.d0_o = Signal(DATA_WIDTH) # output
- self.d1_o = Signal(DATA_WIDTH) # output
-
- DEPTH = int(math.pow(2, ADDR_WIDTH))
- self.ram = Memory(width=DATA_WIDTH, depth=DEPTH)
- #
- # localparam DEPTH = 2**ADDR_WIDTH;
- #
- # (* ram_style = "block" *) reg [DATA_WIDTH-1:0] ram[DEPTH];
- # reg [DATA_WIDTH-1:0] d0;
- # reg [DATA_WIDTH-1:0] d1;
- #
- # always_ff @(posedge clk) begin
- # if(we == 1'b1) begin
- # ram[addr0] <= d_i;
- # end else begin
- # only change data if we==false
- # d0 <= ram[addr0];
- # end
- # d1 <= ram[addr1];
- # end
- #
- # assign d0_o = d0;
- # assign d1_o = d1;
- #
-
- def elaborate(self, platform=None):
- m = Module()
- m.submodules.read_ram0 = read_ram0 = self.ram.read_port()
- m.submodules.read_ram1 = read_ram1 = self.ram.read_port()
- m.submodules.write_ram = write_ram = self.ram.write_port()
-
- # write port
- m.d.comb += write_ram.en.eq(self.we)
- m.d.comb += write_ram.addr.eq(self.addr0)
- m.d.comb += write_ram.data.eq(self.d_i)
-
- # read ports
- m.d.comb += read_ram0.addr.eq(self.addr0)
- m.d.comb += read_ram1.addr.eq(self.addr1)
- with m.If(self.we == 0):
- m.d.sync += self.d0_o.eq(read_ram0.data)
- m.d.sync += self.d1_o.eq(read_ram1.data)
-
- return m
+++ /dev/null
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License. You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# /*
-# * ram_tp_write_first
-# *
-# * This code implements a parameterizable two-port memory. Port 0 can read and
-# * write while Port 1 can read only. Xilinx Vivado will infer a BRAM in
-# * "write first" mode, i.e., upon a read and write to the same address, the
-# * new value is read. Note: Port 1 outputs invalid data in the cycle after
-# * the write when reading the same address.
-# *
-# * For more information, see Xilinx PG058 Block Memory Generator Product Guide.
-# */
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-from nmigen import Memory
-
-import math
-#
-# module ram_tp_write_first
-# #(
-ADDR_WIDTH = 10
-DATA_WIDTH = 36
-# )
-# (
-# input clk,
-# input we,
-# input [ADDR_WIDTH-1:0] addr0,
-# input [ADDR_WIDTH-1:0] addr1,
-# input [DATA_WIDTH-1:0] d_i,
-# output [DATA_WIDTH-1:0] d0_o,
-# output [DATA_WIDTH-1:0] d1_o
-# );
-
-
-class ram_tp_write_first(Elaboratable):
-
- def __init__(self):
- self.we = Signal() # input
- self.addr0 = Signal(ADDR_WIDTH) # input
- self.addr1 = Signal(ADDR_WIDTH) # input
- self.d_i = Signal(DATA_WIDTH) # input
- self.d0_o = Signal(DATA_WIDTH) # output
- self.d1_o = Signal(DATA_WIDTH) # output
-
- DEPTH = int(math.pow(2, ADDR_WIDTH))
- self.ram = Memory(width=DATA_WIDTH, depth=DEPTH)
-
- #
- # localparam DEPTH = 2**ADDR_WIDTH;
- #
- # (* ram_style = "block" *) reg [DATA_WIDTH-1:0] ram[DEPTH];
- # reg [ADDR_WIDTH-1:0] raddr0;
- # reg [ADDR_WIDTH-1:0] raddr1;
- #
- # always_ff @(posedge clk) begin
- # if(we == 1'b1) begin
- # ram[addr0] <= d_i;
- # end
- # raddr0 <= addr0;
- # raddr1 <= addr1;
- # end
- #
- # assign d0_o = ram[raddr0];
- # assign d1_o = ram[raddr1];
- #
-
- def elaborate(self, platform=None):
- m = Module()
- m.submodules.read_ram0 = read_ram0 = self.ram.read_port()
- m.submodules.read_ram1 = read_ram1 = self.ram.read_port()
- m.submodules.write_ram = write_ram = self.ram.write_port()
-
- # write port
- m.d.comb += write_ram.en.eq(self.we)
- m.d.comb += write_ram.addr.eq(self.addr0)
- m.d.comb += write_ram.data.eq(self.d_i)
-
- # read ports
- m.d.comb += read_ram0.addr.eq(self.addr0)
- m.d.comb += read_ram1.addr.eq(self.addr1)
- m.d.sync += self.d0_o.eq(read_ram0.data)
- m.d.sync += self.d1_o.eq(read_ram1.data)
-
- return m
+++ /dev/null
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License. You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-import rab_slice
-import coreconfig
-
-#
-# module slice_top
-# //#(
-# // parameter N_SLICES = 16,
-# // parameter N_REGS = 4*N_SLICES,
-# // parameter ADDR_WIDTH_PHYS = 40,
-# // parameter ADDR_WIDTH_VIRT = 32
-# // )
-# (
-# input logic [N_REGS-1:0] [63:0] int_cfg_regs,
-# input logic int_rw,
-# input logic [ADDR_WIDTH_VIRT-1:0] int_addr_min,
-# input logic [ADDR_WIDTH_VIRT-1:0] int_addr_max,
-# input logic multi_hit_allow,
-# output logic multi_hit,
-# output logic [N_SLICES-1:0] prot,
-# output logic [N_SLICES-1:0] hit,
-# output logic cache_coherent,
-# output logic [ADDR_WIDTH_PHYS-1:0] out_addr
-# );
-#
-
-
-class slice_top(Elaboratable):
-
- def __init__(self):
- # FIXME self.int_cfg_regs = Signal() # input
- self.params = coreconfig.CoreConfig() # rename ?
- self.int_rw = Signal() # input
- self.int_addr_min = Signal(self.params.ADDR_WIDTH_VIRT) # input
- self.int_addr_max = Signal(self.params.ADDR_WIDTH_VIRT) # input
- self.multi_hit_allow = Signal() # input
- self.multi_hit = Signal() # output
- self.prot = Signal(self.params.N_SLICES) # output
- self.hit = Signal(self.params.N_SLICES) # output
- self.cache_coherent = Signal() # output
- self.out_addr = Signal(self.params.ADDR_WIDTH_PHYS) # output
-
- def elaborate(self, platform=None):
- m = Module()
-
- first_hit = Signal()
-
- for i in range(self.params.N_SLICES):
- # TODO pass params / core config here
- u_slice = rab_slice.rab_slice(self.params)
- setattr(m.submodules, "u_slice%d" % i, u_slice)
- # TODO set param and connect ports
-
- # In case of a multi hit, the lowest slice with a hit is selected.
- # TODO always_comb begin : HIT_CHECK
- m.d.comb += [
- first_hit.eq(0),
- self.multi_hit.eq(0),
- self.out_addr.eq(0),
- self.cache_coherent.eq(0)]
-
- for j in range(self.params.N_SLICES):
- with m.If(self.hit[j] == 1):
- with m.If(first_hit == 1):
- with m.If(self.multi_hit_allow == 0):
- m.d.comb += [self.multi_hit.eq(1)]
- with m.Elif(first_hit == 1):
- m.d.comb += [first_hit.eq(1)
- # only output first slice that was hit
- # SV self.out_addr.eq(slice_out_addr[ADDR_WIDTH_PHYS*j + : ADDR_WIDTH_PHYS]),
- # SV self.cache_coherent.eq(int_cfg_regs[4*j+3][3]),
- ]
- return m
-
- # TODO translate generate statement
-
-
-"""
- logic [ADDR_WIDTH_PHYS*N_SLICES-1:0] slice_out_addr;
-
- generate
- for ( i=0; i<N_SLICES; i++ )
- begin
- rab_slice
- #(
- .ADDR_WIDTH_PHYS ( ADDR_WIDTH_PHYS ),
- .ADDR_WIDTH_VIRT ( ADDR_WIDTH_VIRT )
- )
- u_slice
- (
- .cfg_min ( int_cfg_regs[4*i] [ADDR_WIDTH_VIRT-1:0] ),
- .cfg_max ( int_cfg_regs[4*i+1][ADDR_WIDTH_VIRT-1:0] ),
- .cfg_offset ( int_cfg_regs[4*i+2][ADDR_WIDTH_PHYS-1:0] ),
- .cfg_wen ( int_cfg_regs[4*i+3][2] ),
- .cfg_ren ( int_cfg_regs[4*i+3][1] ),
- .cfg_en ( int_cfg_regs[4*i+3][0] ),
- .in_trans_type ( int_rw ),
- .in_addr_min ( int_addr_min ),
- .in_addr_max ( int_addr_max ),
- .out_addr ( slice_out_addr[ADDR_WIDTH_PHYS*i+ADDR_WIDTH_PHYS-1:ADDR_WIDTH_PHYS*i] ),
- .out_prot ( prot[i] ),
- .out_hit ( hit[i] )
- );
- end
- endgenerate
-
- // In case of a multi hit, the lowest slice with a hit is selected.
- always_comb begin : HIT_CHECK
- first_hit = 0;
- multi_hit = 0;
- out_addr = '0;
- cache_coherent = 0;
- for (j = 0; j < N_SLICES; j++) begin
- if (hit[j] == 1'b1) begin
- if (first_hit == 1'b1) begin
- if (multi_hit_allow == 1'b0) begin
- multi_hit = 1'b1;
- end
- end else begin
- first_hit = 1'b1;
- out_addr = slice_out_addr[ADDR_WIDTH_PHYS*j +: ADDR_WIDTH_PHYS];
- cache_coherent = int_cfg_regs[4*j+3][3];
- end
- end
- end
- end
-"""
-
-# sv 2 migen: TODO add translate code for generate statements and for loops inside always_comb
+++ /dev/null
-from ram_tp_write_first import ram_tp_write_first
-from nmigen.compat.sim import run_simulation
-import sys
-sys.path.append("../")
-
-
-def tbench(dut):
- yield dut.we.eq(1)
- for i in range(0, 255):
- yield dut.addr0.eq(i)
- yield dut.d_i.eq(i)
- yield
-
-
-if __name__ == "__main__":
- dut = ram_tp_write_first()
- run_simulation(dut, tbench(dut), vcd_name="ram_tp_write_first.vcd")
- print("ram_tp_write_first Unit Test Success")
+++ /dev/null
-from nmigen.compat.sim import run_simulation
-import sys
-sys.path.append("../")
-# sys.path.append("../../../TestUtil")
-from slice_top import slice_top
-
-def tbench(dut):
- yield
-
-
-if __name__ == "__main__":
- dut = slice_top()
- run_simulation(dut, tbench(dut), vcd_name="test_slice_top.vcd")
- print("slice_top Unit Test Success")
+++ /dev/null
-from soc.decoder.power_enums import (Function, Form, InternalOp,
- In1Sel, In2Sel, In3Sel, OutSel,
- RC, LdstLen, CryIn, get_csv,
- single_bit_flags,
- get_signal_name, default_values)
-import math
-
-
-class MemorySim:
- def __init__(self, bytes_per_word=8):
- self.mem = {}
- self.bytes_per_word = bytes_per_word
- self.word_log2 = math.ceil(math.log2(bytes_per_word))
-
- def _get_shifter_mask(self, width, remainder):
- shifter = ((self.bytes_per_word - width) - remainder) * \
- 8 # bits per byte
- mask = (1 << (width * 8)) - 1
- return shifter, mask
-
- # TODO: Implement ld/st of lesser width
- def ld(self, address, width=8):
- remainder = address & (self.bytes_per_word - 1)
- address = address >> self.word_log2
- assert remainder & (width - 1) == 0, "Unaligned access unsupported!"
- if address in self.mem:
- val = self.mem[address]
- else:
- val = 0
-
- if width != self.bytes_per_word:
- shifter, mask = self._get_shifter_mask(width, remainder)
- val = val & (mask << shifter)
- val >>= shifter
- print("Read {:x} from addr {:x}".format(val, address))
- return val
-
- def st(self, address, value, width=8):
- remainder = address & (self.bytes_per_word - 1)
- address = address >> self.word_log2
- assert remainder & (width - 1) == 0, "Unaligned access unsupported!"
- print("Writing {:x} to addr {:x}".format(value, address))
- if width != self.bytes_per_word:
- if address in self.mem:
- val = self.mem[address]
- else:
- val = 0
- shifter, mask = self._get_shifter_mask(width, remainder)
- val &= ~(mask << shifter)
- val |= value << shifter
- self.mem[address] = val
- else:
- self.mem[address] = value
-
-
-class RegFile:
- def __init__(self):
- self.regfile = [0] * 32
- self.sprs = {}
-
- def write_reg(self, regnum, value):
- all1s = (1 << 64)-1 # 64 bits worth of 1s
- value &= all1s
- print("Writing {:x} to reg r{}".format(value, regnum))
- self.regfile[regnum] = value
-
- def read_reg(self, regnum):
- val = self.regfile[regnum]
- print("Read {:x} from reg r{}".format(val, regnum))
- return val
-
- def assert_gpr(self, gpr, val):
- reg_val = self.read_reg(gpr)
- msg = "reg r{} got {:x}, expecting {:x}".format(
- gpr, reg_val, val)
- assert reg_val == val, msg
-
- def assert_gprs(self, gprs):
- for k, v in list(gprs.items()):
- self.assert_gpr(k, v)
-
- def set_xer(self, result, operanda, operandb):
- xer = 0
- if result & 1 << 64:
- xer |= XER.CA
-
- self.xer = xer
-
-
-class InternalOpSimulator:
- def __init__(self):
- self.mem_sim = MemorySim()
- self.regfile = RegFile()
-
- def execute_alu_op(self, op1, op2, internal_op, carry=0):
- print(internal_op)
- if internal_op == InternalOp.OP_ADD.value:
- return op1 + op2 + carry
- elif internal_op == InternalOp.OP_AND.value:
- return op1 & op2
- elif internal_op == InternalOp.OP_OR.value:
- return op1 | op2
- elif internal_op == InternalOp.OP_MUL_L64.value:
- return op1 * op2
- else:
- assert False, "Not implemented"
-
- def update_cr0(self, result):
- if result == 0:
- self.cr0 = 0b001
- elif result >> 63:
- self.cr0 = 0b100
- else:
- self.cr0 = 0b010
- print("update_cr0", self.cr0)
-
- def alu_op(self, pdecode2):
- all1s = (1 << 64)-1 # 64 bits worth of 1s
- internal_op = yield pdecode2.dec.op.internal_op
- operand1 = 0
- operand2 = 0
- result = 0
- carry = 0
- r1_ok = yield pdecode2.e.read_reg1.ok
- r2_ok = yield pdecode2.e.read_reg2.ok
- r3_ok = yield pdecode2.e.read_reg3.ok
- imm_ok = yield pdecode2.e.imm_data.ok
- if r1_ok:
- r1_sel = yield pdecode2.e.read_reg1.data
- operand1 = self.regfile.read_reg(r1_sel)
- elif r3_ok:
- r3_sel = yield pdecode2.e.read_reg3.data
- operand1 = self.regfile.read_reg(r3_sel)
- if r2_ok:
- r2_sel = yield pdecode2.e.read_reg2.data
- operand2 = self.regfile.read_reg(r2_sel)
- if imm_ok:
- operand2 = yield pdecode2.e.imm_data.data
-
- inv_a = yield pdecode2.dec.op.inv_a
- if inv_a:
- operand1 = (~operand1) & all1s
-
- cry_in = yield pdecode2.dec.op.cry_in
- if cry_in == CryIn.ONE.value:
- carry = 1
- elif cry_in == CryIn.CA.value:
- carry = self.carry_out
-
- # TODO rc_sel = yield pdecode2.dec.op.rc_sel
- result = self.execute_alu_op(operand1, operand2, internal_op,
- carry=carry)
-
- cry_out = yield pdecode2.dec.op.cry_out
- rc = yield pdecode2.e.rc.data
-
- if rc:
- self.update_cr0(result)
- if cry_out == 1:
- self.carry_out = (result >> 64)
- print("setting carry_out", self.carry_out)
-
- ro_ok = yield pdecode2.e.write_reg.ok
- if ro_ok:
- ro_sel = yield pdecode2.e.write_reg.data
- self.regfile.write_reg(ro_sel, result)
-
- def mem_op(self, pdecode2):
- internal_op = yield pdecode2.dec.op.internal_op
- addr_reg = yield pdecode2.e.read_reg1.data
- addr = self.regfile.read_reg(addr_reg)
-
- imm_ok = yield pdecode2.e.imm_data.ok
- r2_ok = yield pdecode2.e.read_reg2.ok
- width = yield pdecode2.e.data_len
- if imm_ok:
- imm = yield pdecode2.e.imm_data.data
- addr += imm
- elif r2_ok:
- r2_sel = yield pdecode2.e.read_reg2.data
- addr += self.regfile.read_reg(r2_sel)
- if internal_op == InternalOp.OP_STORE.value:
- val_reg = yield pdecode2.e.read_reg3.data
- val = self.regfile.read_reg(val_reg)
- self.mem_sim.st(addr, val, width)
- elif internal_op == InternalOp.OP_LOAD.value:
- dest_reg = yield pdecode2.e.write_reg.data
- val = self.mem_sim.ld(addr, width)
- self.regfile.write_reg(dest_reg, val)
-
- def execute_op(self, pdecode2):
- function = yield pdecode2.dec.op.function_unit
- if function == Function.ALU.value:
- yield from self.alu_op(pdecode2)
- elif function == Function.LDST.value:
- yield from self.mem_op(pdecode2)
+++ /dev/null
-from nmigen import Module, Signal
-from nmigen.back.pysim import Simulator, Delay
-from nmigen.test.utils import FHDLTestCase
-import unittest
-from soc.simulator.internalop_sim import InternalOpSimulator
-from soc.decoder.power_decoder import (create_pdecode)
-from soc.decoder.power_enums import (Function, InternalOp,
- In1Sel, In2Sel, In3Sel,
- OutSel, RC, LdstLen, CryIn,
- single_bit_flags, Form, SPR,
- get_signal_name, get_csv)
-from soc.decoder.power_decoder2 import (PowerDecode2)
-from soc.simulator.program import Program
-from soc.simulator.qemu import run_program
-
-
-class Register:
- def __init__(self, num):
- self.num = num
-
-
-class DecoderTestCase(FHDLTestCase):
-
- def run_tst(self, generator, simulator):
- m = Module()
- comb = m.d.comb
- instruction = Signal(32)
-
- pdecode = create_pdecode()
-
- m.submodules.pdecode2 = pdecode2 = PowerDecode2(pdecode)
- comb += pdecode2.dec.raw_opcode_in.eq(instruction)
- sim = Simulator(m)
- gen = generator.generate_instructions()
-
- def process():
- for ins in gen:
-
- print("0x{:X}".format(ins & 0xffffffff))
-
- # ask the decoder to decode this binary data (endian'd)
- yield pdecode2.dec.bigendian.eq(0) # little / big?
- yield instruction.eq(ins) # raw binary instr.
- yield Delay(1e-6)
- yield from simulator.execute_op(pdecode2)
-
- sim.add_process(process)
- with sim.write_vcd("simulator.vcd", "simulator.gtkw",
- traces=pdecode2.ports()):
- sim.run()
-
- def test_example(self):
- lst = ["addi 1, 0, 0x1234",
- "addi 2, 0, 0x5678",
- "add 3, 1, 2",
- "and 4, 1, 2"]
- with Program(lst) as program:
- self.run_tst_program(program, [1, 2, 3, 4])
-
- def test_ldst(self):
- lst = ["addi 1, 0, 0x1234",
- "addi 2, 0, 0x5678",
- "stw 1, 0(2)",
- "lwz 3, 0(2)"]
- with Program(lst) as program:
- self.run_tst_program(program, [1, 2, 3])
-
- def test_ldst_extended(self):
- lst = ["addi 1, 0, 0x1234",
- "addi 2, 0, 0x5678",
- "addi 4, 0, 0x40",
- "stw 1, 0x40(2)",
- "lwzx 3, 4, 2"]
- with Program(lst) as program:
- self.run_tst_program(program, [1, 2, 3])
-
- def test_ldst_widths(self):
- lst = [" lis 1, 0xdead",
- "ori 1, 1, 0xbeef",
- "addi 2, 0, 0x1000",
- "std 1, 0(2)",
- "lbz 1, 5(2)",
- "lhz 3, 4(2)",
- "lwz 4, 4(2)",
- "addi 5, 0, 0x12",
- "stb 5, 5(2)",
- "ld 5, 0(2)"]
- with Program(lst) as program:
- self.run_tst_program(program, [1, 2, 3, 4, 5])
-
- def test_sub(self):
- lst = ["addi 1, 0, 0x1234",
- "addi 2, 0, 0x5678",
- "subf 3, 1, 2",
- "subfic 4, 1, 0x1337",
- "neg 5, 1"]
- with Program(lst) as program:
- self.run_tst_program(program, [1, 2, 3, 4, 5])
-
- def test_add_with_carry(self):
- lst = ["addi 1, 0, 5",
- "neg 1, 1",
- "addi 2, 0, 7",
- "neg 2, 2",
- "addc 3, 2, 1",
- "addi 3, 3, 1"
- ]
- with Program(lst) as program:
- self.run_tst_program(program, [1, 2, 3])
-
- def test_addis(self):
- lst = ["addi 1, 0, 0x0FFF",
- "addis 1, 1, 0x0F"
- ]
- with Program(lst) as program:
- self.run_tst_program(program, [1])
-
- def test_mulli(self):
- lst = ["addi 1, 0, 3",
- "mulli 1, 1, 2"
- ]
- with Program(lst) as program:
- self.run_tst_program(program, [1])
-
- def run_tst_program(self, prog, reglist):
- simulator = InternalOpSimulator()
- self.run_tst(prog, simulator)
- prog.reset()
- with run_program(prog) as q:
- qemu_register_compare(simulator, q, reglist)
-
-
-def qemu_register_compare(simulator, qemu, regs):
- for reg in regs:
- qemu_val = qemu.get_register(reg)
- simulator.regfile.assert_gpr(reg, qemu_val)
-
-
-if __name__ == "__main__":
- unittest.main()
--- /dev/null
+*.wpr
+__pycache__
--- /dev/null
+from nmigen import Module, Signal, Elaboratable
+from nmigen.lib.coding import Encoder, PriorityEncoder
+
+
+class AddressEncoder(Elaboratable):
+ """Address Encoder
+
+ The purpose of this module is to take in a vector and
+ encode the bits that are one hot into an address. This module
+ combines both nmigen's Encoder and PriorityEncoder and will state
+ whether the input line has a single bit hot, multiple bits hot,
+ or no bits hot. The output line will always have the lowest value
+ address output.
+
+ Usage:
+ The output is valid when either single or multiple match is high.
+ Otherwise output is 0.
+ """
+
+ def __init__(self, width):
+ """ Arguments:
+ * width: The desired length of the input vector
+ """
+ # Internal
+ self.encoder = Encoder(width)
+ self.p_encoder = PriorityEncoder(width)
+
+ # Input
+ self.i = Signal(width)
+
+ # Output
+ self.single_match = Signal(1)
+ self.multiple_match = Signal(1)
+ self.o = Signal(range(width))
+
+ def elaborate(self, platform=None):
+ m = Module()
+
+ # Add internal submodules
+ m.submodules.encoder = self.encoder
+ m.submodules.p_encoder = self.p_encoder
+
+ m.d.comb += [
+ self.encoder.i.eq(self.i),
+ self.p_encoder.i.eq(self.i)
+ ]
+
+ # Steps:
+ # 1. check if the input vector is non-zero
+ # 2. if non-zero, check if single match or multiple match
+ # 3. set output line to be lowest value address output
+
+ # If the priority encoder recieves an input of 0
+ # If n is 1 then the output is not valid
+ with m.If(self.p_encoder.n):
+ m.d.comb += [
+ self.single_match.eq(0),
+ self.multiple_match.eq(0),
+ self.o.eq(0)
+ ]
+ # If the priority encoder recieves an input > 0
+ with m.Else():
+ # Multiple Match if encoder n is invalid
+ with m.If(self.encoder.n):
+ m.d.comb += [
+ self.single_match.eq(0),
+ self.multiple_match.eq(1)
+ ]
+ # Single Match if encoder n is valid
+ with m.Else():
+ m.d.comb += [
+ self.single_match.eq(1),
+ self.multiple_match.eq(0)
+ ]
+ # Always set output based on priority encoder output
+ m.d.comb += self.o.eq(self.p_encoder.o)
+ return m
--- /dev/null
+from nmigen import Array, Cat, Module, Signal, Elaboratable
+from nmigen.lib.coding import Decoder
+from nmigen.cli import main # , verilog
+
+from .CamEntry import CamEntry
+from .AddressEncoder import AddressEncoder
+
+
+class Cam(Elaboratable):
+ """ Content Addressable Memory (CAM)
+
+ The purpose of this module is to quickly look up whether an
+ entry exists given a data key.
+ This module will search for the given data in all internal entries
+ and output whether a single or multiple match was found.
+ If an single entry is found the address be returned and single_match
+ is set HIGH. If multiple entries are found the lowest address is
+ returned and multiple_match is set HIGH. If neither single_match or
+ multiple_match are HIGH this implies no match was found. To write
+ to the CAM set the address bus to the desired entry and set write_enable
+ HIGH. Entry managment should be performed one level above this block
+ as lookup is performed within.
+
+ Notes:
+ The read and write operations take one clock cycle to complete.
+ Currently the read_warning line is present for interfacing but
+ is not necessary for this design. This module is capable of writing
+ in the first cycle, reading on the second, and output the correct
+ address on the third.
+ """
+
+ def __init__(self, data_size, cam_size):
+ """ Arguments:
+ * data_size: (bits) The bit size of the data
+ * cam_size: (number) The number of entries in the CAM
+ """
+
+ # Internal
+ self.cam_size = cam_size
+ self.encoder = AddressEncoder(cam_size)
+ self.decoder = Decoder(cam_size)
+ self.entry_array = Array(CamEntry(data_size) for x in range(cam_size))
+
+ # Input
+ self.enable = Signal(1)
+ self.write_enable = Signal(1)
+ self.data_in = Signal(data_size) # The data to be written
+ self.data_mask = Signal(data_size) # mask for ternary writes
+ # address of CAM Entry to write
+ self.address_in = Signal(range(cam_size))
+
+ # Output
+ self.read_warning = Signal(1) # High when a read interrupts a write
+ self.single_match = Signal(1) # High when there is only one match
+ self.multiple_match = Signal(1) # High when there at least two matches
+ # The lowest address matched
+ self.match_address = Signal(range(cam_size))
+
+ def elaborate(self, platform=None):
+ m = Module()
+ # AddressEncoder for match types and output address
+ m.submodules.AddressEncoder = self.encoder
+ # Decoder is used to select which entry will be written to
+ m.submodules.Decoder = self.decoder
+ # CamEntry Array Submodules
+ # Note these area added anonymously
+ entry_array = self.entry_array
+ m.submodules += entry_array
+
+ # Decoder logic
+ m.d.comb += [
+ self.decoder.i.eq(self.address_in),
+ self.decoder.n.eq(0)
+ ]
+
+ encoder_vector = []
+ with m.If(self.enable):
+ # Set the key value for every CamEntry
+ for index in range(self.cam_size):
+
+ # Write Operation
+ with m.If(self.write_enable):
+ with m.If(self.decoder.o[index]):
+ m.d.comb += entry_array[index].command.eq(2)
+ with m.Else():
+ m.d.comb += entry_array[index].command.eq(0)
+
+ # Read Operation
+ with m.Else():
+ m.d.comb += entry_array[index].command.eq(1)
+
+ # Send data input to all entries
+ m.d.comb += entry_array[index].data_in.eq(self.data_in)
+ # Send all entry matches to encoder
+ ematch = entry_array[index].match
+ encoder_vector.append(ematch)
+
+ # Give input to and accept output from encoder module
+ m.d.comb += [
+ self.encoder.i.eq(Cat(*encoder_vector)),
+ self.single_match.eq(self.encoder.single_match),
+ self.multiple_match.eq(self.encoder.multiple_match),
+ self.match_address.eq(self.encoder.o)
+ ]
+
+ # If the CAM is not enabled set all outputs to 0
+ with m.Else():
+ m.d.comb += [
+ self.read_warning.eq(0),
+ self.single_match.eq(0),
+ self.multiple_match.eq(0),
+ self.match_address.eq(0)
+ ]
+
+ return m
+
+ def ports(self):
+ return [self.enable, self.write_enable,
+ self.data_in, self.data_mask,
+ self.read_warning, self.single_match,
+ self.multiple_match, self.match_address]
+
+
+if __name__ == '__main__':
+ cam = Cam(4, 4)
+ main(cam, ports=cam.ports())
--- /dev/null
+from nmigen import Module, Signal, Elaboratable
+
+
+class CamEntry(Elaboratable):
+ """ Content Addressable Memory (CAM) Entry
+
+ The purpose of this module is to represent an entry within a CAM.
+ This module when given a read command will compare the given data
+ and output whether a match was found or not. When given a write
+ command it will write the given data into internal registers.
+ """
+
+ def __init__(self, data_size):
+ """ Arguments:
+ * data_size: (bit count) The size of the data
+ """
+ # Input
+ self.command = Signal(2) # 00 => NA 01 => Read 10 => Write 11 => Reset
+ self.data_in = Signal(data_size) # Data input when writing
+
+ # Output
+ self.match = Signal(1) # Result of the internal/input key comparison
+ self.data = Signal(data_size)
+
+ def elaborate(self, platform=None):
+ m = Module()
+ with m.Switch(self.command):
+ with m.Case("00"):
+ m.d.sync += self.match.eq(0)
+ with m.Case("01"):
+ with m.If(self.data == self.data_in):
+ m.d.sync += self.match.eq(1)
+ with m.Else():
+ m.d.sync += self.match.eq(0)
+ with m.Case("10"):
+ m.d.sync += [
+ self.data.eq(self.data_in),
+ self.match.eq(0)
+ ]
+ with m.Case():
+ m.d.sync += [
+ self.match.eq(0),
+ self.data.eq(0)
+ ]
+
+ return m
--- /dev/null
+# SPDX-License-Identifier: LGPL-2.1-or-later
+# See Notices.txt for copyright information
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+from nmigen.cli import verilog, rtlil
+
+
+class LFSRPolynomial(set):
+ """ implements a polynomial for use in LFSR
+ """
+ def __init__(self, exponents=()):
+ for e in exponents:
+ assert isinstance(e, int), TypeError("%s must be an int" % repr(e))
+ assert (e >= 0), ValueError("%d must not be negative" % e)
+ set.__init__(self, set(exponents).union({0})) # must contain zero
+
+ @property
+ def max_exponent(self):
+ return max(self) # derived from set, so this returns the max exponent
+
+ @property
+ def exponents(self):
+ exponents = list(self) # get elements of set as a list
+ exponents.sort(reverse=True)
+ return exponents
+
+ def __str__(self):
+ expd = {0: "1", 1: 'x', 2: "x^{}"} # case 2 isn't 2, it's min(i,2)
+ retval = map(lambda i: expd[min(i,2)].format(i), self.exponents)
+ return " + ".join(retval)
+
+ def __repr__(self):
+ return "LFSRPolynomial(%s)" % self.exponents
+
+
+# list of selected polynomials from https://web.archive.org/web/20190418121923/https://en.wikipedia.org/wiki/Linear-feedback_shift_register#Some_polynomials_for_maximal_LFSRs # noqa
+LFSR_POLY_2 = LFSRPolynomial([2, 1, 0])
+LFSR_POLY_3 = LFSRPolynomial([3, 2, 0])
+LFSR_POLY_4 = LFSRPolynomial([4, 3, 0])
+LFSR_POLY_5 = LFSRPolynomial([5, 3, 0])
+LFSR_POLY_6 = LFSRPolynomial([6, 5, 0])
+LFSR_POLY_7 = LFSRPolynomial([7, 6, 0])
+LFSR_POLY_8 = LFSRPolynomial([8, 6, 5, 4, 0])
+LFSR_POLY_9 = LFSRPolynomial([9, 5, 0])
+LFSR_POLY_10 = LFSRPolynomial([10, 7, 0])
+LFSR_POLY_11 = LFSRPolynomial([11, 9, 0])
+LFSR_POLY_12 = LFSRPolynomial([12, 11, 10, 4, 0])
+LFSR_POLY_13 = LFSRPolynomial([13, 12, 11, 8, 0])
+LFSR_POLY_14 = LFSRPolynomial([14, 13, 12, 2, 0])
+LFSR_POLY_15 = LFSRPolynomial([15, 14, 0])
+LFSR_POLY_16 = LFSRPolynomial([16, 15, 13, 4, 0])
+LFSR_POLY_17 = LFSRPolynomial([17, 14, 0])
+LFSR_POLY_18 = LFSRPolynomial([18, 11, 0])
+LFSR_POLY_19 = LFSRPolynomial([19, 18, 17, 14, 0])
+LFSR_POLY_20 = LFSRPolynomial([20, 17, 0])
+LFSR_POLY_21 = LFSRPolynomial([21, 19, 0])
+LFSR_POLY_22 = LFSRPolynomial([22, 21, 0])
+LFSR_POLY_23 = LFSRPolynomial([23, 18, 0])
+LFSR_POLY_24 = LFSRPolynomial([24, 23, 22, 17, 0])
+
+
+class LFSR(LFSRPolynomial, Elaboratable):
+ """ implements a Linear Feedback Shift Register
+ """
+ def __init__(self, polynomial):
+ """ Inputs:
+ ------
+ :polynomial: the polynomial to feedback on. may be a LFSRPolynomial
+ instance or an iterable of ints (list/tuple/generator)
+ :enable: enable (set LO to disable. NOTE: defaults to HI)
+
+ Outputs:
+ -------
+ :state: the LFSR state. bitwidth is taken from the polynomial
+ maximum exponent.
+
+ Note: if an LFSRPolynomial is passed in as the input, because
+ LFSRPolynomial is derived from set() it's ok:
+ LFSRPolynomial(LFSRPolynomial(p)) == LFSRPolynomial(p)
+ """
+ LFSRPolynomial.__init__(self, polynomial)
+ self.state = Signal(self.max_exponent, reset=1)
+ self.enable = Signal(reset=1)
+
+ def elaborate(self, platform):
+ m = Module()
+ # do absolutely nothing if the polynomial is empty (always has a zero)
+ if self.max_exponent <= 1:
+ return m
+
+ # create XOR-bunch, select bits from state based on exponent
+ feedback = Const(0) # doesn't do any harm starting from 0b0 (xor chain)
+ for exponent in self:
+ if exponent > 0: # don't have to skip, saves CPU cycles though
+ feedback ^= self.state[exponent - 1]
+
+ # if enabled, shift-and-feedback
+ with m.If(self.enable):
+ # shift up lower bits by Cat'ing in a new bit zero (feedback)
+ newstate = Cat(feedback, self.state[:-1])
+ m.d.sync += self.state.eq(newstate)
+
+ return m
+
+
+# example: Poly24
+if __name__ == '__main__':
+ p24 = rtlil.convert(LFSR(LFSR_POLY_24))
+ with open("lfsr2_p24.il", "w") as f:
+ f.write(p24)
--- /dev/null
+# SPDX-License-Identifier: LGPL-2.1-or-later
+# See Notices.txt for copyright information
+from nmigen import Module
+from typing import Iterable, Optional, Iterator, Any, Union
+from typing_extensions import final
+
+
+@final
+class LFSRPolynomial(set):
+ def __init__(self, exponents: Iterable[int] = ()):
+ def elements() -> Iterable[int]: ...
+ @property
+ def exponents(self) -> list[int]: ...
+ def __str__(self) -> str: ...
+ def __repr__(self) -> str: ...
+
+
+@final
+class LFSR:
+ def __init__(self, polynomial: Union[Iterable[int], LFSRPolynomial]): ...
+ @property
+ def width(self) -> int: ...
+ def elaborate(self, platform: Any) -> Module: ...
--- /dev/null
+verilog:
+ python3 Cam.py generate -t v > Cam.v
--- /dev/null
+from nmigen import Cat, Memory, Module, Signal, Elaboratable
+from nmigen.cli import main
+from nmigen.cli import verilog, rtlil
+
+
+class MemorySet(Elaboratable):
+ def __init__(self, data_size, tag_size, set_count, active):
+ self.active = active
+ input_size = tag_size + data_size # Size of the input data
+ memory_width = input_size + 1 # The width of the cache memory
+ self.active = active
+ self.data_size = data_size
+ self.tag_size = tag_size
+
+ # XXX TODO, use rd-enable and wr-enable?
+ self.mem = Memory(width=memory_width, depth=set_count)
+ self.r = self.mem.read_port()
+ self.w = self.mem.write_port()
+
+ # inputs (address)
+ self.cset = Signal(range(set_count)) # The set to be checked
+ self.tag = Signal(tag_size) # The tag to find
+ self.data_i = Signal(data_size) # Incoming data
+
+ # outputs
+ self.valid = Signal()
+ self.data_o = Signal(data_size) # Outgoing data (excludes tag)
+
+ def elaborate(self, platform):
+ m = Module()
+ m.submodules.mem = self.mem
+ m.submodules.r = self.r
+ m.submodules.w = self.w
+
+ # temporaries
+ active_bit = Signal()
+ tag_valid = Signal()
+ data_start = self.active + 1
+ data_end = data_start + self.data_size
+ tag_start = data_end
+ tag_end = tag_start + self.tag_size
+
+ # connect the read port address to the set/entry
+ read_port = self.r
+ m.d.comb += read_port.addr.eq(self.cset)
+ # Pull out active bit from data
+ data = read_port.data
+ m.d.comb += active_bit.eq(data[self.active])
+ # Validate given tag vs stored tag
+ tag = data[tag_start:tag_end]
+ m.d.comb += tag_valid.eq(self.tag == tag)
+ # An entry is only valid if the tags match AND
+ # is marked as a valid entry
+ m.d.comb += self.valid.eq(tag_valid & active_bit)
+
+ # output data: TODO, check rd-enable?
+ m.d.comb += self.data_o.eq(data[data_start:data_end])
+
+ # connect the write port addr to the set/entry (only if write enabled)
+ # (which is only done on a match, see SAC.write_entry below)
+ write_port = self.w
+ with m.If(write_port.en):
+ m.d.comb += write_port.addr.eq(self.cset)
+ m.d.comb += write_port.data.eq(Cat(1, self.data_i, self.tag))
+
+ return m
--- /dev/null
+from nmigen import Module, Signal, Elaboratable
+from nmigen.cli import main
+
+from soc.TLB.PteEntry import PteEntry
+
+
+class PermissionValidator(Elaboratable):
+ """ The purpose of this Module is to check the Permissions of a given PTE
+ against the requested access permissions.
+
+ This module will either validate (by setting the valid bit HIGH)
+ the request or find a permission fault and invalidate (by setting
+ the valid bit LOW) the request
+ """
+
+ def __init__(self, asid_size, pte_size):
+ """ Arguments:
+ * asid_size: (bit count) The size of the asid to be processed
+ * pte_size: (bit count) The size of the pte to be processed
+
+ Return:
+ * valid HIGH when permissions are correct
+ """
+ # Internal
+ self.pte_entry = PteEntry(asid_size, pte_size)
+
+ # Input
+ self.data = Signal(asid_size + pte_size)
+ self.xwr = Signal(3) # Execute, Write, Read
+ self.super_mode = Signal(1) # Supervisor Mode
+ self.super_access = Signal(1) # Supervisor Access
+ self.asid = Signal(15) # Address Space IDentifier (ASID)
+
+ # Output
+ self.valid = Signal(1) # Denotes if the permissions are correct
+
+ def elaborate(self, platform=None):
+ m = Module()
+
+ m.submodules.pte_entry = self.pte_entry
+
+ m.d.comb += self.pte_entry.i.eq(self.data)
+
+ # Check if the entry is valid
+ with m.If(self.pte_entry.v):
+ # ASID match or Global Permission
+ # Note that the MSB bound is exclusive
+ with m.If((self.pte_entry.asid == self.asid) | self.pte_entry.g):
+ # Check Execute, Write, Read (XWR) Permissions
+ with m.If(self.pte_entry.xwr == self.xwr):
+ # Supervisor Logic
+ with m.If(self.super_mode):
+ # Valid if entry is not in user mode or supervisor
+ # has Supervisor User Memory (SUM) access via the
+ # SUM bit in the sstatus register
+ m.d.comb += self.valid.eq((~self.pte_entry.u)
+ | self.super_access)
+ # User logic
+ with m.Else():
+ # Valid if the entry is in user mode only
+ m.d.comb += self.valid.eq(self.pte_entry.u)
+ with m.Else():
+ m.d.comb += self.valid.eq(0)
+ with m.Else():
+ m.d.comb += self.valid.eq(0)
+ with m.Else():
+ m.d.comb += self.valid.eq(0)
+ return m
--- /dev/null
+from nmigen import Module, Signal, Elaboratable
+from nmigen.cli import main
+
+
+class PteEntry(Elaboratable):
+ """ The purpose of this Module is to centralize the parsing of Page
+ Table Entries (PTE) into one module to prevent common mistakes
+ and duplication of code. The control bits are parsed out for
+ ease of use.
+
+ This module parses according to the standard PTE given by the
+ Volume II: RISC-V Privileged Architectures V1.10 Pg 60.
+ The Address Space IDentifier (ASID) is appended to the MSB of the input
+ and is parsed out as such.
+
+ An valid input Signal would be:
+ ASID PTE
+ Bits:[78-64][63-0]
+
+ The output PTE value will include the control bits.
+ """
+ def __init__(self, asid_size, pte_size):
+ """ Arguments:
+ * asid_size: (bit count) The size of the asid to be processed
+ * pte_size: (bit count) The size of the pte to be processed
+
+ Return:
+ * d The Dirty bit from the PTE portion of i
+ * a The Accessed bit from the PTE portion of i
+ * g The Global bit from the PTE portion of i
+ * u The User Mode bit from the PTE portion of i
+ * xwr The Execute/Write/Read bit from the PTE portion of i
+ * v The Valid bit from the PTE portion of i
+ * asid The asid portion of i
+ * pte The pte portion of i
+ """
+ # Internal
+ self.asid_start = pte_size
+ self.asid_end = pte_size + asid_size
+
+ # Input
+ self.i = Signal(asid_size + pte_size)
+
+ # Output
+ self.d = Signal(1) # Dirty bit (From pte)
+ self.a = Signal(1) # Accessed bit (From pte)
+ self.g = Signal(1) # Global Access (From pte)
+ self.u = Signal(1) # User Mode (From pte)
+ self.xwr = Signal(3) # Execute Read Write (From pte)
+ self.v = Signal(1) # Valid (From pte)
+ self.asid = Signal(asid_size) # Associated Address Space IDentifier
+ self.pte = Signal(pte_size) # Full Page Table Entry
+
+ def elaborate(self, platform=None):
+ m = Module()
+ # Pull out all control bites from PTE
+ m.d.comb += [
+ self.d.eq(self.i[7]),
+ self.a.eq(self.i[6]),
+ self.g.eq(self.i[5]),
+ self.u.eq(self.i[4]),
+ self.xwr.eq(self.i[1:4]),
+ self.v.eq(self.i[0])
+ ]
+ m.d.comb += self.asid.eq(self.i[self.asid_start:self.asid_end])
+ m.d.comb += self.pte.eq(self.i[0:self.asid_start])
+ return m
--- /dev/null
+"""
+
+Online simulator of 4-way set-associative cache:
+http://www.ntu.edu.sg/home/smitha/ParaCache/Paracache/sa4.html
+
+Python simulator of a N-way set-associative cache:
+https://github.com/vaskevich/CacheSim/blob/master/cachesim.py
+"""
+
+from nmigen import Array, Cat, Memory, Module, Signal, Mux, Elaboratable
+from nmigen.compat.genlib import fsm
+from nmigen.cli import main
+from nmigen.cli import verilog, rtlil
+
+from .AddressEncoder import AddressEncoder
+from .MemorySet import MemorySet
+
+# TODO: use a LFSR that advances continuously and picking the bottom
+# few bits from it to select which cache line to replace, instead of PLRU
+# http://bugs.libre-riscv.org/show_bug.cgi?id=71
+from .ariane.plru import PLRU
+from .LFSR import LFSR, LFSR_POLY_24
+
+SA_NA = "00" # no action (none)
+SA_RD = "01" # read
+SA_WR = "10" # write
+
+
+class SetAssociativeCache(Elaboratable):
+ """ Set Associative Cache Memory
+
+ The purpose of this module is to generate a memory cache given the
+ constraints passed in. This will create a n-way set associative cache.
+ It is expected for the SV TLB that the VMA will provide the set number
+ while the ASID provides the tag (still to be decided).
+
+ """
+
+ def __init__(self, tag_size, data_size, set_count, way_count, lfsr=False):
+ """ Arguments
+ * tag_size (bits): The bit count of the tag
+ * data_size (bits): The bit count of the data to be stored
+ * set_count (number): The number of sets/entries in the cache
+ * way_count (number): The number of slots a data can be stored
+ in one set
+ * lfsr: if set, use an LFSR for (pseudo-randomly) selecting
+ set/entry to write to. otherwise, use a PLRU
+ """
+ # Internals
+ self.lfsr_mode = lfsr
+ self.way_count = way_count # The number of slots in one set
+ self.tag_size = tag_size # The bit count of the tag
+ self.data_size = data_size # The bit count of the data to be stored
+
+ # set up Memory array
+ self.mem_array = Array() # memory array
+ for i in range(way_count):
+ ms = MemorySet(data_size, tag_size, set_count, active=0)
+ self.mem_array.append(ms)
+
+ # Finds valid entries
+ self.encoder = AddressEncoder(way_count)
+
+ # setup PLRU or LFSR
+ if lfsr:
+ # LFSR mode
+ self.lfsr = LFSR(LFSR_POLY_24)
+ else:
+ # PLRU mode
+ # One block to handle plru calculations
+ self.plru = PLRU(way_count)
+ self.plru_array = Array() # PLRU data on each set
+ for i in range(set_count):
+ name = "plru%d" % i
+ self.plru_array.append(Signal(self.plru.TLBSZ, name=name))
+
+ # Input
+ self.enable = Signal(1) # Whether the cache is enabled
+ self.command = Signal(2) # 00=None, 01=Read, 10=Write (see SA_XX)
+ self.cset = Signal(range(set_count)) # The set to be checked
+ self.tag = Signal(tag_size) # The tag to find
+ self.data_i = Signal(data_size) # The input data
+
+ # Output
+ self.ready = Signal(1) # 0 => Processing 1 => Ready for commands
+ self.hit = Signal(1) # Tag matched one way in the given set
+ # Tag matched many ways in the given set
+ self.multiple_hit = Signal(1)
+ self.data_o = Signal(data_size) # The data linked to the matched tag
+
+ def check_tags(self, m):
+ """ Validate the tags in the selected set. If one and only one
+ tag matches set its state to zero and increment all others
+ by one. We only advance to next state if a single hit is found.
+ """
+ # Vector to store way valid results
+ # A zero denotes a way is invalid
+ valid_vector = []
+ # Loop through memory to prep read/write ports and set valid_vector
+ for i in range(self.way_count):
+ valid_vector.append(self.mem_array[i].valid)
+
+ # Pass encoder the valid vector
+ m.d.comb += self.encoder.i.eq(Cat(*valid_vector))
+
+ # Only one entry should be marked
+ # This is due to already verifying the tags
+ # matched and the valid bit is high
+ with m.If(self.hit):
+ m.next = "FINISHED_READ"
+ # Pull out data from the read port
+ data = self.mem_array[self.encoder.o].data_o
+ m.d.comb += self.data_o.eq(data)
+ if not self.lfsr_mode:
+ self.access_plru(m)
+
+ # Oh no! Seal the gates! Multiple tags matched?!? kasd;ljkafdsj;k
+ with m.Elif(self.multiple_hit):
+ # XXX TODO, m.next = "FINISHED_READ" ? otherwise stuck
+ m.d.comb += self.data_o.eq(0)
+
+ # No tag matches means no data
+ with m.Else():
+ # XXX TODO, m.next = "FINISHED_READ" ? otherwise stuck
+ m.d.comb += self.data_o.eq(0)
+
+ def access_plru(self, m):
+ """ An entry was accessed and the plru tree must now be updated
+ """
+ # Pull out the set's entry being edited
+ plru_entry = self.plru_array[self.cset]
+ m.d.comb += [
+ # Set the plru data to the current state
+ self.plru.plru_tree.eq(plru_entry),
+ # Set that the cache was accessed
+ self.plru.lu_access_i.eq(1)
+ ]
+
+ def read(self, m):
+ """ Go through the read process of the cache.
+ This takes two cycles to complete. First it checks for a valid tag
+ and secondly it updates the LRU values.
+ """
+ with m.FSM() as fsm_read:
+ with m.State("READY"):
+ m.d.comb += self.ready.eq(0)
+ # check_tags will set the state if the conditions are met
+ self.check_tags(m)
+ with m.State("FINISHED_READ"):
+ m.next = "READY"
+ m.d.comb += self.ready.eq(1)
+ if not self.lfsr_mode:
+ plru_tree_o = self.plru.plru_tree_o
+ m.d.sync += self.plru_array[self.cset].eq(plru_tree_o)
+
+ def write_entry(self, m):
+ if not self.lfsr_mode:
+ m.d.comb += [ # set cset (mem address) into PLRU
+ self.plru.plru_tree.eq(self.plru_array[self.cset]),
+ # and connect plru to encoder for write
+ self.encoder.i.eq(self.plru.replace_en_o)
+ ]
+ write_port = self.mem_array[self.encoder.o].w
+ else:
+ # use the LFSR to generate a random(ish) one of the mem array
+ lfsr_output = Signal(range(self.way_count))
+ lfsr_random = Signal(range(self.way_count))
+ m.d.comb += lfsr_output.eq(self.lfsr.state) # lose some bits
+ # address too big, limit to range of array
+ m.d.comb += lfsr_random.eq(Mux(lfsr_output > self.way_count,
+ lfsr_output - self.way_count,
+ lfsr_output))
+ write_port = self.mem_array[lfsr_random].w
+
+ # then if there is a match from the encoder, enable the selected write
+ with m.If(self.encoder.single_match):
+ m.d.comb += write_port.en.eq(1)
+
+ def write(self, m):
+ """ Go through the write process of the cache.
+ This takes two cycles to complete. First it writes the entry,
+ and secondly it updates the PLRU (in plru mode)
+ """
+ with m.FSM() as fsm_write:
+ with m.State("READY"):
+ m.d.comb += self.ready.eq(0)
+ self.write_entry(m)
+ m.next = "FINISHED_WRITE"
+ with m.State("FINISHED_WRITE"):
+ m.d.comb += self.ready.eq(1)
+ if not self.lfsr_mode:
+ plru_entry = self.plru_array[self.cset]
+ m.d.sync += plru_entry.eq(self.plru.plru_tree_o)
+ m.next = "READY"
+
+ def elaborate(self, platform=None):
+ m = Module()
+
+ # ----
+ # set up Modules: AddressEncoder, LFSR/PLRU, Mem Array
+ # ----
+
+ m.submodules.AddressEncoder = self.encoder
+ if self.lfsr_mode:
+ m.submodules.LFSR = self.lfsr
+ else:
+ m.submodules.PLRU = self.plru
+
+ for i, mem in enumerate(self.mem_array):
+ setattr(m.submodules, "mem%d" % i, mem)
+
+ # ----
+ # select mode: PLRU connect to encoder, LFSR do... something
+ # ----
+
+ if not self.lfsr_mode:
+ # Set what entry was hit
+ m.d.comb += self.plru.lu_hit.eq(self.encoder.o)
+ else:
+ # enable LFSR
+ m.d.comb += self.lfsr.enable.eq(self.enable)
+
+ # ----
+ # connect hit/multiple hit to encoder output
+ # ----
+
+ m.d.comb += [
+ self.hit.eq(self.encoder.single_match),
+ self.multiple_hit.eq(self.encoder.multiple_match),
+ ]
+
+ # ----
+ # connect incoming data/tag/cset(addr) to mem_array
+ # ----
+
+ for mem in self.mem_array:
+ write_port = mem.w
+ m.d.comb += [mem.cset.eq(self.cset),
+ mem.tag.eq(self.tag),
+ mem.data_i.eq(self.data_i),
+ write_port.en.eq(0), # default: disable write
+ ]
+ # ----
+ # Commands: READ/WRITE/TODO
+ # ----
+
+ with m.If(self.enable):
+ with m.Switch(self.command):
+ # Search all sets at a particular tag
+ with m.Case(SA_RD):
+ self.read(m)
+ with m.Case(SA_WR):
+ self.write(m)
+ # Maybe catch multiple tags write here?
+ # TODO
+ # TODO: invalidate/flush, flush-all?
+
+ return m
+
+ def ports(self):
+ return [self.enable, self.command, self.cset, self.tag, self.data_i,
+ self.ready, self.hit, self.multiple_hit, self.data_o]
+
+
+if __name__ == '__main__':
+ sac = SetAssociativeCache(4, 8, 4, 6)
+ vl = rtlil.convert(sac, ports=sac.ports())
+ with open("SetAssociativeCache.il", "w") as f:
+ f.write(vl)
+
+ sac_lfsr = SetAssociativeCache(4, 8, 4, 6, True)
+ vl = rtlil.convert(sac_lfsr, ports=sac_lfsr.ports())
+ with open("SetAssociativeCacheLFSR.il", "w") as f:
+ f.write(vl)
--- /dev/null
+""" TLB Module
+
+ The expected form of the data is:
+ * Item (Bits)
+ * Tag (N - 79) / ASID (78 - 64) / PTE (63 - 0)
+"""
+
+from nmigen import Memory, Module, Signal, Cat, Elaboratable
+from nmigen.cli import main
+
+from .PermissionValidator import PermissionValidator
+from .Cam import Cam
+
+
+class TLB(Elaboratable):
+ def __init__(self, asid_size, vma_size, pte_size, L1_size):
+ """ Arguments
+ * asid_size: Address Space IDentifier (ASID) typically 15 bits
+ * vma_size: Virtual Memory Address (VMA) typically 36 bits
+ * pte_size: Page Table Entry (PTE) typically 64 bits
+
+ Notes:
+ These arguments should represent the largest possible size
+ defined by the MODE settings. See
+ Volume II: RISC-V Privileged Architectures V1.10 Page 57
+ """
+
+ # Internal
+ self.state = 0
+ # L1 Cache Modules
+ self.cam_L1 = Cam(vma_size, L1_size)
+ self.mem_L1 = Memory(width=asid_size + pte_size, depth=L1_size)
+
+ # Permission Validator
+ self.perm_validator = PermissionValidator(asid_size, pte_size)
+
+ # Inputs
+ self.supermode = Signal(1) # Supervisor Mode
+ self.super_access = Signal(1) # Supervisor Access
+ # 00=None, 01=Search, 10=Write L1, 11=Write L2
+ self.command = Signal(2)
+ self.xwr = Signal(3) # Execute, Write, Read
+ self.mode = Signal(4) # 4 bits for access to Sv48 on Rv64
+ self.address_L1 = Signal(range(L1_size))
+ self.asid = Signal(asid_size) # Address Space IDentifier (ASID)
+ self.vma = Signal(vma_size) # Virtual Memory Address (VMA)
+ self.pte_in = Signal(pte_size) # To be saved Page Table Entry (PTE)
+
+ # Outputs
+ self.hit = Signal(1) # Denotes if the VMA had a mapped PTE
+ self.perm_valid = Signal(1) # Denotes if the permissions are correct
+ self.pte_out = Signal(pte_size) # PTE that was mapped to by the VMA
+
+ def search(self, m, read_L1, write_L1):
+ """ searches the TLB
+ """
+ m.d.comb += [
+ write_L1.en.eq(0),
+ self.cam_L1.write_enable.eq(0),
+ self.cam_L1.data_in.eq(self.vma)
+ ]
+ # Match found in L1 CAM
+ match_found = Signal(reset_less=True)
+ m.d.comb += match_found.eq(self.cam_L1.single_match
+ | self.cam_L1.multiple_match)
+ with m.If(match_found):
+ # Memory shortcut variables
+ mem_address = self.cam_L1.match_address
+ # Memory Logic
+ m.d.comb += read_L1.addr.eq(mem_address)
+ # Permission Validator Logic
+ m.d.comb += [
+ self.hit.eq(1),
+ # Set permission validator data to the correct
+ # register file data according to CAM match
+ # address
+ self.perm_validator.data.eq(read_L1.data),
+ # Execute, Read, Write
+ self.perm_validator.xwr.eq(self.xwr),
+ # Supervisor Mode
+ self.perm_validator.super_mode.eq(self.supermode),
+ # Supverisor Access
+ self.perm_validator.super_access.eq(self.super_access),
+ # Address Space IDentifier (ASID)
+ self.perm_validator.asid.eq(self.asid),
+ # Output result of permission validation
+ self.perm_valid.eq(self.perm_validator.valid)
+ ]
+ # Only output PTE if permissions are valid
+ with m.If(self.perm_validator.valid):
+ # XXX TODO - dummy for now
+ reg_data = Signal.like(self.pte_out)
+ m.d.comb += [
+ self.pte_out.eq(reg_data)
+ ]
+ with m.Else():
+ m.d.comb += [
+ self.pte_out.eq(0)
+ ]
+ # Miss Logic
+ with m.Else():
+ m.d.comb += [
+ self.hit.eq(0),
+ self.perm_valid.eq(0),
+ self.pte_out.eq(0)
+ ]
+
+ def write_l1(self, m, read_L1, write_L1):
+ """ writes to the L1 cache
+ """
+ # Memory_L1 Logic
+ m.d.comb += [
+ write_L1.en.eq(1),
+ write_L1.addr.eq(self.address_L1),
+ # The Cat places arguments from LSB -> MSB
+ write_L1.data.eq(Cat(self.pte_in, self.asid))
+ ]
+ # CAM_L1 Logic
+ m.d.comb += [
+ self.cam_L1.write_enable.eq(1),
+ self.cam_L1.data_in.eq(self.vma), # data_in is sent to all entries
+ # self.cam_L1.address_in.eq(todo) # a CAM entry needs to be selected
+
+ ]
+
+ def elaborate(self, platform):
+ m = Module()
+ # Add submodules
+ # Submodules for L1 Cache
+ m.submodules.cam_L1 = self.cam_L1
+ m.submodules.read_L1 = read_L1 = self.mem_L1.read_port()
+ m.submodules.write_L1 = write_L1 = self.mem_L1.write_port()
+
+ # Permission Validator Submodule
+ m.submodules.perm_valididator = self.perm_validator
+
+ # When MODE specifies translation
+ # TODO add in different bit length handling ie prefix 0s
+ tlb_enable = Signal(reset_less=True)
+ m.d.comb += tlb_enable.eq(self.mode != 0)
+
+ with m.If(tlb_enable):
+ m.d.comb += [
+ self.cam_L1.enable.eq(1)
+ ]
+ with m.Switch(self.command):
+ # Search
+ with m.Case("01"):
+ self.search(m, read_L1, write_L1)
+
+ # Write L1
+ # Expected that the miss will be handled in software
+ with m.Case("10"):
+ self.write_l1(m, read_L1, write_L1)
+
+ # TODO
+ # with m.Case("11"):
+
+ # When disabled
+ with m.Else():
+ m.d.comb += [
+ self.cam_L1.enable.eq(0),
+ # XXX TODO - self.reg_file.enable.eq(0),
+ self.hit.eq(0),
+ self.perm_valid.eq(0), # XXX TODO, check this
+ self.pte_out.eq(0)
+ ]
+ return m
+
+
+if __name__ == '__main__':
+ tlb = TLB(15, 36, 64, 4)
+ main(tlb, ports=[tlb.supermode, tlb.super_access, tlb.command,
+ tlb.xwr, tlb.mode, tlb.address_L1, tlb.asid,
+ tlb.vma, tlb.pte_in,
+ tlb.hit, tlb.perm_valid, tlb.pte_out,
+ ] + tlb.cam_L1.ports())
--- /dev/null
+#include <cstdint>
+#include <iostream>
+#include <cmath>
+
+
+#define NWAY 4
+#define NLINE 256
+#define HIT 0
+#define MISS 1
+#define MS 1000
+/*
+Detailed TreePLRU inference see here: https://docs.google.com/spreadsheets/d/14zQpPYPwDAbCCjBT_a3KLaE5FEk-RNhI8Z7Qm_biW8g/edit?usp=sharing
+Ref: https://people.cs.clemson.edu/~mark/464/p_lru.txt
+four-way set associative - three bits
+ each bit represents one branch point in a binary decision tree; let 1
+ represent that the left side has been referenced more recently than the
+ right side, and 0 vice-versa
+ are all 4 lines valid?
+ / \
+ yes no, use an invalid line
+ |
+ |
+ |
+ bit_0 == 0? state | replace ref to | next state
+ / \ ------+-------- -------+-----------
+ y n 00x | line_0 line_0 | 11_
+ / \ 01x | line_1 line_1 | 10_
+ bit_1 == 0? bit_2 == 0? 1x0 | line_2 line_2 | 0_1
+ / \ / \ 1x1 | line_3 line_3 | 0_0
+ y n y n
+ / \ / \ ('x' means ('_' means unchanged)
+ line_0 line_1 line_2 line_3 don't care)
+ 8-way set associative - 7 = 1+2+4 bits
+16-way set associative - 15 = 1+2+4+8 bits
+32-way set associative - 31 = 1+2+4+8+16 bits
+64-way set associative - 63 = 1+2+4+8+16+32 bits
+*/
+using namespace std;
+struct AddressField {
+ uint64_t wd_idx : 2;//Unused
+ uint64_t offset : 4;//Unused
+ uint64_t index : 8;//NLINE = 256 = 2^8
+ uint64_t tag : 50;
+};
+
+union Address {
+ uint32_t* p;
+ AddressField fields;
+};
+
+struct Cell {
+ bool v;
+ uint64_t tag;
+
+ Cell() : v(false), tag(0) {}
+
+ bool isHit(uint64_t tag) {
+ return v && (tag == this->tag);
+ }
+
+ void fetch(uint32_t* address) {
+ Address addr;
+ addr.p = address;
+ addr.fields.offset = 0;
+ addr.fields.wd_idx = 0;
+ tag = addr.fields.tag;
+ v = true;
+ }
+};
+
+ostream& operator<<(ostream & out, const Cell& cell) {
+ out << " v:" << cell.v << " tag:" << hex << cell.tag;
+ return out;
+}
+
+struct Block {
+ Cell cell[NWAY];
+ uint32_t state;
+ uint64_t *mask;//Mask the state to get accurate value for specified 1 bit.
+ uint64_t *value;
+ uint64_t *next_value;
+
+ Block() : state(0) {
+ switch (NWAY) {
+ case 4:
+ mask = new uint64_t[4]{0b110, 0b110, 0b101, 0b101};
+ value = new uint64_t[4]{0b000, 0b010, 0b100, 0b101};
+ next_value = new uint64_t[4]{0b110, 0b100, 0b001, 0b000};
+ break;
+ case 8:
+ mask = new uint64_t[8]{0b1101000, 0b1101000, 0b1100100, 0b1100100, 0b1010010, 0b1010010, 0b1010001,
+ 0b1010001};
+ value = new uint64_t[8]{0b0000000, 0b0001000, 0b0100000, 0b0100100, 0b1000000, 0b1000010, 0b1010000,
+ 0b1010001};
+ next_value = new uint64_t[8]{0b1101000, 0b1100000, 0b1000100, 0b1000000, 0b0010010, 0b0010000,
+ 0b0000001, 0b0000000};
+ break;
+ //TODO - more NWAY goes here.
+ default:
+ std::cout << "Error definition NWAY = " << NWAY << std::endl;
+ }
+ }
+
+ uint32_t *getByTag(uint64_t tag, uint32_t *pway) {
+ for (int i = 0; i < NWAY; ++i) {
+ if (cell[i].isHit(tag)) {
+ *pway = i;
+ return pway;
+ }
+ }
+ return NULL;
+ }
+
+ void setLRU(uint32_t *address) {
+ int way = 0;
+ uint32_t st = state;
+ for (int i = 0; i < NWAY; ++i) {
+ if ((state & mask[i]) == value[i]) {
+ state ^= mask[i];
+ way = i;
+ break;
+ }
+ }
+ cell[way].fetch(address);
+ cout << "MISS: way:" << way << " address:" << address << " state:" << st << "->" << state << endl;
+ }
+
+ uint32_t *get(uint32_t *address, uint32_t *pway) {
+ Address addr;
+ addr.p = address;
+ uint32_t *d = getByTag(addr.fields.tag, pway);
+ if (d != NULL) {
+ return &d[addr.fields.offset];
+ }
+ return d;
+ }
+
+ int set(uint32_t *address) {
+ uint32_t way = 0;
+ uint32_t *p = get(address, &way);
+ if (p != NULL) {
+ printf("HIT: address:%p ref_to way:%d state %X --> ", address, way, state);
+ state &= ~mask[way];
+ printf("%X --> ", state);
+ state |= next_value[way];
+ printf("%X\n", state);
+ // *p = *address; //skip since address is fake.
+ return HIT;
+ } else {
+ setLRU(address);
+ return MISS;
+ }
+ }
+};
+
+ostream& operator<<(ostream & out, const Block& block) {
+ out << "state:" << block.state << " ";
+ for (int i = 0; i<NWAY; i++) {
+ out << block.cell[i];
+ }
+ return out;
+}
+
+struct Cache {
+ Block block[NLINE];
+ uint32_t count[2];
+ Cache() { count[HIT] = 0; count[MISS] = 0; }
+
+ void access(uint32_t* address) {
+ Address addr;
+ addr.p = address;
+ Block& b = block[addr.fields.index];
+ ++count[b.set(address)];
+ }
+
+};
+ostream& operator<<(ostream & out, const Cache& cache) {
+ out << "\n==Summary==\n\tHit: " << cache.count[HIT] << " Miss: " << cache.count[MISS] << std::endl;
+ for (int i = 0; i < NLINE; i++) {
+ out << cache.block[i] << endl;
+ }
+ return out;
+}
+
+Cache cache;
+void multiply(uint32_t* m1, uint32_t* m2, uint32_t* res)
+{
+ int x, i, j;
+ for (i = 0; i < MS; i++) {
+ for (j = 0; j < MS; j++) {
+ cache.access(res + i*MS +j);
+ for (x = 0; x < MS; x++) {
+ cache.access(m1 + i*MS + x);
+ cache.access(m2 + x*MS + j);
+ cache.access(res + i*MS +j);
+ // res[i][j] += m1[i][x] * m2[x][j];
+ cache.access(res + i*MS +j);
+ }
+ }
+ }
+}
+
+int main()
+{
+ uint32_t* m1 = (uint32_t*) 0xFACE00A000000000LL; // fake virtual address; don’t access it
+ uint32_t* m2 = (uint32_t*) 0xFACE00B000000000LL; // fake virtual address; don’t access it
+ uint32_t* res = (uint32_t*) 0xFACE00C000000000LL; // fake virtual address; don’t access it
+ multiply(m1, m2, res);
+ cout << cache << endl;
+ return 0;
+}
--- /dev/null
+from nmigen import Const
+
+INSTR_ADDR_MISALIGNED = Const(0, 64)
+INSTR_ACCESS_FAULT = Const(1, 64)
+ILLEGAL_INSTR = Const(2, 64)
+BREAKPOINT = Const(3, 64)
+LD_ADDR_MISALIGNED = Const(4, 64)
+LD_ACCESS_FAULT = Const(5, 64)
+ST_ADDR_MISALIGNED = Const(6, 64)
+ST_ACCESS_FAULT = Const(7, 64)
+ENV_CALL_UMODE = Const(8, 64) # environment call from user mode
+ENV_CALL_SMODE = Const(9, 64) # environment call from supervisor mode
+ENV_CALL_MMODE = Const(11, 64) # environment call from machine mode
+INSTR_PAGE_FAULT = Const(12, 64) # Instruction page fault
+LOAD_PAGE_FAULT = Const(13, 64) # Load page fault
+STORE_PAGE_FAULT = Const(15, 64) # Store page fault
--- /dev/null
+# Copyright 2018 ETH Zurich and University of Bologna.
+# Copyright and related rights are licensed under the Solderpad Hardware
+# License, Version 0.51 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+# http:#solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# or agreed to in writing, software, hardware and materials distributed under
+# this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# Author: Florian Zaruba, ETH Zurich
+# Date: 12.11.2017
+# Description: Handles cache misses.
+from nmigen.lib.coding import Encoder, PriorityEncoder
+
+
+# --------------
+# MISS Handler
+# --------------
+import ariane_pkg::*;
+import std_cache_pkg::*;
+
+unsigned NR_PORTS = 3
+
+class MissReq(RecordObject):
+ def __init__(self, name=None):
+ Record.__init__(self, name)
+ self.valid = Signal()
+ self.addr = Signal(64)
+ self.be = Signal(8)
+ self.size = Signal(2)
+ self.we = Signal()
+ self.wdata = Signal(64)
+ bypass = Signal()
+
+class CacheLine:
+ def __init__(self):
+ self.tag = Signal(DCACHE_TAG_WIDTH) # tag array
+ self.data = Signal(DCACHE_LINE_WIDTH) # data array
+ self.valid = Signal() # state array
+ self.dirty = Signal() # state array
+
+# cache line byte enable
+class CLBE:
+ def __init__(self):
+ self.tag = Signal(DCACHE_TAG_WIDTH+7)//8) # byte enable into tag array
+ self.data = Signal(DCACHE_LINE_WIDTH+7)//8) # byte enable data array
+ # bit enable into state array (valid for a pair of dirty/valid bits)
+ self.vldrty = Signal(DCACHE_SET_ASSOC)
+ } cl_be_t;
+
+
+
+ # FSM states
+"""
+ enum logic [3:0] {
+ IDLE, # 0
+ FLUSHING, # 1
+ FLUSH, # 2
+ WB_CACHELINE_FLUSH, # 3
+ FLUSH_REQ_STATUS, # 4
+ WB_CACHELINE_MISS, # 5
+ WAIT_GNT_SRAM, # 6
+ MISS, # 7
+ REQ_CACHELINE, # 8
+ MISS_REPL, # 9
+ SAVE_CACHELINE, # A
+ INIT, # B
+ AMO_LOAD, # C
+ AMO_SAVE_LOAD, # D
+ AMO_STORE # E
+ } state_d, state_q;
+"""
+
+class MissHandler(Elaboratable):
+ def __init__(self, NR_PORTS):
+ self.NR_PORTS = NR_PORTS
+ self.pwid = pwid = ceil(log(NR_PORTS) / log(2))
+ self.flush_i = Signal() # flush request
+ self.flush_ack_o = Signal() # acknowledge successful flush
+ self.miss_o = Signal()
+ self.busy_i = Signal() # dcache is busy with something
+
+ # Bypass or miss
+ self.miss_req_i = Array(MissReq(name="missreq") for i in range(NR_PORTS))
+ # Bypass handling
+ self.bypass_gnt_o = Signal(NR_PORTS)
+ self.bypass_valid_o = Signal(NR_PORTS)
+ self.bypass_data_o = Array(Signal(name="bdata_o", 64) \
+ for i in range(NR_PORTS))
+
+ # AXI port
+ output ariane_axi::req_t axi_bypass_o,
+ input ariane_axi::resp_t axi_bypass_i,
+
+ # Miss handling (~> cacheline refill)
+ self.miss_gnt_o = Signal(NR_PORTS)
+ self.active_serving_o = Signal(NR_PORTS)
+
+ self.critical_word_o = Signal(64)
+ self.critical_word_valid_o = Signal()
+ output ariane_axi::req_t axi_data_o,
+ input ariane_axi::resp_t axi_data_i,
+
+ self.mshr_addr_i = Array(Signal(name="bdata_o", 56) \
+ for i in range(NR_PORTS))
+ self.mshr_addr_matches_o = Signal(NR_PORTS)
+ self.mshr_index_matches_o = Signal(NR_PORTS)
+
+ # AMO
+ self.amo_req_i = AMOReq()
+ self.amo_resp_o = AMOResp()
+ # Port to SRAMs, for refill and eviction
+ self.req_o = Signal(DCACHE_SET_ASSOC)
+ self.addr_o = Signal(DCACHE_INDEX_WIDTH) # address into cache array
+ self.data_o = CacheLine()
+ self.be_o = CLBE()
+ self.data_i = Array(CacheLine() \
+ for i in range(DCACHE_SET_ASSOC))
+ self.we_o = Signal()
+
+ def elaborate(self, platform):
+ # Registers
+ mshr_t mshr_d, mshr_q;
+ logic [DCACHE_INDEX_WIDTH-1:0] cnt_d, cnt_q;
+ logic [DCACHE_SET_ASSOC-1:0] evict_way_d, evict_way_q;
+ # cache line to evict
+ cache_line_t evict_cl_d, evict_cl_q;
+
+ logic serve_amo_d, serve_amo_q;
+ # Request from one FSM
+ miss_req_valid = Signal(self.NR_PORTS)
+ miss_req_bypass = Signal(self.NR_PORTS)
+ miss_req_addr = Array(Signal(name="miss_req_addr", 64) \
+ for i in range(NR_PORTS))
+ miss_req_wdata = Array(Signal(name="miss_req_wdata", 64) \
+ for i in range(NR_PORTS))
+ miss_req_we = Signal(self.NR_PORTS)
+ miss_req_be = Array(Signal(name="miss_req_be", 8) \
+ for i in range(NR_PORTS))
+ miss_req_size = Array(Signal(name="miss_req_size", 2) \
+ for i in range(NR_PORTS))
+
+ # Cache Line Refill <-> AXI
+ req_fsm_miss_valid = Signal()
+ req_fsm_miss_addr = Signal(64)
+ req_fsm_miss_wdata = Signal(DCACHE_LINE_WIDTH)
+ req_fsm_miss_we = Signal()
+ req_fsm_miss_be = Signal(DCACHE_LINE_WIDTH//8)
+ ariane_axi::ad_req_t req_fsm_miss_req;
+ req_fsm_miss_size = Signal(2)
+
+ gnt_miss_fsm = Signal()
+ valid_miss_fsm = Signal()
+ nmiss = DCACHE_LINE_WIDTH//64
+ data_miss_fsm = Array(Signal(name="data_miss_fsm", 64) \
+ for i in range(nmiss))
+
+ # Cache Management <-> LFSR
+ lfsr_enable = Signal()
+ lfsr_oh = Signal(DCACHE_SET_ASSOC)
+ lfsr_bin = Signal($clog2(DCACHE_SET_ASSOC-1))
+ # AMOs
+ ariane_pkg::amo_t amo_op;
+ amo_operand_a = Signal(64)
+ amo_operand_b = Signal(64)
+ amo_result_o = Signal(64)
+
+ struct packed {
+ logic [63:3] address;
+ logic valid;
+ } reservation_d, reservation_q;
+
+ # ------------------------------
+ # Cache Management
+ # ------------------------------
+ evict_way = Signal(DCACHE_SET_ASSOC)
+ valid_way = Signal(DCACHE_SET_ASSOC)
+
+ for (i in range(DCACHE_SET_ASSOC):
+ comb += evict_way[i].eq(data_i[i].valid & data_i[i].dirty)
+ comb += valid_way[i].eq(data_i[i].valid)
+
+ # ----------------------
+ # Default Assignments
+ # ----------------------
+ # to AXI refill
+ req_fsm_miss_req = ariane_axi::CACHE_LINE_REQ;
+ req_fsm_miss_size = Const(0b11, 2)
+ # core
+ serve_amo_d = serve_amo_q;
+ # --------------------------------
+ # Flush and Miss operation
+ # --------------------------------
+ state_d = state_q;
+ cnt_d = cnt_q;
+ evict_way_d = evict_way_q;
+ evict_cl_d = evict_cl_q;
+ mshr_d = mshr_q;
+ # communicate to the requester which unit we are currently serving
+ active_serving_o[mshr_q.id] = mshr_q.valid;
+ # AMOs
+ # silence the unit when not used
+ amo_op = amo_req_i.amo_op;
+
+ reservation_d = reservation_q;
+ with m.FSM() as state_q:
+
+ with m.Case("IDLE"):
+ # lowest priority are AMOs, wait until everything else
+ # is served before going for the AMOs
+ with m.If (amo_req_i.req & ~busy_i):
+ # 1. Flush the cache
+ with m.If(~serve_amo_q):
+ m.next = "FLUSH_REQ_STATUS"
+ serve_amo_d.eq(0b1
+ cnt_d.eq(0
+ # 2. Do the AMO
+ with m.Else():
+ m.next = "AMO_LOAD"
+ serve_amo_d.eq(0b0
+
+ # check if we want to flush and can flush
+ # e.g.: we are not busy anymore
+ # TODO: Check that the busy flag is indeed needed
+ with m.If (flush_i & ~busy_i):
+ m.next = "FLUSH_REQ_STATUS"
+ cnt_d = 0
+
+ # check if one of the state machines missed
+ for i in range(NR_PORTS):
+ # here comes the refill portion of code
+ with m.If (miss_req_valid[i] & ~miss_req_bypass[i]):
+ m.next = "MISS"
+ # we are taking another request so don't
+ # take the AMO
+ serve_amo_d = 0b0;
+ # save to MSHR
+ wid = DCACHE_TAG_WIDTH+DCACHE_INDEX_WIDTH
+ comb += [ mshr_d.valid.eq(0b1),
+ mshr_d.we.eq(miss_req_we[i]),
+ mshr_d.id.eq(i),
+ mshr_d.addr.eq(miss_req_addr[i][0:wid]),
+ mshr_d.wdata.eq(miss_req_wdata[i]),
+ mshr_d.be.eq(miss_req_be[i]),
+ ]
+ break
+
+ # ~> we missed on the cache
+ with m.Case("MISS"):
+ # 1. Check if there is an empty cache-line
+ # 2. If not -> evict one
+ comb += req_o.eq(1)
+ sync += addr_o.eq(mshr_q.addr[:DCACHE_INDEX_WIDTH]
+ m.next = "MISS_REPL"
+ comb += miss_o.eq(1)
+
+ # ~> second miss cycle
+ with m.Case("MISS_REPL"):
+ # if all are valid we need to evict one,
+ # pseudo random from LFSR
+ with m.If(~(~valid_way).bool()):
+ comb += lfsr_enable.eq(0b1)
+ comb += evict_way_d.eq(lfsr_oh)
+ # do we need to write back the cache line?
+ with m.If(data_i[lfsr_bin].dirty):
+ state_d = WB_CACHELINE_MISS;
+ comb += evict_cl_d.tag.eq(data_i[lfsr_bin].tag)
+ comb += evict_cl_d.data.eq(data_i[lfsr_bin].data)
+ comb += cnt_d.eq(mshr_q.addr[:DCACHE_INDEX_WIDTH])
+ # no - we can request a cache line now
+ with m.Else():
+ m.next = "REQ_CACHELINE"
+ # we have at least one free way
+ with m.Else():
+ # get victim cache-line by looking for the
+ # first non-valid bit
+ comb += evict_way_d.eq(get_victim_cl(~valid_way)
+ m.next = "REQ_CACHELINE"
+
+ # ~> we can just load the cache-line,
+ # the way is store in evict_way_q
+ with m.Case("REQ_CACHELINE"):
+ comb += req_fsm_miss_valid .eq(1)
+ sync += req_fsm_miss_addr .eq(mshr_q.addr)
+
+ with m.If (gnt_miss_fsm):
+ m.next = "SAVE_CACHELINE"
+ comb += miss_gnt_o[mshr_q.id].eq(1)
+
+ # ~> replace the cacheline
+ with m.Case("SAVE_CACHELINE"):
+ # calculate cacheline offset
+ automatic logic [$clog2(DCACHE_LINE_WIDTH)-1:0] cl_offset;
+ sync += cl_offset.eq(mshr_q.addr[3:DCACHE_BYTE_OFFSET] << 6)
+ # we've got a valid response from refill unit
+ with m.If (valid_miss_fsm):
+ wid = DCACHE_TAG_WIDTH+DCACHE_INDEX_WIDTH
+ sync += addr_o .eq(mshr_q.addr[:DCACHE_INDEX_WIDTH])
+ sync += req_o .eq(evict_way_q)
+ comb += we_o .eq(1)
+ comb += be_o .eq(1)
+ sync += be_o.vldrty .eq(evict_way_q)
+ sync += data_o.tag .eq(mshr_q.addr[DCACHE_INDEX_WIDTH:wid]
+ comb += data_o.data .eq(data_miss_fsm)
+ comb += data_o.valid.eq(1)
+ comb += data_o.dirty.eq(0)
+
+ # is this a write?
+ with m.If (mshr_q.we):
+ # Yes, so safe the updated data now
+ for i in range(8):
+ # check if we really want to write
+ # the corresponding byte
+ with m.If (mshr_q.be[i]):
+ sync += data_o.data[(cl_offset + i*8) +: 8].eq(mshr_q.wdata[i];
+ # it's immediately dirty if we write
+ comb += data_o.dirty.eq(1)
+
+ # reset MSHR
+ comb += mshr_d.valid.eq(0)
+ # go back to idle
+ m.next = 'IDLE'
+
+ # ------------------------------
+ # Write Back Operation
+ # ------------------------------
+ # ~> evict a cache line from way saved in evict_way_q
+ with m.Case("WB_CACHELINE_FLUSH"):
+ with m.Case("WB_CACHELINE_MISS"):
+
+ comb += req_fsm_miss_valid .eq(0b1)
+ sync += req_fsm_miss_addr .eq({evict_cl_q.tag, cnt_q[DCACHE_INDEX_WIDTH-1:DCACHE_BYTE_OFFSET], {{DCACHE_BYTE_OFFSET}{0b0}}};
+ comb += req_fsm_miss_be .eq(1)
+ comb += req_fsm_miss_we .eq(0b1)
+ sync += req_fsm_miss_wdata .eq(evict_cl_q.data;
+
+ # we've got a grant --> this is timing critical, think about it
+ if (gnt_miss_fsm) begin
+ # write status array
+ sync += addr_o .eq(cnt_q)
+ comb += req_o .eq(0b1)
+ comb += we_o .eq(0b1)
+ comb += data_o.valid.eq(INVALIDATE_ON_FLUSH ? 0b0 : 0b1)
+ # invalidate
+ sync += be_o.vldrty.eq(evict_way_q)
+ # go back to handling the miss or flushing,
+ # depending on where we came from
+ with m.If(state_q == WB_CACHELINE_MISS):
+ m.next = "MISS"
+ with m.Else():
+ m.next = "FLUSH_REQ_STATUS"
+
+ # ------------------------------
+ # Flushing & Initialization
+ # ------------------------------
+ # ~> make another request to check the same
+ # cache-line if there are still some valid entries
+ with m.Case("FLUSH_REQ_STATUS"):
+ comb += req_o .eq(1)
+ sync += addr_o .eq(cnt_q)
+ m.next = "FLUSHING"
+
+ with m.Case("FLUSHING"):
+ # this has priority
+ # at least one of the cache lines is dirty
+ with m.If(~evict_way):
+ # evict cache line, look for the first
+ # cache-line which is dirty
+ comb += evict_way_d.eq(get_victim_cl(evict_way))
+ comb += evict_cl_d .eq(data_i[one_hot_to_bin(evict_way)])
+ state_d = WB_CACHELINE_FLUSH;
+ # not dirty ~> increment and continue
+ with m.Else():
+ # increment and re-request
+ sync += cnt_d.eq(cnt_q + (1 << DCACHE_BYTE_OFFSET))
+ m.next = "FLUSH_REQ_STATUS"
+ sync += addr_o .eq(cnt_q)
+ comb += req_o .eq(1)
+ comb += be_o.vldrty.eq(INVALIDATE_ON_FLUSH ? 1 : 0)
+ comb += we_o .eq(1)
+ # finished with flushing operation, go back to idle
+ with m.If (cnt_q[DCACHE_BYTE_OFFSET:DCACHE_INDEX_WIDTH] \
+ == DCACHE_NUM_WORDS-1):
+ # only acknowledge if the flush wasn't
+ # triggered by an atomic
+ sync += flush_ack_o.eq(~serve_amo_q)
+ m.next = "IDLE"
+
+ # ~> only called after reset
+ with m.Case("INIT"):
+ # initialize status array
+ sync += addr_o.eq(cnt_q)
+ comb += req_o .eq(1)
+ comb += we_o .eq(1)
+ # only write the dirty array
+ comb += be_o.vldrty.eq(1)
+ sync += cnt_d .eq(cnt_q + (1 << DCACHE_BYTE_OFFSET))
+ # finished initialization
+ with m.If (cnt_q[DCACHE_BYTE_OFFSET:DCACHE_INDEX_WIDTH] \
+ == DCACHE_NUM_WORDS-1)
+ m.next = "IDLE"
+
+ # ----------------------
+ # AMOs
+ # ----------------------
+ # TODO(zarubaf) Move this closer to memory
+ # ~> we are here because we need to do the AMO,
+ # the cache is clean at this point
+ # start by executing the load
+ with m.Case("AMO_LOAD"):
+ comb += req_fsm_miss_valid.eq(1)
+ # address is in operand a
+ comb += req_fsm_miss_addr.eq(amo_req_i.operand_a)
+ comb += req_fsm_miss_req.eq(ariane_axi::SINGLE_REQ)
+ comb += req_fsm_miss_size.eq(amo_req_i.size)
+ # the request has been granted
+ with m.If(gnt_miss_fsm):
+ m.next = "AMO_SAVE_LOAD"
+ # save the load value
+ with m.Case("AMO_SAVE_LOAD"):
+ with m.If (valid_miss_fsm):
+ # we are only concerned about the lower 64-bit
+ comb += mshr_d.wdata.eq(data_miss_fsm[0])
+ m.next = "AMO_STORE"
+ # and do the store
+ with m.Case("AMO_STORE"):
+ load_data = Signal(64)
+ # re-align load data
+ comb += load_data.eq(data_align(amo_req_i.operand_a[:3],
+ mshr_q.wdata))
+ # Sign-extend for word operation
+ with m.If (amo_req_i.size == 0b10):
+ comb += amo_operand_a.eq(sext32(load_data[:32]))
+ comb += amo_operand_b.eq(sext32(amo_req_i.operand_b[:32]))
+ with m.Else():
+ comb += amo_operand_a.eq(load_data)
+ comb += amo_operand_b.eq(amo_req_i.operand_b)
+
+ # we do not need a store request for load reserved
+ # or a failing store conditional
+ # we can bail-out without making any further requests
+ with m.If ((amo_req_i.amo_op == AMO_LR) | \
+ ((amo_req_i.amo_op == AMO_SC) & \
+ ((reservation_q.valid & \
+ (reservation_q.address != \
+ amo_req_i.operand_a[3:64])) | \
+ ~reservation_q.valid))):
+ comb += req_fsm_miss_valid.eq(0)
+ m.next = "IDLE"
+ comb += amo_resp_o.ack.eq(1)
+ # write-back the result
+ comb += amo_resp_o.result.eq(amo_operand_a)
+ # we know that the SC failed
+ with m.If (amo_req_i.amo_op == AMO_SC):
+ comb += amo_resp_o.result.eq(1)
+ # also clear the reservation
+ comb += reservation_d.valid.eq(0)
+ with m.Else():
+ comb += req_fsm_miss_valid.eq(1)
+
+ comb += req_fsm_miss_we .eq(1)
+ comb += req_fsm_miss_req .eq(ariane_axi::SINGLE_REQ)
+ comb += req_fsm_miss_size.eq(amo_req_i.size)
+ comb += req_fsm_miss_addr.eq(amo_req_i.operand_a)
+
+ comb += req_fsm_miss_wdata.eq(
+ data_align(amo_req_i.operand_a[0:3], amo_result_o))
+ comb += req_fsm_miss_be.eq(
+ be_gen(amo_req_i.operand_a[0:3], amo_req_i.size))
+
+ # place a reservation on the memory
+ with m.If (amo_req_i.amo_op == AMO_LR):
+ comb += reservation_d.address.eq(amo_req_i.operand_a[3:64])
+ comb += reservation_d.valid.eq(1)
+
+ # the request is valid or we didn't need to go for another store
+ with m.If (valid_miss_fsm):
+ m.next = "IDLE"
+ comb += amo_resp_o.ack.eq(1)
+ # write-back the result
+ comb += amo_resp_o.result.eq(amo_operand_a;
+
+ if (amo_req_i.amo_op == AMO_SC) begin
+ comb += amo_resp_o.result.eq(0)
+ # An SC must fail if there is another SC
+ # (to any address) between the LR and the SC in
+ # program order (even to the same address).
+ # in any case destroy the reservation
+ comb += reservation_d.valid.eq(0)
+
+ # check MSHR for aliasing
+
+ comb += mshr_addr_matches_o .eq(0)
+ comb += mshr_index_matches_o.eq()
+
+ for i in range(NR_PORTS):
+ # check mshr for potential matching of other units,
+ # exclude the unit currently being served
+ with m.If (mshr_q.valid & \
+ (mshr_addr_i[i][DCACHE_BYTE_OFFSET:56] == \
+ mshr_q.addr[DCACHE_BYTE_OFFSET:56])):
+ comb += mshr_addr_matches_o[i].eq(1)
+
+ # same as previous, but checking only the index
+ with m.If (mshr_q.valid & \
+ (mshr_addr_i[i][DCACHE_BYTE_OFFSET:DCACHE_INDEX_WIDTH] == \
+ mshr_q.addr[DCACHE_BYTE_OFFSET:DCACHE_INDEX_WIDTH])):
+ mshr_index_matches_o[i].eq(1)
+
+ # --------------------
+ # Sequential Process
+ # --------------------
+
+ """
+ #pragma translate_off
+ `ifndef VERILATOR
+ # assert that cache only hits on one way
+ assert property (
+ @(posedge clk_i) $onehot0(evict_way_q)) else $warning("Evict-way should be one-hot encoded");
+ `endif
+ #pragma translate_on
+ """
+
+ # ----------------------
+ # Bypass Arbiter
+ # ----------------------
+ # Connection Arbiter <-> AXI
+ req_fsm_bypass_valid = Signal()
+ req_fsm_bypass_addr = Signal(64)
+ req_fsm_bypass_wdata = Signal(64)
+ req_fsm_bypass_we = Signal()
+ req_fsm_bypass_be = Signal(8)
+ req_fsm_bypass_size = Signal(2)
+ gnt_bypass_fsm = Signal()
+ valid_bypass_fsm = Signal()
+ data_bypass_fsm = Signal(64)
+ logic [$clog2(NR_PORTS)-1:0] id_fsm_bypass;
+ logic [3:0] id_bypass_fsm;
+ logic [3:0] gnt_id_bypass_fsm;
+
+ i_bypass_arbiter = ib = AXIArbiter( NR_PORTS, 64)
+ comb += [
+ # Master Side
+ ib.data_req_i .eq( miss_req_valid & miss_req_bypass ),
+ ib.address_i .eq( miss_req_addr ),
+ ib.data_wdata_i .eq( miss_req_wdata ),
+ ib.data_we_i .eq( miss_req_we ),
+ ib.data_be_i .eq( miss_req_be ),
+ ib.data_size_i .eq( miss_req_size ),
+ ib.data_gnt_o .eq( bypass_gnt_o ),
+ ib.data_rvalid_o .eq( bypass_valid_o ),
+ ib.data_rdata_o .eq( bypass_data_o ),
+ # Slave Sid
+ ib.id_i .eq( id_bypass_fsm[$clog2(NR_PORTS)-1:0] ),
+ ib.id_o .eq( id_fsm_bypass ),
+ ib.gnt_id_i .eq( gnt_id_bypass_fsm[$clog2(NR_PORTS)-1:0] ),
+ ib.address_o .eq( req_fsm_bypass_addr ),
+ ib.data_wdata_o .eq( req_fsm_bypass_wdata ),
+ ib.data_req_o .eq( req_fsm_bypass_valid ),
+ ib.data_we_o .eq( req_fsm_bypass_we ),
+ ib.data_be_o .eq( req_fsm_bypass_be ),
+ ib.data_size_o .eq( req_fsm_bypass_size ),
+ ib.data_gnt_i .eq( gnt_bypass_fsm ),
+ ib.data_rvalid_i .eq( valid_bypass_fsm ),
+ ib.data_rdata_i .eq( data_bypass_fsm ),
+ ]
+
+ axi_adapter #(
+ .DATA_WIDTH ( 64 ),
+ .AXI_ID_WIDTH ( 4 ),
+ .CACHELINE_BYTE_OFFSET ( DCACHE_BYTE_OFFSET )
+ ) i_bypass_axi_adapter (
+ .clk_i,
+ .rst_ni,
+ .req_i ( req_fsm_bypass_valid ),
+ .type_i ( ariane_axi::SINGLE_REQ ),
+ .gnt_o ( gnt_bypass_fsm ),
+ .addr_i ( req_fsm_bypass_addr ),
+ .we_i ( req_fsm_bypass_we ),
+ .wdata_i ( req_fsm_bypass_wdata ),
+ .be_i ( req_fsm_bypass_be ),
+ .size_i ( req_fsm_bypass_size ),
+ .id_i ( Cat(id_fsm_bypass, 0, 0) ),
+ .valid_o ( valid_bypass_fsm ),
+ .rdata_o ( data_bypass_fsm ),
+ .gnt_id_o ( gnt_id_bypass_fsm ),
+ .id_o ( id_bypass_fsm ),
+ .critical_word_o ( ), # not used for single requests
+ .critical_word_valid_o ( ), # not used for single requests
+ .axi_req_o ( axi_bypass_o ),
+ .axi_resp_i ( axi_bypass_i )
+ );
+
+ # ----------------------
+ # Cache Line AXI Refill
+ # ----------------------
+ axi_adapter #(
+ .DATA_WIDTH ( DCACHE_LINE_WIDTH ),
+ .AXI_ID_WIDTH ( 4 ),
+ .CACHELINE_BYTE_OFFSET ( DCACHE_BYTE_OFFSET )
+ ) i_miss_axi_adapter (
+ .clk_i,
+ .rst_ni,
+ .req_i ( req_fsm_miss_valid ),
+ .type_i ( req_fsm_miss_req ),
+ .gnt_o ( gnt_miss_fsm ),
+ .addr_i ( req_fsm_miss_addr ),
+ .we_i ( req_fsm_miss_we ),
+ .wdata_i ( req_fsm_miss_wdata ),
+ .be_i ( req_fsm_miss_be ),
+ .size_i ( req_fsm_miss_size ),
+ .id_i ( Const(0b1100, 4) ),
+ .gnt_id_o ( ), # open
+ .valid_o ( valid_miss_fsm ),
+ .rdata_o ( data_miss_fsm ),
+ .id_o ( ),
+ .critical_word_o,
+ .critical_word_valid_o,
+ .axi_req_o ( axi_data_o ),
+ .axi_resp_i ( axi_data_i )
+ );
+
+ # -----------------
+ # Replacement LFSR
+ # -----------------
+ lfsr_8bit #(.WIDTH (DCACHE_SET_ASSOC)) i_lfsr (
+ .en_i ( lfsr_enable ),
+ .refill_way_oh ( lfsr_oh ),
+ .refill_way_bin ( lfsr_bin ),
+ .*
+ );
+
+ # -----------------
+ # AMO ALU
+ # -----------------
+ amo_alu i_amo_alu (
+ .amo_op_i ( amo_op ),
+ .amo_operand_a_i ( amo_operand_a ),
+ .amo_operand_b_i ( amo_operand_b ),
+ .amo_result_o ( amo_result_o )
+ );
+
+ # -----------------
+ # Struct Split
+ # -----------------
+
+ for i in range(NR_PORTS):
+ miss_req = MissReq()
+ comb += miss_req.eq(miss_req_i[i]);
+ comb += miss_req_valid [i] .eq(miss_req.valid)
+ comb += miss_req_bypass [i] .eq(miss_req.bypass)
+ comb += miss_req_addr [i] .eq(miss_req.addr)
+ comb += miss_req_wdata [i] .eq(miss_req.wdata)
+ comb += miss_req_we [i] .eq(miss_req.we)
+ comb += miss_req_be [i] .eq(miss_req.be)
+ comb += miss_req_size [i] .eq(miss_req.size)
+
+ # --------------
+ # AXI Arbiter
+ # --------------s
+ #
+ # Description: Arbitrates access to AXI refill/bypass
+ #
+class AXIArbiter:
+ def __init__(self, NR_PORTS = 3, DATA_WIDTH = 64):
+ self.NR_PORTS = NR_PORTS
+ self.DATA_WIDTH = DATA_WIDTH
+ self.pwid = pwid = ceil(log(NR_PORTS) / log(2))
+ rst_ni = ResetSignal() # Asynchronous reset active low
+ # master ports
+ self.data_req_i = Signal(NR_PORTS)
+ self.address_i = Array(Signal(name="address_i", 64) \
+ for i in range(NR_PORTS))
+ self.data_wdata_i = Array(Signal(name="data_wdata_i", 64) \
+ for i in range(NR_PORTS))
+ self.data_we_i = Signal(NR_PORTS)
+ self.data_be_i = Array(Signal(name="data_wdata_i", DATA_WIDTH/8) \
+ for i in range(NR_PORTS))
+ self.data_size_i = Array(Signal(name="data_size_i", 2) \
+ for i in range(NR_PORTS))
+ self.data_gnt_o = Signal(NR_PORTS)
+ self.data_rvalid_o = Signal(NR_PORTS)
+ self.data_rdata_o = Array(Signal(name="data_rdata_o", 64) \
+ for i in range(NR_PORTS))
+
+ # slave port
+ self.id_i = Signal(pwid)
+ self.id_o = Signal(pwid)
+ self.gnt_id_i = Signal(pwid)
+ self.data_req_o = Signal()
+ self.address_o = Signal(64)
+ self.data_wdata_o = Signal(DATA_WIDTH)
+ self.data_we_o = Signal()
+ self.data_be_o = Signal(DATA_WIDTH/8)
+ self.data_size_o = Signal(2)
+ self.data_gnt_i = Signal()
+ self.data_rvalid_i = Signal()
+ self.data_rdata_i = Signal(DATA_WIDTH)
+
+ def elaborate(self, platform):
+ #enum logic [1:0] { IDLE, REQ, SERVING } state_d, state_q;
+
+ class Packet:
+ def __init__(self, pwid, DATA_WIDTH):
+ self.id = Signal(pwid)
+ self.address = Signal(64)
+ self.data = Signal(64)
+ self.size = Signal(2)
+ self.be = Signal(DATA_WIDTH/8)
+ self.we = Signal()
+
+ request_index = Signal(self.pwid)
+ req_q = Packet(self.pwid, self.DATA_WIDTH)
+ req_d = Packet(self.pwid, self.DATA_WIDTH)
+
+ # request register
+ sync += req_q.eq(req_d)
+
+ # request port
+ comb += self.address_o .eq(req_q.address)
+ comb += self.data_wdata_o .eq(req_q.data)
+ comb += self.data_be_o .eq(req_q.be)
+ comb += self.data_size_o .eq(req_q.size)
+ comb += self.data_we_o .eq(req_q.we)
+ comb += self.id_o .eq(req_q.id)
+ comb += self.data_gnt_o .eq(0)
+ # read port
+ comb += self.data_rvalid_o .eq(0)
+ comb += self.data_rdata_o .eq(0)
+ comb += self.data_rdata_o[req_q.id].eq(data_rdata_i)
+
+ m.submodules.pp = pp = PriorityEncoder(self.NR_PORTS)
+ comb += pp.i.eq(self.data_req_i) # select one request (priority-based)
+ comb += request_index.eq(pp.o)
+
+ with m.Switch("state") as s:
+
+ with m.Case("IDLE"):
+ # wait for incoming requests (priority encoder data_req_i)
+ with m.If(~pp.n): # one output valid from encoder
+ comb += self.data_req_o .eq(self.data_req_i[i])
+ comb += self.data_gnt_o[i].eq(self.data_req_i[i])
+ # save the request
+ comb += req_d.address.eq(self.address_i[i])
+ comb += req_d.id.eq(request_index)
+ comb += req_d.data.eq(self.data_wdata_i[i])
+ comb += req_d.size.eq(self.data_size_i[i])
+ comb += req_d.be.eq(self.data_be_i[i])
+ comb += req_d.we.eq(self.data_we_i[i])
+ m.next = "SERVING"
+
+ comb += self.address_o .eq(self.address_i[request_index])
+ comb += self.data_wdata_o .eq(self.data_wdata_i[request_index])
+ comb += self.data_be_o .eq(self.data_be_i[request_index])
+ comb += self.data_size_o .eq(self.data_size_i[request_index])
+ comb += self.data_we_o .eq(self.data_we_i[request_index])
+ comb += self.id_o .eq(request_index)
+
+ with m.Case("SERVING"):
+ comb += self.data_req_o.eq(1)
+ with m.If (self.data_rvalid_i):
+ comb += self.data_rvalid_o[req_q.id].eq(1)
+ m.next = "IDLE"
+
+ # ------------
+ # Assertions
+ # ------------
+
+ """
+#pragma translate_off
+`ifndef VERILATOR
+# make sure that we eventually get an rvalid after we received a grant
+assert property (@(posedge clk_i) data_gnt_i |-> ##[1:$] data_rvalid_i )
+ else begin $error("There was a grant without a rvalid"); $stop(); end
+# assert that there is no grant without a request
+assert property (@(negedge clk_i) data_gnt_i |-> data_req_o)
+ else begin $error("There was a grant without a request."); $stop(); end
+# assert that the address does not contain X when request is sent
+assert property ( @(posedge clk_i) (data_req_o) |-> (!$isunknown(address_o)) )
+ else begin $error("address contains X when request is set"); $stop(); end
+
+`endif
+#pragma translate_on
+ """
+
--- /dev/null
+"""
+# Copyright 2018 ETH Zurich and University of Bologna.
+# Copyright and related rights are licensed under the Solderpad Hardware
+# License, Version 0.51 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+# http:#solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# or agreed to in writing, software, hardware and materials distributed under
+# this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# Author: Florian Zaruba, ETH Zurich
+# Date: 19/04/2017
+# Description: Memory Management Unit for Ariane, contains TLB and
+# address translation unit. SV48 as defined in
+# Volume II: RISC-V Privileged Architectures V1.10 Page 63
+
+import ariane_pkg::*;
+"""
+
+from nmigen import Const, Signal, Cat, Module, Mux
+from nmigen.cli import verilog, rtlil
+
+from ptw import DCacheReqI, DCacheReqO, TLBUpdate, PTE, PTW
+from tlb import TLB
+from exceptcause import (INSTR_ACCESS_FAULT, INSTR_PAGE_FAULT,
+ LOAD_PAGE_FAULT, STORE_PAGE_FAULT)
+
+PRIV_LVL_M = Const(0b11, 2)
+PRIV_LVL_S = Const(0b01, 2)
+PRIV_LVL_U = Const(0b00, 2)
+
+
+class RVException:
+ def __init__(self):
+ self.cause = Signal(64) # cause of exception
+ self.tval = Signal(64) # more info of causing exception
+ # (e.g.: instruction causing it),
+ # address of LD/ST fault
+ self.valid = Signal()
+
+ def eq(self, inp):
+ res = []
+ for (o, i) in zip(self.ports(), inp.ports()):
+ res.append(o.eq(i))
+ return res
+
+ def __iter__(self):
+ yield self.cause
+ yield self.tval
+ yield self.valid
+
+ def ports(self):
+ return list(self)
+
+
+class ICacheReqI:
+ def __init__(self):
+ self.fetch_valid = Signal() # address translation valid
+ self.fetch_paddr = Signal(64) # physical address in
+ self.fetch_exception = RVException() # exception occurred during fetch
+
+ def __iter__(self):
+ yield self.fetch_valid
+ yield self.fetch_paddr
+ yield from self.fetch_exception
+
+ def ports(self):
+ return list(self)
+
+
+class ICacheReqO:
+ def __init__(self):
+ self.fetch_req = Signal() # address translation request
+ self.fetch_vaddr = Signal(64) # virtual address out
+
+ def __iter__(self):
+ yield self.fetch_req
+ yield self.fetch_vaddr
+
+ def ports(self):
+ return list(self)
+
+
+class MMU:
+ def __init__(self, instr_tlb_entries = 4,
+ data_tlb_entries = 4,
+ asid_width = 1):
+ self.instr_tlb_entries = instr_tlb_entries
+ self.data_tlb_entries = data_tlb_entries
+ self.asid_width = asid_width
+
+ self.flush_i = Signal()
+ self.enable_translation_i = Signal()
+ self.en_ld_st_translation_i = Signal() # enable VM translation for LD/ST
+ # IF interface
+ self.icache_areq_i = ICacheReqO()
+ self.icache_areq_o = ICacheReqI()
+ # LSU interface
+ # this is a more minimalistic interface because the actual addressing
+ # logic is handled in the LSU as we distinguish load and stores,
+ # what we do here is simple address translation
+ self.misaligned_ex_i = RVException()
+ self.lsu_req_i = Signal() # request address translation
+ self.lsu_vaddr_i = Signal(64) # virtual address in
+ self.lsu_is_store_i = Signal() # the translation is requested by a store
+ # if we need to walk the page table we can't grant in the same cycle
+
+ # Cycle 0
+ self.lsu_dtlb_hit_o = Signal() # sent in the same cycle as the request
+ # if translation hits in the DTLB
+ # Cycle 1
+ self.lsu_valid_o = Signal() # translation is valid
+ self.lsu_paddr_o = Signal(64) # translated address
+ self.lsu_exception_o = RVException() # addr translate threw exception
+
+ # General control signals
+ self.priv_lvl_i = Signal(2)
+ self.ld_st_priv_lvl_i = Signal(2)
+ self.sum_i = Signal()
+ self.mxr_i = Signal()
+ # input logic flag_mprv_i,
+ self.satp_ppn_i = Signal(44)
+ self.asid_i = Signal(self.asid_width)
+ self.flush_tlb_i = Signal()
+ # Performance counters
+ self.itlb_miss_o = Signal()
+ self.dtlb_miss_o = Signal()
+ # PTW memory interface
+ self.req_port_i = DCacheReqO()
+ self.req_port_o = DCacheReqI()
+
+ def elaborate(self, platform):
+ m = Module()
+
+ iaccess_err = Signal() # insufficient priv to access instr page
+ daccess_err = Signal() # insufficient priv to access data page
+ ptw_active = Signal() # PTW is currently walking a page table
+ walking_instr = Signal() # PTW is walking because of an ITLB miss
+ ptw_error = Signal() # PTW threw an exception
+
+ update_vaddr = Signal(48) # guessed
+ uaddr64 = Cat(update_vaddr, Const(0, 25)) # extend to 64bit with zeros
+ update_ptw_itlb = TLBUpdate(self.asid_width)
+ update_ptw_dtlb = TLBUpdate(self.asid_width)
+
+ itlb_lu_access = Signal()
+ itlb_content = PTE()
+ itlb_is_2M = Signal()
+ itlb_is_1G = Signal()
+ itlb_is_512G = Signal()
+ itlb_lu_hit = Signal()
+
+ dtlb_lu_access = Signal()
+ dtlb_content = PTE()
+ dtlb_is_2M = Signal()
+ dtlb_is_1G = Signal()
+ dtlb_is_512G = Signal()
+ dtlb_lu_hit = Signal()
+
+ # Assignments
+ m.d.comb += [itlb_lu_access.eq(self.icache_areq_i.fetch_req),
+ dtlb_lu_access.eq(self.lsu_req_i)
+ ]
+
+ # ITLB
+ m.submodules.i_tlb = i_tlb = TLB(self.instr_tlb_entries,
+ self.asid_width)
+ m.d.comb += [i_tlb.flush_i.eq(self.flush_tlb_i),
+ i_tlb.update_i.eq(update_ptw_itlb),
+ i_tlb.lu_access_i.eq(itlb_lu_access),
+ i_tlb.lu_asid_i.eq(self.asid_i),
+ i_tlb.lu_vaddr_i.eq(self.icache_areq_i.fetch_vaddr),
+ itlb_content.eq(i_tlb.lu_content_o),
+ itlb_is_2M.eq(i_tlb.lu_is_2M_o),
+ itlb_is_1G.eq(i_tlb.lu_is_1G_o),
+ itlb_is_512G.eq(i_tlb.lu_is_512G_o),
+ itlb_lu_hit.eq(i_tlb.lu_hit_o),
+ ]
+
+ # DTLB
+ m.submodules.d_tlb = d_tlb = TLB(self.data_tlb_entries,
+ self.asid_width)
+ m.d.comb += [d_tlb.flush_i.eq(self.flush_tlb_i),
+ d_tlb.update_i.eq(update_ptw_dtlb),
+ d_tlb.lu_access_i.eq(dtlb_lu_access),
+ d_tlb.lu_asid_i.eq(self.asid_i),
+ d_tlb.lu_vaddr_i.eq(self.lsu_vaddr_i),
+ dtlb_content.eq(d_tlb.lu_content_o),
+ dtlb_is_2M.eq(d_tlb.lu_is_2M_o),
+ dtlb_is_1G.eq(d_tlb.lu_is_1G_o),
+ dtlb_is_512G.eq(d_tlb.lu_is_512G_o),
+ dtlb_lu_hit.eq(d_tlb.lu_hit_o),
+ ]
+
+ # PTW
+ m.submodules.ptw = ptw = PTW(self.asid_width)
+ m.d.comb += [ptw_active.eq(ptw.ptw_active_o),
+ walking_instr.eq(ptw.walking_instr_o),
+ ptw_error.eq(ptw.ptw_error_o),
+ ptw.enable_translation_i.eq(self.enable_translation_i),
+
+ update_vaddr.eq(ptw.update_vaddr_o),
+ update_ptw_itlb.eq(ptw.itlb_update_o),
+ update_ptw_dtlb.eq(ptw.dtlb_update_o),
+
+ ptw.itlb_access_i.eq(itlb_lu_access),
+ ptw.itlb_hit_i.eq(itlb_lu_hit),
+ ptw.itlb_vaddr_i.eq(self.icache_areq_i.fetch_vaddr),
+
+ ptw.dtlb_access_i.eq(dtlb_lu_access),
+ ptw.dtlb_hit_i.eq(dtlb_lu_hit),
+ ptw.dtlb_vaddr_i.eq(self.lsu_vaddr_i),
+
+ ptw.req_port_i.eq(self.req_port_i),
+ self.req_port_o.eq(ptw.req_port_o),
+ ]
+
+ # ila_1 i_ila_1 (
+ # .clk(clk_i), # input wire clk
+ # .probe0({req_port_o.address_tag, req_port_o.address_index}),
+ # .probe1(req_port_o.data_req), # input wire [63:0] probe1
+ # .probe2(req_port_i.data_gnt), # input wire [0:0] probe2
+ # .probe3(req_port_i.data_rdata), # input wire [0:0] probe3
+ # .probe4(req_port_i.data_rvalid), # input wire [0:0] probe4
+ # .probe5(ptw_error), # input wire [1:0] probe5
+ # .probe6(update_vaddr), # input wire [0:0] probe6
+ # .probe7(update_ptw_itlb.valid), # input wire [0:0] probe7
+ # .probe8(update_ptw_dtlb.valid), # input wire [0:0] probe8
+ # .probe9(dtlb_lu_access), # input wire [0:0] probe9
+ # .probe10(lsu_vaddr_i), # input wire [0:0] probe10
+ # .probe11(dtlb_lu_hit), # input wire [0:0] probe11
+ # .probe12(itlb_lu_access), # input wire [0:0] probe12
+ # .probe13(icache_areq_i.fetch_vaddr), # input wire [0:0] probe13
+ # .probe14(itlb_lu_hit) # input wire [0:0] probe13
+ # );
+
+ #-----------------------
+ # Instruction Interface
+ #-----------------------
+ # The instruction interface is a simple request response interface
+
+ # MMU disabled: just pass through
+ m.d.comb += [self.icache_areq_o.fetch_valid.eq(
+ self.icache_areq_i.fetch_req),
+ # play through in case we disabled address translation
+ self.icache_areq_o.fetch_paddr.eq(
+ self.icache_areq_i.fetch_vaddr)
+ ]
+ # two potential exception sources:
+ # 1. HPTW threw an exception -> signal with a page fault exception
+ # 2. We got an access error because of insufficient permissions ->
+ # throw an access exception
+ m.d.comb += self.icache_areq_o.fetch_exception.valid.eq(0)
+ # Check whether we are allowed to access this memory region
+ # from a fetch perspective
+
+ # PLATEN TODO: use PermissionValidator instead [we like modules]
+ m.d.comb += iaccess_err.eq(self.icache_areq_i.fetch_req & \
+ (((self.priv_lvl_i == PRIV_LVL_U) & \
+ ~itlb_content.u) | \
+ ((self.priv_lvl_i == PRIV_LVL_S) & \
+ itlb_content.u)))
+
+ # MMU enabled: address from TLB, request delayed until hit.
+ # Error when TLB hit and no access right or TLB hit and
+ # translated address not valid (e.g. AXI decode error),
+ # or when PTW performs walk due to ITLB miss and raises
+ # an error.
+ with m.If (self.enable_translation_i):
+ # we work with SV48, so if VM is enabled, check that
+ # all bits [47:38] are equal
+ with m.If (self.icache_areq_i.fetch_req & \
+ ~(((~self.icache_areq_i.fetch_vaddr[47:64]) == 0) | \
+ (self.icache_areq_i.fetch_vaddr[47:64]) == 0)):
+ fe = self.icache_areq_o.fetch_exception
+ m.d.comb += [fe.cause.eq(INSTR_ACCESS_FAULT),
+ fe.tval.eq(self.icache_areq_i.fetch_vaddr),
+ fe.valid.eq(1)
+ ]
+
+ m.d.comb += self.icache_areq_o.fetch_valid.eq(0)
+
+ # 4K page
+ paddr = Signal.like(self.icache_areq_o.fetch_paddr)
+ paddr4k = Cat(self.icache_areq_i.fetch_vaddr[0:12],
+ itlb_content.ppn)
+ m.d.comb += paddr.eq(paddr4k)
+ # Mega page
+ with m.If(itlb_is_2M):
+ m.d.comb += paddr[12:21].eq(
+ self.icache_areq_i.fetch_vaddr[12:21])
+ # Giga page
+ with m.If(itlb_is_1G):
+ m.d.comb += paddr[12:30].eq(
+ self.icache_areq_i.fetch_vaddr[12:30])
+ m.d.comb += self.icache_areq_o.fetch_paddr.eq(paddr)
+ # Tera page
+ with m.If(itlb_is_512G):
+ m.d.comb += paddr[12:39].eq(
+ self.icache_areq_i.fetch_vaddr[12:39])
+ m.d.comb += self.icache_areq_o.fetch_paddr.eq(paddr)
+
+ # ---------
+ # ITLB Hit
+ # --------
+ # if we hit the ITLB output the request signal immediately
+ with m.If(itlb_lu_hit):
+ m.d.comb += self.icache_areq_o.fetch_valid.eq(
+ self.icache_areq_i.fetch_req)
+ # we got an access error
+ with m.If (iaccess_err):
+ # throw a page fault
+ fe = self.icache_areq_o.fetch_exception
+ m.d.comb += [fe.cause.eq(INSTR_ACCESS_FAULT),
+ fe.tval.eq(self.icache_areq_i.fetch_vaddr),
+ fe.valid.eq(1)
+ ]
+ # ---------
+ # ITLB Miss
+ # ---------
+ # watch out for exceptions happening during walking the page table
+ with m.Elif(ptw_active & walking_instr):
+ m.d.comb += self.icache_areq_o.fetch_valid.eq(ptw_error)
+ fe = self.icache_areq_o.fetch_exception
+ m.d.comb += [fe.cause.eq(INSTR_PAGE_FAULT),
+ fe.tval.eq(uaddr64),
+ fe.valid.eq(1)
+ ]
+
+ #-----------------------
+ # Data Interface
+ #-----------------------
+
+ lsu_vaddr = Signal(64)
+ dtlb_pte = PTE()
+ misaligned_ex = RVException()
+ lsu_req = Signal()
+ lsu_is_store = Signal()
+ dtlb_hit = Signal()
+ #dtlb_is_2M = Signal()
+ #dtlb_is_1G = Signal()
+ #dtlb_is_512 = Signal()
+
+ # check if we need to do translation or if we are always
+ # ready (e.g.: we are not translating anything)
+ m.d.comb += self.lsu_dtlb_hit_o.eq(Mux(self.en_ld_st_translation_i,
+ dtlb_lu_hit, 1))
+
+ # The data interface is simpler and only consists of a
+ # request/response interface
+ m.d.comb += [
+ # save request and DTLB response
+ lsu_vaddr.eq(self.lsu_vaddr_i),
+ lsu_req.eq(self.lsu_req_i),
+ misaligned_ex.eq(self.misaligned_ex_i),
+ dtlb_pte.eq(dtlb_content),
+ dtlb_hit.eq(dtlb_lu_hit),
+ lsu_is_store.eq(self.lsu_is_store_i),
+ #dtlb_is_2M.eq(dtlb_is_2M),
+ #dtlb_is_1G.eq(dtlb_is_1G),
+ ##dtlb_is_512.eq(self.dtlb_is_512G) #????
+ ]
+ m.d.sync += [
+ self.lsu_paddr_o.eq(lsu_vaddr),
+ self.lsu_valid_o.eq(lsu_req),
+ self.lsu_exception_o.eq(misaligned_ex),
+ ]
+
+ sverr = Signal()
+ usrerr = Signal()
+
+ m.d.comb += [
+ # mute misaligned exceptions if there is no request
+ # otherwise they will throw accidental exceptions
+ misaligned_ex.valid.eq(self.misaligned_ex_i.valid & self.lsu_req_i),
+
+ # SUM is not set and we are trying to access a user
+ # page in supervisor mode
+ sverr.eq(self.ld_st_priv_lvl_i == PRIV_LVL_S & ~self.sum_i & \
+ dtlb_pte.u),
+ # this is not a user page but we are in user mode and
+ # trying to access it
+ usrerr.eq(self.ld_st_priv_lvl_i == PRIV_LVL_U & ~dtlb_pte.u),
+
+ # Check if the User flag is set, then we may only
+ # access it in supervisor mode if SUM is enabled
+ daccess_err.eq(sverr | usrerr),
+ ]
+
+ # translation is enabled and no misaligned exception occurred
+ with m.If(self.en_ld_st_translation_i & ~misaligned_ex.valid):
+ m.d.comb += lsu_req.eq(0)
+ # 4K page
+ paddr = Signal.like(lsu_vaddr)
+ paddr4k = Cat(lsu_vaddr[0:12], itlb_content.ppn)
+ m.d.comb += paddr.eq(paddr4k)
+ # Mega page
+ with m.If(dtlb_is_2M):
+ m.d.comb += paddr[12:21].eq(lsu_vaddr[12:21])
+ # Giga page
+ with m.If(dtlb_is_1G):
+ m.d.comb += paddr[12:30].eq(lsu_vaddr[12:30])
+ m.d.sync += self.lsu_paddr_o.eq(paddr)
+ # TODO platen tera_page
+
+ # ---------
+ # DTLB Hit
+ # --------
+ with m.If(dtlb_hit & lsu_req):
+ m.d.comb += lsu_req.eq(1)
+ # this is a store
+ with m.If (lsu_is_store):
+ # check if the page is write-able and
+ # we are not violating privileges
+ # also check if the dirty flag is set
+ with m.If(~dtlb_pte.w | daccess_err | ~dtlb_pte.d):
+ le = self.lsu_exception_o
+ m.d.sync += [le.cause.eq(STORE_PAGE_FAULT),
+ le.tval.eq(lsu_vaddr),
+ le.valid.eq(1)
+ ]
+
+ # this is a load, check for sufficient access
+ # privileges - throw a page fault if necessary
+ with m.Elif(daccess_err):
+ le = self.lsu_exception_o
+ m.d.sync += [le.cause.eq(LOAD_PAGE_FAULT),
+ le.tval.eq(lsu_vaddr),
+ le.valid.eq(1)
+ ]
+ # ---------
+ # DTLB Miss
+ # ---------
+ # watch out for exceptions
+ with m.Elif (ptw_active & ~walking_instr):
+ # page table walker threw an exception
+ with m.If (ptw_error):
+ # an error makes the translation valid
+ m.d.comb += lsu_req.eq(1)
+ # the page table walker can only throw page faults
+ with m.If (lsu_is_store):
+ le = self.lsu_exception_o
+ m.d.sync += [le.cause.eq(STORE_PAGE_FAULT),
+ le.tval.eq(uaddr64),
+ le.valid.eq(1)
+ ]
+ with m.Else():
+ m.d.sync += [le.cause.eq(LOAD_PAGE_FAULT),
+ le.tval.eq(uaddr64),
+ le.valid.eq(1)
+ ]
+
+ return m
+
+ def ports(self):
+ return [self.flush_i, self.enable_translation_i,
+ self.en_ld_st_translation_i,
+ self.lsu_req_i,
+ self.lsu_vaddr_i, self.lsu_is_store_i, self.lsu_dtlb_hit_o,
+ self.lsu_valid_o, self.lsu_paddr_o,
+ self.priv_lvl_i, self.ld_st_priv_lvl_i, self.sum_i, self.mxr_i,
+ self.satp_ppn_i, self.asid_i, self.flush_tlb_i,
+ self.itlb_miss_o, self.dtlb_miss_o] + \
+ self.icache_areq_i.ports() + self.icache_areq_o.ports() + \
+ self.req_port_i.ports() + self.req_port_o.ports() + \
+ self.misaligned_ex_i.ports() + self.lsu_exception_o.ports()
+
+if __name__ == '__main__':
+ mmu = MMU()
+ vl = rtlil.convert(mmu, ports=mmu.ports())
+ with open("test_mmu.il", "w") as f:
+ f.write(vl)
+
--- /dev/null
+pseudo-LRU
+
+two-way set associative - one bit
+
+ indicates which line of the two has been reference more recently
+
+
+four-way set associative - three bits
+
+ each bit represents one branch point in a binary decision tree; let 1
+ represent that the left side has been referenced more recently than the
+ right side, and 0 vice-versa
+
+ are all 4 lines valid?
+ / \
+ yes no, use an invalid line
+ |
+ |
+ |
+ bit_0 == 0? state | replace ref to | next state
+ / \ ------+-------- -------+-----------
+ y n 00x | line_0 line_0 | 11_
+ / \ 01x | line_1 line_1 | 10_
+ bit_1 == 0? bit_2 == 0? 1x0 | line_2 line_2 | 0_1
+ / \ / \ 1x1 | line_3 line_3 | 0_0
+ y n y n
+ / \ / \ ('x' means ('_' means unchanged)
+ line_0 line_1 line_2 line_3 don't care)
+
+ (see Figure 3-7, p. 3-18, in Intel Embedded Pentium Processor Family Dev.
+ Manual, 1998, http://www.intel.com/design/intarch/manuals/273204.htm)
+
+
+note that there is a 6-bit encoding for true LRU for four-way set associative
+
+ bit 0: bank[1] more recently used than bank[0]
+ bit 1: bank[2] more recently used than bank[0]
+ bit 2: bank[2] more recently used than bank[1]
+ bit 3: bank[3] more recently used than bank[0]
+ bit 4: bank[3] more recently used than bank[1]
+ bit 5: bank[3] more recently used than bank[2]
+
+ this results in 24 valid bit patterns within the 64 possible bit patterns
+ (4! possible valid traces for bank references)
+
+ e.g., a trace of 0 1 2 3, where 0 is LRU and 3 is MRU, is encoded as 111111
+
+ you can implement a state machine with a 256x6 ROM (6-bit state encoding
+ appended with a 2-bit bank reference input will yield a new 6-bit state),
+ and you can implement an LRU bank indicator with a 64x2 ROM
+
--- /dev/null
+# moved to nmutil https://git.libre-soc.org/?p=nmutil.git;a=tree
+from nmutil.plru import PLRU
--- /dev/null
+"""
+# Copyright 2018 ETH Zurich and University of Bologna.
+# Copyright and related rights are licensed under the Solderpad Hardware
+# License, Version 0.51 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+# http:#solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# or agreed to in writing, software, hardware and materials distributed under
+# this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# Author: David Schaffenrath, TU Graz
+# Author: Florian Zaruba, ETH Zurich
+# Date: 24.4.2017
+# Description: Hardware-PTW
+
+/* verilator lint_off WIDTH */
+import ariane_pkg::*;
+
+see linux kernel source:
+
+* "arch/riscv/include/asm/page.h"
+* "arch/riscv/include/asm/mmu_context.h"
+* "arch/riscv/Kconfig" (CONFIG_PAGE_OFFSET)
+
+"""
+
+from nmigen import Const, Signal, Cat, Module, Elaboratable
+from nmigen.hdl.ast import ArrayProxy
+from nmigen.cli import verilog, rtlil
+from math import log2
+
+
+DCACHE_SET_ASSOC = 8
+CONFIG_L1D_SIZE = 32*1024
+DCACHE_INDEX_WIDTH = int(log2(CONFIG_L1D_SIZE / DCACHE_SET_ASSOC))
+DCACHE_TAG_WIDTH = 56 - DCACHE_INDEX_WIDTH
+
+ASID_WIDTH = 8
+
+
+class DCacheReqI:
+ def __init__(self):
+ self.address_index = Signal(DCACHE_INDEX_WIDTH)
+ self.address_tag = Signal(DCACHE_TAG_WIDTH)
+ self.data_wdata = Signal(64)
+ self.data_req = Signal()
+ self.data_we = Signal()
+ self.data_be = Signal(8)
+ self.data_size = Signal(2)
+ self.kill_req = Signal()
+ self.tag_valid = Signal()
+
+ def eq(self, inp):
+ res = []
+ for (o, i) in zip(self.ports(), inp.ports()):
+ res.append(o.eq(i))
+ return res
+
+ def ports(self):
+ return [self.address_index, self.address_tag,
+ self.data_wdata, self.data_req,
+ self.data_we, self.data_be, self.data_size,
+ self.kill_req, self.tag_valid,
+ ]
+
+class DCacheReqO:
+ def __init__(self):
+ self.data_gnt = Signal()
+ self.data_rvalid = Signal()
+ self.data_rdata = Signal(64) # actually in PTE object format
+
+ def eq(self, inp):
+ res = []
+ for (o, i) in zip(self.ports(), inp.ports()):
+ res.append(o.eq(i))
+ return res
+
+ def ports(self):
+ return [self.data_gnt, self.data_rvalid, self.data_rdata]
+
+
+class PTE: #(RecordObject):
+ def __init__(self):
+ self.v = Signal()
+ self.r = Signal()
+ self.w = Signal()
+ self.x = Signal()
+ self.u = Signal()
+ self.g = Signal()
+ self.a = Signal()
+ self.d = Signal()
+ self.rsw = Signal(2)
+ self.ppn = Signal(44)
+ self.reserved = Signal(10)
+
+ def flatten(self):
+ return Cat(*self.ports())
+
+ def eq(self, x):
+ if isinstance(x, ArrayProxy):
+ res = []
+ for o in self.ports():
+ i = getattr(x, o.name)
+ res.append(i)
+ x = Cat(*res)
+ else:
+ x = x.flatten()
+ return self.flatten().eq(x)
+
+ def __iter__(self):
+ """ order is critical so that flatten creates LSB to MSB
+ """
+ yield self.v
+ yield self.r
+ yield self.w
+ yield self.x
+ yield self.u
+ yield self.g
+ yield self.a
+ yield self.d
+ yield self.rsw
+ yield self.ppn
+ yield self.reserved
+
+ def ports(self):
+ return list(self)
+
+
+class TLBUpdate:
+ def __init__(self, asid_width):
+ self.valid = Signal() # valid flag
+ self.is_2M = Signal()
+ self.is_1G = Signal()
+ self.is_512G = Signal()
+ self.vpn = Signal(36)
+ self.asid = Signal(asid_width)
+ self.content = PTE()
+
+ def flatten(self):
+ return Cat(*self.ports())
+
+ def eq(self, x):
+ return self.flatten().eq(x.flatten())
+
+ def ports(self):
+ return [self.valid, self.is_2M, self.is_1G, self.vpn, self.asid] + \
+ self.content.ports()
+
+
+# SV48 defines four levels of page tables
+LVL1 = Const(0, 2) # defined to 0 so that ptw_lvl default-resets to LVL1
+LVL2 = Const(1, 2)
+LVL3 = Const(2, 2)
+LVL4 = Const(3, 2)
+
+
+class PTW(Elaboratable):
+ def __init__(self, asid_width=8):
+ self.asid_width = asid_width
+
+ self.flush_i = Signal() # flush everything, we need to do this because
+ # actually everything we do is speculative at this stage
+ # e.g.: there could be a CSR instruction that changes everything
+ self.ptw_active_o = Signal(reset=1) # active if not IDLE
+ self.walking_instr_o = Signal() # set when walking for TLB
+ self.ptw_error_o = Signal() # set when an error occurred
+ self.enable_translation_i = Signal() # CSRs indicate to enable SV48
+ self.en_ld_st_translation_i = Signal() # enable VM translation for ld/st
+
+ self.lsu_is_store_i = Signal() # translation triggered by store
+ # PTW memory interface
+ self.req_port_i = DCacheReqO()
+ self.req_port_o = DCacheReqI()
+
+ # to TLBs, update logic
+ self.itlb_update_o = TLBUpdate(asid_width)
+ self.dtlb_update_o = TLBUpdate(asid_width)
+
+ self.update_vaddr_o = Signal(48)
+
+ self.asid_i = Signal(self.asid_width)
+ # from TLBs
+ # did we miss?
+ self.itlb_access_i = Signal()
+ self.itlb_hit_i = Signal()
+ self.itlb_vaddr_i = Signal(64)
+
+ self.dtlb_access_i = Signal()
+ self.dtlb_hit_i = Signal()
+ self.dtlb_vaddr_i = Signal(64)
+ # from CSR file
+ self.satp_ppn_i = Signal(44) # ppn from satp
+ self.mxr_i = Signal()
+ # Performance counters
+ self.itlb_miss_o = Signal()
+ self.dtlb_miss_o = Signal()
+
+ def ports(self):
+ return [self.ptw_active_o, self.walking_instr_o, self.ptw_error_o,
+ ]
+ return [
+ self.enable_translation_i, self.en_ld_st_translation_i,
+ self.lsu_is_store_i, self.req_port_i, self.req_port_o,
+ self.update_vaddr_o,
+ self.asid_i,
+ self.itlb_access_i, self.itlb_hit_i, self.itlb_vaddr_i,
+ self.dtlb_access_i, self.dtlb_hit_i, self.dtlb_vaddr_i,
+ self.satp_ppn_i, self.mxr_i,
+ self.itlb_miss_o, self.dtlb_miss_o
+ ] + self.itlb_update_o.ports() + self.dtlb_update_o.ports()
+
+ def elaborate(self, platform):
+ m = Module()
+
+ # input registers
+ data_rvalid = Signal()
+ data_rdata = Signal(64)
+
+ # NOTE: pte decodes the incoming bit-field (data_rdata). data_rdata
+ # is spec'd in 64-bit binary-format: better to spec as Record?
+ pte = PTE()
+ m.d.comb += pte.flatten().eq(data_rdata)
+
+ # SV48 defines four levels of page tables
+ ptw_lvl = Signal(2) # default=0=LVL1 on reset (see above)
+ ptw_lvl1 = Signal()
+ ptw_lvl2 = Signal()
+ ptw_lvl3 = Signal()
+ ptw_lvl4 = Signal()
+ m.d.comb += [ptw_lvl1.eq(ptw_lvl == LVL1),
+ ptw_lvl2.eq(ptw_lvl == LVL2),
+ ptw_lvl3.eq(ptw_lvl == LVL3),
+ ptw_lvl4.eq(ptw_lvl == LVL4)
+ ]
+
+ # is this an instruction page table walk?
+ is_instr_ptw = Signal()
+ global_mapping = Signal()
+ # latched tag signal
+ tag_valid = Signal()
+ # register the ASID
+ tlb_update_asid = Signal(self.asid_width)
+ # register VPN we need to walk, SV48 defines a 48 bit virtual addr
+ vaddr = Signal(64)
+ # 4 byte aligned physical pointer
+ ptw_pptr = Signal(56)
+
+ end = DCACHE_INDEX_WIDTH + DCACHE_TAG_WIDTH
+ m.d.sync += [
+ # Assignments
+ self.update_vaddr_o.eq(vaddr),
+
+ self.walking_instr_o.eq(is_instr_ptw),
+ # directly output the correct physical address
+ self.req_port_o.address_index.eq(ptw_pptr[0:DCACHE_INDEX_WIDTH]),
+ self.req_port_o.address_tag.eq(ptw_pptr[DCACHE_INDEX_WIDTH:end]),
+ # we are never going to kill this request
+ self.req_port_o.kill_req.eq(0), # XXX assign comb?
+ # we are never going to write with the HPTW
+ self.req_port_o.data_wdata.eq(Const(0, 64)), # XXX assign comb?
+ # -----------
+ # TLB Update
+ # -----------
+ self.itlb_update_o.vpn.eq(vaddr[12:48]),
+ self.dtlb_update_o.vpn.eq(vaddr[12:48]),
+ # update the correct page table level
+ self.itlb_update_o.is_2M.eq(ptw_lvl3),
+ self.itlb_update_o.is_1G.eq(ptw_lvl2),
+ self.itlb_update_o.is_512G.eq(ptw_lvl1),
+ self.dtlb_update_o.is_2M.eq(ptw_lvl3),
+ self.dtlb_update_o.is_1G.eq(ptw_lvl2),
+ self.dtlb_update_o.is_512G.eq(ptw_lvl1),
+
+ # output the correct ASID
+ self.itlb_update_o.asid.eq(tlb_update_asid),
+ self.dtlb_update_o.asid.eq(tlb_update_asid),
+ # set the global mapping bit
+ self.itlb_update_o.content.eq(pte),
+ self.itlb_update_o.content.g.eq(global_mapping),
+ self.dtlb_update_o.content.eq(pte),
+ self.dtlb_update_o.content.g.eq(global_mapping),
+
+ self.req_port_o.tag_valid.eq(tag_valid),
+ ]
+
+ #-------------------
+ # Page table walker #needs update
+ #-------------------
+ # A virtual address va is translated into a physical address pa as
+ # follows:
+ # 1. Let a be sptbr.ppn × PAGESIZE, and let i = LEVELS-1. (For Sv48,
+ # PAGESIZE=2^12 and LEVELS=4.)
+ # 2. Let pte be the value of the PTE at address a+va.vpn[i]×PTESIZE.
+ # (For Sv32, PTESIZE=4.)
+ # 3. If pte.v = 0, or if pte.r = 0 and pte.w = 1, stop and raise an
+ # access exception.
+ # 4. Otherwise, the PTE is valid. If pte.r = 1 or pte.x = 1, go to
+ # step 5. Otherwise, this PTE is a pointer to the next level of
+ # the page table.
+ # Let i=i-1. If i < 0, stop and raise an access exception.
+ # Otherwise, let a = pte.ppn × PAGESIZE and go to step 2.
+ # 5. A leaf PTE has been found. Determine if the requested memory
+ # access is allowed by the pte.r, pte.w, and pte.x bits. If not,
+ # stop and raise an access exception. Otherwise, the translation is
+ # successful. Set pte.a to 1, and, if the memory access is a
+ # store, set pte.d to 1.
+ # The translated physical address is given as follows:
+ # - pa.pgoff = va.pgoff.
+ # - If i > 0, then this is a superpage translation and
+ # pa.ppn[i-1:0] = va.vpn[i-1:0].
+ # - pa.ppn[LEVELS-1:i] = pte.ppn[LEVELS-1:i].
+ # 6. If i > 0 and pa.ppn[i − 1 : 0] != 0, this is a misaligned
+ # superpage stop and raise a page-fault exception.
+
+ m.d.sync += tag_valid.eq(0)
+
+ # default assignments
+ m.d.comb += [
+ # PTW memory interface
+ self.req_port_o.data_req.eq(0),
+ self.req_port_o.data_be.eq(Const(0xFF, 8)),
+ self.req_port_o.data_size.eq(Const(0b11, 2)),
+ self.req_port_o.data_we.eq(0),
+ self.ptw_error_o.eq(0),
+ self.itlb_update_o.valid.eq(0),
+ self.dtlb_update_o.valid.eq(0),
+
+ self.itlb_miss_o.eq(0),
+ self.dtlb_miss_o.eq(0),
+ ]
+
+ # ------------
+ # State Machine
+ # ------------
+
+ with m.FSM() as fsm:
+
+ with m.State("IDLE"):
+ self.idle(m, is_instr_ptw, ptw_lvl, global_mapping,
+ ptw_pptr, vaddr, tlb_update_asid)
+
+ with m.State("WAIT_GRANT"):
+ self.grant(m, tag_valid, data_rvalid)
+
+ with m.State("PTE_LOOKUP"):
+ # we wait for the valid signal
+ with m.If(data_rvalid):
+ self.lookup(m, pte, ptw_lvl, ptw_lvl1, ptw_lvl2, ptw_lvl3, ptw_lvl4,
+ data_rvalid, global_mapping,
+ is_instr_ptw, ptw_pptr)
+
+ # Propagate error to MMU/LSU
+ with m.State("PROPAGATE_ERROR"):
+ m.next = "IDLE"
+ m.d.comb += self.ptw_error_o.eq(1)
+
+ # wait for the rvalid before going back to IDLE
+ with m.State("WAIT_RVALID"):
+ with m.If(data_rvalid):
+ m.next = "IDLE"
+
+ m.d.sync += [data_rdata.eq(self.req_port_i.data_rdata),
+ data_rvalid.eq(self.req_port_i.data_rvalid)
+ ]
+
+ return m
+
+ def set_grant_state(self, m):
+ # should we have flushed before we got an rvalid,
+ # wait for it until going back to IDLE
+ with m.If(self.flush_i):
+ with m.If (self.req_port_i.data_gnt):
+ m.next = "WAIT_RVALID"
+ with m.Else():
+ m.next = "IDLE"
+ with m.Else():
+ m.next = "WAIT_GRANT"
+
+ def idle(self, m, is_instr_ptw, ptw_lvl, global_mapping,
+ ptw_pptr, vaddr, tlb_update_asid):
+ # by default we start with the top-most page table
+ m.d.sync += [is_instr_ptw.eq(0),
+ ptw_lvl.eq(LVL1),
+ global_mapping.eq(0),
+ self.ptw_active_o.eq(0), # deactive (IDLE)
+ ]
+ # work out itlb/dtlb miss
+ m.d.comb += self.itlb_miss_o.eq(self.enable_translation_i & \
+ self.itlb_access_i & \
+ ~self.itlb_hit_i & \
+ ~self.dtlb_access_i)
+ m.d.comb += self.dtlb_miss_o.eq(self.en_ld_st_translation_i & \
+ self.dtlb_access_i & \
+ ~self.dtlb_hit_i)
+ # we got an ITLB miss?
+ with m.If(self.itlb_miss_o):
+ pptr = Cat(Const(0, 3), self.itlb_vaddr_i[30:48],
+ self.satp_ppn_i)
+ m.d.sync += [ptw_pptr.eq(pptr),
+ is_instr_ptw.eq(1),
+ vaddr.eq(self.itlb_vaddr_i),
+ tlb_update_asid.eq(self.asid_i),
+ ]
+ self.set_grant_state(m)
+
+ # we got a DTLB miss?
+ with m.Elif(self.dtlb_miss_o):
+ pptr = Cat(Const(0, 3), self.dtlb_vaddr_i[30:48],
+ self.satp_ppn_i)
+ m.d.sync += [ptw_pptr.eq(pptr),
+ vaddr.eq(self.dtlb_vaddr_i),
+ tlb_update_asid.eq(self.asid_i),
+ ]
+ self.set_grant_state(m)
+
+ def grant(self, m, tag_valid, data_rvalid):
+ # we've got a data WAIT_GRANT so tell the
+ # cache that the tag is valid
+
+ # send a request out
+ m.d.comb += self.req_port_o.data_req.eq(1)
+ # wait for the WAIT_GRANT
+ with m.If(self.req_port_i.data_gnt):
+ # send the tag valid signal one cycle later
+ m.d.sync += tag_valid.eq(1)
+ # should we have flushed before we got an rvalid,
+ # wait for it until going back to IDLE
+ with m.If(self.flush_i):
+ with m.If (~data_rvalid):
+ m.next = "WAIT_RVALID"
+ with m.Else():
+ m.next = "IDLE"
+ with m.Else():
+ m.next = "PTE_LOOKUP"
+
+ def lookup(self, m, pte, ptw_lvl, ptw_lvl1, ptw_lvl2, ptw_lvl3, ptw_lvl4,
+ data_rvalid, global_mapping,
+ is_instr_ptw, ptw_pptr):
+ # temporaries
+ pte_rx = Signal(reset_less=True)
+ pte_exe = Signal(reset_less=True)
+ pte_inv = Signal(reset_less=True)
+ pte_a = Signal(reset_less=True)
+ st_wd = Signal(reset_less=True)
+ m.d.comb += [pte_rx.eq(pte.r | pte.x),
+ pte_exe.eq(~pte.x | ~pte.a),
+ pte_inv.eq(~pte.v | (~pte.r & pte.w)),
+ pte_a.eq(pte.a & (pte.r | (pte.x & self.mxr_i))),
+ st_wd.eq(self.lsu_is_store_i & (~pte.w | ~pte.d))]
+
+ l1err = Signal(reset_less=True)
+ l2err = Signal(reset_less=True)
+ l3err = Signal(reset_less=True)
+ m.d.comb += [l3err.eq((ptw_lvl3) & pte.ppn[0:9] != Const(0,0)),
+ l2err.eq((ptw_lvl2) & pte.ppn[0:18] != Const(0, 18)),
+ l1err.eq((ptw_lvl1) & pte.ppn[0:27] != Const(0, 27))]
+
+ # check if the global mapping bit is set
+ with m.If (pte.g):
+ m.d.sync += global_mapping.eq(1)
+
+ m.next = "IDLE"
+
+ # -------------
+ # Invalid PTE
+ # -------------
+ # If pte.v = 0, or if pte.r = 0 and pte.w = 1,
+ # stop and raise a page-fault exception.
+ with m.If (pte_inv):
+ m.next = "PROPAGATE_ERROR"
+
+ # -----------
+ # Valid PTE
+ # -----------
+
+ # it is a valid PTE
+ # if pte.r = 1 or pte.x = 1 it is a valid PTE
+ with m.Elif (pte_rx):
+ # Valid translation found (either 1G, 2M or 4K)
+ with m.If(is_instr_ptw):
+ # ------------
+ # Update ITLB
+ # ------------
+ # If page not executable, we can directly raise error.
+ # This doesn't put a useless entry into the TLB.
+ # The same idea applies to the access flag since we let
+ # the access flag be managed by SW.
+ with m.If (pte_exe):
+ m.next = "IDLE"
+ with m.Else():
+ m.d.comb += self.itlb_update_o.valid.eq(1)
+
+ with m.Else():
+ # ------------
+ # Update DTLB
+ # ------------
+ # Check if the access flag has been set, otherwise
+ # throw page-fault and let software handle those bits.
+ # If page not readable (there are no write-only pages)
+ # directly raise an error. This doesn't put a useless
+ # entry into the TLB.
+ with m.If(pte_a):
+ m.d.comb += self.dtlb_update_o.valid.eq(1)
+ with m.Else():
+ m.next = "PROPAGATE_ERROR"
+ # Request is a store: perform additional checks
+ # If the request was a store and the page not
+ # write-able, raise an error
+ # the same applies if the dirty flag is not set
+ with m.If (st_wd):
+ m.d.comb += self.dtlb_update_o.valid.eq(0)
+ m.next = "PROPAGATE_ERROR"
+
+ # check if the ppn is correctly aligned: Case (6)
+ with m.If(l1err | l2err | l3err):
+ m.next = "PROPAGATE_ERROR"
+ m.d.comb += [self.dtlb_update_o.valid.eq(0),
+ self.itlb_update_o.valid.eq(0)]
+
+ # this is a pointer to the next TLB level
+ with m.Else():
+ # pointer to next level of page table
+ with m.If (ptw_lvl1):
+ # we are in the second level now
+ pptr = Cat(Const(0, 3), self.dtlb_vaddr_i[30:39], pte.ppn)
+ m.d.sync += [ptw_pptr.eq(pptr),
+ ptw_lvl.eq(LVL2)
+ ]
+ with m.If(ptw_lvl2):
+ # here we received a pointer to the third level
+ pptr = Cat(Const(0, 3), self.dtlb_vaddr_i[21:30], pte.ppn)
+ m.d.sync += [ptw_pptr.eq(pptr),
+ ptw_lvl.eq(LVL3)
+ ]
+ with m.If(ptw_lvl3): #guess: shift page levels by one
+ # here we received a pointer to the fourth level
+ # the last one is near the page offset
+ pptr = Cat(Const(0, 3), self.dtlb_vaddr_i[12:21], pte.ppn)
+ m.d.sync += [ptw_pptr.eq(pptr),
+ ptw_lvl.eq(LVL4)
+ ]
+ self.set_grant_state(m)
+
+ with m.If (ptw_lvl4):
+ # Should already be the last level
+ # page table => Error
+ m.d.sync += ptw_lvl.eq(LVL4)
+ m.next = "PROPAGATE_ERROR"
+
+
+if __name__ == '__main__':
+ ptw = PTW()
+ vl = rtlil.convert(ptw, ports=ptw.ports())
+ with open("test_ptw.il", "w") as f:
+ f.write(vl)
--- /dev/null
+import sys
+from soc.TLB.ariane.plru import PLRU
+from nmigen.compat.sim import run_simulation
+
+
+def tbench(dut):
+ yield
+
+
+if __name__ == "__main__":
+ dut = PLRU(4)
+ run_simulation(dut, tbench(dut), vcd_name="test_plru.vcd")
+ print("PLRU Unit Test Success")
--- /dev/null
+from nmigen.compat.sim import run_simulation
+from soc.TLB.ariane.ptw import PTW, PTE
+
+# unit was changed, test needs to be changed
+
+
+def tbench(dut):
+
+ addr = 0x8000000
+
+ #pte = PTE()
+ # yield pte.v.eq(1)
+ # yield pte.r.eq(1)
+
+ yield dut.req_port_i.data_gnt.eq(1)
+ yield dut.req_port_i.data_rvalid.eq(1)
+ yield dut.req_port_i.data_rdata.eq(0x43) # pte.flatten())
+
+ # data lookup
+ yield dut.en_ld_st_translation_i.eq(1)
+ yield dut.asid_i.eq(1)
+
+ yield dut.dtlb_access_i.eq(1)
+ yield dut.dtlb_hit_i.eq(0)
+ yield dut.dtlb_vaddr_i.eq(0x400000000)
+
+ yield
+ yield
+ yield
+
+ yield dut.dtlb_access_i.eq(1)
+ yield dut.dtlb_hit_i.eq(0)
+ yield dut.dtlb_vaddr_i.eq(0x200000)
+
+ yield
+ yield
+ yield
+
+ yield dut.req_port_i.data_gnt.eq(0)
+ yield dut.dtlb_access_i.eq(1)
+ yield dut.dtlb_hit_i.eq(0)
+ yield dut.dtlb_vaddr_i.eq(0x400000011)
+
+ yield
+ yield dut.req_port_i.data_gnt.eq(1)
+ yield
+ yield
+
+ # data lookup, PTW levels 1-2-3
+ addr = 0x4000000
+ yield dut.dtlb_vaddr_i.eq(addr)
+ yield dut.mxr_i.eq(0x1)
+ yield dut.req_port_i.data_gnt.eq(1)
+ yield dut.req_port_i.data_rvalid.eq(1)
+ # pte.flatten())
+ yield dut.req_port_i.data_rdata.eq(0x41 | (addr >> 12) << 10)
+
+ yield dut.en_ld_st_translation_i.eq(1)
+ yield dut.asid_i.eq(1)
+
+ yield dut.dtlb_access_i.eq(1)
+ yield dut.dtlb_hit_i.eq(0)
+ yield dut.dtlb_vaddr_i.eq(addr)
+
+ yield
+ yield
+ yield
+ yield
+ yield
+ yield
+ yield
+ yield
+
+ yield dut.req_port_i.data_gnt.eq(0)
+ yield dut.dtlb_access_i.eq(1)
+ yield dut.dtlb_hit_i.eq(0)
+ yield dut.dtlb_vaddr_i.eq(0x400000011)
+
+ yield
+ yield dut.req_port_i.data_gnt.eq(1)
+ yield
+ yield
+ yield
+ yield
+
+ # instruction lookup
+ yield dut.en_ld_st_translation_i.eq(0)
+ yield dut.enable_translation_i.eq(1)
+ yield dut.asid_i.eq(1)
+
+ yield dut.itlb_access_i.eq(1)
+ yield dut.itlb_hit_i.eq(0)
+ yield dut.itlb_vaddr_i.eq(0x800000)
+
+ yield
+ yield
+ yield
+
+ yield dut.itlb_access_i.eq(1)
+ yield dut.itlb_hit_i.eq(0)
+ yield dut.itlb_vaddr_i.eq(0x200000)
+
+ yield
+ yield
+ yield
+
+ yield dut.req_port_i.data_gnt.eq(0)
+ yield dut.itlb_access_i.eq(1)
+ yield dut.itlb_hit_i.eq(0)
+ yield dut.itlb_vaddr_i.eq(0x800011)
+
+ yield
+ yield dut.req_port_i.data_gnt.eq(1)
+ yield
+ yield
+
+ yield
+
+
+def test_ptw():
+ dut = PTW()
+ run_simulation(dut, tbench(dut), vcd_name="test_ptw.vcd")
+ print("PTW Unit Test Success")
+
+
+if __name__ == "__main__":
+ test_ptw()
--- /dev/null
+from nmigen.compat.sim import run_simulation
+
+from soc.TLB.ariane.tlb import TLB
+
+
+def set_vaddr(addr):
+ yield dut.lu_vaddr_i.eq(addr)
+ yield dut.update_i.vpn.eq(addr >> 12)
+
+
+def tbench(dut):
+ yield dut.lu_access_i.eq(1)
+ yield dut.lu_asid_i.eq(1)
+ yield dut.update_i.valid.eq(1)
+ yield dut.update_i.is_1G.eq(0)
+ yield dut.update_i.is_2M.eq(0)
+ yield dut.update_i.asid.eq(1)
+ yield dut.update_i.content.ppn.eq(0)
+ yield dut.update_i.content.rsw.eq(0)
+ yield dut.update_i.content.r.eq(1)
+
+ yield
+
+ addr = 0x80000
+ yield from set_vaddr(addr)
+ yield
+
+ addr = 0x90001
+ yield from set_vaddr(addr)
+ yield
+
+ addr = 0x28000000
+ yield from set_vaddr(addr)
+ yield
+
+ addr = 0x28000001
+ yield from set_vaddr(addr)
+
+ addr = 0x28000001
+ yield from set_vaddr(addr)
+ yield
+
+ addr = 0x1000040000
+ yield from set_vaddr(addr)
+ yield
+
+ addr = 0x1000040001
+ yield from set_vaddr(addr)
+ yield
+
+ yield dut.update_i.is_1G.eq(1)
+ addr = 0x2040000
+ yield from set_vaddr(addr)
+ yield
+
+ yield dut.update_i.is_1G.eq(1)
+ addr = 0x2040001
+ yield from set_vaddr(addr)
+ yield
+
+ yield
+
+
+if __name__ == "__main__":
+ dut = TLB()
+ run_simulation(dut, tbench(dut), vcd_name="test_tlb.vcd")
+ print("TLB Unit Test Success")
--- /dev/null
+from nmigen.compat.sim import run_simulation
+
+from soc.TLB.ariane.tlb_content import TLBContent
+from soc.TestUtil.test_helper import assert_op, assert_eq
+
+
+def update(dut, a, t, g, m):
+ yield dut.replace_en_i.eq(1)
+ yield dut.update_i.valid.eq(1)
+ yield dut.update_i.is_512G.eq(t)
+ yield dut.update_i.is_1G.eq(g)
+ yield dut.update_i.is_2M.eq(m)
+ yield dut.update_i.vpn.eq(a)
+ yield
+ yield
+
+
+def check_hit(dut, hit, pagesize):
+ hit_d = yield dut.lu_hit_o
+ assert_eq("hit", hit_d, hit)
+
+ if(hit):
+ if(pagesize == "t"):
+ hitp = yield dut.lu_is_512G_o
+ assert_eq("lu_is_512G_o", hitp, 1)
+ elif(pagesize == "g"):
+ hitp = yield dut.lu_is_1G_o
+ assert_eq("lu_is_1G_o", hitp, 1)
+ elif(pagesize == "m"):
+ hitp = yield dut.lu_is_2M_o
+ assert_eq("lu_is_2M_o", hitp, 1)
+
+
+def addr(a, b, c, d):
+ return a | b << 9 | c << 18 | d << 27
+
+
+def tbench(dut):
+ yield dut.vpn0.eq(0x0A)
+ yield dut.vpn1.eq(0x0B)
+ yield dut.vpn2.eq(0x0C)
+ yield dut.vpn3.eq(0x0D)
+ yield from update(dut, addr(0xFF, 0xFF, 0xFF, 0x0D), 1, 0, 0)
+ yield from check_hit(dut, 1, "t")
+
+ yield from update(dut, addr(0xFF, 0xFF, 0x0C, 0x0D), 0, 1, 0)
+ yield from check_hit(dut, 1, "g")
+
+ yield from update(dut, addr(0xFF, 0x0B, 0x0C, 0x0D), 0, 0, 1)
+ yield from check_hit(dut, 1, "m")
+
+ yield from update(dut, addr(0x0A, 0x0B, 0x0C, 0x0D), 0, 0, 0)
+ yield from check_hit(dut, 1, "")
+
+ yield from update(dut, addr(0xAA, 0xBB, 0xCC, 0xDD), 0, 0, 0)
+ yield from check_hit(dut, 0, "miss")
+
+
+if __name__ == "__main__":
+ dut = TLBContent(4, 4)
+ #
+ run_simulation(dut, tbench(dut), vcd_name="test_tlb_content.vcd")
+ print("TLBContent Unit Test Success")
--- /dev/null
+"""
+# Copyright 2018 ETH Zurich and University of Bologna.
+# Copyright and related rights are licensed under the Solderpad Hardware
+# License, Version 0.51 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+# http:#solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# or agreed to in writing, software, hardware and materials distributed under
+# this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# Author: David Schaffenrath, TU Graz
+# Author: Florian Zaruba, ETH Zurich
+# Date: 21.4.2017
+# Description: Translation Lookaside Buffer, SV48
+# fully set-associative
+
+Implementation in c++:
+https://raw.githubusercontent.com/Tony-Hu/TreePLRU/master/TreePLRU.cpp
+
+Text description:
+https://people.cs.clemson.edu/~mark/464/p_lru.txt
+
+Online simulator:
+http://www.ntu.edu.sg/home/smitha/ParaCache/Paracache/vm.html
+"""
+from math import log2
+from nmigen import Signal, Module, Cat, Const, Array, Elaboratable
+from nmigen.cli import verilog, rtlil
+from nmigen.lib.coding import Encoder
+
+from soc.TLB.ariane.ptw import TLBUpdate, PTE, ASID_WIDTH
+from soc.TLB.ariane.plru import PLRU
+from soc.TLB.ariane.tlb_content import TLBContent
+
+TLB_ENTRIES = 8
+
+
+class TLB(Elaboratable):
+ def __init__(self, tlb_entries=8, asid_width=8):
+ self.tlb_entries = tlb_entries
+ self.asid_width = asid_width
+
+ self.flush_i = Signal() # Flush signal
+ # Lookup signals
+ self.lu_access_i = Signal()
+ self.lu_asid_i = Signal(self.asid_width)
+ self.lu_vaddr_i = Signal(64)
+ self.lu_content_o = PTE()
+ self.lu_is_2M_o = Signal()
+ self.lu_is_1G_o = Signal()
+ self.lu_is_512G_o = Signal()
+ self.lu_hit_o = Signal()
+ # Update TLB
+ self.pte_width = len(self.lu_content_o.flatten())
+ self.update_i = TLBUpdate(asid_width)
+
+ def elaborate(self, platform):
+ m = Module()
+
+ vpn3 = Signal(9) # FIXME unused signal
+ vpn2 = Signal(9)
+ vpn1 = Signal(9)
+ vpn0 = Signal(9)
+
+ # -------------
+ # Translation
+ # -------------
+
+ # SV48 defines four levels of page tables
+ m.d.comb += [vpn0.eq(self.lu_vaddr_i[12:21]),
+ vpn1.eq(self.lu_vaddr_i[21:30]),
+ vpn2.eq(self.lu_vaddr_i[30:39]),
+ vpn3.eq(self.lu_vaddr_i[39:48]), # FIXME
+ ]
+
+ tc = []
+ for i in range(self.tlb_entries):
+ tlc = TLBContent(self.pte_width, self.asid_width)
+ setattr(m.submodules, "tc%d" % i, tlc)
+ tc.append(tlc)
+ # connect inputs
+ tlc.update_i = self.update_i # saves a lot of graphviz links
+ m.d.comb += [tlc.vpn0.eq(vpn0),
+ tlc.vpn1.eq(vpn1),
+ tlc.vpn2.eq(vpn2),
+ # TODO 4th
+ tlc.flush_i.eq(self.flush_i),
+ # tlc.update_i.eq(self.update_i),
+ tlc.lu_asid_i.eq(self.lu_asid_i)]
+ tc = Array(tc)
+
+ # --------------
+ # Select hit
+ # --------------
+
+ # use Encoder to select hit index
+ # XXX TODO: assert that there's only one valid entry (one lu_hit)
+ hitsel = Encoder(self.tlb_entries)
+ m.submodules.hitsel = hitsel
+
+ hits = []
+ for i in range(self.tlb_entries):
+ hits.append(tc[i].lu_hit_o)
+ m.d.comb += hitsel.i.eq(Cat(*hits)) # (goes into plru as well)
+ idx = hitsel.o
+
+ active = Signal(reset_less=True)
+ m.d.comb += active.eq(~hitsel.n)
+ with m.If(active):
+ # active hit, send selected as output
+ m.d.comb += [self.lu_is_512G_o.eq(tc[idx].lu_is_512G_o),
+ self.lu_is_1G_o.eq(tc[idx].lu_is_1G_o),
+ self.lu_is_2M_o.eq(tc[idx].lu_is_2M_o),
+ self.lu_hit_o.eq(1),
+ self.lu_content_o.flatten().eq(tc[idx].lu_content_o),
+ ]
+
+ # --------------
+ # PLRU.
+ # --------------
+
+ p = PLRU(self.tlb_entries)
+ plru_tree = Signal(p.TLBSZ)
+ m.submodules.plru = p
+
+ # connect PLRU inputs/outputs
+ # XXX TODO: assert that there's only one valid entry (one replace_en)
+ en = []
+ for i in range(self.tlb_entries):
+ en.append(tc[i].replace_en_i)
+ m.d.comb += [Cat(*en).eq(p.replace_en_o), # output from PLRU into tags
+ p.lu_hit.eq(hitsel.i),
+ p.lu_access_i.eq(self.lu_access_i),
+ p.plru_tree.eq(plru_tree)]
+ m.d.sync += plru_tree.eq(p.plru_tree_o)
+
+ # --------------
+ # Sanity checks
+ # --------------
+
+ assert (self.tlb_entries % 2 == 0) and (self.tlb_entries > 1), \
+ "TLB size must be a multiple of 2 and greater than 1"
+ assert (self.asid_width >= 1), \
+ "ASID width must be at least 1"
+
+ return m
+
+ """
+ # Just for checking
+ function int countSetBits(logic[self.tlb_entries-1:0] vector);
+ automatic int count = 0;
+ foreach (vector[idx]) begin
+ count += vector[idx];
+ end
+ return count;
+ endfunction
+
+ assert property (@(posedge clk_i)(countSetBits(lu_hit) <= 1))
+ else $error("More then one hit in TLB!"); $stop(); end
+ assert property (@(posedge clk_i)(countSetBits(replace_en) <= 1))
+ else $error("More then one TLB entry selected for next replace!");
+ """
+
+ def ports(self):
+ return [self.flush_i, self.lu_access_i,
+ self.lu_asid_i, self.lu_vaddr_i,
+ self.lu_is_2M_o, self.lu_1G_o, self.lu_is_512G_o, self.lu_hit_o
+ ] + self.lu_content_o.ports() + self.update_i.ports()
+
+
+if __name__ == '__main__':
+ tlb = TLB()
+ vl = rtlil.convert(tlb, ports=tlb.ports())
+ with open("test_tlb.il", "w") as f:
+ f.write(vl)
--- /dev/null
+from nmigen import Signal, Module, Cat, Const, Elaboratable
+
+from soc.TLB.ariane.ptw import TLBUpdate, PTE
+
+
+class TLBEntry:
+ def __init__(self, asid_width):
+ self.asid = Signal(asid_width, name="ent_asid")
+ # SV48 defines four levels of page tables
+ self.vpn0 = Signal(9, name="ent_vpn0")
+ self.vpn1 = Signal(9, name="ent_vpn1")
+ self.vpn2 = Signal(9, name="ent_vpn2")
+ self.vpn3 = Signal(9, name="ent_vpn3")
+ self.is_2M = Signal(name="ent_is_2M")
+ self.is_1G = Signal(name="ent_is_1G")
+ self.is_512G = Signal(name="ent_is_512G")
+ self.valid = Signal(name="ent_valid")
+
+ def flatten(self):
+ return Cat(*self.ports())
+
+ def eq(self, x):
+ return self.flatten().eq(x.flatten())
+
+ def ports(self):
+ return [self.asid, self.vpn0, self.vpn1, self.vpn2,
+ self.is_2M, self.is_1G, self.valid]
+
+
+class TLBContent(Elaboratable):
+ def __init__(self, pte_width, asid_width):
+ self.asid_width = asid_width
+ self.pte_width = pte_width
+ self.flush_i = Signal() # Flush signal
+ # Update TLB
+ self.update_i = TLBUpdate(asid_width)
+ self.vpn3 = Signal(9)
+ self.vpn2 = Signal(9)
+ self.vpn1 = Signal(9)
+ self.vpn0 = Signal(9)
+ self.replace_en_i = Signal() # replace the following entry,
+ # set by replacement strategy
+ # Lookup signals
+ self.lu_asid_i = Signal(asid_width)
+ self.lu_content_o = Signal(pte_width)
+ self.lu_is_512G_o = Signal()
+ self.lu_is_2M_o = Signal()
+ self.lu_is_1G_o = Signal()
+ self.lu_hit_o = Signal()
+
+ def elaborate(self, platform):
+ m = Module()
+
+ tags = TLBEntry(self.asid_width)
+
+ content = Signal(self.pte_width)
+
+ m.d.comb += [self.lu_hit_o.eq(0),
+ self.lu_is_512G_o.eq(0),
+ self.lu_is_2M_o.eq(0),
+ self.lu_is_1G_o.eq(0)]
+
+ # temporaries for lookup
+ asid_ok = Signal(reset_less=True)
+ # tags_ok = Signal(reset_less=True)
+
+ vpn3_ok = Signal(reset_less=True)
+ vpn2_ok = Signal(reset_less=True)
+ vpn1_ok = Signal(reset_less=True)
+ vpn0_ok = Signal(reset_less=True)
+
+ #tags_2M = Signal(reset_less=True)
+ vpn0_or_2M = Signal(reset_less=True)
+
+ m.d.comb += [
+ # compare asid and vpn*
+ asid_ok.eq(tags.asid == self.lu_asid_i),
+ vpn3_ok.eq(tags.vpn3 == self.vpn3),
+ vpn2_ok.eq(tags.vpn2 == self.vpn2),
+ vpn1_ok.eq(tags.vpn1 == self.vpn1),
+ vpn0_ok.eq(tags.vpn0 == self.vpn0),
+ vpn0_or_2M.eq(tags.is_2M | vpn0_ok)
+ ]
+
+ with m.If(asid_ok & tags.valid):
+ # first level, only vpn3 needs to match
+ with m.If(tags.is_512G & vpn3_ok):
+ m.d.comb += [self.lu_content_o.eq(content),
+ self.lu_is_512G_o.eq(1),
+ self.lu_hit_o.eq(1),
+ ]
+ # second level , second level vpn2 and vpn3 need to match
+ with m.Elif(tags.is_1G & vpn2_ok & vpn3_ok):
+ m.d.comb += [self.lu_content_o.eq(content),
+ self.lu_is_1G_o.eq(1),
+ self.lu_hit_o.eq(1),
+ ]
+ # not a giga page hit nor a tera page hit so check further
+ with m.Elif(vpn1_ok):
+ # this could be a 2 mega page hit or a 4 kB hit
+ # output accordingly
+ with m.If(vpn0_or_2M):
+ m.d.comb += [self.lu_content_o.eq(content),
+ self.lu_is_2M_o.eq(tags.is_2M),
+ self.lu_hit_o.eq(1),
+ ]
+ # ------------------
+ # Update or Flush
+ # ------------------
+
+ # temporaries
+ replace_valid = Signal(reset_less=True)
+ m.d.comb += replace_valid.eq(self.update_i.valid & self.replace_en_i)
+
+ # flush
+ with m.If(self.flush_i):
+ # invalidate (flush) conditions: all if zero or just this ASID
+ with m.If(self.lu_asid_i == Const(0, self.asid_width) |
+ (self.lu_asid_i == tags.asid)):
+ m.d.sync += tags.valid.eq(0)
+
+ # normal replacement
+ with m.Elif(replace_valid):
+ m.d.sync += [ # update tag array
+ tags.asid.eq(self.update_i.asid),
+ tags.vpn3.eq(self.update_i.vpn[27:36]),
+ tags.vpn2.eq(self.update_i.vpn[18:27]),
+ tags.vpn1.eq(self.update_i.vpn[9:18]),
+ tags.vpn0.eq(self.update_i.vpn[0:9]),
+ tags.is_512G.eq(self.update_i.is_512G),
+ tags.is_1G.eq(self.update_i.is_1G),
+ tags.is_2M.eq(self.update_i.is_2M),
+ tags.valid.eq(1),
+ # and content as well
+ content.eq(self.update_i.content.flatten())
+ ]
+ return m
+
+ def ports(self):
+ return [self.flush_i,
+ self.lu_asid_i,
+ self.lu_is_2M_o, self.lu_is_1G_o, self.lu_is_512G_o, self.lu_hit_o,
+ ] + self.update_i.content.ports() + self.update_i.ports()
--- /dev/null
+# SPDX-License-Identifier: LGPL-2.1-or-later
+# See Notices.txt for copyright information
+from soc.TLB.LFSR import LFSR, LFSRPolynomial, LFSR_POLY_3
+
+from nmigen.back.pysim import Simulator, Delay, Tick
+import unittest
+
+
+class TestLFSR(unittest.TestCase):
+ def test_poly(self):
+ v = LFSRPolynomial()
+ self.assertEqual(repr(v), "LFSRPolynomial([0])")
+ self.assertEqual(str(v), "1")
+ v = LFSRPolynomial([1])
+ self.assertEqual(repr(v), "LFSRPolynomial([1, 0])")
+ self.assertEqual(str(v), "x + 1")
+ v = LFSRPolynomial([0, 1])
+ self.assertEqual(repr(v), "LFSRPolynomial([1, 0])")
+ self.assertEqual(str(v), "x + 1")
+ v = LFSRPolynomial([1, 2])
+ self.assertEqual(repr(v), "LFSRPolynomial([2, 1, 0])")
+ self.assertEqual(str(v), "x^2 + x + 1")
+ v = LFSRPolynomial([2])
+ self.assertEqual(repr(v), "LFSRPolynomial([2, 0])")
+ self.assertEqual(str(v), "x^2 + 1")
+ self.assertEqual(str(LFSR_POLY_3), "x^3 + x^2 + 1")
+
+ def test_lfsr_3(self):
+ module = LFSR(LFSR_POLY_3)
+ traces = [module.state, module.enable]
+ with Simulator(module,
+ vcd_file=open("Waveforms/test_LFSR2.vcd", "w"),
+ gtkw_file=open("Waveforms/test_LFSR2.gtkw", "w"),
+ traces=traces) as sim:
+ sim.add_clock(1e-6, phase=0.25e-6)
+ delay = Delay(1e-7)
+
+ def async_process():
+ yield module.enable.eq(0)
+ yield Tick()
+ self.assertEqual((yield module.state), 0x1)
+ yield Tick()
+ self.assertEqual((yield module.state), 0x1)
+ yield module.enable.eq(1)
+ yield Tick()
+ yield delay
+ self.assertEqual((yield module.state), 0x2)
+ yield Tick()
+ yield delay
+ self.assertEqual((yield module.state), 0x5)
+ yield Tick()
+ yield delay
+ self.assertEqual((yield module.state), 0x3)
+ yield Tick()
+ yield delay
+ self.assertEqual((yield module.state), 0x7)
+ yield Tick()
+ yield delay
+ self.assertEqual((yield module.state), 0x6)
+ yield Tick()
+ yield delay
+ self.assertEqual((yield module.state), 0x4)
+ yield Tick()
+ yield delay
+ self.assertEqual((yield module.state), 0x1)
+ yield Tick()
+
+ sim.add_process(async_process)
+ sim.run()
--- /dev/null
+from nmigen.compat.sim import run_simulation
+from soc.TLB.AddressEncoder import AddressEncoder
+from soc.TestUtil.test_helper import assert_eq, assert_ne, assert_op
+
+
+# This function allows for the easy setting of values to the AddressEncoder
+# Arguments:
+# dut: The AddressEncoder being tested
+# i (Input): The array of single bits to be written
+def set_encoder(dut, i):
+ yield dut.i.eq(i)
+ yield
+
+# Checks the single match of the AddressEncoder
+# Arguments:
+# dut: The AddressEncoder being tested
+# sm (Single Match): The expected match result
+# op (Operation): (0 => ==), (1 => !=)
+
+
+def check_single_match(dut, sm, op):
+ out_sm = yield dut.single_match
+ assert_op("Single Match", out_sm, sm, op)
+
+# Checks the multiple match of the AddressEncoder
+# Arguments:
+# dut: The AddressEncoder being tested
+# mm (Multiple Match): The expected match result
+# op (Operation): (0 => ==), (1 => !=)
+
+
+def check_multiple_match(dut, mm, op):
+ out_mm = yield dut.multiple_match
+ assert_op("Multiple Match", out_mm, mm, op)
+
+# Checks the output of the AddressEncoder
+# Arguments:
+# dut: The AddressEncoder being tested
+# o (Output): The expected output
+# op (Operation): (0 => ==), (1 => !=)
+
+
+def check_output(dut, o, op):
+ out_o = yield dut.o
+ assert_op("Output", out_o, o, op)
+
+# Checks the state of the AddressEncoder
+# Arguments:
+# dut: The AddressEncoder being tested
+# sm (Single Match): The expected match result
+# mm (Multiple Match): The expected match result
+# o (Output): The expected output
+# ss_op (Operation): Operation for the match assertion (0 => ==), (1 => !=)
+# mm_op (Operation): Operation for the match assertion (0 => ==), (1 => !=)
+# o_op (Operation): Operation for the match assertion (0 => ==), (1 => !=)
+
+
+def check_all(dut, sm, mm, o, sm_op, mm_op, o_op):
+ yield from check_single_match(dut, sm, sm_op)
+ yield from check_multiple_match(dut, mm, mm_op)
+ yield from check_output(dut, o, o_op)
+
+
+def tbench(dut):
+ # Check invalid input
+ in_val = 0b000
+ single_match = 0
+ multiple_match = 0
+ output = 0
+ yield from set_encoder(dut, in_val)
+ yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0)
+
+ # Check single bit
+ in_val = 0b001
+ single_match = 1
+ multiple_match = 0
+ output = 0
+ yield from set_encoder(dut, in_val)
+ yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0)
+
+ # Check another single bit
+ in_val = 0b100
+ single_match = 1
+ multiple_match = 0
+ output = 2
+ yield from set_encoder(dut, in_val)
+ yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0)
+
+ # Check multiple match
+ # We expected the lowest bit to be returned which is address 0
+ in_val = 0b101
+ single_match = 0
+ multiple_match = 1
+ output = 0
+ yield from set_encoder(dut, in_val)
+ yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0)
+
+ # Check another multiple match
+ # We expected the lowest bit to be returned which is address 1
+ in_val = 0b110
+ single_match = 0
+ multiple_match = 1
+ output = 1
+ yield from set_encoder(dut, in_val)
+ yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0)
+
+
+def test_addr():
+ dut = AddressEncoder(4)
+ run_simulation(dut, tbench(dut),
+ vcd_name="Waveforms/test_address_encoder.vcd")
+ print("AddressEncoder Unit Test Success")
+
+
+if __name__ == "__main__":
+ test_addr()
--- /dev/null
+from nmigen.compat.sim import run_simulation
+
+from soc.TLB.Cam import Cam
+
+from soc.TestUtil.test_helper import assert_eq, assert_ne, assert_op
+
+# This function allows for the easy setting of values to the Cam
+# Arguments:
+# dut: The Cam being tested
+# e (Enable): Whether the block is going to be enabled
+# we (Write Enable): Whether the Cam will write on the next cycle
+# a (Address): Where the data will be written if write enable is high
+# d (Data): Either what we are looking for or will write to the address
+
+
+def set_cam(dut, e, we, a, d):
+ yield dut.enable.eq(e)
+ yield dut.write_enable.eq(we)
+ yield dut.address_in.eq(a)
+ yield dut.data_in.eq(d)
+ yield
+
+# Checks the multiple match of the Cam
+# Arguments:
+# dut: The Cam being tested
+# mm (Multiple Match): The expected match result
+# op (Operation): (0 => ==), (1 => !=)
+
+
+def check_multiple_match(dut, mm, op):
+ out_mm = yield dut.multiple_match
+ assert_op("Multiple Match", out_mm, mm, op)
+
+# Checks the single match of the Cam
+# Arguments:
+# dut: The Cam being tested
+# sm (Single Match): The expected match result
+# op (Operation): (0 => ==), (1 => !=)
+
+
+def check_single_match(dut, sm, op):
+ out_sm = yield dut.single_match
+ assert_op("Single Match", out_sm, sm, op)
+
+# Checks the address output of the Cam
+# Arguments:
+# dut: The Cam being tested
+# ma (Match Address): The expected match result
+# op (Operation): (0 => ==), (1 => !=)
+
+
+def check_match_address(dut, ma, op):
+ out_ma = yield dut.match_address
+ assert_op("Match Address", out_ma, ma, op)
+
+# Checks the state of the Cam
+# Arguments:
+# dut: The Cam being tested
+# sm (Single Match): The expected match result
+# mm (Multiple Match): The expected match result
+# ma: (Match Address): The expected address output
+# ss_op (Operation): Operation for the match assertion (0 => ==), (1 => !=)
+# mm_op (Operation): Operation for the match assertion (0 => ==), (1 => !=)
+# ma_op (Operation): Operation for the address assertion (0 => ==), (1 => !=)
+
+
+def check_all(dut, mm, sm, ma, mm_op, sm_op, ma_op):
+ yield from check_multiple_match(dut, mm, mm_op)
+ yield from check_single_match(dut, sm, sm_op)
+ yield from check_match_address(dut, ma, ma_op)
+
+
+def tbench(dut):
+ # NA
+ enable = 0
+ write_enable = 0
+ address = 0
+ data = 0
+ single_match = 0
+ yield from set_cam(dut, enable, write_enable, address, data)
+ yield
+ yield from check_single_match(dut, single_match, 0)
+
+ # Read Miss Multiple
+ # Note that the default starting entry data bits are all 0
+ enable = 1
+ write_enable = 0
+ address = 0
+ data = 0
+ multiple_match = 1
+ single_match = 0
+ yield from set_cam(dut, enable, write_enable, address, data)
+ yield
+ yield from check_multiple_match(dut, multiple_match, 0)
+
+ # Read Miss
+ # Note that the default starting entry data bits are all 0
+ enable = 1
+ write_enable = 0
+ address = 0
+ data = 1
+ multiple_match = 0
+ single_match = 0
+ yield from set_cam(dut, enable, write_enable, address, data)
+ yield
+ yield from check_single_match(dut, single_match, 0)
+
+ # Write Entry 0
+ enable = 1
+ write_enable = 1
+ address = 0
+ data = 4
+ multiple_match = 0
+ single_match = 0
+ yield from set_cam(dut, enable, write_enable, address, data)
+ yield
+ yield from check_single_match(dut, single_match, 0)
+
+ # Read Hit Entry 0
+ enable = 1
+ write_enable = 0
+ address = 0
+ data = 4
+ multiple_match = 0
+ single_match = 1
+ yield from set_cam(dut, enable, write_enable, address, data)
+ yield
+ yield from check_all(dut, multiple_match, single_match, address, 0, 0, 0)
+
+ # Search Hit
+ enable = 1
+ write_enable = 0
+ address = 0
+ data = 4
+ multiple_match = 0
+ single_match = 1
+ yield from set_cam(dut, enable, write_enable, address, data)
+ yield
+ yield from check_all(dut, multiple_match, single_match, address, 0, 0, 0)
+
+ # Search Miss
+ enable = 1
+ write_enable = 0
+ address = 0
+ data = 5
+ single_match = 0
+ yield from set_cam(dut, enable, write_enable, address, data)
+ yield
+ yield from check_single_match(dut, single_match, 0)
+
+ # Multiple Match test
+ # Write Entry 1
+ enable = 1
+ write_enable = 1
+ address = 1
+ data = 5
+ multiple_match = 0
+ single_match = 0
+ yield from set_cam(dut, enable, write_enable, address, data)
+ yield
+ yield from check_single_match(dut, single_match, 0)
+
+ # Write Entry 2
+ # Same data as Entry 1
+ enable = 1
+ write_enable = 1
+ address = 2
+ data = 5
+ multiple_match = 0
+ single_match = 0
+ yield from set_cam(dut, enable, write_enable, address, data)
+ yield
+ yield from check_single_match(dut, single_match, 0)
+
+ # Read Hit Data 5
+ enable = 1
+ write_enable = 0
+ address = 1
+ data = 5
+ multiple_match = 1
+ single_match = 0
+ yield from set_cam(dut, enable, write_enable, address, data)
+ yield
+ yield from check_all(dut, multiple_match, single_match, address, 0, 0, 0)
+
+ # Verify read_warning is not caused
+ # Write Entry 0
+ enable = 1
+ write_enable = 1
+ address = 0
+ data = 7
+ multiple_match = 0
+ single_match = 0
+ yield from set_cam(dut, enable, write_enable, address, data)
+ # Note there is no yield we immediately attempt to read in the next cycle
+
+ # Read Hit Data 7
+ enable = 1
+ write_enable = 0
+ address = 0
+ data = 7
+ multiple_match = 0
+ single_match = 1
+ yield from set_cam(dut, enable, write_enable, address, data)
+ yield
+ yield from check_single_match(dut, single_match, 0)
+
+ yield
+
+
+def test_cam():
+ dut = Cam(4, 4)
+ run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_cam.vcd")
+ print("Cam Unit Test Success")
+
+
+if __name__ == "__main__":
+ test_cam()
--- /dev/null
+from nmigen.compat.sim import run_simulation
+
+from soc.TestUtil.test_helper import assert_eq, assert_ne, assert_op
+from soc.TLB.CamEntry import CamEntry
+
+# This function allows for the easy setting of values to the Cam Entry
+# Arguments:
+# dut: The CamEntry being tested
+# c (command): NA (0), Read (1), Write (2), Reserve (3)
+# d (data): The data to be set
+
+
+def set_cam_entry(dut, c, d):
+ # Write desired values
+ yield dut.command.eq(c)
+ yield dut.data_in.eq(d)
+ yield
+ # Reset all lines
+ yield dut.command.eq(0)
+ yield dut.data_in.eq(0)
+ yield
+
+# Checks the data state of the CAM entry
+# Arguments:
+# dut: The CamEntry being tested
+# d (Data): The expected data
+# op (Operation): (0 => ==), (1 => !=)
+
+
+def check_data(dut, d, op):
+ out_d = yield dut.data
+ assert_op("Data", out_d, d, op)
+
+# Checks the match state of the CAM entry
+# Arguments:
+# dut: The CamEntry being tested
+# m (Match): The expected match
+# op (Operation): (0 => ==), (1 => !=)
+
+
+def check_match(dut, m, op):
+ out_m = yield dut.match
+ assert_op("Match", out_m, m, op)
+
+# Checks the state of the CAM entry
+# Arguments:
+# dut: The CamEntry being tested
+# d (data): The expected data
+# m (match): The expected match
+# d_op (Operation): Operation for the data assertion (0 => ==), (1 => !=)
+# m_op (Operation): Operation for the match assertion (0 => ==), (1 => !=)
+
+
+def check_all(dut, d, m, d_op, m_op):
+ yield from check_data(dut, d, d_op)
+ yield from check_match(dut, m, m_op)
+
+# This tbench goes through the paces of testing the CamEntry module
+# It is done by writing and then reading various combinations of key/data pairs
+# and reading the results with varying keys to verify the resulting stored
+# data is correct.
+
+
+def tbench(dut):
+ # Check write
+ command = 2
+ data = 1
+ match = 0
+ yield from set_cam_entry(dut, command, data)
+ yield from check_all(dut, data, match, 0, 0)
+
+ # Check read miss
+ command = 1
+ data = 2
+ match = 0
+ yield from set_cam_entry(dut, command, data)
+ yield from check_all(dut, data, match, 1, 0)
+
+ # Check read hit
+ command = 1
+ data = 1
+ match = 1
+ yield from set_cam_entry(dut, command, data)
+ yield from check_all(dut, data, match, 0, 0)
+
+ # Check overwrite
+ command = 2
+ data = 5
+ match = 0
+ yield from set_cam_entry(dut, command, data)
+ yield
+ yield from check_all(dut, data, match, 0, 0)
+
+ # Check read hit
+ command = 1
+ data = 5
+ match = 1
+ yield from set_cam_entry(dut, command, data)
+ yield from check_all(dut, data, match, 0, 0)
+
+ # Check reset
+ command = 3
+ data = 0
+ match = 0
+ yield from set_cam_entry(dut, command, data)
+ yield from check_all(dut, data, match, 0, 0)
+
+ # Extra clock cycle for waveform
+ yield
+
+
+def test_camentry():
+ dut = CamEntry(4)
+ run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_cam_entry.vcd")
+ print("CamEntry Unit Test Success")
+
+
+if __name__ == "__main__":
+ test_camentry()
--- /dev/null
+from nmigen.compat.sim import run_simulation
+
+from soc.TLB.PermissionValidator import PermissionValidator
+
+from soc.TestUtil.test_helper import assert_op
+
+
+def set_validator(dut, d, xwr, sm, sa, asid):
+ yield dut.data.eq(d)
+ yield dut.xwr.eq(xwr)
+ yield dut.super_mode.eq(sm)
+ yield dut.super_access.eq(sa)
+ yield dut.asid.eq(asid)
+ yield
+
+
+def check_valid(dut, v, op):
+ out_v = yield dut.valid
+ assert_op("Valid", out_v, v, op)
+
+
+def tbench(dut):
+ # 80 bits represented. Ignore the MSB as it will be truncated
+ # ASID is bits first 4 hex values (bits 64 - 78)
+
+ # Test user mode entry valid
+ # Global Bit matching ASID
+ # Ensure that user mode and valid is enabled!
+ data = 0x7FFF0000000000000031
+ # Ignore MSB it will be truncated
+ asid = 0x7FFF
+ super_mode = 0
+ super_access = 0
+ xwr = 0
+ valid = 1
+ yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
+ yield from check_valid(dut, valid, 0)
+
+ # Test user mode entry valid
+ # Global Bit nonmatching ASID
+ # Ensure that user mode and valid is enabled!
+ data = 0x7FFF0000000000000031
+ # Ignore MSB it will be truncated
+ asid = 0x7FF6
+ super_mode = 0
+ super_access = 0
+ xwr = 0
+ valid = 1
+ yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
+ yield from check_valid(dut, valid, 0)
+
+ # Test user mode entry invalid
+ # Global Bit nonmatching ASID
+ # Ensure that user mode and valid is enabled!
+ data = 0x7FFF0000000000000021
+ # Ignore MSB it will be truncated
+ asid = 0x7FF6
+ super_mode = 0
+ super_access = 0
+ xwr = 0
+ valid = 0
+ yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
+ yield from check_valid(dut, valid, 0)
+
+ # Test user mode entry valid
+ # Ensure that user mode and valid is enabled!
+ data = 0x7FFF0000000000000011
+ # Ignore MSB it will be truncated
+ asid = 0x7FFF
+ super_mode = 0
+ super_access = 0
+ xwr = 0
+ valid = 1
+ yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
+ yield from check_valid(dut, valid, 0)
+
+ # Test user mode entry invalid
+ # Ensure that user mode and valid is enabled!
+ data = 0x7FFF0000000000000011
+ # Ignore MSB it will be truncated
+ asid = 0x7FF6
+ super_mode = 0
+ super_access = 0
+ xwr = 0
+ valid = 0
+ yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
+ yield from check_valid(dut, valid, 0)
+
+ # Test supervisor mode entry valid
+ # The entry is NOT in user mode
+ # Ensure that user mode and valid is enabled!
+ data = 0x7FFF0000000000000001
+ # Ignore MSB it will be truncated
+ asid = 0x7FFF
+ super_mode = 1
+ super_access = 0
+ xwr = 0
+ valid = 1
+ yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
+ yield from check_valid(dut, valid, 0)
+
+ # Test supervisor mode entry invalid
+ # The entry is in user mode
+ # Ensure that user mode and valid is enabled!
+ data = 0x7FFF0000000000000011
+ # Ignore MSB it will be truncated
+ asid = 0x7FFF
+ super_mode = 1
+ super_access = 0
+ xwr = 0
+ valid = 0
+ yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
+ yield from check_valid(dut, valid, 0)
+
+ # Test supervisor mode entry valid
+ # The entry is NOT in user mode with access
+ # Ensure that user mode and valid is enabled!
+ data = 0x7FFF0000000000000001
+ # Ignore MSB it will be truncated
+ asid = 0x7FFF
+ super_mode = 1
+ super_access = 1
+ xwr = 0
+ valid = 1
+ yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
+ yield from check_valid(dut, valid, 0)
+
+ # Test supervisor mode entry valid
+ # The entry is in user mode with access
+ # Ensure that user mode and valid is enabled!
+ data = 0x7FFF0000000000000011
+ # Ignore MSB it will be truncated
+ asid = 0x7FFF
+ super_mode = 1
+ super_access = 1
+ xwr = 0
+ valid = 1
+ yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
+ yield from check_valid(dut, valid, 0)
+
+
+def test_permv():
+ dut = PermissionValidator(15, 64)
+ run_simulation(dut, tbench(
+ dut), vcd_name="Waveforms/test_permission_validator.vcd")
+ print("PermissionValidator Unit Test Success")
+
+
+if __name__ == "__main__":
+ test_permv()
--- /dev/null
+from nmigen.compat.sim import run_simulation
+
+from soc.TLB.PteEntry import PteEntry
+
+from soc.TestUtil.test_helper import assert_op
+
+
+def set_entry(dut, i):
+ yield dut.i.eq(i)
+ yield
+
+
+def check_dirty(dut, d, op):
+ out_d = yield dut.d
+ assert_op("Dirty", out_d, d, op)
+
+
+def check_accessed(dut, a, op):
+ out_a = yield dut.a
+ assert_op("Accessed", out_a, a, op)
+
+
+def check_global(dut, o, op):
+ out = yield dut.g
+ assert_op("Global", out, o, op)
+
+
+def check_user(dut, o, op):
+ out = yield dut.u
+ assert_op("User Mode", out, o, op)
+
+
+def check_xwr(dut, o, op):
+ out = yield dut.xwr
+ assert_op("XWR", out, o, op)
+
+
+def check_asid(dut, o, op):
+ out = yield dut.asid
+ assert_op("ASID", out, o, op)
+
+
+def check_pte(dut, o, op):
+ out = yield dut.pte
+ assert_op("ASID", out, o, op)
+
+
+def check_valid(dut, v, op):
+ out_v = yield dut.v
+ assert_op("Valid", out_v, v, op)
+
+
+def check_all(dut, d, a, g, u, xwr, v, asid, pte):
+ yield from check_dirty(dut, d, 0)
+ yield from check_accessed(dut, a, 0)
+ yield from check_global(dut, g, 0)
+ yield from check_user(dut, u, 0)
+ yield from check_xwr(dut, xwr, 0)
+ yield from check_asid(dut, asid, 0)
+ yield from check_pte(dut, pte, 0)
+ yield from check_valid(dut, v, 0)
+
+
+def tbench(dut):
+ # 80 bits represented. Ignore the MSB as it will be truncated
+ # ASID is bits first 4 hex values (bits 64 - 78)
+
+ i = 0x7FFF0000000000000031
+ dirty = 0
+ access = 0
+ glob = 1
+ user = 1
+ xwr = 0
+ valid = 1
+ asid = 0x7FFF
+ pte = 0x0000000000000031
+ yield from set_entry(dut, i)
+ yield from check_all(dut, dirty, access, glob, user, xwr, valid, asid, pte)
+
+ i = 0x0FFF00000000000000FF
+ dirty = 1
+ access = 1
+ glob = 1
+ user = 1
+ xwr = 7
+ valid = 1
+ asid = 0x0FFF
+ pte = 0x00000000000000FF
+ yield from set_entry(dut, i)
+ yield from check_all(dut, dirty, access, glob, user, xwr, valid, asid, pte)
+
+ i = 0x0721000000001100001F
+ dirty = 0
+ access = 0
+ glob = 0
+ user = 1
+ xwr = 7
+ valid = 1
+ asid = 0x0721
+ pte = 0x000000001100001F
+ yield from set_entry(dut, i)
+ yield from check_all(dut, dirty, access, glob, user, xwr, valid, asid, pte)
+
+ yield
+
+
+def test_pteentry():
+ dut = PteEntry(15, 64)
+ run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_pte_entry.vcd")
+ print("PteEntry Unit Test Success")
+
+
+if __name__ == "__main__":
+ test_pteentry()
--- /dev/null
+from nmigen.compat.sim import run_simulation
+
+from soc.TLB.SetAssociativeCache import SetAssociativeCache
+
+from soc.TestUtil.test_helper import assert_eq, assert_ne, assert_op
+
+
+def set_sac(dut, e, c, s, t, d):
+ yield dut.enable.eq(e)
+ yield dut.command.eq(c)
+ yield dut.cset.eq(s)
+ yield dut.tag.eq(t)
+ yield dut.data_i.eq(d)
+ yield
+
+
+def tbench(dut):
+ enable = 1
+ command = 2
+ cset = 1
+ tag = 2
+ data = 3
+ yield from set_sac(dut, enable, command, cset, tag, data)
+ yield
+
+ enable = 1
+ command = 2
+ cset = 1
+ tag = 5
+ data = 8
+ yield from set_sac(dut, enable, command, cset, tag, data)
+ yield
+
+
+def test_assoc_cache():
+ dut = SetAssociativeCache(4, 4, 4, 4)
+ run_simulation(dut, tbench(
+ dut), vcd_name="Waveforms/test_set_associative_cache.vcd")
+ print("Set Associative Cache Unit Test Success")
+
+
+if __name__ == "__main__":
+ test_assoc_cache()
--- /dev/null
+#import tracemalloc
+# tracemalloc.start()
+
+from nmigen.compat.sim import run_simulation
+
+from soc.TLB.TLB import TLB
+
+from soc.TestUtil.test_helper import assert_op, assert_eq
+
+# self.supermode = Signal(1) # Supervisor Mode
+# self.super_access = Signal(1) # Supervisor Access
+# self.command = Signal(2) # 00=None, 01=Search, 10=Write L1, 11=Write L2
+# self.xwr = Signal(3) # Execute, Write, Read
+# self.mode = Signal(4) # 4 bits for access to Sv48 on Rv64
+#self.address_L1 = Signal(range(L1_size))
+# self.asid = Signal(asid_size) # Address Space IDentifier (ASID)
+# self.vma = Signal(vma_size) # Virtual Memory Address (VMA)
+# self.pte_in = Signal(pte_size) # To be saved Page Table Entry (PTE)
+#
+# self.hit = Signal(1) # Denotes if the VMA had a mapped PTE
+# self.perm_valid = Signal(1) # Denotes if the permissions are correct
+# self.pte_out = Signal(pte_size) # PTE that was mapped to by the VMA
+
+COMMAND_READ = 1
+COMMAND_WRITE_L1 = 2
+
+# Checks the data state of the CAM entry
+# Arguments:
+# dut: The CamEntry being tested
+# d (Data): The expected data
+# op (Operation): (0 => ==), (1 => !=)
+
+
+def check_hit(dut, d):
+ hit_d = yield dut.hit
+ #assert_eq("hit", hit_d, d)
+
+
+def tst_command(dut, cmd, xwr, cycles):
+ yield dut.command.eq(cmd)
+ yield dut.xwr.eq(xwr)
+ for i in range(0, cycles):
+ yield
+
+
+def tst_write_L1(dut, vma, address_L1, asid, pte_in):
+ yield dut.address_L1.eq(address_L1)
+ yield dut.asid.eq(asid)
+ yield dut.vma.eq(vma)
+ yield dut.pte_in.eq(pte_in)
+ yield from tst_command(dut, COMMAND_WRITE_L1, 7, 2)
+
+
+def tst_search(dut, vma, found):
+ yield dut.vma.eq(vma)
+ yield from tst_command(dut, COMMAND_READ, 7, 1)
+ yield from check_hit(dut, found)
+
+
+def zero(dut):
+ yield dut.supermode.eq(0)
+ yield dut.super_access.eq(0)
+ yield dut.mode.eq(0)
+ yield dut.address_L1.eq(0)
+ yield dut.asid.eq(0)
+ yield dut.vma.eq(0)
+ yield dut.pte_in.eq(0)
+
+
+def tbench(dut):
+ yield from zero(dut)
+ yield dut.mode.eq(0xF) # enable TLB
+ # test hit
+ yield from tst_write_L1(dut, 0xFEEDFACE, 0, 0xFFFF, 0xF0F0)
+ yield from tst_search(dut, 0xFEEDFACE, 1)
+ yield from tst_search(dut, 0xFACEFEED, 0)
+
+
+def test_tlb():
+ dut = TLB(15, 36, 64, 8)
+ run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_tlb.vcd")
+ print("TLB Unit Test Success")
+
+
+if __name__ == "__main__":
+ test_tlb()
--- /dev/null
+class DualPortSplitter(Elaboratable):
+ """DualPortSplitter
+
+ * one incoming PortInterface
+ * two *OUTGOING* PortInterfaces
+ * uses LDSTSplitter to do it
+
+ (actually, thinking about it LDSTSplitter could simply be
+ modified to conform to PortInterface: one in, two out)
+
+ once that is done each pair of ports may be wired directly
+ to the dual ports of L0CacheBuffer
+
+ The split is carried out so that, regardless of alignment or
+ mis-alignment, outgoing PortInterface[0] takes bit 4 == 0
+ of the address, whilst outgoing PortInterface[1] takes
+ bit 4 == 1.
+
+ PortInterface *may* need to be changed so that the length is
+ a binary number (accepting values 1-16).
+ """
+
+ def __init__(self,inp):
+ self.outp = [PortInterface(name="outp_0"),
+ PortInterface(name="outp_1")]
+ print(self.outp)
+
+ def elaborate(self, platform):
+ m = Module()
+ comb = m.d.comb
+ m.submodules.splitter = splitter = LDSTSplitter(64, 48, 4)
+ self.inp = splitter.pi
+ comb += splitter.addr_i.eq(self.inp.addr) # XXX
+ #comb += splitter.len_i.eq()
+ #comb += splitter.valid_i.eq()
+ comb += splitter.is_ld_i.eq(self.inp.is_ld_i)
+ comb += splitter.is_st_i.eq(self.inp.is_st_i)
+ #comb += splitter.st_data_i.eq()
+ #comb += splitter.sld_valid_i.eq()
+ #comb += splitter.sld_data_i.eq()
+ #comb += splitter.sst_valid_i.eq()
+ return m
--- /dev/null
+# Copyright 2018 ETH Zurich and University of Bologna.
+# Copyright and related rights are licensed under the Solderpad Hardware
+# License, Version 0.51 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+# http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# or agreed to in writing, software, hardware and materials distributed under
+# this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+# module axi4_ar_buffer
+# #(
+# parameter AXI_ID_WIDTH = 4,
+# parameter AXI_USER_WIDTH = 4
+# )
+# (
+# input logic axi4_aclk,
+# input logic axi4_arstn,
+#
+# input logic [AXI_ID_WIDTH-1:0] s_axi4_arid,
+# input logic [31:0] s_axi4_araddr,
+# input logic s_axi4_arvalid,
+# output logic s_axi4_arready,
+# input logic [7:0] s_axi4_arlen,
+# input logic [2:0] s_axi4_arsize,
+# input logic [1:0] s_axi4_arburst,
+# input logic s_axi4_arlock,
+# input logic [2:0] s_axi4_arprot,
+# input logic [3:0] s_axi4_arcache,
+# input logic [AXI_USER_WIDTH-1:0] s_axi4_aruser,
+#
+# output logic [AXI_ID_WIDTH-1:0] m_axi4_arid,
+# output logic [31:0] m_axi4_araddr,
+# output logic m_axi4_arvalid,
+# input logic m_axi4_arready,
+# output logic [7:0] m_axi4_arlen,
+# output logic [2:0] m_axi4_arsize,
+# output logic [1:0] m_axi4_arburst,
+# output logic m_axi4_arlock,
+# output logic [2:0] m_axi4_arprot,
+# output logic [3:0] m_axi4_arcache,
+# output logic [AXI_USER_WIDTH-1:0] m_axi4_aruser
+# );
+
+
+class axi4_ar_buffer(Elaboratable):
+
+ def __init__(self):
+ # self.axi4_aclk = Signal() # input
+ # self.axi4_arstn = Signal() # input
+ self.s_axi4_arid = Signal(AXI_ID_WIDTH) # input
+ self.s_axi4_araddr = Signal(32) # input
+ self.s_axi4_arvalid = Signal() # input
+ self.s_axi4_arready = Signal() # output
+ self.s_axi4_arlen = Signal(8) # input
+ self.s_axi4_arsize = Signal(3) # input
+ self.s_axi4_arburst = Signal(2) # input
+ self.s_axi4_arlock = Signal() # input
+ self.s_axi4_arprot = Signal(3) # input
+ self.s_axi4_arcache = Signal(4) # input
+ self.s_axi4_aruser = Signal(AXI_USER_WIDTH) # input
+ self.m_axi4_arid = Signal(AXI_ID_WIDTH) # output
+ self.m_axi4_araddr = Signal(32) # output
+ self.m_axi4_arvalid = Signal() # output
+ self.m_axi4_arready = Signal() # input
+ self.m_axi4_arlen = Signal(8) # output
+ self.m_axi4_arsize = Signal(3) # output
+ self.m_axi4_arburst = Signal(2) # output
+ self.m_axi4_arlock = Signal() # output
+ self.m_axi4_arprot = Signal(3) # output
+ self.m_axi4_arcache = Signal(4) # output
+ self.m_axi4_aruser = Signal(AXI_USER_WIDTH) # output
+
+ def elaborate(self, platform=None):
+ m = Module()
+ # #TODO use record types here
+ # wire [AXI_ID_WIDTH+AXI_USER_WIDTH+52:0] data_in;
+ # wire [AXI_ID_WIDTH+AXI_USER_WIDTH+52:0] data_out;
+
+ # assign data_in [3:0] = s_axi4_arcache;
+ # assign data_in [6:4] = s_axi4_arprot;
+ # assign data_in [7] = s_axi4_arlock;
+ # assign data_in [9:8] = s_axi4_arburst;
+ # assign data_in [12:10] = s_axi4_arsize;
+ # assign data_in [20:13] = s_axi4_arlen;
+ # assign data_in [52:21] = s_axi4_araddr;
+ # assign data_in [52+AXI_ID_WIDTH:53] = s_axi4_arid;
+ # assign data_in[52+AXI_ID_WIDTH+AXI_USER_WIDTH:53+AXI_ID_WIDTH] = s_axi4_aruser;
+ #
+ # assign m_axi4_arcache = data_out[3:0];
+ # assign m_axi4_arprot = data_out[6:4];
+ # assign m_axi4_arlock = data_out[7];
+ # assign m_axi4_arburst = data_out[9:8];
+ # assign m_axi4_arsize = data_out[12:10];
+ # assign m_axi4_arlen = data_out[20:13];
+ # assign m_axi4_araddr = data_out[52:21];
+ # assign m_axi4_arid = data_out[52+AXI_ID_WIDTH:53];
+ # assign m_axi4_aruser = data_out[52+AXI_ID_WIDTH+AXI_USER_WIDTH:53+AXI_ID_WIDTH];
+
+ # m.d.comb += self.m_axi4_arcache.eq(..)
+ # m.d.comb += self.m_axi4_arprot.eq(..)
+ # m.d.comb += self.m_axi4_arlock.eq(..)
+ # m.d.comb += self.m_axi4_arburst.eq(..)
+ # m.d.comb += self.m_axi4_arsize.eq(..)
+ # m.d.comb += self.m_axi4_arlen.eq(..)
+ # m.d.comb += self.m_axi4_araddr.eq(..)
+ # m.d.comb += self.m_axi4_arid.eq(..)
+ # m.d.comb += self.m_axi4_aruser.eq(..)
+ return m
+
+# TODO convert axi_buffer_rab.sv
+#
+# axi_buffer_rab
+# #(
+# .DATA_WIDTH ( AXI_ID_WIDTH+AXI_USER_WIDTH+53 ),
+# .BUFFER_DEPTH ( 4 )
+# )
+# u_buffer
+# (
+# .clk ( axi4_aclk ),
+# .rstn ( axi4_arstn ),
+# .valid_out ( m_axi4_arvalid ),
+# .data_out ( data_out ),
+# .ready_in ( m_axi4_arready ),
+# .valid_in ( s_axi4_arvalid ),
+# .data_in ( data_in ),
+# .ready_out ( s_axi4_arready )
+# );
+#
+
+# endmodule
--- /dev/null
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi4_ar_sender(Elaboratable):
+
+ def __init__(self):
+ self.axi4_aclk = Signal() # input
+ self.axi4_arstn = Signal() # input
+ self.l1_done_o = Signal() # output
+ self.l1_accept_i = Signal() # input
+ self.l1_drop_i = Signal() # input
+ self.l1_save_i = Signal() # input
+ self.l2_done_o = Signal() # output
+ self.l2_accept_i = Signal() # input
+ self.l2_drop_i = Signal() # input
+ self.l2_sending_o = Signal() # output
+ self.l1_araddr_i = Signal(AXI_ADDR_WIDTH) # input
+ self.l2_araddr_i = Signal(AXI_ADDR_WIDTH) # input
+ self.s_axi4_arid = Signal(AXI_ID_WIDTH) # input
+ self.s_axi4_arvalid = Signal() # input
+ self.s_axi4_arready = Signal() # output
+ self.s_axi4_arlen = Signal(8) # input
+ self.s_axi4_arsize = Signal(3) # input
+ self.s_axi4_arburst = Signal(2) # input
+ self.s_axi4_arlock = Signal() # input
+ self.s_axi4_arprot = Signal(3) # input
+ self.s_axi4_arcache = Signal(4) # input
+ self.s_axi4_aruser = Signal(AXI_USER_WIDTH) # input
+ self.m_axi4_arid = Signal(AXI_ID_WIDTH) # output
+ self.m_axi4_araddr = Signal(AXI_ADDR_WIDTH) # output
+ self.m_axi4_arvalid = Signal() # output
+ self.m_axi4_arready = Signal() # input
+ self.m_axi4_arlen = Signal(8) # output
+ self.m_axi4_arsize = Signal(3) # output
+ self.m_axi4_arburst = Signal(2) # output
+ self.m_axi4_arlock = Signal() # output
+ self.m_axi4_arprot = Signal(3) # output
+ self.m_axi4_arcache = Signal(4) # output
+ self.m_axi4_aruser = Signal(AXI_USER_WIDTH) # output
+
+ def elaborate(self, platform=None):
+ m = Module()
+ m.d.comb += self.l1_save.eq(self.None)
+ m.d.comb += self.l1_done_o.eq(self.None)
+ m.d.comb += self.m_axi4_arvalid.eq(self.None)
+ m.d.comb += self.s_axi4_arready.eq(self.None)
+ m.d.comb += self.m_axi4_aruser.eq(self.None)
+ m.d.comb += self.m_axi4_arcache.eq(self.None)
+ m.d.comb += self.m_axi4_arprot.eq(self.None)
+ m.d.comb += self.m_axi4_arlock.eq(self.None)
+ m.d.comb += self.m_axi4_arburst.eq(self.None)
+ m.d.comb += self.m_axi4_arsize.eq(self.None)
+ m.d.comb += self.m_axi4_arlen.eq(self.None)
+ m.d.comb += self.m_axi4_araddr.eq(self.None)
+ m.d.comb += self.m_axi4_arid.eq(self.None)
+ m.d.comb += self.l2_sending_o.eq(self.None)
+ m.d.comb += self.l2_sent.eq(self.None)
+ m.d.comb += self.l2_done_o.eq(self.None)
+ m.d.comb += self.m_axi4_aruser.eq(self.s_axi4_aruser)
+ m.d.comb += self.m_axi4_arcache.eq(self.s_axi4_arcache)
+ m.d.comb += self.m_axi4_arprot.eq(self.s_axi4_arprot)
+ m.d.comb += self.m_axi4_arlock.eq(self.s_axi4_arlock)
+ m.d.comb += self.m_axi4_arburst.eq(self.s_axi4_arburst)
+ m.d.comb += self.m_axi4_arsize.eq(self.s_axi4_arsize)
+ m.d.comb += self.m_axi4_arlen.eq(self.s_axi4_arlen)
+ m.d.comb += self.m_axi4_araddr.eq(self.l1_araddr_i)
+ m.d.comb += self.m_axi4_arid.eq(self.s_axi4_arid)
+ m.d.comb += self.l2_sending_o.eq(self.1: 'b0)
+ m.d.comb += self.l2_available_q.eq(self.1: 'b0)
+ m.d.comb += self.l2_done_o.eq(self.1: 'b0)
+ return m
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License. You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# module axi4_ar_sender
+# #(
+# parameter AXI_ADDR_WIDTH = 40,
+# parameter AXI_ID_WIDTH = 4,
+# parameter AXI_USER_WIDTH = 4,
+# parameter ENABLE_L2TLB = 0
+# )
+# (
+# input logic axi4_aclk,
+# input logic axi4_arstn,
+#
+# output logic l1_done_o,
+# input logic l1_accept_i,
+# input logic l1_drop_i,
+# input logic l1_save_i,
+#
+# output logic l2_done_o,
+# input logic l2_accept_i,
+# input logic l2_drop_i,
+# output logic l2_sending_o,
+#
+# input logic [AXI_ADDR_WIDTH-1:0] l1_araddr_i,
+# input logic [AXI_ADDR_WIDTH-1:0] l2_araddr_i,
+#
+# input logic [AXI_ID_WIDTH-1:0] s_axi4_arid,
+# input logic s_axi4_arvalid,
+# output logic s_axi4_arready,
+# input logic [7:0] s_axi4_arlen,
+# input logic [2:0] s_axi4_arsize,
+# input logic [1:0] s_axi4_arburst,
+# input logic s_axi4_arlock,
+# input logic [2:0] s_axi4_arprot,
+# input logic [3:0] s_axi4_arcache,
+# input logic [AXI_USER_WIDTH-1:0] s_axi4_aruser,
+#
+# output logic [AXI_ID_WIDTH-1:0] m_axi4_arid,
+# output logic [AXI_ADDR_WIDTH-1:0] m_axi4_araddr,
+# output logic m_axi4_arvalid,
+# input logic m_axi4_arready,
+# output logic [7:0] m_axi4_arlen,
+# output logic [2:0] m_axi4_arsize,
+# output logic [1:0] m_axi4_arburst,
+# output logic m_axi4_arlock,
+# output logic [2:0] m_axi4_arprot,
+# output logic [3:0] m_axi4_arcache,
+# output logic [AXI_USER_WIDTH-1:0] m_axi4_aruser
+# );
+#
+# logic l1_save;
+#
+# logic l2_sent;
+# logic l2_available_q;
+#
+# assign l1_save = l1_save_i & l2_available_q;
+#
+# assign l1_done_o = s_axi4_arvalid & s_axi4_arready ;
+#
+# // if 1: accept and forward a transaction translated by L1
+# // 2: drop or save request (if L2 slot not occupied already)
+# assign m_axi4_arvalid = (s_axi4_arvalid & l1_accept_i) |
+# l2_sending_o;
+# assign s_axi4_arready = (m_axi4_arvalid & m_axi4_arready & ~l2_sending_o) |
+# (s_axi4_arvalid & (l1_drop_i | l1_save));
+#
+# generate
+# if (ENABLE_L2TLB == 1) begin
+# logic [AXI_USER_WIDTH-1:0] l2_axi4_aruser ;
+# logic [3:0] l2_axi4_arcache ;
+# logic [3:0] l2_axi4_arregion;
+# logic [3:0] l2_axi4_arqos ;
+# logic [2:0] l2_axi4_arprot ;
+# logic l2_axi4_arlock ;
+# logic [1:0] l2_axi4_arburst ;
+# logic [2:0] l2_axi4_arsize ;
+# logic [7:0] l2_axi4_arlen ;
+# logic [AXI_ID_WIDTH-1:0] l2_axi4_arid ;
+#
+# assign m_axi4_aruser = l2_sending_o ? l2_axi4_aruser : s_axi4_aruser;
+# assign m_axi4_arcache = l2_sending_o ? l2_axi4_arcache : s_axi4_arcache;
+# assign m_axi4_arprot = l2_sending_o ? l2_axi4_arprot : s_axi4_arprot;
+# assign m_axi4_arlock = l2_sending_o ? l2_axi4_arlock : s_axi4_arlock;
+# assign m_axi4_arburst = l2_sending_o ? l2_axi4_arburst : s_axi4_arburst;
+# assign m_axi4_arsize = l2_sending_o ? l2_axi4_arsize : s_axi4_arsize;
+# assign m_axi4_arlen = l2_sending_o ? l2_axi4_arlen : s_axi4_arlen;
+# assign m_axi4_araddr = l2_sending_o ? l2_araddr_i : l1_araddr_i;
+# assign m_axi4_arid = l2_sending_o ? l2_axi4_arid : s_axi4_arid;
+#
+# // Buffer AXI signals in case of L1 miss
+# always @(posedge axi4_aclk or negedge axi4_arstn) begin
+# if (axi4_arstn == 1'b0) begin
+# l2_axi4_aruser <= 'b0;
+# l2_axi4_arcache <= 'b0;
+# l2_axi4_arprot <= 'b0;
+# l2_axi4_arlock <= 1'b0;
+# l2_axi4_arburst <= 'b0;
+# l2_axi4_arsize <= 'b0;
+# l2_axi4_arlen <= 'b0;
+# l2_axi4_arid <= 'b0;
+# end else if (l1_save) begin
+# l2_axi4_aruser <= s_axi4_aruser;
+# l2_axi4_arcache <= s_axi4_arcache;
+# l2_axi4_arprot <= s_axi4_arprot;
+# l2_axi4_arlock <= s_axi4_arlock;
+# l2_axi4_arburst <= s_axi4_arburst;
+# l2_axi4_arsize <= s_axi4_arsize;
+# l2_axi4_arlen <= s_axi4_arlen;
+# l2_axi4_arid <= s_axi4_arid;
+# end
+# end
+#
+# // signal that an l1_save_i can be accepted
+# always @(posedge axi4_aclk or negedge axi4_arstn) begin
+# if (axi4_arstn == 1'b0) begin
+# l2_available_q <= 1'b1;
+# end else if (l2_sent | l2_drop_i) begin
+# l2_available_q <= 1'b1;
+# end else if (l1_save) begin
+# l2_available_q <= 1'b0;
+# end
+# end
+#
+# assign l2_sending_o = l2_accept_i & ~l2_available_q;
+# assign l2_sent = l2_sending_o & m_axi4_arvalid & m_axi4_arready;
+#
+# // if 1: having sent out a transaction translated by L2
+# // 2: drop request (L2 slot is available again)
+# assign l2_done_o = l2_sent | l2_drop_i;
+#
+# end else begin // !`ifdef ENABLE_L2TLB
+# assign m_axi4_aruser = s_axi4_aruser;
+# assign m_axi4_arcache = s_axi4_arcache;
+# assign m_axi4_arprot = s_axi4_arprot;
+# assign m_axi4_arlock = s_axi4_arlock;
+# assign m_axi4_arburst = s_axi4_arburst;
+# assign m_axi4_arsize = s_axi4_arsize;
+# assign m_axi4_arlen = s_axi4_arlen;
+# assign m_axi4_araddr = l1_araddr_i;
+# assign m_axi4_arid = s_axi4_arid;
+#
+# assign l2_sending_o = 1'b0;
+# assign l2_available_q = 1'b0;
+# assign l2_done_o = 1'b0;
+# end // else: !if(ENABLE_L2TLB == 1)
+# endgenerate
+#
+# endmodule
+#
+#
--- /dev/null
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi4_aw_buffer(Elaboratable):
+
+ def __init__(self):
+ self.axi4_aclk = Signal() # input
+ self.axi4_arstn = Signal() # input
+ self.s_axi4_awid = Signal(AXI_ID_WIDTH) # input
+ self.s_axi4_awaddr = Signal(32) # input
+ self.s_axi4_awvalid = Signal() # input
+ self.s_axi4_awready = Signal() # output
+ self.s_axi4_awlen = Signal(8) # input
+ self.s_axi4_awsize = Signal(3) # input
+ self.s_axi4_awburst = Signal(2) # input
+ self.s_axi4_awlock = Signal() # input
+ self.s_axi4_awprot = Signal(3) # input
+ self.s_axi4_awcache = Signal(4) # input
+ self.s_axi4_awregion = Signal(4) # input
+ self.s_axi4_awqos = Signal(4) # input
+ self.s_axi4_awuser = Signal(AXI_USER_WIDTH) # input
+ self.m_axi4_awid = Signal(AXI_ID_WIDTH) # output
+ self.m_axi4_awaddr = Signal(32) # output
+ self.m_axi4_awvalid = Signal() # output
+ self.m_axi4_awready = Signal() # input
+ self.m_axi4_awlen = Signal(8) # output
+ self.m_axi4_awsize = Signal(3) # output
+ self.m_axi4_awburst = Signal(2) # output
+ self.m_axi4_awlock = Signal() # output
+ self.m_axi4_awprot = Signal(3) # output
+ self.m_axi4_awcache = Signal(4) # output
+ self.m_axi4_awregion = Signal(4) # output
+ self.m_axi4_awqos = Signal(4) # output
+ self.m_axi4_awuser = Signal(AXI_USER_WIDTH) # output
+
+ def elaborate(self, platform=None):
+ m = Module()
+ m.d.comb += self.None.eq(self.s_axi4_awcache)
+ m.d.comb += self.None.eq(self.s_axi4_awprot)
+ m.d.comb += self.None.eq(self.s_axi4_awlock)
+ m.d.comb += self.None.eq(self.s_axi4_awburst)
+ m.d.comb += self.None.eq(self.s_axi4_awsize)
+ m.d.comb += self.None.eq(self.s_axi4_awlen)
+ m.d.comb += self.None.eq(self.s_axi4_awaddr)
+ m.d.comb += self.None.eq(self.s_axi4_awregion)
+ m.d.comb += self.None.eq(self.s_axi4_awqos)
+ m.d.comb += self.None.eq(self.s_axi4_awid)
+ m.d.comb += self.None.eq(self.s_axi4_awuser)
+ m.d.comb += self.m_axi4_awcache.eq(self.None)
+ m.d.comb += self.m_axi4_awprot.eq(self.None)
+ m.d.comb += self.m_axi4_awlock.eq(self.None)
+ m.d.comb += self.m_axi4_awburst.eq(self.None)
+ m.d.comb += self.m_axi4_awsize.eq(self.None)
+ m.d.comb += self.m_axi4_awlen.eq(self.None)
+ m.d.comb += self.m_axi4_awaddr.eq(self.None)
+ m.d.comb += self.m_axi4_awregion.eq(self.None)
+ m.d.comb += self.m_axi4_awqos.eq(self.None)
+ m.d.comb += self.m_axi4_awid.eq(self.None)
+ m.d.comb += self.m_axi4_awuser.eq(self.None)
+ return m
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License. You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# module axi4_aw_buffer
+# #(
+# parameter AXI_ID_WIDTH = 4,
+# parameter AXI_USER_WIDTH = 4
+# )
+# (
+# input logic axi4_aclk,
+# input logic axi4_arstn,
+#
+# input logic [AXI_ID_WIDTH-1:0] s_axi4_awid,
+# input logic [31:0] s_axi4_awaddr,
+# input logic s_axi4_awvalid,
+# output logic s_axi4_awready,
+# input logic [7:0] s_axi4_awlen,
+# input logic [2:0] s_axi4_awsize,
+# input logic [1:0] s_axi4_awburst,
+# input logic s_axi4_awlock,
+# input logic [2:0] s_axi4_awprot,
+# input logic [3:0] s_axi4_awcache,
+# input logic [3:0] s_axi4_awregion,
+# input logic [3:0] s_axi4_awqos,
+# input logic [AXI_USER_WIDTH-1:0] s_axi4_awuser,
+#
+# output logic [AXI_ID_WIDTH-1:0] m_axi4_awid,
+# output logic [31:0] m_axi4_awaddr,
+# output logic m_axi4_awvalid,
+# input logic m_axi4_awready,
+# output logic [7:0] m_axi4_awlen,
+# output logic [2:0] m_axi4_awsize,
+# output logic [1:0] m_axi4_awburst,
+# output logic m_axi4_awlock,
+# output logic [2:0] m_axi4_awprot,
+# output logic [3:0] m_axi4_awcache,
+# output logic [3:0] m_axi4_awregion,
+# output logic [3:0] m_axi4_awqos,
+# output logic [AXI_USER_WIDTH-1:0] m_axi4_awuser
+# );
+#
+# wire [AXI_USER_WIDTH+AXI_ID_WIDTH+60:0] data_in;
+# wire [AXI_USER_WIDTH+AXI_ID_WIDTH+60:0] data_out;
+#
+# assign data_in [3:0] = s_axi4_awcache;
+# assign data_in [6:4] = s_axi4_awprot;
+# assign data_in [7] = s_axi4_awlock;
+# assign data_in [9:8] = s_axi4_awburst;
+# assign data_in [12:10] = s_axi4_awsize;
+# assign data_in [20:13] = s_axi4_awlen;
+# assign data_in [52:21] = s_axi4_awaddr;
+# assign data_in [56:53] = s_axi4_awregion;
+# assign data_in [60:57] = s_axi4_awqos;
+# assign data_in [60+AXI_ID_WIDTH:61] = s_axi4_awid;
+# assign data_in [60+AXI_ID_WIDTH+AXI_USER_WIDTH:61+AXI_ID_WIDTH] = s_axi4_awuser;
+#
+# assign m_axi4_awcache = data_out[3:0];
+# assign m_axi4_awprot = data_out[6:4];
+# assign m_axi4_awlock = data_out[7];
+# assign m_axi4_awburst = data_out[9:8];
+# assign m_axi4_awsize = data_out[12:10];
+# assign m_axi4_awlen = data_out[20:13];
+# assign m_axi4_awaddr = data_out[52:21];
+# assign m_axi4_awregion = data_out[56:53];
+# assign m_axi4_awqos = data_out[60:57];
+# assign m_axi4_awid = data_out[60+AXI_ID_WIDTH:61];
+# assign m_axi4_awuser = data_out[60+AXI_ID_WIDTH+AXI_USER_WIDTH:61+AXI_ID_WIDTH];
+#
+# axi_buffer_rab
+# #(
+# .DATA_WIDTH ( AXI_ID_WIDTH+AXI_USER_WIDTH+61 ),
+# .BUFFER_DEPTH ( 4 )
+# )
+# u_buffer
+# (
+# .clk ( axi4_aclk ),
+# .rstn ( axi4_arstn ),
+# .valid_out ( m_axi4_awvalid ),
+# .data_out ( data_out ),
+# .ready_in ( m_axi4_awready ),
+# .valid_in ( s_axi4_awvalid ),
+# .data_in ( data_in ),
+# .ready_out ( s_axi4_awready )
+# );
+# endmodule
+#
+#
--- /dev/null
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi4_aw_sender(Elaboratable):
+
+ def __init__(self):
+ self.axi4_aclk = Signal() # input
+ self.axi4_arstn = Signal() # input
+ self.l1_done_o = Signal() # output
+ self.l1_accept_i = Signal() # input
+ self.l1_drop_i = Signal() # input
+ self.l1_save_i = Signal() # input
+ self.l2_done_o = Signal() # output
+ self.l2_accept_i = Signal() # input
+ self.l2_drop_i = Signal() # input
+ self.l2_sending_o = Signal() # output
+ self.l1_awaddr_i = Signal(AXI_ADDR_WIDTH) # input
+ self.l2_awaddr_i = Signal(AXI_ADDR_WIDTH) # input
+ self.s_axi4_awid = Signal(AXI_ID_WIDTH) # input
+ self.s_axi4_awvalid = Signal() # input
+ self.s_axi4_awready = Signal() # output
+ self.s_axi4_awlen = Signal(8) # input
+ self.s_axi4_awsize = Signal(3) # input
+ self.s_axi4_awburst = Signal(2) # input
+ self.s_axi4_awlock = Signal() # input
+ self.s_axi4_awprot = Signal(3) # input
+ self.s_axi4_awcache = Signal(4) # input
+ self.s_axi4_awregion = Signal(4) # input
+ self.s_axi4_awqos = Signal(4) # input
+ self.s_axi4_awuser = Signal(AXI_USER_WIDTH) # input
+ self.m_axi4_awid = Signal(AXI_ID_WIDTH) # output
+ self.m_axi4_awaddr = Signal(AXI_ADDR_WIDTH) # output
+ self.m_axi4_awvalid = Signal() # output
+ self.m_axi4_awready = Signal() # input
+ self.m_axi4_awlen = Signal(8) # output
+ self.m_axi4_awsize = Signal(3) # output
+ self.m_axi4_awburst = Signal(2) # output
+ self.m_axi4_awlock = Signal() # output
+ self.m_axi4_awprot = Signal(3) # output
+ self.m_axi4_awcache = Signal(4) # output
+ self.m_axi4_awregion = Signal(4) # output
+ self.m_axi4_awqos = Signal(4) # output
+ self.m_axi4_awuser = Signal(AXI_USER_WIDTH) # output
+
+ def elaborate(self, platform=None):
+ m = Module()
+ m.d.comb += self.l1_save.eq(self.None)
+ m.d.comb += self.l1_done_o.eq(self.None)
+ m.d.comb += self.m_axi4_awvalid.eq(self.None)
+ m.d.comb += self.s_axi4_awready.eq(self.None)
+ m.d.comb += self.m_axi4_awuser.eq(self.None)
+ m.d.comb += self.m_axi4_awcache.eq(self.None)
+ m.d.comb += self.m_axi4_awregion.eq(self.None)
+ m.d.comb += self.m_axi4_awqos.eq(self.None)
+ m.d.comb += self.m_axi4_awprot.eq(self.None)
+ m.d.comb += self.m_axi4_awlock.eq(self.None)
+ m.d.comb += self.m_axi4_awburst.eq(self.None)
+ m.d.comb += self.m_axi4_awsize.eq(self.None)
+ m.d.comb += self.m_axi4_awlen.eq(self.None)
+ m.d.comb += self.m_axi4_awaddr.eq(self.None)
+ m.d.comb += self.m_axi4_awid.eq(self.None)
+ m.d.comb += self.l2_sending_o.eq(self.None)
+ m.d.comb += self.l2_sent.eq(self.None)
+ m.d.comb += self.l2_done_o.eq(self.None)
+ m.d.comb += self.m_axi4_awuser.eq(self.s_axi4_awuser)
+ m.d.comb += self.m_axi4_awcache.eq(self.s_axi4_awcache)
+ m.d.comb += self.m_axi4_awregion.eq(self.s_axi4_awregion)
+ m.d.comb += self.m_axi4_awqos.eq(self.s_axi4_awqos)
+ m.d.comb += self.m_axi4_awprot.eq(self.s_axi4_awprot)
+ m.d.comb += self.m_axi4_awlock.eq(self.s_axi4_awlock)
+ m.d.comb += self.m_axi4_awburst.eq(self.s_axi4_awburst)
+ m.d.comb += self.m_axi4_awsize.eq(self.s_axi4_awsize)
+ m.d.comb += self.m_axi4_awlen.eq(self.s_axi4_awlen)
+ m.d.comb += self.m_axi4_awaddr.eq(self.l1_awaddr_i)
+ m.d.comb += self.m_axi4_awid.eq(self.s_axi4_awid)
+ m.d.comb += self.l2_sending_o.eq(self.1: 'b0)
+ m.d.comb += self.l2_available_q.eq(self.1: 'b0)
+ m.d.comb += self.l2_done_o.eq(self.1: 'b0)
+ return m
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License. You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# module axi4_aw_sender
+# #(
+# parameter AXI_ADDR_WIDTH = 40,
+# parameter AXI_ID_WIDTH = 4,
+# parameter AXI_USER_WIDTH = 4,
+# parameter ENABLE_L2TLB = 0
+# )
+# (
+# input logic axi4_aclk,
+# input logic axi4_arstn,
+#
+# output logic l1_done_o,
+# input logic l1_accept_i,
+# input logic l1_drop_i,
+# input logic l1_save_i,
+#
+# output logic l2_done_o,
+# input logic l2_accept_i,
+# input logic l2_drop_i,
+# output logic l2_sending_o,
+#
+# input logic [AXI_ADDR_WIDTH-1:0] l1_awaddr_i,
+# input logic [AXI_ADDR_WIDTH-1:0] l2_awaddr_i,
+#
+# input logic [AXI_ID_WIDTH-1:0] s_axi4_awid,
+# input logic s_axi4_awvalid,
+# output logic s_axi4_awready,
+# input logic [7:0] s_axi4_awlen,
+# input logic [2:0] s_axi4_awsize,
+# input logic [1:0] s_axi4_awburst,
+# input logic s_axi4_awlock,
+# input logic [2:0] s_axi4_awprot,
+# input logic [3:0] s_axi4_awcache,
+# input logic [3:0] s_axi4_awregion,
+# input logic [3:0] s_axi4_awqos,
+# input logic [AXI_USER_WIDTH-1:0] s_axi4_awuser,
+#
+# output logic [AXI_ID_WIDTH-1:0] m_axi4_awid,
+# output logic [AXI_ADDR_WIDTH-1:0] m_axi4_awaddr,
+# output logic m_axi4_awvalid,
+# input logic m_axi4_awready,
+# output logic [7:0] m_axi4_awlen,
+# output logic [2:0] m_axi4_awsize,
+# output logic [1:0] m_axi4_awburst,
+# output logic m_axi4_awlock,
+# output logic [2:0] m_axi4_awprot,
+# output logic [3:0] m_axi4_awcache,
+# output logic [3:0] m_axi4_awregion,
+# output logic [3:0] m_axi4_awqos,
+# output logic [AXI_USER_WIDTH-1:0] m_axi4_awuser
+# );
+#
+# logic l1_save;
+#
+# logic l2_sent;
+# logic l2_available_q;
+#
+# assign l1_save = l1_save_i & l2_available_q;
+#
+# assign l1_done_o = s_axi4_awvalid & s_axi4_awready ;
+#
+# // if 1: accept and forward a transaction translated by L1
+# // 2: drop or save request (if L2 slot not occupied already)
+# assign m_axi4_awvalid = (s_axi4_awvalid & l1_accept_i) |
+# l2_sending_o;
+# assign s_axi4_awready = (m_axi4_awvalid & m_axi4_awready & ~l2_sending_o) |
+# (s_axi4_awvalid & (l1_drop_i | l1_save));
+#
+# generate
+# if (ENABLE_L2TLB == 1) begin
+# logic [AXI_USER_WIDTH-1:0] l2_axi4_awuser ;
+# logic [3:0] l2_axi4_awcache ;
+# logic [3:0] l2_axi4_awregion;
+# logic [3:0] l2_axi4_awqos ;
+# logic [2:0] l2_axi4_awprot ;
+# logic l2_axi4_awlock ;
+# logic [1:0] l2_axi4_awburst ;
+# logic [2:0] l2_axi4_awsize ;
+# logic [7:0] l2_axi4_awlen ;
+# logic [AXI_ID_WIDTH-1:0] l2_axi4_awid ;
+#
+# assign m_axi4_awuser = l2_sending_o ? l2_axi4_awuser : s_axi4_awuser;
+# assign m_axi4_awcache = l2_sending_o ? l2_axi4_awcache : s_axi4_awcache;
+# assign m_axi4_awregion = l2_sending_o ? l2_axi4_awregion : s_axi4_awregion;
+# assign m_axi4_awqos = l2_sending_o ? l2_axi4_awqos : s_axi4_awqos;
+# assign m_axi4_awprot = l2_sending_o ? l2_axi4_awprot : s_axi4_awprot;
+# assign m_axi4_awlock = l2_sending_o ? l2_axi4_awlock : s_axi4_awlock;
+# assign m_axi4_awburst = l2_sending_o ? l2_axi4_awburst : s_axi4_awburst;
+# assign m_axi4_awsize = l2_sending_o ? l2_axi4_awsize : s_axi4_awsize;
+# assign m_axi4_awlen = l2_sending_o ? l2_axi4_awlen : s_axi4_awlen;
+# assign m_axi4_awaddr = l2_sending_o ? l2_awaddr_i : l1_awaddr_i;
+# assign m_axi4_awid = l2_sending_o ? l2_axi4_awid : s_axi4_awid;
+#
+# // buffer AXI signals in case of L1 miss
+# always @(posedge axi4_aclk or negedge axi4_arstn) begin
+# if (axi4_arstn == 1'b0) begin
+# l2_axi4_awuser <= 'b0;
+# l2_axi4_awcache <= 'b0;
+# l2_axi4_awregion <= 'b0;
+# l2_axi4_awqos <= 'b0;
+# l2_axi4_awprot <= 'b0;
+# l2_axi4_awlock <= 1'b0;
+# l2_axi4_awburst <= 'b0;
+# l2_axi4_awsize <= 'b0;
+# l2_axi4_awlen <= 'b0;
+# l2_axi4_awid <= 'b0;
+# end else if (l1_save) begin
+# l2_axi4_awuser <= s_axi4_awuser;
+# l2_axi4_awcache <= s_axi4_awcache;
+# l2_axi4_awregion <= s_axi4_awregion;
+# l2_axi4_awqos <= s_axi4_awqos;
+# l2_axi4_awprot <= s_axi4_awprot;
+# l2_axi4_awlock <= s_axi4_awlock;
+# l2_axi4_awburst <= s_axi4_awburst;
+# l2_axi4_awsize <= s_axi4_awsize;
+# l2_axi4_awlen <= s_axi4_awlen;
+# l2_axi4_awid <= s_axi4_awid;
+# end
+# end
+#
+# // signal that an l1_save_i can be accepted
+# always @(posedge axi4_aclk or negedge axi4_arstn) begin
+# if (axi4_arstn == 1'b0) begin
+# l2_available_q <= 1'b1;
+# end else if (l2_sent | l2_drop_i) begin
+# l2_available_q <= 1'b1;
+# end else if (l1_save) begin
+# l2_available_q <= 1'b0;
+# end
+# end
+#
+# assign l2_sending_o = l2_accept_i & ~l2_available_q;
+# assign l2_sent = l2_sending_o & m_axi4_awvalid & m_axi4_awready;
+#
+# // if 1: having sent out a transaction translated by L2
+# // 2: drop request (L2 slot is available again)
+# assign l2_done_o = l2_sent | l2_drop_i;
+#
+# end else begin // !`ifdef ENABLE_L2TLB
+# assign m_axi4_awuser = s_axi4_awuser;
+# assign m_axi4_awcache = s_axi4_awcache;
+# assign m_axi4_awregion = s_axi4_awregion;
+# assign m_axi4_awqos = s_axi4_awqos;
+# assign m_axi4_awprot = s_axi4_awprot;
+# assign m_axi4_awlock = s_axi4_awlock;
+# assign m_axi4_awburst = s_axi4_awburst;
+# assign m_axi4_awsize = s_axi4_awsize;
+# assign m_axi4_awlen = s_axi4_awlen;
+# assign m_axi4_awaddr = l1_awaddr_i;
+# assign m_axi4_awid = s_axi4_awid;
+#
+# assign l2_sending_o = 1'b0;
+# assign l2_available_q = 1'b0;
+# assign l2_done_o = 1'b0;
+# end // !`ifdef ENABLE_L2TLB
+# endgenerate
+#
+# endmodule
+#
+#
--- /dev/null
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi4_b_buffer(Elaboratable):
+
+ def __init__(self):
+ self.axi4_aclk = Signal() # input
+ self.axi4_arstn = Signal() # input
+ self.s_axi4_bid = Signal(AXI_ID_WIDTH) # output
+ self.s_axi4_bresp = Signal(2) # output
+ self.s_axi4_bvalid = Signal() # output
+ self.s_axi4_buser = Signal(AXI_USER_WIDTH) # output
+ self.s_axi4_bready = Signal() # input
+ self.m_axi4_bid = Signal(AXI_ID_WIDTH) # input
+ self.m_axi4_bresp = Signal(2) # input
+ self.m_axi4_bvalid = Signal() # input
+ self.m_axi4_buser = Signal(AXI_USER_WIDTH) # input
+ self.m_axi4_bready = Signal() # output
+
+ def elaborate(self, platform=None):
+ m = Module()
+ m.d.comb += self.None.eq(self.m_axi4_bresp)
+ m.d.comb += self.None.eq(self.m_axi4_bid)
+ m.d.comb += self.None.eq(self.m_axi4_buser)
+ m.d.comb += self.s_axi4_buser.eq(self.None)
+ m.d.comb += self.s_axi4_bid.eq(self.None)
+ m.d.comb += self.s_axi4_bresp.eq(self.None)
+ return m
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License. You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# module axi4_b_buffer
+# #(
+# parameter AXI_ID_WIDTH = 4,
+# parameter AXI_USER_WIDTH = 4
+# )
+# (
+# input logic axi4_aclk,
+# input logic axi4_arstn,
+#
+# output logic [AXI_ID_WIDTH-1:0] s_axi4_bid,
+# output logic [1:0] s_axi4_bresp,
+# output logic s_axi4_bvalid,
+# output logic [AXI_USER_WIDTH-1:0] s_axi4_buser,
+# input logic s_axi4_bready,
+#
+# input logic [AXI_ID_WIDTH-1:0] m_axi4_bid,
+# input logic [1:0] m_axi4_bresp,
+# input logic m_axi4_bvalid,
+# input logic [AXI_USER_WIDTH-1:0] m_axi4_buser,
+# output logic m_axi4_bready
+# );
+#
+# wire [AXI_ID_WIDTH+AXI_USER_WIDTH+1:0] data_in;
+# wire [AXI_ID_WIDTH+AXI_USER_WIDTH+1:0] data_out;
+#
+# assign data_in [1:0] = m_axi4_bresp;
+# assign data_in [AXI_ID_WIDTH+1:2] = m_axi4_bid;
+# assign data_in[AXI_ID_WIDTH+AXI_USER_WIDTH+1:AXI_ID_WIDTH+2] = m_axi4_buser;
+#
+# assign s_axi4_buser = data_out[AXI_ID_WIDTH+AXI_USER_WIDTH+1:AXI_ID_WIDTH+2];
+# assign s_axi4_bid = data_out[AXI_ID_WIDTH+1:2];
+# assign s_axi4_bresp = data_out[1:0];
+#
+# axi_buffer_rab
+# #(
+# .DATA_WIDTH ( AXI_ID_WIDTH+AXI_USER_WIDTH+2 ),
+# .BUFFER_DEPTH ( 4 )
+# )
+# u_buffer
+# (
+# .clk ( axi4_aclk ),
+# .rstn ( axi4_arstn ),
+# .valid_out( s_axi4_bvalid ),
+# .data_out ( data_out ),
+# .ready_in ( s_axi4_bready ),
+# .valid_in ( m_axi4_bvalid ),
+# .data_in ( data_in ),
+# .ready_out( m_axi4_bready )
+# );
+#
+# endmodule
+#
+#
--- /dev/null
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi4_b_sender(Elaboratable):
+
+ def __init__(self):
+ self.axi4_aclk = Signal() # input
+ self.axi4_arstn = Signal() # input
+ self.drop_i = Signal() # input
+ self.done_o = Signal() # output
+ self.id_i = Signal(AXI_ID_WIDTH) # input
+ self.prefetch_i = Signal() # input
+ self.hit_i = Signal() # input
+ self.s_axi4_bid = Signal(AXI_ID_WIDTH) # output
+ self.s_axi4_bresp = Signal(2) # output
+ self.s_axi4_bvalid = Signal() # output
+ self.s_axi4_buser = Signal(AXI_USER_WIDTH) # output
+ self.s_axi4_bready = Signal() # input
+ self.m_axi4_bid = Signal(AXI_ID_WIDTH) # input
+ self.m_axi4_bresp = Signal(2) # input
+ self.m_axi4_bvalid = Signal() # input
+ self.m_axi4_buser = Signal(AXI_USER_WIDTH) # input
+ self.m_axi4_bready = Signal() # output
+
+ def elaborate(self, platform=None):
+ m = Module()
+ m.d.comb += self.fifo_push.eq(self.None)
+ m.d.comb += self.done_o.eq(self.fifo_push)
+ m.d.comb += self.fifo_pop.eq(self.None)
+ m.d.comb += self.s_axi4_buser.eq(self.None)
+ m.d.comb += self.s_axi4_bid.eq(self.None)
+ m.d.comb += self.s_axi4_bresp.eq(self.None)
+ m.d.comb += self.s_axi4_bvalid.eq(self.None)
+ m.d.comb += self.m_axi4_bready.eq(self.None)
+ return m
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License. You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# module axi4_b_sender
+# #(
+# parameter AXI_ID_WIDTH = 10,
+# parameter AXI_USER_WIDTH = 4
+# )
+# (
+# input logic axi4_aclk,
+# input logic axi4_arstn,
+#
+# input logic drop_i,
+# output logic done_o,
+# input logic [AXI_ID_WIDTH-1:0] id_i,
+# input logic prefetch_i,
+# input logic hit_i,
+#
+# output logic [AXI_ID_WIDTH-1:0] s_axi4_bid,
+# output logic [1:0] s_axi4_bresp,
+# output logic s_axi4_bvalid,
+# output logic [AXI_USER_WIDTH-1:0] s_axi4_buser,
+# input logic s_axi4_bready,
+#
+# input logic [AXI_ID_WIDTH-1:0] m_axi4_bid,
+# input logic [1:0] m_axi4_bresp,
+# input logic m_axi4_bvalid,
+# input logic [AXI_USER_WIDTH-1:0] m_axi4_buser,
+# output logic m_axi4_bready
+# );
+#
+# logic fifo_valid;
+# logic fifo_pop;
+# logic fifo_push;
+# logic fifo_ready;
+# logic [AXI_ID_WIDTH-1:0] id;
+# logic prefetch;
+# logic hit;
+#
+# logic dropping;
+#
+# axi_buffer_rab
+# #(
+# .DATA_WIDTH ( 2+AXI_ID_WIDTH ),
+# .BUFFER_DEPTH ( 4 )
+# )
+# u_fifo
+# (
+# .clk ( axi4_aclk ),
+# .rstn ( axi4_arstn ),
+# // Pop
+# .data_out ( {prefetch, hit, id} ),
+# .valid_out ( fifo_valid ),
+# .ready_in ( fifo_pop ),
+# // Push
+# .valid_in ( fifo_push ),
+# .data_in ( {prefetch_i, hit_i, id_i} ),
+# .ready_out ( fifo_ready )
+# );
+#
+# assign fifo_push = drop_i & fifo_ready;
+# assign done_o = fifo_push;
+#
+# assign fifo_pop = dropping & s_axi4_bready;
+#
+# always @ (posedge axi4_aclk or negedge axi4_arstn) begin
+# if (axi4_arstn == 1'b0) begin
+# dropping <= 1'b0;
+# end else begin
+# if (fifo_valid && ~dropping)
+# dropping <= 1'b1;
+# else if (fifo_pop)
+# dropping <= 1'b0;
+# end
+# end
+#
+# assign s_axi4_buser = dropping ? {AXI_USER_WIDTH{1'b0}} : m_axi4_buser;
+# assign s_axi4_bid = dropping ? id : m_axi4_bid;
+#
+# assign s_axi4_bresp = (dropping & prefetch & hit) ? 2'b00 : // prefetch hit, mutli, prot
+# (dropping & prefetch ) ? 2'b10 : // prefetch miss
+# (dropping & hit) ? 2'b10 : // non-prefetch multi, prot
+# (dropping ) ? 2'b10 : // non-prefetch miss
+# m_axi4_bresp;
+#
+# assign s_axi4_bvalid = dropping | m_axi4_bvalid;
+# assign m_axi4_bready = ~dropping & s_axi4_bready;
+#
+# endmodule
+#
+#
--- /dev/null
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi4_r_buffer(Elaboratable):
+
+ def __init__(self):
+ self.axi4_aclk = Signal() # input
+ self.axi4_arstn = Signal() # input
+ self.s_axi4_rid = Signal(AXI_ID_WIDTH) # output
+ self.s_axi4_rresp = Signal(2) # output
+ self.s_axi4_rdata = Signal(AXI_DATA_WIDTH) # output
+ self.s_axi4_rlast = Signal() # output
+ self.s_axi4_rvalid = Signal() # output
+ self.s_axi4_ruser = Signal(AXI_USER_WIDTH) # output
+ self.s_axi4_rready = Signal() # input
+ self.m_axi4_rid = Signal(AXI_ID_WIDTH) # input
+ self.m_axi4_rresp = Signal(2) # input
+ self.m_axi4_rdata = Signal(AXI_DATA_WIDTH) # input
+ self.m_axi4_rlast = Signal() # input
+ self.m_axi4_rvalid = Signal() # input
+ self.m_axi4_ruser = Signal(AXI_USER_WIDTH) # input
+ self.m_axi4_rready = Signal() # output
+
+ def elaborate(self, platform=None):
+ m = Module()
+ m.d.comb += self.None.eq(self.m_axi4_rresp)
+ m.d.comb += self.None.eq(self.m_axi4_rlast)
+ m.d.comb += self.None.eq(self.m_axi4_rid)
+ m.d.comb += self.None.eq(self.m_axi4_rdata)
+ m.d.comb += self.None.eq(self.m_axi4_ruser)
+ m.d.comb += self.s_axi4_rresp.eq(self.None)
+ m.d.comb += self.s_axi4_rlast.eq(self.None)
+ m.d.comb += self.s_axi4_rid.eq(self.None)
+ m.d.comb += self.s_axi4_rdata.eq(self.None)
+ m.d.comb += self.s_axi4_ruser.eq(self.None)
+ return m
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License. You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# module axi4_r_buffer
+# #(
+# parameter AXI_DATA_WIDTH = 32,
+# parameter AXI_ID_WIDTH = 4,
+# parameter AXI_USER_WIDTH = 4
+# )
+# (
+# input logic axi4_aclk,
+# input logic axi4_arstn,
+#
+# output logic [AXI_ID_WIDTH-1:0] s_axi4_rid,
+# output logic [1:0] s_axi4_rresp,
+# output logic [AXI_DATA_WIDTH-1:0] s_axi4_rdata,
+# output logic s_axi4_rlast,
+# output logic s_axi4_rvalid,
+# output logic [AXI_USER_WIDTH-1:0] s_axi4_ruser,
+# input logic s_axi4_rready,
+#
+# input logic [AXI_ID_WIDTH-1:0] m_axi4_rid,
+# input logic [1:0] m_axi4_rresp,
+# input logic [AXI_DATA_WIDTH-1:0] m_axi4_rdata,
+# input logic m_axi4_rlast,
+# input logic m_axi4_rvalid,
+# input logic [AXI_USER_WIDTH-1:0] m_axi4_ruser,
+# output logic m_axi4_rready
+# );
+#
+# wire [AXI_DATA_WIDTH+AXI_ID_WIDTH+AXI_USER_WIDTH+3-1:0] data_in;
+# wire [AXI_DATA_WIDTH+AXI_ID_WIDTH+AXI_USER_WIDTH+3-1:0] data_out;
+#
+# localparam ID_START = 3;
+# localparam ID_END = AXI_ID_WIDTH-1 + ID_START;
+# localparam DATA_START = ID_END + 1;
+# localparam DATA_END = AXI_DATA_WIDTH-1 + DATA_START;
+# localparam USER_START = DATA_END + 1;
+# localparam USER_END = AXI_USER_WIDTH-1 + USER_START;
+#
+# assign data_in [1:0] = m_axi4_rresp;
+# assign data_in [2] = m_axi4_rlast;
+# assign data_in [ID_END:ID_START] = m_axi4_rid;
+# assign data_in[DATA_END:DATA_START] = m_axi4_rdata;
+# assign data_in[USER_END:USER_START] = m_axi4_ruser;
+#
+# assign s_axi4_rresp = data_out [1:0];
+# assign s_axi4_rlast = data_out [2];
+# assign s_axi4_rid = data_out [ID_END:ID_START];
+# assign s_axi4_rdata = data_out[DATA_END:DATA_START];
+# assign s_axi4_ruser = data_out[USER_END:USER_START];
+#
+# axi_buffer_rab
+# #(
+# .DATA_WIDTH ( AXI_DATA_WIDTH+AXI_ID_WIDTH+AXI_USER_WIDTH+3 ),
+# .BUFFER_DEPTH ( 4 )
+# )
+# u_buffer
+# (
+# .clk ( axi4_aclk ),
+# .rstn ( axi4_arstn ),
+# // Pop
+# .valid_out ( s_axi4_rvalid ),
+# .data_out ( data_out ),
+# .ready_in ( s_axi4_rready ),
+# // Push
+# .valid_in ( m_axi4_rvalid ),
+# .data_in ( data_in ),
+# .ready_out ( m_axi4_rready )
+# );
+#
+# endmodule
+#
+#
--- /dev/null
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi4_r_sender(Elaboratable):
+
+ def __init__(self):
+ self.axi4_aclk = Signal() # input
+ self.axi4_arstn = Signal() # input
+ self.drop_i = Signal() # input
+ self.drop_len_i = Signal(8) # input
+ self.done_o = Signal() # output
+ self.id_i = Signal(AXI_ID_WIDTH) # input
+ self.prefetch_i = Signal() # input
+ self.hit_i = Signal() # input
+ self.s_axi4_rid = Signal(AXI_ID_WIDTH) # output
+ self.s_axi4_rresp = Signal(2) # output
+ self.s_axi4_rdata = Signal(AXI_DATA_WIDTH) # output
+ self.s_axi4_rlast = Signal() # output
+ self.s_axi4_rvalid = Signal() # output
+ self.s_axi4_ruser = Signal(AXI_USER_WIDTH) # output
+ self.s_axi4_rready = Signal() # input
+ self.m_axi4_rid = Signal(AXI_ID_WIDTH) # input
+ self.m_axi4_rresp = Signal(2) # input
+ self.m_axi4_rdata = Signal(AXI_DATA_WIDTH) # input
+ self.m_axi4_rlast = Signal() # input
+ self.m_axi4_rvalid = Signal() # input
+ self.m_axi4_ruser = Signal(AXI_USER_WIDTH) # input
+ self.m_axi4_rready = Signal() # output
+
+ def elaborate(self, platform=None):
+ m = Module()
+ m.d.comb += self.fifo_push.eq(self.None)
+ m.d.comb += self.done_o.eq(self.fifo_push)
+ m.d.comb += self.s_axi4_rdata.eq(self.m_axi4_rdata)
+ m.d.comb += self.s_axi4_ruser.eq(self.None)
+ m.d.comb += self.s_axi4_rid.eq(self.None)
+ m.d.comb += self.s_axi4_rresp.eq(self.None)
+ m.d.comb += self.s_axi4_rvalid.eq(self.None)
+ m.d.comb += self.m_axi4_rready.eq(self.None)
+ return m
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License. You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# //import CfMath::log2;
+#
+# module axi4_r_sender
+# #(
+# parameter AXI_DATA_WIDTH = 32,
+# parameter AXI_ID_WIDTH = 4,
+# parameter AXI_USER_WIDTH = 4
+# )
+# (
+# input logic axi4_aclk,
+# input logic axi4_arstn,
+#
+# input logic drop_i,
+# input logic [7:0] drop_len_i,
+# output logic done_o,
+# input logic [AXI_ID_WIDTH-1:0] id_i,
+# input logic prefetch_i,
+# input logic hit_i,
+#
+# output logic [AXI_ID_WIDTH-1:0] s_axi4_rid,
+# output logic [1:0] s_axi4_rresp,
+# output logic [AXI_DATA_WIDTH-1:0] s_axi4_rdata,
+# output logic s_axi4_rlast,
+# output logic s_axi4_rvalid,
+# output logic [AXI_USER_WIDTH-1:0] s_axi4_ruser,
+# input logic s_axi4_rready,
+#
+# input logic [AXI_ID_WIDTH-1:0] m_axi4_rid,
+# input logic [1:0] m_axi4_rresp,
+# input logic [AXI_DATA_WIDTH-1:0] m_axi4_rdata,
+# input logic m_axi4_rlast,
+# input logic m_axi4_rvalid,
+# input logic [AXI_USER_WIDTH-1:0] m_axi4_ruser,
+# output logic m_axi4_rready
+# );
+#
+# localparam BUFFER_DEPTH = 16;
+#
+# logic fifo_valid;
+# logic fifo_pop;
+# logic fifo_push;
+# logic fifo_ready;
+# logic [AXI_ID_WIDTH-1:0] id;
+# logic [7:0] len;
+# logic prefetch;
+# logic hit;
+#
+# logic dropping;
+#
+# enum logic [1:0] { FORWARDING, DROPPING }
+# state_d, state_q;
+# logic burst_ongoing_d, burst_ongoing_q;
+# logic [7:0] drop_cnt_d, drop_cnt_q;
+#
+# axi_buffer_rab
+# #(
+# .DATA_WIDTH ( 2+AXI_ID_WIDTH+8 ),
+# .BUFFER_DEPTH ( BUFFER_DEPTH )
+# )
+# u_fifo
+# (
+# .clk ( axi4_aclk ),
+# .rstn ( axi4_arstn ),
+# // Pop
+# .data_out ( {prefetch, hit, id, len} ),
+# .valid_out ( fifo_valid ),
+# .ready_in ( fifo_pop ),
+# // Push
+# .valid_in ( fifo_push ),
+# .data_in ( {prefetch_i, hit_i, id_i, drop_len_i} ),
+# .ready_out ( fifo_ready )
+# );
+#
+# assign fifo_push = drop_i & fifo_ready;
+# assign done_o = fifo_push;
+#
+# always_comb begin
+# burst_ongoing_d = burst_ongoing_q;
+# drop_cnt_d = drop_cnt_q;
+# dropping = 1'b0;
+# s_axi4_rlast = 1'b0;
+# fifo_pop = 1'b0;
+# state_d = state_q;
+#
+# case (state_q)
+# FORWARDING: begin
+# s_axi4_rlast = m_axi4_rlast;
+# // Remember whether there is currently a burst ongoing.
+# if (m_axi4_rvalid && m_axi4_rready) begin
+# if (m_axi4_rlast) begin
+# burst_ongoing_d = 1'b0;
+# end else begin
+# burst_ongoing_d = 1'b1;
+# end
+# end
+# // If there is no burst ongoing and the FIFO has a drop request ready, process it.
+# if (!burst_ongoing_d && fifo_valid) begin
+# drop_cnt_d = len;
+# state_d = DROPPING;
+# end
+# end
+#
+# DROPPING: begin
+# dropping = 1'b1;
+# s_axi4_rlast = (drop_cnt_q == '0);
+# // Handshake on slave interface
+# if (s_axi4_rready) begin
+# drop_cnt_d -= 1;
+# if (drop_cnt_q == '0) begin
+# drop_cnt_d = '0;
+# fifo_pop = 1'b1;
+# state_d = FORWARDING;
+# end
+# end
+# end
+#
+# default: begin
+# state_d = FORWARDING;
+# end
+# endcase
+# end
+#
+# assign s_axi4_rdata = m_axi4_rdata;
+#
+# assign s_axi4_ruser = dropping ? {AXI_USER_WIDTH{1'b0}} : m_axi4_ruser;
+# assign s_axi4_rid = dropping ? id : m_axi4_rid;
+#
+# assign s_axi4_rresp = (dropping & prefetch & hit) ? 2'b00 : // prefetch hit, mutli, prot
+# (dropping & prefetch ) ? 2'b10 : // prefetch miss
+# (dropping & hit) ? 2'b10 : // non-prefetch multi, prot
+# (dropping ) ? 2'b10 : // non-prefetch miss
+# m_axi4_rresp;
+#
+# assign s_axi4_rvalid = dropping | m_axi4_rvalid;
+# assign m_axi4_rready = ~dropping & s_axi4_rready;
+#
+# always_ff @(posedge axi4_aclk, negedge axi4_arstn) begin
+# if (axi4_arstn == 1'b0) begin
+# burst_ongoing_q <= 1'b0;
+# drop_cnt_q <= 'b0;
+# state_q <= FORWARDING;
+# end else begin
+# burst_ongoing_q <= burst_ongoing_d;
+# drop_cnt_q <= drop_cnt_d;
+# state_q <= state_d;
+# end
+# end
+#
+# endmodule
+#
+#
+#
+#
--- /dev/null
+# this file has been generated by sv2nmigen
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License. You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi4_w_buffer(Elaboratable):
+
+ def __init__(self):
+ self.axi4_aclk = Signal() # input
+ self.axi4_arstn = Signal() # input
+ self.l1_done_o = Signal() # output
+ self.l1_accept_i = Signal() # input
+ self.l1_save_i = Signal() # input
+ self.l1_drop_i = Signal() # input
+ self.l1_master_i = Signal() # input
+ self.l1_id_i = Signal(AXI_ID_WIDTH) # input
+ self.l1_len_i = Signal(8) # input
+ self.l1_prefetch_i = Signal() # input
+ self.l1_hit_i = Signal() # input
+ self.l2_done_o = Signal() # output
+ self.l2_accept_i = Signal() # input
+ self.l2_drop_i = Signal() # input
+ self.l2_master_i = Signal() # input
+ self.l2_id_i = Signal(AXI_ID_WIDTH) # input
+ self.l2_len_i = Signal(8) # input
+ self.l2_prefetch_i = Signal() # input
+ self.l2_hit_i = Signal() # input
+ self.master_select_o = Signal() # output
+ self.input_stall_o = Signal() # output
+ self.output_stall_o = Signal() # output
+ self.b_drop_o = Signal() # output
+ self.b_done_i = Signal() # input
+ self.id_o = Signal(AXI_ID_WIDTH) # output
+ self.prefetch_o = Signal() # output
+ self.hit_o = Signal() # output
+ self.s_axi4_wdata = Signal(AXI_DATA_WIDTH) # input
+ self.s_axi4_wvalid = Signal() # input
+ self.s_axi4_wready = Signal() # output
+ self.s_axi4_wstrb = Signal(1+ERROR p_expression_25) # input
+ self.s_axi4_wlast = Signal() # input
+ self.s_axi4_wuser = Signal(AXI_USER_WIDTH) # input
+ self.m_axi4_wdata = Signal(AXI_DATA_WIDTH) # output
+ self.m_axi4_wvalid = Signal() # output
+ self.m_axi4_wready = Signal() # input
+ self.m_axi4_wstrb = Signal(1+ERROR p_expression_25) # output
+ self.m_axi4_wlast = Signal() # output
+ self.m_axi4_wuser = Signal(AXI_USER_WIDTH) # output
+
+ def elaborate(self, platform=None):
+ m = Module()
+ return m
+
+
+#
+# //import CfMath::log2;
+#
+# module axi4_w_buffer
+# #(
+# parameter AXI_DATA_WIDTH = 32,
+# parameter AXI_ID_WIDTH = 4,
+# parameter AXI_USER_WIDTH = 4,
+# parameter ENABLE_L2TLB = 0,
+# parameter HUM_BUFFER_DEPTH = 16
+# )
+# (
+# input logic axi4_aclk,
+# input logic axi4_arstn,
+#
+# // L1 & L2 interfaces
+# output logic l1_done_o,
+# input logic l1_accept_i,
+# input logic l1_save_i,
+# input logic l1_drop_i,
+# input logic l1_master_i,
+# input logic [AXI_ID_WIDTH-1:0] l1_id_i,
+# input logic [7:0] l1_len_i,
+# input logic l1_prefetch_i,
+# input logic l1_hit_i,
+#
+# output logic l2_done_o,
+# input logic l2_accept_i,
+# input logic l2_drop_i,
+# input logic l2_master_i,
+# input logic [AXI_ID_WIDTH-1:0] l2_id_i,
+# input logic [7:0] l2_len_i,
+# input logic l2_prefetch_i,
+# input logic l2_hit_i,
+#
+# output logic master_select_o,
+# output logic input_stall_o,
+# output logic output_stall_o,
+#
+# // B sender interface
+# output logic b_drop_o,
+# input logic b_done_i,
+# output logic [AXI_ID_WIDTH-1:0] id_o,
+# output logic prefetch_o,
+# output logic hit_o,
+#
+# // AXI W channel interfaces
+# input logic [AXI_DATA_WIDTH-1:0] s_axi4_wdata,
+# input logic s_axi4_wvalid,
+# output logic s_axi4_wready,
+# input logic [AXI_DATA_WIDTH/8-1:0] s_axi4_wstrb,
+# input logic s_axi4_wlast,
+# input logic [AXI_USER_WIDTH-1:0] s_axi4_wuser,
+#
+# output logic [AXI_DATA_WIDTH-1:0] m_axi4_wdata,
+# output logic m_axi4_wvalid,
+# input logic m_axi4_wready,
+# output logic [AXI_DATA_WIDTH/8-1:0] m_axi4_wstrb,
+# output logic m_axi4_wlast,
+# output logic [AXI_USER_WIDTH-1:0] m_axi4_wuser
+# );
+#
+"""
+
+ localparam BUFFER_WIDTH = AXI_DATA_WIDTH+AXI_USER_WIDTH+AXI_DATA_WIDTH/8+1;
+
+ localparam INPUT_BUFFER_DEPTH = 4;
+ localparam L1_FIFO_DEPTH = 8;
+ localparam L2_FIFO_DEPTH = 4;
+
+ logic [AXI_DATA_WIDTH-1:0] axi4_wdata;
+ logic axi4_wvalid;
+ logic axi4_wready;
+ logic [AXI_DATA_WIDTH/8-1:0] axi4_wstrb;
+ logic axi4_wlast;
+ logic [AXI_USER_WIDTH-1:0] axi4_wuser;
+
+ logic l1_fifo_valid_out;
+ logic l1_fifo_ready_in;
+ logic l1_fifo_valid_in;
+ logic l1_fifo_ready_out;
+
+ logic l1_req;
+ logic l1_accept_cur, l1_save_cur, l1_drop_cur;
+ logic l1_master_cur;
+ logic [AXI_ID_WIDTH-1:0] l1_id_cur;
+ logic [7:0] l1_len_cur;
+ logic l1_hit_cur, l1_prefetch_cur;
+ logic l1_save_in, l1_save_out;
+ logic [log2(L1_FIFO_DEPTH)-1:0] n_l1_save_SP;
+
+ logic l2_fifo_valid_out;
+ logic l2_fifo_ready_in;
+ logic l2_fifo_valid_in;
+ logic l2_fifo_ready_out;
+
+ logic l2_req;
+ logic l2_accept_cur, l2_drop_cur;
+ logic l2_master_cur;
+ logic [AXI_ID_WIDTH-1:0] l2_id_cur;
+ logic [7:0] l2_len_cur;
+ logic l2_hit_cur, l2_prefetch_cur;
+
+ logic fifo_select, fifo_select_SN, fifo_select_SP;
+ logic w_done;
+ logic b_drop_set;
+
+ // HUM buffer signals
+ logic hum_buf_ready_out;
+ logic hum_buf_valid_in;
+ logic hum_buf_ready_in;
+ logic hum_buf_valid_out;
+ logic hum_buf_underfull;
+
+ logic [AXI_DATA_WIDTH-1:0] hum_buf_wdata;
+ logic [AXI_DATA_WIDTH/8-1:0] hum_buf_wstrb;
+ logic hum_buf_wlast;
+ logic [AXI_USER_WIDTH-1:0] hum_buf_wuser;
+
+ logic hum_buf_drop_req_SN, hum_buf_drop_req_SP;
+ logic [7:0] hum_buf_drop_len_SN, hum_buf_drop_len_SP;
+ logic hum_buf_almost_full;
+
+ logic stop_store;
+ logic wlast_in, wlast_out;
+ logic signed [3:0] n_wlast_SN, n_wlast_SP;
+ logic block_forwarding;
+
+ // Search FSM
+ typedef enum logic [3:0] {STORE, BYPASS,
+ WAIT_L1_BYPASS_YES, WAIT_L2_BYPASS_YES,
+ WAIT_L1_BYPASS_NO, WAIT_L2_BYPASS_NO,
+ FLUSH, DISCARD,
+ DISCARD_FINISH}
+ hum_buf_state_t;
+ hum_buf_state_t hum_buf_SP; // Present state
+ hum_buf_state_tbg hum_buf_SN; // Next State
+
+ axi_buffer_rab
+ #(
+ .DATA_WIDTH ( BUFFER_WIDTH ),
+ .BUFFER_DEPTH ( INPUT_BUFFER_DEPTH )
+ )
+ u_input_buf
+ (
+ .clk ( axi4_aclk ),
+ .rstn ( axi4_arstn ),
+ // Push
+ .data_in ( {s_axi4_wuser, s_axi4_wstrb, s_axi4_wdata, s_axi4_wlast} ),
+ .valid_in ( s_axi4_wvalid ),
+ .ready_out ( s_axi4_wready ),
+ // Pop
+ .data_out ( {axi4_wuser, axi4_wstrb, axi4_wdata, axi4_wlast} ),
+ .valid_out ( axi4_wvalid ),
+ .ready_in ( axi4_wready )
+ );
+
+ axi_buffer_rab
+ #(
+ .DATA_WIDTH ( 2+AXI_ID_WIDTH+8+4 ),
+ .BUFFER_DEPTH ( L1_FIFO_DEPTH )
+ )
+ u_l1_fifo
+ (
+ .clk ( axi4_aclk ),
+ .rstn ( axi4_arstn ),
+ // Push
+ .data_in ( {l1_prefetch_i, l1_hit_i, l1_id_i, l1_len_i, l1_master_i, l1_accept_i, l1_save_i, l1_drop_i} ),
+ .valid_in ( l1_fifo_valid_in ),
+ .ready_out ( l1_fifo_ready_out ),
+ // Pop
+ .data_out ( {l1_prefetch_cur, l1_hit_cur, l1_id_cur, l1_len_cur, l1_master_cur, l1_accept_cur, l1_save_cur, l1_drop_cur} ),
+ .valid_out ( l1_fifo_valid_out ),
+ .ready_in ( l1_fifo_ready_in )
+ );
+
+ // Push upon receiving new requests from the TLB.
+ assign l1_req = l1_accept_i | l1_save_i | l1_drop_i;
+ assign l1_fifo_valid_in = l1_req & l1_fifo_ready_out;
+
+ // Signal handshake
+ assign l1_done_o = l1_fifo_valid_in;
+ assign l2_done_o = l2_fifo_valid_in;
+
+ // Stall AW input of L1 TLB
+ assign input_stall_o = ~(l1_fifo_ready_out & l2_fifo_ready_out);
+
+ // Interface b_drop signals + handshake
+ always_comb begin
+ if (fifo_select == 1'b0) begin
+ prefetch_o = l1_prefetch_cur;
+ hit_o = l1_hit_cur;
+ id_o = l1_id_cur;
+
+ l1_fifo_ready_in = w_done | b_done_i;
+ l2_fifo_ready_in = 1'b0;
+ end else begin
+ prefetch_o = l2_prefetch_cur;
+ hit_o = l2_hit_cur;
+ id_o = l2_id_cur;
+
+ l1_fifo_ready_in = 1'b0;
+ l2_fifo_ready_in = w_done | b_done_i;
+ end
+ end
+
+ // Detect when an L1 transaction save request enters or exits the L1 FIFO.
+ assign l1_save_in = l1_fifo_valid_in & l1_save_i;
+ assign l1_save_out = l1_fifo_ready_in & l1_save_cur;
+
+ // Count the number of L1 transaction to save in the L1 FIFO.
+ always_ff @(posedge axi4_aclk or negedge axi4_arstn) begin
+ if (axi4_arstn == 0) begin
+ n_l1_save_SP <= '0;
+ end else if (l1_save_in ^ l1_save_out) begin
+ if (l1_save_in) begin
+ n_l1_save_SP <= n_l1_save_SP + 1'b1;
+ end else if (l1_save_out) begin
+ n_l1_save_SP <= n_l1_save_SP - 1'b1;
+ end
+ end
+ end
+
+ // Stall forwarding of AW L1 hits if:
+ // 1. The HUM buffer does not allow to be bypassed.
+ // 2. There are multiple L1 save requests in the FIFO, i.e., multiple L2 outputs pending.
+ assign output_stall_o = (n_l1_save_SP > 1) || (block_forwarding == 1'b1);
+
+ generate
+ if (ENABLE_L2TLB == 1) begin : HUM_BUFFER
+
+ axi_buffer_rab_bram
+ #(
+ .DATA_WIDTH ( BUFFER_WIDTH ),
+ .BUFFER_DEPTH ( HUM_BUFFER_DEPTH )
+ )
+ u_hum_buf
+ (
+ .clk ( axi4_aclk ),
+ .rstn ( axi4_arstn ),
+ // Push
+ .data_in ( {axi4_wuser, axi4_wstrb, axi4_wdata, axi4_wlast} ),
+ .valid_in ( hum_buf_valid_in ),
+ .ready_out ( hum_buf_ready_out ),
+ // Pop
+ .data_out ( {hum_buf_wuser, hum_buf_wstrb, hum_buf_wdata, hum_buf_wlast} ),
+ .valid_out ( hum_buf_valid_out ),
+ .ready_in ( hum_buf_ready_in ),
+ // Clear
+ .almost_full ( hum_buf_almost_full ),
+ .underfull ( hum_buf_underfull ),
+ .drop_req ( hum_buf_drop_req_SP ),
+ .drop_len ( hum_buf_drop_len_SP )
+ );
+
+ axi_buffer_rab
+ #(
+ .DATA_WIDTH ( 2+AXI_ID_WIDTH+8+3 ),
+ .BUFFER_DEPTH ( L2_FIFO_DEPTH )
+ )
+ u_l2_fifo
+ (
+ .clk ( axi4_aclk ),
+ .rstn ( axi4_arstn ),
+ // Push
+ .data_in ( {l2_prefetch_i, l2_hit_i, l2_id_i, l2_len_i, l2_master_i, l2_accept_i, l2_drop_i} ),
+ .valid_in ( l2_fifo_valid_in ),
+ .ready_out ( l2_fifo_ready_out ),
+ // Pop
+ .data_out ( {l2_prefetch_cur, l2_hit_cur, l2_id_cur, l2_len_cur, l2_master_cur, l2_accept_cur, l2_drop_cur} ),
+ .valid_out ( l2_fifo_valid_out ),
+ .ready_in ( l2_fifo_ready_in )
+ );
+
+ // Push upon receiving new result from TLB.
+ assign l2_req = l2_accept_i | l2_drop_i;
+ assign l2_fifo_valid_in = l2_req & l2_fifo_ready_out;
+
+ assign wlast_in = axi4_wlast & hum_buf_valid_in & hum_buf_ready_out;
+ assign wlast_out = hum_buf_wlast & hum_buf_valid_out & hum_buf_ready_in;
+
+ always_ff @(posedge axi4_aclk or negedge axi4_arstn) begin
+ if (axi4_arstn == 0) begin
+ fifo_select_SP <= 1'b0;
+ hum_buf_drop_len_SP <= 'b0;
+ hum_buf_drop_req_SP <= 1'b0;
+ hum_buf_SP <= STORE;
+ n_wlast_SP <= 'b0;
+ end else begin
+ fifo_select_SP <= fifo_select_SN;
+ hum_buf_drop_len_SP <= hum_buf_drop_len_SN;
+ hum_buf_drop_req_SP <= hum_buf_drop_req_SN;
+ hum_buf_SP <= hum_buf_SN;
+ n_wlast_SP <= n_wlast_SN;
+ end
+ end
+
+ always_comb begin
+ n_wlast_SN = n_wlast_SP;
+ if (hum_buf_drop_req_SP) begin // Happens exactly once per burst to be dropped.
+ n_wlast_SN -= 1;
+ end
+ if (wlast_in) begin
+ n_wlast_SN += 1;
+ end
+ if (wlast_out) begin
+ n_wlast_SN -= 1;
+ end
+ end
+
+ always_comb begin : HUM_BUFFER_FSM
+ hum_buf_SN = hum_buf_SP;
+
+ m_axi4_wlast = 1'b0;
+ m_axi4_wdata = 'b0;
+ m_axi4_wstrb = 'b0;
+ m_axi4_wuser = 'b0;
+
+ m_axi4_wvalid = 1'b0;
+ axi4_wready = 1'b0;
+
+ hum_buf_valid_in = 1'b0;
+ hum_buf_ready_in = 1'b0;
+
+ hum_buf_drop_req_SN = hum_buf_drop_req_SP;
+ hum_buf_drop_len_SN = hum_buf_drop_len_SP;
+ master_select_o = 1'b0;
+
+ w_done = 1'b0; // read from FIFO without handshake with B sender
+ b_drop_o = 1'b0; // send data from FIFO to B sender (with handshake)
+ fifo_select = 1'b0;
+
+ fifo_select_SN = fifo_select_SP;
+ stop_store = 1'b0;
+
+ block_forwarding = 1'b0;
+
+ unique case (hum_buf_SP)
+
+ STORE : begin
+ // Simply store the data in the buffer.
+ hum_buf_valid_in = axi4_wvalid & hum_buf_ready_out;
+ axi4_wready = hum_buf_ready_out;
+
+ // We have got a full burst in the HUM buffer, thus stop storing.
+ if (wlast_in & !hum_buf_underfull | (n_wlast_SP > $signed(0))) begin
+ hum_buf_SN = WAIT_L1_BYPASS_YES;
+
+ // The buffer is full, thus wait for decision.
+ end else if (~hum_buf_ready_out) begin
+ hum_buf_SN = WAIT_L1_BYPASS_NO;
+ end
+
+ // Avoid the forwarding of L1 hits until we know whether we can bypass.
+ if (l1_fifo_valid_out & l1_save_cur) begin
+ block_forwarding = 1'b1;
+ end
+ end
+
+ WAIT_L1_BYPASS_YES : begin
+ // Wait for orders from L1 TLB.
+ if (l1_fifo_valid_out) begin
+
+ // L1 hit - forward data from buffer
+ if (l1_accept_cur) begin
+ m_axi4_wlast = hum_buf_wlast;
+ m_axi4_wdata = hum_buf_wdata;
+ m_axi4_wstrb = hum_buf_wstrb;
+ m_axi4_wuser = hum_buf_wuser;
+
+ m_axi4_wvalid = hum_buf_valid_out;
+ hum_buf_ready_in = m_axi4_wready;
+
+ master_select_o = l1_master_cur;
+
+ // Detect last data beat.
+ if (wlast_out) begin
+ fifo_select = 1'b0;
+ w_done = 1'b1;
+ hum_buf_SN = STORE;
+ end
+
+ // L1 miss - wait for L2
+ end else if (l1_save_cur) begin
+ fifo_select = 1'b0;
+ w_done = 1'b1;
+ hum_buf_SN = WAIT_L2_BYPASS_YES;
+
+ // L1 prefetch, prot, multi - drop data
+ end else if (l1_drop_cur) begin
+ fifo_select_SN = 1'b0; // L1
+ hum_buf_drop_req_SN = 1'b1;
+ hum_buf_drop_len_SN = l1_len_cur;
+ hum_buf_SN = FLUSH;
+ end
+ end
+ end
+
+ WAIT_L2_BYPASS_YES : begin
+ // Wait for orders from L2 TLB.
+ if (l2_fifo_valid_out) begin
+
+ // L2 hit - forward data from buffer
+ if (l2_accept_cur) begin
+ m_axi4_wlast = hum_buf_wlast;
+ m_axi4_wdata = hum_buf_wdata;
+ m_axi4_wstrb = hum_buf_wstrb;
+ m_axi4_wuser = hum_buf_wuser;
+
+ m_axi4_wvalid = hum_buf_valid_out;
+ hum_buf_ready_in = m_axi4_wready;
+
+ master_select_o = l2_master_cur;
+
+ // Detect last data beat.
+ if (wlast_out) begin
+ fifo_select = 1'b1;
+ w_done = 1'b1;
+ hum_buf_SN = STORE;
+ end
+
+ // L2 miss/prefetch hit
+ end else if (l2_drop_cur) begin
+ fifo_select_SN = 1'b1; // L2
+ hum_buf_drop_req_SN = 1'b1;
+ hum_buf_drop_len_SN = l2_len_cur;
+ hum_buf_SN = FLUSH;
+ end
+
+ // While we wait for orders from L2 TLB, we can still drop and accept L1 transactions.
+ end else if (l1_fifo_valid_out) begin
+
+ // L1 hit
+ if (l1_accept_cur) begin
+ hum_buf_SN = BYPASS;
+
+ // L1 prefetch/prot/multi
+ end else if (l1_drop_cur) begin
+ hum_buf_SN = DISCARD;
+ end
+ end
+ end
+
+ FLUSH : begin
+ // Clear HUM buffer flush request.
+ hum_buf_drop_req_SN = 1'b0;
+
+ // perform handshake with B sender
+ fifo_select = fifo_select_SP;
+ b_drop_o = 1'b1;
+ if (b_done_i) begin
+ hum_buf_SN = STORE;
+ end
+ end
+
+ BYPASS : begin
+ // Forward one full transaction from input buffer.
+ m_axi4_wlast = axi4_wlast;
+ m_axi4_wdata = axi4_wdata;
+ m_axi4_wstrb = axi4_wstrb;
+ m_axi4_wuser = axi4_wuser;
+
+ m_axi4_wvalid = axi4_wvalid;
+ axi4_wready = m_axi4_wready;
+
+ master_select_o = l1_master_cur;
+
+ // We have got a full transaction.
+ if (axi4_wlast & axi4_wready & axi4_wvalid) begin
+ fifo_select = 1'b0;
+ w_done = 1'b1;
+ hum_buf_SN = WAIT_L2_BYPASS_YES;
+ end
+ end
+
+ DISCARD : begin
+ // Discard one full transaction from input buffer.
+ axi4_wready = 1'b1;
+
+ // We have got a full transaction.
+ if (axi4_wlast & axi4_wready & axi4_wvalid) begin
+ // Try to perform handshake with B sender.
+ fifo_select = 1'b0;
+ b_drop_o = 1'b1;
+ // We cannot wait here due to axi4_wready.
+ if (b_done_i) begin
+ hum_buf_SN = WAIT_L2_BYPASS_YES;
+ end else begin
+ hum_buf_SN = DISCARD_FINISH;
+ end
+ end
+ end
+
+ DISCARD_FINISH : begin
+ // Perform handshake with B sender.
+ fifo_select = 1'b0;
+ b_drop_o = 1'b1;
+ if (b_done_i) begin
+ hum_buf_SN = WAIT_L2_BYPASS_YES;
+ end
+ end
+
+ WAIT_L1_BYPASS_NO : begin
+ // Do not allow the forwarding of L1 hits.
+ block_forwarding = 1'b1;
+
+ // Wait for orders from L1 TLB.
+ if (l1_fifo_valid_out) begin
+
+ // L1 hit - forward data from/through HUM buffer and refill the buffer
+ if (l1_accept_cur) begin
+ // Forward data from HUM buffer.
+ m_axi4_wlast = hum_buf_wlast;
+ m_axi4_wdata = hum_buf_wdata;
+ m_axi4_wstrb = hum_buf_wstrb;
+ m_axi4_wuser = hum_buf_wuser;
+
+ m_axi4_wvalid = hum_buf_valid_out;
+ hum_buf_ready_in = m_axi4_wready;
+
+ master_select_o = l1_master_cur;
+
+ // Refill the HUM buffer. Stop when buffer full.
+ stop_store = ~hum_buf_ready_out;
+ hum_buf_valid_in = stop_store ? 1'b0 : axi4_wvalid ;
+ axi4_wready = stop_store ? 1'b0 : hum_buf_ready_out;
+
+ // Detect last data beat.
+ if (wlast_out) begin
+ fifo_select = 1'b0;
+ w_done = 1'b1;
+ if (~hum_buf_ready_out | hum_buf_almost_full) begin
+ hum_buf_SN = WAIT_L1_BYPASS_NO;
+ end else begin
+ hum_buf_SN = STORE;
+ end
+ end
+
+ // Allow the forwarding of L1 hits.
+ block_forwarding = 1'b0;
+
+ // L1 miss - wait for L2
+ end else if (l1_save_cur) begin
+ fifo_select = 1'b0;
+ w_done = 1'b1;
+ hum_buf_SN = WAIT_L2_BYPASS_NO;
+
+ // L1 prefetch, prot, multi - drop data
+ end else if (l1_drop_cur) begin
+ fifo_select_SN = 1'b0; // L1
+ hum_buf_drop_req_SN = 1'b1;
+ hum_buf_drop_len_SN = l1_len_cur;
+ hum_buf_SN = FLUSH;
+
+ // Allow the forwarding of L1 hits.
+ block_forwarding = 1'b0;
+ end
+ end
+ end
+
+ WAIT_L2_BYPASS_NO : begin
+ // Do not allow the forwarding of L1 hits.
+ block_forwarding = 1'b1;
+
+ // Wait for orders from L2 TLB.
+ if (l2_fifo_valid_out) begin
+
+ // L2 hit - forward first part from HUM buffer, rest from input buffer
+ if (l2_accept_cur) begin
+ // Forward data from HUM buffer.
+ m_axi4_wlast = hum_buf_wlast;
+ m_axi4_wdata = hum_buf_wdata;
+ m_axi4_wstrb = hum_buf_wstrb;
+ m_axi4_wuser = hum_buf_wuser;
+
+ m_axi4_wvalid = hum_buf_valid_out;
+ hum_buf_ready_in = m_axi4_wready;
+
+ master_select_o = l2_master_cur;
+
+ // Refill the HUM buffer. Stop when buffer full.
+ stop_store = ~hum_buf_ready_out;
+ hum_buf_valid_in = stop_store ? 1'b0 : axi4_wvalid ;
+ axi4_wready = stop_store ? 1'b0 : hum_buf_ready_out;
+
+ // Detect last data beat.
+ if (wlast_out) begin
+ fifo_select = 1'b1;
+ w_done = 1'b1;
+ if (~hum_buf_ready_out | hum_buf_almost_full) begin
+ hum_buf_SN = WAIT_L1_BYPASS_NO;
+ end else begin
+ hum_buf_SN = STORE;
+ end
+ end
+
+ // Allow the forwarding of L1 hits.
+ block_forwarding = 1'b0;
+
+ // L2 miss/prefetch hit - drop data
+ end else if (l2_drop_cur) begin
+ fifo_select_SN = 1'b1; // L2
+ hum_buf_drop_req_SN = 1'b1;
+ hum_buf_drop_len_SN = l2_len_cur;
+ hum_buf_SN = FLUSH;
+
+ // Allow the forwarding of L1 hits.
+ block_forwarding = 1'b0;
+ end
+ end
+ end
+
+
+ default: begin
+ hum_buf_SN = STORE;
+ end
+
+ endcase // hum_buf_SP
+ end // HUM_BUFFER_FSM
+
+ assign b_drop_set = 1'b0;
+
+ end else begin // HUM_BUFFER
+
+ // register to perform the handshake with B sender
+ always_ff @(posedge axi4_aclk or negedge axi4_arstn) begin
+ if (axi4_arstn == 0) begin
+ b_drop_o <= 1'b0;
+ end else if (b_done_i) begin
+ b_drop_o <= 1'b0;
+ end else if (b_drop_set) begin
+ b_drop_o <= 1'b1;;
+ end
+ end
+
+ always_comb begin : OUTPUT_CTRL
+
+ fifo_select = 1'b0;
+ w_done = 1'b0;
+ b_drop_set = 1'b0;
+
+ m_axi4_wlast = 1'b0;
+ m_axi4_wdata = 'b0;
+ m_axi4_wstrb = 'b0;
+ m_axi4_wuser = 'b0;
+
+ m_axi4_wvalid = 1'b0;
+ axi4_wready = 1'b0;
+
+ if (l1_fifo_valid_out) begin
+ // forward data
+ if (l1_accept_cur) begin
+ m_axi4_wlast = axi4_wlast;
+ m_axi4_wdata = axi4_wdata;
+ m_axi4_wstrb = axi4_wstrb;
+ m_axi4_wuser = axi4_wuser;
+
+ m_axi4_wvalid = axi4_wvalid;
+ axi4_wready = m_axi4_wready;
+
+ // Simply pop from FIFO upon last data beat.
+ w_done = axi4_wlast & axi4_wvalid & axi4_wready;
+
+ // discard entire burst
+ end else if (b_drop_o == 1'b0) begin
+ axi4_wready = 1'b1;
+
+ // Simply pop from FIFO upon last data beat. Perform handshake with B sender.
+ if (axi4_wlast & axi4_wvalid & axi4_wready)
+ b_drop_set = 1'b1;
+ end
+ end
+
+ end // OUTPUT_CTRL
+
+ assign master_select_o = l1_master_cur;
+ assign l2_fifo_ready_out = 1'b1;
+ assign block_forwarding = 1'b0;
+
+ // unused signals
+ assign hum_buf_ready_out = 1'b0;
+ assign hum_buf_valid_in = 1'b0;
+ assign hum_buf_ready_in = 1'b0;
+ assign hum_buf_valid_out = 1'b0;
+ assign hum_buf_wdata = 'b0;
+ assign hum_buf_wstrb = 'b0;
+ assign hum_buf_wlast = 1'b0;
+ assign hum_buf_wuser = 'b0;
+ assign hum_buf_drop_len_SN = 'b0;
+ assign hum_buf_drop_req_SN = 1'b0;
+ assign hum_buf_almost_full = 1'b0;
+
+ assign l2_fifo_valid_in = 1'b0;
+ assign l2_fifo_valid_out = 1'b0;
+ assign l2_prefetch_cur = 1'b0;
+ assign l2_hit_cur = 1'b0;
+ assign l2_id_cur = 'b0;
+ assign l2_len_cur = 'b0;
+ assign l2_master_cur = 1'b0;
+ assign l2_accept_cur = 1'b0;
+ assign l2_drop_cur = 1'b0;
+
+ assign l2_req = 1'b0;
+
+ assign fifo_select_SN = 1'b0;
+ assign fifo_select_SP = 1'b0;
+
+ assign stop_store = 1'b0;
+ assign n_wlast_SP = 'b0;
+ assign wlast_in = 1'b0;
+ assign wlast_out = 1'b0;
+
+ end // HUM_BUFFER
+
+ endgenerate
+"""
--- /dev/null
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi4_w_sender(Elaboratable):
+
+ def __init__(self):
+ self.axi4_aclk = Signal() # input
+ self.axi4_arstn = Signal() # input
+ self.s_axi4_wdata = Signal() # input
+ self.s_axi4_wvalid = Signal() # input
+ self.s_axi4_wready = Signal() # output
+ self.s_axi4_wstrb = Signal() # input
+ self.s_axi4_wlast = Signal() # input
+ self.s_axi4_wuser = Signal() # input
+ self.m_axi4_wdata = Signal() # output
+ self.m_axi4_wvalid = Signal() # output
+ self.m_axi4_wready = Signal() # input
+ self.m_axi4_wstrb = Signal() # output
+ self.m_axi4_wlast = Signal() # output
+ self.m_axi4_wuser = Signal() # output
+
+ def elaborate(self, platform=None):
+ m = Module()
+ m.d.comb += self.m_axi4_wdata.eq(self.s_axi4_wdata)
+ m.d.comb += self.m_axi4_wstrb.eq(self.s_axi4_wstrb)
+ m.d.comb += self.m_axi4_wlast.eq(self.s_axi4_wlast)
+ m.d.comb += self.m_axi4_wuser.eq(self.s_axi4_wuser)
+ m.d.comb += self.m_axi4_wvalid.eq(self.s_axi4_wvalid)
+ m.d.comb += self.s_axi4_wready.eq(self.m_axi4_wready)
+ return m
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License. You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# module axi4_w_sender
+# #(
+# parameter AXI_DATA_WIDTH = 32,
+# parameter AXI_USER_WIDTH = 2
+# )
+# (
+# input axi4_aclk,
+# input axi4_arstn,
+#
+# input [AXI_DATA_WIDTH-1:0] s_axi4_wdata,
+# input s_axi4_wvalid,
+# output s_axi4_wready,
+# input [AXI_DATA_WIDTH/8-1:0] s_axi4_wstrb,
+# input s_axi4_wlast,
+# input [AXI_USER_WIDTH-1:0] s_axi4_wuser,
+#
+# output [AXI_DATA_WIDTH-1:0] m_axi4_wdata,
+# output m_axi4_wvalid,
+# input m_axi4_wready,
+# output [AXI_DATA_WIDTH/8-1:0] m_axi4_wstrb,
+# output m_axi4_wlast,
+# output [AXI_USER_WIDTH-1:0] m_axi4_wuser
+# );
+#
+# assign m_axi4_wdata = s_axi4_wdata;
+# assign m_axi4_wstrb = s_axi4_wstrb;
+# assign m_axi4_wlast = s_axi4_wlast;
+# assign m_axi4_wuser = s_axi4_wuser;
+#
+# assign m_axi4_wvalid = s_axi4_wvalid;
+# assign s_axi4_wready = m_axi4_wready;
+#
+# endmodule
+#
+#
--- /dev/null
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi_buffer_rab(Elaboratable):
+
+ def __init__(self):
+ self.clk = Signal() # input
+ self.rstn = Signal() # input
+ self.data_out = Signal(DATA_WIDTH) # output
+ self.valid_out = Signal() # output
+ self.ready_in = Signal() # input
+ self.valid_in = Signal() # input
+ self.data_in = Signal(DATA_WIDTH) # input
+ self.ready_out = Signal() # output
+
+ def elaborate(self, platform=None):
+ m = Module()
+ m.d.comb += self.full.eq(self.None)
+ m.d.comb += self.data_out.eq(self.None)
+ m.d.comb += self.valid_out.eq(self.None)
+ m.d.comb += self.ready_out.eq(self.None)
+ return m
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License. You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# //import CfMath::log2;
+#
+# module axi_buffer_rab
+# //#(
+# // parameter DATA_WIDTH,
+# // parameter BUFFER_DEPTH
+# //)
+# (
+# input logic clk,
+# input logic rstn,
+#
+# // Downstream port
+# output logic [DATA_WIDTH-1:0] data_out,
+# output logic valid_out,
+# input logic ready_in,
+#
+# // Upstream port
+# input logic valid_in,
+# input logic [DATA_WIDTH-1:0] data_in,
+# output logic ready_out
+# );
+#
+# localparam integer LOG_BUFFER_DEPTH = log2(BUFFER_DEPTH);
+#
+# // Internal data structures
+# reg [LOG_BUFFER_DEPTH - 1 : 0] pointer_in; // location to which we last wrote
+# reg [LOG_BUFFER_DEPTH - 1 : 0] pointer_out; // location from which we last sent
+# reg [LOG_BUFFER_DEPTH : 0] elements; // number of elements in the buffer
+# reg [DATA_WIDTH - 1 : 0] buffer [BUFFER_DEPTH - 1 : 0];
+#
+# wire full;
+#
+# integer loop1;
+#
+# assign full = (elements == BUFFER_DEPTH);
+#
+# always @(posedge clk or negedge rstn)
+# begin: elements_sequential
+# if (rstn == 1'b0)
+# elements <= 0;
+# else
+# begin
+# // ------------------
+# // Are we filling up?
+# // ------------------
+# // One out, none in
+# if (ready_in && valid_out && (!valid_in || full))
+# elements <= elements - 1;
+# // None out, one in
+# else if ((!valid_out || !ready_in) && valid_in && !full)
+# elements <= elements + 1;
+# // Else, either one out and one in, or none out and none in - stays unchanged
+# end
+# end
+#
+# always @(posedge clk or negedge rstn)
+# begin: buffers_sequential
+# if (rstn == 1'b0)
+# begin
+# for (loop1 = 0 ; loop1 < BUFFER_DEPTH ; loop1 = loop1 + 1)
+# buffer[loop1] <= 0;
+# end
+# else
+# begin
+# // Update the memory
+# if (valid_in && !full)
+# buffer[pointer_in] <= data_in;
+# end
+# end
+#
+# always @(posedge clk or negedge rstn)
+# begin: sequential
+# if (rstn == 1'b0)
+# begin
+# pointer_out <= 0;
+# pointer_in <= 0;
+# end
+# else
+# begin
+# // ------------------------------------
+# // Check what to do with the input side
+# // ------------------------------------
+# // We have some input, increase by 1 the input pointer
+# if (valid_in && !full)
+# begin
+# if (pointer_in == $unsigned(BUFFER_DEPTH - 1))
+# pointer_in <= 0;
+# else
+# pointer_in <= pointer_in + 1;
+# end
+# // Else we don't have any input, the input pointer stays the same
+#
+# // -------------------------------------
+# // Check what to do with the output side
+# // -------------------------------------
+# // We had pushed one flit out, we can try to go for the next one
+# if (ready_in && valid_out)
+# begin
+# if (pointer_out == $unsigned(BUFFER_DEPTH - 1))
+# pointer_out <= 0;
+# else
+# pointer_out <= pointer_out + 1;
+# end
+# // Else stay on the same output location
+# end
+# end
+#
+# // Update output ports
+# assign data_out = buffer[pointer_out];
+# assign valid_out = (elements != 0);
+#
+# assign ready_out = ~full;
+#
+# endmodule
+#
+#
--- /dev/null
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi_buffer_rab_bram(Elaboratable):
+
+ def __init__(self):
+ self.clk = Signal() # input
+ self.rstn = Signal() # input
+ self.data_out = Signal(DATA_WIDTH) # output
+ self.valid_out = Signal() # output
+ self.ready_in = Signal() # input
+ self.valid_in = Signal() # input
+ self.data_in = Signal(DATA_WIDTH) # input
+ self.ready_out = Signal() # output
+ self.almost_full = Signal() # output
+ self.underfull = Signal() # output
+ self.drop_req = Signal() # input
+ self.drop_len = Signal(8) # input
+
+ def elaborate(self, platform=None):
+ m = Module()
+ return m
+
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License. You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# ////import CfMath::log2;
+#
+# module axi_buffer_rab_bram
+# //#(
+# // parameter DATA_WIDTH,
+# // parameter BUFFER_DEPTH
+# // )
+# (
+# input logic clk,
+# input logic rstn,
+#
+# // Downstream port
+# output logic [DATA_WIDTH-1:0] data_out,
+# output logic valid_out,
+# input logic ready_in,
+#
+# // Upstream port
+# input logic valid_in,
+# input logic [DATA_WIDTH-1:0] data_in,
+# output logic ready_out,
+#
+# // Status and drop control
+# output logic almost_full,
+# output logic underfull,
+# input logic drop_req,
+# // Number of items to drop. As for AXI lengths, counting starts at zero, i.e., `drop_len == 0`
+# // and `drop_req` means drop one item.
+# input logic [7:0] drop_len
+# );
+#
+""" #docstring_begin
+ // The BRAM needs to be in "write-first" mode for first-word fall-through FIFO behavior.
+ // To still push and pop simultaneously if the buffer is full, we internally increase the
+ // buffer depth by 1.
+ localparam ACT_BUFFER_DEPTH = BUFFER_DEPTH+1;
+ localparam ACT_LOG_BUFFER_DEPTH = log2(ACT_BUFFER_DEPTH+1);
+
+ /**
+ * Internal data structures
+ */
+ // Location to which we last wrote
+ logic [ACT_LOG_BUFFER_DEPTH-1:0] ptr_in_d, ptr_in_q;
+ // Location from which we last sent
+ logic [ACT_LOG_BUFFER_DEPTH-1:0] ptr_out_d, ptr_out_q;
+ // Required for fall-through behavior on the first word
+ logic [ACT_LOG_BUFFER_DEPTH-1:0] ptr_out_bram;
+ // Number of elements in the buffer. Can be negative if elements that have been dropped have not
+ // yet been written.
+ logic signed [ACT_LOG_BUFFER_DEPTH:0] n_elems_d, n_elems_q;
+
+ logic [DATA_WIDTH-1:0] data_out_bram, data_out_q;
+ logic valid_out_q;
+
+ logic full;
+
+ assign almost_full = (n_elems_q == BUFFER_DEPTH-1);
+ assign full = (n_elems_q == BUFFER_DEPTH);
+
+ always_ff @(posedge clk, negedge rstn) begin
+ if (~rstn) begin
+ n_elems_q <= '0;
+ ptr_in_q <= '0;
+ ptr_out_q <= '0;
+ end else begin
+ n_elems_q <= n_elems_d;
+ ptr_in_q <= ptr_in_d;
+ ptr_out_q <= ptr_out_d;
+ end
+ end
+
+ // Update the number of elements.
+ always_comb begin
+ n_elems_d = n_elems_q;
+ if (drop_req) begin
+ n_elems_d -= (drop_len + 1);
+ end
+ if (valid_in && ready_out) begin
+ n_elems_d += 1;
+ end
+ if (valid_out && ready_in) begin
+ n_elems_d -= 1;
+ end
+ end
+
+ // Update the output pointer.
+ always_comb begin
+ ptr_out_d = ptr_out_q;
+ if (drop_req) begin
+ if ((ptr_out_q + drop_len + 1) > (ACT_BUFFER_DEPTH - 1)) begin
+ ptr_out_d = drop_len + 1 - (ACT_BUFFER_DEPTH - ptr_out_q);
+ end else begin
+ ptr_out_d += (drop_len + 1);
+ end
+ end
+ if (valid_out && ready_in) begin
+ if (ptr_out_d == (ACT_BUFFER_DEPTH - 1)) begin
+ ptr_out_d = '0;
+ end else begin
+ ptr_out_d += 1;
+ end
+ end
+ end
+
+ // The BRAM has a read latency of one cycle, so apply the new address one cycle earlier for
+ // first-word fall-through FIFO behavior.
+ //assign ptr_out_bram = (ptr_out_q == (ACT_BUFFER_DEPTH-1)) ? '0 : (ptr_out_q + 1);
+ assign ptr_out_bram = ptr_out_d;
+
+ // Update the input pointer.
+ always_comb begin
+ ptr_in_d = ptr_in_q;
+ if (valid_in && ready_out) begin
+ if (ptr_in_d == (ACT_BUFFER_DEPTH - 1)) begin
+ ptr_in_d = '0;
+ end else begin
+ ptr_in_d += 1;
+ end
+ end
+ end
+
+ // Update output ports.
+ assign valid_out = (n_elems_q > $signed(0));
+ assign underfull = (n_elems_q < $signed(0));
+ assign ready_out = ~full;
+
+ ram_tp_write_first #(
+ .ADDR_WIDTH ( ACT_LOG_BUFFER_DEPTH ),
+ .DATA_WIDTH ( DATA_WIDTH )
+ )
+ ram_tp_write_first_0
+ (
+ .clk ( clk ),
+ .we ( valid_in & ~full ),
+ .addr0 ( ptr_in_q ),
+ .addr1 ( ptr_out_bram ),
+ .d_i ( data_in ),
+ .d0_o ( ),
+ .d1_o ( data_out_bram )
+ );
+
+ // When reading from/writing two the same address on both ports ("Write-Read Collision"),
+ // the data on the read port is invalid (during the write cycle). In this implementation,
+ // this can happen only when the buffer is empty. Thus, we forward the data from an
+ // register in this case.
+ always @(posedge clk) begin
+ if (rstn == 1'b0) begin
+ data_out_q <= 'b0;
+ end else if ( (ptr_out_bram == ptr_in_q) && (valid_in && !full) ) begin
+ data_out_q <= data_in;
+ end
+ end
+
+ always @(posedge clk) begin
+ if (rstn == 1'b0) begin
+ valid_out_q <= 'b0;
+ end else begin
+ valid_out_q <= valid_out;
+ end
+ end
+
+ // Drive output data
+ always_comb begin
+ if (valid_out && !valid_out_q) begin // We have just written to an empty FIFO
+ data_out = data_out_q;
+ end else begin
+ data_out = data_out_bram;
+ end
+ end
+
+"""
+# endmodule
+#
+#
--- /dev/null
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi_rab_cfg(Elaboratable):
+
+ def __init__(self):
+ self.Clk_CI = Signal() # input
+ self.Rst_RBI = Signal() # input
+ self.s_axi_awaddr = Signal(AXI_ADDR_WIDTH) # input
+ self.s_axi_awvalid = Signal() # input
+ self.s_axi_awready = Signal() # output
+ self.s_axi_wdata = Signal() # input
+ self.s_axi_wstrb = Signal(1+ERROR p_expression_25) # input
+ self.s_axi_wvalid = Signal() # input
+ self.s_axi_wready = Signal() # output
+ self.s_axi_bresp = Signal(2) # output
+ self.s_axi_bvalid = Signal() # output
+ self.s_axi_bready = Signal() # input
+ self.s_axi_araddr = Signal(AXI_ADDR_WIDTH) # input
+ self.s_axi_arvalid = Signal() # input
+ self.s_axi_arready = Signal() # output
+ self.s_axi_rdata = Signal(AXI_DATA_WIDTH) # output
+ self.s_axi_rresp = Signal(2) # output
+ self.s_axi_rvalid = Signal() # output
+ self.s_axi_rready = Signal() # input
+ self.L1Cfg_DO = Signal() # output
+ self.L1AllowMultiHit_SO = Signal() # output
+ self.MissAddr_DI = Signal(ADDR_WIDTH_VIRT) # input
+ self.MissMeta_DI = Signal(MISS_META_WIDTH) # input
+ self.Miss_SI = Signal() # input
+ self.MhFifoFull_SO = Signal() # output
+ self.wdata_l2 = Signal() # output
+ self.waddr_l2 = Signal() # output
+ self.wren_l2 = Signal(N_PORTS) # output
+
+ def elaborate(self, platform=None):
+ m = Module()
+ return m
+
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License. You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# // --=========================================================================--
+# //
+# // █████╗ ██╗ ██╗██╗ ██████╗ █████╗ ██████╗ ██████╗███████╗ ██████╗
+# // ██╔══██╗╚██╗██╔╝██║ ██╔══██╗██╔══██╗██╔══██╗ ██╔════╝██╔════╝██╔════╝
+# // ███████║ ╚███╔╝ ██║ ██████╔╝███████║██████╔╝ ██║ █████╗ ██║ ███╗
+# // ██╔══██║ ██╔██╗ ██║ ██╔══██╗██╔══██║██╔══██╗ ██║ ██╔══╝ ██║ ██║
+# // ██║ ██║██╔╝ ██╗██║ ██║ ██║██║ ██║██████╔╝ ╚██████╗██║ ╚██████╔╝
+# // ╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═╝ ╚═╝╚═╝ ╚═╝╚═════╝ ╚═════╝╚═╝ ╚═════╝
+# //
+# //
+# // Author: Pirmin Vogel - vogelpi@iis.ee.ethz.ch
+# //
+# // Purpose : AXI4-Lite configuration and miss handling interface for RAB
+# //
+# // --=========================================================================--
+#
+# //import CfMath::log2;
+#
+# module axi_rab_cfg
+# #(
+# parameter N_PORTS = 3,
+# parameter N_REGS = 196,
+# parameter N_L2_SETS = 32,
+# parameter N_L2_SET_ENTRIES= 32,
+# parameter ADDR_WIDTH_PHYS = 40,
+# parameter ADDR_WIDTH_VIRT = 32,
+# parameter N_FLAGS = 4,
+# parameter AXI_DATA_WIDTH = 64,
+# parameter AXI_ADDR_WIDTH = 32,
+# parameter MISS_META_WIDTH = 10, // <= FIFO_WIDTH
+# parameter MH_FIFO_DEPTH = 16
+# )
+# (
+# input logic Clk_CI,
+# input logic Rst_RBI,
+#
+# // AXI Lite interface
+# input logic [AXI_ADDR_WIDTH-1:0] s_axi_awaddr,
+# input logic s_axi_awvalid,
+# output logic s_axi_awready,
+# input logic [AXI_DATA_WIDTH/8-1:0][7:0] s_axi_wdata,
+# input logic [AXI_DATA_WIDTH/8-1:0] s_axi_wstrb,
+# input logic s_axi_wvalid,
+# output logic s_axi_wready,
+# output logic [1:0] s_axi_bresp,
+# output logic s_axi_bvalid,
+# input logic s_axi_bready,
+# input logic [AXI_ADDR_WIDTH-1:0] s_axi_araddr,
+# input logic s_axi_arvalid,
+# output logic s_axi_arready,
+# output logic [AXI_DATA_WIDTH-1:0] s_axi_rdata,
+# output logic [1:0] s_axi_rresp,
+# output logic s_axi_rvalid,
+# input logic s_axi_rready,
+#
+# // Slice configuration
+# output logic [N_REGS-1:0][63:0] L1Cfg_DO,
+# output logic L1AllowMultiHit_SO,
+#
+# // Miss handling
+# input logic [ADDR_WIDTH_VIRT-1:0] MissAddr_DI,
+# input logic [MISS_META_WIDTH-1:0] MissMeta_DI,
+# input logic Miss_SI,
+# output logic MhFifoFull_SO,
+#
+# // L2 TLB
+# output logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] wdata_l2,
+# output logic [N_PORTS-1:0] [AXI_ADDR_WIDTH-1:0] waddr_l2,
+# output logic [N_PORTS-1:0] wren_l2
+# );
+#
+""" #docstring_begin
+
+ localparam ADDR_LSB = log2(64/8); // 64 even if the AXI Lite interface is 32,
+ // because RAB slices are 64 bit wide.
+ localparam ADDR_MSB = log2(N_REGS)+ADDR_LSB-1;
+
+ localparam L2SINGLE_AMAP_SIZE = 16'h4000; // Maximum 2048 TLB entries in L2
+
+ localparam integer N_L2_ENTRIES = N_L2_SETS * N_L2_SET_ENTRIES;
+
+ localparam logic [AXI_ADDR_WIDTH-1:0] L2_VA_MAX_ADDR = (N_L2_ENTRIES-1) << 2;
+
+ logic [AXI_DATA_WIDTH/8-1:0][7:0] L1Cfg_DP[N_REGS]; // [Byte][Bit]
+ genvar j;
+
+ // █████╗ ██╗ ██╗██╗██╗ ██╗ ██╗ ██╗████████╗███████╗
+ // ██╔══██╗╚██╗██╔╝██║██║ ██║ ██║ ██║╚══██╔══╝██╔════╝
+ // ███████║ ╚███╔╝ ██║███████║█████╗██║ ██║ ██║ █████╗
+ // ██╔══██║ ██╔██╗ ██║╚════██║╚════╝██║ ██║ ██║ ██╔══╝
+ // ██║ ██║██╔╝ ██╗██║ ██║ ███████╗██║ ██║ ███████╗
+ // ╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═╝ ╚══════╝╚═╝ ╚═╝ ╚══════╝
+ //
+ logic [AXI_ADDR_WIDTH-1:0] awaddr_reg;
+ logic awaddr_done_rise;
+ logic awaddr_done_reg;
+ logic awaddr_done_reg_dly;
+
+ logic [AXI_DATA_WIDTH/8-1:0][7:0] wdata_reg;
+ logic [AXI_DATA_WIDTH/8-1:0] wstrb_reg;
+ logic wdata_done_rise;
+ logic wdata_done_reg;
+ logic wdata_done_reg_dly;
+
+ logic wresp_done_reg;
+ logic wresp_running_reg;
+
+ logic [AXI_ADDR_WIDTH-1:0] araddr_reg;
+ logic araddr_done_reg;
+
+ logic [AXI_DATA_WIDTH-1:0] rdata_reg;
+ logic rresp_done_reg;
+ logic rresp_running_reg;
+
+ logic awready;
+ logic wready;
+ logic bvalid;
+
+ logic arready;
+ logic rvalid;
+
+ logic wren;
+ logic wren_l1;
+
+ assign wren = ( wdata_done_rise & awaddr_done_reg ) | ( awaddr_done_rise & wdata_done_reg );
+ assign wdata_done_rise = wdata_done_reg & ~wdata_done_reg_dly;
+ assign awaddr_done_rise = awaddr_done_reg & ~awaddr_done_reg_dly;
+
+ // reg_dly
+ always @(posedge Clk_CI or negedge Rst_RBI)
+ begin
+ if (!Rst_RBI)
+ begin
+ wdata_done_reg_dly <= 1'b0;
+ awaddr_done_reg_dly <= 1'b0;
+ end
+ else
+ begin
+ wdata_done_reg_dly <= wdata_done_reg;
+ awaddr_done_reg_dly <= awaddr_done_reg;
+ end
+ end
+
+ // AW Channel
+ always @(posedge Clk_CI or negedge Rst_RBI)
+ begin
+ if (!Rst_RBI)
+ begin
+ awaddr_done_reg <= 1'b0;
+ awaddr_reg <= '0;
+ awready <= 1'b1;
+ end
+ else
+ begin
+ if (awready && s_axi_awvalid)
+ begin
+ awready <= 1'b0;
+ awaddr_done_reg <= 1'b1;
+ awaddr_reg <= s_axi_awaddr;
+ end
+ else if (awaddr_done_reg && wresp_done_reg)
+ begin
+ awready <= 1'b1;
+ awaddr_done_reg <= 1'b0;
+ end
+ end
+ end
+
+ // W Channel
+ always @(posedge Clk_CI or negedge Rst_RBI)
+ begin
+ if (!Rst_RBI)
+ begin
+ wdata_done_reg <= 1'b0;
+ wready <= 1'b1;
+ wdata_reg <= '0;
+ wstrb_reg <= '0;
+ end
+ else
+ begin
+ if (wready && s_axi_wvalid)
+ begin
+ wready <= 1'b0;
+ wdata_done_reg <= 1'b1;
+ wdata_reg <= s_axi_wdata;
+ wstrb_reg <= s_axi_wstrb;
+ end
+ else if (wdata_done_reg && wresp_done_reg)
+ begin
+ wready <= 1'b1;
+ wdata_done_reg <= 1'b0;
+ end
+ end
+ end
+
+ // B Channel
+ always @(posedge Clk_CI or negedge Rst_RBI)
+ begin
+ if (!Rst_RBI)
+ begin
+ bvalid <= 1'b0;
+ wresp_done_reg <= 1'b0;
+ wresp_running_reg <= 1'b0;
+ end
+ else
+ begin
+ if (awaddr_done_reg && wdata_done_reg && !wresp_done_reg)
+ begin
+ if (!wresp_running_reg)
+ begin
+ bvalid <= 1'b1;
+ wresp_running_reg <= 1'b1;
+ end
+ else if (s_axi_bready)
+ begin
+ bvalid <= 1'b0;
+ wresp_done_reg <= 1'b1;
+ wresp_running_reg <= 1'b0;
+ end
+ end
+ else
+ begin
+ bvalid <= 1'b0;
+ wresp_done_reg <= 1'b0;
+ wresp_running_reg <= 1'b0;
+ end
+ end
+ end
+
+ // AR Channel
+ always @(posedge Clk_CI or negedge Rst_RBI)
+ begin
+ if (!Rst_RBI)
+ begin
+ araddr_done_reg <= 1'b0;
+ arready <= 1'b1;
+ araddr_reg <= '0;
+ end
+ else
+ begin
+ if (arready && s_axi_arvalid)
+ begin
+ arready <= 1'b0;
+ araddr_done_reg <= 1'b1;
+ araddr_reg <= s_axi_araddr;
+ end
+ else if (araddr_done_reg && rresp_done_reg)
+ begin
+ arready <= 1'b1;
+ araddr_done_reg <= 1'b0;
+ end
+ end
+ end
+
+ // R Channel
+ always @(posedge Clk_CI or negedge Rst_RBI)
+ begin
+ if (!Rst_RBI)
+ begin
+ rresp_done_reg <= 1'b0;
+ rvalid <= 1'b0;
+ rresp_running_reg <= 1'b0;
+ end
+ else
+ begin
+ if (araddr_done_reg && !rresp_done_reg)
+ begin
+ if (!rresp_running_reg)
+ begin
+ rvalid <= 1'b1;
+ rresp_running_reg <= 1'b1;
+ end
+ else if (s_axi_rready)
+ begin
+ rvalid <= 1'b0;
+ rresp_done_reg <= 1'b1;
+ rresp_running_reg <= 1'b0;
+ end
+ end
+ else
+ begin
+ rvalid <= 1'b0;
+ rresp_done_reg <= 1'b0;
+ rresp_running_reg <= 1'b0;
+ end
+ end
+ end
+
+ // ██╗ ██╗ ██████╗███████╗ ██████╗ ██████╗ ███████╗ ██████╗
+ // ██║ ███║ ██╔════╝██╔════╝██╔════╝ ██╔══██╗██╔════╝██╔════╝
+ // ██║ ╚██║ ██║ █████╗ ██║ ███╗ ██████╔╝█████╗ ██║ ███╗
+ // ██║ ██║ ██║ ██╔══╝ ██║ ██║ ██╔══██╗██╔══╝ ██║ ██║
+ // ███████╗██║ ╚██████╗██║ ╚██████╔╝ ██║ ██║███████╗╚██████╔╝
+ // ╚══════╝╚═╝ ╚═════╝╚═╝ ╚═════╝ ╚═╝ ╚═╝╚══════╝ ╚═════╝
+ //
+ assign wren_l1 = wren && (awaddr_reg < L2SINGLE_AMAP_SIZE);
+
+ always @( posedge Clk_CI or negedge Rst_RBI )
+ begin
+ var integer idx_reg, idx_byte;
+ if ( Rst_RBI == 1'b0 )
+ begin
+ for ( idx_reg = 0; idx_reg < N_REGS; idx_reg++ )
+ L1Cfg_DP[idx_reg] <= '0;
+ end
+ else if ( wren_l1 )
+ begin
+ if ( awaddr_reg[ADDR_LSB+1] == 1'b0 ) begin // VIRT_ADDR
+ for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8; idx_byte++ ) begin
+ if ( (idx_byte < ADDR_WIDTH_VIRT/8) ) begin
+ if ( wstrb_reg[idx_byte] ) begin
+ L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= wdata_reg[idx_byte];
+ end
+ end
+ else begin // Let synthesizer optimize away unused registers.
+ L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= '0;
+ end
+ end
+ end
+ else if ( awaddr_reg[ADDR_LSB+1:ADDR_LSB] == 2'b10 ) begin // PHYS_ADDR
+ for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8; idx_byte++ ) begin
+ if ( (idx_byte < ADDR_WIDTH_PHYS/8) ) begin
+ if ( wstrb_reg[idx_byte] ) begin
+ L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= wdata_reg[idx_byte];
+ end
+ end
+ else begin // Let synthesizer optimize away unused registers.
+ L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= '0;
+ end
+ end
+ end
+ else begin // ( awaddr_reg[ADDR_LSB+1:ADDR_LSB] == 2'b11 ) // FLAGS
+ for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8; idx_byte++ ) begin
+ if ( (idx_byte < 1) ) begin
+ if ( wstrb_reg[idx_byte] ) begin
+ L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= wdata_reg[idx_byte] & { {{8-N_FLAGS}{1'b0}}, {{N_FLAGS}{1'b1}} };
+ end
+ end
+ else begin // Let synthesizer optimize away unused registers.
+ L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= '0;
+ end
+ end
+ end
+ end
+ end // always @ ( posedge Clk_CI or negedge Rst_RBI )
+
+ generate
+ // Mask unused bits -> Synthesizer should optimize away unused registers
+ for( j=0; j<N_REGS; j++ ) begin
+ if ( j[1] == 1'b0 ) // VIRT_ADDR
+ assign L1Cfg_DO[j] = { {{64-ADDR_WIDTH_VIRT}{1'b0}},{ADDR_WIDTH_VIRT{1'b1}} } & L1Cfg_DP[j];
+ else if ( j[1:0] == 2'b10 ) // PHYS_ADDR
+ assign L1Cfg_DO[j] = { {{64-ADDR_WIDTH_PHYS}{1'b0}},{ADDR_WIDTH_PHYS{1'b1}} } & L1Cfg_DP[j];
+ else // if ( j[1:0] == 2'b11 ) // FLAGS
+ assign L1Cfg_DO[j] = { {{64-N_FLAGS}{1'b0}},{N_FLAGS{1'b1}} } & L1Cfg_DP[j];
+ end
+ endgenerate
+
+ always_comb
+ begin
+ if ( araddr_reg[ADDR_LSB-1] == 1'b1 ) // read upper 32 bit, for debugging over 32-bit interface
+ rdata_reg = { {32'h00000000},{L1Cfg_DO[araddr_reg[ADDR_MSB:ADDR_LSB]][63:32]} };
+ else
+ rdata_reg = L1Cfg_DO[araddr_reg[ADDR_MSB:ADDR_LSB]];
+ end
+
+ assign s_axi_awready = awready;
+ assign s_axi_wready = wready;
+
+ assign s_axi_bresp = 2'b00;
+ assign s_axi_bvalid = bvalid;
+
+ assign s_axi_arready = arready;
+ assign s_axi_rresp = 2'b00;
+ assign s_axi_rvalid = rvalid;
+
+ // ██╗ ██████╗ ██████╗███████╗ ██████╗
+ // ██║ ╚════██╗ ██╔════╝██╔════╝██╔════╝
+ // ██║ █████╔╝ ██║ █████╗ ██║ ███╗
+ // ██║ ██╔═══╝ ██║ ██╔══╝ ██║ ██║
+ // ███████╗███████╗ ╚██████╗██║ ╚██████╔╝
+ // ╚══════╝╚══════╝ ╚═════╝╚═╝ ╚═════╝
+ //
+ logic [N_PORTS-1:0] l2_addr_is_in_va_rams;
+ logic [N_PORTS-1:0] upper_word_is_written;
+ logic [N_PORTS-1:0] lower_word_is_written;
+ generate
+ for( j=0; j< N_PORTS; j++)
+ begin
+ if (AXI_DATA_WIDTH == 64) begin
+ assign l2_addr_is_in_va_rams[j] = (awaddr_reg >= (j+1)*L2SINGLE_AMAP_SIZE) && (awaddr_reg[log2(L2SINGLE_AMAP_SIZE)-1:0] <= L2_VA_MAX_ADDR);
+ assign upper_word_is_written[j] = (wstrb_reg[7:4] != 4'b0000);
+ assign lower_word_is_written[j] = (wstrb_reg[3:0] != 4'b0000);
+ end else begin
+ assign l2_addr_is_in_va_rams[j] = 1'b0;
+ assign upper_word_is_written[j] = 1'b0;
+ assign lower_word_is_written[j] = 1'b0;
+ end
+
+ always @( posedge Clk_CI or negedge Rst_RBI ) begin
+ var integer idx_byte, off_byte;
+ if ( Rst_RBI == 1'b0 )
+ begin
+ wren_l2[j] <= 1'b0;
+ wdata_l2[j] <= '0;
+ end
+ else if (wren)
+ begin
+ if ( (awaddr_reg >= (j+1)*L2SINGLE_AMAP_SIZE) && (awaddr_reg < (j+2)*L2SINGLE_AMAP_SIZE) && (|wstrb_reg) )
+ wren_l2[j] <= 1'b1;
+ if (AXI_DATA_WIDTH == 32) begin
+ for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8; idx_byte++ )
+ wdata_l2[j][idx_byte*8 +: 8] <= wdata_reg[idx_byte] & {8{wstrb_reg[idx_byte]}};
+ end
+ else if (AXI_DATA_WIDTH == 64) begin
+ if (lower_word_is_written[j] == 1'b1)
+ off_byte = 0;
+ else
+ off_byte = 4;
+ // always put the payload in the lower word and set upper word to 0
+ for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8/2; idx_byte++ )
+ wdata_l2[j][idx_byte*8 +: 8] <= wdata_reg[idx_byte+off_byte] & {8{wstrb_reg[idx_byte+off_byte]}};
+ wdata_l2[j][AXI_DATA_WIDTH-1:AXI_DATA_WIDTH/2] <= 'b0;
+ end
+ // pragma translate_off
+ else
+ $fatal(1, "Unsupported AXI_DATA_WIDTH!");
+ // pragma translate_on
+ end
+ else
+ wren_l2[j] <= '0;
+ end // always @ ( posedge Clk_CI or negedge Rst_RBI )
+
+ // Properly align the 32-bit word address when writing from 64-bit interface:
+ // Depending on the system, the incoming address is (non-)aligned to the 64-bit
+ // word when writing the upper 32-bit word.
+ always_comb begin
+ waddr_l2[j] = (awaddr_reg -(j+1)*L2SINGLE_AMAP_SIZE)/4;
+ if (wren_l2[j]) begin
+ if (AXI_DATA_WIDTH == 64) begin
+ if (upper_word_is_written[j] == 1'b1) begin
+ // address must be non-aligned
+ waddr_l2[j][0] = 1'b1;
+ end
+ end
+ // pragma translate_off
+ else if (AXI_DATA_WIDTH != 32) begin
+ $fatal(1, "Unsupported AXI_DATA_WIDTH!");
+ end
+ // pragma translate_on
+ end
+ end
+
+ // Assert that only one 32-bit word is ever written at a time to VA RAMs on 64-bit data
+ // systems.
+ // pragma translate_off
+ always_ff @ (posedge Clk_CI) begin
+ if (AXI_DATA_WIDTH == 64) begin
+ if (l2_addr_is_in_va_rams[j]) begin
+ if (upper_word_is_written[j]) begin
+ assert (!lower_word_is_written[j])
+ else $error("Unsupported write across two 32-bit words to VA RAMs!");
+ end
+ else if (lower_word_is_written[j]) begin
+ assert (!upper_word_is_written[j])
+ else $error("Unsupported write across two 32-bit words to VA RAMs!");
+ end
+ end
+ end
+ end
+ // pragma translate_on
+
+ end // for (j=0; j< N_PORTS; j++)
+ endgenerate
+
+ // ███╗ ███╗██╗ ██╗ ███████╗██╗███████╗ ██████╗ ███████╗
+ // ████╗ ████║██║ ██║ ██╔════╝██║██╔════╝██╔═══██╗██╔════╝
+ // ██╔████╔██║███████║ █████╗ ██║█████╗ ██║ ██║███████╗
+ // ██║╚██╔╝██║██╔══██║ ██╔══╝ ██║██╔══╝ ██║ ██║╚════██║
+ // ██║ ╚═╝ ██║██║ ██║ ██║ ██║██║ ╚██████╔╝███████║
+ // ╚═╝ ╚═╝╚═╝ ╚═╝ ╚═╝ ╚═╝╚═╝ ╚═════╝ ╚══════╝
+ //
+ logic [ADDR_WIDTH_VIRT-1:0] AddrFifoDin_D;
+ logic AddrFifoWen_S;
+ logic AddrFifoRen_S;
+ logic [ADDR_WIDTH_VIRT-1:0] AddrFifoDout_D;
+ logic AddrFifoFull_S;
+ logic AddrFifoEmpty_S;
+ logic AddrFifoEmpty_SB;
+ logic AddrFifoFull_SB;
+
+ logic [MISS_META_WIDTH-1:0] MetaFifoDin_D;
+ logic MetaFifoWen_S;
+ logic MetaFifoRen_S;
+ logic [MISS_META_WIDTH-1:0] MetaFifoDout_D;
+ logic MetaFifoFull_S;
+ logic MetaFifoEmpty_S;
+ logic MetaFifoEmpty_SB;
+ logic MetaFifoFull_SB;
+
+ logic FifosDisabled_S;
+ logic ConfRegWen_S;
+ logic [1:0] ConfReg_DN;
+ logic [1:0] ConfReg_DP;
+
+ logic [AXI_DATA_WIDTH-1:0] wdata_reg_vec;
+
+ assign FifosDisabled_S = ConfReg_DP[0];
+ assign L1AllowMultiHit_SO = ConfReg_DP[1];
+
+ assign AddrFifoEmpty_S = ~AddrFifoEmpty_SB;
+ assign MetaFifoEmpty_S = ~MetaFifoEmpty_SB;
+
+ assign AddrFifoFull_S = ~AddrFifoFull_SB;
+ assign MetaFifoFull_S = ~MetaFifoFull_SB;
+
+ assign MhFifoFull_SO = (AddrFifoWen_S & AddrFifoFull_S) | (MetaFifoWen_S & MetaFifoFull_S);
+
+ generate
+ for ( j=0; j<AXI_DATA_WIDTH/8; j++ )
+ assign wdata_reg_vec[(j+1)*8-1:j*8] = wdata_reg[j];
+ endgenerate
+
+ // write address FIFO
+ always_comb
+ begin
+ AddrFifoWen_S = 1'b0;
+ AddrFifoDin_D = 'b0;
+ if ( (Miss_SI == 1'b1) && (FifosDisabled_S == 1'b0) ) // register a new miss
+ begin
+ AddrFifoWen_S = 1'b1;
+ AddrFifoDin_D = MissAddr_DI;
+ end
+ else if ( (wren_l1 == 1'b1) && (awaddr_reg[ADDR_MSB:0] == 'b0) && (FifosDisabled_S == 1'b0)) // write request from AXI interface
+ begin
+ AddrFifoWen_S = 1'b1;
+ AddrFifoDin_D = wdata_reg_vec[ADDR_WIDTH_VIRT-1:0];
+ end
+ end
+
+ // write meta FIFO
+ always_comb
+ begin
+ MetaFifoWen_S = 1'b0;
+ MetaFifoDin_D = 'b0;
+ if ( (Miss_SI == 1'b1) && (FifosDisabled_S == 1'b0) ) // register a new miss
+ begin
+ MetaFifoWen_S = 1'b1;
+ MetaFifoDin_D[MISS_META_WIDTH-1:0] = MissMeta_DI;
+ end
+ else if ( (wren_l1 == 1'b1) && (awaddr_reg[ADDR_MSB:0] == 4'h8) && (FifosDisabled_S == 1'b0) ) // write request from AXI interface
+ begin
+ MetaFifoWen_S = 1'b1;
+ MetaFifoDin_D = wdata_reg_vec[MISS_META_WIDTH-1:0];
+ end
+ end
+
+ // write configuration register
+ always_comb
+ begin
+ ConfRegWen_S = 1'b0;
+ ConfReg_DN = 1'b0;
+ if ( (wren_l1 == 1'b1) && (awaddr_reg[ADDR_MSB:0] == 8'h10) ) // write request from AXI interface
+ begin
+ ConfRegWen_S = 1'b1;
+ ConfReg_DN = wdata_reg_vec[$high(ConfReg_DN):0];
+ end
+ end
+
+ // AXI read data
+ always_comb
+ begin
+ s_axi_rdata = rdata_reg; // read L1 config
+ AddrFifoRen_S = 1'b0;
+ MetaFifoRen_S = 1'b0;
+ if ( rvalid == 1'b1 )
+ begin
+ // read address FIFO
+ if ( araddr_reg[ADDR_MSB:0] == 'b0 )
+ begin
+ s_axi_rdata = {AXI_DATA_WIDTH{1'b0}};
+ s_axi_rdata[ADDR_WIDTH_VIRT-1:0] = AddrFifoDout_D;
+ if ( AddrFifoEmpty_S == 1'b0 )
+ AddrFifoRen_S = 1'b1;
+ end
+ // read meta FIFO
+ else if ( araddr_reg[ADDR_MSB:0] == 4'h8 )
+ begin
+ s_axi_rdata = {AXI_DATA_WIDTH{1'b0}};
+ s_axi_rdata[31] = MetaFifoEmpty_S;
+ s_axi_rdata[MISS_META_WIDTH-1:0] = MetaFifoDout_D;
+ if ( MetaFifoEmpty_S == 1'b0 )
+ MetaFifoRen_S = 1'b1;
+ end
+ // read configuration register
+ else if ( araddr_reg[ADDR_MSB:0] == 8'h10 )
+ begin
+ s_axi_rdata = {AXI_DATA_WIDTH{1'b0}};
+ s_axi_rdata[$high(ConfReg_DP):0] = ConfReg_DP;
+ end
+ end // if ( rvalid == 1'b1 )
+ end // always_comb begin
+
+ // configuration register
+ always_ff @(posedge Clk_CI or negedge Rst_RBI) begin
+ if (Rst_RBI == 1'b0)
+ begin
+ ConfReg_DP <= 'b0;
+ end
+ else if (ConfRegWen_S == 1'b1)
+ begin
+ ConfReg_DP <= ConfReg_DN;
+ end
+ end
+
+ generic_fifo
+ #(
+ .DATA_WIDTH ( ADDR_WIDTH_VIRT ),
+ .DATA_DEPTH ( MH_FIFO_DEPTH )
+ )
+ fifo_addr_i
+ (
+ .clk ( Clk_CI ),
+ .rst_n ( Rst_RBI ),
+ .data_i ( AddrFifoDin_D ),
+ .valid_i ( AddrFifoWen_S & AddrFifoFull_SB ),
+ .grant_o ( AddrFifoFull_SB ),
+ .data_o ( AddrFifoDout_D ),
+ .valid_o ( AddrFifoEmpty_SB ),
+ .grant_i ( AddrFifoRen_S ),
+ .test_mode_i ( 1'b0 )
+ );
+
+ generic_fifo
+ #(
+ .DATA_WIDTH ( MISS_META_WIDTH ),
+ .DATA_DEPTH ( MH_FIFO_DEPTH )
+ )
+ fifo_meta_i
+ (
+ .clk ( Clk_CI ),
+ .rst_n ( Rst_RBI ),
+ .data_i ( MetaFifoDin_D ),
+ .valid_i ( MetaFifoWen_S & MetaFifoFull_SB ),
+ .grant_o ( MetaFifoFull_SB ),
+ .data_o ( MetaFifoDout_D ),
+ .valid_o ( MetaFifoEmpty_SB ),
+ .grant_i ( MetaFifoRen_S ),
+ .test_mode_i ( 1'b0 )
+ );
+"""
+#
+# endmodule
+#
+#
--- /dev/null
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi_rab_top(Elaboratable):
+
+ def __init__(self):
+ self.Clk_CI = Signal() # input
+ self.NonGatedClk_CI = Signal() # input
+ self.Rst_RBI = Signal() # input
+ self.s_axi4_awid = Signal() # input
+ self.s_axi4_awaddr = Signal() # input
+ self.s_axi4_awvalid = Signal(N_PORTS) # input
+ self.s_axi4_awready = Signal(N_PORTS) # output
+ self.s_axi4_awlen = Signal() # input
+ self.s_axi4_awsize = Signal() # input
+ self.s_axi4_awburst = Signal() # input
+ self.s_axi4_awlock = Signal(N_PORTS) # input
+ self.s_axi4_awprot = Signal() # input
+ self.s_axi4_awcache = Signal() # input
+ self.s_axi4_awregion = Signal() # input
+ self.s_axi4_awqos = Signal() # input
+ self.s_axi4_awuser = Signal() # input
+ self.s_axi4_wdata = Signal() # input
+ self.s_axi4_wvalid = Signal(N_PORTS) # input
+ self.s_axi4_wready = Signal(N_PORTS) # output
+ self.s_axi4_wstrb = Signal() # input
+ self.s_axi4_wlast = Signal(N_PORTS) # input
+ self.s_axi4_wuser = Signal() # input
+ self.s_axi4_bid = Signal() # output
+ self.s_axi4_bresp = Signal() # output
+ self.s_axi4_bvalid = Signal(N_PORTS) # output
+ self.s_axi4_buser = Signal() # output
+ self.s_axi4_bready = Signal(N_PORTS) # input
+ self.s_axi4_arid = Signal() # input
+ self.s_axi4_araddr = Signal() # input
+ self.s_axi4_arvalid = Signal(N_PORTS) # input
+ self.s_axi4_arready = Signal(N_PORTS) # output
+ self.s_axi4_arlen = Signal() # input
+ self.s_axi4_arsize = Signal() # input
+ self.s_axi4_arburst = Signal() # input
+ self.s_axi4_arlock = Signal(N_PORTS) # input
+ self.s_axi4_arprot = Signal() # input
+ self.s_axi4_arcache = Signal() # input
+ self.s_axi4_aruser = Signal() # input
+ self.s_axi4_rid = Signal() # output
+ self.s_axi4_rdata = Signal() # output
+ self.s_axi4_rresp = Signal() # output
+ self.s_axi4_rvalid = Signal(N_PORTS) # output
+ self.s_axi4_rready = Signal(N_PORTS) # input
+ self.s_axi4_rlast = Signal(N_PORTS) # output
+ self.s_axi4_ruser = Signal() # output
+ self.m0_axi4_awid = Signal() # output
+ self.m0_axi4_awaddr = Signal() # output
+ self.m0_axi4_awvalid = Signal(N_PORTS) # output
+ self.m0_axi4_awready = Signal(N_PORTS) # input
+ self.m0_axi4_awlen = Signal() # output
+ self.m0_axi4_awsize = Signal() # output
+ self.m0_axi4_awburst = Signal() # output
+ self.m0_axi4_awlock = Signal(N_PORTS) # output
+ self.m0_axi4_awprot = Signal() # output
+ self.m0_axi4_awcache = Signal() # output
+ self.m0_axi4_awregion = Signal() # output
+ self.m0_axi4_awqos = Signal() # output
+ self.m0_axi4_awuser = Signal() # output
+ self.m0_axi4_wdata = Signal() # output
+ self.m0_axi4_wvalid = Signal(N_PORTS) # output
+ self.m0_axi4_wready = Signal(N_PORTS) # input
+ self.m0_axi4_wstrb = Signal() # output
+ self.m0_axi4_wlast = Signal(N_PORTS) # output
+ self.m0_axi4_wuser = Signal() # output
+ self.m0_axi4_bid = Signal() # input
+ self.m0_axi4_bresp = Signal() # input
+ self.m0_axi4_bvalid = Signal(N_PORTS) # input
+ self.m0_axi4_buser = Signal() # input
+ self.m0_axi4_bready = Signal(N_PORTS) # output
+ self.m0_axi4_arid = Signal() # output
+ self.m0_axi4_araddr = Signal() # output
+ self.m0_axi4_arvalid = Signal(N_PORTS) # output
+ self.m0_axi4_arready = Signal(N_PORTS) # input
+ self.m0_axi4_arlen = Signal() # output
+ self.m0_axi4_arsize = Signal() # output
+ self.m0_axi4_arburst = Signal() # output
+ self.m0_axi4_arlock = Signal(N_PORTS) # output
+ self.m0_axi4_arprot = Signal() # output
+ self.m0_axi4_arcache = Signal() # output
+ self.m0_axi4_aruser = Signal() # output
+ self.m0_axi4_rid = Signal() # input
+ self.m0_axi4_rdata = Signal() # input
+ self.m0_axi4_rresp = Signal() # input
+ self.m0_axi4_rvalid = Signal(N_PORTS) # input
+ self.m0_axi4_rready = Signal(N_PORTS) # output
+ self.m0_axi4_rlast = Signal(N_PORTS) # input
+ self.m0_axi4_ruser = Signal() # input
+ self.m1_axi4_awid = Signal() # output
+ self.m1_axi4_awaddr = Signal() # output
+ self.m1_axi4_awvalid = Signal(N_PORTS) # output
+ self.m1_axi4_awready = Signal(N_PORTS) # input
+ self.m1_axi4_awlen = Signal() # output
+ self.m1_axi4_awsize = Signal() # output
+ self.m1_axi4_awburst = Signal() # output
+ self.m1_axi4_awlock = Signal(N_PORTS) # output
+ self.m1_axi4_awprot = Signal() # output
+ self.m1_axi4_awcache = Signal() # output
+ self.m1_axi4_awregion = Signal() # output
+ self.m1_axi4_awqos = Signal() # output
+ self.m1_axi4_awuser = Signal() # output
+ self.m1_axi4_wdata = Signal() # output
+ self.m1_axi4_wvalid = Signal(N_PORTS) # output
+ self.m1_axi4_wready = Signal(N_PORTS) # input
+ self.m1_axi4_wstrb = Signal() # output
+ self.m1_axi4_wlast = Signal(N_PORTS) # output
+ self.m1_axi4_wuser = Signal() # output
+ self.m1_axi4_bid = Signal() # input
+ self.m1_axi4_bresp = Signal() # input
+ self.m1_axi4_bvalid = Signal(N_PORTS) # input
+ self.m1_axi4_buser = Signal() # input
+ self.m1_axi4_bready = Signal(N_PORTS) # output
+ self.m1_axi4_arid = Signal() # output
+ self.m1_axi4_araddr = Signal() # output
+ self.m1_axi4_arvalid = Signal(N_PORTS) # output
+ self.m1_axi4_arready = Signal(N_PORTS) # input
+ self.m1_axi4_arlen = Signal() # output
+ self.m1_axi4_arsize = Signal() # output
+ self.m1_axi4_arburst = Signal() # output
+ self.m1_axi4_arlock = Signal(N_PORTS) # output
+ self.m1_axi4_arprot = Signal() # output
+ self.m1_axi4_arcache = Signal() # output
+ self.m1_axi4_aruser = Signal() # output
+ self.m1_axi4_rid = Signal() # input
+ self.m1_axi4_rdata = Signal() # input
+ self.m1_axi4_rresp = Signal() # input
+ self.m1_axi4_rvalid = Signal(N_PORTS) # input
+ self.m1_axi4_rready = Signal(N_PORTS) # output
+ self.m1_axi4_rlast = Signal(N_PORTS) # input
+ self.m1_axi4_ruser = Signal() # input
+ self.s_axi4lite_awaddr = Signal(AXI_LITE_ADDR_WIDTH) # input
+ self.s_axi4lite_awvalid = Signal() # input
+ self.s_axi4lite_awready = Signal() # output
+ self.s_axi4lite_wdata = Signal(AXI_LITE_DATA_WIDTH) # input
+ self.s_axi4lite_wvalid = Signal() # input
+ self.s_axi4lite_wready = Signal() # output
+ self.s_axi4lite_wstrb = Signal(1+ERROR p_expression_25) # input
+ self.s_axi4lite_bresp = Signal(2) # output
+ self.s_axi4lite_bvalid = Signal() # output
+ self.s_axi4lite_bready = Signal() # input
+ self.s_axi4lite_araddr = Signal(AXI_LITE_ADDR_WIDTH) # input
+ self.s_axi4lite_arvalid = Signal() # input
+ self.s_axi4lite_arready = Signal() # output
+ self.s_axi4lite_rdata = Signal(AXI_LITE_DATA_WIDTH) # output
+ self.s_axi4lite_rresp = Signal(2) # output
+ self.s_axi4lite_rvalid = Signal() # output
+ self.s_axi4lite_rready = Signal() # input
+ self.int_miss = Signal(N_PORTS) # output
+ self.int_multi = Signal(N_PORTS) # output
+ self.int_prot = Signal(N_PORTS) # output
+ self.int_mhf_full = Signal() # output
+
+ def elaborate(self, platform=None):
+ m = Module()
+ return m
+
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License. You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# // --=========================================================================--
+# //
+# // █████╗ ██╗ ██╗██╗ ██████╗ █████╗ ██████╗ ████████╗ ██████╗ ██████╗
+# // ██╔══██╗╚██╗██╔╝██║ ██╔══██╗██╔══██╗██╔══██╗ ╚══██╔══╝██╔═══██╗██╔══██╗
+# // ███████║ ╚███╔╝ ██║ ██████╔╝███████║██████╔╝ ██║ ██║ ██║██████╔╝
+# // ██╔══██║ ██╔██╗ ██║ ██╔══██╗██╔══██║██╔══██╗ ██║ ██║ ██║██╔═══╝
+# // ██║ ██║██╔╝ ██╗██║ ██║ ██║██║ ██║██████╔╝ ██║ ╚██████╔╝██║
+# // ╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═╝ ╚═╝╚═╝ ╚═╝╚═════╝ ╚═╝ ╚═════╝ ╚═╝
+# //
+# // --=========================================================================--
+# /*
+# * axi_rab_top
+# *
+# * The remapping address block (RAB) performs address translation for AXI
+# * transactions arriving at the input port and forwards them to different
+# * downstream AXI ports.
+# *
+# * The five axi channels are each buffered on the input side using a FIFO,
+# * described in axi4_XX_buffer. The RAB lookup result is merged into the
+# * AXI transaction via the axi4_XX_sender instances, which manages upstream
+# * error signaling for failed lookups.
+# *
+# * Address translation is performed based on data stored in up to two
+# * translation lookaside buffers (TLBs), which are private per RAB port (each
+# * of which having two AXI master ports and one AXI slave port). These TLBs
+# * are managed in software through the AXI-Lite interface.
+# *
+# * If ACP is enabled, the `cache_coherent` flag in the TLBs is used to
+# * multiplex between the two ports. If ACP is disabled, only the first master
+# * port is used. In this case, the `cache_coherent` flag is used to set the
+# * AxCACHE signals of the AXI bus accordingly.
+# *
+# * Authors:
+# * Antonio Pullini <pullinia@iis.ee.ethz.ch>
+# * Conrad Burchert <bconrad@ethz.ch>
+# * Maheshwara Sharma <msharma@student.ethz.ch>
+# * Andreas Kurth <akurth@iis.ee.ethz.ch>
+# * Johannes Weinbuch <jweinbuch@student.ethz.ch>
+# * Pirmin Vogel <vogelpi@iis.ee.ethz.ch>
+# */
+#
+# //`include "pulp_soc_defines.sv"
+#
+# ////import CfMath::log2;
+#
+# module axi_rab_top
+#
+# // Parameters {{{
+# #(
+# parameter N_PORTS = 2,
+# parameter N_L2_SETS = 32,
+# parameter N_L2_SET_ENTRIES = 32,
+# parameter AXI_DATA_WIDTH = 64,
+# parameter AXI_S_ADDR_WIDTH = 32,
+# parameter AXI_M_ADDR_WIDTH = 40,
+# parameter AXI_LITE_DATA_WIDTH = 64,
+# parameter AXI_LITE_ADDR_WIDTH = 32,
+# parameter AXI_ID_WIDTH = 10,
+# parameter AXI_USER_WIDTH = 6,
+# parameter MH_FIFO_DEPTH = 16
+# )
+# // }}}
+#
+# // Ports {{{
+# (
+#
+# input logic Clk_CI, // This clock may be gated.
+# input logic NonGatedClk_CI,
+# input logic Rst_RBI,
+#
+# // For every slave port there are two master ports. The master
+# // port to use can be set using the master_select flag of the protection
+# // bits of a slice
+#
+# // AXI4 Slave {{{
+# input logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] s_axi4_awid,
+# input logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] s_axi4_awaddr,
+# input logic [N_PORTS-1:0] s_axi4_awvalid,
+# output logic [N_PORTS-1:0] s_axi4_awready,
+# input logic [N_PORTS-1:0] [7:0] s_axi4_awlen,
+# input logic [N_PORTS-1:0] [2:0] s_axi4_awsize,
+# input logic [N_PORTS-1:0] [1:0] s_axi4_awburst,
+# input logic [N_PORTS-1:0] s_axi4_awlock,
+# input logic [N_PORTS-1:0] [2:0] s_axi4_awprot,
+# input logic [N_PORTS-1:0] [3:0] s_axi4_awcache,
+# input logic [N_PORTS-1:0] [3:0] s_axi4_awregion,
+# input logic [N_PORTS-1:0] [3:0] s_axi4_awqos,
+# input logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] s_axi4_awuser,
+#
+# input logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] s_axi4_wdata,
+# input logic [N_PORTS-1:0] s_axi4_wvalid,
+# output logic [N_PORTS-1:0] s_axi4_wready,
+# input logic [N_PORTS-1:0] [AXI_DATA_WIDTH/8-1:0] s_axi4_wstrb,
+# input logic [N_PORTS-1:0] s_axi4_wlast,
+# input logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] s_axi4_wuser,
+#
+# output logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] s_axi4_bid,
+# output logic [N_PORTS-1:0] [1:0] s_axi4_bresp,
+# output logic [N_PORTS-1:0] s_axi4_bvalid,
+# output logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] s_axi4_buser,
+# input logic [N_PORTS-1:0] s_axi4_bready,
+#
+# input logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] s_axi4_arid,
+# input logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] s_axi4_araddr,
+# input logic [N_PORTS-1:0] s_axi4_arvalid,
+# output logic [N_PORTS-1:0] s_axi4_arready,
+# input logic [N_PORTS-1:0] [7:0] s_axi4_arlen,
+# input logic [N_PORTS-1:0] [2:0] s_axi4_arsize,
+# input logic [N_PORTS-1:0] [1:0] s_axi4_arburst,
+# input logic [N_PORTS-1:0] s_axi4_arlock,
+# input logic [N_PORTS-1:0] [2:0] s_axi4_arprot,
+# input logic [N_PORTS-1:0] [3:0] s_axi4_arcache,
+# input logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] s_axi4_aruser,
+#
+# output logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] s_axi4_rid,
+# output logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] s_axi4_rdata,
+# output logic [N_PORTS-1:0] [1:0] s_axi4_rresp,
+# output logic [N_PORTS-1:0] s_axi4_rvalid,
+# input logic [N_PORTS-1:0] s_axi4_rready,
+# output logic [N_PORTS-1:0] s_axi4_rlast,
+# output logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] s_axi4_ruser,
+# // }}}
+#
+# // AXI4 Master 0 {{{
+# output logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] m0_axi4_awid,
+# output logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] m0_axi4_awaddr,
+# output logic [N_PORTS-1:0] m0_axi4_awvalid,
+# input logic [N_PORTS-1:0] m0_axi4_awready,
+# output logic [N_PORTS-1:0] [7:0] m0_axi4_awlen,
+# output logic [N_PORTS-1:0] [2:0] m0_axi4_awsize,
+# output logic [N_PORTS-1:0] [1:0] m0_axi4_awburst,
+# output logic [N_PORTS-1:0] m0_axi4_awlock,
+# output logic [N_PORTS-1:0] [2:0] m0_axi4_awprot,
+# output logic [N_PORTS-1:0] [3:0] m0_axi4_awcache,
+# output logic [N_PORTS-1:0] [3:0] m0_axi4_awregion,
+# output logic [N_PORTS-1:0] [3:0] m0_axi4_awqos,
+# output logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m0_axi4_awuser,
+#
+# output logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] m0_axi4_wdata,
+# output logic [N_PORTS-1:0] m0_axi4_wvalid,
+# input logic [N_PORTS-1:0] m0_axi4_wready,
+# output logic [N_PORTS-1:0] [AXI_DATA_WIDTH/8-1:0] m0_axi4_wstrb,
+# output logic [N_PORTS-1:0] m0_axi4_wlast,
+# output logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m0_axi4_wuser,
+#
+# input logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] m0_axi4_bid,
+# input logic [N_PORTS-1:0] [1:0] m0_axi4_bresp,
+# input logic [N_PORTS-1:0] m0_axi4_bvalid,
+# input logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m0_axi4_buser,
+# output logic [N_PORTS-1:0] m0_axi4_bready,
+#
+# output logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] m0_axi4_arid,
+# output logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] m0_axi4_araddr,
+# output logic [N_PORTS-1:0] m0_axi4_arvalid,
+# input logic [N_PORTS-1:0] m0_axi4_arready,
+# output logic [N_PORTS-1:0] [7:0] m0_axi4_arlen,
+# output logic [N_PORTS-1:0] [2:0] m0_axi4_arsize,
+# output logic [N_PORTS-1:0] [1:0] m0_axi4_arburst,
+# output logic [N_PORTS-1:0] m0_axi4_arlock,
+# output logic [N_PORTS-1:0] [2:0] m0_axi4_arprot,
+# output logic [N_PORTS-1:0] [3:0] m0_axi4_arcache,
+# output logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m0_axi4_aruser,
+#
+# input logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] m0_axi4_rid,
+# input logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] m0_axi4_rdata,
+# input logic [N_PORTS-1:0] [1:0] m0_axi4_rresp,
+# input logic [N_PORTS-1:0] m0_axi4_rvalid,
+# output logic [N_PORTS-1:0] m0_axi4_rready,
+# input logic [N_PORTS-1:0] m0_axi4_rlast,
+# input logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m0_axi4_ruser,
+# // }}}
+#
+# // AXI4 Master 1 {{{
+# output logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] m1_axi4_awid,
+# output logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] m1_axi4_awaddr,
+# output logic [N_PORTS-1:0] m1_axi4_awvalid,
+# input logic [N_PORTS-1:0] m1_axi4_awready,
+# output logic [N_PORTS-1:0] [7:0] m1_axi4_awlen,
+# output logic [N_PORTS-1:0] [2:0] m1_axi4_awsize,
+# output logic [N_PORTS-1:0] [1:0] m1_axi4_awburst,
+# output logic [N_PORTS-1:0] m1_axi4_awlock,
+# output logic [N_PORTS-1:0] [2:0] m1_axi4_awprot,
+# output logic [N_PORTS-1:0] [3:0] m1_axi4_awcache,
+# output logic [N_PORTS-1:0] [3:0] m1_axi4_awregion,
+# output logic [N_PORTS-1:0] [3:0] m1_axi4_awqos,
+# output logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m1_axi4_awuser,
+#
+# output logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] m1_axi4_wdata,
+# output logic [N_PORTS-1:0] m1_axi4_wvalid,
+# input logic [N_PORTS-1:0] m1_axi4_wready,
+# output logic [N_PORTS-1:0] [AXI_DATA_WIDTH/8-1:0] m1_axi4_wstrb,
+# output logic [N_PORTS-1:0] m1_axi4_wlast,
+# output logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m1_axi4_wuser,
+#
+# input logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] m1_axi4_bid,
+# input logic [N_PORTS-1:0] [1:0] m1_axi4_bresp,
+# input logic [N_PORTS-1:0] m1_axi4_bvalid,
+# input logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m1_axi4_buser,
+# output logic [N_PORTS-1:0] m1_axi4_bready,
+#
+# output logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] m1_axi4_arid,
+# output logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] m1_axi4_araddr,
+# output logic [N_PORTS-1:0] m1_axi4_arvalid,
+# input logic [N_PORTS-1:0] m1_axi4_arready,
+# output logic [N_PORTS-1:0] [7:0] m1_axi4_arlen,
+# output logic [N_PORTS-1:0] [2:0] m1_axi4_arsize,
+# output logic [N_PORTS-1:0] [1:0] m1_axi4_arburst,
+# output logic [N_PORTS-1:0] m1_axi4_arlock,
+# output logic [N_PORTS-1:0] [2:0] m1_axi4_arprot,
+# output logic [N_PORTS-1:0] [3:0] m1_axi4_arcache,
+# output logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m1_axi4_aruser,
+#
+# input logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] m1_axi4_rid,
+# input logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] m1_axi4_rdata,
+# input logic [N_PORTS-1:0] [1:0] m1_axi4_rresp,
+# input logic [N_PORTS-1:0] m1_axi4_rvalid,
+# output logic [N_PORTS-1:0] m1_axi4_rready,
+# input logic [N_PORTS-1:0] m1_axi4_rlast,
+# input logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m1_axi4_ruser,
+# // }}}
+#
+# // AXI 4 Lite Slave (Configuration Interface) {{{
+# // AXI4-Lite port to setup the rab slices
+# // use this to program the configuration registers
+# input logic [AXI_LITE_ADDR_WIDTH-1:0] s_axi4lite_awaddr,
+# input logic s_axi4lite_awvalid,
+# output logic s_axi4lite_awready,
+#
+# input logic [AXI_LITE_DATA_WIDTH-1:0] s_axi4lite_wdata,
+# input logic s_axi4lite_wvalid,
+# output logic s_axi4lite_wready,
+# input logic [AXI_LITE_DATA_WIDTH/8-1:0] s_axi4lite_wstrb,
+#
+# output logic [1:0] s_axi4lite_bresp,
+# output logic s_axi4lite_bvalid,
+# input logic s_axi4lite_bready,
+#
+# input logic [AXI_LITE_ADDR_WIDTH-1:0] s_axi4lite_araddr,
+# input logic s_axi4lite_arvalid,
+# output logic s_axi4lite_arready,
+#
+# output logic [AXI_LITE_DATA_WIDTH-1:0] s_axi4lite_rdata,
+# output logic [1:0] s_axi4lite_rresp,
+# output logic s_axi4lite_rvalid,
+# input logic s_axi4lite_rready,
+# // }}}
+#
+# // BRAMs {{{
+# //`ifdef RAB_AX_LOG_EN
+# // BramPort.Slave ArBram_PS,
+# // BramPort.Slave AwBram_PS,
+# //`endif
+# // }}}
+#
+# // Logger Control {{{
+# //`ifdef RAB_AX_LOG_EN
+# // input logic LogEn_SI,
+# // input logic ArLogClr_SI,
+# // input logic AwLogClr_SI,
+# // output logic ArLogRdy_SO,
+# // output logic AwLogRdy_SO,
+# //`endif
+# // }}}
+#
+# // Interrupt Outputs {{{
+# // Interrupt lines to handle misses, collisions of slices/multiple hits,
+# // protection faults and overflow of the miss handling fifo
+# //`ifdef RAB_AX_LOG_EN
+# // output logic int_ar_log_full,
+# // output logic int_aw_log_full,
+# //`endif
+# output logic [N_PORTS-1:0] int_miss,
+# output logic [N_PORTS-1:0] int_multi,
+# output logic [N_PORTS-1:0] int_prot,
+# output logic int_mhf_full
+# // }}}
+#
+# );
+#
+"""#docstring_begin
+
+ // }}}
+
+ // Signals {{{
+ // ███████╗██╗ ██████╗ ███╗ ██╗ █████╗ ██╗ ███████╗
+ // ██╔════╝██║██╔════╝ ████╗ ██║██╔══██╗██║ ██╔════╝
+ // ███████╗██║██║ ███╗██╔██╗ ██║███████║██║ ███████╗
+ // ╚════██║██║██║ ██║██║╚██╗██║██╔══██║██║ ╚════██║
+ // ███████║██║╚██████╔╝██║ ╚████║██║ ██║███████╗███████║
+ // ╚══════╝╚═╝ ╚═════╝ ╚═╝ ╚═══╝╚═╝ ╚═╝╚══════╝╚══════╝
+ //
+
+ // Internal AXI4 lines, these connect buffers on the slave side to the rab core and
+ // multiplexers which switch between the two master outputs
+ logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] int_awid;
+ logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] int_awaddr;
+ logic [N_PORTS-1:0] int_awvalid;
+ logic [N_PORTS-1:0] int_awready;
+ logic [N_PORTS-1:0] [7:0] int_awlen;
+ logic [N_PORTS-1:0] [2:0] int_awsize;
+ logic [N_PORTS-1:0] [1:0] int_awburst;
+ logic [N_PORTS-1:0] int_awlock;
+ logic [N_PORTS-1:0] [2:0] int_awprot;
+ logic [N_PORTS-1:0] [3:0] int_awcache;
+ logic [N_PORTS-1:0] [3:0] int_awregion;
+ logic [N_PORTS-1:0] [3:0] int_awqos;
+ logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_awuser;
+
+ logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] int_wdata;
+ logic [N_PORTS-1:0] int_wvalid;
+ logic [N_PORTS-1:0] int_wready;
+ logic [N_PORTS-1:0] [AXI_DATA_WIDTH/8-1:0] int_wstrb;
+ logic [N_PORTS-1:0] int_wlast;
+ logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_wuser;
+
+ logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] int_bid;
+ logic [N_PORTS-1:0] [1:0] int_bresp;
+ logic [N_PORTS-1:0] int_bvalid;
+ logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_buser;
+ logic [N_PORTS-1:0] int_bready;
+
+ logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] int_arid;
+ logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] int_araddr;
+ logic [N_PORTS-1:0] int_arvalid;
+ logic [N_PORTS-1:0] int_arready;
+ logic [N_PORTS-1:0] [7:0] int_arlen;
+ logic [N_PORTS-1:0] [2:0] int_arsize;
+ logic [N_PORTS-1:0] [1:0] int_arburst;
+ logic [N_PORTS-1:0] int_arlock;
+ logic [N_PORTS-1:0] [2:0] int_arprot;
+ logic [N_PORTS-1:0] [3:0] int_arcache;
+ logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_aruser;
+
+ logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] int_rid;
+ logic [N_PORTS-1:0] [1:0] int_rresp;
+ logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] int_rdata;
+ logic [N_PORTS-1:0] int_rlast;
+ logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_ruser;
+ logic [N_PORTS-1:0] int_rvalid;
+ logic [N_PORTS-1:0] int_rready;
+
+ // rab_core outputs
+ logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] int_wtrans_addr;
+ logic [N_PORTS-1:0] int_wtrans_accept;
+ logic [N_PORTS-1:0] int_wtrans_drop;
+ logic [N_PORTS-1:0] int_wtrans_miss;
+ logic [N_PORTS-1:0] int_wtrans_sent;
+ logic [N_PORTS-1:0] int_wtrans_cache_coherent;
+ logic [N_PORTS-1:0] int_wmaster_select;
+
+ logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] int_rtrans_addr;
+ logic [N_PORTS-1:0] int_rtrans_accept;
+ logic [N_PORTS-1:0] int_rtrans_drop;
+ logic [N_PORTS-1:0] int_rtrans_miss;
+ logic [N_PORTS-1:0] int_rtrans_sent;
+ logic [N_PORTS-1:0] int_rtrans_cache_coherent;
+ logic [N_PORTS-1:0] int_rmaster_select;
+
+ logic [N_PORTS-1:0] w_master_select;
+
+ // Internal master0 AXI4 lines. These connect the first master port to the
+ // multiplexers
+ // For channels read address, write address and write data the other lines
+ // are ignored if valid is not set, therefore we only need to multiplex those
+ logic [N_PORTS-1:0] int_m0_awvalid;
+ logic [N_PORTS-1:0] int_m0_awready;
+
+ logic [N_PORTS-1:0] int_m0_wvalid;
+ logic [N_PORTS-1:0] int_m0_wready;
+
+ logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] int_m0_bid;
+ logic [N_PORTS-1:0] [1:0] int_m0_bresp;
+ logic [N_PORTS-1:0] int_m0_bvalid;
+ logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_m0_buser;
+ logic [N_PORTS-1:0] int_m0_bready;
+
+ logic [N_PORTS-1:0] int_m0_arvalid;
+ logic [N_PORTS-1:0] int_m0_arready;
+
+ logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] int_m0_rid;
+ logic [N_PORTS-1:0] [1:0] int_m0_rresp;
+ logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] int_m0_rdata;
+ logic [N_PORTS-1:0] int_m0_rlast;
+ logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_m0_ruser;
+ logic [N_PORTS-1:0] int_m0_rready;
+ logic [N_PORTS-1:0] int_m0_rvalid;
+
+ logic [N_PORTS-1:0] l1_m0_ar_accept;
+ logic [N_PORTS-1:0] l1_m0_ar_drop;
+ logic [N_PORTS-1:0] l1_m0_ar_save;
+ logic [N_PORTS-1:0] l1_m0_ar_done;
+ logic [N_PORTS-1:0] l2_m0_ar_accept;
+ logic [N_PORTS-1:0] l2_m0_ar_drop;
+ logic [N_PORTS-1:0] l2_m0_ar_done;
+ logic [N_PORTS-1:0] l2_m0_ar_sending;
+
+ logic [N_PORTS-1:0] l1_m0_aw_accept;
+ logic [N_PORTS-1:0] l1_m0_aw_drop;
+ logic [N_PORTS-1:0] l1_m0_aw_save;
+ logic [N_PORTS-1:0] l1_m0_aw_done;
+ logic [N_PORTS-1:0] l2_m0_aw_accept;
+ logic [N_PORTS-1:0] l2_m0_aw_drop;
+ logic [N_PORTS-1:0] l2_m0_aw_done;
+ logic [N_PORTS-1:0] l2_m0_aw_sending;
+
+ // Internal master1 AXI4 lines. These connect the second master port to the
+ // multiplexers
+ // For channels read address, write address and write data the other lines
+ // are ignored if valid is not set, therefore we only need to multiplex those
+ logic [N_PORTS-1:0] int_m1_awvalid;
+ logic [N_PORTS-1:0] int_m1_awready;
+
+ logic [N_PORTS-1:0] int_m1_wvalid;
+ logic [N_PORTS-1:0] int_m1_wready;
+
+ logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] int_m1_bid;
+ logic [N_PORTS-1:0] [1:0] int_m1_bresp;
+ logic [N_PORTS-1:0] int_m1_bvalid;
+ logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_m1_buser;
+ logic [N_PORTS-1:0] int_m1_bready;
+
+ logic [N_PORTS-1:0] int_m1_arvalid;
+ logic [N_PORTS-1:0] int_m1_arready;
+
+ logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] int_m1_rid;
+ logic [N_PORTS-1:0] [1:0] int_m1_rresp;
+ logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] int_m1_rdata;
+ logic [N_PORTS-1:0] int_m1_rlast;
+ logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_m1_ruser;
+ logic [N_PORTS-1:0] int_m1_rvalid;
+ logic [N_PORTS-1:0] int_m1_rready;
+
+ logic [N_PORTS-1:0] l1_m1_ar_accept;
+ logic [N_PORTS-1:0] l1_m1_ar_drop;
+ logic [N_PORTS-1:0] l1_m1_ar_save;
+ logic [N_PORTS-1:0] l1_m1_ar_done;
+ logic [N_PORTS-1:0] l2_m1_ar_accept;
+ logic [N_PORTS-1:0] l2_m1_ar_drop;
+ logic [N_PORTS-1:0] l2_m1_ar_done;
+
+ logic [N_PORTS-1:0] l1_m1_aw_accept;
+ logic [N_PORTS-1:0] l1_m1_aw_drop;
+ logic [N_PORTS-1:0] l1_m1_aw_save;
+ logic [N_PORTS-1:0] l1_m1_aw_done;
+ logic [N_PORTS-1:0] l2_m1_aw_accept;
+ logic [N_PORTS-1:0] l2_m1_aw_drop;
+ logic [N_PORTS-1:0] l2_m1_aw_done;
+
+ // L1 outputs
+ logic [N_PORTS-1:0] rab_miss; // L1 RAB miss
+ logic [N_PORTS-1:0] rab_prot;
+ logic [N_PORTS-1:0] rab_multi;
+ logic [N_PORTS-1:0] rab_prefetch;
+
+ //
+ // Signals used to support L2 TLB
+ //
+ // L2 RAM configuration signals
+ logic [N_PORTS-1:0] [AXI_LITE_DATA_WIDTH-1:0] L2CfgWData_D;
+ logic [N_PORTS-1:0] [AXI_LITE_ADDR_WIDTH-1:0] L2CfgWAddr_D;
+ logic [N_PORTS-1:0] L2CfgWE_S;
+
+ // L1 output and drop Buffer
+ logic [N_PORTS-1:0] L1OutRwType_D, L1DropRwType_DP;
+ logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] L1OutUser_D, L1DropUser_DP;
+ logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] L1OutId_D, L1DropId_DP;
+ logic [N_PORTS-1:0] [7:0] L1OutLen_D, L1DropLen_DP;
+ logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] L1OutAddr_D, L1DropAddr_DP;
+ logic [N_PORTS-1:0] L1OutProt_D, L1DropProt_DP;
+ logic [N_PORTS-1:0] L1OutMulti_D, L1DropMulti_DP;
+ logic [N_PORTS-1:0] L1DropEn_S;
+ logic [N_PORTS-1:0] L1DropPrefetch_S;
+
+ logic [N_PORTS-1:0] L1DropValid_SN, L1DropValid_SP;
+
+ // L2 input Buffer
+ logic [N_PORTS-1:0] L2InRwType_DP;
+ logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] L2InUser_DP;
+ logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] L2InId_DP;
+ logic [N_PORTS-1:0] [7:0] L2InLen_DP;
+ logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] L2InAddr_DP;
+ logic [N_PORTS-1:0] L2InEn_S;
+
+ // L2 output Buffer
+ logic [N_PORTS-1:0] L2OutRwType_DP;
+ logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] L2OutUser_DP;
+ logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] L2OutId_DP;
+ logic [N_PORTS-1:0] [7:0] L2OutLen_DP;
+ logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] L2OutInAddr_DP;
+
+ logic [N_PORTS-1:0] L2OutHit_SN, L2OutHit_SP;
+ logic [N_PORTS-1:0] L2OutMiss_SN, L2OutMiss_SP;
+ logic [N_PORTS-1:0] L2OutProt_SN, L2OutProt_SP;
+ logic [N_PORTS-1:0] L2OutMulti_SN, L2OutMulti_SP;
+ logic [N_PORTS-1:0] L2OutCC_SN, L2OutCC_SP;
+ logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] L2OutAddr_DN, L2OutAddr_DP;
+
+ logic [N_PORTS-1:0] L2OutValid_SN, L2OutValid_SP;
+ logic [N_PORTS-1:0] L2OutPrefetch_S;
+ logic [N_PORTS-1:0] L2OutReady_S;
+ logic [N_PORTS-1:0] L2OutEn_S;
+
+ // L2 outputs
+ logic [N_PORTS-1:0] L2Busy_S;
+ logic [N_PORTS-1:0] L2OutValid_S;
+
+ logic [N_PORTS-1:0] L2Miss_S;
+
+ // Signals for interfacing the AXI modules
+ logic [N_PORTS-1:0] l1_ar_accept;
+ logic [N_PORTS-1:0] l1_aw_accept;
+ logic [N_PORTS-1:0] l1_w_accept;
+ logic [N_PORTS-1:0] l1_xw_accept;
+
+ logic [N_PORTS-1:0] l1_ar_drop;
+ logic [N_PORTS-1:0] l1_aw_drop;
+ logic [N_PORTS-1:0] l1_w_drop;
+ logic [N_PORTS-1:0] l1_xw_drop;
+
+ logic [N_PORTS-1:0] l1_ar_save;
+ logic [N_PORTS-1:0] l1_aw_save;
+ logic [N_PORTS-1:0] l1_w_save;
+ logic [N_PORTS-1:0] l1_xw_save;
+
+ logic [N_PORTS-1:0] l1_ar_done;
+ logic [N_PORTS-1:0] l1_r_done;
+ logic [N_PORTS-1:0] l1_r_drop;
+ logic [N_PORTS-1:0] lx_r_drop;
+ logic [N_PORTS-1:0] lx_r_done;
+
+ logic [N_PORTS-1:0] l1_aw_done;
+ logic [N_PORTS-1:0] l1_w_done;
+ logic [N_PORTS-1:0] l1_xw_done;
+ logic [N_PORTS-1:0] l1_aw_done_SP;
+ logic [N_PORTS-1:0] l1_w_done_SP;
+
+ logic [N_PORTS-1:0] l2_ar_accept;
+ logic [N_PORTS-1:0] l2_aw_accept;
+ logic [N_PORTS-1:0] l2_w_accept;
+ logic [N_PORTS-1:0] l2_xw_accept;
+
+ logic [N_PORTS-1:0] l2_ar_drop;
+ logic [N_PORTS-1:0] l2_r_drop;
+ logic [N_PORTS-1:0] l2_xr_drop;
+ logic [N_PORTS-1:0] l2_aw_drop;
+ logic [N_PORTS-1:0] l2_w_drop;
+ logic [N_PORTS-1:0] l2_xw_drop;
+
+ logic [N_PORTS-1:0] l2_aw_done;
+ logic [N_PORTS-1:0] l2_w_done;
+ logic [N_PORTS-1:0] l2_xw_done;
+ logic [N_PORTS-1:0] l2_aw_done_SP;
+ logic [N_PORTS-1:0] l2_w_done_SP;
+
+ logic [N_PORTS-1:0] l2_ar_done;
+ logic [N_PORTS-1:0] l2_r_done;
+ logic [N_PORTS-1:0] l2_xr_done;
+ logic [N_PORTS-1:0] l2_ar_done_SP;
+ logic [N_PORTS-1:0] l2_r_done_SP;
+
+ logic [N_PORTS-1:0] l1_mx_aw_done;
+ logic [N_PORTS-1:0] l1_mx_ar_done;
+ logic [N_PORTS-1:0] l1_m0_aw_done_SP;
+ logic [N_PORTS-1:0] l1_m0_ar_done_SP;
+ logic [N_PORTS-1:0] l1_m1_aw_done_SP;
+ logic [N_PORTS-1:0] l1_m1_ar_done_SP;
+
+ logic [N_PORTS-1:0] l2_mx_aw_done;
+ logic [N_PORTS-1:0] l2_mx_ar_done;
+ logic [N_PORTS-1:0] l2_m0_aw_done_SP;
+ logic [N_PORTS-1:0] l2_m0_ar_done_SP;
+ logic [N_PORTS-1:0] l2_m1_aw_done_SP;
+ logic [N_PORTS-1:0] l2_m1_ar_done_SP;
+
+ logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] l1_id_drop, lx_id_drop, b_id_drop;
+ logic [N_PORTS-1:0] [7:0] l1_len_drop, lx_len_drop;
+ logic [N_PORTS-1:0] l1_prefetch_drop, lx_prefetch_drop, b_prefetch_drop;
+ logic [N_PORTS-1:0] l1_hit_drop, lx_hit_drop, b_hit_drop;
+
+ logic [N_PORTS-1:0] b_drop;
+ logic [N_PORTS-1:0] b_done;
+
+ logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] l2_aw_addr;
+ logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] l2_ar_addr;
+
+ logic [N_PORTS-1:0] l2_cache_coherent;
+ logic [N_PORTS-1:0] l2_master_select;
+
+ logic [N_PORTS-1:0] aw_in_stall;
+ logic [N_PORTS-1:0] aw_out_stall;
+
+ genvar i;
+
+ // RRESP FSM
+ typedef enum logic {IDLE, BUSY} r_resp_mux_ctrl_state_t;
+ r_resp_mux_ctrl_state_t [N_PORTS-1:0] RRespMuxCtrl_SN, RRespMuxCtrl_SP;
+ logic [N_PORTS-1:0] RRespSel_SN, RRespSel_SP;
+ logic [N_PORTS-1:0] RRespBurst_S;
+ logic [N_PORTS-1:0] RRespSelIm_S;
+
+ // }}}
+
+ // Local parameters {{{
+
+ // Enable L2 for select ports
+ localparam integer ENABLE_L2TLB[N_PORTS-1:0] = `EN_L2TLB_ARRAY;
+
+ // L2TLB parameters
+ localparam integer HUM_BUFFER_DEPTH = (N_L2_SET_ENTRIES/2/`RAB_L2_N_PAR_VA_RAMS)+13;
+
+ // }}}
+
+ // Derive `master_select` from cache coherency flag. {{{
+ `ifdef EN_ACP
+ assign int_wmaster_select = int_wtrans_cache_coherent;
+ assign int_rmaster_select = int_rtrans_cache_coherent;
+ assign l2_master_select = l2_cache_coherent;
+ `else
+ assign int_wmaster_select = '0;
+ assign int_rmaster_select = '0;
+ assign l2_master_select = '0;
+ `endif
+ // }}}
+
+ // Buf and Send {{{
+ // ██████╗ ██╗ ██╗███████╗ ██╗ ███████╗███████╗███╗ ██╗██████╗
+ // ██╔══██╗██║ ██║██╔════╝ ██║ ██╔════╝██╔════╝████╗ ██║██╔══██╗
+ // ██████╔╝██║ ██║█████╗ ████████╗ ███████╗█████╗ ██╔██╗ ██║██║ ██║
+ // ██╔══██╗██║ ██║██╔══╝ ██╔═██╔═╝ ╚════██║██╔══╝ ██║╚██╗██║██║ ██║
+ // ██████╔╝╚██████╔╝██║ ██████║ ███████║███████╗██║ ╚████║██████╔╝
+ // ╚═════╝ ╚═════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝╚═╝ ╚═══╝╚═════╝
+ //
+ logic[N_PORTS-1:0] m0_write_is_burst, m0_read_is_burst;
+ logic[N_PORTS-1:0] m1_write_is_burst, m1_read_is_burst;
+
+ generate for (i = 0; i < N_PORTS; i++) begin : BUF_AND_SEND
+
+ // Write Address channel (aw) {{{
+ /*
+ * write address channel (aw)
+ *
+ * ██╗ ██╗██████╗ ██╗████████╗███████╗ █████╗ ██████╗ ██████╗ ██████╗
+ * ██║ ██║██╔══██╗██║╚══██╔══╝██╔════╝ ██╔══██╗██╔══██╗██╔══██╗██╔══██╗
+ * ██║ █╗ ██║██████╔╝██║ ██║ █████╗ ███████║██║ ██║██║ ██║██████╔╝
+ * ██║███╗██║██╔══██╗██║ ██║ ██╔══╝ ██╔══██║██║ ██║██║ ██║██╔══██╗
+ * ╚███╔███╔╝██║ ██║██║ ██║ ███████╗ ██║ ██║██████╔╝██████╔╝██║ ██║
+ * ╚══╝╚══╝ ╚═╝ ╚═╝╚═╝ ╚═╝ ╚══════╝ ╚═╝ ╚═╝╚═════╝ ╚═════╝ ╚═╝ ╚═╝
+ *
+ */
+
+ axi4_aw_buffer
+ #(
+ .AXI_ID_WIDTH ( AXI_ID_WIDTH ),
+ .AXI_USER_WIDTH ( AXI_USER_WIDTH )
+ )
+ u_aw_buffer
+ (
+ .axi4_aclk ( Clk_CI ),
+ .axi4_arstn ( Rst_RBI ),
+ .s_axi4_awid ( s_axi4_awid[i] ),
+ .s_axi4_awaddr ( s_axi4_awaddr[i] ),
+ .s_axi4_awvalid ( s_axi4_awvalid[i] ),
+ .s_axi4_awready ( s_axi4_awready[i] ),
+ .s_axi4_awlen ( s_axi4_awlen[i] ),
+ .s_axi4_awsize ( s_axi4_awsize[i] ),
+ .s_axi4_awburst ( s_axi4_awburst[i] ),
+ .s_axi4_awlock ( s_axi4_awlock[i] ),
+ .s_axi4_awprot ( s_axi4_awprot[i] ),
+ .s_axi4_awcache ( s_axi4_awcache[i] ),
+ .s_axi4_awregion ( s_axi4_awregion[i] ),
+ .s_axi4_awqos ( s_axi4_awqos[i] ),
+ .s_axi4_awuser ( s_axi4_awuser[i] ),
+ .m_axi4_awid ( int_awid[i] ),
+ .m_axi4_awaddr ( int_awaddr[i] ),
+ .m_axi4_awvalid ( int_awvalid[i] ),
+ .m_axi4_awready ( int_awready[i] ),
+ .m_axi4_awlen ( int_awlen[i] ),
+ .m_axi4_awsize ( int_awsize[i] ),
+ .m_axi4_awburst ( int_awburst[i] ),
+ .m_axi4_awlock ( int_awlock[i] ),
+ .m_axi4_awprot ( int_awprot[i] ),
+ .m_axi4_awcache ( int_awcache[i] ),
+ .m_axi4_awregion ( int_awregion[i] ),
+ .m_axi4_awqos ( int_awqos[i] ),
+ .m_axi4_awuser ( int_awuser[i] )
+ );
+
+ axi4_aw_sender
+ #(
+ .AXI_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ),
+ .AXI_ID_WIDTH ( AXI_ID_WIDTH ),
+ .AXI_USER_WIDTH ( AXI_USER_WIDTH ),
+ .ENABLE_L2TLB ( ENABLE_L2TLB[i] )
+ )
+ u_aw_sender_m0
+ (
+ .axi4_aclk ( Clk_CI ),
+ .axi4_arstn ( Rst_RBI ),
+ .l1_done_o ( l1_m0_aw_done[i] ),
+ .l1_accept_i ( l1_m0_aw_accept[i] ),
+ .l1_drop_i ( l1_m0_aw_drop[i] ),
+ .l1_save_i ( l1_m0_aw_save[i] ),
+ .l2_done_o ( l2_m0_aw_done[i] ),
+ .l2_accept_i ( l2_m0_aw_accept[i] ),
+ .l2_drop_i ( l2_m0_aw_drop[i] ),
+ .l2_sending_o ( l2_m0_aw_sending[i] ),
+ .l1_awaddr_i ( int_wtrans_addr[i] ),
+ .l2_awaddr_i ( l2_aw_addr[i] ),
+ .s_axi4_awid ( int_awid[i] ),
+ .s_axi4_awvalid ( int_m0_awvalid[i] ),
+ .s_axi4_awready ( int_m0_awready[i] ),
+ .s_axi4_awlen ( int_awlen[i] ),
+ .s_axi4_awsize ( int_awsize[i] ),
+ .s_axi4_awburst ( int_awburst[i] ),
+ .s_axi4_awlock ( int_awlock[i] ),
+ .s_axi4_awprot ( int_awprot[i] ),
+ .s_axi4_awcache ( int_awcache[i] ),
+ .s_axi4_awregion ( int_awregion[i] ),
+ .s_axi4_awqos ( int_awqos[i] ),
+ .s_axi4_awuser ( int_awuser[i] ),
+ .m_axi4_awid ( m0_axi4_awid[i] ),
+ .m_axi4_awaddr ( m0_axi4_awaddr[i] ),
+ .m_axi4_awvalid ( m0_axi4_awvalid[i] ),
+ .m_axi4_awready ( m0_axi4_awready[i] ),
+ .m_axi4_awlen ( m0_axi4_awlen[i] ),
+ .m_axi4_awsize ( m0_axi4_awsize[i] ),
+ .m_axi4_awburst ( m0_axi4_awburst[i] ),
+ .m_axi4_awlock ( m0_axi4_awlock[i] ),
+ .m_axi4_awprot ( m0_axi4_awprot[i] ),
+ .m_axi4_awcache ( ),
+ .m_axi4_awregion ( m0_axi4_awregion[i] ),
+ .m_axi4_awqos ( m0_axi4_awqos[i] ),
+ .m_axi4_awuser ( m0_axi4_awuser[i] )
+ );
+
+ // The AXCACHE signals are set according to burstiness and cache coherence or statically
+ // when not connected to ACP on Zynq (implemented below).
+ assign m0_write_is_burst[i] = (m0_axi4_awlen[i] != {8{1'b0}}) && (m0_axi4_awburst[i] != 2'b00);
+ `ifndef EN_ACP
+ always_comb begin
+ if ( (l2_m0_aw_sending[i] & l2_cache_coherent[i]) | int_wtrans_cache_coherent[i]) begin
+ if (m0_write_is_burst[i]) begin
+ m0_axi4_awcache[i] = 4'b0111;
+ end else begin
+ m0_axi4_awcache[i] = 4'b1111;
+ end
+ end else begin
+ m0_axi4_awcache[i] = 4'b0011;
+ end
+ end
+ `else
+ assign m0_axi4_awcache[i] = 4'b0011;
+ `endif
+
+ axi4_aw_sender
+ #(
+ .AXI_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ),
+ .AXI_ID_WIDTH ( AXI_ID_WIDTH ),
+ .AXI_USER_WIDTH ( AXI_USER_WIDTH ),
+ .ENABLE_L2TLB ( ENABLE_L2TLB[i] )
+ )
+ u_aw_sender_m1
+ (
+ .axi4_aclk ( Clk_CI ),
+ .axi4_arstn ( Rst_RBI ),
+ .l1_accept_i ( l1_m1_aw_accept[i] ),
+ .l1_drop_i ( l1_m1_aw_drop[i] ),
+ .l1_save_i ( l1_m1_aw_save[i] ),
+ .l1_done_o ( l1_m1_aw_done[i] ),
+ .l2_accept_i ( l2_m1_aw_accept[i] ),
+ .l2_drop_i ( l2_m1_aw_drop[i] ),
+ .l2_done_o ( l2_m1_aw_done[i] ),
+ .l2_sending_o ( ), // just helps to set axcache
+ .l1_awaddr_i ( int_wtrans_addr[i] ),
+ .l2_awaddr_i ( l2_aw_addr[i] ),
+ .s_axi4_awid ( int_awid[i] ),
+ .s_axi4_awvalid ( int_m1_awvalid[i] ),
+ .s_axi4_awready ( int_m1_awready[i] ),
+ .s_axi4_awlen ( int_awlen[i] ),
+ .s_axi4_awsize ( int_awsize[i] ),
+ .s_axi4_awburst ( int_awburst[i] ),
+ .s_axi4_awlock ( int_awlock[i] ),
+ .s_axi4_awprot ( int_awprot[i] ),
+ .s_axi4_awcache ( int_awcache[i] ),
+ .s_axi4_awregion ( int_awregion[i] ),
+ .s_axi4_awqos ( int_awqos[i] ),
+ .s_axi4_awuser ( int_awuser[i] ),
+ .m_axi4_awid ( m1_axi4_awid[i] ),
+ .m_axi4_awaddr ( m1_axi4_awaddr[i] ),
+ .m_axi4_awvalid ( m1_axi4_awvalid[i] ),
+ .m_axi4_awready ( m1_axi4_awready[i] ),
+ .m_axi4_awlen ( m1_axi4_awlen[i] ),
+ .m_axi4_awsize ( m1_axi4_awsize[i] ),
+ .m_axi4_awburst ( m1_axi4_awburst[i] ),
+ .m_axi4_awlock ( m1_axi4_awlock[i] ),
+ .m_axi4_awprot ( m1_axi4_awprot[i] ),
+ .m_axi4_awcache ( ),
+ .m_axi4_awregion ( m1_axi4_awregion[i] ),
+ .m_axi4_awqos ( m1_axi4_awqos[i] ),
+ .m_axi4_awuser ( m1_axi4_awuser[i] )
+ );
+
+ // The AXCACHE signals are set according to burstiness and cache coherence or statically
+ // when not connected to ACP on Zynq (implemented below).
+ assign m1_write_is_burst[i] = (m1_axi4_awlen[i] != {8{1'b0}}) && (m1_axi4_awburst[i] != 2'b00);
+ `ifdef EN_ACP
+ always_comb begin
+ if (m1_write_is_burst[i]) begin
+ m1_axi4_awcache[i] = 4'b1011;
+ end else begin
+ m1_axi4_awcache[i] = 4'b1111;
+ end
+ end
+ `else
+ assign m1_axi4_awcache[i] = 4'b0011;
+ `endif
+
+ // }}}
+
+ // Write Data channel (w) {{{
+ /*
+ * write data channel (w)
+ *
+ * ██╗ ██╗██████╗ ██╗████████╗███████╗ ██████╗ █████╗ ████████╗ █████╗
+ * ██║ ██║██╔══██╗██║╚══██╔══╝██╔════╝ ██╔══██╗██╔══██╗╚══██╔══╝██╔══██╗
+ * ██║ █╗ ██║██████╔╝██║ ██║ █████╗ ██║ ██║███████║ ██║ ███████║
+ * ██║███╗██║██╔══██╗██║ ██║ ██╔══╝ ██║ ██║██╔══██║ ██║ ██╔══██║
+ * ╚███╔███╔╝██║ ██║██║ ██║ ███████╗ ██████╔╝██║ ██║ ██║ ██║ ██║
+ * ╚══╝╚══╝ ╚═╝ ╚═╝╚═╝ ╚═╝ ╚══════╝ ╚═════╝ ╚═╝ ╚═╝ ╚═╝ ╚═╝ ╚═╝
+ *
+ */
+ axi4_w_buffer
+ #(
+ .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ),
+ .AXI_ID_WIDTH ( AXI_ID_WIDTH ),
+ .AXI_USER_WIDTH ( AXI_USER_WIDTH ),
+ .ENABLE_L2TLB ( ENABLE_L2TLB[i] ),
+ .HUM_BUFFER_DEPTH ( HUM_BUFFER_DEPTH )
+ )
+ u_w_buffer
+ (
+ .axi4_aclk ( Clk_CI ),
+ .axi4_arstn ( Rst_RBI ),
+
+ // L1 interface
+ .l1_done_o ( l1_w_done[i] ),
+ .l1_accept_i ( l1_w_accept[i] ),
+ .l1_save_i ( l1_w_save[i] ),
+ .l1_drop_i ( l1_w_drop[i] ),
+ .l1_master_i ( int_wmaster_select[i] ),
+ .l1_id_i ( l1_id_drop[i] ),
+ .l1_len_i ( l1_len_drop[i] ),
+ .l1_prefetch_i ( l1_prefetch_drop[i] ),
+ .l1_hit_i ( l1_hit_drop[i] ),
+
+ // L2 interface
+ .l2_done_o ( l2_w_done[i] ),
+ .l2_accept_i ( l2_w_accept[i] ),
+ .l2_drop_i ( l2_w_drop[i] ),
+ .l2_master_i ( l2_master_select[i] ),
+ .l2_id_i ( lx_id_drop[i] ),
+ .l2_len_i ( lx_len_drop[i] ),
+ .l2_prefetch_i ( lx_prefetch_drop[i] ),
+ .l2_hit_i ( lx_hit_drop[i] ),
+
+ // Top-level control outputs
+ .master_select_o ( w_master_select[i] ),
+ .input_stall_o ( aw_in_stall[i] ), // stall L1 AW input if request buffers full
+ .output_stall_o ( aw_out_stall[i] ), // stall L1 AW hit forwarding if bypass not possible
+
+ // B sender interface
+ .b_drop_o ( b_drop[i] ),
+ .b_done_i ( b_done[i] ),
+ .id_o ( b_id_drop[i] ),
+ .prefetch_o ( b_prefetch_drop[i] ),
+ .hit_o ( b_hit_drop[i] ),
+
+ // AXI W channel interfaces
+ .s_axi4_wdata ( s_axi4_wdata[i] ),
+ .s_axi4_wvalid ( s_axi4_wvalid[i] ),
+ .s_axi4_wready ( s_axi4_wready[i] ),
+ .s_axi4_wstrb ( s_axi4_wstrb[i] ),
+ .s_axi4_wlast ( s_axi4_wlast[i] ),
+ .s_axi4_wuser ( s_axi4_wuser[i] ),
+ .m_axi4_wdata ( int_wdata[i] ),
+ .m_axi4_wvalid ( int_wvalid[i] ),
+ .m_axi4_wready ( int_wready[i] ),
+ .m_axi4_wstrb ( int_wstrb[i] ),
+ .m_axi4_wlast ( int_wlast[i] ),
+ .m_axi4_wuser ( int_wuser[i] )
+ );
+
+ axi4_w_sender
+ #(
+ .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ),
+ .AXI_USER_WIDTH ( AXI_USER_WIDTH )
+ )
+ u_w_sender_m0
+ (
+ .axi4_aclk ( Clk_CI ),
+ .axi4_arstn ( Rst_RBI ),
+ .s_axi4_wdata ( int_wdata[i] ),
+ .s_axi4_wvalid ( int_m0_wvalid[i] ),
+ .s_axi4_wready ( int_m0_wready[i] ),
+ .s_axi4_wstrb ( int_wstrb[i] ),
+ .s_axi4_wlast ( int_wlast[i] ),
+ .s_axi4_wuser ( int_wuser[i] ),
+ .m_axi4_wdata ( m0_axi4_wdata[i] ),
+ .m_axi4_wvalid ( m0_axi4_wvalid[i] ),
+ .m_axi4_wready ( m0_axi4_wready[i] ),
+ .m_axi4_wstrb ( m0_axi4_wstrb[i] ),
+ .m_axi4_wlast ( m0_axi4_wlast[i] ),
+ .m_axi4_wuser ( m0_axi4_wuser[i] )
+ );
+
+ axi4_w_sender
+ #(
+ .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ),
+ .AXI_USER_WIDTH ( AXI_USER_WIDTH )
+
+ )
+ u_w_sender_m1
+ (
+ .axi4_aclk ( Clk_CI ),
+ .axi4_arstn ( Rst_RBI ),
+ .s_axi4_wdata ( int_wdata[i] ),
+ .s_axi4_wvalid ( int_m1_wvalid[i] ),
+ .s_axi4_wready ( int_m1_wready[i] ),
+ .s_axi4_wstrb ( int_wstrb[i] ),
+ .s_axi4_wlast ( int_wlast[i] ),
+ .s_axi4_wuser ( int_wuser[i] ),
+ .m_axi4_wdata ( m1_axi4_wdata[i] ),
+ .m_axi4_wvalid ( m1_axi4_wvalid[i] ),
+ .m_axi4_wready ( m1_axi4_wready[i] ),
+ .m_axi4_wstrb ( m1_axi4_wstrb[i] ),
+ .m_axi4_wlast ( m1_axi4_wlast[i] ),
+ .m_axi4_wuser ( m1_axi4_wuser[i] )
+ );
+
+ /*
+ * Multiplexer to switch between the two output master ports on the write data (w) channel
+ */
+ always_comb begin
+ /* Only one output can be selected at any time */
+ if (w_master_select[i] == 1'b0) begin
+ int_m0_wvalid[i] = int_wvalid[i];
+ int_m1_wvalid[i] = 1'b0;
+ int_wready[i] = int_m0_wready[i];
+ end else begin
+ int_m0_wvalid[i] = 1'b0;
+ int_m1_wvalid[i] = int_wvalid[i];
+ int_wready[i] = int_m1_wready[i];
+ end
+ end
+
+ // }}}
+
+ // Write Response channel (b) {{{
+ /*
+ * write response channel (b)
+ *
+ * ██╗ ██╗██████╗ ██╗████████╗███████╗ ██████╗ ███████╗███████╗██████╗
+ * ██║ ██║██╔══██╗██║╚══██╔══╝██╔════╝ ██╔══██╗██╔════╝██╔════╝██╔══██╗
+ * ██║ █╗ ██║██████╔╝██║ ██║ █████╗ ██████╔╝█████╗ ███████╗██████╔╝
+ * ██║███╗██║██╔══██╗██║ ██║ ██╔══╝ ██╔══██╗██╔══╝ ╚════██║██╔═══╝
+ * ╚███╔███╔╝██║ ██║██║ ██║ ███████╗ ██║ ██║███████╗███████║██║
+ * ╚══╝╚══╝ ╚═╝ ╚═╝╚═╝ ╚═╝ ╚══════╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝
+ *
+ */
+ axi4_b_buffer
+ #(
+ .AXI_ID_WIDTH ( AXI_ID_WIDTH ),
+ .AXI_USER_WIDTH ( AXI_USER_WIDTH )
+ )
+ u_b_buffer_m0
+ (
+ .axi4_aclk ( Clk_CI ),
+ .axi4_arstn ( Rst_RBI ),
+ .s_axi4_bid ( int_m0_bid[i] ),
+ .s_axi4_bresp ( int_m0_bresp[i] ),
+ .s_axi4_bvalid ( int_m0_bvalid[i] ),
+ .s_axi4_buser ( int_m0_buser[i] ),
+ .s_axi4_bready ( int_m0_bready[i] ),
+ .m_axi4_bid ( m0_axi4_bid[i] ),
+ .m_axi4_bresp ( m0_axi4_bresp[i] ),
+ .m_axi4_bvalid ( m0_axi4_bvalid[i] ),
+ .m_axi4_buser ( m0_axi4_buser[i] ),
+ .m_axi4_bready ( m0_axi4_bready[i] )
+ );
+
+ axi4_b_buffer
+ #(
+ .AXI_ID_WIDTH ( AXI_ID_WIDTH ),
+ .AXI_USER_WIDTH ( AXI_USER_WIDTH )
+ )
+ u_b_buffer_m1
+ (
+ .axi4_aclk ( Clk_CI ),
+ .axi4_arstn ( Rst_RBI ),
+ .s_axi4_bid ( int_m1_bid[i] ),
+ .s_axi4_bresp ( int_m1_bresp[i] ),
+ .s_axi4_bvalid ( int_m1_bvalid[i] ),
+ .s_axi4_buser ( int_m1_buser[i] ),
+ .s_axi4_bready ( int_m1_bready[i] ),
+ .m_axi4_bid ( m1_axi4_bid[i] ),
+ .m_axi4_bresp ( m1_axi4_bresp[i] ),
+ .m_axi4_bvalid ( m1_axi4_bvalid[i] ),
+ .m_axi4_buser ( m1_axi4_buser[i] ),
+ .m_axi4_bready ( m1_axi4_bready[i] )
+ );
+
+ axi4_b_sender
+ #(
+ .AXI_ID_WIDTH ( AXI_ID_WIDTH ),
+ .AXI_USER_WIDTH ( AXI_USER_WIDTH )
+ )
+ u_b_sender
+ (
+ .axi4_aclk ( Clk_CI ),
+ .axi4_arstn ( Rst_RBI ),
+ .drop_i ( b_drop[i] ),
+ .done_o ( b_done[i] ),
+ .id_i ( b_id_drop[i] ),
+ .prefetch_i ( b_prefetch_drop[i] ),
+ .hit_i ( b_hit_drop[i] ),
+ .s_axi4_bid ( s_axi4_bid[i] ),
+ .s_axi4_bresp ( s_axi4_bresp[i] ),
+ .s_axi4_bvalid ( s_axi4_bvalid[i] ),
+ .s_axi4_buser ( s_axi4_buser[i] ),
+ .s_axi4_bready ( s_axi4_bready[i] ),
+ .m_axi4_bid ( int_bid[i] ),
+ .m_axi4_bresp ( int_bresp[i] ),
+ .m_axi4_bvalid ( int_bvalid[i] ),
+ .m_axi4_buser ( int_buser[i] ),
+ .m_axi4_bready ( int_bready[i] )
+ );
+
+ /*
+ * Multiplexer to switch between the two output master ports on the write response (b) channel
+ */
+ always_comb begin
+ /* Output 1 always gets priority, so if it has something to send connect
+ it and let output 0 wait using rready = 0 */
+ if (int_m1_bvalid[i] == 1'b1) begin
+ int_m0_bready[i] = 1'b0;
+ int_m1_bready[i] = int_bready[i];
+
+ int_bid[i] = int_m1_bid[i];
+ int_bresp[i] = int_m1_bresp[i];
+ int_buser[i] = int_m1_buser[i];
+ int_bvalid[i] = int_m1_bvalid[i];
+ end else begin
+ int_m0_bready[i] = int_bready[i];
+ int_m1_bready[i] = 1'b0;
+
+ int_bid[i] = int_m0_bid[i];
+ int_bresp[i] = int_m0_bresp[i];
+ int_buser[i] = int_m0_buser[i];
+ int_bvalid[i] = int_m0_bvalid[i];
+ end
+ end
+
+ // }}}
+
+ // Read Address channel (ar) {{{
+ /*
+ * read address channel (ar)
+ *
+ * ██████╗ ███████╗ █████╗ ██████╗ █████╗ ██████╗ ██████╗ ██████╗
+ * ██╔══██╗██╔════╝██╔══██╗██╔══██╗ ██╔══██╗██╔══██╗██╔══██╗██╔══██╗
+ * ██████╔╝█████╗ ███████║██║ ██║ ███████║██║ ██║██║ ██║██████╔╝
+ * ██╔══██╗██╔══╝ ██╔══██║██║ ██║ ██╔══██║██║ ██║██║ ██║██╔══██╗
+ * ██║ ██║███████╗██║ ██║██████╔╝ ██║ ██║██████╔╝██████╔╝██║ ██║
+ * ╚═╝ ╚═╝╚══════╝╚═╝ ╚═╝╚═════╝ ╚═╝ ╚═╝╚═════╝ ╚═════╝ ╚═╝ ╚═╝
+ *
+ */
+ axi4_ar_buffer
+ #(
+ .AXI_ID_WIDTH ( AXI_ID_WIDTH ),
+ .AXI_USER_WIDTH ( AXI_USER_WIDTH )
+ )
+ u_ar_buffer
+ (
+ .axi4_aclk ( Clk_CI ),
+ .axi4_arstn ( Rst_RBI ),
+ .s_axi4_arid ( s_axi4_arid[i] ),
+ .s_axi4_araddr ( s_axi4_araddr[i] ),
+ .s_axi4_arvalid ( s_axi4_arvalid[i] ),
+ .s_axi4_arready ( s_axi4_arready[i] ),
+ .s_axi4_arlen ( s_axi4_arlen[i] ),
+ .s_axi4_arsize ( s_axi4_arsize[i] ),
+ .s_axi4_arburst ( s_axi4_arburst[i] ),
+ .s_axi4_arlock ( s_axi4_arlock[i] ),
+ .s_axi4_arprot ( s_axi4_arprot[i] ),
+ .s_axi4_arcache ( s_axi4_arcache[i] ),
+ .s_axi4_aruser ( s_axi4_aruser[i] ),
+ .m_axi4_arid ( int_arid[i] ),
+ .m_axi4_araddr ( int_araddr[i] ),
+ .m_axi4_arvalid ( int_arvalid[i] ),
+ .m_axi4_arready ( int_arready[i] ),
+ .m_axi4_arlen ( int_arlen[i] ),
+ .m_axi4_arsize ( int_arsize[i] ),
+ .m_axi4_arburst ( int_arburst[i] ),
+ .m_axi4_arlock ( int_arlock[i] ),
+ .m_axi4_arprot ( int_arprot[i] ),
+ .m_axi4_arcache ( int_arcache[i] ),
+ .m_axi4_aruser ( int_aruser[i] )
+ );
+
+ axi4_ar_sender
+ #(
+ .AXI_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ),
+ .AXI_ID_WIDTH ( AXI_ID_WIDTH ),
+ .AXI_USER_WIDTH ( AXI_USER_WIDTH ),
+ .ENABLE_L2TLB ( ENABLE_L2TLB[i] )
+ )
+ u_ar_sender_m0
+ (
+ .axi4_aclk ( Clk_CI ),
+ .axi4_arstn ( Rst_RBI ),
+ .l1_done_o ( l1_m0_ar_done[i] ),
+ .l1_accept_i ( l1_m0_ar_accept[i] ),
+ .l1_drop_i ( l1_m0_ar_drop[i] ),
+ .l1_save_i ( l1_m0_ar_save[i] ),
+ .l2_done_o ( l2_m0_ar_done[i] ),
+ .l2_accept_i ( l2_m0_ar_accept[i] ),
+ .l2_drop_i ( l2_m0_ar_drop[i] ),
+ .l2_sending_o ( l2_m0_ar_sending[i] ),
+ .l1_araddr_i ( int_rtrans_addr[i] ),
+ .l2_araddr_i ( l2_ar_addr[i] ),
+ .s_axi4_arid ( int_arid[i] ),
+ .s_axi4_arvalid ( int_m0_arvalid[i] ),
+ .s_axi4_arready ( int_m0_arready[i] ),
+ .s_axi4_arlen ( int_arlen[i] ),
+ .s_axi4_arsize ( int_arsize[i] ),
+ .s_axi4_arburst ( int_arburst[i] ),
+ .s_axi4_arlock ( int_arlock[i] ),
+ .s_axi4_arprot ( int_arprot[i] ),
+ .s_axi4_arcache ( int_arcache[i] ),
+ .s_axi4_aruser ( int_aruser[i] ),
+ .m_axi4_arid ( m0_axi4_arid[i] ),
+ .m_axi4_araddr ( m0_axi4_araddr[i] ),
+ .m_axi4_arvalid ( m0_axi4_arvalid[i] ),
+ .m_axi4_arready ( m0_axi4_arready[i] ),
+ .m_axi4_arlen ( m0_axi4_arlen[i] ),
+ .m_axi4_arsize ( m0_axi4_arsize[i] ),
+ .m_axi4_arburst ( m0_axi4_arburst[i] ),
+ .m_axi4_arlock ( m0_axi4_arlock[i] ),
+ .m_axi4_arprot ( m0_axi4_arprot[i] ),
+ .m_axi4_arcache ( ),
+ .m_axi4_aruser ( m0_axi4_aruser[i] )
+ );
+
+ // The AXCACHE signals are set according to burstiness and cache coherence or statically
+ // when not connected to ACP on Zynq (implemented below).
+ assign m0_read_is_burst[i] = (m0_axi4_arlen[i] != {8{1'b0}}) && (m0_axi4_arburst[i] != 2'b00);
+ `ifndef EN_ACP
+ always_comb begin
+ if ( (l2_m0_ar_sending[i] & l2_cache_coherent[i]) | int_rtrans_cache_coherent[i]) begin
+ if (m0_read_is_burst[i]) begin
+ m0_axi4_arcache[i] = 4'b1011;
+ end else begin
+ m0_axi4_arcache[i] = 4'b1111;
+ end
+ end else begin
+ m0_axi4_arcache[i] = 4'b0011;
+ end
+ end
+ `else
+ assign m0_axi4_arcache[i] = 4'b0011;
+ `endif
+
+ axi4_ar_sender
+ #(
+ .AXI_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ),
+ .AXI_ID_WIDTH ( AXI_ID_WIDTH ),
+ .AXI_USER_WIDTH ( AXI_USER_WIDTH ),
+ .ENABLE_L2TLB ( ENABLE_L2TLB[i] )
+ )
+ u_ar_sender_m1
+ (
+ .axi4_aclk ( Clk_CI ),
+ .axi4_arstn ( Rst_RBI ),
+ .l1_done_o ( l1_m1_ar_done[i] ),
+ .l1_accept_i ( l1_m1_ar_accept[i] ),
+ .l1_drop_i ( l1_m1_ar_drop[i] ),
+ .l1_save_i ( l1_m1_ar_save[i] ),
+ .l2_done_o ( l2_m1_ar_done[i] ),
+ .l2_accept_i ( l2_m1_ar_accept[i] ),
+ .l2_drop_i ( l2_m1_ar_drop[i] ),
+ .l2_sending_o ( ), // just helps to set axcache
+ .l1_araddr_i ( int_rtrans_addr[i] ),
+ .l2_araddr_i ( l2_ar_addr[i] ),
+ .s_axi4_arid ( int_arid[i] ),
+ .s_axi4_arvalid ( int_m1_arvalid[i] ),
+ .s_axi4_arready ( int_m1_arready[i] ),
+ .s_axi4_arlen ( int_arlen[i] ),
+ .s_axi4_arsize ( int_arsize[i] ),
+ .s_axi4_arburst ( int_arburst[i] ),
+ .s_axi4_arlock ( int_arlock[i] ),
+ .s_axi4_arprot ( int_arprot[i] ),
+ .s_axi4_arcache ( int_arcache[i] ),
+ .s_axi4_aruser ( int_aruser[i] ),
+ .m_axi4_arid ( m1_axi4_arid[i] ),
+ .m_axi4_araddr ( m1_axi4_araddr[i] ),
+ .m_axi4_arvalid ( m1_axi4_arvalid[i] ),
+ .m_axi4_arready ( m1_axi4_arready[i] ),
+ .m_axi4_arlen ( m1_axi4_arlen[i] ),
+ .m_axi4_arsize ( m1_axi4_arsize[i] ),
+ .m_axi4_arburst ( m1_axi4_arburst[i] ),
+ .m_axi4_arlock ( m1_axi4_arlock[i] ),
+ .m_axi4_arprot ( m1_axi4_arprot[i] ),
+ .m_axi4_arcache ( ),
+ .m_axi4_aruser ( m1_axi4_aruser[i] )
+ );
+
+ // The AXCACHE signals are set according to burstiness and cache coherence or statically
+ // when not connected to ACP on Zynq (implemented below).
+ assign m1_read_is_burst[i] = (m1_axi4_arlen[i] != {8{1'b0}}) && (m1_axi4_arburst[i] != 2'b00);
+ `ifdef EN_ACP
+ always_comb begin
+ if (m1_read_is_burst[i]) begin
+ m1_axi4_arcache[i] = 4'b1011;
+ end else begin
+ m1_axi4_arcache[i] = 4'b1111;
+ end
+ end
+ `else
+ assign m1_axi4_arcache[i] = 4'b0011;
+ `endif
+
+ // }}}
+
+ // Read Response channel (r) {{{
+ /*
+ * read response channel (r)
+ *
+ * ██████╗ ███████╗ █████╗ ██████╗ ██████╗ ███████╗███████╗██████╗
+ * ██╔══██╗██╔════╝██╔══██╗██╔══██╗ ██╔══██╗██╔════╝██╔════╝██╔══██╗
+ * ██████╔╝█████╗ ███████║██║ ██║ ██████╔╝█████╗ ███████╗██████╔╝
+ * ██╔══██╗██╔══╝ ██╔══██║██║ ██║ ██╔══██╗██╔══╝ ╚════██║██╔═══╝
+ * ██║ ██║███████╗██║ ██║██████╔╝ ██║ ██║███████╗███████║██║
+ * ╚═╝ ╚═╝╚══════╝╚═╝ ╚═╝╚═════╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝
+ *
+ */
+ axi4_r_buffer
+ #(
+ .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ),
+ .AXI_ID_WIDTH ( AXI_ID_WIDTH ),
+ .AXI_USER_WIDTH ( AXI_USER_WIDTH )
+ )
+ u_r_buffer_m0
+ (
+ .axi4_aclk ( Clk_CI ),
+ .axi4_arstn ( Rst_RBI ),
+ .s_axi4_rid ( int_m0_rid[i] ),
+ .s_axi4_rresp ( int_m0_rresp[i] ),
+ .s_axi4_rdata ( int_m0_rdata[i] ),
+ .s_axi4_rlast ( int_m0_rlast[i] ),
+ .s_axi4_rvalid ( int_m0_rvalid[i] ),
+ .s_axi4_ruser ( int_m0_ruser[i] ),
+ .s_axi4_rready ( int_m0_rready[i] ),
+ .m_axi4_rid ( m0_axi4_rid[i] ),
+ .m_axi4_rresp ( m0_axi4_rresp[i] ),
+ .m_axi4_rdata ( m0_axi4_rdata[i] ),
+ .m_axi4_rlast ( m0_axi4_rlast[i] ),
+ .m_axi4_rvalid ( m0_axi4_rvalid[i] ),
+ .m_axi4_ruser ( m0_axi4_ruser[i] ),
+ .m_axi4_rready ( m0_axi4_rready[i] )
+ );
+
+ axi4_r_buffer
+ #(
+ .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ),
+ .AXI_ID_WIDTH ( AXI_ID_WIDTH ),
+ .AXI_USER_WIDTH ( AXI_USER_WIDTH )
+ )
+ u_r_buffer_m1
+ (
+ .axi4_aclk ( Clk_CI ),
+ .axi4_arstn ( Rst_RBI ),
+ .s_axi4_rid ( int_m1_rid[i] ),
+ .s_axi4_rresp ( int_m1_rresp[i] ),
+ .s_axi4_rdata ( int_m1_rdata[i] ),
+ .s_axi4_rlast ( int_m1_rlast[i] ),
+ .s_axi4_rvalid ( int_m1_rvalid[i] ),
+ .s_axi4_ruser ( int_m1_ruser[i] ),
+ .s_axi4_rready ( int_m1_rready[i] ),
+ .m_axi4_rid ( m1_axi4_rid[i] ),
+ .m_axi4_rresp ( m1_axi4_rresp[i] ),
+ .m_axi4_rdata ( m1_axi4_rdata[i] ),
+ .m_axi4_rlast ( m1_axi4_rlast[i] ),
+ .m_axi4_rvalid ( m1_axi4_rvalid[i] ),
+ .m_axi4_ruser ( m1_axi4_ruser[i] ),
+ .m_axi4_rready ( m1_axi4_rready[i] )
+ );
+
+ axi4_r_sender
+ #(
+ .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ),
+ .AXI_ID_WIDTH ( AXI_ID_WIDTH ),
+ .AXI_USER_WIDTH ( AXI_USER_WIDTH )
+ )
+ u_r_sender
+ (
+ .axi4_aclk ( Clk_CI ),
+ .axi4_arstn ( Rst_RBI ),
+ .drop_i ( lx_r_drop[i] ),
+ .drop_len_i ( lx_len_drop[i] ),
+ .done_o ( lx_r_done[i] ),
+ .id_i ( lx_id_drop[i] ),
+ .prefetch_i ( lx_prefetch_drop[i] ),
+ .hit_i ( lx_hit_drop[i] ),
+ .s_axi4_rid ( s_axi4_rid[i] ),
+ .s_axi4_rresp ( s_axi4_rresp[i] ),
+ .s_axi4_rdata ( s_axi4_rdata[i] ),
+ .s_axi4_rlast ( s_axi4_rlast[i] ),
+ .s_axi4_rvalid ( s_axi4_rvalid[i] ),
+ .s_axi4_ruser ( s_axi4_ruser[i] ),
+ .s_axi4_rready ( s_axi4_rready[i] ),
+ .m_axi4_rid ( int_rid[i] ),
+ .m_axi4_rresp ( int_rresp[i] ),
+ .m_axi4_rdata ( int_rdata[i] ),
+ .m_axi4_rlast ( int_rlast[i] ),
+ .m_axi4_rvalid ( int_rvalid[i] ),
+ .m_axi4_ruser ( int_ruser[i] ),
+ .m_axi4_rready ( int_rready[i] )
+ );
+
+ /*
+ * Multiplexer to switch between the two output master ports on the read response(r) channel
+ *
+ * Do not perform read burst interleaving as the DMA does not support it. This means we can only
+ * switch between the two masters upon sending rlast or when idle.
+ *
+ * However, if the downstream already performs burst interleaving, this cannot be undone here.
+ * Also, the downstream may interleave a burst reponse with a single-beat transaction. In this
+ * case, the FSM below falls out of the burst mode. To avoid it performing burst interleaving
+ * after such an event, it gives priority to the master which received the last burst in case
+ * both have a have a burst ready (rvalid).
+ *
+ * Order of priority:
+ * 1. Ongoing burst transaction
+ * 2. Single-beat transaction on Master 1.
+ * 3. Single-beat transaction on Master 0.
+ * 4. Burst transaction on master that received the last burst.
+ */
+ // Select signal
+ always_ff @(posedge Clk_CI) begin
+ if (Rst_RBI == 0) begin
+ RRespSel_SP[i] <= 1'b0;
+ end else begin
+ RRespSel_SP[i] <= RRespSel_SN[i];
+ end
+ end
+
+ // FSM
+ always_comb begin : RRespMuxFsm
+ RRespMuxCtrl_SN[i] = RRespMuxCtrl_SP[i];
+ RRespSel_SN[i] = RRespSel_SP[i];
+
+ RRespBurst_S[i] = 1'b0;
+ RRespSelIm_S[i] = 1'b0;
+
+ unique case (RRespMuxCtrl_SP[i])
+
+ IDLE: begin
+ // immediately forward single-beat transactions
+ if (int_m1_rvalid[i] && int_m1_rlast[i])
+ RRespSelIm_S[i] = 1'b1;
+ else if (int_m0_rvalid[i] && int_m0_rlast[i])
+ RRespSelIm_S[i] = 1'b0;
+
+ // bursts - they also start immediately
+ else if (int_m1_rvalid[i] || int_m0_rvalid[i]) begin
+ RRespMuxCtrl_SN[i] = BUSY;
+
+ // in case both are ready, continue with the master that had the last burst
+ if (int_m1_rvalid[i] && int_m0_rvalid[i]) begin
+ RRespSel_SN[i] = RRespSel_SP[i];
+ RRespSelIm_S[i] = RRespSel_SP[i];
+ end else if (int_m1_rvalid[i]) begin
+ RRespSel_SN[i] = 1'b1;
+ RRespSelIm_S[i] = 1'b1;
+ end else begin
+ RRespSel_SN[i] = 1'b0;
+ RRespSelIm_S[i] = 1'b0;
+ end
+ end
+ end
+
+ BUSY: begin
+ RRespBurst_S[i] = 1'b1;
+ // detect last handshake of currently ongoing transfer
+ if (int_rvalid[i] && int_rready[i] && int_rlast[i])
+ RRespMuxCtrl_SN[i] = IDLE;
+ end
+
+ default: begin
+ RRespMuxCtrl_SN[i] = IDLE;
+ end
+
+ endcase
+ end
+
+ // FSM state
+ always_ff @(posedge Clk_CI) begin
+ if (Rst_RBI == 0) begin
+ RRespMuxCtrl_SP[i] <= IDLE;
+ end else begin
+ RRespMuxCtrl_SP[i] <= RRespMuxCtrl_SN[i];
+ end
+ end
+
+ // Actual multiplexer
+ always_comb begin
+ if ( (RRespBurst_S[i] && RRespSel_SP[i]) || (!RRespBurst_S[i] && RRespSelIm_S[i]) ) begin
+ int_m0_rready[i] = 1'b0;
+ int_m1_rready[i] = int_rready[i];
+
+ int_rid[i] = int_m1_rid[i];
+ int_rresp[i] = int_m1_rresp[i];
+ int_rdata[i] = int_m1_rdata[i];
+ int_rlast[i] = int_m1_rlast[i];
+ int_ruser[i] = int_m1_ruser[i];
+ int_rvalid[i] = int_m1_rvalid[i];
+ end else begin
+ int_m0_rready[i] = int_rready[i];
+ int_m1_rready[i] = 1'b0;
+
+ int_rid[i] = int_m0_rid[i];
+ int_rresp[i] = int_m0_rresp[i];
+ int_rdata[i] = int_m0_rdata[i];
+ int_rlast[i] = int_m0_rlast[i];
+ int_ruser[i] = int_m0_ruser[i];
+ int_rvalid[i] = int_m0_rvalid[i];
+ end
+ end
+
+ end // BUF & SEND
+
+ // }}}
+
+ endgenerate // BUF & SEND }}}
+
+ // Log {{{
+
+`ifdef RAB_AX_LOG_EN
+ AxiBramLogger
+ #(
+ .AXI_ID_BITW ( AXI_ID_WIDTH ),
+ .AXI_ADDR_BITW ( AXI_S_ADDR_WIDTH ),
+ .NUM_LOG_ENTRIES ( `RAB_AX_LOG_ENTRIES )
+ )
+ u_aw_logger
+ (
+ .Clk_CI ( NonGatedClk_CI ),
+ .TimestampClk_CI ( Clk_CI ),
+ .Rst_RBI ( Rst_RBI ),
+ .AxiValid_SI ( s_axi4_awvalid[1] ),
+ .AxiReady_SI ( s_axi4_awready[1] ),
+ .AxiId_DI ( s_axi4_awid[1] ),
+ .AxiAddr_DI ( s_axi4_awaddr[1] ),
+ .AxiLen_DI ( s_axi4_awlen[1] ),
+ .Clear_SI ( AwLogClr_SI ),
+ .LogEn_SI ( LogEn_SI ),
+ .Full_SO ( int_aw_log_full ),
+ .Ready_SO ( AwLogRdy_SO ),
+ .Bram_PS ( AwBram_PS )
+ );
+
+ AxiBramLogger
+ #(
+ .AXI_ID_BITW ( AXI_ID_WIDTH ),
+ .AXI_ADDR_BITW ( AXI_S_ADDR_WIDTH ),
+ .NUM_LOG_ENTRIES ( `RAB_AX_LOG_ENTRIES )
+ )
+ u_ar_logger
+ (
+ .Clk_CI ( NonGatedClk_CI ),
+ .TimestampClk_CI ( Clk_CI ),
+ .Rst_RBI ( Rst_RBI ),
+ .AxiValid_SI ( s_axi4_arvalid[1] ),
+ .AxiReady_SI ( s_axi4_arready[1] ),
+ .AxiId_DI ( s_axi4_arid[1] ),
+ .AxiAddr_DI ( s_axi4_araddr[1] ),
+ .AxiLen_DI ( s_axi4_arlen[1] ),
+ .Clear_SI ( ArLogClr_SI ),
+ .LogEn_SI ( LogEn_SI ),
+ .Full_SO ( int_ar_log_full ),
+ .Ready_SO ( ArLogRdy_SO ),
+ .Bram_PS ( ArBram_PS )
+ );
+`endif
+
+ // }}}
+
+ // RAB Core {{{
+ // ██████╗ █████╗ ██████╗ ██████╗ ██████╗ ██████╗ ███████╗
+ // ██╔══██╗██╔══██╗██╔══██╗ ██╔════╝██╔═══██╗██╔══██╗██╔════╝
+ // ██████╔╝███████║██████╔╝ ██║ ██║ ██║██████╔╝█████╗
+ // ██╔══██╗██╔══██║██╔══██╗ ██║ ██║ ██║██╔══██╗██╔══╝
+ // ██║ ██║██║ ██║██████╔╝ ╚██████╗╚██████╔╝██║ ██║███████╗
+ // ╚═╝ ╚═╝╚═╝ ╚═╝╚═════╝ ╚═════╝ ╚═════╝ ╚═╝ ╚═╝╚══════╝
+ //
+ /*
+ * rab_core
+ *
+ * The rab core translates addresses. It has two ports, which can be used
+ * independently, however they will compete for time internally, as lookups
+ * are serialized.
+ *
+ * type is the read(0) or write(1) used to check the protection flags. If they
+ * don't match an interrupt is created on the int_prot line.
+ */
+
+ rab_core
+ #(
+ .N_PORTS ( N_PORTS ),
+ .N_L2_SETS ( N_L2_SETS ),
+ .N_L2_SET_ENTRIES ( N_L2_SET_ENTRIES ),
+ .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ),
+ .AXI_S_ADDR_WIDTH ( AXI_S_ADDR_WIDTH ),
+ .AXI_M_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ),
+ .AXI_LITE_DATA_WIDTH ( AXI_LITE_DATA_WIDTH ),
+ .AXI_LITE_ADDR_WIDTH ( AXI_LITE_ADDR_WIDTH ),
+ .AXI_ID_WIDTH ( AXI_ID_WIDTH ),
+ .AXI_USER_WIDTH ( AXI_USER_WIDTH ),
+ .MH_FIFO_DEPTH ( MH_FIFO_DEPTH )
+ )
+ u_rab_core
+ (
+ .Clk_CI ( Clk_CI ),
+ .Rst_RBI ( Rst_RBI ),
+
+ // Config IF
+ .s_axi_awaddr ( s_axi4lite_awaddr ),
+ .s_axi_awvalid ( s_axi4lite_awvalid ),
+ .s_axi_awready ( s_axi4lite_awready ),
+ .s_axi_wdata ( s_axi4lite_wdata ),
+ .s_axi_wstrb ( s_axi4lite_wstrb ),
+ .s_axi_wvalid ( s_axi4lite_wvalid ),
+ .s_axi_wready ( s_axi4lite_wready ),
+ .s_axi_bresp ( s_axi4lite_bresp ),
+ .s_axi_bvalid ( s_axi4lite_bvalid ),
+ .s_axi_bready ( s_axi4lite_bready ),
+ .s_axi_araddr ( s_axi4lite_araddr ),
+ .s_axi_arvalid ( s_axi4lite_arvalid ),
+ .s_axi_arready ( s_axi4lite_arready ),
+ .s_axi_rready ( s_axi4lite_rready ),
+ .s_axi_rdata ( s_axi4lite_rdata ),
+ .s_axi_rresp ( s_axi4lite_rresp ),
+ .s_axi_rvalid ( s_axi4lite_rvalid ),
+
+ // L1 miss info outputs -> L2 TLB arbitration
+ .int_miss ( rab_miss ),
+ .int_multi ( rab_multi ),
+ .int_prot ( rab_prot ),
+ .int_prefetch ( rab_prefetch ),
+ .int_mhf_full ( int_mhf_full ),
+
+ // L1 transaction info outputs -> L2 TLB arbitration
+ .int_axaddr_o ( L1OutAddr_D ),
+ .int_axid_o ( L1OutId_D ),
+ .int_axlen_o ( L1OutLen_D ),
+ .int_axuser_o ( L1OutUser_D ),
+
+ // Write Req IF
+ .port1_addr ( int_awaddr ),
+ .port1_id ( int_awid ),
+ .port1_len ( int_awlen ),
+ .port1_size ( int_awsize ),
+ .port1_addr_valid ( int_awvalid & ~aw_in_stall ), // avoid the FSM accepting new AW requests
+ .port1_type ( {N_PORTS{1'b1}} ),
+ .port1_user ( int_awuser ),
+ .port1_sent ( int_wtrans_sent ), // signal done to L1 FSM
+ .port1_out_addr ( int_wtrans_addr ),
+ .port1_cache_coherent ( int_wtrans_cache_coherent ),
+ .port1_accept ( int_wtrans_accept ),
+ .port1_drop ( int_wtrans_drop ),
+ .port1_miss ( int_wtrans_miss ),
+
+ // Read Req IF
+ .port2_addr ( int_araddr ),
+ .port2_id ( int_arid ),
+ .port2_len ( int_arlen ),
+ .port2_size ( int_arsize ),
+ .port2_addr_valid ( int_arvalid ),
+ .port2_type ( {N_PORTS{1'b0}} ),
+ .port2_user ( int_aruser ),
+ .port2_sent ( int_rtrans_sent ), // signal done to L1 FSM
+ .port2_out_addr ( int_rtrans_addr ),
+ .port2_cache_coherent ( int_rtrans_cache_coherent ),
+ .port2_accept ( int_rtrans_accept ),
+ .port2_drop ( int_rtrans_drop ),
+ .port2_miss ( int_rtrans_miss ),
+
+ // L2 miss info inputs -> axi_rab_cfg
+ .miss_l2_i ( L2Miss_S ),
+ .miss_l2_addr_i ( L2OutInAddr_DP ),
+ .miss_l2_id_i ( L2OutId_DP ),
+ .miss_l2_user_i ( L2OutUser_DP ),
+
+ // L2 config outputs
+ .wdata_l2_o ( L2CfgWData_D ),
+ .waddr_l2_o ( L2CfgWAddr_D ),
+ .wren_l2_o ( L2CfgWE_S )
+ );
+
+ // }}}
+
+ // AX SPLITS {{{
+ // █████╗ ██╗ ██╗ ███████╗██████╗ ██╗ ██╗████████╗
+ // ██╔══██╗╚██╗██╔╝ ██╔════╝██╔══██╗██║ ██║╚══██╔══╝
+ // ███████║ ╚███╔╝ ███████╗██████╔╝██║ ██║ ██║
+ // ██╔══██║ ██╔██╗ ╚════██║██╔═══╝ ██║ ██║ ██║
+ // ██║ ██║██╔╝ ██╗ ███████║██║ ███████╗██║ ██║
+ // ╚═╝ ╚═╝╚═╝ ╚═╝ ╚══════╝╚═╝ ╚══════╝╚═╝ ╚═╝
+ //
+ /**
+ * Multiplex the two output master ports of the Read Address and Write Address (AR/AW) channels.
+ *
+ * Use the `int_xmaster_select` signal to route the signals to either Master 0 (to memory) or
+ * Master 1 (to ACP). In case of an L1 miss: Route the signals to both masters. They shall be
+ * saved until the L2 outputs are available.
+ */
+ generate for (i = 0; i < N_PORTS; i++) begin : AX_SPLIT
+
+ /*
+ * When accepting L1 transactions, we must just do so on the selected master. Drop requests must
+ * be performed on any one of the two masters. Save requests must be performed by both masters.
+ */
+ always_comb begin : AW_L1_SPLIT
+
+ // TLB handshake
+ l1_m0_aw_accept[i] = 1'b0;
+ l1_m1_aw_accept[i] = 1'b0;
+ l1_m0_aw_drop[i] = 1'b0;
+ l1_m1_aw_drop[i] = 1'b0;
+ l1_m0_aw_save[i] = 1'b0;
+ l1_m1_aw_save[i] = 1'b0;
+
+ l1_mx_aw_done[i] = 1'b0;
+
+ // AXI sender input handshake
+ int_m0_awvalid[i] = 1'b0;
+ int_m1_awvalid[i] = 1'b0;
+ int_awready[i] = 1'b0;
+
+ // accept on selected master only
+ if (l1_aw_accept[i]) begin
+ if (int_wmaster_select[i]) begin
+ l1_m1_aw_accept[i] = 1'b1;
+ l1_mx_aw_done[i] = l1_m1_aw_done[i];
+
+ int_m1_awvalid[i] = int_awvalid[i];
+ int_awready[i] = int_m1_awready[i];
+
+ end else begin
+ l1_m0_aw_accept[i] = 1'b1;
+ l1_mx_aw_done[i] = l1_m0_aw_done[i];
+
+ int_m0_awvalid[i] = int_awvalid[i];
+ int_awready[i] = int_m0_awready[i];
+ end
+
+ // drop on Master 0 only
+ end else if (l1_aw_drop[i]) begin
+ l1_m0_aw_drop[i] = 1'b1;
+ l1_mx_aw_done[i] = l1_m0_aw_done[i];
+
+ int_m0_awvalid[i] = int_awvalid[i];
+ int_awready[i] = l1_m0_aw_done[i];
+
+ // save on both masters
+ end else if (l1_aw_save[i]) begin
+ // split save
+ l1_m0_aw_save[i] = ~l1_m0_aw_done_SP[i];
+ l1_m1_aw_save[i] = ~l1_m1_aw_done_SP[i];
+
+ // combine done
+ l1_mx_aw_done[i] = l1_m0_aw_done_SP[i] & l1_m1_aw_done_SP[i];
+
+ int_m0_awvalid[i] = int_awvalid[i];
+ int_m1_awvalid[i] = int_awvalid[i];
+ int_awready[i] = l1_mx_aw_done[i];
+ end
+ end
+
+ // signal back to handshake splitter
+ assign l1_aw_done[i] = l1_mx_aw_done[i];
+
+ always_ff @(posedge Clk_CI) begin : L1_MX_AW_DONE_REG
+ if (Rst_RBI == 0) begin
+ l1_m0_aw_done_SP[i] <= 1'b0;
+ l1_m1_aw_done_SP[i] <= 1'b0;
+ end else if (l1_mx_aw_done[i]) begin
+ l1_m0_aw_done_SP[i] <= 1'b0;
+ l1_m1_aw_done_SP[i] <= 1'b0;
+ end else begin
+ l1_m0_aw_done_SP[i] <= l1_m0_aw_done_SP[i] | l1_m0_aw_done[i];
+ l1_m1_aw_done_SP[i] <= l1_m1_aw_done_SP[i] | l1_m1_aw_done[i];
+ end
+ end
+
+ /*
+ * When accepting L2 transactions, we must drop the corresponding transaction from the other
+ * master to make it available again for save requests from L1_DROP_SAVE.
+ */
+ always_comb begin : AW_L2_SPLIT
+
+ l2_m0_aw_accept[i] = 1'b0;
+ l2_m1_aw_accept[i] = 1'b0;
+ l2_m0_aw_drop[i] = 1'b0;
+ l2_m1_aw_drop[i] = 1'b0;
+
+ // de-assert request signals individually upon handshakes
+ if (l2_aw_accept[i]) begin
+ if (l2_master_select[i]) begin
+ l2_m1_aw_accept[i] = ~l2_m1_aw_done_SP[i];
+ l2_m0_aw_drop[i] = ~l2_m0_aw_done_SP[i];
+
+ end else begin
+ l2_m0_aw_accept[i] = ~l2_m0_aw_done_SP[i];
+ l2_m1_aw_drop[i] = ~l2_m1_aw_done_SP[i];
+
+ end
+ end else begin
+ l2_m0_aw_drop[i] = ~l2_m0_aw_done_SP[i] ? l2_aw_drop[i] : 1'b0;
+ l2_m1_aw_drop[i] = ~l2_m1_aw_done_SP[i] ? l2_aw_drop[i] : 1'b0;
+
+ end
+
+ // combine done
+ l2_mx_aw_done[i] = l2_m0_aw_done_SP[i] & l2_m1_aw_done_SP[i];
+
+ l2_aw_done[i] = l2_mx_aw_done[i];
+ end
+
+ always_ff @(posedge Clk_CI) begin : L2_MX_AW_DONE_REG
+ if (Rst_RBI == 0) begin
+ l2_m0_aw_done_SP[i] <= 1'b0;
+ l2_m1_aw_done_SP[i] <= 1'b0;
+ end else if (l2_mx_aw_done[i]) begin
+ l2_m0_aw_done_SP[i] <= 1'b0;
+ l2_m1_aw_done_SP[i] <= 1'b0;
+ end else begin
+ l2_m0_aw_done_SP[i] <= l2_m0_aw_done_SP[i] | l2_m0_aw_done[i];
+ l2_m1_aw_done_SP[i] <= l2_m1_aw_done_SP[i] | l2_m1_aw_done[i];
+ end
+ end
+
+ /*
+ * When accepting L1 transactions, we must just do so on the selected master. Drop requests must
+ * be performed on any one of the two masters. Save requests must be performed by both masters.
+ */
+ always_comb begin : AR_L1_SPLIT
+
+ // TLB handshake
+ l1_m0_ar_accept[i] = 1'b0;
+ l1_m1_ar_accept[i] = 1'b0;
+ l1_m0_ar_drop[i] = 1'b0;
+ l1_m1_ar_drop[i] = 1'b0;
+ l1_m0_ar_save[i] = 1'b0;
+ l1_m1_ar_save[i] = 1'b0;
+
+ l1_mx_ar_done[i] = 1'b0;
+
+ // AXI sender input handshake
+ int_m0_arvalid[i] = 1'b0;
+ int_m1_arvalid[i] = 1'b0;
+ int_arready[i] = 1'b0;
+
+ // accept on selected master only
+ if (l1_ar_accept[i]) begin
+ if (int_rmaster_select[i]) begin
+ l1_m1_ar_accept[i] = 1'b1;
+ l1_mx_ar_done[i] = l1_m1_ar_done[i];
+
+ int_m1_arvalid[i] = int_arvalid[i];
+ int_arready[i] = int_m1_arready[i];
+
+ end else begin
+ l1_m0_ar_accept[i] = 1'b1;
+ l1_mx_ar_done[i] = l1_m0_ar_done[i];
+
+ int_m0_arvalid[i] = int_arvalid[i];
+ int_arready[i] = int_m0_arready[i];
+ end
+
+ // drop on Master 0 only
+ end else if (l1_ar_drop[i]) begin
+ l1_m0_ar_drop[i] = 1'b1;
+ l1_mx_ar_done[i] = l1_m0_ar_done[i];
+
+ int_m0_arvalid[i] = int_arvalid[i];
+ int_arready[i] = l1_m0_ar_done[i];
+
+ // save on both masters
+ end else if (l1_ar_save[i]) begin
+ // split save
+ l1_m0_ar_save[i] = ~l1_m0_ar_done_SP[i];
+ l1_m1_ar_save[i] = ~l1_m1_ar_done_SP[i];
+
+ // combine done
+ l1_mx_ar_done[i] = l1_m0_ar_done_SP[i] & l1_m1_ar_done_SP[i];
+
+ int_m0_arvalid[i] = int_arvalid[i];
+ int_m1_arvalid[i] = int_arvalid[i];
+ int_arready[i] = l1_mx_ar_done[i];
+ end
+ end
+
+ // signal back to handshake splitter
+ assign l1_ar_done[i] = l1_mx_ar_done[i];
+
+ always_ff @(posedge Clk_CI) begin : L1_MX_AR_DONE_REG
+ if (Rst_RBI == 0) begin
+ l1_m0_ar_done_SP[i] <= 1'b0;
+ l1_m1_ar_done_SP[i] <= 1'b0;
+ end else if (l1_mx_ar_done[i]) begin
+ l1_m0_ar_done_SP[i] <= 1'b0;
+ l1_m1_ar_done_SP[i] <= 1'b0;
+ end else begin
+ l1_m0_ar_done_SP[i] <= l1_m0_ar_done_SP[i] | l1_m0_ar_done[i];
+ l1_m1_ar_done_SP[i] <= l1_m1_ar_done_SP[i] | l1_m1_ar_done[i];
+ end
+ end
+
+ /*
+ * When accepting L2 transactions, we must drop the corresponding transaction from the other
+ * master to make it available again for save requests from L1_DROP_SAVE.
+ */
+ always_comb begin : AR_L2_SPLIT
+
+ l2_m0_ar_accept[i] = 1'b0;
+ l2_m1_ar_accept[i] = 1'b0;
+ l2_m0_ar_drop[i] = 1'b0;
+ l2_m1_ar_drop[i] = 1'b0;
+
+ // de-assert request signals individually upon handshakes
+ if (l2_ar_accept[i]) begin
+ if (l2_master_select[i]) begin
+ l2_m1_ar_accept[i] = ~l2_m1_ar_done_SP[i];
+ l2_m0_ar_drop[i] = ~l2_m0_ar_done_SP[i];
+
+ end else begin
+ l2_m0_ar_accept[i] = ~l2_m0_ar_done_SP[i];
+ l2_m1_ar_drop[i] = ~l2_m1_ar_done_SP[i];
+
+ end
+ end else if (l2_ar_drop[i]) begin
+ l2_m0_ar_drop[i] = ~l2_m0_ar_done_SP[i] ? l2_ar_drop[i] : 1'b0;
+ l2_m1_ar_drop[i] = ~l2_m1_ar_done_SP[i] ? l2_ar_drop[i] : 1'b0;
+
+ end
+
+ // combine done
+ l2_mx_ar_done[i] = l2_m0_ar_done_SP[i] & l2_m1_ar_done_SP[i];
+
+ l2_ar_done[i] = l2_mx_ar_done[i];
+ end
+
+ always_ff @(posedge Clk_CI) begin : L2_MX_AR_DONE_REG
+ if (Rst_RBI == 0) begin
+ l2_m0_ar_done_SP[i] <= 1'b0;
+ l2_m1_ar_done_SP[i] <= 1'b0;
+ end else if (l2_mx_ar_done[i]) begin
+ l2_m0_ar_done_SP[i] <= 1'b0;
+ l2_m1_ar_done_SP[i] <= 1'b0;
+ end else begin
+ l2_m0_ar_done_SP[i] <= l2_m0_ar_done_SP[i] | l2_m0_ar_done[i];
+ l2_m1_ar_done_SP[i] <= l2_m1_ar_done_SP[i] | l2_m1_ar_done[i];
+ end
+ end
+
+ end // AX_SPLIT
+ endgenerate // AX_SPLIT
+
+ // }}}
+
+ // HANDSHAKE SPLITS {{{
+ // ██╗ ██╗███████╗ ███████╗██████╗ ██╗ ██╗████████╗
+ // ██║ ██║██╔════╝ ██╔════╝██╔══██╗██║ ██║╚══██╔══╝
+ // ███████║███████╗ ███████╗██████╔╝██║ ██║ ██║
+ // ██╔══██║╚════██║ ╚════██║██╔═══╝ ██║ ██║ ██║
+ // ██║ ██║███████║ ███████║██║ ███████╗██║ ██║
+ // ╚═╝ ╚═╝╚══════╝ ╚══════╝╚═╝ ╚══════╝╚═╝ ╚═╝
+ //
+ /*
+ * We need to perform combined handshakes with multiple AXI modules
+ * upon transactions drops, accepts, saves etc. from two TLBs.
+ */
+ generate for (i = 0; i < N_PORTS; i++) begin : HANDSHAKE_SPLIT
+
+ assign l1_xw_accept[i] = int_wtrans_accept[i] & ~aw_out_stall[i];
+ assign int_wtrans_sent[i] = l1_xw_done[i];
+
+ assign l1_ar_accept[i] = int_rtrans_accept[i];
+ assign int_rtrans_sent[i] = l1_ar_done[i];
+
+ /*
+ * L1 AW sender + W buffer handshake split
+ */
+ // forward
+ assign l1_aw_accept[i] = l1_xw_accept[i] & ~l1_aw_done_SP[i];
+ assign l1_w_accept[i] = l1_xw_accept[i] & ~l1_w_done_SP[i];
+
+ assign l1_aw_save[i] = l1_xw_save[i] & ~l1_aw_done_SP[i];
+ assign l1_w_save[i] = l1_xw_save[i] & ~l1_w_done_SP[i];
+
+ assign l1_aw_drop[i] = l1_xw_drop[i] & ~l1_aw_done_SP[i];
+ assign l1_w_drop[i] = l1_xw_drop[i] & ~l1_w_done_SP[i];
+
+ // backward
+ assign l1_xw_done[i] = l1_aw_done_SP[i] & l1_w_done_SP[i];
+
+ always_ff @(posedge Clk_CI) begin : L1_XW_HS_SPLIT
+ if (Rst_RBI == 0) begin
+ l1_aw_done_SP[i] <= 1'b0;
+ l1_w_done_SP[i] <= 1'b0;
+ end else if (l1_xw_done[i]) begin
+ l1_aw_done_SP[i] <= 1'b0;
+ l1_w_done_SP[i] <= 1'b0;
+ end else begin
+ l1_aw_done_SP[i] <= l1_aw_done_SP[i] | l1_aw_done[i];
+ l1_w_done_SP[i] <= l1_w_done_SP[i] | l1_w_done[i];
+ end
+ end
+
+ if (ENABLE_L2TLB[i] == 1) begin : L2_HS_SPLIT
+
+ /*
+ * L1 AR sender + R sender handshake split
+ *
+ * AR and R do not need to be strictly in sync. We thus use separate handshakes.
+ * But the handshake signals for the R sender are multiplexed with the those for
+ * the L2. However, L2_ACCEPT_DROP_SAVE has always higher priority.
+ */
+ assign lx_r_drop[i] = l2_r_drop[i] | l1_r_drop[i];
+ assign l1_r_done[i] = l2_r_drop[i] ? 1'b0 : lx_r_done[i];
+ assign l2_r_done[i] = l2_r_drop[i] ? lx_r_done[i] : 1'b0;
+
+ /*
+ * L2 AW sender + W buffer handshake split
+ */
+ // forward
+ assign l2_aw_accept[i] = l2_xw_accept[i] & ~l2_aw_done_SP[i];
+ assign l2_w_accept[i] = l2_xw_accept[i] & ~l2_w_done_SP[i];
+
+ assign l2_aw_drop[i] = l2_xw_drop[i] & ~l2_aw_done_SP[i];
+ assign l2_w_drop[i] = l2_xw_drop[i] & ~l2_w_done_SP[i];
+
+ // backward
+ assign l2_xw_done[i] = l2_aw_done_SP[i] & l2_w_done_SP[i];
+
+ always_ff @(posedge Clk_CI) begin : L2_XW_HS_SPLIT
+ if (Rst_RBI == 0) begin
+ l2_aw_done_SP[i] <= 1'b0;
+ l2_w_done_SP[i] <= 1'b0;
+ end else if (l2_xw_done[i]) begin
+ l2_aw_done_SP[i] <= 1'b0;
+ l2_w_done_SP[i] <= 1'b0;
+ end else begin
+ l2_aw_done_SP[i] <= l2_aw_done_SP[i] | l2_aw_done[i];
+ l2_w_done_SP[i] <= l2_w_done_SP[i] | l2_w_done[i];
+ end
+ end
+
+ /*
+ * L2 AR + R sender handshake split
+ */
+ // forward
+ assign l2_ar_drop[i] = l2_xr_drop[i] & ~l2_ar_done_SP[i];
+ assign l2_r_drop[i] = l2_xr_drop[i] & ~l2_r_done_SP[i];
+
+ // backward - make sure to always clear L2_XR_HS_SPLIT
+ always_comb begin
+ if (l2_xr_drop[i]) begin
+ l2_xr_done[i] = l2_ar_done_SP[i] & l2_r_done_SP[i];
+ end else begin
+ l2_xr_done[i] = l2_ar_done_SP[i];
+ end
+ end
+
+ always_ff @(posedge Clk_CI) begin : L2_XR_HS_SPLIT
+ if (Rst_RBI == 0) begin
+ l2_ar_done_SP[i] <= 1'b0;
+ l2_r_done_SP[i] <= 1'b0;
+ end else if (l2_xr_done[i]) begin
+ l2_ar_done_SP[i] <= 1'b0;
+ l2_r_done_SP[i] <= 1'b0;
+ end else begin
+ l2_ar_done_SP[i] <= l2_ar_done_SP[i] | l2_ar_done[i];
+ l2_r_done_SP[i] <= l2_r_done_SP[i] | l2_r_done[i];
+ end
+ end
+
+ end else begin // if (ENABLE_L2TLB[i] == 1)
+
+ assign lx_r_drop[i] = l1_r_drop[i];
+ assign l1_r_done[i] = lx_r_done[i];
+
+ assign l2_aw_accept[i] = 1'b0;
+ assign l2_w_accept[i] = 1'b0;
+ assign l2_aw_drop[i] = 1'b0;
+ assign l2_w_drop[i] = 1'b0;
+ assign l2_xw_done[i] = 1'b0;
+ assign l2_aw_done_SP[i] = 1'b0;
+ assign l2_w_done_SP[i] = 1'b0;
+
+ assign l2_ar_accept[i] = 1'b0;
+ assign l2_ar_drop[i] = 1'b0;
+ assign l2_r_drop[i] = 1'b0;
+ assign l2_xr_done[i] = 1'b0;
+ assign l2_r_done[i] = 1'b0;
+ assign l2_ar_done_SP[i] = 1'b0;
+ assign l2_r_done_SP[i] = 1'b0;
+
+ end // if (ENABLE_L2TLB[i] == 1)
+
+ end // HANDSHAKE_SPLIT
+ endgenerate // HANDSHAKE_SPLIT
+
+ // }}}
+
+ // L2 TLB {{{
+ // ██╗ ██████╗ ████████╗██╗ ██████╗
+ // ██║ ╚════██╗ ╚══██╔══╝██║ ██╔══██╗
+ // ██║ █████╔╝ ██║ ██║ ██████╔╝
+ // ██║ ██╔═══╝ ██║ ██║ ██╔══██╗
+ // ███████╗███████╗ ██║ ███████╗██████╔╝
+ // ╚══════╝╚══════╝ ╚═╝ ╚══════╝╚═════╝
+ //
+ /*
+ * l2_tlb
+ *
+ * The L2 TLB translates addresses upon misses in the L1 TLB (rab_core).
+ *
+ * It supports one ongoing translation at a time. If an L1 miss occurs while the L2 is busy,
+ * the L1 is stalled untill the L2 is available again.
+ *
+ */
+ generate for (i = 0; i < N_PORTS; i++) begin : L2_TLB
+ if (ENABLE_L2TLB[i] == 1) begin : L2_TLB
+
+ /*
+ * L1 output selector
+ */
+ assign L1OutRwType_D[i] = int_wtrans_drop[i] ? 1'b1 : 1'b0;
+ assign L1OutProt_D[i] = rab_prot[i];
+ assign L1OutMulti_D[i] = rab_multi[i];
+
+ /*
+ * L1 output control + L1_DROP_BUF, L2_IN_BUF management
+ *
+ * Forward the L1 drop request to AR/AW sender modules if
+ * 1. the transactions needs to be dropped (L1 multi, prot, prefetch), or
+ * 2. if a lookup in the L2 TLB is required (L1 miss) and the input buffer is not full.
+ *
+ * The AR/AW senders do not support more than 1 oustanding L1 miss. The push back towards
+ * the upstream is realized by not accepting the save request (saving the L1 transaction)
+ * in the senders as long as the L2 TLB is busy or has valid output. This ultimately
+ * blocks the L1 TLB.
+ *
+ * Together with the AW drop/save, we also perform the W drop/save as AW and W need to
+ * absolutely remain in order. In contrast, the R drop is performed
+ */
+ always_comb begin : L1_DROP_SAVE
+
+ l1_ar_drop[i] = 1'b0;
+ l1_ar_save[i] = 1'b0;
+ l1_xw_drop[i] = 1'b0;
+ l1_xw_save[i] = 1'b0;
+
+ l1_id_drop[i] = L1OutId_D[i];
+ l1_len_drop[i] = L1OutLen_D[i];
+ l1_prefetch_drop[i] = rab_prefetch[i];
+ l1_hit_drop[i] = 1'b1; // there are no drops for L1 misses
+
+ L1DropEn_S[i] = 1'b0;
+ L2InEn_S[i] = 1'b0;
+
+ if ( rab_prot[i] | rab_multi[i] | rab_prefetch[i] ) begin
+ // 1. Drop
+ l1_ar_drop[i] = int_rtrans_drop[i] & ~L1DropValid_SP[i];
+ l1_xw_drop[i] = int_wtrans_drop[i] & ~L1DropValid_SP[i];
+
+ // Store to L1_DROP_BUF upon handshake
+ L1DropEn_S[i] = (l1_ar_drop[i] & l1_ar_done[i]) |
+ (l1_xw_drop[i] & l1_xw_done[i]);
+
+ end else if ( rab_miss[i] ) begin
+ // 2. Save - Make sure L2 is really available.
+ l1_ar_save[i] = int_rtrans_drop[i] & ~L2Busy_S[i];
+ l1_xw_save[i] = int_wtrans_drop[i] & ~L2Busy_S[i];
+
+ // Store to L2_IN_BUF upon handshake - triggers the L2 TLB
+ L2InEn_S[i] = (l1_ar_save[i] & l1_ar_done[i]) |
+ (l1_xw_save[i] & l1_xw_done[i]);
+ end
+ end
+
+ /*
+ * L2 output control + L2_OUT_BUF management + R/B sender control + W buffer control
+ *
+ * Perform L1 R transaction drops unless the L2 output buffer holds valid data. The AXI specs
+ * require the B response to be sent only after consuming/discarding the corresponding data
+ * in the W channel. Thus, we only send L2 drop request to the W buffer here. The drop
+ * request to the B sender is then sent by the W buffer autonomously.
+ *
+ * L1 AW/W drop requests are managed by L1_DROP_SAVE.
+ */
+ always_comb begin : L2_ACCEPT_DROP_SAVE
+
+ l2_ar_addr[i] = 'b0;
+ l2_aw_addr[i] = 'b0;
+ l2_ar_accept[i] = 1'b0;
+ l2_xr_drop[i] = 1'b0;
+ l2_xw_accept[i] = 1'b0;
+ l2_xw_drop[i] = 1'b0;
+
+ l1_r_drop[i] = 1'b0;
+
+ lx_id_drop[i] = 'b0;
+ lx_len_drop[i] = 'b0;
+ lx_prefetch_drop[i] = 1'b0;
+ lx_hit_drop[i] = 1'b0;
+
+ L1DropValid_SN[i] = L1DropValid_SP[i] | L1DropEn_S[i];
+ L2OutValid_SN[i] = L2OutValid_SP[i];
+ L2OutReady_S[i] = 1'b0;
+ L2OutEn_S[i] = 1'b0;
+
+ L2Miss_S[i] = 1'b0;
+ int_multi[i] = 1'b0;
+ int_prot[i] = 1'b0;
+
+ if (L2OutValid_SP[i] == 1'b0) begin
+
+ // Drop L1 from R senders
+ if (L1DropValid_SP[i] == 1'b1) begin
+
+ // Only perform the R sender drop here.
+ if (~L1DropRwType_DP[i]) begin
+
+ l1_r_drop[i] = 1'b1;
+ lx_id_drop[i] = L1DropId_DP[i];
+ lx_len_drop[i] = L1DropLen_DP[i];
+ lx_prefetch_drop[i] = L1DropPrefetch_S[i];
+ lx_hit_drop[i] = 1'b1; // there are no drops for L1 misses
+
+ // Invalidate L1_DROP_BUF upon handshake
+ if ( l1_r_drop[i] & l1_r_done[i] ) begin
+
+ L1DropValid_SN[i] = 1'b0;
+ int_prot[i] = L1DropProt_DP[i];
+ int_multi[i] = L1DropMulti_DP[i];
+ end
+
+ end else begin
+ // Invalidate L1_DROP_BUF
+ L1DropValid_SN[i] = 1'b0;
+ int_prot[i] = L1DropProt_DP[i];
+ int_multi[i] = L1DropMulti_DP[i];
+ end
+ end
+
+ end else begin // L2_OUT_BUF has valid data
+
+ if ( L2OutHit_SP[i] & ~(L2OutPrefetch_S[i] | L2OutProt_SP[i] | L2OutMulti_SP[i]) ) begin
+
+ l2_ar_addr[i] = L2OutAddr_DP[i];
+ l2_aw_addr[i] = L2OutAddr_DP[i];
+
+ l2_ar_accept[i] = L2OutRwType_DP[i] ? 1'b0 : 1'b1;
+ l2_xw_accept[i] = L2OutRwType_DP[i] ? 1'b1 : 1'b0;
+
+ // Invalidate L2_OUT_BUF upon handshake
+ L2OutValid_SN[i] = ~( (l2_ar_accept[i] & l2_ar_done[i]) |
+ (l2_xw_accept[i] & l2_xw_done[i]) );
+ end else begin
+
+ lx_id_drop[i] = L2OutId_DP[i];
+ lx_len_drop[i] = L2OutLen_DP[i];
+ lx_prefetch_drop[i] = L2OutPrefetch_S[i];
+ lx_hit_drop[i] = L2OutHit_SP[i];
+
+ // The l2_xr_drop will also perform the handshake with the R sender
+ l2_xr_drop[i] = L2OutRwType_DP[i] ? 1'b0 : 1'b1;
+ l2_xw_drop[i] = L2OutRwType_DP[i] ? 1'b1 : 1'b0;
+
+ // Invalidate L1_DROP_BUF upon handshake
+ if ( (l2_xr_drop[i] & l2_xr_done[i]) | (l2_xw_drop[i] & l2_xw_done[i]) ) begin
+
+ L2OutValid_SN[i] = 1'b0;
+ L2Miss_S[i] = ~L2OutHit_SP[i];
+ int_prot[i] = L2OutProt_SP[i];
+ int_multi[i] = L2OutMulti_SP[i];
+ end
+ end
+ end
+
+ // Only accept new L2 output after ongoing drops have finished.
+ if ( (l2_xr_drop[i] == l2_xr_done[i]) &
+ (l2_xw_drop[i] == l2_xw_done[i]) &
+ (l1_r_drop[i] == l1_r_done[i] ) ) begin
+ // Store to L2_OUT_BUF upon handshake with L2 TLB module
+ if ( (L2OutValid_SP[i] == 1'b0) && (L2OutValid_S[i] == 1'b1) ) begin
+ L2OutValid_SN[i] = 1'b1;
+ L2OutReady_S[i] = 1'b1;
+ L2OutEn_S[i] = 1'b1;
+ end
+ end
+ end
+
+ /*
+ * L1 drop buffer
+ *
+ * Used in case of multi, prot and prefetch hits in the L1 TLB.
+ */
+ always_ff @(posedge Clk_CI) begin : L1_DROP_BUF
+ if (Rst_RBI == 0) begin
+ L1DropProt_DP[i] <= 1'b0;
+ L1DropMulti_DP[i] <= 1'b0;
+ L1DropRwType_DP[i] <= 1'b0;
+ L1DropUser_DP[i] <= 'b0;
+ L1DropId_DP[i] <= 'b0;
+ L1DropLen_DP[i] <= 'b0;
+ L1DropAddr_DP[i] <= 'b0;
+ end else if (L1DropEn_S[i] == 1'b1) begin
+ L1DropProt_DP[i] <= L1OutProt_D[i] ;
+ L1DropMulti_DP[i] <= L1OutMulti_D[i] ;
+ L1DropRwType_DP[i] <= L1OutRwType_D[i];
+ L1DropUser_DP[i] <= L1OutUser_D[i] ;
+ L1DropId_DP[i] <= L1OutId_D[i] ;
+ L1DropLen_DP[i] <= L1OutLen_D[i] ;
+ L1DropAddr_DP[i] <= L1OutAddr_D[i] ;
+ end
+ end // always_ff @ (posedge Clk_CI)
+
+ /*
+ * L2 input buffer
+ *
+ * Make sure there are no combinational paths between L1 TLB/inputs and L2 TLB.
+ */
+ always_ff @(posedge Clk_CI) begin : L2_IN_BUF
+ if (Rst_RBI == 0) begin
+ L2InRwType_DP[i] <= 1'b0;
+ L2InUser_DP[i] <= 'b0;
+ L2InId_DP[i] <= 'b0;
+ L2InLen_DP[i] <= 'b0;
+ L2InAddr_DP[i] <= 'b0;
+ end else if (L2InEn_S[i] == 1'b1) begin
+ L2InRwType_DP[i] <= L1OutRwType_D[i];
+ L2InUser_DP[i] <= L1OutUser_D[i] ;
+ L2InId_DP[i] <= L1OutId_D[i] ;
+ L2InLen_DP[i] <= L1OutLen_D[i] ;
+ L2InAddr_DP[i] <= L1OutAddr_D[i] ;
+ end
+ end // always_ff @ (posedge Clk_CI)
+
+ l2_tlb
+ #(
+ .AXI_S_ADDR_WIDTH ( AXI_S_ADDR_WIDTH ),
+ .AXI_M_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ),
+ .AXI_LITE_DATA_WIDTH ( AXI_LITE_DATA_WIDTH ),
+ .AXI_LITE_ADDR_WIDTH ( AXI_LITE_ADDR_WIDTH ),
+ .N_SETS ( `RAB_L2_N_SETS ),
+ .N_OFFSETS ( `RAB_L2_N_SET_ENTRIES/2/`RAB_L2_N_PAR_VA_RAMS ),
+ .N_PAR_VA_RAMS ( `RAB_L2_N_PAR_VA_RAMS ),
+ .HIT_OFFSET_STORE_WIDTH ( log2(`RAB_L2_N_SET_ENTRIES/2/`RAB_L2_N_PAR_VA_RAMS) )
+ )
+ u_l2_tlb
+ (
+ .clk_i ( Clk_CI ),
+ .rst_ni ( Rst_RBI ),
+
+ // Config inputs
+ .we_i ( L2CfgWE_S[i] ),
+ .waddr_i ( L2CfgWAddr_D[i] ),
+ .wdata_i ( L2CfgWData_D[i] ),
+
+ // Request input
+ .start_i ( L2InEn_S[i] ),
+ .busy_o ( L2Busy_S[i] ),
+ .rw_type_i ( L2InRwType_DP[i] ),
+ .in_addr_i ( L2InAddr_DP[i] ),
+
+ // Response output
+ .out_ready_i ( L2OutReady_S[i] ),
+ .out_valid_o ( L2OutValid_S[i] ),
+ .hit_o ( L2OutHit_SN[i] ),
+ .miss_o ( L2OutMiss_SN[i] ),
+ .prot_o ( L2OutProt_SN[i] ),
+ .multi_o ( L2OutMulti_SN[i] ),
+ .cache_coherent_o ( L2OutCC_SN[i] ),
+ .out_addr_o ( L2OutAddr_DN[i] )
+ );
+
+ /*
+ * L2 output buffer
+ *
+ * Make sure there are no combinational paths between L1 TLB/inputs and L2 TLB.
+ */
+ always_ff @(posedge Clk_CI) begin : L2_OUT_BUF
+ if (Rst_RBI == 0) begin
+ L2OutRwType_DP[i] <= 1'b0;
+ L2OutUser_DP[i] <= 'b0;
+ L2OutLen_DP[i] <= 'b0;
+ L2OutId_DP[i] <= 'b0;
+ L2OutInAddr_DP[i] <= 'b0;
+
+ L2OutHit_SP[i] <= 1'b0;
+ L2OutMiss_SP[i] <= 1'b0;
+ L2OutProt_SP[i] <= 1'b0;
+ L2OutMulti_SP[i] <= 1'b0;
+ L2OutCC_SP[i] <= 1'b0;
+ L2OutAddr_DP[i] <= 'b0;
+ end else if (L2OutEn_S[i] == 1'b1) begin
+ L2OutRwType_DP[i] <= L2InRwType_DP[i];
+ L2OutUser_DP[i] <= L2InUser_DP[i] ;
+ L2OutLen_DP[i] <= L2InLen_DP[i] ;
+ L2OutId_DP[i] <= L2InId_DP[i] ;
+ L2OutInAddr_DP[i] <= L2InAddr_DP[i] ;
+
+ L2OutHit_SP[i] <= L2OutHit_SN[i] ;
+ L2OutMiss_SP[i] <= L2OutMiss_SN[i] ;
+ L2OutProt_SP[i] <= L2OutProt_SN[i] ;
+ L2OutMulti_SP[i] <= L2OutMulti_SN[i];
+ L2OutCC_SP[i] <= L2OutCC_SN[i] ;
+ L2OutAddr_DP[i] <= L2OutAddr_DN[i] ;
+ end
+ end // always_ff @ (posedge Clk_CI)
+
+ always_ff @(posedge Clk_CI) begin : BUF_VALID
+ if (Rst_RBI == 0) begin
+ L1DropValid_SP[i] = 1'b0;
+ L2OutValid_SP[i] = 1'b0;
+ end else begin
+ L1DropValid_SP[i] = L1DropValid_SN[i];
+ L2OutValid_SP[i] = L2OutValid_SN[i];
+ end
+ end
+
+ always_comb begin : BUF_TO_PREFETCH
+ // L1 Drop Buf
+ if (L1DropUser_DP[i] == {AXI_USER_WIDTH{1'b1}})
+ L1DropPrefetch_S[i] = 1'b1;
+ else
+ L1DropPrefetch_S[i] = 1'b0;
+
+ // L2 Out Buf
+ if (L2OutUser_DP[i] == {AXI_USER_WIDTH{1'b1}})
+ L2OutPrefetch_S[i] = 1'b1;
+ else
+ L2OutPrefetch_S[i] = 1'b0;
+ end
+
+ assign l2_cache_coherent[i] = L2OutCC_SP[i];
+ assign int_miss[i] = L2Miss_S[i];
+
+ end else begin : L2_TLB_STUB // if (ENABLE_L2TLB[i] == 1)
+
+ assign l1_ar_drop[i] = int_rtrans_drop[i];
+ assign l1_r_drop[i] = int_rtrans_drop[i];
+ assign l1_xw_drop[i] = int_wtrans_drop[i];
+
+ assign l1_ar_save[i] = 1'b0;
+ assign l1_xw_save[i] = 1'b0;
+ assign l2_xw_accept[i] = 1'b0;
+ assign l2_xr_drop[i] = 1'b0;
+ assign l2_xw_drop[i] = 1'b0;
+
+ assign l2_ar_addr[i] = 'b0;
+ assign l2_aw_addr[i] = 'b0;
+
+ assign l1_id_drop[i] = int_wtrans_drop[i] ? int_awid[i] :
+ int_rtrans_drop[i] ? int_arid[i] :
+ '0;
+ assign l1_len_drop[i] = int_wtrans_drop[i] ? int_awlen[i] :
+ int_rtrans_drop[i] ? int_arlen[i] :
+ '0;
+ assign l1_prefetch_drop[i] = rab_prefetch[i];
+ assign l1_hit_drop[i] = ~rab_miss[i];
+
+ assign lx_id_drop[i] = int_wtrans_drop[i] ? int_awid[i] :
+ int_rtrans_drop[i] ? int_arid[i] :
+ '0;
+ assign lx_len_drop[i] = int_wtrans_drop[i] ? int_awlen[i] :
+ int_rtrans_drop[i] ? int_arlen[i] :
+ '0;
+ assign lx_prefetch_drop[i] = rab_prefetch[i];
+ assign lx_hit_drop[i] = ~rab_miss[i];
+
+ assign l2_cache_coherent[i] = 1'b0;
+
+ assign int_miss[i] = rab_miss[i];
+ assign int_prot[i] = rab_prot[i];
+ assign int_multi[i] = rab_multi[i];
+
+ // unused signals
+ assign L2Miss_S[i] = 1'b0;
+
+ assign L1OutRwType_D[i] = 1'b0;
+ assign L1OutProt_D[i] = 1'b0;
+ assign L1OutMulti_D[i] = 1'b0;
+
+ assign L1DropRwType_DP[i] = 1'b0;
+ assign L1DropUser_DP[i] = 'b0;
+ assign L1DropId_DP[i] = 'b0;
+ assign L1DropLen_DP[i] = 'b0;
+ assign L1DropAddr_DP[i] = 'b0;
+ assign L1DropProt_DP[i] = 1'b0;
+ assign L1DropMulti_DP[i] = 1'b0;
+
+ assign L1DropEn_S[i] = 1'b0;
+ assign L1DropPrefetch_S[i] = 1'b0;
+ assign L1DropValid_SN[i] = 1'b0;
+ assign L1DropValid_SP[i] = 1'b0;
+
+ assign L2InRwType_DP[i] = 1'b0;
+ assign L2InUser_DP[i] = 'b0;
+ assign L2InId_DP[i] = 'b0;
+ assign L2InLen_DP[i] = 'b0;
+ assign L2InAddr_DP[i] = 'b0;
+
+ assign L2InEn_S[i] = 1'b0;
+
+ assign L2OutHit_SN[i] = 1'b0;
+ assign L2OutMiss_SN[i] = 1'b0;
+ assign L2OutProt_SN[i] = 1'b0;
+ assign L2OutMulti_SN[i] = 1'b0;
+ assign L2OutCC_SN[i] = 1'b0;
+ assign L2OutAddr_DN[i] = 'b0;
+
+ assign L2OutRwType_DP[i] = 1'b0;
+ assign L2OutUser_DP[i] = 'b0;
+ assign L2OutId_DP[i] = 'b0;
+ assign L2OutLen_DP[i] = 'b0;
+ assign L2OutInAddr_DP[i] = 'b0;
+ assign L2OutHit_SP[i] = 1'b0;
+ assign L2OutMiss_SP[i] = 1'b0;
+ assign L2OutProt_SP[i] = 1'b0;
+ assign L2OutMulti_SP[i] = 1'b0;
+ assign L2OutCC_SP[i] = 1'b0;
+ assign L2OutAddr_DP[i] = 'b0;
+
+ assign L2OutEn_S[i] = 1'b0;
+ assign L2OutPrefetch_S[i] = 1'b0;
+ assign L2Busy_S[i] = 1'b0;
+ assign L2OutValid_S[i] = 1'b0;
+ assign L2OutValid_SN[i] = 1'b0;
+ assign L2OutValid_SP[i] = 1'b0;
+ assign L2OutReady_S[i] = 1'b0;
+
+ end // !`ifdef ENABLE_L2TLB
+ end // for (i = 0; i < N_PORTS; i++)
+ endgenerate
+
+// }}}
+"""
+# endmodule
+#
+#
+# // vim: ts=2 sw=2 sts=2 et nosmartindent autoindent foldmethod=marker
+#
+#
--- /dev/null
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class check_ram(Elaboratable):
+
+ def __init__(self):
+ self.clk_i = Signal() # input
+ self.rst_ni = Signal() # input
+ self.in_addr = Signal(ADDR_WIDTH) # input
+ self.rw_type = Signal() # input
+ self.ram_we = Signal() # input
+ self.port0_addr = Signal(1+ERROR p_expression_25) # input
+ self.port1_addr = Signal(1+ERROR p_expression_25) # input
+ self.ram_wdata = Signal(RAM_DATA_WIDTH) # input
+ self.output_sent = Signal() # input
+ self.output_valid = Signal() # input
+ self.offset_addr_d = Signal(OFFSET_WIDTH) # input
+ self.hit_addr = Signal(1+ERROR p_expression_25) # output
+ self.master = Signal() # output
+ self.hit = Signal() # output
+ self.multi_hit = Signal() # output
+ self.prot = Signal() # output
+
+ def elaborate(self, platform=None):
+ m = Module()
+ return m
+
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License. You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# //import CfMath::log2;
+#
+# //`define MULTI_HIT_FULL_SET
+#
+# module check_ram
+# //#(
+# // parameter ADDR_WIDTH = 32,
+# // parameter RAM_DATA_WIDTH = 32,
+# // parameter PAGE_SIZE = 4096, // 4kB
+# // parameter SET_WIDTH = 5,
+# // parameter OFFSET_WIDTH = 4
+# // )
+# (
+# input logic clk_i,
+# input logic rst_ni,
+# input logic [ADDR_WIDTH-1:0] in_addr,
+# input logic rw_type, // 1 => write, 0=> read
+# input logic ram_we,
+# input logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] port0_addr,
+# input logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] port1_addr,
+# input logic [RAM_DATA_WIDTH-1:0] ram_wdata,
+# input logic output_sent,
+# input logic output_valid,
+# input logic [OFFSET_WIDTH-1:0] offset_addr_d,
+# output logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] hit_addr,
+# output logic master,
+# output logic hit,
+# output logic multi_hit,
+# output logic prot
+# );
+#
+""" #docstring_begin
+
+ localparam IGNORE_LSB = log2(PAGE_SIZE); // 12
+
+ logic [RAM_DATA_WIDTH-1:0] port0_data_o, port1_data_o; // RAM read data outputs
+ logic port0_hit, port1_hit; // Ram output matches in_addr
+
+ logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] port0_addr_saved, port1_addr_saved;
+
+ // Hit FSM Signals
+ typedef enum logic {SEARCH, HIT} hit_state_t;
+ hit_state_t hit_SP; // Hit FSM state
+ hit_state_t hit_SN; // Hit FSM next state
+
+ // Multi Hit FSM signals
+`ifdef MULTI_HIT_FULL_SET
+ typedef enum logic[1:0] {NO_HITS, ONE_HIT, MULTI_HIT} multi_state_t;
+ multi_state_t multi_SP; // Multi Hit FSM state
+ multi_state_t multi_SN; // Multi Hit FSM next state
+
+ logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] hit_addr_saved;
+ logic master_saved;
+`endif
+
+ //// --------------- Block RAM (Dual Port) -------------- ////
+
+ // The outputs of the BRAMs are only valid if in the previous cycle:
+ // 1. the inputs were valid, and
+ // 2. the BRAM was not written to.
+ // Otherwise, the outputs must be ignored which is controlled by the output_valid signal.
+ // This signal is driven by the uppler level L2 TLB module.
+ ram_tp_no_change #(
+ .ADDR_WIDTH( SET_WIDTH+OFFSET_WIDTH+1 ),
+ .DATA_WIDTH( RAM_DATA_WIDTH )
+ )
+ ram_tp_no_change_0
+ (
+ .clk ( clk_i ),
+ .we ( ram_we ),
+ .addr0 ( port0_addr ),
+ .addr1 ( port1_addr ),
+ .d_i ( ram_wdata ),
+ .d0_o ( port0_data_o ),
+ .d1_o ( port1_data_o )
+ );
+
+ //// Check Ram Outputs
+ assign port0_hit = (port0_data_o[0] == 1'b1) && (in_addr[ADDR_WIDTH-1: IGNORE_LSB] == port0_data_o[RAM_DATA_WIDTH-1:4]);
+ assign port1_hit = (port1_data_o[0] == 1'b1) && (in_addr[ADDR_WIDTH-1: IGNORE_LSB] == port1_data_o[RAM_DATA_WIDTH-1:4]);
+ //// ----------------------------------------------------- /////
+
+ //// ------------------- Check if Hit ------------------------ ////
+ // FSM
+ always_ff @(posedge clk_i) begin
+ if (rst_ni == 0) begin
+ hit_SP <= SEARCH;
+ end else begin
+ hit_SP <= hit_SN;
+ end
+ end
+
+ always_ff @(posedge clk_i, negedge rst_ni) begin
+ if (!rst_ni) begin
+ port0_addr_saved <= '0;
+ port1_addr_saved <= '0;
+ end else begin
+ port0_addr_saved <= port0_addr;
+ port1_addr_saved <= port1_addr;
+ end
+ end
+
+ always_comb begin
+ hit_SN = hit_SP;
+ hit = 1'b0;
+ hit_addr = 0;
+ master = 1'b0;
+ unique case(hit_SP)
+ SEARCH :
+ if (output_valid)
+ if (port0_hit || port1_hit) begin
+ hit_SN = HIT;
+ hit = 1'b1;
+ hit_addr = port0_hit ? {port0_addr_saved[SET_WIDTH+OFFSET_WIDTH:OFFSET_WIDTH], offset_addr_d} :
+ port1_hit ? {port1_addr_saved[SET_WIDTH+OFFSET_WIDTH:OFFSET_WIDTH], offset_addr_d} :
+ 0;
+ master = port0_hit ? port0_data_o[3] :
+ port1_hit ? port1_data_o[3] :
+ 1'b0;
+ end
+
+ HIT : begin
+`ifdef MULTI_HIT_FULL_SET // Since the search continues after the first hit, it needs to be saved to be accessed later.
+ hit = 1'b1;
+ hit_addr = hit_addr_saved;
+ master = master_saved;
+`endif
+ if (output_sent)
+ hit_SN = SEARCH;
+ end
+
+ default : begin
+ hit_SN = SEARCH;
+ end
+ endcase // case (hit_SP)
+ end // always_comb begin
+
+ //// ------------------------------------------- ////
+
+ assign prot = output_valid && port0_hit ? ((~port0_data_o[2] && rw_type) || (~port0_data_o[1] && ~rw_type)) :
+ output_valid && port1_hit ? ((~port1_data_o[2] && rw_type) || (~port1_data_o[1] && ~rw_type)) :
+ 1'b0;
+
+ //// ------------------- Multi ------------------- ////
+`ifdef MULTI_HIT_FULL_SET
+
+ always_ff @(posedge clk_i) begin
+ if (rst_ni == 0) begin
+ hit_addr_saved <= 0;
+ master_saved <= 1'b0;
+ end else if (output_valid) begin
+ hit_addr_saved <= hit_addr;
+ master_saved <= master;
+ end
+ end
+
+ // FSM
+ always_ff @(posedge clk_i) begin
+ if (rst_ni == 0) begin
+ multi_SP <= NO_HITS;
+ end else begin
+ multi_SP <= multi_SN;
+ end
+ end
+
+ always_comb begin
+ multi_SN = multi_SP;
+ multi_hit = 1'b0;
+ unique case(multi_SP)
+ NO_HITS :
+ if(output_valid && (port0_hit && port1_hit)) begin
+ multi_SN = MULTI_HIT;
+ multi_hit = 1'b1;
+ end else if(output_valid && (port0_hit || port1_hit))
+ multi_SN = ONE_HIT;
+
+ ONE_HIT :
+ if(output_valid && (port0_hit || port1_hit)) begin
+ multi_SN = MULTI_HIT;
+ multi_hit = 1'b1;
+ end else if (output_sent)
+ multi_SN = NO_HITS;
+
+ MULTI_HIT : begin
+ multi_hit = 1'b1;
+ if (output_sent)
+ multi_SN = NO_HITS;
+ end
+
+ endcase // case (multi_SP)
+ end // always_comb begin
+
+`else // !`ifdef MULTI_HIT_FULL_SET
+ assign multi_hit = output_valid && port0_hit && port1_hit;
+`endif // !`ifdef MULTI_HIT_FULL_SET
+ //// ------------------------------------------- ////
+"""
+# endmodule
+#
+#
--- /dev/null
+class CoreConfig:
+ def __init__(self):
+ self.N_SLICES = 16
+ self.N_REGS = 4*self.N_SLICES
+ self.ADDR_WIDTH_PHYS = 40
+ self.ADDR_WIDTH_VIRT = 32
--- /dev/null
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class fsm(Elaboratable):
+
+ def __init__(self):
+ self.Clk_CI = Signal() # input
+ self.Rst_RBI = Signal() # input
+ self.port1_addr_valid_i = Signal() # input
+ self.port2_addr_valid_i = Signal() # input
+ self.port1_sent_i = Signal() # input
+ self.port2_sent_i = Signal() # input
+ self.select_i = Signal() # input
+ self.no_hit_i = Signal() # input
+ self.multi_hit_i = Signal() # input
+ self.no_prot_i = Signal() # input
+ self.prefetch_i = Signal() # input
+ self.out_addr_i = Signal(AXI_M_ADDR_WIDTH) # input
+ self.cache_coherent_i = Signal() # input
+ self.port1_accept_o = Signal() # output
+ self.port1_drop_o = Signal() # output
+ self.port1_miss_o = Signal() # output
+ self.port2_accept_o = Signal() # output
+ self.port2_drop_o = Signal() # output
+ self.port2_miss_o = Signal() # output
+ self.out_addr_o = Signal(AXI_M_ADDR_WIDTH) # output
+ self.cache_coherent_o = Signal() # output
+ self.miss_o = Signal() # output
+ self.multi_o = Signal() # output
+ self.prot_o = Signal() # output
+ self.prefetch_o = Signal() # output
+ self.in_addr_i = Signal(AXI_S_ADDR_WIDTH) # input
+ self.in_id_i = Signal(AXI_ID_WIDTH) # input
+ self.in_len_i = Signal(8) # input
+ self.in_user_i = Signal(AXI_USER_WIDTH) # input
+ self.in_addr_o = Signal(AXI_S_ADDR_WIDTH) # output
+ self.in_id_o = Signal(AXI_ID_WIDTH) # output
+ self.in_len_o = Signal(8) # output
+ self.in_user_o = Signal(AXI_USER_WIDTH) # output
+
+ def elaborate(self, platform=None):
+ m = Module()
+ return m
+
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License. You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# //`timescale 1ns / 1ps
+#
+# module fsm
+# #(
+# parameter AXI_M_ADDR_WIDTH = 40,
+# parameter AXI_S_ADDR_WIDTH = 32,
+# parameter AXI_ID_WIDTH = 8,
+# parameter AXI_USER_WIDTH = 6
+# )
+# (
+# input logic Clk_CI,
+# input logic Rst_RBI,
+#
+# input logic port1_addr_valid_i,
+# input logic port2_addr_valid_i,
+# input logic port1_sent_i,
+# input logic port2_sent_i,
+# input logic select_i,
+# input logic no_hit_i,
+# input logic multi_hit_i,
+# input logic no_prot_i,
+# input logic prefetch_i,
+# input logic [AXI_M_ADDR_WIDTH-1:0] out_addr_i,
+# input logic cache_coherent_i,
+# output logic port1_accept_o,
+# output logic port1_drop_o,
+# output logic port1_miss_o,
+# output logic port2_accept_o,
+# output logic port2_drop_o,
+# output logic port2_miss_o,
+# output logic [AXI_M_ADDR_WIDTH-1:0] out_addr_o,
+# output logic cache_coherent_o,
+# output logic miss_o,
+# output logic multi_o,
+# output logic prot_o,
+# output logic prefetch_o,
+# input logic [AXI_S_ADDR_WIDTH-1:0] in_addr_i,
+# input logic [AXI_ID_WIDTH-1:0] in_id_i,
+# input logic [7:0] in_len_i,
+# input logic [AXI_USER_WIDTH-1:0] in_user_i,
+# output logic [AXI_S_ADDR_WIDTH-1:0] in_addr_o,
+# output logic [AXI_ID_WIDTH-1:0] in_id_o,
+# output logic [7:0] in_len_o,
+# output logic [AXI_USER_WIDTH-1:0] in_user_o
+# );
+#
+""" #docstring_begin
+
+ //-------------Internal Signals----------------------
+
+ typedef enum logic {IDLE, WAIT} state_t;
+ logic state_SP; // Present state
+ logic state_SN; // Next State
+
+ logic port1_accept_SN;
+ logic port1_drop_SN;
+ logic port1_miss_SN;
+ logic port2_accept_SN;
+ logic port2_drop_SN;
+ logic port2_miss_SN;
+ logic miss_SN;
+ logic multi_SN;
+ logic prot_SN;
+ logic prefetch_SN;
+ logic cache_coherent_SN;
+ logic [AXI_M_ADDR_WIDTH-1:0] out_addr_DN;
+
+ logic out_reg_en_S;
+
+ //----------FSM comb------------------------------
+
+ always_comb begin: FSM_COMBO
+ state_SN = state_SP;
+
+ port1_accept_SN = 1'b0;
+ port1_drop_SN = 1'b0;
+ port1_miss_SN = 1'b0;
+ port2_accept_SN = 1'b0;
+ port2_drop_SN = 1'b0;
+ port2_miss_SN = 1'b0;
+ miss_SN = 1'b0;
+ multi_SN = 1'b0;
+ prot_SN = 1'b0;
+ prefetch_SN = 1'b0;
+ cache_coherent_SN = 1'b0;
+ out_addr_DN = '0;
+
+ out_reg_en_S = 1'b0; // by default hold register output
+
+ unique case(state_SP)
+ IDLE :
+ if ( (port1_addr_valid_i & select_i) | (port2_addr_valid_i & ~select_i) ) begin
+ out_reg_en_S = 1'b1;
+ state_SN = WAIT;
+
+ // Select inputs for output registers
+ if (port1_addr_valid_i & select_i) begin
+ port1_accept_SN = ~(no_hit_i | multi_hit_i | ~no_prot_i | prefetch_i);
+ port1_drop_SN = (no_hit_i | multi_hit_i | ~no_prot_i | prefetch_i);
+ port1_miss_SN = no_hit_i;
+ port2_accept_SN = 1'b0;
+ port2_drop_SN = 1'b0;
+ port2_miss_SN = 1'b0;
+ end else if (port2_addr_valid_i & ~select_i) begin
+ port1_accept_SN = 1'b0;
+ port1_drop_SN = 1'b0;
+ port1_miss_SN = 1'b0;
+ port2_accept_SN = ~(no_hit_i | multi_hit_i | ~no_prot_i | prefetch_i);
+ port2_drop_SN = (no_hit_i | multi_hit_i | ~no_prot_i | prefetch_i);
+ port2_miss_SN = no_hit_i;
+ end
+
+ miss_SN = port1_miss_SN | port2_miss_SN;
+ multi_SN = multi_hit_i;
+ prot_SN = ~no_prot_i;
+ prefetch_SN = ~no_hit_i & prefetch_i;
+
+ cache_coherent_SN = cache_coherent_i;
+ out_addr_DN = out_addr_i;
+ end
+
+ WAIT :
+ if ( port1_sent_i | port2_sent_i ) begin
+ out_reg_en_S = 1'b1; // "clear" the register
+ state_SN = IDLE;
+ end
+
+ default : begin
+ state_SN = IDLE;
+ end
+ endcase
+ end
+
+ //----------FSM seq-------------------------------
+
+ always_ff @(posedge Clk_CI, negedge Rst_RBI) begin: FSM_SEQ
+ if (Rst_RBI == 1'b0)
+ state_SP <= IDLE;
+ else
+ state_SP <= state_SN;
+ end
+
+ //----------Output seq--------------------------
+
+ always_ff @(posedge Clk_CI, negedge Rst_RBI) begin: OUTPUT_SEQ
+ if (Rst_RBI == 1'b0) begin
+ port1_accept_o = 1'b0;
+ port1_drop_o = 1'b0;
+ port1_miss_o = 1'b0;
+ port2_accept_o = 1'b0;
+ port2_drop_o = 1'b0;
+ port2_miss_o = 1'b0;
+ miss_o = 1'b0;
+ multi_o = 1'b0;
+ prot_o = 1'b0;
+ prefetch_o = 1'b0;
+ cache_coherent_o = 1'b0;
+ out_addr_o = '0;
+ in_addr_o = '0;
+ in_id_o = '0;
+ in_len_o = '0;
+ in_user_o = '0;
+ end else if (out_reg_en_S == 1'b1) begin
+ port1_accept_o = port1_accept_SN;
+ port1_drop_o = port1_drop_SN;
+ port1_miss_o = port1_miss_SN;
+ port2_accept_o = port2_accept_SN;
+ port2_drop_o = port2_drop_SN;
+ port2_miss_o = port2_miss_SN;
+ miss_o = miss_SN;
+ multi_o = multi_SN;
+ prot_o = prot_SN;
+ prefetch_o = prefetch_SN;
+ cache_coherent_o = cache_coherent_SN;
+ out_addr_o = out_addr_DN;
+ in_addr_o = in_addr_i;
+ in_id_o = in_id_i;
+ in_len_o = in_len_i;
+ in_user_o = in_user_i;
+ end
+ end // block: OUTPUT_SEQ
+"""
+#
+# endmodule
+#
+#
--- /dev/null
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class l2_tlb(Elaboratable):
+
+ def __init__(self):
+ self.clk_i = Signal() # input
+ self.rst_ni = Signal() # input
+ self.we_i = Signal() # input
+ self.waddr_i = Signal(AXI_LITE_ADDR_WIDTH) # input
+ self.wdata_i = Signal(AXI_LITE_DATA_WIDTH) # input
+ self.start_i = Signal() # input
+ self.busy_o = Signal() # output
+ self.in_addr_i = Signal(AXI_S_ADDR_WIDTH) # input
+ self.rw_type_i = Signal() # input
+ self.out_ready_i = Signal() # input
+ self.out_valid_o = Signal() # output
+ self.hit_o = Signal() # output
+ self.miss_o = Signal() # output
+ self.prot_o = Signal() # output
+ self.multi_o = Signal() # output
+ self.cache_coherent_o = Signal() # output
+ self.out_addr_o = Signal(AXI_M_ADDR_WIDTH) # output
+
+ def elaborate(self, platform=None):
+ m = Module()
+ return m
+
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License. You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# //`include "pulp_soc_defines.sv"
+#
+# ////import CfMath::log2;
+#
+# //`define MULTI_HIT_FULL_SET // Enable full multi hit detection. Always the entire set is searched.
+# //`define MULTI_HIT_CUR_CYCLE // Enable partial multi hit detection. Only multi hits in the same search cycle are detected.
+#
+# //`ifdef MULTI_HIT_FULL_SET
+# // `ifndef MULTI_HIT_CUR_CYCLE
+# // `define MULTI_HIT_CUR_CYCLE
+# // `endif
+# //`endif
+#
+# module l2_tlb
+# //#(
+# // parameter AXI_S_ADDR_WIDTH = 32,
+# // parameter AXI_M_ADDR_WIDTH = 40,
+# // parameter AXI_LITE_DATA_WIDTH = 64,
+# // parameter AXI_LITE_ADDR_WIDTH = 32,
+# // parameter N_SETS = 32,
+# // parameter N_OFFSETS = 4, //per port. There are 2 ports.
+# // parameter PAGE_SIZE = 4096, // 4kB
+# // parameter N_PAR_VA_RAMS = 4,
+# // parameter HIT_OFFSET_STORE_WIDTH = 2 // Num of bits of VA RAM offset stored. This should not be greater than OFFSET_WIDTH
+# // )
+# (
+# input logic clk_i,
+# input logic rst_ni,
+#
+# input logic we_i,
+# input logic [AXI_LITE_ADDR_WIDTH-1:0] waddr_i,
+# input logic [AXI_LITE_DATA_WIDTH-1:0] wdata_i,
+#
+# input logic start_i,
+# output logic busy_o,
+# input logic [AXI_S_ADDR_WIDTH-1:0] in_addr_i,
+# input logic rw_type_i, //1 => write, 0=> read
+#
+# input logic out_ready_i,
+# output logic out_valid_o,
+# output logic hit_o,
+# output logic miss_o,
+# output logic prot_o,
+# output logic multi_o,
+# output logic cache_coherent_o,
+# output logic [AXI_M_ADDR_WIDTH-1:0] out_addr_o
+# );
+#
+""" #docstring_begin
+
+ localparam VA_RAM_DEPTH = N_SETS * N_OFFSETS * 2;
+ localparam PA_RAM_DEPTH = VA_RAM_DEPTH * N_PAR_VA_RAMS;
+ localparam VA_RAM_ADDR_WIDTH = log2(VA_RAM_DEPTH);
+ localparam PA_RAM_ADDR_WIDTH = log2(PA_RAM_DEPTH);
+ localparam SET_WIDTH = log2(N_SETS);
+ localparam OFFSET_WIDTH = log2(N_OFFSETS);
+ localparam LL_WIDTH = log2(N_PAR_VA_RAMS);
+ localparam IGNORE_LSB = log2(PAGE_SIZE);
+
+ localparam VA_RAM_DATA_WIDTH = AXI_S_ADDR_WIDTH - IGNORE_LSB + 4;
+ localparam PA_RAM_DATA_WIDTH = AXI_M_ADDR_WIDTH - IGNORE_LSB;
+
+ logic [N_PAR_VA_RAMS-1:0] hit, prot, multi_hit, cache_coherent;
+ logic [N_PAR_VA_RAMS-1:0] ram_we;
+ logic last_search, last_search_next;
+ logic first_search, first_search_next;
+ logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] ram_waddr;
+ logic [N_PAR_VA_RAMS-1:0][SET_WIDTH+OFFSET_WIDTH+1-1:0] hit_addr;
+ logic pa_ram_we;
+ logic [PA_RAM_ADDR_WIDTH-1:0] pa_port0_raddr, pa_port0_waddr; // PA RAM read, Write addr;
+ logic [PA_RAM_ADDR_WIDTH-1:0] pa_port0_raddr_reg_SN, pa_port0_raddr_reg_SP; // registered addresses, needed for WAIT_ON_WRITE;
+ logic [PA_RAM_ADDR_WIDTH-1:0] pa_port0_addr; // PA RAM addr
+ logic [PA_RAM_DATA_WIDTH-1:0] pa_port0_data, pa_data, pa_port0_data_reg; // PA RAM data
+ logic pa_ram_store_data_SN, pa_ram_store_data_SP;
+ logic hit_top, prot_top, multi_hit_top, first_hit_top;
+ logic output_sent;
+ int hit_block_num;
+
+ logic searching, search_done;
+ logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] port0_addr, port0_raddr; // VA RAM port0 addr
+ logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] port1_addr; // VA RAM port1 addr
+ logic [OFFSET_WIDTH-1:0] offset_addr, offset_addr_d;
+ logic [OFFSET_WIDTH-1:0] offset_start_addr, offset_end_addr;
+ logic [SET_WIDTH-1:0] set_num;
+
+ logic va_output_valid;
+ logic searching_q;
+
+ genvar z;
+
+ // Search FSM
+ typedef enum logic [1:0] {IDLE, SEARCH, DONE} search_state_t;
+ search_state_t search_SP; // Present state
+ search_state_t search_SN; // Next State
+
+ // Output FSM
+ typedef enum logic [1:0] {OUT_IDLE, SEND_OUTPUT, WAIT_ON_WRITE} out_state_t;
+ out_state_t out_SP; // Present state
+ out_state_t out_SN; // Next State
+
+ logic miss_next;
+ logic hit_next;
+ logic prot_next;
+ logic multi_next;
+ logic cache_coherent_next;
+
+ // Generate the VA Block rams and their surrounding logic
+ generate
+ for (z = 0; z < N_PAR_VA_RAMS; z++) begin : VA_RAMS
+ check_ram
+ #(
+ .ADDR_WIDTH ( AXI_S_ADDR_WIDTH ),
+ .RAM_DATA_WIDTH ( VA_RAM_DATA_WIDTH ),
+ .PAGE_SIZE ( PAGE_SIZE ),
+ .SET_WIDTH ( SET_WIDTH ),
+ .OFFSET_WIDTH ( OFFSET_WIDTH )
+ )
+ u_check_ram
+ (
+ .clk_i ( clk_i ),
+ .rst_ni ( rst_ni ),
+ .in_addr ( in_addr_i ),
+ .rw_type ( rw_type_i ),
+ .ram_we ( ram_we[z] ),
+ .port0_addr ( port0_addr ),
+ .port1_addr ( port1_addr ),
+ .ram_wdata ( wdata_i[VA_RAM_DATA_WIDTH-1:0] ),
+ .output_sent ( output_sent ),
+ .output_valid ( va_output_valid ),
+ .offset_addr_d ( offset_addr_d ),
+ .hit_addr ( hit_addr[z] ),
+ .master ( cache_coherent[z] ),
+ .hit ( hit[z] ),
+ .multi_hit ( multi_hit[z] ),
+ .prot ( prot[z] )
+ );
+ end // for (z = 0; z < N_PORTS; z++)
+ endgenerate
+
+ ////////////////// ---------------- Control and Address --------------- ////////////////////////
+ // FSM
+ always_ff @(posedge clk_i) begin
+ if (rst_ni == 0) begin
+ search_SP <= IDLE;
+ end else begin
+ search_SP <= search_SN;
+ end
+ end
+
+ always_comb begin : SEARCH_FSM
+ search_SN = search_SP;
+ busy_o = 1'b0;
+ searching = 1'b0;
+ search_done = 1'b0;
+ last_search_next = 1'b0;
+ first_search_next = first_search;
+
+ unique case (search_SP)
+ IDLE : begin
+ if (start_i) begin
+ search_SN = SEARCH;
+ first_search_next = 1'b1;
+ end
+ end
+
+ SEARCH : begin
+ busy_o = 1'b1;
+
+ // detect last search cycle
+ if ( (first_search == 1'b0) && (offset_addr == offset_end_addr) )
+ last_search_next = 1'b1;
+
+ // pause search during VA RAM reconfigration
+ if (|ram_we) begin
+ searching = 1'b0;
+ end else begin
+ searching = 1'b1;
+ first_search_next = 1'b0;
+ end
+
+ if (va_output_valid) begin
+ // stop search
+`ifdef MULTI_HIT_FULL_SET
+ if (last_search | prot_top | multi_hit_top) begin
+`else
+ if (last_search | prot_top | multi_hit_top | hit_top ) begin
+`endif
+ search_SN = DONE;
+ search_done = 1'b1;
+ end
+ end
+ end
+
+ DONE : begin
+ busy_o = 1'b1;
+ if (out_valid_o & out_ready_i)
+ search_SN = IDLE;
+ end
+
+ default : begin
+ search_SN = IDLE;
+ end
+ endcase // case (prot_SP)
+ end // always_comb begin
+
+ always_ff @(posedge clk_i) begin
+ if (rst_ni == 0) begin
+ last_search <= 1'b0;
+ first_search <= 1'b0;
+ end else begin
+ last_search <= last_search_next;
+ first_search <= first_search_next;
+ end
+ end
+
+ /*
+ * VA RAM address generation
+ *
+ * The input address and set number, and thus the offset start address, are available in the
+ * cycle after the start signal. The buffered offset_addr becomes available one cycle later.
+ * During the first search cycle, we therefore directly use offset_addr_start for the lookup.
+ */
+ assign set_num = in_addr_i[SET_WIDTH+IGNORE_LSB -1 : IGNORE_LSB];
+
+ assign port0_raddr[OFFSET_WIDTH] = 1'b0;
+ assign port1_addr [OFFSET_WIDTH] = 1'b1;
+
+ assign port0_raddr[OFFSET_WIDTH-1:0] = first_search ? offset_start_addr : offset_addr;
+ assign port1_addr [OFFSET_WIDTH-1:0] = first_search ? offset_start_addr : offset_addr;
+
+ assign port0_raddr[SET_WIDTH+OFFSET_WIDTH : OFFSET_WIDTH+1] = set_num;
+ assign port1_addr [SET_WIDTH+OFFSET_WIDTH : OFFSET_WIDTH+1] = set_num;
+
+ assign port0_addr = ram_we ? ram_waddr : port0_raddr;
+
+ // The outputs of the BRAMs are only valid if in the previous cycle:
+ // 1. the inputs were valid, and
+ // 2. the BRAMs were not written to.
+ // Otherwise, the outputs must be ignored.
+ always_ff @(posedge clk_i) begin
+ if (rst_ni == 0) begin
+ searching_q <= 1'b0;
+ end else begin
+ searching_q <= searching;
+ end
+ end
+ assign va_output_valid = searching_q;
+
+ // Address offset for looking up the VA RAMs
+ always_ff @(posedge clk_i) begin
+ if (rst_ni == 0) begin
+ offset_addr <= 0;
+ end else if (first_search) begin
+ offset_addr <= offset_start_addr + 1'b1;
+ end else if (searching) begin
+ offset_addr <= offset_addr + 1'b1;
+ end
+ end
+
+ // Delayed address offest for looking up the PA RAM upon a hit in the VA RAMs
+ always_ff @(posedge clk_i) begin
+ if (rst_ni == 0) begin
+ offset_addr_d <= 0;
+ end else if (first_search) begin
+ offset_addr_d <= offset_start_addr;
+ end else if (searching) begin
+ offset_addr_d <= offset_addr_d + 1'b1;
+ end
+ end
+
+ // Store the offset addr for hit to reduce latency for next search.
+ generate
+ if (HIT_OFFSET_STORE_WIDTH > 0) begin : OFFSET_STORE
+`ifndef MULTI_HIT_FULL_SET
+ logic [N_SETS-1:0][HIT_OFFSET_STORE_WIDTH-1:0] hit_offset_addr; // Contains offset addr for previous hit for every SET.
+ logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] hit_addr_reg;
+
+ assign offset_start_addr = { hit_offset_addr[set_num] , {{OFFSET_WIDTH-HIT_OFFSET_STORE_WIDTH}{1'b0}} };
+ assign offset_end_addr = hit_offset_addr[set_num]-1'b1;
+
+ // Register the hit addr
+ always_ff @(posedge clk_i) begin
+ if (rst_ni == 0) begin
+ hit_addr_reg <= 0;
+ end else if (hit_top) begin
+ hit_addr_reg <= hit_addr[hit_block_num];
+ end
+ end
+
+ // Store hit addr for each set. The next search in the same set will start from the saved addr.
+ always_ff @(posedge clk_i) begin
+ if (rst_ni == 0) begin
+ hit_offset_addr <= 0;
+ end else if (hit_o) begin
+ hit_offset_addr[set_num][HIT_OFFSET_STORE_WIDTH-1:0] <= hit_addr_reg[OFFSET_WIDTH-1 : (OFFSET_WIDTH - HIT_OFFSET_STORE_WIDTH)];
+ end
+ end
+`else // No need to store offset if full multi hit detection is enabled because the entire SET is searched.
+ assign offset_start_addr = 0;
+ assign offset_end_addr = {OFFSET_WIDTH{1'b1}};
+`endif
+ end else begin // if (HIT_OFFSET_STORE_WIDTH > 0)
+ assign offset_start_addr = 0;
+ assign offset_end_addr = {OFFSET_WIDTH{1'b1}};
+ end
+ endgenerate
+
+ assign prot_top = |prot;
+
+ //////////////////////////////////////////////////////////////////////////////////////
+ // check for hit, multi hit
+ // In case of a multi hit, the hit_block_num indicates the lowest VA RAM with a hit.
+ // In case of a multi hit in the same VA RAM, Port 0 is given priority.
+ always_comb begin : HIT_CHECK
+ hit_top = |hit;
+ hit_block_num = 0;
+ first_hit_top = 1'b0;
+ multi_hit_top = 1'b0;
+ for (int i=N_PAR_VA_RAMS-1; i>=0; i--) begin
+ if (hit[i] == 1'b1) begin
+`ifdef MULTI_HIT_CUR_CYCLE
+ if (multi_hit[i] | first_hit_top ) begin
+ multi_hit_top = 1'b1;
+ end
+`endif
+ first_hit_top = 1'b1;
+ hit_block_num = i;
+ end
+ end // for (int i=0; i<N_PAR_VA_RAMS; i++)
+ end // always_comb begin
+
+ ///////////////////// ------------- Outputs ------------ //////////////////////////////////
+ //// FSM
+ always_ff @(posedge clk_i) begin
+ if (rst_ni == 0) begin
+ out_SP <= OUT_IDLE;
+ pa_ram_store_data_SP <= 1'b0;
+ pa_port0_raddr_reg_SP <= 'b0;
+ end else begin
+ out_SP <= out_SN;
+ pa_ram_store_data_SP <= pa_ram_store_data_SN;
+ pa_port0_raddr_reg_SP <= pa_port0_raddr_reg_SN;
+ end
+ end
+
+ always_comb begin : OUTPUT_FSM
+ out_SN = out_SP;
+
+ miss_next = miss_o;
+ prot_next = prot_o;
+ multi_next = multi_o;
+ hit_next = hit_o;
+ cache_coherent_next = cache_coherent_o;
+ pa_port0_raddr_reg_SN = pa_port0_raddr_reg_SP;
+
+ pa_port0_raddr = 'b0;
+ pa_ram_store_data_SN = 1'b0;
+
+ out_valid_o = 1'b0;
+ output_sent = 1'b0;
+
+ unique case (out_SP)
+ OUT_IDLE : begin
+ hit_next = 1'b0;
+ miss_next = 1'b0;
+ prot_next = 1'b0;
+ multi_next = 1'b0;
+ cache_coherent_next = 1'b0;
+
+ // abort transaction
+ if ((search_done & ~hit_top) | prot_top | multi_hit_top) begin
+ out_SN = SEND_OUTPUT;
+
+ if (search_done & ~hit_top) begin
+ miss_next = 1'b1;
+ end
+ if (prot_top) begin
+ prot_next = 1'b1;
+ hit_next = 1'b1;
+ end
+ if (multi_hit_top) begin
+ multi_next = 1'b1;
+ hit_next = 1'b1;
+ end
+
+ // read PA RAM
+ end else if (search_done & hit_top) begin
+ hit_next = 1'b1;
+ cache_coherent_next = cache_coherent[hit_block_num];
+ pa_port0_raddr = (N_PAR_VA_RAMS * hit_addr[hit_block_num]) + hit_block_num;
+ pa_port0_raddr_reg_SN = pa_port0_raddr;
+
+ // read PA RAM now
+ if (~pa_ram_we) begin
+ out_SN = SEND_OUTPUT;
+ pa_ram_store_data_SN = 1'b1;
+
+ // read PA RAM after PA RAM reconfiguration
+ end else begin // pa_ram_we
+ out_SN = WAIT_ON_WRITE;
+
+ end
+ end
+ end
+
+ WAIT_ON_WRITE : begin
+ if ( ~pa_ram_we ) begin
+ out_SN = SEND_OUTPUT;
+ pa_port0_raddr = pa_port0_raddr_reg_SP;
+ pa_ram_store_data_SN = 1'b1;
+ end
+ end
+
+ SEND_OUTPUT : begin
+ out_valid_o = 1'b1;
+ if (out_ready_i) begin
+ out_SN = OUT_IDLE;
+ output_sent = 1'b1;
+ end
+ end
+
+ default : begin
+ out_SN = OUT_IDLE;
+ end
+
+ endcase // case (out_SP)
+ end // always_comb begin
+
+ //// Output signals
+ always_ff @(posedge clk_i) begin
+ if (rst_ni == 0) begin
+ miss_o <= 1'b0;
+ prot_o <= 1'b0;
+ multi_o <= 1'b0;
+ hit_o <= 1'b0;
+ cache_coherent_o <= 1'b0;
+ end else begin
+ miss_o <= miss_next;
+ prot_o <= prot_next;
+ multi_o <= multi_next;
+ hit_o <= hit_next;
+ cache_coherent_o <= cache_coherent_next;
+ end
+ end
+
+ ///////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+ ///////////////////// --------------- Physical Address -------------- ////////////////////////////
+
+ /// PA Block RAM
+ ram_tp_no_change #(
+ .ADDR_WIDTH( PA_RAM_ADDR_WIDTH ),
+ .DATA_WIDTH( PA_RAM_DATA_WIDTH )
+ )
+ pa_ram
+ (
+ .clk ( clk_i ),
+ .we ( pa_ram_we ),
+ .addr0 ( pa_port0_addr ),
+ .addr1 ( '0 ),
+ .d_i ( wdata_i[PA_RAM_DATA_WIDTH-1:0] ),
+ .d0_o ( pa_port0_data ),
+ .d1_o ( )
+ );
+
+ assign out_addr_o[IGNORE_LSB-1:0] = in_addr_i[IGNORE_LSB-1:0];
+ assign out_addr_o[AXI_M_ADDR_WIDTH-1:IGNORE_LSB] = pa_data;
+
+ always_ff @(posedge clk_i) begin
+ if (rst_ni == 0) begin
+ pa_port0_data_reg <= 0;
+ end else if (pa_ram_store_data_SP) begin
+ pa_port0_data_reg <= pa_port0_data;
+ end
+ end
+
+ assign pa_data = pa_ram_store_data_SP ? pa_port0_data : pa_port0_data_reg;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+///// Write enable for all block rams
+generate if (LL_WIDTH != 0) begin
+ always_comb begin
+ var reg[LL_WIDTH:0] para;
+ var int para_int;
+ for (para = 0; para < N_PAR_VA_RAMS; para=para+1'b1) begin
+ para_int = int'(para);
+ ram_we[para_int] = we_i && (waddr_i[LL_WIDTH+VA_RAM_ADDR_WIDTH] == 1'b0) && (waddr_i[LL_WIDTH-1:0] == para);
+ end
+ end
+end else begin
+ assign ram_we[0] = we_i && (waddr_i[LL_WIDTH+VA_RAM_ADDR_WIDTH] == 1'b0);
+end
+
+endgenerate
+
+// Addresses are word, not byte addresses
+assign pa_ram_we = we_i && (waddr_i[LL_WIDTH+VA_RAM_ADDR_WIDTH] == 1'b1); //waddr_i[LL_WIDTH+VA_RAM_ADDR_WIDTH] will be 0 for all VA writes and 1 for all PA writes
+assign ram_waddr = waddr_i[LL_WIDTH+VA_RAM_ADDR_WIDTH-1:LL_WIDTH];
+assign pa_port0_waddr = waddr_i[PA_RAM_ADDR_WIDTH-1:0];
+assign pa_port0_addr = pa_ram_we ? pa_port0_waddr : pa_port0_raddr;
+
+"""
+# endmodule
+#
+# // vim: ts=3 sw=3 sts=3 et nosmartindent autoindent foldmethod=marker tw=100
+#
+#
--- /dev/null
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License. You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+
+# this file has been generated by sv2nmigen
+
+#
+# //`include "pulp_soc_defines.sv"
+#
+# ////import CfMath::log2;
+#
+# //`define MY_ARRAY_SUM(MY_ARRAY,ARRAY_SIZE) ( (ARRAY_SIZE==1) ? MY_ARRAY[0] : (ARRAY_SIZE==2) ? MY_ARRAY[0] + MY_ARRAY[1] : (ARRAY_SIZE==3) ? MY_ARRAY[0] + MY_ARRAY[1] + MY_ARRAY[2] : (ARRAY_SIZE==4) ? MY_ARRAY[0] + MY_ARRAY[1] + MY_ARRAY[2] + MY_ARRAY[3] : 0 )
+#
+
+# module rab_core
+# #(
+# parameter N_PORTS = 3,
+# parameter N_L2_SETS = 32,
+# parameter N_L2_SET_ENTRIES = 32,
+# parameter AXI_DATA_WIDTH = 64,
+# parameter AXI_S_ADDR_WIDTH = 32,
+# parameter AXI_M_ADDR_WIDTH = 40,
+# parameter AXI_LITE_DATA_WIDTH = 64,
+# parameter AXI_LITE_ADDR_WIDTH = 32,
+# parameter AXI_ID_WIDTH = 8,
+# parameter AXI_USER_WIDTH = 6,
+# parameter MH_FIFO_DEPTH = 16
+# )
+# (
+# input logic Clk_CI,
+# input logic Rst_RBI,
+#
+# input logic [AXI_LITE_ADDR_WIDTH-1:0] s_axi_awaddr,
+# input logic s_axi_awvalid,
+# output logic s_axi_awready,
+#
+# input logic [AXI_LITE_DATA_WIDTH-1:0] s_axi_wdata,
+# input logic [AXI_LITE_DATA_WIDTH/8-1:0] s_axi_wstrb,
+# input logic s_axi_wvalid,
+# output logic s_axi_wready,
+#
+# input logic [AXI_LITE_ADDR_WIDTH-1:0] s_axi_araddr,
+# input logic s_axi_arvalid,
+# output logic s_axi_arready,
+#
+# input logic s_axi_rready,
+# output logic [AXI_LITE_DATA_WIDTH-1:0] s_axi_rdata,
+# output logic [1:0] s_axi_rresp,
+# output logic s_axi_rvalid,
+#
+# output logic [1:0] s_axi_bresp,
+# output logic s_axi_bvalid,
+# input logic s_axi_bready,
+#
+# output logic [N_PORTS-1:0] int_miss,
+# output logic [N_PORTS-1:0] int_prot,
+# output logic [N_PORTS-1:0] int_multi,
+# output logic [N_PORTS-1:0] int_prefetch,
+# output logic int_mhf_full,
+#
+# output logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] int_axaddr_o,
+# output logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] int_axid_o,
+# output logic [N_PORTS-1:0] [7:0] int_axlen_o,
+# output logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_axuser_o,
+#
+# input logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] port1_addr,
+# input logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] port1_id,
+# input logic [N_PORTS-1:0] [7:0] port1_len,
+# input logic [N_PORTS-1:0] [2:0] port1_size,
+# input logic [N_PORTS-1:0] port1_addr_valid,
+# input logic [N_PORTS-1:0] port1_type,
+# input logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] port1_user,
+# input logic [N_PORTS-1:0] port1_sent,
+# output logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] port1_out_addr,
+# output logic [N_PORTS-1:0] port1_cache_coherent,
+# output logic [N_PORTS-1:0] port1_accept,
+# output logic [N_PORTS-1:0] port1_drop,
+# output logic [N_PORTS-1:0] port1_miss,
+#
+# input logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] port2_addr,
+# input logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] port2_id,
+# input logic [N_PORTS-1:0] [7:0] port2_len,
+# input logic [N_PORTS-1:0] [2:0] port2_size,
+# input logic [N_PORTS-1:0] port2_addr_valid,
+# input logic [N_PORTS-1:0] port2_type,
+# input logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] port2_user,
+# input logic [N_PORTS-1:0] port2_sent,
+# output logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] port2_out_addr,
+# output logic [N_PORTS-1:0] port2_cache_coherent,
+# output logic [N_PORTS-1:0] port2_accept,
+# output logic [N_PORTS-1:0] port2_drop,
+# output logic [N_PORTS-1:0] port2_miss,
+#
+# input logic [N_PORTS-1:0] miss_l2_i,
+# input logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] miss_l2_addr_i,
+# input logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] miss_l2_id_i,
+# input logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] miss_l2_user_i,
+#
+# output logic [N_PORTS-1:0] [AXI_LITE_DATA_WIDTH-1:0] wdata_l2_o,
+# output logic [N_PORTS-1:0] [AXI_LITE_ADDR_WIDTH-1:0] waddr_l2_o,
+# output logic [N_PORTS-1:0] wren_l2_o
+# );
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class rab_core(Elaboratable):
+
+ def __init__(self):
+ self.s_axi_awaddr = Signal(AXI_LITE_ADDR_WIDTH) # input
+ self.s_axi_awvalid = Signal() # input
+ self.s_axi_awready = Signal() # output
+ self.s_axi_wdata = Signal(AXI_LITE_DATA_WIDTH) # input
+ self.s_axi_wstrb = Signal(FIXME) # input
+ self.s_axi_wvalid = Signal() # input
+ self.s_axi_wready = Signal() # output
+ self.s_axi_araddr = Signal(AXI_LITE_ADDR_WIDTH) # input
+ self.s_axi_arvalid = Signal() # input
+ self.s_axi_arready = Signal() # output
+ self.s_axi_rready = Signal() # input
+ self.s_axi_rdata = Signal(AXI_LITE_DATA_WIDTH) # output
+ self.s_axi_rresp = Signal(2) # output
+ self.s_axi_rvalid = Signal() # output
+ self.s_axi_bresp = Signal(2) # output
+ self.s_axi_bvalid = Signal() # output
+ self.s_axi_bready = Signal() # input
+ self.int_miss = Signal(N_PORTS) # output
+ self.int_prot = Signal(N_PORTS) # output
+ self.int_multi = Signal(N_PORTS) # output
+ self.int_prefetch = Signal(N_PORTS) # output
+ self.int_mhf_full = Signal() # output
+ self.int_axaddr_o = Signal() # output
+ self.int_axid_o = Signal() # output
+ self.int_axlen_o = Signal() # output
+ self.int_axuser_o = Signal() # output
+ self.port1_addr = Signal() # input
+ self.port1_id = Signal() # input
+ self.port1_len = Signal() # input
+ self.port1_size = Signal() # input
+ self.port1_addr_valid = Signal(N_PORTS) # input
+ self.port1_type = Signal(N_PORTS) # input
+ self.port1_user = Signal() # input
+ self.port1_sent = Signal(N_PORTS) # input
+ self.port1_out_addr = Signal() # output
+ self.port1_cache_coherent = Signal(N_PORTS) # output
+ self.port1_accept = Signal(N_PORTS) # output
+ self.port1_drop = Signal(N_PORTS) # output
+ self.port1_miss = Signal(N_PORTS) # output
+ self.port2_addr = Signal() # input
+ self.port2_id = Signal() # input
+ self.port2_len = Signal() # input
+ self.port2_size = Signal() # input
+ self.port2_addr_valid = Signal(N_PORTS) # input
+ self.port2_type = Signal(N_PORTS) # input
+ self.port2_user = Signal() # input
+ self.port2_sent = Signal(N_PORTS) # input
+ self.port2_out_addr = Signal() # output
+ self.port2_cache_coherent = Signal(N_PORTS) # output
+ self.port2_accept = Signal(N_PORTS) # output
+ self.port2_drop = Signal(N_PORTS) # output
+ self.port2_miss = Signal(N_PORTS) # output
+ self.miss_l2_i = Signal(N_PORTS) # input
+ self.miss_l2_addr_i = Signal() # input
+ self.miss_l2_id_i = Signal() # input
+ self.miss_l2_user_i = Signal() # input
+ self.wdata_l2_o = Signal() # output
+ self.waddr_l2_o = Signal() # output
+ self.wren_l2_o = Signal(N_PORTS) # output
+
+ def elaborate(self, platform=None):
+ m = Module()
+ return m
+
+
+"""
+
+
+ // ███████╗██╗ ██████╗ ███╗ ██╗ █████╗ ██╗ ███████╗
+ // ██╔════╝██║██╔════╝ ████╗ ██║██╔══██╗██║ ██╔════╝
+ // ███████╗██║██║ ███╗██╔██╗ ██║███████║██║ ███████╗
+ // ╚════██║██║██║ ██║██║╚██╗██║██╔══██║██║ ╚════██║
+ // ███████║██║╚██████╔╝██║ ╚████║██║ ██║███████╗███████║
+ // ╚══════╝╚═╝ ╚═════╝ ╚═╝ ╚═══╝╚═╝ ╚═╝╚══════╝╚══════╝
+ // signals
+
+ localparam integer ENABLE_L2TLB[N_PORTS-1:0] = `EN_L2TLB_ARRAY;
+
+ localparam integer N_SLICES[N_PORTS-1:0] = `N_SLICES_ARRAY;
+ localparam N_SLICES_TOT = `MY_ARRAY_SUM(N_SLICES,N_PORTS);
+ localparam N_SLICES_MAX = `N_SLICES_MAX;
+
+ localparam N_REGS = 4*N_SLICES_TOT + 4;
+ localparam AXI_SIZE_WIDTH = log2(AXI_DATA_WIDTH/8);
+
+ localparam PORT_ID_WIDTH = (N_PORTS < 2) ? 1 : log2(N_PORTS);
+ localparam MISS_META_WIDTH = PORT_ID_WIDTH + AXI_USER_WIDTH + AXI_ID_WIDTH;
+
+ logic [N_PORTS-1:0] [15:0] p1_burst_size;
+ logic [N_PORTS-1:0] [15:0] p2_burst_size;
+
+ logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] p1_align_addr;
+ logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] p2_align_addr;
+
+ logic [N_PORTS-1:0] [AXI_SIZE_WIDTH-1:0] p1_mask;
+ logic [N_PORTS-1:0] [AXI_SIZE_WIDTH-1:0] p2_mask;
+
+ logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] p1_max_addr;
+ logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] p2_max_addr;
+
+ logic [N_PORTS-1:0] p1_prefetch;
+ logic [N_PORTS-1:0] p2_prefetch;
+
+ logic [N_PORTS-1:0] int_rw;
+ logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] int_addr_min;
+ logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] int_addr_max;
+ logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] int_id;
+ logic [N_PORTS-1:0] [7:0] int_len;
+ logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_user;
+
+ logic [N_PORTS-1:0] hit;
+ logic [N_PORTS-1:0] prot;
+ logic [N_PORTS-1:0] prefetch;
+
+ logic [N_PORTS-1:0] no_hit;
+ logic [N_PORTS-1:0] no_prot;
+
+ logic [N_PORTS-1:0] [N_SLICES_MAX-1:0] hit_slices;
+ logic [N_PORTS-1:0] [N_SLICES_MAX-1:0] prot_slices;
+
+ logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] out_addr;
+ logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] out_addr_reg;
+
+ logic [N_PORTS-1:0] cache_coherent;
+ logic [N_PORTS-1:0] cache_coherent_reg;
+
+ logic [N_PORTS-1:0] select;
+ reg [N_PORTS-1:0] curr_priority;
+
+ reg [N_PORTS-1:0] multi_hit;
+
+ logic [N_PORTS-1:0] miss_valid_mhf;
+ logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] miss_addr_mhf;
+ logic [N_PORTS-1:0] [MISS_META_WIDTH-1:0] miss_meta_mhf;
+
+ logic [N_REGS-1:0] [63:0] int_cfg_regs;
+ logic [N_PORTS-1:0] [4*N_SLICES_MAX-1:0] [63:0] int_cfg_regs_slices;
+
+ logic L1AllowMultiHit_S;
+
+ genvar z;
+
+ // █████╗ ███████╗███████╗██╗ ██████╗ ███╗ ██╗███╗ ███╗███████╗███╗ ██╗████████╗███████╗
+ // ██╔══██╗██╔════╝██╔════╝██║██╔════╝ ████╗ ██║████╗ ████║██╔════╝████╗ ██║╚══██╔══╝██╔════╝
+ // ███████║███████╗███████╗██║██║ ███╗██╔██╗ ██║██╔████╔██║█████╗ ██╔██╗ ██║ ██║ ███████╗
+ // ██╔══██║╚════██║╚════██║██║██║ ██║██║╚██╗██║██║╚██╔╝██║██╔══╝ ██║╚██╗██║ ██║ ╚════██║
+ // ██║ ██║███████║███████║██║╚██████╔╝██║ ╚████║██║ ╚═╝ ██║███████╗██║ ╚████║ ██║ ███████║
+ // ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝ ╚═════╝ ╚═╝ ╚═══╝╚═╝ ╚═╝╚══════╝╚═╝ ╚═══╝ ╚═╝ ╚══════╝
+ // assignments
+
+ always_comb
+ begin : PORT_SELECT
+ var integer idx;
+
+ for (idx=0; idx<N_PORTS; idx++) begin
+
+ // select = 1 -> port1 active
+ // select = 0 -> port2 active
+ select[idx] = (curr_priority[idx] & port1_addr_valid[idx]) | ~port2_addr_valid[idx];
+
+ p1_burst_size[idx] = (port1_len[idx] + 1) << port1_size[idx];
+ p2_burst_size[idx] = (port2_len[idx] + 1) << port2_size[idx];
+
+ // align min addr for max addr computation to allow for smart AXI bursts around the 4k boundary
+ if (port1_size[idx] == 3'b001)
+ p1_mask[idx] = 3'b110;
+ else if (port1_size[idx] == 3'b010)
+ p1_mask[idx] = 3'b100;
+ else if (port1_size[idx] == 3'b011)
+ p1_mask[idx] = 3'b000;
+ else
+ p1_mask[idx] = 3'b111;
+
+ p1_align_addr[idx][AXI_S_ADDR_WIDTH-1:AXI_SIZE_WIDTH] = port1_addr[idx][AXI_S_ADDR_WIDTH-1:AXI_SIZE_WIDTH];
+ p1_align_addr[idx][AXI_SIZE_WIDTH-1:0] = port1_addr[idx][AXI_SIZE_WIDTH-1:0] & p1_mask[idx];
+
+ if (port2_size[idx] == 3'b001)
+ p2_mask[idx] = 3'b110;
+ else if (port2_size[idx] == 3'b010)
+ p2_mask[idx] = 3'b100;
+ else if (port2_size[idx] == 3'b011)
+ p2_mask[idx] = 3'b000;
+ else
+ p2_mask[idx] = 3'b111;
+
+ if (port1_user[idx] == {AXI_USER_WIDTH{1'b1}})
+ p1_prefetch[idx] = 1'b1;
+ else
+ p1_prefetch[idx] = 1'b0;
+
+ if (port2_user[idx] == {AXI_USER_WIDTH{1'b1}})
+ p2_prefetch[idx] = 1'b1;
+ else
+ p2_prefetch[idx] = 1'b0;
+
+ p2_align_addr[idx][AXI_S_ADDR_WIDTH-1:AXI_SIZE_WIDTH] = port2_addr[idx][AXI_S_ADDR_WIDTH-1:AXI_SIZE_WIDTH];
+ p2_align_addr[idx][AXI_SIZE_WIDTH-1:0] = port2_addr[idx][AXI_SIZE_WIDTH-1:0] & p2_mask[idx];
+
+ p1_max_addr[idx] = p1_align_addr[idx] + p1_burst_size[idx] - 1;
+ p2_max_addr[idx] = p2_align_addr[idx] + p2_burst_size[idx] - 1;
+
+ int_addr_min[idx] = select[idx] ? port1_addr[idx] : port2_addr[idx];
+ int_addr_max[idx] = select[idx] ? p1_max_addr[idx] : p2_max_addr[idx];
+ int_rw[idx] = select[idx] ? port1_type[idx] : port2_type[idx];
+ int_id[idx] = select[idx] ? port1_id[idx] : port2_id[idx];
+ int_len[idx] = select[idx] ? port1_len[idx] : port2_len[idx];
+ int_user[idx] = select[idx] ? port1_user[idx] : port2_user[idx];
+ prefetch[idx] = select[idx] ? p1_prefetch[idx] : p2_prefetch[idx];
+
+ hit [idx] = | hit_slices [idx];
+ prot[idx] = | prot_slices[idx];
+
+ no_hit [idx] = ~hit [idx];
+ no_prot[idx] = ~prot[idx];
+
+ port1_out_addr[idx] = out_addr_reg[idx];
+ port2_out_addr[idx] = out_addr_reg[idx];
+
+ port1_cache_coherent[idx] = cache_coherent_reg[idx];
+ port2_cache_coherent[idx] = cache_coherent_reg[idx];
+ end
+ end
+
+ always_comb
+ begin
+ var integer idx_port, idx_slice;
+ var integer reg_num;
+ reg_num=0;
+ for ( idx_port = 0; idx_port < N_PORTS; idx_port++ ) begin
+ for ( idx_slice = 0; idx_slice < 4*N_SLICES[idx_port]; idx_slice++ ) begin
+ int_cfg_regs_slices[idx_port][idx_slice] = int_cfg_regs[4+reg_num];
+ reg_num++;
+ end
+ // int_cfg_regs_slices[idx_port][N_SLICES_MAX:N_SLICES[idx_port]] will be dangling
+ // Fix to zero. Synthesis will remove these signals.
+ // int_cfg_regs_slices[idx_port][4*N_SLICES_MAX-1:4*N_SLICES[idx_port]] = 0;
+ end
+ end
+
+ always @(posedge Clk_CI or negedge Rst_RBI)
+ begin : PORT_PRIORITY
+ var integer idx;
+ if (Rst_RBI == 1'b0)
+ curr_priority = 'h0;
+ else begin
+ for (idx=0; idx<N_PORTS; idx++) begin
+ if (port1_accept[idx] || port1_drop[idx])
+ curr_priority[idx] = 1'b1;
+ else if (port2_accept[idx] || port2_drop[idx])
+ curr_priority[idx] = 1'b0;
+ end
+ end
+ end
+
+ // find port that misses
+ logic [PORT_ID_WIDTH-1:0] PortIdx_D; // index of the first missing port
+ var integer idx_miss;
+ always_comb begin : MHF_PORT_SELECT
+ PortIdx_D = 'b0;
+ for (idx_miss = 0; idx_miss < N_PORTS; idx_miss++) begin
+ if (miss_valid_mhf[idx_miss] == 1'b1) begin
+ PortIdx_D = idx_miss;
+ break;
+ end
+ end
+ end // always_comb begin
+
+ // █████╗ ██╗ ██╗██╗ ██████╗ █████╗ ██████╗ ██████╗███████╗ ██████╗
+ // ██╔══██╗╚██╗██╔╝██║ ██╔══██╗██╔══██╗██╔══██╗ ██╔════╝██╔════╝██╔════╝
+ // ███████║ ╚███╔╝ ██║ ██████╔╝███████║██████╔╝ ██║ █████╗ ██║ ███╗
+ // ██╔══██║ ██╔██╗ ██║ ██╔══██╗██╔══██║██╔══██╗ ██║ ██╔══╝ ██║ ██║
+ // ██║ ██║██╔╝ ██╗██║ ██║ ██║██║ ██║██████╔╝ ╚██████╗██║ ╚██████╔╝
+ // ╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═╝ ╚═╝╚═╝ ╚═╝╚═════╝ ╚═════╝╚═╝ ╚═════╝
+ axi_rab_cfg
+ #(
+ .N_PORTS ( N_PORTS ),
+ .N_REGS ( N_REGS ),
+ .N_L2_SETS ( N_L2_SETS ),
+ .N_L2_SET_ENTRIES( N_L2_SET_ENTRIES ),
+ .ADDR_WIDTH_PHYS ( AXI_M_ADDR_WIDTH ),
+ .ADDR_WIDTH_VIRT ( AXI_S_ADDR_WIDTH ),
+ .N_FLAGS ( 4 ),
+ .AXI_DATA_WIDTH ( AXI_LITE_DATA_WIDTH ),
+ .AXI_ADDR_WIDTH ( AXI_LITE_ADDR_WIDTH ),
+ .MISS_META_WIDTH ( MISS_META_WIDTH ),
+ .MH_FIFO_DEPTH ( MH_FIFO_DEPTH )
+ )
+ u_axi_rab_cfg
+ (
+ .Clk_CI ( Clk_CI ),
+ .Rst_RBI ( Rst_RBI ),
+ .s_axi_awaddr ( s_axi_awaddr ),
+ .s_axi_awvalid ( s_axi_awvalid ),
+ .s_axi_wdata ( s_axi_wdata ),
+ .s_axi_wstrb ( s_axi_wstrb ),
+ .s_axi_wvalid ( s_axi_wvalid ),
+ .s_axi_bready ( s_axi_bready ),
+ .s_axi_araddr ( s_axi_araddr ),
+ .s_axi_arvalid ( s_axi_arvalid ),
+ .s_axi_rready ( s_axi_rready ),
+ .s_axi_arready ( s_axi_arready ),
+ .s_axi_rdata ( s_axi_rdata ),
+ .s_axi_rresp ( s_axi_rresp ),
+ .s_axi_rvalid ( s_axi_rvalid ),
+ .s_axi_wready ( s_axi_wready ),
+ .s_axi_bresp ( s_axi_bresp ),
+ .s_axi_bvalid ( s_axi_bvalid ),
+ .s_axi_awready ( s_axi_awready ),
+ .L1Cfg_DO ( int_cfg_regs ),
+ .L1AllowMultiHit_SO ( L1AllowMultiHit_S ),
+ .MissAddr_DI ( miss_addr_mhf[PortIdx_D] ),
+ .MissMeta_DI ( miss_meta_mhf[PortIdx_D] ),
+ .Miss_SI ( miss_valid_mhf[PortIdx_D] ),
+ .MhFifoFull_SO ( int_mhf_full ),
+ .wdata_l2 ( wdata_l2_o ),
+ .waddr_l2 ( waddr_l2_o ),
+ .wren_l2 ( wren_l2_o )
+ );
+
+ generate for (z = 0; z < N_PORTS; z++) begin : MHF_TLB_SELECT
+ if (ENABLE_L2TLB[z] == 1) begin // L2 TLB is enabled
+ assign miss_valid_mhf[z] = miss_l2_i[z];
+ assign miss_addr_mhf[z] = miss_l2_addr_i[z];
+ assign miss_meta_mhf[z] = {miss_l2_user_i[z], PortIdx_D, miss_l2_id_i[z]};
+ end else begin// L2 TLB is disabled
+ assign miss_valid_mhf[z] = int_miss[z];
+ assign miss_addr_mhf[z] = int_addr_min[z];
+ assign miss_meta_mhf[z] = {int_user[z], PortIdx_D, int_id[z]};
+ end
+ end
+ endgenerate
+
+ // ███████╗██╗ ██╗ ██████╗███████╗ ████████╗ ██████╗ ██████╗
+ // ██╔════╝██║ ██║██╔════╝██╔════╝ ╚══██╔══╝██╔═══██╗██╔══██╗
+ // ███████╗██║ ██║██║ █████╗ ██║ ██║ ██║██████╔╝
+ // ╚════██║██║ ██║██║ ██╔══╝ ██║ ██║ ██║██╔═══╝
+ // ███████║███████╗██║╚██████╗███████╗ ██║ ╚██████╔╝██║
+ // ╚══════╝╚══════╝╚═╝ ╚═════╝╚══════╝ ╚═╝ ╚═════╝ ╚═╝
+ generate for (z = 0; z < N_PORTS; z++) begin : SLICE_TOP_GEN
+ slice_top
+ #(
+ .N_SLICES ( N_SLICES[z] ),
+ .N_REGS ( 4*N_SLICES[z] ),
+ .ADDR_WIDTH_PHYS ( AXI_M_ADDR_WIDTH ),
+ .ADDR_WIDTH_VIRT ( AXI_S_ADDR_WIDTH )
+ )
+ u_slice_top
+ (
+ .int_cfg_regs ( int_cfg_regs_slices[z][4*N_SLICES[z]-1:0] ),
+ .int_rw ( int_rw[z] ),
+ .int_addr_min ( int_addr_min[z] ),
+ .int_addr_max ( int_addr_max[z] ),
+ .multi_hit_allow ( L1AllowMultiHit_S ),
+ .multi_hit ( multi_hit[z] ),
+ .prot ( prot_slices[z][N_SLICES[z]-1:0] ),
+ .hit ( hit_slices [z][N_SLICES[z]-1:0] ),
+ .cache_coherent ( cache_coherent[z] ),
+ .out_addr ( out_addr[z] )
+ );
+ // hit_slices [N_SLICES_MAX-1:N_SLICES_MAX-N_SLICES[z]] will be dangling
+ // prot_slices[N_SLICES_MAX-1:N_SLICES_MAX-N_SLICES[z]] will be dangling
+ // Fix to zero. Synthesis will remove these signals.
+ if ( N_SLICES[z] < N_SLICES_MAX ) begin
+ assign hit_slices [z][N_SLICES_MAX-1:N_SLICES[z]] = 0;
+ assign prot_slices[z][N_SLICES_MAX-1:N_SLICES[z]] = 0;
+ end
+ end // for (z = 0; z < N_PORTS; z++)
+ endgenerate
+
+ // ███████╗███████╗███╗ ███╗
+ // ██╔════╝██╔════╝████╗ ████║
+ // █████╗ ███████╗██╔████╔██║
+ // ██╔══╝ ╚════██║██║╚██╔╝██║
+ // ██║ ███████║██║ ╚═╝ ██║
+ // ╚═╝ ╚══════╝╚═╝ ╚═╝
+ //
+ generate for (z = 0; z < N_PORTS; z++) begin : FSM_GEN
+ fsm
+ #(
+ .AXI_M_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ),
+ .AXI_S_ADDR_WIDTH ( AXI_S_ADDR_WIDTH ),
+ .AXI_ID_WIDTH ( AXI_ID_WIDTH ),
+ .AXI_USER_WIDTH ( AXI_USER_WIDTH )
+ )
+ u_fsm
+ (
+ .Clk_CI ( Clk_CI ),
+ .Rst_RBI ( Rst_RBI ),
+ .port1_addr_valid_i ( port1_addr_valid[z] ),
+ .port2_addr_valid_i ( port2_addr_valid[z] ),
+ .port1_sent_i ( port1_sent[z] ),
+ .port2_sent_i ( port2_sent[z] ),
+ .select_i ( select[z] ),
+ .no_hit_i ( no_hit[z] ),
+ .multi_hit_i ( multi_hit[z] ),
+ .no_prot_i ( no_prot[z] ),
+ .prefetch_i ( prefetch[z] ),
+ .out_addr_i ( out_addr[z] ),
+ .cache_coherent_i ( cache_coherent[z] ),
+ .port1_accept_o ( port1_accept[z] ),
+ .port1_drop_o ( port1_drop[z] ),
+ .port1_miss_o ( port1_miss[z] ),
+ .port2_accept_o ( port2_accept[z] ),
+ .port2_drop_o ( port2_drop[z] ),
+ .port2_miss_o ( port2_miss[z] ),
+ .out_addr_o ( out_addr_reg[z] ),
+ .cache_coherent_o ( cache_coherent_reg[z] ),
+ .miss_o ( int_miss[z] ),
+ .multi_o ( int_multi[z] ),
+ .prot_o ( int_prot[z] ),
+ .prefetch_o ( int_prefetch[z] ),
+ .in_addr_i ( int_addr_min[z] ),
+ .in_id_i ( int_id[z] ),
+ .in_len_i ( int_len[z] ),
+ .in_user_i ( int_user[z] ),
+ .in_addr_o ( int_axaddr_o[z] ),
+ .in_id_o ( int_axid_o[z] ),
+ .in_len_o ( int_axlen_o[z] ),
+ .in_user_o ( int_axuser_o[z] )
+ );
+ end
+ endgenerate
+
+"""
--- /dev/null
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License. You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# module rab_slice
+# #(
+# parameter ADDR_WIDTH_PHYS = 40,
+# parameter ADDR_WIDTH_VIRT = 32
+# )
+# (
+# input logic [ADDR_WIDTH_VIRT-1:0] cfg_min,
+# input logic [ADDR_WIDTH_VIRT-1:0] cfg_max,
+# input logic [ADDR_WIDTH_PHYS-1:0] cfg_offset,
+# input logic cfg_wen,
+# input logic cfg_ren,
+# input logic cfg_en,
+# input logic in_trans_type,
+# input logic [ADDR_WIDTH_VIRT-1:0] in_addr_min,
+# input logic [ADDR_WIDTH_VIRT-1:0] in_addr_max,
+# output logic out_hit,
+# output logic out_prot,
+# output logic [ADDR_WIDTH_PHYS-1:0] out_addr
+# );
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class rab_slice(Elaboratable):
+
+ def __init__(self, params): # pass config object
+ # TODO parameters
+ self.params = params
+ self.cfg_min = Signal(params.ADDR_WIDTH_VIRT) # input
+ self.cfg_max = Signal(params.ADDR_WIDTH_VIRT) # input
+ self.cfg_offset = Signal(params.ADDR_WIDTH_PHYS) # input
+ self.cfg_wen = Signal() # input
+ self.cfg_ren = Signal() # input
+ self.cfg_en = Signal() # input
+ self.in_trans_type = Signal() # input
+ self.in_addr_min = Signal(params.ADDR_WIDTH_VIRT) # input
+ self.in_addr_max = Signal(params.ADDR_WIDTH_VIRT) # input
+ self.out_hit = Signal() # output
+ self.out_prot = Signal() # output
+ self.out_addr = Signal(params.ADDR_WIDTH_PHYS) # output
+
+ def elaborate(self, platform=None):
+ m = Module()
+ min_above_min = Signal()
+ min_below_max = Signal()
+ max_below_max = Signal()
+
+ # assign min_above_min = (in_addr_min >= cfg_min) ? 1'b1 : 1'b0;
+ # assign min_below_max = (in_addr_min <= cfg_max) ? 1'b1 : 1'b0;
+ # assign max_below_max = (in_addr_max <= cfg_max) ? 1'b1 : 1'b0;
+ # assign out_hit = cfg_en & min_above_min & min_below_max & max_below_max;
+ # assign out_prot = out_hit & ((in_trans_type & ~cfg_wen) | (~in_trans_type & ~cfg_ren));
+ # assign out_addr = in_addr_min - cfg_min + cfg_offset;
+ m.d.comb += [
+ min_above_min.eq(self.in_addr_min >= self.cfg_min),
+ min_below_max.eq(self.in_addr_min <= self.cfg_max),
+ max_below_max.eq(self.in_addr_max <= self.cfg_max),
+ self.out_hit.eq(self.cfg_en & min_above_min &
+ min_below_max & max_below_max),
+ self.out_prot.eq(self.out_hit & (
+ (self.in_trans_type & ~self.cfg_wen) | (~self.in_trans_type & ~self.cfg_ren))),
+ self.out_addr.eq(self.in_addr_min - self.cfg_min + self.cfg_offset)
+ ]
+
+ return m
--- /dev/null
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License. You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# /*
+# * ram_tp_no_change
+# *
+# * This code implements a parameterizable two-port memory. Port 0 can read and
+# * write while Port 1 can read only. The Xilinx tools will infer a BRAM with
+# * Port 0 in "no change" mode, i.e., during a write, it retains the last read
+# * value on the output. Port 1 (read-only) is in "write first" mode. Still, it
+# * outputs the old data during the write cycle. Note: Port 1 outputs invalid
+# * data in the cycle after the write when reading the same address.
+# *
+# * For more information, see Xilinx PG058 Block Memory Generator Product Guide.
+# */
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+from nmigen import Memory
+
+import math
+
+#
+# module ram_tp_no_change
+# #(
+ADDR_WIDTH = 10
+DATA_WIDTH = 36
+# )
+# (
+# input clk,
+# input we,
+# input [ADDR_WIDTH-1:0] addr0,
+# input [ADDR_WIDTH-1:0] addr1,
+# input [DATA_WIDTH-1:0] d_i,
+# output [DATA_WIDTH-1:0] d0_o,
+# output [DATA_WIDTH-1:0] d1_o
+# );
+
+
+class ram_tp_no_change(Elaboratable):
+
+ def __init__(self):
+ self.we = Signal() # input
+ self.addr0 = Signal(ADDR_WIDTH) # input
+ self.addr1 = Signal(ADDR_WIDTH) # input
+ self.d_i = Signal(DATA_WIDTH) # input
+ self.d0_o = Signal(DATA_WIDTH) # output
+ self.d1_o = Signal(DATA_WIDTH) # output
+
+ DEPTH = int(math.pow(2, ADDR_WIDTH))
+ self.ram = Memory(width=DATA_WIDTH, depth=DEPTH)
+ #
+ # localparam DEPTH = 2**ADDR_WIDTH;
+ #
+ # (* ram_style = "block" *) reg [DATA_WIDTH-1:0] ram[DEPTH];
+ # reg [DATA_WIDTH-1:0] d0;
+ # reg [DATA_WIDTH-1:0] d1;
+ #
+ # always_ff @(posedge clk) begin
+ # if(we == 1'b1) begin
+ # ram[addr0] <= d_i;
+ # end else begin
+ # only change data if we==false
+ # d0 <= ram[addr0];
+ # end
+ # d1 <= ram[addr1];
+ # end
+ #
+ # assign d0_o = d0;
+ # assign d1_o = d1;
+ #
+
+ def elaborate(self, platform=None):
+ m = Module()
+ m.submodules.read_ram0 = read_ram0 = self.ram.read_port()
+ m.submodules.read_ram1 = read_ram1 = self.ram.read_port()
+ m.submodules.write_ram = write_ram = self.ram.write_port()
+
+ # write port
+ m.d.comb += write_ram.en.eq(self.we)
+ m.d.comb += write_ram.addr.eq(self.addr0)
+ m.d.comb += write_ram.data.eq(self.d_i)
+
+ # read ports
+ m.d.comb += read_ram0.addr.eq(self.addr0)
+ m.d.comb += read_ram1.addr.eq(self.addr1)
+ with m.If(self.we == 0):
+ m.d.sync += self.d0_o.eq(read_ram0.data)
+ m.d.sync += self.d1_o.eq(read_ram1.data)
+
+ return m
--- /dev/null
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License. You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# /*
+# * ram_tp_write_first
+# *
+# * This code implements a parameterizable two-port memory. Port 0 can read and
+# * write while Port 1 can read only. Xilinx Vivado will infer a BRAM in
+# * "write first" mode, i.e., upon a read and write to the same address, the
+# * new value is read. Note: Port 1 outputs invalid data in the cycle after
+# * the write when reading the same address.
+# *
+# * For more information, see Xilinx PG058 Block Memory Generator Product Guide.
+# */
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+from nmigen import Memory
+
+import math
+#
+# module ram_tp_write_first
+# #(
+ADDR_WIDTH = 10
+DATA_WIDTH = 36
+# )
+# (
+# input clk,
+# input we,
+# input [ADDR_WIDTH-1:0] addr0,
+# input [ADDR_WIDTH-1:0] addr1,
+# input [DATA_WIDTH-1:0] d_i,
+# output [DATA_WIDTH-1:0] d0_o,
+# output [DATA_WIDTH-1:0] d1_o
+# );
+
+
+class ram_tp_write_first(Elaboratable):
+
+ def __init__(self):
+ self.we = Signal() # input
+ self.addr0 = Signal(ADDR_WIDTH) # input
+ self.addr1 = Signal(ADDR_WIDTH) # input
+ self.d_i = Signal(DATA_WIDTH) # input
+ self.d0_o = Signal(DATA_WIDTH) # output
+ self.d1_o = Signal(DATA_WIDTH) # output
+
+ DEPTH = int(math.pow(2, ADDR_WIDTH))
+ self.ram = Memory(width=DATA_WIDTH, depth=DEPTH)
+
+ #
+ # localparam DEPTH = 2**ADDR_WIDTH;
+ #
+ # (* ram_style = "block" *) reg [DATA_WIDTH-1:0] ram[DEPTH];
+ # reg [ADDR_WIDTH-1:0] raddr0;
+ # reg [ADDR_WIDTH-1:0] raddr1;
+ #
+ # always_ff @(posedge clk) begin
+ # if(we == 1'b1) begin
+ # ram[addr0] <= d_i;
+ # end
+ # raddr0 <= addr0;
+ # raddr1 <= addr1;
+ # end
+ #
+ # assign d0_o = ram[raddr0];
+ # assign d1_o = ram[raddr1];
+ #
+
+ def elaborate(self, platform=None):
+ m = Module()
+ m.submodules.read_ram0 = read_ram0 = self.ram.read_port()
+ m.submodules.read_ram1 = read_ram1 = self.ram.read_port()
+ m.submodules.write_ram = write_ram = self.ram.write_port()
+
+ # write port
+ m.d.comb += write_ram.en.eq(self.we)
+ m.d.comb += write_ram.addr.eq(self.addr0)
+ m.d.comb += write_ram.data.eq(self.d_i)
+
+ # read ports
+ m.d.comb += read_ram0.addr.eq(self.addr0)
+ m.d.comb += read_ram1.addr.eq(self.addr1)
+ m.d.sync += self.d0_o.eq(read_ram0.data)
+ m.d.sync += self.d1_o.eq(read_ram1.data)
+
+ return m
--- /dev/null
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License. You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+import rab_slice
+import coreconfig
+
+#
+# module slice_top
+# //#(
+# // parameter N_SLICES = 16,
+# // parameter N_REGS = 4*N_SLICES,
+# // parameter ADDR_WIDTH_PHYS = 40,
+# // parameter ADDR_WIDTH_VIRT = 32
+# // )
+# (
+# input logic [N_REGS-1:0] [63:0] int_cfg_regs,
+# input logic int_rw,
+# input logic [ADDR_WIDTH_VIRT-1:0] int_addr_min,
+# input logic [ADDR_WIDTH_VIRT-1:0] int_addr_max,
+# input logic multi_hit_allow,
+# output logic multi_hit,
+# output logic [N_SLICES-1:0] prot,
+# output logic [N_SLICES-1:0] hit,
+# output logic cache_coherent,
+# output logic [ADDR_WIDTH_PHYS-1:0] out_addr
+# );
+#
+
+
+class slice_top(Elaboratable):
+
+ def __init__(self):
+ # FIXME self.int_cfg_regs = Signal() # input
+ self.params = coreconfig.CoreConfig() # rename ?
+ self.int_rw = Signal() # input
+ self.int_addr_min = Signal(self.params.ADDR_WIDTH_VIRT) # input
+ self.int_addr_max = Signal(self.params.ADDR_WIDTH_VIRT) # input
+ self.multi_hit_allow = Signal() # input
+ self.multi_hit = Signal() # output
+ self.prot = Signal(self.params.N_SLICES) # output
+ self.hit = Signal(self.params.N_SLICES) # output
+ self.cache_coherent = Signal() # output
+ self.out_addr = Signal(self.params.ADDR_WIDTH_PHYS) # output
+
+ def elaborate(self, platform=None):
+ m = Module()
+
+ first_hit = Signal()
+
+ for i in range(self.params.N_SLICES):
+ # TODO pass params / core config here
+ u_slice = rab_slice.rab_slice(self.params)
+ setattr(m.submodules, "u_slice%d" % i, u_slice)
+ # TODO set param and connect ports
+
+ # In case of a multi hit, the lowest slice with a hit is selected.
+ # TODO always_comb begin : HIT_CHECK
+ m.d.comb += [
+ first_hit.eq(0),
+ self.multi_hit.eq(0),
+ self.out_addr.eq(0),
+ self.cache_coherent.eq(0)]
+
+ for j in range(self.params.N_SLICES):
+ with m.If(self.hit[j] == 1):
+ with m.If(first_hit == 1):
+ with m.If(self.multi_hit_allow == 0):
+ m.d.comb += [self.multi_hit.eq(1)]
+ with m.Elif(first_hit == 1):
+ m.d.comb += [first_hit.eq(1)
+ # only output first slice that was hit
+ # SV self.out_addr.eq(slice_out_addr[ADDR_WIDTH_PHYS*j + : ADDR_WIDTH_PHYS]),
+ # SV self.cache_coherent.eq(int_cfg_regs[4*j+3][3]),
+ ]
+ return m
+
+ # TODO translate generate statement
+
+
+"""
+ logic [ADDR_WIDTH_PHYS*N_SLICES-1:0] slice_out_addr;
+
+ generate
+ for ( i=0; i<N_SLICES; i++ )
+ begin
+ rab_slice
+ #(
+ .ADDR_WIDTH_PHYS ( ADDR_WIDTH_PHYS ),
+ .ADDR_WIDTH_VIRT ( ADDR_WIDTH_VIRT )
+ )
+ u_slice
+ (
+ .cfg_min ( int_cfg_regs[4*i] [ADDR_WIDTH_VIRT-1:0] ),
+ .cfg_max ( int_cfg_regs[4*i+1][ADDR_WIDTH_VIRT-1:0] ),
+ .cfg_offset ( int_cfg_regs[4*i+2][ADDR_WIDTH_PHYS-1:0] ),
+ .cfg_wen ( int_cfg_regs[4*i+3][2] ),
+ .cfg_ren ( int_cfg_regs[4*i+3][1] ),
+ .cfg_en ( int_cfg_regs[4*i+3][0] ),
+ .in_trans_type ( int_rw ),
+ .in_addr_min ( int_addr_min ),
+ .in_addr_max ( int_addr_max ),
+ .out_addr ( slice_out_addr[ADDR_WIDTH_PHYS*i+ADDR_WIDTH_PHYS-1:ADDR_WIDTH_PHYS*i] ),
+ .out_prot ( prot[i] ),
+ .out_hit ( hit[i] )
+ );
+ end
+ endgenerate
+
+ // In case of a multi hit, the lowest slice with a hit is selected.
+ always_comb begin : HIT_CHECK
+ first_hit = 0;
+ multi_hit = 0;
+ out_addr = '0;
+ cache_coherent = 0;
+ for (j = 0; j < N_SLICES; j++) begin
+ if (hit[j] == 1'b1) begin
+ if (first_hit == 1'b1) begin
+ if (multi_hit_allow == 1'b0) begin
+ multi_hit = 1'b1;
+ end
+ end else begin
+ first_hit = 1'b1;
+ out_addr = slice_out_addr[ADDR_WIDTH_PHYS*j +: ADDR_WIDTH_PHYS];
+ cache_coherent = int_cfg_regs[4*j+3][3];
+ end
+ end
+ end
+ end
+"""
+
+# sv 2 migen: TODO add translate code for generate statements and for loops inside always_comb
--- /dev/null
+from ram_tp_write_first import ram_tp_write_first
+from nmigen.compat.sim import run_simulation
+import sys
+sys.path.append("../")
+
+
+def tbench(dut):
+ yield dut.we.eq(1)
+ for i in range(0, 255):
+ yield dut.addr0.eq(i)
+ yield dut.d_i.eq(i)
+ yield
+
+
+if __name__ == "__main__":
+ dut = ram_tp_write_first()
+ run_simulation(dut, tbench(dut), vcd_name="ram_tp_write_first.vcd")
+ print("ram_tp_write_first Unit Test Success")
--- /dev/null
+from nmigen.compat.sim import run_simulation
+import sys
+sys.path.append("../")
+# sys.path.append("../../../TestUtil")
+from slice_top import slice_top
+
+def tbench(dut):
+ yield
+
+
+if __name__ == "__main__":
+ dut = slice_top()
+ run_simulation(dut, tbench(dut), vcd_name="test_slice_top.vcd")
+ print("slice_top Unit Test Success")
--- /dev/null
+from soc.decoder.power_enums import (Function, Form, InternalOp,
+ In1Sel, In2Sel, In3Sel, OutSel,
+ RC, LdstLen, CryIn, get_csv,
+ single_bit_flags,
+ get_signal_name, default_values)
+import math
+
+
+class MemorySim:
+ def __init__(self, bytes_per_word=8):
+ self.mem = {}
+ self.bytes_per_word = bytes_per_word
+ self.word_log2 = math.ceil(math.log2(bytes_per_word))
+
+ def _get_shifter_mask(self, width, remainder):
+ shifter = ((self.bytes_per_word - width) - remainder) * \
+ 8 # bits per byte
+ mask = (1 << (width * 8)) - 1
+ return shifter, mask
+
+ # TODO: Implement ld/st of lesser width
+ def ld(self, address, width=8):
+ remainder = address & (self.bytes_per_word - 1)
+ address = address >> self.word_log2
+ assert remainder & (width - 1) == 0, "Unaligned access unsupported!"
+ if address in self.mem:
+ val = self.mem[address]
+ else:
+ val = 0
+
+ if width != self.bytes_per_word:
+ shifter, mask = self._get_shifter_mask(width, remainder)
+ val = val & (mask << shifter)
+ val >>= shifter
+ print("Read {:x} from addr {:x}".format(val, address))
+ return val
+
+ def st(self, address, value, width=8):
+ remainder = address & (self.bytes_per_word - 1)
+ address = address >> self.word_log2
+ assert remainder & (width - 1) == 0, "Unaligned access unsupported!"
+ print("Writing {:x} to addr {:x}".format(value, address))
+ if width != self.bytes_per_word:
+ if address in self.mem:
+ val = self.mem[address]
+ else:
+ val = 0
+ shifter, mask = self._get_shifter_mask(width, remainder)
+ val &= ~(mask << shifter)
+ val |= value << shifter
+ self.mem[address] = val
+ else:
+ self.mem[address] = value
+
+
+class RegFile:
+ def __init__(self):
+ self.regfile = [0] * 32
+ self.sprs = {}
+
+ def write_reg(self, regnum, value):
+ all1s = (1 << 64)-1 # 64 bits worth of 1s
+ value &= all1s
+ print("Writing {:x} to reg r{}".format(value, regnum))
+ self.regfile[regnum] = value
+
+ def read_reg(self, regnum):
+ val = self.regfile[regnum]
+ print("Read {:x} from reg r{}".format(val, regnum))
+ return val
+
+ def assert_gpr(self, gpr, val):
+ reg_val = self.read_reg(gpr)
+ msg = "reg r{} got {:x}, expecting {:x}".format(
+ gpr, reg_val, val)
+ assert reg_val == val, msg
+
+ def assert_gprs(self, gprs):
+ for k, v in list(gprs.items()):
+ self.assert_gpr(k, v)
+
+ def set_xer(self, result, operanda, operandb):
+ xer = 0
+ if result & 1 << 64:
+ xer |= XER.CA
+
+ self.xer = xer
+
+
+class InternalOpSimulator:
+ def __init__(self):
+ self.mem_sim = MemorySim()
+ self.regfile = RegFile()
+
+ def execute_alu_op(self, op1, op2, internal_op, carry=0):
+ print(internal_op)
+ if internal_op == InternalOp.OP_ADD.value:
+ return op1 + op2 + carry
+ elif internal_op == InternalOp.OP_AND.value:
+ return op1 & op2
+ elif internal_op == InternalOp.OP_OR.value:
+ return op1 | op2
+ elif internal_op == InternalOp.OP_MUL_L64.value:
+ return op1 * op2
+ else:
+ assert False, "Not implemented"
+
+ def update_cr0(self, result):
+ if result == 0:
+ self.cr0 = 0b001
+ elif result >> 63:
+ self.cr0 = 0b100
+ else:
+ self.cr0 = 0b010
+ print("update_cr0", self.cr0)
+
+ def alu_op(self, pdecode2):
+ all1s = (1 << 64)-1 # 64 bits worth of 1s
+ internal_op = yield pdecode2.dec.op.internal_op
+ operand1 = 0
+ operand2 = 0
+ result = 0
+ carry = 0
+ r1_ok = yield pdecode2.e.read_reg1.ok
+ r2_ok = yield pdecode2.e.read_reg2.ok
+ r3_ok = yield pdecode2.e.read_reg3.ok
+ imm_ok = yield pdecode2.e.imm_data.ok
+ if r1_ok:
+ r1_sel = yield pdecode2.e.read_reg1.data
+ operand1 = self.regfile.read_reg(r1_sel)
+ elif r3_ok:
+ r3_sel = yield pdecode2.e.read_reg3.data
+ operand1 = self.regfile.read_reg(r3_sel)
+ if r2_ok:
+ r2_sel = yield pdecode2.e.read_reg2.data
+ operand2 = self.regfile.read_reg(r2_sel)
+ if imm_ok:
+ operand2 = yield pdecode2.e.imm_data.data
+
+ inv_a = yield pdecode2.dec.op.inv_a
+ if inv_a:
+ operand1 = (~operand1) & all1s
+
+ cry_in = yield pdecode2.dec.op.cry_in
+ if cry_in == CryIn.ONE.value:
+ carry = 1
+ elif cry_in == CryIn.CA.value:
+ carry = self.carry_out
+
+ # TODO rc_sel = yield pdecode2.dec.op.rc_sel
+ result = self.execute_alu_op(operand1, operand2, internal_op,
+ carry=carry)
+
+ cry_out = yield pdecode2.dec.op.cry_out
+ rc = yield pdecode2.e.rc.data
+
+ if rc:
+ self.update_cr0(result)
+ if cry_out == 1:
+ self.carry_out = (result >> 64)
+ print("setting carry_out", self.carry_out)
+
+ ro_ok = yield pdecode2.e.write_reg.ok
+ if ro_ok:
+ ro_sel = yield pdecode2.e.write_reg.data
+ self.regfile.write_reg(ro_sel, result)
+
+ def mem_op(self, pdecode2):
+ internal_op = yield pdecode2.dec.op.internal_op
+ addr_reg = yield pdecode2.e.read_reg1.data
+ addr = self.regfile.read_reg(addr_reg)
+
+ imm_ok = yield pdecode2.e.imm_data.ok
+ r2_ok = yield pdecode2.e.read_reg2.ok
+ width = yield pdecode2.e.data_len
+ if imm_ok:
+ imm = yield pdecode2.e.imm_data.data
+ addr += imm
+ elif r2_ok:
+ r2_sel = yield pdecode2.e.read_reg2.data
+ addr += self.regfile.read_reg(r2_sel)
+ if internal_op == InternalOp.OP_STORE.value:
+ val_reg = yield pdecode2.e.read_reg3.data
+ val = self.regfile.read_reg(val_reg)
+ self.mem_sim.st(addr, val, width)
+ elif internal_op == InternalOp.OP_LOAD.value:
+ dest_reg = yield pdecode2.e.write_reg.data
+ val = self.mem_sim.ld(addr, width)
+ self.regfile.write_reg(dest_reg, val)
+
+ def execute_op(self, pdecode2):
+ function = yield pdecode2.dec.op.function_unit
+ if function == Function.ALU.value:
+ yield from self.alu_op(pdecode2)
+ elif function == Function.LDST.value:
+ yield from self.mem_op(pdecode2)