This adds an exception to holding o_valid low, when the ALU is idle.
If a write to the ALU just occurred, allow o_valid to become high, in
the same cycle.
*.il
**/*.gtkw
.eggs
-
+formal_test_temp
.vscode/*
build
gen
.noseids
nosetests.xml
+test-out
variables:
PIP_CACHE_DIR: "$CI_PROJECT_DIR/.cache/pip"
GIT_SUBMODULE_STRATEGY: recursive
+ GIT_DEPTH: "500"
build:
stage: build
- apt-get -o dir::cache::archives="$(pwd)/apt-cache" update
- >-
apt-get -o dir::cache::archives="$(pwd)/apt-cache" -y install
- build-essential git python3-dev python3-pip
- python3-setuptools python3-wheel pkg-config tcl-dev
- libreadline-dev bison flex libffi-dev ccache python3-venv
- binutils-powerpc64-linux-gnu binutils-powerpc64le-linux-gnu
- autoconf gperf libgmp-dev libmpfr-dev libssl-dev curl
+ build-essential
+ git
+ python3-dev
+ python3-pip
+ python3-setuptools
+ python3-setuptools-scm
+ python3-wheel
+ pkg-config
+ tcl-dev
+ libreadline-dev
+ bison
+ flex
+ libffi-dev
+ ccache
+ python3-venv
+ binutils-powerpc64-linux-gnu
+ binutils-powerpc64le-linux-gnu
+ autoconf
+ gperf
+ libgmp-dev
+ libmpfr-dev
+ libssl-dev
+ curl
- export PATH="/usr/lib/ccache:$PATH"
- export CCACHE_BASEDIR="$PWD"
- export CCACHE_DIR="$PWD/ccache"
- ccache --show-stats || true
- curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
- source $HOME/.cargo/env
- after_script:
- - export CCACHE_DIR="$PWD/ccache"
- - ccache --show-stats
script:
- - python3 -m venv .env
+ - python3 -m venv --system-site-packages .env
- . .env/bin/activate
- - pip install nose
+ - pip install pytest-xdist==3.3.1 pytest==7.3.1
+
+ - git clone --depth 1 -b v0.1.1 https://github.com/cocotb/cocotb-bus.git cocotb-bus
+ - pushd cocotb-bus
+ - pip install . --no-deps
+ - popd
+
+ - git clone --depth 1 -b v1.5.2 https://github.com/cocotb/cocotb.git cocotb
+ - pushd cocotb
+ - pip install .
+ - popd
+
+ - git clone --depth 1 https://git.libre-soc.org/git/pytest-output-to-files.git pytest-output-to-files
+ - pushd pytest-output-to-files
+ - git rev-parse HEAD
+ - python3 setup.py develop
+ - popd
- - git clone --depth 1 https://github.com/SRI-CSL/yices2.git yices2
+ - git clone --depth 1 -b Yices-2.6.4 https://github.com/SRI-CSL/yices2.git yices2
- pushd yices2
- autoconf
- ./configure
- - make -j$(nproc) > /dev/null
+ - make -j$(nproc)
- make install
- popd
- - git clone --depth 1 https://github.com/YosysHQ/yosys.git yosys
+ - git clone --depth 1 -b yosys-0.17 https://github.com/YosysHQ/yosys.git yosys
- pushd yosys
- make config-gcc
- - make -j$(nproc) > /dev/null
+ - make -j$(nproc)
- make install
- popd
- yosys -V
- - git clone --depth 1 https://github.com/YosysHQ/SymbiYosys.git SymbiYosys
+ - git clone https://github.com/YosysHQ/SymbiYosys.git SymbiYosys
- pushd SymbiYosys
- - make install > /dev/null
+ - git checkout d10e472edf4ea9be3aa6347b264ba575fbea933a
+ - make install
- popd
- - git clone --depth 1 https://github.com/nmigen/nmigen.git nmigen
+ - git clone --depth 1 https://gitlab.com/nmigen/nmigen.git nmigen
- pushd nmigen
- - python setup.py develop
+ - git rev-parse HEAD
+ - python3 setup.py develop
+ - popd
+
+ - git clone --depth 1 https://git.libre-soc.org/git/mdis.git mdis
+ - pushd mdis
+ - git rev-parse HEAD
+ - python3 setup.py develop
- popd
- git clone --depth 1 https://git.libre-soc.org/git/nmutil.git nmutil
- pushd nmutil
- - python setup.py develop
+ - git rev-parse HEAD
+ - python3 setup.py develop
- popd
- git clone --depth 1 https://git.libre-soc.org/git/nmigen-soc.git nmigen-soc
- git clone --depth 1 https://git.libre-soc.org/git/openpower-isa.git openpower-isa
- pushd openpower-isa
- python3 setup.py develop
- - make -j$(nproc) svanalysis > /dev/null
- - make -j$(nproc) pyfnwriter > /dev/null 2>&1
- - make -j$(nproc) pywriter > /dev/null 2>&1
+ - if ! out="$(make 2>&1)"; then echo "$out"; exit 1; fi
- popd
- git clone --depth 1 https://git.libre-soc.org/git/c4m-jtag.git c4m-jtag
- popd
- IEEE754FPU_PATH="$(pwd)"/ieee754fpu
- - git clone --depth 1 --recursive https://github.com/billzorn/sfpy.git sfpy
+ - git clone --depth 1 --recursive -b v0.6.0 https://github.com/billzorn/sfpy.git sfpy
- pushd sfpy
+ - git apply "$IEEE754FPU_PATH"/sfpy.patch
- pushd berkeley-softfloat-3
- git apply "$IEEE754FPU_PATH"/berkeley-softfloat.patch
- popd
- git apply ../softposit_sfpy_build.patch
- git apply "$IEEE754FPU_PATH"/SoftPosit.patch
- popd
- - pip install --upgrade -r requirements.txt
+ - pip install -r requirements.txt
- make lib -j$(nproc)
- make cython -j$(nproc)
- make wheel -j$(nproc)
- - pip install dist/sfpy*.whl
+ - pip install --force-reinstall dist/sfpy*.whl
- popd
- python3 -m pip install 'maturin>=0.11,<0.12'
- popd
- python setup.py develop
- - nosetests -v --processes=-1 --process-timeout=120 -w src/
+ - SILENCELOG='!*,default' pytest -v --maxfail=20
--enable-xics --enable-sram4x4kblock --disable-svp64 \
src/soc/litex/florent/libresoc/libresoc.v
+# build microwatt "external core", note that the TLB set size is set to 16
+# for I/D-Cache which needs a corresponding alteration of the device-tree
+# entries for linux
+microwatt_external_core:
+ python3 src/soc/simple/issuer_verilog.py --microwatt-compat --enable-mmu \
+ external_core_top.v
+
+# build microwatt "external core" with fixed 64-bit width SVP64
+# note that the TLB set size is set to 16
+# for I/D-Cache which needs a corresponding alteration of the device-tree
+# entries for linux
+microwatt_external_core_svp64:
+ python3 src/soc/simple/issuer_verilog.py --microwatt-compat-svp64 --enable-mmu \
+ external_core_top.v
+
+microwatt_external_core_spi:
+ python3 src/soc/simple/issuer_verilog.py --microwatt-compat \
+ --small-cache \
+ --enable-mmu \
+ --pc-reset 0x10000000 \
+ external_core_top.v
+
+# microwatt-compatible core with smaller cache size (quick. VERSA_ECP5. just)
+microwatt_external_core_bram:
+ python3 src/soc/simple/issuer_verilog.py --microwatt-compat \
+ --small-cache \
+ --enable-mmu \
+ --pc-reset 0xFF000000 \
+ external_core_top.v
+
+# microwatt-compatible core with larger cache size (experiment on arty)
+microwatt_external_core_bram_arty:
+ python3 src/soc/simple/issuer_verilog.py --microwatt-compat \
+ --enable-mmu \
+ --pc-reset 0xFF000000 \
+ external_core_top.v
+
# build the litex libresoc SoC without 4k SRAMs
ls180_verilog_build: ls180_verilog
make -C soc/soc/litex/florent ls180
'sphinx.ext.coverage',
'recommonmark',
#'symbolator_sphinx',
- 'sphinxcontrib_verilog_diagrams',
+ #'sphinxcontrib_verilog_diagrams', # XXX now spinxcontrib-hdl-diagrams
'sphinx_rtd_theme',
#'sphinx_tabs.tabs',
]
#!/bin/sh
cd pinmux
python2 src/pinmux_generator.py -v -s ls180 -o ls180
-python2 src/pinmux_generator.py -v -s ngi_router -o ngi_router
+# temporary - return to older version of pinmux
+#python2 src/pinmux_generator.py -v -s ngi_router -o ngi_router
-Subproject commit 20ca612b2600530ce901009b3d1b9ef0e05b7438
+Subproject commit 7cbf0e2a54448f549243cd602ebafd10de8d32f0
--- /dev/null
+[tool.pytest.ini_options]
+minversion = "6.0"
+python_classes = ""
+python_functions = ""
+testpaths = ["src/soc"]
+required_plugins = ["pytest-xdist>=1.0.0", "pytest-output-to-files>=0.1.0"]
+addopts = [
+ "-n",
+ "auto",
+ "--shorten-output-dir=test-out",
+]
version = '0.0.1'
+# the only reason this is added is because it's become a part of python 3.9.
+# the project standard is python 3.7 however in future that will be updated.
+# for now, cached_property is RELUCTANTLY added but a *copy* is added so
+# that the generation of HDL is not critically dependent on random crap
+# off the internet. you're spending USD 16 *MILLION* on masks, you better
+# be absolutely paranoid-level certain you know where every piece of the
+# chain creating the HDL comes from.
+cprop = "git+https://git.libre-soc.org/git/cached-property.git@1.5.2" \
+ "#egg=cached-property-1.5.2"
+
# using pip3 for ongoing development is a royal pain. seriously not
# recommended. therefore a number of these dependencies have been
# commented out. *they are still required* - they will need installing
# manually.
+# XXX UNDER NO CIRCUMSTANCES ADD ARBITRARY DEPENDENCIES HERE. XXX
+# as this is HDL, not software, every dependency added is
+# a serious maintenance and reproducible-build problem.
+# dropping USD 16 million on 7nm Mask Charges when the
+# HDL can be compromised - accidentally or deliberately -
+# by pip3 going out and randomly downloading complete
+# shite is not going to do anyone any favours.
+
+# TODO: make *all* of these be from libre-soc git repo only
+# (which means updating the nmigen-soc one to mirror gitlab)
+
install_requires = [
# 'sfpy', # needs manual patching
'libresoc-ieee754fpu', # uploaded (successfully, whew) to pip
'libresoc-openpower-isa', # uploaded (successfully, whew) to pip
# 'nmigen-soc', # install manually from git.libre-soc.org
+
+ # git url needed for having `pip3 install -e .` install from libre-soc git
+ "cached-property@"+cprop,
+]
+
+# git url needed for having `setup.py develop` install from libre-soc git
+dependency_links = [
+ cprop,
]
test_requires = [
long_description_content_type='text/markdown',
classifiers=[
"Topic :: Software Development",
- "License :: OSI Approved :: GNU Lesser General Public License v3 or later (LGPLv3+)",
+ "License :: OSI Approved :: " \
+ "GNU Lesser General Public License v3 or later (LGPLv3+)",
"Programming Language :: Python :: 3",
"Operating System :: OS Independent",
],
include_package_data=True,
zip_safe=False,
install_requires=install_requires,
+ dependency_links=dependency_links,
tests_require=test_requires,
test_suite='nose.collector',
)
--- /dev/null
+#!/usr/bin/env python3
+#
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2022 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+# Sponsored by NLnet and NGI POINTER under EU Grants 871528 and 957073
+# Part of the Libre-SOC Project.
+#
+# this is a wrapper around the external_core_top.v verilog module
+# which allows for faster development iteration (oh and microwatt or
+# other core to be dropped into a peripheral fabric)
+
+from nmigen import (Elaboratable, Cat, Module, Signal, ClockSignal, Instance,
+ ResetSignal, Const)
+from nmigen.cli import rtlil, verilog
+
+from soc.debug.dmi import DMIInterface
+from nmigen_soc.wishbone.bus import Interface
+import os
+
+__all__ = ["ExternalCore"]
+
+
+class ExternalCore(Elaboratable):
+ """External Core verilog wrapper for microwatt and libre-soc
+ (actually, anything prepared to map to the Signals defined below)
+ remember to call ExternalCore.add_verilog_source
+ """
+
+ def __init__(self, ibus=None, dbus=None, features=None, name=None):
+
+ # set up the icache wishbone bus
+ if features is None:
+ features = frozenset(("stall",))
+ if ibus is None:
+ ibus = Interface(addr_width=32,
+ data_width=64,
+ features=features,
+ granularity=8,
+ name="core_ibus")
+ if dbus is None:
+ dbus = Interface(addr_width=32,
+ data_width=64,
+ features=features,
+ granularity=8,
+ name="core_dbus")
+ self.dmi = DMIInterface(name="dmi")
+ self.ibus = ibus
+ self.dbus = dbus
+
+ assert len(self.ibus.dat_r) == 64, "bus width must be 64"
+ assert len(self.dbus.dat_r) == 64, "bus width must be 64"
+
+ # IRQ for data buffer receive/xmit
+ self.irq = Signal()
+
+ # debug monitoring signals
+ self.nia = Signal(64)
+ self.nia_req = Signal()
+ self.msr = Signal(64)
+ self.ldst_addr = Signal(64)
+ self.ldst_req = Signal()
+
+ # alternative reset and termination indicator
+ self.alt_reset = Signal()
+ self.terminated_o = Signal()
+
+ @classmethod
+ def add_verilog_source(cls, verilog_src_dir, platform):
+ # add each of the verilog sources, needed for when doing platform.build
+ for fname in ['external_core_top.v',
+ ]:
+ # prepend the src directory to each filename, add its contents
+ fullname = os.path.join(verilog_src_dir, fname)
+ with open(fullname) as f:
+ platform.add_file(fullname, f)
+
+ def elaborate(self, platform):
+ m = Module()
+ comb = m.d.comb
+
+ # create definition of external core here, so that
+ # nmigen understands I/O directions (defined by i_ and o_ prefixes)
+ ibus, dbus, dmi = self.ibus, self.dbus, self.dmi
+
+ # sigh, microwatt wishbone address is borked, it contains the 3 LSBs
+ ibus_adr = Signal(32)
+ dbus_adr = Signal(32)
+ m.d.comb += ibus.adr.eq(ibus_adr[3:])
+ m.d.comb += dbus.adr.eq(dbus_adr[3:])
+
+ kwargs = {
+ # clock/reset signals
+ 'i_clk': ClockSignal(),
+ 'i_rst': ResetSignal(),
+ # DMI interface
+ 'i_dmi_addr': dmi.addr_i,
+ 'i_dmi_req': dmi.req_i,
+ 'i_dmi_wr': dmi.we_i,
+ 'i_dmi_din': dmi.din,
+ 'o_dmi_dout': dmi.dout,
+ 'o_dmi_ack': dmi.ack_o,
+ # debug/monitor signals
+ 'o_nia': self.nia,
+ 'o_nia_req': self.nia_req,
+ 'o_msr_o': self.msr,
+ 'o_ldst_addr': self.ldst_addr,
+ 'o_ldst_req': self.ldst_req,
+ 'i_alt_reset': self.alt_reset,
+ 'o_terminated_out': self.terminated_o,
+ # wishbone instruction bus
+ 'o_wishbone_insn_out.adr': ibus_adr,
+ 'o_wishbone_insn_out.dat': ibus.dat_w,
+ 'o_wishbone_insn_out.sel': ibus.sel,
+ 'o_wishbone_insn_out.cyc': ibus.cyc,
+ 'o_wishbone_insn_out.stb': ibus.stb,
+ 'o_wishbone_insn_out.we': ibus.we,
+ 'i_wishbone_insn_in.dat': ibus.dat_r,
+ 'i_wishbone_insn_in.ack': ibus.ack,
+ 'i_wishbone_insn_in.stall': ibus.stall,
+ # wishbone data bus
+ 'o_wishbone_data_out.adr': dbus_adr,
+ 'o_wishbone_data_out.dat': dbus.dat_w,
+ 'o_wishbone_data_out.sel': dbus.sel,
+ 'o_wishbone_data_out.cyc': dbus.cyc,
+ 'o_wishbone_data_out.stb': dbus.stb,
+ 'o_wishbone_data_out.we': dbus.we,
+ 'i_wishbone_data_in.dat': dbus.dat_r,
+ 'i_wishbone_data_in.ack': dbus.ack,
+ 'i_wishbone_data_in.stall': dbus.stall,
+ # external interrupt request
+ 'i_ext_irq': self.irq,
+ }
+ core = Instance("external_core_top", **kwargs)
+ m.submodules['core_top'] = core
+
+ return m
+
+
+def create_ilang(dut, ports, test_name):
+ vl = rtlil.convert(dut, name=test_name, ports=ports)
+ with open("%s.il" % test_name, "w") as f:
+ f.write(vl)
+
+def create_verilog(dut, ports, test_name):
+ vl = verilog.convert(dut, name=test_name, ports=ports)
+ with open("%s.v" % test_name, "w") as f:
+ f.write(vl)
+
+
+if __name__ == "__main__":
+ core = ExternalCore(name="core")
+ create_ilang(core, [
+ core.ibus.cyc, core.ibus.stb, core.ibus.ack,
+ core.ibus.dat_r, core.ibus.dat_w, core.ibus.adr,
+ core.ibus.we, core.ibus.sel, core.ibus.stall,
+ core.dbus.cyc, core.dbus.stb, core.dbus.ack,
+ core.dbus.dat_r, core.dbus.dat_w, core.dbus.adr,
+ core.dbus.we, core.dbus.sel,
+ core.irq, core.alt_reset, core.terminated_o,
+ core.msr, core.nia, core.nia_req,
+ core.ldst_addr, core.ldst_req,
+ core.dmi.addr_i, core.dmi.req_i, core.dmi.we_i,
+ core.dmi.din, core.dmi.dout, core.dmi.ack_o,
+ ], "core_0")
+
--- /dev/null
+#!/usr/bin/env python3
+#
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2020-2022 Raptor Engineering LLC <support@raptorengineering.com>
+# Copyright (C) 2022 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+# Sponsored by NLnet and NGI POINTER under EU Grants 871528 and 957073
+# Part of the Libre-SOC Project.
+#
+# this is a wrapper around the opencores verilog 10/100 MAC
+
+from nmigen import (Elaboratable, Cat, Module, Signal, ClockSignal, Instance,
+ ResetSignal)
+
+from nmigen_soc.wishbone.bus import Interface
+from nmigen_soc.memory import MemoryMap
+from nmigen.utils import log2_int
+from nmigen.cli import rtlil, verilog
+import os
+
+__all__ = ["EthMAC"]
+
+
+class EthMAC(Elaboratable):
+ """Ethernet MAC from opencores, nmigen wrapper.
+ remember to call EthMAC.add_verilog_source
+ """
+
+ def __init__(self, master_bus=None, slave_bus=None, name=None,
+ irq=None, pins=None):
+ if name is not None:
+ # convention: give the name in the format "name_number"
+ self.idx = int(name.split("_")[-1])
+ else:
+ self.idx = 0
+ name = "eth_0"
+ self.granularity = 8
+ self.data_width = 32
+ self.dsize = log2_int(self.data_width//self.granularity)
+
+ # set up the wishbone busses
+ features = frozenset()
+ if master_bus is None:
+ master_bus = Interface(addr_width=30,
+ data_width=32,
+ features=features,
+ granularity=8,
+ name=name+"_wb_%d_0" % self.idx)
+ if slave_bus is None:
+ slave_bus = Interface(addr_width=12,
+ data_width=32,
+ features=features,
+ granularity=8,
+ name=name+"_wb_%d_1" % self.idx)
+ self.master_bus = master_bus
+ self.slave_bus = slave_bus
+ if irq is None:
+ irq = Signal()
+ self.irq = irq
+
+ slave_mmap = MemoryMap(addr_width=12+self.dsize,
+ data_width=self.granularity)
+
+ self.slave_bus.memory_map = slave_mmap
+
+ # RMII TX signals
+ self.mtx_clk = Signal()
+ self.mtxd = Signal(4)
+ self.mtxen = Signal()
+ self.mtxerr = Signal()
+
+ # RMII RX signals
+ self.mrx_clk = Signal()
+ self.mrxd = Signal(4)
+ self.mrxdv = Signal()
+ self.mrxerr = Signal()
+
+ # RMII common signals
+ self.mcoll = Signal()
+ self.mcrs = Signal()
+
+ # RMII management interface signals
+ self.mdc = Signal()
+ self.md_in = Signal()
+ self.md_out = Signal()
+ self.md_direction = Signal()
+
+ # pins resource
+ self.pins = pins
+
+ @classmethod
+ def add_verilog_source(cls, verilog_src_dir, platform):
+ # add each of the verilog sources, needed for when doing platform.build
+ for fname in ['eth_clockgen.v', 'eth_cop.v', 'eth_crc.v',
+ 'eth_fifo.v', 'eth_maccontrol.v', 'ethmac_defines.v',
+ 'eth_macstatus.v', 'ethmac.v', 'eth_miim.v',
+ 'eth_outputcontrol.v', 'eth_random.v',
+ 'eth_receivecontrol.v', 'eth_registers.v',
+ 'eth_register.v', 'eth_rxaddrcheck.v',
+ 'eth_rxcounters.v', 'eth_rxethmac.v',
+ 'eth_rxstatem.v', 'eth_shiftreg.v',
+ 'eth_spram_256x32.v', 'eth_top.v',
+ 'eth_transmitcontrol.v', 'eth_txcounters.v',
+ 'eth_txethmac.v', 'eth_txstatem.v', 'eth_wishbone.v',
+ 'timescale.v']:
+ # prepend the src directory to each filename, add its contents
+ fullname = os.path.join(verilog_src_dir, fname)
+ with open(fullname) as f:
+ platform.add_file(fullname, f)
+
+ def elaborate(self, platform):
+ m = Module()
+ comb = m.d.comb
+ idx = self.idx
+
+ # Calculate arbiter bus address
+ wb_master_bus_adr = Signal(32)
+ # arbiter address is in words, ethernet master address is in bytes
+ comb += self.master_bus.adr.eq(wb_master_bus_adr >> 2)
+
+ # create definition of external verilog EthMAC code here, so that
+ # nmigen understands I/O directions (defined by i_ and o_ prefixes)
+ ethmac = Instance("eth_top",
+ # Clock/reset (use DomainRenamer if needed)
+ i_wb_clk_i=ClockSignal(),
+ i_wb_rst_i=ResetSignal(),
+
+ # Master Wishbone bus signals
+ o_m_wb_adr_o=wb_master_bus_adr,
+ i_m_wb_dat_i=self.master_bus.dat_r,
+ o_m_wb_sel_o=self.master_bus.sel,
+ o_m_wb_dat_o=self.master_bus.dat_w,
+ o_m_wb_we_o=self.master_bus.we,
+ o_m_wb_stb_o=self.master_bus.stb,
+ o_m_wb_cyc_o=self.master_bus.cyc,
+ i_m_wb_ack_i=self.master_bus.ack,
+
+ # Slave Wishbone bus signals
+ i_wb_adr_i=self.slave_bus.adr,
+ i_wb_dat_i=self.slave_bus.dat_w,
+ i_wb_sel_i=self.slave_bus.sel,
+ o_wb_dat_o=self.slave_bus.dat_r,
+ i_wb_we_i=self.slave_bus.we,
+ i_wb_stb_i=self.slave_bus.stb,
+ i_wb_cyc_i=self.slave_bus.cyc,
+ o_wb_ack_o=self.slave_bus.ack,
+
+ o_int_o=self.irq,
+
+ # RMII TX
+ i_mtx_clk_pad_i=self.mtx_clk,
+ o_mtxd_pad_o=self.mtxd,
+ o_mtxen_pad_o=self.mtxen,
+ o_mtxerr_pad_o=self.mtxerr,
+
+ # RMII RX
+ i_mrx_clk_pad_i=self.mrx_clk,
+ i_mrxd_pad_i=self.mrxd,
+ i_mrxdv_pad_i=self.mrxdv,
+ i_mrxerr_pad_i=self.mrxerr,
+
+ # RMII common
+ i_mcoll_pad_i=self.mcoll,
+ i_mcrs_pad_i=self.mcrs,
+
+ # Management Interface
+ o_mdc_pad_o=self.mdc,
+ i_md_pad_i=self.md_in,
+ o_md_pad_o=self.md_out,
+ o_md_padoe_o=self.md_direction
+ );
+
+ m.submodules['ethmac_%d' % self.idx] = ethmac
+
+ if self.pins is not None:
+ comb += self.mtx_clk.eq(self.pins.mtx_clk.i)
+ comb += self.pins.mtxd.o.eq(self.mtxd)
+ comb += self.pins.mtxen.o.eq(self.mtxen)
+ comb += self.pins.mtxerr.o.eq(self.mtxerr)
+
+ comb += self.mrx_clk.eq(self.pins.mrx_clk.i)
+ comb += self.mrxd.eq(self.pins.mrxd.i)
+ comb += self.mrxdv.eq(self.pins.mrxdv.i)
+ comb += self.mrxerr.eq(self.pins.mrxerr.i)
+ comb += self.mcoll.eq(self.pins.mcoll.i)
+ comb += self.mcrs.eq(self.pins.mcrs.i)
+
+ comb += self.pins.mdc.o.eq(self.mdc)
+
+ comb += self.pins.md.o.eq(self.md_out)
+ comb += self.pins.md.oe.eq(self.md_direction)
+ comb += self.md_in.eq(self.pins.md.i)
+ return m
+
+
+def create_ilang(dut, ports, test_name):
+ vl = rtlil.convert(dut, name=test_name, ports=ports)
+ with open("%s.il" % test_name, "w") as f:
+ f.write(vl)
+
+def create_verilog(dut, ports, test_name):
+ vl = verilog.convert(dut, name=test_name, ports=ports)
+ with open("%s.v" % test_name, "w") as f:
+ f.write(vl)
+
+if __name__ == "__main__":
+ ethmac = EthMAC(name="eth_0")
+ create_ilang(ethmac, [ethmac.master_bus.cyc, ethmac.master_bus.stb,
+ ethmac.master_bus.ack, ethmac.master_bus.dat_r,
+ ethmac.master_bus.dat_w, ethmac.master_bus.adr,
+ ethmac.master_bus.we, ethmac.master_bus.sel,
+ ethmac.slave_bus.cyc, ethmac.slave_bus.stb,
+ ethmac.slave_bus.ack,
+ ethmac.slave_bus.dat_r, ethmac.slave_bus.dat_w,
+ ethmac.slave_bus.adr,
+ ethmac.slave_bus.we, ethmac.slave_bus.sel,
+ ethmac.mtx_clk, ethmac.mtxd, ethmac.mtxen,
+ ethmac.mtxerr, ethmac.mrx_clk, ethmac.mrxd,
+ ethmac.mrxdv, ethmac.mrxerr, ethmac.mcoll,
+ ethmac.mcrs, ethmac.mdc, ethmac.md_in,
+ ethmac.md_out, ethmac.md_direction
+ ], "eth_0")
+
--- /dev/null
+#!/usr/bin/env python3
+#
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2022 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+# Sponsored by NLnet and NGI POINTER under EU Grants 871528 and 957073
+# Part of the Libre-SOC Project.
+#
+# this is a wrapper around the opencores verilog uart16550 module
+
+from nmigen import (Elaboratable, Cat, Module, Signal, ClockSignal, Instance,
+ ResetSignal, Record)
+
+from nmigen_soc.wishbone.bus import Interface
+from nmigen.cli import rtlil, verilog
+import os
+
+__all__ = ["SDRAM", "SDRAMConfig"]
+
+ """
+ class MT48LC16M16(SDRModule):
+ # geometry
+ nbanks = 4
+ nrows = 8192
+ ncols = 512
+ # timings
+ technology_timings = _TechnologyTimings(tREFI=64e6/8192,
+ tWTR=(2, None),
+ tCCD=(1, None),
+ tRRD=(None, 15))
+ speedgrade_timings = {"default": _SpeedgradeTimings(tRP=20,
+ tRCD=20,
+ tWR=15,
+ tRFC=(None, 66),
+ tFAW=None,
+ tRAS=44)}
+ # for MT48LC16M16-75 part
+ comb += self.cfg.sdr_en.eq(1)
+ comb += self.cfg.sdr_mode_reg.eq(0x033)
+ comb += self.cfg.req_depth.eq(3) # max
+ comb += self.cfg.sdr_tras_d.eq(44) # Active to precharge delay
+ comb += self.cfg.sdr_trp_d.eq(20) # Precharge to active delay
+ comb += self.cfg.sdr_trcd_d.eq(20) # Active to R/W delay
+ comb += self.cfg.sdr_cas.eq(3) # CAS latency
+ comb += self.cfg.sdr_trcar_d.eq(66) # tRFC auto-refresh period
+ comb += self.cfg.sdr_twr_d.eq(15) # clock + 7.5ns
+ comb += self.cfg.sdr_rfsh.eq(0x100)
+ comb += self.cfg.sdr_rfmax.eq(6)
+ """
+
+
+class SDRAMConfig(Record):
+ def __init__(self, refresh_timer_sz, refresh_row_count, name=None):
+ super().__init__(name=name, layout=[
+ # configuration parameters, these need to match the SDRAM IC datasheet
+ ('req_depth', 2), # max request accepted
+ ('sdr_en', 1), # Enable SDRAM controller
+ ('sdr_mode_reg', 13),
+ ('sdr_tras_d', 4), # Active to precharge delay
+ ('sdr_trp_d', 4), # Precharge to active delay
+ ('sdr_trcd_d', 4), # Active to R/W delay
+ ('sdr_cas', 3), # SDRAM CAS Latency
+ ('sdr_trcar_d', 4), # Auto-refresh period
+ ('sdr_twr_d', 4), # Write recovery delay
+ ('sdr_rfsh', refresh_timer_sz),
+ ('sdr_rfmax', refresh_row_count)
+ ])
+
+
+class SDRAM(Elaboratable):
+ """SDRAM controller from opencores, nmigen wrapper. remember to call
+ SDRAM.add_verilog_source.
+
+ * the SDRAM IC will be accessible over the Wishbone Bus
+ * sdr_* signals must be wired to the IC
+ * cfg parameters must match those listed in the SDRAM IC's datasheet
+ """
+
+ def __init__(self, bus=None, features=None, name=None,
+ data_width=32, addr_width=26,
+ sdr_data_width=16,
+ cfg=None,
+ pins=None):
+ if name is not None:
+ name = "sdram"
+ self.data_width = data_width
+ self.sdr_data_width = sdr_data_width
+ self.addr_width = addr_width
+ self.refresh_timer_sz = 12
+ self.refresh_row_count = 3
+
+ # set up the wishbone bus
+ if features is None:
+ features = frozenset({'cti'})
+ if bus is None:
+ bus = Interface(addr_width=addr_width,
+ data_width=data_width,
+ features=features,
+ granularity=8,
+ name=name)
+ self.bus = bus
+ assert len(self.bus.dat_r) == data_width, \
+ "bus width must be %d" % data_width
+
+ byte_width = sdr_data_width // 8 # for individual byte masks/enables
+
+ # SDRAM signals
+ self.sdram_clk = Signal() # sdram phy clock
+ self.sdram_resetn = Signal(reset_less=True) # sdram reset (low)
+ self.sdr_cs_n = Signal() # chip select
+ self.sdr_cke = Signal() # clock-enable
+ self.sdr_ras_n = Signal() # read-address strobe
+ self.sdr_cas_n = Signal() # cas
+ self.sdr_we_n = Signal() # write-enable
+ self.sdr_dqm = Signal(byte_width) # data mask
+ self.sdr_ba = Signal(2) # bank enable
+ self.sdr_addr = Signal(13) # sdram address, 13 bits
+ # these combine to create a bi-direction inout, sdr_dq
+ # note, each bit of sdr_den_n covers a *byte* of sdr_din/sdr_dout
+ self.sdr_den_n = Signal(byte_width)
+ self.sdr_din = Signal(data_width)
+ self.sdr_dout = Signal(data_width)
+
+ # configuration parameters, these need to match the SDRAM IC datasheet
+ self.sdr_init_done = Signal() # Indicate SDRAM init Done
+ if cfg is None:
+ cfg = SDRAMConfig(self.refresh_timer_sz,
+ self.refresh_row_count, name="sdr_cfg")
+
+ # config and pins resource
+ self.pins = pins
+ self.cfg = cfg
+
+ @classmethod
+ def add_verilog_source(cls, verilog_src_dir, platform):
+ # add each of the verilog sources, needed for when doing platform.build
+ for fname in [ './core/sdrc_bank_ctl.v', './core/sdrc_bank_fsm.v',
+ './core/sdrc_bs_convert.v', './core/sdrc_core.v',
+ './core/sdrc_req_gen.v', './core/sdrc_xfr_ctl.v',
+ './core/sdrc_define.v',
+ './lib/async_fifo.v', './lib/sync_fifo.v',
+ './top/sdrc_top.v', './wb2sdrc/wb2sdrc.v',
+ ]:
+ # prepend the src directory to each filename, add its contents
+ fullname = os.path.join(verilog_src_dir, fname)
+ with open(fullname) as f:
+ platform.add_file(fullname, f)
+
+ def elaborate(self, platform):
+ m = Module()
+ comb = m.d.comb
+
+ # create definition of external verilog 16550 uart here, so that # nmigen understands I/O directions (defined by i_ and o_ prefixes)
+ bus = self.bus
+
+ params = {
+ # clock/reset (use DomainRenamer if needed)
+ 'i_wb_clk_i' : ClockSignal(),
+ 'i_wb_rst_i' : ResetSignal(),
+
+ # wishbone bus signals
+ 'i_wb_adr_i' : bus.adr,
+ 'i_wb_dat_i' : bus.dat_w,
+ 'i_wb_sel_i' : bus.sel,
+ 'o_wb_dat_o' : bus.dat_r,
+ 'i_wb_we_i' : bus.we,
+ 'i_wb_stb_i' : bus.stb,
+ 'i_wb_cyc_i' : bus.cyc,
+ 'o_wb_ack_o' : bus.ack,
+
+ # SDRAM signals
+ 'i_sdram_clk' : self.sdram_clk,
+ 'i_sdram_resetn' : self.sdram_resetn,
+ 'o_sdr_cs_n' : self.sdr_cs_n,
+ 'o_sdr_cke' : self.sdr_cke,
+ 'o_sdr_ras_n' : self.sdr_ras_n,
+ 'o_sdr_cas_n' : self.sdr_cas_n,
+ 'o_sdr_we_n' : self.sdr_we_n,
+ 'o_sdr_dqm' : self.sdr_dqm,
+ 'o_sdr_ba' : self.sdr_ba,
+ 'o_sdr_addr' : self.sdr_addr,
+ 'o_sdr_den_n' : self.sdr_den_n,
+ 'i_sdr_din' : self.sdr_din,
+ 'o_sdr_dout' : self.sdr_dout,
+
+ # configuration parameters (from the SDRAM IC datasheet)
+ 'o_sdr_init_done' : self.sdr_init_done ,
+ 'i_cfg_req_depth' : self.cfg.req_depth ,
+ 'i_cfg_sdr_en' : self.cfg.sdr_en ,
+ 'i_cfg_sdr_mode_reg' : self.cfg.sdr_mode_reg ,
+ 'i_cfg_sdr_tras_d' : self.cfg.sdr_tras_d ,
+ 'i_cfg_sdr_trp_d' : self.cfg.sdr_trp_d ,
+ 'i_cfg_sdr_trcd_d' : self.cfg.sdr_trcd_d ,
+ 'i_cfg_sdr_cas' : self.cfg.sdr_cas ,
+ 'i_cfg_sdr_trcar_d' : self.cfg.sdr_trcar_d ,
+ 'i_cfg_sdr_twr_d' : self.cfg.sdr_twr_d ,
+ 'i_cfg_sdr_rfsh' : self.cfg.sdr_rfsh ,
+ 'i_cfg_sdr_rfmax' : self.cfg.sdr_rfmax,
+
+ # verilog parameters
+ 'p_APP_AW' : self.addr_width, # Application Address Width
+ 'p_APP_DW' : self.data_width, # Application Data Width
+ 'p_APP_BW' : self.addr_width//8, # Application Byte Width
+ 'p_APP_RW' : 9, # Application Request Width
+ 'p_SDR_DW' : self.sdr_data_width, # SDR Data Width
+ 'p_SDR_BW' : self.sdr_data_width//8, # SDR Byte Width
+ 'p_dw' : self.data_width, # data width
+ 'p_tw' : 8, # tag id width
+ 'p_bl' : 9, # burst_length_width
+ }
+ m.submodules['sdrc_top'] = Instance("sdrc_top", **params)
+
+ return m
+
+ if self.pins is not None:
+ comb += self.pins.tx.eq(self.tx_o)
+ comb += self.rx_i.eq(self.pins.rx)
+
+ return m
+
+
+def create_ilang(dut, ports, test_name):
+ vl = rtlil.convert(dut, name=test_name, ports=ports)
+ with open("%s.il" % test_name, "w") as f:
+ f.write(vl)
+
+def create_verilog(dut, ports, test_name):
+ vl = verilog.convert(dut, name=test_name, ports=ports)
+ with open("%s.v" % test_name, "w") as f:
+ f.write(vl)
+
+
+if __name__ == "__main__":
+ sdram = SDRAM(name="sdram", data_width=8)
+ create_ilang(sdram, [sdram.bus.cyc, sdram.bus.stb, sdram.bus.ack,
+ sdram.bus.dat_r, sdram.bus.dat_w, sdram.bus.adr,
+ sdram.bus.we, sdram.bus.sel,
+ sdram.sdram_clk, sdram.sdram_resetn,
+ sdram.sdr_cs_n, sdram.sdr_cke,
+ sdram.sdr_ras_n, sdram.sdr_cas_n, sdram.sdr_we_n,
+ sdram.sdr_dqm, sdram.sdr_ba, sdram.sdr_addr,
+ sdram.sdr_den_n, sdram.sdr_din, sdram.sdr_dout,
+ sdram.sdr_init_done, sdram.cfg.req_depth,
+ sdram.cfg.sdr_en, sdram.cfg.sdr_mode_reg,
+ sdram.cfg.sdr_tras_d, sdram.cfg.sdr_trp_d,
+ sdram.cfg.sdr_trcd_d, sdram.cfg.sdr_cas,
+ sdram.cfg.sdr_trcar_d, sdram.cfg.sdr_twr_d,
+ sdram.cfg.sdr_rfsh, sdram.cfg.sdr_rfmax,
+ ], "sdram")
+
data_width=self.memory.width,
granularity=granularity,
features=features,
- alignment=0,
+ #alignment=0,
name=None)
self.bus = bus
self.granularity = bus.granularity
--- /dev/null
+#!/usr/bin/env python3
+#
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2022 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+# Copyright (C) 2022 Raptor Engineering, LLC <support@raptorengineering.com>
+# Sponsored by NLnet and NGI POINTER under EU Grants 871528 and 957073
+# Part of the Libre-SOC Project.
+#
+# this is a System Console peripheral compatible with microwatt
+# https://github.com/antonblanchard/microwatt/blob/master/syscon.vhdl
+
+from nmigen import (Elaboratable, Cat, Module, Signal)
+from nmigen.cli import rtlil, verilog
+
+from lambdasoc.periph import Peripheral
+
+__all__ = ["MicrowattSYSCON"]
+
+
+class MicrowattSYSCON(Peripheral, Elaboratable):
+ """Microwatt-compatible (Sys)tem (Con)figuration module
+ """
+
+ def __init__(self, *, sys_clk_freq=100e6,
+ core_clk_freq=100e6,
+ mem_clk_freq=100e6,
+ spi_offset=None,
+ dram_addr=None,
+ has_uart=True,
+ uart_is_16550=True
+ ):
+ super().__init__(name="syscon")
+ self.sys_clk_freq = sys_clk_freq
+ self.core_clk_freq = core_clk_freq
+ self.mem_clk_freq = mem_clk_freq
+ self.has_uart = has_uart
+ self.spi_offset = spi_offset
+ self.dram_addr = dram_addr
+ self.uart_is_16550 = uart_is_16550
+
+ # System control ports
+ self.dram_at_0 = Signal()
+ self.core_reset = Signal()
+ self.soc_reset = Signal()
+
+ # set up a CSR Bank and associated bridge. has to be in this order
+ # (declare bank, declare bridge) for some unknown reason.
+ # (r)ead regs will have a r_stb and r_data Record entry
+ # (w)rite regs will have a w_stb and w_data Record entry
+ bank = self.csr_bank()
+ self._reg_sig_r = bank.csr(64, "r") # signature
+ self._reg_info_r = bank.csr(64, "r") # info
+ self._bram_info_r = bank.csr(64, "r") # bram info
+ self._dram_info_r = bank.csr(64, "r") # dram info
+ self._clk_info_r = bank.csr(64, "r") # nest clock frequency
+ self._ctrl_info_r = bank.csr(64, "rw") # control info
+ self._dram_init_r = bank.csr(64, "r") # dram initialisation info
+ self._spiflash_info_r = bank.csr(64, "r") # spi flash info
+ self._uart0_info_r = bank.csr(64, "r") # UART0 info (baud etc.)
+ self._uart1_info_r = bank.csr(64, "r") # UART1 info (baud etc.)
+ self._bram_bootaddr_r = bank.csr(64, "r") # BRAM boot address
+ self._core_clk_info_r = bank.csr(64, "r") # core clock frequency
+ self._mem_clk_info_r = bank.csr(64, "r") # memory clock frequency
+
+ # bridge the above-created CSRs over wishbone. ordering and size
+ # above mattered, the bridge automatically packs them together
+ # as memory-addressable "things" for us
+ self._bridge = self.bridge(data_width=32, granularity=8, alignment=3)
+ self.bus = self._bridge.bus
+
+ def elaborate(self, platform):
+ m = Module()
+ comb, sync = m.d.comb, m.d.comb
+ m.submodules.bridge = self._bridge
+
+ # enter data into the CSRs. r_data can be left live all the time,
+ # w_data obviously has to be set only when w_stb triggers.
+
+ # identifying signature
+ comb += self._reg_sig_r.r_data.eq(0xf00daa5500010001)
+
+ # nest clock rate (hz)
+ comb += self._clk_info_r.r_data.eq(int(self.sys_clk_freq)) # in hz
+
+ # core clock rate (hz)
+ comb += self._core_clk_info_r.r_data.eq(int(self.core_clk_freq)) # in hz
+
+ # memory clock rate (hz)
+ comb += self._mem_clk_info_r.r_data.eq(int(self.mem_clk_freq)) # in hz
+
+ # detect peripherals
+ has_spi = self.spi_offset is not None
+ has_dram = self.dram_addr is not None
+
+ # uart peripheral clock rate, currently assumed to be system clock
+ # 0 ..31 : UART clock freq (in HZ)
+ # 32 : UART is 16550 (otherwise pp)
+ comb += self._uart0_info_r.r_data[0:32].eq(int(self.sys_clk_freq))
+ comb += self._uart0_info_r.r_data[32].eq(1)
+
+ # Reg Info, defines what peripherals and characteristics are present
+ comb += self._reg_info_r.r_data[0].eq(self.has_uart) # has UART0
+ comb += self._reg_info_r.r_data[1].eq(has_dram) # has DDR DRAM
+ comb += self._reg_info_r.r_data[3].eq(has_spi) # has SPI Flash
+ comb += self._reg_info_r.r_data[5].eq(1) # Large SYSCON
+
+ # system control
+ sysctrl = Cat(self.dram_at_0, self.core_reset, self.soc_reset)
+ with m.If(self._ctrl_info_r.w_stb):
+ sync += sysctrl.eq(self._ctrl_info_r.w_data)
+ comb += self._ctrl_info_r.r_data.eq(sysctrl)
+
+ # SPI Flash Address
+ comb += self._spiflash_info_r.r_data.eq(self.spi_offset or 0)
+
+ return m
+
+
+def create_ilang(dut, ports, test_name):
+ vl = rtlil.convert(dut, name=test_name, ports=ports)
+ with open("%s.il" % test_name, "w") as f:
+ f.write(vl)
+
+def create_verilog(dut, ports, test_name):
+ vl = verilog.convert(dut, name=test_name, ports=ports)
+ with open("%s.v" % test_name, "w") as f:
+ f.write(vl)
+
+
+if __name__ == "__main__":
+ from nmigen_soc import wishbone
+ class QuickDemo(Elaboratable):
+ def elaborate(self, platform):
+ m = Module()
+ arbiter = wishbone.Arbiter(addr_width=30, data_width=32,
+ granularity=8)
+ decoder = wishbone.Decoder(addr_width=30, data_width=32,
+ granularity=8)
+ m.submodules.syscon = syscon = MicrowattSYSCON()
+ m.submodules.decoder = decoder
+ m.submodules.arbiter = arbiter
+ decoder.add(syscon.bus, addr=0xc0000000)
+ m.d.comb += arbiter.bus.connect(decoder.bus)
+ return m
+ m = QuickDemo()
+ create_ilang(m, None, "syscondemo")
+
--- /dev/null
+#!/usr/bin/env python3
+#
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2020-2022 Raptor Engineering LLC <support@raptorengineering.com>
+# Copyright (C) 2022 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+# Sponsored by NLnet and NGI POINTER under EU Grants 871528 and 957073
+# Part of the Libre-SOC Project.
+#
+# this is a wrapper around the opencores verilog tercel module
+
+from nmigen import (Elaboratable, Cat, Module, Signal, ClockSignal, Instance,
+ ResetSignal, Const)
+
+from nmigen_soc.wishbone.bus import Interface
+from nmigen_soc.memory import MemoryMap
+from nmigen.utils import log2_int
+from nmigen.cli import rtlil, verilog
+from nmutil.byterev import byte_reverse
+import os
+
+__all__ = ["Tercel"]
+
+
+class Tercel(Elaboratable):
+ """Tercel SPI controller from Raptor Engineering, nmigen wrapper.
+ remember to call Tercel.add_verilog_source
+ """
+
+ def __init__(self, bus=None, cfg_bus=None, features=None, name=None,
+ data_width=32, spi_region_addr_width=28, pins=None,
+ clk_freq=None,
+ lattice_ecp5_usrmclk=False,
+ adr_offset=0): # address offset (bytes)
+ if name is not None:
+ # convention: give the name in the format "name_number"
+ self.idx = int(name.split("_")[-1])
+ else:
+ self.idx = 0
+ name = "spi_0"
+ self.granularity = 8
+ self.data_width = data_width
+ self.dsize = log2_int(self.data_width//self.granularity)
+ self.adr_offset = adr_offset
+ self.lattice_ecp5_usrmclk = lattice_ecp5_usrmclk
+
+ # TODO, sort this out.
+ assert clk_freq is not None
+ clk_freq = round(clk_freq)
+ self.clk_freq = Const(clk_freq, 32) #clk_freq.bit_length())
+
+ # set up the wishbone busses
+ if features is None:
+ #features = frozenset({'err'}) # sigh
+ features = frozenset()
+ if bus is None:
+ bus = Interface(addr_width=spi_region_addr_width,
+ data_width=data_width,
+ features=features,
+ granularity=8,
+ name=name+"_wb_%d_0" % self.idx)
+ if cfg_bus is None:
+ cfg_bus = Interface(addr_width=6,
+ data_width=data_width,
+ features=features,
+ granularity=8,
+ name=name+"_wb_%d_1" % self.idx)
+ self.bus = bus
+ assert len(self.bus.dat_r) == data_width, \
+ "bus width must be %d" % data_width
+ self.cfg_bus = cfg_bus
+ assert len(self.cfg_bus.dat_r) == data_width, \
+ "bus width must be %d" % data_width
+
+ mmap = MemoryMap(addr_width=spi_region_addr_width+self.dsize,
+ data_width=self.granularity)
+ cfg_mmap = MemoryMap(addr_width=6+self.dsize,
+ data_width=self.granularity)
+
+ self.bus.memory_map = mmap
+ self.cfg_bus.memory_map = cfg_mmap
+
+ # QSPI signals
+ self.dq_out = Signal(4) # Data
+ self.dq_direction = Signal(4)
+ self.dq_in = Signal(4)
+ self.cs_n_out = Signal() # Slave select
+ self.spi_clk = Signal() # Clock
+ self.dbg_port = Signal(8) # debug info
+
+ # pins resource
+ self.pins = pins
+
+ @classmethod
+ def add_verilog_source(cls, verilog_src_dir, platform):
+ # add each of the verilog sources, needed for when doing platform.build
+ for fname in ['wishbone_spi_master.v', 'phy.v']:
+ # prepend the src directory to each filename, add its contents
+ fullname = os.path.join(verilog_src_dir, fname)
+ with open(fullname) as f:
+ platform.add_file(fullname, f)
+
+ def elaborate(self, platform):
+ m = Module()
+ comb = m.d.comb
+ pins, bus, cfg_bus = self.pins, self.bus, self.cfg_bus
+
+ # Calculate SPI flash address
+ spi_bus_adr = Signal(30)
+ # wb address is in words, offset is in bytes
+ comb += spi_bus_adr.eq(bus.adr - (self.adr_offset >> 2))
+
+ # urrr.... byte-reverse the config bus and data bus read/write
+ cdat_w = Signal.like(cfg_bus.dat_w)
+ cdat_r = Signal.like(cfg_bus.dat_r)
+ dat_w = Signal.like(bus.dat_w)
+ dat_r = Signal.like(bus.dat_r)
+ comb += cdat_w.eq(byte_reverse(m, "rv_cdat_w", cfg_bus.dat_w, 4))
+ comb += cfg_bus.dat_r.eq(byte_reverse(m, "rv_cdat_r", cdat_r, 4))
+ comb += dat_w.eq(byte_reverse(m, "rv_dat_w", bus.dat_w, 4))
+ comb += bus.dat_r.eq(byte_reverse(m, "rv_dat_r", dat_r, 4))
+
+ # create definition of external verilog Tercel code here, so that
+ # nmigen understands I/O directions (defined by i_ and o_ prefixes)
+ idx, bus = self.idx, self.bus
+ tercel = Instance("tercel_core",
+ # System parameters
+ i_sys_clk_freq = self.clk_freq,
+
+ # Clock/reset (use DomainRenamer if needed)
+ i_peripheral_clock=ClockSignal(),
+ i_peripheral_reset=ResetSignal(),
+
+ # SPI region Wishbone bus signals
+ i_wishbone_adr=spi_bus_adr,
+ i_wishbone_dat_w=dat_w,
+ i_wishbone_sel=bus.sel,
+ o_wishbone_dat_r=dat_r,
+ i_wishbone_we=bus.we,
+ i_wishbone_stb=bus.stb,
+ i_wishbone_cyc=bus.cyc,
+ o_wishbone_ack=bus.ack,
+ #o_wishbone_err=bus.err,
+
+ # Configuration region Wishbone bus signals
+ i_cfg_wishbone_adr=cfg_bus.adr,
+ i_cfg_wishbone_dat_w=cdat_w,
+ i_cfg_wishbone_sel=cfg_bus.sel,
+ o_cfg_wishbone_dat_r=cdat_r,
+ i_cfg_wishbone_we=cfg_bus.we,
+ i_cfg_wishbone_stb=cfg_bus.stb,
+ i_cfg_wishbone_cyc=cfg_bus.cyc,
+ o_cfg_wishbone_ack=cfg_bus.ack,
+ #o_cfg_wishbone_err=cfg_bus.err,
+
+ # QSPI signals
+ o_spi_d_out=self.dq_out,
+ o_spi_d_direction=self.dq_direction,
+ i_spi_d_in=self.dq_in,
+ o_spi_ss_n=self.cs_n_out,
+ o_spi_clock=self.spi_clk,
+
+ # debug port
+ o_debug_port=self.dbg_port
+ );
+
+ m.submodules['tercel_%d' % self.idx] = tercel
+
+ if pins is not None:
+ for i in range(4):
+ pad = getattr(pins, "dq%d" % i)
+ comb += pad.o.eq(self.dq_out[i])
+ comb += pad.oe.eq(self.dq_direction[i])
+ comb += self.dq_in[i].eq(pad.i)
+ # ECP5 needs special handling for the SPI clock, sigh.
+ if self.lattice_ecp5_usrmclk:
+ comb += pad.o_clk.eq(ClockSignal())
+ comb += pad.i_clk.eq(ClockSignal())
+ # XXX invert handled by SPIFlashResource
+ comb += pins.cs_n.eq(self.cs_n_out)
+ # ECP5 needs special handling for the SPI clock, sigh.
+ if self.lattice_ecp5_usrmclk:
+ m.submodules += Instance("USRMCLK",
+ i_USRMCLKI = self.spi_clk,
+ i_USRMCLKTS = 0
+ )
+ else:
+ comb += pins.clk.eq(self.spi_clk)
+
+ return m
+
+ def ports(self):
+ return [self.bus.cyc, self.bus.stb, self.bus.ack,
+ self.bus.dat_r, self.bus.dat_w, self.bus.adr,
+ self.bus.we, self.bus.sel,
+ self.cfg_bus.cyc, self.cfg_bus.stb,
+ self.cfg_bus.ack,
+ self.cfg_bus.dat_r, self.cfg_bus.dat_w,
+ self.cfg_bus.adr,
+ self.cfg_bus.we, self.cfg_bus.sel,
+ self.dq_out, self.dq_direction, self.dq_in,
+ self.cs_n_out, self.spi_clk
+ ]
+
+
+def create_ilang(dut, ports, test_name):
+ vl = rtlil.convert(dut, name=test_name, ports=ports)
+ with open("%s.il" % test_name, "w") as f:
+ f.write(vl)
+
+def create_verilog(dut, ports, test_name):
+ vl = verilog.convert(dut, name=test_name, ports=ports)
+ with open("%s.v" % test_name, "w") as f:
+ f.write(vl)
+
+
+if __name__ == "__main__":
+ tercel = Tercel(name="spi_0", data_width=32, clk_freq=100e6)
+ create_ilang(tercel, tercel.ports(), "spi_0")
+
"""
-def wb_write(bus, addr, data, sel=True):
+def wb_write(bus, addr, data, sel=0b1111):
# write wb
yield bus.we.eq(1)
yield bus.cyc.eq(1)
yield bus.stb.eq(1)
- yield bus.sel.eq(0b1111 if sel else 0b1) # 32-bit / 8-bit
+ #yield bus.sel.eq(0b1111 if sel else 0b1) # 32-bit / 8-bit
+ yield bus.sel.eq(sel)
yield bus.adr.eq(addr)
yield bus.dat_w.eq(data)
yield bus.dat_w.eq(0)
-def wb_read(bus, addr, sel=True):
+def wb_read(bus, addr, sel=0b1111):
# read wb
yield bus.cyc.eq(1)
yield bus.stb.eq(1)
yield bus.we.eq(0)
- yield bus.sel.eq(0b1111 if sel else 0b1) # 32-bit / 8-bit
+ #yield bus.sel.eq(0b1111 if sel else 0b1) # 32-bit / 8-bit
+ yield bus.sel.eq(sel)
yield bus.adr.eq(addr)
# wait for ack to go high
--- /dev/null
+#!/usr/bin/env python3
+#
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2022 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+# Sponsored by NLnet and NGI POINTER under EU Grants 871528 and 957073
+# Part of the Libre-SOC Project.
+#
+# this is a wrapper around the opencores verilog uart16550 module
+
+from nmigen import (Elaboratable, Cat, Module, Signal, ClockSignal, Instance,
+ ResetSignal)
+
+from nmigen_soc.wishbone.bus import Interface
+from nmigen.cli import rtlil, verilog
+import os
+import tempfile
+
+__all__ = ["UART16550"]
+
+
+class UART16550(Elaboratable):
+ """16550 UART from opencores, nmigen wrapper. remember to call
+ UART16550.add_verilog_source
+ """
+
+ def __init__(self, bus=None, features=None, name=None, data_width=32,
+ pins=None, irq=None):
+ if name is not None:
+ # convention: give the name in the format "name_number"
+ self.idx = int(name.split("_")[-1])
+ else:
+ self.idx = 0
+ name = "uart_0"
+ self.data_width = data_width
+
+ # set up the wishbone bus
+ if features is None:
+ features = frozenset()
+ if bus is None:
+ bus = Interface(addr_width=5,
+ data_width=data_width,
+ features=features,
+ granularity=8,
+ name=name+"_wb_%d" % self.idx)
+ self.bus = bus
+ assert len(self.bus.dat_r) == data_width, \
+ "bus width must be %d" % data_width
+
+ # IRQ for data buffer receive/xmit
+ if irq is None:
+ irq = Signal()
+ self.irq = irq
+
+ # 9-pin UART signals (if anyone still remembers those...)
+ self.tx_o = Signal() # transmit
+ self.rx_i = Signal() # receive
+ self.rts_o = Signal() # ready to send
+ self.cts_i = Signal() # clear to send
+ self.dtr_o = Signal() # data terminal ready
+ self.dsr_i = Signal() # data send ready
+ self.ri_i = Signal() # can't even remember what this is!
+ self.dcd_i = Signal() # or this!
+
+ # pins resource
+ self.pins = pins
+
+ @classmethod
+ def add_verilog_source(cls, verilog_src_dir, platform):
+ # create a temp file containing "`define DATA_BUS_WIDTH_8"
+ t = tempfile.NamedTemporaryFile(delete=False, suffix=".v")
+ t.write("`define DATA_BUS_WIDTH_8\n".encode())
+ t.flush()
+ t.seek(0)
+ platform.add_file(t.name, t)
+
+ # add each of the verilog sources, needed for when doing platform.build
+ for fname in ['raminfr.v', 'uart_defines.v', 'uart_rfifo.v',
+ 'uart_top.v', 'timescale.v', 'uart_receiver.v',
+ 'uart_sync_flops.v', 'uart_transmitter.v',
+ 'uart_debug_if.v', 'uart_regs.v',
+ 'uart_tfifo.v', 'uart_wb.v'
+ ]:
+ # prepend the src directory to each filename, add its contents
+ fullname = os.path.join(verilog_src_dir, fname)
+ with open(fullname) as f:
+ platform.add_file(fullname, f)
+
+ def elaborate(self, platform):
+ m = Module()
+ comb = m.d.comb
+
+ # create definition of external verilog 16550 uart here, so that # nmigen understands I/O directions (defined by i_ and o_ prefixes)
+ idx, bus = self.idx, self.bus
+ uart = Instance("uart_top",
+ # clock/reset (use DomainRenamer if needed)
+ i_wb_clk_i=ClockSignal(),
+ i_wb_rst_i=ResetSignal(),
+ # wishbone bus signals
+ i_wb_adr_i=bus.adr,
+ i_wb_dat_i=bus.dat_w,
+ i_wb_sel_i=bus.sel,
+ o_wb_dat_o=bus.dat_r,
+ i_wb_we_i=bus.we,
+ i_wb_stb_i=bus.stb,
+ i_wb_cyc_i=bus.cyc,
+ o_wb_ack_o=bus.ack,
+ # interrupt line
+ o_int_o=self.irq,
+ # 9-pin RS232/UART signals
+ o_stx_pad_o=self.tx_o,
+ i_srx_pad_i=self.rx_i,
+ o_rts_pad_o=self.rts_o,
+ i_cts_pad_i=self.cts_i,
+ o_dtr_pad_o=self.dtr_o,
+ i_dsr_pad_i=self.dsr_i,
+ i_ri_pad_i=self.ri_i,
+ i_dcd_pad_i=self.dcd_i
+ );
+
+ m.submodules['uart16550_%d' % self.idx] = uart
+
+ if self.pins is not None:
+ comb += self.pins.tx.eq(self.tx_o)
+ comb += self.rx_i.eq(self.pins.rx)
+
+ return m
+
+
+def create_ilang(dut, ports, test_name):
+ vl = rtlil.convert(dut, name=test_name, ports=ports)
+ with open("%s.il" % test_name, "w") as f:
+ f.write(vl)
+
+def create_verilog(dut, ports, test_name):
+ vl = verilog.convert(dut, name=test_name, ports=ports)
+ with open("%s.v" % test_name, "w") as f:
+ f.write(vl)
+
+
+if __name__ == "__main__":
+ uart = UART16550(name="uart_0", data_width=8)
+ create_ilang(uart, [uart.bus.cyc, uart.bus.stb, uart.bus.ack,
+ uart.bus.dat_r, uart.bus.dat_w, uart.bus.adr,
+ uart.bus.we, uart.bus.sel,
+ uart.irq,
+ uart.tx_o, uart.rx_i, uart.rts_o, uart.cts_i,
+ uart.dtr_o, uart.dsr_i, uart.ri_i, uart.dcd_i
+ ], "uart_0")
+
--- /dev/null
+# Copyright (C) 2022 Raptor Engineering, LLC <support@raptorengineering.com>
+#
+# Based partly on code from LibreSoC
+#
+# Modifications for the Libre-SOC Project funded by NLnet and NGI POINTER
+# under EU Grants 871528 and 957073, under the LGPLv3+ License
+#
+# this is a wrapper around the Verilog Wishbone Components wb_async_reg module
+
+from nmigen import (Elaboratable, Cat, Module, Signal, ClockSignal, Instance,
+ ResetSignal, Const)
+
+from nmigen_soc.wishbone.bus import Interface
+from nmigen_soc.memory import MemoryMap
+from nmigen.utils import log2_int
+from nmigen.cli import rtlil, verilog
+from nmutil.byterev import byte_reverse
+import os
+
+__all__ = ["WBAsyncBridge"]
+
+
+class WBAsyncBridge(Elaboratable):
+ """Verilog Wishbone Components wb_async_reg module, nmigen wrapper.
+ remember to call WBAsyncBridge.add_verilog_source
+ """
+
+ def __init__(self, master_bus=None, slave_bus=None, master_features=None,
+ slave_features=None, name=None,
+ address_width=30, data_width=32, granularity=8,
+ master_clock_domain=None, slave_clock_domain=None):
+ if name is not None:
+ # convention: give the name in the format "name_number"
+ self.idx = int(name.split("_")[-1])
+ else:
+ self.idx = 0
+ name = "wbasyncbridge_0"
+ self.address_width = address_width
+ self.data_width = data_width
+ self.granularity = granularity
+ self.dsize = log2_int(self.data_width//self.granularity)
+
+ # set up the clock domains
+ if master_clock_domain is None:
+ self.wb_mclk = ClockSignal()
+ self.wb_mrst = ResetSignal()
+ else:
+ self.wb_mclk = ClockSignal(master_clock_domain)
+ self.wb_mrst = ResetSignal(master_clock_domain)
+ if slave_clock_domain is None:
+ self.wb_sclk = ClockSignal()
+ self.wb_srst = ResetSignal()
+ else:
+ self.wb_sclk = ClockSignal(slave_clock_domain)
+ self.wb_srst = ResetSignal(slave_clock_domain)
+
+ # set up the wishbone busses
+ if master_features is None:
+ master_features = frozenset()
+ if slave_features is None:
+ slave_features = frozenset()
+ if master_bus is None:
+ master_bus = Interface(addr_width=self.address_width,
+ data_width=self.data_width,
+ features=master_features,
+ granularity=self.granularity,
+ name=name+"_wb_%d_master" % self.idx)
+ if slave_bus is None:
+ slave_bus = Interface(addr_width=self.address_width,
+ data_width=self.data_width,
+ features=slave_features,
+ granularity=self.granularity,
+ name=name+"_wb_%d_slave" % self.idx)
+ self.master_bus = master_bus
+ assert len(self.master_bus.dat_r) == data_width, \
+ "bus width must be %d" % data_width
+ self.slave_bus = slave_bus
+ assert len(self.slave_bus.dat_r) == data_width, \
+ "bus width must be %d" % data_width
+
+ @classmethod
+ def add_verilog_source(cls, verilog_src_dir, platform):
+ # add each of the verilog sources, needed for when doing platform.build
+ for fname in ['wb_async_reg.v']:
+ # prepend the src directory to each filename, add its contents
+ fullname = os.path.join(verilog_src_dir, fname)
+ with open(fullname) as f:
+ platform.add_file(fullname, f)
+
+ def elaborate(self, platform):
+ m = Module()
+ comb = m.d.comb
+ master_bus, slave_bus = self.master_bus, self.slave_bus
+ slave_err = Signal()
+ slave_rty = Signal()
+
+ # create definition of external verilog bridge code here, so that
+ # nmigen understands I/O directions (defined by i_ and o_ prefixes)
+ idx = self.idx
+ wb_async_bridge = Instance("wb_async_reg",
+ # Parameters
+ p_ADDR_WIDTH=self.address_width,
+ p_DATA_WIDTH=self.data_width,
+ # width of select is the data width
+ # *divided* by the data granularity.
+ # data_width=32-bit, data granularity=8-bit,
+ # select_width ==> 32/8 ==> 4
+ p_SELECT_WIDTH=self.data_width//self.granularity,
+
+ # Clocks/resets
+ i_wbm_clk=self.wb_mclk,
+ i_wbm_rst=self.wb_mrst,
+ i_wbs_clk=self.wb_sclk,
+ i_wbs_rst=self.wb_srst,
+
+ # Master Wishbone bus signals
+ i_wbm_adr_i=self.master_bus.adr,
+ i_wbm_dat_i=self.master_bus.dat_w,
+ o_wbm_dat_o=self.master_bus.dat_r,
+ i_wbm_we_i=self.master_bus.we,
+ i_wbm_sel_i=self.master_bus.sel,
+ i_wbm_stb_i=self.master_bus.stb,
+ i_wbm_cyc_i=self.master_bus.cyc,
+ o_wbm_ack_o=self.master_bus.ack,
+ #o_wbm_err=self.master_bus.err,
+ #o_wbm_rty_i=self.master_bus.rty,
+
+ # Slave Wishbone bus signals
+ o_wbs_adr_o=self.slave_bus.adr,
+ i_wbs_dat_i=self.slave_bus.dat_r,
+ o_wbs_dat_o=self.slave_bus.dat_w,
+ o_wbs_we_o=self.slave_bus.we,
+ o_wbs_sel_o=self.slave_bus.sel,
+ o_wbs_stb_o=self.slave_bus.stb,
+ o_wbs_cyc_o=self.slave_bus.cyc,
+ i_wbs_ack_i=self.slave_bus.ack,
+ i_wbs_err_i=slave_err,
+ i_wbs_rty_i=slave_rty
+ );
+
+ # Wire unused signals to 0
+ comb += slave_err.eq(0)
+ comb += slave_rty.eq(0)
+
+ m.submodules['wb_async_bridge_%d' % self.idx] = wb_async_bridge
+
+ return m
+
+ def ports(self):
+ return [self.master_bus.adr, self.master_bus.dat_w,
+ self.master_bus.dat_r,
+ self.master_bus.we, self.master_bus.sel,
+ self.master_bus.stb,
+ self.master_bus.cyc, self.master_bus.ack,
+ self.master_bus.err,
+ self.master_bus.rty,
+ self.slave_bus.adr, self.slave_bus.dat_w,
+ self.slave_bus.dat_r,
+ self.slave_bus.we, self.slave_bus.sel,
+ self.slave_bus.stb,
+ self.slave_bus.cyc, self.slave_bus.ack,
+ self.slave_bus.err,
+ self.slave_bus.rty
+ ]
+
+def create_ilang(dut, ports, test_name):
+ vl = rtlil.convert(dut, name=test_name, ports=ports)
+ with open("%s.il" % test_name, "w") as f:
+ f.write(vl)
+
+def create_verilog(dut, ports, test_name):
+ vl = verilog.convert(dut, name=test_name, ports=ports)
+ with open("%s.v" % test_name, "w") as f:
+ f.write(vl)
+
+
+if __name__ == "__main__":
+ wbasyncbridge = WBAsyncBridge(name="wbasyncbridge_0", address_width=30, data_width=32, granularity=8)
+ create_ilang(wbasyncbridge, wbasyncbridge.ports(), "wbasyncbridge_0")
shift_reg = Signal(dw_from)
counter = Signal(log2_int(ratio, False))
- counter_reset = Signal()
- counter_ce = Signal()
- with m.If(counter_reset):
- sync += counter.eq(0)
- with m.Elif(counter_ce):
- sync += counter.eq(counter + 1)
+ cur_counter = Signal(log2_int(ratio, False))
counter_done = Signal()
comb += counter_done.eq(counter == ratio-1)
+ comb += cur_counter.eq(counter)
+ skip = Signal()
# Main FSM
with m.FSM() as fsm:
with m.State("IDLE"):
- comb += counter_reset.eq(1)
+ sync += counter.eq(0)
sync += cached_data.eq(0)
with m.If(master.stb & master.cyc):
with m.If(master.we):
with m.State("WRITE"):
comb += write.eq(1)
- comb += slave.we.eq(1)
- comb += slave.cyc.eq(1)
with m.If(master.stb & master.cyc):
+ comb += skip.eq(slave.sel == 0)
+ comb += slave.we.eq(1)
+ comb += slave.cyc.eq(1)
comb += slave.stb.eq(1)
- with m.If(slave.ack):
- comb += counter_ce.eq(1)
+ with m.If(slave.ack | skip):
+ sync += counter.eq(counter + 1)
with m.If(counter_done):
comb += master.ack.eq(1)
m.next = "IDLE"
with m.State("READ"):
comb += read.eq(1)
- comb += slave.cyc.eq(1)
with m.If(master.stb & master.cyc):
+ comb += skip.eq(slave.sel == 0)
+ comb += slave.cyc.eq(1)
comb += slave.stb.eq(1)
- with m.If(slave.ack):
- comb += counter_ce.eq(1)
+ with m.If(slave.ack | skip):
+ comb += cur_counter.eq(counter + 1) # TODO use Picker
+ sync += counter.eq(cur_counter)
with m.If(counter_done):
comb += master.ack.eq(1)
comb += master.dat_r.eq(shift_reg)
comb += slave.cti.eq(7) # indicate end of burst
with m.Else():
comb += slave.cti.eq(2)
- comb += slave.adr.eq(Cat(counter, master.adr))
+ comb += slave.adr.eq(Cat(cur_counter, master.adr))
# write Datapath - select fragments of data, depending on "counter"
with m.Switch(counter):
# read Datapath - uses cached_data and master.dat_r as a shift-register.
# by the time "counter" is done (counter_done) this is complete
comb += shift_reg.eq(Cat(cached_data[dw_to:], slave.dat_r))
- with m.If(read & counter_ce):
+ with m.If(read & (slave.ack | skip)):
sync += cached_data.eq(shift_reg)
'bare_wb': BareFetchUnit,
#'test_cache_wb': TestCacheFetchUnit
}
+ self.pspec = pspec
+ if self.pspec.imem_ifacetype in ['mmu_cache_wb', 'test_mmu_cache_wb']:
+ # XXX BLECH! use pspec to transfer the I-Cache which is
+ # created down inside LoadStore1!
+ self.fu = icache = pspec.icache # ICache already FetchUnitInterface
+ # tell I-Cache to connect up to its FetchUnitInterface
+ icache.use_fetch_interface()
+ return
+
fukls = fudict[pspec.imem_ifacetype]
self.fu = fukls(pspec)
+ def wb_bus(self):
+ return self.fu.ibus
+
# path is relative to this filename, in the pinmux submodule
pinmux = os.getenv("PINMUX", "%s/../../../pinmux" % pth)
- fname = "%s/%s/litex_pinpads.json" % (pinmux, chipname)
+ fname = "%s/%s/fabric_pinpads.json" % (pinmux, chipname)
with open(fname) as f:
txt = f.read()
sys.setrecursionlimit(10**6)
-def read_from_addr(dut, addr):
+def read_from_addr(dut, addr, stall=True):
yield dut.a_pc_i.eq(addr)
yield dut.a_i_valid.eq(1)
yield dut.f_i_valid.eq(1)
- yield dut.a_stall_i.eq(1)
- yield
- yield dut.a_stall_i.eq(0)
+ if stall:
+ yield dut.a_stall_i.eq(1)
+ yield
+ yield dut.a_stall_i.eq(0)
yield
yield Settle()
while (yield dut.f_busy_o):
cnt = 0
while True:
addr_ok = yield port.addr_ok_o
- print("addrok", addr_ok,cnt,debug)
- if addr_ok:
+ exc_happened = yield port.exc_o.happened
+ print("addrok", addr_ok,cnt,debug,exc_happened)
+ if addr_ok or exc_happened:
break
yield
cnt += 1
yield
-def pi_st(port1, addr, data, datalen, msr_pr=0, is_dcbz=0):
+def pi_st(port1, addr, data, datalen, msr, is_dcbz=0):
# have to wait until not busy
yield from wait_busy(port1,debug="pi_st_A") # wait while busy
yield port1.is_dcbz_i.eq(is_dcbz) # reset dcbz too
yield port1.is_st_i.eq(1) # indicate ST
yield port1.data_len.eq(datalen) # ST length (1/2/4/8)
- yield port1.msr_pr.eq(msr_pr) # MSR PR bit (1==>virt, 0==>real)
+ yield port1.priv_mode.eq(~msr.pr)
+ yield port1.virt_mode.eq(msr.dr)
+ yield port1.mode_32bit.eq(~msr.sf)
yield port1.addr.data.eq(addr) # set address
yield port1.addr.ok.eq(1) # set ok
yield Settle()
+
+ # must check exception even before waiting for address.
+ # XXX TODO: wait_addr should check for exception
+ exc_info = yield from get_exception_info(port1.exc_o)
+ exc_happened = exc_info.happened
+ if exc_happened:
+ print("print fast ST exception happened")
+ yield # MUST wait for one clock cycle before de-asserting these
+ yield port1.is_st_i.eq(0) # end
+ yield port1.addr.ok.eq(0) # set !ok
+ yield port1.is_dcbz_i.eq(0) # reset dcbz too
+ return "fast", exc_info
+
yield from wait_addr(port1) # wait until addr ok
+ exc_info = yield from get_exception_info(port1.exc_o)
+ exc_happened = exc_info.happened
+ if exc_happened:
+ print("print fast ST exception happened")
+ yield # MUST wait for one clock cycle before de-asserting these
+ yield port1.is_st_i.eq(0) # end
+ yield port1.addr.ok.eq(0) # set !ok
+ yield port1.is_dcbz_i.eq(0) # reset dcbz too
+ return "fast", exc_info
+
+
# yield # not needed, just for checking
# yield # not needed, just for checking
# assert "ST" for one cycle (required by the API)
yield
yield port1.st.ok.eq(0)
exc_info = yield from get_exception_info(port1.exc_o)
- dar_o = yield port1.dar_o
exc_happened = exc_info.happened
if exc_happened:
print("print fast ST exception happened")
yield port1.is_st_i.eq(0) # end
yield port1.addr.ok.eq(0) # set !ok
yield port1.is_dcbz_i.eq(0) # reset dcbz too
- return "fast", exc_info, dar_o
+ return "fast", exc_info
yield from wait_busy(port1,debug="pi_st_E") # wait while busy
exc_info = yield from get_exception_info(port1.exc_o)
- dar_o = yield port1.dar_o
exc_happened = exc_info.happened
if exc_happened:
yield # needed if mmu/dache is used
yield port1.addr.ok.eq(0) # set !ok
yield port1.is_dcbz_i.eq(0) # reset dcbz too
yield # needed if mmu/dache is used
- return "slow", exc_info, dar_o
+ return "slow", exc_info
# can go straight to reset.
yield port1.is_st_i.eq(0) # end
yield port1.is_dcbz_i.eq(0) # reset dcbz too
yield # needed if mmu/dache is used
- return None, None, None
+ return None, None
def get_exception_info(exc_o):
attrs = []
# copy of pi_st removed
-def pi_ld(port1, addr, datalen, msr_pr=0):
+def pi_ld(port1, addr, datalen, msr):
# have to wait until not busy
yield from wait_busy(port1,debug="pi_ld_A") # wait while busy
# set up a LD on the port. address first:
yield port1.is_ld_i.eq(1) # indicate LD
yield port1.data_len.eq(datalen) # LD length (1/2/4/8)
- yield port1.msr_pr.eq(msr_pr) # MSR PR bit (1==>virt, 0==>real)
+ yield port1.priv_mode.eq(~msr.pr)
+ yield port1.virt_mode.eq(msr.dr)
+ yield port1.mode_32bit.eq(~msr.sf)
yield port1.addr.data.eq(addr) # set address
yield port1.addr.ok.eq(1) # set ok
yield Settle()
yield from wait_addr(port1) # wait until addr ok
exc_info = yield from get_exception_info(port1.exc_o)
- dar_o = yield port1.dar_o
exc_happened = exc_info.happened
if exc_happened:
print("print fast LD exception happened")
yield # MUST wait for one clock cycle before de-asserting these
yield port1.is_ld_i.eq(0) # end
yield port1.addr.ok.eq(0) # set !ok
- return None, "fast", exc_info, dar_o
+ return None, "fast", exc_info
yield
yield from wait_ldok(port1) # wait until ld ok
data = yield port1.ld.data
exc_info = yield from get_exception_info(port1.exc_o)
- dar_o = yield port1.dar_o
exc_happened = yield port1.exc_o.happened
exc_happened = exc_info.happened
yield port1.is_ld_i.eq(0) # end
yield port1.addr.ok.eq(0) # set !ok
if exc_happened:
- return None, "slow", exc_info, dar_o
+ return None, "slow", exc_info
yield from wait_busy(port1, debug="pi_ld_E") # wait while busy
exc_info = yield from get_exception_info(port1.exc_o)
- dar_o = yield port1.dar_o
exc_happened = exc_info.happened
if exc_happened:
- return None, "slow", exc_info, dar_o
+ return None, "slow", exc_info
- return data, None, None, None
+ return data, None, None
-def pi_ldst(arg, dut, msr_pr=0):
+def pi_ldst(arg, dut, msr):
# do two half-word stores at consecutive addresses, then two loads
addr1 = 0x04
data = 0xbeef
data2 = 0xf00f
#data = 0x4
- assert(yield from pi_st(dut, addr1, data, 2, msr_pr) is None)
- assert(yield from pi_st(dut, addr2, data2, 2, msr_pr) is None)
- result, exc = yield from pi_ld(dut, addr1, 2, msr_pr)
- result2, exc2 = yield from pi_ld(dut, addr2, 2, msr_pr)
+ assert(yield from pi_st(dut, addr1, data, 2, msr) is None)
+ assert(yield from pi_st(dut, addr2, data2, 2, msr) is None)
+ result, exc = yield from pi_ld(dut, addr1, 2, msr)
+ result2, exc2 = yield from pi_ld(dut, addr2, 2, msr)
assert(exc is None)
assert(exc2 is None)
arg.assertEqual(data, result, "data %x != %x" % (result, data))
# now load both in a 32-bit load to make sure they're really consecutive
data3 = data | (data2 << 16)
- result3, exc3 = yield from pi_ld(dut, addr1, 4, msr_pr)
+ result3, exc3 = yield from pi_ld(dut, addr1, 4, msr)
assert(exc3 is None)
arg.assertEqual(data3, result3, "data3 %x != %x" % (result3, data3))
dut = Module()
pspec = TestMemPspec(ldst_ifacetype=ifacetype,
imem_ifacetype='',
- addr_wid=48,
+ addr_wid=64,
mask_wid=8,
reg_wid=64)
cmpi = ConfigMemoryPortInterface(pspec)
vcd_name='test_pi_%s.vcd' % ifacetype)
+# FIXME: TypeError: pi_ldst() missing 1 required positional argument: 'msr'
+@unittest.skip('broken')
class TestPIMem(unittest.TestCase):
-
def test_pi_mem(self):
tst_config_pi(self, 'testpi')
--- /dev/null
+ls180_pins.py
from nmigen.utils import log2_int
from nmigen.cli import rtlil
from soc.config.state import CoreState
+from openpower.consts import FastRegsEnum
# DMI register addresses
class DBGCore:
- CTRL = 0b0000
- STAT = 0b0001
+ CTRL = 0b0000 # Control: start/stop/reset
+ STAT = 0b0001 # Status (read started/stopped/stopping)
NIA = 0b0010 # NIA register (read only for now)
MSR = 0b0011 # MSR (read only)
GSPR_IDX = 0b0100 # GSPR register index
CR = 0b1000 # CR (read only)
XER = 0b1001 # XER (read only) - note this is a TEMPORARY hack
SVSTATE = 0b1010 # SVSTATE register (read only for now)
+ STOPADDR = 0b1011 # Address at which the core automatically stops
# CTRL register (direct actions, write 1 to act, read back 0)
self.core_stopped_i = Signal()
self.state = CoreState("core_dbg")
- # GSPR register read port
- self.d_gpr = DbgReg("d_gpr")
-
- # CR register read port
- self.d_cr = DbgReg("d_cr")
-
- # XER register read port
- self.d_xer = DbgReg("d_xer")
+ self.d_gpr = DbgReg("d_gpr") # GSPR register read port
+ self.d_fast = DbgReg("d_fast") # GSPR register read port
+ self.d_cr = DbgReg("d_cr") # CR register read port
+ self.d_xer = DbgReg("d_xer") # XER register read port
# Core logging data
self.log_data_i = Signal(256)
self.log_read_data_o = Signal(64)
self.log_write_addr_o = Signal(32)
+ # address at which the processor stops automatically
+ # set to 0xffffffffffffffff by default (impossible to reach)
+ self.stop_addr_o = Signal(64, reset=-1)
+
# Misc
self.terminated_o = Signal()
m = Module()
comb, sync = m.d.comb, m.d.sync
dmi, d_gpr, d_cr, d_xer, = self.dmi, self.d_gpr, self.d_cr, self.d_xer
+ d_fast = self.d_fast
# DMI needs fixing... make a one clock pulse
dmi_req_i_1 = Signal()
do_icreset = Signal()
terminated = Signal()
do_gspr_rd = Signal()
+ # select either GPRs or FAST regs to read, based on GSPR_IDX
gspr_index = Signal.like(d_gpr.addr)
+ fast_index = Signal.like(d_gpr.addr)
+ gspr_en = Signal()
+ fast_en = Signal()
log_dmi_addr = Signal(32)
log_dmi_data = Signal(64)
LOG_INDEX_BITS = log2_int(self.LOG_LENGTH)
- # Single cycle register accesses on DMI except for GSPR data
+ # Single cycle register accesses on DMI except for registers
with m.Switch(dmi.addr_i):
with m.Case(DBGCore.GSPR_DATA):
- comb += dmi.ack_o.eq(d_gpr.ack)
- comb += d_gpr.req.eq(dmi.req_i)
+ with m.If(gspr_en): # GPR requested, acknowledge GPR
+ comb += dmi.ack_o.eq(d_gpr.ack)
+ comb += d_gpr.req.eq(dmi.req_i)
+ with m.If(fast_en): # FAST requested
+ comb += dmi.ack_o.eq(d_fast.ack)
+ comb += d_fast.req.eq(dmi.req_i)
with m.Case(DBGCore.CR):
comb += dmi.ack_o.eq(d_cr.ack)
comb += d_cr.req.eq(dmi.req_i)
comb += dmi.ack_o.eq(d_xer.ack)
comb += d_xer.req.eq(dmi.req_i)
with m.Default():
+ # everything else is immediate-acknowledgement (combinatorial)
comb += dmi.ack_o.eq(dmi.req_i)
# Status register read composition (DBUG_CORE_STAT_xxx)
# DMI read data mux
with m.Switch(dmi.addr_i):
- with m.Case( DBGCore.STAT):
+ with m.Case( DBGCore.STAT): # Status register
comb += dmi.dout.eq(stat_reg)
- with m.Case( DBGCore.NIA):
+ with m.Case( DBGCore.NIA): # NIA (PC)
comb += dmi.dout.eq(self.state.pc)
- with m.Case( DBGCore.MSR):
+ with m.Case( DBGCore.MSR): # MSR
comb += dmi.dout.eq(self.state.msr)
- with m.Case( DBGCore.SVSTATE):
+ with m.Case( DBGCore.SVSTATE): # SVSTATE
comb += dmi.dout.eq(self.state.svstate)
- with m.Case( DBGCore.GSPR_DATA):
- comb += dmi.dout.eq(d_gpr.data)
- with m.Case( DBGCore.LOG_ADDR):
+ with m.Case( DBGCore.GSPR_DATA): # GPR/FAST regs
+ with m.If(gspr_en):
+ comb += dmi.dout.eq(d_gpr.data) # GPR data selected
+ with m.If(fast_en):
+ comb += dmi.dout.eq(d_fast.data) # FAST reg read selected
+ with m.Case( DBGCore.LOG_ADDR): # Logging
comb += dmi.dout.eq(Cat(log_dmi_addr, self.log_write_addr_o))
with m.Case( DBGCore.LOG_DATA):
comb += dmi.dout.eq(log_dmi_data)
- with m.Case(DBGCore.CR):
+ with m.Case(DBGCore.CR): # CR
comb += dmi.dout.eq(d_cr.data)
- with m.Case(DBGCore.XER):
+ with m.Case(DBGCore.XER): # XER
comb += dmi.dout.eq(d_xer.data)
+ with m.Case(DBGCore.STOPADDR): # Halt PC
+ comb += dmi.dout.eq(self.stop_addr_o)
# DMI writes
# Reset the 1-cycle "do" signals
# GSPR address
with m.Elif(dmi.addr_i == DBGCore.GSPR_IDX):
- sync += gspr_index.eq(dmi.din)
+ sync += gspr_index.eq(0)
+ sync += fast_index.eq(0)
+ sync += gspr_en.eq(0)
+ sync += fast_en.eq(0)
+ with m.If(dmi.din <= 31):
+ sync += gspr_index.eq(dmi.din)
+ sync += gspr_en.eq(1)
+ # cover the FastRegs LR, CTR, SRR0, SRR1 etc.
+ # numbering is from microwatt
+ for x, i in FastRegsEnum.__dict__.items():
+ if not isinstance(i, int) or x == 'N_REGS':
+ continue
+ with m.If(dmi.din == 32+i):
+ sync += fast_index.eq(i)
+ sync += fast_en.eq(1)
# Log address
with m.Elif(dmi.addr_i == DBGCore.LOG_ADDR):
sync += log_dmi_addr.eq(dmi.din)
sync += do_dmi_log_rd.eq(1)
+
+ # set PC Halt address
+ with m.Elif(dmi.addr_i == DBGCore.STOPADDR):
+ sync += self.stop_addr_o.eq(dmi.din)
+
with m.Else():
# sync += Display("DMI read from " & to_string(dmi_addr))
pass
sync += terminated.eq(1)
comb += d_gpr.addr.eq(gspr_index)
+ comb += d_fast.addr.eq(fast_index)
# Core control signals generated by the debug module
- comb += self.core_stop_o.eq(stopping & ~do_step)
+ # Note: make stop and terminated synchronous, to help with timing
+ # however this *may* interfere with some of the DMI-based unit tests
+ # so has to be kept an eye on
+ sync += self.core_stop_o.eq((stopping & ~do_step) | self.terminate_i)
+ sync += self.terminated_o.eq(terminated | self.terminate_i)
comb += self.core_rst_o.eq(do_reset)
comb += self.icache_rst_o.eq(do_icreset)
- comb += self.terminated_o.eq(terminated)
# Logging RAM (none)
yield from self.d_gpr
yield from self.d_cr
yield from self.d_xer
+ yield from self.d_fast
yield self.log_data_i
yield self.log_read_addr_i
yield self.log_read_data_o
yield
yield dut.bus.tms.eq(0)
+def tms_data_getset(dut, tms, d_len, d_in=0, reverse=False):
+ if reverse:
+ # Reverse the for loop to transmit MSB-first
+ bit_range = range(d_len-1, -1, -1)
+ else:
+ bit_range = range(d_len)
-def tms_data_getset(dut, tms, d_len, d_in=0):
res = 0
yield dut.bus.tms.eq(tms)
- for i in range(d_len):
+ for i in bit_range:
tdi = 1 if (d_in & (1<<i)) else 0
yield dut.bus.tck.eq(1)
res |= (1<<i) if (yield dut.bus.tdo) else 0
yield from tms_state_set(dut, [1, 1, 0])
-def jtag_read_write_reg(dut, addr, d_len, d_in=0):
+def jtag_read_write_reg(dut, addr, d_len, d_in=0, reverse=False):
yield from jtag_set_run(dut)
yield from jtag_set_shift_ir(dut)
yield from tms_data_getset(dut, 0, dut._ir_width, addr)
yield from jtag_set_idle(dut)
yield from jtag_set_shift_dr(dut)
- result = yield from tms_data_getset(dut, 0, d_len, d_in)
+ result = yield from tms_data_getset(dut, 0, d_len, d_in, reverse)
yield from jtag_set_idle(dut)
return result
#####################
# input (and output) for logical initial stage (common input)
+
+
class ALUInputData(FUBaseData):
- regspec = [('INT', 'a', '0:63'), # RA
- ('INT', 'b', '0:63'), # RB/immediate
+ regspec = [('INT', 'a', '0:63'), # RA
+ ('INT', 'b', '0:63'), # RB/immediate
]
+
def __init__(self, pspec):
super().__init__(pspec, False)
class ALUOutputData(FUBaseData):
regspec = [('INT', 'o', '0:63'), # RT
]
+
def __init__(self, pspec):
super().__init__(pspec, True)
class ALUFunctionUnit(FunctionUnitBaseSingle):
-#class ALUFunctionUnit(FunctionUnitBaseMulti):
+ # class ALUFunctionUnit(FunctionUnitBaseMulti):
fnunit = Function.ALU
- def __init__(self, idx):
- super().__init__(ALUPipeSpec, ALU, 1)
+ def __init__(self, idx, parent_pspec):
+ super().__init__(ALUPipeSpec, ALU, 1, parent_pspec)
class ALU(Elaboratable):
# TODO: replace with Memory at some point
-from nmigen import Elaboratable, Signal, Array, Module
+from nmigen import Elaboratable, Signal, Array, Module, Memory
from nmutil.util import Display
+
class CacheRam(Elaboratable):
def __init__(self, ROW_BITS=16, WIDTH = 64, TRACE=True, ADD_BUF=False,
ADD_BUF = self.ADD_BUF
SIZE = 2**ROW_BITS
- ram = Array(Signal(WIDTH) for i in range(SIZE))
+ # set up the Cache RAM Memory and create one read and one write port
+ # the read port is *not* transparent (does not pass write-thru-read)
#attribute ram_style of ram : signal is "block";
-
- rd_data0 = Signal(WIDTH)
-
+ ram = Memory(depth=SIZE, width=WIDTH,
+ attrs={'syn_ramstyle': "block_ram"})
+ m.submodules.rdport = rdport = ram.read_port(transparent=False)
+ m.submodules.wrport = wrport = ram.write_port(granularity=8)
+
with m.If(TRACE):
with m.If(self.wr_sel.bool()):
sync += Display( "write ramno %d a: %%x "
"sel: %%x dat: %%x" % self.ram_num,
self.wr_addr,
self.wr_sel, self.wr_data)
- for i in range(WIDTH//8):
- lbit = i * 8;
- mbit = lbit + 8;
- with m.If(self.wr_sel[i]):
- sync += ram[self.wr_addr][lbit:mbit].eq(self.wr_data[lbit:mbit])
- with m.If(self.rd_en):
- sync += rd_data0.eq(ram[self.rd_addr])
- if TRACE:
+
+ # read data output and a latched copy. behaves like microwatt cacheram
+ rd_data0 = Signal(WIDTH)
+ rd_data0l = Signal(WIDTH)
+
+ # delay on read address/en
+ rd_delay = Signal()
+ rd_delay_addr = Signal.like(self.rd_addr)
+ sync += rd_delay_addr.eq(self.rd_addr)
+ sync += rd_delay.eq(self.rd_en)
+
+ # write port
+ comb += wrport.addr.eq(self.wr_addr)
+ comb += wrport.en.eq(self.wr_sel)
+ comb += wrport.data.eq(self.wr_data)
+
+ # read port (include a latch on the output, for microwatt compatibility)
+ comb += rdport.addr.eq(self.rd_addr)
+ comb += rdport.en.eq(self.rd_en)
+ with m.If(rd_delay):
+ comb += rd_data0.eq(rdport.data)
+ sync += rd_data0l.eq(rd_data0) # preserve latched data
+ with m.Else():
+ comb += rd_data0.eq(rd_data0l) # output latched (last-read)
+
+ if TRACE:
+ with m.If(rd_delay):
sync += Display("read ramno %d a: %%x dat: %%x" % self.ram_num,
- self.rd_addr, ram[self.rd_addr])
+ rd_delay_addr, rd_data0)
pass
-
+ # extra delay requested?
if ADD_BUF:
sync += self.rd_data_o.eq(rd_data0)
else:
rw_domain = m.d.sync
else:
rw_domain = m.d.comb
+ # generate a pulse on system reset, to reset any latches, if needed
+ system_reset = Signal(reset=1)
+ m.d.sync += system_reset.eq(0)
+
# add the ALU to the MultiCompUnit only if it is a "real" ALU
# see AllFunctionUnits as to why: a FunctionUnitBaseMulti
# only has one "real" ALU but multiple pseudo front-ends,
# ALU only proceeds when all src are ready. rd_rel_o is delayed
# so combine it with go_rd_i. if all bits are set we're good
all_rd = Signal(reset_less=True)
- m.d.comb += all_rd.eq(self.busy_o & rok_l.q &
+ m.d.comb += all_rd.eq(self.busy_o & # rok_l.q & # XXX LOOP
(((~self.rd.rel_o) | self.rd.go_i).all()))
# generate read-done pulse
all_rd_pulse = Signal(reset_less=True)
- m.d.comb += all_rd_pulse.eq(rising_edge(m, all_rd))
+ m.d.comb += all_rd_pulse.eq(rising_edge(m, all_rd)) # XXX LOOP
# create rising pulse from alu valid condition.
alu_done = self.cu.alu_done_o
m.d.comb += reset.eq(req_done | self.go_die_i)
m.d.comb += rst_r.eq(self.issue_i | self.go_die_i)
m.d.comb += reset_w.eq(self.wr.go_i | Repl(self.go_die_i, self.n_dst))
- m.d.comb += reset_r.eq(self.rd.go_i | Repl(self.go_die_i, self.n_src))
+ m.d.comb += reset_r.eq(self.rd.go_i | Repl(rst_r, self.n_src))
# read-done,wr-proceed latch
rw_domain += rok_l.s.eq(self.issue_i) # set up when issue starts
- rw_domain += rok_l.r.eq(self.alu.n.o_valid & self.busy_o) # ALU done
+ rw_domain += rok_l.r.eq(self.alu.n.o_valid & self.busy_o) # ALUdone LOOP
# wr-done, back-to-start latch
rw_domain += rst_l.s.eq(all_rd) # set when read-phase is fully done
# src operand latch (not using go_wr_i) ANDed with rdmask
rdmaskn = Signal(self.n_src)
latchregister(m, self.rdmaskn, rdmaskn, self.issue_i, name="rdmask_l")
- m.d.comb += src_l.s.eq(Repl(self.issue_i, self.n_src) & ~rdmaskn)
+ m.d.sync += src_l.s.eq(Repl(self.issue_i, self.n_src) & ~rdmaskn)
m.d.sync += src_l.r.eq(reset_r)
# dest operand latch (not using issue_i)
rw_domain += req_l.s.eq(alu_pulsem & self.wrmask)
- m.d.comb += req_l.r.eq(reset_w | prev_wr_go)
+ m.d.comb += req_l.r.eq(reset_w | prev_wr_go |
+ Repl(system_reset, self.n_dst))
# pass operation to the ALU (sync: plenty time to wait for src reads)
op = self.get_op()
m.submodules.alu_l = alu_l = SRLatch(False, name="alu")
m.d.comb += self.alu.n.i_ready.eq(alu_l.q)
m.d.sync += alu_l.r.eq(self.alu.n.o_valid & alu_l.q)
- m.d.comb += alu_l.s.eq(all_rd_pulse)
+ m.d.comb += alu_l.s.eq(all_rd_pulse) # XXX LOOP
# -----
# outputs
from nmigen.compat.sim import run_simulation
from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Mux, Cat, Elaboratable, Array, Repl
+from nmigen import Module, Signal, Mux, Cat, Elaboratable, Array, Repl, C
from nmigen.hdl.rec import Record, Layout
from nmutil.latch import SRLatch, latchregister
TODO: use one module for the byte-reverse as it's quite expensive in gates
"""
- def __init__(self, pi=None, rwid=64, awid=48, opsubset=CompLDSTOpSubset,
+ def __init__(self, pi=None, rwid=64, awid=64, opsubset=CompLDSTOpSubset,
debugtest=False, name=None):
super().__init__(rwid)
self.awid = awid
# POWER-compliant LD/ST has index and update: *fixed* number of ports
self.n_src = n_src = 3 # RA, RB, RT/RS
- self.n_dst = n_dst = 2 # RA, RT/RS
+ self.n_dst = n_dst = 3 # RA, RT/RS, CR0
# set up array of src and dest signals
for i in range(n_src):
self.o_data = Data(self.data_wid, name="o") # Dest1 out: RT
self.addr_o = Data(self.data_wid, name="ea") # Addr out: Update => RA
+ self.cr_o = Data(4, name="cr0") # CR0 (for stdcx etc)
self.exc_o = cu.exc_o
self.done_o = cu.done_o
self.busy_o = cu.busy_o
#####################
# latches for the FSM.
- m.submodules.opc_l = opc_l = SRLatch(sync=False, name="opc")
+ m.submodules.opc_l = opc_l = SRLatch(sync=True, name="opc")
m.submodules.src_l = src_l = SRLatch(False, self.n_src, name="src")
m.submodules.alu_l = alu_l = SRLatch(sync=False, name="alu")
m.submodules.adr_l = adr_l = SRLatch(sync=False, name="adr")
m.submodules.sto_l = sto_l = SRLatch(sync=False, name="sto")
m.submodules.wri_l = wri_l = SRLatch(sync=False, name="wri")
m.submodules.upd_l = upd_l = SRLatch(sync=False, name="upd")
+ m.submodules.cr0_l = cr0_l = SRLatch(sync=False, name="cr0")
m.submodules.rst_l = rst_l = SRLatch(sync=False, name="rst")
m.submodules.lsd_l = lsd_l = SRLatch(sync=False, name="lsd") # done
op_is_st = Signal(reset_less=True)
op_is_dcbz = Signal(reset_less=True)
op_is_st_or_dcbz = Signal(reset_less=True)
+ op_is_atomic = Signal(reset_less=True)
# ALU/LD data output control
alu_valid = Signal(reset_less=True) # ALU operands are valid
rd_done = Signal(reset_less=True) # all *necessary* operands read
wr_reset = Signal(reset_less=True) # final reset condition
canceln = Signal(reset_less=True) # cancel (active low)
+ store_done = Signal(reset_less=True) # store has been actioned
# LD and ALU out
alu_o = Signal(self.data_wid, reset_less=True)
reset_o = Signal(reset_less=True) # reset opcode
reset_w = Signal(reset_less=True) # reset write
reset_u = Signal(reset_less=True) # reset update
+ reset_c = Signal(reset_less=True) # reset cr0
reset_a = Signal(reset_less=True) # reset adr latch
reset_i = Signal(reset_less=True) # issue|die (use a lot)
reset_r = Signal(self.n_src, reset_less=True) # reset src
comb += reset_o.eq(self.done_o | terminate) # opcode reset
comb += reset_w.eq(self.wr.go_i[0] | terminate) # write reg 1
comb += reset_u.eq(self.wr.go_i[1] | terminate) # update (reg 2)
+ comb += reset_c.eq(self.wr.go_i[2] | terminate) # cr0 (reg 3)
comb += reset_s.eq(self.go_st_i | terminate) # store reset
comb += reset_r.eq(self.rd.go_i | Repl(terminate, self.n_src))
comb += reset_a.eq(self.go_ad_i | terminate)
comb += op_is_st.eq(oper_r.insn_type == MicrOp.OP_STORE) # ST
comb += op_is_ld.eq(oper_r.insn_type == MicrOp.OP_LOAD) # LD
comb += op_is_dcbz.eq(oper_r.insn_type == MicrOp.OP_DCBZ) # DCBZ
+ comb += op_is_atomic.eq(oper_r.reserve) # atomic LR/SC
comb += op_is_st_or_dcbz.eq(op_is_st | op_is_dcbz)
# dcbz is special case of store
#uncomment if needed
# - alu_l : looks after add of src1/2/imm (EA)
# - adr_l : waits for add (EA)
# - upd_l : waits for adr and Regfile (port 2)
+ # - cr0_l : waits for Rc=1 and CR0 Regfile (port 3)
# - src_l[2] : ST
# - lod_l : waits for adr (EA) and for LD Data
# - wri_l : waits for LD Data and Regfile (port 1)
# opcode latch - inverted so that busy resets to 0
# note this MUST be sync so as to avoid a combinatorial loop
# between busy_o and issue_i on the reset latch (rst_l)
- sync += opc_l.s.eq(issue_i) # XXX NOTE: INVERTED FROM book!
- sync += opc_l.r.eq(reset_o) # XXX NOTE: INVERTED FROM book!
+ comb += opc_l.s.eq(issue_i) # XXX NOTE: INVERTED FROM book!
+ comb += opc_l.r.eq(reset_o) # XXX NOTE: INVERTED FROM book!
# src operand latch
sync += src_l.s.eq(Repl(issue_i, self.n_src) & ~self.rdmaskn)
#self.done_o | (self.pi.busy_o & op_is_update),
self.n_dst))
+ # CR0 operand latch (CR0 written to reg 3 if Rc=1)
+ op_is_rc1 = self.oper_i.rc.rc & self.oper_i.rc.ok
+ comb += cr0_l.s.eq(issue_i & op_is_rc1)
+ sync += cr0_l.r.eq(reset_c)
+
# update-mode operand latch (EA written to reg 2)
sync += upd_l.s.eq(reset_i)
sync += upd_l.r.eq(reset_u)
with m.If(self.done_o | terminate):
sync += oper_r.eq(0)
- # and for LD
+ # and for LD and store-done
ldd_r = Signal(self.data_wid, reset_less=True) # Dest register
latchregister(m, ldd_o, ldd_r, ld_ok, name="ldo_r")
+ # store actioned, communicate through CR0 (for atomic LR/SC)
+ latchregister(m, self.pi.store_done.data, store_done,
+ self.pi.store_done.ok,
+ name="std_r")
+
# and for each input from the incoming src operands
srl = []
for i in range(self.n_src):
comb += self.wr.rel_o[1].eq(upd_l.q & busy_o & op_is_update &
alu_valid & canceln)
+ # request write of CR0 result only in reserve and Rc=1
+ comb += self.wr.rel_o[2].eq(cr0_l.q & busy_o & op_is_atomic &
+ alu_valid & canceln)
+
# provide "done" signal: select req_rel for non-LD/ST, adr_rel for LD/ST
comb += wr_any.eq(self.st.go_i | p_st_go |
- self.wr.go_i[0] | self.wr.go_i[1])
+ self.wr.go_i.bool())
comb += wr_reset.eq(rst_l.q & busy_o & canceln &
- ~(self.st.rel_o | self.wr.rel_o[0] |
- self.wr.rel_o[1]) &
+ ~(self.st.rel_o | self.wr.rel_o.bool()) &
(lod_l.qn | op_is_st_or_dcbz)
)
comb += self.done_o.eq(wr_reset & (~self.pi.busy_o | op_is_ld))
# put the LD-output register directly onto the output bus on a go_write
comb += self.o_data.data.eq(self.dest[0])
+ comb += self.o_data.ok.eq(self.wr.rel_o[0])
with m.If(self.wr.go_i[0]):
comb += self.dest[0].eq(ldd_r)
# "update" mode, put address out on 2nd go-write
comb += self.addr_o.data.eq(self.dest[1])
+ comb += self.addr_o.ok.eq(self.wr.rel_o[1])
with m.If(op_is_update & self.wr.go_i[1]):
comb += self.dest[1].eq(addr_r)
+ # fun-fun-fun, calculate CR0 when Rc=1 requested.
+ cr0 = self.dest[2]
+ comb += self.cr_o.data.eq(cr0)
+ comb += self.cr_o.ok.eq(self.wr.rel_o[2])
+ with m.If(cr0_l.q):
+ comb += cr0.eq(Cat(C(0, 1), store_done, C(0, 2)))
+
# need to look like MultiCompUnit: put wrmask out.
# XXX may need to make this enable only when write active
- comb += self.wrmask.eq(bro & Cat(op_is_ld, op_is_update))
+ comb += self.wrmask.eq(bro & Cat(op_is_ld, op_is_update, cr0_l.q))
###########################
# PortInterface connections
# connect to LD/ST PortInterface.
comb += pi.is_ld_i.eq(op_is_ld & busy_o) # decoded-LD
+ comb += pi.is_nc.eq(op_is_cix & busy_o) # cache-inhibited
comb += pi.is_st_i.eq(op_is_st_or_dcbz & busy_o) # decoded-ST
comb += pi.is_dcbz_i.eq(op_is_dcbz & busy_o) # decoded-DCBZ
+ comb += pi.reserve.eq(oper_r.reserve & busy_o) # atomic LR/SC
comb += pi.data_len.eq(oper_r.data_len) # data_len
# address: use sync to avoid long latency
sync += pi.addr.data.eq(addr_r) # EA from adder
sync += pi.addr.ok.eq(alu_ok & lsd_l.q) # "do address stuff" (once)
comb += self.exc_o.eq(pi.exc_o) # exception occurred
comb += addr_ok.eq(self.pi.addr_ok_o) # no exc, address fine
- # connect MSR.PR for priv/virt operation
- comb += pi.msr_pr.eq(oper_r.msr[MSR.PR])
- comb += Display("LDSTCompUnit: oper_r.msr %x pi.msr_pr=%x",
- oper_r.msr, oper_r.msr[MSR.PR])
+ # connect MSR.PR etc. for priv/virt operation
+ comb += pi.priv_mode.eq(~oper_r.msr[MSR.PR])
+ comb += pi.virt_mode.eq(oper_r.msr[MSR.DR])
+ comb += pi.mode_32bit.eq(~oper_r.msr[MSR.SF])
+ with m.If(self.issue_i): # display this only once
+ sync += Display("LDSTCompUnit: oper_r.msr %x pr=%x dr=%x sf=%x",
+ oper_r.msr,
+ oper_r.msr[MSR.PR],
+ oper_r.msr[MSR.DR],
+ oper_r.msr[MSR.SF])
# byte-reverse on LD
revnorev = Signal(64, reset_less=True)
comb += pi.st.data.eq(stdata_r)
with m.Else():
comb += pi.st.data.eq(op3)
+
# store - data goes in based on go_st
comb += pi.st.ok.eq(self.st.go_i) # go store signals st data valid
return self.o_data # LDSTOutputData.regspec o
if i == 1:
return self.addr_o # LDSTOutputData.regspec o1
+ if i == 2:
+ return self.cr_o # LDSTOutputData.regspec cr_a
# return self.dest[i]
def get_fu_out(self, i):
yield self.wr.rel_o
yield from self.o_data.ports()
yield from self.addr_o.ports()
+ yield from self.cr_o.ports()
yield self.load_mem_o
yield self.stwd_mem_o
units = {}
pspec = TestMemPspec(ldst_ifacetype='bare_wb',
imem_ifacetype='bare_wb',
- addr_wid=48,
+ addr_wid=64,
mask_wid=8,
reg_wid=64,
units=units)
units = {}
pspec = TestMemPspec(ldst_ifacetype='bare_wb',
imem_ifacetype='bare_wb',
- addr_wid=48,
+ addr_wid=64,
mask_wid=8,
reg_wid=64,
units=units)
+#!/usr/bin/env python3
+#
+# Copyright (C) 2020,2021 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+# Copyright (C) 2020 Cole Poirier
+# Copyright (C) 2020,2021 Cesar Strauss
+# Copyright (C) 2021 Tobias Platen
+#
+# Original dcache.vhdl Copyright of its authors and licensed
+# by IBM under CC-BY 4.0
+# https://github.com/antonblanchard/microwatt
+#
+# Conversion to nmigen funded by NLnet and NGI POINTER under EU Grants
+# 871528 and 957073, under the LGPL-v3+ License
+
"""DCache
based on Anton Blanchard microwatt dcache.vhdl
* https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg
* https://bugs.libre-soc.org/show_bug.cgi?id=469
+* https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
+ (discussion about brams for ECP5)
"""
from enum import Enum, unique
-from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
+from nmigen import (Module, Signal, Elaboratable, Cat, Repl, Array, Const,
+ Record, Memory)
from nmutil.util import Display
+from nmigen.lib.coding import Decoder
from copy import deepcopy
from random import randint, seed
+from nmigen_soc.wishbone.bus import Interface
+
from nmigen.cli import main
from nmutil.iocontrol import RecordObject
from nmigen.utils import log2_int
WBIOMasterOut, WBIOSlaveOut)
from soc.experiment.cache_ram import CacheRam
-#from soc.experiment.plru import PLRU
-from nmutil.plru import PLRU
+from soc.experiment.plru import PLRU, PLRUs
+#from nmutil.plru import PLRU, PLRUs
# for test
from soc.bus.sram import SRAM
from nmutil.util import wrap
-
-# TODO: make these parameters of DCache at some point
-LINE_SIZE = 64 # Line size in bytes
-NUM_LINES = 16 # Number of lines in a set
-NUM_WAYS = 4 # Number of ways
-TLB_SET_SIZE = 64 # L1 DTLB entries per set
-TLB_NUM_WAYS = 2 # L1 DTLB number of sets
-TLB_LG_PGSZ = 12 # L1 DTLB log_2(page_size)
LOG_LENGTH = 0 # Non-zero to enable log data collection
-# BRAM organisation: We never access more than
-# -- WB_DATA_BITS at a time so to save
-# -- resources we make the array only that wide, and
-# -- use consecutive indices to make a cache "line"
-# --
-# -- ROW_SIZE is the width in bytes of the BRAM
-# -- (based on WB, so 64-bits)
-ROW_SIZE = WB_DATA_BITS // 8;
-
-# ROW_PER_LINE is the number of row (wishbone
-# transactions) in a line
-ROW_PER_LINE = LINE_SIZE // ROW_SIZE
-
-# BRAM_ROWS is the number of rows in BRAM needed
-# to represent the full dcache
-BRAM_ROWS = NUM_LINES * ROW_PER_LINE
-
-print ("ROW_SIZE", ROW_SIZE)
-print ("ROW_PER_LINE", ROW_PER_LINE)
-print ("BRAM_ROWS", BRAM_ROWS)
-print ("NUM_WAYS", NUM_WAYS)
-
-# Bit fields counts in the address
-
-# REAL_ADDR_BITS is the number of real address
-# bits that we store
-REAL_ADDR_BITS = 56
-
-# ROW_BITS is the number of bits to select a row
-ROW_BITS = log2_int(BRAM_ROWS)
-
-# ROW_LINE_BITS is the number of bits to select
-# a row within a line
-ROW_LINE_BITS = log2_int(ROW_PER_LINE)
-
-# LINE_OFF_BITS is the number of bits for
-# the offset in a cache line
-LINE_OFF_BITS = log2_int(LINE_SIZE)
-
-# ROW_OFF_BITS is the number of bits for
-# the offset in a row
-ROW_OFF_BITS = log2_int(ROW_SIZE)
-
-# INDEX_BITS is the number if bits to
-# select a cache line
-INDEX_BITS = log2_int(NUM_LINES)
-
-# SET_SIZE_BITS is the log base 2 of the set size
-SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
-
-# TAG_BITS is the number of bits of
-# the tag part of the address
-TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
-
-# TAG_WIDTH is the width in bits of each way of the tag RAM
-TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
-
-# WAY_BITS is the number of bits to select a way
-WAY_BITS = log2_int(NUM_WAYS)
-
-# Example of layout for 32 lines of 64 bytes:
-layout = """\
- .. tag |index| line |
- .. | row | |
- .. | |---| | ROW_LINE_BITS (3)
- .. | |--- - --| LINE_OFF_BITS (6)
- .. | |- --| ROW_OFF_BITS (3)
- .. |----- ---| | ROW_BITS (8)
- .. |-----| | INDEX_BITS (5)
- .. --------| | TAG_BITS (45)
-"""
-print (layout)
-print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
- (TAG_BITS, INDEX_BITS, ROW_BITS,
- ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
-print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
-print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
-print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
-
-TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
-
-print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
-
-def CacheTagArray():
- return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" % x) \
- for x in range(NUM_LINES))
-
-def CacheValidBitsArray():
- return Array(Signal(NUM_WAYS, name="cachevalid_%d" % x) \
- for x in range(NUM_LINES))
-
-def RowPerLineValidArray():
- return Array(Signal(name="rows_valid%d" % x) \
- for x in range(ROW_PER_LINE))
-
-# L1 TLB
-TLB_SET_BITS = log2_int(TLB_SET_SIZE)
-TLB_WAY_BITS = log2_int(TLB_NUM_WAYS)
-TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
-TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
-TLB_PTE_BITS = 64
-TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
-
def ispow2(x):
return (1<<log2_int(x, False)) == x
-assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
-assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
-assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
-assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
-assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
-assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
- "geometry bits don't add up"
-assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
- "geometry bits don't add up"
-assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
- "geometry bits don't add up"
-assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
-assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
-
-
-def TLBValidBitsArray():
- return Array(Signal(TLB_NUM_WAYS, name="tlbvalid%d" % x) \
- for x in range(TLB_SET_SIZE))
-
-def TLBTagEAArray():
- return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
- for x in range (TLB_NUM_WAYS))
-
-def TLBTagsArray():
- return Array(Signal(TLB_TAG_WAY_BITS, name="tlbtags%d" % x) \
- for x in range (TLB_SET_SIZE))
-
-def TLBPtesArray():
- return Array(Signal(TLB_PTE_WAY_BITS, name="tlbptes%d" % x) \
- for x in range(TLB_SET_SIZE))
-
-def HitWaySet():
- return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
- for x in range(TLB_NUM_WAYS))
-
-# Cache RAM interface
-def CacheRamOut():
- return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
- for x in range(NUM_WAYS))
-
-# PLRU output interface
-def PLRUOut():
- return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
- for x in range(NUM_LINES))
-
-# TLB PLRU output interface
-def TLBPLRUOut():
- return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
- for x in range(TLB_SET_SIZE))
-
-# Helper functions to decode incoming requests
-#
-# Return the cache line index (tag index) for an address
-def get_index(addr):
- return addr[LINE_OFF_BITS:SET_SIZE_BITS]
-# Return the cache row index (data memory) for an address
-def get_row(addr):
- return addr[ROW_OFF_BITS:SET_SIZE_BITS]
+class DCacheConfig:
+ def __init__(self, LINE_SIZE = 64, # Line size in bytes
+ NUM_LINES = 64, # Number of lines in a set
+ NUM_WAYS = 2, # Number of ways
+ TLB_SET_SIZE = 64, # L1 DTLB entries per set
+ TLB_NUM_WAYS = 2, # L1 DTLB number of sets
+ TLB_LG_PGSZ = 12): # L1 DTLB log_2(page_size)
+ self.LINE_SIZE = LINE_SIZE
+ self.NUM_LINES = NUM_LINES
+ self.NUM_WAYS = NUM_WAYS
+ self.TLB_SET_SIZE = TLB_SET_SIZE
+ self.TLB_NUM_WAYS = TLB_NUM_WAYS
+ self.TLB_LG_PGSZ = TLB_LG_PGSZ
+
+ # BRAM organisation: We never access more than
+ # -- WB_DATA_BITS at a time so to save
+ # -- resources we make the array only that wide, and
+ # -- use consecutive indices to make a cache "line"
+ # --
+ # -- ROW_SIZE is the width in bytes of the BRAM
+ # -- (based on WB, so 64-bits)
+ self.ROW_SIZE = WB_DATA_BITS // 8;
+
+ # ROW_PER_LINE is the number of row (wishbone
+ # transactions) in a line
+ self.ROW_PER_LINE = self.LINE_SIZE // self.ROW_SIZE
+
+ # BRAM_ROWS is the number of rows in BRAM needed
+ # to represent the full dcache
+ self.BRAM_ROWS = self.NUM_LINES * self.ROW_PER_LINE
+
+ print ("ROW_SIZE", self.ROW_SIZE)
+ print ("ROW_PER_LINE", self.ROW_PER_LINE)
+ print ("BRAM_ROWS", self.BRAM_ROWS)
+ print ("NUM_WAYS", self.NUM_WAYS)
+
+ # Bit fields counts in the address
+
+ # REAL_ADDR_BITS is the number of real address
+ # bits that we store
+ self.REAL_ADDR_BITS = 56
+
+ # ROW_BITS is the number of bits to select a row
+ self.ROW_BITS = log2_int(self.BRAM_ROWS)
+
+ # ROW_LINE_BITS is the number of bits to select
+ # a row within a line
+ self.ROW_LINE_BITS = log2_int(self.ROW_PER_LINE)
+
+ # LINE_OFF_BITS is the number of bits for
+ # the offset in a cache line
+ self.LINE_OFF_BITS = log2_int(self.LINE_SIZE)
+
+ # ROW_OFF_BITS is the number of bits for
+ # the offset in a row
+ self.ROW_OFF_BITS = log2_int(self.ROW_SIZE)
+
+ # INDEX_BITS is the number if bits to
+ # select a cache line
+ self.INDEX_BITS = log2_int(self.NUM_LINES)
+
+ # SET_SIZE_BITS is the log base 2 of the set size
+ self.SET_SIZE_BITS = self.LINE_OFF_BITS + self.INDEX_BITS
+
+ # TAG_BITS is the number of bits of
+ # the tag part of the address
+ self.TAG_BITS = self.REAL_ADDR_BITS - self.SET_SIZE_BITS
+
+ # TAG_WIDTH is the width in bits of each way of the tag RAM
+ self.TAG_WIDTH = self.TAG_BITS + 7 - ((self.TAG_BITS + 7) % 8)
+
+ # WAY_BITS is the number of bits to select a way
+ self.WAY_BITS = log2_int(self.NUM_WAYS)
+
+ # Example of layout for 32 lines of 64 bytes:
+ layout = f"""\
+ DCache Layout:
+ |.. -----------------------| REAL_ADDR_BITS ({self.REAL_ADDR_BITS})
+ .. |--------------| SET_SIZE_BITS ({self.SET_SIZE_BITS})
+ .. tag |index| line |
+ .. | row | |
+ .. | |---| | ROW_LINE_BITS ({self.ROW_LINE_BITS})
+ .. | |--- - --| LINE_OFF_BITS ({self.LINE_OFF_BITS})
+ .. | |- --| ROW_OFF_BITS ({self.ROW_OFF_BITS})
+ .. |----- ---| | ROW_BITS ({self.ROW_BITS})
+ .. |-----| | INDEX_BITS ({self.INDEX_BITS})
+ .. --------| | TAG_BITS ({self.TAG_BITS})
+ """
+ print (layout)
+ print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
+ (self.TAG_BITS, self.INDEX_BITS, self.ROW_BITS,
+ self.ROW_OFF_BITS, self.LINE_OFF_BITS, self.ROW_LINE_BITS))
+ print ("index @: %d-%d" % (self.LINE_OFF_BITS, self.SET_SIZE_BITS))
+ print ("row @: %d-%d" % (self.LINE_OFF_BITS, self.ROW_OFF_BITS))
+ print ("tag @: %d-%d width %d" % (self.SET_SIZE_BITS,
+ self.REAL_ADDR_BITS, self.TAG_WIDTH))
+
+ self.TAG_RAM_WIDTH = self.TAG_WIDTH * self.NUM_WAYS
+
+ print ("TAG_RAM_WIDTH", self.TAG_RAM_WIDTH)
+ print (" TAG_WIDTH", self.TAG_WIDTH)
+ print (" NUM_WAYS", self.NUM_WAYS)
+ print (" NUM_LINES", self.NUM_LINES)
+
+ # L1 TLB
+ self.TLB_SET_BITS = log2_int(self.TLB_SET_SIZE)
+ self.TLB_WAY_BITS = log2_int(self.TLB_NUM_WAYS)
+ self.TLB_EA_TAG_BITS = 64 - (self.TLB_LG_PGSZ + self.TLB_SET_BITS)
+ self.TLB_TAG_WAY_BITS = self.TLB_NUM_WAYS * self.TLB_EA_TAG_BITS
+ self.TLB_PTE_BITS = 64
+ self.TLB_PTE_WAY_BITS = self.TLB_NUM_WAYS * self.TLB_PTE_BITS;
+
+ assert (self.LINE_SIZE % self.ROW_SIZE) == 0, \
+ "LINE_SIZE not multiple of ROW_SIZE"
+ assert ispow2(self.LINE_SIZE), "LINE_SIZE not power of 2"
+ assert ispow2(self.NUM_LINES), "NUM_LINES not power of 2"
+ assert ispow2(self.ROW_PER_LINE), "ROW_PER_LINE not power of 2"
+ assert self.ROW_BITS == \
+ (self.INDEX_BITS + self.ROW_LINE_BITS), \
+ "geometry bits don't add up"
+ assert (self.LINE_OFF_BITS == \
+ self.ROW_OFF_BITS + self.ROW_LINE_BITS), \
+ "geometry bits don't add up"
+ assert self.REAL_ADDR_BITS == \
+ (self.TAG_BITS + self.INDEX_BITS + self.LINE_OFF_BITS), \
+ "geometry bits don't add up"
+ assert self.REAL_ADDR_BITS == \
+ (self.TAG_BITS + self.ROW_BITS + self.ROW_OFF_BITS), \
+ "geometry bits don't add up"
+ assert 64 == WB_DATA_BITS, \
+ "Can't yet handle wb width that isn't 64-bits"
+ assert self.SET_SIZE_BITS <= self.TLB_LG_PGSZ, \
+ "Set indexed by virtual address"
+
+ def CacheTagArray(self):
+ return Array(Signal(self.TAG_RAM_WIDTH, name="tag%d" % x) \
+ for x in range(self.NUM_LINES))
+
+ def CacheValidsArray(self):
+ return Array(Signal(self.NUM_WAYS, name="tag_valids%d" % x)
+ for x in range(self.NUM_LINES))
+
+ def RowPerLineValidArray(self):
+ return Array(Signal(name="rows_valid%d" % x) \
+ for x in range(self.ROW_PER_LINE))
+
+ def TLBHit(self, name):
+ return Record([('valid', 1),
+ ('way', self.TLB_WAY_BITS)], name=name)
+
+ def TLBTagEAArray(self):
+ return Array(Signal(self.TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
+ for x in range (self.TLB_NUM_WAYS))
+
+ def TLBRecord(self, name):
+ tlb_layout = [('valid', self.TLB_NUM_WAYS),
+ ('tag', self.TLB_TAG_WAY_BITS),
+ ('pte', self.TLB_PTE_WAY_BITS)
+ ]
+ return Record(tlb_layout, name=name)
+
+ def TLBValidArray(self):
+ return Array(Signal(self.TLB_NUM_WAYS, name="tlb_valid%d" % x)
+ for x in range(self.TLB_SET_SIZE))
+
+ def HitWaySet(self):
+ return Array(Signal(self.WAY_BITS, name="hitway_%d" % x) \
+ for x in range(self.TLB_NUM_WAYS))
+
+ # Cache RAM interface
+ def CacheRamOut(self):
+ return Array(Signal(self.WB_DATA_BITS, name="cache_out%d" % x) \
+ for x in range(self.NUM_WAYS))
+
+ # PLRU output interface
+ def PLRUOut(self):
+ return Array(Signal(self.WAY_BITS, name="plru_out%d" % x) \
+ for x in range(self.NUM_LINES))
+
+ # TLB PLRU output interface
+ def TLBPLRUOut(self):
+ return Array(Signal(self.TLB_WAY_BITS, name="tlbplru_out%d" % x) \
+ for x in range(self.TLB_SET_SIZE))
+
+ # Helper functions to decode incoming requests
+ #
+ # Return the cache line index (tag index) for an address
+ def get_index(self, addr):
+ return addr[self.LINE_OFF_BITS:self.SET_SIZE_BITS]
+
+ # Return the cache row index (data memory) for an address
+ def get_row(self, addr):
+ return addr[self.ROW_OFF_BITS:self.SET_SIZE_BITS]
-# Return the index of a row within a line
-def get_row_of_line(row):
- return row[:ROW_BITS][:ROW_LINE_BITS]
+ # Return the index of a row within a line
+ def get_row_of_line(self, row):
+ return row[:self.ROW_BITS][:self.ROW_LINE_BITS]
-# Returns whether this is the last row of a line
-def is_last_row_addr(addr, last):
- return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
+ # Returns whether this is the last row of a line
+ def is_last_row_addr(self, addr, last):
+ return addr[self.ROW_OFF_BITS:self.LINE_OFF_BITS] == last
-# Returns whether this is the last row of a line
-def is_last_row(row, last):
- return get_row_of_line(row) == last
+ # Returns whether this is the last row of a line
+ def is_last_row(self, row, last):
+ return self.get_row_of_line(row) == last
-# Return the next row in the current cache line. We use a
-# dedicated function in order to limit the size of the
-# generated adder to be only the bits within a cache line
-# (3 bits with default settings)
-def next_row(row):
- row_v = row[0:ROW_LINE_BITS] + 1
- return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
+ # Return the next row in the current cache line. We use a
+ # dedicated function in order to limit the size of the
+ # generated adder to be only the bits within a cache line
+ # (3 bits with default settings)
+ def next_row(self, row):
+ row_v = row[0:self.ROW_LINE_BITS] + 1
+ return Cat(row_v[:self.ROW_LINE_BITS], row[self.ROW_LINE_BITS:])
-# Get the tag value from the address
-def get_tag(addr):
- return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
+ # Get the tag value from the address
+ def get_tag(self, addr):
+ return addr[self.SET_SIZE_BITS:self.REAL_ADDR_BITS]
-# Read a tag from a tag memory row
-def read_tag(way, tagset):
- return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
+ # Read a tag from a tag memory row
+ def read_tag(self, way, tagset):
+ return tagset.word_select(way, self.TAG_WIDTH)[:self.TAG_BITS]
-# Read a TLB tag from a TLB tag memory row
-def read_tlb_tag(way, tags):
- return tags.word_select(way, TLB_EA_TAG_BITS)
+ # Read a TLB tag from a TLB tag memory row
+ def read_tlb_tag(self, way, tags):
+ return tags.word_select(way, self.TLB_EA_TAG_BITS)
-# Write a TLB tag to a TLB tag memory row
-def write_tlb_tag(way, tags, tag):
- return read_tlb_tag(way, tags).eq(tag)
+ # Write a TLB tag to a TLB tag memory row
+ def write_tlb_tag(self, way, tags, tag):
+ return self.read_tlb_tag(way, tags).eq(tag)
-# Read a PTE from a TLB PTE memory row
-def read_tlb_pte(way, ptes):
- return ptes.word_select(way, TLB_PTE_BITS)
+ # Read a PTE from a TLB PTE memory row
+ def read_tlb_pte(self, way, ptes):
+ return ptes.word_select(way, self.TLB_PTE_BITS)
-def write_tlb_pte(way, ptes, newpte):
- return read_tlb_pte(way, ptes).eq(newpte)
+ def write_tlb_pte(self, way, ptes, newpte):
+ return self.read_tlb_pte(way, ptes).eq(newpte)
# Record for storing permission, attribute, etc. bits from a PTE
class MemAccessRequest(RecordObject):
- def __init__(self, name=None):
+ def __init__(self, cfg, name=None):
super().__init__(name=name)
self.op = Signal(Op)
self.valid = Signal()
self.dcbz = Signal()
- self.real_addr = Signal(REAL_ADDR_BITS)
+ self.real_addr = Signal(cfg.REAL_ADDR_BITS)
self.data = Signal(64)
self.byte_sel = Signal(8)
- self.hit_way = Signal(WAY_BITS)
+ self.hit_way = Signal(cfg.WAY_BITS)
self.same_tag = Signal()
self.mmu_req = Signal()
# First stage register, contains state for stage 1 of load hits
# and for the state machine used by all other operations
class RegStage1(RecordObject):
- def __init__(self, name=None):
+ def __init__(self, cfg, name=None):
super().__init__(name=name)
# Info about the request
self.full = Signal() # have uncompleted request
self.mmu_req = Signal() # request is from MMU
- self.req = MemAccessRequest(name="reqmem")
+ self.req = MemAccessRequest(cfg, name="reqmem")
# Cache hit state
- self.hit_way = Signal(WAY_BITS)
+ self.hit_way = Signal(cfg.WAY_BITS)
self.hit_load_valid = Signal()
- self.hit_index = Signal(INDEX_BITS)
+ self.hit_index = Signal(cfg.INDEX_BITS)
self.cache_hit = Signal()
# TLB hit state
- self.tlb_hit = Signal()
- self.tlb_hit_way = Signal(TLB_NUM_WAYS)
- self.tlb_hit_index = Signal(TLB_WAY_BITS)
+ self.tlb_hit = cfg.TLBHit("tlb_hit")
+ self.tlb_hit_index = Signal(cfg.TLB_SET_BITS)
# 2-stage data buffer for data forwarded from writes to reads
self.forward_data1 = Signal(64)
self.forward_data2 = Signal(64)
self.forward_sel1 = Signal(8)
self.forward_valid1 = Signal()
- self.forward_way1 = Signal(WAY_BITS)
- self.forward_row1 = Signal(ROW_BITS)
+ self.forward_way1 = Signal(cfg.WAY_BITS)
+ self.forward_row1 = Signal(cfg.ROW_BITS)
self.use_forward1 = Signal()
self.forward_sel = Signal(8)
self.write_tag = Signal()
self.slow_valid = Signal()
self.wb = WBMasterOut("wb")
- self.reload_tag = Signal(TAG_BITS)
- self.store_way = Signal(WAY_BITS)
- self.store_row = Signal(ROW_BITS)
- self.store_index = Signal(INDEX_BITS)
- self.end_row_ix = Signal(ROW_LINE_BITS)
- self.rows_valid = RowPerLineValidArray()
+ self.reload_tag = Signal(cfg.TAG_BITS)
+ self.store_way = Signal(cfg.WAY_BITS)
+ self.store_row = Signal(cfg.ROW_BITS)
+ self.store_index = Signal(cfg.INDEX_BITS)
+ self.end_row_ix = Signal(cfg.ROW_LINE_BITS)
+ self.rows_valid = cfg.RowPerLineValidArray()
self.acks_pending = Signal(3)
self.inc_acks = Signal()
self.dec_acks = Signal()
# Reservation information
class Reservation(RecordObject):
- def __init__(self):
- super().__init__()
+ def __init__(self, cfg, name=None):
+ super().__init__(name=name)
self.valid = Signal()
- self.addr = Signal(64-LINE_OFF_BITS)
+ self.addr = Signal(64-cfg.LINE_OFF_BITS)
class DTLBUpdate(Elaboratable):
- def __init__(self):
+ def __init__(self, cfg):
+ self.cfg = cfg
self.tlbie = Signal()
self.tlbwe = Signal()
self.doall = Signal()
- self.updated = Signal()
- self.v_updated = Signal()
- self.tlb_hit = Signal()
- self.tlb_req_index = Signal(TLB_SET_BITS)
-
- self.tlb_hit_way = Signal(TLB_WAY_BITS)
- self.tlb_tag_way = Signal(TLB_TAG_WAY_BITS)
- self.tlb_pte_way = Signal(TLB_PTE_WAY_BITS)
- self.repl_way = Signal(TLB_WAY_BITS)
- self.eatag = Signal(TLB_EA_TAG_BITS)
- self.pte_data = Signal(TLB_PTE_BITS)
+ self.tlb_hit = cfg.TLBHit("tlb_hit")
+ self.tlb_req_index = Signal(cfg.TLB_SET_BITS)
- self.dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
+ self.repl_way = Signal(cfg.TLB_WAY_BITS)
+ self.eatag = Signal(cfg.TLB_EA_TAG_BITS)
+ self.pte_data = Signal(cfg.TLB_PTE_BITS)
- self.tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
- self.db_out = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
- self.pb_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
+ # read from dtlb array
+ self.tlb_read = Signal()
+ self.tlb_read_index = Signal(cfg.TLB_SET_BITS)
+ self.tlb_way = cfg.TLBRecord("o_tlb_way")
def elaborate(self, platform):
m = Module()
comb = m.d.comb
sync = m.d.sync
-
- tagset = Signal(TLB_TAG_WAY_BITS)
- pteset = Signal(TLB_PTE_WAY_BITS)
-
- tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
- comb += db_out.eq(self.dv)
+ cfg = self.cfg
+
+ # there are 3 parts to this:
+ # QTY TLB_NUM_WAYs TAGs - of width (say) 46 bits of Effective Address
+ # QTY TLB_NUM_WAYs PTEs - of width (say) 64 bits
+ # "Valid" bits, one per "way", of QTY TLB_NUM_WAYs. these cannot
+ # be a Memory because they can all be cleared (tlbie, doall), i mean,
+ # we _could_, in theory, by overriding the Reset Signal of the Memory,
+ # hmmm....
+
+ dtlb_valid = cfg.TLBValidArray()
+ tlb_req_index = self.tlb_req_index
+
+ print ("TLB_TAG_WAY_BITS", cfg.TLB_TAG_WAY_BITS)
+ print (" TLB_EA_TAG_BITS", cfg.TLB_EA_TAG_BITS)
+ print (" TLB_NUM_WAYS", cfg.TLB_NUM_WAYS)
+ print ("TLB_PTE_WAY_BITS", cfg.TLB_PTE_WAY_BITS)
+ print (" TLB_PTE_BITS", cfg.TLB_PTE_BITS)
+ print (" TLB_NUM_WAYS", cfg.TLB_NUM_WAYS)
+
+ # TAG and PTE Memory SRAMs. transparent, write-enables are TLB_NUM_WAYS
+ tagway = Memory(depth=cfg.TLB_SET_SIZE, width=cfg.TLB_TAG_WAY_BITS,
+ attrs={'syn_ramstyle': "block_ram"})
+ m.submodules.rd_tagway = rd_tagway = tagway.read_port()
+ m.submodules.wr_tagway = wr_tagway = tagway.write_port(
+ granularity=cfg.TLB_EA_TAG_BITS)
+
+ pteway = Memory(depth=cfg.TLB_SET_SIZE, width=cfg.TLB_PTE_WAY_BITS,
+ attrs={'syn_ramstyle': "block_ram"})
+ m.submodules.rd_pteway = rd_pteway = pteway.read_port()
+ m.submodules.wr_pteway = wr_pteway = pteway.write_port(
+ granularity=cfg.TLB_PTE_BITS)
+
+ # commented out for now, can be put in if Memory.reset can be
+ # used for tlbie&doall to reset the entire Memory to zero in 1 cycle
+ #validm = Memory(depth=TLB_SET_SIZE, width=TLB_NUM_WAYS)
+ #m.submodules.rd_valid = rd_valid = validm.read_port()
+ #m.submodules.wr_valid = wr_valid = validm.write_port(
+ #granularity=1)
+
+ # connect up read and write addresses to Valid/PTE/TAG SRAMs
+ m.d.comb += rd_pteway.addr.eq(self.tlb_read_index)
+ m.d.comb += rd_tagway.addr.eq(self.tlb_read_index)
+ #m.d.comb += rd_valid.addr.eq(self.tlb_read_index)
+ m.d.comb += wr_tagway.addr.eq(tlb_req_index)
+ m.d.comb += wr_pteway.addr.eq(tlb_req_index)
+ #m.d.comb += wr_valid.addr.eq(tlb_req_index)
+
+ updated = Signal()
+ v_updated = Signal()
+ tb_out = Signal(cfg.TLB_TAG_WAY_BITS) # tlb_way_tags_t
+ db_out = Signal(cfg.TLB_NUM_WAYS) # tlb_way_valids_t
+ pb_out = Signal(cfg.TLB_PTE_WAY_BITS) # tlb_way_ptes_t
+ dv = Signal(cfg.TLB_NUM_WAYS) # tlb_way_valids_t
+
+ comb += dv.eq(dtlb_valid[tlb_req_index])
+ comb += db_out.eq(dv)
with m.If(self.tlbie & self.doall):
- pass # clear all back in parent
+ # clear all valid bits at once
+ # XXX hmmm, validm _could_ use Memory reset here...
+ for i in range(cfg.TLB_SET_SIZE):
+ sync += dtlb_valid[i].eq(0)
with m.Elif(self.tlbie):
- with m.If(self.tlb_hit):
- comb += db_out.bit_select(self.tlb_hit_way, 1).eq(0)
- comb += self.v_updated.eq(1)
-
+ # invalidate just the hit_way
+ with m.If(self.tlb_hit.valid):
+ comb += db_out.bit_select(self.tlb_hit.way, 1).eq(0)
+ comb += v_updated.eq(1)
with m.Elif(self.tlbwe):
-
- comb += tagset.eq(self.tlb_tag_way)
- comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
- comb += tb_out.eq(tagset)
-
- comb += pteset.eq(self.tlb_pte_way)
- comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
- comb += pb_out.eq(pteset)
-
+ # write to the requested tag and PTE
+ comb += cfg.write_tlb_tag(self.repl_way, tb_out, self.eatag)
+ comb += cfg.write_tlb_pte(self.repl_way, pb_out, self.pte_data)
+ # set valid bit
comb += db_out.bit_select(self.repl_way, 1).eq(1)
- comb += self.updated.eq(1)
- comb += self.v_updated.eq(1)
+ comb += updated.eq(1)
+ comb += v_updated.eq(1)
+
+ # above, sometimes valid is requested to be updated but data not
+ # therefore split them out, here. note the granularity thing matches
+ # with the shift-up of the eatag/pte_data into the correct TLB way.
+ # thus is it not necessary to write the entire lot, just the portion
+ # being altered: hence writing the *old* copy of the row is not needed
+ with m.If(updated): # PTE and TAG to be written
+ comb += wr_pteway.data.eq(pb_out)
+ comb += wr_pteway.en.eq(1<<self.repl_way)
+ comb += wr_tagway.data.eq(tb_out)
+ comb += wr_tagway.en.eq(1<<self.repl_way)
+ with m.If(v_updated): # Valid to be written
+ sync += dtlb_valid[tlb_req_index].eq(db_out)
+ #comb += wr_valid.data.eq(db_out)
+ #comb += wr_valid.en.eq(1<<self.repl_way)
+
+ # select one TLB way, use a register here
+ r_delay = Signal()
+ sync += r_delay.eq(self.tlb_read)
+ # first deal with the valids, which are not in a Memory.
+ # tlb way valid is output on a 1 clock delay with sync,
+ # but have to explicitly deal with "forwarding" here
+ with m.If(self.tlb_read):
+ with m.If(v_updated): # write *and* read in same cycle: forward
+ sync += self.tlb_way.valid.eq(db_out)
+ with m.Else():
+ sync += self.tlb_way.valid.eq(dtlb_valid[self.tlb_read_index])
+ # now deal with the Memory-read case. the output must remain
+ # valid (stable) even when a read-request is not made, but stable
+ # on a one-clock delay, hence the register
+ r_tlb_way = cfg.TLBRecord("r_tlb_way")
+ with m.If(r_delay):
+ # on one clock delay, capture the contents of the read port(s)
+ comb += self.tlb_way.tag.eq(rd_tagway.data)
+ comb += self.tlb_way.pte.eq(rd_pteway.data)
+ sync += r_tlb_way.tag.eq(rd_tagway.data)
+ sync += r_tlb_way.pte.eq(rd_pteway.data)
+ with m.Else():
+ # ... so that the register can output it when no read is requested
+ # it's rather overkill but better to be safe than sorry
+ comb += self.tlb_way.tag.eq(r_tlb_way.tag)
+ comb += self.tlb_way.pte.eq(r_tlb_way.pte)
+ #comb += self.tlb_way.eq(r_tlb_way)
return m
class DCachePendingHit(Elaboratable):
- def __init__(self, tlb_pte_way, tlb_valid_way, tlb_hit_way,
+ def __init__(self, cfg, tlb_way,
cache_i_validdx, cache_tag_set,
- req_addr,
- hit_set):
+ req_addr):
self.go = Signal()
self.virt_mode = Signal()
self.is_hit = Signal()
- self.tlb_hit = Signal()
- self.hit_way = Signal(WAY_BITS)
+ self.tlb_hit = cfg.TLBHit("tlb_hit")
+ self.hit_way = Signal(cfg.WAY_BITS)
self.rel_match = Signal()
- self.req_index = Signal(INDEX_BITS)
- self.reload_tag = Signal(TAG_BITS)
+ self.req_index = Signal(cfg.INDEX_BITS)
+ self.reload_tag = Signal(cfg.TAG_BITS)
- self.tlb_hit_way = tlb_hit_way
- self.tlb_pte_way = tlb_pte_way
- self.tlb_valid_way = tlb_valid_way
+ self.tlb_way = tlb_way
self.cache_i_validdx = cache_i_validdx
self.cache_tag_set = cache_tag_set
self.req_addr = req_addr
- self.hit_set = hit_set
+ self.cfg = cfg
def elaborate(self, platform):
m = Module()
go = self.go
virt_mode = self.virt_mode
is_hit = self.is_hit
- tlb_pte_way = self.tlb_pte_way
- tlb_valid_way = self.tlb_valid_way
+ tlb_way = self.tlb_way
cache_i_validdx = self.cache_i_validdx
cache_tag_set = self.cache_tag_set
req_addr = self.req_addr
- tlb_hit_way = self.tlb_hit_way
tlb_hit = self.tlb_hit
- hit_set = self.hit_set
hit_way = self.hit_way
rel_match = self.rel_match
req_index = self.req_index
reload_tag = self.reload_tag
+ cfg = self.cfg
+ hit_set = Array(Signal(name="hit_set_%d" % i) \
+ for i in range(cfg.TLB_NUM_WAYS))
rel_matches = Array(Signal(name="rel_matches_%d" % i) \
- for i in range(TLB_NUM_WAYS))
- hit_way_set = HitWaySet()
+ for i in range(cfg.TLB_NUM_WAYS))
+ hit_way_set = cfg.HitWaySet()
# Test if pending request is a hit on any way
# In order to make timing in virtual mode,
# the TLB, and then decide later which match to use.
with m.If(virt_mode):
- for j in range(TLB_NUM_WAYS): # tlb_num_way_t
- s_tag = Signal(TAG_BITS, name="s_tag%d" % j)
- s_hit = Signal()
- s_pte = Signal(TLB_PTE_BITS)
- s_ra = Signal(REAL_ADDR_BITS)
- comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
- comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
- s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
- comb += s_tag.eq(get_tag(s_ra))
-
- for i in range(NUM_WAYS): # way_t
+ for j in range(cfg.TLB_NUM_WAYS): # tlb_num_way_t
+ s_tag = Signal(cfg.TAG_BITS, name="s_tag%d" % j)
+ s_hit = Signal(name="s_hit%d" % j)
+ s_pte = Signal(cfg.TLB_PTE_BITS, name="s_pte%d" % j)
+ s_ra = Signal(cfg.REAL_ADDR_BITS, name="s_ra%d" % j)
+ # read the PTE, calc the Real Address, get tge tag
+ comb += s_pte.eq(cfg.read_tlb_pte(j, tlb_way.pte))
+ comb += s_ra.eq(Cat(req_addr[0:cfg.TLB_LG_PGSZ],
+ s_pte[cfg.TLB_LG_PGSZ:cfg.REAL_ADDR_BITS]))
+ comb += s_tag.eq(cfg.get_tag(s_ra))
+ # for each way check tge tag against the cache tag set
+ for i in range(cfg.NUM_WAYS): # way_t
is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
comb += is_tag_hit.eq(go & cache_i_validdx[i] &
- (read_tag(i, cache_tag_set) == s_tag)
- & tlb_valid_way[j])
+ (cfg.read_tag(i, cache_tag_set) == s_tag)
+ & (tlb_way.valid[j]))
with m.If(is_tag_hit):
comb += hit_way_set[j].eq(i)
comb += s_hit.eq(1)
comb += hit_set[j].eq(s_hit)
- with m.If(s_tag == reload_tag):
- comb += rel_matches[j].eq(1)
- with m.If(tlb_hit):
- comb += is_hit.eq(hit_set[tlb_hit_way])
- comb += hit_way.eq(hit_way_set[tlb_hit_way])
- comb += rel_match.eq(rel_matches[tlb_hit_way])
+ comb += rel_matches[j].eq(s_tag == reload_tag)
+ with m.If(tlb_hit.valid):
+ comb += is_hit.eq(hit_set[tlb_hit.way])
+ comb += hit_way.eq(hit_way_set[tlb_hit.way])
+ comb += rel_match.eq(rel_matches[tlb_hit.way])
with m.Else():
- s_tag = Signal(TAG_BITS)
- comb += s_tag.eq(get_tag(req_addr))
- for i in range(NUM_WAYS): # way_t
+ s_tag = Signal(cfg.TAG_BITS)
+ comb += s_tag.eq(cfg.get_tag(req_addr))
+ for i in range(cfg.NUM_WAYS): # way_t
is_tag_hit = Signal(name="is_tag_hit_%d" % i)
comb += is_tag_hit.eq(go & cache_i_validdx[i] &
- (read_tag(i, cache_tag_set) == s_tag))
+ (cfg.read_tag(i, cache_tag_set) == s_tag))
with m.If(is_tag_hit):
comb += hit_way.eq(i)
comb += is_hit.eq(1)
return m
-class DCache(Elaboratable):
+class DCache(Elaboratable, DCacheConfig):
"""Set associative dcache write-through
TODO (in no specific order):
at the end of line (this requires dealing with requests coming in
while not idle...)
"""
- def __init__(self):
+ def __init__(self, pspec=None):
self.d_in = LoadStore1ToDCacheType("d_in")
self.d_out = DCacheToLoadStore1Type("d_out")
self.m_out = DCacheToMMUType("m_out")
self.stall_out = Signal()
-
- self.wb_out = WBMasterOut("wb_out")
- self.wb_in = WBSlaveOut("wb_in")
+ self.any_stall_out = Signal()
+ self.dreq_when_stall = Signal()
+ self.mreq_when_stall = Signal()
+
+ # standard naming (wired to non-standard for compatibility)
+ self.bus = Interface(addr_width=32,
+ data_width=64,
+ granularity=8,
+ features={'stall'},
+ #alignment=0,
+ name="dcache")
self.log_out = Signal(20)
+ # test if small cache to be enabled
+ self.small_cache = (hasattr(pspec, "small_cache") and
+ (pspec.small_cache == True))
+ # test if microwatt compatibility is to be enabled
+ self.microwatt_compat = (hasattr(pspec, "microwatt_compat") and
+ (pspec.microwatt_compat == True))
+ # test if fabric compatibility is to be enabled
+ self.fabric_compat = (hasattr(pspec, "fabric_compat") and
+ (pspec.fabric_compat == True))
+
+ XLEN = pspec.XLEN
+ TLB_SET_SIZE = 8
+ TLB_NUM_WAYS = 2
+ NUM_LINES = 8
+ NUM_WAYS = 2
+
+ if self.small_cache:
+ # reduce way sizes and num lines to ridiculously small
+ TLB_SET_SIZE = 2
+ TLB_NUM_WAYS = 1
+ NUM_LINES = 2
+ NUM_WAYS = 1
+ if self.microwatt_compat or self.fabric_compat:
+ # reduce way sizes
+ NUM_WAYS = 1
+ TLB_NUM_WAYS = 1
+
+ super().__init__(TLB_SET_SIZE=TLB_SET_SIZE,
+ # XLEN=XLEN, # TODO
+ TLB_NUM_WAYS = TLB_NUM_WAYS,
+ NUM_LINES = NUM_LINES,
+ NUM_WAYS = NUM_WAYS
+ )
+
def stage_0(self, m, r0, r1, r0_full):
"""Latch the request in r0.req as long as we're not stalling
"""
comb += r.doall.eq(m_in.doall)
comb += r.tlbld.eq(m_in.tlbld)
comb += r.mmu_req.eq(1)
+ comb += r.d_valid.eq(1)
m.d.sync += Display(" DCACHE req mmu addr %x pte %x ld %d",
m_in.addr, m_in.pte, r.req.load)
comb += r.doall.eq(0)
comb += r.tlbld.eq(0)
comb += r.mmu_req.eq(0)
+ comb += r.d_valid.eq(0)
+
+ sync += r0_full.eq(0)
with m.If((~r1.full & ~d_in.hold) | ~r0_full):
sync += r0.eq(r)
sync += r0_full.eq(r.req.valid)
+ with m.Elif(~r0.d_valid):
# Sample data the cycle after a request comes in from loadstore1.
# If another request has come in already then the data will get
# put directly into req.data below.
- with m.If(r0.req.valid & ~r.req.valid & ~r0.d_valid &
- ~r0.mmu_req):
- sync += r0.req.data.eq(d_in.data)
- sync += r0.d_valid.eq(1)
+ sync += r0.req.data.eq(d_in.data)
+ sync += r0.d_valid.eq(1)
with m.If(d_in.valid):
m.d.sync += Display(" DCACHE req cache "
"virt %d addr %x data %x ld %d",
r.req.virt_mode, r.req.addr,
r.req.data, r.req.load)
- def tlb_read(self, m, r0_stall, tlb_valid_way,
- tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
- dtlb_tags, dtlb_ptes):
+ def tlb_read(self, m, r0_stall, tlb_way):
"""TLB
Operates in the second cycle on the request latched in r0.req.
TLB updates write the entry at the end of the second cycle.
sync = m.d.sync
m_in, d_in = self.m_in, self.d_in
- index = Signal(TLB_SET_BITS)
- addrbits = Signal(TLB_SET_BITS)
+ addrbits = Signal(self.TLB_SET_BITS)
- amin = TLB_LG_PGSZ
- amax = TLB_LG_PGSZ + TLB_SET_BITS
+ amin = self.TLB_LG_PGSZ
+ amax = self.TLB_LG_PGSZ + self.TLB_SET_BITS
with m.If(m_in.valid):
comb += addrbits.eq(m_in.addr[amin : amax])
with m.Else():
comb += addrbits.eq(d_in.addr[amin : amax])
- comb += index.eq(addrbits)
# If we have any op and the previous op isn't finished,
# then keep the same output for next cycle.
- with m.If(~r0_stall):
- sync += tlb_valid_way.eq(dtlb_valid_bits[index])
- sync += tlb_tag_way.eq(dtlb_tags[index])
- sync += tlb_pte_way.eq(dtlb_ptes[index])
+ d = self.dtlb_update
+ comb += d.tlb_read_index.eq(addrbits)
+ comb += d.tlb_read.eq(~r0_stall)
+ comb += tlb_way.eq(d.tlb_way)
- def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
+ def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, tlb_req_index):
"""Generate TLB PLRUs
"""
comb = m.d.comb
sync = m.d.sync
- if TLB_NUM_WAYS == 0:
+ if self.TLB_NUM_WAYS == 0:
return
- for i in range(TLB_SET_SIZE):
- # TLB PLRU interface
- tlb_plru = PLRU(TLB_WAY_BITS)
- setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
- tlb_plru_acc_en = Signal()
- comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
- comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
- comb += tlb_plru.acc_i.eq(r1.tlb_hit_way)
- comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
+ # suite of PLRUs with a selection and output mechanism
+ tlb_plrus = PLRUs("d_tlb", self.TLB_SET_SIZE, self.TLB_WAY_BITS)
+ m.submodules.tlb_plrus = tlb_plrus
+ comb += tlb_plrus.way.eq(r1.tlb_hit.way)
+ comb += tlb_plrus.valid.eq(r1.tlb_hit.valid)
+ comb += tlb_plrus.index.eq(r1.tlb_hit_index)
+ comb += tlb_plrus.isel.eq(tlb_req_index) # select victim
+ comb += tlb_plru_victim.eq(tlb_plrus.o_index) # selected victim
def tlb_search(self, m, tlb_req_index, r0, r0_valid,
- tlb_valid_way, tlb_tag_way, tlb_hit_way,
- tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
+ tlb_way,
+ pte, tlb_hit, valid_ra, perm_attr, ra):
comb = m.d.comb
- hitway = Signal(TLB_WAY_BITS)
+ hitway = Signal(self.TLB_WAY_BITS)
hit = Signal()
- eatag = Signal(TLB_EA_TAG_BITS)
+ eatag = Signal(self.TLB_EA_TAG_BITS)
- TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
- comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
- comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
+ self.TLB_LG_END = self.TLB_LG_PGSZ + self.TLB_SET_BITS
+ r0_req_addr = r0.req.addr[self.TLB_LG_PGSZ : self.TLB_LG_END]
+ comb += tlb_req_index.eq(r0_req_addr)
+ comb += eatag.eq(r0.req.addr[self.TLB_LG_END : 64 ])
- for i in range(TLB_NUM_WAYS):
+ for i in range(self.TLB_NUM_WAYS):
is_tag_hit = Signal(name="is_tag_hit%d" % i)
- tlb_tag = Signal(TLB_EA_TAG_BITS, name="tlb_tag%d" % i)
- comb += tlb_tag.eq(read_tlb_tag(i, tlb_tag_way))
- comb += is_tag_hit.eq(tlb_valid_way[i] & (tlb_tag == eatag))
+ tlb_tag = Signal(self.TLB_EA_TAG_BITS, name="tlb_tag%d" % i)
+ comb += tlb_tag.eq(self.read_tlb_tag(i, tlb_way.tag))
+ comb += is_tag_hit.eq((tlb_way.valid[i]) & (tlb_tag == eatag))
with m.If(is_tag_hit):
comb += hitway.eq(i)
comb += hit.eq(1)
- comb += tlb_hit.eq(hit & r0_valid)
- comb += tlb_hit_way.eq(hitway)
+ comb += tlb_hit.valid.eq(hit & r0_valid)
+ comb += tlb_hit.way.eq(hitway)
- with m.If(tlb_hit):
- comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
- comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
+ with m.If(tlb_hit.valid):
+ comb += pte.eq(self.read_tlb_pte(hitway, tlb_way.pte))
+ comb += valid_ra.eq(tlb_hit.valid | ~r0.req.virt_mode)
with m.If(r0.req.virt_mode):
- comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
- r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
- pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
+ comb += ra.eq(Cat(Const(0, self.ROW_OFF_BITS),
+ r0.req.addr[self.ROW_OFF_BITS:self.TLB_LG_PGSZ],
+ pte[self.TLB_LG_PGSZ:self.REAL_ADDR_BITS]))
comb += perm_attr.reference.eq(pte[8])
comb += perm_attr.changed.eq(pte[7])
comb += perm_attr.nocache.eq(pte[5])
comb += perm_attr.rd_perm.eq(pte[2])
comb += perm_attr.wr_perm.eq(pte[1])
with m.Else():
- comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
- r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
+ comb += ra.eq(Cat(Const(0, self.ROW_OFF_BITS),
+ r0.req.addr[self.ROW_OFF_BITS:self.REAL_ADDR_BITS]))
comb += perm_attr.reference.eq(1)
comb += perm_attr.changed.eq(1)
comb += perm_attr.nocache.eq(0)
with m.If(valid_ra):
m.d.sync += Display("DCACHE virt mode %d hit %d ra %x pte %x",
- r0.req.virt_mode, tlb_hit, ra, pte)
+ r0.req.virt_mode, tlb_hit.valid, ra, pte)
m.d.sync += Display(" perm ref=%d", perm_attr.reference)
m.d.sync += Display(" perm chg=%d", perm_attr.changed)
m.d.sync += Display(" perm noc=%d", perm_attr.nocache)
m.d.sync += Display(" perm rdp=%d", perm_attr.rd_perm)
m.d.sync += Display(" perm wrp=%d", perm_attr.wr_perm)
- def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
- tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
- dtlb_tags, tlb_pte_way, dtlb_ptes):
-
- dtlb_valids = TLBValidBitsArray()
+ def tlb_update(self, m, r0_valid, r0, tlb_req_index,
+ tlb_hit, tlb_plru_victim):
comb = m.d.comb
sync = m.d.sync
comb += tlbie.eq(r0_valid & r0.tlbie)
comb += tlbwe.eq(r0_valid & r0.tlbld)
- m.submodules.tlb_update = d = DTLBUpdate()
- with m.If(tlbie & r0.doall):
- # clear all valid bits at once
- for i in range(TLB_SET_SIZE):
- sync += dtlb_valid_bits[i].eq(0)
- with m.If(d.updated):
- sync += dtlb_tags[tlb_req_index].eq(d.tb_out)
- sync += dtlb_ptes[tlb_req_index].eq(d.pb_out)
- with m.If(d.v_updated):
- sync += dtlb_valid_bits[tlb_req_index].eq(d.db_out)
-
- comb += d.dv.eq(dtlb_valid_bits[tlb_req_index])
+ d = self.dtlb_update
comb += d.tlbie.eq(tlbie)
comb += d.tlbwe.eq(tlbwe)
comb += d.doall.eq(r0.doall)
comb += d.tlb_hit.eq(tlb_hit)
- comb += d.tlb_hit_way.eq(tlb_hit_way)
- comb += d.tlb_tag_way.eq(tlb_tag_way)
- comb += d.tlb_pte_way.eq(tlb_pte_way)
comb += d.tlb_req_index.eq(tlb_req_index)
- with m.If(tlb_hit):
- comb += d.repl_way.eq(tlb_hit_way)
+ with m.If(tlb_hit.valid):
+ comb += d.repl_way.eq(tlb_hit.way)
with m.Else():
- comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
- comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
+ comb += d.repl_way.eq(tlb_plru_victim)
+ comb += d.eatag.eq(r0.req.addr[self.TLB_LG_PGSZ + self.TLB_SET_BITS:64])
comb += d.pte_data.eq(r0.req.data)
def maybe_plrus(self, m, r1, plru_victim):
comb = m.d.comb
sync = m.d.sync
- if TLB_NUM_WAYS == 0:
+ if self.TLB_NUM_WAYS == 0:
return
- for i in range(NUM_LINES):
- # PLRU interface
- plru = PLRU(WAY_BITS)
- setattr(m.submodules, "plru%d" % i, plru)
- plru_acc_en = Signal()
+ # suite of PLRUs with a selection and output mechanism
+ m.submodules.plrus = plrus = PLRUs("dtag", self.NUM_LINES,
+ self.WAY_BITS)
+ comb += plrus.way.eq(r1.hit_way)
+ comb += plrus.valid.eq(r1.cache_hit)
+ comb += plrus.index.eq(r1.hit_index)
+ comb += plrus.isel.eq(r1.store_index) # select victim
+ comb += plru_victim.eq(plrus.o_index) # selected victim
- comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
- comb += plru.acc_en.eq(plru_acc_en)
- comb += plru.acc_i.eq(r1.hit_way)
- comb += plru_victim[i].eq(plru.lru_o)
-
- def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
+ def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set):
"""Cache tag RAM read port
"""
comb = m.d.comb
sync = m.d.sync
+
m_in, d_in = self.m_in, self.d_in
- index = Signal(INDEX_BITS)
+ # synchronous tag read-port: NOT TRANSPARENT (cannot pass through
+ # write-to-a-read at the same time), seems to pass tests ok
+ m.submodules.rd_tag = rd_tag = self.tagmem.read_port(transparent=False)
+
+ index = Signal(self.INDEX_BITS)
with m.If(r0_stall):
comb += index.eq(req_index)
with m.Elif(m_in.valid):
- comb += index.eq(get_index(m_in.addr))
+ comb += index.eq(self.get_index(m_in.addr))
with m.Else():
- comb += index.eq(get_index(d_in.addr))
- sync += cache_tag_set.eq(cache_tags[index])
+ comb += index.eq(self.get_index(d_in.addr))
+ comb += rd_tag.addr.eq(index)
+ comb += cache_tag_set.eq(rd_tag.data) # read-port is a 1-clock delay
def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
r0_valid, r1, cache_valids, replace_way,
use_forward1_next, use_forward2_next,
req_hit_way, plru_victim, rc_ok, perm_attr,
valid_ra, perm_ok, access_ok, req_op, req_go,
- tlb_pte_way,
- tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
+ tlb_hit, tlb_way, cache_tag_set,
cancel_store, req_same_tag, r0_stall, early_req_row):
"""Cache request parsing and hit detection
"""
m_in, d_in = self.m_in, self.d_in
is_hit = Signal()
- hit_way = Signal(WAY_BITS)
+ hit_way = Signal(self.WAY_BITS)
op = Signal(Op)
opsel = Signal(3)
go = Signal()
nc = Signal()
- hit_set = Array(Signal(name="hit_set_%d" % i) \
- for i in range(TLB_NUM_WAYS))
- cache_i_validdx = Signal(NUM_WAYS)
+ cache_i_validdx = Signal(self.NUM_WAYS)
# Extract line, row and tag from request
- comb += req_index.eq(get_index(r0.req.addr))
- comb += req_row.eq(get_row(r0.req.addr))
- comb += req_tag.eq(get_tag(ra))
+ comb += req_index.eq(self.get_index(r0.req.addr))
+ comb += req_row.eq(self.get_row(r0.req.addr))
+ comb += req_tag.eq(self.get_tag(ra))
if False: # display on comb is a bit... busy.
comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
comb += cache_i_validdx.eq(cache_valids[req_index])
- m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
- tlb_valid_way, tlb_hit_way,
- cache_i_validdx, cache_tag_set,
- r0.req.addr,
- hit_set)
-
+ m.submodules.dcache_pend = dc = DCachePendingHit(self, tlb_way,
+ cache_i_validdx, cache_tag_set,
+ r0.req.addr)
comb += dc.tlb_hit.eq(tlb_hit)
comb += dc.reload_tag.eq(r1.reload_tag)
comb += dc.virt_mode.eq(r0.req.virt_mode)
comb += dc.go.eq(go)
comb += dc.req_index.eq(req_index)
+
comb += is_hit.eq(dc.is_hit)
comb += hit_way.eq(dc.hit_way)
comb += req_same_tag.eq(dc.rel_match)
# For a store, consider this a hit even if the row isn't
# valid since it will be by the time we perform the store.
# For a load, check the appropriate row valid bit.
- rrow = Signal(ROW_LINE_BITS)
+ rrow = Signal(self.ROW_LINE_BITS)
comb += rrow.eq(req_row)
valid = r1.rows_valid[rrow]
comb += is_hit.eq((~r0.req.load) | valid)
comb += hit_way.eq(replace_way)
# Whether to use forwarded data for a load or not
- with m.If((get_row(r1.req.real_addr) == req_row) &
+ with m.If((self.get_row(r1.req.real_addr) == req_row) &
(r1.req.hit_way == hit_way)):
# Only need to consider r1.write_bram here, since if we
# are writing refill data here, then we don't have a
# The way to replace on a miss
with m.If(r1.write_tag):
- comb += replace_way.eq(plru_victim[r1.store_index])
+ comb += replace_way.eq(plru_victim)
with m.Else():
comb += replace_way.eq(r1.store_way)
(perm_attr.wr_perm |
(r0.req.load & perm_attr.rd_perm)))
comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
+
# Combine the request and cache hit status to decide what
# operation needs to be done
comb += nc.eq(r0.req.nc | perm_attr.nocache)
# row requested.
with m.If(~r0_stall):
with m.If(m_in.valid):
- comb += early_req_row.eq(get_row(m_in.addr))
+ comb += early_req_row.eq(self.get_row(m_in.addr))
with m.Else():
- comb += early_req_row.eq(get_row(d_in.addr))
+ comb += early_req_row.eq(self.get_row(d_in.addr))
with m.Else():
comb += early_req_row.eq(req_row)
with m.Else():
comb += clear_rsrv.eq(r0.req.atomic_last) # store conditional
with m.If((~reservation.valid) |
- (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
+ (r0.req.addr[self.LINE_OFF_BITS:64] !=
+ reservation.addr)):
comb += cancel_store.eq(1)
def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
reservation, r0):
-
comb = m.d.comb
sync = m.d.sync
sync += reservation.valid.eq(0)
with m.Elif(set_rsrv):
sync += reservation.valid.eq(1)
- sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
+ sync += reservation.addr.eq(r0.req.addr[self.LINE_OFF_BITS:64])
def writeback_control(self, m, r1, cache_out_row):
"""Return data for loads & completion control logic
dsel = data_fwd.word_select(i, 8)
comb += data_out.word_select(i, 8).eq(dsel)
+ # DCache output to LoadStore
comb += d_out.valid.eq(r1.ls_valid)
comb += d_out.data.eq(data_out)
comb += d_out.store_done.eq(~r1.stcx_fail)
account by using 1-cycle delayed signals for load hits.
"""
comb = m.d.comb
- wb_in = self.wb_in
+ bus = self.bus
+
+ # a Binary-to-Unary one-hots here. replace-way one-hot is gated
+ # (enabled) by bus.ack, not-write-bram, and state RELOAD_WAIT_ACK
+ m.submodules.rams_replace_way_e = rwe = Decoder(self.NUM_WAYS)
+ comb += rwe.n.eq(~((r1.state == State.RELOAD_WAIT_ACK) & bus.ack &
+ ~r1.write_bram))
+ comb += rwe.i.eq(replace_way)
+
+ m.submodules.rams_hit_way_e = hwe = Decoder(self.NUM_WAYS)
+ comb += hwe.i.eq(r1.hit_way)
+
+ # this one is gated with write_bram, and replace_way_e can never be
+ # set at the same time. that means that do_write can OR the outputs
+ m.submodules.rams_hit_req_way_e = hre = Decoder(self.NUM_WAYS)
+ comb += hre.n.eq(~r1.write_bram) # Decoder.n is inverted
+ comb += hre.i.eq(r1.req.hit_way)
+
+ # common Signals
+ do_read = Signal()
+ wr_addr = Signal(self.ROW_BITS)
+ wr_data = Signal(WB_DATA_BITS)
+ wr_sel = Signal(self.ROW_SIZE)
+ rd_addr = Signal(self.ROW_BITS)
+
+ comb += do_read.eq(1) # always enable
+ comb += rd_addr.eq(early_req_row)
+
+ # Write mux:
+ #
+ # Defaults to wishbone read responses (cache refill)
+ #
+ # For timing, the mux on wr_data/sel/addr is not
+ # dependent on anything other than the current state.
- for i in range(NUM_WAYS):
- do_read = Signal(name="do_rd%d" % i)
- rd_addr = Signal(ROW_BITS, name="rd_addr_%d" % i)
+ with m.If(r1.write_bram):
+ # Write store data to BRAM. This happens one
+ # cycle after the store is in r0.
+ comb += wr_data.eq(r1.req.data)
+ comb += wr_sel.eq(r1.req.byte_sel)
+ comb += wr_addr.eq(self.get_row(r1.req.real_addr))
+
+ with m.Else():
+ # Otherwise, we might be doing a reload or a DCBZ
+ with m.If(r1.dcbz):
+ comb += wr_data.eq(0)
+ with m.Else():
+ comb += wr_data.eq(bus.dat_r)
+ comb += wr_addr.eq(r1.store_row)
+ comb += wr_sel.eq(~0) # all 1s
+
+ # set up Cache Rams
+ for i in range(self.NUM_WAYS):
do_write = Signal(name="do_wr%d" % i)
- wr_addr = Signal(ROW_BITS, name="wr_addr_%d" % i)
- wr_data = Signal(WB_DATA_BITS, name="din_%d" % i)
- wr_sel = Signal(ROW_SIZE)
- wr_sel_m = Signal(ROW_SIZE)
- _d_out = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
+ wr_sel_m = Signal(self.ROW_SIZE, name="wr_sel_m_%d" % i)
+ d_out= Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
- way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
- setattr(m.submodules, "cacheram_%d" % i, way)
+ way = CacheRam(self.ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
+ m.submodules["cacheram_%d" % i] = way
comb += way.rd_en.eq(do_read)
comb += way.rd_addr.eq(rd_addr)
- comb += _d_out.eq(way.rd_data_o)
+ comb += d_out.eq(way.rd_data_o)
comb += way.wr_sel.eq(wr_sel_m)
comb += way.wr_addr.eq(wr_addr)
comb += way.wr_data.eq(wr_data)
# Cache hit reads
- comb += do_read.eq(1)
- comb += rd_addr.eq(early_req_row)
- with m.If(r1.hit_way == i):
- comb += cache_out_row.eq(_d_out)
-
- # Write mux:
- #
- # Defaults to wishbone read responses (cache refill)
- #
- # For timing, the mux on wr_data/sel/addr is not
- # dependent on anything other than the current state.
-
- with m.If(r1.write_bram):
- # Write store data to BRAM. This happens one
- # cycle after the store is in r0.
- comb += wr_data.eq(r1.req.data)
- comb += wr_sel.eq(r1.req.byte_sel)
- comb += wr_addr.eq(get_row(r1.req.real_addr))
-
- with m.If(i == r1.req.hit_way):
- comb += do_write.eq(1)
- with m.Else():
- # Otherwise, we might be doing a reload or a DCBZ
- with m.If(r1.dcbz):
- comb += wr_data.eq(0)
- with m.Else():
- comb += wr_data.eq(wb_in.dat)
- comb += wr_addr.eq(r1.store_row)
- comb += wr_sel.eq(~0) # all 1s
+ with m.If(hwe.o[i]):
+ comb += cache_out_row.eq(d_out)
- with m.If((r1.state == State.RELOAD_WAIT_ACK)
- & wb_in.ack & (replace_way == i)):
- comb += do_write.eq(1)
+ # these are mutually-exclusive via their Decoder-enablers
+ # (note: Decoder-enable is inverted)
+ comb += do_write.eq(hre.o[i] | rwe.o[i])
# Mask write selects with do_write since BRAM
# doesn't have a global write-enable
# It also handles error cases (TLB miss, cache paradox)
def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
req_hit_way, req_index, req_tag, access_ok,
- tlb_hit, tlb_hit_way, tlb_req_index):
-
+ tlb_hit, tlb_req_index):
comb = m.d.comb
sync = m.d.sync
sync += r1.hit_way.eq(req_hit_way)
sync += r1.hit_index.eq(req_index)
- with m.If(req_op == Op.OP_LOAD_HIT):
- sync += r1.hit_load_valid.eq(1)
- with m.Else():
- sync += r1.hit_load_valid.eq(0)
-
- with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
- sync += r1.cache_hit.eq(1)
- with m.Else():
- sync += r1.cache_hit.eq(0)
+ sync += r1.hit_load_valid.eq(req_op == Op.OP_LOAD_HIT)
+ sync += r1.cache_hit.eq((req_op == Op.OP_LOAD_HIT) |
+ (req_op == Op.OP_STORE_HIT))
with m.If(req_op == Op.OP_BAD):
sync += Display("Signalling ld/st error "
sync += r1.ls_error.eq(~r0.mmu_req)
sync += r1.mmu_error.eq(r0.mmu_req)
sync += r1.cache_paradox.eq(access_ok)
-
with m.Else():
sync += r1.ls_error.eq(0)
sync += r1.mmu_error.eq(0)
sync += r1.cache_paradox.eq(0)
- with m.If(req_op == Op.OP_STCX_FAIL):
- sync += r1.stcx_fail.eq(1)
- with m.Else():
- sync += r1.stcx_fail.eq(0)
+ sync += r1.stcx_fail.eq(req_op == Op.OP_STCX_FAIL)
# Record TLB hit information for updating TLB PLRU
sync += r1.tlb_hit.eq(tlb_hit)
- sync += r1.tlb_hit_way.eq(tlb_hit_way)
sync += r1.tlb_hit_index.eq(tlb_req_index)
# Memory accesses are handled by this state machine:
# All wishbone requests generation is done here.
# This machine operates at stage 1.
def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
- cache_valids, r0, replace_way,
+ r0, replace_way,
req_hit_way, req_same_tag,
- r0_valid, req_op, cache_tags, req_go, ra):
+ r0_valid, req_op, cache_valids, req_go, ra):
comb = m.d.comb
sync = m.d.sync
- wb_in = self.wb_in
+ bus = self.bus
d_in = self.d_in
- req = MemAccessRequest("mreq_ds")
+ m.submodules.wr_tag = wr_tag = self.tagmem.write_port(
+ granularity=self.TAG_WIDTH)
- req_row = Signal(ROW_BITS)
- req_idx = Signal(INDEX_BITS)
- req_tag = Signal(TAG_BITS)
- comb += req_idx.eq(get_index(req.real_addr))
- comb += req_row.eq(get_row(req.real_addr))
- comb += req_tag.eq(get_tag(req.real_addr))
+ req = MemAccessRequest(self, "mreq_ds")
+
+ r1_next_cycle = Signal()
+ req_row = Signal(self.ROW_BITS)
+ req_idx = Signal(self.INDEX_BITS)
+ req_tag = Signal(self.TAG_BITS)
+ comb += req_idx.eq(self.get_index(req.real_addr))
+ comb += req_row.eq(self.get_row(req.real_addr))
+ comb += req_tag.eq(self.get_tag(req.real_addr))
sync += r1.use_forward1.eq(use_forward1_next)
sync += r1.forward_sel.eq(0)
sync += r1.forward_data1.eq(r1.req.data)
sync += r1.forward_sel1.eq(r1.req.byte_sel)
sync += r1.forward_way1.eq(r1.req.hit_way)
- sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
+ sync += r1.forward_row1.eq(self.get_row(r1.req.real_addr))
sync += r1.forward_valid1.eq(1)
with m.Else():
with m.If(r1.dcbz):
sync += r1.forward_data1.eq(0)
with m.Else():
- sync += r1.forward_data1.eq(wb_in.dat)
+ sync += r1.forward_data1.eq(bus.dat_r)
sync += r1.forward_sel1.eq(~0) # all 1s
sync += r1.forward_way1.eq(replace_way)
sync += r1.forward_row1.eq(r1.store_row)
sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
- with m.If(~r0.mmu_req):
- sync += r1.ls_valid.eq(1)
- with m.Else():
+ with m.If(r0.mmu_req):
sync += r1.mmu_done.eq(1)
+ with m.Else():
+ sync += r1.ls_valid.eq(1)
with m.If(r1.write_tag):
# Store new tag in selected way
- for i in range(NUM_WAYS):
- with m.If(i == replace_way):
- ct = Signal(TAG_RAM_WIDTH)
- comb += ct.eq(cache_tags[r1.store_index])
- """
-TODO: check this
-cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
- (TAG_WIDTH - 1 downto TAG_BITS => '0') & r1.reload_tag;
- """
- comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
- sync += cache_tags[r1.store_index].eq(ct)
+ replace_way_onehot = Signal(self.NUM_WAYS)
+ comb += replace_way_onehot.eq(1<<replace_way)
+ ct = Signal(self.TAG_RAM_WIDTH)
+ comb += ct.eq(r1.reload_tag << (replace_way*self.TAG_WIDTH))
+ comb += wr_tag.en.eq(replace_way_onehot)
+ comb += wr_tag.addr.eq(r1.store_index)
+ comb += wr_tag.data.eq(ct)
+
sync += r1.store_way.eq(replace_way)
sync += r1.write_tag.eq(0)
| (req_op == Op.OP_STORE_HIT)):
sync += r1.req.eq(req)
sync += r1.full.eq(1)
+ # do not let r1.state RELOAD_WAIT_ACK or STORE_WAIT_ACK
+ # destroy r1.req by overwriting r1.full back to zero
+ comb += r1_next_cycle.eq(1)
# Main state machine
with m.Switch(r1.state):
with m.Case(State.IDLE):
- sync += r1.wb.adr.eq(req.real_addr[ROW_LINE_BITS:])
+ sync += r1.wb.adr.eq(req.real_addr[self.ROW_OFF_BITS:])
sync += r1.wb.sel.eq(req.byte_sel)
sync += r1.wb.dat.eq(req.data)
sync += r1.dcbz.eq(req.dcbz)
# for subsequent stores.
sync += r1.store_index.eq(req_idx)
sync += r1.store_row.eq(req_row)
- sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1)
+ sync += r1.end_row_ix.eq(self.get_row_of_line(req_row)-1)
sync += r1.reload_tag.eq(req_tag)
sync += r1.req.same_tag.eq(1)
with m.If(req.op == Op.OP_STORE_HIT):
sync += r1.store_way.eq(req.hit_way)
+ #with m.If(r1.dec_acks):
+ # sync += r1.acks_pending.eq(r1.acks_pending - 1)
+
# Reset per-row valid bits,
# ready for handling OP_LOAD_MISS
- for i in range(ROW_PER_LINE):
+ for i in range(self.ROW_PER_LINE):
sync += r1.rows_valid[i].eq(0)
with m.If(req_op != Op.OP_NONE):
sync += r1.state.eq(State.STORE_WAIT_ACK)
sync += r1.acks_pending.eq(1)
sync += r1.full.eq(0)
+ comb += r1_next_cycle.eq(0)
sync += r1.slow_valid.eq(1)
- with m.If(~req.mmu_req):
- sync += r1.ls_valid.eq(1)
- with m.Else():
+ with m.If(req.mmu_req):
sync += r1.mmu_done.eq(1)
+ with m.Else():
+ sync += r1.ls_valid.eq(1)
with m.If(req.op == Op.OP_STORE_HIT):
sync += r1.write_bram.eq(1)
pass
with m.Case(State.RELOAD_WAIT_ACK):
- ld_stbs_done = Signal()
- # Requests are all sent if stb is 0
- comb += ld_stbs_done.eq(~r1.wb.stb)
# If we are still sending requests, was one accepted?
- with m.If((~wb_in.stall) & r1.wb.stb):
- # That was the last word? We are done sending.
- # Clear stb and set ld_stbs_done so we can handle an
- # eventual last ack on the same cycle.
+ with m.If((~bus.stall) & r1.wb.stb):
+ # That was the last word? We are done sending. Clear stb
# sigh - reconstruct wb adr with 3 extra 0s at front
- wb_adr = Cat(Const(0, ROW_OFF_BITS), r1.wb.adr)
- with m.If(is_last_row_addr(wb_adr, r1.end_row_ix)):
+ wb_adr = Cat(Const(0, self.ROW_OFF_BITS), r1.wb.adr)
+ with m.If(self.is_last_row_addr(wb_adr, r1.end_row_ix)):
sync += r1.wb.stb.eq(0)
- comb += ld_stbs_done.eq(1)
# Calculate the next row address in the current cache line
- row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
+ rlen = self.LINE_OFF_BITS-self.ROW_OFF_BITS
+ row = Signal(rlen)
comb += row.eq(r1.wb.adr)
- sync += r1.wb.adr[:LINE_OFF_BITS-ROW_OFF_BITS].eq(row+1)
+ sync += r1.wb.adr[:rlen].eq(row+1)
# Incoming acks processing
- sync += r1.forward_valid1.eq(wb_in.ack)
- with m.If(wb_in.ack):
- srow = Signal(ROW_LINE_BITS)
+ sync += r1.forward_valid1.eq(bus.ack)
+ with m.If(bus.ack):
+ srow = Signal(self.ROW_LINE_BITS)
comb += srow.eq(r1.store_row)
sync += r1.rows_valid[srow].eq(1)
# Compare the whole address in case the
# request in r1.req is not the one that
# started this refill.
- with m.If(req.valid & r1.req.same_tag &
- ((r1.dcbz & r1.req.dcbz) |
- (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
- (r1.store_row == get_row(req.real_addr))):
- sync += r1.full.eq(0)
+ rowmatch = Signal()
+ lastrow = Signal()
+ comb += rowmatch.eq(r1.store_row ==
+ self.get_row(r1.req.real_addr))
+ comb += lastrow.eq(self.is_last_row(r1.store_row,
+ r1.end_row_ix))
+ with m.If(r1.full & r1.req.same_tag &
+ ((r1.dcbz & req.dcbz) |
+ (r1.req.op == Op.OP_LOAD_MISS)) & rowmatch):
+ sync += r1.full.eq(r1_next_cycle)
sync += r1.slow_valid.eq(1)
- with m.If(~r1.mmu_req):
- sync += r1.ls_valid.eq(1)
- with m.Else():
+ with m.If(r1.mmu_req):
sync += r1.mmu_done.eq(1)
+ with m.Else():
+ sync += r1.ls_valid.eq(1)
sync += r1.forward_sel.eq(~0) # all 1s
sync += r1.use_forward1.eq(1)
# Check for completion
- with m.If(ld_stbs_done & is_last_row(r1.store_row,
- r1.end_row_ix)):
+ with m.If(lastrow):
# Complete wishbone cycle
sync += r1.wb.cyc.eq(0)
# Cache line is now valid
- cv = Signal(INDEX_BITS)
+ cv = Signal(self.INDEX_BITS)
comb += cv.eq(cache_valids[r1.store_index])
comb += cv.bit_select(r1.store_way, 1).eq(1)
sync += cache_valids[r1.store_index].eq(cv)
cv, r1.store_index, r1.store_way)
# Increment store row counter
- sync += r1.store_row.eq(next_row(r1.store_row))
+ sync += r1.store_row.eq(self.next_row(r1.store_row))
with m.Case(State.STORE_WAIT_ACK):
st_stbs_done = Signal()
- acks = Signal(3)
adjust_acks = Signal(3)
comb += st_stbs_done.eq(~r1.wb.stb)
- comb += acks.eq(r1.acks_pending)
with m.If(r1.inc_acks != r1.dec_acks):
with m.If(r1.inc_acks):
- comb += adjust_acks.eq(acks + 1)
+ comb += adjust_acks.eq(r1.acks_pending + 1)
with m.Else():
- comb += adjust_acks.eq(acks - 1)
+ comb += adjust_acks.eq(r1.acks_pending - 1)
with m.Else():
- comb += adjust_acks.eq(acks)
+ comb += adjust_acks.eq(r1.acks_pending)
sync += r1.acks_pending.eq(adjust_acks)
# Clear stb when slave accepted request
- with m.If(~wb_in.stall):
+ with m.If(~bus.stall):
# See if there is another store waiting
# to be done which is in the same real page.
+ # (this is when same_tsg is true)
with m.If(req.valid):
- _ra = req.real_addr[ROW_LINE_BITS:SET_SIZE_BITS]
- sync += r1.wb.adr[0:SET_SIZE_BITS].eq(_ra)
+ _ra = req.real_addr[self.ROW_OFF_BITS:
+ self.SET_SIZE_BITS]
+ alen = self.SET_SIZE_BITS-self.ROW_OFF_BITS
+ sync += r1.wb.adr[0:alen].eq(_ra)
sync += r1.wb.dat.eq(req.data)
sync += r1.wb.sel.eq(req.byte_sel)
with m.If((adjust_acks < 7) & req.same_tag &
- ((req.op == Op.OP_STORE_MISS)
- | (req.op == Op.OP_STORE_HIT))):
+ ((req.op == Op.OP_STORE_MISS) |
+ (req.op == Op.OP_STORE_HIT))):
sync += r1.wb.stb.eq(1)
comb += st_stbs_done.eq(0)
+ sync += r1.store_way.eq(req.hit_way)
+ sync += r1.store_row.eq(self.get_row(req.real_addr))
with m.If(req.op == Op.OP_STORE_HIT):
sync += r1.write_bram.eq(1)
- sync += r1.full.eq(0)
+ sync += r1.full.eq(r1_next_cycle)
sync += r1.slow_valid.eq(1)
# Store requests never come from the MMU
comb += st_stbs_done.eq(1)
# Got ack ? See if complete.
- with m.If(wb_in.ack):
+ sync += Display("got ack %d %d stbs %d adjust_acks %d",
+ bus.ack, bus.ack, st_stbs_done, adjust_acks)
+ with m.If(bus.ack):
with m.If(st_stbs_done & (adjust_acks == 1)):
sync += r1.state.eq(State.IDLE)
sync += r1.wb.cyc.eq(0)
with m.Case(State.NC_LOAD_WAIT_ACK):
# Clear stb when slave accepted request
- with m.If(~wb_in.stall):
+ with m.If(~bus.stall):
sync += r1.wb.stb.eq(0)
# Got ack ? complete.
- with m.If(wb_in.ack):
+ with m.If(bus.ack):
sync += r1.state.eq(State.IDLE)
- sync += r1.full.eq(0)
+ sync += r1.full.eq(r1_next_cycle)
sync += r1.slow_valid.eq(1)
- with m.If(~r1.mmu_req):
- sync += r1.ls_valid.eq(1)
- with m.Else():
+ with m.If(r1.mmu_req):
sync += r1.mmu_done.eq(1)
+ with m.Else():
+ sync += r1.ls_valid.eq(1)
sync += r1.forward_sel.eq(~0) # all 1s
sync += r1.use_forward1.eq(1)
sync += r1.wb.cyc.eq(0)
sync += r1.wb.stb.eq(0)
- def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out):
+ def dcache_log(self, m, r1, valid_ra, tlb_hit, stall_out):
sync = m.d.sync
- d_out, wb_in, log_out = self.d_out, self.wb_in, self.log_out
+ d_out, bus, log_out = self.d_out, self.bus, self.log_out
- sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
+ sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit.way[:3],
stall_out, req_op[:3], d_out.valid, d_out.error,
- r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
+ r1.wb.cyc, r1.wb.stb, bus.ack, bus.stall,
r1.real_adr[3:6]))
def elaborate(self, platform):
m = Module()
- comb = m.d.comb
- d_in = self.d_in
+ comb, sync = m.d.comb, m.d.sync
+ m_in, d_in = self.m_in, self.d_in
# Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
- cache_tags = CacheTagArray()
- cache_tag_set = Signal(TAG_RAM_WIDTH)
- cache_valids = CacheValidBitsArray()
+ cache_valids = self.CacheValidsArray()
+ cache_tag_set = Signal(self.TAG_RAM_WIDTH)
- # TODO attribute ram_style : string;
- # TODO attribute ram_style of cache_tags : signal is "distributed";
+ self.tagmem = Memory(depth=self.NUM_LINES, width=self.TAG_RAM_WIDTH,
+ attrs={'syn_ramstyle': "block_ram"})
"""note: these are passed to nmigen.hdl.Memory as "attributes".
don't know how, just that they are.
"""
- dtlb_valid_bits = TLBValidBitsArray()
- dtlb_tags = TLBTagsArray()
- dtlb_ptes = TLBPtesArray()
# TODO attribute ram_style of
# dtlb_tags : signal is "distributed";
# TODO attribute ram_style of
r0 = RegStage0("r0")
r0_full = Signal()
- r1 = RegStage1("r1")
+ r1 = RegStage1(self, "r1")
- reservation = Reservation()
+ reservation = Reservation(self, "rsrv")
# Async signals on incoming request
- req_index = Signal(INDEX_BITS)
- req_row = Signal(ROW_BITS)
- req_hit_way = Signal(WAY_BITS)
- req_tag = Signal(TAG_BITS)
+ req_index = Signal(self.INDEX_BITS)
+ req_row = Signal(self.ROW_BITS)
+ req_hit_way = Signal(self.WAY_BITS)
+ req_tag = Signal(self.TAG_BITS)
req_op = Signal(Op)
req_data = Signal(64)
req_same_tag = Signal()
req_go = Signal()
- early_req_row = Signal(ROW_BITS)
+ early_req_row = Signal(self.ROW_BITS)
cancel_store = Signal()
set_rsrv = Signal()
cache_out_row = Signal(WB_DATA_BITS)
- plru_victim = PLRUOut()
- replace_way = Signal(WAY_BITS)
+ plru_victim = Signal(self.WAY_BITS)
+ replace_way = Signal(self.WAY_BITS)
# Wishbone read/write/cache write formatting signals
bus_sel = Signal(8)
# TLB signals
- tlb_tag_way = Signal(TLB_TAG_WAY_BITS)
- tlb_pte_way = Signal(TLB_PTE_WAY_BITS)
- tlb_valid_way = Signal(TLB_NUM_WAYS)
- tlb_req_index = Signal(TLB_SET_BITS)
- tlb_hit = Signal()
- tlb_hit_way = Signal(TLB_WAY_BITS)
- pte = Signal(TLB_PTE_BITS)
- ra = Signal(REAL_ADDR_BITS)
+ tlb_way = self.TLBRecord("tlb_way")
+ tlb_req_index = Signal(self.TLB_SET_BITS)
+ tlb_hit = self.TLBHit("tlb_hit")
+ pte = Signal(self.TLB_PTE_BITS)
+ ra = Signal(self.REAL_ADDR_BITS)
valid_ra = Signal()
perm_attr = PermAttr("dc_perms")
rc_ok = Signal()
perm_ok = Signal()
access_ok = Signal()
- tlb_plru_victim = TLBPLRUOut()
+ tlb_plru_victim = Signal(self.TLB_WAY_BITS)
# we don't yet handle collisions between loadstore1 requests
# and MMU requests
comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
comb += self.stall_out.eq(r0_stall)
-
- # Wire up wishbone request latch out of stage 1
- comb += self.wb_out.eq(r1.wb)
+ # debugging: detect if any stall ever requested, which is fine,
+ # but if a request comes in when stall requested, that's bad.
+ with m.If(r0_stall):
+ sync += self.any_stall_out.eq(1)
+ with m.If(d_in.valid):
+ sync += self.dreq_when_stall.eq(1)
+ with m.If(m_in.valid):
+ sync += self.mreq_when_stall.eq(1)
# deal with litex not doing wishbone pipeline mode
# XXX in wrong way. FIFOs are needed in the SRAM test
- # so that stb/ack match up
- comb += self.wb_in.stall.eq(self.wb_out.cyc & ~self.wb_in.ack)
+ # so that stb/ack match up. same thing done in icache.py
+ if not self.microwatt_compat or self.fabric_compat:
+ comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
+
+ # Wire up wishbone request latch out of stage 1
+ comb += self.bus.we.eq(r1.wb.we)
+ comb += self.bus.adr.eq(r1.wb.adr)
+ comb += self.bus.sel.eq(r1.wb.sel)
+ comb += self.bus.stb.eq(r1.wb.stb)
+ comb += self.bus.dat_w.eq(r1.wb.dat)
+ comb += self.bus.cyc.eq(r1.wb.cyc)
+
+ # create submodule TLBUpdate
+ m.submodules.dtlb_update = self.dtlb_update = DTLBUpdate(self)
# call sub-functions putting everything together, using shared
# signals established above
self.stage_0(m, r0, r1, r0_full)
- self.tlb_read(m, r0_stall, tlb_valid_way,
- tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
- dtlb_tags, dtlb_ptes)
+ self.tlb_read(m, r0_stall, tlb_way)
self.tlb_search(m, tlb_req_index, r0, r0_valid,
- tlb_valid_way, tlb_tag_way, tlb_hit_way,
- tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra)
- self.tlb_update(m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
- tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
- dtlb_tags, tlb_pte_way, dtlb_ptes)
+ tlb_way,
+ pte, tlb_hit, valid_ra, perm_attr, ra)
+ self.tlb_update(m, r0_valid, r0, tlb_req_index,
+ tlb_hit, tlb_plru_victim)
self.maybe_plrus(m, r1, plru_victim)
- self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
- self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
+ self.maybe_tlb_plrus(m, r1, tlb_plru_victim, tlb_req_index)
+ self.cache_tag_read(m, r0_stall, req_index, cache_tag_set)
self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
r0_valid, r1, cache_valids, replace_way,
use_forward1_next, use_forward2_next,
req_hit_way, plru_victim, rc_ok, perm_attr,
valid_ra, perm_ok, access_ok, req_op, req_go,
- tlb_pte_way,
- tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
+ tlb_hit, tlb_way, cache_tag_set,
cancel_store, req_same_tag, r0_stall, early_req_row)
self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
r0_valid, r0, reservation)
self.rams(m, r1, early_req_row, cache_out_row, replace_way)
self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
req_hit_way, req_index, req_tag, access_ok,
- tlb_hit, tlb_hit_way, tlb_req_index)
+ tlb_hit, tlb_req_index)
self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
- cache_valids, r0, replace_way,
+ r0, replace_way,
req_hit_way, req_same_tag,
- r0_valid, req_op, cache_tags, req_go, ra)
- #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
+ r0_valid, req_op, cache_valids, req_go, ra)
+ #self.dcache_log(m, r1, valid_ra, tlb_hit, stall_out)
return m
--- /dev/null
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2022 Cesar Strauss <cestrauss@gmail.com>
+# Sponsored by NLnet under EU Grant and 957073
+# Part of the Libre-SOC Project.
+
+"""
+Formal proof of soc.experiment.compalu_multi.MultiCompUnit
+
+In short, MultiCompUnit:
+
+1) stores an opcode from Issue, when not "busy", and "issue" is pulsed
+2) signals "busy" high
+3) fetches its operand(s), if any (which are not masked or zero) from the
+Scoreboard (REL/GO protocol)
+4) starts the ALU (ready/valid protocol), as soon as all inputs are available
+5) captures result from ALU (again ready/valid)
+5) sends the result(s) back to the Scoreboard (again REL/GO)
+6) drops "busy"
+
+Note that, if the conditions are right, many of the above can occur together,
+on a single cycle.
+
+The formal proof involves ensuring that:
+1) the ALU gets the right opcode from Issue
+2) the ALU gets the right operands from the Scoreboard
+3) the Scoreboard receives the right result from the ALU
+4) no transactions are dropped or repeated
+
+This can be checked using holding registers and transaction counters.
+
+See https://bugs.libre-soc.org/show_bug.cgi?id=879 and
+https://bugs.libre-soc.org/show_bug.cgi?id=197
+"""
+
+import unittest
+
+from nmigen import Signal, Module
+from nmigen.hdl.ast import Cover, Const, Assume, Assert
+from nmutil.formaltest import FHDLTestCase
+from nmutil.singlepipe import ControlBase
+
+from soc.experiment.compalu_multi import MultiCompUnit
+from soc.fu.alu.alu_input_record import CompALUOpSubset
+
+
+# Formal model of a simple ALU, whose inputs and outputs are randomly
+# generated by the formal engine
+
+class ALUCtx:
+ def __init__(self):
+ self.op = CompALUOpSubset(name="op")
+
+
+class ALUInput:
+ def __init__(self):
+ self.a = Signal(16)
+ self.b = Signal(16)
+ self.ctx = ALUCtx()
+
+ def eq(self, i):
+ return [self.a.eq(i.a), self.b.eq(i.b)]
+
+
+class ALUOutput:
+ def __init__(self):
+ self.o1 = Signal(16)
+ self.o2 = Signal(16)
+
+ def eq(self, i):
+ return [self.o1.eq(i.o1), self.o2.eq(i.o2)]
+
+
+class ALU(ControlBase):
+ def __init__(self):
+ super().__init__(stage=self)
+ self.p.i_data, self.n.o_data = self.new_specs(None)
+ self.i, self.o = self.p.i_data, self.n.o_data
+
+ def setup(self, m, i):
+ pass
+
+ def ispec(self, name=None):
+ return ALUInput()
+
+ def ospec(self, name=None):
+ return ALUOutput()
+
+ def elaborate(self, platform):
+ m = super().elaborate(platform)
+ return m
+
+
+class CompALUMultiTestCase(FHDLTestCase):
+ def test_formal(self):
+ inspec = [('INT', 'a', '0:15'),
+ ('INT', 'b', '0:15')]
+ outspec = [('INT', 'o1', '0:15'),
+ ('INT', 'o2', '0:15')]
+ regspec = (inspec, outspec)
+ m = Module()
+ # Instantiate "random" ALU
+ alu = ALU()
+ m.submodules.dut = dut = MultiCompUnit(regspec, alu, CompALUOpSubset)
+ # TODO Test shadow / die
+ m.d.comb += [dut.shadown_i.eq(1), dut.go_die_i.eq(0)]
+ # Don't issue while busy
+ issue = Signal()
+ m.d.comb += dut.issue_i.eq(issue & ~dut.busy_o)
+ # Avoid toggling go_i when rel_o is low (rel / go protocol)
+ rd_go = Signal(dut.n_src)
+ m.d.comb += dut.cu.rd.go_i.eq(rd_go & dut.cu.rd.rel_o)
+ wr_go = Signal(dut.n_dst)
+ m.d.comb += dut.cu.wr.go_i.eq(wr_go & dut.cu.wr.rel_o)
+ # Transaction counters
+ do_issue = Signal()
+ m.d.comb += do_issue.eq(dut.issue_i & ~dut.busy_o)
+ cnt_issue = Signal(4)
+ m.d.sync += cnt_issue.eq(cnt_issue + do_issue)
+ do_read = Signal(dut.n_src)
+ m.d.comb += do_read.eq(dut.cu.rd.rel_o & dut.cu.rd.go_i)
+ cnt_read = []
+ for i in range(dut.n_src):
+ cnt = Signal(4, name="cnt_read_%d" % i)
+ m.d.sync += cnt.eq(cnt + do_read[i])
+ cnt_read.append(cnt)
+ do_write = Signal(dut.n_dst)
+ m.d.comb += do_write.eq(dut.cu.wr.rel_o & dut.cu.wr.go_i)
+ cnt_write = []
+ for i in range(dut.n_dst):
+ cnt = Signal(4, name="cnt_write_%d" % i)
+ m.d.sync += cnt.eq(cnt + do_write[i])
+ cnt_write.append(cnt)
+ do_alu_write = Signal()
+ m.d.comb += do_alu_write.eq(alu.p.i_valid & alu.p.o_ready)
+ cnt_alu_write = Signal(4)
+ m.d.sync += cnt_alu_write.eq(cnt_alu_write + do_alu_write)
+ do_alu_read = Signal()
+ m.d.comb += do_alu_read.eq(alu.n.o_valid & alu.n.i_ready)
+ cnt_alu_read = Signal(4)
+ m.d.sync += cnt_alu_read.eq(cnt_alu_read + do_alu_read)
+ cnt_masked_read = []
+ do_masked_read = Signal(dut.n_src)
+ for i in range(dut.n_src):
+ cnt = Signal(4, name="cnt_masked_read_%d" % i)
+ if i == 0:
+ extra = dut.oper_i.zero_a
+ elif i == 1:
+ extra = dut.oper_i.imm_data.ok
+ else:
+ extra = Const(0, 1)
+ m.d.comb += do_masked_read[i].eq(do_issue &
+ (dut.rdmaskn[i] | extra))
+ m.d.sync += cnt.eq(cnt + do_masked_read[i])
+ cnt_masked_read.append(cnt)
+ # If the ALU is idle, do not assert valid
+ with m.If((cnt_alu_read == cnt_alu_write) & ~do_alu_write):
+ m.d.comb += Assume(~alu.n.o_valid)
+ # Keep ALU valid high, until read
+ last_alu_valid = Signal()
+ m.d.sync += last_alu_valid.eq(alu.n.o_valid & ~alu.n.i_ready)
+ with m.If(last_alu_valid):
+ m.d.comb += Assume(alu.n.o_valid)
+
+ # Invariant checks
+
+ # For every instruction issued, at any point in time,
+ # each operand was either:
+ # 1) Already read
+ # 2) Not read yet, but the read is pending (rel_o high)
+ # 3) Masked
+ for i in range(dut.n_src):
+ sum_read = Signal(4)
+ m.d.comb += sum_read.eq(
+ cnt_read[i] + cnt_masked_read[i] + dut.cu.rd.rel_o[i])
+ m.d.comb += Assert(sum_read == cnt_issue)
+
+ # For every instruction, either:
+ # 1) The ALU is executing the instruction
+ # 2) Otherwise, execution is pending (alu.p.i_valid is high)
+ # 3) Otherwise, it is waiting for operands
+ # (some dut.cu.rd.rel_o are still high)
+ # 4) ... unless all operands are masked, in which case there is a one
+ # cycle delay
+ all_masked = Signal()
+ m.d.sync += all_masked.eq(do_masked_read.all())
+ sum_alu_write = Signal(4)
+ m.d.comb += sum_alu_write.eq(
+ cnt_alu_write +
+ (dut.cu.rd.rel_o.any() | all_masked | alu.p.i_valid))
+ m.d.comb += Assert(sum_alu_write == cnt_issue)
+
+ # Ask the formal engine to give an example
+ m.d.comb += Cover((cnt_issue == 2)
+ & (cnt_read[0] == 1)
+ & (cnt_read[1] == 0)
+ & (cnt_write[0] == 1)
+ & (cnt_write[1] == 1)
+ & (cnt_alu_write == 1)
+ & (cnt_alu_read == 1)
+ & (cnt_masked_read[0] == 1)
+ & (cnt_masked_read[1] == 1))
+ with self.subTest("cover"):
+ self.assertFormal(m, mode="cover", depth=10)
+
+ # Check assertions
+ with self.subTest("bmc"):
+ self.assertFormal(m, mode="bmc", depth=10)
+
+
+if __name__ == "__main__":
+ unittest.main()
write TAG_BITS width which may not match full ram blocks and might
cause muxes to be inferred for "partial writes".
* Check if making the read size of PLRU a ROM helps utilization
+
+Links:
+
+* https://bugs.libre-soc.org/show_bug.cgi?id=485
+* https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
+ (discussion about brams for ECP5)
+
"""
from enum import (Enum, unique)
-from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl)
+from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl,
+ Record)
from nmigen.cli import main, rtlil
from nmutil.iocontrol import RecordObject
from nmigen.utils import log2_int
+from nmigen.lib.coding import Decoder
from nmutil.util import Display
+from nmutil.latch import SRLatch
#from nmutil.plru import PLRU
+from soc.experiment.plru import PLRU, PLRUs
from soc.experiment.cache_ram import CacheRam
-from soc.experiment.plru import PLRU
from soc.experiment.mem_types import (Fetch1ToICacheType,
ICacheToDecode1Type,
from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
WB_SEL_BITS, WBAddrType, WBDataType,
WBSelType, WBMasterOut, WBSlaveOut,
- WBMasterOutVector, WBSlaveOutVector,
- WBIOMasterOut, WBIOSlaveOut)
+ )
+
+from nmigen_soc.wishbone.bus import Interface
+from soc.minerva.units.fetch import FetchUnitInterface
+
# for test
from soc.bus.sram import SRAM
# Also, check out the cxxsim nmigen branch, and latest yosys from git
from nmutil.sim_tmp_alternative import Simulator, Settle
+# from microwatt/utils.vhdl
+def ispow2(n):
+ return n != 0 and (n & (n - 1)) == 0
SIM = 0
-LINE_SIZE = 64
-# BRAM organisation: We never access more than wishbone_data_bits
-# at a time so to save resources we make the array only that wide,
-# and use consecutive indices for to make a cache "line"
-#
-# ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
-ROW_SIZE = WB_DATA_BITS // 8
-# Number of lines in a set
-NUM_LINES = 16
-# Number of ways
-NUM_WAYS = 4
-# L1 ITLB number of entries (direct mapped)
-TLB_SIZE = 64
-# L1 ITLB log_2(page_size)
-TLB_LG_PGSZ = 12
-# Number of real address bits that we store
-REAL_ADDR_BITS = 56
# Non-zero to enable log data collection
LOG_LENGTH = 0
-ROW_SIZE_BITS = ROW_SIZE * 8
-# ROW_PER_LINE is the number of row (wishbone) transactions in a line
-ROW_PER_LINE = LINE_SIZE // ROW_SIZE
-# BRAM_ROWS is the number of rows in BRAM needed to represent the full icache
-BRAM_ROWS = NUM_LINES * ROW_PER_LINE
-# INSN_PER_ROW is the number of 32bit instructions per BRAM row
-INSN_PER_ROW = ROW_SIZE_BITS // 32
-
-# Bit fields counts in the address
-#
-# INSN_BITS is the number of bits to select an instruction in a row
-INSN_BITS = log2_int(INSN_PER_ROW)
-# ROW_BITS is the number of bits to select a row
-ROW_BITS = log2_int(BRAM_ROWS)
-# ROW_LINE_BITS is the number of bits to select a row within a line
-ROW_LINE_BITS = log2_int(ROW_PER_LINE)
-# LINE_OFF_BITS is the number of bits for the offset in a cache line
-LINE_OFF_BITS = log2_int(LINE_SIZE)
-# ROW_OFF_BITS is the number of bits for the offset in a row
-ROW_OFF_BITS = log2_int(ROW_SIZE)
-# INDEX_BITS is the number of bits to select a cache line
-INDEX_BITS = log2_int(NUM_LINES)
-# SET_SIZE_BITS is the log base 2 of the set size
-SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
-# TAG_BITS is the number of bits of the tag part of the address
-TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
-# TAG_WIDTH is the width in bits of each way of the tag RAM
-TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
-
-# WAY_BITS is the number of bits to select a way
-WAY_BITS = log2_int(NUM_WAYS)
-TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
-
-# L1 ITLB
-TLB_BITS = log2_int(TLB_SIZE)
-TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
-TLB_PTE_BITS = 64
-
-print("BRAM_ROWS =", BRAM_ROWS)
-print("INDEX_BITS =", INDEX_BITS)
-print("INSN_BITS =", INSN_BITS)
-print("INSN_PER_ROW =", INSN_PER_ROW)
-print("LINE_SIZE =", LINE_SIZE)
-print("LINE_OFF_BITS =", LINE_OFF_BITS)
-print("LOG_LENGTH =", LOG_LENGTH)
-print("NUM_LINES =", NUM_LINES)
-print("NUM_WAYS =", NUM_WAYS)
-print("REAL_ADDR_BITS =", REAL_ADDR_BITS)
-print("ROW_BITS =", ROW_BITS)
-print("ROW_OFF_BITS =", ROW_OFF_BITS)
-print("ROW_LINE_BITS =", ROW_LINE_BITS)
-print("ROW_PER_LINE =", ROW_PER_LINE)
-print("ROW_SIZE =", ROW_SIZE)
-print("ROW_SIZE_BITS =", ROW_SIZE_BITS)
-print("SET_SIZE_BITS =", SET_SIZE_BITS)
-print("SIM =", SIM)
-print("TAG_BITS =", TAG_BITS)
-print("TAG_RAM_WIDTH =", TAG_RAM_WIDTH)
-print("TAG_BITS =", TAG_BITS)
-print("TLB_BITS =", TLB_BITS)
-print("TLB_EA_TAG_BITS =", TLB_EA_TAG_BITS)
-print("TLB_LG_PGSZ =", TLB_LG_PGSZ)
-print("TLB_PTE_BITS =", TLB_PTE_BITS)
-print("TLB_SIZE =", TLB_SIZE)
-print("WAY_BITS =", WAY_BITS)
-
-# from microwatt/utils.vhdl
-def ispow2(n):
- return n != 0 and (n & (n - 1)) == 0
-
-assert LINE_SIZE % ROW_SIZE == 0
-assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
-assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
-assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
-assert ispow2(INSN_PER_ROW), "INSN_PER_ROW not power of 2"
-assert (ROW_BITS == (INDEX_BITS + ROW_LINE_BITS)), \
- "geometry bits don't add up"
-assert (LINE_OFF_BITS == (ROW_OFF_BITS + ROW_LINE_BITS)), \
- "geometry bits don't add up"
-assert (REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS)), \
- "geometry bits don't add up"
-assert (REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS)), \
- "geometry bits don't add up"
-
-# Example of layout for 32 lines of 64 bytes:
-#
-# .. tag |index| line |
-# .. | row | |
-# .. | | | |00| zero (2)
-# .. | | |-| | INSN_BITS (1)
-# .. | |---| | ROW_LINE_BITS (3)
-# .. | |--- - --| LINE_OFF_BITS (6)
-# .. | |- --| ROW_OFF_BITS (3)
-# .. |----- ---| | ROW_BITS (8)
-# .. |-----| | INDEX_BITS (5)
-# .. --------| | TAG_BITS (53)
-
-# The cache data BRAM organized as described above for each way
-#subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
-#
-# The cache tags LUTRAM has a row per set. Vivado is a pain and will
-# not handle a clean (commented) definition of the cache tags as a 3d
-# memory. For now, work around it by putting all the tags
-def CacheTagArray():
- return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" %x) \
- for x in range(NUM_LINES))
-
-# The cache valid bits
-def CacheValidBitsArray():
- return Array(Signal(NUM_WAYS, name="cachevalid_%d" %x) \
- for x in range(NUM_LINES))
-
-def RowPerLineValidArray():
- return Array(Signal(name="rows_valid_%d" %x) \
- for x in range(ROW_PER_LINE))
-
-
-# TODO to be passed to nigmen as ram attributes
-# attribute ram_style : string;
-# attribute ram_style of cache_tags : signal is "distributed";
-
-
-def TLBValidBitsArray():
- return Array(Signal(name="tlbvalid_%d" %x) \
- for x in range(TLB_SIZE))
-
-def TLBTagArray():
- return Array(Signal(TLB_EA_TAG_BITS, name="tlbtag_%d" %x) \
- for x in range(TLB_SIZE))
-
-def TLBPtesArray():
- return Array(Signal(TLB_PTE_BITS, name="tlbptes_%d" %x) \
- for x in range(TLB_SIZE))
-
-# Cache RAM interface
-def CacheRamOut():
- return Array(Signal(ROW_SIZE_BITS, name="cache_out_%d" %x) \
- for x in range(NUM_WAYS))
-
-# PLRU output interface
-def PLRUOut():
- return Array(Signal(WAY_BITS, name="plru_out_%d" %x) \
- for x in range(NUM_LINES))
-
-# Return the cache line index (tag index) for an address
-def get_index(addr):
- return addr[LINE_OFF_BITS:SET_SIZE_BITS]
-
-# Return the cache row index (data memory) for an address
-def get_row(addr):
- return addr[ROW_OFF_BITS:SET_SIZE_BITS]
-
-# Return the index of a row within a line
-def get_row_of_line(row):
- return row[:ROW_LINE_BITS]
-
-# Returns whether this is the last row of a line
-def is_last_row_addr(addr, last):
- return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
-
-# Returns whether this is the last row of a line
-def is_last_row(row, last):
- return get_row_of_line(row) == last
-
-# Return the next row in the current cache line. We use a dedicated
-# function in order to limit the size of the generated adder to be
-# only the bits within a cache line (3 bits with default settings)
-def next_row(row):
- row_v = row[0:ROW_LINE_BITS] + 1
- return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
-
-# Read the instruction word for the given address
-# in the current cache row
-def read_insn_word(addr, data):
- word = addr[2:INSN_BITS+2]
- return data.word_select(word, 32)
-
-# Get the tag value from the address
-def get_tag(addr):
- return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
-
-# Read a tag from a tag memory row
-def read_tag(way, tagset):
- return tagset.word_select(way, TAG_BITS)
-
-# Write a tag to tag memory row
-def write_tag(way, tagset, tag):
- return read_tag(way, tagset).eq(tag)
-
-# Simple hash for direct-mapped TLB index
-def hash_ea(addr):
- hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ addr[
- TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS
- ] ^ addr[
- TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS
- ]
- return hsh
+class ICacheConfig:
+ def __init__(self, XLEN = 64,
+ LINE_SIZE = 64,
+ NUM_LINES = 64, # Number of lines in a set
+ NUM_WAYS = 2, # Number of ways
+ TLB_SIZE = 64, # L1 ITLB number of entries
+ TLB_LG_PGSZ = 12): # L1 ITLB log_2(page_size)
+ self.XLEN = XLEN
+ self.LINE_SIZE = LINE_SIZE
+ self.NUM_LINES = NUM_LINES
+ self.NUM_WAYS = NUM_WAYS
+ self.TLB_SIZE = TLB_SIZE
+ self.TLB_LG_PGSZ = TLB_LG_PGSZ
+
+ # BRAM organisation: We never access more than wishbone_data_bits
+ # at a time so to save resources we make the array only that wide,
+ # and use consecutive indices for to make a cache "line"
+ #
+ # self.ROW_SIZE is the width in bytes of the BRAM
+ # (based on WB, so 64-bits)
+ self.ROW_SIZE = WB_DATA_BITS // 8
+ # Number of real address bits that we store
+ self.REAL_ADDR_BITS = XLEN-8 # 56 for XLEN=64
+
+ self.ROW_SIZE_BITS = self.ROW_SIZE * 8
+ # ROW_PER_LINE is the number of row (wishbone) transactions in a line
+ self.ROW_PER_LINE = self.LINE_SIZE // self.ROW_SIZE
+ # BRAM_ROWS is the number of rows in BRAM
+ # needed to represent the full icache
+ self.BRAM_ROWS = self.NUM_LINES * self.ROW_PER_LINE
+ # INSN_PER_ROW is the number of 32bit instructions per BRAM row
+ self.INSN_PER_ROW = self.ROW_SIZE_BITS // 32
+
+ # Bit fields counts in the address
+ #
+ # INSN_BITS is the number of bits to select an instruction in a row
+ self.INSN_BITS = log2_int(self.INSN_PER_ROW)
+ # ROW_BITS is the number of bits to select a row
+ self.ROW_BITS = log2_int(self.BRAM_ROWS)
+ # ROW_LINE_BITS is the number of bits to select a row within a line
+ self.ROW_LINE_BITS = log2_int(self.ROW_PER_LINE)
+ # LINE_OFF_BITS is the number of bits for the offset in a cache line
+ self.LINE_OFF_BITS = log2_int(self.LINE_SIZE)
+ # ROW_OFF_BITS is the number of bits for the offset in a row
+ self.ROW_OFF_BITS = log2_int(self.ROW_SIZE)
+ # INDEX_BITS is the number of bits to select a cache line
+ self.INDEX_BITS = log2_int(self.NUM_LINES)
+ # SET_SIZE_BITS is the log base 2 of the set size
+ self.SET_SIZE_BITS = self.LINE_OFF_BITS + self.INDEX_BITS
+ # TAG_BITS is the number of bits of the tag part of the address
+ self.TAG_BITS = self.REAL_ADDR_BITS - self.SET_SIZE_BITS
+ # TAG_WIDTH is the width in bits of each way of the tag RAM
+ self.TAG_WIDTH = self.TAG_BITS + 7 - ((self.TAG_BITS + 7) % 8)
+
+ # WAY_BITS is the number of bits to select a way
+ self.WAY_BITS = log2_int(self.NUM_WAYS)
+ self.TAG_RAM_WIDTH = self.TAG_BITS * self.NUM_WAYS
+
+ # L1 ITLB
+ self.TL_BITS = log2_int(self.TLB_SIZE)
+ self.TLB_EA_TAG_BITS = XLEN - (self.TLB_LG_PGSZ + self.TL_BITS)
+ self.TLB_PTE_BITS = XLEN
+
+ print("self.XLEN =", self.XLEN)
+ print("self.BRAM_ROWS =", self.BRAM_ROWS)
+ print("self.INDEX_BITS =", self.INDEX_BITS)
+ print("self.INSN_BITS =", self.INSN_BITS)
+ print("self.INSN_PER_ROW =", self.INSN_PER_ROW)
+ print("self.LINE_SIZE =", self.LINE_SIZE)
+ print("self.LINE_OFF_BITS =", self.LINE_OFF_BITS)
+ print("LOG_LENGTH =", LOG_LENGTH)
+ print("self.NUM_LINES =", self.NUM_LINES)
+ print("self.NUM_WAYS =", self.NUM_WAYS)
+ print("self.REAL_ADDR_BITS =", self.REAL_ADDR_BITS)
+ print("self.ROW_BITS =", self.ROW_BITS)
+ print("self.ROW_OFF_BITS =", self.ROW_OFF_BITS)
+ print("self.ROW_LINE_BITS =", self.ROW_LINE_BITS)
+ print("self.ROW_PER_LINE =", self.ROW_PER_LINE)
+ print("self.ROW_SIZE =", self.ROW_SIZE)
+ print("self.ROW_SIZE_BITS =", self.ROW_SIZE_BITS)
+ print("self.SET_SIZE_BITS =", self.SET_SIZE_BITS)
+ print("SIM =", SIM)
+ print("self.TAG_BITS =", self.TAG_BITS)
+ print("self.TAG_RAM_WIDTH =", self.TAG_RAM_WIDTH)
+ print("self.TAG_BITS =", self.TAG_BITS)
+ print("self.TL_BITS =", self.TL_BITS)
+ print("self.TLB_EA_TAG_BITS =", self.TLB_EA_TAG_BITS)
+ print("self.TLB_LG_PGSZ =", self.TLB_LG_PGSZ)
+ print("self.TLB_PTE_BITS =", self.TLB_PTE_BITS)
+ print("self.TLB_SIZE =", self.TLB_SIZE)
+ print("self.WAY_BITS =", self.WAY_BITS)
+ print()
+
+ assert self.LINE_SIZE % self.ROW_SIZE == 0
+ assert ispow2(self.LINE_SIZE), "self.LINE_SIZE not power of 2"
+ assert ispow2(self.NUM_LINES), "self.NUM_LINES not power of 2"
+ assert ispow2(self.ROW_PER_LINE), "self.ROW_PER_LINE not power of 2"
+ assert ispow2(self.INSN_PER_ROW), "self.INSN_PER_ROW not power of 2"
+ assert (self.ROW_BITS == (self.INDEX_BITS + self.ROW_LINE_BITS)), \
+ "geometry bits don't add up"
+ assert (self.LINE_OFF_BITS ==
+ (self.ROW_OFF_BITS + self.ROW_LINE_BITS)), \
+ "geometry bits don't add up"
+ assert (self.REAL_ADDR_BITS ==
+ (self.TAG_BITS + self.INDEX_BITS + self.LINE_OFF_BITS)), \
+ "geometry bits don't add up"
+ assert (self.REAL_ADDR_BITS ==
+ (self.TAG_BITS + self.ROW_BITS + self.ROW_OFF_BITS)), \
+ "geometry bits don't add up"
+
+ # Example of layout for 32 lines of 64 bytes:
+ #
+ # .. tag |index| line |
+ # .. | row | |
+ # .. | | | |00| zero (2)
+ # .. | | |-| | self.INSN_BITS (1)
+ # .. | |---| | self.ROW_LINE_BITS (3)
+ # .. | |--- - --| self.LINE_OFF_BITS (6)
+ # .. | |- --| self.ROW_OFF_BITS (3)
+ # .. |----- ---| | self.ROW_BITS (8)
+ # .. |-----| | self.INDEX_BITS (5)
+ # .. --------| | self.TAG_BITS (53)
+
+ # The cache data BRAM organized as described above for each way
+ #subtype cache_row_t is std_ulogic_vector(self.ROW_SIZE_BITS-1 downto 0);
+ #
+ def RowPerLineValidArray(self):
+ return Array(Signal(name="rows_valid_%d" %x) \
+ for x in range(self.ROW_PER_LINE))
+
+
+ # TODO to be passed to nigmen as ram attributes
+ # attribute ram_style : string;
+ # attribute ram_style of cache_tags : signal is "distributed";
+
+ def TLBRecord(self, name):
+ tlb_layout = [ ('tag', self.TLB_EA_TAG_BITS),
+ ('pte', self.TLB_PTE_BITS)
+ ]
+ return Record(tlb_layout, name=name)
+
+ def TLBArray(self):
+ return Array(self.TLBRecord("tlb%d" % x) for x in range(self.TLB_SIZE))
+
+ # PLRU output interface
+ def PLRUOut(self):
+ return Array(Signal(self.WAY_BITS, name="plru_out_%d" %x) \
+ for x in range(self.NUM_LINES))
+
+ # Return the cache line index (tag index) for an address
+ def get_index(self, addr):
+ return addr[self.LINE_OFF_BITS:self.SET_SIZE_BITS]
+
+ # Return the cache row index (data memory) for an address
+ def get_row(self, addr):
+ return addr[self.ROW_OFF_BITS:self.SET_SIZE_BITS]
+
+ # Return the index of a row within a line
+ def get_row_of_line(self, row):
+ return row[:self.ROW_BITS][:self.ROW_LINE_BITS]
+
+ # Returns whether this is the last row of a line
+ def is_last_row_addr(self, addr, last):
+ return addr[self.ROW_OFF_BITS:self.LINE_OFF_BITS] == last
+
+ # Returns whether this is the last row of a line
+ def is_last_row(self, row, last):
+ return self.get_row_of_line(row) == last
+
+ # Return the next row in the current cache line. We use a dedicated
+ # function in order to limit the size of the generated adder to be
+ # only the bits within a cache line (3 bits with default settings)
+ def next_row(self, row):
+ row_v = row[0:self.ROW_LINE_BITS] + 1
+ return Cat(row_v[:self.ROW_LINE_BITS], row[self.ROW_LINE_BITS:])
+
+ # Read the instruction word for the given address
+ # in the current cache row
+ def read_insn_word(self, addr, data):
+ word = addr[2:self.INSN_BITS+2]
+ return data.word_select(word, 32)
+
+ # Get the tag value from the address
+ def get_tag(self, addr):
+ return addr[self.SET_SIZE_BITS:self.REAL_ADDR_BITS]
+
+ # Read a tag from a tag memory row
+ def read_tag(self, way, tagset):
+ return tagset.word_select(way, self.TAG_BITS)
+
+ # Write a tag to tag memory row
+ def write_tag(self, way, tagset, tag):
+ return self.read_tag(way, tagset).eq(tag)
+
+ # Simple hash for direct-mapped TLB index
+ def hash_ea(self, addr):
+ hsh = (addr[self.TLB_LG_PGSZ:self.TLB_LG_PGSZ + self.TL_BITS] ^
+ addr[self.TLB_LG_PGSZ + self.TL_BITS:
+ self.TLB_LG_PGSZ + 2 * self.TL_BITS ] ^
+ addr[self.TLB_LG_PGSZ + 2 * self.TL_BITS:
+ self.TLB_LG_PGSZ + 3 * self.TL_BITS])
+ return hsh
# Cache reload state machine
class RegInternal(RecordObject):
- def __init__(self):
+ def __init__(self, cfg):
super().__init__()
# Cache hit state (Latches for 1 cycle BRAM access)
- self.hit_way = Signal(NUM_WAYS)
+ self.hit_way = Signal(cfg.WAY_BITS)
self.hit_nia = Signal(64)
self.hit_smark = Signal()
self.hit_valid = Signal()
self.state = Signal(State, reset=State.IDLE)
self.wb = WBMasterOut("wb")
self.req_adr = Signal(64)
- self.store_way = Signal(NUM_WAYS)
- self.store_index = Signal(NUM_LINES)
- self.store_row = Signal(BRAM_ROWS)
- self.store_tag = Signal(TAG_BITS)
+ self.store_way = Signal(cfg.WAY_BITS)
+ self.store_index = Signal(cfg.INDEX_BITS)
+ self.store_row = Signal(cfg.ROW_BITS)
+ self.store_tag = Signal(cfg.TAG_BITS)
self.store_valid = Signal()
- self.end_row_ix = Signal(ROW_LINE_BITS)
- self.rows_valid = RowPerLineValidArray()
+ self.end_row_ix = Signal(cfg.ROW_LINE_BITS)
+ self.rows_valid = cfg.RowPerLineValidArray()
# TLB miss state
self.fetch_failed = Signal()
-class ICache(Elaboratable):
+class ICache(FetchUnitInterface, Elaboratable, ICacheConfig):
"""64 bit direct mapped icache. All instructions are 4B aligned."""
- def __init__(self):
+ def __init__(self, pspec):
+ FetchUnitInterface.__init__(self, pspec)
self.i_in = Fetch1ToICacheType(name="i_in")
self.i_out = ICacheToDecode1Type(name="i_out")
self.flush_in = Signal()
self.inval_in = Signal()
- self.wb_out = WBMasterOut(name="wb_out")
- self.wb_in = WBSlaveOut(name="wb_in")
+ # standard naming (wired to non-standard for compatibility)
+ self.bus = Interface(addr_width=32,
+ data_width=64,
+ granularity=8,
+ features={'stall'},
+ #alignment=0,
+ name="icache_wb")
self.log_out = Signal(54)
+ # use FetchUnitInterface, helps keep some unit tests running
+ self.use_fetch_iface = False
+
+ # test if small cache to be enabled
+ self.small_cache = (hasattr(pspec, "small_cache") and
+ (pspec.small_cache == True))
+ # test if microwatt compatibility to be enabled
+ self.microwatt_compat = (hasattr(pspec, "microwatt_compat") and
+ (pspec.microwatt_compat == True))
+ # test if fabric compatibility is to be enabled
+ self.fabric_compat = (hasattr(pspec, "fabric_compat") and
+ (pspec.fabric_compat == True))
+
+ XLEN = pspec.XLEN
+ LINE_SIZE = 64
+ TLB_SIZE = 8
+ NUM_LINES = 8
+ NUM_WAYS = 2
+ if self.small_cache:
+ # reduce way sizes and num lines to ridiculously small
+ NUM_LINES = 2
+ NUM_WAYS = 1
+ TLB_SIZE = 2
+ if self.microwatt_compat or self.fabric_compat:
+ # reduce way sizes
+ NUM_WAYS = 1
+
+ ICacheConfig.__init__(self, LINE_SIZE=LINE_SIZE,
+ XLEN=XLEN,
+ NUM_LINES = NUM_LINES,
+ NUM_WAYS = NUM_WAYS,
+ TLB_SIZE=TLB_SIZE
+ )
+
+ def use_fetch_interface(self):
+ self.use_fetch_iface = True
# Generate a cache RAM for each way
def rams(self, m, r, cache_out_row, use_previous,
comb = m.d.comb
sync = m.d.sync
- wb_in, stall_in = self.wb_in, self.stall_in
+ bus, stall_in = self.bus, self.stall_in
+
+ # read condition (for every cache ram)
+ do_read = Signal()
+ comb += do_read.eq(~(stall_in | use_previous))
+
+ rd_addr = Signal(self.ROW_BITS)
+ wr_addr = Signal(self.ROW_BITS)
+ comb += rd_addr.eq(req_row)
+ comb += wr_addr.eq(r.store_row)
- for i in range(NUM_WAYS):
- do_read = Signal(name="do_rd_%d" % i)
+ # binary-to-unary converters: replace-way enabled by bus.ack,
+ # hit-way left permanently enabled
+ m.submodules.replace_way_e = re = Decoder(self.NUM_WAYS)
+ m.submodules.hit_way_e = he = Decoder(self.NUM_WAYS)
+ comb += re.i.eq(replace_way)
+ comb += re.n.eq(~bus.ack)
+ comb += he.i.eq(r.hit_way)
+
+ for i in range(self.NUM_WAYS):
do_write = Signal(name="do_wr_%d" % i)
- rd_addr = Signal(ROW_BITS)
- wr_addr = Signal(ROW_BITS)
- d_out = Signal(ROW_SIZE_BITS, name="d_out_%d" % i)
- wr_sel = Signal(ROW_SIZE)
+ d_out = Signal(self.ROW_SIZE_BITS, name="d_out_%d" % i)
+ wr_sel = Signal(self.ROW_SIZE, name="wr_sel_%d" % i)
- way = CacheRam(ROW_BITS, ROW_SIZE_BITS, True)
- setattr(m.submodules, "cacheram_%d" % i, way)
+ way = CacheRam(self.ROW_BITS, self.ROW_SIZE_BITS,
+ TRACE=True, ram_num=i)
+ m.submodules["cacheram_%d" % i] = way
comb += way.rd_en.eq(do_read)
comb += way.rd_addr.eq(rd_addr)
comb += d_out.eq(way.rd_data_o)
comb += way.wr_sel.eq(wr_sel)
comb += way.wr_addr.eq(wr_addr)
- comb += way.wr_data.eq(wb_in.dat)
+ comb += way.wr_data.eq(bus.dat_r)
- comb += do_read.eq(~(stall_in | use_previous))
- comb += do_write.eq(wb_in.ack & (replace_way == i))
+ comb += do_write.eq(re.o[i])
with m.If(do_write):
sync += Display("cache write adr: %x data: %lx",
wr_addr, way.wr_data)
- with m.If(r.hit_way == i):
+ with m.If(he.o[i]):
comb += cache_out_row.eq(d_out)
with m.If(do_read):
sync += Display("cache read adr: %x data: %x",
req_row, d_out)
- comb += rd_addr.eq(req_row)
- comb += wr_addr.eq(r.store_row)
- comb += wr_sel.eq(Repl(do_write, ROW_SIZE))
+ comb += wr_sel.eq(Repl(do_write, self.ROW_SIZE))
# Generate PLRUs
def maybe_plrus(self, m, r, plru_victim):
comb = m.d.comb
- with m.If(NUM_WAYS > 1):
- for i in range(NUM_LINES):
- plru_acc_i = Signal(WAY_BITS)
- plru_acc_en = Signal()
- plru = PLRU(WAY_BITS)
- setattr(m.submodules, "plru_%d" % i, plru)
-
- comb += plru.acc_i.eq(plru_acc_i)
- comb += plru.acc_en.eq(plru_acc_en)
+ if self.NUM_WAYS == 0:
+ return
- # PLRU interface
- with m.If(get_index(r.hit_nia) == i):
- comb += plru.acc_en.eq(r.hit_valid)
- comb += plru.acc_i.eq(r.hit_way)
- comb += plru_victim[i].eq(plru.lru_o)
+ m.submodules.plrus = plru = PLRUs("itag", self.NUM_LINES,
+ self.WAY_BITS)
+ comb += plru.way.eq(r.hit_way)
+ comb += plru.valid.eq(r.hit_valid)
+ comb += plru.index.eq(self.get_index(r.hit_nia))
+ comb += plru.isel.eq(r.store_index) # select victim
+ comb += plru_victim.eq(plru.o_index) # selected victim
# TLB hit detection and real address generation
- def itlb_lookup(self, m, tlb_req_index, itlb_ptes, itlb_tags,
- real_addr, itlb_valid_bits, ra_valid, eaa_priv,
+ def itlb_lookup(self, m, tlb_req_index, itlb, itlb_valid,
+ real_addr, ra_valid, eaa_priv,
priv_fault, access_ok):
comb = m.d.comb
i_in = self.i_in
- pte = Signal(TLB_PTE_BITS)
- ttag = Signal(TLB_EA_TAG_BITS)
+ # use an *asynchronous* Memory read port here (combinatorial)
+ m.submodules.rd_tlb = rd_tlb = self.tlbmem.read_port(domain="comb")
+ tlb = self.TLBRecord("tlb_rdport")
+ pte, ttag = tlb.pte, tlb.tag
- comb += tlb_req_index.eq(hash_ea(i_in.nia))
- comb += pte.eq(itlb_ptes[tlb_req_index])
- comb += ttag.eq(itlb_tags[tlb_req_index])
+ comb += tlb_req_index.eq(self.hash_ea(i_in.nia))
+ comb += rd_tlb.addr.eq(tlb_req_index)
+ comb += tlb.eq(rd_tlb.data)
with m.If(i_in.virt_mode):
- comb += real_addr.eq(Cat(
- i_in.nia[:TLB_LG_PGSZ],
- pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
- ))
+ comb += real_addr.eq(Cat(i_in.nia[:self.TLB_LG_PGSZ],
+ pte[self.TLB_LG_PGSZ:self.REAL_ADDR_BITS]))
- with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
- comb += ra_valid.eq(itlb_valid_bits[tlb_req_index])
+ with m.If(ttag == i_in.nia[self.TLB_LG_PGSZ + self.TL_BITS:64]):
+ comb += ra_valid.eq(itlb_valid.q.bit_select(tlb_req_index, 1))
comb += eaa_priv.eq(pte[3])
with m.Else():
- comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
+ comb += real_addr.eq(i_in.nia[:self.REAL_ADDR_BITS])
comb += ra_valid.eq(1)
comb += eaa_priv.eq(1)
comb += access_ok.eq(ra_valid & ~priv_fault)
# iTLB update
- def itlb_update(self, m, itlb_valid_bits, itlb_tags, itlb_ptes):
+ def itlb_update(self, m, itlb, itlb_valid):
comb = m.d.comb
sync = m.d.sync
m_in = self.m_in
- wr_index = Signal(TLB_SIZE)
- comb += wr_index.eq(hash_ea(m_in.addr))
+ wr_index = Signal(self.TL_BITS)
+ wr_unary = Signal(self.TLB_SIZE)
+ comb += wr_index.eq(self.hash_ea(m_in.addr))
+ comb += wr_unary.eq(1<<wr_index)
+
+ m.submodules.wr_tlb = wr_tlb = self.tlbmem.write_port()
+ sync += itlb_valid.s.eq(0)
+ sync += itlb_valid.r.eq(0)
with m.If(m_in.tlbie & m_in.doall):
# Clear all valid bits
- for i in range(TLB_SIZE):
- sync += itlb_valid_bits[i].eq(0)
+ sync += itlb_valid.r.eq(-1)
with m.Elif(m_in.tlbie):
# Clear entry regardless of hit or miss
- sync += itlb_valid_bits[wr_index].eq(0)
+ sync += itlb_valid.r.eq(wr_unary)
with m.Elif(m_in.tlbld):
- sync += itlb_tags[wr_index].eq(
- m_in.addr[TLB_LG_PGSZ + TLB_BITS:64]
- )
- sync += itlb_ptes[wr_index].eq(m_in.pte)
- sync += itlb_valid_bits[wr_index].eq(1)
+ tlb = self.TLBRecord("tlb_wrport")
+ comb += tlb.tag.eq(m_in.addr[self.TLB_LG_PGSZ + self.TL_BITS:64])
+ comb += tlb.pte.eq(m_in.pte)
+ comb += wr_tlb.en.eq(1)
+ comb += wr_tlb.addr.eq(wr_index)
+ comb += wr_tlb.data.eq(tlb)
+ sync += itlb_valid.s.eq(wr_unary)
# Cache hit detection, output to fetch2 and other misc logic
def icache_comb(self, m, use_previous, r, req_index, req_row,
req_hit_way, req_tag, real_addr, req_laddr,
- cache_valid_bits, cache_tags, access_ok,
+ cache_valids, access_ok,
req_is_hit, req_is_miss, replace_way,
plru_victim, cache_out_row):
comb = m.d.comb
+ m.submodules.rd_tag = rd_tag = self.tagmem.read_port(domain="comb")
- i_in, i_out, wb_out = self.i_in, self.i_out, self.wb_out
+ i_in, i_out, bus = self.i_in, self.i_out, self.bus
flush_in, stall_out = self.flush_in, self.stall_out
is_hit = Signal()
- hit_way = Signal(NUM_WAYS)
+ hit_way = Signal(self.WAY_BITS)
# i_in.sequential means that i_in.nia this cycle is 4 more than
# last cycle. If we read more than 32 bits at a time, had a
# cache hit last cycle, and we don't want the first 32-bit chunk
# then we can keep the data we read last cycle and just use that.
- with m.If(i_in.nia[2:INSN_BITS+2] != 0):
+ with m.If(i_in.nia[2:self.INSN_BITS+2] != 0):
comb += use_previous.eq(i_in.sequential & r.hit_valid)
# Extract line, row and tag from request
- comb += req_index.eq(get_index(i_in.nia))
- comb += req_row.eq(get_row(i_in.nia))
- comb += req_tag.eq(get_tag(real_addr))
+ comb += req_index.eq(self.get_index(i_in.nia))
+ comb += req_row.eq(self.get_row(i_in.nia))
+ comb += req_tag.eq(self.get_tag(real_addr))
# Calculate address of beginning of cache row, will be
# used for cache miss processing if needed
comb += req_laddr.eq(Cat(
- Const(0, ROW_OFF_BITS),
- real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
+ Const(0, self.ROW_OFF_BITS),
+ real_addr[self.ROW_OFF_BITS:self.REAL_ADDR_BITS],
))
# Test if pending request is a hit on any way
hitcond = Signal()
- comb += hitcond.eq((r.state == State.WAIT_ACK)
- & (req_index == r.store_index)
- & r.rows_valid[req_row % ROW_PER_LINE]
+ rowvalid = Signal()
+ comb += rowvalid.eq(r.rows_valid[req_row % self.ROW_PER_LINE])
+ comb += hitcond.eq((r.state == State.WAIT_ACK) &
+ (req_index == r.store_index) &
+ rowvalid
)
- with m.If(i_in.req):
- cvb = Signal(NUM_WAYS)
- ctag = Signal(TAG_RAM_WIDTH)
- comb += ctag.eq(cache_tags[req_index])
- comb += cvb.eq(cache_valid_bits[req_index])
- for i in range(NUM_WAYS):
- tagi = Signal(TAG_BITS, name="tag_i%d" % i)
- comb += tagi.eq(read_tag(i, ctag))
- hit_test = Signal(name="hit_test%d" % i)
- comb += hit_test.eq(i == r.store_way)
- with m.If((cvb[i] | (hitcond & hit_test))
- & (tagi == req_tag)):
- comb += hit_way.eq(i)
- comb += is_hit.eq(1)
+ # i_in.req asserts Decoder active
+ cvb = Signal(self.NUM_WAYS)
+ ctag = Signal(self.TAG_RAM_WIDTH)
+ comb += rd_tag.addr.eq(req_index)
+ comb += ctag.eq(rd_tag.data)
+ comb += cvb.eq(cache_valids.q.word_select(req_index, self.NUM_WAYS))
+ m.submodules.store_way_e = se = Decoder(self.NUM_WAYS)
+ comb += se.i.eq(r.store_way)
+ comb += se.n.eq(~i_in.req)
+ for i in range(self.NUM_WAYS):
+ tagi = Signal(self.TAG_BITS, name="tag_i%d" % i)
+ hit_test = Signal(name="hit_test%d" % i)
+ is_tag_hit = Signal(name="is_tag_hit_%d" % i)
+ comb += tagi.eq(self.read_tag(i, ctag))
+ comb += hit_test.eq(se.o[i])
+ comb += is_tag_hit.eq((cvb[i] | (hitcond & hit_test)) &
+ (tagi == req_tag))
+ with m.If(is_tag_hit):
+ comb += hit_way.eq(i)
+ comb += is_hit.eq(1)
# Generate the "hit" and "miss" signals
# for the synchronous blocks
comb += req_is_hit.eq(is_hit)
comb += req_is_miss.eq(~is_hit)
- with m.Else():
- comb += req_is_hit.eq(0)
- comb += req_is_miss.eq(0)
-
comb += req_hit_way.eq(hit_way)
# The way to replace on a miss
with m.If(r.state == State.CLR_TAG):
- comb += replace_way.eq(plru_victim[r.store_index])
+ comb += replace_way.eq(plru_victim)
with m.Else():
comb += replace_way.eq(r.store_way)
# be output an entire row which I prefer not to do just yet
# as it would force fetch2 to know about some of the cache
# geometry information.
- comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out_row))
+ comb += i_out.insn.eq(self.read_insn_word(r.hit_nia, cache_out_row))
comb += i_out.valid.eq(r.hit_valid)
comb += i_out.nia.eq(r.hit_nia)
comb += i_out.stop_mark.eq(r.hit_smark)
comb += stall_out.eq(~(is_hit & access_ok))
# Wishbone requests output (from the cache miss reload machine)
- comb += wb_out.eq(r.wb)
+ comb += bus.we.eq(r.wb.we)
+ comb += bus.adr.eq(r.wb.adr)
+ comb += bus.sel.eq(r.wb.sel)
+ comb += bus.stb.eq(r.wb.stb)
+ comb += bus.dat_w.eq(r.wb.dat)
+ comb += bus.cyc.eq(r.wb.cyc)
# Cache hit synchronous machine
def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
with m.If(req_is_hit):
sync += r.hit_way.eq(req_hit_way)
- sync += Display(
- "cache hit nia:%x IR:%x SM:%x idx:%x tag:%x " \
- "way:%x RA:%x", i_in.nia, i_in.virt_mode, \
- i_in.stop_mark, req_index, req_tag, \
- req_hit_way, real_addr
- )
-
-
+ sync += Display("cache hit nia:%x IR:%x SM:%x idx:%x tag:%x "
+ "way:%x RA:%x", i_in.nia, i_in.virt_mode,
+ i_in.stop_mark, req_index, req_tag,
+ req_hit_way, real_addr)
with m.If(~stall_in):
# Send stop marks and NIA down regardless of validity
i_in = self.i_in
# Reset per-row valid flags, only used in WAIT_ACK
- for i in range(ROW_PER_LINE):
+ for i in range(self.ROW_PER_LINE):
sync += r.rows_valid[i].eq(0)
# We need to read a cache line
"cache miss nia:%x IR:%x SM:%x idx:%x "
" way:%x tag:%x RA:%x", i_in.nia,
i_in.virt_mode, i_in.stop_mark, req_index,
- replace_way, req_tag, real_addr
- )
+ replace_way, req_tag, real_addr)
# Keep track of our index and way for subsequent stores
- st_row = Signal(BRAM_ROWS)
- comb += st_row.eq(get_row(req_laddr))
+ st_row = Signal(self.ROW_BITS)
+ comb += st_row.eq(self.get_row(req_laddr))
sync += r.store_index.eq(req_index)
sync += r.store_row.eq(st_row)
sync += r.store_tag.eq(req_tag)
sync += r.store_valid.eq(1)
- sync += r.end_row_ix.eq(get_row_of_line(st_row) - 1)
+ sync += r.end_row_ix.eq(self.get_row_of_line(st_row) - 1)
# Prep for first wishbone read. We calculate the address
# of the start of the cache line and start the WB cycle.
sync += r.state.eq(State.CLR_TAG)
def icache_miss_clr_tag(self, m, r, replace_way,
- cache_valid_bits, req_index,
- tagset, cache_tags):
-
+ req_index,
+ cache_valids):
comb = m.d.comb
sync = m.d.sync
+ m.submodules.wr_tag = wr_tag = self.tagmem.write_port(
+ granularity=self.TAG_BITS)
# Get victim way from plru
sync += r.store_way.eq(replace_way)
+
# Force misses on that way while reloading that line
- cv = Signal(INDEX_BITS)
- comb += cv.eq(cache_valid_bits[req_index])
- comb += cv.bit_select(replace_way, 1).eq(0)
- sync += cache_valid_bits[req_index].eq(cv)
+ idx = req_index*self.NUM_WAYS + replace_way # 2D index, 1st dim: self.NUM_WAYS
+ comb += cache_valids.r.eq(1<<idx)
- for i in range(NUM_WAYS):
- with m.If(i == replace_way):
- comb += tagset.eq(cache_tags[r.store_index])
- comb += write_tag(i, tagset, r.store_tag)
- sync += cache_tags[r.store_index].eq(tagset)
+ # use write-port "granularity" to select the tag to write to
+ # TODO: the Memory should be multipled-up (by NUM_TAGS)
+ tagset = Signal(self.TAG_RAM_WIDTH)
+ comb += tagset.eq(r.store_tag << (replace_way*self.TAG_BITS))
+ comb += wr_tag.en.eq(1<<replace_way)
+ comb += wr_tag.addr.eq(r.store_index)
+ comb += wr_tag.data.eq(tagset)
sync += r.state.eq(State.WAIT_ACK)
def icache_miss_wait_ack(self, m, r, replace_way, inval_in,
- stbs_done, cache_valid_bits):
+ cache_valids):
comb = m.d.comb
sync = m.d.sync
- wb_in = self.wb_in
-
- # Requests are all sent if stb is 0
- stbs_zero = Signal()
- comb += stbs_zero.eq(r.wb.stb == 0)
- comb += stbs_done.eq(stbs_zero)
+ bus = self.bus
# If we are still sending requests, was one accepted?
- with m.If(~wb_in.stall & ~stbs_zero):
- # That was the last word? We are done sending.
- # Clear stb and set stbs_done so we can handle
- # an eventual last ack on the same cycle.
- with m.If(is_last_row_addr(r.req_adr, r.end_row_ix)):
- sync += Display(
- "IS_LAST_ROW_ADDR r.wb.addr:%x " \
- "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x " \
- "stbs_done:%x", r.wb.adr, r.end_row_ix,
- r.wb.stb, stbs_zero, stbs_done
- )
+ with m.If(~bus.stall & r.wb.stb):
+ # That was the last word? We are done sending. Clear stb
+ with m.If(self.is_last_row_addr(r.req_adr, r.end_row_ix)):
+ sync += Display("IS_LAST_ROW_ADDR r.wb.addr:%x "
+ "r.end_row_ix:%x r.wb.stb:%x",
+ r.wb.adr, r.end_row_ix, r.wb.stb)
sync += r.wb.stb.eq(0)
- comb += stbs_done.eq(1)
# Calculate the next row address
- rarange = Signal(LINE_OFF_BITS - ROW_OFF_BITS)
- comb += rarange.eq(
- r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS] + 1
- )
- sync += r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(
- rarange
- )
+ rarange = Signal(self.LINE_OFF_BITS - self.ROW_OFF_BITS)
+ comb += rarange.eq(r.req_adr[self.ROW_OFF_BITS:
+ self.LINE_OFF_BITS] + 1)
+ sync += r.req_adr[self.ROW_OFF_BITS:self.LINE_OFF_BITS].eq(rarange)
sync += Display("RARANGE r.req_adr:%x rarange:%x "
- "stbs_zero:%x stbs_done:%x",
- r.req_adr, rarange, stbs_zero, stbs_done)
+ "r.wb.stb:%x",
+ r.req_adr, rarange, r.wb.stb)
# Incoming acks processing
- with m.If(wb_in.ack):
- sync += Display("WB_IN_ACK data:%x stbs_zero:%x "
- "stbs_done:%x",
- wb_in.dat, stbs_zero, stbs_done)
+ with m.If(bus.ack):
+ sync += Display("WB_IN_ACK data:%x", bus.dat_r)
- sync += r.rows_valid[r.store_row % ROW_PER_LINE].eq(1)
+ sync += r.rows_valid[r.store_row % self.ROW_PER_LINE].eq(1)
# Check for completion
- with m.If(stbs_done &
- is_last_row(r.store_row, r.end_row_ix)):
+ with m.If(self.is_last_row(r.store_row, r.end_row_ix)):
# Complete wishbone cycle
sync += r.wb.cyc.eq(0)
# be nice, clear addr
sync += r.req_adr.eq(0)
# Cache line is now valid
- cv = Signal(INDEX_BITS)
- comb += cv.eq(cache_valid_bits[r.store_index])
- comb += cv.bit_select(replace_way, 1).eq(
- r.store_valid & ~inval_in
- )
- sync += cache_valid_bits[r.store_index].eq(cv)
-
+ idx = r.store_index*self.NUM_WAYS + replace_way # 2D index again
+ valid = r.store_valid & ~inval_in
+ comb += cache_valids.s.eq(1<<idx)
sync += r.state.eq(State.IDLE)
- # not completed, move on to next request in row
- with m.Else():
- # Increment store row counter
- sync += r.store_row.eq(next_row(r.store_row))
-
+ # move on to next request in row
+ # Increment store row counter
+ sync += r.store_row.eq(self.next_row(r.store_row))
# Cache miss/reload synchronous machine
- def icache_miss(self, m, cache_valid_bits, r, req_is_miss,
+ def icache_miss(self, m, r, req_is_miss,
req_index, req_laddr, req_tag, replace_way,
- cache_tags, access_ok, real_addr):
+ cache_valids, access_ok, real_addr):
comb = m.d.comb
sync = m.d.sync
- i_in, wb_in, m_in = self.i_in, self.wb_in, self.m_in
+ i_in, bus, m_in = self.i_in, self.bus, self.m_in
stall_in, flush_in = self.stall_in, self.flush_in
inval_in = self.inval_in
- tagset = Signal(TAG_RAM_WIDTH)
- stbs_done = Signal()
-
comb += r.wb.sel.eq(-1)
comb += r.wb.adr.eq(r.req_adr[3:])
# Process cache invalidations
with m.If(inval_in):
- for i in range(NUM_LINES):
- sync += cache_valid_bits[i].eq(0)
+ comb += cache_valids.r.eq(-1)
sync += r.store_valid.eq(0)
# Main state machine
with m.Switch(r.state):
with m.Case(State.IDLE):
- self.icache_miss_idle(
- m, r, req_is_miss, req_laddr,
- req_index, req_tag, replace_way,
- real_addr
- )
+ self.icache_miss_idle(m, r, req_is_miss, req_laddr,
+ req_index, req_tag, replace_way,
+ real_addr)
with m.Case(State.CLR_TAG, State.WAIT_ACK):
with m.If(r.state == State.CLR_TAG):
- self.icache_miss_clr_tag(
- m, r, replace_way,
- cache_valid_bits, req_index,
- tagset, cache_tags
- )
-
- self.icache_miss_wait_ack(
- m, r, replace_way, inval_in,
- stbs_done, cache_valid_bits
- )
+ self.icache_miss_clr_tag(m, r, replace_way,
+ req_index,
+ cache_valids)
+
+ self.icache_miss_wait_ack(m, r, replace_way, inval_in,
+ cache_valids)
# TLB miss and protection fault processing
with m.If(flush_in | m_in.tlbld):
comb = m.d.comb
sync = m.d.sync
- wb_in, i_out = self.wb_in, self.i_out
+ bus, i_out = self.bus, self.i_out
log_out, stall_out = self.log_out, self.stall_out
# Output data to logger
for i in range(LOG_LENGTH):
log_data = Signal(54)
- lway = Signal(NUM_WAYS)
+ lway = Signal(self.WAY_BITS)
wstate = Signal()
sync += lway.eq(req_hit_way)
sync += log_data.eq(Cat(
ra_valid, access_ok, req_is_miss, req_is_hit,
lway, wstate, r.hit_nia[2:6], r.fetch_failed,
- stall_out, wb_in.stall, r.wb.cyc, r.wb.stb,
- r.real_addr[3:6], wb_in.ack, i_out.insn, i_out.valid
+ stall_out, bus.stall, r.wb.cyc, r.wb.stb,
+ r.real_addr[3:6], bus.ack, i_out.insn, i_out.valid
))
comb += log_out.eq(log_data)
m = Module()
comb = m.d.comb
- # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
- cache_tags = CacheTagArray()
- cache_valid_bits = CacheValidBitsArray()
+ # Cache-Ways "valid" indicators. this is a 2D Signal, by the
+ # number of ways and the number of lines.
+ vec = SRLatch(sync=True, llen=self.NUM_WAYS*self.NUM_LINES,
+ name="cachevalids")
+ m.submodules.cache_valids = cache_valids = vec
+
+ # TLB Array
+ itlb = self.TLBArray()
+ vec = SRLatch(sync=False, llen=self.TLB_SIZE, name="tlbvalids")
+ m.submodules.itlb_valids = itlb_valid = vec
- itlb_valid_bits = TLBValidBitsArray()
- itlb_tags = TLBTagArray()
- itlb_ptes = TLBPtesArray()
# TODO to be passed to nmigen as ram attributes
# attribute ram_style of itlb_tags : signal is "distributed";
# attribute ram_style of itlb_ptes : signal is "distributed";
# Privilege bit from PTE EAA field
eaa_priv = Signal()
- r = RegInternal()
+ r = RegInternal(self)
# Async signal on incoming request
- req_index = Signal(NUM_LINES)
- req_row = Signal(BRAM_ROWS)
- req_hit_way = Signal(NUM_WAYS)
- req_tag = Signal(TAG_BITS)
+ req_index = Signal(self.INDEX_BITS)
+ req_row = Signal(self.ROW_BITS)
+ req_hit_way = Signal(self.WAY_BITS)
+ req_tag = Signal(self.TAG_BITS)
req_is_hit = Signal()
req_is_miss = Signal()
req_laddr = Signal(64)
- tlb_req_index = Signal(TLB_SIZE)
- real_addr = Signal(REAL_ADDR_BITS)
+ tlb_req_index = Signal(self.TL_BITS)
+ real_addr = Signal(self.REAL_ADDR_BITS)
ra_valid = Signal()
priv_fault = Signal()
access_ok = Signal()
use_previous = Signal()
- cache_out_row = Signal(ROW_SIZE_BITS)
+ cache_out_row = Signal(self.ROW_SIZE_BITS)
+
+ plru_victim = Signal(self.WAY_BITS)
+ replace_way = Signal(self.WAY_BITS)
- plru_victim = PLRUOut()
- replace_way = Signal(NUM_WAYS)
+ self.tlbmem = Memory(depth=self.TLB_SIZE,
+ width=self.TLB_EA_TAG_BITS+self.TLB_PTE_BITS,
+ #attrs={'syn_ramstyle': "block_ram"}
+ )
+ self.tagmem = Memory(depth=self.NUM_LINES,
+ width=self.TAG_RAM_WIDTH,
+ #attrs={'syn_ramstyle': "block_ram"}
+ )
# call sub-functions putting everything together,
# using shared signals established above
self.rams(m, r, cache_out_row, use_previous, replace_way, req_row)
self.maybe_plrus(m, r, plru_victim)
- self.itlb_lookup(m, tlb_req_index, itlb_ptes, itlb_tags, real_addr,
- itlb_valid_bits, ra_valid, eaa_priv, priv_fault,
+ self.itlb_lookup(m, tlb_req_index, itlb, itlb_valid, real_addr,
+ ra_valid, eaa_priv, priv_fault,
access_ok)
- self.itlb_update(m, itlb_valid_bits, itlb_tags, itlb_ptes)
+ self.itlb_update(m, itlb, itlb_valid)
self.icache_comb(m, use_previous, r, req_index, req_row, req_hit_way,
- req_tag, real_addr, req_laddr, cache_valid_bits,
- cache_tags, access_ok, req_is_hit, req_is_miss,
+ req_tag, real_addr, req_laddr,
+ cache_valids,
+ access_ok, req_is_hit, req_is_miss,
replace_way, plru_victim, cache_out_row)
self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
req_index, req_tag, real_addr)
- self.icache_miss(m, cache_valid_bits, r, req_is_miss, req_index,
- req_laddr, req_tag, replace_way, cache_tags,
+ self.icache_miss(m, r, req_is_miss, req_index,
+ req_laddr, req_tag, replace_way,
+ cache_valids,
access_ok, real_addr)
#self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
# req_is_miss, req_is_hit, lway, wstate, r)
+ # don't connect up to FetchUnitInterface so that some unit tests
+ # can continue to operate
+ if not self.use_fetch_iface:
+ return m
+
+ # connect to FetchUnitInterface. FetchUnitInterface is undocumented
+ # so needs checking and iterative revising
+ i_in, bus, i_out = self.i_in, self.bus, self.i_out
+ comb += i_in.req.eq(self.a_i_valid)
+ comb += i_in.nia.eq(self.a_pc_i)
+ comb += self.stall_in.eq(self.a_stall_i)
+ comb += self.f_fetch_err_o.eq(i_out.fetch_failed)
+ comb += self.f_badaddr_o.eq(i_out.nia)
+ comb += self.f_instr_o.eq(i_out.insn)
+ comb += self.f_busy_o.eq(~i_out.valid) # probably
+
+ # TODO, connect dcache wb_in/wb_out to "standard" nmigen Wishbone bus
+ ibus = self.ibus
+ comb += ibus.adr.eq(self.bus.adr)
+ comb += ibus.dat_w.eq(self.bus.dat_w)
+ comb += ibus.sel.eq(self.bus.sel)
+ comb += ibus.cyc.eq(self.bus.cyc)
+ comb += ibus.stb.eq(self.bus.stb)
+ comb += ibus.we.eq(self.bus.we)
+
+ comb += self.bus.dat_r.eq(ibus.dat_r)
+ comb += self.bus.ack.eq(ibus.ack)
+ if hasattr(ibus, "stall"):
+ comb += self.bus.stall.eq(ibus.stall)
+ else:
+ # fake-up the wishbone stall signal to comply with pipeline mode
+ # same thing is done in dcache.py
+ comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
+
return m
def icache_sim(dut):
- i_out = dut.i_in
- i_in = dut.i_out
+ i_in = dut.i_in
+ i_out = dut.i_out
m_out = dut.m_in
- yield i_in.valid.eq(0)
- yield i_out.priv_mode.eq(1)
- yield i_out.req.eq(0)
- yield i_out.nia.eq(0)
- yield i_out.stop_mark.eq(0)
+ yield i_in.priv_mode.eq(1)
+ yield i_in.req.eq(0)
+ yield i_in.nia.eq(0)
+ yield i_in.stop_mark.eq(0)
yield m_out.tlbld.eq(0)
yield m_out.tlbie.eq(0)
yield m_out.addr.eq(0)
yield
yield
yield
- yield i_out.req.eq(1)
- yield i_out.nia.eq(Const(0x0000000000000004, 64))
- for i in range(30):
- yield
+
+ # miss, stalls for a bit
+ yield i_in.req.eq(1)
+ yield i_in.nia.eq(Const(0x0000000000000004, 64))
yield
- valid = yield i_in.valid
+ valid = yield i_out.valid
+ while not valid:
+ yield
+ valid = yield i_out.valid
+ yield i_in.req.eq(0)
+
+ insn = yield i_out.insn
nia = yield i_out.nia
- insn = yield i_in.insn
- print(f"valid? {valid}")
- assert valid
assert insn == 0x00000001, \
"insn @%x=%x expected 00000001" % (nia, insn)
- yield i_out.req.eq(0)
+ yield i_in.req.eq(0)
yield
# hit
+ yield i_in.req.eq(1)
+ yield i_in.nia.eq(Const(0x0000000000000008, 64))
yield
+ valid = yield i_out.valid
+ while not valid:
+ yield
+ valid = yield i_out.valid
+ yield i_in.req.eq(0)
+
+ nia = yield i_out.nia
+ insn = yield i_out.insn
yield
- yield i_out.req.eq(1)
- yield i_out.nia.eq(Const(0x0000000000000008, 64))
- yield
- yield
- valid = yield i_in.valid
- nia = yield i_in.nia
- insn = yield i_in.insn
- assert valid
assert insn == 0x00000002, \
"insn @%x=%x expected 00000002" % (nia, insn)
- yield
# another miss
- yield i_out.req.eq(1)
- yield i_out.nia.eq(Const(0x0000000000000040, 64))
- for i in range(30):
- yield
+ yield i_in.req.eq(1)
+ yield i_in.nia.eq(Const(0x0000000000000040, 64))
yield
- valid = yield i_in.valid
- nia = yield i_out.nia
- insn = yield i_in.insn
- assert valid
+ valid = yield i_out.valid
+ while not valid:
+ yield
+ valid = yield i_out.valid
+ yield i_in.req.eq(0)
+
+ nia = yield i_in.nia
+ insn = yield i_out.insn
assert insn == 0x00000010, \
"insn @%x=%x expected 00000010" % (nia, insn)
- # test something that aliases
- yield i_out.req.eq(1)
- yield i_out.nia.eq(Const(0x0000000000000100, 64))
+ # test something that aliases (this only works because
+ # the unit test SRAM is a depth of 512)
+ yield i_in.req.eq(1)
+ yield i_in.nia.eq(Const(0x0000000000000100, 64))
yield
yield
- valid = yield i_in.valid
+ valid = yield i_out.valid
assert ~valid
for i in range(30):
yield
yield
- insn = yield i_in.insn
- valid = yield i_in.valid
- insn = yield i_in.insn
+ insn = yield i_out.insn
+ valid = yield i_out.valid
+ insn = yield i_out.insn
assert valid
assert insn == 0x00000040, \
"insn @%x=%x expected 00000040" % (nia, insn)
- yield i_out.req.eq(0)
-
+ yield i_in.req.eq(0)
def test_icache(mem):
- dut = ICache()
-
- memory = Memory(width=64, depth=512, init=mem)
- sram = SRAM(memory=memory, granularity=8)
-
- m = Module()
-
- m.submodules.icache = dut
- m.submodules.sram = sram
-
- m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
- m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
- m.d.comb += sram.bus.we.eq(dut.wb_out.we)
- m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
- m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
- m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
-
- m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
- m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
-
- # nmigen Simulation
- sim = Simulator(m)
- sim.add_clock(1e-6)
-
- sim.add_sync_process(wrap(icache_sim(dut)))
- with sim.write_vcd('test_icache.vcd'):
+ from soc.config.test.test_loadstore import TestMemPspec
+ pspec = TestMemPspec(addr_wid=32,
+ mask_wid=8,
+ reg_wid=64,
+ XLEN=32,
+ )
+ dut = ICache(pspec)
+
+ memory = Memory(width=64, depth=512, init=mem)
+ sram = SRAM(memory=memory, granularity=8)
+
+ m = Module()
+
+ m.submodules.icache = dut
+ m.submodules.sram = sram
+
+ m.d.comb += sram.bus.cyc.eq(dut.bus.cyc)
+ m.d.comb += sram.bus.stb.eq(dut.bus.stb)
+ m.d.comb += sram.bus.we.eq(dut.bus.we)
+ m.d.comb += sram.bus.sel.eq(dut.bus.sel)
+ m.d.comb += sram.bus.adr.eq(dut.bus.adr)
+ m.d.comb += sram.bus.dat_w.eq(dut.bus.dat_w)
+
+ m.d.comb += dut.bus.ack.eq(sram.bus.ack)
+ m.d.comb += dut.bus.dat_r.eq(sram.bus.dat_r)
+
+ # nmigen Simulation
+ sim = Simulator(m)
+ sim.add_clock(1e-6)
+
+ sim.add_sync_process(wrap(icache_sim(dut)))
+ with sim.write_vcd('test_icache.vcd'):
sim.run()
+
if __name__ == '__main__':
- dut = ICache()
+ from soc.config.test.test_loadstore import TestMemPspec
+ pspec = TestMemPspec(addr_wid=64,
+ mask_wid=8,
+ XLEN=32,
+ reg_wid=64,
+ )
+ dut = ICache(pspec)
vl = rtlil.convert(dut, ports=[])
with open("test_icache.il", "w") as f:
f.write(vl)
+ # set up memory every 32-bits with incrementing values 0 1 2 ...
mem = []
for i in range(512):
mem.append((i*2) | ((i*2+1)<<32))
test_icache(mem)
-
class L0CacheBuffer2(Elaboratable):
"""L0CacheBuffer2"""
- def __init__(self, n_units=8, regwid=64, addrwid=48):
+ def __init__(self, n_units=8, regwid=64, addrwid=64):
self.n_units = n_units
self.regwid = regwid
self.addrwid = addrwid
# connect the ports as modules
for i in range(self.n_units):
- d = LDSTSplitter(64, 48, 4, self.dports[i])
+ d = LDSTSplitter(64, 64, 4, self.dports[i])
setattr(m.submodules, "ldst_splitter%d" % i, d)
# state-machine latches TODO
by this class. That task is taken care of by LDSTCompUnit.
"""
- def __init__(self, n_units, pimem, regwid=64, addrwid=48):
+ def __init__(self, n_units, pimem, regwid=64, addrwid=64):
self.n_units = n_units
self.pimem = pimem
self.regwid = regwid
def test_l0_cache_test_bare_wb(self):
pspec = TestMemPspec(ldst_ifacetype='test_bare_wb',
- addr_wid=48,
+ addr_wid=64,
mask_wid=8,
reg_wid=64)
dut = TstL0CacheBuffer(pspec)
def test_l0_cache_testpi(self):
pspec = TestMemPspec(ldst_ifacetype='testpi',
- addr_wid=48,
+ addr_wid=64,
mask_wid=8,
reg_wid=64)
dut = TstL0CacheBuffer(pspec)
DCacheToMMUType,
MMUToICacheType)
+# Radix Tree Page Directory Entry Record, TODO put this somewhere sensible
+# v3.0C Book III p1015-1016 section 6.7.10.1
+class RTPDE(RecordObject):
+ def __init__(self, name=None):
+ super().__init__(name=name)
+ self.nls = Signal(5) # Nextded Access Auth bits 59:63 LSB0 0:4
+ self.rs1 = Signal(3) # Reserved bits 56:58 LSB0 5:7
+ self.nlb = Signal(52) # Next Level Base bit 4:55 LSB0 8:59
+ self.rs2 = Signal(2) # Reserved bit 2:3 LSB0 60:61
+ self.leaf = Signal(1) # leaf bit 1 LSB0 62
+ self.valid = Signal(1) # valid bit 0 LSB0 63
+
+
+# Radix Tree Page Table Entry Record, TODO put this somewhere sensible
+# v3.0C Book III p1016 section 6.7.10.2
+class RTPTE(RecordObject):
+ def __init__(self, name=None):
+ super().__init__(name=name)
+ self.eaa = Signal(4) # Encoded Access Auth bits 60:63 LSB0 0:3
+ self.att = Signal(2) # Attributes bits 58:59 LSB0 4:5
+ self.rs1 = Signal(1) # Reserved bit 57 LSB0 6
+ self.c = Signal(1) # Change bit 56 LSB0 7
+ self.r = Signal(1) # Reference bit 55 LSB0 8
+ self.sw = Signal(3) # SW bits 1:3 bits 52:54 LSB0 9:11
+ self.rpn = Signal(45) # Real Page Number bits 7:51 LSB0 12:56
+ self.rs2 = Signal(4) # Reserved bit 3:6 LSB0 57-60
+ self.sw0 = Signal(1) # SW bit 0 bit 2 LSB0 61
+ self.leaf = Signal(1) # leaf bit 1 LSB0 62
+ self.valid = Signal(1) # valid bit 0 LSB0 63
+
+# and these... which of course are turned round to LSB0 order.
+# TODO: sigh. use botchify and put them in openpower.consts
+EAA_PRIV = 3 # bit 0 (in MSB0) set ==> problem-state banned (priv=1 only)
+EAA_RD = 2 # bit 1 (in MSB0) set ==> loads are permitted
+EAA_WR = 1 # bit 2 (in MSB0) set ==> load and stores permitted
+EAA_EXE = 0 # bit 3 (in MSB0) set ==> execute permitted
# for debugging
display_invalid = True
RADIX_FINISH = 9
+# Process Table Record - near-identical to Page Table Record (same format)
+# v3.0C Book III Section 6.7.6.2 p1004
+class PRTBL(RecordObject):
+ def __init__(self, name=None):
+ super().__init__(name=name)
+ self.rpds = Signal(5) # Root Page Directory Size 59:63 LSB0 0:4
+ self.rts2 = Signal(3) # Radix Tree Size part 2 56:58 LSB0 5:7
+ self.rpdb = Signal(52) # Root Page Directory Base 4:55 LSB0 8:59
+ self.rsv2 = Signal(1) # reserved 3 LSB0 60
+ self.rts1 = Signal(2) # Radix Tree Size part 1 1:2 LSB0 61:62
+ self.rsv1 = Signal(1) # reserved 0 LSB0 63
+
+
class RegStage(RecordObject):
def __init__(self, name=None):
super().__init__(name=name)
self.priv = Signal()
self.addr = Signal(64)
self.inval_all = Signal()
+
# config SPRs
self.prtbl = Signal(64)
self.pid = Signal(32)
+
# internal state
self.state = Signal(State) # resets to IDLE
self.done = Signal()
self.err = Signal()
+
+ # there are 4 quadrants (0-3): here we only support 2 (pt0 and pt3)
+ # these are bits 62-63 of any given address.
+ # except in segment_check, bit 62 is ignored
+ # Quadrant Select can be seen in v3.0C 6.7.10 p1015 book III figure 36
+ # and is further described in 6.7.11.3 p1019
self.pgtbl0 = Signal(64)
self.pt0_valid = Signal()
self.pgtbl3 = Signal(64)
self.pt3_valid = Signal()
+
self.shift = Signal(6)
self.mask_size = Signal(5)
self.pgbase = Signal(56)
self.rc_error = Signal()
+# Page Table Record - note that HR bit is treated as part of rts below
+# (near-identical to Process Table Record - same format)
+# v3.0C Book III Section 6.7.6.1 p1003
+class PGTBL(RecordObject):
+ def __init__(self, name=None):
+ super().__init__(name=name)
+ self.rpds = Signal(5) # Root Page Directory Size 59:63 LSB0 0:4
+ self.rts2 = Signal(3) # Radix Tree Size part 2 56:58 LSB0 5:7
+ self.rpdb = Signal(52) # Root Page Directory Base 4:55 LSB0 8:59
+ self.s = Signal(1) # Host Secure 3 LSB0 60
+ self.rts1 = Signal(2) # Radix Tree Size part 1 1:2 LSB0 61:62
+ self.hr = Signal(1) # Host Radix 0 LSB0 63
+
+
class MMU(Elaboratable):
"""Radix MMU
(i.e. there is no gRA -> hRA translation).
"""
def __init__(self):
- self.l_in = LoadStore1ToMMUType()
- self.l_out = MMUToLoadStore1Type()
- self.d_out = MMUToDCacheType()
- self.d_in = DCacheToMMUType()
- self.i_out = MMUToICacheType()
+ self.l_in = LoadStore1ToMMUType("l_in")
+ self.l_out = MMUToLoadStore1Type("l_out")
+ self.d_out = MMUToDCacheType("d_out")
+ self.d_in = DCacheToMMUType("d_in")
+ self.i_out = MMUToICacheType("i_out")
def radix_tree_idle(self, m, l_in, r, v):
+ """radix_tree_idle - the main decision-point. valid actions include:
+ * LDST incoming TLBIE request (invalidate TLB entry)
+ * LDST incoming RADIX walk request
+ * set either PRTBL or PID SPRs (which then fires a TLB invalidate)
+ """
comb = m.d.comb
sync = m.d.sync
pt_valid = Signal()
- pgtbl = Signal(64)
+ pgtbl = PGTBL("pgtbl")
rts = Signal(6)
- mbits = Signal(6)
+ mbits = Signal(6, name="mbits_idle")
- with m.If(~l_in.addr[63]):
- comb += pgtbl.eq(r.pgtbl0)
- comb += pt_valid.eq(r.pt0_valid)
- with m.Else():
+ with m.If(l_in.addr[63]): # quadrant 3
comb += pgtbl.eq(r.pgtbl3)
comb += pt_valid.eq(r.pt3_valid)
+ with m.Else():
+ comb += pgtbl.eq(r.pgtbl0)
+ comb += pt_valid.eq(r.pt0_valid)
# rts == radix tree size, number of address bits
- # being translated
- comb += rts.eq(Cat(pgtbl[5:8], pgtbl[61:63]))
+ # being translated. takes bits 5:7 and 61:62
+ comb += rts.eq(Cat(pgtbl.rts2, pgtbl.rts1, C(0)))
# mbits == number of address bits to index top
- # level of tree
- comb += mbits.eq(pgtbl[0:5])
+ # level of tree. takes bits 0:4
+ comb += mbits.eq(pgtbl.rpds)
# set v.shift to rts so that we can use finalmask
- # for the segment check
+ # for the segment check.
+ # note: rpdb (52 bits long) is truncated to 48 bits
comb += v.shift.eq(rts)
comb += v.mask_size.eq(mbits[0:5])
- comb += v.pgbase.eq(Cat(C(0, 8), pgtbl[8:56]))
+
+ # create the page base from root page directory base (48 bits with 8 0s)
+ comb += v.pgbase.eq(Cat(C(0, 8), pgtbl.rpdb[:48])) # bits 8:55
+
+ # request either TLB invalidate
+ # or start a RADIX walk
with m.If(l_in.valid):
comb += v.addr.eq(l_in.addr)
comb += v.store.eq(~(l_in.load | l_in.iside))
comb += v.priv.eq(l_in.priv)
- comb += Display("state %d l_in.valid addr %x iside %d store %d "
- "rts %x mbits %x pt_valid %d",
+ sync += Display("state %d l_in.valid addr %x iside %d store %d "
+ "rpdb %x rts %d mbits %d pt_valid %d",
v.state, v.addr, v.iside, v.store,
- rts, mbits, pt_valid)
+ pgtbl.rpdb, rts, mbits, pt_valid)
with m.If(l_in.tlbie):
# Invalidate all iTLB/dTLB entries for
# set v.shift so we can use finalmask
# for generating the process table
# entry address
- comb += v.shift.eq(r.prtbl[0:5])
+ prtbl = PRTBL("prtbl")
+ comb += prtbl.eq(r.prtbl)
+ comb += v.shift.eq(prtbl.rpds)
comb += v.state.eq(State.PROC_TBL_READ)
with m.Elif(mbits == 0):
with m.Else():
comb += v.state.eq(State.SEGMENT_CHECK)
+ # set either PID or PRTBL SPRs
+ # (then invalidate TLBs)
+
with m.If(l_in.mtspr):
# Move to PID needs to invalidate L1 TLBs
- # and cached pgtbl0 value. Move to PRTBL
- # does that plus invalidating the cached
+ # and cached pgtbl0 value.
+ # Move to PRTBL does that plus invalidating the cached
# pgtbl3 value as well.
with m.If(~l_in.sprn[9]):
comb += v.pid.eq(l_in.rs[0:32])
def proc_tbl_wait(self, m, v, r, data):
comb = m.d.comb
- with m.If(r.addr[63]):
- comb += v.pgtbl3.eq(data)
+ sync = m.d.sync
+ rts = Signal(6)
+ mbits = Signal(6, name="mbits_tbl_wait")
+ prtbl = PRTBL("prtblw")
+ comb += prtbl.eq(data)
+
+ with m.If(r.addr[63]): # top bit of quadrant selects pt3
+ comb += v.pgtbl3.eq(prtbl)
comb += v.pt3_valid.eq(1)
with m.Else():
- comb += v.pgtbl0.eq(data)
+ comb += v.pgtbl0.eq(prtbl)
comb += v.pt0_valid.eq(1)
- rts = Signal(6)
- mbits = Signal(6)
-
# rts == radix tree size, # address bits being translated
- comb += rts.eq(Cat(data[5:8], data[61:63]))
+ comb += rts.eq(Cat(prtbl.rts2, prtbl.rts1, C(0)))
# mbits == # address bits to index top level of tree
- comb += mbits.eq(data[0:5])
+ comb += mbits.eq(prtbl.rpds[0:5])
# set v.shift to rts so that we can use finalmask for the segment check
comb += v.shift.eq(rts)
comb += v.mask_size.eq(mbits[0:5])
- comb += v.pgbase.eq(Cat(C(0, 8), data[8:56]))
+
+ # create the page base from root page directory base (48 bits with 8 0s)
+ comb += v.pgbase.eq(Cat(C(0, 8), prtbl.rpdb[:48])) # bits 8:55
with m.If(mbits):
comb += v.state.eq(State.SEGMENT_CHECK)
+ sync += Display("PROC TBL %d data %x rts1 %x rts2 %x rts %d "
+ "rpdb %x mbits %d pgbase %x "
+ " pt0_valid %d, pt3_valid %d",
+ v.state, data, prtbl.rts1, prtbl.rts2, rts,
+ prtbl.rpdb, mbits, v.pgbase,
+ v.pt0_valid, v.pt3_valid)
with m.Else():
comb += v.state.eq(State.RADIX_FINISH)
comb += v.invalid.eq(1)
- if(display_invalid): m.d.sync += Display("MMUBUG: mbits is invalid")
+ if (display_invalid): m.d.sync += Display("MMU: mbits is invalid")
def radix_read_wait(self, m, v, r, d_in, data):
comb = m.d.comb
sync = m.d.sync
+ rpte = RTPTE(name="radix_rpte") # page-table (leaf) entry
+ rpde = RTPDE(name="radix_rpde") # page-directory (non-leaf) entry
+
perm_ok = Signal()
rc_ok = Signal()
- mbits = Signal(6)
- valid = Signal()
- leaf = Signal()
+ mbits = Signal(6, name="mbits_read_wait")
+ valid = rpte.valid
+ eaa = rpte.eaa
+ leaf = rpte.leaf
badtree = Signal()
- comb += Display("RDW %016x done %d "
+ sync += Display("RDW %016x done %d "
"perm %d rc %d mbits %d shf %d "
"valid %d leaf %d bad %d",
data, d_in.done, perm_ok, rc_ok,
mbits, r.shift, valid, leaf, badtree)
- # set pde
+ # set pde and interpret as Radix Tree Page Table Entry (leaf=1 case)
comb += v.pde.eq(data)
+ comb += rpte.eq(data)
+ comb += rpde.eq(data)
- # test valid bit
- comb += valid.eq(data[63]) # valid=data[63]
- comb += leaf.eq(data[62]) # valid=data[63]
-
- comb += v.pde.eq(data)
- # valid & leaf
with m.If(valid):
+ # valid & leaf: RADIX Page-Table Entry
with m.If(leaf):
# check permissions and RC bits
- with m.If(r.priv | ~data[3]):
- with m.If(~r.iside):
- comb += perm_ok.eq(data[1] | (data[2] & ~r.store))
- with m.Else():
+ with m.If(r.priv | ~eaa[EAA_PRIV]):
+ with m.If(r.iside): # instruction-side request
# no IAMR, so no KUEP support for now
# deny execute permission if cache inhibited
- comb += perm_ok.eq(data[0] & ~data[5])
+ comb += perm_ok.eq(eaa[EAA_EXE] & ~rpte.att[1])
+ with m.Else():
+ # Load/Store (read/write)
+ comb += perm_ok.eq(eaa[EAA_WR] |
+ (eaa[EAA_RD] & ~r.store))
+ comb += rc_ok.eq(rpte.r & (rpte.c | ~r.store))
- comb += rc_ok.eq(data[8] & (data[7] | ~r.store))
+ # permissions / rc ok, load TLB, otherwise report error
with m.If(perm_ok & rc_ok):
comb += v.state.eq(State.RADIX_LOAD_TLB)
+ sync += Display("RADIX LEAF data %x att %x eaa %x "
+ "R %d C %d "
+ "shift %d pgbase %x ",
+ data, rpte.att, eaa,
+ rpte.r, rpte.c,
+ v.shift, v.pgbase
+ )
with m.Else():
comb += v.state.eq(State.RADIX_FINISH)
comb += v.perm_err.eq(~perm_ok)
# permission error takes precedence over RC error
comb += v.rc_error.eq(perm_ok)
- # valid & !leaf
+ # valid & !leaf: RADIX Page-Directory Entry
with m.Else():
- comb += mbits.eq(data[0:5])
+ comb += mbits.eq(rpde.nls) # 5 bits NLS into 6-bit-long mbits
comb += badtree.eq((mbits < 5) |
(mbits > 16) |
(mbits > r.shift))
comb += v.badtree.eq(1)
with m.Else():
comb += v.shift.eq(r.shift - mbits)
- comb += v.mask_size.eq(mbits[0:5])
- comb += v.pgbase.eq(Cat(C(0, 8), data[8:56]))
+ comb += v.mask_size.eq(mbits)
+ # pagebase is first 48 bits of NLB, shifted up 1 byte
+ comb += v.pgbase.eq(Cat(C(0, 8), rpde.nlb[:48]))
comb += v.state.eq(State.RADIX_LOOKUP)
with m.Else():
# non-present PTE, generate a DSI
comb += v.state.eq(State.RADIX_FINISH)
comb += v.invalid.eq(1)
- if(display_invalid):
- sync += Display("MMUBUG: non-present PTE, generate a DSI")
+ if (display_invalid):
+ sync += Display("MMU: non-present PTE, generate a DSI")
def segment_check(self, m, v, r, data, finalmask):
+ """segment_check: checks validity of the request before doing a
+ RADIX lookup. reports either segment error or bad tree if not ok
+ """
comb = m.d.comb
- mbits = Signal(6)
+ mbits = Signal(6, name="mbits_check")
nonzero = Signal()
comb += mbits.eq(r.mask_size)
comb += v.shift.eq(r.shift + (31 - 12) - mbits)
comb += nonzero.eq((r.addr[31:62] & ~finalmask[0:31]).bool())
- with m.If((r.addr[63] ^ r.addr[62]) | nonzero):
+ with m.If((r.addr[63] != r.addr[62]) # pt3 == 0b11 and pt1 == 0b00
+ | nonzero):
comb += v.state.eq(State.RADIX_FINISH)
comb += v.segerror.eq(1)
with m.Elif((mbits < 5) | (mbits > 16) |
"%d badtree=%d", l_out.invalid, l_out.badtree)
with m.If(rin.state == State.RADIX_LOOKUP):
- sync += Display ("radix lookup shift=%d msize=%d",
- rin.shift, rin.mask_size)
+ sync += Display ("radix lookup shift=%x msize=%x",
+ rin.shift, mask)
with m.If(r.state == State.RADIX_LOOKUP):
- sync += Display(f"send load addr=%x addrsh=%d mask=%x",
+ sync += Display(f"send load addr=%x addrsh=%x mask=%x",
d_out.addr, addrsh, mask)
+
+ # update the internal register
sync += r.eq(rin)
def elaborate(self, platform):
self.rin = rin = RegStage("r_in")
r = RegStage("r")
+ # get access to prtbl and pid for debug / testing purposes ONLY
+ # (actually, not needed, because setup_regs() triggers mmu direct)
+ # self._prtbl = r.prtbl
+ # self._pid = r.pid
+
l_in = self.l_in
l_out = self.l_out
d_out = self.d_out
self.mmu_0(m, r, rin, l_in, l_out, d_out, addrsh, mask)
- v = RegStage()
+ v = RegStage("v")
dcreq = Signal()
tlb_load = Signal()
itlb_load = Signal()
comb += v.eq(r)
comb += v.valid.eq(0)
- comb += dcreq.eq(0)
comb += v.done.eq(0)
comb += v.err.eq(0)
comb += v.invalid.eq(0)
comb += v.segerror.eq(0)
comb += v.perm_err.eq(0)
comb += v.rc_error.eq(0)
- comb += tlb_load.eq(0)
- comb += itlb_load.eq(0)
- comb += tlbie_req.eq(0)
comb += v.inval_all.eq(0)
- comb += prtbl_rd.eq(0)
# Radix tree data structures in memory are
# big-endian, so we need to byte-swap them
# generate mask for extracting address fields for PTE addr generation
m.submodules.pte_mask = pte_mask = Mask(16-5)
+ pte_mask.mask.name = "pte_mask"
comb += pte_mask.shift.eq(r.mask_size - 5)
comb += mask.eq(Cat(C(0x1f, 5), pte_mask.mask))
# generate mask for extracting address bits to go in
# TLB entry in order to support pages > 4kB
m.submodules.tlb_mask = tlb_mask = Mask(44)
+ tlb_mask.mask.name = "tlb_mask"
comb += tlb_mask.shift.eq(r.shift)
comb += finalmask.eq(tlb_mask.mask)
+ # Shift address bits 61--12 right by 0--47 bits and
+ # supply the least significant 16 bits of the result.
+ comb += addrsh.eq(r.addr[12:62] >> r.shift)
+
with m.If(r.state != State.IDLE):
sync += Display("MMU state %d %016x", r.state, data)
+ sync += Display("addrsh %x r.shift %d r.addr[12:62] %x",
+ addrsh, r.shift, r.addr[12:62])
+
+ ##########
+ # Main FSM
+ ##########
with m.Switch(r.state):
with m.Case(State.IDLE):
sync += Display(" RADIX_FINISH")
comb += v.state.eq(State.IDLE)
+ # check and report either error or done.
with m.If((v.state == State.RADIX_FINISH) |
((v.state == State.RADIX_LOAD_TLB) & r.iside)):
comb += v.err.eq(v.invalid | v.badtree | v.segerror
| v.perm_err | v.rc_error)
comb += v.done.eq(~v.err)
- with m.If(~r.addr[63]):
+ # PID is only valid if MSB of address is zero, top 2 bits are Quadrant
+ with m.If(~r.addr[63]): # quadrant 0 (pt0)
comb += effpid.eq(r.pid)
+ # calculate Process Table Address
pr24 = Signal(24, reset_less=True)
- comb += pr24.eq(masked(r.prtbl[12:36], effpid[8:32], finalmask))
- comb += prtb_adr.eq(Cat(C(0, 4), effpid[0:8], pr24, r.prtbl[36:56]))
+ prtbla = PRTBL("prtbla")
+ comb += prtbla.eq(r.prtbl)
+ rpdb = prtbla.rpdb
+ comb += pr24.eq(masked(rpdb[4:28], effpid[8:32], finalmask))
+ comb += prtb_adr.eq(Cat(C(0, 4), effpid[0:8], pr24, rpdb[28:48]))
+ # calculate Page Table Address
pg16 = Signal(16, reset_less=True)
comb += pg16.eq(masked(r.pgbase[3:19], addrsh, mask))
comb += pgtb_adr.eq(Cat(C(0, 3), pg16, r.pgbase[19:56]))
+ # calculate Page Table Entry from Real Page Number (leaf=1, RTPTE)
+ rpte = RTPTE(name="rpte")
+ comb += rpte.eq(r.pde)
pd44 = Signal(44, reset_less=True)
- comb += pd44.eq(masked(r.pde[12:56], r.addr[12:56], finalmask))
+ comb += pd44.eq(masked(rpte.rpn, r.addr[12:56], finalmask))
comb += pte.eq(Cat(r.pde[0:12], pd44))
# update registers
comb += addr.eq(prtb_adr)
with m.Else():
comb += addr.eq(pgtb_adr)
+ sync += Display(f"pagetable pg16=%x addrsh %x mask %x pgbase=%x "
+ "pgbase[19:56]=%x",
+ pg16, addrsh, mask, r.pgbase, r.pgbase[19:56])
+ # connect to other interfaces: LDST, D-Cache, I-Cache
comb += l_out.done.eq(r.done)
comb += l_out.err.eq(r.err)
comb += l_out.invalid.eq(r.invalid)
mem = {0x0: 0x000000, # to get mtspr prtbl working
0x10000: # PARTITION_TABLE_2
- # PATB_GR=1 PRTB=0x1000 PRTS=0xb
- b(0x800000000100000b),
+ # HR=1 RTS1=0x2 PRTB=0x300 RTS2=0x5 PRTS=0xb
+ b(0xc0000000000030ad),
0x30000: # RADIX_ROOT_PTE
# V = 1 L = 0 NLB = 0x400 NLS = 9
# R = 1 C = 1 ATT = 0 EAA 0x7
b(0xc000000000000187),
- 0x1000000: # PROCESS_TABLE_3
+#
+# slightly different from radix_walk_example.txt: address in microwatt
+# has the top bit set to indicate hypervisor. here, Quadrant 3's
+# process table entry is put instead into Quadrant 0. the entry
+# PROCESS_TABLE_3 should, strictly speaking, be at 0x1000010
+
+# 0x1000000: # PROCESS_TABLE_3 (pt0_valid)
+# # RTS1 = 0x2 RPDB = 0x300 RTS2 = 0x5 RPDS = 12
+# b(0x40000000000300ac),
+
+ 0x1000000: # PROCESS_TABLE_3 (pt3_valid)
# RTS1 = 0x2 RPDB = 0x300 RTS2 = 0x5 RPDS = 13
b(0x40000000000300ad),
}
+ # microwatt mmu.bin first part of test 2.
+ # PRTBL must be set to 0x12000, PID to 1
+ mem = {
+ 0x0: 0x000000, # to get mtspr prtbl working
+ 0x13920: 0x86810000000000c0, # leaf, supposed to be at 0x13920
+ 0x10000: 0x0930010000000080, # directory node
+ 0x12010: 0x0a00010000000000, # page table
+ 0x124000: 0x0000000badc0ffee, # memory to be looked up
+ }
+
+ # microwatt mmu.bin first part of test 4.
+ # PRTBL must be set to 0x12000, PID to 1
+ mem = {
+ 0x0: 0x000000, # to get mtspr prtbl working
+ 0x13858: 0x86a10000000000c0, # leaf node
+ 0x10000: 0x0930010000000080, # directory node
+ 0x12010: 0x0a00010000000000, # page table
+ }
+
+ # microwatt mmu.bin test 5.
+ # PRTBL must be set to 0x12000, PID to 1
+ mem = {
+ 0x0: 0x000000, # to get mtspr prtbl working
+ 0x13cf8: 0x86b10000000000c0, # leaf node
+ 0x13d00: 0x0000000000000000, # invalid leaf node
+ 0x10008: 0x0930010000000080, # directory node
+ 0x12010: 0x0a00010000000000, # page table
+ }
+
+ # microwatt mmu.bin test 12, instruction-side
+ # PRTBL must be set to 0x12000, PID to 1, iside to 1
+ mem = {
+ 0x0: 0x000000, # to get mtspr prtbl working
+ 0x13920: 0x01110000000000c0, # leaf node
+ 0x10008: 0x0930010000000080, # directory node
+ 0x12010: 0x0a00010000000000, # page table
+ }
+
while not stop:
while True: # wait for dc_valid
if stop:
return
dc_valid = yield (dut.d_out.valid)
+ tlbld = yield (dut.d_out.tlbld)
if dc_valid:
break
yield
addr = yield dut.d_out.addr
+ if tlbld:
+ pte = yield dut.d_out.pte
+ print (" DCACHE PTE %x -> %x" % (pte, addr))
+ yield dut.d_in.done.eq(1)
+ yield
+ yield dut.d_in.done.eq(0)
+ continue
+
if addr not in mem:
print (" DCACHE LOOKUP FAIL %x" % (addr))
stop = True
yield
yield dut.d_in.done.eq(0)
+
def mmu_wait(dut):
global stop
while not stop: # wait for dc_valid / err
+ d_valid = yield (dut.d_out.valid)
+ if d_valid:
+ tlbld = yield (dut.d_out.tlbld)
+ addr = yield (dut.d_out.addr)
+ print ("addr %x tlbld %d" % (addr, tlbld))
l_done = yield (dut.l_out.done)
l_err = yield (dut.l_out.err)
l_badtree = yield (dut.l_out.badtree)
yield dut.l_in.mtspr.eq(0) # captured by RegStage(s)
yield dut.l_in.load.eq(0) # can reset everything safely
+
def mmu_sim(dut):
global stop
+ # microwatt PRTBL = 0x12000, other test is 0x1000000
+ #prtbl = 0x100000
+ #pidr = 0x0
+ prtbl = 0x12000
+ pidr = 0x1
+
# MMU MTSPR set prtbl
yield dut.l_in.mtspr.eq(1)
yield dut.l_in.sprn[9].eq(1) # totally fake way to set SPR=prtbl
- yield dut.l_in.rs.eq(0x1000000) # set process table
+ yield dut.l_in.rs.eq(prtbl) # set process table
yield dut.l_in.valid.eq(1)
yield from mmu_wait(dut)
yield
prtbl = yield (dut.rin.prtbl)
print ("prtbl after MTSPR %x" % prtbl)
- assert prtbl == 0x1000000
+ assert prtbl == prtbl
+
+ if True: # microwatt test set PIDR
+ # MMU MTSPR set PIDR = 1
+ yield dut.l_in.mtspr.eq(1)
+ yield dut.l_in.sprn[9].eq(0) # totally fake way to set SPR=pidr
+ yield dut.l_in.rs.eq(pidr) # set process table
+ yield dut.l_in.valid.eq(1)
+ yield from mmu_wait(dut)
+ yield
+ yield dut.l_in.sprn.eq(0)
+ yield dut.l_in.rs.eq(0)
+ yield
#yield dut.rin.prtbl.eq(0x1000000) # manually set process table
#yield
+ #addr = 0x10000 # original test
+ #addr = 0x124108 # microwatt mmu.bin test 2
+ #addr = 0x10b0d8 # microwatt mmu.bin test 4
+ # these are a misalignment test. one load results in two actual
+ # lookups, one of which has a valid page table entry, the other
+ # does not. we currently do not support misaligned in Loadstore1
+ # therefore these tests fail with an align_intr (0x600) at 0x39fffd
+ addr = 0x39fffd # microwatt mmu.bin test 5
+ addr = 0x3a0000 # microwatt mmu.bin test 5
+
+ # microwatt mmu.bin test 12 is instruction-side
+ addr = 0x324000 # microwatt mmu.bin test 12
+ iside = 1
# MMU PTE request
- yield dut.l_in.load.eq(1)
+ yield dut.l_in.iside.eq(iside)
+ yield dut.l_in.load.eq(0)
yield dut.l_in.priv.eq(1)
- yield dut.l_in.addr.eq(0x10000)
+ yield dut.l_in.addr.eq(addr)
yield dut.l_in.valid.eq(1)
yield from mmu_wait(dut)
addr = yield dut.d_out.addr
pte = yield dut.d_out.pte
+ tlb_ld = yield dut.d_out.tlbld
l_done = yield (dut.l_out.done)
l_err = yield (dut.l_out.err)
l_badtree = yield (dut.l_out.badtree)
- print ("translated done %d err %d badtree %d addr %x pte %x" % \
- (l_done, l_err, l_badtree, addr, pte))
+ print ("translated done %d err %d badtree %d "
+ "addr %x pte %x tlb_ld %d" % \
+ (l_done, l_err, l_badtree, addr, pte, tlb_ld))
+
yield
yield dut.l_in.priv.eq(0)
yield dut.l_in.addr.eq(0)
busy_o/1 most likely to be x_busy_o
go_die_i/1 rst?
- addr.data/48 x_addr_i (x_addr_i[:4] goes into LenExpand)
+ addr.data/64 x_addr_i (x_addr_i[:4] goes into LenExpand)
addr.ok/1 probably x_i_valid & ~x_stall_i
addr_ok_o/1 no equivalent. *might* work using x_stall_i
class Pi2LSUI(PortInterfaceBase):
def __init__(self, name, lsui=None,
- data_wid=64, mask_wid=8, addr_wid=48):
+ data_wid=64, mask_wid=8, addr_wid=64):
print("pi2lsui reg mask addr", data_wid, mask_wid, addr_wid)
super().__init__(data_wid, addr_wid)
if lsui is None:
self.lsui_busy = Signal()
self.valid_l = SRLatch(False, name="valid")
- def set_wr_addr(self, m, addr, mask, misalign, msr_pr, is_dcbz):
+ def set_wr_addr(self, m, addr, mask, misalign, msr, is_dcbz, is_nc):
print("pi2lsui TODO, implement is_dcbz")
m.d.comb += self.valid_l.s.eq(1)
m.d.comb += self.lsui.x_mask_i.eq(mask)
m.d.comb += self.lsui.x_addr_i.eq(addr)
- def set_rd_addr(self, m, addr, mask, misalign, msr_pr):
+ def set_rd_addr(self, m, addr, mask, misalign, msr, is_nc):
m.d.comb += self.valid_l.s.eq(1)
m.d.comb += self.lsui.x_mask_i.eq(mask)
m.d.comb += self.lsui.x_addr_i.eq(addr)
class Pi2LSUI1(Elaboratable):
def __init__(self, name, pi=None, lsui=None,
- data_wid=64, mask_wid=8, addr_wid=48):
+ data_wid=64, mask_wid=8, addr_wid=64):
print("pi2lsui reg mask addr", data_wid, mask_wid, addr_wid)
self.addrbits = mask_wid
if pi is None:
from nmutil.latch import SRLatch, latchregister
from nmutil.util import rising_edge
from openpower.decoder.power_decoder2 import Data
+from openpower.decoder.power_enums import MSRSpec
from soc.scoreboard.addr_match import LenExpand
from soc.experiment.mem_types import LDSTException
busy_o is deasserted on the cycle AFTER st.ok is asserted.
"""
- def __init__(self, name=None, regwid=64, addrwid=48):
+ def __init__(self, name=None, regwid=64, addrwid=64):
self._regwid = regwid
self._addrwid = addrwid
RecordObject.__init__(self, name=name)
- # distinguish op type (ld/st)
+ # distinguish op type (ld/st/dcbz/nc)
self.is_ld_i = Signal(reset_less=True)
self.is_st_i = Signal(reset_less=True)
+ self.is_dcbz_i = Signal(reset_less=True) # cache-line zeroing
+ self.is_nc = Signal() # no cacheing
# LD/ST data length (TODO: other things may be needed)
self.data_len = Signal(4, reset_less=True)
+ # atomic reservation (LR/SC - ldarx / stdcx etc.)
+ self.reserve = Signal(reset_less=True)
+
# common signals
self.busy_o = Signal(reset_less=True) # do not use if busy
self.go_die_i = Signal(reset_less=True) # back to reset
# addr is valid (TLB, L1 etc.)
self.addr_ok_o = Signal(reset_less=True)
self.exc_o = LDSTException("exc")
- self.dar_o = Signal(64, reset_less=True)
# LD/ST
self.ld = Data(regwid, "ld_data_o") # ok to be set by L0 Cache/Buf
self.st = Data(regwid, "st_data_i") # ok to be set by CompUnit
+ self.store_done = Data(1, "store_done_o") # store has been actioned
- # additional "modes"
- self.is_nc = Signal() # no cacheing
- self.msr_pr = Signal() # 1==virtual, 0==privileged
- self.is_dcbz_i = Signal(reset_less=True)
-
- # mmu
- self.mmu_done = Signal() # keep for now
+ #only priv_mode = not msr_pr is used currently
+ # TODO: connect signals
+ self.virt_mode = Signal() # ctrl.msr(MSR_DR);
+ self.priv_mode = Signal() # not ctrl.msr(MSR_PR);
+ self.mode_32bit = Signal() # not ctrl.msr(MSR_SF);
# dcache
self.ldst_error = Signal()
self.is_nc.eq(inport.is_nc),
self.is_dcbz_i.eq(inport.is_dcbz_i),
self.data_len.eq(inport.data_len),
+ self.reserve.eq(inport.reserve),
self.go_die_i.eq(inport.go_die_i),
self.addr.data.eq(inport.addr.data),
self.addr.ok.eq(inport.addr.ok),
self.st.eq(inport.st),
- self.msr_pr.eq(inport.msr_pr),
+ self.virt_mode.eq(inport.virt_mode),
+ self.priv_mode.eq(inport.priv_mode),
+ self.mode_32bit.eq(inport.mode_32bit),
inport.ld.eq(self.ld),
inport.busy_o.eq(self.busy_o),
inport.addr_ok_o.eq(self.addr_ok_o),
inport.exc_o.eq(self.exc_o),
- inport.dar_o.eq(self.dar_o),
- inport.mmu_done.eq(self.mmu_done),
+ inport.store_done.eq(self.store_done),
inport.ldst_error.eq(self.ldst_error),
inport.cache_paradox.eq(self.cache_paradox)
]
def connect_port(self, inport):
return self.pi.connect_port(inport)
- def set_wr_addr(self, m, addr, mask, misalign, msr_pr, is_dcbz): pass
- def set_rd_addr(self, m, addr, mask, misalign, msr_pr): pass
+ def set_wr_addr(self, m, addr, mask, misalign, msr, is_dcbz, is_nc): pass
+ def set_rd_addr(self, m, addr, mask, misalign, msr, is_nc): pass
def set_wr_data(self, m, data, wen): pass
def get_rd_data(self, m): pass
pi = self.pi
comb += lds.eq(pi.is_ld_i) # ld-req signals
comb += sts.eq(pi.is_st_i) # st-req signals
- pr = pi.msr_pr # MSR problem state: PR=1 ==> virt, PR==0 ==> priv
+
+ # TODO: construct an MSRspec here and pass it over in
+ # self.set_rd_addr and set_wr_addr below rather than just pr
+ pr = ~pi.priv_mode
+ dr = pi.virt_mode
+ sf = ~pi.mode_32bit
+ msr = MSRSpec(pr=pr, dr=dr, sf=sf)
# detect busy "edge"
busy_delay = Signal()
misalign = Signal()
comb += misalign.eq(lenexp.lexp_o[8:].bool())
-
# activate mode: only on "edge"
comb += ld_active.s.eq(rising_edge(m, lds)) # activate LD mode
comb += st_active.s.eq(rising_edge(m, sts)) # activate ST mode
comb += lenexp.len_i.eq(pi.data_len)
comb += lenexp.addr_i.eq(lsbaddr)
with m.If(pi.addr.ok & adrok_l.qn):
- self.set_rd_addr(m, pi.addr.data, lenexp.lexp_o, misalign, pr)
+ self.set_rd_addr(m, pi.addr.data, lenexp.lexp_o, misalign,
+ msr, pi.is_nc)
comb += pi.addr_ok_o.eq(1) # acknowledge addr ok
sync += adrok_l.s.eq(1) # and pull "ack" latch
comb += lenexp.len_i.eq(pi.data_len)
comb += lenexp.addr_i.eq(lsbaddr)
with m.If(pi.addr.ok):
- self.set_wr_addr(m, pi.addr.data, lenexp.lexp_o, misalign, pr,
- pi.is_dcbz_i)
+ self.set_wr_addr(m, pi.addr.data, lenexp.lexp_o, misalign, msr,
+ pi.is_dcbz_i, pi.is_nc)
with m.If(adrok_l.qn & self.pi.exc_o.happened==0):
comb += pi.addr_ok_o.eq(1) # acknowledge addr ok
sync += adrok_l.s.eq(1) # and pull "ack" latch
with m.If(st_active.q & pi.st.ok):
# shift data up before storing. lenexp *bit* version of mask is
# passed straight through as byte-level "write-enable" lines.
- stdata = Signal(self.regwid, reset_less=True)
+ stdata = Signal(self.regwid*2, reset_less=True)
comb += stdata.eq(pi.st.data << (lenexp.addr_i*8))
# TODO: replace with link to LoadStoreUnitInterface.x_store_data
# and also handle the ready/stall/busy protocol
comb += busy_l.r.eq(1)
# busy latch outputs to interface
- comb += pi.busy_o.eq(busy_l.q)
+ if hasattr(self, "external_busy"):
+ # when there is an extra (external) busy, include that here.
+ # this is used e.g. in LoadStore1 when an instruction fault
+ # is being processed (instr_fault) and stops Load/Store requests
+ # from being made until it's done
+ comb += pi.busy_o.eq(busy_l.q | self.external_busy(m))
+ else:
+ comb += pi.busy_o.eq(busy_l.q)
return m
# hard-code memory addressing width to 6 bits
self.mem = TestMemory(regwid, 5, granularity=regwid//8, init=False)
- def set_wr_addr(self, m, addr, mask, misalign, msr_pr, is_dcbz):
+ def set_wr_addr(self, m, addr, mask, misalign, msr, is_dcbz, is_nc):
lsbaddr, msbaddr = self.splitaddr(addr)
m.d.comb += self.mem.wrport.addr.eq(msbaddr)
- def set_rd_addr(self, m, addr, mask, misalign, msr_pr):
+ def set_rd_addr(self, m, addr, mask, misalign, msr, is_nc):
lsbaddr, msbaddr = self.splitaddr(addr)
m.d.comb += self.mem.rdport.addr.eq(msbaddr)
# based on microwatt plru.vhdl
-from nmigen import Elaboratable, Signal, Array, Module, Mux, Const
+from nmigen import Elaboratable, Signal, Array, Module, Mux, Const, Cat
from nmigen.cli import rtlil
+from nmigen.lib.coding import Decoder
class PLRU(Elaboratable):
def ports(self):
return [self.acc_en, self.lru_o, self.acc_i]
+
+class PLRUs(Elaboratable):
+ def __init__(self, cachetype, n_plrus, n_bits):
+ self.cachetype = cachetype
+ self.n_plrus = n_plrus
+ self.n_bits = n_bits
+ self.valid = Signal()
+ self.way = Signal(n_bits)
+ self.index = Signal(n_plrus.bit_length())
+ self.isel = Signal(n_plrus.bit_length())
+ self.o_index = Signal(n_bits)
+
+ def elaborate(self, platform):
+ """Generate TLB PLRUs
+ """
+ m = Module()
+ comb = m.d.comb
+
+ if self.n_plrus == 0:
+ return m
+
+ # Binary-to-Unary one-hot, enabled by valid
+ m.submodules.te = te = Decoder(self.n_plrus)
+ comb += te.n.eq(~self.valid)
+ comb += te.i.eq(self.index)
+
+ out = Array(Signal(self.n_bits, name="plru_out%d" % x) \
+ for x in range(self.n_plrus))
+
+ for i in range(self.n_plrus):
+ # PLRU interface
+ name = "%s_plru_%d" % (self.cachetype, i)
+ m.submodules[name] = plru = PLRU(self.n_bits)
+
+ comb += plru.acc_en.eq(te.o[i])
+ comb += plru.acc_i.eq(self.way)
+ comb += out[i].eq(plru.lru_o)
+
+ # select output based on index
+ comb += self.o_index.eq(out[self.isel])
+
+ return m
+
+ def ports(self):
+ return [self.valid, self.way, self.index, self.isel, self.o_index]
+
+
if __name__ == '__main__':
dut = PLRU(2)
vl = rtlil.convert(dut, ports=dut.ports())
f.write(vl)
+ dut = PLRUs("testing", 4, 2)
+ vl = rtlil.convert(dut, ports=dut.ports())
+ with open("test_plrus.il", "w") as f:
+ f.write(vl)
+
+
RTS2 = 0x5
RPDS = 12
- PROCESS_TABLE_3 | PROCESS_TABLE_3 //Hypervisor Userspace
+0x1000010 : PROCESS_TABLE_3 | PROCESS_TABLE_3 //Hypervisor Userspace
0x40000000000300ad | 0x0
RTS1 = 0x2
RPDB = 0x300
0x40000: # RADIX_SECOND_LEVEL
# V = 1 L = 1 SW = 0 RPN = 0
- # R = 1 C = 1 ATT = 0 EAA 0x7
+ # R = 1 C = 1 ATT = 0 EAA 0x3
b(0xc000000000000183),
0x1000000: # PROCESS_TABLE_3
#0x10004: 0
}
+
+
+# executable permission is barred here (EAA=0x2)
+test2 = {
+ 0x10000: # PARTITION_TABLE_2
+ # PATB_GR=1 PRTB=0x1000 PRTS=0xb
+ b(0x800000000100000b),
+
+ 0x30000: # RADIX_ROOT_PTE
+ # V = 1 L = 0 NLB = 0x400 NLS = 9
+ b(0x8000000000040009),
+
+ 0x40000: # RADIX_SECOND_LEVEL
+ # V = 1 L = 1 SW = 0 RPN = 0
+ # R = 1 C = 1 ATT = 0 EAA 0x2
+ b(0xc000000000000182),
+
+ 0x1000000: # PROCESS_TABLE_3
+ # RTS1 = 0x2 RPDB = 0x300 RTS2 = 0x5 RPDS = 13
+ b(0x40000000000300ad),
+
+ #0x10004: 0
+
+}
+
+
+# microwatt mmu.bin first part of test 2. PRTBL must be set to 0x12000, PID to 1
+microwatt_test2 = {
+ 0x13920: 0x86810000000000c0, # leaf node
+ 0x10000: 0x0930010000000080, # directory node
+ 0x12010: 0x0a00010000000000, # page table
+ 0x8108: 0x0000000badc0ffee, # memory to be looked up
+ }
+
+microwatt_test4 = {
+ 0x13858: 0x86a10000000000c0, # leaf node
+ 0x10000: 0x0930010000000080, # directory node
+ 0x12010: 0x0a00010000000000, # page table
+}
+
+# microwatt mmu.bin test 5: a misaligned read which crosses over to a TLB that
+# is not valid. must attempt a 64-bit read at address 0x39fffd to trigger
+
+microwatt_test5 = {
+ 0x13cf8: 0x86b10000000000c0, # leaf, covers up to 0x39ffff
+ 0x10008: 0x0930010000000080, # directory node
+ 0x12010: 0x0a00010000000000, # page table
+ 0x39fff8: 0x0123456badc0ffee, # to be looked up (should fail)
+ 0x400000: 0x0123456badc0ffee, # not page-mapped
+}
+
+# linux kernel 5.7 first MMU enable
+"""
+ rd @ 000bf803 di b000000000001033 sel ff 3.......
+ rd @ 000bf804 di 0 sel ff ........
+ rd @ 000bf805 di 0 sel ff ........
+ rd @ 000bf806 di 10000 sel ff ........
+ rd @ 000bf807 di c0000000005fc380 sel ff ........
+ rd @ 000bf800 di 80000000 sel ff ........
+ rd @ 000bf801 di c00000000059d400 sel ff ..Y.....
+ rd @ 000bf802 di c000000000000000 sel ff ........
+pc a588 insn 7c7a03a6 msr a000000000000003
+pc a58c insn 7c9b03a6 msr a000000000000003
+pc a590 insn 4c000024 msr a000000000000003
+pc a598 insn f82d0190 msr b000000000000033
+ rd @ 01c00000 di ad005c0000000040 sel ff ........
+ rd @ 01c00001 di 0 sel ff ........
+ rd @ 01c00002 di 0 sel ff ........
+ rd @ 01c00003 di 0 sel ff ........
+ rd @ 01c00004 di 0 sel ff ........
+ rd @ 01c00005 di 0 sel ff ........
+ rd @ 01c00006 di 0 sel ff ........
+ rd @ 01c00007 di 0 sel ff ........
+ rd @ 000b8000 di 9e0ff0f00000080 sel ff ........
+ rd @ 000b8001 di 0 sel ff ........
+ rd @ 000b8002 di 0 sel ff ........
+ rd @ 000b8003 di 0 sel ff ........
+ rd @ 000b8004 di 0 sel ff ........
+ rd @ 000b8005 di 0 sel ff ........
+ rd @ 000b8006 di 0 sel ff ........
+ rd @ 000b8007 di 0 sel ff ........
+ rd @ 01fffc00 di 9d0ff0f00000080 sel ff ........
+ rd @ 01fffc01 di 0 sel ff ........
+ rd @ 01fffc02 di 0 sel ff ........
+ rd @ 01fffc03 di 0 sel ff ........
+ rd @ 01fffc04 di 0 sel ff ........
+ rd @ 01fffc05 di 0 sel ff ........
+ rd @ 01fffc06 di 0 sel ff ........
+ rd @ 01fffc07 di 0 sel ff ........
+ rd @ 01fffa00 di 8f010000000000c0 sel ff ........
+ rd @ 01fffa01 di 8f012000000000c0 sel ff ........
+ rd @ 01fffa02 di 8f014000000000c0 sel ff ........
+ rd @ 01fffa03 di 8e016000000000c0 sel ff ........
+ rd @ 01fffa04 di 8e018000000000c0 sel ff ........
+ rd @ 01fffa05 di 8e01a000000000c0 sel ff ........
+ rd @ 01fffa06 di 8e01c000000000c0 sel ff ........
+ rd @ 01fffa07 di 8e01e000000000c0 sel ff ........
+"""
+
+microwatt_linux_5_7_boot = {
+ 0x000bf803<<3: 0xb000000000001033,
+ 0x000bf804<<3: 0x0,
+ 0x000bf805<<3: 0x0,
+ 0x000bf806<<3: 0x10000,
+ 0x000bf807<<3: 0xc0000000005fc380,
+ 0x000bf800<<3: 0x80000000,
+ 0x000bf801<<3: 0xc00000000059d400,
+ 0x000bf802<<3: 0xc000000000000000,
+ 0x01c00000<<3: 0xad005c0000000040,
+ 0x01c00001<<3: 0x0,
+ 0x01c00002<<3: 0x0,
+ 0x01c00003<<3: 0x0,
+ 0x01c00004<<3: 0x0,
+ 0x01c00005<<3: 0x0,
+ 0x01c00006<<3: 0x0,
+ 0x01c00007<<3: 0x0,
+ 0x000b8000<<3: 0x09e0ff0f00000080,
+ 0x000b8001<<3: 0x0,
+ 0x000b8002<<3: 0x0,
+ 0x000b8003<<3: 0x0,
+ 0x000b8004<<3: 0x0,
+ 0x000b8005<<3: 0x0,
+ 0x000b8006<<3: 0x0,
+ 0x000b8007<<3: 0x0,
+ 0x01fffc00<<3: 0x09d0ff0f00000080,
+ 0x01fffc01<<3: 0x0,
+ 0x01fffc02<<3: 0x0,
+ 0x01fffc03<<3: 0x0,
+ 0x01fffc04<<3: 0x0,
+ 0x01fffc05<<3: 0x0,
+ 0x01fffc06<<3: 0x0,
+ 0x01fffc07<<3: 0x0,
+ 0x01fffa00<<3: 0x8f010000000000c0,
+ 0x01fffa01<<3: 0x8f012000000000c0,
+ 0x01fffa02<<3: 0x8f014000000000c0,
+ 0x01fffa03<<3: 0x8e016000000000c0,
+ 0x01fffa04<<3: 0x8e018000000000c0,
+ 0x01fffa05<<3: 0x8e01a000000000c0,
+ 0x01fffa06<<3: 0x8e01c000000000c0,
+ 0x01fffa07<<3: 0x8e01e000000000c0,
+}
wrmask=[0, 1],
src_delays=[2, 0], dest_delays=[1, 0])
- # test combinatorial zero-delay operation
- # In the test ALU, any operation other than ADD, MUL, EXTS or SHR
- # is zero-delay, and do a subtraction.
- # 5 - 2 = 3
- yield from op.issue([5, 2], MicrOp.OP_CMP, [3, 0],
- wrmask=[0, 1],
- src_delays=[0, 1], dest_delays=[2, 0])
# test all combinations of masked input ports
# NOP does not make any request nor response
yield from op.issue([5, 2], MicrOp.OP_NOP, [0, 0],
yield from op.issue([2, 0x80], MicrOp.OP_EXTSWSLI, [0xFF80, 0],
rdmaskn=[1, 0], wrmask=[0, 1],
src_delays=[1, 2], dest_delays=[1, 0])
+
+ # test combinatorial zero-delay operation
+ # In the test ALU, any operation other than ADD, MUL, EXTS or SHR
+ # is zero-delay, and do a subtraction.
+ # 5 - 2 = 3
+ yield from op.issue([5, 2], MicrOp.OP_CMP, [3, 0],
+ wrmask=[0, 1],
+ src_delays=[0, 1], dest_delays=[2, 0])
+
# test with rc=1, so expect results on the CR output port
# 5 + 2 = 7
# 7 > 0 => CR = 0b100
'n_data_o[7:0]',
({'submodule': 'n'},
['n_o_valid', 'n_i_ready'])])]),
- ('debug', {'module': 'top'},
+ ('debug', {'module': 'bench'},
['src1_count[7:0]', 'src2_count[7:0]', 'dest1_count[7:0]'])]
write_gtkw(
"test_compunit_fsm1.gtkw",
"test_compunit_fsm1.vcd",
traces, style,
- module='top.cu'
+ module='bench.top.cu'
)
m = Module()
alu = Shifter(8)
"test_compunit_regspec3.vcd",
traces, style,
clk_period=1e-6,
- module='top.cu')
+ module='bench.top.cu')
inspec = [('INT', 'a', '0:15'),
('INT', 'b', '0:15'),
('next port', 'out', [
'alu_o[15:0]', 'o_valid', 'i_ready',
'alu_o_ok', 'alu_cr_ok'])]),
- ('debug', {'module': 'top'},
+ ('debug', {'module': 'bench'},
['src1_count[7:0]', 'src2_count[7:0]', 'dest1_count[7:0]'])]
write_gtkw("test_compunit_regspec1.gtkw",
"test_compunit_regspec1.vcd",
traces, style,
clk_period=1e-6,
- module='top.cu')
+ module='bench.top.cu')
inspec = [('INT', 'a', '0:15'),
('INT', 'b', '0:15')]
yield
+# FIXME: AttributeError: type object 'LDSTPipeSpec' has no attribute 'regspec'
+@unittest.skip('broken')
class TestLDSTCompUnit(unittest.TestCase):
def test_ldst_compunit(self):
from soc.config.loadstore import ConfigMemoryPortInterface
from soc.experiment.test import pagetables
-from soc.experiment.test.test_wishbone import wb_get
+from openpower.test.wb_get import wb_get
+from openpower.test import wb_get as wbget
+
########################################
assert(ld_data==data)
print("dzbz test passed")
- dut.stop = True # stop simulation
+ wbget.stop = True # stop simulation
########################################
class TestLDSTCompUnitMMU(LDSTCompUnit):
sim.add_clock(1e-6)
dut.mem = pagetables.test1
- dut.stop = False
+ wbget.stop = False
sim.add_sync_process(wrap(ldst_sim(dut)))
- sim.add_sync_process(wrap(wb_get(dut)))
+ sim.add_sync_process(wrap(wb_get(dut.cmpi.wb_bus(), dut.mem)))
with sim.write_vcd('test_scoreboard_mmu.vcd'):
sim.run()
sim.add_clock(1e-6)
dut.mem = pagetables.test1
- dut.stop = False
+ wbget.stop = False
sim.add_sync_process(wrap(ldst_sim(dut)))
- sim.add_sync_process(wrap(wb_get(dut)))
+ sim.add_sync_process(wrap(wb_get(dut.cmpi.wb_bus(), dut.mem)))
with sim.write_vcd('test_scoreboard_regspec_mmu.vcd'):
sim.run()
from soc.experiment.test import pagetables
from soc.experiment.test.test_wishbone import wb_get
-#new unit added to this test case
+# new unit added to this test case
from soc.fu.mmu.pipe_data import MMUPipeSpec
from soc.fu.mmu.fsm import FSMMMUStage
-#for sending instructions to the FSM
+# for sending instructions to the FSM
from openpower.consts import MSR
from openpower.decoder.power_fields import DecodeFields
from openpower.decoder.power_fieldsn import SignalBitRange
from openpower.decoder.power_decoder2 import decode_spr_num
from openpower.decoder.power_enums import MicrOp
+
def test_TLBIE(dut):
yield dut.fsm.p.i_data.ctx.op.eq(MicrOp.OP_TLBIE)
yield dut.fsm.p.valid_i.eq(1)
yield
yield Display("OP_TLBIE test done")
+
def ldst_sim(dut):
- yield dut.mmu.rin.prtbl.eq(0x1000000) # set process table
+ yield dut.mmu.rin.prtbl.eq(0x1000000) # set process table
addr = 0x100e0
- data = 0xFF #just a single byte for this test
+ data = 0xFF # just a single byte for this test
#data = 0xf553b658ba7e1f51
yield from store(dut, addr, 0, data, 0)
yield
ld_data, data_ok, ld_addr = yield from load(dut, addr, 0, 0)
- print(data,data_ok,ld_addr)
- assert(ld_data==data)
+ print(data, data_ok, ld_addr)
+ assert(ld_data == data)
yield
yield from test_TLBIE(dut)
-
"""
-- not testing dzbz here --
data = 0
print("dzbz test passed")
"""
- dut.stop = True # stop simulation
+ dut.stop = True # stop simulation
########################################
reg_wid=64,
units=units)
- dut = TestLDSTCompUnit(16,pspec)
+ dut = TestLDSTCompUnit(16, pspec)
vl = rtlil.convertMMUFSM(dut, ports=dut.ports())
with open("test_ldst_comp_mmu1.il", "w") as f:
f.write(vl)
run_simulation(dut, ldst_sim(dut), vcd_name='test_ldst_comp.vcd')
########################################
+
+
class TestLDSTCompUnitRegSpecMMUFSM(LDSTCompUnit):
def __init__(self, pspec):
self.mmu = MMU()
- pipe_spec = MMUPipeSpec(id_wid=2)
+ pipe_spec = MMUPipeSpec(id_wid=2, parent_pspec=None)
self.fsm = FSMMMUStage(pipe_spec)
self.fsm.set_ldst_interface(ldst)
# link mmu and dcache together
dcache = self.l0.dcache
mmu = self.mmu
- m.d.comb += dcache.m_in.eq(mmu.d_out) # MMUToDCacheType
- m.d.comb += mmu.d_in.eq(dcache.m_out) # DCacheToMMUType
+ m.d.comb += dcache.m_in.eq(mmu.d_out) # MMUToDCacheType
+ m.d.comb += mmu.d_in.eq(dcache.m_out) # DCacheToMMUType
return m
+
def test_scoreboard_regspec_mmufsm():
m = Module()
dut.mem = pagetables.test1
dut.stop = False
- sim.add_sync_process(wrap(ldst_sim(dut))) # rename ?
+ sim.add_sync_process(wrap(ldst_sim(dut))) # rename ?
sim.add_sync_process(wrap(wb_get(dut)))
with sim.write_vcd('test_scoreboard_regspec_mmufsm.vcd'):
sim.run()
if __name__ == '__main__':
test_scoreboard_regspec_mmufsm()
- #only one test for now -- test_scoreboard_mmu()
+ # only one test for now -- test_scoreboard_mmu()
m.submodules.dcache = dut
m.submodules.sram = sram
- m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
- m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
- m.d.comb += sram.bus.we.eq(dut.wb_out.we)
- m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
- m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
- m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
+ m.d.comb += sram.bus.cyc.eq(dut.bus.cyc)
+ m.d.comb += sram.bus.stb.eq(dut.bus.stb)
+ m.d.comb += sram.bus.we.eq(dut.bus.we)
+ m.d.comb += sram.bus.sel.eq(dut.bus.sel)
+ m.d.comb += sram.bus.adr.eq(dut.bus.adr)
+ m.d.comb += sram.bus.dat_w.eq(dut.bus.dat_w)
- m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
- m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
+ m.d.comb += dut.bus.ack.eq(sram.bus.ack)
+ m.d.comb += dut.bus.dat_r.eq(sram.bus.dat_r)
dcache_write_gtkw(test_name)
('d_out', [
'd_out_valid', 'd_out_data[63:0]'
]),
+ # XXX TODO, update to standard wishbone Signals (single "bus" Interface)
('wb_out', [
'wb_out_cyc', 'wb_out_stb', 'wb_out_we',
'wb_out_adr[31:0]', 'wb_out_sel[7:0]', 'wb_out_dat[63:0]'
m.submodules.dcache = dut
m.submodules.sram = sram
- m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
- m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
- m.d.comb += sram.bus.we.eq(dut.wb_out.we)
- m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
- m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
- m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
-
- m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
- m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
+ m.d.comb += sram.bus.cyc.eq(dut.bus.cyc)
+ m.d.comb += sram.bus.stb.eq(dut.bus.stb)
+ m.d.comb += sram.bus.we.eq(dut.bus.we)
+ m.d.comb += sram.bus.sel.eq(dut.bus.sel)
+ m.d.comb += sram.bus.adr.eq(dut.bus.adr)
+ m.d.comb += sram.bus.dat_w.eq(dut.bus.dat_w)
+
+ m.d.comb += dut.bus.ack.eq(sram.bus.ack)
+ m.d.comb += dut.bus.dat_r.eq(sram.bus.dat_r)
dcache_write_gtkw(test_name)
from soc.experiment.test import pagetables
from nmigen.compat.sim import run_simulation
+from openpower.test.wb_get import wb_get
+from openpower.test import wb_get as wbget
+from openpower.decoder.power_enums import MSRSpec
+wbget.stop = False
-stop = False
-
-def wb_get(wb, mem):
- """simulator process for getting memory load requests
- """
-
- global stop
- assert(stop==False)
-
- while not stop:
- while True: # wait for dc_valid
- if stop:
- return
- cyc = yield (wb.cyc)
- stb = yield (wb.stb)
- if cyc and stb:
- break
- yield
- addr = (yield wb.adr) << 3
- if addr not in mem:
- print (" WB LOOKUP NO entry @ %x, returning zero" % (addr))
-
- # read or write?
- we = (yield wb.we)
- if we:
- store = (yield wb.dat_w)
- sel = (yield wb.sel)
- data = mem.get(addr, 0)
- # note we assume 8-bit sel, here
- res = 0
- for i in range(8):
- mask = 0xff << (i*8)
- if sel & (1<<i):
- res |= store & mask
- else:
- res |= data & mask
- mem[addr] = res
- print (" DCACHE set %x mask %x data %x" % (addr, sel, res))
- else:
- data = mem.get(addr, 0)
- yield wb.dat_r.eq(data)
- print (" DCACHE get %x data %x" % (addr, data))
-
- yield wb.ack.eq(1)
- yield
- yield wb.ack.eq(0)
- yield
-
def setup_mmu():
- global stop
- stop = False
+ wbget.stop = False
pspec = TestMemPspec(ldst_ifacetype='mmu_cache_wb',
imem_ifacetype='',
l_in, l_out = mmu.l_in, mmu.l_out
d_in, d_out = dcache.d_in, dcache.d_out
- wb_out, wb_in = dcache.wb_out, dcache.wb_in
# link mmu and dcache together
m.d.comb += dcache.m_in.eq(mmu.d_out) # MMUToDCacheType
def _test_dcbz_addr_100e0(dut, mem):
mmu = dut.submodules.mmu
pi = dut.submodules.ldst.pi
- global stop
- stop = False
+ wbget.stop = False
yield mmu.rin.prtbl.eq(0x1000000) # set process table
yield
addr = 0x100e0
data = 0xf553b658ba7e1f51
- yield from pi_st(pi, addr, data, 8, msr_pr=0)
+ msr = MSRSpec(pr=1, dr=0, sf=1) # 64 bit by default
+
+ yield from pi_st(pi, addr, data, 8, msr)
yield
- ld_data = yield from pi_ld(pi, addr, 8, msr_pr=0)
+ ld_data, _, _ = yield from pi_ld(pi, addr, 8, msr)
assert ld_data == 0xf553b658ba7e1f51
- ld_data = yield from pi_ld(pi, addr, 8, msr_pr=0)
+ ld_data, _, _ = yield from pi_ld(pi, addr, 8, msr)
assert ld_data == 0xf553b658ba7e1f51
print("do_dcbz ===============")
- yield from pi_st(pi, addr, data, 8, msr_pr=0, is_dcbz=1)
+ yield from pi_st(pi, addr, data, 8, msr, is_dcbz=1)
print("done_dcbz ===============")
yield
- ld_data = yield from pi_ld(pi, addr, 8, msr_pr=0)
+ ld_data, _, _ = yield from pi_ld(pi, addr, 8, msr)
print("ld_data after dcbz")
print(ld_data)
assert ld_data == 0
yield
- stop = True
+ wbget.stop = True
def test_dcbz_addr_100e0():
super().__init__(regwid, addrwid)
self.ldst = LDSTSplitter(32, 48, 4)
- def set_wr_addr(self, m, addr, mask, misalign, msr_pr, is_dcbz):
+ def set_wr_addr(self, m, addr, mask, misalign, msr, is_dcbz, is_nc):
m.d.comb += self.ldst.addr_i.eq(addr)
- def set_rd_addr(self, m, addr, mask, misalign, msr_pr):
+ def set_rd_addr(self, m, addr, mask, misalign, msr, is_nc):
m.d.comb += self.ldst.addr_i.eq(addr)
def set_wr_data(self, m, data, wen):
from nmutil.mask import Mask, masked
from nmutil.util import Display
from random import randint, seed
+from openpower.test.wb_get import wb_get
+from openpower.test import wb_get as wbget
if True:
from nmigen.back.pysim import Simulator, Delay, Settle
from soc.experiment.mmu import MMU
from nmigen.compat.sim import run_simulation
+from openpower.decoder.power_enums import MSRSpec
-stop = False
+msr_default = MSRSpec(pr=1, dr=0, sf=1) # 64 bit by default
+
+
+wbget.stop = False
def b(x): # byte-reverse function
return int.from_bytes(x.to_bytes(8, byteorder='little'),
# for cell in mem:
# f.write(str(hex(cell))+"="+str(hex(mem[cell]))+"\n")
-def wb_get(wb, mem):
- """simulator process for getting memory load requests
- """
-
- global stop
- assert(stop==False)
-
- while not stop:
- while True: # wait for dc_valid
- if stop:
- return
- cyc = yield (wb.cyc)
- stb = yield (wb.stb)
- if cyc and stb:
- break
- yield
- addr = (yield wb.adr) << 3
- if addr not in mem:
- print (" WB LOOKUP NO entry @ %x, returning zero" % (addr))
-
- # read or write?
- we = (yield wb.we)
- if we:
- store = (yield wb.dat_w)
- sel = (yield wb.sel)
- data = mem.get(addr, 0)
- # note we assume 8-bit sel, here
- res = 0
- for i in range(8):
- mask = 0xff << (i*8)
- if sel & (1<<i):
- res |= store & mask
- else:
- res |= data & mask
- mem[addr] = res
- print (" DCACHE set %x mask %x data %x" % (addr, sel, res))
- else:
- data = mem.get(addr, 0)
- yield wb.dat_r.eq(data)
- print (" DCACHE get %x data %x" % (addr, data))
-
- yield wb.ack.eq(1)
- yield
- yield wb.ack.eq(0)
- yield
-
def mmu_lookup(dut, addr):
mmu = dut.submodules.mmu
- global stop
print("pi_ld", hex(addr))
- data = yield from pi_ld(dut.submodules.ldst.pi, addr, 4, msr_pr=1)
+ data, _, _ = yield from pi_ld(dut.submodules.ldst.pi, addr, 4, msr=msr_default)
print("pi_ld done, data", hex(data))
"""
# original test code kept for reference
- while not stop: # wait for dc_valid / err
+ while not wbget.stop: # wait for dc_valid / err
print("waiting for mmu")
l_done = yield (mmu.l_out.done)
l_err = yield (mmu.l_out.err)
def ldst_sim(dut):
mmu = dut.submodules.mmu
- global stop
yield mmu.rin.prtbl.eq(0x1000000) # set process table
yield
data = yield from mmu_lookup(dut, addr+8)
assert data == 0xf001a5a5
- yield from pi_st(dut.submodules.ldst.pi, addr+4, 0x10015a5a, 4, msr_pr=1)
+ yield from pi_st(dut.submodules.ldst.pi, addr+4, 0x10015a5a, 4, msr=msr_default)
data = yield from mmu_lookup(dut, addr+4)
assert data == 0x10015a5a
yield
yield
- stop = True
+ wbget.stop = True
def setup_mmu():
- global stop
- stop = False
+ wbget.stop = False
pspec = TestMemPspec(ldst_ifacetype='mmu_cache_wb',
imem_ifacetype='',
l_in, l_out = mmu.l_in, mmu.l_out
d_in, d_out = dcache.d_in, dcache.d_out
- wb_out, wb_in = dcache.wb_out, dcache.wb_in
# link mmu and dcache together
m.d.comb += dcache.m_in.eq(mmu.d_out) # MMUToDCacheType
def ldst_sim_misalign(dut):
mmu = dut.submodules.mmu
- global stop
- stop = False
+ wbget.stop = False
yield mmu.rin.prtbl.eq(0x1000000) # set process table
yield
- data = yield from pi_ld(dut.submodules.ldst.pi, 0x1007, 8, msr_pr=1)
- print ("misalign ld data", hex(data))
+ data, _, _ = yield from pi_ld(dut.submodules.ldst.pi, 0x1007, 8, msr_default)
+ print ("misalign ld data", data)
yield
- stop = True
+ wbget.stop = True
def test_misalign_mmu():
def ldst_sim_radixmiss(dut):
mmu = dut.submodules.mmu
- global stop
- stop = False
+ wbget.stop = False
yield mmu.rin.prtbl.eq(1<<40) # set process table
yield
- data = yield from pi_ld(dut.submodules.ldst.pi, 0x10000000, 8, msr_pr=1)
- print ("radixmiss ld data", hex(data))
+ data, _, _ = yield from pi_ld(dut.submodules.ldst.pi,
+ 0x10000000, 8, msr=msr_default)
+ print ("radixmiss ld data", data)
yield
- stop = True
+ wbget.stop = True
def ldst_sim_dcache_regression(dut):
mmu = dut.submodules.mmu
- global stop
- stop = False
+ wbget.stop = False
yield mmu.rin.prtbl.eq(0x1000000) # set process table
yield
addr = 0x10000
- data = yield from pi_ld(dut.submodules.ldst.pi, addr, 8, msr_pr=1)
- print ("=== dcache_regression ld data", hex(data))
+ data, _, _ = yield from pi_ld(dut.submodules.ldst.pi, addr, 8, msr=msr_default)
+ print ("=== dcache_regression ld data", data)
assert(data == 0xdeadbeef01234567)
yield
- stop = True
+ wbget.stop = True
def ldst_sim_dcache_random(dut):
mmu = dut.submodules.mmu
pi = dut.submodules.ldst.pi
- global stop
- stop = False
+ wbget.stop = False
yield mmu.rin.prtbl.eq(0x1000000) # set process table
yield
addr *= 8
addr += 0x10000
- yield from pi_st(pi, addr, data, 8, msr_pr=1)
+ yield from pi_st(pi, addr, data, 8, msr=msr_default)
yield
- ld_data = yield from pi_ld(pi, addr, 8, msr_pr=1)
+ ld_data, _, _ = yield from pi_ld(pi, addr, 8, msr=msr_default)
eq = (data==ld_data)
print ("dcache_random values", hex(addr), hex(data), hex(ld_data), eq)
assert(data==ld_data) ## investigate why this fails -- really seldom
yield
- stop = True
+ wbget.stop = True
def ldst_sim_dcache_first(dut): # this test is likely to fail
mmu = dut.submodules.mmu
pi = dut.submodules.ldst.pi
- global stop
- stop = False
+ wbget.stop = False
yield mmu.rin.prtbl.eq(0x1000000) # set process table
yield
data = 0x8c5a3e460d71f0b4
# known to fail without bugfix in src/soc/fu/ldst/loadstore.py
- yield from pi_st(pi, addr, data, 8, msr_pr=1)
+ yield from pi_st(pi, addr, data, 8, msr=msr_default)
yield
- ld_data = yield from pi_ld(pi, addr, 8, msr_pr=1)
+ ld_data, _, _ = yield from pi_ld(pi, addr, 8, msr=msr_default)
print ("addr",addr)
print ("dcache_first ld data", hex(data), hex(ld_data))
assert(data==ld_data)
yield
- stop = True
+ wbget.stop = True
def test_radixmiss_mmu():
def ldst_sim_dcache_random2(dut, mem):
mmu = dut.submodules.mmu
pi = dut.submodules.ldst.pi
- global stop
- stop = False
+ wbget.stop = False
yield mmu.rin.prtbl.eq(0x1000000) # set process table
yield
print("before_pi_st")
yield
- yield from pi_st(pi, addr, data, 8, msr_pr=1)
+ yield from pi_st(pi, addr, data, 8, msr=msr_default)
yield
for i in range(0,c2):
yield
print("== read: wb_get")
- ld_data = yield from pi_ld(pi, addr, 8, msr_pr=1)
+ ld_data, _, _ = yield from pi_ld(pi, addr, 8, msr=msr_default)
#dumpmem(mem,"/tmp/dumpmem"+str(c)+".txt")
#c += 1
assert(data==ld_data) ## investigate why this fails -- really seldom
yield
- stop = True
+ wbget.stop = True
def test_dcache_random2():
from soc.experiment.mmu import MMU
from nmigen.compat.sim import run_simulation
+from openpower.test.wb_get import wb_get
+from openpower.test import wb_get as wbget
+from openpower.decoder.power_enums import MSRSpec
+msr_default = MSRSpec(pr=0, dr=0, sf=1) # 64 bit by default
-stop = False
+
+wbget.stop = False
def b(x): # byte-reverse function
return int.from_bytes(x.to_bytes(8, byteorder='little'),
byteorder='big', signed=False)
-def wb_get(wb, mem):
- """simulator process for getting memory load requests
- """
-
- global stop
-
- while not stop:
- while True: # wait for dc_valid
- if stop:
- return
- cyc = yield (wb.cyc)
- stb = yield (wb.stb)
- if cyc and stb:
- break
- yield
- addr = (yield wb.adr) << 3
- if addr not in mem:
- print (" WB LOOKUP NO entry @ %x, returning zero" % (addr))
-
- # read or write?
- we = (yield wb.we)
- if we:
- store = (yield wb.dat_w)
- sel = (yield wb.sel)
- data = mem.get(addr, 0)
- # note we assume 8-bit sel, here
- res = 0
- for i in range(8):
- mask = 0xff << (i*8)
- if sel & (1<<i):
- res |= store & mask
- else:
- res |= data & mask
- mem[addr] = res
- print (" DCACHE set %x mask %x data %x" % (addr, sel, res))
- else:
- data = mem.get(addr, 0)
- yield wb.dat_r.eq(data)
- print (" DCACHE get %x data %x" % (addr, data))
-
- yield wb.ack.eq(1)
- yield
- yield wb.ack.eq(0)
- yield
-
def setup_mmu():
l_in, l_out = mmu.l_in, mmu.l_out
d_in, d_out = dcache.d_in, dcache.d_out
- wb_out, wb_in = dcache.wb_out, dcache.wb_in
# link mmu and dcache together
m.d.comb += dcache.m_in.eq(mmu.d_out) # MMUToDCacheType
def ldst_sim_misalign(dut):
mmu = dut.submodules.mmu
- global stop
- stop = False
+ wbget.stop = False
yield mmu.rin.prtbl.eq(0x1000000) # set process table
yield
- data = yield from pi_ld(dut.submodules.ldst.pi, 0x1000, 4, msr_pr=1)
+ # load 8 bytes at aligned address
+ align_addr = 0x1000
+ data, exctype, exc = yield from pi_ld(dut.submodules.ldst.pi,
+ align_addr, 8, msr=msr_default)
+ print ("ldst_sim_misalign (aligned)", hex(data), exctype, exc)
+ assert data == 0xdeadbeef01234567
+
+ # load 4 bytes at aligned address
+ align_addr = 0x1004
+ data, exctype, exc = yield from pi_ld(dut.submodules.ldst.pi,
+ align_addr, 4, msr=msr_default)
+ print ("ldst_sim_misalign (aligned)", hex(data), exctype, exc)
+ assert data == 0xdeadbeef
+
+ # load 8 bytes at *mis*-aligned address which is still within
+ # the page
+ misalign_addr = 0x1004
+ data, exctype, exc = yield from pi_ld(dut.submodules.ldst.pi,
+ misalign_addr, 8, msr=msr_default)
+
+ print ("ldst_sim_misalign", hex(data), exctype, exc)
+ assert data == 0xf001a5a5deadbeef
+
+ # load 8 bytes at *mis*-aligned address which is still within
+ # the page
+ misalign_addr = 0x1006
+ data, exctype, exc = yield from pi_ld(dut.submodules.ldst.pi,
+ misalign_addr, 8, msr=msr_default)
+
+ print ("ldst_sim_misalign", hex(data), exctype, exc)
+ assert data == 0xf00ff001a5a5dead
+ wbget.stop = True
+ return
+
+ # load 8 bytes at *mis*-aligned address which is NOT within
+ # the page - TODO - work this out
+ misalign_addr = 0x10000004
+ data, exctype, exc = yield from pi_ld(dut.submodules.ldst.pi,
+ misalign_addr, 8, msr=msr_default)
+ print ("ldst_sim_misalign", data, exctype, exc)
+ yield
+ dar = yield dut.submodules.ldst.dar
+ print ("DAR", hex(dar))
+ assert dar == misalign_addr
+ # check exception bits
+ assert exc.happened
+ assert exc.alignment
+ assert not exc.segment_fault
+ assert not exc.instr_fault
+ assert not exc.invalid
+ assert not exc.perm_error
+ assert not exc.rc_error
+ assert not exc.badtree
+
+ wbget.stop = True
def test_misalign_mmu():
-from nmigen import (C, Module, Signal, Elaboratable, Mux, Cat, Repl, Signal)
+from nmigen import (C, Module, Signal, Elaboratable, Mux, Cat, Repl, Signal,
+ Const)
from nmigen.cli import main
from nmigen.cli import rtlil
from nmutil.mask import Mask, masked
from nmigen.sim import Simulator, Delay, Settle
from nmutil.util import wrap
-from soc.config.test.test_pi2ls import pi_ld, pi_st, pi_ldst, wait_busy
+from soc.config.test.test_pi2ls import (pi_ld, pi_st, pi_ldst, wait_busy,
+ get_exception_info)
#from soc.config.test.test_pi2ls import pi_st_debug
from soc.config.test.test_loadstore import TestMemPspec
from soc.config.loadstore import ConfigMemoryPortInterface
from nmigen.compat.sim import run_simulation
from random import random
+from openpower.test.wb_get import wb_get_classic
+from openpower.test import wb_get as wbget
+from openpower.exceptions import LDSTExceptionTuple
-stop = False
+from soc.config.test.test_fetch import read_from_addr
+from openpower.decoder.power_enums import MSRSpec
-def wb_get(wb, mem):
- """simulator process for getting memory load requests
- """
-
- global stop
- assert (stop==False)
-
- while not stop:
- while True: # wait for dc_valid
- if stop:
- return
- cyc = yield (wb.cyc)
- stb = yield (wb.stb)
- if cyc and stb:
- break
- yield
- addr = (yield wb.adr) << 3
- if addr not in mem:
- print (" WB LOOKUP NO entry @ %x, returning zero" % (addr))
-
- # read or write?
- we = (yield wb.we)
- if we:
- store = (yield wb.dat_w)
- sel = (yield wb.sel)
- data = mem.get(addr, 0)
- # note we assume 8-bit sel, here
- res = 0
- for i in range(8):
- mask = 0xff << (i*8)
- if sel & (1<<i):
- res |= store & mask
- else:
- res |= data & mask
- mem[addr] = res
- print (" DCACHE set %x mask %x data %x" % (addr, sel, res))
- else:
- data = mem.get(addr, 0)
- yield wb.dat_r.eq(data)
- print (" DCACHE get %x data %x" % (addr, data))
-
- yield wb.ack.eq(1)
- yield
- yield wb.ack.eq(0)
- yield
def setup_mmu():
- global stop
- stop = False
+ wbget.stop = False
pspec = TestMemPspec(ldst_ifacetype='mmu_cache_wb',
imem_ifacetype='',
m.submodules.ldst = ldst = cmpi.pi
m.submodules.mmu = mmu = MMU()
dcache = ldst.dcache
+ icache = ldst.icache
l_in, l_out = mmu.l_in, mmu.l_out
d_in, d_out = dcache.d_in, dcache.d_out
- wb_out, wb_in = dcache.wb_out, dcache.wb_in
+ i_in, i_out = icache.i_in, icache.i_out # FetchToICache, ICacheToDecode
- # link mmu and dcache together
+ # link mmu, dcache and icache together
m.d.comb += dcache.m_in.eq(mmu.d_out) # MMUToDCacheType
+ m.d.comb += icache.m_in.eq(mmu.i_out) # MMUToICacheType
m.d.comb += mmu.d_in.eq(dcache.m_out) # DCacheToMMUType
# link ldst and MMU together
comb += l_in.eq(ldst.m_out)
comb += ldst.m_in.eq(l_out)
+ # add a debug status Signal: use "msg.str = "blah"
+ # then toggle with yield msg.eq(0); yield msg.eq(1)
+ debug_status = Signal(8, decoder=lambda _ : debug_status.str)
+ m.debug_status = debug_status
+ debug_status.str = ''
+
return m, cmpi
+
+def icache_read(dut,addr,priv,virt):
+
+ icache = dut.submodules.ldst.icache
+ i_in = icache.i_in
+ i_out = icache.i_out
+
+ yield i_in.priv_mode.eq(priv)
+ yield i_in.virt_mode.eq(virt)
+ yield i_in.req.eq(1)
+ yield i_in.nia.eq(addr)
+ yield i_in.stop_mark.eq(0)
+
+ yield i_in.req.eq(1)
+ yield i_in.nia.eq(addr)
+ yield
+ valid = yield i_out.valid
+ failed = yield i_out.fetch_failed
+ while not valid and not failed:
+ yield
+ valid = yield i_out.valid
+ failed = yield i_out.fetch_failed
+ yield i_in.req.eq(0)
+
+ nia = yield i_out.nia
+ insn = yield i_out.insn
+ yield
+ yield
+
+ return nia, insn, valid, failed
+
+
test_exceptions = True
test_dcbz = True
test_random = True
+
+def debug(dut, msg):
+ print ("set debug message", msg)
+ dut.debug_status.str = msg # set the message
+ yield dut.debug_status.eq(0) # trigger an update
+ yield dut.debug_status.eq(1)
+
+
+def _test_loadstore1_ifetch_iface(dut, mem):
+ """test_loadstore1_ifetch_iface
+
+ read in priv mode, non-virtual. tests the FetchUnitInterface
+
+ """
+
+ mmu = dut.submodules.mmu
+ ldst = dut.submodules.ldst
+ pi = ldst.pi
+ icache = dut.submodules.ldst.icache
+ wbget.stop = False
+
+ print("=== test loadstore instruction (real) ===")
+
+ i_in = icache.i_in
+ i_out = icache.i_out
+ i_m_in = icache.m_in
+
+ yield from debug(dut, "real mem instruction")
+ # set address to 0x8, update mem[0x8] to 01234 | 0x5678<<32
+ # (have to do 64-bit writes into the dictionary-memory-emulated-thing)
+ addr = 8
+ addr2 = 12
+ expected_insn2 = 0x5678
+ expected_insn = 0x1234
+ mem[addr] = expected_insn | expected_insn2<<32
+
+ yield i_in.priv_mode.eq(1)
+ insn = yield from read_from_addr(icache, addr, stall=False)
+
+ nia = yield i_out.nia # NO, must use FetchUnitInterface
+ print ("fetched %x from addr %x" % (insn, nia))
+ assert insn == expected_insn
+
+ print("=== test loadstore instruction (2nd, real) ===")
+ yield from debug(dut, "real mem 2nd (addr 0xc)")
+
+ insn2 = yield from read_from_addr(icache, addr2, stall=False)
+
+ nia = yield i_out.nia # NO, must use FetchUnitInterface
+ print ("fetched %x from addr2 %x" % (insn2, nia))
+ assert insn2 == expected_insn2
+
+ print("=== test loadstore instruction (done) ===")
+
+ yield from debug(dut, "test done")
+ yield
+ yield
+
+ print ("fetched %x from addr %x" % (insn, nia))
+ assert insn == expected_insn
+
+ wbget.stop = True
+
+
+def write_mem2(mem, addr, i1, i2):
+ mem[addr] = i1 | i2<<32
+
+
+#TODO: use fetch interface here
+def lookup_virt(dut,addr):
+ icache = dut.submodules.ldst.icache
+ i_in = icache.i_in
+ i_out = icache.i_out
+ yield i_in.priv_mode.eq(0)
+ yield i_in.virt_mode.eq(1)
+ yield i_in.req.eq(0)
+ yield i_in.stop_mark.eq(0)
+
+ yield icache.a_i_valid.eq(1)
+ yield icache.a_pc_i.eq(addr)
+ yield
+ valid = yield i_out.valid
+ failed = yield i_out.fetch_failed
+ while not valid and not failed:
+ yield
+ valid = yield i_out.valid
+ failed = yield i_out.fetch_failed
+ yield icache.a_i_valid.eq(0)
+
+ return valid,failed
+
+
+def mmu_lookup(dut,addr):
+ ldst = dut.submodules.ldst
+ pi = ldst.pi
+ yield from debug(dut, "instr fault "+hex(addr))
+ yield ldst.priv_mode.eq(0)
+ yield ldst.instr_fault.eq(1)
+ yield ldst.maddr.eq(addr)
+ yield
+ yield ldst.instr_fault.eq(0)
+ while True:
+ done = yield (ldst.done)
+ exc_info = yield from get_exception_info(pi.exc_o)
+ if done or exc_info.happened:
+ break
+ yield
+ yield
+ assert exc_info.happened == 0 # assert just before doing the fault set zero
+ yield ldst.instr_fault.eq(0)
+ yield from debug(dut, "instr fault done "+hex(addr))
+ yield
+ yield
+ yield
+
+
+def _test_loadstore1_ifetch_multi(dut, mem):
+ mmu = dut.submodules.mmu
+ ldst = dut.submodules.ldst
+ pi = ldst.pi
+ icache = dut.submodules.ldst.icache
+ assert wbget.stop == False
+
+ print ("set process table")
+ yield from debug(dut, "set prtble")
+ yield mmu.rin.prtbl.eq(0x1000000) # set process table
+ yield
+
+ i_in = icache.i_in
+ i_out = icache.i_out
+ i_m_in = icache.m_in
+
+ # fetch instructions from multiple addresses
+ # should cope with some addresses being invalid
+ real_addrs = [0,4,8,0,8,4,0,0,12]
+ write_mem2(mem,0,0xF0,0xF4)
+ write_mem2(mem,8,0xF8,0xFC)
+
+ yield i_in.priv_mode.eq(1)
+ for addr in real_addrs:
+ yield from debug(dut, "real_addr "+hex(addr))
+ insn = yield from read_from_addr(icache, addr, stall=False)
+ nia = yield i_out.nia # NO, must use FetchUnitInterface
+ print ("TEST_MULTI: fetched %x from addr %x == %x" % (insn, nia,addr))
+ assert insn==0xF0+addr
+
+ # now with virtual memory enabled
+ yield i_in.virt_mode.eq(1)
+
+ virt_addrs = [0x10200,0x10204,0x10208,0x10200,
+ 0x102008,0x10204,0x10200,0x10200,0x10200C]
+
+ write_mem2(mem,0x10200,0xF8,0xFC)
+
+ for addr in virt_addrs:
+ yield from debug(dut, "virt_addr "+hex(addr))
+
+ valid, failed = yield from lookup_virt(dut,addr)
+ yield
+ print("TEST_MULTI: failed=",failed) # this is reported wrong
+ if failed==1: # test one first
+ yield from mmu_lookup(dut,addr)
+ valid, failed = yield from lookup_virt(dut,addr)
+ assert(valid==1)
+
+ wbget.stop = True
+
+
+def _test_loadstore1_ifetch(dut, mem):
+ """test_loadstore1_ifetch
+
+ this is quite a complex multi-step test.
+
+ * first (just because, as a demo) read in priv mode, non-virtual.
+ just like in experiment/icache.py itself.
+
+ * second, using the (usual) PTE for these things (which came originally
+ from gem5-experimental experiment/radix_walk_example.txt) do a
+ virtual-memory read through the *instruction* cache.
+ this is expected to FAIL
+
+ * third: mess about with the MMU, setting "iside" (instruction-side),
+ requesting an MMU RADIX LOOKUP. this triggers an itlb_load
+ (instruction-cache TLB entry-insertion)
+
+ * fourth and finally: retry the read of the instruction through i-cache.
+ this is now expected to SUCCEED
+
+ a lot going on.
+ """
+
+ mmu = dut.submodules.mmu
+ ldst = dut.submodules.ldst
+ pi = ldst.pi
+ icache = dut.submodules.ldst.icache
+ wbget.stop = False
+
+ print("=== test loadstore instruction (real) ===")
+
+ i_in = icache.i_in
+ i_out = icache.i_out
+ i_m_in = icache.m_in
+
+ # first virtual memory test
+
+ print ("set process table")
+ yield from debug(dut, "set prtble")
+ yield mmu.rin.prtbl.eq(0x1000000) # set process table
+ yield
+
+ yield from debug(dut, "real mem instruction")
+ # set address to zero, update mem[0] to 01234
+ addr = 8
+ expected_insn = 0x1234
+ mem[addr] = expected_insn
+
+ yield i_in.priv_mode.eq(1)
+ yield i_in.req.eq(0)
+ yield i_in.nia.eq(addr)
+ yield i_in.stop_mark.eq(0)
+ yield i_m_in.tlbld.eq(0)
+ yield i_m_in.tlbie.eq(0)
+ yield i_m_in.addr.eq(0)
+ yield i_m_in.pte.eq(0)
+ yield
+ yield
+ yield
+
+ # miss, stalls for a bit -- this one is different here
+ ##nia, insn, valid, failed = yield from icache_read(dut,addr,0,0)
+ ##assert(valid==0)
+ ##assert(failed==1)
+
+ yield i_in.req.eq(1)
+ yield i_in.nia.eq(addr)
+ yield
+ valid = yield i_out.valid
+ while not valid:
+ yield
+ valid = yield i_out.valid
+ yield i_in.req.eq(0)
+
+ nia = yield i_out.nia
+ insn = yield i_out.insn
+ yield
+ yield
+
+ print ("fetched %x from addr %x" % (insn, nia))
+ assert insn == expected_insn
+
+ print("=== test loadstore instruction (virtual) ===")
+
+ # look up i-cache expecting it to fail
+
+ yield from debug(dut, "virtual instr req")
+ # set address to 0x10200, update mem[] to 5678
+ virt_addr = 0x10200
+ real_addr = virt_addr
+ expected_insn = 0x5678
+ mem[real_addr] = expected_insn
+
+ yield i_in.priv_mode.eq(0)
+ yield i_in.virt_mode.eq(1)
+ yield i_in.req.eq(0)
+ yield i_in.nia.eq(virt_addr)
+ yield i_in.stop_mark.eq(0)
+ yield i_m_in.tlbld.eq(0)
+ yield i_m_in.tlbie.eq(0)
+ yield i_m_in.addr.eq(0)
+ yield i_m_in.pte.eq(0)
+ yield
+ yield
+ yield
+
+ # miss, stalls for a bit
+ yield i_in.req.eq(1)
+ yield i_in.nia.eq(virt_addr)
+ yield
+ valid = yield i_out.valid
+ failed = yield i_out.fetch_failed
+ while not valid and not failed:
+ yield
+ valid = yield i_out.valid
+ failed = yield i_out.fetch_failed
+ yield i_in.req.eq(0)
+
+ print ("failed?", "yes" if failed else "no")
+ assert failed == 1
+ yield
+ yield
+
+ print("=== test loadstore instruction (instruction fault) ===")
+
+ yield from debug(dut, "instr fault")
+
+ virt_addr = 0x10200
+
+ yield ldst.priv_mode.eq(0)
+ yield ldst.instr_fault.eq(1)
+ yield ldst.maddr.eq(virt_addr)
+ # still broken -- investigate
+ # msr = MSRSpec(pr=?, dr=?, sf=0)
+ # ld_data, exctype, exc = yield from pi_ld(pi, virt_addr, 8, msr=msr)
+ yield
+ yield ldst.instr_fault.eq(0)
+ while True:
+ done = yield (ldst.done)
+ exc_info = yield from get_exception_info(pi.exc_o)
+ if done or exc_info.happened:
+ break
+ yield
+ assert exc_info.happened == 0 # assert just before doing the fault set zero
+ yield ldst.instr_fault.eq(0)
+ yield
+ yield
+ yield
+
+ print("=== test loadstore instruction (try instruction again) ===")
+ yield from debug(dut, "instr virt retry")
+ # set address to 0x10200, update mem[] to 5678
+ virt_addr = 0x10200
+ real_addr = virt_addr
+ expected_insn = 0x5678
+
+ yield i_in.priv_mode.eq(0)
+ yield i_in.virt_mode.eq(1)
+ yield i_in.req.eq(0)
+ yield i_in.nia.eq(virt_addr)
+ yield i_in.stop_mark.eq(0)
+ yield i_m_in.tlbld.eq(0)
+ yield i_m_in.tlbie.eq(0)
+ yield i_m_in.addr.eq(0)
+ yield i_m_in.pte.eq(0)
+ yield
+ yield
+ yield
+
+ # miss, stalls for a bit
+ """
+ yield i_in.req.eq(1)
+ yield i_in.nia.eq(virt_addr)
+ yield
+ valid = yield i_out.valid
+ failed = yield i_out.fetch_failed
+ while not valid and not failed:
+ yield
+ valid = yield i_out.valid
+ failed = yield i_out.fetch_failed
+ yield i_in.req.eq(0)
+ nia = yield i_out.nia
+ insn = yield i_out.insn
+ """
+
+ ## part 4
+ nia, insn, valid, failed = yield from icache_read(dut,virt_addr,0,1)
+
+ yield from debug(dut, "test done")
+ yield
+ yield
+
+ print ("failed?", "yes" if failed else "no")
+ assert failed == 0
+
+ print ("fetched %x from addr %x" % (insn, nia))
+ assert insn == expected_insn
+
+ wbget.stop = True
+
+
def _test_loadstore1_invalid(dut, mem):
mmu = dut.submodules.mmu
pi = dut.submodules.ldst.pi
- global stop
- stop = False
+ wbget.stop = False
print("=== test invalid ===")
addr = 0
- ld_data, exctype, exc, dar_o = yield from pi_ld(pi, addr, 8, msr_pr=1)
+ msr = MSRSpec(pr=1, dr=0, sf=0) # set problem-state
+ ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
print("ld_data", ld_data, exctype, exc)
assert (exctype == "slow")
invalid = exc.invalid
print("=== test invalid done ===")
- stop = True
+ wbget.stop = True
+
+
+def _test_loadstore1_microwatt_mmu_bin_test2(dut, mem):
+ mmu = dut.submodules.mmu
+ pi = dut.submodules.ldst.pi
+ ldst = dut.submodules.ldst # to get at DAR (NOT part of PortInterface)
+ wbget.stop = False
+
+ yield mmu.rin.prtbl.eq(0x12000) # set process table
+ yield mmu.rin.pid.eq(0x1) # set PID=1
+ yield
+
+ addr = 0x124108
+ msr = MSRSpec(pr=1, dr=1, sf=1)
+
+ print("=== alignment error (ld) ===")
+
+ ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
+ print("ld_data after mmu.bin test2")
+ print(ld_data)
+ assert ld_data == 0x0000000badc0ffee
+ assert exctype is None
+
+ wbget.stop = True
+
+
+def _test_loadstore1_microwatt_mmu_bin_test5(dut, mem):
+ mmu = dut.submodules.mmu
+ pi = dut.submodules.ldst.pi
+ ldst = dut.submodules.ldst # to get at DAR (NOT part of PortInterface)
+ wbget.stop = False
+
+ yield mmu.rin.prtbl.eq(0x12000) # set process table
+ yield mmu.rin.pid.eq(0x1) # set PID=1
+ yield
+
+ addr = 0x39fffd
+ msr = MSRSpec(pr=1, dr=1, sf=1)
+
+ print("=== page-fault alignment error (ld) ===")
+
+ ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
+ print("ld_data after mmu.bin test5")
+ print(ld_data)
+ print (exctype, exc)
+
+ wbget.stop = True
+
+
+def test_pi_ld_misalign(pi, addr, data_len, msr):
+ for i in range(0,data_len):
+ ld_data, exctype, exc = yield from pi_ld(pi, addr+i, data_len, msr=msr)
+ yield
+ assert exc is None # use "is None" not "== None"
+ print("MISALIGN: test_pi_ld_misalign returned",hex(ld_data))
+
+
+def test_pi_st_ld_misalign(pi, addr, data_len, msr):
+ data = 0x0102030405060708
+ for i in range(0, data_len):
+ exctype, exc = yield from pi_st(pi, addr+i, data, data_len, msr=msr)
+ print (exctype, exc)
+ assert exc is None # use "is None" not "== None"
+ ld_data, exctype, exc = yield from pi_ld(pi, addr+i, data_len, msr=msr)
+ yield
+ assert exc is None # use "is None" not "== None"
+ print("MISALIGN: test_pi_ld_misalign returned",hex(ld_data))
+ assert ld_data == data
+
+
+def _test_loadstore1_misalign(dut, mem):
+ mmu = dut.submodules.mmu
+ pi = dut.submodules.ldst.pi
+ ldst = dut.submodules.ldst # to get at DAR (NOT part of PortInterface)
+ wbget.stop = False
+
+ yield mmu.rin.prtbl.eq(0x12000) # set process table
+ yield mmu.rin.pid.eq(0x1) # set PID=1
+ #yield
+
+ addr = 1
+ msr = MSRSpec(pr=0, dr=0, sf=1)
+
+ yield from test_pi_ld_misalign(pi,0,8,msr)
+
+ yield from test_pi_st_ld_misalign(pi,0,8,msr)
+
+ wbget.stop = True
def _test_loadstore1(dut, mem):
mmu = dut.submodules.mmu
pi = dut.submodules.ldst.pi
- global stop
- stop = False
+ ldst = dut.submodules.ldst # to get at DAR (NOT part of PortInterface)
+ wbget.stop = False
yield mmu.rin.prtbl.eq(0x1000000) # set process table
yield
addr = 0x100e0
data = 0xf553b658ba7e1f51
+ msr = MSRSpec(pr=0, dr=0, sf=0)
if test_dcbz:
- yield from pi_st(pi, addr, data, 8, msr_pr=1)
+ yield from pi_st(pi, addr, data, 8, msr=msr)
yield
- ld_data, exctype, exc, dar_o = yield from pi_ld(pi, addr, 8, msr_pr=1)
+ ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
assert ld_data == 0xf553b658ba7e1f51
assert exctype is None
- ld_data, exctype, exc, dar_o = yield from pi_ld(pi, addr, 8, msr_pr=1)
+ ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
assert ld_data == 0xf553b658ba7e1f51
assert exctype is None
print("do_dcbz ===============")
- yield from pi_st(pi, addr, data, 8, msr_pr=1, is_dcbz=1)
+ yield from pi_st(pi, addr, data, 8, msr=msr, is_dcbz=1)
print("done_dcbz ===============")
yield
- ld_data, exctype, exc, dar_o = yield from pi_ld(pi, addr, 8, msr_pr=1)
+ ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
print("ld_data after dcbz")
print(ld_data)
assert ld_data == 0
if test_exceptions:
print("=== alignment error (ld) ===")
addr = 0xFF100e0FF
- ld_data, exctype, exc, dar = yield from pi_ld(pi, addr, 8, msr_pr=1)
+ ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
if exc:
alignment = exc.alignment
happened = exc.happened
+ yield # wait for dsr to update
+ dar = yield ldst.dar
else:
alignment = 0
happened = 0
+ dar = 0
assert (happened == 1)
assert (alignment == 1)
assert (dar == addr)
print("=== alignment error (st) ===")
addr = 0xFF100e0FF
- exctype, exc, dar_o = yield from pi_st(pi, addr,0, 8, msr_pr=1)
+ exctype, exc = yield from pi_st(pi, addr,0, 8, msr=msr)
if exc:
alignment = exc.alignment
happened = exc.happened
if True:
print("=== no alignment error (ld) ===")
addr = 0x100e0
- ld_data, exctype, exc, dar_o = yield from pi_ld(pi, addr, 8, msr_pr=1)
+ ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
print("ld_data", ld_data, exctype, exc)
if exc:
alignment = exc.alignment
for addr in addrs:
print("== RANDOM addr ==",hex(addr))
- ld_data, exctype, exc, dar_o = \
- yield from pi_ld(pi, addr, 8, msr_pr=1)
+ ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
print("ld_data[RANDOM]",ld_data,exc,addr)
assert (exctype == None)
for addr in addrs:
print("== RANDOM addr ==",hex(addr))
- exc = yield from pi_st(pi, addr,0xFF*addr, 8, msr_pr=1)
+ exc = yield from pi_st(pi, addr,0xFF*addr, 8, msr=msr)
assert (exctype == None)
# readback written data and compare
for addr in addrs:
print("== RANDOM addr ==",hex(addr))
- ld_data, exctype, exc, dar_o = \
- yield from pi_ld(pi, addr, 8, msr_pr=1)
+ ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
print("ld_data[RANDOM_READBACK]",ld_data,exc,addr)
assert (exctype == None)
assert (ld_data == 0xFF*addr)
print("== RANDOM addr done ==")
- stop = True
+ wbget.stop = True
+
+
+def _test_loadstore1_ifetch_invalid(dut, mem):
+ mmu = dut.submodules.mmu
+ ldst = dut.submodules.ldst
+ pi = ldst.pi
+ icache = dut.submodules.ldst.icache
+ wbget.stop = False
+
+ print("=== test loadstore instruction (invalid) ===")
+
+ i_in = icache.i_in
+ i_out = icache.i_out
+ i_m_in = icache.m_in
+
+ # first virtual memory test
+
+ print ("set process table")
+ yield from debug(dut, "set prtbl")
+ yield mmu.rin.prtbl.eq(0x1000000) # set process table
+ yield
+
+ yield from debug(dut, "real mem instruction")
+ # set address to zero, update mem[0] to 01234
+ addr = 8
+ expected_insn = 0x1234
+ mem[addr] = expected_insn
+
+ yield i_in.priv_mode.eq(1)
+ yield i_in.req.eq(0)
+ yield i_in.nia.eq(addr)
+ yield i_in.stop_mark.eq(0)
+ yield i_m_in.tlbld.eq(0)
+ yield i_m_in.tlbie.eq(0)
+ yield i_m_in.addr.eq(0)
+ yield i_m_in.pte.eq(0)
+ yield
+ yield
+ yield
+
+ # miss, stalls for a bit
+ yield i_in.req.eq(1)
+ yield i_in.nia.eq(addr)
+ yield
+ valid = yield i_out.valid
+ nia = yield i_out.nia
+ while not valid:
+ yield
+ valid = yield i_out.valid
+ yield i_in.req.eq(0)
+
+ nia = yield i_out.nia
+ insn = yield i_out.insn
+
+ yield
+ yield
+
+ print ("fetched %x from addr %x" % (insn, nia))
+ assert insn == expected_insn
+
+ print("=== test loadstore instruction (virtual) ===")
+ yield from debug(dut, "virtual instr req")
+
+ # look up i-cache expecting it to fail
+
+ # set address to 0x10200, update mem[] to 5678
+ virt_addr = 0x10200
+ real_addr = virt_addr
+ expected_insn = 0x5678
+ mem[real_addr] = expected_insn
+
+ yield i_in.priv_mode.eq(1)
+ yield i_in.virt_mode.eq(1)
+ yield i_in.req.eq(0)
+ yield i_in.nia.eq(virt_addr)
+ yield i_in.stop_mark.eq(0)
+ yield i_m_in.tlbld.eq(0)
+ yield i_m_in.tlbie.eq(0)
+ yield i_m_in.addr.eq(0)
+ yield i_m_in.pte.eq(0)
+ yield
+ yield
+ yield
+
+ # miss, stalls for a bit
+ yield i_in.req.eq(1)
+ yield i_in.nia.eq(virt_addr)
+ yield
+ valid = yield i_out.valid
+ failed = yield i_out.fetch_failed
+ while not valid and not failed:
+ yield
+ valid = yield i_out.valid
+ failed = yield i_out.fetch_failed
+ yield i_in.req.eq(0)
+
+ print ("failed?", "yes" if failed else "no")
+ assert failed == 1
+ yield
+ yield
+
+ print("=== test invalid loadstore instruction (instruction fault) ===")
+
+ yield from debug(dut, "instr fault (perm err expected)")
+ virt_addr = 0x10200
+
+ yield ldst.priv_mode.eq(0)
+ yield ldst.instr_fault.eq(1)
+ yield ldst.maddr.eq(virt_addr)
+ #ld_data, exctype, exc = yield from pi_ld(pi, virt_addr, 8, msr=msr)
+ yield
+ yield ldst.instr_fault.eq(0)
+ while True:
+ done = yield (ldst.done)
+ exc_info = yield from get_exception_info(pi.exc_o)
+ if done or exc_info.happened:
+ break
+ yield
+ assert exc_info.happened == 1 # different here as expected
+
+ # TODO: work out what kind of exception occurred and check it's
+ # the right one. we *expect* it to be a permissions error because
+ # the RPTE leaf node in pagetables.test2 is marked as "non-executable"
+ # but we also expect instr_fault to be set because it is an instruction
+ # (iside) lookup
+ print (" MMU lookup exception type?")
+ for fname in LDSTExceptionTuple._fields:
+ print (" fname %20s %d" % (fname, getattr(exc_info, fname)))
+
+ # ok now printed them out and visually inspected: check them with asserts
+ assert exc_info.instr_fault == 1 # instruction fault (yes!)
+ assert exc_info.perm_error == 1 # permissions (yes!)
+ assert exc_info.rc_error == 0
+ assert exc_info.alignment == 0
+ assert exc_info.invalid == 0
+ assert exc_info.segment_fault == 0
+ assert exc_info.rc_error == 0
+
+ yield from debug(dut, "test done")
+ yield ldst.instr_fault.eq(0)
+ yield
+ yield
+ yield
+
+ wbget.stop = True
+
+
+def test_loadstore1_ifetch_unit_iface():
+
+ m, cmpi = setup_mmu()
+
+ mem = pagetables.test1
+
+ # set this up before passing to Simulator (which calls elaborate)
+ icache = m.submodules.ldst.icache
+ icache.use_fetch_interface() # this is the function which converts
+ # to FetchUnitInterface. *including*
+ # rewiring the Wishbone Bus to ibus
+
+ # nmigen Simulation
+ sim = Simulator(m)
+ sim.add_clock(1e-6)
+
+ sim.add_sync_process(wrap(_test_loadstore1_ifetch_iface(m, mem)))
+ # add two wb_get_classic processes onto the *same* memory dictionary.
+ # this shouuuld work.... cross-fingers...
+ sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
+ sim.add_sync_process(wrap(wb_get_classic(icache.ibus, mem))) # ibus not bus
+ with sim.write_vcd('test_loadstore1_ifetch_iface.vcd',
+ traces=[m.debug_status]): # include extra debug
+ sim.run()
+
+
+def test_loadstore1_ifetch():
+
+ m, cmpi = setup_mmu()
+
+ mem = pagetables.test1
+
+ # nmigen Simulation
+ sim = Simulator(m)
+ sim.add_clock(1e-6)
+
+ icache = m.submodules.ldst.icache
+ sim.add_sync_process(wrap(_test_loadstore1_ifetch(m, mem)))
+ # add two wb_get_classic processes onto the *same* memory dictionary.
+ # this shouuuld work.... cross-fingers...
+ sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
+ sim.add_sync_process(wrap(wb_get_classic(icache.bus, mem)))
+ with sim.write_vcd('test_loadstore1_ifetch.vcd',
+ traces=[m.debug_status]): # include extra debug
+ sim.run()
+
def test_loadstore1():
sim.add_clock(1e-6)
sim.add_sync_process(wrap(_test_loadstore1(m, mem)))
- sim.add_sync_process(wrap(wb_get(cmpi.wb_bus(), mem)))
+ sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
with sim.write_vcd('test_loadstore1.vcd'):
sim.run()
+
+def test_loadstore1_microwatt_mmu_bin_test2():
+
+ m, cmpi = setup_mmu()
+
+ mem = pagetables.microwatt_test2
+
+ # nmigen Simulation
+ sim = Simulator(m)
+ sim.add_clock(1e-6)
+
+ sim.add_sync_process(wrap(_test_loadstore1_microwatt_mmu_bin_test2(m, mem)))
+ sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
+ with sim.write_vcd('test_microwatt_mmu_test2.vcd'):
+ sim.run()
+
+
+def test_loadstore1_microwatt_mmu_bin_test5():
+
+ m, cmpi = setup_mmu()
+
+ mem = pagetables.microwatt_test5
+
+ # nmigen Simulation
+ sim = Simulator(m)
+ sim.add_clock(1e-6)
+
+ sim.add_sync_process(wrap(_test_loadstore1_microwatt_mmu_bin_test5(m, mem)))
+ sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
+ with sim.write_vcd('test_microwatt_mmu_test5.vcd'):
+ sim.run()
+
+
+def test_loadstore1_misalign():
+
+ m, cmpi = setup_mmu()
+
+ mem = pagetables.microwatt_test2
+
+ # nmigen Simulation
+ sim = Simulator(m)
+ sim.add_clock(1e-6)
+
+ ###########1122334455667788
+ mem[0] = 0x0102030405060708
+ mem[8] = 0xffffffffffffffff
+
+ sim.add_sync_process(wrap(_test_loadstore1_misalign(m, mem)))
+ sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
+ with sim.write_vcd('test_loadstore1_misalign.vcd'):
+ sim.run()
+ print ("mem", mem)
+
+
def test_loadstore1_invalid():
m, cmpi = setup_mmu()
sim.add_clock(1e-6)
sim.add_sync_process(wrap(_test_loadstore1_invalid(m, mem)))
- sim.add_sync_process(wrap(wb_get(cmpi.wb_bus(), mem)))
+ sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
with sim.write_vcd('test_loadstore1_invalid.vcd'):
sim.run()
+
+def test_loadstore1_ifetch_invalid():
+ m, cmpi = setup_mmu()
+
+ # this is a specially-arranged page table which has the permissions
+ # barred for execute on the leaf node (EAA=0x2 instead of EAA=0x3)
+ mem = pagetables.test2
+
+ # nmigen Simulation
+ sim = Simulator(m)
+ sim.add_clock(1e-6)
+
+ icache = m.submodules.ldst.icache
+ sim.add_sync_process(wrap(_test_loadstore1_ifetch_invalid(m, mem)))
+ # add two wb_get_classic processes onto the *same* memory dictionary.
+ # this shouuuld work.... cross-fingers...
+ sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
+ sim.add_sync_process(wrap(wb_get_classic(icache.bus, mem)))
+ with sim.write_vcd('test_loadstore1_ifetch_invalid.vcd',
+ traces=[m.debug_status]): # include extra debug
+ sim.run()
+
+
+def test_loadstore1_ifetch_multi():
+ m, cmpi = setup_mmu()
+ wbget.stop = False
+
+ # this is a specially-arranged page table which has the permissions
+ # barred for execute on the leaf node (EAA=0x2 instead of EAA=0x3)
+ mem = pagetables.test1
+
+ # set this up before passing to Simulator (which calls elaborate)
+ icache = m.submodules.ldst.icache
+ icache.use_fetch_interface() # this is the function which converts
+ # to FetchUnitInterface. *including*
+ # rewiring the Wishbone Bus to ibus
+
+ # nmigen Simulation
+ sim = Simulator(m)
+ sim.add_clock(1e-6)
+
+ sim.add_sync_process(wrap(_test_loadstore1_ifetch_multi(m, mem)))
+ # add two wb_get_classic processes onto the *same* memory dictionary.
+ # this shouuuld work.... cross-fingers...
+ sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
+ sim.add_sync_process(wrap(wb_get_classic(icache.ibus, mem))) # ibus not bus
+ with sim.write_vcd('test_loadstore1_ifetch_multi.vcd',
+ traces=[m.debug_status]): # include extra debug
+ sim.run()
+
if __name__ == '__main__':
- test_loadstore1()
- test_loadstore1_invalid()
+ #test_loadstore1()
+ #test_loadstore1_microwatt_mmu_bin_test2()
+ #test_loadstore1_microwatt_mmu_bin_test5()
+ #test_loadstore1_invalid()
+ #test_loadstore1_ifetch() #FIXME
+ #test_loadstore1_ifetch_invalid()
+ #test_loadstore1_ifetch_unit_iface() # guess: should be working
+ #test_loadstore1_ifetch_multi()
+ test_loadstore1_misalign()
from soc.experiment.mmu import MMU
from soc.experiment.dcache import DCache
from soc.experiment.icache import ICache
+from openpower.test.wb_get import wb_get
+from openpower.test import wb_get as wbget
import random
-stop = False
-
-def set_stop(newval):
- global stop
- stop = newval
-
+wbget.stop = False
def b(x):
return int.from_bytes(x.to_bytes(8, byteorder='little'),
}
-def wb_get(c, mem, name):
- """simulator process for getting memory load requests
- """
-
- logfile = open("/tmp/wb_get.log","w")
-
- def log(msg):
- logfile.write(msg+"\n")
- print(msg)
-
- global stop
- while not stop:
- while True: # wait for dc_valid
- if stop:
- log("stop")
- return
- cyc = yield (c.wb_out.cyc)
- stb = yield (c.wb_out.stb)
- if cyc and stb:
- break
- yield
- addr = (yield c.wb_out.adr) << 3
- if addr not in mem:
- log("%s LOOKUP FAIL %x" % (name, addr))
- stop = True
- return
-
- yield
- data = mem[addr]
- yield c.wb_in.dat.eq(data)
- log("%s get %x data %x" % (name, addr, data))
- yield c.wb_in.ack.eq(1)
- yield
- yield c.wb_in.ack.eq(0)
- yield
-
-
def icache_sim(dut, mem):
i_out = dut.i_in
i_in = dut.i_out
m_out = dut.m_in
+ wbget.stop = False
+
for k,v in mem.items():
yield i_in.valid.eq(0)
yield i_out.priv_mode.eq(1)
yield i_out.req.eq(0)
yield
+ wbget.stop = True
def test_icache_il():
dut = ICache()
# read from "memory" process and corresponding wishbone "read" process
sim.add_sync_process(wrap(icache_sim(icache, mem)))
- sim.add_sync_process(wrap(wb_get(icache, mem, "ICACHE")))
+ sim.add_sync_process(wrap(wb_get(icache.bus, mem, "ICACHE")))
with sim.write_vcd('test_icache.vcd'):
sim.run()
def mmu_lookup(mmu, addr):
- global stop
yield mmu.l_in.load.eq(1)
yield mmu.l_in.priv.eq(1)
yield mmu.l_in.addr.eq(addr)
yield mmu.l_in.valid.eq(1)
- while not stop: # wait for dc_valid / err
+
+ print ("mmu lookup %x stopped" % addr, wbget.stop)
+ while not wbget.stop: # wait for dc_valid / err
+ print ("stopped", wbget.stop)
l_done = yield (mmu.l_out.done)
l_err = yield (mmu.l_out.err)
l_badtree = yield (mmu.l_out.badtree)
def mmu_sim(mmu):
- global stop
+ wbget.stop = False
yield mmu.rin.prtbl.eq(0x1000000) # set process table
yield
phys_addr = yield from mmu_lookup(mmu, 0x10000)
assert phys_addr == 0x40000
+ yield
- stop = True
+ wbget.stop = True
def test_mmu():
sim.add_clock(1e-6)
sim.add_sync_process(wrap(mmu_sim(mmu)))
- sim.add_sync_process(wrap(wb_get(dcache, default_mem, "DCACHE")))
+ sim.add_sync_process(wrap(wb_get(dcache.bus,
+ default_mem, "DCACHE")))
with sim.write_vcd('test_mmu.vcd'):
sim.run()
from soc.experiment.mmu import MMU
from soc.experiment.dcache import DCache
+from openpower.test.wb_get import wb_get
+from openpower.test import wb_get as wbget
#more imports
# will take at least one week (10.10.2020)
# many unconnected signals
+def b(x):
+ return int.from_bytes(x.to_bytes(8, byteorder='little'),
+ byteorder='big', signed=False)
+
+mem = {0x10000: # PARTITION_TABLE_2
+ # PATB_GR=1 PRTB=0x1000 PRTS=0xb
+ b(0x800000000100000b),
+
+ 0x30000: # RADIX_ROOT_PTE
+ # V = 1 L = 0 NLB = 0x400 NLS = 9
+ b(0x8000000000040009),
+
+ 0x40000: # RADIX_SECOND_LEVEL
+ # V = 1 L = 1 SW = 0 RPN = 0
+ # R = 1 C = 1 ATT = 0 EAA 0x7
+ b(0xc000000000000187),
+
+ 0x1000000: # PROCESS_TABLE_3
+ # RTS1 = 0x2 RPDB = 0x300 RTS2 = 0x5 RPDS = 13
+ b(0x40000000000300ad),
+ }
+
class TestMicrowattMemoryPortInterface(PortInterfaceBase):
"""TestMicrowattMemoryPortInterface
self.mmu = mmu
self.dcache = dcache
- def set_wr_addr(self, m, addr, mask, misalign, msr_pr, is_dcbz):
+ def set_wr_addr(self, m, addr, mask, misalign, msr, is_dcbz, is_nc):
m.d.comb += self.dcache.d_in.addr.eq(addr)
m.d.comb += self.mmu.l_in.addr.eq(addr)
m.d.comb += self.mmu.l_in.load.eq(0)
- m.d.comb += self.mmu.l_in.priv.eq(1) # TODO put msr_pr here
+ m.d.comb += self.mmu.l_in.priv.eq(~msr.pr) # TODO verify
m.d.comb += self.mmu.l_in.valid.eq(1)
- def set_rd_addr(self, m, addr, mask, misalign, msr_pr):
+ def set_rd_addr(self, m, addr, mask, misalign, msr, is_nc):
m.d.comb += self.dcache.d_in.addr.eq(addr)
m.d.comb += self.mmu.l_in.addr.eq(addr)
m.d.comb += self.mmu.l_in.load.eq(1)
- m.d.comb += self.mmu.l_in.priv.eq(1) # TODO put msr_pr here
+ m.d.comb += self.mmu.l_in.priv.eq(~msr.pr) # TODO verify
m.d.comb += self.mmu.l_in.valid.eq(1)
def set_wr_data(self, m, data, wen):
yield from super().ports()
# TODO: memory ports
-stop = False
-
-
-def wb_get(dc):
- """simulator process for getting memory load requests
- """
-
- global stop
-
- def b(x):
- return int.from_bytes(x.to_bytes(8, byteorder='little'),
- byteorder='big', signed=False)
-
- mem = {0x10000: # PARTITION_TABLE_2
- # PATB_GR=1 PRTB=0x1000 PRTS=0xb
- b(0x800000000100000b),
-
- 0x30000: # RADIX_ROOT_PTE
- # V = 1 L = 0 NLB = 0x400 NLS = 9
- b(0x8000000000040009),
-
- 0x40000: # RADIX_SECOND_LEVEL
- # V = 1 L = 1 SW = 0 RPN = 0
- # R = 1 C = 1 ATT = 0 EAA 0x7
- b(0xc000000000000187),
-
- 0x1000000: # PROCESS_TABLE_3
- # RTS1 = 0x2 RPDB = 0x300 RTS2 = 0x5 RPDS = 13
- b(0x40000000000300ad),
- }
-
- while not stop:
- while True: # wait for dc_valid
- if stop:
- return
- cyc = yield (dc.wb_out.cyc)
- stb = yield (dc.wb_out.stb)
- if cyc and stb:
- break
- yield
- addr = (yield dc.wb_out.adr) << 3
- if addr not in mem:
- print (" WB LOOKUP NO entry @ %x, returning zero" % (addr))
-
- data = mem.get(addr, 0)
- yield dc.wb_in.dat.eq(data)
- print (" DCACHE get %x data %x" % (addr, data))
- yield dc.wb_in.ack.eq(1)
- yield
- yield dc.wb_in.ack.eq(0)
- yield
+wbget.stop = False
def mmu_lookup(dut, addr):
mmu = dut.mmu
- global stop
print("pi_ld")
yield from pi_ld(dut.pi, addr, 1)
def mmu_sim(dut):
mmu = dut.mmu
- global stop
yield mmu.rin.prtbl.eq(0x1000000) # set process table
yield
phys_addr = yield from mmu_lookup(dut, 0x10000)
assert phys_addr == 0x40000
- stop = True
+ wbget.stop = True
def test_mmu():
sim.add_clock(1e-6)
sim.add_sync_process(wrap(mmu_sim(dut)))
- sim.add_sync_process(wrap(wb_get(dcache)))
+ sim.add_sync_process(wrap(wb_get(dcache.bus, mem)))
with sim.write_vcd('test_mmu_pi.vcd'):
sim.run()
-def wb_get(dut):
- """simulator process for getting memory load requests
- """
- mem = dut.mem
- wb = dut.cmpi.wb_bus()
+from openpower.test.wb_get import wb_get
- while not dut.stop:
- while True: # wait for dc_valid
- if dut.stop:
- return
- cyc = yield (wb.cyc)
- stb = yield (wb.stb)
- if cyc and stb:
- break
- yield
- addr = (yield wb.adr) << 3
- if addr not in mem:
- print (" WB LOOKUP NO entry @ %x, returning zero" % (addr))
-
- # read or write?
- we = (yield wb.we)
- if we:
- store = (yield wb.dat_w)
- sel = (yield wb.sel)
- data = mem.get(addr, 0)
- # note we assume 8-bit sel, here
- res = 0
- for i in range(8):
- mask = 0xff << (i*8)
- if sel & (1<<i):
- res |= store & mask
- else:
- res |= data & mask
- mem[addr] = res
- print (" DCACHE set %x mask %x data %x" % (addr, sel, res))
- else:
- data = mem.get(addr, 0)
- yield wb.dat_r.eq(data)
- print (" DCACHE get %x data %x" % (addr, data))
-
- yield wb.ack.eq(1)
- yield
- yield wb.ack.eq(0)
- yield
recwidth += width
comb += p.eq(AnyConst(width))
- pspec = ALUPipeSpec(id_wid=2, op_wid=recwidth)
+ pspec = ALUPipeSpec(id_wid=2, op_wid=recwidth, parent_pspec=None)
m.submodules.dut = dut = ALUInputStage(pspec)
a = Signal(64)
module = Driver()
self.assertFormal(module, mode="bmc", depth=4)
self.assertFormal(module, mode="cover", depth=4)
+
def test_ilang(self):
dut = Driver()
vl = rtlil.convert(dut, ports=[])
width = p.width
comb += p.eq(AnyConst(width))
- pspec = ALUPipeSpec(id_wid=2)
+ pspec = ALUPipeSpec(id_wid=2, parent_pspec=None)
m.submodules.dut = dut = ALUMainStage(pspec)
# convenience variables
a = dut.i.a
b = dut.i.b
ca_in = dut.i.xer_ca[0] # CA carry in
- ca32_in = dut.i.xer_ca[1] # CA32 carry in 32
+ ca32_in = dut.i.xer_ca[1] # CA32 carry in 32
so_in = dut.i.xer_so # SO sticky overflow
ca_o = dut.o.xer_ca.data[0] # CA carry out
- ca32_o = dut.o.xer_ca.data[1] # CA32 carry out32
+ ca32_o = dut.o.xer_ca.data[1] # CA32 carry out32
ov_o = dut.o.xer_ov.data[0] # OV overflow
- ov32_o = dut.o.xer_ov.data[1] # OV32 overflow32
+ ov32_o = dut.o.xer_ov.data[1] # OV32 overflow32
o = dut.o.o.data
# setup random inputs
module = Driver()
self.assertFormal(module, mode="bmc", depth=2)
self.assertFormal(module, mode="cover", depth=2)
+
def test_ilang(self):
dut = Driver()
vl = rtlil.convert(dut, ports=[])
recwidth += width
comb += p.eq(AnyConst(width))
- pspec = ALUPipeSpec(id_wid=2)
+ pspec = ALUPipeSpec(id_wid=2, parent_pspec=None)
m.submodules.dut = dut = ALUOutputStage(pspec)
o = Signal(64)
return m
+
class GTCombinerTestCase(FHDLTestCase):
def test_formal(self):
module = Driver()
self.assertFormal(module, mode="bmc", depth=4)
self.assertFormal(module, mode="cover", depth=4)
+
def test_ilang(self):
dut = Driver()
vl = rtlil.convert(dut, ports=[])
return ALUOutputData(self.pspec) # defines pipeline stage output format
def elaborate(self, platform):
+ XLEN = self.pspec.XLEN
m = Module()
comb = m.d.comb
comb += b_i.eq(b) # into trap pipeline
with m.Elif(is_32bit):
with m.If(op.is_signed):
- comb += a_i.eq(exts(a, 32, 64))
- comb += b_i.eq(exts(b, 32, 64))
+ comb += a_i.eq(exts(a, 32, XLEN))
+ comb += b_i.eq(exts(b, 32, XLEN))
with m.Else():
- comb += a_i.eq(extz(a, 32, 64))
- comb += b_i.eq(extz(b, 32, 64))
+ comb += a_i.eq(extz(a, 32, XLEN))
+ comb += b_i.eq(extz(b, 32, XLEN))
with m.Else():
comb += a_i.eq(a)
comb += b_i.eq(b)
#### CMP, CMPL v3.0B p85-86
with m.Case(MicrOp.OP_CMP):
- a_n = Signal(64) # temporary - inverted a
+ a_n = Signal(XLEN) # temporary - inverted a
tval = Signal(5)
a_lt = Signal()
carry_32 = Signal()
# this is supposed to be inverted (b-a, not a-b)
comb += a_n.eq(~a) # sigh a gets inverted
- comb += carry_32.eq(add_o[33] ^ a[32] ^ b[32])
- comb += carry_64.eq(add_o[65])
+ if XLEN == 64:
+ comb += carry_32.eq(add_o[33] ^ a[32] ^ b[32])
+ else:
+ comb += carry_32.eq(add_o[XLEN+1])
+ comb += carry_64.eq(add_o[XLEN+1])
comb += zerolo.eq(~((a_n[0:32] ^ b[0:32]).bool()))
- comb += zerohi.eq(~((a_n[32:64] ^ b[32:64]).bool()))
+ comb += zerohi.eq(~((a_n[32:XLEN] ^ b[32:XLEN]).bool()))
with m.If(zerolo & (is_32bit | zerohi)):
# values are equal
comb += tval[2].eq(1)
with m.Else():
- comb += msb_a.eq(Mux(is_32bit, a_n[31], a_n[63]))
- comb += msb_b.eq(Mux(is_32bit, b[31], b[63]))
+ comb += msb_a.eq(Mux(is_32bit, a_n[31], a_n[XLEN-1]))
+ comb += msb_b.eq(Mux(is_32bit, b[31], b[XLEN-1]))
C0 = Const(0, 1)
with m.If(msb_a != msb_b):
# Subtraction might overflow, but
# https://bugs.libre-soc.org/show_bug.cgi?id=319#c5
ca = Signal(2, reset_less=True)
comb += ca[0].eq(add_o[-1]) # XER.CA
- comb += ca[1].eq(add_o[33] ^ (a_i[32] ^ b_i[32])) # XER.CA32
+ if XLEN == 64:
+ comb += ca[1].eq(add_o[33] ^ (a_i[32] ^ b_i[32])) # XER.CA32
+ else:
+ comb += ca[1].eq(add_o[-1]) # XER.CA32
comb += cry_o.data.eq(ca)
comb += cry_o.ok.eq(1)
# 32-bit (ov[1]) and 64-bit (ov[0]) overflow
ov = Signal(2, reset_less=True)
comb += ov[0].eq(calc_ov(a_i[-1], b_i[-1], ca[0], add_o[-2]))
- comb += ov[1].eq(calc_ov(a_i[31], b_i[31], ca[1], add_o[32]))
+ if XLEN == 64:
+ comb += ov[1].eq(calc_ov(a_i[31], b_i[31], ca[1],
+ add_o[32]))
+ else:
+ comb += ov[1].eq(calc_ov(a_i[-1], b_i[-1], ca[0],
+ add_o[-2]))
comb += ov_o.data.eq(ov)
comb += ov_o.ok.eq(1)
with m.Case(MicrOp.OP_EXTS):
with m.If(op.data_len == 1):
- comb += o.data.eq(exts(a, 8, 64))
+ comb += o.data.eq(exts(a, 8, XLEN))
with m.If(op.data_len == 2):
- comb += o.data.eq(exts(a, 16, 64))
+ comb += o.data.eq(exts(a, 16, XLEN))
with m.If(op.data_len == 4):
- comb += o.data.eq(exts(a, 32, 64))
+ comb += o.data.eq(exts(a, 32, XLEN))
comb += o.ok.eq(1) # output register
###################
class ALUInputData(FUBaseData):
- regspec = [('INT', 'ra', '0:63'), # RA
- ('INT', 'rb', '0:63'), # RB/immediate
- ('XER', 'xer_so', '32'), # XER bit 32: SO
- ('XER', 'xer_ca', '34,45')] # XER bit 34/45: CA/CA32
def __init__(self, pspec):
super().__init__(pspec, False)
# convenience
self.a, self.b = self.ra, self.rb
+ @property
+ def regspec(self):
+ return [('INT', 'ra', self.intrange), # RA
+ ('INT', 'rb', self.intrange), # RB/immediate
+ ('XER', 'xer_so', '32'), # XER bit 32: SO
+ ('XER', 'xer_ca', '34,45')] # XER bit 34/45: CA/CA32
+
+
class ALUOutputData(FUBaseData):
- regspec = [('INT', 'o', '0:63'),
- ('CR', 'cr_a', '0:3'),
- ('XER', 'xer_ca', '34,45'), # bit0: ca, bit1: ca32
- ('XER', 'xer_ov', '33,44'), # bit0: ov, bit1: ov32
- ('XER', 'xer_so', '32')]
def __init__(self, pspec):
super().__init__(pspec, True)
# convenience
self.cr0 = self.cr_a
+ @property
+ def regspec(self):
+ return [('INT', 'o', self.intrange),
+ ('CR', 'cr_a', '0:3'),
+ ('XER', 'xer_ca', '34,45'), # bit0: ca, bit1: ca32
+ ('XER', 'xer_ov', '33,44'), # bit0: ov, bit1: ov32
+ ('XER', 'xer_so', '32')]
+
+
class ALUPipeSpec(CommonPipeSpec):
- regspec = (ALUInputData.regspec, ALUOutputData.regspec)
opsubsetkls = CompALUOpSubset
+ regspecklses = (ALUInputData, ALUOutputData)
from soc.fu.alu.output_stage import ALUOutputStage
-class ALUStagesOld(PipeModBaseChain):
+class ALUStages(PipeModBaseChain):
def get_chain(self):
inp = ALUInputStage(self.pspec)
main = ALUMainStage(self.pspec)
- return [inp, main, out]
-
-
-class ALUStageEnd(PipeModBaseChain):
- def get_chain(self):
out = ALUOutputStage(self.pspec)
- return [out]
+ return [inp, main, out]
-class ALUBasePipeOld(ControlBase):
+class ALUBasePipe(ControlBase):
def __init__(self, pspec):
ControlBase.__init__(self)
self.pspec = pspec
self.pipe1 = ALUStages(pspec)
- self.pipe2 = ALUStageEnd(pspec)
- self._eqs = self.connect([self.pipe1, self.pipe2])
+ self._eqs = self.connect([self.pipe1])
def elaborate(self, platform):
m = ControlBase.elaborate(self, platform)
m.submodules.pipe1 = self.pipe1
- m.submodules.pipe2 = self.pipe2
m.d.comb += self._eqs
return m
-
-class ALUStages(PipeModBaseChain):
+class ALUStages1(PipeModBaseChain):
def get_chain(self):
inp = ALUInputStage(self.pspec)
+ return [inp]
+
+class ALUStages2(PipeModBaseChain):
+ def get_chain(self):
main = ALUMainStage(self.pspec)
+ return [main]
+
+
+class ALUStages3(PipeModBaseChain):
+ def get_chain(self):
out = ALUOutputStage(self.pspec)
- return [inp, main, out]
+ return [out]
class ALUBasePipe(ControlBase):
def __init__(self, pspec):
ControlBase.__init__(self)
self.pspec = pspec
- self.pipe1 = ALUStages(pspec)
- self._eqs = self.connect([self.pipe1])
+ self.pipe1 = ALUStages1(pspec)
+ self.pipe2 = ALUStages2(pspec)
+ self.pipe3 = ALUStages3(pspec)
+ self._eqs = self.connect([self.pipe1, self.pipe2, self.pipe3])
def elaborate(self, platform):
m = ControlBase.elaborate(self, platform)
- m.submodules.pipe1 = self.pipe1
+ m.submodules.logical_pipe1 = self.pipe1
+ m.submodules.logical_pipe2 = self.pipe2
+ m.submodules.logical_pipe3 = self.pipe3
m.d.comb += self._eqs
return m
+
class ALUIAllCases(ALUTestCase):
def case_ilang(self):
- pspec = ALUPipeSpec(id_wid=2)
+ pspec = ALUPipeSpec(id_wid=2, parent_pspec=None)
alu = ALUBasePipe(pspec)
vl = rtlil.convert(alu, ports=alu.ports())
with open("alu_pipeline.il", "w") as f:
class TestRunner(unittest.TestCase):
- def execute(self, alu,instruction, pdecode2, test):
+ def execute(self, alu, instruction, pdecode2, test):
program = test.program
sim = ISA(pdecode2, test.regs, test.sprs, test.cr,
test.mem, test.msr,
fn_unit = yield pdecode2.e.do.fn_unit
asmcode = yield pdecode2.e.asmcode
dec_asmcode = yield pdecode2.dec.op.asmcode
- print ("asmcode", asmcode, dec_asmcode)
+ print("asmcode", asmcode, dec_asmcode)
self.assertEqual(fn_unit, Function.ALU.value)
yield from set_alu_inputs(alu, pdecode2, sim)
yield Settle()
def test_it(self):
- test_data = ALUTestCase().test_data
+ test_data = ALUTestCase({'soc'}).test_data
m = Module()
comb = m.d.comb
instruction = Signal(32)
opkls = ALUPipeSpec.opsubsetkls
pdecode = create_pdecode()
- m.submodules.pdecode2 = pdecode2 = PowerDecode2(pdecode, opkls, fn_name)
+ m.submodules.pdecode2 = pdecode2 = PowerDecode2(
+ pdecode, opkls, fn_name)
pdecode = pdecode2.dec
- pspec = ALUPipeSpec(id_wid=2)
+ class PPspec:
+ XLEN = 64
+ pps = PPspec()
+ pspec = ALUPipeSpec(id_wid=2, parent_pspec=pps)
m.submodules.alu = alu = ALUBasePipe(pspec)
comb += alu.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
recwidth += width
comb += p.eq(AnyConst(width))
- pspec = ALUPipeSpec(id_wid=2, op_wid=recwidth)
+ pspec = ALUPipeSpec(id_wid=2, op_wid=recwidth, parent_pspec=None)
m.submodules.dut = dut = ALUInputStage(pspec)
a = Signal(64)
return m
+
class GTCombinerTestCase(FHDLTestCase):
def test_formal(self):
module = Driver()
self.assertFormal(module, mode="bmc", depth=4)
self.assertFormal(module, mode="cover", depth=4)
+
def test_ilang(self):
dut = Driver()
vl = rtlil.convert(dut, ports=[])
recwidth += width
comb += p.eq(AnyConst(width))
- pspec = BranchPipeSpec(id_wid=2)
+ pspec = BranchPipeSpec(id_wid=2, parent_pspec=None)
m.submodules.dut = dut = BranchMainStage(pspec)
# convenience aliases
def test_formal(self):
module = Driver()
self.assertFormal(module, mode="bmc", depth=2)
+
def test_ilang(self):
dut = Driver()
vl = rtlil.convert(dut, ports=[])
class BranchPipeSpec(CommonPipeSpec):
- regspec = (BranchInputData.regspec, BranchOutputData.regspec)
+ regspecklses = (BranchInputData, BranchOutputData)
opsubsetkls = CompBROpSubset
from nmutil.singlepipe import ControlBase
from nmutil.pipemodbase import PipeModBaseChain
from soc.fu.branch.main_stage import BranchMainStage
+from nmutil.pipemodbase import PipeModBase
+from soc.fu.branch.pipe_data import BranchInputData
+from nmigen import Module
+
+# gives a 1-clock delay to stop combinatorial link between in and out
+class DummyBranchStage(PipeModBase):
+ def __init__(self, pspec): super().__init__(pspec, "dummy")
+ def ispec(self): return BranchInputData(self.pspec)
+ def ospec(self): return BranchInputData(self.pspec)
+
+ def elaborate(self, platform):
+ m = Module()
+ m.d.comb += self.o.eq(self.i) # pass-through output
+ return m
+
+class BranchDummyStages(PipeModBaseChain):
+ def get_chain(self):
+ dummy = DummyBranchStage(self.pspec)
+ return [dummy]
+
class BranchStages(PipeModBaseChain):
def get_chain(self):
def __init__(self, pspec):
ControlBase.__init__(self)
self.pspec = pspec
- self.pipe1 = BranchStages(pspec)
- self._eqs = self.connect([self.pipe1])
+ self.pipe1 = BranchDummyStages(pspec)
+ self.pipe2 = BranchStages(pspec)
+ self._eqs = self.connect([self.pipe1, self.pipe2])
def elaborate(self, platform):
m = ControlBase.elaborate(self, platform)
- m.submodules.pipe = self.pipe1
+ m.submodules.pipe1 = self.pipe1
+ m.submodules.pipe2 = self.pipe2
m.d.comb += self._eqs
return m
class BranchAllCases(BranchTestCase):
def case_ilang(self):
- pspec = BranchPipeSpec(id_wid=2)
+ pspec = BranchPipeSpec(id_wid=2, parent_pspec=None)
alu = BranchBasePipe(pspec)
vl = rtlil.convert(alu, ports=alu.ports())
with open("branch_pipeline.il", "w") as f:
class TestRunner(unittest.TestCase):
def test_it(self):
- test_data = BranchAllCases().test_data
+ test_data = BranchTestCase().test_data
+ print ("test data", test_data)
m = Module()
comb = m.d.comb
instruction = Signal(32)
m.submodules.pdecode2 = pdecode2 = PowerDecode2(None, opkls, fn_name)
pdecode = pdecode2.dec
- pspec = BranchPipeSpec(id_wid=2)
+ pspec = BranchPipeSpec(id_wid=2, parent_pspec=None)
m.submodules.branch = branch = BranchBasePipe(pspec)
comb += branch.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
print(index)
ins, code = instructions[index]
- print("0x{:X}".format(ins & 0xffffffff))
+ print("insn 0x{:X}".format(ins & 0xffffffff))
print(code)
# ask the decoder to decode this binary data (endian'd)
super().__init__(pspec, "output")
def elaborate(self, platform):
+ XLEN = self.pspec.XLEN
m = Module()
comb = m.d.comb
op = self.i.ctx.op
# XXX ah. right. this needs to be done only if the *mode* is 32-bit
# (an MSR bit)
# see https://bugs.libre-soc.org/show_bug.cgi?id=424
- target = Signal(64, reset_less=True)
+ target = Signal(XLEN, reset_less=True)
#with m.If(op.is_32bit):
# comb += target.eq(o[:32])
#with m.Else():
to actually read (and write) the correct register number
"""
- def __init__(self, speckls, pipekls, idx):
+ def __init__(self, speckls, pipekls, idx, parent_pspec):
alu_name = "alu_%s%d" % (self.fnunit.name.lower(), idx)
- pspec = speckls(id_wid=2) # spec (NNNPipeSpec instance)
+ # spec (NNNPipeSpec instance)
+ pspec = speckls(id_wid=2, parent_pspec=parent_pspec)
opsubset = pspec.opsubsetkls # get the operand subset class
- regspec = pspec.regspec # get the regspec
+ rsk = pspec.regspecklses # get the regspec classes
+ regspec = []
+ for kls in rsk:
+ regspec.append(kls(pspec).regspec)
+ print ("regspecs", regspec)
alu = pipekls(pspec) # create actual NNNBasePipe
self.pspec = pspec
super().__init__(regspec, alu, opsubset, name=alu_name) # MultiCompUnit
ideal (it could be a lot neater) but works for now.
"""
- def __init__(self, speckls, pipekls, num_rows):
+ def __init__(self, speckls, pipekls, num_rows, parent_pspec):
id_wid = num_rows.bit_length()
- pspec = speckls(id_wid=id_wid) # spec (NNNPipeSpec instance)
- opsubset = pspec.opsubsetkls # get the operand subset class
- regspec = pspec.regspec # get the regspec
- alu = pipekls(pspec) # create actual NNNBasePipe
+
+ # spec (NNNPipeSpec instance)
+ pspec = speckls(id_wid=id_wid, parent_pspec=parent_pspec)
self.pspec = pspec
+ opsubset = pspec.opsubsetkls # get the operand subset class
+ rsk = pspec.regspecklses # get the regspec classes
+ regspec = []
+ for kls in rsk:
+ regspec.append(kls(pspec).regspec)
+ print ("regspecs", regspec)
+ alu = pipekls(pspec) # create actual NNNBasePipe
alu_name = self.fnunit.name.lower()
super().__init__(alu, num_rows, alu_name) # initialise fan-in/fan-out
self.cu = []
######################################################################
###### actual Function Units: these are "single" stage pipelines #####
-#class ALUFunctionUnit(FunctionUnitBaseSingle):
+# class ALUFunctionUnit(FunctionUnitBaseSingle):
+
+
class ALUFunctionUnit(FunctionUnitBaseMulti):
fnunit = Function.ALU
- def __init__(self, num_rses):
- super().__init__(ALUPipeSpec, ALUBasePipe, num_rses)
+ def __init__(self, num_rses, parent_pspec):
+ super().__init__(ALUPipeSpec, ALUBasePipe, num_rses, parent_pspec)
-#class LogicalFunctionUnit(FunctionUnitBaseSingle):
+# class LogicalFunctionUnit(FunctionUnitBaseSingle):
class LogicalFunctionUnit(FunctionUnitBaseMulti):
fnunit = Function.LOGICAL
- def __init__(self, idx):
- super().__init__(LogicalPipeSpec, LogicalBasePipe, idx)
+ def __init__(self, idx, parent_pspec):
+ super().__init__(LogicalPipeSpec, LogicalBasePipe, idx, parent_pspec)
-#class CRFunctionUnit(FunctionUnitBaseSingle):
+# class CRFunctionUnit(FunctionUnitBaseSingle):
class CRFunctionUnit(FunctionUnitBaseMulti):
fnunit = Function.CR
- def __init__(self, idx):
- super().__init__(CRPipeSpec, CRBasePipe, idx)
+ def __init__(self, idx, parent_pspec):
+ super().__init__(CRPipeSpec, CRBasePipe, idx, parent_pspec)
-#class BranchFunctionUnit(FunctionUnitBaseSingle):
+# class BranchFunctionUnit(FunctionUnitBaseSingle):
class BranchFunctionUnit(FunctionUnitBaseMulti):
fnunit = Function.BRANCH
- def __init__(self, idx):
- super().__init__(BranchPipeSpec, BranchBasePipe, idx)
+ def __init__(self, idx, parent_pspec):
+ super().__init__(BranchPipeSpec, BranchBasePipe, idx, parent_pspec)
-#class ShiftRotFunctionUnit(FunctionUnitBaseSingle):
+# class ShiftRotFunctionUnit(FunctionUnitBaseSingle):
class ShiftRotFunctionUnit(FunctionUnitBaseMulti):
fnunit = Function.SHIFT_ROT
- def __init__(self, idx):
- super().__init__(ShiftRotPipeSpec, ShiftRotBasePipe, idx)
+ def __init__(self, idx, parent_pspec):
+ super().__init__(ShiftRotPipeSpec, ShiftRotBasePipe, idx, parent_pspec)
class DivFSMFunctionUnit(FunctionUnitBaseSingle):
fnunit = Function.DIV
- def __init__(self, idx):
- super().__init__(DivPipeSpecFSMDivCore, DivBasePipe, idx)
+ def __init__(self, idx, parent_pspec):
+ super().__init__(DivPipeSpecFSMDivCore, DivBasePipe, idx, parent_pspec)
class MMUFSMFunctionUnit(FunctionUnitBaseSingle):
fnunit = Function.MMU
- def __init__(self, idx):
- super().__init__(MMUPipeSpec, FSMMMUStage, idx)
+ def __init__(self, idx, parent_pspec):
+ super().__init__(MMUPipeSpec, FSMMMUStage, idx, parent_pspec)
+ self.exc_o = self.alu.exc_o # get at MMU exception
class DivPipeFunctionUnit(FunctionUnitBaseSingle):
fnunit = Function.DIV
- def __init__(self, idx):
- super().__init__(DivPipeSpecDivPipeCore, DivBasePipe, idx)
+ def __init__(self, idx, parent_pspec):
+ super().__init__(DivPipeSpecDivPipeCore, DivBasePipe, idx, parent_pspec)
-#class MulFunctionUnit(FunctionUnitBaseSingle):
+# class MulFunctionUnit(FunctionUnitBaseSingle):
class MulFunctionUnit(FunctionUnitBaseMulti):
fnunit = Function.MUL
- def __init__(self, idx):
- super().__init__(MulPipeSpec, MulBasePipe, idx)
+ def __init__(self, idx, parent_pspec):
+ super().__init__(MulPipeSpec, MulBasePipe, idx, parent_pspec)
class TrapFunctionUnit(FunctionUnitBaseSingle):
fnunit = Function.TRAP
- def __init__(self, idx):
- super().__init__(TrapPipeSpec, TrapBasePipe, idx)
+ def __init__(self, idx, parent_pspec):
+ super().__init__(TrapPipeSpec, TrapBasePipe, idx, parent_pspec)
class SPRFunctionUnit(FunctionUnitBaseSingle):
fnunit = Function.SPR
- def __init__(self, idx):
- super().__init__(SPRPipeSpec, SPRBasePipe, idx)
+ def __init__(self, idx, parent_pspec):
+ super().__init__(SPRPipeSpec, SPRBasePipe, idx, parent_pspec)
# special-case: LD/ST conforms to the CompUnit API but is not a pipeline
class LDSTFunctionUnit(LDSTCompUnit):
fnunit = Function.LDST
- def __init__(self, pi, awid, idx):
+ def __init__(self, pi, awid, idx, parent_pspec):
alu_name = "ldst_%s%d" % (self.fnunit.name.lower(), idx)
- pspec = LDSTPipeSpec(id_wid=2) # spec (NNNPipeSpec instance)
+ # spec (NNNPipeSpec instance)
+ pspec = LDSTPipeSpec(id_wid=2, parent_pspec=parent_pspec)
opsubset = pspec.opsubsetkls # get the operand subset class
- regspec = pspec.regspec # get the regspec
+ rsk = pspec.regspecklses # get the regspec classes
+ regspec = []
+ for kls in rsk:
+ regspec.append(kls(pspec).regspec)
+ print ("regspecs", regspec)
self.opsubsetkls = opsubset
super().__init__(pi, regspec, awid, opsubset, name=alu_name)
for name, qty in units.items():
kls = alus[name]
if issubclass(kls, FunctionUnitBaseMulti):
- fu = kls(qty) # create just the one ALU but many "fronts"
- self.actual_alus[name] = fu # to be made a module of AllFUs
+ # create just the one ALU but many "fronts"
+ fu = kls(qty, parent_pspec=pspec)
+ self.actual_alus[name] = fu # to be made a module of AllFUs
for i in range(qty):
self.fus["%s%d" % (name, i)] = fu.cu[i]
else:
for i in range(qty):
- self.fus["%s%d" % (name, i)] = kls(i)
+ self.fus["%s%d" % (name, i)] = kls(i, parent_pspec=pspec)
# debug print for MMU ALU
if microwatt_mmu:
# if any PortInterfaces, we want LDST Units.
if pilist is None:
return
- print ("pilist", pilist)
+ print("pilist", pilist)
for i, pi in enumerate(pilist):
- self.fus["ldst%d" % (i)] = LDSTFunctionUnit(pi, addrwid, i)
+ self.fus["ldst%d" % (i)] = LDSTFunctionUnit(pi, addrwid, i, pspec)
# extract exceptions from any FunctionUnits for easy access
self.excs = {}
for name, alu in self.fus.items():
if hasattr(alu, "exc_o"):
- print ("FU exceptions", name, type(alu.exc_o), alu.exc_o)
+ print("FU exceptions", name, type(alu.exc_o), alu.exc_o)
self.excs[name] = alu.exc_o
def get_exc(self, name):
def tst_all_fus():
pspec = TestMemPspec(ldst_ifacetype='testpi',
imem_ifacetype='',
- addr_wid=48,
+ addr_wid=64,
mask_wid=8,
reg_wid=64)
dut = AllFunctionUnits(pspec)
self.funit = funit
self.bigendian = bigendian
- def execute(self, cu, l0, instruction, pdecode2, simdec2, test):
+ def execute(self, m, cu, l0, instruction, pdecode2, simdec2, test):
program = test.program
print("test", test.name, test.mem)
# set operand and get inputs
yield from set_operand(cu, pdecode2, sim)
# reset read-operand mask
- rdmask = get_rdflags(pdecode2.e, cu)
+ rdmask = get_rdflags(m, pdecode2.e, cu)
#print ("hardcoded rdmask", cu.rdflags(pdecode2.e))
#print ("decoder rdmask", rdmask)
yield cu.rdmaskn.eq(~rdmask)
m.d.comb += cu.ad.go_i.eq(cu.ad.rel_o) # link addr direct to rel
m.d.comb += cu.st.go_i.eq(cu.st.rel_o) # link store direct to rel
else:
- m.submodules.cu = cu = self.fukls(0)
+ m.submodules.cu = cu = self.fukls(0, parent_pspec=None)
l0 = None
comb += pdecode2.dec.raw_opcode_in.eq(instruction)
for test in self.test_data:
print(test.name)
with self.subTest(test.name):
- yield from self.execute(cu, l0, instruction,
+ yield from self.execute(m, cu, l0, instruction,
pdecode2, simdec2,
test)
recwidth += width
comb += p.eq(AnyConst(width))
- pspec = ALUPipeSpec(id_wid=2)
+ pspec = ALUPipeSpec(id_wid=2, parent_pspec=None)
m.submodules.dut = dut = CRMainStage(pspec)
full_cr_in = Signal(32)
# into cr_a
comb += dut.i.cr_a.eq(cr_input_arr[bc])
-
# For OP_CROP, we need to input the corresponding CR
# registers for BA, BB, and BT
with m.Case(MicrOp.OP_CROP):
comb += Assert(o[4*i:4*i+4] == cr[4*i:4*i+4])
with m.Else():
comb += Assert(o[4*i:4*i+4] == 0)
- with m.Else(): # mfcrf
+ with m.Else(): # mfcrf
comb += Assert(o == cr)
comb += o_ok.eq(1)
with m.Case(MicrOp.OP_SETB):
with m.If(cr_arr[4*bfa]):
- comb += Assert(o == ((1<<64)-1))
+ comb += Assert(o == ((1 << 64)-1))
with m.Elif(cr_arr[4*bfa+1]):
comb += Assert(o == 1)
with m.Else():
def test_formal(self):
module = Driver()
self.assertFormal(module, mode="bmc", depth=2)
+
def test_ilang(self):
dut = Driver()
vl = rtlil.convert(dut, ports=[])
class CRPipeSpec(CommonPipeSpec):
- regspec = (CRInputData.regspec, CROutputData.regspec)
+ regspecklses = (CRInputData, CROutputData)
opsubsetkls = CompCROpSubset
class CRIlangCase(TestAccumulatorBase):
def case_ilang(self):
- pspec = CRPipeSpec(id_wid=2)
+ pspec = CRPipeSpec(id_wid=2, parent_pspec=None)
alu = CRBasePipe(pspec)
vl = rtlil.convert(alu, ports=alu.ports())
with open("cr_pipeline.il", "w") as f:
if whole_reg_ok:
full_cr = yield alu.n.o_data.full_cr.data & full_cr_mask
expected_cr = simulator.cr.value
- print("CR whole: expected %x, actual: %x mask: %x" % \
- (expected_cr, full_cr, full_cr_mask))
+ print("CR whole: expected %x, actual: %x mask: %x" %
+ (expected_cr, full_cr, full_cr_mask))
# HACK: only look at the bits that we expected to change
self.assertEqual(expected_cr & full_cr_mask, full_cr, code)
elif cr_en:
m.submodules.pdecode2 = pdecode2 = PowerDecode2(None, opkls, fn_name)
pdecode = pdecode2.dec
- pspec = CRPipeSpec(id_wid=2)
+ pspec = CRPipeSpec(id_wid=2, parent_pspec=None)
m.submodules.alu = alu = CRBasePipe(pspec)
comb += alu.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
--- /dev/null
+# SPDX-License-Identifier: LGPL-3-or-later
+# Copyright 2022 Jacob Lifshay programmerjake@gmail.com
+
+# Funded by NLnet Assure Programme 2021-02-052, https://nlnet.nl/assure part
+# of Horizon 2020 EU Programme 957073.
+
+from collections import defaultdict
+import logging
+import math
+import enum
+from fractions import Fraction
+from types import FunctionType
+from functools import lru_cache
+from nmigen.hdl.ast import Signal, unsigned, signed, Const
+from nmigen.hdl.dsl import Module, Elaboratable
+from nmigen.hdl.mem import Memory
+from nmutil.clz import CLZ
+from nmutil.plain_data import plain_data, fields, replace
+
+try:
+ from functools import cached_property
+except ImportError:
+ from cached_property import cached_property
+
+# fix broken IDE type detection for cached_property
+from typing import TYPE_CHECKING, Any
+if TYPE_CHECKING:
+ from functools import cached_property
+
+
+_NOT_FOUND = object()
+
+
+def cache_on_self(func):
+ """like `functools.cached_property`, except for methods. unlike
+ `lru_cache` the cache is per-class instance rather than a global cache
+ per-method."""
+
+ assert isinstance(func, FunctionType), \
+ "non-plain methods are not supported"
+
+ cache_name = func.__name__ + "__cache"
+
+ def wrapper(self, *args, **kwargs):
+ # specifically access through `__dict__` to bypass frozen=True
+ cache = self.__dict__.get(cache_name, _NOT_FOUND)
+ if cache is _NOT_FOUND:
+ self.__dict__[cache_name] = cache = {}
+ key = (args, *kwargs.items())
+ retval = cache.get(key, _NOT_FOUND)
+ if retval is _NOT_FOUND:
+ retval = func(self, *args, **kwargs)
+ cache[key] = retval
+ return retval
+
+ wrapper.__doc__ = func.__doc__
+ return wrapper
+
+
+@enum.unique
+class RoundDir(enum.Enum):
+ DOWN = enum.auto()
+ UP = enum.auto()
+ NEAREST_TIES_UP = enum.auto()
+ ERROR_IF_INEXACT = enum.auto()
+
+
+@plain_data(frozen=True, eq=False, repr=False)
+class FixedPoint:
+ __slots__ = "bits", "frac_wid"
+
+ def __init__(self, bits, frac_wid):
+ self.bits = bits
+ self.frac_wid = frac_wid
+ assert isinstance(self.bits, int)
+ assert isinstance(self.frac_wid, int) and self.frac_wid >= 0
+
+ @staticmethod
+ def cast(value):
+ """convert `value` to a fixed-point number with enough fractional
+ bits to preserve its value."""
+ if isinstance(value, FixedPoint):
+ return value
+ if isinstance(value, int):
+ return FixedPoint(value, 0)
+ if isinstance(value, str):
+ value = value.strip()
+ neg = value.startswith("-")
+ if neg or value.startswith("+"):
+ value = value[1:]
+ if value.startswith(("0x", "0X")) and "." in value:
+ value = value[2:]
+ got_dot = False
+ bits = 0
+ frac_wid = 0
+ for digit in value:
+ if digit == "_":
+ continue
+ if got_dot:
+ if digit == ".":
+ raise ValueError("too many `.` in string")
+ frac_wid += 4
+ if digit == ".":
+ got_dot = True
+ continue
+ if not digit.isalnum():
+ raise ValueError("invalid hexadecimal digit")
+ bits <<= 4
+ bits |= int("0x" + digit, base=16)
+ else:
+ bits = int(value, base=0)
+ frac_wid = 0
+ if neg:
+ bits = -bits
+ return FixedPoint(bits, frac_wid)
+
+ if isinstance(value, float):
+ n, d = value.as_integer_ratio()
+ log2_d = d.bit_length() - 1
+ assert d == 1 << log2_d, ("d isn't a power of 2 -- won't ever "
+ "fail with float being IEEE 754")
+ return FixedPoint(n, log2_d)
+ raise TypeError("can't convert type to FixedPoint")
+
+ @staticmethod
+ def with_frac_wid(value, frac_wid, round_dir=RoundDir.ERROR_IF_INEXACT):
+ """convert `value` to the nearest fixed-point number with `frac_wid`
+ fractional bits, rounding according to `round_dir`."""
+ assert isinstance(frac_wid, int) and frac_wid >= 0
+ assert isinstance(round_dir, RoundDir)
+ if isinstance(value, Fraction):
+ numerator = value.numerator
+ denominator = value.denominator
+ else:
+ value = FixedPoint.cast(value)
+ numerator = value.bits
+ denominator = 1 << value.frac_wid
+ if denominator < 0:
+ numerator = -numerator
+ denominator = -denominator
+ bits, remainder = divmod(numerator << frac_wid, denominator)
+ if round_dir == RoundDir.DOWN:
+ pass
+ elif round_dir == RoundDir.UP:
+ if remainder != 0:
+ bits += 1
+ elif round_dir == RoundDir.NEAREST_TIES_UP:
+ if remainder * 2 >= denominator:
+ bits += 1
+ elif round_dir == RoundDir.ERROR_IF_INEXACT:
+ if remainder != 0:
+ raise ValueError("inexact conversion")
+ else:
+ assert False, "unimplemented round_dir"
+ return FixedPoint(bits, frac_wid)
+
+ def to_frac_wid(self, frac_wid, round_dir=RoundDir.ERROR_IF_INEXACT):
+ """convert to the nearest fixed-point number with `frac_wid`
+ fractional bits, rounding according to `round_dir`."""
+ return FixedPoint.with_frac_wid(self, frac_wid, round_dir)
+
+ def __float__(self):
+ # use truediv to get correct result even when bits
+ # and frac_wid are huge
+ return float(self.bits / (1 << self.frac_wid))
+
+ def as_fraction(self):
+ return Fraction(self.bits, 1 << self.frac_wid)
+
+ def cmp(self, rhs):
+ """compare self with rhs, returning a positive integer if self is
+ greater than rhs, zero if self is equal to rhs, and a negative integer
+ if self is less than rhs."""
+ rhs = FixedPoint.cast(rhs)
+ common_frac_wid = max(self.frac_wid, rhs.frac_wid)
+ lhs = self.to_frac_wid(common_frac_wid)
+ rhs = rhs.to_frac_wid(common_frac_wid)
+ return lhs.bits - rhs.bits
+
+ def __eq__(self, rhs):
+ return self.cmp(rhs) == 0
+
+ def __ne__(self, rhs):
+ return self.cmp(rhs) != 0
+
+ def __gt__(self, rhs):
+ return self.cmp(rhs) > 0
+
+ def __lt__(self, rhs):
+ return self.cmp(rhs) < 0
+
+ def __ge__(self, rhs):
+ return self.cmp(rhs) >= 0
+
+ def __le__(self, rhs):
+ return self.cmp(rhs) <= 0
+
+ def fract(self):
+ """return the fractional part of `self`.
+ that is `self - math.floor(self)`.
+ """
+ fract_mask = (1 << self.frac_wid) - 1
+ return FixedPoint(self.bits & fract_mask, self.frac_wid)
+
+ def __str__(self):
+ if self < 0:
+ return "-" + str(-self)
+ digit_bits = 4
+ frac_digit_count = (self.frac_wid + digit_bits - 1) // digit_bits
+ fract = self.fract().to_frac_wid(frac_digit_count * digit_bits)
+ frac_str = hex(fract.bits)[2:].zfill(frac_digit_count)
+ return hex(math.floor(self)) + "." + frac_str
+
+ def __repr__(self):
+ return f"FixedPoint.with_frac_wid({str(self)!r}, {self.frac_wid})"
+
+ def __add__(self, rhs):
+ rhs = FixedPoint.cast(rhs)
+ common_frac_wid = max(self.frac_wid, rhs.frac_wid)
+ lhs = self.to_frac_wid(common_frac_wid)
+ rhs = rhs.to_frac_wid(common_frac_wid)
+ return FixedPoint(lhs.bits + rhs.bits, common_frac_wid)
+
+ def __radd__(self, lhs):
+ # symmetric
+ return self.__add__(lhs)
+
+ def __neg__(self):
+ return FixedPoint(-self.bits, self.frac_wid)
+
+ def __sub__(self, rhs):
+ rhs = FixedPoint.cast(rhs)
+ common_frac_wid = max(self.frac_wid, rhs.frac_wid)
+ lhs = self.to_frac_wid(common_frac_wid)
+ rhs = rhs.to_frac_wid(common_frac_wid)
+ return FixedPoint(lhs.bits - rhs.bits, common_frac_wid)
+
+ def __rsub__(self, lhs):
+ # a - b == -(b - a)
+ return -self.__sub__(lhs)
+
+ def __mul__(self, rhs):
+ rhs = FixedPoint.cast(rhs)
+ return FixedPoint(self.bits * rhs.bits, self.frac_wid + rhs.frac_wid)
+
+ def __rmul__(self, lhs):
+ # symmetric
+ return self.__mul__(lhs)
+
+ def __floor__(self):
+ return self.bits >> self.frac_wid
+
+ def div(self, rhs, frac_wid, round_dir=RoundDir.ERROR_IF_INEXACT):
+ assert isinstance(frac_wid, int) and frac_wid >= 0
+ assert isinstance(round_dir, RoundDir)
+ rhs = FixedPoint.cast(rhs)
+ return FixedPoint.with_frac_wid(self.as_fraction()
+ / rhs.as_fraction(),
+ frac_wid, round_dir)
+
+ def sqrt(self, round_dir=RoundDir.ERROR_IF_INEXACT):
+ assert isinstance(round_dir, RoundDir)
+ if self < 0:
+ raise ValueError("can't compute sqrt of negative number")
+ if self == 0:
+ return self
+ retval = FixedPoint(0, self.frac_wid)
+ int_part_wid = self.bits.bit_length() - self.frac_wid
+ first_bit_index = -(-int_part_wid // 2) # division rounds up
+ last_bit_index = -self.frac_wid
+ for bit_index in range(first_bit_index, last_bit_index - 1, -1):
+ trial = retval + FixedPoint(1 << (bit_index + self.frac_wid),
+ self.frac_wid)
+ if trial * trial <= self:
+ retval = trial
+ if round_dir == RoundDir.DOWN:
+ pass
+ elif round_dir == RoundDir.UP:
+ if retval * retval < self:
+ retval += FixedPoint(1, self.frac_wid)
+ elif round_dir == RoundDir.NEAREST_TIES_UP:
+ half_way = retval + FixedPoint(1, self.frac_wid + 1)
+ if half_way * half_way <= self:
+ retval += FixedPoint(1, self.frac_wid)
+ elif round_dir == RoundDir.ERROR_IF_INEXACT:
+ if retval * retval != self:
+ raise ValueError("inexact sqrt")
+ else:
+ assert False, "unimplemented round_dir"
+ return retval
+
+ def rsqrt(self, round_dir=RoundDir.ERROR_IF_INEXACT):
+ """compute the reciprocal-sqrt of `self`"""
+ assert isinstance(round_dir, RoundDir)
+ if self < 0:
+ raise ValueError("can't compute rsqrt of negative number")
+ if self == 0:
+ raise ZeroDivisionError("can't compute rsqrt of zero")
+ retval = FixedPoint(0, self.frac_wid)
+ first_bit_index = -(-self.frac_wid // 2) # division rounds up
+ last_bit_index = -self.frac_wid
+ for bit_index in range(first_bit_index, last_bit_index - 1, -1):
+ trial = retval + FixedPoint(1 << (bit_index + self.frac_wid),
+ self.frac_wid)
+ if trial * trial * self <= 1:
+ retval = trial
+ if round_dir == RoundDir.DOWN:
+ pass
+ elif round_dir == RoundDir.UP:
+ if retval * retval * self < 1:
+ retval += FixedPoint(1, self.frac_wid)
+ elif round_dir == RoundDir.NEAREST_TIES_UP:
+ half_way = retval + FixedPoint(1, self.frac_wid + 1)
+ if half_way * half_way * self <= 1:
+ retval += FixedPoint(1, self.frac_wid)
+ elif round_dir == RoundDir.ERROR_IF_INEXACT:
+ if retval * retval * self != 1:
+ raise ValueError("inexact rsqrt")
+ else:
+ assert False, "unimplemented round_dir"
+ return retval
+
+
+class ParamsNotAccurateEnough(Exception):
+ """raised when the parameters aren't accurate enough to have goldschmidt
+ division work."""
+
+
+def _assert_accuracy(condition, msg="not accurate enough"):
+ if condition:
+ return
+ raise ParamsNotAccurateEnough(msg)
+
+
+@plain_data(frozen=True, unsafe_hash=True)
+class GoldschmidtDivParamsBase:
+ """parameters for a Goldschmidt division algorithm, excluding derived
+ parameters.
+ """
+
+ __slots__ = ("io_width", "extra_precision", "table_addr_bits",
+ "table_data_bits", "iter_count")
+
+ def __init__(self, io_width, extra_precision, table_addr_bits,
+ table_data_bits, iter_count):
+ assert isinstance(io_width, int)
+ assert isinstance(extra_precision, int)
+ assert isinstance(table_addr_bits, int)
+ assert isinstance(table_data_bits, int)
+ assert isinstance(iter_count, int)
+ self.io_width = io_width
+ """bit-width of the input divisor and the result.
+ the input numerator is `2 * io_width`-bits wide.
+ """
+
+ self.extra_precision = extra_precision
+ """number of bits of additional precision used inside the algorithm."""
+
+ self.table_addr_bits = table_addr_bits
+ """the number of address bits used in the lookup-table."""
+
+ self.table_data_bits = table_data_bits
+ """the number of data bits used in the lookup-table."""
+
+ self.iter_count = iter_count
+ """the total number of iterations of the division algorithm's loop"""
+
+
+@plain_data(frozen=True, unsafe_hash=True)
+class GoldschmidtDivParams(GoldschmidtDivParamsBase):
+ """parameters for a Goldschmidt division algorithm.
+ Use `GoldschmidtDivParams.get` to find a efficient set of parameters.
+ """
+
+ __slots__ = "table", "ops"
+
+ def _shrink_bound(self, bound, round_dir):
+ """prevent fractions from having huge numerators/denominators by
+ rounding to a `FixedPoint` and converting back to a `Fraction`.
+
+ This is intended only for values used to compute bounds, and not for
+ values that end up in the hardware.
+ """
+ assert isinstance(bound, (Fraction, int))
+ assert round_dir is RoundDir.DOWN or round_dir is RoundDir.UP, \
+ "you shouldn't use that round_dir on bounds"
+ frac_wid = self.io_width * 4 + 100 # should be enough precision
+ fixed = FixedPoint.with_frac_wid(bound, frac_wid, round_dir)
+ return fixed.as_fraction()
+
+ def _shrink_min(self, min_bound):
+ """prevent fractions used as minimum bounds from having huge
+ numerators/denominators by rounding down to a `FixedPoint` and
+ converting back to a `Fraction`.
+
+ This is intended only for values used to compute bounds, and not for
+ values that end up in the hardware.
+ """
+ return self._shrink_bound(min_bound, RoundDir.DOWN)
+
+ def _shrink_max(self, max_bound):
+ """prevent fractions used as maximum bounds from having huge
+ numerators/denominators by rounding up to a `FixedPoint` and
+ converting back to a `Fraction`.
+
+ This is intended only for values used to compute bounds, and not for
+ values that end up in the hardware.
+ """
+ return self._shrink_bound(max_bound, RoundDir.UP)
+
+ @property
+ def table_addr_count(self):
+ """number of distinct addresses in the lookup-table."""
+ # used while computing self.table, so can't just do len(self.table)
+ return 1 << self.table_addr_bits
+
+ def table_input_exact_range(self, addr):
+ """return the range of inputs as `Fraction`s used for the table entry
+ with address `addr`."""
+ assert isinstance(addr, int)
+ assert 0 <= addr < self.table_addr_count
+ _assert_accuracy(self.io_width >= self.table_addr_bits)
+ addr_shift = self.io_width - self.table_addr_bits
+ min_numerator = (1 << self.io_width) + (addr << addr_shift)
+ denominator = 1 << self.io_width
+ values_per_table_entry = 1 << addr_shift
+ max_numerator = min_numerator + values_per_table_entry - 1
+ min_input = Fraction(min_numerator, denominator)
+ max_input = Fraction(max_numerator, denominator)
+ min_input = self._shrink_min(min_input)
+ max_input = self._shrink_max(max_input)
+ assert 1 <= min_input <= max_input < 2
+ return min_input, max_input
+
+ def table_value_exact_range(self, addr):
+ """return the range of values as `Fraction`s used for the table entry
+ with address `addr`."""
+ min_input, max_input = self.table_input_exact_range(addr)
+ # division swaps min/max
+ min_value = 1 / max_input
+ max_value = 1 / min_input
+ min_value = self._shrink_min(min_value)
+ max_value = self._shrink_max(max_value)
+ assert 0.5 < min_value <= max_value <= 1
+ return min_value, max_value
+
+ def table_exact_value(self, index):
+ min_value, max_value = self.table_value_exact_range(index)
+ # we round down
+ return min_value
+
+ def __init__(self, io_width, extra_precision, table_addr_bits,
+ table_data_bits, iter_count):
+ super().__init__(io_width=io_width,
+ extra_precision=extra_precision,
+ table_addr_bits=table_addr_bits,
+ table_data_bits=table_data_bits,
+ iter_count=iter_count)
+ _assert_accuracy(self.io_width >= 1, "io_width out of range")
+ _assert_accuracy(self.extra_precision >= 0,
+ "extra_precision out of range")
+ _assert_accuracy(self.table_addr_bits >= 1,
+ "table_addr_bits out of range")
+ _assert_accuracy(self.table_data_bits >= 1,
+ "table_data_bits out of range")
+ _assert_accuracy(self.iter_count >= 1, "iter_count out of range")
+ table = []
+ for addr in range(1 << self.table_addr_bits):
+ table.append(FixedPoint.with_frac_wid(self.table_exact_value(addr),
+ self.table_data_bits,
+ RoundDir.DOWN))
+
+ self.table = tuple(table)
+ """ the lookup-table.
+ type: tuple[FixedPoint, ...]
+ """
+
+ self.ops = tuple(self.__make_ops())
+ "the operations needed to perform the goldschmidt division algorithm."
+
+ @property
+ def expanded_width(self):
+ """the total number of bits of precision used inside the algorithm."""
+ return self.io_width + self.extra_precision
+
+ @property
+ def n_d_f_int_wid(self):
+ """the number of bits in the integer part of `state.n`, `state.d`, and
+ `state.f` during the main iteration loop.
+ """
+ return 2
+
+ @property
+ def n_d_f_total_wid(self):
+ """the total number of bits (both integer and fraction bits) in
+ `state.n`, `state.d`, and `state.f` during the main iteration loop.
+ """
+ return self.n_d_f_int_wid + self.expanded_width
+
+ @cache_on_self
+ def max_neps(self, i):
+ """maximum value of `neps[i]`.
+ `neps[i]` is defined to be `n[i] * N_prime[i - 1] * F_prime[i - 1]`.
+ """
+ assert isinstance(i, int) and 0 <= i < self.iter_count
+ return Fraction(1, 1 << self.expanded_width)
+
+ @cache_on_self
+ def max_deps(self, i):
+ """maximum value of `deps[i]`.
+ `deps[i]` is defined to be `d[i] * D_prime[i - 1] * F_prime[i - 1]`.
+ """
+ assert isinstance(i, int) and 0 <= i < self.iter_count
+ return Fraction(1, 1 << self.expanded_width)
+
+ @cache_on_self
+ def max_feps(self, i):
+ """maximum value of `feps[i]`.
+ `feps[i]` is defined to be `f[i] * (2 - D_prime[i - 1])`.
+ """
+ assert isinstance(i, int) and 0 <= i < self.iter_count
+ # zero, because the computation of `F_prime[i]` in
+ # `GoldschmidtDivOp.MulDByF.run(...)` is exact.
+ return Fraction(0)
+
+ @cached_property
+ def e0_range(self):
+ """minimum and maximum values of `e[0]`
+ (the relative error in `F_prime[-1]`)
+ """
+ min_e0 = Fraction(0)
+ max_e0 = Fraction(0)
+ for addr in range(self.table_addr_count):
+ # `F_prime[-1] = (1 - e[0]) / B`
+ # => `e[0] = 1 - B * F_prime[-1]`
+ min_b, max_b = self.table_input_exact_range(addr)
+ f_prime_m1 = self.table[addr].as_fraction()
+ assert min_b >= 0 and f_prime_m1 >= 0, \
+ "only positive quadrant of interval multiplication implemented"
+ min_product = min_b * f_prime_m1
+ max_product = max_b * f_prime_m1
+ # negation swaps min/max
+ cur_min_e0 = 1 - max_product
+ cur_max_e0 = 1 - min_product
+ min_e0 = min(min_e0, cur_min_e0)
+ max_e0 = max(max_e0, cur_max_e0)
+ min_e0 = self._shrink_min(min_e0)
+ max_e0 = self._shrink_max(max_e0)
+ return min_e0, max_e0
+
+ @cached_property
+ def min_e0(self):
+ """minimum value of `e[0]` (the relative error in `F_prime[-1]`)
+ """
+ min_e0, max_e0 = self.e0_range
+ return min_e0
+
+ @cached_property
+ def max_e0(self):
+ """maximum value of `e[0]` (the relative error in `F_prime[-1]`)
+ """
+ min_e0, max_e0 = self.e0_range
+ return max_e0
+
+ @cached_property
+ def max_abs_e0(self):
+ """maximum value of `abs(e[0])`."""
+ return max(abs(self.min_e0), abs(self.max_e0))
+
+ @cached_property
+ def min_abs_e0(self):
+ """minimum value of `abs(e[0])`."""
+ return Fraction(0)
+
+ @cache_on_self
+ def max_n(self, i):
+ """maximum value of `n[i]` (the relative error in `N_prime[i]`
+ relative to the previous iteration)
+ """
+ assert isinstance(i, int) and 0 <= i < self.iter_count
+ if i == 0:
+ # from Claim 10
+ # `n[0] = neps[0] / ((1 - e[0]) * (A / B))`
+ # `n[0] <= 2 * neps[0] / (1 - e[0])`
+
+ assert self.max_e0 < 1 and self.max_neps(0) >= 0, \
+ "only one quadrant of interval division implemented"
+ retval = 2 * self.max_neps(0) / (1 - self.max_e0)
+ elif i == 1:
+ # from Claim 10
+ # `n[1] <= neps[1] / ((1 - f[0]) * (1 - pi[0] - delta[0]))`
+ min_mpd = 1 - self.max_pi(0) - self.max_delta(0)
+ assert self.max_f(0) <= 1 and min_mpd >= 0, \
+ "only one quadrant of interval multiplication implemented"
+ prod = (1 - self.max_f(0)) * min_mpd
+ assert self.max_neps(1) >= 0 and prod > 0, \
+ "only one quadrant of interval division implemented"
+ retval = self.max_neps(1) / prod
+ else:
+ # from Claim 6
+ # `0 <= n[i] <= 2 * max_neps[i] / (1 - pi[i - 1] - delta[i - 1])`
+ min_mpd = 1 - self.max_pi(i - 1) - self.max_delta(i - 1)
+ assert self.max_neps(i) >= 0 and min_mpd > 0, \
+ "only one quadrant of interval division implemented"
+ retval = self.max_neps(i) / min_mpd
+
+ return self._shrink_max(retval)
+
+ @cache_on_self
+ def max_d(self, i):
+ """maximum value of `d[i]` (the relative error in `D_prime[i]`
+ relative to the previous iteration)
+ """
+ assert isinstance(i, int) and 0 <= i < self.iter_count
+ if i == 0:
+ # from Claim 10
+ # `d[0] = deps[0] / (1 - e[0])`
+
+ assert self.max_e0 < 1 and self.max_deps(0) >= 0, \
+ "only one quadrant of interval division implemented"
+ retval = self.max_deps(0) / (1 - self.max_e0)
+ elif i == 1:
+ # from Claim 10
+ # `d[1] <= deps[1] / ((1 - f[0]) * (1 - delta[0] ** 2))`
+ assert self.max_f(0) <= 1 and self.max_delta(0) <= 1, \
+ "only one quadrant of interval multiplication implemented"
+ divisor = (1 - self.max_f(0)) * (1 - self.max_delta(0) ** 2)
+ assert self.max_deps(1) >= 0 and divisor > 0, \
+ "only one quadrant of interval division implemented"
+ retval = self.max_deps(1) / divisor
+ else:
+ # from Claim 6
+ # `0 <= d[i] <= max_deps[i] / (1 - delta[i - 1])`
+ assert self.max_deps(i) >= 0 and self.max_delta(i - 1) < 1, \
+ "only one quadrant of interval division implemented"
+ retval = self.max_deps(i) / (1 - self.max_delta(i - 1))
+
+ return self._shrink_max(retval)
+
+ @cache_on_self
+ def max_f(self, i):
+ """maximum value of `f[i]` (the relative error in `F_prime[i]`
+ relative to the previous iteration)
+ """
+ assert isinstance(i, int) and 0 <= i < self.iter_count
+ if i == 0:
+ # from Claim 10
+ # `f[0] = feps[0] / (1 - delta[0])`
+
+ assert self.max_delta(0) < 1 and self.max_feps(0) >= 0, \
+ "only one quadrant of interval division implemented"
+ retval = self.max_feps(0) / (1 - self.max_delta(0))
+ elif i == 1:
+ # from Claim 10
+ # `f[1] = feps[1]`
+ retval = self.max_feps(1)
+ else:
+ # from Claim 6
+ # `f[i] <= max_feps[i]`
+ retval = self.max_feps(i)
+
+ return self._shrink_max(retval)
+
+ @cache_on_self
+ def max_delta(self, i):
+ """ maximum value of `delta[i]`.
+ `delta[i]` is defined in Definition 4 of paper.
+ """
+ assert isinstance(i, int) and 0 <= i < self.iter_count
+ if i == 0:
+ # `delta[0] = abs(e[0]) + 3 * d[0] / 2`
+ retval = self.max_abs_e0 + Fraction(3, 2) * self.max_d(0)
+ else:
+ # `delta[i] = delta[i - 1] ** 2 + f[i - 1]`
+ prev_max_delta = self.max_delta(i - 1)
+ assert prev_max_delta >= 0
+ retval = prev_max_delta ** 2 + self.max_f(i - 1)
+
+ # `delta[i]` has to be smaller than one otherwise errors would go off
+ # to infinity
+ _assert_accuracy(retval < 1)
+
+ return self._shrink_max(retval)
+
+ @cache_on_self
+ def max_pi(self, i):
+ """ maximum value of `pi[i]`.
+ `pi[i]` is defined right below Theorem 5 of paper.
+ """
+ assert isinstance(i, int) and 0 <= i < self.iter_count
+ # `pi[i] = 1 - (1 - n[i]) * prod`
+ # where `prod` is the product of,
+ # for `j` in `0 <= j < i`, `(1 - n[j]) / (1 + d[j])`
+ min_prod = Fraction(1)
+ for j in range(i):
+ max_n_j = self.max_n(j)
+ max_d_j = self.max_d(j)
+ assert max_n_j <= 1 and max_d_j > -1, \
+ "only one quadrant of interval division implemented"
+ min_prod *= (1 - max_n_j) / (1 + max_d_j)
+ max_n_i = self.max_n(i)
+ assert max_n_i <= 1 and min_prod >= 0, \
+ "only one quadrant of interval multiplication implemented"
+ retval = 1 - (1 - max_n_i) * min_prod
+ return self._shrink_max(retval)
+
+ @cached_property
+ def max_n_shift(self):
+ """ maximum value of `state.n_shift`.
+ """
+ # numerator must be less than `denominator << self.io_width`, so
+ # `n_shift` is at most `self.io_width`
+ return self.io_width
+
+ @cached_property
+ def n_hat(self):
+ """ maximum value of, for all `i`, `max_n(i)` and `max_d(i)`
+ """
+ n_hat = Fraction(0)
+ for i in range(self.iter_count):
+ n_hat = max(n_hat, self.max_n(i), self.max_d(i))
+ return self._shrink_max(n_hat)
+
+ def __make_ops(self):
+ """ Goldschmidt division algorithm.
+
+ based on:
+ Even, G., Seidel, P. M., & Ferguson, W. E. (2003).
+ A Parametric Error Analysis of Goldschmidt's Division Algorithm.
+ https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.90.1238&rep=rep1&type=pdf
+
+ yields: GoldschmidtDivOp
+ the operations needed to perform the division.
+ """
+ # establish assumptions of the paper's error analysis (section 3.1):
+
+ # 1. normalize so A (numerator) and B (denominator) are in [1, 2)
+ yield GoldschmidtDivOp.Normalize
+
+ # 2. ensure all relative errors from directed rounding are <= 1 / 4.
+ # the assumption is met by multipliers with > 4-bits precision
+ _assert_accuracy(self.expanded_width > 4)
+
+ # 3. require `abs(e[0]) + 3 * d[0] / 2 + f[0] < 1 / 2`.
+ _assert_accuracy(self.max_abs_e0 + 3 * self.max_d(0) / 2
+ + self.max_f(0) < Fraction(1, 2))
+
+ # 4. the initial approximation F'[-1] of 1/B is in [1/2, 1].
+ # (B is the denominator)
+
+ for addr in range(self.table_addr_count):
+ f_prime_m1 = self.table[addr]
+ _assert_accuracy(0.5 <= f_prime_m1 <= 1)
+
+ yield GoldschmidtDivOp.FEqTableLookup
+
+ # we use Setting I (section 4.1 of the paper):
+ # Require `n[i] <= n_hat` and `d[i] <= n_hat` and `f[i] = 0`:
+ # the conditions on n_hat are satisfied by construction.
+ for i in range(self.iter_count):
+ _assert_accuracy(self.max_f(i) == 0)
+ yield GoldschmidtDivOp.MulNByF
+ if i != self.iter_count - 1:
+ yield GoldschmidtDivOp.MulDByF
+ yield GoldschmidtDivOp.FEq2MinusD
+
+ # relative approximation error `p(N_prime[i])`:
+ # `p(N_prime[i]) = (A / B - N_prime[i]) / (A / B)`
+ # `0 <= p(N_prime[i])`
+ # `p(N_prime[i]) <= (2 * i) * n_hat \`
+ # ` + (abs(e[0]) + 3 * n_hat / 2) ** (2 ** i)`
+ i = self.iter_count - 1 # last used `i`
+ # compute power manually to prevent huge intermediate values
+ power = self._shrink_max(self.max_abs_e0 + 3 * self.n_hat / 2)
+ for _ in range(i):
+ power = self._shrink_max(power * power)
+
+ max_rel_error = (2 * i) * self.n_hat + power
+
+ min_a_over_b = Fraction(1, 2)
+ min_abs_error_for_correctness = min_a_over_b / (1 << self.max_n_shift)
+ min_rel_error_for_correctness = (min_abs_error_for_correctness
+ / min_a_over_b)
+
+ _assert_accuracy(
+ max_rel_error < min_rel_error_for_correctness,
+ f"not accurate enough: max_rel_error={max_rel_error}"
+ f" min_rel_error_for_correctness={min_rel_error_for_correctness}")
+
+ yield GoldschmidtDivOp.CalcResult
+
+ @cache_on_self
+ def default_cost_fn(self):
+ """ calculate the estimated cost on an arbitrary scale of implementing
+ goldschmidt division with the specified parameters. larger cost
+ values mean worse parameters.
+
+ This is the default cost function for `GoldschmidtDivParams.get`.
+
+ returns: float
+ """
+ rom_cells = self.table_data_bits << self.table_addr_bits
+ cost = float(rom_cells)
+ for op in self.ops:
+ if op == GoldschmidtDivOp.MulNByF \
+ or op == GoldschmidtDivOp.MulDByF:
+ mul_cost = self.expanded_width ** 2
+ mul_cost *= self.expanded_width.bit_length()
+ cost += mul_cost
+ cost += 5e7 * self.iter_count
+ return cost
+
+ @staticmethod
+ @lru_cache(maxsize=1 << 16)
+ def __cached_new(base_params):
+ assert isinstance(base_params, GoldschmidtDivParamsBase)
+ kwargs = {}
+ for field in fields(GoldschmidtDivParamsBase):
+ kwargs[field] = getattr(base_params, field)
+ try:
+ return GoldschmidtDivParams(**kwargs), None
+ except ParamsNotAccurateEnough as e:
+ return None, e
+
+ @staticmethod
+ def __raise(e): # type: (ParamsNotAccurateEnough) -> Any
+ raise e
+
+ @staticmethod
+ def cached_new(base_params, handle_error=__raise):
+ assert isinstance(base_params, GoldschmidtDivParamsBase)
+ params, error = GoldschmidtDivParams.__cached_new(base_params)
+ if error is None:
+ return params
+ else:
+ return handle_error(error)
+
+ @staticmethod
+ def get(io_width, cost_fn=default_cost_fn, max_table_addr_bits=12):
+ """ find efficient parameters for a goldschmidt division algorithm
+ with `params.io_width == io_width`.
+
+ arguments:
+ io_width: int
+ bit-width of the input divisor and the result.
+ the input numerator is `2 * io_width`-bits wide.
+ cost_fn: Callable[[GoldschmidtDivParams], float]
+ return the estimated cost on an arbitrary scale of implementing
+ goldschmidt division with the specified parameters. larger cost
+ values mean worse parameters.
+ max_table_addr_bits: int
+ maximum allowable value of `table_addr_bits`
+ """
+ assert isinstance(io_width, int) and io_width >= 1
+ assert callable(cost_fn)
+
+ last_error = None
+ last_error_params = None
+
+ def cached_new(base_params):
+ def handle_error(e):
+ nonlocal last_error, last_error_params
+ last_error = e
+ last_error_params = base_params
+ return None
+
+ retval = GoldschmidtDivParams.cached_new(base_params, handle_error)
+ if retval is None:
+ logging.debug(f"GoldschmidtDivParams.get: err: {base_params}")
+ else:
+ logging.debug(f"GoldschmidtDivParams.get: ok: {base_params}")
+ return retval
+
+ @lru_cache(maxsize=None)
+ def get_cost(base_params):
+ params = cached_new(base_params)
+ if params is None:
+ return math.inf
+ retval = cost_fn(params)
+ logging.debug(f"GoldschmidtDivParams.get: cost={retval}: {params}")
+ return retval
+
+ # start with parameters big enough to always work.
+ initial_extra_precision = io_width * 2 + 4
+ initial_params = GoldschmidtDivParamsBase(
+ io_width=io_width,
+ extra_precision=initial_extra_precision,
+ table_addr_bits=min(max_table_addr_bits, io_width),
+ table_data_bits=io_width + initial_extra_precision,
+ iter_count=1 + io_width.bit_length())
+
+ if cached_new(initial_params) is None:
+ raise ValueError(f"initial goldschmidt division algorithm "
+ f"parameters are invalid: {initial_params}"
+ ) from last_error
+
+ # find good initial `iter_count`
+ params = initial_params
+ for iter_count in range(1, initial_params.iter_count):
+ trial_params = replace(params, iter_count=iter_count)
+ if cached_new(trial_params) is not None:
+ params = trial_params
+ break
+
+ # now find `table_addr_bits`
+ cost = get_cost(params)
+ for table_addr_bits in range(1, max_table_addr_bits):
+ trial_params = replace(params, table_addr_bits=table_addr_bits)
+ trial_cost = get_cost(trial_params)
+ if trial_cost < cost:
+ params = trial_params
+ cost = trial_cost
+ break
+
+ # check one higher `iter_count` to see if it has lower cost
+ for table_addr_bits in range(1, max_table_addr_bits + 1):
+ trial_params = replace(params,
+ table_addr_bits=table_addr_bits,
+ iter_count=params.iter_count + 1)
+ trial_cost = get_cost(trial_params)
+ if trial_cost < cost:
+ params = trial_params
+ cost = trial_cost
+ break
+
+ # now shrink `table_data_bits`
+ while True:
+ trial_params = replace(params,
+ table_data_bits=params.table_data_bits - 1)
+ trial_cost = get_cost(trial_params)
+ if trial_cost < cost:
+ params = trial_params
+ cost = trial_cost
+ else:
+ break
+
+ # and shrink `extra_precision`
+ while True:
+ trial_params = replace(params,
+ extra_precision=params.extra_precision - 1)
+ trial_cost = get_cost(trial_params)
+ if trial_cost < cost:
+ params = trial_params
+ cost = trial_cost
+ else:
+ break
+
+ retval = cached_new(params)
+ assert isinstance(retval, GoldschmidtDivParams)
+ return retval
+
+
+def clz(v, wid):
+ """count leading zeros -- handy for debugging."""
+ assert isinstance(wid, int)
+ assert isinstance(v, int) and 0 <= v < (1 << wid)
+ return (1 << wid).bit_length() - v.bit_length()
+
+
+@enum.unique
+class GoldschmidtDivOp(enum.Enum):
+ Normalize = "n, d, n_shift = normalize(n, d)"
+ FEqTableLookup = "f = table_lookup(d)"
+ MulNByF = "n *= f"
+ MulDByF = "d *= f"
+ FEq2MinusD = "f = 2 - d"
+ CalcResult = "result = unnormalize_and_round(n)"
+
+ def run(self, params, state):
+ assert isinstance(params, GoldschmidtDivParams)
+ assert isinstance(state, GoldschmidtDivState)
+ expanded_width = params.expanded_width
+ table_addr_bits = params.table_addr_bits
+ if self == GoldschmidtDivOp.Normalize:
+ # normalize so 1 <= d < 2
+ # can easily be done with count-leading-zeros and left shift
+ while state.d < 1:
+ state.n = (state.n * 2).to_frac_wid(expanded_width)
+ state.d = (state.d * 2).to_frac_wid(expanded_width)
+
+ state.n_shift = 0
+ # normalize so 1 <= n < 2
+ while state.n >= 2:
+ state.n = (state.n * 0.5).to_frac_wid(expanded_width,
+ round_dir=RoundDir.DOWN)
+ state.n_shift += 1
+ elif self == GoldschmidtDivOp.FEqTableLookup:
+ # compute initial f by table lookup
+ d_m_1 = state.d - 1
+ d_m_1 = d_m_1.to_frac_wid(table_addr_bits, RoundDir.DOWN)
+ assert 0 <= d_m_1.bits < (1 << params.table_addr_bits)
+ state.f = params.table[d_m_1.bits]
+ state.f = state.f.to_frac_wid(expanded_width,
+ round_dir=RoundDir.DOWN)
+ elif self == GoldschmidtDivOp.MulNByF:
+ assert state.f is not None
+ n = state.n * state.f
+ state.n = n.to_frac_wid(expanded_width, round_dir=RoundDir.DOWN)
+ elif self == GoldschmidtDivOp.MulDByF:
+ assert state.f is not None
+ d = state.d * state.f
+ state.d = d.to_frac_wid(expanded_width, round_dir=RoundDir.UP)
+ elif self == GoldschmidtDivOp.FEq2MinusD:
+ state.f = (2 - state.d).to_frac_wid(expanded_width)
+ elif self == GoldschmidtDivOp.CalcResult:
+ assert state.n_shift is not None
+ # scale to correct value
+ n = state.n * (1 << state.n_shift)
+
+ state.quotient = math.floor(n)
+ state.remainder = state.orig_n - state.quotient * state.orig_d
+ if state.remainder >= state.orig_d:
+ state.quotient += 1
+ state.remainder -= state.orig_d
+ else:
+ assert False, f"unimplemented GoldschmidtDivOp: {self}"
+
+ def gen_hdl(self, params, state, sync_rom):
+ """generate the hdl for this operation.
+
+ arguments:
+ params: GoldschmidtDivParams
+ the goldschmidt division parameters.
+ state: GoldschmidtDivHDLState
+ the input/output state
+ sync_rom: bool
+ true if the rom should be read synchronously rather than
+ combinatorially, incurring an extra clock cycle of latency.
+ """
+ assert isinstance(params, GoldschmidtDivParams)
+ assert isinstance(state, GoldschmidtDivHDLState)
+ m = state.m
+ if self == GoldschmidtDivOp.Normalize:
+ # normalize so 1 <= d < 2
+ assert state.d.width == params.io_width
+ assert state.n.width == 2 * params.io_width
+ d_leading_zeros = CLZ(params.io_width)
+ m.submodules.d_leading_zeros = d_leading_zeros
+ m.d.comb += d_leading_zeros.sig_in.eq(state.d)
+ d_shift_out = Signal.like(state.d)
+ m.d.comb += d_shift_out.eq(state.d << d_leading_zeros.lz)
+ d = Signal(params.n_d_f_total_wid)
+ m.d.comb += d.eq((d_shift_out << (1 + params.expanded_width))
+ >> state.d.width)
+
+ # normalize so 1 <= n < 2
+ n_leading_zeros = CLZ(2 * params.io_width)
+ m.submodules.n_leading_zeros = n_leading_zeros
+ m.d.comb += n_leading_zeros.sig_in.eq(state.n)
+ signed_zero = Const(0, signed(1)) # force subtraction to be signed
+ n_shift_s_v = (params.io_width + signed_zero + d_leading_zeros.lz
+ - n_leading_zeros.lz)
+ n_shift_s = Signal.like(n_shift_s_v)
+ n_shift_n_lz_out = Signal.like(state.n)
+ n_shift_d_lz_out = Signal.like(state.n << d_leading_zeros.lz)
+ m.d.comb += [
+ n_shift_s.eq(n_shift_s_v),
+ n_shift_d_lz_out.eq(state.n << d_leading_zeros.lz),
+ n_shift_n_lz_out.eq(state.n << n_leading_zeros.lz),
+ ]
+ state.n_shift = Signal(d_leading_zeros.lz.width)
+ n = Signal(params.n_d_f_total_wid)
+ with m.If(n_shift_s < 0):
+ m.d.comb += [
+ state.n_shift.eq(0),
+ n.eq((n_shift_d_lz_out << (1 + params.expanded_width))
+ >> state.d.width),
+ ]
+ with m.Else():
+ m.d.comb += [
+ state.n_shift.eq(n_shift_s),
+ n.eq((n_shift_n_lz_out << (1 + params.expanded_width))
+ >> state.n.width),
+ ]
+ state.n = n
+ state.d = d
+ elif self == GoldschmidtDivOp.FEqTableLookup:
+ assert state.d.width == params.n_d_f_total_wid, "invalid d width"
+ # compute initial f by table lookup
+
+ # extra bit for table entries == 1.0
+ table_width = 1 + params.table_data_bits
+ table = Memory(width=table_width, depth=len(params.table),
+ init=[i.bits for i in params.table])
+ addr = state.d[:-params.n_d_f_int_wid][-params.table_addr_bits:]
+ if sync_rom:
+ table_read = table.read_port()
+ m.d.comb += table_read.addr.eq(addr)
+ state.insert_pipeline_register()
+ else:
+ table_read = table.read_port(domain="comb")
+ m.d.comb += table_read.addr.eq(addr)
+ m.submodules.table_read = table_read
+ state.f = Signal(params.n_d_f_int_wid + params.expanded_width)
+ data_shift = params.expanded_width - params.table_data_bits
+ m.d.comb += state.f.eq(table_read.data << data_shift)
+ elif self == GoldschmidtDivOp.MulNByF:
+ assert state.n.width == params.n_d_f_total_wid, "invalid n width"
+ assert state.f is not None
+ assert state.f.width == params.n_d_f_total_wid, "invalid f width"
+ n = Signal.like(state.n)
+ m.d.comb += n.eq((state.n * state.f) >> params.expanded_width)
+ state.n = n
+ elif self == GoldschmidtDivOp.MulDByF:
+ assert state.d.width == params.n_d_f_total_wid, "invalid d width"
+ assert state.f is not None
+ assert state.f.width == params.n_d_f_total_wid, "invalid f width"
+ d = Signal.like(state.d)
+ d_times_f = Signal.like(state.d * state.f)
+ m.d.comb += [
+ d_times_f.eq(state.d * state.f),
+ # round the multiplication up
+ d.eq((d_times_f >> params.expanded_width)
+ + (d_times_f[:params.expanded_width] != 0)),
+ ]
+ state.d = d
+ elif self == GoldschmidtDivOp.FEq2MinusD:
+ assert state.d.width == params.n_d_f_total_wid, "invalid d width"
+ f = Signal.like(state.d)
+ m.d.comb += f.eq((2 << params.expanded_width) - state.d)
+ state.f = f
+ elif self == GoldschmidtDivOp.CalcResult:
+ assert state.n.width == params.n_d_f_total_wid, "invalid n width"
+ assert state.n_shift is not None
+ # scale to correct value
+ n = state.n * (1 << state.n_shift)
+ q_approx = Signal(params.io_width)
+ # extra bit for if it's bigger than orig_d
+ r_approx = Signal(params.io_width + 1)
+ adjusted_r = Signal(signed(1 + params.io_width))
+ m.d.comb += [
+ q_approx.eq((state.n << state.n_shift)
+ >> params.expanded_width),
+ r_approx.eq(state.orig_n - q_approx * state.orig_d),
+ adjusted_r.eq(r_approx - state.orig_d),
+ ]
+ state.quotient = Signal(params.io_width)
+ state.remainder = Signal(params.io_width)
+
+ with m.If(adjusted_r >= 0):
+ m.d.comb += [
+ state.quotient.eq(q_approx + 1),
+ state.remainder.eq(adjusted_r),
+ ]
+ with m.Else():
+ m.d.comb += [
+ state.quotient.eq(q_approx),
+ state.remainder.eq(r_approx),
+ ]
+ else:
+ assert False, f"unimplemented GoldschmidtDivOp: {self}"
+
+
+@plain_data(repr=False)
+class GoldschmidtDivState:
+ __slots__ = ("orig_n", "orig_d", "n", "d",
+ "f", "quotient", "remainder", "n_shift")
+
+ def __init__(self, orig_n, orig_d, n, d,
+ f=None, quotient=None, remainder=None, n_shift=None):
+ assert isinstance(orig_n, int)
+ assert isinstance(orig_d, int)
+ assert isinstance(n, FixedPoint)
+ assert isinstance(d, FixedPoint)
+ assert f is None or isinstance(f, FixedPoint)
+ assert quotient is None or isinstance(quotient, int)
+ assert remainder is None or isinstance(remainder, int)
+ assert n_shift is None or isinstance(n_shift, int)
+ self.orig_n = orig_n
+ """original numerator"""
+
+ self.orig_d = orig_d
+ """original denominator"""
+
+ self.n = n
+ """numerator -- N_prime[i] in the paper's algorithm 2"""
+
+ self.d = d
+ """denominator -- D_prime[i] in the paper's algorithm 2"""
+
+ self.f = f
+ """current factor -- F_prime[i] in the paper's algorithm 2"""
+
+ self.quotient = quotient
+ """final quotient"""
+
+ self.remainder = remainder
+ """final remainder"""
+
+ self.n_shift = n_shift
+ """amount the numerator needs to be left-shifted at the end of the
+ algorithm.
+ """
+
+ def __repr__(self):
+ fields_str = []
+ for field in fields(GoldschmidtDivState):
+ value = getattr(self, field)
+ if value is None:
+ continue
+ if isinstance(value, int) and field != "n_shift":
+ fields_str.append(f"{field}={hex(value)}")
+ else:
+ fields_str.append(f"{field}={value!r}")
+ return f"GoldschmidtDivState({', '.join(fields_str)})"
+
+
+def goldschmidt_div(n, d, params, trace=lambda state: None):
+ """ Goldschmidt division algorithm.
+
+ based on:
+ Even, G., Seidel, P. M., & Ferguson, W. E. (2003).
+ A Parametric Error Analysis of Goldschmidt's Division Algorithm.
+ https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.90.1238&rep=rep1&type=pdf
+
+ arguments:
+ n: int
+ numerator. a `2*width`-bit unsigned integer.
+ must be less than `d << width`, otherwise the quotient wouldn't
+ fit in `width` bits.
+ d: int
+ denominator. a `width`-bit unsigned integer. must not be zero.
+ width: int
+ the bit-width of the inputs/outputs. must be a positive integer.
+ trace: Function[[GoldschmidtDivState], None]
+ called with the initial state and the state after executing each
+ operation in `params.ops`.
+
+ returns: tuple[int, int]
+ the quotient and remainder. a tuple of two `width`-bit unsigned
+ integers.
+ """
+ assert isinstance(params, GoldschmidtDivParams)
+ assert isinstance(d, int) and 0 < d < (1 << params.io_width)
+ assert isinstance(n, int) and 0 <= n < (d << params.io_width)
+
+ # this whole algorithm is done with fixed-point arithmetic where values
+ # have `width` fractional bits
+
+ state = GoldschmidtDivState(
+ orig_n=n,
+ orig_d=d,
+ n=FixedPoint(n, params.io_width),
+ d=FixedPoint(d, params.io_width),
+ )
+
+ trace(state)
+ for op in params.ops:
+ op.run(params, state)
+ trace(state)
+
+ assert state.quotient is not None
+ assert state.remainder is not None
+
+ return state.quotient, state.remainder
+
+
+@plain_data(eq=False)
+class GoldschmidtDivHDLState:
+ __slots__ = ("m", "orig_n", "orig_d", "n", "d",
+ "f", "quotient", "remainder", "n_shift")
+
+ __signal_name_prefix = "state_"
+
+ def __init__(self, m, orig_n, orig_d, n, d,
+ f=None, quotient=None, remainder=None, n_shift=None):
+ assert isinstance(m, Module)
+ assert isinstance(orig_n, Signal)
+ assert isinstance(orig_d, Signal)
+ assert isinstance(n, Signal)
+ assert isinstance(d, Signal)
+ assert f is None or isinstance(f, Signal)
+ assert quotient is None or isinstance(quotient, Signal)
+ assert remainder is None or isinstance(remainder, Signal)
+ assert n_shift is None or isinstance(n_shift, Signal)
+
+ self.m = m
+ """The HDL Module"""
+
+ self.orig_n = orig_n
+ """original numerator"""
+
+ self.orig_d = orig_d
+ """original denominator"""
+
+ self.n = n
+ """numerator -- N_prime[i] in the paper's algorithm 2"""
+
+ self.d = d
+ """denominator -- D_prime[i] in the paper's algorithm 2"""
+
+ self.f = f
+ """current factor -- F_prime[i] in the paper's algorithm 2"""
+
+ self.quotient = quotient
+ """final quotient"""
+
+ self.remainder = remainder
+ """final remainder"""
+
+ self.n_shift = n_shift
+ """amount the numerator needs to be left-shifted at the end of the
+ algorithm.
+ """
+
+ # old_signals must be set last
+ self.old_signals = defaultdict(list)
+
+ def __setattr__(self, name, value):
+ assert isinstance(name, str)
+ if name.startswith("_"):
+ return super().__setattr__(name, value)
+ try:
+ old_signals = self.old_signals[name]
+ except AttributeError:
+ # haven't yet finished __post_init__
+ return super().__setattr__(name, value)
+ assert name != "m" and name != "old_signals", f"can't write to {name}"
+ assert isinstance(value, Signal)
+ value.name = f"{self.__signal_name_prefix}{name}_{len(old_signals)}"
+ old_signal = getattr(self, name, None)
+ if old_signal is not None:
+ assert isinstance(old_signal, Signal)
+ old_signals.append(old_signal)
+ return super().__setattr__(name, value)
+
+ def insert_pipeline_register(self):
+ old_prefix = self.__signal_name_prefix
+ try:
+ for field in fields(GoldschmidtDivHDLState):
+ if field.startswith("_") or field == "m":
+ continue
+ old_sig = getattr(self, field, None)
+ if old_sig is None:
+ continue
+ assert isinstance(old_sig, Signal)
+ new_sig = Signal.like(old_sig)
+ setattr(self, field, new_sig)
+ self.m.d.sync += new_sig.eq(old_sig)
+ finally:
+ self.__signal_name_prefix = old_prefix
+
+
+class GoldschmidtDivHDL(Elaboratable):
+ """ Goldschmidt division algorithm.
+
+ based on:
+ Even, G., Seidel, P. M., & Ferguson, W. E. (2003).
+ A Parametric Error Analysis of Goldschmidt's Division Algorithm.
+ https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.90.1238&rep=rep1&type=pdf
+
+ attributes:
+ params: GoldschmidtDivParams
+ the goldschmidt division algorithm parameters.
+ pipe_reg_indexes: list[int]
+ the operation indexes where pipeline registers should be inserted.
+ duplicate values mean multiple registers should be inserted for
+ that operation index -- this is useful to allow yosys to spread a
+ multiplication across those multiple pipeline stages.
+ sync_rom: bool
+ true if the rom should be read synchronously rather than
+ combinatorially, incurring an extra clock cycle of latency.
+ n: Signal(unsigned(2 * params.io_width))
+ input numerator. a `2 * params.io_width`-bit unsigned integer.
+ must be less than `d << params.io_width`, otherwise the quotient
+ wouldn't fit in `params.io_width` bits.
+ d: Signal(unsigned(params.io_width))
+ input denominator. a `params.io_width`-bit unsigned integer.
+ must not be zero.
+ q: Signal(unsigned(params.io_width))
+ output quotient. only valid when `n < (d << params.io_width)`.
+ r: Signal(unsigned(params.io_width))
+ output remainder. only valid when `n < (d << params.io_width)`.
+ trace: list[GoldschmidtDivHDLState]
+ list of the initial state and the state after executing each
+ operation in `params.ops`.
+ """
+
+ @property
+ def total_pipeline_registers(self):
+ """the total number of pipeline registers"""
+ return len(self.pipe_reg_indexes) + self.sync_rom
+
+ def __init__(self, params, pipe_reg_indexes=(), sync_rom=False):
+ assert isinstance(params, GoldschmidtDivParams)
+ assert isinstance(sync_rom, bool)
+ self.params = params
+ self.pipe_reg_indexes = sorted(int(i) for i in pipe_reg_indexes)
+ self.sync_rom = sync_rom
+ self.n = Signal(unsigned(2 * params.io_width))
+ self.d = Signal(unsigned(params.io_width))
+ self.q = Signal(unsigned(params.io_width))
+ self.r = Signal(unsigned(params.io_width))
+
+ # in constructor so we get trace without needing to call elaborate
+ state = GoldschmidtDivHDLState(
+ m=Module(),
+ orig_n=self.n,
+ orig_d=self.d,
+ n=self.n,
+ d=self.d)
+
+ self.trace = [replace(state)]
+
+ # copy and reverse
+ pipe_reg_indexes = list(reversed(self.pipe_reg_indexes))
+
+ for op_index, op in enumerate(self.params.ops):
+ while len(pipe_reg_indexes) > 0 \
+ and pipe_reg_indexes[-1] <= op_index:
+ pipe_reg_indexes.pop()
+ state.insert_pipeline_register()
+ op.gen_hdl(self.params, state, self.sync_rom)
+ self.trace.append(replace(state))
+
+ while len(pipe_reg_indexes) > 0:
+ pipe_reg_indexes.pop()
+ state.insert_pipeline_register()
+
+ state.m.d.comb += [
+ self.q.eq(state.quotient),
+ self.r.eq(state.remainder),
+ ]
+
+ def elaborate(self, platform):
+ return self.trace[0].m
+
+
+GOLDSCHMIDT_SQRT_RSQRT_TABLE_ADDR_INT_WID = 2
+
+
+@lru_cache()
+def goldschmidt_sqrt_rsqrt_table(table_addr_bits, table_data_bits):
+ """Generate the look-up table needed for Goldschmidt's square-root and
+ reciprocal-square-root algorithm.
+
+ arguments:
+ table_addr_bits: int
+ the number of address bits for the look-up table.
+ table_data_bits: int
+ the number of data bits for the look-up table.
+ """
+ assert isinstance(table_addr_bits, int) and \
+ table_addr_bits >= GOLDSCHMIDT_SQRT_RSQRT_TABLE_ADDR_INT_WID
+ assert isinstance(table_data_bits, int) and table_data_bits >= 1
+ table = []
+ table_len = 1 << table_addr_bits
+ for addr in range(table_len):
+ if addr == 0:
+ value = FixedPoint(0, table_data_bits)
+ elif (addr << 2) < table_len:
+ value = None # table entries should be unused
+ else:
+ table_addr_frac_wid = table_addr_bits
+ table_addr_frac_wid -= GOLDSCHMIDT_SQRT_RSQRT_TABLE_ADDR_INT_WID
+ max_input_value = FixedPoint(addr + 1, table_addr_bits - 2)
+ max_frac_wid = max(max_input_value.frac_wid, table_data_bits)
+ value = max_input_value.to_frac_wid(max_frac_wid)
+ value = value.rsqrt(RoundDir.DOWN)
+ value = value.to_frac_wid(table_data_bits, RoundDir.DOWN)
+ table.append(value)
+
+ # tuple for immutability
+ return tuple(table)
+
+# FIXME: add code to calculate error bounds and check that the algorithm will
+# actually work (like in the goldschmidt division algorithm).
+# FIXME: add code to calculate a good set of parameters based on the error
+# bounds checking.
+
+
+def goldschmidt_sqrt_rsqrt(radicand, io_width, frac_wid, extra_precision,
+ table_addr_bits, table_data_bits, iter_count):
+ """Goldschmidt's square-root and reciprocal-square-root algorithm.
+
+ uses algorithm based on second method at:
+ https://en.wikipedia.org/wiki/Methods_of_computing_square_roots#Goldschmidt%E2%80%99s_algorithm
+
+ arguments:
+ radicand: FixedPoint(frac_wid=frac_wid)
+ the input value to take the square-root and reciprocal-square-root of.
+ io_width: int
+ the number of bits in the input (`radicand`) and output values.
+ frac_wid: int
+ the number of fraction bits in the input (`radicand`) and output
+ values.
+ extra_precision: int
+ the number of bits of internal extra precision.
+ table_addr_bits: int
+ the number of address bits for the look-up table.
+ table_data_bits: int
+ the number of data bits for the look-up table.
+
+ returns: tuple[FixedPoint, FixedPoint]
+ the square-root and reciprocal-square-root, rounded down to the
+ nearest representable value. If `radicand == 0`, then the
+ reciprocal-square-root value returned is zero.
+ """
+ assert (isinstance(radicand, FixedPoint)
+ and radicand.frac_wid == frac_wid
+ and 0 <= radicand.bits < (1 << io_width))
+ assert isinstance(io_width, int) and io_width >= 1
+ assert isinstance(frac_wid, int) and 0 <= frac_wid < io_width
+ assert isinstance(extra_precision, int) and extra_precision >= io_width
+ assert isinstance(table_addr_bits, int) and table_addr_bits >= 1
+ assert isinstance(table_data_bits, int) and table_data_bits >= 1
+ assert isinstance(iter_count, int) and iter_count >= 0
+ expanded_frac_wid = frac_wid + extra_precision
+ s = radicand.to_frac_wid(expanded_frac_wid)
+ sqrt_rshift = extra_precision
+ rsqrt_rshift = extra_precision
+ while s != 0 and s < 1:
+ s = (s * 4).to_frac_wid(expanded_frac_wid)
+ sqrt_rshift += 1
+ rsqrt_rshift -= 1
+ while s >= 4:
+ s = s.div(4, expanded_frac_wid)
+ sqrt_rshift -= 1
+ rsqrt_rshift += 1
+ table = goldschmidt_sqrt_rsqrt_table(table_addr_bits=table_addr_bits,
+ table_data_bits=table_data_bits)
+ # core goldschmidt sqrt/rsqrt algorithm:
+ # initial setup:
+ table_addr_frac_wid = table_addr_bits
+ table_addr_frac_wid -= GOLDSCHMIDT_SQRT_RSQRT_TABLE_ADDR_INT_WID
+ addr = s.to_frac_wid(table_addr_frac_wid, RoundDir.DOWN)
+ assert 0 <= addr.bits < (1 << table_addr_bits), "table addr out of range"
+ f = table[addr.bits]
+ assert f is not None, "accessed invalid table entry"
+ # use with_frac_wid to fix IDE type deduction
+ f = FixedPoint.with_frac_wid(f, expanded_frac_wid, RoundDir.DOWN)
+ x = (s * f).to_frac_wid(expanded_frac_wid, RoundDir.DOWN)
+ h = (f * 0.5).to_frac_wid(expanded_frac_wid, RoundDir.DOWN)
+ for _ in range(iter_count):
+ # iteration step:
+ f = (1.5 - x * h).to_frac_wid(expanded_frac_wid, RoundDir.DOWN)
+ x = (x * f).to_frac_wid(expanded_frac_wid, RoundDir.DOWN)
+ h = (h * f).to_frac_wid(expanded_frac_wid, RoundDir.DOWN)
+ r = 2 * h
+ # now `x` is approximately `sqrt(s)` and `r` is approximately `rsqrt(s)`
+
+ sqrt = FixedPoint(x.bits >> sqrt_rshift, frac_wid)
+ rsqrt = FixedPoint(r.bits >> rsqrt_rshift, frac_wid)
+
+ next_sqrt = FixedPoint(sqrt.bits + 1, frac_wid)
+ if next_sqrt * next_sqrt <= radicand:
+ sqrt = next_sqrt
+
+ next_rsqrt = FixedPoint(rsqrt.bits + 1, frac_wid)
+ if next_rsqrt * next_rsqrt * radicand <= 1 and radicand != 0:
+ rsqrt = next_rsqrt
+ return sqrt, rsqrt
--- /dev/null
+# SPDX-License-Identifier: LGPL-3-or-later
+# Copyright 2022 Jacob Lifshay programmerjake@gmail.com
+
+# Funded by NLnet Assure Programme 2021-02-052, https://nlnet.nl/assure part
+# of Horizon 2020 EU Programme 957073.
+
+from nmutil.plain_data import fields, replace
+import math
+import unittest
+from nmutil.formaltest import FHDLTestCase
+from nmutil.sim_util import do_sim, hash_256
+from nmigen.sim import Tick, Delay
+from nmigen.hdl.ast import Signal
+from nmigen.hdl.dsl import Module
+from soc.fu.div.experiment.goldschmidt_div_sqrt import (
+ GoldschmidtDivHDL, GoldschmidtDivHDLState, GoldschmidtDivParams,
+ GoldschmidtDivState, ParamsNotAccurateEnough, goldschmidt_div,
+ FixedPoint, RoundDir, goldschmidt_sqrt_rsqrt)
+
+
+class TestFixedPoint(FHDLTestCase):
+ def test_str_roundtrip(self):
+ for frac_wid in range(8):
+ for bits in range(-1 << 9, 1 << 9):
+ with self.subTest(bits=hex(bits), frac_wid=frac_wid):
+ value = FixedPoint(bits, frac_wid)
+ round_trip_value = FixedPoint.cast(str(value))
+ self.assertEqual(value, round_trip_value)
+
+ @staticmethod
+ def trap(f):
+ try:
+ return f(), None
+ except (ValueError, ZeroDivisionError) as e:
+ return None, e.__class__.__name__
+
+ def test_sqrt(self):
+ for frac_wid in range(8):
+ for bits in range(1 << 9):
+ for round_dir in RoundDir:
+ radicand = FixedPoint(bits, frac_wid)
+ expected_f = math.sqrt(float(radicand))
+ expected = self.trap(lambda: FixedPoint.with_frac_wid(
+ expected_f, frac_wid, round_dir))
+ with self.subTest(radicand=repr(radicand),
+ round_dir=str(round_dir),
+ expected=repr(expected)):
+ result = self.trap(lambda: radicand.sqrt(round_dir))
+ self.assertEqual(result, expected)
+
+ def test_rsqrt(self):
+ for frac_wid in range(8):
+ for bits in range(1, 1 << 9):
+ for round_dir in RoundDir:
+ radicand = FixedPoint(bits, frac_wid)
+ expected_f = 1 / math.sqrt(float(radicand))
+ expected = self.trap(lambda: FixedPoint.with_frac_wid(
+ expected_f, frac_wid, round_dir))
+ with self.subTest(radicand=repr(radicand),
+ round_dir=str(round_dir),
+ expected=repr(expected)):
+ result = self.trap(lambda: radicand.rsqrt(round_dir))
+ self.assertEqual(result, expected)
+
+
+class TestGoldschmidtDiv(FHDLTestCase):
+ def test_case1(self):
+ with self.assertRaises(ParamsNotAccurateEnough):
+ GoldschmidtDivParams(io_width=3, extra_precision=2,
+ table_addr_bits=3, table_data_bits=5,
+ iter_count=2)
+
+ def test_case2(self):
+ with self.assertRaises(ParamsNotAccurateEnough):
+ GoldschmidtDivParams(io_width=4, extra_precision=1,
+ table_addr_bits=1, table_data_bits=5,
+ iter_count=1)
+
+ @staticmethod
+ def cases(io_width, cases=None):
+ assert isinstance(io_width, int) and io_width >= 1
+ if cases is not None:
+ for n, d in cases:
+ assert isinstance(d, int) \
+ and 0 < d < (1 << io_width), "invalid case"
+ assert isinstance(n, int) \
+ and 0 <= n < (d << io_width), "invalid case"
+ yield (n, d)
+ elif io_width > 6:
+ assert io_width * 2 <= 256, \
+ "can't generate big enough numbers for test cases"
+ for i in range(10000):
+ d = hash_256(f'd {i}') % (1 << io_width)
+ if d == 0:
+ d = 1
+ n = hash_256(f'n {i}') % (d << io_width)
+ yield (n, d)
+ else:
+ for d in range(1, 1 << io_width):
+ for n in range(d << io_width):
+ yield (n, d)
+
+ def tst(self, io_width, cases=None):
+ assert isinstance(io_width, int)
+ params = GoldschmidtDivParams.get(io_width)
+ with self.subTest(params=str(params)):
+ for n, d in self.cases(io_width, cases):
+ expected_q, expected_r = divmod(n, d)
+ with self.subTest(n=hex(n), d=hex(d),
+ expected_q=hex(expected_q),
+ expected_r=hex(expected_r)):
+ trace = []
+
+ def trace_fn(state):
+ assert isinstance(state, GoldschmidtDivState)
+ trace.append((replace(state)))
+ q, r = goldschmidt_div(n, d, params, trace=trace_fn)
+ with self.subTest(q=hex(q), r=hex(r), trace=repr(trace)):
+ self.assertEqual((q, r), (expected_q, expected_r))
+
+ def tst_sim(self, io_width, cases=None, pipe_reg_indexes=(),
+ sync_rom=False):
+ assert isinstance(io_width, int)
+ params = GoldschmidtDivParams.get(io_width)
+ m = Module()
+ dut = GoldschmidtDivHDL(params, pipe_reg_indexes=pipe_reg_indexes,
+ sync_rom=sync_rom)
+ m.submodules.dut = dut
+ # make sync domain get added
+ m.d.sync += Signal().eq(0)
+
+ def inputs_proc():
+ yield Tick()
+ for n, d in self.cases(io_width, cases):
+ yield dut.n.eq(n)
+ yield dut.d.eq(d)
+ yield Tick()
+
+ def check_interals(n, d):
+ # check internals only if dut is completely combinatorial
+ # so we don't have to figure out how to read values in
+ # previous clock cycles
+ if dut.total_pipeline_registers != 0:
+ return
+ ref_trace = []
+
+ def ref_trace_fn(state):
+ assert isinstance(state, GoldschmidtDivState)
+ ref_trace.append((replace(state)))
+ goldschmidt_div(n=n, d=d, params=params, trace=ref_trace_fn)
+ self.assertEqual(len(dut.trace), len(ref_trace))
+ for index, state in enumerate(dut.trace):
+ ref_state = ref_trace[index]
+ last_op = None if index == 0 else params.ops[index - 1]
+ with self.subTest(index=index, state=repr(state),
+ ref_state=repr(ref_state),
+ last_op=str(last_op)):
+ for field in fields(GoldschmidtDivHDLState):
+ sig = getattr(state, field)
+ if not isinstance(sig, Signal):
+ continue
+ ref_value = getattr(ref_state, field)
+ ref_value_str = repr(ref_value)
+ if isinstance(ref_value, int):
+ ref_value_str = hex(ref_value)
+ value = yield sig
+ with self.subTest(field_name=field,
+ sig=repr(sig),
+ sig_shape=repr(sig.shape()),
+ value=hex(value),
+ ref_value=ref_value_str):
+ if isinstance(ref_value, int):
+ self.assertEqual(value, ref_value)
+ else:
+ assert isinstance(ref_value, FixedPoint)
+ self.assertEqual(value, ref_value.bits)
+
+ def check_outputs():
+ yield Tick()
+ for _ in range(dut.total_pipeline_registers):
+ yield Tick()
+ for n, d in self.cases(io_width, cases):
+ yield Delay(0.1e-6)
+ expected_q, expected_r = divmod(n, d)
+ with self.subTest(n=hex(n), d=hex(d),
+ expected_q=hex(expected_q),
+ expected_r=hex(expected_r)):
+ q = yield dut.q
+ r = yield dut.r
+ with self.subTest(q=hex(q), r=hex(r)):
+ self.assertEqual((q, r), (expected_q, expected_r))
+ yield from check_interals(n, d)
+
+ yield Tick()
+
+ with self.subTest(params=str(params)):
+ with do_sim(self, m, (dut.n, dut.d, dut.q, dut.r)) as sim:
+ sim.add_clock(1e-6)
+ sim.add_process(inputs_proc)
+ sim.add_process(check_outputs)
+ sim.run()
+
+ def test_1_through_4(self):
+ for io_width in range(1, 4 + 1):
+ with self.subTest(io_width=io_width):
+ self.tst(io_width)
+
+ def test_5(self):
+ self.tst(5)
+
+ def test_6(self):
+ self.tst(6)
+
+ def test_8(self):
+ self.tst(8)
+
+ def test_16(self):
+ self.tst(16)
+
+ def test_32(self):
+ self.tst(32)
+
+ def test_64(self):
+ self.tst(64)
+
+ def test_sim_5(self):
+ self.tst_sim(5)
+
+ def test_sim_8(self):
+ self.tst_sim(8)
+
+ def test_sim_16(self):
+ self.tst_sim(16)
+
+ def test_sim_32(self):
+ self.tst_sim(32)
+
+ def test_sim_64(self):
+ self.tst_sim(64)
+
+ def tst_params(self, io_width):
+ assert isinstance(io_width, int)
+ params = GoldschmidtDivParams.get(io_width)
+ print()
+ print(params)
+
+ def test_params_1(self):
+ self.tst_params(1)
+
+ def test_params_2(self):
+ self.tst_params(2)
+
+ def test_params_3(self):
+ self.tst_params(3)
+
+ def test_params_4(self):
+ self.tst_params(4)
+
+ def test_params_5(self):
+ self.tst_params(5)
+
+ def test_params_6(self):
+ self.tst_params(6)
+
+ def test_params_7(self):
+ self.tst_params(7)
+
+ def test_params_8(self):
+ self.tst_params(8)
+
+ def test_params_9(self):
+ self.tst_params(9)
+
+ def test_params_10(self):
+ self.tst_params(10)
+
+ def test_params_11(self):
+ self.tst_params(11)
+
+ def test_params_12(self):
+ self.tst_params(12)
+
+ def test_params_13(self):
+ self.tst_params(13)
+
+ def test_params_14(self):
+ self.tst_params(14)
+
+ def test_params_15(self):
+ self.tst_params(15)
+
+ def test_params_16(self):
+ self.tst_params(16)
+
+ def test_params_17(self):
+ self.tst_params(17)
+
+ def test_params_18(self):
+ self.tst_params(18)
+
+ def test_params_19(self):
+ self.tst_params(19)
+
+ def test_params_20(self):
+ self.tst_params(20)
+
+ def test_params_21(self):
+ self.tst_params(21)
+
+ def test_params_22(self):
+ self.tst_params(22)
+
+ def test_params_23(self):
+ self.tst_params(23)
+
+ def test_params_24(self):
+ self.tst_params(24)
+
+ def test_params_25(self):
+ self.tst_params(25)
+
+ def test_params_26(self):
+ self.tst_params(26)
+
+ def test_params_27(self):
+ self.tst_params(27)
+
+ def test_params_28(self):
+ self.tst_params(28)
+
+ def test_params_29(self):
+ self.tst_params(29)
+
+ def test_params_30(self):
+ self.tst_params(30)
+
+ def test_params_31(self):
+ self.tst_params(31)
+
+ def test_params_32(self):
+ self.tst_params(32)
+
+ def test_params_33(self):
+ self.tst_params(33)
+
+ def test_params_34(self):
+ self.tst_params(34)
+
+ def test_params_35(self):
+ self.tst_params(35)
+
+ def test_params_36(self):
+ self.tst_params(36)
+
+ def test_params_37(self):
+ self.tst_params(37)
+
+ def test_params_38(self):
+ self.tst_params(38)
+
+ def test_params_39(self):
+ self.tst_params(39)
+
+ def test_params_40(self):
+ self.tst_params(40)
+
+ def test_params_41(self):
+ self.tst_params(41)
+
+ def test_params_42(self):
+ self.tst_params(42)
+
+ def test_params_43(self):
+ self.tst_params(43)
+
+ def test_params_44(self):
+ self.tst_params(44)
+
+ def test_params_45(self):
+ self.tst_params(45)
+
+ def test_params_46(self):
+ self.tst_params(46)
+
+ def test_params_47(self):
+ self.tst_params(47)
+
+ def test_params_48(self):
+ self.tst_params(48)
+
+ def test_params_49(self):
+ self.tst_params(49)
+
+ def test_params_50(self):
+ self.tst_params(50)
+
+ def test_params_51(self):
+ self.tst_params(51)
+
+ def test_params_52(self):
+ self.tst_params(52)
+
+ def test_params_53(self):
+ self.tst_params(53)
+
+ def test_params_54(self):
+ self.tst_params(54)
+
+ def test_params_55(self):
+ self.tst_params(55)
+
+ def test_params_56(self):
+ self.tst_params(56)
+
+ def test_params_57(self):
+ self.tst_params(57)
+
+ def test_params_58(self):
+ self.tst_params(58)
+
+ def test_params_59(self):
+ self.tst_params(59)
+
+ def test_params_60(self):
+ self.tst_params(60)
+
+ def test_params_61(self):
+ self.tst_params(61)
+
+ def test_params_62(self):
+ self.tst_params(62)
+
+ def test_params_63(self):
+ self.tst_params(63)
+
+ def test_params_64(self):
+ self.tst_params(64)
+
+
+class TestGoldschmidtSqrtRSqrt(FHDLTestCase):
+ def tst(self, io_width, frac_wid, extra_precision,
+ table_addr_bits, table_data_bits, iter_count):
+ assert isinstance(io_width, int)
+ assert isinstance(frac_wid, int)
+ assert isinstance(extra_precision, int)
+ assert isinstance(table_addr_bits, int)
+ assert isinstance(table_data_bits, int)
+ assert isinstance(iter_count, int)
+ with self.subTest(io_width=io_width, frac_wid=frac_wid,
+ extra_precision=extra_precision,
+ table_addr_bits=table_addr_bits,
+ table_data_bits=table_data_bits,
+ iter_count=iter_count):
+ for bits in range(1 << io_width):
+ radicand = FixedPoint(bits, frac_wid)
+ expected_sqrt = radicand.sqrt(RoundDir.DOWN)
+ expected_rsqrt = FixedPoint(0, frac_wid)
+ if radicand > 0:
+ expected_rsqrt = radicand.rsqrt(RoundDir.DOWN)
+ with self.subTest(radicand=repr(radicand),
+ expected_sqrt=repr(expected_sqrt),
+ expected_rsqrt=repr(expected_rsqrt)):
+ sqrt, rsqrt = goldschmidt_sqrt_rsqrt(
+ radicand=radicand, io_width=io_width,
+ frac_wid=frac_wid,
+ extra_precision=extra_precision,
+ table_addr_bits=table_addr_bits,
+ table_data_bits=table_data_bits,
+ iter_count=iter_count)
+ with self.subTest(sqrt=repr(sqrt), rsqrt=repr(rsqrt)):
+ self.assertEqual((sqrt, rsqrt),
+ (expected_sqrt, expected_rsqrt))
+
+ def test1(self):
+ self.tst(io_width=16, frac_wid=8, extra_precision=20,
+ table_addr_bits=4, table_data_bits=28, iter_count=4)
+
+
+if __name__ == "__main__":
+ unittest.main()
class DivInputData(FUBaseData):
- regspec = [('INT', 'ra', '0:63'), # RA
- ('INT', 'rb', '0:63'), # RB/immediate
- ('XER', 'xer_so', '32'), ] # XER bit 32: SO
-
def __init__(self, pspec):
super().__init__(pspec, False)
# convenience
self.a, self.b = self.ra, self.rb
+ @property
+ def regspec(self):
+ return [('INT', 'ra', self.intrange), # RA
+ ('INT', 'rb', self.intrange), # RB/immediate
+ ('XER', 'xer_so', '32'), ] # XER bit 32: SO
+
# output stage shared between div and mul: like ALUOutputData but no CA/32
class DivMulOutputData(FUBaseData):
- regspec = [('INT', 'o', '0:63'),
- ('CR', 'cr_a', '0:3'),
- ('XER', 'xer_ov', '33,44'), # bit0: ov, bit1: ov32
- ('XER', 'xer_so', '32')]
def __init__(self, pspec):
super().__init__(pspec, True)
# convenience
self.cr0 = self.cr_a
+ @property
+ def regspec(self):
+ return [('INT', 'o', self.intrange),
+ ('CR', 'cr_a', '0:3'),
+ ('XER', 'xer_ov', '33,44'), # bit0: ov, bit1: ov32
+ ('XER', 'xer_so', '32')]
+
class DivPipeKindConfigBase:
def __init__(self,
class DivPipeSpec(CommonPipeSpec):
- def __init__(self, id_wid, div_pipe_kind):
- super().__init__(id_wid=id_wid)
+ def __init__(self, id_wid, parent_pspec, div_pipe_kind):
+ super().__init__(id_wid=id_wid, parent_pspec=parent_pspec)
self.div_pipe_kind = div_pipe_kind
self.core_config = div_pipe_kind.config.core_config
- regspec = (DivInputData.regspec, DivMulOutputData.regspec)
+ regspecklses = (DivInputData, DivMulOutputData)
opsubsetkls = CompLogicalOpSubset
class DivPipeSpecDivPipeCore(DivPipeSpec):
- def __init__(self, id_wid):
- super().__init__(id_wid=id_wid, div_pipe_kind=DivPipeKind.DivPipeCore)
+ def __init__(self, id_wid, parent_pspec):
+ super().__init__(id_wid=id_wid,
+ parent_pspec=parent_pspec,
+ div_pipe_kind=DivPipeKind.DivPipeCore)
class DivPipeSpecFSMDivCore(DivPipeSpec):
- def __init__(self, id_wid):
- super().__init__(id_wid=id_wid, div_pipe_kind=DivPipeKind.FSMDivCore)
+ def __init__(self, id_wid, parent_pspec):
+ super().__init__(id_wid=id_wid,
+ parent_pspec=parent_pspec,
+ div_pipe_kind=DivPipeKind.FSMDivCore)
class DivPipeSpecSimOnly(DivPipeSpec):
- def __init__(self, id_wid):
- super().__init__(id_wid=id_wid, div_pipe_kind=DivPipeKind.SimOnly)
+ def __init__(self, id_wid, parent_pspec):
+ super().__init__(id_wid=id_wid,
+ parent_pspec=parent_pspec,
+ div_pipe_kind=DivPipeKind.SimOnly)
class CoreBaseData(DivInputData):
class DivStagesStart(PipeModBaseChain):
def get_chain(self):
alu_input = DivMulInputStage(self.pspec)
+ return [alu_input]
+
+
+class DivStagesSetup(PipeModBaseChain):
+ def get_chain(self):
div_setup = DivSetupStage(self.pspec)
if isinstance(self.pspec.div_pipe_kind.config,
DivPipeKindConfigCombPipe):
core_setup = [DivCoreSetupStage(self.pspec)]
else:
core_setup = ()
- return [alu_input, div_setup, *core_setup]
+ return [div_setup, *core_setup]
class DivStagesMiddle(PipeModBaseChain):
else:
core_final = ()
div_out = DivOutputStage(self.pspec)
- alu_out = DivMulOutputStage(self.pspec)
self.div_out = div_out # debugging - bug #425
- return [*core_final, div_out, alu_out]
+ return [*core_final, div_out]
+
+
+class DivStagesFinalise(PipeModBaseChain):
+ def get_chain(self):
+ alu_out = DivMulOutputStage(self.pspec)
+ return [alu_out]
class DivBasePipe(ControlBase):
ControlBase.__init__(self)
self.pspec = pspec
self.pipe_start = DivStagesStart(pspec)
+ self.pipe_setup = DivStagesSetup(pspec)
self.pipe_middles = []
if isinstance(self.pspec.div_pipe_kind.config,
DivPipeKindConfigCombPipe):
self.pipe_middles.append(
self.pspec.div_pipe_kind.config.core_stage_class(pspec))
self.pipe_end = DivStagesEnd(pspec)
+ self.pipe_final = DivStagesFinalise(pspec)
self._eqs = self.connect([self.pipe_start,
+ self.pipe_setup,
*self.pipe_middles,
- self.pipe_end])
+ self.pipe_end,
+ self.pipe_final])
def elaborate(self, platform):
m = ControlBase.elaborate(self, platform)
m.submodules.pipe_start = self.pipe_start
+ m.submodules.pipe_setup = self.pipe_setup
for i in range(len(self.pipe_middles)):
name = f"pipe_middle_{i}"
setattr(m.submodules, name, self.pipe_middles[i])
m.submodules.pipe_end = self.pipe_end
+ m.submodules.pipe_final = self.pipe_final
m.d.comb += self._eqs
return m
return CoreInputData(self.pspec)
def elaborate(self, platform):
+ XLEN = self.pspec.XLEN
m = Module()
comb = m.d.comb
# convenience variables
# work out if a/b are negative (check 32-bit / signed)
comb += dividend_neg_o.eq(Mux(op.is_32bit,
- a[31], a[63]) & op.is_signed)
- comb += divisor_neg_o.eq(Mux(op.is_32bit, b[31], b[63]) & op.is_signed)
+ a[31], a[XLEN-1]) & op.is_signed)
+ comb += divisor_neg_o.eq(Mux(op.is_32bit,
+ b[31], b[XLEN-1]) & op.is_signed)
# negation of a 64-bit value produces the same lower 32-bit
# result as negation of just the lower 32-bits, so we don't
# need to do anything special before negating
- abs_dor = Signal(64, reset_less=True) # absolute of divisor
- abs_dend = Signal(64, reset_less=True) # absolute of dividend
+ abs_dor = Signal(XLEN, reset_less=True) # absolute of divisor
+ abs_dend = Signal(XLEN, reset_less=True) # absolute of dividend
comb += abs_dor.eq(Mux(divisor_neg_o, -b, b))
comb += abs_dend.eq(Mux(dividend_neg_o, -a, a))
with m.If(op.is_32bit):
comb += dividend_o.eq(abs_dend[0:32] << 32)
with m.Else():
- comb += dividend_o.eq(abs_dend[0:64] << 64)
+ comb += dividend_o.eq(abs_dend[0:XLEN] << XLEN)
###### sticky overflow and context, both pass-through #####
m.submodules.pdecode2 = pdecode2 = PowerDecode2(pdecode)
- pspec = DivPipeSpec(id_wid=2, div_pipe_kind=div_pipe_kind)
+ class PPspec:
+ XLEN = 64
+ pps = PPspec()
+ pspec = DivPipeSpec(
+ id_wid=2, div_pipe_kind=div_pipe_kind, parent_pspec=pps)
m.submodules.alu = alu = DivBasePipe(pspec)
comb += alu.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
class TestPipeIlang(unittest.TestCase):
def write_ilang(self, div_pipe_kind):
- pspec = DivPipeSpec(id_wid=2, div_pipe_kind=div_pipe_kind)
+ class PPspec:
+ XLEN = 64
+ pps = PPspec()
+ pspec = DivPipeSpec(
+ id_wid=2, div_pipe_kind=div_pipe_kind, parent_pspec=pps)
alu = DivBasePipe(pspec)
vl = rtlil.convert(alu, ports=alu.ports())
with open(f"div_pipeline_{div_pipe_kind.name}.il", "w") as f:
('is_signed', 1),
('data_len', 4),
('byte_reverse', 1),
+ ('reserve', 1), # atomic update
('sign_extend', 1),
('ldst_mode', LDSTMode),
('insn', 32),
from nmigen import (Elaboratable, Module, Signal, Shape, unsigned, Cat, Mux,
Record, Memory,
- Const)
+ Const, C)
from nmutil.iocontrol import RecordObject
from nmutil.util import rising_edge, Display
from enum import Enum, unique
from soc.experiment.dcache import DCache
+from soc.experiment.icache import ICache
from soc.experiment.pimem import PortInterfaceBase
from soc.experiment.mem_types import LoadStore1ToMMUType
from soc.experiment.mem_types import MMUToLoadStore1Type
IDLE = 0 # ready for instruction
ACK_WAIT = 1 # waiting for ack from dcache
MMU_LOOKUP = 2 # waiting for MMU to look up translation
- TLBIE_WAIT = 3 # waiting for MMU to finish doing a tlbie
+ #SECOND_REQ = 3 # second request for unaligned transfer
+
+@unique
+class Misalign(Enum):
+ ONEWORD = 0 # only one word needed, all good
+ NEED2WORDS = 1 # need to send/receive two words
+ WAITFIRST = 2 # waiting for the first word
+ WAITSECOND = 3 # waiting for the second word
# captures the LDSTRequest from the PortInterface, which "blips" most
self.load = Signal()
self.dcbz = Signal()
- self.addr = Signal(64)
+ self.raddr = Signal(64)
# self.store_data = Signal(64) # this is already sync (on a delay)
- self.byte_sel = Signal(8)
+ self.byte_sel = Signal(16)
self.nc = Signal() # non-cacheable access
self.virt_mode = Signal()
self.priv_mode = Signal()
+ self.mode_32bit = Signal() # XXX UNUSED AT PRESENT
+ self.alignstate = Signal(Misalign) # progress of alignment request
self.align_intr = Signal()
+ # atomic (LR/SC reservation)
+ self.reserve = Signal()
+ self.atomic = Signal()
+ self.atomic_last = Signal()
+
# glue logic for microwatt mmu and dcache
class LoadStore1(PortInterfaceBase):
addrwid = pspec.addr_wid
super().__init__(regwid, addrwid)
- self.dcache = DCache()
+ self.dcache = DCache(pspec)
+ self.icache = ICache(pspec)
# these names are from the perspective of here (LoadStore1)
self.d_out = self.dcache.d_in # in to dcache is out for LoadStore
self.d_in = self.dcache.d_out # out from dcache is in for LoadStore
- self.m_out = LoadStore1ToMMUType() # out *to* MMU
- self.m_in = MMUToLoadStore1Type() # in *from* MMU
+ self.i_out = self.icache.i_in # in to icache is out for LoadStore
+ self.i_in = self.icache.i_out # out from icache is in for LoadStore
+ self.m_out = LoadStore1ToMMUType("m_out") # out *to* MMU
+ self.m_in = MMUToLoadStore1Type("m_in") # in *from* MMU
self.req = LDSTRequest(name="ldst_req")
# TODO, convert dcache wb_in/wb_out to "standard" nmigen Wishbone bus
self.dbus = Record(make_wb_layout(pspec))
+ self.ibus = Record(make_wb_layout(pspec))
# for creating a single clock blip to DCache
self.d_valid = Signal()
self.load = Signal()
self.tlbie = Signal()
self.dcbz = Signal()
- self.addr = Signal(64)
- self.store_data = Signal(64)
- self.load_data = Signal(64)
- self.load_data_delay = Signal(64)
- self.byte_sel = Signal(8)
+ self.raddr = Signal(64)
+ self.maddr = Signal(64)
+ self.store_data = Signal(64) # first half (aligned)
+ self.store_data2 = Signal(64) # second half (misaligned)
+ self.load_data = Signal(128) # 128 to cope with misalignment
+ self.load_data_delay = Signal(128) # perform 2 LD/STs
+ self.byte_sel = Signal(16) # also for misaligned, 16-bit
+ self.alignstate = Signal(Misalign) # progress of alignment request
+ self.next_addr = Signal(64) # 2nd (aligned) read/write addr
#self.xerc : xer_common_t;
- #self.reserve = Signal()
- #self.atomic = Signal()
- #self.atomic_last = Signal()
#self.rc = Signal()
self.nc = Signal() # non-cacheable access
- self.virt_mode = Signal()
- self.priv_mode = Signal()
- self.state = Signal(State)
- self.instr_fault = Signal()
+ self.mode_32bit = Signal() # XXX UNUSED AT PRESENT
+ self.state = Signal(State)
+ self.instr_fault = Signal() # indicator to request i-cache MMU lookup
+ self.r_instr_fault = Signal() # accessed in external_busy
+ self.priv_mode = Signal() # only for instruction fetch (not LDST)
self.align_intr = Signal()
self.busy = Signal()
self.wait_dcache = Signal()
self.wait_mmu = Signal()
- #self.mode_32bit = Signal()
+ self.lrsc_misalign = Signal()
#self.intr_vec : integer range 0 to 16#fff#;
#self.nia = Signal(64)
#self.srr1 = Signal(16)
-
- def set_wr_addr(self, m, addr, mask, misalign, msr_pr, is_dcbz):
+ # use these to set the dsisr or dar respectively
+ self.mmu_set_spr = Signal()
+ self.mmu_set_dsisr = Signal()
+ self.mmu_set_dar = Signal()
+ self.sprval_in = Signal(64)
+
+ # ONLY access these read-only, do NOT attempt to change
+ self.dsisr = Signal(32)
+ self.dar = Signal(64)
+
+ # when external_busy set, do not allow PortInterface to proceed
+ def external_busy(self, m):
+ return self.instr_fault | self.r_instr_fault
+
+ def set_wr_addr(self, m, addr, mask, misalign, msr, is_dcbz, is_nc):
+ m.d.comb += self.req.nc.eq(is_nc)
m.d.comb += self.req.load.eq(0) # store operation
m.d.comb += self.req.byte_sel.eq(mask)
- m.d.comb += self.req.addr.eq(addr)
- m.d.comb += self.req.priv_mode.eq(~msr_pr) # not-problem ==> priv
- m.d.comb += self.req.virt_mode.eq(msr_pr) # problem-state ==> virt
- m.d.comb += self.req.align_intr.eq(misalign)
+ m.d.comb += self.req.raddr.eq(addr)
+ m.d.comb += self.req.priv_mode.eq(~msr.pr) # not-problem ==> priv
+ m.d.comb += self.req.virt_mode.eq(msr.dr) # DR ==> virt
+ m.d.comb += self.req.mode_32bit.eq(~msr.sf) # not-sixty-four ==> 32bit
m.d.comb += self.req.dcbz.eq(is_dcbz)
+ with m.If(misalign):
+ m.d.comb += self.req.alignstate.eq(Misalign.NEED2WORDS)
+ m.d.sync += self.next_addr.eq(Cat(C(0, 3), addr[3:]+1))
# m.d.comb += Display("set_wr_addr %i dcbz %i",addr,is_dcbz)
# option to disable the cache entirely for write
if self.disable_cache:
m.d.comb += self.req.nc.eq(1)
+
+ # dcbz cannot do no-cache
+ with m.If(is_dcbz & self.req.nc):
+ m.d.comb += self.req.align_intr.eq(1)
+
+ # hmm, rather than add yet another argument to set_wr_addr
+ # read direct from PortInterface
+ m.d.comb += self.req.reserve.eq(self.pi.reserve) # atomic request
+ m.d.comb += self.req.atomic.eq(~self.lrsc_misalign)
+ m.d.comb += self.req.atomic_last.eq(~self.lrsc_misalign)
+
return None
- def set_rd_addr(self, m, addr, mask, misalign, msr_pr):
+ def set_rd_addr(self, m, addr, mask, misalign, msr, is_nc):
m.d.comb += self.d_valid.eq(1)
m.d.comb += self.req.load.eq(1) # load operation
m.d.comb += self.req.byte_sel.eq(mask)
- m.d.comb += self.req.align_intr.eq(misalign)
- m.d.comb += self.req.addr.eq(addr)
- m.d.comb += self.req.priv_mode.eq(~msr_pr) # not-problem ==> priv
- m.d.comb += self.req.virt_mode.eq(msr_pr) # problem-state ==> virt
+ m.d.comb += self.req.raddr.eq(addr)
+ m.d.comb += self.req.priv_mode.eq(~msr.pr) # not-problem ==> priv
+ m.d.comb += self.req.virt_mode.eq(msr.dr) # DR ==> virt
+ m.d.comb += self.req.mode_32bit.eq(~msr.sf) # not-sixty-four ==> 32bit
+ m.d.comb += self.req.nc.eq(is_nc)
# BAD HACK! disable cacheing on LD when address is 0xCxxx_xxxx
# this is for peripherals. same thing done in Microwatt loadstore1.vhdl
with m.If(addr[28:] == Const(0xc, 4)):
# option to disable the cache entirely for read
if self.disable_cache:
m.d.comb += self.req.nc.eq(1)
+ with m.If(misalign):
+ # need two reads: prepare next address in advance
+ m.d.comb += self.req.alignstate.eq(Misalign.NEED2WORDS)
+ m.d.sync += self.next_addr.eq(Cat(C(0, 3), addr[3:]+1))
+
+ # hmm, rather than add yet another argument to set_rd_addr
+ # read direct from PortInterface
+ m.d.comb += self.req.reserve.eq(self.pi.reserve) # atomic request
+ m.d.comb += self.req.atomic.eq(~self.lrsc_misalign)
+ m.d.comb += self.req.atomic_last.eq(~self.lrsc_misalign)
+
return None #FIXME return value
def set_wr_data(self, m, data, wen):
# put data into comb which is picked up in main elaborate()
m.d.comb += self.d_w_valid.eq(1)
m.d.comb += self.store_data.eq(data)
- #m.d.sync += self.d_out.byte_sel.eq(wen) # this might not be needed
+ m.d.sync += self.store_data2.eq(data[64:128])
st_ok = self.done # TODO indicates write data is valid
+ m.d.comb += self.pi.store_done.data.eq(self.d_in.store_done)
+ m.d.comb += self.pi.store_done.ok.eq(1)
return st_ok
def get_rd_data(self, m):
# microwatt takes one more cycle before next operation can be issued
sync += self.done_delay.eq(self.done)
- sync += self.load_data_delay.eq(self.load_data)
+ #sync += self.load_data_delay[0:64].eq(self.load_data[0:64])
- # create dcache module
+ # create dcache and icache module
m.submodules.dcache = dcache = self.dcache
+ m.submodules.icache = icache = self.icache
# temp vars
d_out, d_in, dbus = self.d_out, self.d_in, self.dbus
+ i_out, i_in, ibus = self.i_out, self.i_in, self.ibus
m_out, m_in = self.m_out, self.m_in
exc = self.pi.exc_o
exception = exc.happened
mmureq = Signal()
- # copy of address, but gets over-ridden for OP_FETCH_FAILED
+ # copy of address, but gets over-ridden for instr_fault
maddr = Signal(64)
- m.d.comb += maddr.eq(self.addr)
+ m.d.comb += maddr.eq(self.raddr)
+
+ # check for LR/SC misalignment, used in set_rd/wr_addr above
+ comb += self.lrsc_misalign.eq(((self.pi.data_len[0:3]-1) &
+ self.req.raddr[0:3]).bool())
+ with m.If(self.lrsc_misalign & self.req.reserve):
+ m.d.comb += self.req.align_intr.eq(1)
# create a blip (single pulse) on valid read/write request
# this can be over-ridden in the FSM to get dcache to re-run
# a request when MMU_LOOKUP completes.
m.d.comb += self.d_validblip.eq(rising_edge(m, self.d_valid))
ldst_r = LDSTRequest("ldst_r")
- comb += Display("MMUTEST: LoadStore1 d_in.error=%i",d_in.error)
+ sync += Display("MMUTEST: LoadStore1 d_in.error=%i",d_in.error)
# fsm skeleton
with m.Switch(self.state):
with m.Case(State.IDLE):
- with m.If(self.d_validblip & ~exc.happened):
+ sync += self.load_data_delay.eq(0) # clear out
+ with m.If((self.d_validblip | self.instr_fault) &
+ ~exc.happened):
comb += self.busy.eq(1)
sync += self.state.eq(State.ACK_WAIT)
sync += ldst_r.eq(self.req) # copy of LDSTRequest on "blip"
-# sync += Display("validblip self.req.virt_mode=%i",
-# self.req.virt_mode)
+ # sync += Display("validblip self.req.virt_mode=%i",
+ # self.req.virt_mode)
+ with m.If(self.instr_fault):
+ comb += mmureq.eq(1)
+ sync += self.r_instr_fault.eq(1)
+ comb += maddr.eq(self.maddr)
+ sync += self.state.eq(State.MMU_LOOKUP)
+ with m.Else():
+ sync += self.r_instr_fault.eq(0)
+ # if the LD/ST requires two dwords, move to waiting
+ # for first word
+ with m.If(self.req.alignstate == Misalign.NEED2WORDS):
+ sync += ldst_r.alignstate.eq(Misalign.WAITFIRST)
with m.Else():
sync += ldst_r.eq(0)
# waiting for completion
with m.Case(State.ACK_WAIT):
- comb += Display("MMUTEST: ACK_WAIT")
+ sync += Display("MMUTEST: ACK_WAIT")
comb += self.busy.eq(~exc.happened)
with m.If(d_in.error):
sync += self.state.eq(State.IDLE)
sync += ldst_r.eq(0)
sync += Display("cache error -> update dsisr")
- #sync += self.dsisr[63 - 38].eq(~self.load)
+ sync += self.dsisr[63 - 38].eq(~ldst_r.load)
# XXX there is no architected bit for this
# (probably should be a machine check in fact)
- #sync += self.dsisr[63 - 35].eq(d_in.cache_paradox)
+ sync += self.dsisr[63 - 35].eq(d_in.cache_paradox)
+ sync += self.r_instr_fault.eq(0)
with m.Else():
# Look up the translation for TLB miss
comb += mmureq.eq(1)
sync += self.state.eq(State.MMU_LOOKUP)
with m.If(d_in.valid):
- m.d.comb += self.done.eq(~mmureq) # done if not doing MMU
with m.If(self.done):
- sync += Display("ACK_WAIT, done %x", self.addr)
- sync += self.state.eq(State.IDLE)
- sync += ldst_r.eq(0)
- with m.If(self.load):
- m.d.comb += self.load_data.eq(d_in.data)
+ sync += Display("ACK_WAIT, done %x", self.raddr)
+ with m.If(ldst_r.alignstate == Misalign.ONEWORD):
+ # done if there is only one dcache operation
+ sync += self.state.eq(State.IDLE)
+ sync += ldst_r.eq(0)
+ with m.If(ldst_r.load):
+ m.d.comb += self.load_data.eq(d_in.data)
+ sync += self.load_data_delay[0:64].eq(d_in.data)
+ m.d.comb += self.done.eq(~mmureq) # done if not MMU
+ with m.Elif(ldst_r.alignstate == Misalign.WAITFIRST):
+ # first LD done: load data, initiate 2nd request.
+ # leave in ACK_WAIT state
+ with m.If(ldst_r.load):
+ m.d.comb += self.load_data[0:63].eq(d_in.data)
+ sync += self.load_data_delay[0:64].eq(d_in.data)
+ with m.Else():
+ m.d.sync += d_out.data.eq(self.store_data2)
+ # mmm kinda cheating, make a 2nd blip.
+ # use an aligned version of the address
+ m.d.comb += self.d_validblip.eq(1)
+ comb += self.req.eq(ldst_r) # from copy of request
+ comb += self.req.raddr.eq(self.next_addr)
+ comb += self.req.byte_sel.eq(ldst_r.byte_sel[8:])
+ comb += self.req.alignstate.eq(Misalign.WAITSECOND)
+ sync += ldst_r.raddr.eq(self.next_addr)
+ sync += ldst_r.byte_sel.eq(ldst_r.byte_sel[8:])
+ sync += ldst_r.alignstate.eq(Misalign.WAITSECOND)
+ sync += Display(" second req %x", self.req.raddr)
+ with m.Elif(ldst_r.alignstate == Misalign.WAITSECOND):
+ sync += Display(" done second %x", d_in.data)
+ # done second load
+ sync += self.state.eq(State.IDLE)
+ sync += ldst_r.eq(0)
+ with m.If(ldst_r.load):
+ m.d.comb += self.load_data[64:128].eq(d_in.data)
+ sync += self.load_data_delay[64:128].eq(d_in.data)
+ m.d.comb += self.done.eq(~mmureq) # done if not MMU
# waiting here for the MMU TLB lookup to complete.
# either re-try the dcache lookup or throw MMU exception
with m.Case(State.MMU_LOOKUP):
- comb += self.busy.eq(1)
+ comb += self.busy.eq(~exception)
with m.If(m_in.done):
- with m.If(~self.instr_fault):
+ with m.If(~self.r_instr_fault):
sync += Display("MMU_LOOKUP, done %x -> %x",
- self.addr, d_out.addr)
+ self.raddr, d_out.addr)
# retry the request now that the MMU has
# installed a TLB entry, if not exception raised
m.d.comb += self.d_out.valid.eq(~exception)
sync += self.state.eq(State.ACK_WAIT)
- sync += ldst_r.eq(0)
with m.Else():
- sync += Display("MMU_LOOKUP, exception %x", self.addr)
- # instruction lookup fault: store address in DAR
- comb += exc.happened.eq(1) # reason = MMU_LOOKUP
- # mark dar as updated ?
- comb += self.pi.dar_o.eq(self.addr)
sync += self.state.eq(State.IDLE)
+ sync += self.r_instr_fault.eq(0)
+ comb += self.done.eq(1)
with m.If(m_in.err):
- # MMU RADIX exception thrown
+ # MMU RADIX exception thrown. XXX
+ # TODO: critical that the write here has to
+ # notify the MMU FSM of the change to dsisr
comb += exception.eq(1)
+ comb += self.done.eq(1)
sync += Display("MMU RADIX exception thrown")
- #sync += self.dsisr[63 - 33].eq(m_in.invalid)
- #sync += self.dsisr[63 - 36].eq(m_in.perm_error)
- #sync += self.dsisr[63 - 38].eq(self.load)
- #sync += self.dsisr[63 - 44].eq(m_in.badtree)
- #sync += self.dsisr[63 - 45].eq(m_in.rc_error)
+ sync += self.dsisr[63 - 33].eq(m_in.invalid)
+ sync += self.dsisr[63 - 36].eq(m_in.perm_error) # noexec
+ sync += self.dsisr[63 - 38].eq(~ldst_r.load)
+ sync += self.dsisr[63 - 44].eq(m_in.badtree)
+ sync += self.dsisr[63 - 45].eq(m_in.rc_error)
sync += self.state.eq(State.IDLE)
+ # exception thrown, clear out instruction fault state
+ sync += self.r_instr_fault.eq(0)
- with m.Case(State.TLBIE_WAIT):
- pass
+ # MMU FSM communicating a request to update DSISR or DAR (OP_MTSPR)
+ with m.If(self.mmu_set_spr):
+ with m.If(self.mmu_set_dsisr):
+ sync += self.dsisr.eq(self.sprval_in)
+ with m.If(self.mmu_set_dar):
+ sync += self.dar.eq(self.sprval_in)
- # alignment error: store address in DAR
+ # hmmm, alignment occurs in set_rd_addr/set_wr_addr, note exception
with m.If(self.align_intr):
- comb += exc.happened.eq(1) # reason = alignment
- sync += Display("alignment error: store addr in DAR %x", self.addr)
- comb += self.pi.dar_o.eq(self.addr)
+ comb += exc.happened.eq(1)
+ # check for updating DAR
+ with m.If(exception):
+ sync += Display("exception %x", self.raddr)
+ # alignment error: store address in DAR
+ with m.If(self.align_intr):
+ sync += Display("alignment error: addr in DAR %x", self.raddr)
+ sync += self.dar.eq(self.raddr)
+ with m.Elif(~self.r_instr_fault):
+ sync += Display("not instr fault, addr in DAR %x", self.raddr)
+ sync += self.dar.eq(self.raddr)
# when done or exception, return to idle state
with m.If(self.done | exception):
comb += self.align_intr.eq(self.req.align_intr)
comb += exc.invalid.eq(m_in.invalid)
comb += exc.alignment.eq(self.align_intr)
- comb += exc.instr_fault.eq(self.instr_fault)
+ comb += exc.instr_fault.eq(self.r_instr_fault)
# badtree, perm_error, rc_error, segment_fault
comb += exc.badtree.eq(m_in.badtree)
comb += exc.perm_error.eq(m_in.perm_error)
comb += exc.rc_error.eq(m_in.rc_error)
comb += exc.segment_fault.eq(m_in.segerr)
+ # conditions for 0x400 trap need these in SRR1
+ with m.If(exception & ~exc.alignment & exc.instr_fault):
+ comb += exc.srr1[14].eq(exc.invalid) # 47-33
+ comb += exc.srr1[12].eq(exc.perm_error) # 47-35
+ comb += exc.srr1[3].eq(exc.badtree) # 47-44
+ comb += exc.srr1[2].eq(exc.rc_error) # 47-45
# TODO, connect dcache wb_in/wb_out to "standard" nmigen Wishbone bus
- comb += dbus.adr.eq(dcache.wb_out.adr)
- comb += dbus.dat_w.eq(dcache.wb_out.dat)
- comb += dbus.sel.eq(dcache.wb_out.sel)
- comb += dbus.cyc.eq(dcache.wb_out.cyc)
- comb += dbus.stb.eq(dcache.wb_out.stb)
- comb += dbus.we.eq(dcache.wb_out.we)
-
- comb += dcache.wb_in.dat.eq(dbus.dat_r)
- comb += dcache.wb_in.ack.eq(dbus.ack)
+ comb += dbus.adr.eq(dcache.bus.adr)
+ comb += dbus.dat_w.eq(dcache.bus.dat_w)
+ comb += dbus.sel.eq(dcache.bus.sel)
+ comb += dbus.cyc.eq(dcache.bus.cyc)
+ comb += dbus.stb.eq(dcache.bus.stb)
+ comb += dbus.we.eq(dcache.bus.we)
+
+ comb += dcache.bus.dat_r.eq(dbus.dat_r)
+ comb += dcache.bus.ack.eq(dbus.ack)
if hasattr(dbus, "stall"):
- comb += dcache.wb_in.stall.eq(dbus.stall)
+ comb += dcache.bus.stall.eq(dbus.stall)
- # update out d data when flag set
+ # update out d data when flag set, for first half (second done in FSM)
with m.If(self.d_w_valid):
m.d.sync += d_out.data.eq(self.store_data)
#with m.Else():
m.d.comb += self.d_out.valid.eq(~exc.happened)
m.d.comb += d_out.load.eq(self.req.load)
m.d.comb += d_out.byte_sel.eq(self.req.byte_sel)
- m.d.comb += self.addr.eq(self.req.addr)
+ m.d.comb += self.raddr.eq(self.req.raddr)
m.d.comb += d_out.nc.eq(self.req.nc)
- # XXX driver conflict. ehn??
- # XXX m.d.comb += d_out.priv_mode.eq(self.req.priv_mode)
- # XXX m.d.comb += d_out.virt_mode.eq(self.req.virt_mode)
+ m.d.comb += d_out.priv_mode.eq(self.req.priv_mode)
+ m.d.comb += d_out.virt_mode.eq(self.req.virt_mode)
+ m.d.comb += d_out.reserve.eq(self.req.reserve)
+ m.d.comb += d_out.atomic.eq(self.req.atomic)
+ m.d.comb += d_out.atomic_last.eq(self.req.atomic_last)
#m.d.comb += Display("validblip dcbz=%i addr=%x",
#self.req.dcbz,self.req.addr)
m.d.comb += d_out.dcbz.eq(self.req.dcbz)
with m.Else():
m.d.comb += d_out.load.eq(ldst_r.load)
m.d.comb += d_out.byte_sel.eq(ldst_r.byte_sel)
- m.d.comb += self.addr.eq(ldst_r.addr)
+ m.d.comb += self.raddr.eq(ldst_r.raddr)
m.d.comb += d_out.nc.eq(ldst_r.nc)
- # XXX driver conflict. ehn??
- # XXX m.d.comb += d_out.priv_mode.eq(ldst_r.priv_mode)
- # XXX m.d.comb += d_out.virt_mode.eq(ldst_r.virt_mode)
+ m.d.comb += d_out.priv_mode.eq(ldst_r.priv_mode)
+ m.d.comb += d_out.virt_mode.eq(ldst_r.virt_mode)
+ m.d.comb += d_out.reserve.eq(ldst_r.reserve)
+ m.d.comb += d_out.atomic.eq(ldst_r.atomic)
+ m.d.comb += d_out.atomic_last.eq(ldst_r.atomic_last)
#m.d.comb += Display("no_validblip dcbz=%i addr=%x",
#ldst_r.dcbz,ldst_r.addr)
m.d.comb += d_out.dcbz.eq(ldst_r.dcbz)
-
- # XXX these should be possible to remove but for some reason
- # cannot be... yet. TODO, investigate
- m.d.comb += self.load_data.eq(d_in.data)
- m.d.comb += d_out.addr.eq(self.addr)
+ m.d.comb += d_out.addr.eq(self.raddr)
# Update outputs to MMU
m.d.comb += m_out.valid.eq(mmureq)
m.d.comb += m_out.iside.eq(self.instr_fault)
m.d.comb += m_out.load.eq(ldst_r.load)
- # m_out.priv <= r.priv_mode; TODO
+ with m.If(self.instr_fault):
+ m.d.comb += m_out.priv.eq(self.priv_mode)
+ with m.Else():
+ m.d.comb += m_out.priv.eq(ldst_r.priv_mode)
m.d.comb += m_out.tlbie.eq(self.tlbie)
# m_out.mtspr <= mmu_mtspr; # TODO
# m_out.sprn <= sprn; # TODO
# LDSTCompUnit is unusual in that it's non-standard to RegSpecAPI
regspec = [('INT', 'o', '0:63'), # RT
('INT', 'o1', '0:63'), # RA (effective address, update mode)
- # TODO, later ('CR', 'cr_a', '0:3'),
+ ('CR', 'cr_a', '0:3'),
# TODO, later ('XER', 'xer_so', '32')
]
def __init__(self, pspec):
class LDSTPipeSpec(CommonPipeSpec):
- regspec = (LDSTInputData.regspec, LDSTOutputData.regspec)
+ regspecklses = (LDSTInputData, LDSTOutputData)
opsubsetkls = CompLDSTOpSubset
def elaborate(self, platform):
m = Module()
perm = Signal(self.width, reset_less=True)
- rb64 = [Signal(1, reset_less=True, name=f"rb64_{i}") for i in range(64)]
- for i in range(64):
- m.d.comb += rb64[i].eq(self.rb[63-i])
+ rb64 = [Signal(1, reset_less=True, name=f"rb64_{i}")
+ for i in range(self.width)]
+ for i in range(self.width):
+ m.d.comb += rb64[i].eq(self.rb[self.width-1-i])
rb64 = Array(rb64)
- for i in range(8):
+ for i in range(self.width//8):
index = self.rs[8*i:8*i+8]
idx = Signal(8, name=f"idx_{i}", reset_less=True)
m.d.comb += idx.eq(index)
- with m.If(idx < 64):
+ with m.If(idx < self.width):
m.d.comb += perm[i].eq(rb64[idx])
m.d.comb += self.ra[0:8].eq(perm)
return m
recwidth += width
comb += p.eq(AnyConst(width))
- pspec = ALUPipeSpec(id_wid=2)
+ pspec = ALUPipeSpec(id_wid=2, parent_pspec=None)
m.submodules.dut = dut = ALUInputStage(pspec)
a = Signal(64)
dut.i.b.eq(b),
a.eq(AnyConst(64)),
b.eq(AnyConst(64))]
-
+
comb += dut.i.ctx.op.eq(rec)
# Assert that op gets copied from the input to output
module = Driver()
self.assertFormal(module, mode="bmc", depth=4)
self.assertFormal(module, mode="cover", depth=4)
+
def test_ilang(self):
dut = Driver()
vl = rtlil.convert(dut, ports=[])
width = p.width
comb += p.eq(AnyConst(width))
- pspec = ALUPipeSpec(id_wid=2)
+ pspec = ALUPipeSpec(id_wid=2, parent_pspec=None)
m.submodules.dut = dut = LogicalMainStage(pspec)
# convenience variables
# setup random inputs
comb += [a.eq(AnyConst(64)),
b.eq(AnyConst(64)),
- #carry_in.eq(AnyConst(0b11)),
+ # carry_in.eq(AnyConst(0b11)),
]
comb += dut.i.ctx.op.eq(rec)
comb += a_signed_32.eq(a[0:32])
o_ok = Signal()
- comb += o_ok.eq(1) # will be set to zero if no op takes place
+ comb += o_ok.eq(1) # will be set to zero if no op takes place
# main assertion of arithmetic operations
with m.Switch(rec.insn_type):
comb += peo.eq(32)
with m.Else():
comb += peo.eq(pe32.o)
- with m.If(XO[-1]): # cnttzw
+ with m.If(XO[-1]): # cnttzw
comb += pe32.i.eq(a[0:32])
comb += Assert(o == peo)
- with m.Else(): # cntlzw
+ with m.Else(): # cntlzw
comb += pe32.i.eq(a[0:32][::-1])
comb += Assert(o == peo)
with m.Else():
comb += peo64.eq(64)
with m.Else():
comb += peo64.eq(pe64.o)
- with m.If(XO[-1]): # cnttzd
+ with m.If(XO[-1]): # cnttzd
comb += pe64.i.eq(a[0:64])
comb += Assert(o == peo64)
- with m.Else(): # cntlzd
+ with m.Else(): # cntlzd
comb += pe64.i.eq(a[0:64][::-1])
comb += Assert(o == peo64)
module = Driver()
self.assertFormal(module, mode="bmc", depth=2)
self.assertFormal(module, mode="cover", depth=2)
+
def test_ilang(self):
dut = Driver()
vl = rtlil.convert(dut, ports=[])
# to the output stage
# Copyright (C) 2020 Michael Nolan <mtnolan2640@gmail.com>
+# Copyright (C) 2020, 2021 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+
from nmigen import (Module, Signal, Cat, Repl, Mux, Const, Array)
from nmutil.pipemodbase import PipeModBase
from nmutil.clz import CLZ
return LogicalOutputData(self.pspec)
def elaborate(self, platform):
+ XLEN = self.pspec.XLEN
m = Module()
comb = m.d.comb
op, a, b, o = self.i.ctx.op, self.i.a, self.i.b, self.o.o
comb += o.ok.eq(1) # overridden if no op activates
- m.submodules.bpermd = bpermd = Bpermd(64)
- m.submodules.popcount = popcount = Popcount()
+ m.submodules.bpermd = bpermd = Bpermd(XLEN)
+ m.submodules.popcount = popcount = Popcount(XLEN)
##########################
# main switch for logic ops AND, OR and XOR, cmpb, parity, and popcount
par0 = Signal(reset_less=True)
par1 = Signal(reset_less=True)
comb += par0.eq(Cat(a[0], a[8], a[16], a[24]).xor())
- comb += par1.eq(Cat(a[32], a[40], a[48], a[56]).xor())
+ if XLEN == 64:
+ comb += par1.eq(Cat(a[32], a[40], a[48], a[56]).xor())
with m.If(op.data_len[3] == 1):
comb += o.data.eq(par0 ^ par1)
with m.Else():
comb += o[0].eq(par0)
- comb += o[32].eq(par1)
+ if XLEN == 64:
+ comb += o[32].eq(par1)
###################
###### cntlz v3.0B p99
count_right = Signal(reset_less=True)
comb += count_right.eq(XO[-1])
- cntz_i = Signal(64, reset_less=True)
+ cntz_i = Signal(XLEN, reset_less=True)
a32 = Signal(32, reset_less=True)
comb += a32.eq(a[0:32])
with m.Else():
comb += cntz_i.eq(Mux(count_right, a[::-1], a))
- m.submodules.clz = clz = CLZ(64)
+ m.submodules.clz = clz = CLZ(XLEN)
comb += clz.sig_in.eq(cntz_i)
comb += o.data.eq(Mux(op.is_32bit, clz.lz-32, clz.lz))
# input (and output) for logical initial stage (common input)
class LogicalInputData(FUBaseData):
- regspec = [('INT', 'ra', '0:63'), # RA
- ('INT', 'rb', '0:63'), # RB/immediate
- ('XER', 'xer_so', '32'), # bit0: so
- ]
def __init__(self, pspec):
super().__init__(pspec, False)
# convenience
self.a, self.b = self.ra, self.rb
+ @property
+ def regspec(self):
+ return [('INT', 'ra', self.intrange), # RA
+ ('INT', 'rb', self.intrange), # RB/immediate
+ ('XER', 'xer_so', '32'), # bit0: so
+ ]
# input to logical final stage (common output)
class LogicalOutputData(FUBaseData):
- regspec = [('INT', 'o', '0:63'), # RT
- ('CR', 'cr_a', '0:3'),
- ('XER', 'xer_so', '32'), # bit0: so
- ]
def __init__(self, pspec):
super().__init__(pspec, True)
# convenience
self.cr0 = self.cr_a
+ @property
+ def regspec(self):
+ return [('INT', 'o', self.intrange),
+ ('CR', 'cr_a', '0:3'),
+ ('XER', 'xer_so', '32'), # bit0: so
+ ]
+
# output from logical final stage (common output) - note that XER.so
# is *not* included (the only reason it's in the input is because of CR0)
class LogicalOutputDataFinal(FUBaseData):
- regspec = [('INT', 'o', '0:63'), # RT
- ('CR', 'cr_a', '0:3'),
- ]
def __init__(self, pspec):
super().__init__(pspec, True)
# convenience
self.cr0 = self.cr_a
+ @property
+ def regspec(self):
+ return [('INT', 'o', self.intrange),
+ ('CR', 'cr_a', '0:3'),
+ ]
class LogicalPipeSpec(CommonPipeSpec):
- regspec = (LogicalInputData.regspec, LogicalOutputDataFinal.regspec)
+ regspecklses = (LogicalInputData, LogicalOutputDataFinal)
opsubsetkls = CompLogicalOpSubset
class LogicalStages1(PipeModBaseChain):
def get_chain(self):
inp = LogicalInputStage(self.pspec)
+ return [inp]
+
+class LogicalStages2(PipeModBaseChain):
+ def get_chain(self):
main = LogicalMainStage(self.pspec)
- return [inp, main]
+ return [main]
-class LogicalStages2(PipeModBaseChain):
+class LogicalStages3(PipeModBaseChain):
def get_chain(self):
out = LogicalOutputStage(self.pspec)
return [out]
self.pspec = pspec
self.pipe1 = LogicalStages1(pspec)
self.pipe2 = LogicalStages2(pspec)
- self._eqs = self.connect([self.pipe1, self.pipe2])
+ self.pipe3 = LogicalStages3(pspec)
+ self._eqs = self.connect([self.pipe1, self.pipe2, self.pipe3])
def elaborate(self, platform):
m = ControlBase.elaborate(self, platform)
m.submodules.logical_pipe1 = self.pipe1
m.submodules.logical_pipe2 = self.pipe2
+ m.submodules.logical_pipe3 = self.pipe3
m.d.comb += self._eqs
return m
class Popcount(Elaboratable):
- def __init__(self):
- self.a = Signal(64, reset_less=True)
- self.b = Signal(64, reset_less=True)
+ def __init__(self, width=64):
+ self.width = width
+ self.a = Signal(width, reset_less=True)
+ self.b = Signal(width, reset_less=True)
self.data_len = Signal(4, reset_less=True) # data len up to... err.. 8?
- self.o = Signal(64, reset_less=True)
+ self.o = Signal(width, reset_less=True)
+ assert width in [32, 64], "only 32 or 64 bit supported for now"
def elaborate(self, platform):
m = Module()
# creating arrays big enough to store the sum, each time
pc = [a]
# QTY32 2-bit (to take 2x 1-bit sums) etc.
- work = [(32, 2), (16, 3), (8, 4), (4, 5), (2, 6), (1, 7)]
+ work = [(16, 3), (8, 4), (4, 5), (2, 6), (1, 7)]
+ if self.width == 64:
+ work = [(32, 2)] + work
for l, bw in work: # l=number of add-reductions, bw=bitwidth
pc.append(array_of(l, bw))
- pc8 = pc[3] # array of 8 8-bit counts (popcntb)
- pc32 = pc[5] # array of 2 32-bit counts (popcntw)
+ pc8 = pc[-4] # array of 8 8-bit counts (popcntb)
+ pc32 = pc[-2] # array of 2 32-bit counts (popcntw)
popcnt = pc[-1] # array of 1 64-bit count (popcntd)
# cascade-tree of adds
for idx, (l, bw) in enumerate(work):
# decode operation length (1-hot)
with m.If(data_len == 1):
# popcntb - pack 8x 4-bit answers into 8x 8-bit output fields
- for i in range(8):
+ for i in range(self.width//8):
comb += o[i*8:(i+1)*8].eq(pc8[i])
with m.Elif(data_len == 4):
- # popcntw - pack 2x 5-bit answers into 2x 32-bit output fields
- for i in range(2):
- comb += o[i*32:(i+1)*32].eq(pc32[i])
+ if self.width == 64:
+ # popcntw - pack 2x 5-bit answers into 2x 32-bit output fields
+ for i in range(2):
+ comb += o[i*32:(i+1)*32].eq(pc32[i])
+ else:
+ comb += o.eq(popcnt[0])
with m.Else():
# popcntd - put 1x 6-bit answer into 64-bit output
comb += o.eq(popcnt[0])
# and place it into i_data.b
inp = yield from get_cu_inputs(dec2, sim)
- print ("set alu inputs", inp)
+ print("set alu inputs", inp)
yield from ALUHelpers.set_int_ra(alu, dec2, inp)
yield from ALUHelpers.set_int_rb(alu, dec2, inp)
yield from ALUHelpers.set_xer_so(alu, dec2, inp)
class LogicalIlangCase(TestAccumulatorBase):
def case_ilang(self):
- pspec = LogicalPipeSpec(id_wid=2)
+ class PPspec:
+ XLEN = 64
+ pps = PPspec()
+ pspec = LogicalPipeSpec(id_wid=2, parent_pspec=pps)
alu = LogicalBasePipe(pspec)
vl = rtlil.convert(alu, ports=alu.ports())
with open("logical_pipeline.il", "w") as f:
f.write(vl)
-class TestRunner(FHDLTestCase):
- def __init__(self, test_data):
- super().__init__("run_all")
- self.test_data = test_data
+class TestRunner(unittest.TestCase):
- def execute(self, alu,instruction, pdecode2, test):
+ def execute(self, alu, instruction, pdecode2, test):
print(test.name)
program = test.program
self.subTest(test.name)
simulator, code)
yield Settle()
- def run_all(self):
+ def test_it(self):
+ test_data = LogicalIlangCase().test_data + \
+ LogicalTestCase({'soc'}).test_data
m = Module()
comb = m.d.comb
instruction = Signal(32)
m.submodules.pdecode2 = pdecode2 = PowerDecode2(pdecode)
- pspec = LogicalPipeSpec(id_wid=2)
+ class PPspec:
+ XLEN = 64
+ pps = PPspec()
+ pspec = LogicalPipeSpec(id_wid=2, parent_pspec=pps)
m.submodules.alu = alu = LogicalBasePipe(pspec)
comb += alu.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
sim.add_clock(1e-6)
def process():
- for test in self.test_data:
+ for test in test_data:
print(test.name)
program = test.program
with self.subTest(test.name):
if __name__ == "__main__":
- unittest.main(exit=False)
- suite = unittest.TestSuite()
- suite.addTest(TestRunner(LogicalIlangCase().test_data))
- suite.addTest(TestRunner(LogicalTestCase().test_data))
-
- runner = unittest.TextTestRunner()
- runner.run(suite)
+ unittest.main()
from soc.fu.ldst.loadstore import LoadStore1, TestSRAMLoadStore1
from nmutil.util import Display
+
class FSMMMUStage(ControlBase):
"""FSM MMU
# set up p/n data
self.p.i_data = MMUInputData(pspec)
self.n.o_data = MMUOutputData(pspec)
+ self.exc_o = self.n.o_data.exception # AllFunctionUnits needs this
self.mmu = MMU()
# incoming PortInterface
self.ldst = ldst
self.dcache = self.ldst.dcache
+ self.icache = self.ldst.icache
self.pi = self.ldst.pi
def elaborate(self, platform):
assert hasattr(self, "dcache"), "remember to call set_ldst_interface"
m = super().elaborate(platform)
comb, sync = m.d.comb, m.d.sync
- dcache = self.dcache
+ dcache, icache = self.dcache, self.icache
+ ldst = self.ldst # managed externally: do not add here
- # link mmu and dcache together
+ # link mmu, dcache and icache together
m.submodules.mmu = mmu = self.mmu
- ldst = self.ldst # managed externally: do not add here
m.d.comb += dcache.m_in.eq(mmu.d_out) # MMUToDCacheType
m.d.comb += mmu.d_in.eq(dcache.m_out) # DCacheToMMUType
+ m.d.comb += icache.m_in.eq(mmu.i_out) # MMUToICacheType
l_in, l_out = mmu.l_in, mmu.l_out
d_in, d_out = dcache.d_in, dcache.d_out
- wb_out, wb_in = dcache.wb_out, dcache.wb_in
# link ldst and MMU together
comb += l_in.eq(ldst.m_out)
comb += ldst.m_in.eq(l_out)
i_data, o_data = self.p.i_data, self.n.o_data
- a_i, b_i, o, spr1_o = i_data.ra, i_data.rb, o_data.o, o_data.spr1
op = i_data.ctx.op
+ cia_i = op.cia
msr_i = op.msr
- spr1_i = i_data.spr1
-
- # these are set / got here *ON BEHALF* of LoadStore1
- # XXX have to deal with this another way
- # dsisr, dar = ldst.dsisr, ldst.dar
+ a_i, b_i, spr1_i = i_data.ra, i_data.rb, i_data.spr1
+ o, exc_o, spr1_o = o_data.o, o_data.exception, o_data.spr1
# busy/done signals
- busy = Signal()
- done = Signal()
+ busy = Signal(name="mmu_fsm_busy")
+ done = Signal(name="mmu_fsm_done")
m.d.comb += self.n.o_valid.eq(busy & done)
m.d.comb += self.p.o_ready.eq(~busy)
spr = Signal(len(x_fields.SPR))
comb += spr.eq(decode_spr_num(x_fields.SPR))
- # based on MSR bits, set priv and virt mode. TODO: 32-bit mode
- comb += d_in.priv_mode.eq(~msr_i[MSR.PR])
- comb += d_in.virt_mode.eq(msr_i[MSR.DR])
- #comb += d_in.mode_32bit.eq(msr_i[MSR.SF]) # ?? err
-
# ok so we have to "pulse" the MMU (or dcache) rather than
# hold the valid hi permanently. guess what this does...
valid = Signal()
# WIP: properly implement MicrOp.OP_MTSPR and MicrOp.OP_MFSPR
with m.Switch(op.insn_type):
+
+ ##########
+ # OP_MTSPR
+ ##########
+
with m.Case(MicrOp.OP_MTSPR):
comb += Display("MMUTEST: OP_MTSPR: spr=%i", spr)
# despite redirection this FU **MUST** behave exactly
comb += self.debug0.eq(3)
#if matched update local cached value
#commented out because there is a driver conflict
- #with m.If(spr[0]):
- # sync += dsisr.eq(a_i[:32])
- #with m.Else():
- # sync += dar.eq(a_i)
+ comb += ldst.sprval_in.eq(a_i)
+ comb += ldst.mmu_set_spr.eq(1)
+ with m.If(spr[0]):
+ comb += ldst.mmu_set_dar.eq(1)
+ with m.Else():
+ comb += ldst.mmu_set_dsisr.eq(1)
comb += done.eq(1)
# pass it over to the MMU instead
with m.Else():
comb += l_in.rs.eq(a_i) # incoming operand (RS)
comb += done.eq(1) # FIXME l_out.done
+ ##########
+ # OP_MFSPR
+ ##########
+
with m.Case(MicrOp.OP_MFSPR):
comb += Display("MMUTEST: OP_MFSPR: spr=%i returns=%i",
spr, spr1_i)
- comb += o.data.eq(spr1_i)
+ # partial SPR number decoding perfectly fine
+ with m.If(spr[9] | spr[5]):
+ # identified as an MMU OP_MFSPR, contact the MMU.
+ # interestingly, the read is combinatorial: no need
+ # to set "valid", just set the SPR number
+ comb += l_in.sprn.eq(spr) # which SPR
+ comb += o.data.eq(l_out.sprval)
+ with m.Else():
+ # identified as DSISR or DAR. again: read the SPR
+ # directly, combinatorial access
+ with m.If(spr[0]):
+ comb += o.data.eq(ldst.dar)
+ with m.Else():
+ comb += o.data.eq(ldst.dsisr)
+
comb += o.ok.eq(1)
comb += done.eq(1)
+ ##########
+ # OP_TLBIE
+ ##########
+
with m.Case(MicrOp.OP_TLBIE):
comb += Display("MMUTEST: OP_TLBIE: insn_bits=%i", spr)
# pass TLBIE request to MMU (spec: v3.0B p1034)
comb += done.eq(l_out.done) # zzzz
comb += self.debug0.eq(2)
+ ##########
+ # OP_FETCH_FAILED
+ ##########
+
+ with m.Case(MicrOp.OP_FETCH_FAILED):
+ comb += Display("MMUTEST: OP_FETCH_FAILED: @%x", cia_i)
+ # trigger an instruction fetch failed MMU event.
+ # PowerDecoder2 drops svstate.pc into NIA for us
+ # really, this should be direct communication with the
+ # MMU, rather than going through LoadStore1. but, doing
+ # so allows for the opportunity to prevent LoadStore1
+ # from accepting any other LD/ST requests.
+ comb += valid.eq(1) # start "pulse"
+ comb += ldst.instr_fault.eq(blip)
+ comb += ldst.priv_mode.eq(~msr_i[MSR.PR])
+ comb += ldst.maddr.eq(cia_i)
+ # XXX should not access this!
+ comb += done.eq(ldst.done)
+ comb += self.debug0.eq(3)
+ # LDST unit contains exception data, which (messily)
+ # is copied over, here. not ideal but it will do for now
+ comb += exc_o.eq(ldst.pi.exc_o)
+
+ ############
+ # OP_ILLEGAL
+ ############
+
with m.Case(MicrOp.OP_ILLEGAL):
comb += self.illegal.eq(1)
layout = (('insn_type', MicrOp),
('fn_unit', Function),
('insn', 32),
- ('msr', 64), # TODO: a lot less bits. only need PR, DR, SF
+ ('cia', 64), # for instruction fault (MMU PTE lookup)
+ ('msr', 64), # ditto, to set priv_mode etc.
('zero_a', 1),
)
super().__init__(layout, name=name)
from soc.fu.pipe_data import FUBaseData
from soc.fu.mmu.mmu_input_record import CompMMUOpSubset
from soc.fu.alu.pipe_data import CommonPipeSpec
+from openpower.exceptions import LDSTException
class MMUInputData(FUBaseData):
('SPR', 'spr1', '0:63'), # MMU (slow)
]
def __init__(self, pspec):
- super().__init__(pspec, True)
+ super().__init__(pspec, True, LDSTException)
class MMUPipeSpec(CommonPipeSpec):
- regspec = (MMUInputData.regspec, MMUOutputData.regspec)
+ regspecklses = (MMUInputData, MMUOutputData)
opsubsetkls = CompMMUOpSubset
# libre-soc has own SPR unit
# other instructions here -> must be load/store
- def case_mmu_ldst(self):
+ def cse_dcbz(self):
lst = [
"dcbz 1,2",
+ ]
+
+ initial_regs = [0] * 32
+ initial_regs[1] = 0x2
+ initial_regs[2] = 0x2020
+
+ self.add_case(Program(lst, bigendian),
+ initial_regs, initial_mem={})
+
+ def case_mmu_dar(self):
+ lst = [
+ "mfspr 1, 720", # DAR to reg 1
+ "mtspr 19, 3", # reg 3 to DAR
+ ]
+
+ initial_regs = [0] * 32
+ initial_regs[1] = 0x2
+ initial_regs[3] = 0x5
+
+ initial_sprs = {'DAR': 0x87654321,
+ }
+ self.add_case(Program(lst, bigendian),
+ initial_regs, initial_sprs, initial_mem={})
+
+ def case_mmu_ldst(self):
+ lst = [
+ "dcbz 1,0",
"tlbie 0,0,0,0,0", # RB,RS,RIC,PRS,R
"mtspr 18, 1", # reg 1 to DSISR
"mtspr 19, 2", # reg 2 to DAR
- "mfspr 1, 18", # DSISR to reg 1
- "mfspr 2, 19", # DAR to reg 2
+ "mfspr 5, 18", # DSISR to reg 5
+ "mfspr 6, 19", # DAR to reg 6
"mtspr 48, 3", # set MMU PID
"mtspr 720, 4", # set MMU PRTBL
- "lhz 3, 0(1)" # load some data
+ "lhz 3, 0(1)", # load some data
+ "addi 7, 0, 1"
]
initial_regs = [0] * 32
- initial_regs[3] = 1
+ initial_regs[1] = 0x2
+ initial_regs[2] = 0x2020
+ initial_regs[3] = 5
initial_regs[4] = 0xDEADBEEF
- #initial_regs[1] = 0xDEADBEEF
- #FIXME initial_sprs = {'DSISR': 0x12345678, 'DAR': 0x87654321}
- initial_sprs = {}
+ initial_sprs = {'DSISR': 0x12345678, 'DAR': 0x87654321,
+ 'PIDR': 0xabcd, 'PRTBL': 0x0def}
self.add_case(Program(lst, bigendian),
- initial_regs, initial_sprs)
+ initial_regs, initial_sprs, initial_mem={})
if __name__ == "__main__":
+ mem = {}
unittest.main(exit=False)
suite = unittest.TestSuite()
- suite.addTest(TestRunner(MMUTestCase().test_data,microwatt_mmu=True))
+ suite.addTest(TestRunner(MMUTestCase().test_data,
+ microwatt_mmu=True,
+ svp64=False,
+ rom=mem))
runner = unittest.TextTestRunner()
runner.run(suite)
debughang = 2
+
class MMUTestCase(TestAccumulatorBase):
# MMU handles MTSPR, MFSPR, DCBZ and TLBIE.
# other instructions here -> must be load/store
def case_mfspr_after_invalid_load(self):
- lst = [ # TODO -- set SPR on both sinulator and port interface
- "mfspr 1, 18", # DSISR to reg 1
- "mfspr 2, 19", # DAR to reg 2
- # TODO -- verify returned sprvals
- ]
+ lst = [ # TODO -- set SPR on both sinulator and port interface
+ "mfspr 1, 18", # DSISR to reg 1
+ "mfspr 2, 19", # DAR to reg 2
+ # TODO -- verify returned sprvals
+ ]
initial_regs = [0] * 32
- #THOSE are currently broken -- initial_sprs = {'DSISR': 0x12345678, 'DAR': 0x87654321}
+ # THOSE are currently broken -- initial_sprs = {'DSISR': 0x12345678, 'DAR': 0x87654321}
initial_sprs = {}
self.add_case(Program(lst, bigendian),
initial_regs, initial_sprs)
- #def case_ilang(self):
- # pspec = SPRPipeSpec(id_wid=2)
+ # def case_ilang(self):
+ # pspec = SPRPipeSpec(id_wid=2, parent_pspec=None)
# alu = SPRBasePipe(pspec)
# vl = rtlil.convert(alu, ports=alu.ports())
# with open("trap_pipeline.il", "w") as f:
vld = yield fsm.n.o_valid
while not vld:
yield
- if debughang: print("not valid -- hang")
+ if debughang:
+ print("not valid -- hang")
vld = yield fsm.n.o_valid
- if debughang==2: vld=1
+ if debughang == 2:
+ vld = 1
yield
def run_all(self):
reg_wid=64)
m.submodules.core = core = NonProductionCore(pspec
- # XXX NO absolutely do not do this.
- # all options must go into the pspec
- #, microwatt_mmu=True
- )
+ # XXX NO absolutely do not do this.
+ # all options must go into the pspec
+ # , microwatt_mmu=True
+ )
comb += pdecode2.dec.raw_opcode_in.eq(instruction)
sim = Simulator(m)
traces=[]):
sim.run()
+
if __name__ == "__main__":
unittest.main(exit=False)
suite = unittest.TestSuite()
debughang = 1
+
def set_fsm_inputs(alu, dec2, sim):
# TODO: see https://bugs.libre-soc.org/show_bug.cgi?id=305#c43
# detect the immediate here (with m.If(self.i.ctx.op.imm_data.imm_ok))
# yield from ALUHelpers.set_spr_spr1(alu, dec2, inp)
overflow = None
- a=None
- b=None
+ a = None
+ b = None
# TODO
if 'xer_so' in inp:
print("xer_so::::::::::::::::::::::::::::::::::::::::::::::::")
so = inp['xer_so']
print(so)
overflow = pia.OverflowFlags(so=bool(so),
- ov=False,
- ov32=False)
+ ov=False,
+ ov32=False)
if 'ra' in inp:
a = inp['ra']
if 'rb' in inp:
def check_fsm_outputs(fsm, pdecode2, sim, code):
# check that MMUOutputData is correct
- return None #TODO
+ return None # TODO
+
+# incomplete test - connect fsm inputs first
+
-#incomplete test - connect fsm inputs first
class MMUIlangCase(TestAccumulatorBase):
- #def case_ilang(self):
- # pspec = SPRPipeSpec(id_wid=2)
+ # def case_ilang(self):
+ # pspec = SPRPipeSpec(id_wid=2, parent_pspec=None)
# alu = SPRBasePipe(pspec)
# vl = rtlil.convert(alu, ports=alu.ports())
# with open("trap_pipeline.il", "w") as f:
def __init__(self, test_data):
super().__init__("run_all")
self.test_data = test_data
- #hack here -- all unit tests are affected
+ # hack here -- all unit tests are affected
self.run_all()
def check_fsm_outputs(self, alu, dec2, sim, code, pia_res):
sim_o = {}
res = {}
- #MMUOutputData does not have xer
+ # MMUOutputData does not have xer
yield from ALUHelpers.get_cr_a(res, alu, dec2)
- #yield from ALUHelpers.get_xer_ov(res, alu, dec2)
+ # yield from ALUHelpers.get_xer_ov(res, alu, dec2)
yield from ALUHelpers.get_int_o(res, alu, dec2)
- #yield from ALUHelpers.get_xer_so(res, alu, dec2)
-
+ # yield from ALUHelpers.get_xer_so(res, alu, dec2)
print("res output", res)
yield from ALUHelpers.get_sim_int_o(sim_o, sim, dec2)
yield from ALUHelpers.get_wr_sim_cr_a(sim_o, sim, dec2)
- #yield from ALUHelpers.get_sim_xer_ov(sim_o, sim, dec2)
- #yield from ALUHelpers.get_sim_xer_so(sim_o, sim, dec2)
+ # yield from ALUHelpers.get_sim_xer_ov(sim_o, sim, dec2)
+ # yield from ALUHelpers.get_sim_xer_so(sim_o, sim, dec2)
print("sim output", sim_o)
print("power-instruction-analyzer result:")
print(pia_res)
- #if pia_res is not None:
+ # if pia_res is not None:
# with self.subTest(check="pia", sim_o=sim_o, pia_res=str(pia_res)):
# pia_o = pia_res_to_output(pia_res)
# ALUHelpers.check_int_o(self, res, pia_o, code)
# #ALUHelpers.check_xer_so(self, res, pia_o, code)
with self.subTest(check="sim", sim_o=sim_o, pia_res=str(pia_res)):
- #ALUHelpers.check_int_o(self, res, sim_o, code) # mmu is not an alu
+ # ALUHelpers.check_int_o(self, res, sim_o, code) # mmu is not an alu
ALUHelpers.check_cr_a(self, res, sim_o, code)
#ALUHelpers.check_xer_ov(self, res, sim_o, code)
#ALUHelpers.check_xer_so(self, res, sim_o, code)
- #oe = yield dec2.e.do.oe.oe
- #oe_ok = yield dec2.e.do.oe.ok
+ # oe = yield dec2.e.do.oe.oe
+ # oe_ok = yield dec2.e.do.oe.ok
#print("oe, oe_ok", oe, oe_ok)
- #if not oe or not oe_ok:
+ # if not oe or not oe_ok:
# # if OE not enabled, XER SO and OV must not be activated
# so_ok = yield alu.n.o_data.xer_so.ok
# ov_ok = yield alu.n.o_data.xer_ov.ok
print("dec2 spr/fast in", fast_out, spr_out)
fn_unit = yield pdecode2.e.do.fn_unit
- #FIXME this fails -- self.assertEqual(fn_unit, Function.SPR.value)
+ # FIXME this fails -- self.assertEqual(fn_unit, Function.SPR.value)
pia_res = yield from set_fsm_inputs(fsm, pdecode2, sim)
yield
opname = code.split(' ')[0]
index = pc//4
print("pc after %08x" % (pc))
- vld = yield fsm.n.o_valid #fsm
+ vld = yield fsm.n.o_valid # fsm
while not vld:
yield
if debughang:
print("not valid -- hang")
return
vld = yield fsm.n.o_valid
- if debughang==2: vld=1
+ if debughang == 2:
+ vld = 1
yield
yield from self.check_fsm_outputs(fsm, pdecode2, sim, code, pia_res)
comb = m.d.comb
instruction = Signal(32)
- pspec = TestMemPspec(addr_wid=48,
+ pspec = TestMemPspec(addr_wid=64,
mask_wid=8,
reg_wid=64,
)
m.submodules.pdecode2 = pdecode2 = PowerDecode2(pdecode)
- pipe_spec = MMUPipeSpec(id_wid=2)
+ pipe_spec = MMUPipeSpec(id_wid=2, parent_pspec=None)
ldst = LoadStore1(pspec)
fsm = FSMMMUStage(pipe_spec)
fsm.set_ldst_interface(ldst)
m.submodules.fsm = fsm
m.submodules.ldst = ldst
- #FIXME connect fsm inputs
+ # FIXME connect fsm inputs
comb += fsm.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
comb += fsm.p.i_valid.eq(1)
traces=[]):
sim.run()
+
if __name__ == "__main__":
unittest.main(exit=False)
suite = unittest.TestSuite()
# set up the mul stages. do not add them to m.submodules, this
# is handled by StageChain.setup().
- pspec = MulPipeSpec(id_wid=2)
+ pspec = MulPipeSpec(id_wid=2, parent_pspec=None)
pipe1 = MulMainStage1(pspec)
pipe2 = MulMainStage2(pspec)
pipe3 = MulMainStage3(pspec)
- class Dummy: pass
- dut = Dummy() # make a class into which dut.i and dut.o can be dropped
+ class Dummy:
+ pass
+ dut = Dummy() # make a class into which dut.i and dut.o can be dropped
dut.i = pipe1.ispec()
- chain = [pipe1, pipe2, pipe3] # chain of 3 mul stages
+ chain = [pipe1, pipe2, pipe3] # chain of 3 mul stages
- StageChain(chain).setup(m, dut.i) # input linked here, through chain
- dut.o = chain[-1].o # output is the last thing in the chain...
+ StageChain(chain).setup(m, dut.i) # input linked here, through chain
+ dut.o = chain[-1].o # output is the last thing in the chain...
# convenience variables
a = dut.i.ra
# setup random inputs
comb += [a.eq(AnyConst(64)),
b.eq(AnyConst(64)),
- ]
+ ]
comb += dut.i.ctx.op.eq(rec)
###### HI-32 #####
with m.Case(MicrOp.OP_MUL_H32):
- comb += Assume(rec.is_32bit) # OP_MUL_H32 is a 32-bit op
+ comb += Assume(rec.is_32bit) # OP_MUL_H32 is a 32-bit op
exp_prod = Signal(64)
expected_o = Signal.like(exp_prod)
# differ, we negate the product. This implies that
# the product is calculated from the absolute values
# of the inputs.
- prod = Signal.like(exp_prod) # intermediate product
+ prod = Signal.like(exp_prod) # intermediate product
comb += prod.eq(abs32_a * abs32_b)
comb += exp_prod.eq(Mux(ab32_sne, -prod, prod))
comb += expected_o.eq(Repl(exp_prod[32:64], 2))
# differ, we negate the product. This implies that
# the product is calculated from the absolute values
# of the inputs.
- prod = Signal.like(exp_prod) # intermediate product
+ prod = Signal.like(exp_prod) # intermediate product
comb += prod.eq(abs64_a * abs64_b)
comb += exp_prod.eq(Mux(ab64_sne, -prod, prod))
comb += Assert(o[0:64] == exp_prod[64:128])
module = Driver()
self.assertFormal(module, mode="bmc", depth=2)
self.assertFormal(module, mode="cover", depth=2)
+
def test_ilang(self):
dut = Driver()
vl = rtlil.convert(dut, ports=[])
class MulOutputData(FUBaseData):
- regspec = [('INT', 'o', '0:128'),
- ('XER', 'xer_so', '32')] # XER bit 32: SO
def __init__(self, pspec):
super().__init__(pspec, False) # still input style
self.data.append(self.neg_res)
self.data.append(self.neg_res32)
+ @property
+ def regspec(self):
+ return [('INT', 'o', "0:%d" % (self.pspec.XLEN*2)), # 2xXLEN
+ ('XER', 'xer_so', '32')] # XER bit 32: SO
+
class MulPipeSpec(CommonPipeSpec):
- regspec = (DivInputData.regspec, DivMulOutputData.regspec)
+ regspecklses = (DivInputData, DivMulOutputData)
opsubsetkls = CompMULOpSubset
return MulIntermediateData(self.pspec) # pipeline stage output format
def elaborate(self, platform):
+ XLEN = self.pspec.XLEN
m = Module()
comb = m.d.comb
comb += is_32bit.eq(op.is_32bit)
# work out if a/b are negative (check 32-bit / signed)
- comb += sign_a.eq(Mux(op.is_32bit, a[31], a[63]) & op.is_signed)
- comb += sign_b.eq(Mux(op.is_32bit, b[31], b[63]) & op.is_signed)
+ comb += sign_a.eq(Mux(op.is_32bit, a[31], a[XLEN-1]) & op.is_signed)
+ comb += sign_b.eq(Mux(op.is_32bit, b[31], b[XLEN-1]) & op.is_signed)
comb += sign32_a.eq(a[31] & op.is_signed)
comb += sign32_b.eq(b[31] & op.is_signed)
# negation of a 64-bit value produces the same lower 32-bit
# result as negation of just the lower 32-bits, so we don't
# need to do anything special before negating
- abs_a = Signal(64, reset_less=True)
- abs_b = Signal(64, reset_less=True)
+ abs_a = Signal(XLEN, reset_less=True)
+ abs_b = Signal(XLEN, reset_less=True)
comb += abs_a.eq(Mux(sign_a, -a, a))
comb += abs_b.eq(Mux(sign_b, -b, b))
m.submodules.pdecode2 = pdecode2 = PowerDecode2(None, opkls, fn_name)
pdecode = pdecode2.dec
- pspec = MulPipeSpec(id_wid=2)
+ class PPspec:
+ XLEN = 64
+ pps = PPspec()
+ pspec = MulPipeSpec(id_wid=2, parent_pspec=pps)
m.submodules.alu = alu = MulBasePipe(pspec)
comb += alu.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
class TestPipeLong(MulTestHelper):
def test_mul_pipe_2_arg(self):
- self.run_all(MulTestCases2Arg().test_data, "mul_pipe_caller_long_2_arg",
- has_third_input=False)
+ self.run_all(MulTestCases2Arg({'soc'}).test_data,
+ "mul_pipe_caller_long_2_arg", has_third_input=False)
def helper_3_arg(self, subtest_index):
- self.run_all(MulTestCases3Arg(subtest_index).test_data,
+ self.run_all(MulTestCases3Arg(subtest_index, {'soc'}).test_data,
f"mul_pipe_caller_long_3_arg_{subtest_index}",
has_third_input=True)
class TestPipeIlang(unittest.TestCase):
def write_ilang(self):
- pspec = MulPipeSpec(id_wid=2)
+ class PPspec:
+ XLEN = 64
+ pps = PPspec()
+ pspec = MulPipeSpec(id_wid=2, parent_pspec=pps)
alu = MulBasePipe(pspec)
vl = rtlil.convert(alu, ports=alu.ports())
with open("mul_pipeline.il", "w") as f:
"""
def __init__(self, pspec, output, exc_kls=None):
- self.ctx = PipeContext(pspec) # context for ReservationStation usage
+ self.pspec = pspec
+ self.ctx = PipeContext(pspec) # context for ReservationStation usage
self.muxid = self.ctx.muxid
self.data = []
self.is_output = output
# take regspec and create data attributes (in or out)
# TODO: use widspec to create reduced bit mapping.
+ print (self.regspec)
for i, (regfile, regname, widspec) in enumerate(self.regspec):
wid = get_regspec_bitwidth([self.regspec], 0, i)
if output:
if hasattr(self, "exception"):
yield from self.exception.ports()
+ # convenience function to return 0:63 if XLEN=64, 0:31 if XLEN=32 etc.
+ @property
+ def intrange(self):
+ return "0:%d" % (self.pspec.XLEN-1)
+
def eq(self, i):
eqs = [self.ctx.eq(i.ctx)]
assert len(self.data) == len(i.data), \
- "length of %s mismatch against %s: %s %s" % \
- (repr(self), repr(i), repr(self.data), repr(i.data))
+ "length of %s mismatch against %s: %s %s" % \
+ (repr(self), repr(i), repr(self.data), repr(i.data))
for j in range(len(self.data)):
assert type(self.data[j]) == type(i.data[j]), \
- "type mismatch in FUBaseData %s %s" % \
- (repr(self.data[j]), repr(i.data[j]))
+ "type mismatch in FUBaseData %s %s" % \
+ (repr(self.data[j]), repr(i.data[j]))
eqs.append(self.data[j].eq(i.data[j]))
if hasattr(self, "exception"):
eqs.append(self.exception.eq(i.exception))
return eqs
def ports(self):
- return self.ctx.ports() # TODO: include self.data
+ return self.ctx.ports() # TODO: include self.data
# hmmm there has to be a better way than this
"""CommonPipeSpec: base class for all pipeline specifications
see README.md for explanation of members.
"""
- def __init__(self, id_wid):
+
+ def __init__(self, id_wid, parent_pspec):
self.pipekls = SimpleHandshakeRedir
self.id_wid = id_wid
self.opkls = lambda _: self.opsubsetkls()
- self.op_wid = get_rec_width(self.opkls(None)) # hmm..
+ self.op_wid = get_rec_width(self.opkls(None)) # hmm..
self.stage = None
+ self.parent_pspec = parent_pspec
+
+ # forward attributes from parent_pspec
+ def __getattr__(self, name):
+ return getattr(self.parent_pspec, name)
+
+
+def get_pspec_draft_bitmanip(pspec):
+ """ True if the draft bitmanip instructions are enabled in the provided
+ pspec. The instructions enabled by this are draft instructions -- they are
+ not official OpenPower instructions, they are intended to be eventually
+ submitted to the OpenPower ISA WG.
+
+ https://libre-soc.org/openpower/sv/bitmanip/
+ """
+ # use `is True` to account for Mock absurdities
+ return getattr(pspec, "draft_bitmanip", False) is True
class RegSpec:
def __init__(self, rwid, n_src=None, n_dst=None, name=None):
self._rwid = rwid
+ print ("RegSpec", rwid)
if isinstance(rwid, int):
# rwid: integer (covers all registers)
self._n_src, self._n_dst = n_src, n_dst
-# Proof of correctness for partitioned equal signal combiner
+# Proof of correctness for shift/rotate FU
# Copyright (C) 2020 Michael Nolan <mtnolan2640@gmail.com>
"""
Links:
* https://bugs.libre-soc.org/show_bug.cgi?id=340
+
+run tests with:
+pip install pytest
+pip install pytest-xdist
+pytest -n auto src/soc/fu/shift_rot/formal/proof_main_stage.py
+because that tells pytest to run the tests in parallel, it will take a few
+minutes instead of an hour.
"""
+import unittest
+import enum
from nmigen import (Module, Signal, Elaboratable, Mux, Cat, Repl,
- signed)
-from nmigen.asserts import Assert, AnyConst, Assume, Cover
+ signed, Const, unsigned)
+from nmigen.asserts import Assert, AnyConst, Assume
from nmutil.formaltest import FHDLTestCase
-from nmigen.cli import rtlil
+from nmutil.sim_util import do_sim
+from nmigen.sim import Delay
from soc.fu.shift_rot.main_stage import ShiftRotMainStage
-from soc.fu.shift_rot.rotator import right_mask, left_mask
from soc.fu.shift_rot.pipe_data import ShiftRotPipeSpec
-from soc.fu.shift_rot.sr_input_record import CompSROpSubset
from openpower.decoder.power_enums import MicrOp
-from openpower.consts import field
-import unittest
-from nmutil.extend import exts
+
+@enum.unique
+class TstOp(enum.Enum):
+ """ops we're testing, the idea is if we run a separate formal proof for
+ each instruction, we end up covering them all and each runs much faster,
+ also the formal proofs can be run in parallel."""
+ SHL = MicrOp.OP_SHL
+ SHR = MicrOp.OP_SHR
+ RLC32 = MicrOp.OP_RLC, 32
+ RLC64 = MicrOp.OP_RLC, 64
+ RLCL = MicrOp.OP_RLCL
+ RLCR = MicrOp.OP_RLCR
+ EXTSWSLI = MicrOp.OP_EXTSWSLI
+ TERNLOG = MicrOp.OP_TERNLOG
+ # grev removed -- leaving code for later use in grevlut
+ # GREV32 = MicrOp.OP_GREV, 32
+ # GREV64 = MicrOp.OP_GREV, 64
+
+ @property
+ def op(self):
+ if isinstance(self.value, tuple):
+ return self.value[0]
+ return self.value
+
+
+def eq_any_const(sig: Signal):
+ return sig.eq(AnyConst(sig.shape(), src_loc_at=1))
+
+
+class Mask(Elaboratable):
+ # copied from qemu's mask fn:
+ # https://gitlab.com/qemu-project/qemu/-/blob/477c3b934a47adf7de285863f59d6e4503dd1a6d/target/ppc/internal.h#L21
+ def __init__(self):
+ self.start = Signal(6)
+ self.end = Signal(6)
+ self.out = Signal(64)
+
+ def elaborate(self, platform):
+ m = Module()
+ max_val = Const(~0, unsigned(64))
+ max_bit = 63
+ with m.If(self.start == 0):
+ m.d.comb += self.out.eq(max_val << (max_bit - self.end))
+ with m.Elif(self.end == max_bit):
+ m.d.comb += self.out.eq(max_val >> self.start)
+ with m.Else():
+ ret = (max_val >> self.start) ^ ((max_val >> self.end) >> 1)
+ m.d.comb += self.out.eq(Mux(self.start > self.end, ~ret, ret))
+ return m
+
+
+class TstMask(unittest.TestCase):
+ def test_mask(self):
+ dut = Mask()
+
+ def case(start, end, expected):
+ with self.subTest(start=start, end=end):
+ yield dut.start.eq(start)
+ yield dut.end.eq(end)
+ yield Delay(1e-6)
+ out = yield dut.out
+ with self.subTest(out=hex(out), expected=hex(expected)):
+ self.assertEqual(expected, out)
+
+ def process():
+ for start in range(64):
+ for end in range(64):
+ expected = 0
+ if start > end:
+ for i in range(start, 64):
+ expected |= 1 << (63 - i)
+ for i in range(0, end + 1):
+ expected |= 1 << (63 - i)
+ else:
+ for i in range(start, end + 1):
+ expected |= 1 << (63 - i)
+ yield from case(start, end, expected)
+ with do_sim(self, dut, [dut.start, dut.end, dut.out]) as sim:
+ sim.add_process(process)
+ sim.run()
+
+
+def rotl64(v, amt):
+ v |= Const(0, 64) # convert to value at least 64-bits wide
+ amt |= Const(0, 6) # convert to value at least 6-bits wide
+ return (Cat(v[:64], v[:64]) >> (64 - amt[:6]))[:64]
+
+
+def rotl32(v, amt):
+ v |= Const(0, 32) # convert to value at least 32-bits wide
+ return rotl64(Cat(v[:32], v[:32]), amt)
# This defines a module to drive the device under test and assert
# properties about its outputs
class Driver(Elaboratable):
- def __init__(self):
- # inputs and outputs
- pass
+ def __init__(self, which):
+ assert isinstance(which, TstOp) or which is None
+ self.which = which
def elaborate(self, platform):
m = Module()
comb = m.d.comb
- rec = CompSROpSubset()
- # Setup random inputs for dut.op. do them explicitly so that
- # we can see which ones cause failures in the debug report
- #for p in rec.ports():
- # comb += p.eq(AnyConst(p.width))
- comb += rec.insn_type.eq(AnyConst(rec.insn_type.width))
- comb += rec.fn_unit.eq(AnyConst(rec.fn_unit.width))
- comb += rec.imm_data.imm.eq(AnyConst(rec.imm_data.imm.width))
- comb += rec.imm_data.imm_ok.eq(AnyConst(rec.imm_data.imm_ok.width))
- comb += rec.rc.rc.eq(AnyConst(rec.rc.rc.width))
- comb += rec.rc.rc_ok.eq(AnyConst(rec.rc.rc_ok.width))
- comb += rec.oe.oe.eq(AnyConst(rec.oe.oe.width))
- comb += rec.oe.oe_ok.eq(AnyConst(rec.oe.oe_ok.width))
- comb += rec.write_cr0.eq(AnyConst(rec.write_cr0.width))
- comb += rec.input_carry.eq(AnyConst(rec.input_carry.width))
- comb += rec.output_carry.eq(AnyConst(rec.output_carry.width))
- comb += rec.input_cr.eq(AnyConst(rec.input_cr.width))
- comb += rec.is_32bit.eq(AnyConst(rec.is_32bit.width))
- comb += rec.is_signed.eq(AnyConst(rec.is_signed.width))
- comb += rec.insn.eq(AnyConst(rec.insn.width))
-
-
- pspec = ShiftRotPipeSpec(id_wid=2)
+ pspec = ShiftRotPipeSpec(id_wid=2, parent_pspec=None)
+ pspec.draft_bitmanip = True
m.submodules.dut = dut = ShiftRotMainStage(pspec)
- # convenience variables
- rs = dut.i.rs # register to shift
- b = dut.i.rb # register containing amount to shift by
- ra = dut.i.a # source register if masking is to be done
- carry_in = dut.i.xer_ca[0]
- carry_in32 = dut.i.xer_ca[1]
- carry_out = dut.o.xer_ca
- o = dut.o.o.data
- print ("fields", rec.fields)
- itype = rec.insn_type
-
- # instruction fields
- m_fields = dut.fields.FormM
- md_fields = dut.fields.FormMD
-
- # setup random inputs
- comb += rs.eq(AnyConst(64))
- comb += ra.eq(AnyConst(64))
- comb += b.eq(AnyConst(64))
- comb += carry_in.eq(AnyConst(1))
- comb += carry_in32.eq(AnyConst(1))
-
- # copy operation
- comb += dut.i.ctx.op.eq(rec)
+ # Set inputs to formal variables
+ comb += [
+ eq_any_const(dut.i.ctx.op.insn_type),
+ eq_any_const(dut.i.ctx.op.fn_unit),
+ eq_any_const(dut.i.ctx.op.imm_data.data),
+ eq_any_const(dut.i.ctx.op.imm_data.ok),
+ eq_any_const(dut.i.ctx.op.rc.rc),
+ eq_any_const(dut.i.ctx.op.rc.ok),
+ eq_any_const(dut.i.ctx.op.oe.oe),
+ eq_any_const(dut.i.ctx.op.oe.ok),
+ eq_any_const(dut.i.ctx.op.write_cr0),
+ eq_any_const(dut.i.ctx.op.input_carry),
+ eq_any_const(dut.i.ctx.op.output_carry),
+ eq_any_const(dut.i.ctx.op.input_cr),
+ eq_any_const(dut.i.ctx.op.is_32bit),
+ eq_any_const(dut.i.ctx.op.is_signed),
+ eq_any_const(dut.i.ctx.op.insn),
+ eq_any_const(dut.i.xer_ca),
+ eq_any_const(dut.i.ra),
+ eq_any_const(dut.i.rb),
+ eq_any_const(dut.i.rc),
+ ]
# check that the operation (op) is passed through (and muxid)
comb += Assert(dut.o.ctx.op == dut.i.ctx.op)
comb += Assert(dut.o.ctx.muxid == dut.i.ctx.muxid)
- # signed and signed/32 versions of input rs
- a_signed = Signal(signed(64))
- a_signed_32 = Signal(signed(32))
- comb += a_signed.eq(rs)
- comb += a_signed_32.eq(rs[0:32])
-
- # masks: start-left
- mb = Signal(7, reset_less=True)
- ml = Signal(64, reset_less=True)
-
- # clear left?
- with m.If((itype == MicrOp.OP_RLC) | (itype == MicrOp.OP_RLCL)):
- with m.If(rec.is_32bit):
- comb += mb.eq(m_fields.MB)
- with m.Else():
- comb += mb.eq(md_fields.mb)
- with m.Else():
- with m.If(rec.is_32bit):
- comb += mb.eq(b[0:6])
- with m.Else():
- comb += mb.eq(b+32)
- comb += ml.eq(left_mask(m, mb))
-
- # masks: end-right
- me = Signal(7, reset_less=True)
- mr = Signal(64, reset_less=True)
-
- # clear right?
- with m.If((itype == MicrOp.OP_RLC) | (itype == MicrOp.OP_RLCR)):
- with m.If(rec.is_32bit):
- comb += me.eq(m_fields.ME)
- with m.Else():
- comb += me.eq(md_fields.me)
- with m.Else():
- with m.If(rec.is_32bit):
- comb += me.eq(b[0:6])
- with m.Else():
- comb += me.eq(63-b)
- comb += mr.eq(right_mask(m, me))
-
- # must check Data.ok
- o_ok = Signal()
- comb += o_ok.eq(1)
-
- # main assertion of arithmetic operations
- with m.Switch(itype):
-
- # left-shift: 64/32-bit
- with m.Case(MicrOp.OP_SHL):
- comb += Assume(ra == 0)
- with m.If(rec.is_32bit):
- comb += Assert(o[0:32] == ((rs << b[0:6]) & 0xffffffff))
- comb += Assert(o[32:64] == 0)
- with m.Else():
- comb += Assert(o == ((rs << b[0:7]) & ((1 << 64)-1)))
-
- # right-shift: 64/32-bit / signed
- with m.Case(MicrOp.OP_SHR):
- comb += Assume(ra == 0)
- with m.If(~rec.is_signed):
- with m.If(rec.is_32bit):
- comb += Assert(o[0:32] == (rs[0:32] >> b[0:6]))
- comb += Assert(o[32:64] == 0)
- with m.Else():
- comb += Assert(o == (rs >> b[0:7]))
- with m.Else():
- with m.If(rec.is_32bit):
- comb += Assert(o[0:32] == (a_signed_32 >> b[0:6]))
- comb += Assert(o[32:64] == Repl(rs[31], 32))
- with m.Else():
- comb += Assert(o == (a_signed >> b[0:7]))
-
- # extswsli: 32/64-bit moded
- with m.Case(MicrOp.OP_EXTSWSLI):
- comb += Assume(ra == 0)
- with m.If(rec.is_32bit):
- comb += Assert(o[0:32] == ((rs << b[0:6]) & 0xffffffff))
- comb += Assert(o[32:64] == 0)
- with m.Else():
- # sign-extend to 64 bit
- a_s = Signal(64, reset_less=True)
- comb += a_s.eq(exts(rs, 32, 64))
- comb += Assert(o == ((a_s << b[0:7]) & ((1 << 64)-1)))
-
- # rlwinm, rlwnm, rlwimi
- # *CAN* these even be 64-bit capable? I don't think they are.
- with m.Case(MicrOp.OP_RLC):
- comb += Assume(ra == 0)
- comb += Assume(rec.is_32bit)
-
- # Duplicate some signals so that they're much easier to find
- # in gtkwave.
- # Pro-tip: when debugging, factor out expressions into
- # explicitly named
- # signals, and search using a unique grep-tag (RLC in my case).
- # After
- # debugging, resubstitute values to comply with surrounding
- # code norms.
-
- mrl = Signal(64, reset_less=True, name='MASK_FOR_RLC')
- with m.If(mb > me):
- comb += mrl.eq(ml | mr)
- with m.Else():
- comb += mrl.eq(ml & mr)
-
- ainp = Signal(64, reset_less=True, name='A_INP_FOR_RLC')
- comb += ainp.eq(field(rs, 32, 63))
-
- sh = Signal(6, reset_less=True, name='SH_FOR_RLC')
- comb += sh.eq(b[0:6])
-
- exp_shl = Signal(64, reset_less=True,
- name='A_SHIFTED_LEFT_BY_SH_FOR_RLC')
- comb += exp_shl.eq((ainp << sh) & 0xFFFFFFFF)
-
- exp_shr = Signal(64, reset_less=True,
- name='A_SHIFTED_RIGHT_FOR_RLC')
- comb += exp_shr.eq((ainp >> (32 - sh)) & 0xFFFFFFFF)
-
- exp_rot = Signal(64, reset_less=True,
- name='A_ROTATED_LEFT_FOR_RLC')
- comb += exp_rot.eq(exp_shl | exp_shr)
-
- exp_ol = Signal(32, reset_less=True, name='EXPECTED_OL_FOR_RLC')
- comb += exp_ol.eq(field((exp_rot & mrl) | (ainp & ~mrl),
- 32, 63))
-
- act_ol = Signal(32, reset_less=True, name='ACTUAL_OL_FOR_RLC')
- comb += act_ol.eq(field(o, 32, 63))
-
- # If I uncomment the following lines, I can confirm that all
- # 32-bit rotations work. If I uncomment only one of the
- # following lines, I can confirm that all 32-bit rotations
- # work. When I remove/recomment BOTH lines, however, the
- # assertion fails. Why??
-
-# comb += Assume(mr == 0xFFFFFFFF)
-# comb += Assume(ml == 0xFFFFFFFF)
- #with m.If(rec.is_32bit):
- # comb += Assert(act_ol == exp_ol)
- # comb += Assert(field(o, 0, 31) == 0)
-
- #TODO
- with m.Case(MicrOp.OP_RLCR):
- pass
- with m.Case(MicrOp.OP_RLCL):
- pass
- with m.Default():
- comb += o_ok.eq(0)
-
- # check that data ok was only enabled when op actioned
- comb += Assert(dut.o.o.ok == o_ok)
+ if self.which is None:
+ for i in TstOp:
+ comb += Assume(dut.i.ctx.op.insn_type != i.op)
+ comb += Assert(~dut.o.o.ok)
+ else:
+ # we're only checking a particular operation:
+ comb += Assume(dut.i.ctx.op.insn_type == self.which.op)
+ comb += Assert(dut.o.o.ok)
+
+ # dispatch to check fn for each op
+ getattr(self, f"_check_{self.which.name.lower()}")(m, dut)
return m
+ def _check_shl(self, m, dut):
+ m.d.comb += Assume(dut.i.ra == 0)
+ expected = Signal(64)
+ with m.If(dut.i.ctx.op.is_32bit):
+ m.d.comb += expected.eq((dut.i.rs << dut.i.rb[:6])[:32])
+ with m.Else():
+ m.d.comb += expected.eq((dut.i.rs << dut.i.rb[:7])[:64])
+ m.d.comb += Assert(dut.o.o.data == expected)
+ m.d.comb += Assert(dut.o.xer_ca.data == 0)
+
+ def _check_shr(self, m, dut):
+ m.d.comb += Assume(dut.i.ra == 0)
+ expected = Signal(64)
+ carry = Signal()
+ shift_in_s = Signal(signed(128))
+ shift_roundtrip = Signal(signed(128))
+ shift_in_u = Signal(128)
+ shift_amt = Signal(7)
+ with m.If(dut.i.ctx.op.is_32bit):
+ m.d.comb += [
+ shift_amt.eq(dut.i.rb[:6]),
+ shift_in_s.eq(dut.i.rs[:32].as_signed()),
+ shift_in_u.eq(dut.i.rs[:32]),
+ ]
+ with m.Else():
+ m.d.comb += [
+ shift_amt.eq(dut.i.rb[:7]),
+ shift_in_s.eq(dut.i.rs.as_signed()),
+ shift_in_u.eq(dut.i.rs),
+ ]
+
+ with m.If(dut.i.ctx.op.is_signed):
+ m.d.comb += [
+ expected.eq(shift_in_s >> shift_amt),
+ shift_roundtrip.eq((shift_in_s >> shift_amt) << shift_amt),
+ carry.eq((shift_in_s < 0) & (shift_roundtrip != shift_in_s)),
+ ]
+ with m.Else():
+ m.d.comb += [
+ expected.eq(shift_in_u >> shift_amt),
+ carry.eq(0),
+ ]
+ m.d.comb += Assert(dut.o.o.data == expected)
+ m.d.comb += Assert(dut.o.xer_ca.data == Repl(carry, 2))
+
+ def _check_rlc32(self, m, dut):
+ m.d.comb += Assume(dut.i.ctx.op.is_32bit)
+ # rlwimi, rlwinm, and rlwnm
+
+ m.submodules.mask = mask = Mask()
+ expected = Signal(64)
+ rot = Signal(64)
+ m.d.comb += rot.eq(rotl32(dut.i.rs[:32], dut.i.rb[:5]))
+ m.d.comb += mask.start.eq(dut.fields.FormM.MB[:] + 32)
+ m.d.comb += mask.end.eq(dut.fields.FormM.ME[:] + 32)
+
+ # for rlwinm and rlwnm, ra is guaranteed to be 0, so that part of
+ # the expression turns into a no-op
+ m.d.comb += expected.eq((rot & mask.out) | (dut.i.ra & ~mask.out))
+ m.d.comb += Assert(dut.o.o.data == expected)
+ m.d.comb += Assert(dut.o.xer_ca.data == 0)
+
+ def _check_rlc64(self, m, dut):
+ m.d.comb += Assume(~dut.i.ctx.op.is_32bit)
+ # rldic and rldimi
+
+ # `rb` is always a 6-bit immediate
+ m.d.comb += Assume(dut.i.rb[6:] == 0)
+
+ m.submodules.mask = mask = Mask()
+ expected = Signal(64)
+ rot = Signal(64)
+ m.d.comb += rot.eq(rotl64(dut.i.rs, dut.i.rb[:6]))
+ mb = dut.fields.FormMD.mb[:]
+ m.d.comb += mask.start.eq(Cat(mb[1:6], mb[0]))
+ m.d.comb += mask.end.eq(63 - dut.i.rb[:6])
+
+ # for rldic, ra is guaranteed to be 0, so that part of
+ # the expression turns into a no-op
+ m.d.comb += expected.eq((rot & mask.out) | (dut.i.ra & ~mask.out))
+ m.d.comb += Assert(dut.o.o.data == expected)
+ m.d.comb += Assert(dut.o.xer_ca.data == 0)
+
+ def _check_rlcl(self, m, dut):
+ m.d.comb += Assume(~dut.i.ctx.op.is_32bit)
+ # rldicl and rldcl
+
+ m.d.comb += Assume(~dut.i.ctx.op.is_signed)
+ m.d.comb += Assume(dut.i.ra == 0)
+
+ m.submodules.mask = mask = Mask()
+ m.d.comb += mask.end.eq(63)
+ mb = dut.fields.FormMD.mb[:]
+ m.d.comb += mask.start.eq(Cat(mb[1:6], mb[0]))
+
+ rot = Signal(64)
+ m.d.comb += rot.eq(rotl64(dut.i.rs, dut.i.rb[:6]))
+
+ expected = Signal(64)
+ m.d.comb += expected.eq(rot & mask.out)
+
+ m.d.comb += Assert(dut.o.o.data == expected)
+ m.d.comb += Assert(dut.o.xer_ca.data == 0)
+
+ def _check_rlcr(self, m, dut):
+ m.d.comb += Assume(~dut.i.ctx.op.is_32bit)
+ # rldicr and rldcr
+
+ m.d.comb += Assume(~dut.i.ctx.op.is_signed)
+ m.d.comb += Assume(dut.i.ra == 0)
+
+ m.submodules.mask = mask = Mask()
+ m.d.comb += mask.start.eq(0)
+ me = dut.fields.FormMD.me[:]
+ m.d.comb += mask.end.eq(Cat(me[1:6], me[0]))
+
+ rot = Signal(64)
+ m.d.comb += rot.eq(rotl64(dut.i.rs, dut.i.rb[:6]))
+
+ expected = Signal(64)
+ m.d.comb += expected.eq(rot & mask.out)
+
+ m.d.comb += Assert(dut.o.o.data == expected)
+ m.d.comb += Assert(dut.o.xer_ca.data == 0)
+
+ def _check_extswsli(self, m, dut):
+ m.d.comb += Assume(dut.i.ra == 0)
+ m.d.comb += Assume(dut.i.rb[6:] == 0)
+ m.d.comb += Assume(~dut.i.ctx.op.is_32bit) # all instrs. are 64-bit
+ expected = Signal(64)
+ m.d.comb += expected.eq((dut.i.rs[0:32].as_signed() << dut.i.rb[:6]))
+ m.d.comb += Assert(dut.o.o.data == expected)
+ m.d.comb += Assert(dut.o.xer_ca.data == 0)
+
+ def _check_ternlog(self, m, dut):
+ lut = dut.fields.FormTLI.TLI[:]
+ for i in range(64):
+ idx = Cat(dut.i.rb[i], dut.i.ra[i], dut.i.rc[i])
+ for j in range(8):
+ with m.If(j == idx):
+ m.d.comb += Assert(dut.o.o.data[i] == lut[j])
+ m.d.comb += Assert(dut.o.xer_ca.data == 0)
+
+ # grev removed -- leaving code for later use in grevlut
+ def _check_grev32(self, m, dut):
+ m.d.comb += Assume(dut.i.ctx.op.is_32bit)
+ # assert zero-extended
+ m.d.comb += Assert(dut.o.o.data[32:] == 0)
+ i = Signal(5)
+ m.d.comb += eq_any_const(i)
+ idx = dut.i.rb[0: 5] ^ i
+ m.d.comb += Assert((dut.o.o.data >> i)[0] == (dut.i.ra >> idx)[0])
+ m.d.comb += Assert(dut.o.xer_ca.data == 0)
+
+ # grev removed -- leaving code for later use in grevlut
+ def _check_grev64(self, m, dut):
+ m.d.comb += Assume(~dut.i.ctx.op.is_32bit)
+ i = Signal(6)
+ m.d.comb += eq_any_const(i)
+ idx = dut.i.rb[0: 6] ^ i
+ m.d.comb += Assert((dut.o.o.data >> i)[0] == (dut.i.ra >> idx)[0])
+ m.d.comb += Assert(dut.o.xer_ca.data == 0)
+
class ALUTestCase(FHDLTestCase):
- def test_formal(self):
- module = Driver()
+ def run_it(self, which):
+ module = Driver(which)
self.assertFormal(module, mode="bmc", depth=2)
self.assertFormal(module, mode="cover", depth=2)
- def test_ilang(self):
- dut = Driver()
- vl = rtlil.convert(dut, ports=[])
- with open("main_stage.il", "w") as f:
- f.write(vl)
+
+ def test_none(self):
+ self.run_it(None)
+
+ def test_shl(self):
+ self.run_it(TstOp.SHL)
+
+ def test_shr(self):
+ self.run_it(TstOp.SHR)
+
+ def test_rlc32(self):
+ self.run_it(TstOp.RLC32)
+
+ def test_rlc64(self):
+ self.run_it(TstOp.RLC64)
+
+ def test_rlcl(self):
+ self.run_it(TstOp.RLCL)
+
+ def test_rlcr(self):
+ self.run_it(TstOp.RLCR)
+
+ def test_extswsli(self):
+ self.run_it(TstOp.EXTSWSLI)
+
+ def test_ternlog(self):
+ self.run_it(TstOp.TERNLOG)
+
+ @unittest.skip("grev removed -- leaving code for later use in grevlut")
+ def test_grev32(self):
+ self.run_it(TstOp.GREV32)
+
+ @unittest.skip("grev removed -- leaving code for later use in grevlut")
+ def test_grev64(self):
+ self.run_it(TstOp.GREV64)
+
+
+# check that all test cases are covered
+for i in TstOp:
+ assert callable(getattr(ALUTestCase, f"test_{i.name.lower()}"))
if __name__ == '__main__':
# output stage
from nmigen import (Module, Signal, Cat, Repl, Mux, Const)
from nmutil.pipemodbase import PipeModBase
+from soc.fu.pipe_data import get_pspec_draft_bitmanip
from soc.fu.shift_rot.pipe_data import (ShiftRotOutputData,
ShiftRotInputData)
from nmutil.lut import BitwiseLut
class ShiftRotMainStage(PipeModBase):
def __init__(self, pspec):
super().__init__(pspec, "main")
+ self.draft_bitmanip = get_pspec_draft_bitmanip(pspec)
self.fields = DecodeFields(SignalBitRange, [self.i.ctx.op.insn])
self.fields.create_specs()
return ShiftRotOutputData(self.pspec)
def elaborate(self, platform):
+ XLEN = self.pspec.XLEN
m = Module()
comb = m.d.comb
op = self.i.ctx.op
o = self.o.o
- bitwise_lut = BitwiseLut(input_count=3, width=64)
- m.submodules.bitwise_lut = bitwise_lut
- comb += bitwise_lut.inputs[0].eq(self.i.rb)
- comb += bitwise_lut.inputs[1].eq(self.i.ra)
- comb += bitwise_lut.inputs[2].eq(self.i.rc)
+ bitwise_lut = None
+ if self.draft_bitmanip:
+ bitwise_lut = BitwiseLut(input_count=3, width=XLEN)
+ m.submodules.bitwise_lut = bitwise_lut
+ comb += bitwise_lut.inputs[0].eq(self.i.rb)
+ comb += bitwise_lut.inputs[1].eq(self.i.ra)
+ comb += bitwise_lut.inputs[2].eq(self.i.rc)
# NOTE: the sh field immediate is read in by PowerDecode2
# (actually DecodeRB), whereupon by way of rb "immediate" mode
comb += mb_extra.eq(md_fields['mb'][0:-1][0])
# set up microwatt rotator module
- m.submodules.rotator = rotator = Rotator()
+ m.submodules.rotator = rotator = Rotator(XLEN)
comb += [
rotator.me.eq(me),
rotator.mb.eq(mb),
comb += o.ok.eq(1) # defaults to enabled
+ # instruction rotate type
+ mode = Signal(4, reset_less=True)
+ comb += Cat(rotator.right_shift,
+ rotator.clear_left,
+ rotator.clear_right,
+ rotator.sign_ext_rs).eq(mode)
+
# outputs from the microwatt rotator module
comb += [o.data.eq(rotator.result_o),
self.o.xer_ca.data.eq(Repl(rotator.carry_out_o, 2))]
- # instruction rotate type
- mode = Signal(4, reset_less=True)
with m.Switch(op.insn_type):
with m.Case(MicrOp.OP_SHL):
comb += mode.eq(0b0000) # L-shift
comb += mode.eq(0b0100) # clear R
with m.Case(MicrOp.OP_EXTSWSLI):
comb += mode.eq(0b1000) # L-ext
- with m.Case(MicrOp.OP_TERNLOG):
- # TODO: this only works for ternaryi, change to get lut value
- # from register when we implement other variants
- comb += bitwise_lut.lut.eq(self.fields.FormTLI.TLI[:])
- comb += o.data.eq(bitwise_lut.output)
- comb += self.o.xer_ca.data.eq(0)
+ if self.draft_bitmanip:
+ with m.Case(MicrOp.OP_TERNLOG):
+ # TODO: this only works for ternlogi, change to get lut
+ # value from register when we implement other variants
+ comb += bitwise_lut.lut.eq(self.fields.FormTLI.TLI[:])
+ comb += o.data.eq(bitwise_lut.output)
+ comb += self.o.xer_ca.data.eq(0)
with m.Default():
comb += o.ok.eq(0) # otherwise disable
- comb += Cat(rotator.right_shift,
- rotator.clear_left,
- rotator.clear_right,
- rotator.sign_ext_rs).eq(mode)
-
###### sticky overflow and context, both pass-through #####
comb += self.o.xer_so.data.eq(self.i.xer_so)
class ShiftRotInputData(FUBaseData):
- regspec = [('INT', 'ra', '0:63'), # RA
- ('INT', 'rb', '0:63'), # RB
- ('INT', 'rc', '0:63'), # RS
- ('XER', 'xer_so', '32'), # XER bit 32: SO
- ('XER', 'xer_ca', '34,45')] # XER bit 34/45: CA/CA32
def __init__(self, pspec):
super().__init__(pspec, False)
# convenience
self.a, self.b, self.rs = self.ra, self.rb, self.rc
+ @property
+ def regspec(self):
+ return [('INT', 'ra', self.intrange), # RA
+ ('INT', 'rb', self.intrange), # RB/immediate
+ ('INT', 'rc', self.intrange), # RB/immediate
+ ('XER', 'xer_so', '32'), # XER bit 32: SO
+ ('XER', 'xer_ca', '34,45')] # XER bit 34/45: CA/CA32
+
# input to shiftrot final stage (common output)
class ShiftRotOutputData(FUBaseData):
- regspec = [('INT', 'o', '0:63'), # RT
- ('CR', 'cr_a', '0:3'),
- ('XER', 'xer_so', '32'), # bit0: so
- ('XER', 'xer_ca', '34,45'), # XER bit 34/45: CA/CA32
- ]
def __init__(self, pspec):
super().__init__(pspec, True)
# convenience
self.cr0 = self.cr_a
+ @property
+ def regspec(self):
+ return [('INT', 'o', self.intrange),
+ ('CR', 'cr_a', '0:3'),
+ ('XER', 'xer_so', '32'), # bit0: so
+ ('XER', 'xer_ca', '34,45'), # XER bit 34/45: CA/CA32
+ ]
+
# output from shiftrot final stage (common output) - note that XER.so
# is *not* included (the only reason it's in the input is because of CR0)
class ShiftRotOutputDataFinal(FUBaseData):
- regspec = [('INT', 'o', '0:63'), # RT
- ('CR', 'cr_a', '0:3'),
- ('XER', 'xer_ca', '34,45'), # XER bit 34/45: CA/CA32
- ]
def __init__(self, pspec):
super().__init__(pspec, True)
# convenience
self.cr0 = self.cr_a
+ @property
+ def regspec(self):
+ return [('INT', 'o', self.intrange),
+ ('CR', 'cr_a', '0:3'),
+ ('XER', 'xer_ca', '34,45'), # XER bit 34/45: CA/CA32
+ ]
+
class ShiftRotPipeSpec(CommonPipeSpec):
- regspec = (ShiftRotInputData.regspec, ShiftRotOutputDataFinal.regspec)
+ regspecklses = (ShiftRotInputData, ShiftRotOutputDataFinal)
opsubsetkls = CompSROpSubset
from soc.fu.shift_rot.main_stage import ShiftRotMainStage
from soc.fu.shift_rot.output_stage import ShiftRotOutputStage
-class ShiftRotStages(PipeModBaseChain):
+class ShiftRotStart(PipeModBaseChain):
def get_chain(self):
inp = ShiftRotInputStage(self.pspec)
+ return [inp]
+
+class ShiftRotStage(PipeModBaseChain):
+ def get_chain(self):
main = ShiftRotMainStage(self.pspec)
- return [inp, main]
+ return [main]
class ShiftRotStageEnd(PipeModBaseChain):
def __init__(self, pspec):
ControlBase.__init__(self)
self.pspec = pspec
- self.pipe1 = ShiftRotStages(pspec)
- self.pipe2 = ShiftRotStageEnd(pspec)
- self._eqs = self.connect([self.pipe1, self.pipe2])
+ self.pipe1 = ShiftRotStart(pspec)
+ self.pipe2 = ShiftRotStage(pspec)
+ self.pipe3 = ShiftRotStageEnd(pspec)
+ self._eqs = self.connect([self.pipe1, self.pipe2, self.pipe3])
def elaborate(self, platform):
m = ControlBase.elaborate(self, platform)
m.submodules.pipe1 = self.pipe1
m.submodules.pipe2 = self.pipe2
+ m.submodules.pipe3 = self.pipe3
m.d.comb += self._eqs
return m
# note BE bit numbering
-def right_mask(m, mask_begin):
- ret = Signal(64, name="right_mask", reset_less=True)
- with m.If(mask_begin <= 64):
- m.d.comb += ret.eq((1 << (64-mask_begin)) - 1)
+def right_mask(m, mask_begin, width):
+ ret = Signal(width, name="right_mask", reset_less=True)
+ with m.If(mask_begin <= width):
+ m.d.comb += ret.eq((1 << (width-mask_begin)) - 1)
with m.Else():
m.d.comb += ret.eq(0)
return ret
-def left_mask(m, mask_end):
- ret = Signal(64, name="left_mask", reset_less=True)
- m.d.comb += ret.eq(~((1 << (63-mask_end)) - 1))
+def left_mask(m, mask_end, width):
+ ret = Signal(width, name="left_mask", reset_less=True)
+ m.d.comb += ret.eq(~((1 << (width-1-mask_end)) - 1))
return ret
* clear_right = 1 when insn_type is OP_RLC or OP_RLCR
"""
- def __init__(self):
+ def __init__(self, width):
+ self.width = width
# input
self.me = Signal(5, reset_less=True) # ME field
self.mb = Signal(5, reset_less=True) # MB field
# extra bit of mb in MD-form
self.mb_extra = Signal(1, reset_less=True)
- self.ra = Signal(64, reset_less=True) # RA
- self.rs = Signal(64, reset_less=True) # RS
+ self.ra = Signal(width, reset_less=True) # RA
+ self.rs = Signal(width, reset_less=True) # RS
self.shift = Signal(7, reset_less=True) # RB[0:7]
self.is_32bit = Signal(reset_less=True)
self.right_shift = Signal(reset_less=True)
self.clear_right = Signal(reset_less=True)
self.sign_ext_rs = Signal(reset_less=True)
# output
- self.result_o = Signal(64, reset_less=True)
+ self.result_o = Signal(width, reset_less=True)
self.carry_out_o = Signal(reset_less=True)
def elaborate(self, platform):
+ width = self.width
m = Module()
comb = m.d.comb
ra, rs = self.ra, self.rs
sh = Signal(7, reset_less=True)
mb = Signal(7, reset_less=True)
me = Signal(7, reset_less=True)
- mr = Signal(64, reset_less=True)
- ml = Signal(64, reset_less=True)
+ mr = Signal(width, reset_less=True)
+ ml = Signal(width, reset_less=True)
output_mode = Signal(2, reset_less=True)
hi32 = Signal(32, reset_less=True)
- repl32 = Signal(64, reset_less=True)
+ repl32 = Signal(width, reset_less=True)
# First replicate bottom 32 bits to both halves if 32-bit
with m.If(self.is_32bit):
# sign-extend bottom 32 bits
comb += hi32.eq(Repl(rs[31], 32))
with m.Else():
- comb += hi32.eq(rs[32:64])
+ if width == 64:
+ comb += hi32.eq(rs[32:64])
comb += repl32.eq(Cat(rs[0:32], hi32))
shift_signed = Signal(signed(6))
comb += rot_count.eq(self.shift[0:6])
# ROTL submodule
- m.submodules.rotl = rotl = ROTL(64)
+ m.submodules.rotl = rotl = ROTL(width)
comb += rotl.a.eq(repl32)
comb += rotl.b.eq(rot_count)
comb += rot.eq(rotl.o)
comb += me.eq(Cat(~sh[0:6], sh[6]))
# Calculate left and right masks
- m.submodules.right_mask = right_mask = Mask(64)
- with m.If(mb <= 64):
- comb += right_mask.shift.eq(64-mb)
+ m.submodules.right_mask = right_mask = Mask(width)
+ with m.If(mb <= width):
+ comb += right_mask.shift.eq(width-mb)
comb += mr.eq(right_mask.mask)
with m.Else():
comb += mr.eq(0)
#comb += mr.eq(right_mask(m, mb))
- m.submodules.left_mask = left_mask = Mask(64)
- comb += left_mask.shift.eq(63-me)
+ m.submodules.left_mask = left_mask = Mask(width)
+ comb += left_mask.shift.eq(width-1-me)
comb += ml.eq(~left_mask.mask)
#comb += ml.eq(left_mask(m, me))
# 10 for rldicl, sr[wd]
# 1z for sra[wd][i], z = 1 if rs is negative
with m.If((self.clear_left & ~self.clear_right) | self.right_shift):
- comb += output_mode.eq(Cat(self.arith & repl32[63], Const(1, 1)))
+ comb += output_mode.eq(Cat(self.arith &
+ repl32[width-1], Const(1, 1)))
with m.Else():
mbgt = self.clear_right & (mb[0:6] > me[0:6])
comb += output_mode.eq(Cat(mbgt, Const(0, 1)))
comb = m.d.comb
mr = Signal(64)
mb = Signal(6)
- comb += mr.eq(left_mask(m, mb))
+ comb += mr.eq(left_mask(m, mb, 64))
def loop():
for i in range(64):
from nmutil.formaltest import FHDLTestCase
from nmigen.cli import rtlil
from soc.fu.shift_rot.maskgen import MaskGen
-from openpower.decoder.helpers import MASK
+from openpower.decoder.helpers import ISACallerHelper
import random
import unittest
class MaskGenTestCase(FHDLTestCase):
def test_maskgen(self):
+ MASK = ISACallerHelper(64, FPSCR=None).MASK
m = Module()
comb = m.d.comb
m.submodules.dut = dut = MaskGen(64)
from nmutil.sim_tmp_alternative import Simulator, Settle
from openpower.test.shift_rot.shift_rot_cases import ShiftRotTestCase
+from openpower.test.bitmanip.bitmanip_cases import BitManipTestCase
def get_cu_inputs(dec2, sim):
class ShiftRotIlangCase(TestAccumulatorBase):
def case_ilang(self):
- pspec = ShiftRotPipeSpec(id_wid=2)
+ class PPspec:
+ XLEN = 64
+ pps = PPspec()
+ pspec = ShiftRotPipeSpec(id_wid=2, parent_pspec=pps)
+ pspec.draft_bitmanip = True
alu = ShiftRotBasePipe(pspec)
vl = rtlil.convert(alu, ports=alu.ports())
with open("shift_rot_pipeline.il", "w") as f:
m.submodules.pdecode2 = pdecode2 = PowerDecode2(None, opkls, fn_name)
pdecode = pdecode2.dec
- pspec = ShiftRotPipeSpec(id_wid=2)
+ class PPspec:
+ XLEN = 64
+ pps = PPspec()
+ pspec = ShiftRotPipeSpec(id_wid=2, parent_pspec=pps)
+ pspec.draft_bitmanip = True
m.submodules.alu = alu = ShiftRotBasePipe(pspec)
comb += alu.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
yield from ALUHelpers.get_xer_ca(res, alu, dec2)
yield from ALUHelpers.get_int_o(res, alu, dec2)
- print ("hw outputs", res)
+ print("hw outputs", res)
yield from ALUHelpers.get_sim_int_o(sim_o, sim, dec2)
yield from ALUHelpers.get_wr_sim_cr_a(sim_o, sim, dec2)
yield from ALUHelpers.get_wr_sim_xer_ca(sim_o, sim, dec2)
- print ("sim outputs", sim_o)
+ print("sim outputs", sim_o)
ALUHelpers.check_cr_a(self, res, sim_o, "CR%d %s" % (cridx, code))
ALUHelpers.check_xer_ca(self, res, sim_o, code)
unittest.main(exit=False)
suite = unittest.TestSuite()
suite.addTest(TestRunner(ShiftRotTestCase().test_data))
+ suite.addTest(TestRunner(BitManipTestCase().test_data))
suite.addTest(TestRunner(ShiftRotIlangCase().test_data))
runner = unittest.TextTestRunner()
from openpower.decoder.power_fieldsn import SignalBitRange
# use POWER numbering. sigh.
+
+
def xer_bit(name):
return 63-XER_bits[name]
width = p.width
comb += p.eq(AnyConst(width))
- pspec = SPRPipeSpec(id_wid=2)
+ pspec = SPRPipeSpec(id_wid=2, parent_pspec=None)
m.submodules.dut = dut = SPRMainStage(pspec)
# frequently used aliases
a = dut.i.a
ca_in = dut.i.xer_ca[0] # CA carry in
- ca32_in = dut.i.xer_ca[1] # CA32 carry in 32
+ ca32_in = dut.i.xer_ca[1] # CA32 carry in 32
so_in = dut.i.xer_so # SO sticky overflow
ov_in = dut.i.xer_ov[0] # XER OV in
- ov32_in = dut.i.xer_ov[1] # XER OV32 in
+ ov32_in = dut.i.xer_ov[1] # XER OV32 in
o = dut.o.o
# setup random inputs
comb += dut.i.ctx.op.eq(rec)
# check that the operation (op) is passed through (and muxid)
- comb += Assert(dut.o.ctx.op == dut.i.ctx.op )
- comb += Assert(dut.o.ctx.muxid == dut.i.ctx.muxid )
+ comb += Assert(dut.o.ctx.op == dut.i.ctx.op)
+ comb += Assert(dut.o.ctx.muxid == dut.i.ctx.muxid)
# MTSPR
fields = DecodeFields(SignalBitRange, [dut.i.ctx.op.insn])
super().__init__(pspec, "spr_main")
# test if regfiles are reduced
self.regreduce_en = (hasattr(pspec, "regreduce") and
- (pspec.regreduce == True))
+ (pspec.regreduce == True))
self.fields = DecodeFields(SignalBitRange, [self.i.ctx.op.insn])
self.fields.create_specs()
so_i, ov_i, ca_i = self.i.xer_so, self.i.xer_ov, self.i.xer_ca
so_o, ov_o, ca_o = self.o.xer_so, self.o.xer_ov, self.o.xer_ca
o, spr1_o, fast1_o = self.o.o, self.o.spr1, self.o.fast1
+ state1_i, state1_o = self.i.state1, self.o.state1
# take copy of D-Form TO field
x_fields = self.fields.FormXFX
#### MTSPR ####
with m.Case(MicrOp.OP_MTSPR):
with m.Switch(spr):
- # fast SPRs first
+ # State SPRs first, note that this triggers a regfile write
+ # which is monitored right the way down in TestIssuerBase.
+ with m.Case(SPR.DEC, SPR.TB):
+ comb += state1_o.data.eq(a_i)
+ comb += state1_o.ok.eq(1)
+
+ # Fast SPRs second: anything in FAST regs
with m.Case(SPR.CTR, SPR.LR, SPR.TAR, SPR.SRR0,
- SPR.SRR1, SPR.XER, SPR.DEC):
+ SPR.SRR1, SPR.XER, SPR.HSRR0, SPR.HSRR1,
+ SPR.SPRG0_priv, SPR.SPRG1_priv,
+ SPR.SPRG2_priv, SPR.SPRG3,
+ SPR.HSPRG0, SPR.HSPRG1, SPR.SVSRR0):
comb += fast1_o.data.eq(a_i)
comb += fast1_o.ok.eq(1)
# XER is constructed
with m.Case(MicrOp.OP_MFSPR):
comb += o.ok.eq(1)
with m.Switch(spr):
- # fast SPRs first
- with m.Case(SPR.CTR, SPR.LR, SPR.TAR, SPR.SRR0, SPR.SRR1,
- SPR.XER, SPR.DEC, SPR.TB):
+ # state SPRs first
+ with m.Case(SPR.DEC, SPR.TB):
+ comb += o.data.eq(state1_i)
+ # TBU is upper 32-bits of State Reg
+ with m.Case(SPR.TBU):
+ comb += o.data[0:32].eq(state1_i[32:64])
+
+ # fast SPRs second
+ with m.Case(SPR.CTR, SPR.LR, SPR.TAR, SPR.SRR0,
+ SPR.SRR1, SPR.XER, SPR.HSRR0, SPR.HSRR1,
+ SPR.SPRG0_priv, SPR.SPRG1_priv,
+ SPR.SPRG2_priv, SPR.SPRG3,
+ SPR.HSPRG0, SPR.HSPRG1, SPR.SVSRR0):
comb += o.data.eq(fast1_i)
with m.If(spr == SPR.XER):
# bits 0:31 and 35:43 are treated as reserved
# and return 0s when read using mfxer
comb += o[32:64].eq(0) # MBS0 bits 0-31
- comb += o[63-43:64-35].eq(0) # MSB0 bits 35-43
+ comb += o[63-43:64-35].eq(0) # MSB0 bits 35-43
# sticky
comb += o[63-XER_bits['SO']].eq(so_i)
# overflow
# carry
comb += o[63-XER_bits['CA']].eq(ca_i[0])
comb += o[63-XER_bits['CA32']].eq(ca_i[1])
- with m.Case(SPR.TBU):
- comb += o.data[0:32].eq(fast1_i[32:64])
-
# slow SPRs TODO
with m.Default():
comb += o.data.eq(spr1_i)
regspec = [('INT', 'ra', '0:63'), # RA
('SPR', 'spr1', '0:63'), # SPR (slow)
('FAST', 'fast1', '0:63'), # SPR (fast: LR, CTR etc)
+ ('STATE', 'state1', '0:63'), # SPR (DEC/TB)
('XER', 'xer_so', '32'), # XER bit 32: SO
('XER', 'xer_ov', '33,44'), # XER bit 34/45: CA/CA32
('XER', 'xer_ca', '34,45')] # bit0: ov, bit1: ov32
# convenience
self.a = self.ra
+# note that state1 gets a corresponding "state1" write port created
+# by core.py which is "monitored" by TestIssuerBase (hack-job, sigh).
+# when writes are spotted then the DEC/TB FSM resets and re-reads
+# DEC/TB.
class SPROutputData(FUBaseData):
regspec = [('INT', 'o', '0:63'), # RT
('SPR', 'spr1', '0:63'), # SPR (slow)
('FAST', 'fast1', '0:63'), # SPR (fast: LR, CTR etc)
+ ('STATE', 'state1', '0:63'), # SPR (DEC/TB)
('XER', 'xer_so', '32'), # XER bit 32: SO
('XER', 'xer_ov', '33,44'), # XER bit 34/45: CA/CA32
('XER', 'xer_ca', '34,45')] # bit0: ov, bit1: ov32
class SPRPipeSpec(CommonPipeSpec):
- regspec = (SPRInputData.regspec, SPROutputData.regspec)
+ regspecklses = (SPRInputData, SPROutputData)
opsubsetkls = CompSPROpSubset
class SPRIlangCase(TestAccumulatorBase):
def case_ilang(self):
- pspec = SPRPipeSpec(id_wid=2)
+ pspec = SPRPipeSpec(id_wid=2, parent_pspec=None)
alu = SPRBasePipe(pspec)
vl = rtlil.convert(alu, ports=alu.ports())
with open("trap_pipeline.il", "w") as f:
m.submodules.pdecode2 = pdecode2 = PowerDecode2(pdecode)
- pspec = SPRPipeSpec(id_wid=2)
+ pspec = SPRPipeSpec(id_wid=2, parent_pspec=None)
m.submodules.alu = alu = SPRBasePipe(pspec)
comb += alu.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
comb = m.d.comb
rec = CompTrapOpSubset()
- pspec = TrapPipeSpec(id_wid=2)
+ pspec = TrapPipeSpec(id_wid=2, parent_pspec=None)
m.submodules.dut = dut = TrapMainStage(pspec)
###################
with m.Case(MicrOp.OP_MTMSRD):
- msr_od = msr_o.data # another "shortener"
+ msr_od = msr_o.data # another "shortener"
with m.If(L == 0):
# if (MSR[29:31] != 0b010) | (SRR1[29:31] != 0b000) then
# MSR[48] <- (RS)[48] | (RS)[49]
# MSR[58] <- (RS)[58] | (RS)[49]
# MSR[59] <- (RS)[59] | (RS)[49]
- PR = field(rs, 49) # alias/copy of SRR1 PR field
+ PR = field(rs, 49) # alias/copy of SRR1 PR field
comb += [
Assert(field(msr_od, 48) == field(rs, 48) | PR),
Assert(field(msr_od, 58) == field(rs, 58) | PR),
# RFID. v3.0B p955
###################
with m.Case(MicrOp.OP_RFID):
- msr_od = msr_o.data # another "shortener"
+ msr_od = msr_o.data # another "shortener"
comb += [
Assert(msr_o.ok),
Assert(nia_o.ok),
# if (MSR[29:31] != 0b010) | (SRR1[29:31] != 0b000) then
# MSR[29:31] <- SRR1[29:31]
- with m.If((field(msr_i , 29, 31) != 0b010) |
+ with m.If((field(msr_i, 29, 31) != 0b010) |
(field(srr1_i, 29, 31) != 0b000)):
comb += Assert(F(msr_od, 29, 31) == F(srr1_i, 29, 31))
with m.Else():
# MSR[48] <- (RS)[48] | (RS)[49]
# MSR[58] <- (RS)[58] | (RS)[49]
# MSR[59] <- (RS)[59] | (RS)[49]
- PR = field(srr1_i, 49) # alias/copy of SRR1 PR field
+ PR = field(srr1_i, 49) # alias/copy of SRR1 PR field
comb += [
Assert(field(msr_od, 48) == field(srr1_i, 48) | PR),
Assert(field(msr_od, 58) == field(srr1_i, 58) | PR),
if __name__ == '__main__':
unittest.main()
-
def msr_copy(msr_o, msr_i, zero_me=True):
- """msr_copy
+ """msr_copy (also used to copy relevant bits into SRR1)
+
ISA says this:
Defined MSR bits are classified as either full func tion or partial
function. Full function MSR bits are saved in SRR1 or HSRR1 when
return l
-def msr_check_pr(m, msr):
+def msr_check_pr(m, d_in, msr):
"""msr_check_pr: checks "problem state"
"""
comb = m.d.comb
- with m.If(msr[MSR.PR]):
+ with m.If(d_in[MSR.PR]):
comb += msr[MSR.EE].eq(1) # set external interrupt bit
comb += msr[MSR.IR].eq(1) # set instruction relocation bit
comb += msr[MSR.DR].eq(1) # set data relocation bit
super().__init__(pspec, "main")
self.fields = DecodeFields(SignalBitRange, [self.i.ctx.op.insn])
self.fields.create_specs()
+ self.kaivb = Signal(64) # KAIVB SPR
+ self.state_reset = Signal() # raise high to reset KAIVB cache
def trap(self, m, trap_addr, return_addr):
"""trap. sets new PC, stores MSR and old PC in SRR1 and SRR0
op = self.i.ctx.op
msr_i = op.msr
svstate_i = op.svstate
+
+ exc = LDSTException("trapexc")
+ comb += exc.eq(op.ldst_exc)
+ srr1_i = exc.srr1 # new SRR1 bits come from exception
nia_o = self.o.nia
svsrr0_o, srr0_o, srr1_o = self.o.svsrr0, self.o.srr0, self.o.srr1
- # trap address
+ # trap address, including KAIVB override
comb += nia_o.data.eq(trap_addr)
+ comb += nia_o.data[13:].eq(self.kaivb[13:])
comb += nia_o.ok.eq(1)
# addr to begin from on return
comb += srr0_o.data.eq(return_addr)
comb += srr0_o.ok.eq(1)
- # take a copy of the current MSR into SRR1
- comb += msr_copy(srr1_o.data, msr_i) # old MSR
+ # take a copy of the current MSR into SRR1, but first copy old SRR1
+ # this preserves the bits of SRR1 that are not supposed to change:
+ # MSR.IR,DR,PMM,RI,LE (0-5) and MR,FP,ME,FE0 (11-14)
+ # i would suggest reading v3.0C p1063 Book III section 7.2.1 for
+ # advice but it's so obscure and indirect, that it's just easier
+ # to copy microwatt behaviour. see writeback.vhdl
+ # IMPORTANT: PowerDecoder2 needed to actually read SRR1 for
+ # it to have the contents *of* SRR1 to copy over!
+ comb += msr_copy(srr1_o.data, msr_i, False) # old MSR
+ comb += srr1_o.data[16:22].eq(srr1_i[0:6]) # IR,DR,PMM,RI,LE
+ comb += srr1_o.data[27:31].eq(srr1_i[11:15]) # MR,FP,ME,FE0
comb += srr1_o.ok.eq(1)
# take a copy of the current SVSTATE into SVSRR0
def elaborate(self, platform):
m = Module()
- comb = m.d.comb
+ comb, sync = m.d.comb, m.d.sync
op = self.i.ctx.op
# convenience variables
srr0_o, srr1_o, svsrr0_o = self.o.srr0, self.o.srr1, self.o.svsrr0
traptype, trapaddr = op.traptype, op.trapaddr
+ # hard reset of KAIVB
+ with m.If(self.state_reset):
+ sync += self.kaivb.eq(0)
+
# take copy of D-Form TO field
i_fields = self.fields.FormD
to = Signal(i_fields.TO[0:-1].shape())
# TODO: some #defines for the bits n stuff.
with m.Switch(op.insn_type):
+ ##############
+ # KAIVB https://bugs.libre-soc.org/show_bug.cgi?id=859
+
+ with m.Case(MicrOp.OP_MTSPR):
+ sync += self.kaivb.eq(a_i)
+
+ with m.Case(MicrOp.OP_MFSPR):
+ comb += o.data.eq(self.kaivb)
+ comb += o.ok.eq(1)
+
###############
# TDI/TWI/TD/TW. v3.0B p90-91
comb += srr1_o.data[PI.FP].eq(1)
with m.If(traptype & TT.ADDR):
comb += srr1_o.data[PI.ADR].eq(1)
- with m.If(traptype & TT.MEMEXC):
+ with m.If((traptype & TT.MEMEXC).bool() &
+ (trapaddr == 0x400)):
+ # Instruction Storage Interrupt (ISI - 0x400)
+ # v3.0C Book III Chap 7.5.5 p1085
# decode exception bits, store in SRR1
exc = LDSTException("trapexc")
comb += exc.eq(op.ldst_exc)
# MTMSR/D. v3.0B p TODO - move to MSR
with m.Case(MicrOp.OP_MTMSRD, MicrOp.OP_MTMSR):
- L = self.fields.FormX.L[0:-1] # X-Form field L
+ # L => bit 16 in LSB0, bit 15 in MSB0 order
+ L = self.fields.FormX.L1[0:1] # X-Form field L1
# start with copy of msr
- comb += msr_o.eq(msr_i)
+ comb += msr_o.data.eq(msr_i)
with m.If(L):
# just update RI..EE
comb += msr_o.data[MSR.RI].eq(a_i[MSR.RI])
# mtmsr - 32-bit, only room for bottom 32 LSB flags
for stt, end in [(1,12), (13, 32)]:
comb += msr_o.data[stt:end].eq(a_i[stt:end])
- msr_check_pr(m, msr_o.data)
+ # check problem state: if set, not permitted to set EE,IR,DR
+ msr_check_pr(m, a_i, msr_o.data)
# Per https://bugs.libre-soc.org/show_bug.cgi?id=325#c123,
# this actually *is* in the microwatt code now.
# hypervisor stuff. here: bits 3 (HV) and 51 (ME) were
# copied over by msr_copy but if HV was not set we need
# the *original* (msr_i) bits
- with m.If(~msr_i[MSR.HV]):
- comb += msr_o.data[MSR.HV].eq(msr_i[MSR.HV])
- comb += msr_o.data[MSR.ME].eq(msr_i[MSR.ME])
+ # XXX taking this out to see what happens when running
+ # linux-5.7 microwatt buildroot. microwatt does not
+ # implement HV, so this is unlikely to work. 0x900
+ # linux kernel exception handling tends to support this
+ # with m.If(~msr_i[MSR.HV]):
+ # comb += msr_o.data[MSR.HV].eq(msr_i[MSR.HV])
+ # comb += msr_o.data[MSR.ME].eq(msr_i[MSR.ME])
comb += msr_o.ok.eq(1)
# MSR was in srr1: copy it over, however *caveats below*
comb += msr_copy(msr_o.data, srr1_i, zero_me=False) # don't zero
- with m.If(~self.i.ctx.op.insn[9]): # XXX BAD HACK! (hrfid)
- with m.If(field(msr_i, 3)): # HV
- comb += field(msr_o, 51).eq(field(srr1_i, 51)) # ME
- with m.Else():
- comb += field(msr_o, 51).eq(field(msr_i, 51)) # ME
-
- # check problem state
- msr_check_pr(m, msr_o.data)
+ if False: # XXX no - not doing hypervisor yet
+ with m.If(~self.i.ctx.op.insn[9]): # XXX BAD HACK! (hrfid)
+ with m.If(field(msr_i, 3)): # HV
+ comb += field(msr_o.data, 51).eq(field(srr1_i, 51)) # ME
+ with m.Else():
+ comb += field(msr_o.data, 51).eq(field(msr_i, 51)) # ME
+ else:
+ # same as microwatt: treat MSR.ME rfid same as hrfid
+ comb += field(msr_o.data, 51).eq(field(srr1_i, 51)) # ME
+
+ # check problem state: if set, not permitted to set EE,IR,DR
+ msr_check_pr(m, srr1_i, msr_o.data)
# don't understand but it's in the spec. again: bits 32-34
# are copied from srr1_i and need *restoring* to msr_i
class TrapPipeSpec(CommonPipeSpec):
- regspec = (TrapInputData.regspec, TrapOutputData.regspec)
+ regspecklses = (TrapInputData, TrapOutputData)
opsubsetkls = CompTrapOpSubset
class TrapIlangCase(TestAccumulatorBase):
def case_ilang(self):
- pspec = TrapPipeSpec(id_wid=2)
+ pspec = TrapPipeSpec(id_wid=2, parent_pspec=None)
alu = TrapBasePipe(pspec)
vl = rtlil.convert(alu, ports=alu.ports())
with open("trap_pipeline.il", "w") as f:
class TestRunner(unittest.TestCase):
- def __init__(self, test_data):
- super().__init__("run_all")
- self.test_data = test_data
- def run_all(self):
+ def execute(self, alu, instruction, pdecode2, test):
+ program = test.program
+ sim = ISA(pdecode2, test.regs, test.sprs, test.cr,
+ test.mem, test.msr,
+ bigendian=bigendian)
+ gen = program.generate_instructions()
+ instructions = list(zip(gen, program.assembly.splitlines()))
+
+ msr = sim.msr.value
+ pc = sim.pc.CIA.value
+ print("starting msr, pc %08x, %08x" % (msr, pc))
+ index = pc//4
+ while index < len(instructions):
+ ins, code = instructions[index]
+
+ print("pc %08x msr %08x instr: %08x" % (pc, msr, ins))
+ print(code)
+ if 'XER' in sim.spr:
+ so = 1 if sim.spr['XER'][XER_bits['SO']] else 0
+ ov = 1 if sim.spr['XER'][XER_bits['OV']] else 0
+ ov32 = 1 if sim.spr['XER'][XER_bits['OV32']] else 0
+ print("before: so/ov/32", so, ov, ov32)
+
+ # ask the decoder to decode this binary data (endian'd)
+ yield pdecode2.dec.bigendian.eq(bigendian) # l/big?
+ yield pdecode2.state.msr.eq(msr) # set MSR in pdecode2
+ yield pdecode2.state.pc.eq(pc) # set CIA in pdecode2
+ yield instruction.eq(ins) # raw binary instr.
+ yield Settle()
+ fn_unit = yield pdecode2.e.do.fn_unit
+ asmcode = yield pdecode2.e.asmcode
+ dec_asmcode = yield pdecode2.dec.op.asmcode
+ print("asmcode", asmcode, dec_asmcode)
+ self.assertEqual(fn_unit, Function.TRAP.value)
+ alu_o = yield from set_alu_inputs(alu, pdecode2, sim)
+
+ # set valid for one cycle, propagate through pipeline...
+ yield alu.p.i_valid.eq(1)
+ yield
+ yield alu.p.i_valid.eq(0)
+
+ opname = code.split(' ')[0]
+ yield from sim.call(opname)
+ pc = sim.pc.CIA.value
+ index = pc//4
+ print("pc after %08x" % (pc))
+ msr = sim.msr.value
+ print("msr after %08x" % (msr))
+
+ vld = yield alu.n.o_valid
+ while not vld:
+ yield
+ vld = yield alu.n.o_valid
+ yield
+
+ yield from self.check_alu_outputs(alu, pdecode2, sim, code)
+ yield Settle()
+
+ def test_it(self):
+ test_data = TrapTestCase().test_data
m = Module()
comb = m.d.comb
instruction = Signal(32)
- pdecode = create_pdecode()
-
- m.submodules.pdecode2 = pdecode2 = PowerDecode2(pdecode)
+ fn_name = "TRAP"
+ opkls = TrapPipeSpec.opsubsetkls
- pspec = TrapPipeSpec(id_wid=2)
+ pdecode = create_pdecode()
+ m.submodules.pdecode2 = pdecode2 = PowerDecode2(
+ pdecode, opkls, fn_name)
+ pdecode = pdecode2.dec
+
+ class PPspec:
+ XLEN = 64
+ pps = PPspec()
+ pspec = TrapPipeSpec(id_wid=2, parent_pspec=pps)
m.submodules.alu = alu = TrapBasePipe(pspec)
comb += alu.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
- comb += alu.p.i_valid.eq(1)
comb += alu.n.i_ready.eq(1)
comb += pdecode2.dec.raw_opcode_in.eq(instruction)
sim = Simulator(m)
sim.add_clock(1e-6)
def process():
- for test in self.test_data:
+ for test in test_data:
print(test.name)
program = test.program
with self.subTest(test.name):
- sim = ISA(pdecode2, test.regs, test.sprs, test.cr,
- test.mem, test.msr,
- bigendian=bigendian)
- gen = program.generate_instructions()
- instructions = list(zip(gen, program.assembly.splitlines()))
-
- msr = sim.msr.value
- pc = sim.pc.CIA.value
- print("starting msr, pc %08x, %08x" % (msr, pc))
- index = pc//4
- while index < len(instructions):
- ins, code = instructions[index]
-
- print("pc %08x msr %08x instr: %08x" % (pc, msr, ins))
- print(code)
- if 'XER' in sim.spr:
- so = 1 if sim.spr['XER'][XER_bits['SO']] else 0
- ov = 1 if sim.spr['XER'][XER_bits['OV']] else 0
- ov32 = 1 if sim.spr['XER'][XER_bits['OV32']] else 0
- print("before: so/ov/32", so, ov, ov32)
-
- # ask the decoder to decode this binary data (endian'd)
- yield pdecode2.dec.bigendian.eq(bigendian) # l/big?
- yield pdecode2.state.msr.eq(msr) # set MSR in pdecode2
- yield pdecode2.state.pc.eq(pc) # set CIA in pdecode2
- yield instruction.eq(ins) # raw binary instr.
- yield Settle()
- fn_unit = yield pdecode2.e.do.fn_unit
- self.assertEqual(fn_unit, Function.TRAP.value)
- alu_o = yield from set_alu_inputs(alu, pdecode2, sim)
- yield
- opname = code.split(' ')[0]
- yield from sim.call(opname)
- pc = sim.pc.CIA.value
- index = pc//4
- print("pc after %08x" % (pc))
- msr = sim.msr.value
- print("msr after %08x" % (msr))
-
- vld = yield alu.n.o_valid
- while not vld:
- yield
- vld = yield alu.n.o_valid
- yield
-
- yield from self.check_alu_outputs(alu, pdecode2,
- sim, code)
+ yield from self.execute(alu, instruction, pdecode2, test)
sim.add_sync_process(process)
with sim.write_vcd("alu_simulator.vcd", "simulator.gtkw",
def check_alu_outputs(self, alu, dec2, sim, code):
- rc = yield dec2.e.do.rc.data
- cridx_ok = yield dec2.e.write_cr.ok
- cridx = yield dec2.e.write_cr.data
-
- print("check extra output", repr(code), cridx_ok, cridx)
- if rc:
- self.assertEqual(cridx, 0, code)
-
sim_o = {}
res = {}
if __name__ == "__main__":
- unittest.main(exit=False)
- suite = unittest.TestSuite()
- suite.addTest(TestRunner(TrapTestCase().test_data))
- suite.addTest(TestRunner(TrapIlangCase().test_data))
-
- runner = unittest.TextTestRunner()
- runner.run(suite)
+ unittest.main()
('is_32bit', 1),
('traptype', TT.size), # see trap main_stage.py, PowerDecoder2
('trapaddr', 13),
- ('ldst_exc', len(LDSTException._exc_types)),
+ ('ldst_exc', LDSTException.length), # blech
]
super().__init__(layout, name=name)
# highest priority interrupt currently presented (which is allowed
# via XICS)
#
+# Bugreports:
+#
+# * https://bugs.libre-soc.org/show_bug.cgi?id=407
"""
from nmigen import Elaboratable, Module, Signal, Cat, Const, Record, Array, Mux
from nmutil.iocontrol import RecordObject
class XICS_ICP(Elaboratable):
- def __init__(self):
- class Spec: pass
- spec = Spec()
+ def __init__(self, spec=None):
+ if spec is None:
+ class Spec: pass
+ spec = Spec()
spec.addr_wid = 30
spec.mask_wid = 4
spec.reg_wid = 32
class XICS_ICS(Elaboratable):
- def __init__(self, SRC_NUM=16, PRIO_BITS=8):
+ def __init__(self, spec=None, SRC_NUM=16, PRIO_BITS=8):
self.SRC_NUM = SRC_NUM
self.PRIO_BITS = PRIO_BITS
self.pri_masked = (1<<self.PRIO_BITS)-1
- class Spec: pass
- spec = Spec()
+ if spec is None:
+ class Spec: pass
+ spec = Spec()
spec.addr_wid = 30
spec.mask_wid = 4
spec.reg_wid = 32
-Subproject commit b55917aafa6bbc9f16e1d97dc095e929c31aa81a
+Subproject commit 0f03df1546c8cf6ab91ef63b04713dca768a84c4
addr_wid, mask_wid, data_wid = spec.addr_wid, spec.mask_wid, spec.reg_wid
adr_lsbs = log2_int(mask_wid) # LSBs of addr covered by mask
badwid = spec.addr_wid-adr_lsbs # MSBs (not covered by mask)
+ # test if microwatt compatibility is to be enabled
+ microwatt_compat = (hasattr(spec, "microwatt_compat") and
+ (spec.microwatt_compat == True))
+ # test if fabric compatibility is to be enabled
+ fabric_compat = (hasattr(spec, "fabric_compat") and
+ (spec.fabric_compat == True))
res = [
("adr", badwid , DIR_FANOUT),
("we", 1, DIR_FANOUT),
("err", 1, DIR_FANIN)
]
+ # microwatt needs a stall signal (operates in pipeline mode)
+ if microwatt_compat or fabric_compat:
+ res.append(("stall", 1, DIR_FANIN))
if not cti:
return res
return res + [
def elaborate(self, platform):
m = Module()
- self.reg = reg = Signal(self.width, name="reg", reset=self.reset)
+ self.reg = reg = Signal(self.width, name="reg", reset=self.reset,
+ attrs={'syn_ramstyle': "block_ram"})
if self.synced:
domain = m.d.sync
and read-en signals (per port).
"""
- def __init__(self, width, depth, synced=True, fwd_bus_mode=True):
+ def __init__(self, width, depth, synced=True, fwd_bus_mode=True,
+ resets=None):
+ if resets is None:
+ resets = [0] * depth
self.synced = synced
self.width = width
self.depth = depth
self.regs = Array(Register(width, synced=synced,
- writethru=fwd_bus_mode) \
- for _ in range(self.depth))
+ writethru=fwd_bus_mode,
+ resetval=rst) \
+ for rst in resets)
self._rdports = []
self._wrports = []
self.fwd_bus_mode = fwd_bus_mode
self.synced = synced
self.width, self.depth = width, depth
- self.memory = Memory(width=width, depth=depth)
+ self.memory = Memory(width=width, depth=depth,
+ attrs={'syn_ramstyle': "block_ram"})
self._rdports = {}
self._wrports = {}
def elaborate(self, platform):
m = Module()
bsz = int(log(self.width) / log(2))
- regs = Array(Signal(self.width, name="reg") for _ in range(self.depth))
+ regs = Array(Signal(self.width, name="reg",
+ attrs={'syn_ramstyle': "block_ram"}) \
+ for _ in range(self.depth))
# read ports. has write-through detection (returns data written)
for rp in self._rdports:
(d_rd2)
"""
- def __init__(self, svp64_en=False, regreduce_en=False):
- super().__init__(64, StateRegsEnum.N_REGS)
+ def __init__(self, svp64_en=False, regreduce_en=False, resets=None):
+ super().__init__(64, StateRegsEnum.N_REGS, resets=resets)
wr_spec, rd_spec = self.get_port_specs()
create_ports(self, wr_spec, rd_spec)
def get_port_specs(self):
- w_port_spec = {'nia': "nia",
+ w_port_spec = { # these 3 allow writing state by Function Units
+ # strictly speaking this should not be allowed,
+ # the information should be passed back to Issuer
+ # to work out what to do
+ 'nia': "nia",
'msr': "msr",
'svstate': "svstate",
- 'sv': "sv", # writing SVSTATE (issuer)
- 'd_wr1': "d_wr1"} # writing PC (issuer)
- r_port_spec = {'cia': "cia", # reading PC (issuer)
+ 'issue': "issue", # writing DEC/TB
+ 'state1': "state1", # SPR pipeline
+ # these 3 allow writing state by Issuer
+ 'sv': "sv", # writing SVSTATE
+ 'd_wr1': "d_wr1", # writing PC
+ 'd_wr2': "d_wr2"} # writing MSR
+ r_port_spec = { # these are for reading state by Issuer but
+ # the FUs do not read them: they are passed in
+ # because of multi-issue / pipelining / etc.
+ # the state could be totally different and is
+ # only known *at* issue time, *by* the issuer
+ 'cia': "cia", # reading PC (issuer)
'msr': "msr", # reading MSR (issuer)
'sv': "sv", # reading SV (issuer)
+ # SPR and DEC/TB FSM
+ 'issue': "issue", # reading DEC/TB
+ 'state1': "state1", # SPR pipeline
}
return w_port_spec, r_port_spec
* Array-based unary-indexed (not binary-indexed)
* write-through capability (read on same cycle as write)
"""
- def __init__(self, svp64_en=False, regreduce_en=False):
- super().__init__(64, 32, fwd_bus_mode=False)
+ def __init__(self, svp64_en=False, regreduce_en=False, reg_wid=64):
+ super().__init__(reg_wid, 32, fwd_bus_mode=False)
self.svp64_en = svp64_en
self.regreduce_en = regreduce_en
wr_spec, rd_spec = self.get_port_specs()
class FastRegs(RegFileMem, FastRegsEnum): #RegFileArray):
"""FastRegs
- FAST regfile - CTR, LR, TAR, SRR1, SRR2, XER, TB, DEC, SVSRR0
+ FAST regfile - CTR, LR, TAR, SRR1, SRR2, XER, SVSRR0
* QTY 6of 64-bit registers
* 3R2W
def get_port_specs(self):
w_port_spec = {'fast1': "dest1",
- 'issue': "issue", # writing DEC/TB
}
r_port_spec = {'fast1': "src1",
- 'issue': "issue", # reading DEC/TB
+ 'dmi': "dmi" # needed for Debug (DMI)
}
if not self.regreduce_en:
r_port_spec['fast2'] = "src2"
('fast', FastRegs),
('state', StateRegs),
('spr', SPRRegs),]
- def __init__(self, pspec, make_hazard_vecs=False):
+ def __init__(self, pspec, make_hazard_vecs=False,
+ state_resets=None): # state file reset values
# test is SVP64 is to be enabled
svp64_en = hasattr(pspec, "svp64") and (pspec.svp64 == True)
regreduce_en = hasattr(pspec, "regreduce") and \
(pspec.regreduce == True)
+ # get Integer File register width
+ reg_wid = 64
+ if isinstance(pspec.XLEN, int):
+ reg_wid = pspec.XLEN
+
self.rf = {} # register file dict
# create regfiles here, Factory style
for (name, kls) in RegFiles.regkls:
- rf = self.rf[name] = kls(svp64_en, regreduce_en)
+ kwargs = {'svp64_en': svp64_en, 'regreduce_en': regreduce_en}
+ if name == 'state':
+ kwargs['resets'] = state_resets
+ if name == 'int':
+ kwargs['reg_wid'] = reg_wid
+ rf = self.rf[name] = kls(**kwargs)
# also add these as instances, self.state, self.fast, self.cr etc.
setattr(self, name, rf)
if __name__ == '__main__':
m = Module()
from soc.config.test.test_loadstore import TestMemPspec
- pspec = TestMemPspec()
+ pspec = TestMemPspec(regreduce_en=True,
+ XLEN=32) # integer reg width = 32
rf = RegFiles(pspec, make_hazard_vecs=True)
rf.elaborate_into(m, None)
vl = rtlil.convert(m)
--- /dev/null
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2022 Cesar Strauss <cestrauss@gmail.com>
+# Sponsored by NLnet and NGI POINTER under EU Grants 871528 and 957073
+# Part of the Libre-SOC Project.
+
+"""
+Wrapper around a single port (1R or 1W) SRAM, to make a multi-port regfile.
+
+This SRAM primitive has one cycle delay for reads, and, after a write,
+it reads the value just written. The goal is to use it to make at least an
+1W2R regfile.
+
+See https://bugs.libre-soc.org/show_bug.cgi?id=781 and
+https://bugs.libre-soc.org/show_bug.cgi?id=502
+"""
+
+import unittest
+
+from nmigen import Elaboratable, Module, Memory, Signal, Repl, Mux
+from nmigen.back import rtlil
+from nmigen.sim import Simulator
+from nmigen.asserts import Assert, Assume, Past, AnyConst
+
+from nmutil.formaltest import FHDLTestCase
+from nmutil.gtkw import write_gtkw
+
+
+class SinglePortSRAM(Elaboratable):
+ """
+ Model of a single port SRAM, which can be simulated, verified and/or
+ synthesized to an FPGA.
+
+ :param addr_width: width of the address bus
+ :param data_width: width of the data bus
+ :param we_width: number of write enable lines
+
+ .. note:: The debug read port is meant only to assist in formal proofs!
+ """
+ def __init__(self, addr_width, data_width, we_width):
+ self.addr_width = addr_width
+ self.data_width = data_width
+ self.we_width = we_width
+ # interface signals
+ self.d = Signal(data_width); """ write data"""
+ self.q = Signal(data_width); """read data"""
+ self.a = Signal(addr_width); """ read/write address"""
+ self.we = Signal(we_width); """write enable"""
+ # debug signals, only used in formal proofs
+ self.dbg_addr = Signal(addr_width); """debug: address under test"""
+ lanes = range(we_width)
+ self.dbg_lane = Signal(lanes); """debug: write lane under test"""
+ gran = self.data_width // self.we_width
+ self.dbg_data = Signal(gran); """debug: data to keep in sync"""
+ self.dbg_wrote = Signal(); """debug: data is valid"""
+
+ def elaborate(self, platform):
+ m = Module()
+ # backing memory
+ depth = 1 << self.addr_width
+ gran = self.data_width // self.we_width
+ mem = Memory(width=self.data_width, depth=depth)
+ # create read and write ports
+ # By connecting the same address to both ports, they behave, in fact,
+ # as a single, "half-duplex" port.
+ # The transparent attribute means that, on a write, we read the new
+ # value, on the next cycle
+ # Note that nmigen memories have a one cycle delay, for reads,
+ # by default
+ m.submodules.rdport = rdport = mem.read_port(transparent=True)
+ m.submodules.wrport = wrport = mem.write_port(granularity=gran)
+ # duplicate the address to both ports
+ m.d.comb += wrport.addr.eq(self.a)
+ m.d.comb += rdport.addr.eq(self.a)
+ # write enable
+ m.d.comb += wrport.en.eq(self.we)
+ # read and write data
+ m.d.comb += wrport.data.eq(self.d)
+ m.d.comb += self.q.eq(rdport.data)
+
+ # the following is needed for induction, where an unreachable state
+ # (memory and holding register differ) is turned into an illegal one
+ if platform == "formal":
+ # the debug port is an asynchronous read port, allowing direct
+ # access to a given memory location by the formal engine
+ m.submodules.dbgport = dbgport = mem.read_port(domain="comb")
+ # first, get the value stored in our memory location,
+ # using its debug port
+ stored = Signal(self.data_width)
+ m.d.comb += dbgport.addr.eq(self.dbg_addr)
+ m.d.comb += stored.eq(dbgport.data)
+ # now, ensure that the value stored in memory is always in sync
+ # with the holding register
+ with m.If(self.dbg_wrote):
+ m.d.sync += Assert(self.dbg_data ==
+ stored.word_select(self.dbg_lane, gran))
+
+ return m
+
+ def ports(self):
+ return [
+ self.d,
+ self.a,
+ self.we,
+ self.q
+ ]
+
+
+def create_ilang(dut, ports, test_name):
+ vl = rtlil.convert(dut, name=test_name, ports=ports)
+ with open("%s.il" % test_name, "w") as f:
+ f.write(vl)
+
+
+class SinglePortSRAMTestCase(FHDLTestCase):
+ @staticmethod
+ def test_simple_rtlil():
+ """
+ Generate a simple SRAM. Try ``read_rtlil mem_simple.il; proc; show``
+ from a yosys prompt, to see the memory primitives, and
+ ``read_rtlil mem_simple.il; synth; show`` to see it implemented as
+ flip-flop RAM
+ """
+ dut = SinglePortSRAM(2, 4, 2)
+ create_ilang(dut, dut.ports(), "mem_simple")
+
+ @staticmethod
+ def test_blkram_rtlil():
+ """
+ Generates a bigger SRAM.
+ Try ``read_rtlil mem_blkram.il; synth_ecp5; show`` from a yosys
+ prompt, to see it implemented as block RAM
+ """
+ dut = SinglePortSRAM(10, 16, 2)
+ create_ilang(dut, dut.ports(), "mem_blkram")
+
+ def test_sram_model(self):
+ """
+ Simulate some read/write/modify operations on the SRAM model
+ """
+ dut = SinglePortSRAM(7, 32, 4)
+ sim = Simulator(dut)
+ sim.add_clock(1e-6)
+
+ def process():
+ # 1) write 0x12_34_56_78 to address 0
+ yield dut.a.eq(0)
+ yield dut.d.eq(0x12_34_56_78)
+ yield dut.we.eq(0b1111)
+ yield
+ # 2) write 0x9A_BC_DE_F0 to address 1
+ yield dut.a.eq(1)
+ yield dut.d.eq(0x9A_BC_DE_F0)
+ yield dut.we.eq(0b1111)
+ yield
+ # ... and read value just written to address 0
+ self.assertEqual((yield dut.q), 0x12_34_56_78)
+ # 3) prepare to read from address 0
+ yield dut.d.eq(0)
+ yield dut.we.eq(0b0000)
+ yield dut.a.eq(0)
+ yield
+ # ... and read value just written to address 1
+ self.assertEqual((yield dut.q), 0x9A_BC_DE_F0)
+ # 4) prepare to read from address 1
+ yield dut.a.eq(1)
+ yield
+ # ... and read value from address 0
+ self.assertEqual((yield dut.q), 0x12_34_56_78)
+ # 5) write 0x9A and 0xDE to bytes 1 and 3, leaving
+ # bytes 0 and 2 unchanged
+ yield dut.a.eq(0)
+ yield dut.d.eq(0x9A_FF_DE_FF)
+ yield dut.we.eq(0b1010)
+ yield
+ # ... and read value from address 1
+ self.assertEqual((yield dut.q), 0x9A_BC_DE_F0)
+ # 6) nothing more to do
+ yield dut.d.eq(0)
+ yield dut.we.eq(0)
+ yield
+ # ... other than confirm that bytes 1 and 3 were modified
+ # correctly
+ self.assertEqual((yield dut.q), 0x9A_34_DE_78)
+
+ sim.add_sync_process(process)
+ traces = ['rdport.clk', 'a[6:0]', 'we[3:0]', 'd[31:0]', 'q[31:0]']
+ write_gtkw('test_sram_model.gtkw', 'test_sram_model.vcd',
+ traces, module='top')
+ sim_writer = sim.write_vcd('test_sram_model.vcd')
+ with sim_writer:
+ sim.run()
+
+ def test_model_sram_proof(self):
+ """
+ Formal proof of the single port SRAM model
+ """
+ m = Module()
+ # 128 x 32-bit, 8-bit granularity
+ m.submodules.dut = dut = SinglePortSRAM(7, 32, 4)
+ gran = len(dut.d) // len(dut.we) # granularity
+ # choose a single random memory location to test
+ a_const = AnyConst(dut.a.shape())
+ # choose a single byte lane to test
+ lane = AnyConst(range(dut.we_width))
+ # holding data register
+ d_reg = Signal(gran)
+ # for some reason, simulated formal memory is not zeroed at reset
+ # ... so, remember whether we wrote it, at least once.
+ wrote = Signal()
+ # if our memory location and byte lane is being written
+ # ... capture the data in our holding register
+ with m.If((dut.a == a_const) & dut.we.bit_select(lane, 1)):
+ m.d.sync += d_reg.eq(dut.d.word_select(lane, gran))
+ m.d.sync += wrote.eq(1)
+ # if our memory location is being read
+ # ... and the holding register has valid data
+ # ... then its value must match the memory output, on the given lane
+ with m.If((Past(dut.a) == a_const) & wrote):
+ m.d.sync += Assert(d_reg == dut.q.word_select(lane, gran))
+
+ # pass our state to the device under test, so it can ensure that
+ # its state is in sync with ours, for induction
+ m.d.comb += [
+ dut.dbg_addr.eq(a_const),
+ dut.dbg_lane.eq(lane),
+ dut.dbg_data.eq(d_reg),
+ dut.dbg_wrote.eq(wrote),
+ ]
+
+ self.assertFormal(m, mode="prove", depth=2)
+
+
+class PhasedDualPortRegfile(Elaboratable):
+ """
+ Builds, from a pair of 1RW blocks, a pseudo 1W/1R RAM, where the
+ read port works every cycle, but the write port is only available on
+ either even (1eW/1R) or odd (1oW/1R) cycles.
+
+ :param addr_width: width of the address bus
+ :param data_width: width of the data bus
+ :param we_width: number of write enable lines
+ :param write_phase: indicates on which phase the write port will
+ accept data
+ :param transparent: whether a simultaneous read and write returns the
+ new value (True) or the old value (False)
+
+ .. note:: The debug read port is meant only to assist in formal proofs!
+ """
+
+ def __init__(self, addr_width, data_width, we_width, write_phase,
+ transparent=False):
+ self.addr_width = addr_width
+ self.data_width = data_width
+ self.we_width = we_width
+ self.write_phase = write_phase
+ self.transparent = transparent
+ # interface signals
+ self.wr_addr_i = Signal(addr_width); """write port address"""
+ self.wr_data_i = Signal(data_width); """write port data"""
+ self.wr_we_i = Signal(we_width); """write port enable"""
+ self.rd_addr_i = Signal(addr_width); """read port address"""
+ self.rd_data_o = Signal(data_width); """read port data"""
+ self.phase = Signal(); """even/odd cycle indicator"""
+ # debug signals, only used in formal proofs
+ self.dbg_addr = Signal(addr_width); """debug: address under test"""
+ lanes = range(we_width)
+ self.dbg_lane = Signal(lanes); """debug: write lane under test"""
+ gran = self.data_width // self.we_width
+ self.dbg_data = Signal(gran); """debug: data to keep in sync"""
+ self.dbg_wrote = Signal(); """debug: data is valid"""
+
+ def elaborate(self, platform):
+ m = Module()
+ # granularity
+ # instantiate the two 1RW memory blocks
+ mem1 = SinglePortSRAM(self.addr_width, self.data_width, self.we_width)
+ mem2 = SinglePortSRAM(self.addr_width, self.data_width, self.we_width)
+ m.submodules.mem1 = mem1
+ m.submodules.mem2 = mem2
+ # wire write port to first memory, and its output to the second
+ m.d.comb += mem1.d.eq(self.wr_data_i)
+ m.d.comb += mem2.d.eq(mem1.q)
+ # holding registers for the write port of the second memory
+ last_wr_addr = Signal(self.addr_width)
+ last_wr_we = Signal(self.we_width)
+ # do the read and write address coincide?
+ same_read_write = Signal()
+ with m.If(self.phase == self.write_phase):
+ # write phase, start a write on the first memory
+ m.d.comb += mem1.a.eq(self.wr_addr_i)
+ m.d.comb += mem1.we.eq(self.wr_we_i)
+ # save write address and write select for repeating the write
+ # on the second memory, later
+ m.d.sync += last_wr_we.eq(self.wr_we_i)
+ m.d.sync += last_wr_addr.eq(self.wr_addr_i)
+ # start a read on the second memory
+ m.d.comb += mem2.a.eq(self.rd_addr_i)
+ # output previously read data from the first memory
+ m.d.comb += self.rd_data_o.eq(mem1.q)
+ if self.transparent:
+ # remember whether we are reading from the same location we are
+ # writing
+ m.d.sync += same_read_write.eq(self.rd_addr_i == self.wr_addr_i)
+ with m.Else():
+ # read phase, write last written data on second memory
+ m.d.comb += mem2.a.eq(last_wr_addr)
+ m.d.comb += mem2.we.eq(last_wr_we)
+ # start a read on the first memory
+ m.d.comb += mem1.a.eq(self.rd_addr_i)
+ if self.transparent:
+ with m.If(same_read_write):
+ # when transparent, and read and write addresses coincide,
+ # output the data just written
+ m.d.comb += self.rd_data_o.eq(mem1.q)
+ with m.Else():
+ # otherwise, output previously read data
+ # from the second memory
+ m.d.comb += self.rd_data_o.eq(mem2.q)
+ else:
+ # always output the read data from the second memory,
+ # if not transparent
+ m.d.comb += self.rd_data_o.eq(mem2.q)
+
+ if platform == "formal":
+ # pass our state to the device under test, so it can ensure that
+ # its state is in sync with ours, for induction
+ m.d.comb += [
+ # pass the address and write lane under test to both memories
+ mem1.dbg_addr.eq(self.dbg_addr),
+ mem2.dbg_addr.eq(self.dbg_addr),
+ mem1.dbg_lane.eq(self.dbg_lane),
+ mem2.dbg_lane.eq(self.dbg_lane),
+ # the second memory copies its state from the first memory,
+ # after a cycle, so it has a one cycle delay
+ mem1.dbg_data.eq(self.dbg_data),
+ mem2.dbg_data.eq(Past(self.dbg_data)),
+ mem1.dbg_wrote.eq(self.dbg_wrote),
+ mem2.dbg_wrote.eq(Past(self.dbg_wrote)),
+ ]
+
+ return m
+
+ def ports(self):
+ return [
+ self.wr_addr_i,
+ self.wr_data_i,
+ self.wr_we_i,
+ self.rd_addr_i,
+ self.rd_data_o,
+ self.phase
+ ]
+
+
+class PhasedDualPortRegfileTestCase(FHDLTestCase):
+
+ def do_test_phased_dual_port_regfile(self, write_phase, transparent):
+ """
+ Simulate some read/write/modify operations on the phased write memory
+ """
+ dut = PhasedDualPortRegfile(7, 32, 4, write_phase, transparent)
+ sim = Simulator(dut)
+ sim.add_clock(1e-6)
+
+ # compare read data with previously written data
+ # and start a new read
+ def read(rd_addr_i, expected=None):
+ if expected is not None:
+ self.assertEqual((yield dut.rd_data_o), expected)
+ yield dut.rd_addr_i.eq(rd_addr_i)
+
+ # start a write, and set write phase
+ def write(wr_addr_i, wr_we_i, wr_data_i):
+ yield dut.wr_addr_i.eq(wr_addr_i)
+ yield dut.wr_we_i.eq(wr_we_i)
+ yield dut.wr_data_i.eq(wr_data_i)
+ yield dut.phase.eq(write_phase)
+
+ # disable writes, and start read phase
+ def skip_write():
+ yield dut.wr_addr_i.eq(0)
+ yield dut.wr_we_i.eq(0)
+ yield dut.wr_data_i.eq(0)
+ yield dut.phase.eq(~write_phase)
+
+ # writes a few values on the write port, and read them back
+ # ... reads can happen every cycle
+ # ... writes, only every two cycles.
+ # since reads have a one cycle delay, the expected value on
+ # each read refers to the last read performed, not the
+ # current one, which is in progress.
+ def process():
+ yield from read(0)
+ yield from write(0x42, 0b1111, 0x12345678)
+ yield
+ yield from read(0x42)
+ yield from skip_write()
+ yield
+ yield from read(0x42)
+ yield from write(0x43, 0b1111, 0x9ABCDEF0)
+ yield
+ yield from read(0x43, 0x12345678)
+ yield from skip_write()
+ yield
+ yield from read(0x42, 0x12345678)
+ yield from write(0x43, 0b1001, 0xF0FFFF9A)
+ yield
+ yield from read(0x43, 0x9ABCDEF0)
+ yield from skip_write()
+ yield
+ yield from read(0x43, 0x12345678)
+ yield from write(0x42, 0b0110, 0xFF5634FF)
+ yield
+ yield from read(0x42, 0xF0BCDE9A)
+ yield from skip_write()
+ yield
+ yield from read(0, 0xF0BCDE9A)
+ yield from write(0, 0, 0)
+ yield
+ yield from read(0, 0x12563478)
+ yield from skip_write()
+ yield
+ # try reading and writing to the same location, simultaneously
+ yield from read(0x42)
+ yield from write(0x42, 0b0101, 0x55AA9966)
+ yield
+ # ... and read again
+ yield from read(0x42)
+ yield from skip_write()
+ yield
+ if transparent:
+ # returns the value just written
+ yield from read(0, 0x12AA3466)
+ else:
+ # returns the old value
+ yield from read(0, 0x12563478)
+ yield from write(0, 0, 0)
+ yield
+ # after a cycle, always returns the new value
+ yield from read(0, 0x12AA3466)
+ yield from skip_write()
+
+ sim.add_sync_process(process)
+ debug_file = f'test_phased_dual_port_{write_phase}'
+ if transparent:
+ debug_file += '_transparent'
+ traces = ['clk', 'phase',
+ 'wr_addr_i[6:0]', 'wr_we_i[3:0]', 'wr_data_i[31:0]',
+ 'rd_addr_i[6:0]', 'rd_data_o[31:0]']
+ write_gtkw(debug_file + '.gtkw',
+ debug_file + '.vcd',
+ traces, module='top', zoom=-22)
+ sim_writer = sim.write_vcd(debug_file + '.vcd')
+ with sim_writer:
+ sim.run()
+
+ def test_phased_dual_port_regfile(self):
+ """test both types (odd and even write ports) of phased write memory"""
+ with self.subTest("writes happen on phase 0"):
+ self.do_test_phased_dual_port_regfile(0, False)
+ with self.subTest("writes happen on phase 1"):
+ self.do_test_phased_dual_port_regfile(1, False)
+ """test again, with a transparent read port"""
+ with self.subTest("writes happen on phase 0 (transparent reads)"):
+ self.do_test_phased_dual_port_regfile(0, True)
+ with self.subTest("writes happen on phase 1 (transparent reads)"):
+ self.do_test_phased_dual_port_regfile(1, True)
+
+ def do_test_phased_dual_port_regfile_proof(self, write_phase, transparent):
+ """
+ Formal proof of the pseudo 1W/1R regfile
+ """
+ m = Module()
+ # 128 x 32-bit, 8-bit granularity
+ dut = PhasedDualPortRegfile(7, 32, 4, write_phase, transparent)
+ m.submodules.dut = dut
+ gran = dut.data_width // dut.we_width # granularity
+ # choose a single random memory location to test
+ a_const = AnyConst(dut.addr_width)
+ # choose a single byte lane to test
+ lane = AnyConst(range(dut.we_width))
+ # drive alternating phases
+ m.d.comb += Assume(dut.phase != Past(dut.phase))
+ # holding data register
+ d_reg = Signal(gran)
+ # for some reason, simulated formal memory is not zeroed at reset
+ # ... so, remember whether we wrote it, at least once.
+ wrote = Signal()
+ # if our memory location and byte lane is being written,
+ # capture the data in our holding register
+ with m.If((dut.wr_addr_i == a_const)
+ & dut.wr_we_i.bit_select(lane, 1)
+ & (dut.phase == dut.write_phase)):
+ m.d.sync += d_reg.eq(dut.wr_data_i.word_select(lane, gran))
+ m.d.sync += wrote.eq(1)
+ # if our memory location is being read,
+ # and the holding register has valid data,
+ # then its value must match the memory output, on the given lane
+ with m.If(Past(dut.rd_addr_i) == a_const):
+ if transparent:
+ with m.If(wrote):
+ rd_lane = dut.rd_data_o.word_select(lane, gran)
+ m.d.sync += Assert(d_reg == rd_lane)
+ else:
+ # with a non-transparent read port, the read value depends
+ # on whether there is a simultaneous write, or not
+ with m.If((Past(dut.wr_addr_i) == a_const)
+ & Past(dut.phase) == dut.write_phase):
+ # simultaneous write -> check against last written value
+ with m.If(Past(wrote)):
+ rd_lane = dut.rd_data_o.word_select(lane, gran)
+ m.d.sync += Assert(Past(d_reg) == rd_lane)
+ with m.Else():
+ # otherwise, check against current written value
+ with m.If(wrote):
+ rd_lane = dut.rd_data_o.word_select(lane, gran)
+ m.d.sync += Assert(d_reg == rd_lane)
+
+ # pass our state to the device under test, so it can ensure that
+ # its state is in sync with ours, for induction
+ m.d.comb += [
+ # address and mask under test
+ dut.dbg_addr.eq(a_const),
+ dut.dbg_lane.eq(lane),
+ # state of our holding register
+ dut.dbg_data.eq(d_reg),
+ dut.dbg_wrote.eq(wrote),
+ ]
+
+ self.assertFormal(m, mode="prove", depth=3)
+
+ def test_phased_dual_port_regfile_proof(self):
+ """test both types (odd and even write ports) of phased write memory"""
+ with self.subTest("writes happen on phase 0"):
+ self.do_test_phased_dual_port_regfile_proof(0, False)
+ with self.subTest("writes happen on phase 1"):
+ self.do_test_phased_dual_port_regfile_proof(1, False)
+ # test again, with transparent read ports
+ with self.subTest("writes happen on phase 0 (transparent reads)"):
+ self.do_test_phased_dual_port_regfile_proof(0, True)
+ with self.subTest("writes happen on phase 1 (transparent reads)"):
+ self.do_test_phased_dual_port_regfile_proof(1, True)
+
+
+class DualPortRegfile(Elaboratable):
+ """
+ Builds, from a pair of phased 1W/1R blocks, a true 1W/1R RAM, where both
+ read and write ports work every cycle.
+ It employs a Last Value Table, that tracks to which memory each address was
+ last written.
+
+ :param addr_width: width of the address bus
+ :param data_width: width of the data bus
+ :param we_width: number of write enable lines
+ :param transparent: whether a simultaneous read and write returns the
+ new value (True) or the old value (False)
+ """
+
+ def __init__(self, addr_width, data_width, we_width, transparent=True):
+ self.addr_width = addr_width
+ self.data_width = data_width
+ self.we_width = we_width
+ self.transparent = transparent
+ # interface signals
+ self.wr_addr_i = Signal(addr_width); """write port address"""
+ self.wr_data_i = Signal(data_width); """write port data"""
+ self.wr_we_i = Signal(we_width); """write port enable"""
+ self.rd_addr_i = Signal(addr_width); """read port address"""
+ self.rd_data_o = Signal(data_width); """read port data"""
+ # debug signals, only used in formal proofs
+ # address and write lane under test
+ self.dbg_addr = Signal(addr_width); """debug: address under test"""
+ lanes = range(we_width)
+ self.dbg_lane = Signal(lanes); """debug: write lane under test"""
+ # upstream state, to keep in sync with ours
+ gran = self.data_width // self.we_width
+ self.dbg_data = Signal(gran); """debug: data to keep in sync"""
+ self.dbg_wrote = Signal(); """debug: data is valid"""
+ self.dbg_wrote_phase = Signal(); """debug: the phase data was written"""
+ self.dbg_phase = Signal(); """debug: current phase"""
+
+ def elaborate(self, platform):
+ m = Module()
+ # depth and granularity
+ depth = 1 << self.addr_width
+ gran = self.data_width // self.we_width
+ # instantiate the two phased 1R/1W memory blocks
+ mem0 = PhasedDualPortRegfile(
+ self.addr_width, self.data_width, self.we_width, 0,
+ self.transparent)
+ mem1 = PhasedDualPortRegfile(
+ self.addr_width, self.data_width, self.we_width, 1,
+ self.transparent)
+ m.submodules.mem0 = mem0
+ m.submodules.mem1 = mem1
+ # instantiate the backing memory (FFRAM or LUTRAM)
+ # for the Last Value Table
+ # it should have the same number and port types of the desired
+ # memory, but just one bit per write lane
+ lvt_mem = Memory(width=self.we_width, depth=depth)
+ lvt_wr = lvt_mem.write_port(granularity=1)
+ lvt_rd = lvt_mem.read_port(transparent=self.transparent)
+ if not self.transparent:
+ # for some reason, formal proofs don't recognize the default
+ # reset value for this signal
+ m.d.comb += lvt_rd.en.eq(1)
+ m.submodules.lvt_wr = lvt_wr
+ m.submodules.lvt_rd = lvt_rd
+ # generate and wire the phases for the phased memories
+ phase = Signal()
+ m.d.sync += phase.eq(~phase)
+ m.d.comb += [
+ mem0.phase.eq(phase),
+ mem1.phase.eq(phase),
+ ]
+ m.d.comb += [
+ # wire the write ports, directly
+ mem0.wr_addr_i.eq(self.wr_addr_i),
+ mem1.wr_addr_i.eq(self.wr_addr_i),
+ mem0.wr_we_i.eq(self.wr_we_i),
+ mem1.wr_we_i.eq(self.wr_we_i),
+ mem0.wr_data_i.eq(self.wr_data_i),
+ mem1.wr_data_i.eq(self.wr_data_i),
+ # also wire the read addresses
+ mem0.rd_addr_i.eq(self.rd_addr_i),
+ mem1.rd_addr_i.eq(self.rd_addr_i),
+ # wire read and write ports to the LVT
+ lvt_wr.addr.eq(self.wr_addr_i),
+ lvt_wr.en.eq(self.wr_we_i),
+ lvt_rd.addr.eq(self.rd_addr_i),
+ # the data for the LVT is the phase on which the value was
+ # written
+ lvt_wr.data.eq(Repl(phase, self.we_width)),
+ ]
+ for i in range(self.we_width):
+ # select the right memory to assign to the output read port,
+ # in this byte lane, according to the LVT contents
+ m.d.comb += self.rd_data_o.word_select(i, gran).eq(
+ Mux(
+ lvt_rd.data[i],
+ mem1.rd_data_o.word_select(i, gran),
+ mem0.rd_data_o.word_select(i, gran)))
+
+ if platform == "formal":
+ # pass upstream state to the memories, so they can ensure that
+ # their state are in sync with upstream, for induction
+ m.d.comb += [
+ # address and write lane under test
+ mem0.dbg_addr.eq(self.dbg_addr),
+ mem1.dbg_addr.eq(self.dbg_addr),
+ mem0.dbg_lane.eq(self.dbg_lane),
+ mem1.dbg_lane.eq(self.dbg_lane),
+ # upstream state
+ mem0.dbg_data.eq(self.dbg_data),
+ mem1.dbg_data.eq(self.dbg_data),
+ # the memory, on which the write ends up, depends on which
+ # phase it was written
+ mem0.dbg_wrote.eq(self.dbg_wrote & ~self.dbg_wrote_phase),
+ mem1.dbg_wrote.eq(self.dbg_wrote & self.dbg_wrote_phase),
+ ]
+ # sync phase to upstream
+ m.d.comb += Assert(self.dbg_phase == phase)
+ # this debug port for the LVT is an asynchronous read port,
+ # allowing direct access to a given memory location
+ # by the formal engine
+ m.submodules.dbgport = dbgport = lvt_mem.read_port(domain='comb')
+ # first, get the value stored in our memory location,
+ stored = Signal(self.we_width)
+ m.d.comb += dbgport.addr.eq(self.dbg_addr)
+ m.d.comb += stored.eq(dbgport.data)
+ # now, ensure that the value stored in memory is always in sync
+ # with the expected value (which memory the value was written to)
+ with m.If(self.dbg_wrote):
+ m.d.comb += Assert(stored.bit_select(self.dbg_lane, 1)
+ == self.dbg_wrote_phase)
+ return m
+
+ def ports(self):
+ return [
+ self.wr_addr_i,
+ self.wr_data_i,
+ self.wr_we_i,
+ self.rd_addr_i,
+ self.rd_data_o
+ ]
+
+
+class DualPortRegfileTestCase(FHDLTestCase):
+
+ def do_test_dual_port_regfile(self, transparent):
+ """
+ Simulate some read/write/modify operations on the dual port register
+ file
+ """
+ dut = DualPortRegfile(7, 32, 4, transparent)
+ sim = Simulator(dut)
+ sim.add_clock(1e-6)
+
+ expected = None
+ last_expected = None
+
+ # compare read data with previously written data
+ # and start a new read
+ def read(rd_addr_i, next_expected=None):
+ nonlocal expected, last_expected
+ if expected is not None:
+ self.assertEqual((yield dut.rd_data_o), expected)
+ yield dut.rd_addr_i.eq(rd_addr_i)
+ # account for the read latency
+ expected = last_expected
+ last_expected = next_expected
+
+ # start a write
+ def write(wr_addr_i, wr_we_i, wr_data_i):
+ yield dut.wr_addr_i.eq(wr_addr_i)
+ yield dut.wr_we_i.eq(wr_we_i)
+ yield dut.wr_data_i.eq(wr_data_i)
+
+ def process():
+ # write a pair of values, one for each memory
+ yield from read(0)
+ yield from write(0x42, 0b1111, 0x87654321)
+ yield
+ yield from read(0x42, 0x87654321)
+ yield from write(0x43, 0b1111, 0x0FEDCBA9)
+ yield
+ # skip a beat
+ yield from read(0x43, 0x0FEDCBA9)
+ yield from write(0, 0, 0)
+ yield
+ # write again, but now they switch memories
+ yield from read(0)
+ yield from write(0x42, 0b1111, 0x12345678)
+ yield
+ yield from read(0x42, 0x12345678)
+ yield from write(0x43, 0b1111, 0x9ABCDEF0)
+ yield
+ yield from read(0x43, 0x9ABCDEF0)
+ yield from write(0, 0, 0)
+ yield
+ # test partial writes
+ yield from read(0)
+ yield from write(0x42, 0b1001, 0x78FFFF12)
+ yield
+ yield from read(0)
+ yield from write(0x43, 0b0110, 0xFFDEABFF)
+ yield
+ yield from read(0x42, 0x78345612)
+ yield from write(0, 0, 0)
+ yield
+ yield from read(0x43, 0x9ADEABF0)
+ yield from write(0, 0, 0)
+ yield
+ yield from read(0)
+ yield from write(0, 0, 0)
+ yield
+ if transparent:
+ # returns the value just written
+ yield from read(0x42, 0x78AA5666)
+ else:
+ # returns the old value
+ yield from read(0x42, 0x78345612)
+ yield from write(0x42, 0b0101, 0x55AA9966)
+ yield
+ # after a cycle, always returns the new value
+ yield from read(0x42, 0x78AA5666)
+ yield from write(0, 0, 0)
+ yield
+ yield from read(0)
+ yield from write(0, 0, 0)
+ yield
+ yield from read(0)
+ yield from write(0, 0, 0)
+
+ sim.add_sync_process(process)
+ debug_file = 'test_dual_port_regfile'
+ if transparent:
+ debug_file += '_transparent'
+ traces = ['clk', 'phase',
+ {'comment': 'write port'},
+ 'wr_addr_i[6:0]', 'wr_we_i[3:0]', 'wr_data_i[31:0]',
+ {'comment': 'read port'},
+ 'rd_addr_i[6:0]', 'rd_data_o[31:0]',
+ {'comment': 'LVT write port'},
+ 'phase', 'lvt_mem_w_addr[6:0]', 'lvt_mem_w_en[3:0]',
+ 'lvt_mem_w_data[3:0]',
+ {'comment': 'LVT read port'},
+ 'lvt_mem_r_addr[6:0]', 'lvt_mem_r_data[3:0]',
+ {'comment': 'backing memory'},
+ 'mem0.rd_data_o[31:0]',
+ 'mem1.rd_data_o[31:0]',
+ ]
+ write_gtkw(debug_file + '.gtkw',
+ debug_file + '.vcd',
+ traces, module='top', zoom=-22)
+ sim_writer = sim.write_vcd(debug_file + '.vcd')
+ with sim_writer:
+ sim.run()
+
+ def test_dual_port_regfile(self):
+ with self.subTest("non-transparent reads"):
+ self.do_test_dual_port_regfile(False)
+ with self.subTest("transparent reads"):
+ self.do_test_dual_port_regfile(True)
+
+ def do_test_dual_port_regfile_proof(self, transparent=True):
+ """
+ Formal proof of the 1W/1R regfile
+ """
+ m = Module()
+ # 128 x 32-bit, 8-bit granularity
+ dut = DualPortRegfile(7, 32, 4, transparent)
+ m.submodules.dut = dut
+ gran = dut.data_width // dut.we_width # granularity
+ # choose a single random memory location to test
+ a_const = AnyConst(dut.addr_width)
+ # choose a single byte lane to test
+ lane = AnyConst(range(dut.we_width))
+ # holding data register
+ d_reg = Signal(gran)
+ # keep track of the phase, so we can remember which memory
+ # we wrote to
+ phase = Signal()
+ m.d.sync += phase.eq(~phase)
+ # for some reason, simulated formal memory is not zeroed at reset
+ # ... so, remember whether we wrote it, at least once.
+ wrote = Signal()
+ # ... and on which phase it was written
+ wrote_phase = Signal()
+ # if our memory location and byte lane is being written,
+ # capture the data in our holding register
+ with m.If((dut.wr_addr_i == a_const)
+ & dut.wr_we_i.bit_select(lane, 1)):
+ m.d.sync += d_reg.eq(dut.wr_data_i.word_select(lane, gran))
+ m.d.sync += wrote.eq(1)
+ m.d.sync += wrote_phase.eq(phase)
+ # if our memory location is being read,
+ # and the holding register has valid data,
+ # then its value must match the memory output, on the given lane
+ with m.If(Past(dut.rd_addr_i) == a_const):
+ if transparent:
+ with m.If(wrote):
+ rd_lane = dut.rd_data_o.word_select(lane, gran)
+ m.d.sync += Assert(d_reg == rd_lane)
+ else:
+ # with a non-transparent read port, the read value depends
+ # on whether there is a simultaneous write, or not
+ with m.If(Past(dut.wr_addr_i) == a_const):
+ # simultaneous write -> check against last written value
+ with m.If(wrote & Past(wrote)):
+ rd_lane = dut.rd_data_o.word_select(lane, gran)
+ m.d.sync += Assert(Past(d_reg) == rd_lane)
+ with m.Else():
+ # otherwise, check against current written value
+ with m.If(wrote):
+ rd_lane = dut.rd_data_o.word_select(lane, gran)
+ m.d.sync += Assert(d_reg == rd_lane)
+
+ m.d.comb += [
+ dut.dbg_addr.eq(a_const),
+ dut.dbg_lane.eq(lane),
+ dut.dbg_data.eq(d_reg),
+ dut.dbg_wrote.eq(wrote),
+ dut.dbg_wrote_phase.eq(wrote_phase),
+ dut.dbg_phase.eq(phase),
+ ]
+
+ self.assertFormal(m, mode="prove", depth=3)
+
+ def test_dual_port_regfile_proof(self):
+ """
+ Formal check of 1W/1R regfile (transparent and not)
+ """
+ with self.subTest("transparent reads"):
+ self.do_test_dual_port_regfile_proof(True)
+ with self.subTest("non-transparent reads"):
+ self.do_test_dual_port_regfile_proof(False)
+
+
+class PhasedReadPhasedWriteFullReadSRAM(Elaboratable):
+ """
+ Builds, from three 1RW blocks, a pseudo 1W/2R SRAM, with:
+
+ * one full read port, which works every cycle,
+ * one write port, which is only available on either even or odd cycles,
+ * an extra transparent read port, available only on the same cycles as the
+ write port
+
+ This type of SRAM is useful for a XOR-based 6x1RW implementation of
+ a 1R/1W register file.
+
+ :param addr_width: width of the address bus
+ :param data_width: width of the data bus
+ :param we_width: number of write enable lines
+ :param write_phase: indicates on which phase the write port will
+ accept data
+ :param transparent: whether a simultaneous read and write returns the
+ new value (True) or the old value (False) on the full
+ read port
+
+ .. note:: The debug read port is meant only to assist in formal proofs!
+ """
+
+ def __init__(self, addr_width, data_width, we_width, write_phase,
+ transparent=True):
+ self.addr_width = addr_width
+ self.data_width = data_width
+ self.we_width = we_width
+ self.write_phase = write_phase
+ self.transparent = transparent
+ # interface signals
+ self.wr_addr_i = Signal(addr_width); """phased write port address"""
+ self.wr_data_i = Signal(data_width); """phased write port data"""
+ self.wr_we_i = Signal(we_width); """phased write port enable"""
+ self.rd_addr_i = Signal(addr_width); """full read port address"""
+ self.rd_data_o = Signal(data_width); """full read port data"""
+ self.rdp_addr_i = Signal(addr_width); """phased read port address"""
+ self.rdp_data_o = Signal(data_width); """phased read port data"""
+ self.phase = Signal(); """even/odd cycle indicator"""
+ # debug signals, only used in formal proofs
+ self.dbg_addr = Signal(addr_width); """debug: address under test"""
+ lanes = range(we_width)
+ self.dbg_lane = Signal(lanes); """debug: write lane under test"""
+ gran = self.data_width // self.we_width
+ self.dbg_data = Signal(gran); """debug: data to keep in sync"""
+ self.dbg_wrote = Signal(); """debug: data is valid"""
+
+ def elaborate(self, platform):
+ m = Module()
+ # instantiate the 1RW memory blocks
+ mem1 = SinglePortSRAM(self.addr_width, self.data_width, self.we_width)
+ mem2 = SinglePortSRAM(self.addr_width, self.data_width, self.we_width)
+ mem3 = SinglePortSRAM(self.addr_width, self.data_width, self.we_width)
+ m.submodules.mem1 = mem1
+ m.submodules.mem2 = mem2
+ m.submodules.mem3 = mem3
+ # wire input write data to first memory, and its output to the others
+ m.d.comb += [
+ mem1.d.eq(self.wr_data_i),
+ mem2.d.eq(mem1.q),
+ mem3.d.eq(mem1.q)
+ ]
+ # holding registers for the write port of the other memories
+ last_wr_addr = Signal(self.addr_width)
+ last_wr_we = Signal(self.we_width)
+ # do read and write addresses coincide?
+ same_read_write = Signal()
+ same_phased_read_write = Signal()
+ with m.If(self.phase == self.write_phase):
+ # write phase, start a write on the first memory
+ m.d.comb += mem1.a.eq(self.wr_addr_i)
+ m.d.comb += mem1.we.eq(self.wr_we_i)
+ # save write address and write select for repeating the write
+ # on the other memories, one cycle later
+ m.d.sync += last_wr_we.eq(self.wr_we_i)
+ m.d.sync += last_wr_addr.eq(self.wr_addr_i)
+ # start a read on the other memories
+ m.d.comb += mem2.a.eq(self.rd_addr_i)
+ m.d.comb += mem3.a.eq(self.rdp_addr_i)
+ # output previously read data from the first memory
+ m.d.comb += self.rd_data_o.eq(mem1.q)
+ # remember whether we are reading from the same location as we
+ # are writing
+ m.d.sync += same_phased_read_write.eq(
+ self.rdp_addr_i == self.wr_addr_i)
+ if self.transparent:
+ m.d.sync += same_read_write.eq(self.rd_addr_i == self.wr_addr_i)
+ with m.Else():
+ # read phase, write last written data on the other memories
+ m.d.comb += [
+ mem2.a.eq(last_wr_addr),
+ mem2.we.eq(last_wr_we),
+ mem3.a.eq(last_wr_addr),
+ mem3.we.eq(last_wr_we),
+ ]
+ # start a read on the first memory
+ m.d.comb += mem1.a.eq(self.rd_addr_i)
+ # output the read data from the second memory
+ if self.transparent:
+ with m.If(same_read_write):
+ # when transparent, and read and write addresses coincide,
+ # output the data just written
+ m.d.comb += self.rd_data_o.eq(mem1.q)
+ with m.Else():
+ # otherwise, output previously read data
+ # from the second memory
+ m.d.comb += self.rd_data_o.eq(mem2.q)
+ else:
+ # always output the read data from the second memory,
+ # if not transparent
+ m.d.comb += self.rd_data_o.eq(mem2.q)
+ with m.If(same_phased_read_write):
+ # if read and write addresses coincide,
+ # output the data just written
+ m.d.comb += self.rdp_data_o.eq(mem1.q)
+ with m.Else():
+ # otherwise, output previously read data
+ # from the third memory
+ m.d.comb += self.rdp_data_o.eq(mem3.q)
+
+ if platform == "formal":
+ # pass our state to the device under test, so it can ensure that
+ # its state is in sync with ours, for induction
+ m.d.comb += [
+ # pass the address and write lane under test to both memories
+ mem1.dbg_addr.eq(self.dbg_addr),
+ mem2.dbg_addr.eq(self.dbg_addr),
+ mem3.dbg_addr.eq(self.dbg_addr),
+ mem1.dbg_lane.eq(self.dbg_lane),
+ mem2.dbg_lane.eq(self.dbg_lane),
+ mem3.dbg_lane.eq(self.dbg_lane),
+ # the other memories copy their state from the first memory,
+ # after a cycle, so they have a one cycle delay
+ mem1.dbg_data.eq(self.dbg_data),
+ mem2.dbg_data.eq(Past(self.dbg_data)),
+ mem3.dbg_data.eq(Past(self.dbg_data)),
+ mem1.dbg_wrote.eq(self.dbg_wrote),
+ mem2.dbg_wrote.eq(Past(self.dbg_wrote)),
+ mem3.dbg_wrote.eq(Past(self.dbg_wrote)),
+ ]
+
+ return m
+
+
+class PhasedReadPhasedWriteFullReadSRAMTestCase(FHDLTestCase):
+
+ def do_test_case(self, write_phase, transparent):
+ """
+ Simulate some read/write/modify operations
+ """
+ dut = PhasedReadPhasedWriteFullReadSRAM(7, 32, 4, write_phase,
+ transparent)
+ sim = Simulator(dut)
+ sim.add_clock(1e-6)
+
+ expected = None
+ last_expected = None
+
+ # compare read data with previously written data
+ # and start a new read
+ def read(rd_addr_i, next_expected=None):
+ nonlocal expected, last_expected
+ if expected is not None:
+ self.assertEqual((yield dut.rd_data_o), expected)
+ yield dut.rd_addr_i.eq(rd_addr_i)
+ # account for the read latency
+ expected = last_expected
+ last_expected = next_expected
+
+ expected2 = None
+
+ # same as above, but for the phased read port
+ def phased_read(rdp_addr_i, next_expected2=None):
+ nonlocal expected2
+ if expected2 is not None:
+ self.assertEqual((yield dut.rdp_data_o), expected2)
+ yield dut.rdp_addr_i.eq(rdp_addr_i)
+ # account for the read latency
+ expected2 = next_expected2
+
+ # start a write
+ def write(wr_addr_i, wr_we_i, wr_data_i):
+ yield dut.wr_addr_i.eq(wr_addr_i)
+ yield dut.wr_we_i.eq(wr_we_i)
+ yield dut.wr_data_i.eq(wr_data_i)
+ yield dut.phase.eq(write_phase)
+
+ # disable writes, and start read phase
+ def skip_write():
+ yield dut.wr_addr_i.eq(0)
+ yield dut.wr_we_i.eq(0)
+ yield dut.wr_data_i.eq(0)
+ yield dut.phase.eq(~write_phase)
+ # also skip reading from the phased read port
+ yield dut.rdp_addr_i.eq(0)
+
+ # writes a few values on the write port, and read them back
+ def process():
+ yield from read(0)
+ yield from phased_read(0)
+ yield from write(0x42, 0b1111, 0x12345678)
+ yield
+ yield from read(0x42, 0x12345678)
+ yield from skip_write()
+ yield
+ yield from read(0x42, 0x12345678)
+ yield from phased_read(0x42, 0x12345678)
+ yield from write(0x43, 0b1111, 0x9ABCDEF0)
+ yield
+ yield from read(0x43, 0x9ABCDEF0)
+ yield from skip_write()
+ yield
+ yield from read(0x42, 0x12345678)
+ yield from phased_read(0x42, 0x12345678)
+ yield from write(0x43, 0b1001, 0xF0FFFF9A)
+ yield
+ yield from read(0x43, 0xF0BCDE9A)
+ yield from skip_write()
+ yield
+ yield from read(0x43, 0xF0BCDE9A)
+ yield from phased_read(0x43, 0xF0BCDE9A)
+ yield from write(0x42, 0b0110, 0xFF5634FF)
+ yield
+ yield from read(0x42, 0x12563478)
+ yield from skip_write()
+ yield
+ yield from read(0)
+ yield from phased_read(0)
+ yield from write(0, 0, 0)
+ yield
+ yield from read(0)
+ yield from skip_write()
+ yield
+ # try reading and writing at the same time
+ if transparent:
+ # transparent port, return the value just written
+ yield from read(0x42, 0x12AA3466)
+ else:
+ # ... otherwise, return the old value
+ yield from read(0x42, 0x12563478)
+ # transparent port, always return the value just written
+ yield from phased_read(0x42, 0x12AA3466)
+ yield from write(0x42, 0b0101, 0x55AA9966)
+ yield
+ # after a cycle, always returns the new value
+ yield from read(0x42, 0x12AA3466)
+ yield from skip_write()
+ yield
+ yield from read(0)
+ yield from phased_read(0)
+ yield from write(0, 0, 0)
+ yield
+ yield from read(0)
+ yield from skip_write()
+
+ sim.add_sync_process(process)
+ debug_file = 'test_phased_read_write_sram_' + str(write_phase)
+ if transparent:
+ debug_file += '_transparent'
+ traces = ['clk', 'phase',
+ {'comment': 'phased write port'},
+ 'wr_addr_i[6:0]', 'wr_we_i[3:0]', 'wr_data_i[31:0]',
+ {'comment': 'full read port'},
+ 'rd_addr_i[6:0]', 'rd_data_o[31:0]',
+ {'comment': 'phased read port'},
+ 'rdp_addr_i[6:0]', 'rdp_data_o[31:0]']
+ write_gtkw(debug_file + '.gtkw',
+ debug_file + '.vcd',
+ traces, module='top', zoom=-22)
+ sim_writer = sim.write_vcd(debug_file + '.vcd')
+ with sim_writer:
+ sim.run()
+
+ def test_case(self):
+ """test both types (odd and even write ports) of phased memory"""
+ with self.subTest("writes happen on phase 0"):
+ self.do_test_case(0, True)
+ with self.subTest("writes happen on phase 1"):
+ self.do_test_case(1, True)
+ with self.subTest("writes happen on phase 0 (non-transparent reads)"):
+ self.do_test_case(0, False)
+ with self.subTest("writes happen on phase 1 (non-transparent reads)"):
+ self.do_test_case(1, False)
+
+ def do_test_formal(self, write_phase, transparent):
+ """
+ Formal proof of the pseudo 1W/2R regfile
+ """
+ m = Module()
+ # 128 x 32-bit, 8-bit granularity
+ dut = PhasedReadPhasedWriteFullReadSRAM(7, 32, 4, write_phase,
+ transparent)
+ m.submodules.dut = dut
+ gran = dut.data_width // dut.we_width # granularity
+ # choose a single random memory location to test
+ a_const = AnyConst(dut.addr_width)
+ # choose a single byte lane to test
+ lane = AnyConst(range(dut.we_width))
+ # drive alternating phases
+ m.d.comb += Assume(dut.phase != Past(dut.phase))
+ # holding data register
+ d_reg = Signal(gran)
+ # for some reason, simulated formal memory is not zeroed at reset
+ # ... so, remember whether we wrote it, at least once.
+ wrote = Signal()
+ # if our memory location and byte lane is being written,
+ # capture the data in our holding register
+ with m.If((dut.wr_addr_i == a_const)
+ & dut.wr_we_i.bit_select(lane, 1)
+ & (dut.phase == dut.write_phase)):
+ m.d.sync += d_reg.eq(dut.wr_data_i.word_select(lane, gran))
+ m.d.sync += wrote.eq(1)
+ # if our memory location is being read,
+ # and the holding register has valid data,
+ # then its value must match the memory output, on the given lane
+ with m.If(Past(dut.rd_addr_i) == a_const):
+ if transparent:
+ with m.If(wrote):
+ rd_lane = dut.rd_data_o.word_select(lane, gran)
+ m.d.sync += Assert(d_reg == rd_lane)
+ else:
+ # with a non-transparent read port, the read value depends
+ # on whether there is a simultaneous write, or not
+ with m.If((Past(dut.wr_addr_i) == a_const)
+ & Past(dut.phase) == dut.write_phase):
+ # simultaneous write -> check against last written value
+ with m.If(Past(wrote)):
+ rd_lane = dut.rd_data_o.word_select(lane, gran)
+ m.d.sync += Assert(Past(d_reg) == rd_lane)
+ with m.Else():
+ # otherwise, check against current written value
+ with m.If(wrote):
+ rd_lane = dut.rd_data_o.word_select(lane, gran)
+ m.d.sync += Assert(d_reg == rd_lane)
+ # same for the phased read port, except it's always transparent
+ # and the port works only on the write phase
+ with m.If((Past(dut.rdp_addr_i) == a_const) & wrote
+ & (Past(dut.phase) == dut.write_phase)):
+ rdp_lane = dut.rdp_data_o.word_select(lane, gran)
+ m.d.sync += Assert(d_reg == rdp_lane)
+
+ # pass our state to the device under test, so it can ensure that
+ # its state is in sync with ours, for induction
+ m.d.comb += [
+ # address and mask under test
+ dut.dbg_addr.eq(a_const),
+ dut.dbg_lane.eq(lane),
+ # state of our holding register
+ dut.dbg_data.eq(d_reg),
+ dut.dbg_wrote.eq(wrote),
+ ]
+
+ self.assertFormal(m, mode="prove", depth=3)
+
+ def test_formal(self):
+ """test both types (odd and even write ports) of phased write memory"""
+ with self.subTest("writes happen on phase 0"):
+ self.do_test_formal(0, False)
+ with self.subTest("writes happen on phase 1"):
+ self.do_test_formal(1, False)
+ # test again, with transparent read ports
+ with self.subTest("writes happen on phase 0 (transparent reads)"):
+ self.do_test_formal(0, True)
+ with self.subTest("writes happen on phase 1 (transparent reads)"):
+ self.do_test_formal(1, True)
+
+
+class DualPortXorRegfile(Elaboratable):
+ """
+ Builds, from a pair of phased 1W/2R blocks, a true 1W/1R RAM, where both
+ write and (non-transparent) read ports work every cycle.
+
+ It employs a XOR trick, as follows:
+
+ 1) Like before, there are two memories, each reading on every cycle, and
+ writing on alternate cycles
+ 2) Instead of a MUX, the read port is a direct XOR of the two memories.
+ 3) Writes happens in two cycles:
+
+ First, read the current value of the *other* memory, at the write
+ location.
+
+ Then, on *this* memory, write that read value, XORed with the desired
+ value.
+
+ This recovers the desired value when read:
+ (other XOR desired) XOR other = desired
+
+ :param addr_width: width of the address bus
+ :param data_width: width of the data bus
+ :param we_width: number of write enable lines
+ :param transparent: whether a simultaneous read and write returns the
+ new value (True) or the old value (False) on the full
+ read port
+ """
+
+ def __init__(self, addr_width, data_width, we_width, transparent):
+ self.addr_width = addr_width
+ self.data_width = data_width
+ self.we_width = we_width
+ self.transparent = transparent
+ # interface signals
+ self.wr_addr_i = Signal(addr_width); """write port address"""
+ self.wr_data_i = Signal(data_width); """write port data"""
+ self.wr_we_i = Signal(we_width); """write port enable"""
+ self.rd_addr_i = Signal(addr_width); """read port address"""
+ self.rd_data_o = Signal(data_width); """read port data"""
+
+ def elaborate(self, platform):
+ m = Module()
+ # instantiate the two phased 1W/2R memory blocks
+ mem0 = PhasedReadPhasedWriteFullReadSRAM(
+ self.addr_width, self.data_width, self.we_width, 0, True)
+ mem1 = PhasedReadPhasedWriteFullReadSRAM(
+ self.addr_width, self.data_width, self.we_width, 1, True)
+ m.submodules.mem0 = mem0
+ m.submodules.mem1 = mem1
+ # generate and wire the phases for the phased memories
+ phase = Signal()
+ m.d.sync += phase.eq(~phase)
+ m.d.comb += [
+ mem0.phase.eq(phase),
+ mem1.phase.eq(phase),
+ ]
+ # store the write information for the next cycle
+ last_addr = Signal(self.addr_width)
+ last_we = Signal(self.we_width)
+ last_data = Signal(self.data_width)
+ m.d.sync += [
+ last_addr.eq(self.wr_addr_i),
+ last_we.eq(self.wr_we_i),
+ last_data.eq(self.wr_data_i),
+ ]
+ # read path
+ # wire read address to memories, and XOR their output
+ xor_data = Signal(self.data_width)
+ m.d.comb += [
+ mem0.rd_addr_i.eq(self.rd_addr_i),
+ mem1.rd_addr_i.eq(self.rd_addr_i),
+ xor_data.eq(mem0.rd_data_o ^ mem1.rd_data_o),
+ ]
+ if self.transparent:
+ # do the read and write addresses coincide?
+ same_read_write = Signal()
+ m.d.sync += same_read_write.eq(self.rd_addr_i == self.wr_addr_i)
+ gran = self.data_width // self.we_width
+ for i in range(self.we_width):
+ # when simultaneously reading and writing to the same location
+ # and write lane, bypass the memory, and output the write
+ # holding register instead
+ with m.If(same_read_write & last_we[i]):
+ m.d.comb += self.rd_data_o.word_select(i, gran).eq(
+ last_data.word_select(i, gran))
+ # otherwise, output the xor data
+ with m.Else():
+ m.d.comb += self.rd_data_o.word_select(i, gran).eq(
+ xor_data.word_select(i, gran))
+ # when not transparent, just output the memory contents (xor data)
+ else:
+ m.d.comb += self.rd_data_o.eq(xor_data)
+ # write path
+ # 1) read the memory location which is about to be written
+ m.d.comb += [
+ mem0.rdp_addr_i.eq(self.wr_addr_i),
+ mem1.rdp_addr_i.eq(self.wr_addr_i),
+ ]
+ # 2) write the XOR of the other memory data, and the desired value
+ m.d.comb += [
+ mem0.wr_addr_i.eq(last_addr),
+ mem1.wr_addr_i.eq(last_addr),
+ mem0.wr_we_i.eq(last_we),
+ mem1.wr_we_i.eq(last_we),
+ mem0.wr_data_i.eq(last_data ^ mem1.rdp_data_o),
+ mem1.wr_data_i.eq(last_data ^ mem0.rdp_data_o),
+ ]
+ return m
+
+
+class DualPortXorRegfileTestCase(FHDLTestCase):
+
+ def do_test_case(self, transparent):
+ """
+ Simulate some read/write/modify operations on the dual port register
+ file
+ """
+ dut = DualPortXorRegfile(7, 32, 4, transparent)
+ sim = Simulator(dut)
+ sim.add_clock(1e-6)
+
+ expected = None
+ last_expected = None
+
+ # compare read data with previously written data
+ # and start a new read
+ def read(rd_addr_i, next_expected=None):
+ nonlocal expected, last_expected
+ if expected is not None:
+ self.assertEqual((yield dut.rd_data_o), expected)
+ yield dut.rd_addr_i.eq(rd_addr_i)
+ # account for the read latency
+ expected = last_expected
+ last_expected = next_expected
+
+ # start a write
+ def write(wr_addr_i, wr_we_i, wr_data_i):
+ yield dut.wr_addr_i.eq(wr_addr_i)
+ yield dut.wr_we_i.eq(wr_we_i)
+ yield dut.wr_data_i.eq(wr_data_i)
+
+ def process():
+ # write a pair of values, one for each memory
+ yield from read(0)
+ yield from write(0x42, 0b1111, 0x87654321)
+ yield
+ yield from read(0x42, 0x87654321)
+ yield from write(0x43, 0b1111, 0x0FEDCBA9)
+ yield
+ # skip a beat
+ yield from read(0x43, 0x0FEDCBA9)
+ yield from write(0, 0, 0)
+ yield
+ # write again, but now they switch memories
+ yield from read(0)
+ yield from write(0x42, 0b1111, 0x12345678)
+ yield
+ yield from read(0x42, 0x12345678)
+ yield from write(0x43, 0b1111, 0x9ABCDEF0)
+ yield
+ yield from read(0x43, 0x9ABCDEF0)
+ yield from write(0, 0, 0)
+ yield
+ # test partial writes
+ yield from read(0)
+ yield from write(0x42, 0b1001, 0x78FFFF12)
+ yield
+ yield from read(0)
+ yield from write(0x43, 0b0110, 0xFFDEABFF)
+ yield
+ yield from read(0x42, 0x78345612)
+ yield from write(0, 0, 0)
+ yield
+ yield from read(0x43, 0x9ADEABF0)
+ yield from write(0, 0, 0)
+ yield
+ yield from read(0)
+ yield from write(0, 0, 0)
+ yield
+ # test simultaneous read and write
+ if transparent:
+ # transparent reads, returns the new value
+ yield from read(0x42, 0x78AA5666)
+ else:
+ # non-transparent read: returns the old value
+ yield from read(0x42, 0x78345612)
+ yield from write(0x42, 0b0101, 0x55AA9966)
+ yield
+ # after a cycle, returns the new value
+ yield from read(0x42, 0x78AA5666)
+ yield from write(0, 0, 0)
+ yield
+ # settle down
+ yield from read(0)
+ yield from write(0, 0, 0)
+ yield
+ yield from read(0)
+ yield from write(0, 0, 0)
+
+ sim.add_sync_process(process)
+ debug_file = 'test_dual_port_xor_regfile'
+ if transparent:
+ debug_file += '_transparent'
+ traces = ['clk', 'phase',
+ {'comment': 'write port'},
+ 'wr_addr_i[6:0]', 'wr_we_i[3:0]', 'wr_data_i[31:0]',
+ {'comment': 'read port'},
+ 'rd_addr_i[6:0]', 'rd_data_o[31:0]',
+ ]
+ write_gtkw(debug_file + '.gtkw',
+ debug_file + '.vcd',
+ traces, module='top', zoom=-22)
+ sim_writer = sim.write_vcd(debug_file + '.vcd')
+ with sim_writer:
+ sim.run()
+
+ def test_case(self):
+ with self.subTest("non-transparent reads"):
+ self.do_test_case(False)
+ with self.subTest("transparent reads"):
+ self.do_test_case(True)
+
+
+if __name__ == "__main__":
+ unittest.main()
# 3 bits, unary: return the port
if regfile == 'XER':
return port
- # 3 bits, unary: return the port
- if regfile == 'SVSTATE':
+ # 5 bits, unary: return the port
+ if regfile == 'STATE':
return port
# 9 bits (9 entries), might be unary already
if regfile == 'FAST':
# link LoadStore1 into MMU
mmu = self.fus.get_fu('mmu0')
+ ldst0 = self.fus.get_fu('ldst0')
print ("core pspec", pspec.ldst_ifacetype)
print ("core mmu", mmu)
if mmu is not None:
- print ("core lsmem.lsi", l0.cmpi.lsmem.lsi)
- mmu.alu.set_ldst_interface(l0.cmpi.lsmem.lsi)
+ lsi = l0.cmpi.lsmem.lsi # a LoadStore1 Interface object
+ print ("core lsmem.lsi", lsi)
+ mmu.alu.set_ldst_interface(lsi)
+ # urr store I-Cache in core so it is easier to get at
+ self.icache = lsi.icache
+
+ # alternative reset values for STATE regs. these probably shouldn't
+ # be set, here, instead have them done by Issuer. which they are.
+ # as well. because core.state overrides them. sigh.
+ self.msr_at_reset = 0x0
+ self.pc_at_reset = 0x0
+ if hasattr(pspec, "msr_reset") and isinstance(pspec.msr_reset, int):
+ self.msr_at_reset = pspec.msr_reset
+ if hasattr(pspec, "pc_reset") and isinstance(pspec.pc_reset, int):
+ self.pc_at_reset = pspec.pc_reset
+ state_resets = [self.pc_at_reset, # PC at reset
+ self.msr_at_reset, # MSR at reset
+ 0x0, # SVSTATE at reset
+ 0x0, # DEC at reset
+ 0x0] # TB at reset
# register files (yes plural)
- self.regs = RegFiles(pspec, make_hazard_vecs=self.make_hazard_vecs)
+ self.regs = RegFiles(pspec, make_hazard_vecs=self.make_hazard_vecs,
+ state_resets=state_resets)
# set up input and output: unusual requirement to set data directly
# (due to the way that the core is set up in a different domain,
svp64_en=self.svp64_en,
regreduce_en=self.regreduce_en)
self.des[funame] = self.decoders[funame].do
+ print ("create decoder subset", funame, opkls, self.des[funame])
# create per-Function Unit write-after-write hazard signals
# yes, really, this should have been added in ReservationStations
if "mmu0" in self.decoders:
self.decoders["mmu0"].mmu0_spr_dec = self.decoders["spr0"]
+ # allow pausing of the DEC/TB FSM back in Issuer, by spotting
+ # if there is an MTSPR instruction
+ self.pause_dec_tb = Signal()
+
# next 3 functions are Stage API Compliance
def setup(self, m, i):
pass
# is a waw hazard. decoder has to still
# be asserted in order to detect that, tho
comb += fu.oper_i.eq_from(do)
+ if funame == 'mmu0':
+ # URRR this is truly dreadful.
+ # OP_FETCH_FAILED is a "fake" op.
+ # no instruction creates it. OP_TRAP
+ # uses the *main* decoder: this is
+ # a *Satellite* decoder that reacts
+ # on *insn_in*... not fake ops. gaah.
+ main_op = self.ireg.e.do
+ with m.If(main_op.insn_type ==
+ MicrOp.OP_FETCH_FAILED):
+ comb += fu.oper_i.insn_type.eq(
+ MicrOp.OP_FETCH_FAILED)
+ comb += fu.oper_i.fn_unit.eq(
+ Function.MMU)
# issue when valid (and no write-hazard)
comb += fu.issue_i.eq(~self.waw_hazard)
# instruction ok, indicate ready
funame.lower().startswith('trap')):
with m.If(fu.busy_o):
comb += busy_o.eq(1)
+ # for SPR pipeline pause dec/tb FSM to avoid race condition
+ # TODO: really this should be much more sophisticated,
+ # spot MTSPR, spot that DEC/TB is what is to be updated.
+ # a job for PowerDecoder2, there
+ if funame.lower().startswith('spr'):
+ with m.If(fu.busy_o #& fu.oper_i.insn_type == OP_MTSPR
+ ):
+ comb += self.pause_dec_tb.eq(1)
# return both the function unit "enable" dict as well as the "busy".
# the "busy-or-issued" can be passed in to the Read/Write port
if __name__ == '__main__':
pspec = TestMemPspec(ldst_ifacetype='testpi',
imem_ifacetype='',
- addr_wid=48,
+ addr_wid=64,
allow_overlap=True,
mask_wid=8,
reg_wid=64)
def __init__(self):
self.pc = Signal(64)
+ self.msr = Signal(64)
def eq(self, i):
- return [self.pc.eq(i.pc),
+ return [self.pc.eq(i.pc), self.msr.eq(i.msr),
]
--- /dev/null
+"""simple core issuer
+
+not in any way intended for production use. this runs a FSM that:
+
+* reads the Program Counter from StateRegs
+* reads an instruction from a fixed-size Test Memory
+* issues it to the Simple Core
+* waits for it to complete
+* increments the PC
+* does it all over again
+
+the purpose of this module is to verify the functional correctness
+of the Function Units in the absolute simplest and clearest possible
+way, and to at provide something that can be further incrementally
+improved.
+"""
+
+from nmigen import (Elaboratable, Module, Signal,
+ Mux, Const, Repl, Cat)
+from nmigen.cli import rtlil
+from nmigen.cli import main
+import sys
+
+from nmutil.singlepipe import ControlBase
+from soc.simple.core_data import FetchOutput, FetchInput
+
+from openpower.consts import MSR
+from openpower.decoder.power_enums import MicrOp
+from openpower.state import CoreState
+from soc.regfile.regfiles import StateRegs
+from soc.config.test.test_loadstore import TestMemPspec
+from soc.experiment.icache import ICache
+
+from nmutil.util import rising_edge
+
+from soc.simple.issuer import TestIssuerBase
+
+def get_insn(f_instr_o, pc):
+ if f_instr_o.width == 32:
+ return f_instr_o
+ else:
+ # 64-bit: bit 2 of pc decides which word to select
+ return f_instr_o.word_select(pc[2], 32)
+
+
+# Fetch Finite State Machine.
+# WARNING: there are currently DriverConflicts but it's actually working.
+# TODO, here: everything that is global in nature, information from the
+# main TestIssuerInternal, needs to move to either ispec() or ospec().
+# not only that: TestIssuerInternal.imem can entirely move into here
+# because imem is only ever accessed inside the FetchFSM.
+class FetchFSM(ControlBase):
+ def __init__(self, allow_overlap, imem, core_rst,
+ pdecode2, cur_state,
+ dbg, core, svstate, nia):
+ self.allow_overlap = allow_overlap
+ self.imem = imem
+ self.core_rst = core_rst
+ self.pdecode2 = pdecode2
+ self.cur_state = cur_state
+ self.dbg = dbg
+ self.core = core
+ self.svstate = svstate
+ self.nia = nia
+
+ # set up pipeline ControlBase and allocate i/o specs
+ # (unusual: normally done by the Pipeline API)
+ super().__init__(stage=self)
+ self.p.i_data, self.n.o_data = self.new_specs(None)
+ self.i, self.o = self.p.i_data, self.n.o_data
+
+ # next 3 functions are Stage API Compliance
+ def setup(self, m, i):
+ pass
+
+ def ispec(self):
+ return FetchInput()
+
+ def ospec(self):
+ return FetchOutput()
+
+ def elaborate(self, platform):
+ """fetch FSM
+
+ this FSM performs fetch of raw instruction data, partial-decodes
+ it 32-bit at a time to detect SVP64 prefixes, and will optionally
+ read a 2nd 32-bit quantity if that occurs.
+ """
+ m = super().elaborate(platform)
+
+ dbg = self.dbg
+ core = self.core
+ pc = self.i.pc
+ msr = self.i.msr
+ svstate = self.svstate
+ nia = self.nia
+ fetch_pc_o_ready = self.p.o_ready
+ fetch_pc_i_valid = self.p.i_valid
+ fetch_insn_o_valid = self.n.o_valid
+ fetch_insn_i_ready = self.n.i_ready
+
+ comb = m.d.comb
+ sync = m.d.sync
+ pdecode2 = self.pdecode2
+ cur_state = self.cur_state
+ dec_opcode_o = pdecode2.dec.raw_opcode_in # raw opcode
+
+ # also note instruction fetch failed
+ if hasattr(core, "icache"):
+ fetch_failed = core.icache.i_out.fetch_failed
+ flush_needed = True
+ else:
+ fetch_failed = Const(0, 1)
+ flush_needed = False
+
+ # set priv / virt mode on I-Cache, sigh
+ if isinstance(self.imem, ICache):
+ comb += self.imem.i_in.priv_mode.eq(~msr[MSR.PR])
+ comb += self.imem.i_in.virt_mode.eq(msr[MSR.DR])
+
+ with m.FSM(name='fetch_fsm'):
+
+ # allow fetch to not run at startup due to I-Cache reset not
+ # having time to settle. power-on-reset holds dbg.core_stopped_i
+ with m.State("PRE_IDLE"):
+ with m.If(~dbg.core_stopped_i & ~dbg.core_stop_o):
+ m.next = "IDLE"
+
+ # waiting (zzz)
+ with m.State("IDLE"):
+ with m.If(~dbg.stopping_o & ~fetch_failed):
+ comb += fetch_pc_o_ready.eq(1)
+ with m.If(fetch_pc_i_valid & ~fetch_failed):
+ # instruction allowed to go: start by reading the PC
+ # capture the PC and also drop it into Insn Memory
+ # we have joined a pair of combinatorial memory
+ # lookups together. this is Generally Bad.
+ comb += self.imem.a_pc_i.eq(pc)
+ comb += self.imem.a_i_valid.eq(1)
+ comb += self.imem.f_i_valid.eq(1)
+ sync += cur_state.pc.eq(pc)
+ sync += cur_state.svstate.eq(svstate) # and svstate
+ sync += cur_state.msr.eq(msr) # and msr
+
+ m.next = "INSN_READ" # move to "wait for bus" phase
+
+ # dummy pause to find out why simulation is not keeping up
+ with m.State("INSN_READ"):
+ if self.allow_overlap:
+ stopping = dbg.stopping_o
+ else:
+ stopping = Const(0)
+ with m.If(stopping):
+ # stopping: jump back to idle
+ m.next = "IDLE"
+ with m.Else():
+ with m.If(self.imem.f_busy_o & ~fetch_failed): # zzz...
+ # busy but not fetch failed: stay in wait-read
+ comb += self.imem.a_i_valid.eq(1)
+ comb += self.imem.f_i_valid.eq(1)
+ with m.Else():
+ # not busy (or fetch failed!): instruction fetched
+ # when fetch failed, the instruction gets ignored
+ # by the decoder
+ insn = get_insn(self.imem.f_instr_o, cur_state.pc)
+ # not SVP64 - 32-bit only
+ sync += nia.eq(cur_state.pc + 4)
+ sync += dec_opcode_o.eq(insn)
+ m.next = "INSN_READY"
+
+ with m.State("INSN_READY"):
+ # hand over the instruction, to be decoded
+ comb += fetch_insn_o_valid.eq(1)
+ with m.If(fetch_insn_i_ready):
+ m.next = "IDLE"
+
+ # whatever was done above, over-ride it if core reset is held
+ with m.If(self.core_rst):
+ sync += nia.eq(0)
+
+ return m
+
+
+class TestIssuerInternalInOrder(TestIssuerBase):
+ """TestIssuer - reads instructions from TestMemory and issues them
+
+ efficiency and speed is not the main goal here: functional correctness
+ and code clarity is. optimisations (which almost 100% interfere with
+ easy understanding) come later.
+ """
+
+ def issue_fsm(self, m, core, nia,
+ dbg, core_rst,
+ fetch_pc_o_ready, fetch_pc_i_valid,
+ fetch_insn_o_valid, fetch_insn_i_ready,
+ exec_insn_i_valid, exec_insn_o_ready,
+ exec_pc_o_valid, exec_pc_i_ready):
+ """issue FSM
+
+ decode / issue FSM. this interacts with the "fetch" FSM
+ through fetch_insn_ready/valid (incoming) and fetch_pc_ready/valid
+ (outgoing). also interacts with the "execute" FSM
+ through exec_insn_ready/valid (outgoing) and exec_pc_ready/valid
+ (incoming).
+ SVP64 RM prefixes have already been set up by the
+ "fetch" phase, so execute is fairly straightforward.
+ """
+
+ comb = m.d.comb
+ sync = m.d.sync
+ pdecode2 = self.pdecode2
+ cur_state = self.cur_state
+
+ # temporaries
+ dec_opcode_i = pdecode2.dec.raw_opcode_in # raw opcode
+
+ # note if an exception happened. in a pipelined or OoO design
+ # this needs to be accompanied by "shadowing" (or stalling)
+ exc_happened = self.core.o.exc_happened
+ # also note instruction fetch failed
+ if hasattr(core, "icache"):
+ fetch_failed = core.icache.i_out.fetch_failed
+ flush_needed = True
+ # set to fault in decoder
+ # update (highest priority) instruction fault
+ rising_fetch_failed = rising_edge(m, fetch_failed)
+ with m.If(rising_fetch_failed):
+ sync += pdecode2.instr_fault.eq(1)
+ else:
+ fetch_failed = Const(0, 1)
+ flush_needed = False
+
+ with m.FSM(name="issue_fsm"):
+
+ # sync with the "fetch" phase which is reading the instruction
+ # at this point, there is no instruction running, that
+ # could inadvertently update the PC.
+ with m.State("ISSUE_START"):
+ # reset instruction fault
+ sync += pdecode2.instr_fault.eq(0)
+ # wait on "core stop" release, before next fetch
+ # need to do this here, in case we are in a VL==0 loop
+ with m.If(~dbg.core_stop_o & ~core_rst):
+ comb += fetch_pc_i_valid.eq(1) # tell fetch to start
+ with m.If(fetch_pc_o_ready): # fetch acknowledged us
+ m.next = "INSN_WAIT"
+ with m.Else():
+ # tell core it's stopped, and acknowledge debug handshake
+ comb += dbg.core_stopped_i.eq(1)
+
+ # wait for an instruction to arrive from Fetch
+ with m.State("INSN_WAIT"):
+ if self.allow_overlap:
+ stopping = dbg.stopping_o
+ else:
+ stopping = Const(0)
+ with m.If(stopping):
+ # stopping: jump back to idle
+ m.next = "ISSUE_START"
+ if flush_needed:
+ # request the icache to stop asserting "failed"
+ comb += core.icache.flush_in.eq(1)
+ # stop instruction fault
+ sync += pdecode2.instr_fault.eq(0)
+ with m.Else():
+ comb += fetch_insn_i_ready.eq(1)
+ with m.If(fetch_insn_o_valid):
+ # loop into ISSUE_START if it's a SVP64 instruction
+ # and VL == 0. this because VL==0 is a for-loop
+ # from 0 to 0 i.e. always, always a NOP.
+ m.next = "DECODE_SV" # skip predication
+
+ # after src/dst step have been updated, we are ready
+ # to decode the instruction
+ with m.State("DECODE_SV"):
+ # decode the instruction
+ with m.If(~fetch_failed):
+ sync += pdecode2.instr_fault.eq(0)
+ sync += core.i.e.eq(pdecode2.e)
+ sync += core.i.state.eq(cur_state)
+ sync += core.i.raw_insn_i.eq(dec_opcode_i)
+ sync += core.i.bigendian_i.eq(self.core_bigendian_i)
+ # after decoding, reset any previous exception condition,
+ # allowing it to be set again during the next execution
+ sync += pdecode2.ldst_exc.eq(0)
+
+ m.next = "INSN_EXECUTE" # move to "execute"
+
+ # handshake with execution FSM, move to "wait" once acknowledged
+ with m.State("INSN_EXECUTE"):
+ comb += exec_insn_i_valid.eq(1) # trigger execute
+ with m.If(exec_insn_o_ready): # execute acknowledged us
+ m.next = "EXECUTE_WAIT"
+
+ with m.State("EXECUTE_WAIT"):
+ # wait on "core stop" release, at instruction end
+ # need to do this here, in case we are in a VL>1 loop
+ with m.If(~dbg.core_stop_o & ~core_rst):
+ comb += exec_pc_i_ready.eq(1)
+ # see https://bugs.libre-soc.org/show_bug.cgi?id=636
+ # the exception info needs to be blatted into
+ # pdecode.ldst_exc, and the instruction "re-run".
+ # when ldst_exc.happened is set, the PowerDecoder2
+ # reacts very differently: it re-writes the instruction
+ # with a "trap" (calls PowerDecoder2.trap()) which
+ # will *overwrite* whatever was requested and jump the
+ # PC to the exception address, as well as alter MSR.
+ # nothing else needs to be done other than to note
+ # the change of PC and MSR (and, later, SVSTATE)
+ with m.If(exc_happened):
+ mmu = core.fus.get_exc("mmu0")
+ ldst = core.fus.get_exc("ldst0")
+ if mmu is not None:
+ with m.If(fetch_failed):
+ # instruction fetch: exception is from MMU
+ # reset instr_fault (highest priority)
+ sync += pdecode2.ldst_exc.eq(mmu)
+ sync += pdecode2.instr_fault.eq(0)
+ if flush_needed:
+ # request icache to stop asserting "failed"
+ comb += core.icache.flush_in.eq(1)
+ with m.If(~fetch_failed):
+ # otherwise assume it was a LDST exception
+ sync += pdecode2.ldst_exc.eq(ldst)
+
+ with m.If(exec_pc_o_valid):
+
+ # return directly to Decode if Execute generated an
+ # exception.
+ with m.If(pdecode2.ldst_exc.happened):
+ m.next = "DECODE_SV"
+
+ # if MSR, PC or SVSTATE were changed by the previous
+ # instruction, go directly back to Fetch, without
+ # updating either MSR PC or SVSTATE
+ with m.Elif(self.msr_changed | self.pc_changed |
+ self.sv_changed):
+ m.next = "ISSUE_START"
+
+ with m.Else():
+ # before going back to fetch, update the PC state
+ # register with the NIA.
+ # ok here we are not reading the branch unit.
+ # TODO: this just blithely overwrites whatever
+ # pipeline updated the PC
+ comb += self.state_w_pc.wen.eq(1 << StateRegs.PC)
+ comb += self.state_w_pc.i_data.eq(nia)
+ m.next = "ISSUE_START"
+
+ with m.Else():
+ comb += dbg.core_stopped_i.eq(1)
+ if flush_needed:
+ # request the icache to stop asserting "failed"
+ comb += core.icache.flush_in.eq(1)
+ # stop instruction fault
+ sync += pdecode2.instr_fault.eq(0)
+ if flush_needed:
+ # request the icache to stop asserting "failed"
+ comb += core.icache.flush_in.eq(1)
+ # stop instruction fault
+ sync += pdecode2.instr_fault.eq(0)
+
+ def execute_fsm(self, m, core,
+ exec_insn_i_valid, exec_insn_o_ready,
+ exec_pc_o_valid, exec_pc_i_ready):
+ """execute FSM
+
+ execute FSM. this interacts with the "issue" FSM
+ through exec_insn_ready/valid (incoming) and exec_pc_ready/valid
+ (outgoing). SVP64 RM prefixes have already been set up by the
+ "issue" phase, so execute is fairly straightforward.
+ """
+
+ comb = m.d.comb
+ sync = m.d.sync
+ pdecode2 = self.pdecode2
+
+ # temporaries
+ core_busy_o = core.n.o_data.busy_o # core is busy
+ core_ivalid_i = core.p.i_valid # instruction is valid
+
+ if hasattr(core, "icache"):
+ fetch_failed = core.icache.i_out.fetch_failed
+ else:
+ fetch_failed = Const(0, 1)
+
+ with m.FSM(name="exec_fsm"):
+
+ # waiting for instruction bus (stays there until not busy)
+ with m.State("INSN_START"):
+ comb += exec_insn_o_ready.eq(1)
+ with m.If(exec_insn_i_valid):
+ comb += core_ivalid_i.eq(1) # instruction is valid/issued
+ sync += self.sv_changed.eq(0)
+ sync += self.pc_changed.eq(0)
+ sync += self.msr_changed.eq(0)
+ with m.If(core.p.o_ready): # only move if accepted
+ m.next = "INSN_ACTIVE" # move to "wait completion"
+
+ # instruction started: must wait till it finishes
+ with m.State("INSN_ACTIVE"):
+ # note changes to MSR, PC and SVSTATE
+ # XXX oops, really must monitor *all* State Regfile write
+ # ports looking for changes!
+ with m.If(self.state_nia.wen & (1 << StateRegs.SVSTATE)):
+ sync += self.sv_changed.eq(1)
+ with m.If(self.state_nia.wen & (1 << StateRegs.MSR)):
+ sync += self.msr_changed.eq(1)
+ with m.If(self.state_nia.wen & (1 << StateRegs.PC)):
+ sync += self.pc_changed.eq(1)
+ with m.If(~core_busy_o): # instruction done!
+ comb += exec_pc_o_valid.eq(1)
+ with m.If(exec_pc_i_ready):
+ # when finished, indicate "done".
+ # however, if there was an exception, the instruction
+ # is *not* yet done. this is an implementation
+ # detail: we choose to implement exceptions by
+ # taking the exception information from the LDST
+ # unit, putting that *back* into the PowerDecoder2,
+ # and *re-running the entire instruction*.
+ # if we erroneously indicate "done" here, it is as if
+ # there were *TWO* instructions:
+ # 1) the failed LDST 2) a TRAP.
+ with m.If(~pdecode2.ldst_exc.happened &
+ ~fetch_failed):
+ comb += self.insn_done.eq(1)
+ m.next = "INSN_START" # back to fetch
+
+ def elaborate(self, platform):
+ m = super().elaborate(platform)
+ # convenience
+ comb, sync = m.d.comb, m.d.sync
+ cur_state = self.cur_state
+ pdecode2 = self.pdecode2
+ dbg = self.dbg
+ core = self.core
+
+ # set up peripherals and core
+ core_rst = self.core_rst
+
+ # indicate to outside world if any FU is still executing
+ comb += self.any_busy.eq(core.n.o_data.any_busy_o) # any FU executing
+
+ # address of the next instruction, in the absence of a branch
+ # depends on the instruction size
+ nia = Signal(64)
+
+ # connect up debug signals
+ with m.If(core.o.core_terminate_o):
+ comb += dbg.terminate_i.eq(1)
+
+ # there are *THREE^WFOUR-if-SVP64-enabled* FSMs, fetch (32/64-bit)
+ # issue, decode/execute, now joined by "Predicate fetch/calculate".
+ # these are the handshake signals between each
+
+ # fetch FSM can run as soon as the PC is valid
+ fetch_pc_i_valid = Signal() # Execute tells Fetch "start next read"
+ fetch_pc_o_ready = Signal() # Fetch Tells SVSTATE "proceed"
+
+ # fetch FSM hands over the instruction to be decoded / issued
+ fetch_insn_o_valid = Signal()
+ fetch_insn_i_ready = Signal()
+
+ # issue FSM delivers the instruction to the be executed
+ exec_insn_i_valid = Signal()
+ exec_insn_o_ready = Signal()
+
+ # execute FSM, hands over the PC/SVSTATE back to the issue FSM
+ exec_pc_o_valid = Signal()
+ exec_pc_i_ready = Signal()
+
+ # the FSMs here are perhaps unusual in that they detect conditions
+ # then "hold" information, combinatorially, for the core
+ # (as opposed to using sync - which would be on a clock's delay)
+ # this includes the actual opcode, valid flags and so on.
+
+ # Fetch, then predicate fetch, then Issue, then Execute.
+ # Issue is where the VL for-loop # lives. the ready/valid
+ # signalling is used to communicate between the four.
+
+ # set up Fetch FSM
+ fetch = FetchFSM(self.allow_overlap,
+ self.imem, core_rst, pdecode2, cur_state,
+ dbg, core,
+ dbg.state.svstate, # combinatorially same
+ nia)
+ m.submodules.fetch = fetch
+ # connect up in/out data to existing Signals
+ comb += fetch.p.i_data.pc.eq(dbg.state.pc) # combinatorially same
+ comb += fetch.p.i_data.msr.eq(dbg.state.msr) # combinatorially same
+ # and the ready/valid signalling
+ comb += fetch_pc_o_ready.eq(fetch.p.o_ready)
+ comb += fetch.p.i_valid.eq(fetch_pc_i_valid)
+ comb += fetch_insn_o_valid.eq(fetch.n.o_valid)
+ comb += fetch.n.i_ready.eq(fetch_insn_i_ready)
+
+ self.issue_fsm(m, core, nia,
+ dbg, core_rst,
+ fetch_pc_o_ready, fetch_pc_i_valid,
+ fetch_insn_o_valid, fetch_insn_i_ready,
+ exec_insn_i_valid, exec_insn_o_ready,
+ exec_pc_o_valid, exec_pc_i_ready)
+
+ self.execute_fsm(m, core,
+ exec_insn_i_valid, exec_insn_o_ready,
+ exec_pc_o_valid, exec_pc_i_ready)
+
+ return m
+
+
+# XXX TODO: update this
+
+if __name__ == '__main__':
+ units = {'alu': 1, 'cr': 1, 'branch': 1, 'trap': 1, 'logical': 1,
+ 'spr': 1,
+ 'div': 1,
+ 'mul': 1,
+ 'shiftrot': 1
+ }
+ pspec = TestMemPspec(ldst_ifacetype='bare_wb',
+ imem_ifacetype='bare_wb',
+ addr_wid=64,
+ mask_wid=8,
+ reg_wid=64,
+ units=units)
+ dut = TestIssuer(pspec)
+ vl = main(dut, ports=dut.ports(), name="test_issuer")
+
+ if len(sys.argv) == 1:
+ vl = rtlil.convert(dut, ports=dut.external_ports(), name="test_issuer")
+ with open("test_issuer.il", "w") as f:
+ f.write(vl)
from openpower.decoder.decode2execute1 import IssuerDecode2ToOperand
from openpower.decoder.decode2execute1 import Data
from openpower.decoder.power_enums import (MicrOp, SVP64PredInt, SVP64PredCR,
- SVP64PredMode)
+ SVP64PredMode)
from openpower.state import CoreState
-from openpower.consts import (CR, SVP64CROffs)
-from soc.experiment.testmem import TestMemory # test only for instructions
+from openpower.consts import (CR, SVP64CROffs, MSR)
+from soc.experiment.testmem import TestMemory # test only for instructions
from soc.regfile.regfiles import StateRegs, FastRegs
from soc.simple.core import NonProductionCore
from soc.config.test.test_loadstore import TestMemPspec
from soc.clock.select import ClockSelect
from soc.clock.dummypll import DummyPLL
from openpower.sv.svstate import SVSTATERec
-
+from soc.experiment.icache import ICache
from nmutil.util import rising_edge
+
def get_insn(f_instr_o, pc):
if f_instr_o.width == 32:
return f_instr_o
return f_instr_o.word_select(pc[2], 32)
# gets state input or reads from state regfile
-def state_get(m, core_rst, state_i, name, regfile, regnum):
+
+
+def state_get(m, res, core_rst, state_i, name, regfile, regnum):
comb = m.d.comb
sync = m.d.sync
- # read the PC
- res = Signal(64, reset_less=True, name=name)
+ # read the {insert state variable here}
res_ok_delay = Signal(name="%s_ok_delay" % name)
with m.If(~core_rst):
sync += res_ok_delay.eq(~state_i.ok)
# incoming override (start from pc_i)
comb += res.eq(state_i.data)
with m.Else():
- # otherwise read StateRegs regfile for PC...
- comb += regfile.ren.eq(1<<regnum)
+ # otherwise read StateRegs regfile for {insert state here}...
+ comb += regfile.ren.eq(1 << regnum)
# ... but on a 1-clock delay
with m.If(res_ok_delay):
comb += res.eq(regfile.o_data)
- return res
def get_predint(m, mask, name):
return idx, invert
-# Fetch Finite State Machine.
-# WARNING: there are currently DriverConflicts but it's actually working.
-# TODO, here: everything that is global in nature, information from the
-# main TestIssuerInternal, needs to move to either ispec() or ospec().
-# not only that: TestIssuerInternal.imem can entirely move into here
-# because imem is only ever accessed inside the FetchFSM.
-class FetchFSM(ControlBase):
- def __init__(self, allow_overlap, svp64_en, imem, core_rst,
- pdecode2, cur_state,
- dbg, core, svstate, nia, is_svp64_mode):
- self.allow_overlap = allow_overlap
- self.svp64_en = svp64_en
- self.imem = imem
- self.core_rst = core_rst
- self.pdecode2 = pdecode2
- self.cur_state = cur_state
- self.dbg = dbg
- self.core = core
- self.svstate = svstate
- self.nia = nia
- self.is_svp64_mode = is_svp64_mode
-
- # set up pipeline ControlBase and allocate i/o specs
- # (unusual: normally done by the Pipeline API)
- super().__init__(stage=self)
- self.p.i_data, self.n.o_data = self.new_specs(None)
- self.i, self.o = self.p.i_data, self.n.o_data
-
- # next 3 functions are Stage API Compliance
- def setup(self, m, i):
- pass
-
- def ispec(self):
- return FetchInput()
-
- def ospec(self):
- return FetchOutput()
-
- def elaborate(self, platform):
- """fetch FSM
-
- this FSM performs fetch of raw instruction data, partial-decodes
- it 32-bit at a time to detect SVP64 prefixes, and will optionally
- read a 2nd 32-bit quantity if that occurs.
- """
- m = super().elaborate(platform)
-
- dbg = self.dbg
- core = self.core,
- pc = self.i.pc
- svstate = self.svstate
- nia = self.nia
- is_svp64_mode = self.is_svp64_mode
- fetch_pc_o_ready = self.p.o_ready
- fetch_pc_i_valid = self.p.i_valid
- fetch_insn_o_valid = self.n.o_valid
- fetch_insn_i_ready = self.n.i_ready
-
- comb = m.d.comb
- sync = m.d.sync
- pdecode2 = self.pdecode2
- cur_state = self.cur_state
- dec_opcode_o = pdecode2.dec.raw_opcode_in # raw opcode
-
- msr_read = Signal(reset=1)
-
- # don't read msr every cycle
- staterf = self.core.regs.rf['state']
- state_r_msr = staterf.r_ports['msr'] # MSR rd
+class TestIssuerBase(Elaboratable):
+ """TestIssuerBase - common base class for Issuers
- comb += state_r_msr.ren.eq(0)
-
- with m.FSM(name='fetch_fsm'):
-
- # waiting (zzz)
- with m.State("IDLE"):
- with m.If(~dbg.stopping_o):
- comb += fetch_pc_o_ready.eq(1)
- with m.If(fetch_pc_i_valid):
- # instruction allowed to go: start by reading the PC
- # capture the PC and also drop it into Insn Memory
- # we have joined a pair of combinatorial memory
- # lookups together. this is Generally Bad.
- comb += self.imem.a_pc_i.eq(pc)
- comb += self.imem.a_i_valid.eq(1)
- comb += self.imem.f_i_valid.eq(1)
- sync += cur_state.pc.eq(pc)
- sync += cur_state.svstate.eq(svstate) # and svstate
-
- # initiate read of MSR. arrives one clock later
- comb += state_r_msr.ren.eq(1 << StateRegs.MSR)
- sync += msr_read.eq(0)
-
- m.next = "INSN_READ" # move to "wait for bus" phase
-
- # dummy pause to find out why simulation is not keeping up
- with m.State("INSN_READ"):
- if self.allow_overlap:
- stopping = dbg.stopping_o
- else:
- stopping = Const(0)
- with m.If(stopping):
- # stopping: jump back to idle
- m.next = "IDLE"
- with m.Else():
- # one cycle later, msr/sv read arrives. valid only once.
- with m.If(~msr_read):
- sync += msr_read.eq(1) # yeah don't read it again
- sync += cur_state.msr.eq(state_r_msr.o_data)
- with m.If(self.imem.f_busy_o): # zzz...
- # busy: stay in wait-read
- comb += self.imem.a_i_valid.eq(1)
- comb += self.imem.f_i_valid.eq(1)
- with m.Else():
- # not busy: instruction fetched
- insn = get_insn(self.imem.f_instr_o, cur_state.pc)
- if self.svp64_en:
- svp64 = self.svp64
- # decode the SVP64 prefix, if any
- comb += svp64.raw_opcode_in.eq(insn)
- comb += svp64.bigendian.eq(self.core_bigendian_i)
- # pass the decoded prefix (if any) to PowerDecoder2
- sync += pdecode2.sv_rm.eq(svp64.svp64_rm)
- sync += pdecode2.is_svp64_mode.eq(is_svp64_mode)
- # remember whether this is a prefixed instruction,
- # so the FSM can readily loop when VL==0
- sync += is_svp64_mode.eq(svp64.is_svp64_mode)
- # calculate the address of the following instruction
- insn_size = Mux(svp64.is_svp64_mode, 8, 4)
- sync += nia.eq(cur_state.pc + insn_size)
- with m.If(~svp64.is_svp64_mode):
- # with no prefix, store the instruction
- # and hand it directly to the next FSM
- sync += dec_opcode_o.eq(insn)
- m.next = "INSN_READY"
- with m.Else():
- # fetch the rest of the instruction from memory
- comb += self.imem.a_pc_i.eq(cur_state.pc + 4)
- comb += self.imem.a_i_valid.eq(1)
- comb += self.imem.f_i_valid.eq(1)
- m.next = "INSN_READ2"
- else:
- # not SVP64 - 32-bit only
- sync += nia.eq(cur_state.pc + 4)
- sync += dec_opcode_o.eq(insn)
- m.next = "INSN_READY"
-
- with m.State("INSN_READ2"):
- with m.If(self.imem.f_busy_o): # zzz...
- # busy: stay in wait-read
- comb += self.imem.a_i_valid.eq(1)
- comb += self.imem.f_i_valid.eq(1)
- with m.Else():
- # not busy: instruction fetched
- insn = get_insn(self.imem.f_instr_o, cur_state.pc+4)
- sync += dec_opcode_o.eq(insn)
- m.next = "INSN_READY"
- # TODO: probably can start looking at pdecode2.rm_dec
- # here or maybe even in INSN_READ state, if svp64_mode
- # detected, in order to trigger - and wait for - the
- # predicate reading.
- if self.svp64_en:
- pmode = pdecode2.rm_dec.predmode
- """
- if pmode != SVP64PredMode.ALWAYS.value:
- fire predicate loading FSM and wait before
- moving to INSN_READY
- else:
- sync += self.srcmask.eq(-1) # set to all 1s
- sync += self.dstmask.eq(-1) # set to all 1s
- m.next = "INSN_READY"
- """
-
- with m.State("INSN_READY"):
- # hand over the instruction, to be decoded
- comb += fetch_insn_o_valid.eq(1)
- with m.If(fetch_insn_i_ready):
- m.next = "IDLE"
+ takes care of power-on reset, peripherals, debug, DEC/TB,
+ and gets PC/MSR/SVSTATE from the State Regfile etc.
+ """
- # whatever was done above, over-ride it if core reset is held
- with m.If(self.core_rst):
- sync += nia.eq(0)
+ def __init__(self, pspec):
- return m
+ # test if microwatt compatibility is to be enabled
+ self.microwatt_compat = (hasattr(pspec, "microwatt_compat") and
+ (pspec.microwatt_compat == True))
+ self.alt_reset = Signal(reset_less=True) # not connected yet (microwatt)
+ # test if fabric compatibility is to be enabled
+ self.fabric_compat = (hasattr(pspec, "fabric_compat") and
+ (pspec.fabric_compat == True))
+ if self.microwatt_compat or self.fabric_compat:
-class TestIssuerInternal(Elaboratable):
- """TestIssuer - reads instructions from TestMemory and issues them
+ if hasattr(pspec, "microwatt_old"):
+ self.microwatt_old = pspec.microwatt_old
+ else:
+ self.microwatt_old = True # PLEASE DO NOT ALTER THIS
- efficiency and speed is not the main goal here: functional correctness
- and code clarity is. optimisations (which almost 100% interfere with
- easy understanding) come later.
- """
- def __init__(self, pspec):
+ if hasattr(pspec, "microwatt_debug"):
+ self.microwatt_debug = pspec.microwatt_debug
+ else:
+ self.microwatt_debug = True # set to False when using an FPGA
# test is SVP64 is to be enabled
self.svp64_en = hasattr(pspec, "svp64") and (pspec.svp64 == True)
# and if regfiles are reduced
self.regreduce_en = (hasattr(pspec, "regreduce") and
- (pspec.regreduce == True))
+ (pspec.regreduce == True))
# and if overlap requested
self.allow_overlap = (hasattr(pspec, "allow_overlap") and
- (pspec.allow_overlap == True))
+ (pspec.allow_overlap == True))
+
+ # and get the core domain
+ self.core_domain = "coresync"
+ if (hasattr(pspec, "core_domain") and
+ isinstance(pspec.core_domain, str)):
+ self.core_domain = pspec.core_domain
# JTAG interface. add this right at the start because if it's
# added it *modifies* the pspec, by adding enable/disable signals
# for parts of the rest of the core
self.jtag_en = hasattr(pspec, "debug") and pspec.debug == 'jtag'
- self.dbg_domain = "sync" # sigh "dbgsunc" too problematic
- #self.dbg_domain = "dbgsync" # domain for DMI/JTAG clock
+ #self.dbg_domain = "sync" # sigh "dbgsunc" too problematic
+ self.dbg_domain = "dbgsync" # domain for DMI/JTAG clock
if self.jtag_en:
- # XXX MUST keep this up-to-date with litex, and
+ # XXX MUST keep this up-to-date with fabric, and
# soc-cocotb-sim, and err.. all needs sorting out, argh
subset = ['uart',
'mtwi',
'eint', 'gpio', 'mspi0',
# 'mspi1', - disabled for now
# 'pwm', 'sd0', - disabled for now
- 'sdr']
+ 'sdr']
self.jtag = JTAG(get_pinspecs(subset=subset),
domain=self.dbg_domain)
# add signals to pspec to enable/disable icache and dcache
self.sram4k = []
for i in range(4):
self.sram4k.append(SPBlock512W64B8W(name="sram4k_%d" % i,
- #features={'err'}
+ # features={'err'}
))
# add interrupt controller?
self.xics_icp = XICS_ICP()
self.xics_ics = XICS_ICS()
self.int_level_i = self.xics_ics.int_level_i
+ else:
+ self.ext_irq = Signal()
# add GPIO peripheral?
self.gpio = hasattr(pspec, "gpio") and pspec.gpio == True
# main instruction core. suitable for prototyping / demo only
self.core = core = NonProductionCore(pspec)
- self.core_rst = ResetSignal("coresync")
+ self.core_rst = ResetSignal(self.core_domain)
# instruction decoder. goes into Trap Record
#pdecode = create_pdecode()
- self.cur_state = CoreState("cur") # current state (MSR/PC/SVSTATE)
+ self.cur_state = CoreState("cur") # current state (MSR/PC/SVSTATE)
self.pdecode2 = PowerDecode2(None, state=self.cur_state,
opkls=IssuerDecode2ToOperand,
svp64_en=self.svp64_en,
pdecode = self.pdecode2.dec
if self.svp64_en:
- self.svp64 = SVP64PrefixDecoder() # for decoding SVP64 prefix
+ self.svp64 = SVP64PrefixDecoder() # for decoding SVP64 prefix
+
+ self.update_svstate = Signal() # set this if updating svstate
+ self.new_svstate = new_svstate = SVSTATERec("new_svstate")
# Test Instruction memory
+ if hasattr(core, "icache"):
+ # XXX BLECH! use pspec to transfer the I-Cache to ConfigFetchUnit
+ # truly dreadful. needs a huge reorg.
+ pspec.icache = core.icache
self.imem = ConfigFetchUnit(pspec).fu
# DMI interface
self.dbg = CoreDebug()
+ self.dbg_rst_i = Signal(reset_less=True)
# instruction go/monitor
self.pc_o = Signal(64, reset_less=True)
- self.pc_i = Data(64, "pc_i") # set "ok" to indicate "please change me"
- self.svstate_i = Data(64, "svstate_i") # ditto
- self.core_bigendian_i = Signal() # TODO: set based on MSR.LE
+ self.pc_i = Data(64, "pc_i") # set "ok" to indicate "please change me"
+ self.msr_i = Data(64, "msr_i") # set "ok" to indicate "please change me"
+ self.svstate_i = Data(64, "svstate_i") # ditto
+ self.core_bigendian_i = Signal() # TODO: set based on MSR.LE
self.busy_o = Signal(reset_less=True)
self.memerr_o = Signal(reset_less=True)
# STATE regfile read /write ports for PC, MSR, SVSTATE
staterf = self.core.regs.rf['state']
- self.state_r_pc = staterf.r_ports['cia'] # PC rd
- self.state_w_pc = staterf.w_ports['d_wr1'] # PC wr
- self.state_r_sv = staterf.r_ports['sv'] # SVSTATE rd
- self.state_w_sv = staterf.w_ports['sv'] # SVSTATE wr
+ self.state_r_msr = staterf.r_ports['msr'] # MSR rd
+ self.state_r_pc = staterf.r_ports['cia'] # PC rd
+ self.state_r_sv = staterf.r_ports['sv'] # SVSTATE rd
+
+ self.state_w_msr = staterf.w_ports['d_wr2'] # MSR wr
+ self.state_w_pc = staterf.w_ports['d_wr1'] # PC wr
+ self.state_w_sv = staterf.w_ports['sv'] # SVSTATE wr
# DMI interface access
intrf = self.core.regs.rf['int']
+ fastrf = self.core.regs.rf['fast']
crrf = self.core.regs.rf['cr']
xerrf = self.core.regs.rf['xer']
- self.int_r = intrf.r_ports['dmi'] # INT read
- self.cr_r = crrf.r_ports['full_cr_dbg'] # CR read
- self.xer_r = xerrf.r_ports['full_xer'] # XER read
+ self.int_r = intrf.r_ports['dmi'] # INT DMI read
+ self.cr_r = crrf.r_ports['full_cr_dbg'] # CR DMI read
+ self.xer_r = xerrf.r_ports['full_xer'] # XER DMI read
+ self.fast_r = fastrf.r_ports['dmi'] # FAST DMI read
if self.svp64_en:
# for predication
- self.int_pred = intrf.r_ports['pred'] # INT predicate read
- self.cr_pred = crrf.r_ports['cr_pred'] # CR predicate read
+ self.int_pred = intrf.r_ports['pred'] # INT predicate read
+ self.cr_pred = crrf.r_ports['cr_pred'] # CR predicate read
# hack method of keeping an eye on whether branch/trap set the PC
self.state_nia = self.core.regs.rf['state'].w_ports['nia']
self.state_nia.wen.name = 'state_nia_wen'
+ # and whether SPR pipeline sets DEC or TB (fu/spr/main_stage.py)
+ self.state_spr = self.core.regs.rf['state'].w_ports['state1']
# pulse to synchronize the simulator at instruction end
self.insn_done = Signal()
# indicate any instruction still outstanding, in execution
self.any_busy = Signal()
- if self.svp64_en:
- # store copies of predicate masks
- self.srcmask = Signal(64)
- self.dstmask = Signal(64)
+ if self.svp64_en:
+ # store copies of predicate masks
+ self.srcmask = Signal(64)
+ self.dstmask = Signal(64)
+
+ # sigh, the wishbone addresses are not wishbone-compliant
+ # in old versions of microwatt, tplaten_3d_game is a new one
+ if self.microwatt_compat or self.fabric_compat:
+ self.ibus_adr = Signal(32, name='wishbone_insn_out.adr')
+ self.dbus_adr = Signal(32, name='wishbone_data_out.adr')
+
+ # add an output of the PC and instruction, and whether it was requested
+ # this is for verilator debug purposes
+ if self.microwatt_compat or self.fabric_compat:
+ self.nia = Signal(64)
+ self.msr_o = Signal(64)
+ self.nia_req = Signal(1)
+ self.insn = Signal(32)
+ self.ldst_req = Signal(1)
+ self.ldst_addr = Signal(1)
+
+ # for pausing dec/tb during an SPR pipeline event, this
+ # ensures that an SPR write (mtspr) to TB or DEC does not
+ # get overwritten by the DEC/TB FSM
+ self.pause_dec_tb = Signal()
+
+ def setup_peripherals(self, m):
+ comb, sync = m.d.comb, m.d.sync
+
+ # okaaaay so the debug module must be in coresync clock domain
+ # but NOT its reset signal. to cope with this, set every single
+ # submodule explicitly in coresync domain, debug and JTAG
+ # in their own one but using *external* reset.
+ csd = DomainRenamer(self.core_domain)
+ dbd = DomainRenamer(self.dbg_domain)
+
+ if self.microwatt_compat or self.fabric_compat:
+ m.submodules.core = core = self.core
+ else:
+ m.submodules.core = core = csd(self.core)
+
+ # this _so_ needs sorting out. ICache is added down inside
+ # LoadStore1 and is already a submodule of LoadStore1
+ if not isinstance(self.imem, ICache):
+ m.submodules.imem = imem = csd(self.imem)
+
+ # set up JTAG Debug Module (in correct domain)
+ m.submodules.dbg = dbg = dbd(self.dbg)
+ if self.jtag_en:
+ m.submodules.jtag = jtag = dbd(self.jtag)
+ # TODO: UART2GDB mux, here, from external pin
+ # see https://bugs.libre-soc.org/show_bug.cgi?id=499
+ sync += dbg.dmi.connect_to(jtag.dmi)
+
+ # fixup the clocks in microwatt-compat mode (but leave resets alone
+ # so that microwatt soc.vhdl can pull a reset on the core or DMI
+ # can do it, just like in TestIssuer)
+ if self.microwatt_compat or self.fabric_compat:
+ intclk = ClockSignal(self.core_domain)
+ dbgclk = ClockSignal(self.dbg_domain)
+ if self.core_domain != 'sync':
+ comb += intclk.eq(ClockSignal())
+ if self.dbg_domain != 'sync':
+ comb += dbgclk.eq(ClockSignal())
+
+ # if using old version of microwatt
+ # drop the first 3 bits of the incoming wishbone addresses
+ if self.microwatt_compat or self.fabric_compat:
+ ibus = self.imem.ibus
+ dbus = self.core.l0.cmpi.wb_bus()
+ if self.microwatt_old:
+ comb += self.ibus_adr.eq(Cat(Const(0, 3), ibus.adr))
+ comb += self.dbus_adr.eq(Cat(Const(0, 3), dbus.adr))
+ else:
+ comb += self.ibus_adr.eq(ibus.adr)
+ comb += self.dbus_adr.eq(dbus.adr)
+ if self.microwatt_debug:
+ # microwatt verilator debug purposes
+ pi = self.core.l0.cmpi.pi.pi
+ comb += self.ldst_req.eq(pi.addr_ok_o)
+ comb += self.ldst_addr.eq(pi.addr)
+
+ cur_state = self.cur_state
+
+ # 4x 4k SRAM blocks. these simply "exist", they get routed in fabric
+ if self.sram4x4k:
+ for i, sram in enumerate(self.sram4k):
+ m.submodules["sram4k_%d" % i] = csd(sram)
+ comb += sram.enable.eq(self.wb_sram_en)
+
+ # XICS interrupt handler
+ if self.xics:
+ m.submodules.xics_icp = icp = csd(self.xics_icp)
+ m.submodules.xics_ics = ics = csd(self.xics_ics)
+ comb += icp.ics_i.eq(ics.icp_o) # connect ICS to ICP
+ sync += cur_state.eint.eq(icp.core_irq_o) # connect ICP to core
+ else:
+ sync += cur_state.eint.eq(self.ext_irq) # connect externally
+
+ # GPIO test peripheral
+ if self.gpio:
+ m.submodules.simple_gpio = simple_gpio = csd(self.simple_gpio)
+
+ # connect one GPIO output to ICS bit 15 (like in microwatt soc.vhdl)
+ # XXX causes fabric ECP5 test to get wrong idea about input and output
+ # (but works with verilator sim *sigh*)
+ # if self.gpio and self.xics:
+ # comb += self.int_level_i[15].eq(simple_gpio.gpio_o[0])
+
+ # instruction decoder
+ pdecode = create_pdecode()
+ m.submodules.dec2 = pdecode2 = csd(self.pdecode2)
+ if self.svp64_en:
+ m.submodules.svp64 = svp64 = csd(self.svp64)
+
+ # clock delay power-on reset
+ cd_por = ClockDomain(reset_less=True)
+ cd_sync = ClockDomain()
+ m.domains += cd_por, cd_sync
+ core_sync = ClockDomain(self.core_domain)
+ if self.core_domain != "sync":
+ m.domains += core_sync
+ if self.dbg_domain != "sync":
+ dbg_sync = ClockDomain(self.dbg_domain)
+ m.domains += dbg_sync
+
+ # create a delay, but remember it is in the power-on-reset clock domain!
+ ti_rst = Signal(reset_less=True)
+ delay = Signal(range(4), reset=3)
+ stop_delay = Signal(range(16), reset=5)
+ with m.If(delay != 0):
+ m.d.por += delay.eq(delay - 1) # decrement... in POR domain!
+ with m.If(stop_delay != 0):
+ m.d.por += stop_delay.eq(stop_delay - 1) # likewise
+ comb += cd_por.clk.eq(ClockSignal())
+
+ # power-on reset delay
+ core_rst = ResetSignal(self.core_domain)
+ if self.core_domain != "sync":
+ comb += ti_rst.eq(delay != 0 | dbg.core_rst_o | ResetSignal())
+ comb += core_rst.eq(ti_rst)
+ else:
+ with m.If(delay != 0 | dbg.core_rst_o):
+ comb += core_rst.eq(1)
+ with m.If(stop_delay != 0):
+ # run DMI core-stop as well but on an extra couple of cycles
+ comb += dbg.core_stopped_i.eq(1)
+
+ # connect external reset signal to DMI Reset
+ if self.dbg_domain != "sync":
+ dbg_rst = ResetSignal(self.dbg_domain)
+ comb += dbg_rst.eq(self.dbg_rst_i)
+
+ # busy/halted signals from core
+ core_busy_o = ~core.p.o_ready | core.n.o_data.busy_o # core is busy
+ comb += self.busy_o.eq(core_busy_o)
+ comb += pdecode2.dec.bigendian.eq(self.core_bigendian_i)
+
+ # temporary hack: says "go" immediately for both address gen and ST
+ # XXX: st.go_i is set to 1 cycle delay to reduce combinatorial chains
+ l0 = core.l0
+ ldst = core.fus.fus['ldst0']
+ st_go_edge = rising_edge(m, ldst.st.rel_o)
+ # link addr-go direct to rel
+ m.d.comb += ldst.ad.go_i.eq(ldst.ad.rel_o)
+ m.d.sync += ldst.st.go_i.eq(st_go_edge) # link store-go to rising rel
+
+ def do_dmi(self, m, dbg):
+ """deals with DMI debug requests
+
+ currently only provides read requests for the INT regfile, CR and XER
+ it will later also deal with *writing* to these regfiles.
+ """
+ comb = m.d.comb
+ sync = m.d.sync
+ dmi, d_reg, d_cr, d_xer, = dbg.dmi, dbg.d_gpr, dbg.d_cr, dbg.d_xer
+ d_fast = dbg.d_fast
+ intrf = self.core.regs.rf['int']
+ fastrf = self.core.regs.rf['fast']
+
+ with m.If(d_reg.req): # request for regfile access being made
+ # TODO: error-check this
+ # XXX should this be combinatorial? sync better?
+ if intrf.unary:
+ comb += self.int_r.ren.eq(1 << d_reg.addr)
+ else:
+ comb += self.int_r.addr.eq(d_reg.addr)
+ comb += self.int_r.ren.eq(1)
+ d_reg_delay = Signal()
+ sync += d_reg_delay.eq(d_reg.req)
+ with m.If(d_reg_delay):
+ # data arrives one clock later
+ comb += d_reg.data.eq(self.int_r.o_data)
+ comb += d_reg.ack.eq(1)
+
+ # fast regfile
+ with m.If(d_fast.req): # request for regfile access being made
+ if fastrf.unary:
+ comb += self.fast_r.ren.eq(1 << d_fast.addr)
+ else:
+ comb += self.fast_r.addr.eq(d_fast.addr)
+ comb += self.fast_r.ren.eq(1)
+ d_fast_delay = Signal()
+ sync += d_fast_delay.eq(d_fast.req)
+ with m.If(d_fast_delay):
+ # data arrives one clock later
+ comb += d_fast.data.eq(self.fast_r.o_data)
+ comb += d_fast.ack.eq(1)
+
+ # sigh same thing for CR debug
+ with m.If(d_cr.req): # request for regfile access being made
+ comb += self.cr_r.ren.eq(0b11111111) # enable all
+ d_cr_delay = Signal()
+ sync += d_cr_delay.eq(d_cr.req)
+ with m.If(d_cr_delay):
+ # data arrives one clock later
+ comb += d_cr.data.eq(self.cr_r.o_data)
+ comb += d_cr.ack.eq(1)
+
+ # aaand XER...
+ with m.If(d_xer.req): # request for regfile access being made
+ comb += self.xer_r.ren.eq(0b111111) # enable all
+ d_xer_delay = Signal()
+ sync += d_xer_delay.eq(d_xer.req)
+ with m.If(d_xer_delay):
+ # data arrives one clock later
+ comb += d_xer.data.eq(self.xer_r.o_data)
+ comb += d_xer.ack.eq(1)
+
+ def tb_dec_fsm(self, m, spr_dec):
+ """tb_dec_fsm
+
+ this is a FSM for updating either dec or tb. it runs alternately
+ DEC, TB, DEC, TB. note that SPR pipeline could have written a new
+ value to DEC, however the regfile has "passthrough" on it so this
+ *should* be ok.
+
+ see v3.0B p1097-1099 for Timer Resource and p1065 and p1076
+ """
+
+ comb, sync = m.d.comb, m.d.sync
+ state_rf = self.core.regs.rf['state']
+ state_r_dectb = state_rf.r_ports['issue'] # DEC/TB
+ state_w_dectb = state_rf.w_ports['issue'] # DEC/TB
+
+
+ with m.FSM() as fsm:
+
+ # initiates read of current DEC
+ with m.State("DEC_READ"):
+ comb += state_r_dectb.ren.eq(1<<StateRegs.DEC)
+ with m.If(~self.pause_dec_tb):
+ m.next = "DEC_WRITE"
+
+ # waits for DEC read to arrive (1 cycle), updates with new value
+ # respects if dec/tb writing has been paused
+ with m.State("DEC_WRITE"):
+ with m.If(self.pause_dec_tb):
+ # if paused, return to reading
+ m.next = "DEC_READ"
+ with m.Else():
+ new_dec = Signal(64)
+ # TODO: MSR.LPCR 32-bit decrement mode
+ comb += new_dec.eq(state_r_dectb.o_data - 1)
+ comb += state_w_dectb.wen.eq(1<<StateRegs.DEC)
+ comb += state_w_dectb.i_data.eq(new_dec)
+ # copy to cur_state for decoder, for an interrupt
+ sync += spr_dec.eq(new_dec)
+ m.next = "TB_READ"
+
+ # initiates read of current TB
+ with m.State("TB_READ"):
+ comb += state_r_dectb.ren.eq(1<<StateRegs.TB)
+ with m.If(~self.pause_dec_tb):
+ m.next = "TB_WRITE"
+
+ # waits for read TB to arrive, initiates write of current TB
+ # respects if dec/tb writing has been paused
+ with m.State("TB_WRITE"):
+ with m.If(self.pause_dec_tb):
+ # if paused, return to reading
+ m.next = "TB_READ"
+ with m.Else():
+ new_tb = Signal(64)
+ comb += new_tb.eq(state_r_dectb.o_data + 1)
+ comb += state_w_dectb.wen.eq(1<<StateRegs.TB)
+ comb += state_w_dectb.i_data.eq(new_tb)
+ m.next = "DEC_READ"
+
+ return m
+
+ def elaborate(self, platform):
+ m = Module()
+ # convenience
+ comb, sync = m.d.comb, m.d.sync
+ cur_state = self.cur_state
+ pdecode2 = self.pdecode2
+ dbg = self.dbg
+
+ # set up peripherals and core
+ core_rst = self.core_rst
+ self.setup_peripherals(m)
+
+ # reset current state if core reset requested
+ with m.If(core_rst):
+ m.d.sync += self.cur_state.eq(0)
+ # and, sigh, set configured values, which are also done in regfile
+ # XXX ??? what the hell is the shift for??
+ m.d.sync += self.cur_state.pc.eq(self.core.pc_at_reset)
+ m.d.sync += self.cur_state.msr.eq(self.core.msr_at_reset)
+
+ # check halted condition: requested PC to execute matches DMI stop addr
+ # and immediately stop. address of 0xffff_ffff_ffff_ffff can never
+ # match
+ halted = Signal()
+ comb += halted.eq(dbg.stop_addr_o == dbg.state.pc)
+ with m.If(halted):
+ comb += dbg.core_stopped_i.eq(1)
+ comb += dbg.terminate_i.eq(1)
+
+ # PC and instruction from I-Memory
+ comb += self.pc_o.eq(cur_state.pc)
+ self.pc_changed = Signal() # note write to PC
+ self.msr_changed = Signal() # note write to MSR
+ self.sv_changed = Signal() # note write to SVSTATE
+
+ # read state either from incoming override or from regfile
+ state = CoreState("get") # current state (MSR/PC/SVSTATE)
+ state_get(m, state.msr, core_rst, self.msr_i,
+ "msr", # read MSR
+ self.state_r_msr, StateRegs.MSR)
+ state_get(m, state.pc, core_rst, self.pc_i,
+ "pc", # read PC
+ self.state_r_pc, StateRegs.PC)
+ state_get(m, state.svstate, core_rst, self.svstate_i,
+ "svstate", # read SVSTATE
+ self.state_r_sv, StateRegs.SVSTATE)
+
+ # don't write pc every cycle
+ comb += self.state_w_pc.wen.eq(0)
+ comb += self.state_w_pc.i_data.eq(0)
+
+ # connect up debug state. note "combinatorially same" below,
+ # this is a bit naff, passing state over in the dbg class, but
+ # because it is combinatorial it achieves the desired goal
+ comb += dbg.state.eq(state)
+
+ # this bit doesn't have to be in the FSM: connect up to read
+ # regfiles on demand from DMI
+ self.do_dmi(m, dbg)
+
+ # DEC and TB inc/dec FSM. copy of DEC is put into CoreState,
+ # (which uses that in PowerDecoder2 to raise 0x900 exception)
+ self.tb_dec_fsm(m, cur_state.dec)
+
+ # while stopped, allow updating the MSR, PC and SVSTATE.
+ # these are mainly for debugging purposes (including DMI/JTAG)
+ with m.If(dbg.core_stopped_i):
+ with m.If(self.pc_i.ok):
+ comb += self.state_w_pc.wen.eq(1 << StateRegs.PC)
+ comb += self.state_w_pc.i_data.eq(self.pc_i.data)
+ sync += self.pc_changed.eq(1)
+ with m.If(self.msr_i.ok):
+ comb += self.state_w_msr.wen.eq(1 << StateRegs.MSR)
+ comb += self.state_w_msr.i_data.eq(self.msr_i.data)
+ sync += self.msr_changed.eq(1)
+ with m.If(self.svstate_i.ok | self.update_svstate):
+ with m.If(self.svstate_i.ok): # over-ride from external source
+ comb += self.new_svstate.eq(self.svstate_i.data)
+ comb += self.state_w_sv.wen.eq(1 << StateRegs.SVSTATE)
+ comb += self.state_w_sv.i_data.eq(self.new_svstate)
+ sync += self.sv_changed.eq(1)
+
+ # start renaming some of the ports to match microwatt
+ if self.microwatt_compat or self.fabric_compat:
+ self.core.o.core_terminate_o.name = "terminated_out"
+ # names of DMI interface
+ self.dbg.dmi.addr_i.name = 'dmi_addr'
+ self.dbg.dmi.din.name = 'dmi_din'
+ self.dbg.dmi.dout.name = 'dmi_dout'
+ self.dbg.dmi.req_i.name = 'dmi_req'
+ self.dbg.dmi.we_i.name = 'dmi_wr'
+ self.dbg.dmi.ack_o.name = 'dmi_ack'
+ # wishbone instruction bus
+ ibus = self.imem.ibus
+ if self.microwatt_compat:
+ ibus.adr.name = 'wishbone_insn_out.adr'
+ ibus.dat_w.name = 'wishbone_insn_out.dat'
+ ibus.sel.name = 'wishbone_insn_out.sel'
+ ibus.cyc.name = 'wishbone_insn_out.cyc'
+ ibus.stb.name = 'wishbone_insn_out.stb'
+ ibus.we.name = 'wishbone_insn_out.we'
+ ibus.dat_r.name = 'wishbone_insn_in.dat'
+ ibus.ack.name = 'wishbone_insn_in.ack'
+ ibus.stall.name = 'wishbone_insn_in.stall'
+ # wishbone data bus
+ dbus = self.core.l0.cmpi.wb_bus()
+ if self.microwatt_compat:
+ dbus.adr.name = 'wishbone_data_out.adr'
+ dbus.dat_w.name = 'wishbone_data_out.dat'
+ dbus.sel.name = 'wishbone_data_out.sel'
+ dbus.cyc.name = 'wishbone_data_out.cyc'
+ dbus.stb.name = 'wishbone_data_out.stb'
+ dbus.we.name = 'wishbone_data_out.we'
+ dbus.dat_r.name = 'wishbone_data_in.dat'
+ dbus.ack.name = 'wishbone_data_in.ack'
+ dbus.stall.name = 'wishbone_data_in.stall'
+
+ return m
+
+ def __iter__(self):
+ yield from self.pc_i.ports()
+ yield from self.msr_i.ports()
+ yield self.pc_o
+ yield self.memerr_o
+ yield from self.core.ports()
+ yield from self.imem.ports()
+ yield self.core_bigendian_i
+ yield self.busy_o
+
+ def ports(self):
+ return list(self)
+
+ def external_ports(self):
+ if self.microwatt_compat or self.fabric_compat:
+ if self.fabric_compat:
+ ports = [self.core.o.core_terminate_o,
+ self.alt_reset, # not connected yet
+ self.nia, self.insn, self.nia_req, self.msr_o,
+ self.ldst_req, self.ldst_addr,
+ ClockSignal(),
+ ResetSignal(),
+ ]
+ else:
+ ports = [self.core.o.core_terminate_o,
+ self.ext_irq,
+ self.alt_reset, # not connected yet
+ self.nia, self.insn, self.nia_req, self.msr_o,
+ self.ldst_req, self.ldst_addr,
+ ClockSignal(),
+ ResetSignal(),
+ ]
+ ports += list(self.dbg.dmi.ports())
+ # for dbus/ibus microwatt, exclude err btw and cti
+ for name, sig in self.imem.ibus.fields.items():
+ if name not in ['err', 'bte', 'cti', 'adr']:
+ ports.append(sig)
+ for name, sig in self.core.l0.cmpi.wb_bus().fields.items():
+ if name not in ['err', 'bte', 'cti', 'adr']:
+ ports.append(sig)
+ # microwatt non-compliant with wishbone
+ ports.append(self.ibus_adr)
+ ports.append(self.dbus_adr)
+
+ if self.microwatt_compat:
+ # Ignore the remaining ports in microwatt compat mode
+ return ports
+
+ ports = self.pc_i.ports()
+ ports = self.msr_i.ports()
+ ports += [self.pc_o, self.memerr_o, self.core_bigendian_i, self.busy_o,
+ ]
+
+ if self.jtag_en:
+ ports += list(self.jtag.external_ports())
+ else:
+ # don't add DMI if JTAG is enabled
+ ports += list(self.dbg.dmi.ports())
+
+ ports += list(self.imem.ibus.fields.values())
+ ports += list(self.core.l0.cmpi.wb_bus().fields.values())
+
+ if self.sram4x4k:
+ for sram in self.sram4k:
+ ports += list(sram.bus.fields.values())
+
+ if self.xics:
+ ports += list(self.xics_icp.bus.fields.values())
+ ports += list(self.xics_ics.bus.fields.values())
+ ports.append(self.int_level_i)
+ else:
+ ports.append(self.ext_irq)
+
+ if self.gpio:
+ ports += list(self.simple_gpio.bus.fields.values())
+ ports.append(self.gpio_o)
+
+ return ports
+
+ def ports(self):
+ return list(self)
+
+
+class TestIssuerInternal(TestIssuerBase):
+ """TestIssuer - reads instructions from TestMemory and issues them
+
+ efficiency and speed is not the main goal here: functional correctness
+ and code clarity is. optimisations (which almost 100% interfere with
+ easy understanding) come later.
+ """
+
+ def fetch_fsm(self, m, dbg, core, core_rst, nia, is_svp64_mode,
+ fetch_pc_o_ready, fetch_pc_i_valid,
+ fetch_insn_o_valid, fetch_insn_i_ready):
+ """fetch FSM
+
+ this FSM performs fetch of raw instruction data, partial-decodes
+ it 32-bit at a time to detect SVP64 prefixes, and will optionally
+ read a 2nd 32-bit quantity if that occurs.
+ """
+ comb = m.d.comb
+ sync = m.d.sync
+ pdecode2 = self.pdecode2
+ cur_state = self.cur_state
+ dec_opcode_i = pdecode2.dec.raw_opcode_in # raw opcode
+ pc, msr, svstate = cur_state.pc, cur_state.msr, cur_state.svstate
+
+ # also note instruction fetch failed
+ if hasattr(core, "icache"):
+ fetch_failed = core.icache.i_out.fetch_failed
+ flush_needed = True
+ else:
+ fetch_failed = Const(0, 1)
+ flush_needed = False
+
+ # set priv / virt mode on I-Cache, sigh
+ if isinstance(self.imem, ICache):
+ comb += self.imem.i_in.priv_mode.eq(~msr[MSR.PR])
+ comb += self.imem.i_in.virt_mode.eq(msr[MSR.IR]) # Instr. Redir (VM)
+
+ with m.FSM(name='fetch_fsm'):
+
+ # allow fetch to not run at startup due to I-Cache reset not
+ # having time to settle. power-on-reset holds dbg.core_stopped_i
+ with m.State("PRE_IDLE"):
+ with m.If(~dbg.core_stopped_i & ~dbg.core_stop_o & ~core_rst):
+ m.next = "IDLE"
+
+ # waiting (zzz)
+ with m.State("IDLE"):
+ # fetch allowed if not failed and stopped but not stepping
+ # (see dmi.py for how core_stop_o is generated)
+ with m.If(~fetch_failed & ~dbg.core_stop_o):
+ comb += fetch_pc_o_ready.eq(1)
+ with m.If(fetch_pc_i_valid & ~pdecode2.instr_fault
+ & ~dbg.core_stop_o):
+ # instruction allowed to go: start by reading the PC
+ # capture the PC and also drop it into Insn Memory
+ # we have joined a pair of combinatorial memory
+ # lookups together. this is Generally Bad.
+ comb += self.imem.a_pc_i.eq(pc)
+ comb += self.imem.a_i_valid.eq(1)
+ comb += self.imem.f_i_valid.eq(1)
+ m.next = "INSN_READ" # move to "wait for bus" phase
+
+ # dummy pause to find out why simulation is not keeping up
+ with m.State("INSN_READ"):
+ # when using "single-step" mode, checking dbg.stopping_o
+ # prevents progress. allow fetch to proceed once started
+ stopping = Const(0)
+ #if self.allow_overlap:
+ # stopping = dbg.stopping_o
+ with m.If(stopping):
+ # stopping: jump back to idle
+ m.next = "IDLE"
+ with m.Else():
+ with m.If(self.imem.f_busy_o &
+ ~pdecode2.instr_fault): # zzz...
+ # busy but not fetch failed: stay in wait-read
+ comb += self.imem.a_pc_i.eq(pc)
+ comb += self.imem.a_i_valid.eq(1)
+ comb += self.imem.f_i_valid.eq(1)
+ with m.Else():
+ # not busy (or fetch failed!): instruction fetched
+ # when fetch failed, the instruction gets ignored
+ # by the decoder
+ if hasattr(core, "icache"):
+ # blech, icache returns actual instruction
+ insn = self.imem.f_instr_o
+ else:
+ # but these return raw memory
+ insn = get_insn(self.imem.f_instr_o, cur_state.pc)
+ if self.svp64_en:
+ svp64 = self.svp64
+ # decode the SVP64 prefix, if any
+ comb += svp64.raw_opcode_in.eq(insn)
+ comb += svp64.bigendian.eq(self.core_bigendian_i)
+ # pass the decoded prefix (if any) to PowerDecoder2
+ sync += pdecode2.sv_rm.eq(svp64.svp64_rm)
+ sync += pdecode2.is_svp64_mode.eq(is_svp64_mode)
+ # remember whether this is a prefixed instruction,
+ # so the FSM can readily loop when VL==0
+ sync += is_svp64_mode.eq(svp64.is_svp64_mode)
+ # calculate the address of the following instruction
+ insn_size = Mux(svp64.is_svp64_mode, 8, 4)
+ sync += nia.eq(cur_state.pc + insn_size)
+ with m.If(~svp64.is_svp64_mode):
+ # with no prefix, store the instruction
+ # and hand it directly to the next FSM
+ sync += dec_opcode_i.eq(insn)
+ m.next = "INSN_READY"
+ with m.Else():
+ # fetch the rest of the instruction from memory
+ comb += self.imem.a_pc_i.eq(cur_state.pc + 4)
+ comb += self.imem.a_i_valid.eq(1)
+ comb += self.imem.f_i_valid.eq(1)
+ m.next = "INSN_READ2"
+ else:
+ # not SVP64 - 32-bit only
+ sync += nia.eq(cur_state.pc + 4)
+ sync += dec_opcode_i.eq(insn)
+ if self.microwatt_compat or self.fabric_compat:
+ # for verilator debug purposes
+ comb += self.insn.eq(insn)
+ comb += self.nia.eq(cur_state.pc)
+ comb += self.msr_o.eq(cur_state.msr)
+ comb += self.nia_req.eq(1)
+ m.next = "INSN_READY"
+
+ with m.State("INSN_READ2"):
+ with m.If(self.imem.f_busy_o): # zzz...
+ # busy: stay in wait-read
+ comb += self.imem.a_i_valid.eq(1)
+ comb += self.imem.f_i_valid.eq(1)
+ with m.Else():
+ # not busy: instruction fetched
+ if hasattr(core, "icache"):
+ # blech, icache returns actual instruction
+ insn = self.imem.f_instr_o
+ else:
+ insn = get_insn(self.imem.f_instr_o, cur_state.pc+4)
+ sync += dec_opcode_i.eq(insn)
+ m.next = "INSN_READY"
+ # TODO: probably can start looking at pdecode2.rm_dec
+ # here or maybe even in INSN_READ state, if svp64_mode
+ # detected, in order to trigger - and wait for - the
+ # predicate reading.
+ if self.svp64_en:
+ pmode = pdecode2.rm_dec.predmode
+ """
+ if pmode != SVP64PredMode.ALWAYS.value:
+ fire predicate loading FSM and wait before
+ moving to INSN_READY
+ else:
+ sync += self.srcmask.eq(-1) # set to all 1s
+ sync += self.dstmask.eq(-1) # set to all 1s
+ m.next = "INSN_READY"
+ """
+
+ with m.State("INSN_READY"):
+ # hand over the instruction, to be decoded
+ comb += fetch_insn_o_valid.eq(1)
+ with m.If(fetch_insn_i_ready):
+ m.next = "IDLE"
+
def fetch_predicate_fsm(self, m,
pred_insn_i_valid, pred_insn_o_ready,
comb = m.d.comb
sync = m.d.sync
pdecode2 = self.pdecode2
- rm_dec = pdecode2.rm_dec # SVP64RMModeDecode
+ rm_dec = pdecode2.rm_dec # SVP64RMModeDecode
predmode = rm_dec.predmode
srcpred, dstpred = rm_dec.srcpred, rm_dec.dstpred
cr_pred, int_pred = self.cr_pred, self.int_pred # read regfiles
scr_bit = Signal()
dcr_bit = Signal()
comb += cr_field.eq(cr_pred.o_data)
- comb += scr_bit.eq(cr_field.bit_select(sidx, 1) ^ scrinvert)
- comb += dcr_bit.eq(cr_field.bit_select(didx, 1) ^ dcrinvert)
+ comb += scr_bit.eq(cr_field.bit_select(sidx, 1)
+ ^ scrinvert)
+ comb += dcr_bit.eq(cr_field.bit_select(didx, 1)
+ ^ dcrinvert)
# set the corresponding mask bit
bit_to_set = Signal.like(self.srcmask)
comb += bit_to_set.eq(1 << cur_cr_idx)
with m.If(pred_mask_i_ready):
m.next = "FETCH_PRED_IDLE"
- def issue_fsm(self, m, core, pc_changed, sv_changed, nia,
+ def issue_fsm(self, m, core, nia,
dbg, core_rst, is_svp64_mode,
fetch_pc_o_ready, fetch_pc_i_valid,
fetch_insn_o_valid, fetch_insn_i_ready,
sync = m.d.sync
pdecode2 = self.pdecode2
cur_state = self.cur_state
+ new_svstate = self.new_svstate
# temporaries
- dec_opcode_i = pdecode2.dec.raw_opcode_in # raw opcode
+ dec_opcode_i = pdecode2.dec.raw_opcode_in # raw opcode
# for updating svstate (things like srcstep etc.)
- update_svstate = Signal() # set this (below) if updating
- new_svstate = SVSTATERec("new_svstate")
comb += new_svstate.eq(cur_state.svstate)
# precalculate srcstep+1 and dststep+1
# note if an exception happened. in a pipelined or OoO design
# this needs to be accompanied by "shadowing" (or stalling)
exc_happened = self.core.o.exc_happened
+ # also note instruction fetch failed
+ if hasattr(core, "icache"):
+ fetch_failed = core.icache.i_out.fetch_failed
+ flush_needed = True
+ # set to fault in decoder
+ # update (highest priority) instruction fault
+ rising_fetch_failed = rising_edge(m, fetch_failed)
+ with m.If(rising_fetch_failed):
+ sync += pdecode2.instr_fault.eq(1)
+ else:
+ fetch_failed = Const(0, 1)
+ flush_needed = False
+
+ sync += fetch_pc_i_valid.eq(0)
with m.FSM(name="issue_fsm"):
+ with m.State("PRE_IDLE"):
+ with m.If(~dbg.core_stop_o & ~core_rst):
+ m.next = "ISSUE_START"
+
# sync with the "fetch" phase which is reading the instruction
# at this point, there is no instruction running, that
# could inadvertently update the PC.
with m.State("ISSUE_START"):
+ # reset instruction fault
+ sync += pdecode2.instr_fault.eq(0)
# wait on "core stop" release, before next fetch
# need to do this here, in case we are in a VL==0 loop
with m.If(~dbg.core_stop_o & ~core_rst):
- comb += fetch_pc_i_valid.eq(1) # tell fetch to start
+ sync += fetch_pc_i_valid.eq(1) # tell fetch to start
+ sync += cur_state.pc.eq(dbg.state.pc)
+ sync += cur_state.svstate.eq(dbg.state.svstate)
+ sync += cur_state.msr.eq(dbg.state.msr)
with m.If(fetch_pc_o_ready): # fetch acknowledged us
m.next = "INSN_WAIT"
with m.Else():
# tell core it's stopped, and acknowledge debug handshake
comb += dbg.core_stopped_i.eq(1)
- # while stopped, allow updating the PC and SVSTATE
- with m.If(self.pc_i.ok):
- comb += self.state_w_pc.wen.eq(1 << StateRegs.PC)
- comb += self.state_w_pc.i_data.eq(self.pc_i.data)
- sync += pc_changed.eq(1)
+ # while stopped, allow updating SVSTATE
with m.If(self.svstate_i.ok):
comb += new_svstate.eq(self.svstate_i.data)
- comb += update_svstate.eq(1)
- sync += sv_changed.eq(1)
+ comb += self.update_svstate.eq(1)
+ sync += self.sv_changed.eq(1)
# wait for an instruction to arrive from Fetch
with m.State("INSN_WAIT"):
- if self.allow_overlap:
- stopping = dbg.stopping_o
- else:
- stopping = Const(0)
+ # when using "single-step" mode, checking dbg.stopping_o
+ # prevents progress. allow issue to proceed once started
+ stopping = Const(0)
+ #if self.allow_overlap:
+ # stopping = dbg.stopping_o
with m.If(stopping):
# stopping: jump back to idle
m.next = "ISSUE_START"
+ if flush_needed:
+ # request the icache to stop asserting "failed"
+ comb += core.icache.flush_in.eq(1)
+ # stop instruction fault
+ sync += pdecode2.instr_fault.eq(0)
with m.Else():
comb += fetch_insn_i_ready.eq(1)
with m.If(fetch_insn_o_valid):
m.next = "MASK_WAIT"
with m.State("MASK_WAIT"):
- comb += pred_mask_i_ready.eq(1) # ready to receive the masks
- with m.If(pred_mask_o_valid): # predication masks are ready
+ comb += pred_mask_i_ready.eq(1) # ready to receive the masks
+ with m.If(pred_mask_o_valid): # predication masks are ready
m.next = "PRED_SKIP"
# skip zeros in predicate
comb += self.state_w_pc.i_data.eq(nia)
comb += new_svstate.srcstep.eq(0)
comb += new_svstate.dststep.eq(0)
- comb += update_svstate.eq(1)
+ comb += self.update_svstate.eq(1)
# synchronize with the simulator
comb += self.insn_done.eq(1)
# go back to Issue
# update new src/dst step
comb += new_svstate.srcstep.eq(skip_srcstep)
comb += new_svstate.dststep.eq(skip_dststep)
- comb += update_svstate.eq(1)
+ comb += self.update_svstate.eq(1)
# proceed to Decode
m.next = "DECODE_SV"
# to decode the instruction
with m.State("DECODE_SV"):
# decode the instruction
+ with m.If(~fetch_failed):
+ sync += pdecode2.instr_fault.eq(0)
sync += core.i.e.eq(pdecode2.e)
sync += core.i.state.eq(cur_state)
sync += core.i.raw_insn_i.eq(dec_opcode_i)
# handshake with execution FSM, move to "wait" once acknowledged
with m.State("INSN_EXECUTE"):
- comb += exec_insn_i_valid.eq(1) # trigger execute
- with m.If(exec_insn_o_ready): # execute acknowledged us
- m.next = "EXECUTE_WAIT"
+ # when using "single-step" mode, checking dbg.stopping_o
+ # prevents progress. allow execute to proceed once started
+ stopping = Const(0)
+ #if self.allow_overlap:
+ # stopping = dbg.stopping_o
+ with m.If(stopping):
+ # stopping: jump back to idle
+ m.next = "ISSUE_START"
+ if flush_needed:
+ # request the icache to stop asserting "failed"
+ comb += core.icache.flush_in.eq(1)
+ # stop instruction fault
+ sync += pdecode2.instr_fault.eq(0)
+ with m.Else():
+ comb += exec_insn_i_valid.eq(1) # trigger execute
+ with m.If(exec_insn_o_ready): # execute acknowledged us
+ m.next = "EXECUTE_WAIT"
with m.State("EXECUTE_WAIT"):
- # wait on "core stop" release, at instruction end
- # need to do this here, in case we are in a VL>1 loop
- with m.If(~dbg.core_stop_o & ~core_rst):
- comb += exec_pc_i_ready.eq(1)
- # see https://bugs.libre-soc.org/show_bug.cgi?id=636
- # the exception info needs to be blatted into
- # pdecode.ldst_exc, and the instruction "re-run".
- # when ldst_exc.happened is set, the PowerDecoder2
- # reacts very differently: it re-writes the instruction
- # with a "trap" (calls PowerDecoder2.trap()) which
- # will *overwrite* whatever was requested and jump the
- # PC to the exception address, as well as alter MSR.
- # nothing else needs to be done other than to note
- # the change of PC and MSR (and, later, SVSTATE)
- with m.If(exc_happened):
- sync += pdecode2.ldst_exc.eq(core.fus.get_exc("ldst0"))
-
- with m.If(exec_pc_o_valid):
-
- # was this the last loop iteration?
- is_last = Signal()
- cur_vl = cur_state.svstate.vl
- comb += is_last.eq(next_srcstep == cur_vl)
-
- # return directly to Decode if Execute generated an
- # exception.
- with m.If(pdecode2.ldst_exc.happened):
- m.next = "DECODE_SV"
-
- # if either PC or SVSTATE were changed by the previous
- # instruction, go directly back to Fetch, without
- # updating either PC or SVSTATE
- with m.Elif(pc_changed | sv_changed):
- m.next = "ISSUE_START"
-
- # also return to Fetch, when no output was a vector
- # (regardless of SRCSTEP and VL), or when the last
- # instruction was really the last one of the VL loop
- with m.Elif((~pdecode2.loop_continue) | is_last):
- # before going back to fetch, update the PC state
- # register with the NIA.
- # ok here we are not reading the branch unit.
- # TODO: this just blithely overwrites whatever
- # pipeline updated the PC
- comb += self.state_w_pc.wen.eq(1 << StateRegs.PC)
- comb += self.state_w_pc.i_data.eq(nia)
- # reset SRCSTEP before returning to Fetch
- if self.svp64_en:
- with m.If(pdecode2.loop_continue):
- comb += new_svstate.srcstep.eq(0)
- comb += new_svstate.dststep.eq(0)
- comb += update_svstate.eq(1)
- else:
+ comb += exec_pc_i_ready.eq(1)
+ # see https://bugs.libre-soc.org/show_bug.cgi?id=636
+ # the exception info needs to be blatted into
+ # pdecode.ldst_exc, and the instruction "re-run".
+ # when ldst_exc.happened is set, the PowerDecoder2
+ # reacts very differently: it re-writes the instruction
+ # with a "trap" (calls PowerDecoder2.trap()) which
+ # will *overwrite* whatever was requested and jump the
+ # PC to the exception address, as well as alter MSR.
+ # nothing else needs to be done other than to note
+ # the change of PC and MSR (and, later, SVSTATE)
+ with m.If(exc_happened):
+ mmu = core.fus.get_exc("mmu0")
+ ldst = core.fus.get_exc("ldst0")
+ if mmu is not None:
+ with m.If(fetch_failed):
+ # instruction fetch: exception is from MMU
+ # reset instr_fault (highest priority)
+ sync += pdecode2.ldst_exc.eq(mmu)
+ sync += pdecode2.instr_fault.eq(0)
+ if flush_needed:
+ # request icache to stop asserting "failed"
+ comb += core.icache.flush_in.eq(1)
+ with m.If(~fetch_failed):
+ # otherwise assume it was a LDST exception
+ sync += pdecode2.ldst_exc.eq(ldst)
+
+ with m.If(exec_pc_o_valid):
+
+ # was this the last loop iteration?
+ is_last = Signal()
+ cur_vl = cur_state.svstate.vl
+ comb += is_last.eq(next_srcstep == cur_vl)
+
+ with m.If(pdecode2.instr_fault):
+ # reset instruction fault, try again
+ sync += pdecode2.instr_fault.eq(0)
+ m.next = "ISSUE_START"
+
+ # return directly to Decode if Execute generated an
+ # exception.
+ with m.Elif(pdecode2.ldst_exc.happened):
+ m.next = "DECODE_SV"
+
+ # if MSR, PC or SVSTATE were changed by the previous
+ # instruction, go directly back to Fetch, without
+ # updating either MSR PC or SVSTATE
+ with m.Elif(self.msr_changed | self.pc_changed |
+ self.sv_changed):
+ m.next = "ISSUE_START"
+
+ # also return to Fetch, when no output was a vector
+ # (regardless of SRCSTEP and VL), or when the last
+ # instruction was really the last one of the VL loop
+ with m.Elif((~pdecode2.loop_continue) | is_last):
+ # before going back to fetch, update the PC state
+ # register with the NIA.
+ # ok here we are not reading the branch unit.
+ # TODO: this just blithely overwrites whatever
+ # pipeline updated the PC
+ comb += self.state_w_pc.wen.eq(1 << StateRegs.PC)
+ comb += self.state_w_pc.i_data.eq(nia)
+ # reset SRCSTEP before returning to Fetch
+ if self.svp64_en:
+ with m.If(pdecode2.loop_continue):
comb += new_svstate.srcstep.eq(0)
comb += new_svstate.dststep.eq(0)
- comb += update_svstate.eq(1)
- m.next = "ISSUE_START"
+ comb += self.update_svstate.eq(1)
+ else:
+ comb += new_svstate.srcstep.eq(0)
+ comb += new_svstate.dststep.eq(0)
+ comb += self.update_svstate.eq(1)
+ m.next = "ISSUE_START"
- # returning to Execute? then, first update SRCSTEP
- with m.Else():
- comb += new_svstate.srcstep.eq(next_srcstep)
- comb += new_svstate.dststep.eq(next_dststep)
- comb += update_svstate.eq(1)
- # return to mask skip loop
- m.next = "PRED_SKIP"
+ # returning to Execute? then, first update SRCSTEP
+ with m.Else():
+ comb += new_svstate.srcstep.eq(next_srcstep)
+ comb += new_svstate.dststep.eq(next_dststep)
+ comb += self.update_svstate.eq(1)
+ # return to mask skip loop
+ m.next = "PRED_SKIP"
- with m.Else():
- comb += dbg.core_stopped_i.eq(1)
- # while stopped, allow updating the PC and SVSTATE
- with m.If(self.pc_i.ok):
- comb += self.state_w_pc.wen.eq(1 << StateRegs.PC)
- comb += self.state_w_pc.i_data.eq(self.pc_i.data)
- sync += pc_changed.eq(1)
- with m.If(self.svstate_i.ok):
- comb += new_svstate.eq(self.svstate_i.data)
- comb += update_svstate.eq(1)
- sync += sv_changed.eq(1)
# check if svstate needs updating: if so, write it to State Regfile
- with m.If(update_svstate):
- comb += self.state_w_sv.wen.eq(1<<StateRegs.SVSTATE)
- comb += self.state_w_sv.i_data.eq(new_svstate)
- sync += cur_state.svstate.eq(new_svstate) # for next clock
+ with m.If(self.update_svstate):
+ sync += cur_state.svstate.eq(self.new_svstate) # for next clock
- def execute_fsm(self, m, core, pc_changed, sv_changed,
+ def execute_fsm(self, m, core,
exec_insn_i_valid, exec_insn_o_ready,
exec_pc_o_valid, exec_pc_i_ready):
"""execute FSM
comb = m.d.comb
sync = m.d.sync
+ dbg = self.dbg
pdecode2 = self.pdecode2
+ cur_state = self.cur_state
# temporaries
- core_busy_o = core.n.o_data.busy_o # core is busy
+ core_busy_o = core.n.o_data.busy_o # core is busy
core_ivalid_i = core.p.i_valid # instruction is valid
+ if hasattr(core, "icache"):
+ fetch_failed = core.icache.i_out.fetch_failed
+ else:
+ fetch_failed = Const(0, 1)
+
with m.FSM(name="exec_fsm"):
# waiting for instruction bus (stays there until not busy)
comb += exec_insn_o_ready.eq(1)
with m.If(exec_insn_i_valid):
comb += core_ivalid_i.eq(1) # instruction is valid/issued
- sync += sv_changed.eq(0)
- sync += pc_changed.eq(0)
- with m.If(core.p.o_ready): # only move if accepted
+ sync += self.sv_changed.eq(0)
+ sync += self.pc_changed.eq(0)
+ sync += self.msr_changed.eq(0)
+ with m.If(core.p.o_ready): # only move if accepted
m.next = "INSN_ACTIVE" # move to "wait completion"
# instruction started: must wait till it finishes
with m.State("INSN_ACTIVE"):
- # note changes to PC and SVSTATE
- with m.If(self.state_nia.wen & (1<<StateRegs.SVSTATE)):
- sync += sv_changed.eq(1)
- with m.If(self.state_nia.wen & (1<<StateRegs.PC)):
- sync += pc_changed.eq(1)
- with m.If(~core_busy_o): # instruction done!
+ # note changes to MSR, PC and SVSTATE
+ with m.If(self.state_nia.wen & (1 << StateRegs.SVSTATE)):
+ sync += self.sv_changed.eq(1)
+ with m.If(self.state_nia.wen & (1 << StateRegs.MSR)):
+ sync += self.msr_changed.eq(1)
+ with m.If(self.state_nia.wen & (1 << StateRegs.PC)):
+ sync += self.pc_changed.eq(1)
+ # and note changes to DEC/TB, to be passed to DEC/TB FSM
+ with m.If(self.state_spr.wen & (1 << StateRegs.TB)):
+ comb += self.pause_dec_tb.eq(1)
+ # but also zero-out the cur_state DEC so that, on
+ # the next instruction, if it is "enable interrupt"
+ # the delay between the DEC/TB FSM reading and updating
+ # cur_state.dec doesn't trigger a spurious interrupt.
+ # the DEC/TB FSM will read the regfile and update to
+ # the correct value, so having cur_state.dec set to zero
+ # for a while is no big deal.
+ with m.If(self.state_spr.wen & (1 << StateRegs.DEC)):
+ comb += self.pause_dec_tb.eq(1)
+ sync += cur_state.dec.eq(0) # only needs top bit clear
+ with m.If(~core_busy_o): # instruction done!
comb += exec_pc_o_valid.eq(1)
with m.If(exec_pc_i_ready):
# when finished, indicate "done".
# if we erroneously indicate "done" here, it is as if
# there were *TWO* instructions:
# 1) the failed LDST 2) a TRAP.
- with m.If(~pdecode2.ldst_exc.happened):
+ with m.If(~pdecode2.ldst_exc.happened &
+ ~pdecode2.instr_fault):
comb += self.insn_done.eq(1)
m.next = "INSN_START" # back to fetch
-
- def setup_peripherals(self, m):
- comb, sync = m.d.comb, m.d.sync
-
- # okaaaay so the debug module must be in coresync clock domain
- # but NOT its reset signal. to cope with this, set every single
- # submodule explicitly in coresync domain, debug and JTAG
- # in their own one but using *external* reset.
- csd = DomainRenamer("coresync")
- dbd = DomainRenamer(self.dbg_domain)
-
- m.submodules.core = core = csd(self.core)
- m.submodules.imem = imem = csd(self.imem)
- m.submodules.dbg = dbg = dbd(self.dbg)
- if self.jtag_en:
- m.submodules.jtag = jtag = dbd(self.jtag)
- # TODO: UART2GDB mux, here, from external pin
- # see https://bugs.libre-soc.org/show_bug.cgi?id=499
- sync += dbg.dmi.connect_to(jtag.dmi)
-
- cur_state = self.cur_state
-
- # 4x 4k SRAM blocks. these simply "exist", they get routed in litex
- if self.sram4x4k:
- for i, sram in enumerate(self.sram4k):
- m.submodules["sram4k_%d" % i] = csd(sram)
- comb += sram.enable.eq(self.wb_sram_en)
-
- # XICS interrupt handler
- if self.xics:
- m.submodules.xics_icp = icp = csd(self.xics_icp)
- m.submodules.xics_ics = ics = csd(self.xics_ics)
- comb += icp.ics_i.eq(ics.icp_o) # connect ICS to ICP
- sync += cur_state.eint.eq(icp.core_irq_o) # connect ICP to core
-
- # GPIO test peripheral
- if self.gpio:
- m.submodules.simple_gpio = simple_gpio = csd(self.simple_gpio)
-
- # connect one GPIO output to ICS bit 15 (like in microwatt soc.vhdl)
- # XXX causes litex ECP5 test to get wrong idea about input and output
- # (but works with verilator sim *sigh*)
- #if self.gpio and self.xics:
- # comb += self.int_level_i[15].eq(simple_gpio.gpio_o[0])
-
- # instruction decoder
- pdecode = create_pdecode()
- m.submodules.dec2 = pdecode2 = csd(self.pdecode2)
- if self.svp64_en:
- m.submodules.svp64 = svp64 = csd(self.svp64)
-
- # convenience
- dmi, d_reg, d_cr, d_xer, = dbg.dmi, dbg.d_gpr, dbg.d_cr, dbg.d_xer
- intrf = self.core.regs.rf['int']
-
- # clock delay power-on reset
- cd_por = ClockDomain(reset_less=True)
- cd_sync = ClockDomain()
- core_sync = ClockDomain("coresync")
- m.domains += cd_por, cd_sync, core_sync
- if self.dbg_domain != "sync":
- dbg_sync = ClockDomain(self.dbg_domain)
- m.domains += dbg_sync
-
- ti_rst = Signal(reset_less=True)
- delay = Signal(range(4), reset=3)
- with m.If(delay != 0):
- m.d.por += delay.eq(delay - 1)
- comb += cd_por.clk.eq(ClockSignal())
-
- # power-on reset delay
- core_rst = ResetSignal("coresync")
- comb += ti_rst.eq(delay != 0 | dbg.core_rst_o | ResetSignal())
- comb += core_rst.eq(ti_rst)
-
- # debug clock is same as coresync, but reset is *main external*
- if self.dbg_domain != "sync":
- dbg_rst = ResetSignal(self.dbg_domain)
- comb += dbg_rst.eq(ResetSignal())
-
- # busy/halted signals from core
- core_busy_o = ~core.p.o_ready | core.n.o_data.busy_o # core is busy
- comb += self.busy_o.eq(core_busy_o)
- comb += pdecode2.dec.bigendian.eq(self.core_bigendian_i)
-
- # temporary hack: says "go" immediately for both address gen and ST
- l0 = core.l0
- ldst = core.fus.fus['ldst0']
- st_go_edge = rising_edge(m, ldst.st.rel_o)
- m.d.comb += ldst.ad.go_i.eq(ldst.ad.rel_o) # link addr-go direct to rel
- m.d.comb += ldst.st.go_i.eq(st_go_edge) # link store-go to rising rel
+ # terminate returns directly to INSN_START
+ with m.If(dbg.terminate_i):
+ # comb += self.insn_done.eq(1) - no because it's not
+ m.next = "INSN_START" # back to fetch
def elaborate(self, platform):
- m = Module()
+ m = super().elaborate(platform)
# convenience
comb, sync = m.d.comb, m.d.sync
cur_state = self.cur_state
# set up peripherals and core
core_rst = self.core_rst
- self.setup_peripherals(m)
-
- # reset current state if core reset requested
- with m.If(core_rst):
- m.d.sync += self.cur_state.eq(0)
-
- # PC and instruction from I-Memory
- comb += self.pc_o.eq(cur_state.pc)
- pc_changed = Signal() # note write to PC
- sv_changed = Signal() # note write to SVSTATE
# indicate to outside world if any FU is still executing
- comb += self.any_busy.eq(core.n.o_data.any_busy_o) # any FU executing
-
- # read state either from incoming override or from regfile
- # TODO: really should be doing MSR in the same way
- pc = state_get(m, core_rst, self.pc_i,
- "pc", # read PC
- self.state_r_pc, StateRegs.PC)
- svstate = state_get(m, core_rst, self.svstate_i,
- "svstate", # read SVSTATE
- self.state_r_sv, StateRegs.SVSTATE)
-
- # don't write pc every cycle
- comb += self.state_w_pc.wen.eq(0)
- comb += self.state_w_pc.i_data.eq(0)
+ comb += self.any_busy.eq(core.n.o_data.any_busy_o) # any FU executing
# address of the next instruction, in the absence of a branch
# depends on the instruction size
nia = Signal(64)
# connect up debug signals
- # TODO comb += core.icache_rst_i.eq(dbg.icache_rst_o)
- comb += dbg.terminate_i.eq(core.o.core_terminate_o)
- comb += dbg.state.pc.eq(pc)
- comb += dbg.state.svstate.eq(svstate)
- comb += dbg.state.msr.eq(cur_state.msr)
+ with m.If(core.o.core_terminate_o):
+ comb += dbg.terminate_i.eq(1)
# pass the prefix mode from Fetch to Issue, so the latter can loop
# on VL==0
# these are the handshake signals between each
# fetch FSM can run as soon as the PC is valid
- fetch_pc_i_valid = Signal() # Execute tells Fetch "start next read"
- fetch_pc_o_ready = Signal() # Fetch Tells SVSTATE "proceed"
+ fetch_pc_i_valid = Signal() # Execute tells Fetch "start next read"
+ fetch_pc_o_ready = Signal() # Fetch Tells SVSTATE "proceed"
# fetch FSM hands over the instruction to be decoded / issued
fetch_insn_o_valid = Signal()
# Issue is where the VL for-loop # lives. the ready/valid
# signalling is used to communicate between the four.
- # set up Fetch FSM
- fetch = FetchFSM(self.allow_overlap, self.svp64_en,
- self.imem, core_rst, pdecode2, cur_state,
- dbg, core, svstate, nia, is_svp64_mode)
- m.submodules.fetch = fetch
- # connect up in/out data to existing Signals
- comb += fetch.p.i_data.pc.eq(pc)
- # and the ready/valid signalling
- comb += fetch_pc_o_ready.eq(fetch.p.o_ready)
- comb += fetch.p.i_valid.eq(fetch_pc_i_valid)
- comb += fetch_insn_o_valid.eq(fetch.n.o_valid)
- comb += fetch.n.i_ready.eq(fetch_insn_i_ready)
-
- self.issue_fsm(m, core, pc_changed, sv_changed, nia,
+ self.fetch_fsm(m, dbg, core, core_rst, nia, is_svp64_mode,
+ fetch_pc_o_ready, fetch_pc_i_valid,
+ fetch_insn_o_valid, fetch_insn_i_ready)
+
+ self.issue_fsm(m, core, nia,
dbg, core_rst, is_svp64_mode,
fetch_pc_o_ready, fetch_pc_i_valid,
fetch_insn_o_valid, fetch_insn_i_ready,
pred_insn_i_valid, pred_insn_o_ready,
pred_mask_o_valid, pred_mask_i_ready)
- self.execute_fsm(m, core, pc_changed, sv_changed,
+ self.execute_fsm(m, core,
exec_insn_i_valid, exec_insn_o_ready,
exec_pc_o_valid, exec_pc_i_ready)
- # this bit doesn't have to be in the FSM: connect up to read
- # regfiles on demand from DMI
- self.do_dmi(m, dbg)
-
- # DEC and TB inc/dec FSM. copy of DEC is put into CoreState,
- # (which uses that in PowerDecoder2 to raise 0x900 exception)
- self.tb_dec_fsm(m, cur_state.dec)
-
- return m
-
- def do_dmi(self, m, dbg):
- """deals with DMI debug requests
-
- currently only provides read requests for the INT regfile, CR and XER
- it will later also deal with *writing* to these regfiles.
- """
- comb = m.d.comb
- sync = m.d.sync
- dmi, d_reg, d_cr, d_xer, = dbg.dmi, dbg.d_gpr, dbg.d_cr, dbg.d_xer
- intrf = self.core.regs.rf['int']
-
- with m.If(d_reg.req): # request for regfile access being made
- # TODO: error-check this
- # XXX should this be combinatorial? sync better?
- if intrf.unary:
- comb += self.int_r.ren.eq(1<<d_reg.addr)
- else:
- comb += self.int_r.addr.eq(d_reg.addr)
- comb += self.int_r.ren.eq(1)
- d_reg_delay = Signal()
- sync += d_reg_delay.eq(d_reg.req)
- with m.If(d_reg_delay):
- # data arrives one clock later
- comb += d_reg.data.eq(self.int_r.o_data)
- comb += d_reg.ack.eq(1)
-
- # sigh same thing for CR debug
- with m.If(d_cr.req): # request for regfile access being made
- comb += self.cr_r.ren.eq(0b11111111) # enable all
- d_cr_delay = Signal()
- sync += d_cr_delay.eq(d_cr.req)
- with m.If(d_cr_delay):
- # data arrives one clock later
- comb += d_cr.data.eq(self.cr_r.o_data)
- comb += d_cr.ack.eq(1)
-
- # aaand XER...
- with m.If(d_xer.req): # request for regfile access being made
- comb += self.xer_r.ren.eq(0b111111) # enable all
- d_xer_delay = Signal()
- sync += d_xer_delay.eq(d_xer.req)
- with m.If(d_xer_delay):
- # data arrives one clock later
- comb += d_xer.data.eq(self.xer_r.o_data)
- comb += d_xer.ack.eq(1)
-
- def tb_dec_fsm(self, m, spr_dec):
- """tb_dec_fsm
-
- this is a FSM for updating either dec or tb. it runs alternately
- DEC, TB, DEC, TB. note that SPR pipeline could have written a new
- value to DEC, however the regfile has "passthrough" on it so this
- *should* be ok.
-
- see v3.0B p1097-1099 for Timeer Resource and p1065 and p1076
- """
-
- comb, sync = m.d.comb, m.d.sync
- fast_rf = self.core.regs.rf['fast']
- fast_r_dectb = fast_rf.r_ports['issue'] # DEC/TB
- fast_w_dectb = fast_rf.w_ports['issue'] # DEC/TB
-
- with m.FSM() as fsm:
-
- # initiates read of current DEC
- with m.State("DEC_READ"):
- comb += fast_r_dectb.addr.eq(FastRegs.DEC)
- comb += fast_r_dectb.ren.eq(1)
- m.next = "DEC_WRITE"
-
- # waits for DEC read to arrive (1 cycle), updates with new value
- with m.State("DEC_WRITE"):
- new_dec = Signal(64)
- # TODO: MSR.LPCR 32-bit decrement mode
- comb += new_dec.eq(fast_r_dectb.o_data - 1)
- comb += fast_w_dectb.addr.eq(FastRegs.DEC)
- comb += fast_w_dectb.wen.eq(1)
- comb += fast_w_dectb.i_data.eq(new_dec)
- sync += spr_dec.eq(new_dec) # copy into cur_state for decoder
- m.next = "TB_READ"
-
- # initiates read of current TB
- with m.State("TB_READ"):
- comb += fast_r_dectb.addr.eq(FastRegs.TB)
- comb += fast_r_dectb.ren.eq(1)
- m.next = "TB_WRITE"
-
- # waits for read TB to arrive, initiates write of current TB
- with m.State("TB_WRITE"):
- new_tb = Signal(64)
- comb += new_tb.eq(fast_r_dectb.o_data + 1)
- comb += fast_w_dectb.addr.eq(FastRegs.TB)
- comb += fast_w_dectb.wen.eq(1)
- comb += fast_w_dectb.i_data.eq(new_tb)
- m.next = "DEC_READ"
+ # whatever was done above, over-ride it if core reset is held.
+ # set NIA to pc_at_reset
+ with m.If(core_rst):
+ sync += nia.eq(self.core.pc_at_reset)
return m
- def __iter__(self):
- yield from self.pc_i.ports()
- yield self.pc_o
- yield self.memerr_o
- yield from self.core.ports()
- yield from self.imem.ports()
- yield self.core_bigendian_i
- yield self.busy_o
-
- def ports(self):
- return list(self)
-
- def external_ports(self):
- ports = self.pc_i.ports()
- ports += [self.pc_o, self.memerr_o, self.core_bigendian_i, self.busy_o,
- ]
-
- if self.jtag_en:
- ports += list(self.jtag.external_ports())
- else:
- # don't add DMI if JTAG is enabled
- ports += list(self.dbg.dmi.ports())
-
- ports += list(self.imem.ibus.fields.values())
- ports += list(self.core.l0.cmpi.wb_bus().fields.values())
-
- if self.sram4x4k:
- for sram in self.sram4k:
- ports += list(sram.bus.fields.values())
-
- if self.xics:
- ports += list(self.xics_icp.bus.fields.values())
- ports += list(self.xics_ics.bus.fields.values())
- ports.append(self.int_level_i)
-
- if self.gpio:
- ports += list(self.simple_gpio.bus.fields.values())
- ports.append(self.gpio_o)
-
- return ports
-
- def ports(self):
- return list(self)
-
class TestIssuer(Elaboratable):
def __init__(self, pspec):
self.ti = TestIssuerInternal(pspec)
self.pll = DummyPLL(instance=True)
+ self.dbg_rst_i = Signal(reset_less=True)
+
# PLL direct clock or not
self.pll_en = hasattr(pspec, "use_pll") and pspec.use_pll
if self.pll_en:
self.pll_test_o = Signal(reset_less=True)
self.pll_vco_o = Signal(reset_less=True)
self.clk_sel_i = Signal(2, reset_less=True)
- self.ref_clk = ClockSignal() # can't rename it but that's ok
+ self.ref_clk = ClockSignal() # can't rename it but that's ok
self.pllclk_clk = ClockSignal("pllclk")
def elaborate(self, platform):
# internal clock is set to selector clock-out. has the side-effect of
# running TestIssuer at this speed (see DomainRenamer("intclk") above)
# debug clock runs at coresync internal clock
- cd_coresync = ClockDomain("coresync")
- #m.domains += cd_coresync
if self.ti.dbg_domain != 'sync':
cd_dbgsync = ClockDomain("dbgsync")
- #m.domains += cd_dbgsync
- intclk = ClockSignal("coresync")
+ intclk = ClockSignal(self.ti.core_domain)
dbgclk = ClockSignal(self.ti.dbg_domain)
# XXX BYPASS PLL XXX
# XXX BYPASS PLL XXX
# XXX BYPASS PLL XXX
if self.pll_en:
comb += intclk.eq(self.ref_clk)
+ assert self.ti.core_domain != 'sync', \
+ "cannot set core_domain to sync and use pll at the same time"
else:
- comb += intclk.eq(ClockSignal())
+ if self.ti.core_domain != 'sync':
+ comb += intclk.eq(ClockSignal())
if self.ti.dbg_domain != 'sync':
dbgclk = ClockSignal(self.ti.dbg_domain)
comb += dbgclk.eq(intclk)
+ comb += self.ti.dbg_rst_i.eq(self.dbg_rst_i)
return m
def ports(self):
return list(self.ti.ports()) + list(self.pll.ports()) + \
- [ClockSignal(), ResetSignal()]
+ [ClockSignal(), ResetSignal()]
def external_ports(self):
ports = self.ti.external_ports()
'div': 1,
'mul': 1,
'shiftrot': 1
- }
+ }
pspec = TestMemPspec(ldst_ifacetype='bare_wb',
imem_ifacetype='bare_wb',
- addr_wid=48,
+ addr_wid=64,
mask_wid=8,
reg_wid=64,
units=units)
import argparse
from nmigen.cli import verilog
+from openpower.consts import MSR
from soc.config.test.test_loadstore import TestMemPspec
-from soc.simple.issuer import TestIssuer
+from soc.simple.issuer import TestIssuer, TestIssuerInternal
if __name__ == '__main__':
parser.add_argument("--disable-svp64", dest='svp64', action="store_false",
help="disable SVP64",
default=False)
+ parser.add_argument("--pc-reset", default="0",
+ help="Set PC at reset (default 0)")
+ parser.add_argument("--xlen", default=64, type=int,
+ help="Set register width [default 64]")
+ # create a module that's directly compatible as a drop-in replacement
+ # in microwatt.v
+ parser.add_argument("--microwatt-compat", dest='mwcompat',
+ action="store_true",
+ help="generate microwatt-compatible interface",
+ default=False)
+ parser.add_argument("--microwatt-compat-svp64", dest='mwcompatsvp64',
+ action="store_true",
+ help="generate microwatt-compatible interface + SVP64",
+ default=False)
+ parser.add_argument("--old-microwatt-compat", dest='old_mwcompat',
+ action="store_true",
+ help="generate old microwatt-compatible interface",
+ default=True)
+ parser.add_argument("--microwatt-debug", dest='mwdebug',
+ action="store_true",
+ help="generate old microwatt-compatible interface",
+ default=False)
+ # create a module with Fabric compatibility
+ parser.add_argument("--fabric-compat", dest='fabriccompat',
+ action="store_true",
+ help="generate Fabric-compatible interface",
+ default=False)
+ # small cache option
+ parser.add_argument("--small-cache", dest='smallcache',
+ action="store_true",
+ help="generate small caches",
+ default=False)
+
+ # allow overlaps in TestIssuer
+ parser.add_argument("--allow-overlap", dest='allow_overlap',
+ action="store_true",
+ help="allow overlap in TestIssuer",
+ default=False)
args = parser.parse_args()
+ # convenience: set some defaults
+ if args.mwcompat:
+ args.pll = False
+ args.debug = 'dmi'
+ args.core = True
+ args.xics = False
+ args.gpio = False
+ args.sram4x4kblock = False
+ args.svp64 = False
+
+ # Yes, this is duplicating mwcompat, but for the sake of simplicity
+ # adding support for svp64 like this
+ if args.mwcompatsvp64:
+ args.pll = False
+ args.debug = 'dmi'
+ args.core = True
+ args.xics = False
+ args.gpio = False
+ args.sram4x4kblock = False
+ args.svp64 = True
+ args.mwcompat = True # Ensures TestMemPspec gets the expected value
+
print(args)
units = {'alu': 1,
# decide which memory type to configure
if args.mmu:
ldst_ifacetype = 'mmu_cache_wb'
+ imem_ifacetype = 'mmu_cache_wb'
else:
ldst_ifacetype = 'bare_wb'
- imem_ifacetype = 'bare_wb'
+ imem_ifacetype = 'bare_wb'
+
+ # default MSR
+ msr_reset = (1<<MSR.LE) | (1<<MSR.SF) # 64-bit, little-endian default
+
+ # default PC
+ if args.pc_reset.startswith("0x"):
+ pc_reset = int(args.pc_reset, 16)
+ else:
+ pc_reset = int(args.pc_reset)
pspec = TestMemPspec(ldst_ifacetype=ldst_ifacetype,
imem_ifacetype=imem_ifacetype,
- addr_wid=48,
+ addr_wid=64,
mask_wid=8,
+ # pipeline and integer register file width
+ XLEN=args.xlen,
# must leave at 64
reg_wid=64,
# set to 32 for instruction-memory width=32
sram4x4kblock=args.enable_sram4x4kblock, # add SRAMs
debug=args.debug, # set to jtag or dmi
svp64=args.svp64, # enable SVP64
- mmu=args.mmu, # enable MMU
- units=units)
+ microwatt_mmu=args.mmu, # enable MMU
+ microwatt_compat=args.mwcompat, # microwatt compatible
+ microwatt_old=args.old_mwcompat, # old microwatt api
+ microwatt_debug=args.mwdebug, # microwatt debug signals
+ fabric_compat=args.fabriccompat, # fabric compatible (overlaps with microwatt compat)
+ small_cache=args.smallcache, # small cache/TLB sizes
+ allow_overlap=args.allow_overlap, # allow overlap
+ units=units,
+ msr_reset=msr_reset,
+ pc_reset=pc_reset)
+ #if args.mwcompat:
+ # pspec.core_domain = 'sync'
- print("mmu", pspec.__dict__["mmu"])
+ print("mmu", pspec.__dict__["microwatt_mmu"])
print("nocore", pspec.__dict__["nocore"])
print("regreduce", pspec.__dict__["regreduce"])
print("gpio", pspec.__dict__["gpio"])
print("use_pll", pspec.__dict__["use_pll"])
print("debug", pspec.__dict__["debug"])
print("SVP64", pspec.__dict__["svp64"])
+ print("XLEN", pspec.__dict__["XLEN"])
+ print("MSR@reset", hex(pspec.__dict__["msr_reset"]))
+ print("PC@reset", hex(pspec.__dict__["pc_reset"]))
+ print("Microwatt compatibility", pspec.__dict__["microwatt_compat"])
+ print("Old Microwatt compatibility", pspec.__dict__["microwatt_old"])
+ print("Microwatt debug", pspec.__dict__["microwatt_debug"])
+ print("Fabric compatibility", pspec.__dict__["fabric_compat"])
+ print("Small Cache/TLB", pspec.__dict__["small_cache"])
- dut = TestIssuer(pspec)
+ if args.mwcompat:
+ dut = TestIssuerInternal(pspec)
+ name = "external_core_top"
+ else:
+ dut = TestIssuer(pspec)
+ name = "test_issuer"
- vl = verilog.convert(dut, ports=dut.external_ports(), name="test_issuer")
+ vl = verilog.convert(dut, ports=dut.external_ports(), name=name)
with open(args.output_filename, "w") as f:
f.write(vl)
from soc.fu.ldst.test.test_pipe_caller import LDSTTestCase
from openpower.test.general.overlap_hazards import (HazardTestCase,
RandomHazardTestCase)
-from openpower.util import spr_to_fast_reg
+from openpower.util import spr_to_fast_reg, spr_to_state_reg
from openpower.consts import StateRegsEnum
# list of SPRs that are controlled and managed by the MMU
-mmu_sprs = ["PRTBL", "DSISR", "DAR", "PIDR"]
+mmu_sprs = ["PRTBL", "PIDR"]
+ldst_sprs = ["DAR", "DSISR"]
def set_mmu_spr(name, i, val, core): # important keep pep8 formatting
yield fsm.mmu.l_in.rs.eq(val)
yield
yield fsm.mmu.l_in.mtspr.eq(0)
- print("mmu_spr was updated")
+ while True:
+ done = yield fsm.mmu.l_out.done
+ if done:
+ break
+ yield
+ yield
+ print("mmu_spr %s %d was updated %x" % (name, i, val))
+
+
+def set_ldst_spr(name, i, val, core): # important keep pep8 formatting
+ ldst = core.fus.get_fu("mmu0").alu.ldst # awkward to get at but it works
+ yield ldst.sprval_in.eq(val)
+ yield ldst.mmu_set_spr.eq(1)
+ if name == 'DAR':
+ yield ldst.mmu_set_dar.eq(1)
+ yield
+ yield ldst.mmu_set_dar.eq(0)
+ else:
+ yield ldst.mmu_set_dsisr.eq(1)
+ yield
+ yield ldst.mmu_set_dsisr.eq(0)
+ yield ldst.mmu_set_spr.eq(0)
+ print("ldst_spr %s %d was updated %x" % (name, i, val))
def setup_regs(pdecode2, core, test):
# setting both fast and slow SPRs from test data
fregs = core.regs.fast
+ stateregs = core.regs.state
sregs = core.regs.spr
for sprname, val in test.sprs.items():
if isinstance(val, SelectableInt):
sprname = spr_dict[sprname].SPR
if sprname == 'XER':
continue
+ print ('set spr %s val %x' % (sprname, val))
+
fast = spr_to_fast_reg(sprname)
- if fast is None:
+ state = spr_to_state_reg(sprname)
+
+ if fast is None and state is None:
# match behaviour of SPRMap in power_decoder2.py
for i, x in enumerate(SPR):
if sprname == x.name:
- print("setting slow SPR %d (%s) to %x" %
- (i, sprname, val))
- if sprname not in mmu_sprs:
- yield sregs.memory._array[i].eq(val)
+ print("setting slow SPR %d (%s/%d) to %x" %
+ (i, sprname, x.value, val))
+ if sprname in mmu_sprs:
+ yield from set_mmu_spr(sprname, x.value, val, core)
+ elif sprname in ldst_sprs:
+ yield from set_ldst_spr(sprname, x.value, val, core)
else:
- yield from set_mmu_spr(sprname, i, val, core)
+ yield sregs.memory._array[i].eq(val)
+ elif state is not None:
+ print("setting state reg %d (%s) to %x" %
+ (state, sprname, val))
+ if stateregs.unary:
+ rval = stateregs.regs[state].reg
+ else:
+ rval = stateregs.memory._array[state]
+ yield rval.eq(val)
else:
print("setting fast reg %d (%s) to %x" %
(fast, sprname, val))
from openpower.test.branch.branch_cases import BranchTestCase
from soc.fu.spr.test.test_pipe_caller import SPRTestCase
from openpower.test.ldst.ldst_cases import LDSTTestCase
+from openpower.test.trap.trap_cases import TrapTestCase
from openpower.simulator.test_sim import (GeneralTestCases, AttnTestCase)
from openpower.simulator.test_helloworld_sim import HelloTestCases
if __name__ == "__main__":
svp64 = True
- if sys.argv[1] == 'nosvp64':
+ if len(sys.argv) > 1 and sys.argv[1] == 'nosvp64':
svp64 = False
del sys.argv[1]
allow_overlap = True
del sys.argv[1]
+ # use in-order issuer, instead of the original FSM based one
+ inorder = False
+ if len(sys.argv) >= 2 and sys.argv[1] == '--inorder':
+ inorder = True
+ del sys.argv[1]
+
# allow list of testing to be selected by command-line
- testing = sys.argv[1:]
- sys.argv = sys.argv[:1]
+ testing = []
+ for i in reversed(range(1, len(sys.argv))):
+ if not sys.argv[i].startswith('-'):
+ testing.append(sys.argv.pop(i))
if not testing:
testing = ['general', 'ldst', 'cr', 'shiftrot', 'shiftrot2',
'logical', 'alu',
'branch', 'div', 'mul', 'hazard']
- print ("SVP64 test mode enabled", svp64, "overlap",
- allow_overlap, "testing", testing)
+ print("SVP64 test mode enabled", svp64, "overlap",
+ allow_overlap, "in-order", inorder, "testing", testing)
unittest.main(exit=False)
suite = unittest.TestSuite()
'hazard': HazardTestCase().test_data,
'alu': ALUTestCase().test_data,
'branch': BranchTestCase().test_data,
+ 'trap': TrapTestCase().test_data,
'spr': SPRTestCase().test_data
- }
+ }
# walk through all tests, those requested get added
for tname, data in tests.items():
if tname in testing:
- suite.addTest(TestRunner(data, svp64=svp64,
+ suite.addTest(TestRunner(data, svp64=svp64, inorder=inorder,
allow_overlap=allow_overlap))
runner = unittest.TextTestRunner()
--- /dev/null
+"""simple core test, runs instructions from a TestMemory
+
+related bugs:
+
+ * https://bugs.libre-soc.org/show_bug.cgi?id=363
+"""
+
+# NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
+# Also, check out the cxxsim nmigen branch, and latest yosys from git
+
+import unittest
+import sys
+
+# here is the logic which takes test cases and "executes" them.
+# in this instance (TestRunner) its job is to instantiate both
+# a Libre-SOC nmigen-based HDL instance and an ISACaller python
+# simulator. it's also responsible for performing the single
+# step and comparison.
+from soc.simple.test.test_runner import TestRunner
+
+#@platen:bookmarks
+#src/openpower/test/runner.py:class TestRunnerBase(FHDLTestCase):
+
+# test with MMU
+from openpower.test.mmu.mmu_cases import MMUTestCase
+from openpower.test.mmu.mmu_rom_cases import MMUTestCaseROM, default_mem
+from openpower.test.ldst.ldst_cases import LDSTTestCase
+from openpower.test.ldst.ldst_exc_cases import LDSTExceptionTestCase
+#from openpower.simulator.test_sim import (GeneralTestCases, AttnTestCase)
+from soc.experiment.test import pagetables
+
+
+from openpower.simulator.program import Program
+from openpower.endian import bigendian
+from openpower.test.common import TestAccumulatorBase
+
+from openpower.consts import MSR
+
+from soc.experiment.test import pagetables
+
+
+class MMUTestCase(TestAccumulatorBase):
+
+ def case_first_vm_enabled(self):
+ lst = [
+ "std 6,0(2)",
+ ]
+
+ # set up regs
+ initial_regs = [0] * 32
+ initial_regs[2] = 0xc0000000005fc190
+ initial_regs[6] = 0x0101
+
+ # memory same as microwatt test
+ initial_mem = pagetables.microwatt_linux_5_7_boot
+
+ # set virtual and non-privileged
+ # msr: 8000000000000011
+ initial_msr = 0 << MSR.PR # must set "problem" state
+ initial_msr |= 1 << MSR.LE # little-endian
+ initial_msr |= 1 << MSR.SF # 64-bit
+ initial_msr |= 1 << MSR.DR # set "virtual" state for data
+
+ # set PRTBL to 0xe000000
+ initial_sprs = {720: 0xe000000, # PRTBL
+ 48: 1 # PIDR
+ }
+
+ print("MMUTEST: initial_msr=",initial_msr)
+ self.add_case(Program(lst, bigendian), initial_regs,
+ initial_mem=initial_mem,
+ initial_sprs=initial_sprs,
+ initial_msr=initial_msr)
+
+
+ def case_first_vm_enabled_2(self):
+ lst = [
+ "std 6,0(2)",
+ ]
+
+ # set up regs
+ initial_regs = [0] * 32
+ initial_regs[2] = 0xc000000000598000
+ initial_regs[6] = 0x0101
+
+ # memory same as microwatt test
+ initial_mem = pagetables.microwatt_linux_5_7_boot
+
+ # set virtual and non-privileged
+ # msr: 8000000000000011
+ initial_msr = 0 << MSR.PR # must set "problem" state
+ initial_msr |= 1 << MSR.LE # little-endian
+ initial_msr |= 1 << MSR.SF # 64-bit
+ initial_msr |= 1 << MSR.DR # set "virtual" state for data
+
+ # set PRTBL to 0xe000000
+ initial_sprs = {720: 0xe00000c, # PRTBL
+ 48: 1 # PIDR
+ }
+
+ print("MMUTEST: initial_msr=",initial_msr)
+ self.add_case(Program(lst, bigendian), initial_regs,
+ initial_mem=initial_mem,
+ initial_sprs=initial_sprs,
+ initial_msr=initial_msr)
+
+
+if __name__ == "__main__":
+ svp64 = True
+ if len(sys.argv) == 2:
+ if sys.argv[1] == 'nosvp64':
+ svp64 = False
+ sys.argv.pop()
+
+ print ("SVP64 test mode enabled", svp64)
+
+ unittest.main(exit=False)
+ suite = unittest.TestSuite()
+
+ # MMU/DCache integration tests
+ suite.addTest(TestRunner(MMUTestCase().test_data, svp64=svp64,
+ microwatt_mmu=True,
+ rom=pagetables.microwatt_linux_5_7_boot))
+
+ runner = unittest.TextTestRunner()
+ runner.run(suite)
--- /dev/null
+"""simple core test, runs instructions from a TestMemory
+
+related bugs:
+
+ * https://bugs.libre-soc.org/show_bug.cgi?id=363
+"""
+
+# NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
+# Also, check out the cxxsim nmigen branch, and latest yosys from git
+
+import unittest
+import sys
+
+# here is the logic which takes test cases and "executes" them.
+# in this instance (TestRunner) its job is to instantiate both
+# a Libre-SOC nmigen-based HDL instance and an ISACaller python
+# simulator. it's also responsible for performing the single
+# step and comparison.
+from soc.simple.test.test_runner import TestRunner
+
+#@platen:bookmarks
+#src/openpower/test/runner.py:class TestRunnerBase(FHDLTestCase):
+
+# test with MMU
+from openpower.test.mmu.mmu_cases import MMUTestCase
+from openpower.test.mmu.mmu_rom_cases import MMUTestCaseROM, default_mem
+from openpower.test.ldst.ldst_cases import LDSTTestCase
+from openpower.test.ldst.ldst_exc_cases import LDSTExceptionTestCase
+#from openpower.simulator.test_sim import (GeneralTestCases, AttnTestCase)
+
+from openpower.simulator.program import Program
+from openpower.endian import bigendian
+from openpower.test.common import TestAccumulatorBase
+
+from openpower.consts import MSR
+
+from soc.experiment.test import pagetables
+
+
+class MMUTestCase(TestAccumulatorBase):
+
+ def case_virtual_ld_st(self):
+ lst = ["stb 10,0(2)",
+ "addi 10,0, -4",
+ "stb 10,0(5)",
+ "lhz 6,0(2)",
+ ]
+
+ # set up regs
+ initial_regs = [0] * 32
+ initial_regs[1] = 0x1000000 # hm, was going to do mtspr 720,1 with this
+ initial_regs[2] = 0x3456
+ initial_regs[3] = 0x4321
+ initial_regs[4] = 0x6543
+ initial_regs[5] = 0x3457
+ initial_regs[10] = 0xfe
+
+ # no pre-loaded memory here
+ initial_mem = {}
+
+ # set virtual and non-privileged
+ initial_msr = 0 << MSR.PR # must set "problem" state
+ #initial_msr |= 1 << MSR.DR # set "virtual" state for data
+ initial_msr |= 1 << MSR.IR # set "virtual" state for instructions
+ initial_msr |= 1 << MSR.LE # set little-endian
+
+ # set PRTBL to 0x1000000
+ initial_sprs = {720: 0x1000000} # PRTBL
+
+ print("MMUTEST: initial_msr=",initial_msr)
+ self.add_case(Program(lst, bigendian), initial_regs,
+ initial_mem=initial_mem,
+ initial_sprs=initial_sprs,
+ initial_msr=initial_msr)
+
+ def case_virtual_invalid_no_prtbl(self):
+ """virtual memory test but with no PRTBL set it is expected
+ to throw an "invalid" exception
+ """
+ lst = ["stb 10,0(2)",
+ ]
+
+ # set up regs
+ initial_regs = [0] * 32
+
+ # set virtual and non-privileged
+ initial_msr = 1 << MSR.PR # must set "problem" state
+ initial_msr |= 1 << MSR.DR # set "virtual" state for data
+ initial_msr |= 1 << MSR.IR # set "virtual" state for instructions
+
+ print("MMUTEST: initial_msr=",initial_msr)
+ self.add_case(Program(lst, bigendian), initial_regs,
+ initial_msr=initial_msr,
+ stop_at_pc=0x400) # stop at this exception addr
+
+if __name__ == "__main__":
+ svp64 = True
+ if len(sys.argv) == 2:
+ if sys.argv[1] == 'nosvp64':
+ svp64 = False
+ sys.argv.pop()
+
+ print ("SVP64 test mode enabled", svp64)
+
+ unittest.main(exit=False)
+ suite = unittest.TestSuite()
+
+ # MMU/DCache integration tests
+ suite.addTest(TestRunner(MMUTestCase().test_data, svp64=svp64,
+ microwatt_mmu=True,
+ rom=pagetables.test1))
+
+ runner = unittest.TextTestRunner()
+ runner.run(suite)
--- /dev/null
+"""simple core test, runs instructions from a TestMemory
+
+related bugs:
+
+ * https://bugs.libre-soc.org/show_bug.cgi?id=363
+"""
+
+# NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
+# Also, check out the cxxsim nmigen branch, and latest yosys from git
+
+import unittest
+import sys
+
+# here is the logic which takes test cases and "executes" them.
+# in this instance (TestRunner) its job is to instantiate both
+# a Libre-SOC nmigen-based HDL instance and an ISACaller python
+# simulator. it's also responsible for performing the single
+# step and comparison.
+from soc.simple.test.test_runner import TestRunner
+
+#@platen:bookmarks
+#src/openpower/test/runner.py:class TestRunnerBase(FHDLTestCase):
+
+# test with MMU
+from openpower.test.mmu.mmu_cases import MMUTestCase
+from openpower.test.mmu.mmu_rom_cases import MMUTestCaseROM, default_mem
+from openpower.test.ldst.ldst_cases import LDSTTestCase
+from openpower.test.ldst.ldst_exc_cases import LDSTExceptionTestCase
+#from openpower.simulator.test_sim import (GeneralTestCases, AttnTestCase)
+from soc.experiment.test import pagetables
+
+
+from openpower.simulator.program import Program
+from openpower.endian import bigendian
+from openpower.test.common import TestAccumulatorBase
+
+from openpower.consts import MSR
+
+from soc.experiment.test import pagetables
+
+
+class MMUTestCase(TestAccumulatorBase):
+
+ def case_microwatt_test_3_mmu_ld(self):
+ lst = [
+ "ld 6,0(2)",
+ ]
+
+ # set up regs
+ initial_regs = [0] * 32
+ initial_regs[2] = 0x124108
+
+ # memory same as microwatt test
+ initial_mem = pagetables.microwatt_test2
+
+ # set virtual and non-privileged
+ # msr: 8000000000000011
+ initial_msr = 0 << MSR.PR # must set "problem" state
+ initial_msr |= 1 << MSR.LE # little-endian
+ initial_msr |= 1 << MSR.SF # 64-bit
+ initial_msr |= 1 << MSR.DR # set "virtual" state for data
+
+ # set PRTBL to 0x12000
+ initial_sprs = {720: 0x12000, # PRTBL
+ 48: 1 # PIDR
+ }
+
+ print("MMUTEST: initial_msr=",initial_msr)
+ self.add_case(Program(lst, bigendian), initial_regs,
+ initial_mem=initial_mem,
+ initial_sprs=initial_sprs,
+ initial_msr=initial_msr)
+
+
+if __name__ == "__main__":
+ svp64 = True
+ if len(sys.argv) == 2:
+ if sys.argv[1] == 'nosvp64':
+ svp64 = False
+ sys.argv.pop()
+
+ print ("SVP64 test mode enabled", svp64)
+
+ unittest.main(exit=False)
+ suite = unittest.TestSuite()
+
+ # MMU/DCache integration tests
+ suite.addTest(TestRunner(MMUTestCase().test_data, svp64=svp64,
+ microwatt_mmu=True,
+ rom=pagetables.microwatt_test2))
+
+ runner = unittest.TextTestRunner()
+ runner.run(suite)
from soc.simple.test.test_runner import setup_i_memory
+from pathlib import Path
+
import sys
sys.setrecursionlimit(10**6)
with Program("1.bin", bigendian) as program:
self.run_tst_program(program)
+ @unittest.skipUnless(Path("hello_world.bin").exists(),
+ "missing hello_world.bin")
def test_binary(self):
with Program("hello_world.bin", bigendian) as program:
self.run_tst_program(program)
pspec = TestMemPspec(ldst_ifacetype='test_bare_wb',
imem_ifacetype='test_bare_wb',
- addr_wid=48,
+ addr_wid=64,
mask_wid=8,
reg_wid=64,
imem_test_depth=32768,
from nmigen import Module, Signal
from nmigen.hdl.xfrm import ResetInserter
from copy import copy
+from pprint import pprint
# NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
# Also, check out the cxxsim nmigen branch, and latest yosys from git
from openpower.endian import bigendian
from soc.simple.issuer import TestIssuerInternal
+from soc.simple.inorder import TestIssuerInternalInOrder
from soc.simple.test.test_core import (setup_regs, check_regs, check_mem,
wait_for_busy_clear,
check_sim_memory)
from soc.debug.dmi import DBGCore, DBGCtrl, DBGStat
from nmutil.util import wrap
-from soc.experiment.test.test_mmu_dcache import wb_get
from openpower.test.state import TestState, StateRunner
from openpower.test.runner import TestRunnerBase
-def setup_i_memory(imem, startaddr, instructions):
+def insert_into_rom(startaddr, instructions, rom):
+ print("insn before, init rom", len(instructions))
+ pprint(rom)
+
+ startaddr //= 4 # instructions are 32-bit
+
+ # 64 bit
+ mask = ((1 << 64)-1)
+ for ins in instructions:
+ if isinstance(ins, tuple):
+ insn, code = ins
+ else:
+ insn, code = ins, ''
+ insn = insn & 0xffffffff
+ msbs = (startaddr >> 1) & mask
+ lsb = 1 if (startaddr & 1) else 0
+ print ("insn", hex(insn), hex(msbs), hex(lsb))
+
+ val = rom.get(msbs<<3, 0)
+ if insn != 0:
+ print("before set", hex(4*startaddr),
+ hex(msbs), hex(val), hex(insn))
+ val = (val | (insn << (lsb*32)))
+ val = val & mask
+ rom[msbs<<3] = val
+ if insn != 0:
+ print("after set", hex(4*startaddr), hex(msbs), hex(val))
+ print("instr: %06x 0x%x %s %08x" % (4*startaddr, insn, code, val))
+ startaddr += 1
+ startaddr = startaddr & mask
+
+ print ("after insn insert")
+ pprint(rom)
+
+
+def setup_i_memory(imem, startaddr, instructions, rom):
mem = imem
print("insn before, init mem", mem.depth, mem.width, mem,
len(instructions))
- for i in range(mem.depth):
- yield mem._array[i].eq(0)
- yield Settle()
+
+ if not rom:
+ # initialise mem array to zero
+ for i in range(mem.depth):
+ yield mem._array[i].eq(0)
+ yield Settle()
+
startaddr //= 4 # instructions are 32-bit
if mem.width == 32:
+ assert rom is None, "cannot do 32-bit from wb_get ROM yet"
mask = ((1 << 32)-1)
for ins in instructions:
if isinstance(ins, tuple):
insn, code = ins, ''
insn = insn & 0xffffffff
msbs = (startaddr >> 1) & mask
- val = yield mem._array[msbs]
+ lsb = 1 if (startaddr & 1) else 0
+
+ if rom: # must put the value into the wb_get area
+ val = rom[msbs<<1]
+ else:
+ val = yield mem._array[msbs]
if insn != 0:
print("before set", hex(4*startaddr),
hex(msbs), hex(val), hex(insn))
- lsb = 1 if (startaddr & 1) else 0
val = (val | (insn << (lsb*32)))
val = val & mask
- yield mem._array[msbs].eq(val)
- yield Settle()
+ if rom: # must put the value into the wb_get area
+ rom[msbs<<1] = val
+ else:
+ yield mem._array[msbs].eq(val)
+ yield Settle()
if insn != 0:
print("after set", hex(4*startaddr), hex(msbs), hex(val))
print("instr: %06x 0x%x %s %08x" % (4*startaddr, insn, code, val))
"""HDLRunner: Implements methods for the setup, preparation, and
running of tests using nmigen HDL simulation.
"""
+
def __init__(self, dut, m, pspec):
super().__init__("hdl", HDLRunner)
self.dut = dut
+ self.pspec = pspec
self.pc_i = Signal(32)
self.svstate_i = Signal(64)
#hard_reset = Signal(reset_less=True)
- self.issuer = TestIssuerInternal(pspec)
+ if pspec.inorder:
+ self.issuer = TestIssuerInternalInOrder(pspec)
+ else:
+ self.issuer = TestIssuerInternal(pspec)
# use DMI RESET command instead, this does actually work though
- #issuer = ResetInserter({'coresync': hard_reset,
+ # issuer = ResetInserter({'coresync': hard_reset,
# 'sync': hard_reset})(issuer)
m.submodules.issuer = self.issuer
self.dmi = self.issuer.dbg.dmi
def prepare_for_test(self, test):
self.test = test
+ #print ("preparing for test name", test.name)
# set up bigendian (TODO: don't do this, use MSR)
yield self.issuer.core_bigendian_i.eq(bigendian)
yield
yield
yield
+ #print ("end of test preparation", test.name)
def setup_during_test(self):
- yield from set_dmi(self.dmi, DBGCore.CTRL, 1<<DBGCtrl.STOP)
+ # first run a manual hard-reset of the debug interface.
+ # core is counting down on a 3-clock delay at this point
+ yield self.issuer.dbg_rst_i.eq(1)
+ yield
+ yield self.issuer.dbg_rst_i.eq(0)
+
+ # now run a DMI-interface reset. because DMI is running
+ # in dbgsync domain its reset is *NOT* connected to
+ # core reset (hence the dbg_rst_i blip, above)
+ yield from set_dmi(self.dmi, DBGCore.CTRL, 1 << DBGCtrl.STOP)
yield
+ #print("test setup")
def run_test(self, instructions):
"""run_hdl_state - runs a TestIssuer nmigen HDL simulation
"""
- imem = self.issuer.imem._get_memory()
+ #print("starting test")
+
+ if self.dut.rom is None:
+ imem = self.issuer.imem._get_memory()
+ #print("got memory", imem)
+ else:
+ print("skipping memory get due to rom")
+ pprint(self.dut.rom)
core = self.issuer.core
dmi = self.issuer.dbg.dmi
pdecode2 = self.issuer.pdecode2
pc = 0 # start address
counter = 0 # test to pause/start
- yield from setup_i_memory(imem, pc, instructions)
- yield from setup_tst_memory(l0, self.test.mem)
+ # XXX for now, when ROM (run under wb_get) is detected,
+ # skip setup of memories. must be done a different way
+ if self.dut.rom is None:
+ yield from setup_i_memory(imem, pc, instructions, self.dut.rom)
+ yield from setup_tst_memory(l0, self.test.mem)
+ else:
+ insert_into_rom(pc, instructions, self.dut.default_mem)
+ print("about to setup regs")
yield from setup_regs(pdecode2, core, self.test)
+ #print("setup mem and regs done")
# set PC and SVSTATE
yield self.pc_i.eq(pc)
print("instructions", instructions)
+ # before starting the simulation, set the core stop address to be
+ # just after the last instruction. if a load of an instruction is
+ # requested at this address, the core is immediately put into "halt"
+ # XXX: keep an eye out for in-order problems
+ hard_stop_addr = self.test.stop_at_pc
+ if hard_stop_addr is None:
+ hard_stop_addr = len(instructions)*4
+ yield from set_dmi(dmi, DBGCore.STOPADDR, hard_stop_addr)
+
# run the loop of the instructions on the current test
index = (yield self.issuer.cur_state.pc) // 4
while index < len(instructions):
# start the core
yield
yield from set_dmi(dmi, DBGCore.CTRL,
- 1<<DBGCtrl.START)
- yield self.issuer.pc_i.ok.eq(0) # no change PC after this
- yield self.issuer.svstate_i.ok.eq(0) # ditto
+ 1 << DBGCtrl.START)
+ yield self.issuer.pc_i.ok.eq(0) # no change PC after this
+ yield self.issuer.svstate_i.ok.eq(0) # ditto
yield
yield
counter = counter + 1
# wait until executed
- while not (yield self.issuer.insn_done):
+ while not ((yield self.issuer.insn_done) or
+ (yield self.issuer.dbg.terminated_o)):
yield
# okaaay long story: in overlap mode, PC is updated one cycle
if index < len(instructions):
# Get HDL mem and state
state = yield from TestState("hdl", core, self.dut,
- code)
+ code)
hdl_states.append(state)
if index >= len(instructions):
- print ("index over, send dmi stop")
+ print("index over, send dmi stop")
# stop at end
- yield from set_dmi(dmi, DBGCore.CTRL, 1<<DBGCtrl.STOP)
+ yield from set_dmi(dmi, DBGCore.CTRL, 1 << DBGCtrl.STOP)
yield
yield
+ # hmm really should use DMI status check here but hey it's quick
+ while True:
+ stopped = yield self.issuer.dbg.core_stop_o
+ if stopped:
+ break
+ yield
+ break
terminated = yield self.issuer.dbg.terminated_o
print("terminated(2)", terminated)
if terminated:
break
- if self.dut.allow_overlap:
+ if self.dut.allow_overlap: # or not self.dut.rom: ??
# wait until all settled
# XXX really this should be in DMI, which should in turn
# use issuer.any_busy to not send back "stopped" signal
if self.dut.allow_overlap:
# get last state, at end of run
state = yield from TestState("hdl", core, self.dut,
- code)
+ code)
hdl_states.append(state)
return hdl_states
def end_test(self):
- yield from set_dmi(self.dmi, DBGCore.CTRL, 1<<DBGCtrl.STOP)
+ yield from set_dmi(self.dmi, DBGCore.CTRL, 1 << DBGCtrl.STOP)
yield
yield
xer = yield from get_dmi(self.dmi, DBGCore.XER)
print("after test %s XER value %x" % (self.test.name, xer))
+ # get MSR
+ msr = yield from get_dmi(self.dmi, DBGCore.MSR)
+ print("after test %s MSR value %x" % (self.test.name, msr))
+
# test of dmi reg get
for int_reg in range(32):
yield from set_dmi(self.dmi, DBGCore.GSPR_IDX, int_reg)
value = yield from get_dmi(self.dmi, DBGCore.GSPR_DATA)
print("after test %s reg %2d value %x" %
- (self.test.name, int_reg, value))
+ (self.test.name, int_reg, value))
# pull a reset
- yield from set_dmi(self.dmi, DBGCore.CTRL, 1<<DBGCtrl.RESET)
+ yield from set_dmi(self.dmi, DBGCore.CTRL, 1 << DBGCtrl.RESET)
yield
class TestRunner(TestRunnerBase):
def __init__(self, tst_data, microwatt_mmu=False, rom=None,
- svp64=True, run_hdl=True, run_sim=True,
- allow_overlap=False):
+ svp64=True, inorder=False, run_hdl=True, run_sim=True,
+ allow_overlap=False):
if run_hdl:
run_hdl = HDLRunner
super().__init__(tst_data, microwatt_mmu=microwatt_mmu,
- rom=rom,
- svp64=svp64, run_hdl=run_hdl, run_sim=run_sim,
- allow_overlap=allow_overlap)
-
+ rom=rom, inorder=inorder,
+ svp64=svp64, run_hdl=run_hdl, run_sim=run_sim,
+ allow_overlap=allow_overlap)
super().__init__()
self.core = core
+ def get_fpregs(self):
+ if False:
+ yield
+ self.fpregs = []
+ for i in range(32):
+ self.fpregs.append(0)
+
def get_intregs(self):
self.intregs = []
for i in range(32):
log("class hdl pc", hex(self.pc))
def get_mem(self):
+ self.mem = {}
# get the underlying HDL-simulated memory from the L0CacheBuffer
+ if hasattr(self.core, "icache"):
+ # err temporarily ignore memory
+ return # XXX have to work out how to deal with wb_get
hdlmem = get_l0_mem(self.core.l0)
- self.mem = {}
for i in range(hdlmem.depth):
value = yield hdlmem._array[i] # should not really do this
self.mem[i*8] = value