From: Luke Kenneth Casson Leighton Date: Mon, 9 Mar 2020 13:42:31 +0000 (+0000) Subject: move all source directories to soc so that "import soc.scoreboard" etc is used X-Git-Tag: div_pipeline~1749 X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=2d1027c7357d92b8cae4c15f55ad97b8fe81707b;p=soc.git move all source directories to soc so that "import soc.scoreboard" etc is used --- diff --git a/src/TLB/.gitignore b/src/TLB/.gitignore deleted file mode 100644 index 3324664b..00000000 --- a/src/TLB/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -*.wpr -__pycache__ diff --git a/src/TLB/AddressEncoder.py b/src/TLB/AddressEncoder.py deleted file mode 100644 index 128f2c97..00000000 --- a/src/TLB/AddressEncoder.py +++ /dev/null @@ -1,75 +0,0 @@ -from nmigen import Module, Signal, Elaboratable -from nmigen.lib.coding import Encoder, PriorityEncoder - -class AddressEncoder(Elaboratable): - """Address Encoder - - The purpose of this module is to take in a vector and - encode the bits that are one hot into an address. This module - combines both nmigen's Encoder and PriorityEncoder and will state - whether the input line has a single bit hot, multiple bits hot, - or no bits hot. The output line will always have the lowest value - address output. - - Usage: - The output is valid when either single or multiple match is high. - Otherwise output is 0. - """ - def __init__(self, width): - """ Arguments: - * width: The desired length of the input vector - """ - # Internal - self.encoder = Encoder(width) - self.p_encoder = PriorityEncoder(width) - - # Input - self.i = Signal(width) - - # Output - self.single_match = Signal(1) - self.multiple_match = Signal(1) - self.o = Signal(max=width) - - def elaborate(self, platform=None): - m = Module() - - # Add internal submodules - m.submodules.encoder = self.encoder - m.submodules.p_encoder = self.p_encoder - - m.d.comb += [ - self.encoder.i.eq(self.i), - self.p_encoder.i.eq(self.i) - ] - - # Steps: - # 1. check if the input vector is non-zero - # 2. if non-zero, check if single match or multiple match - # 3. set output line to be lowest value address output - - # If the priority encoder recieves an input of 0 - # If n is 1 then the output is not valid - with m.If(self.p_encoder.n): - m.d.comb += [ - self.single_match.eq(0), - self.multiple_match.eq(0), - self.o.eq(0) - ] - # If the priority encoder recieves an input > 0 - with m.Else(): - # Multiple Match if encoder n is invalid - with m.If(self.encoder.n): - m.d.comb += [ - self.single_match.eq(0), - self.multiple_match.eq(1) - ] - # Single Match if encoder n is valid - with m.Else(): - m.d.comb += [ - self.single_match.eq(1), - self.multiple_match.eq(0) - ] - # Always set output based on priority encoder output - m.d.comb += self.o.eq(self.p_encoder.o) - return m diff --git a/src/TLB/Cam.py b/src/TLB/Cam.py deleted file mode 100644 index e7d901ff..00000000 --- a/src/TLB/Cam.py +++ /dev/null @@ -1,125 +0,0 @@ -from nmigen import Array, Cat, Module, Signal, Elaboratable -from nmigen.lib.coding import Decoder -from nmigen.cli import main #, verilog - -from .CamEntry import CamEntry -from .AddressEncoder import AddressEncoder - - -class Cam(Elaboratable): - """ Content Addressable Memory (CAM) - - The purpose of this module is to quickly look up whether an - entry exists given a data key. - This module will search for the given data in all internal entries - and output whether a single or multiple match was found. - If an single entry is found the address be returned and single_match - is set HIGH. If multiple entries are found the lowest address is - returned and multiple_match is set HIGH. If neither single_match or - multiple_match are HIGH this implies no match was found. To write - to the CAM set the address bus to the desired entry and set write_enable - HIGH. Entry managment should be performed one level above this block - as lookup is performed within. - - Notes: - The read and write operations take one clock cycle to complete. - Currently the read_warning line is present for interfacing but - is not necessary for this design. This module is capable of writing - in the first cycle, reading on the second, and output the correct - address on the third. - """ - - def __init__(self, data_size, cam_size): - """ Arguments: - * data_size: (bits) The bit size of the data - * cam_size: (number) The number of entries in the CAM - """ - - # Internal - self.cam_size = cam_size - self.encoder = AddressEncoder(cam_size) - self.decoder = Decoder(cam_size) - self.entry_array = Array(CamEntry(data_size) for x in range(cam_size)) - - # Input - self.enable = Signal(1) - self.write_enable = Signal(1) - self.data_in = Signal(data_size) # The data to be written - self.data_mask = Signal(data_size) # mask for ternary writes - self.address_in = Signal(max=cam_size) # address of CAM Entry to write - - # Output - self.read_warning = Signal(1) # High when a read interrupts a write - self.single_match = Signal(1) # High when there is only one match - self.multiple_match = Signal(1) # High when there at least two matches - self.match_address = Signal(max=cam_size) # The lowest address matched - - def elaborate(self, platform=None): - m = Module() - # AddressEncoder for match types and output address - m.submodules.AddressEncoder = self.encoder - # Decoder is used to select which entry will be written to - m.submodules.Decoder = self.decoder - # CamEntry Array Submodules - # Note these area added anonymously - entry_array = self.entry_array - m.submodules += entry_array - - # Decoder logic - m.d.comb += [ - self.decoder.i.eq(self.address_in), - self.decoder.n.eq(0) - ] - - encoder_vector = [] - with m.If(self.enable): - # Set the key value for every CamEntry - for index in range(self.cam_size): - - # Write Operation - with m.If(self.write_enable): - with m.If(self.decoder.o[index]): - m.d.comb += entry_array[index].command.eq(2) - with m.Else(): - m.d.comb += entry_array[index].command.eq(0) - - # Read Operation - with m.Else(): - m.d.comb += entry_array[index].command.eq(1) - - # Send data input to all entries - m.d.comb += entry_array[index].data_in.eq(self.data_in) - # Send all entry matches to encoder - ematch = entry_array[index].match - encoder_vector.append(ematch) - - # Give input to and accept output from encoder module - m.d.comb += [ - self.encoder.i.eq(Cat(*encoder_vector)), - self.single_match.eq(self.encoder.single_match), - self.multiple_match.eq(self.encoder.multiple_match), - self.match_address.eq(self.encoder.o) - ] - - # If the CAM is not enabled set all outputs to 0 - with m.Else(): - m.d.comb += [ - self.read_warning.eq(0), - self.single_match.eq(0), - self.multiple_match.eq(0), - self.match_address.eq(0) - ] - - return m - - def ports(self): - return [self.enable, self.write_enable, - self.data_in, self.data_mask, - self.read_warning, self.single_match, - self.multiple_match, self.match_address] - - -if __name__ == '__main__': - cam = Cam(4, 4) - main(cam, ports=cam.ports()) - diff --git a/src/TLB/CamEntry.py b/src/TLB/CamEntry.py deleted file mode 100644 index b1d93082..00000000 --- a/src/TLB/CamEntry.py +++ /dev/null @@ -1,46 +0,0 @@ -from nmigen import Module, Signal, Elaboratable - - -class CamEntry(Elaboratable): - """ Content Addressable Memory (CAM) Entry - - The purpose of this module is to represent an entry within a CAM. - This module when given a read command will compare the given data - and output whether a match was found or not. When given a write - command it will write the given data into internal registers. - """ - - def __init__(self, data_size): - """ Arguments: - * data_size: (bit count) The size of the data - """ - # Input - self.command = Signal(2) # 00 => NA 01 => Read 10 => Write 11 => Reset - self.data_in = Signal(data_size) # Data input when writing - - # Output - self.match = Signal(1) # Result of the internal/input key comparison - self.data = Signal(data_size) - - def elaborate(self, platform=None): - m = Module() - with m.Switch(self.command): - with m.Case("00"): - m.d.sync += self.match.eq(0) - with m.Case("01"): - with m.If(self.data == self.data_in): - m.d.sync += self.match.eq(1) - with m.Else(): - m.d.sync += self.match.eq(0) - with m.Case("10"): - m.d.sync += [ - self.data.eq(self.data_in), - self.match.eq(0) - ] - with m.Case(): - m.d.sync += [ - self.match.eq(0), - self.data.eq(0) - ] - - return m diff --git a/src/TLB/LFSR.py b/src/TLB/LFSR.py deleted file mode 100644 index d8b606ec..00000000 --- a/src/TLB/LFSR.py +++ /dev/null @@ -1,109 +0,0 @@ -# SPDX-License-Identifier: LGPL-2.1-or-later -# See Notices.txt for copyright information -from nmigen import Signal, Module, Const, Cat, Elaboratable -from nmigen.cli import verilog, rtlil - - -class LFSRPolynomial(set): - """ implements a polynomial for use in LFSR - """ - def __init__(self, exponents=()): - for e in exponents: - assert isinstance(e, int), TypeError("%s must be an int" % repr(e)) - assert (e >= 0), ValueError("%d must not be negative" % e) - set.__init__(self, set(exponents).union({0})) # must contain zero - - @property - def max_exponent(self): - return max(self) # derived from set, so this returns the max exponent - - @property - def exponents(self): - exponents = list(self) # get elements of set as a list - exponents.sort(reverse=True) - return exponents - - def __str__(self): - expd = {0: "1", 1: 'x', 2: "x^{}"} # case 2 isn't 2, it's min(i,2) - retval = map(lambda i: expd[min(i,2)].format(i), self.exponents) - return " + ".join(retval) - - def __repr__(self): - return "LFSRPolynomial(%s)" % self.exponents - - -# list of selected polynomials from https://web.archive.org/web/20190418121923/https://en.wikipedia.org/wiki/Linear-feedback_shift_register#Some_polynomials_for_maximal_LFSRs # noqa -LFSR_POLY_2 = LFSRPolynomial([2, 1, 0]) -LFSR_POLY_3 = LFSRPolynomial([3, 2, 0]) -LFSR_POLY_4 = LFSRPolynomial([4, 3, 0]) -LFSR_POLY_5 = LFSRPolynomial([5, 3, 0]) -LFSR_POLY_6 = LFSRPolynomial([6, 5, 0]) -LFSR_POLY_7 = LFSRPolynomial([7, 6, 0]) -LFSR_POLY_8 = LFSRPolynomial([8, 6, 5, 4, 0]) -LFSR_POLY_9 = LFSRPolynomial([9, 5, 0]) -LFSR_POLY_10 = LFSRPolynomial([10, 7, 0]) -LFSR_POLY_11 = LFSRPolynomial([11, 9, 0]) -LFSR_POLY_12 = LFSRPolynomial([12, 11, 10, 4, 0]) -LFSR_POLY_13 = LFSRPolynomial([13, 12, 11, 8, 0]) -LFSR_POLY_14 = LFSRPolynomial([14, 13, 12, 2, 0]) -LFSR_POLY_15 = LFSRPolynomial([15, 14, 0]) -LFSR_POLY_16 = LFSRPolynomial([16, 15, 13, 4, 0]) -LFSR_POLY_17 = LFSRPolynomial([17, 14, 0]) -LFSR_POLY_18 = LFSRPolynomial([18, 11, 0]) -LFSR_POLY_19 = LFSRPolynomial([19, 18, 17, 14, 0]) -LFSR_POLY_20 = LFSRPolynomial([20, 17, 0]) -LFSR_POLY_21 = LFSRPolynomial([21, 19, 0]) -LFSR_POLY_22 = LFSRPolynomial([22, 21, 0]) -LFSR_POLY_23 = LFSRPolynomial([23, 18, 0]) -LFSR_POLY_24 = LFSRPolynomial([24, 23, 22, 17, 0]) - - -class LFSR(LFSRPolynomial, Elaboratable): - """ implements a Linear Feedback Shift Register - """ - def __init__(self, polynomial): - """ Inputs: - ------ - :polynomial: the polynomial to feedback on. may be a LFSRPolynomial - instance or an iterable of ints (list/tuple/generator) - :enable: enable (set LO to disable. NOTE: defaults to HI) - - Outputs: - ------- - :state: the LFSR state. bitwidth is taken from the polynomial - maximum exponent. - - Note: if an LFSRPolynomial is passed in as the input, because - LFSRPolynomial is derived from set() it's ok: - LFSRPolynomial(LFSRPolynomial(p)) == LFSRPolynomial(p) - """ - LFSRPolynomial.__init__(self, polynomial) - self.state = Signal(self.max_exponent, reset=1) - self.enable = Signal(reset=1) - - def elaborate(self, platform): - m = Module() - # do absolutely nothing if the polynomial is empty (always has a zero) - if self.max_exponent <= 1: - return m - - # create XOR-bunch, select bits from state based on exponent - feedback = Const(0) # doesn't do any harm starting from 0b0 (xor chain) - for exponent in self: - if exponent > 0: # don't have to skip, saves CPU cycles though - feedback ^= self.state[exponent - 1] - - # if enabled, shift-and-feedback - with m.If(self.enable): - # shift up lower bits by Cat'ing in a new bit zero (feedback) - newstate = Cat(feedback, self.state[:-1]) - m.d.sync += self.state.eq(newstate) - - return m - - -# example: Poly24 -if __name__ == '__main__': - p24 = rtlil.convert(LFSR(LFSR_POLY_24)) - with open("lfsr2_p24.il", "w") as f: - f.write(p24) diff --git a/src/TLB/LFSR.pyi b/src/TLB/LFSR.pyi deleted file mode 100644 index 64eb9115..00000000 --- a/src/TLB/LFSR.pyi +++ /dev/null @@ -1,23 +0,0 @@ -# SPDX-License-Identifier: LGPL-2.1-or-later -# See Notices.txt for copyright information -from nmigen import Module -from typing import Iterable, Optional, Iterator, Any, Union -from typing_extensions import final - - -@final -class LFSRPolynomial(set): - def __init__(self, exponents: Iterable[int] = ()): - def elements() -> Iterable[int]: ... - @property - def exponents(self) -> list[int]: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - - -@final -class LFSR: - def __init__(self, polynomial: Union[Iterable[int], LFSRPolynomial]): ... - @property - def width(self) -> int: ... - def elaborate(self, platform: Any) -> Module: ... diff --git a/src/TLB/Makefile b/src/TLB/Makefile deleted file mode 100644 index 1eb67acc..00000000 --- a/src/TLB/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -verilog: - python3 Cam.py generate -t v > Cam.v diff --git a/src/TLB/MemorySet.py b/src/TLB/MemorySet.py deleted file mode 100644 index ea61bdf5..00000000 --- a/src/TLB/MemorySet.py +++ /dev/null @@ -1,66 +0,0 @@ -from nmigen import Cat, Memory, Module, Signal, Elaboratable -from nmigen.cli import main -from nmigen.cli import verilog, rtlil - - -class MemorySet(Elaboratable): - def __init__(self, data_size, tag_size, set_count, active): - self.active = active - input_size = tag_size + data_size # Size of the input data - memory_width = input_size + 1 # The width of the cache memory - self.active = active - self.data_size = data_size - self.tag_size = tag_size - - # XXX TODO, use rd-enable and wr-enable? - self.mem = Memory(memory_width, set_count) - self.r = self.mem.read_port() - self.w = self.mem.write_port() - - # inputs (address) - self.cset = Signal(max=set_count) # The set to be checked - self.tag = Signal(tag_size) # The tag to find - self.data_i = Signal(data_size) # Incoming data - - # outputs - self.valid = Signal() - self.data_o = Signal(data_size) # Outgoing data (excludes tag) - - def elaborate(self, platform): - m = Module() - m.submodules.mem = self.mem - m.submodules.r = self.r - m.submodules.w = self.w - - # temporaries - active_bit = Signal() - tag_valid = Signal() - data_start = self.active + 1 - data_end = data_start + self.data_size - tag_start = data_end - tag_end = tag_start + self.tag_size - - # connect the read port address to the set/entry - read_port = self.r - m.d.comb += read_port.addr.eq(self.cset) - # Pull out active bit from data - data = read_port.data - m.d.comb += active_bit.eq(data[self.active]) - # Validate given tag vs stored tag - tag = data[tag_start:tag_end] - m.d.comb += tag_valid.eq(self.tag == tag) - # An entry is only valid if the tags match AND - # is marked as a valid entry - m.d.comb += self.valid.eq(tag_valid & active_bit) - - # output data: TODO, check rd-enable? - m.d.comb += self.data_o.eq(data[data_start:data_end]) - - # connect the write port addr to the set/entry (only if write enabled) - # (which is only done on a match, see SAC.write_entry below) - write_port = self.w - with m.If(write_port.en): - m.d.comb += write_port.addr.eq(self.cset) - m.d.comb += write_port.data.eq(Cat(1, self.data_i, self.tag)) - - return m diff --git a/src/TLB/PermissionValidator.py b/src/TLB/PermissionValidator.py deleted file mode 100644 index 0107c0e9..00000000 --- a/src/TLB/PermissionValidator.py +++ /dev/null @@ -1,68 +0,0 @@ -from nmigen import Module, Signal, Elaboratable -from nmigen.cli import main - -from TLB.PteEntry import PteEntry - - -class PermissionValidator(Elaboratable): - """ The purpose of this Module is to check the Permissions of a given PTE - against the requested access permissions. - - This module will either validate (by setting the valid bit HIGH) - the request or find a permission fault and invalidate (by setting - the valid bit LOW) the request - """ - - def __init__(self, asid_size, pte_size): - """ Arguments: - * asid_size: (bit count) The size of the asid to be processed - * pte_size: (bit count) The size of the pte to be processed - - Return: - * valid HIGH when permissions are correct - """ - # Internal - self.pte_entry = PteEntry(asid_size, pte_size) - - # Input - self.data = Signal(asid_size + pte_size); - self.xwr = Signal(3) # Execute, Write, Read - self.super_mode = Signal(1) # Supervisor Mode - self.super_access = Signal(1) # Supervisor Access - self.asid = Signal(15) # Address Space IDentifier (ASID) - - # Output - self.valid = Signal(1) # Denotes if the permissions are correct - - def elaborate(self, platform=None): - m = Module() - - m.submodules.pte_entry = self.pte_entry - - m.d.comb += self.pte_entry.i.eq(self.data) - - # Check if the entry is valid - with m.If(self.pte_entry.v): - # ASID match or Global Permission - # Note that the MSB bound is exclusive - with m.If((self.pte_entry.asid == self.asid) | self.pte_entry.g): - # Check Execute, Write, Read (XWR) Permissions - with m.If(self.pte_entry.xwr == self.xwr): - # Supervisor Logic - with m.If(self.super_mode): - # Valid if entry is not in user mode or supervisor - # has Supervisor User Memory (SUM) access via the - # SUM bit in the sstatus register - m.d.comb += self.valid.eq((~self.pte_entry.u) \ - | self.super_access) - # User logic - with m.Else(): - # Valid if the entry is in user mode only - m.d.comb += self.valid.eq(self.pte_entry.u) - with m.Else(): - m.d.comb += self.valid.eq(0) - with m.Else(): - m.d.comb += self.valid.eq(0) - with m.Else(): - m.d.comb += self.valid.eq(0) - return m diff --git a/src/TLB/PteEntry.py b/src/TLB/PteEntry.py deleted file mode 100644 index 73ea9220..00000000 --- a/src/TLB/PteEntry.py +++ /dev/null @@ -1,67 +0,0 @@ -from nmigen import Module, Signal, Elaboratable -from nmigen.cli import main - - -class PteEntry(Elaboratable): - """ The purpose of this Module is to centralize the parsing of Page - Table Entries (PTE) into one module to prevent common mistakes - and duplication of code. The control bits are parsed out for - ease of use. - - This module parses according to the standard PTE given by the - Volume II: RISC-V Privileged Architectures V1.10 Pg 60. - The Address Space IDentifier (ASID) is appended to the MSB of the input - and is parsed out as such. - - An valid input Signal would be: - ASID PTE - Bits:[78-64][63-0] - - The output PTE value will include the control bits. - """ - def __init__(self, asid_size, pte_size): - """ Arguments: - * asid_size: (bit count) The size of the asid to be processed - * pte_size: (bit count) The size of the pte to be processed - - Return: - * d The Dirty bit from the PTE portion of i - * a The Accessed bit from the PTE portion of i - * g The Global bit from the PTE portion of i - * u The User Mode bit from the PTE portion of i - * xwr The Execute/Write/Read bit from the PTE portion of i - * v The Valid bit from the PTE portion of i - * asid The asid portion of i - * pte The pte portion of i - """ - # Internal - self.asid_start = pte_size - self.asid_end = pte_size + asid_size - - # Input - self.i = Signal(asid_size + pte_size) - - # Output - self.d = Signal(1) # Dirty bit (From pte) - self.a = Signal(1) # Accessed bit (From pte) - self.g = Signal(1) # Global Access (From pte) - self.u = Signal(1) # User Mode (From pte) - self.xwr = Signal(3) # Execute Read Write (From pte) - self.v = Signal(1) # Valid (From pte) - self.asid = Signal(asid_size) # Associated Address Space IDentifier - self.pte = Signal(pte_size) # Full Page Table Entry - - def elaborate(self, platform=None): - m = Module() - # Pull out all control bites from PTE - m.d.comb += [ - self.d.eq(self.i[7]), - self.a.eq(self.i[6]), - self.g.eq(self.i[5]), - self.u.eq(self.i[4]), - self.xwr.eq(self.i[1:4]), - self.v.eq(self.i[0]) - ] - m.d.comb += self.asid.eq(self.i[self.asid_start:self.asid_end]) - m.d.comb += self.pte.eq(self.i[0:self.asid_start]) - return m diff --git a/src/TLB/SetAssociativeCache.py b/src/TLB/SetAssociativeCache.py deleted file mode 100644 index 70c075da..00000000 --- a/src/TLB/SetAssociativeCache.py +++ /dev/null @@ -1,272 +0,0 @@ -""" - -Online simulator of 4-way set-associative cache: -http://www.ntu.edu.sg/home/smitha/ParaCache/Paracache/sa4.html - -Python simulator of a N-way set-associative cache: -https://github.com/vaskevich/CacheSim/blob/master/cachesim.py -""" - -from nmigen import Array, Cat, Memory, Module, Signal, Mux, Elaboratable -from nmigen.compat.genlib import fsm -from nmigen.cli import main -from nmigen.cli import verilog, rtlil - -from .AddressEncoder import AddressEncoder -from .MemorySet import MemorySet - -# TODO: use a LFSR that advances continuously and picking the bottom -# few bits from it to select which cache line to replace, instead of PLRU -# http://bugs.libre-riscv.org/show_bug.cgi?id=71 -from .ariane.plru import PLRU -from .LFSR import LFSR, LFSR_POLY_24 - -SA_NA = "00" # no action (none) -SA_RD = "01" # read -SA_WR = "10" # write - - -class SetAssociativeCache(Elaboratable): - """ Set Associative Cache Memory - - The purpose of this module is to generate a memory cache given the - constraints passed in. This will create a n-way set associative cache. - It is expected for the SV TLB that the VMA will provide the set number - while the ASID provides the tag (still to be decided). - - """ - def __init__(self, tag_size, data_size, set_count, way_count, lfsr=False): - """ Arguments - * tag_size (bits): The bit count of the tag - * data_size (bits): The bit count of the data to be stored - * set_count (number): The number of sets/entries in the cache - * way_count (number): The number of slots a data can be stored - in one set - * lfsr: if set, use an LFSR for (pseudo-randomly) selecting - set/entry to write to. otherwise, use a PLRU - """ - # Internals - self.lfsr_mode = lfsr - self.way_count = way_count # The number of slots in one set - self.tag_size = tag_size # The bit count of the tag - self.data_size = data_size # The bit count of the data to be stored - - # set up Memory array - self.mem_array = Array() # memory array - for i in range(way_count): - ms = MemorySet(data_size, tag_size, set_count, active=0) - self.mem_array.append(ms) - - # Finds valid entries - self.encoder = AddressEncoder(way_count) - - # setup PLRU or LFSR - if lfsr: - # LFSR mode - self.lfsr = LFSR(LFSR_POLY_24) - else: - # PLRU mode - self.plru = PLRU(way_count) # One block to handle plru calculations - self.plru_array = Array() # PLRU data on each set - for i in range(set_count): - name="plru%d" % i - self.plru_array.append(Signal(self.plru.TLBSZ, name=name)) - - # Input - self.enable = Signal(1) # Whether the cache is enabled - self.command = Signal(2) # 00=None, 01=Read, 10=Write (see SA_XX) - self.cset = Signal(max=set_count) # The set to be checked - self.tag = Signal(tag_size) # The tag to find - self.data_i = Signal(data_size) # The input data - - # Output - self.ready = Signal(1) # 0 => Processing 1 => Ready for commands - self.hit = Signal(1) # Tag matched one way in the given set - self.multiple_hit = Signal(1) # Tag matched many ways in the given set - self.data_o = Signal(data_size) # The data linked to the matched tag - - def check_tags(self, m): - """ Validate the tags in the selected set. If one and only one - tag matches set its state to zero and increment all others - by one. We only advance to next state if a single hit is found. - """ - # Vector to store way valid results - # A zero denotes a way is invalid - valid_vector = [] - # Loop through memory to prep read/write ports and set valid_vector - for i in range(self.way_count): - valid_vector.append(self.mem_array[i].valid) - - # Pass encoder the valid vector - m.d.comb += self.encoder.i.eq(Cat(*valid_vector)) - - # Only one entry should be marked - # This is due to already verifying the tags - # matched and the valid bit is high - with m.If(self.hit): - m.next = "FINISHED_READ" - # Pull out data from the read port - data = self.mem_array[self.encoder.o].data_o - m.d.comb += self.data_o.eq(data) - if not self.lfsr_mode: - self.access_plru(m) - - # Oh no! Seal the gates! Multiple tags matched?!? kasd;ljkafdsj;k - with m.Elif(self.multiple_hit): - # XXX TODO, m.next = "FINISHED_READ" ? otherwise stuck - m.d.comb += self.data_o.eq(0) - - # No tag matches means no data - with m.Else(): - # XXX TODO, m.next = "FINISHED_READ" ? otherwise stuck - m.d.comb += self.data_o.eq(0) - - def access_plru(self, m): - """ An entry was accessed and the plru tree must now be updated - """ - # Pull out the set's entry being edited - plru_entry = self.plru_array[self.cset] - m.d.comb += [ - # Set the plru data to the current state - self.plru.plru_tree.eq(plru_entry), - # Set that the cache was accessed - self.plru.lu_access_i.eq(1) - ] - - def read(self, m): - """ Go through the read process of the cache. - This takes two cycles to complete. First it checks for a valid tag - and secondly it updates the LRU values. - """ - with m.FSM() as fsm_read: - with m.State("READY"): - m.d.comb += self.ready.eq(0) - # check_tags will set the state if the conditions are met - self.check_tags(m) - with m.State("FINISHED_READ"): - m.next = "READY" - m.d.comb += self.ready.eq(1) - if not self.lfsr_mode: - plru_tree_o = self.plru.plru_tree_o - m.d.sync += self.plru_array[self.cset].eq(plru_tree_o) - - def write_entry(self, m): - if not self.lfsr_mode: - m.d.comb += [# set cset (mem address) into PLRU - self.plru.plru_tree.eq(self.plru_array[self.cset]), - # and connect plru to encoder for write - self.encoder.i.eq(self.plru.replace_en_o) - ] - write_port = self.mem_array[self.encoder.o].w - else: - # use the LFSR to generate a random(ish) one of the mem array - lfsr_output = Signal(max=self.way_count) - lfsr_random = Signal(max=self.way_count) - m.d.comb += lfsr_output.eq(self.lfsr.state) # lose some bits - # address too big, limit to range of array - m.d.comb += lfsr_random.eq(Mux(lfsr_output > self.way_count, - lfsr_output - self.way_count, - lfsr_output)) - write_port = self.mem_array[lfsr_random].w - - # then if there is a match from the encoder, enable the selected write - with m.If(self.encoder.single_match): - m.d.comb += write_port.en.eq(1) - - def write(self, m): - """ Go through the write process of the cache. - This takes two cycles to complete. First it writes the entry, - and secondly it updates the PLRU (in plru mode) - """ - with m.FSM() as fsm_write: - with m.State("READY"): - m.d.comb += self.ready.eq(0) - self.write_entry(m) - m.next ="FINISHED_WRITE" - with m.State("FINISHED_WRITE"): - m.d.comb += self.ready.eq(1) - if not self.lfsr_mode: - plru_entry = self.plru_array[self.cset] - m.d.sync += plru_entry.eq(self.plru.plru_tree_o) - m.next = "READY" - - - def elaborate(self, platform=None): - m = Module() - - # ---- - # set up Modules: AddressEncoder, LFSR/PLRU, Mem Array - # ---- - - m.submodules.AddressEncoder = self.encoder - if self.lfsr_mode: - m.submodules.LFSR = self.lfsr - else: - m.submodules.PLRU = self.plru - - for i, mem in enumerate(self.mem_array): - setattr(m.submodules, "mem%d" % i, mem) - - # ---- - # select mode: PLRU connect to encoder, LFSR do... something - # ---- - - if not self.lfsr_mode: - # Set what entry was hit - m.d.comb += self.plru.lu_hit.eq(self.encoder.o) - else: - # enable LFSR - m.d.comb += self.lfsr.enable.eq(self.enable) - - # ---- - # connect hit/multiple hit to encoder output - # ---- - - m.d.comb += [ - self.hit.eq(self.encoder.single_match), - self.multiple_hit.eq(self.encoder.multiple_match), - ] - - # ---- - # connect incoming data/tag/cset(addr) to mem_array - # ---- - - for mem in self.mem_array: - write_port = mem.w - m.d.comb += [mem.cset.eq(self.cset), - mem.tag.eq(self.tag), - mem.data_i.eq(self.data_i), - write_port.en.eq(0), # default: disable write - ] - # ---- - # Commands: READ/WRITE/TODO - # ---- - - with m.If(self.enable): - with m.Switch(self.command): - # Search all sets at a particular tag - with m.Case(SA_RD): - self.read(m) - with m.Case(SA_WR): - self.write(m) - # Maybe catch multiple tags write here? - # TODO - # TODO: invalidate/flush, flush-all? - - return m - - def ports(self): - return [self.enable, self.command, self.cset, self.tag, self.data_i, - self.ready, self.hit, self.multiple_hit, self.data_o] - - -if __name__ == '__main__': - sac = SetAssociativeCache(4, 8, 4, 6) - vl = rtlil.convert(sac, ports=sac.ports()) - with open("SetAssociativeCache.il", "w") as f: - f.write(vl) - - sac_lfsr = SetAssociativeCache(4, 8, 4, 6, True) - vl = rtlil.convert(sac_lfsr, ports=sac_lfsr.ports()) - with open("SetAssociativeCacheLFSR.il", "w") as f: - f.write(vl) diff --git a/src/TLB/TLB.py b/src/TLB/TLB.py deleted file mode 100644 index 98c9af72..00000000 --- a/src/TLB/TLB.py +++ /dev/null @@ -1,175 +0,0 @@ -""" TLB Module - - The expected form of the data is: - * Item (Bits) - * Tag (N - 79) / ASID (78 - 64) / PTE (63 - 0) -""" - -from nmigen import Memory, Module, Signal, Cat, Elaboratable -from nmigen.cli import main - -from .PermissionValidator import PermissionValidator -from .Cam import Cam - -class TLB(Elaboratable): - def __init__(self, asid_size, vma_size, pte_size, L1_size): - """ Arguments - * asid_size: Address Space IDentifier (ASID) typically 15 bits - * vma_size: Virtual Memory Address (VMA) typically 36 bits - * pte_size: Page Table Entry (PTE) typically 64 bits - - Notes: - These arguments should represent the largest possible size - defined by the MODE settings. See - Volume II: RISC-V Privileged Architectures V1.10 Page 57 - """ - - # Internal - self.state = 0 - # L1 Cache Modules - self.cam_L1 = Cam(vma_size, L1_size) - self.mem_L1 = Memory(asid_size + pte_size, L1_size) - - # Permission Validator - self.perm_validator = PermissionValidator(asid_size, pte_size) - - # Inputs - self.supermode = Signal(1) # Supervisor Mode - self.super_access = Signal(1) # Supervisor Access - self.command = Signal(2) # 00=None, 01=Search, 10=Write L1, 11=Write L2 - self.xwr = Signal(3) # Execute, Write, Read - self.mode = Signal(4) # 4 bits for access to Sv48 on Rv64 - self.address_L1 = Signal(max=L1_size) - self.asid = Signal(asid_size) # Address Space IDentifier (ASID) - self.vma = Signal(vma_size) # Virtual Memory Address (VMA) - self.pte_in = Signal(pte_size) # To be saved Page Table Entry (PTE) - - # Outputs - self.hit = Signal(1) # Denotes if the VMA had a mapped PTE - self.perm_valid = Signal(1) # Denotes if the permissions are correct - self.pte_out = Signal(pte_size) # PTE that was mapped to by the VMA - - def search(self, m, read_L1, write_L1): - """ searches the TLB - """ - m.d.comb += [ - write_L1.en.eq(0), - self.cam_L1.write_enable.eq(0), - self.cam_L1.data_in.eq(self.vma) - ] - # Match found in L1 CAM - match_found = Signal(reset_less=True) - m.d.comb += match_found.eq(self.cam_L1.single_match - | self.cam_L1.multiple_match) - with m.If(match_found): - # Memory shortcut variables - mem_address = self.cam_L1.match_address - # Memory Logic - m.d.comb += read_L1.addr.eq(mem_address) - # Permission Validator Logic - m.d.comb += [ - self.hit.eq(1), - # Set permission validator data to the correct - # register file data according to CAM match - # address - self.perm_validator.data.eq(read_L1.data), - # Execute, Read, Write - self.perm_validator.xwr.eq(self.xwr), - # Supervisor Mode - self.perm_validator.super_mode.eq(self.supermode), - # Supverisor Access - self.perm_validator.super_access.eq(self.super_access), - # Address Space IDentifier (ASID) - self.perm_validator.asid.eq(self.asid), - # Output result of permission validation - self.perm_valid.eq(self.perm_validator.valid) - ] - # Only output PTE if permissions are valid - with m.If(self.perm_validator.valid): - # XXX TODO - dummy for now - reg_data = Signal.like(self.pte_out) - m.d.comb += [ - self.pte_out.eq(reg_data) - ] - with m.Else(): - m.d.comb += [ - self.pte_out.eq(0) - ] - # Miss Logic - with m.Else(): - m.d.comb += [ - self.hit.eq(0), - self.perm_valid.eq(0), - self.pte_out.eq(0) - ] - - def write_l1(self, m, read_L1, write_L1): - """ writes to the L1 cache - """ - # Memory_L1 Logic - m.d.comb += [ - write_L1.en.eq(1), - write_L1.addr.eq(self.address_L1), - # The Cat places arguments from LSB -> MSB - write_L1.data.eq(Cat(self.pte_in, self.asid)) - ] - # CAM_L1 Logic - m.d.comb += [ - self.cam_L1.write_enable.eq(1), - self.cam_L1.data_in.eq(self.vma), #data_in is sent to all entries - # self.cam_L1.address_in.eq(todo) # a CAM entry needs to be selected - - ] - - def elaborate(self, platform): - m = Module() - # Add submodules - # Submodules for L1 Cache - m.submodules.cam_L1 = self.cam_L1 - m.submodules.read_L1 = read_L1 = self.mem_L1.read_port() - m.submodules.write_L1 = write_L1 = self.mem_L1.write_port() - - # Permission Validator Submodule - m.submodules.perm_valididator = self.perm_validator - - # When MODE specifies translation - # TODO add in different bit length handling ie prefix 0s - tlb_enable = Signal(reset_less=True) - m.d.comb += tlb_enable.eq(self.mode != 0) - - with m.If(tlb_enable): - m.d.comb += [ - self.cam_L1.enable.eq(1) - ] - with m.Switch(self.command): - # Search - with m.Case("01"): - self.search(m, read_L1, write_L1) - - # Write L1 - # Expected that the miss will be handled in software - with m.Case("10"): - self.write_l1(m, read_L1, write_L1) - - # TODO - #with m.Case("11"): - - # When disabled - with m.Else(): - m.d.comb += [ - self.cam_L1.enable.eq(0), - # XXX TODO - self.reg_file.enable.eq(0), - self.hit.eq(0), - self.perm_valid.eq(0), # XXX TODO, check this - self.pte_out.eq(0) - ] - return m - - -if __name__ == '__main__': - tlb = TLB(15, 36, 64, 4) - main(tlb, ports=[ tlb.supermode, tlb.super_access, tlb.command, - tlb.xwr, tlb.mode, tlb.address_L1, tlb.asid, - tlb.vma, tlb.pte_in, - tlb.hit, tlb.perm_valid, tlb.pte_out, - ] + tlb.cam_L1.ports()) diff --git a/src/TLB/__init__.py b/src/TLB/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/TLB/ariane/TreePLRU.cpp b/src/TLB/ariane/TreePLRU.cpp deleted file mode 100644 index 2f6aeea5..00000000 --- a/src/TLB/ariane/TreePLRU.cpp +++ /dev/null @@ -1,211 +0,0 @@ -#include -#include -#include - - -#define NWAY 4 -#define NLINE 256 -#define HIT 0 -#define MISS 1 -#define MS 1000 -/* -Detailed TreePLRU inference see here: https://docs.google.com/spreadsheets/d/14zQpPYPwDAbCCjBT_a3KLaE5FEk-RNhI8Z7Qm_biW8g/edit?usp=sharing -Ref: https://people.cs.clemson.edu/~mark/464/p_lru.txt -four-way set associative - three bits - each bit represents one branch point in a binary decision tree; let 1 - represent that the left side has been referenced more recently than the - right side, and 0 vice-versa - are all 4 lines valid? - / \ - yes no, use an invalid line - | - | - | - bit_0 == 0? state | replace ref to | next state - / \ ------+-------- -------+----------- - y n 00x | line_0 line_0 | 11_ - / \ 01x | line_1 line_1 | 10_ - bit_1 == 0? bit_2 == 0? 1x0 | line_2 line_2 | 0_1 - / \ / \ 1x1 | line_3 line_3 | 0_0 - y n y n - / \ / \ ('x' means ('_' means unchanged) - line_0 line_1 line_2 line_3 don't care) - 8-way set associative - 7 = 1+2+4 bits -16-way set associative - 15 = 1+2+4+8 bits -32-way set associative - 31 = 1+2+4+8+16 bits -64-way set associative - 63 = 1+2+4+8+16+32 bits -*/ -using namespace std; -struct AddressField { - uint64_t wd_idx : 2;//Unused - uint64_t offset : 4;//Unused - uint64_t index : 8;//NLINE = 256 = 2^8 - uint64_t tag : 50; -}; - -union Address { - uint32_t* p; - AddressField fields; -}; - -struct Cell { - bool v; - uint64_t tag; - - Cell() : v(false), tag(0) {} - - bool isHit(uint64_t tag) { - return v && (tag == this->tag); - } - - void fetch(uint32_t* address) { - Address addr; - addr.p = address; - addr.fields.offset = 0; - addr.fields.wd_idx = 0; - tag = addr.fields.tag; - v = true; - } -}; - -ostream& operator<<(ostream & out, const Cell& cell) { - out << " v:" << cell.v << " tag:" << hex << cell.tag; - return out; -} - -struct Block { - Cell cell[NWAY]; - uint32_t state; - uint64_t *mask;//Mask the state to get accurate value for specified 1 bit. - uint64_t *value; - uint64_t *next_value; - - Block() : state(0) { - switch (NWAY) { - case 4: - mask = new uint64_t[4]{0b110, 0b110, 0b101, 0b101}; - value = new uint64_t[4]{0b000, 0b010, 0b100, 0b101}; - next_value = new uint64_t[4]{0b110, 0b100, 0b001, 0b000}; - break; - case 8: - mask = new uint64_t[8]{0b1101000, 0b1101000, 0b1100100, 0b1100100, 0b1010010, 0b1010010, 0b1010001, - 0b1010001}; - value = new uint64_t[8]{0b0000000, 0b0001000, 0b0100000, 0b0100100, 0b1000000, 0b1000010, 0b1010000, - 0b1010001}; - next_value = new uint64_t[8]{0b1101000, 0b1100000, 0b1000100, 0b1000000, 0b0010010, 0b0010000, - 0b0000001, 0b0000000}; - break; - //TODO - more NWAY goes here. - default: - std::cout << "Error definition NWAY = " << NWAY << std::endl; - } - } - - uint32_t *getByTag(uint64_t tag, uint32_t *pway) { - for (int i = 0; i < NWAY; ++i) { - if (cell[i].isHit(tag)) { - *pway = i; - return pway; - } - } - return NULL; - } - - void setLRU(uint32_t *address) { - int way = 0; - uint32_t st = state; - for (int i = 0; i < NWAY; ++i) { - if ((state & mask[i]) == value[i]) { - state ^= mask[i]; - way = i; - break; - } - } - cell[way].fetch(address); - cout << "MISS: way:" << way << " address:" << address << " state:" << st << "->" << state << endl; - } - - uint32_t *get(uint32_t *address, uint32_t *pway) { - Address addr; - addr.p = address; - uint32_t *d = getByTag(addr.fields.tag, pway); - if (d != NULL) { - return &d[addr.fields.offset]; - } - return d; - } - - int set(uint32_t *address) { - uint32_t way = 0; - uint32_t *p = get(address, &way); - if (p != NULL) { - printf("HIT: address:%p ref_to way:%d state %X --> ", address, way, state); - state &= ~mask[way]; - printf("%X --> ", state); - state |= next_value[way]; - printf("%X\n", state); - // *p = *address; //skip since address is fake. - return HIT; - } else { - setLRU(address); - return MISS; - } - } -}; - -ostream& operator<<(ostream & out, const Block& block) { - out << "state:" << block.state << " "; - for (int i = 0; i cacheline refill) - self.miss_gnt_o = Signal(NR_PORTS) - self.active_serving_o = Signal(NR_PORTS) - - self.critical_word_o = Signal(64) - self.critical_word_valid_o = Signal() - output ariane_axi::req_t axi_data_o, - input ariane_axi::resp_t axi_data_i, - - self.mshr_addr_i = Array(Signal(name="bdata_o", 56) \ - for i in range(NR_PORTS)) - self.mshr_addr_matches_o = Signal(NR_PORTS) - self.mshr_index_matches_o = Signal(NR_PORTS) - - # AMO - self.amo_req_i = AMOReq() - self.amo_resp_o = AMOResp() - # Port to SRAMs, for refill and eviction - self.req_o = Signal(DCACHE_SET_ASSOC) - self.addr_o = Signal(DCACHE_INDEX_WIDTH) # address into cache array - self.data_o = CacheLine() - self.be_o = CLBE() - self.data_i = Array(CacheLine() \ - for i in range(DCACHE_SET_ASSOC)) - self.we_o = Signal() - - def elaborate(self, platform): - # Registers - mshr_t mshr_d, mshr_q; - logic [DCACHE_INDEX_WIDTH-1:0] cnt_d, cnt_q; - logic [DCACHE_SET_ASSOC-1:0] evict_way_d, evict_way_q; - # cache line to evict - cache_line_t evict_cl_d, evict_cl_q; - - logic serve_amo_d, serve_amo_q; - # Request from one FSM - miss_req_valid = Signal(self.NR_PORTS) - miss_req_bypass = Signal(self.NR_PORTS) - miss_req_addr = Array(Signal(name="miss_req_addr", 64) \ - for i in range(NR_PORTS)) - miss_req_wdata = Array(Signal(name="miss_req_wdata", 64) \ - for i in range(NR_PORTS)) - miss_req_we = Signal(self.NR_PORTS) - miss_req_be = Array(Signal(name="miss_req_be", 8) \ - for i in range(NR_PORTS)) - miss_req_size = Array(Signal(name="miss_req_size", 2) \ - for i in range(NR_PORTS)) - - # Cache Line Refill <-> AXI - req_fsm_miss_valid = Signal() - req_fsm_miss_addr = Signal(64) - req_fsm_miss_wdata = Signal(DCACHE_LINE_WIDTH) - req_fsm_miss_we = Signal() - req_fsm_miss_be = Signal(DCACHE_LINE_WIDTH//8) - ariane_axi::ad_req_t req_fsm_miss_req; - req_fsm_miss_size = Signal(2) - - gnt_miss_fsm = Signal() - valid_miss_fsm = Signal() - nmiss = DCACHE_LINE_WIDTH//64 - data_miss_fsm = Array(Signal(name="data_miss_fsm", 64) \ - for i in range(nmiss)) - - # Cache Management <-> LFSR - lfsr_enable = Signal() - lfsr_oh = Signal(DCACHE_SET_ASSOC) - lfsr_bin = Signal($clog2(DCACHE_SET_ASSOC-1)) - # AMOs - ariane_pkg::amo_t amo_op; - amo_operand_a = Signal(64) - amo_operand_b = Signal(64) - amo_result_o = Signal(64) - - struct packed { - logic [63:3] address; - logic valid; - } reservation_d, reservation_q; - - # ------------------------------ - # Cache Management - # ------------------------------ - evict_way = Signal(DCACHE_SET_ASSOC) - valid_way = Signal(DCACHE_SET_ASSOC) - - for (i in range(DCACHE_SET_ASSOC): - comb += evict_way[i].eq(data_i[i].valid & data_i[i].dirty) - comb += valid_way[i].eq(data_i[i].valid) - - # ---------------------- - # Default Assignments - # ---------------------- - # to AXI refill - req_fsm_miss_req = ariane_axi::CACHE_LINE_REQ; - req_fsm_miss_size = Const(0b11, 2) - # core - serve_amo_d = serve_amo_q; - # -------------------------------- - # Flush and Miss operation - # -------------------------------- - state_d = state_q; - cnt_d = cnt_q; - evict_way_d = evict_way_q; - evict_cl_d = evict_cl_q; - mshr_d = mshr_q; - # communicate to the requester which unit we are currently serving - active_serving_o[mshr_q.id] = mshr_q.valid; - # AMOs - # silence the unit when not used - amo_op = amo_req_i.amo_op; - - reservation_d = reservation_q; - with m.FSM() as state_q: - - with m.Case("IDLE"): - # lowest priority are AMOs, wait until everything else - # is served before going for the AMOs - with m.If (amo_req_i.req & ~busy_i): - # 1. Flush the cache - with m.If(~serve_amo_q): - m.next = "FLUSH_REQ_STATUS" - serve_amo_d.eq(0b1 - cnt_d.eq(0 - # 2. Do the AMO - with m.Else(): - m.next = "AMO_LOAD" - serve_amo_d.eq(0b0 - - # check if we want to flush and can flush - # e.g.: we are not busy anymore - # TODO: Check that the busy flag is indeed needed - with m.If (flush_i & ~busy_i): - m.next = "FLUSH_REQ_STATUS" - cnt_d = 0 - - # check if one of the state machines missed - for i in range(NR_PORTS): - # here comes the refill portion of code - with m.If (miss_req_valid[i] & ~miss_req_bypass[i]): - m.next = "MISS" - # we are taking another request so don't - # take the AMO - serve_amo_d = 0b0; - # save to MSHR - wid = DCACHE_TAG_WIDTH+DCACHE_INDEX_WIDTH - comb += [ mshr_d.valid.eq(0b1), - mshr_d.we.eq(miss_req_we[i]), - mshr_d.id.eq(i), - mshr_d.addr.eq(miss_req_addr[i][0:wid]), - mshr_d.wdata.eq(miss_req_wdata[i]), - mshr_d.be.eq(miss_req_be[i]), - ] - break - - # ~> we missed on the cache - with m.Case("MISS"): - # 1. Check if there is an empty cache-line - # 2. If not -> evict one - comb += req_o.eq(1) - sync += addr_o.eq(mshr_q.addr[:DCACHE_INDEX_WIDTH] - m.next = "MISS_REPL" - comb += miss_o.eq(1) - - # ~> second miss cycle - with m.Case("MISS_REPL"): - # if all are valid we need to evict one, - # pseudo random from LFSR - with m.If(~(~valid_way).bool()): - comb += lfsr_enable.eq(0b1) - comb += evict_way_d.eq(lfsr_oh) - # do we need to write back the cache line? - with m.If(data_i[lfsr_bin].dirty): - state_d = WB_CACHELINE_MISS; - comb += evict_cl_d.tag.eq(data_i[lfsr_bin].tag) - comb += evict_cl_d.data.eq(data_i[lfsr_bin].data) - comb += cnt_d.eq(mshr_q.addr[:DCACHE_INDEX_WIDTH]) - # no - we can request a cache line now - with m.Else(): - m.next = "REQ_CACHELINE" - # we have at least one free way - with m.Else(): - # get victim cache-line by looking for the - # first non-valid bit - comb += evict_way_d.eq(get_victim_cl(~valid_way) - m.next = "REQ_CACHELINE" - - # ~> we can just load the cache-line, - # the way is store in evict_way_q - with m.Case("REQ_CACHELINE"): - comb += req_fsm_miss_valid .eq(1) - sync += req_fsm_miss_addr .eq(mshr_q.addr) - - with m.If (gnt_miss_fsm): - m.next = "SAVE_CACHELINE" - comb += miss_gnt_o[mshr_q.id].eq(1) - - # ~> replace the cacheline - with m.Case("SAVE_CACHELINE"): - # calculate cacheline offset - automatic logic [$clog2(DCACHE_LINE_WIDTH)-1:0] cl_offset; - sync += cl_offset.eq(mshr_q.addr[3:DCACHE_BYTE_OFFSET] << 6) - # we've got a valid response from refill unit - with m.If (valid_miss_fsm): - wid = DCACHE_TAG_WIDTH+DCACHE_INDEX_WIDTH - sync += addr_o .eq(mshr_q.addr[:DCACHE_INDEX_WIDTH]) - sync += req_o .eq(evict_way_q) - comb += we_o .eq(1) - comb += be_o .eq(1) - sync += be_o.vldrty .eq(evict_way_q) - sync += data_o.tag .eq(mshr_q.addr[DCACHE_INDEX_WIDTH:wid] - comb += data_o.data .eq(data_miss_fsm) - comb += data_o.valid.eq(1) - comb += data_o.dirty.eq(0) - - # is this a write? - with m.If (mshr_q.we): - # Yes, so safe the updated data now - for i in range(8): - # check if we really want to write - # the corresponding byte - with m.If (mshr_q.be[i]): - sync += data_o.data[(cl_offset + i*8) +: 8].eq(mshr_q.wdata[i]; - # it's immediately dirty if we write - comb += data_o.dirty.eq(1) - - # reset MSHR - comb += mshr_d.valid.eq(0) - # go back to idle - m.next = 'IDLE' - - # ------------------------------ - # Write Back Operation - # ------------------------------ - # ~> evict a cache line from way saved in evict_way_q - with m.Case("WB_CACHELINE_FLUSH"): - with m.Case("WB_CACHELINE_MISS"): - - comb += req_fsm_miss_valid .eq(0b1) - sync += req_fsm_miss_addr .eq({evict_cl_q.tag, cnt_q[DCACHE_INDEX_WIDTH-1:DCACHE_BYTE_OFFSET], {{DCACHE_BYTE_OFFSET}{0b0}}}; - comb += req_fsm_miss_be .eq(1) - comb += req_fsm_miss_we .eq(0b1) - sync += req_fsm_miss_wdata .eq(evict_cl_q.data; - - # we've got a grant --> this is timing critical, think about it - if (gnt_miss_fsm) begin - # write status array - sync += addr_o .eq(cnt_q) - comb += req_o .eq(0b1) - comb += we_o .eq(0b1) - comb += data_o.valid.eq(INVALIDATE_ON_FLUSH ? 0b0 : 0b1) - # invalidate - sync += be_o.vldrty.eq(evict_way_q) - # go back to handling the miss or flushing, - # depending on where we came from - with m.If(state_q == WB_CACHELINE_MISS): - m.next = "MISS" - with m.Else(): - m.next = "FLUSH_REQ_STATUS" - - # ------------------------------ - # Flushing & Initialization - # ------------------------------ - # ~> make another request to check the same - # cache-line if there are still some valid entries - with m.Case("FLUSH_REQ_STATUS"): - comb += req_o .eq(1) - sync += addr_o .eq(cnt_q) - m.next = "FLUSHING" - - with m.Case("FLUSHING"): - # this has priority - # at least one of the cache lines is dirty - with m.If(~evict_way): - # evict cache line, look for the first - # cache-line which is dirty - comb += evict_way_d.eq(get_victim_cl(evict_way)) - comb += evict_cl_d .eq(data_i[one_hot_to_bin(evict_way)]) - state_d = WB_CACHELINE_FLUSH; - # not dirty ~> increment and continue - with m.Else(): - # increment and re-request - sync += cnt_d.eq(cnt_q + (1 << DCACHE_BYTE_OFFSET)) - m.next = "FLUSH_REQ_STATUS" - sync += addr_o .eq(cnt_q) - comb += req_o .eq(1) - comb += be_o.vldrty.eq(INVALIDATE_ON_FLUSH ? 1 : 0) - comb += we_o .eq(1) - # finished with flushing operation, go back to idle - with m.If (cnt_q[DCACHE_BYTE_OFFSET:DCACHE_INDEX_WIDTH] \ - == DCACHE_NUM_WORDS-1): - # only acknowledge if the flush wasn't - # triggered by an atomic - sync += flush_ack_o.eq(~serve_amo_q) - m.next = "IDLE" - - # ~> only called after reset - with m.Case("INIT"): - # initialize status array - sync += addr_o.eq(cnt_q) - comb += req_o .eq(1) - comb += we_o .eq(1) - # only write the dirty array - comb += be_o.vldrty.eq(1) - sync += cnt_d .eq(cnt_q + (1 << DCACHE_BYTE_OFFSET)) - # finished initialization - with m.If (cnt_q[DCACHE_BYTE_OFFSET:DCACHE_INDEX_WIDTH] \ - == DCACHE_NUM_WORDS-1) - m.next = "IDLE" - - # ---------------------- - # AMOs - # ---------------------- - # TODO(zarubaf) Move this closer to memory - # ~> we are here because we need to do the AMO, - # the cache is clean at this point - # start by executing the load - with m.Case("AMO_LOAD"): - comb += req_fsm_miss_valid.eq(1) - # address is in operand a - comb += req_fsm_miss_addr.eq(amo_req_i.operand_a) - comb += req_fsm_miss_req.eq(ariane_axi::SINGLE_REQ) - comb += req_fsm_miss_size.eq(amo_req_i.size) - # the request has been granted - with m.If(gnt_miss_fsm): - m.next = "AMO_SAVE_LOAD" - # save the load value - with m.Case("AMO_SAVE_LOAD"): - with m.If (valid_miss_fsm): - # we are only concerned about the lower 64-bit - comb += mshr_d.wdata.eq(data_miss_fsm[0]) - m.next = "AMO_STORE" - # and do the store - with m.Case("AMO_STORE"): - load_data = Signal(64) - # re-align load data - comb += load_data.eq(data_align(amo_req_i.operand_a[:3], - mshr_q.wdata)) - # Sign-extend for word operation - with m.If (amo_req_i.size == 0b10): - comb += amo_operand_a.eq(sext32(load_data[:32])) - comb += amo_operand_b.eq(sext32(amo_req_i.operand_b[:32])) - with m.Else(): - comb += amo_operand_a.eq(load_data) - comb += amo_operand_b.eq(amo_req_i.operand_b) - - # we do not need a store request for load reserved - # or a failing store conditional - # we can bail-out without making any further requests - with m.If ((amo_req_i.amo_op == AMO_LR) | \ - ((amo_req_i.amo_op == AMO_SC) & \ - ((reservation_q.valid & \ - (reservation_q.address != \ - amo_req_i.operand_a[3:64])) | \ - ~reservation_q.valid))): - comb += req_fsm_miss_valid.eq(0) - m.next = "IDLE" - comb += amo_resp_o.ack.eq(1) - # write-back the result - comb += amo_resp_o.result.eq(amo_operand_a) - # we know that the SC failed - with m.If (amo_req_i.amo_op == AMO_SC): - comb += amo_resp_o.result.eq(1) - # also clear the reservation - comb += reservation_d.valid.eq(0) - with m.Else(): - comb += req_fsm_miss_valid.eq(1) - - comb += req_fsm_miss_we .eq(1) - comb += req_fsm_miss_req .eq(ariane_axi::SINGLE_REQ) - comb += req_fsm_miss_size.eq(amo_req_i.size) - comb += req_fsm_miss_addr.eq(amo_req_i.operand_a) - - comb += req_fsm_miss_wdata.eq( - data_align(amo_req_i.operand_a[0:3], amo_result_o)) - comb += req_fsm_miss_be.eq( - be_gen(amo_req_i.operand_a[0:3], amo_req_i.size)) - - # place a reservation on the memory - with m.If (amo_req_i.amo_op == AMO_LR): - comb += reservation_d.address.eq(amo_req_i.operand_a[3:64]) - comb += reservation_d.valid.eq(1) - - # the request is valid or we didn't need to go for another store - with m.If (valid_miss_fsm): - m.next = "IDLE" - comb += amo_resp_o.ack.eq(1) - # write-back the result - comb += amo_resp_o.result.eq(amo_operand_a; - - if (amo_req_i.amo_op == AMO_SC) begin - comb += amo_resp_o.result.eq(0) - # An SC must fail if there is another SC - # (to any address) between the LR and the SC in - # program order (even to the same address). - # in any case destroy the reservation - comb += reservation_d.valid.eq(0) - - # check MSHR for aliasing - - comb += mshr_addr_matches_o .eq(0) - comb += mshr_index_matches_o.eq() - - for i in range(NR_PORTS): - # check mshr for potential matching of other units, - # exclude the unit currently being served - with m.If (mshr_q.valid & \ - (mshr_addr_i[i][DCACHE_BYTE_OFFSET:56] == \ - mshr_q.addr[DCACHE_BYTE_OFFSET:56])): - comb += mshr_addr_matches_o[i].eq(1) - - # same as previous, but checking only the index - with m.If (mshr_q.valid & \ - (mshr_addr_i[i][DCACHE_BYTE_OFFSET:DCACHE_INDEX_WIDTH] == \ - mshr_q.addr[DCACHE_BYTE_OFFSET:DCACHE_INDEX_WIDTH])): - mshr_index_matches_o[i].eq(1) - - # -------------------- - # Sequential Process - # -------------------- - - """ - #pragma translate_off - `ifndef VERILATOR - # assert that cache only hits on one way - assert property ( - @(posedge clk_i) $onehot0(evict_way_q)) else $warning("Evict-way should be one-hot encoded"); - `endif - #pragma translate_on - """ - - # ---------------------- - # Bypass Arbiter - # ---------------------- - # Connection Arbiter <-> AXI - req_fsm_bypass_valid = Signal() - req_fsm_bypass_addr = Signal(64) - req_fsm_bypass_wdata = Signal(64) - req_fsm_bypass_we = Signal() - req_fsm_bypass_be = Signal(8) - req_fsm_bypass_size = Signal(2) - gnt_bypass_fsm = Signal() - valid_bypass_fsm = Signal() - data_bypass_fsm = Signal(64) - logic [$clog2(NR_PORTS)-1:0] id_fsm_bypass; - logic [3:0] id_bypass_fsm; - logic [3:0] gnt_id_bypass_fsm; - - i_bypass_arbiter = ib = AXIArbiter( NR_PORTS, 64) - comb += [ - # Master Side - ib.data_req_i .eq( miss_req_valid & miss_req_bypass ), - ib.address_i .eq( miss_req_addr ), - ib.data_wdata_i .eq( miss_req_wdata ), - ib.data_we_i .eq( miss_req_we ), - ib.data_be_i .eq( miss_req_be ), - ib.data_size_i .eq( miss_req_size ), - ib.data_gnt_o .eq( bypass_gnt_o ), - ib.data_rvalid_o .eq( bypass_valid_o ), - ib.data_rdata_o .eq( bypass_data_o ), - # Slave Sid - ib.id_i .eq( id_bypass_fsm[$clog2(NR_PORTS)-1:0] ), - ib.id_o .eq( id_fsm_bypass ), - ib.gnt_id_i .eq( gnt_id_bypass_fsm[$clog2(NR_PORTS)-1:0] ), - ib.address_o .eq( req_fsm_bypass_addr ), - ib.data_wdata_o .eq( req_fsm_bypass_wdata ), - ib.data_req_o .eq( req_fsm_bypass_valid ), - ib.data_we_o .eq( req_fsm_bypass_we ), - ib.data_be_o .eq( req_fsm_bypass_be ), - ib.data_size_o .eq( req_fsm_bypass_size ), - ib.data_gnt_i .eq( gnt_bypass_fsm ), - ib.data_rvalid_i .eq( valid_bypass_fsm ), - ib.data_rdata_i .eq( data_bypass_fsm ), - ] - - axi_adapter #( - .DATA_WIDTH ( 64 ), - .AXI_ID_WIDTH ( 4 ), - .CACHELINE_BYTE_OFFSET ( DCACHE_BYTE_OFFSET ) - ) i_bypass_axi_adapter ( - .clk_i, - .rst_ni, - .req_i ( req_fsm_bypass_valid ), - .type_i ( ariane_axi::SINGLE_REQ ), - .gnt_o ( gnt_bypass_fsm ), - .addr_i ( req_fsm_bypass_addr ), - .we_i ( req_fsm_bypass_we ), - .wdata_i ( req_fsm_bypass_wdata ), - .be_i ( req_fsm_bypass_be ), - .size_i ( req_fsm_bypass_size ), - .id_i ( Cat(id_fsm_bypass, 0, 0) ), - .valid_o ( valid_bypass_fsm ), - .rdata_o ( data_bypass_fsm ), - .gnt_id_o ( gnt_id_bypass_fsm ), - .id_o ( id_bypass_fsm ), - .critical_word_o ( ), # not used for single requests - .critical_word_valid_o ( ), # not used for single requests - .axi_req_o ( axi_bypass_o ), - .axi_resp_i ( axi_bypass_i ) - ); - - # ---------------------- - # Cache Line AXI Refill - # ---------------------- - axi_adapter #( - .DATA_WIDTH ( DCACHE_LINE_WIDTH ), - .AXI_ID_WIDTH ( 4 ), - .CACHELINE_BYTE_OFFSET ( DCACHE_BYTE_OFFSET ) - ) i_miss_axi_adapter ( - .clk_i, - .rst_ni, - .req_i ( req_fsm_miss_valid ), - .type_i ( req_fsm_miss_req ), - .gnt_o ( gnt_miss_fsm ), - .addr_i ( req_fsm_miss_addr ), - .we_i ( req_fsm_miss_we ), - .wdata_i ( req_fsm_miss_wdata ), - .be_i ( req_fsm_miss_be ), - .size_i ( req_fsm_miss_size ), - .id_i ( Const(0b1100, 4) ), - .gnt_id_o ( ), # open - .valid_o ( valid_miss_fsm ), - .rdata_o ( data_miss_fsm ), - .id_o ( ), - .critical_word_o, - .critical_word_valid_o, - .axi_req_o ( axi_data_o ), - .axi_resp_i ( axi_data_i ) - ); - - # ----------------- - # Replacement LFSR - # ----------------- - lfsr_8bit #(.WIDTH (DCACHE_SET_ASSOC)) i_lfsr ( - .en_i ( lfsr_enable ), - .refill_way_oh ( lfsr_oh ), - .refill_way_bin ( lfsr_bin ), - .* - ); - - # ----------------- - # AMO ALU - # ----------------- - amo_alu i_amo_alu ( - .amo_op_i ( amo_op ), - .amo_operand_a_i ( amo_operand_a ), - .amo_operand_b_i ( amo_operand_b ), - .amo_result_o ( amo_result_o ) - ); - - # ----------------- - # Struct Split - # ----------------- - - for i in range(NR_PORTS): - miss_req = MissReq() - comb += miss_req.eq(miss_req_i[i]); - comb += miss_req_valid [i] .eq(miss_req.valid) - comb += miss_req_bypass [i] .eq(miss_req.bypass) - comb += miss_req_addr [i] .eq(miss_req.addr) - comb += miss_req_wdata [i] .eq(miss_req.wdata) - comb += miss_req_we [i] .eq(miss_req.we) - comb += miss_req_be [i] .eq(miss_req.be) - comb += miss_req_size [i] .eq(miss_req.size) - - # -------------- - # AXI Arbiter - # --------------s - # - # Description: Arbitrates access to AXI refill/bypass - # -class AXIArbiter: - def __init__(self, NR_PORTS = 3, DATA_WIDTH = 64): - self.NR_PORTS = NR_PORTS - self.DATA_WIDTH = DATA_WIDTH - self.pwid = pwid = ceil(log(NR_PORTS) / log(2)) - rst_ni = ResetSignal() # Asynchronous reset active low - # master ports - self.data_req_i = Signal(NR_PORTS) - self.address_i = Array(Signal(name="address_i", 64) \ - for i in range(NR_PORTS)) - self.data_wdata_i = Array(Signal(name="data_wdata_i", 64) \ - for i in range(NR_PORTS)) - self.data_we_i = Signal(NR_PORTS) - self.data_be_i = Array(Signal(name="data_wdata_i", DATA_WIDTH/8) \ - for i in range(NR_PORTS)) - self.data_size_i = Array(Signal(name="data_size_i", 2) \ - for i in range(NR_PORTS)) - self.data_gnt_o = Signal(NR_PORTS) - self.data_rvalid_o = Signal(NR_PORTS) - self.data_rdata_o = Array(Signal(name="data_rdata_o", 64) \ - for i in range(NR_PORTS)) - - # slave port - self.id_i = Signal(pwid) - self.id_o = Signal(pwid) - self.gnt_id_i = Signal(pwid) - self.data_req_o = Signal() - self.address_o = Signal(64) - self.data_wdata_o = Signal(DATA_WIDTH) - self.data_we_o = Signal() - self.data_be_o = Signal(DATA_WIDTH/8) - self.data_size_o = Signal(2) - self.data_gnt_i = Signal() - self.data_rvalid_i = Signal() - self.data_rdata_i = Signal(DATA_WIDTH) - - def elaborate(self, platform): - #enum logic [1:0] { IDLE, REQ, SERVING } state_d, state_q; - - class Packet: - def __init__(self, pwid, DATA_WIDTH): - self.id = Signal(pwid) - self.address = Signal(64) - self.data = Signal(64) - self.size = Signal(2) - self.be = Signal(DATA_WIDTH/8) - self.we = Signal() - - request_index = Signal(self.pwid) - req_q = Packet(self.pwid, self.DATA_WIDTH) - req_d = Packet(self.pwid, self.DATA_WIDTH) - - # request register - sync += req_q.eq(req_d) - - # request port - comb += self.address_o .eq(req_q.address) - comb += self.data_wdata_o .eq(req_q.data) - comb += self.data_be_o .eq(req_q.be) - comb += self.data_size_o .eq(req_q.size) - comb += self.data_we_o .eq(req_q.we) - comb += self.id_o .eq(req_q.id) - comb += self.data_gnt_o .eq(0) - # read port - comb += self.data_rvalid_o .eq(0) - comb += self.data_rdata_o .eq(0) - comb += self.data_rdata_o[req_q.id].eq(data_rdata_i) - - m.submodules.pp = pp = PriorityEncoder(self.NR_PORTS) - comb += pp.i.eq(self.data_req_i) # select one request (priority-based) - comb += request_index.eq(pp.o) - - with m.Switch("state") as s: - - with m.Case("IDLE"): - # wait for incoming requests (priority encoder data_req_i) - with m.If(~pp.n): # one output valid from encoder - comb += self.data_req_o .eq(self.data_req_i[i]) - comb += self.data_gnt_o[i].eq(self.data_req_i[i]) - # save the request - comb += req_d.address.eq(self.address_i[i]) - comb += req_d.id.eq(request_index) - comb += req_d.data.eq(self.data_wdata_i[i]) - comb += req_d.size.eq(self.data_size_i[i]) - comb += req_d.be.eq(self.data_be_i[i]) - comb += req_d.we.eq(self.data_we_i[i]) - m.next = "SERVING" - - comb += self.address_o .eq(self.address_i[request_index]) - comb += self.data_wdata_o .eq(self.data_wdata_i[request_index]) - comb += self.data_be_o .eq(self.data_be_i[request_index]) - comb += self.data_size_o .eq(self.data_size_i[request_index]) - comb += self.data_we_o .eq(self.data_we_i[request_index]) - comb += self.id_o .eq(request_index) - - with m.Case("SERVING"): - comb += self.data_req_o.eq(1) - with m.If (self.data_rvalid_i): - comb += self.data_rvalid_o[req_q.id].eq(1) - m.next = "IDLE" - - # ------------ - # Assertions - # ------------ - - """ -#pragma translate_off -`ifndef VERILATOR -# make sure that we eventually get an rvalid after we received a grant -assert property (@(posedge clk_i) data_gnt_i |-> ##[1:$] data_rvalid_i ) - else begin $error("There was a grant without a rvalid"); $stop(); end -# assert that there is no grant without a request -assert property (@(negedge clk_i) data_gnt_i |-> data_req_o) - else begin $error("There was a grant without a request."); $stop(); end -# assert that the address does not contain X when request is sent -assert property ( @(posedge clk_i) (data_req_o) |-> (!$isunknown(address_o)) ) - else begin $error("address contains X when request is set"); $stop(); end - -`endif -#pragma translate_on - """ - diff --git a/src/TLB/ariane/mmu.py b/src/TLB/ariane/mmu.py deleted file mode 100644 index a14862cd..00000000 --- a/src/TLB/ariane/mmu.py +++ /dev/null @@ -1,474 +0,0 @@ -""" -# Copyright 2018 ETH Zurich and University of Bologna. -# Copyright and related rights are licensed under the Solderpad Hardware -# License, Version 0.51 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# http:#solderpad.org/licenses/SHL-0.51. Unless required by applicable law -# or agreed to in writing, software, hardware and materials distributed under -# this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Author: Florian Zaruba, ETH Zurich -# Date: 19/04/2017 -# Description: Memory Management Unit for Ariane, contains TLB and -# address translation unit. SV48 as defined in -# Volume II: RISC-V Privileged Architectures V1.10 Page 63 - -import ariane_pkg::*; -""" - -from nmigen import Const, Signal, Cat, Module, Mux -from nmigen.cli import verilog, rtlil - -from ptw import DCacheReqI, DCacheReqO, TLBUpdate, PTE, PTW -from tlb import TLB -from exceptcause import (INSTR_ACCESS_FAULT, INSTR_PAGE_FAULT, - LOAD_PAGE_FAULT, STORE_PAGE_FAULT) - -PRIV_LVL_M = Const(0b11, 2) -PRIV_LVL_S = Const(0b01, 2) -PRIV_LVL_U = Const(0b00, 2) - - -class RVException: - def __init__(self): - self.cause = Signal(64) # cause of exception - self.tval = Signal(64) # more info of causing exception - # (e.g.: instruction causing it), - # address of LD/ST fault - self.valid = Signal() - - def eq(self, inp): - res = [] - for (o, i) in zip(self.ports(), inp.ports()): - res.append(o.eq(i)) - return res - - def __iter__(self): - yield self.cause - yield self.tval - yield self.valid - - def ports(self): - return list(self) - - -class ICacheReqI: - def __init__(self): - self.fetch_valid = Signal() # address translation valid - self.fetch_paddr = Signal(64) # physical address in - self.fetch_exception = RVException() # exception occurred during fetch - - def __iter__(self): - yield self.fetch_valid - yield self.fetch_paddr - yield from self.fetch_exception - - def ports(self): - return list(self) - - -class ICacheReqO: - def __init__(self): - self.fetch_req = Signal() # address translation request - self.fetch_vaddr = Signal(64) # virtual address out - - def __iter__(self): - yield self.fetch_req - yield self.fetch_vaddr - - def ports(self): - return list(self) - - -class MMU: - def __init__(self, instr_tlb_entries = 4, - data_tlb_entries = 4, - asid_width = 1): - self.instr_tlb_entries = instr_tlb_entries - self.data_tlb_entries = data_tlb_entries - self.asid_width = asid_width - - self.flush_i = Signal() - self.enable_translation_i = Signal() - self.en_ld_st_translation_i = Signal() # enable VM translation for LD/ST - # IF interface - self.icache_areq_i = ICacheReqO() - self.icache_areq_o = ICacheReqI() - # LSU interface - # this is a more minimalistic interface because the actual addressing - # logic is handled in the LSU as we distinguish load and stores, - # what we do here is simple address translation - self.misaligned_ex_i = RVException() - self.lsu_req_i = Signal() # request address translation - self.lsu_vaddr_i = Signal(64) # virtual address in - self.lsu_is_store_i = Signal() # the translation is requested by a store - # if we need to walk the page table we can't grant in the same cycle - - # Cycle 0 - self.lsu_dtlb_hit_o = Signal() # sent in the same cycle as the request - # if translation hits in the DTLB - # Cycle 1 - self.lsu_valid_o = Signal() # translation is valid - self.lsu_paddr_o = Signal(64) # translated address - self.lsu_exception_o = RVException() # addr translate threw exception - - # General control signals - self.priv_lvl_i = Signal(2) - self.ld_st_priv_lvl_i = Signal(2) - self.sum_i = Signal() - self.mxr_i = Signal() - # input logic flag_mprv_i, - self.satp_ppn_i = Signal(44) - self.asid_i = Signal(self.asid_width) - self.flush_tlb_i = Signal() - # Performance counters - self.itlb_miss_o = Signal() - self.dtlb_miss_o = Signal() - # PTW memory interface - self.req_port_i = DCacheReqO() - self.req_port_o = DCacheReqI() - - def elaborate(self, platform): - m = Module() - - iaccess_err = Signal() # insufficient priv to access instr page - daccess_err = Signal() # insufficient priv to access data page - ptw_active = Signal() # PTW is currently walking a page table - walking_instr = Signal() # PTW is walking because of an ITLB miss - ptw_error = Signal() # PTW threw an exception - - update_vaddr = Signal(48) # guessed - uaddr64 = Cat(update_vaddr, Const(0, 25)) # extend to 64bit with zeros - update_ptw_itlb = TLBUpdate(self.asid_width) - update_ptw_dtlb = TLBUpdate(self.asid_width) - - itlb_lu_access = Signal() - itlb_content = PTE() - itlb_is_2M = Signal() - itlb_is_1G = Signal() - itlb_is_512G = Signal() - itlb_lu_hit = Signal() - - dtlb_lu_access = Signal() - dtlb_content = PTE() - dtlb_is_2M = Signal() - dtlb_is_1G = Signal() - dtlb_is_512G = Signal() - dtlb_lu_hit = Signal() - - # Assignments - m.d.comb += [itlb_lu_access.eq(self.icache_areq_i.fetch_req), - dtlb_lu_access.eq(self.lsu_req_i) - ] - - # ITLB - m.submodules.i_tlb = i_tlb = TLB(self.instr_tlb_entries, - self.asid_width) - m.d.comb += [i_tlb.flush_i.eq(self.flush_tlb_i), - i_tlb.update_i.eq(update_ptw_itlb), - i_tlb.lu_access_i.eq(itlb_lu_access), - i_tlb.lu_asid_i.eq(self.asid_i), - i_tlb.lu_vaddr_i.eq(self.icache_areq_i.fetch_vaddr), - itlb_content.eq(i_tlb.lu_content_o), - itlb_is_2M.eq(i_tlb.lu_is_2M_o), - itlb_is_1G.eq(i_tlb.lu_is_1G_o), - itlb_is_512G.eq(i_tlb.lu_is_512G_o), - itlb_lu_hit.eq(i_tlb.lu_hit_o), - ] - - # DTLB - m.submodules.d_tlb = d_tlb = TLB(self.data_tlb_entries, - self.asid_width) - m.d.comb += [d_tlb.flush_i.eq(self.flush_tlb_i), - d_tlb.update_i.eq(update_ptw_dtlb), - d_tlb.lu_access_i.eq(dtlb_lu_access), - d_tlb.lu_asid_i.eq(self.asid_i), - d_tlb.lu_vaddr_i.eq(self.lsu_vaddr_i), - dtlb_content.eq(d_tlb.lu_content_o), - dtlb_is_2M.eq(d_tlb.lu_is_2M_o), - dtlb_is_1G.eq(d_tlb.lu_is_1G_o), - dtlb_is_512G.eq(d_tlb.lu_is_512G_o), - dtlb_lu_hit.eq(d_tlb.lu_hit_o), - ] - - # PTW - m.submodules.ptw = ptw = PTW(self.asid_width) - m.d.comb += [ptw_active.eq(ptw.ptw_active_o), - walking_instr.eq(ptw.walking_instr_o), - ptw_error.eq(ptw.ptw_error_o), - ptw.enable_translation_i.eq(self.enable_translation_i), - - update_vaddr.eq(ptw.update_vaddr_o), - update_ptw_itlb.eq(ptw.itlb_update_o), - update_ptw_dtlb.eq(ptw.dtlb_update_o), - - ptw.itlb_access_i.eq(itlb_lu_access), - ptw.itlb_hit_i.eq(itlb_lu_hit), - ptw.itlb_vaddr_i.eq(self.icache_areq_i.fetch_vaddr), - - ptw.dtlb_access_i.eq(dtlb_lu_access), - ptw.dtlb_hit_i.eq(dtlb_lu_hit), - ptw.dtlb_vaddr_i.eq(self.lsu_vaddr_i), - - ptw.req_port_i.eq(self.req_port_i), - self.req_port_o.eq(ptw.req_port_o), - ] - - # ila_1 i_ila_1 ( - # .clk(clk_i), # input wire clk - # .probe0({req_port_o.address_tag, req_port_o.address_index}), - # .probe1(req_port_o.data_req), # input wire [63:0] probe1 - # .probe2(req_port_i.data_gnt), # input wire [0:0] probe2 - # .probe3(req_port_i.data_rdata), # input wire [0:0] probe3 - # .probe4(req_port_i.data_rvalid), # input wire [0:0] probe4 - # .probe5(ptw_error), # input wire [1:0] probe5 - # .probe6(update_vaddr), # input wire [0:0] probe6 - # .probe7(update_ptw_itlb.valid), # input wire [0:0] probe7 - # .probe8(update_ptw_dtlb.valid), # input wire [0:0] probe8 - # .probe9(dtlb_lu_access), # input wire [0:0] probe9 - # .probe10(lsu_vaddr_i), # input wire [0:0] probe10 - # .probe11(dtlb_lu_hit), # input wire [0:0] probe11 - # .probe12(itlb_lu_access), # input wire [0:0] probe12 - # .probe13(icache_areq_i.fetch_vaddr), # input wire [0:0] probe13 - # .probe14(itlb_lu_hit) # input wire [0:0] probe13 - # ); - - #----------------------- - # Instruction Interface - #----------------------- - # The instruction interface is a simple request response interface - - # MMU disabled: just pass through - m.d.comb += [self.icache_areq_o.fetch_valid.eq( - self.icache_areq_i.fetch_req), - # play through in case we disabled address translation - self.icache_areq_o.fetch_paddr.eq( - self.icache_areq_i.fetch_vaddr) - ] - # two potential exception sources: - # 1. HPTW threw an exception -> signal with a page fault exception - # 2. We got an access error because of insufficient permissions -> - # throw an access exception - m.d.comb += self.icache_areq_o.fetch_exception.valid.eq(0) - # Check whether we are allowed to access this memory region - # from a fetch perspective - - # PLATEN TODO: use PermissionValidator instead [we like modules] - m.d.comb += iaccess_err.eq(self.icache_areq_i.fetch_req & \ - (((self.priv_lvl_i == PRIV_LVL_U) & \ - ~itlb_content.u) | \ - ((self.priv_lvl_i == PRIV_LVL_S) & \ - itlb_content.u))) - - # MMU enabled: address from TLB, request delayed until hit. - # Error when TLB hit and no access right or TLB hit and - # translated address not valid (e.g. AXI decode error), - # or when PTW performs walk due to ITLB miss and raises - # an error. - with m.If (self.enable_translation_i): - # we work with SV48, so if VM is enabled, check that - # all bits [47:38] are equal - with m.If (self.icache_areq_i.fetch_req & \ - ~(((~self.icache_areq_i.fetch_vaddr[47:64]) == 0) | \ - (self.icache_areq_i.fetch_vaddr[47:64]) == 0)): - fe = self.icache_areq_o.fetch_exception - m.d.comb += [fe.cause.eq(INSTR_ACCESS_FAULT), - fe.tval.eq(self.icache_areq_i.fetch_vaddr), - fe.valid.eq(1) - ] - - m.d.comb += self.icache_areq_o.fetch_valid.eq(0) - - # 4K page - paddr = Signal.like(self.icache_areq_o.fetch_paddr) - paddr4k = Cat(self.icache_areq_i.fetch_vaddr[0:12], - itlb_content.ppn) - m.d.comb += paddr.eq(paddr4k) - # Mega page - with m.If(itlb_is_2M): - m.d.comb += paddr[12:21].eq( - self.icache_areq_i.fetch_vaddr[12:21]) - # Giga page - with m.If(itlb_is_1G): - m.d.comb += paddr[12:30].eq( - self.icache_areq_i.fetch_vaddr[12:30]) - m.d.comb += self.icache_areq_o.fetch_paddr.eq(paddr) - # Tera page - with m.If(itlb_is_512G): - m.d.comb += paddr[12:39].eq( - self.icache_areq_i.fetch_vaddr[12:39]) - m.d.comb += self.icache_areq_o.fetch_paddr.eq(paddr) - - # --------- - # ITLB Hit - # -------- - # if we hit the ITLB output the request signal immediately - with m.If(itlb_lu_hit): - m.d.comb += self.icache_areq_o.fetch_valid.eq( - self.icache_areq_i.fetch_req) - # we got an access error - with m.If (iaccess_err): - # throw a page fault - fe = self.icache_areq_o.fetch_exception - m.d.comb += [fe.cause.eq(INSTR_ACCESS_FAULT), - fe.tval.eq(self.icache_areq_i.fetch_vaddr), - fe.valid.eq(1) - ] - # --------- - # ITLB Miss - # --------- - # watch out for exceptions happening during walking the page table - with m.Elif(ptw_active & walking_instr): - m.d.comb += self.icache_areq_o.fetch_valid.eq(ptw_error) - fe = self.icache_areq_o.fetch_exception - m.d.comb += [fe.cause.eq(INSTR_PAGE_FAULT), - fe.tval.eq(uaddr64), - fe.valid.eq(1) - ] - - #----------------------- - # Data Interface - #----------------------- - - lsu_vaddr = Signal(64) - dtlb_pte = PTE() - misaligned_ex = RVException() - lsu_req = Signal() - lsu_is_store = Signal() - dtlb_hit = Signal() - #dtlb_is_2M = Signal() - #dtlb_is_1G = Signal() - #dtlb_is_512 = Signal() - - # check if we need to do translation or if we are always - # ready (e.g.: we are not translating anything) - m.d.comb += self.lsu_dtlb_hit_o.eq(Mux(self.en_ld_st_translation_i, - dtlb_lu_hit, 1)) - - # The data interface is simpler and only consists of a - # request/response interface - m.d.comb += [ - # save request and DTLB response - lsu_vaddr.eq(self.lsu_vaddr_i), - lsu_req.eq(self.lsu_req_i), - misaligned_ex.eq(self.misaligned_ex_i), - dtlb_pte.eq(dtlb_content), - dtlb_hit.eq(dtlb_lu_hit), - lsu_is_store.eq(self.lsu_is_store_i), - #dtlb_is_2M.eq(dtlb_is_2M), - #dtlb_is_1G.eq(dtlb_is_1G), - ##dtlb_is_512.eq(self.dtlb_is_512G) #???? - ] - m.d.sync += [ - self.lsu_paddr_o.eq(lsu_vaddr), - self.lsu_valid_o.eq(lsu_req), - self.lsu_exception_o.eq(misaligned_ex), - ] - - sverr = Signal() - usrerr = Signal() - - m.d.comb += [ - # mute misaligned exceptions if there is no request - # otherwise they will throw accidental exceptions - misaligned_ex.valid.eq(self.misaligned_ex_i.valid & self.lsu_req_i), - - # SUM is not set and we are trying to access a user - # page in supervisor mode - sverr.eq(self.ld_st_priv_lvl_i == PRIV_LVL_S & ~self.sum_i & \ - dtlb_pte.u), - # this is not a user page but we are in user mode and - # trying to access it - usrerr.eq(self.ld_st_priv_lvl_i == PRIV_LVL_U & ~dtlb_pte.u), - - # Check if the User flag is set, then we may only - # access it in supervisor mode if SUM is enabled - daccess_err.eq(sverr | usrerr), - ] - - # translation is enabled and no misaligned exception occurred - with m.If(self.en_ld_st_translation_i & ~misaligned_ex.valid): - m.d.comb += lsu_req.eq(0) - # 4K page - paddr = Signal.like(lsu_vaddr) - paddr4k = Cat(lsu_vaddr[0:12], itlb_content.ppn) - m.d.comb += paddr.eq(paddr4k) - # Mega page - with m.If(dtlb_is_2M): - m.d.comb += paddr[12:21].eq(lsu_vaddr[12:21]) - # Giga page - with m.If(dtlb_is_1G): - m.d.comb += paddr[12:30].eq(lsu_vaddr[12:30]) - m.d.sync += self.lsu_paddr_o.eq(paddr) - # TODO platen tera_page - - # --------- - # DTLB Hit - # -------- - with m.If(dtlb_hit & lsu_req): - m.d.comb += lsu_req.eq(1) - # this is a store - with m.If (lsu_is_store): - # check if the page is write-able and - # we are not violating privileges - # also check if the dirty flag is set - with m.If(~dtlb_pte.w | daccess_err | ~dtlb_pte.d): - le = self.lsu_exception_o - m.d.sync += [le.cause.eq(STORE_PAGE_FAULT), - le.tval.eq(lsu_vaddr), - le.valid.eq(1) - ] - - # this is a load, check for sufficient access - # privileges - throw a page fault if necessary - with m.Elif(daccess_err): - le = self.lsu_exception_o - m.d.sync += [le.cause.eq(LOAD_PAGE_FAULT), - le.tval.eq(lsu_vaddr), - le.valid.eq(1) - ] - # --------- - # DTLB Miss - # --------- - # watch out for exceptions - with m.Elif (ptw_active & ~walking_instr): - # page table walker threw an exception - with m.If (ptw_error): - # an error makes the translation valid - m.d.comb += lsu_req.eq(1) - # the page table walker can only throw page faults - with m.If (lsu_is_store): - le = self.lsu_exception_o - m.d.sync += [le.cause.eq(STORE_PAGE_FAULT), - le.tval.eq(uaddr64), - le.valid.eq(1) - ] - with m.Else(): - m.d.sync += [le.cause.eq(LOAD_PAGE_FAULT), - le.tval.eq(uaddr64), - le.valid.eq(1) - ] - - return m - - def ports(self): - return [self.flush_i, self.enable_translation_i, - self.en_ld_st_translation_i, - self.lsu_req_i, - self.lsu_vaddr_i, self.lsu_is_store_i, self.lsu_dtlb_hit_o, - self.lsu_valid_o, self.lsu_paddr_o, - self.priv_lvl_i, self.ld_st_priv_lvl_i, self.sum_i, self.mxr_i, - self.satp_ppn_i, self.asid_i, self.flush_tlb_i, - self.itlb_miss_o, self.dtlb_miss_o] + \ - self.icache_areq_i.ports() + self.icache_areq_o.ports() + \ - self.req_port_i.ports() + self.req_port_o.ports() + \ - self.misaligned_ex_i.ports() + self.lsu_exception_o.ports() - -if __name__ == '__main__': - mmu = MMU() - vl = rtlil.convert(mmu, ports=mmu.ports()) - with open("test_mmu.il", "w") as f: - f.write(vl) - diff --git a/src/TLB/ariane/p_lru.txt b/src/TLB/ariane/p_lru.txt deleted file mode 100644 index 4bac7680..00000000 --- a/src/TLB/ariane/p_lru.txt +++ /dev/null @@ -1,51 +0,0 @@ -pseudo-LRU - -two-way set associative - one bit - - indicates which line of the two has been reference more recently - - -four-way set associative - three bits - - each bit represents one branch point in a binary decision tree; let 1 - represent that the left side has been referenced more recently than the - right side, and 0 vice-versa - - are all 4 lines valid? - / \ - yes no, use an invalid line - | - | - | - bit_0 == 0? state | replace ref to | next state - / \ ------+-------- -------+----------- - y n 00x | line_0 line_0 | 11_ - / \ 01x | line_1 line_1 | 10_ - bit_1 == 0? bit_2 == 0? 1x0 | line_2 line_2 | 0_1 - / \ / \ 1x1 | line_3 line_3 | 0_0 - y n y n - / \ / \ ('x' means ('_' means unchanged) - line_0 line_1 line_2 line_3 don't care) - - (see Figure 3-7, p. 3-18, in Intel Embedded Pentium Processor Family Dev. - Manual, 1998, http://www.intel.com/design/intarch/manuals/273204.htm) - - -note that there is a 6-bit encoding for true LRU for four-way set associative - - bit 0: bank[1] more recently used than bank[0] - bit 1: bank[2] more recently used than bank[0] - bit 2: bank[2] more recently used than bank[1] - bit 3: bank[3] more recently used than bank[0] - bit 4: bank[3] more recently used than bank[1] - bit 5: bank[3] more recently used than bank[2] - - this results in 24 valid bit patterns within the 64 possible bit patterns - (4! possible valid traces for bank references) - - e.g., a trace of 0 1 2 3, where 0 is LRU and 3 is MRU, is encoded as 111111 - - you can implement a state machine with a 256x6 ROM (6-bit state encoding - appended with a 2-bit bank reference input will yield a new 6-bit state), - and you can implement an LRU bank indicator with a 64x2 ROM - diff --git a/src/TLB/ariane/plru.py b/src/TLB/ariane/plru.py deleted file mode 100644 index a8db5c27..00000000 --- a/src/TLB/ariane/plru.py +++ /dev/null @@ -1,105 +0,0 @@ -from nmigen import Signal, Module, Cat, Const -from nmigen.hdl.ir import Elaboratable -from math import log2 - - -class PLRU(Elaboratable): - """ PLRU - Pseudo Least Recently Used Replacement - - PLRU-tree indexing: - lvl0 0 - / \ - / \ - lvl1 1 2 - / \ / \ - lvl2 3 4 5 6 - / \ /\/\ /\ - ... ... ... ... - """ - def __init__(self, entries): - self.entries = entries - self.lu_hit = Signal(entries) - self.replace_en_o = Signal(entries) - self.lu_access_i = Signal() - # Tree (bit per entry) - self.TLBSZ = 2*(self.entries-1) - self.plru_tree = Signal(self.TLBSZ) - self.plru_tree_o = Signal(self.TLBSZ) - - def elaborate(self, platform=None): - m = Module() - - # Just predefine which nodes will be set/cleared - # E.g. for a TLB with 8 entries, the for-loop is semantically - # equivalent to the following pseudo-code: - # unique case (1'b1) - # lu_hit[7]: plru_tree[0, 2, 6] = {1, 1, 1}; - # lu_hit[6]: plru_tree[0, 2, 6] = {1, 1, 0}; - # lu_hit[5]: plru_tree[0, 2, 5] = {1, 0, 1}; - # lu_hit[4]: plru_tree[0, 2, 5] = {1, 0, 0}; - # lu_hit[3]: plru_tree[0, 1, 4] = {0, 1, 1}; - # lu_hit[2]: plru_tree[0, 1, 4] = {0, 1, 0}; - # lu_hit[1]: plru_tree[0, 1, 3] = {0, 0, 1}; - # lu_hit[0]: plru_tree[0, 1, 3] = {0, 0, 0}; - # default: begin /* No hit */ end - # endcase - LOG_TLB = int(log2(self.entries)) - print(LOG_TLB) - for i in range(self.entries): - # we got a hit so update the pointer as it was least recently used - hit = Signal(reset_less=True) - m.d.comb += hit.eq(self.lu_hit[i] & self.lu_access_i) - with m.If(hit): - # Set the nodes to the values we would expect - for lvl in range(LOG_TLB): - idx_base = (1< MSB, lvl1 <=> MSB-1, ... - shift = LOG_TLB - lvl; - new_idx = Const(~((i >> (shift-1)) & 1), (1, False)) - plru_idx = idx_base + (i >> shift) - print ("plru", i, lvl, hex(idx_base), - plru_idx, shift, new_idx) - m.d.comb += self.plru_tree_o[plru_idx].eq(new_idx) - - # Decode tree to write enable signals - # Next for-loop basically creates the following logic for e.g. - # an 8 entry TLB (note: pseudo-code obviously): - # replace_en[7] = &plru_tree[ 6, 2, 0]; #plru_tree[0,2,6]=={1,1,1} - # replace_en[6] = &plru_tree[~6, 2, 0]; #plru_tree[0,2,6]=={1,1,0} - # replace_en[5] = &plru_tree[ 5,~2, 0]; #plru_tree[0,2,5]=={1,0,1} - # replace_en[4] = &plru_tree[~5,~2, 0]; #plru_tree[0,2,5]=={1,0,0} - # replace_en[3] = &plru_tree[ 4, 1,~0]; #plru_tree[0,1,4]=={0,1,1} - # replace_en[2] = &plru_tree[~4, 1,~0]; #plru_tree[0,1,4]=={0,1,0} - # replace_en[1] = &plru_tree[ 3,~1,~0]; #plru_tree[0,1,3]=={0,0,1} - # replace_en[0] = &plru_tree[~3,~1,~0]; #plru_tree[0,1,3]=={0,0,0} - # For each entry traverse the tree. If every tree-node matches - # the corresponding bit of the entry's index, this is - # the next entry to replace. - replace = [] - for i in range(self.entries): - en = [] - for lvl in range(LOG_TLB): - idx_base = (1< MSB, lvl1 <=> MSB-1, ... - shift = LOG_TLB - lvl; - new_idx = (i >> (shift-1)) & 1; - plru_idx = idx_base + (i>>shift) - plru = Signal(reset_less=True, - name="plru-%d-%d-%d" % (i, lvl, plru_idx)) - m.d.comb += plru.eq(self.plru_tree[plru_idx]) - # en &= plru_tree_q[idx_base + (i>>shift)] == new_idx; - if new_idx: - en.append(~plru) # yes inverted (using bool()) - else: - en.append(plru) # yes inverted (using bool()) - print ("plru", i, en) - # boolean logic manipulation: - # plru0 & plru1 & plru2 == ~(~plru0 | ~plru1 | ~plru2) - replace.append(~Cat(*en).bool()) - m.d.comb += self.replace_en_o.eq(Cat(*replace)) - - return m - - def ports(self): - return [self.entries, self.lu_hit, self.replace_en_o, - self.lu_access_i, self.plru_tree, self.plru_tree_o] diff --git a/src/TLB/ariane/ptw.py b/src/TLB/ariane/ptw.py deleted file mode 100644 index 4046c711..00000000 --- a/src/TLB/ariane/ptw.py +++ /dev/null @@ -1,556 +0,0 @@ -""" -# Copyright 2018 ETH Zurich and University of Bologna. -# Copyright and related rights are licensed under the Solderpad Hardware -# License, Version 0.51 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# http:#solderpad.org/licenses/SHL-0.51. Unless required by applicable law -# or agreed to in writing, software, hardware and materials distributed under -# this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Author: David Schaffenrath, TU Graz -# Author: Florian Zaruba, ETH Zurich -# Date: 24.4.2017 -# Description: Hardware-PTW - -/* verilator lint_off WIDTH */ -import ariane_pkg::*; - -see linux kernel source: - -* "arch/riscv/include/asm/page.h" -* "arch/riscv/include/asm/mmu_context.h" -* "arch/riscv/Kconfig" (CONFIG_PAGE_OFFSET) - -""" - -from nmigen import Const, Signal, Cat, Module, Elaboratable -from nmigen.hdl.ast import ArrayProxy -from nmigen.cli import verilog, rtlil -from math import log2 - - -DCACHE_SET_ASSOC = 8 -CONFIG_L1D_SIZE = 32*1024 -DCACHE_INDEX_WIDTH = int(log2(CONFIG_L1D_SIZE / DCACHE_SET_ASSOC)) -DCACHE_TAG_WIDTH = 56 - DCACHE_INDEX_WIDTH - -ASID_WIDTH = 8 - - -class DCacheReqI: - def __init__(self): - self.address_index = Signal(DCACHE_INDEX_WIDTH) - self.address_tag = Signal(DCACHE_TAG_WIDTH) - self.data_wdata = Signal(64) - self.data_req = Signal() - self.data_we = Signal() - self.data_be = Signal(8) - self.data_size = Signal(2) - self.kill_req = Signal() - self.tag_valid = Signal() - - def eq(self, inp): - res = [] - for (o, i) in zip(self.ports(), inp.ports()): - res.append(o.eq(i)) - return res - - def ports(self): - return [self.address_index, self.address_tag, - self.data_wdata, self.data_req, - self.data_we, self.data_be, self.data_size, - self.kill_req, self.tag_valid, - ] - -class DCacheReqO: - def __init__(self): - self.data_gnt = Signal() - self.data_rvalid = Signal() - self.data_rdata = Signal(64) # actually in PTE object format - - def eq(self, inp): - res = [] - for (o, i) in zip(self.ports(), inp.ports()): - res.append(o.eq(i)) - return res - - def ports(self): - return [self.data_gnt, self.data_rvalid, self.data_rdata] - - -class PTE: #(RecordObject): - def __init__(self): - self.v = Signal() - self.r = Signal() - self.w = Signal() - self.x = Signal() - self.u = Signal() - self.g = Signal() - self.a = Signal() - self.d = Signal() - self.rsw = Signal(2) - self.ppn = Signal(44) - self.reserved = Signal(10) - - def flatten(self): - return Cat(*self.ports()) - - def eq(self, x): - if isinstance(x, ArrayProxy): - res = [] - for o in self.ports(): - i = getattr(x, o.name) - res.append(i) - x = Cat(*res) - else: - x = x.flatten() - return self.flatten().eq(x) - - def __iter__(self): - """ order is critical so that flatten creates LSB to MSB - """ - yield self.v - yield self.r - yield self.w - yield self.x - yield self.u - yield self.g - yield self.a - yield self.d - yield self.rsw - yield self.ppn - yield self.reserved - - def ports(self): - return list(self) - - -class TLBUpdate: - def __init__(self, asid_width): - self.valid = Signal() # valid flag - self.is_2M = Signal() - self.is_1G = Signal() - self.is_512G = Signal() - self.vpn = Signal(36) - self.asid = Signal(asid_width) - self.content = PTE() - - def flatten(self): - return Cat(*self.ports()) - - def eq(self, x): - return self.flatten().eq(x.flatten()) - - def ports(self): - return [self.valid, self.is_2M, self.is_1G, self.vpn, self.asid] + \ - self.content.ports() - - -# SV48 defines four levels of page tables -LVL1 = Const(0, 2) # defined to 0 so that ptw_lvl default-resets to LVL1 -LVL2 = Const(1, 2) -LVL3 = Const(2, 2) -LVL4 = Const(3, 2) - - -class PTW(Elaboratable): - def __init__(self, asid_width=8): - self.asid_width = asid_width - - self.flush_i = Signal() # flush everything, we need to do this because - # actually everything we do is speculative at this stage - # e.g.: there could be a CSR instruction that changes everything - self.ptw_active_o = Signal(reset=1) # active if not IDLE - self.walking_instr_o = Signal() # set when walking for TLB - self.ptw_error_o = Signal() # set when an error occurred - self.enable_translation_i = Signal() # CSRs indicate to enable SV48 - self.en_ld_st_translation_i = Signal() # enable VM translation for ld/st - - self.lsu_is_store_i = Signal() # translation triggered by store - # PTW memory interface - self.req_port_i = DCacheReqO() - self.req_port_o = DCacheReqI() - - # to TLBs, update logic - self.itlb_update_o = TLBUpdate(asid_width) - self.dtlb_update_o = TLBUpdate(asid_width) - - self.update_vaddr_o = Signal(48) - - self.asid_i = Signal(self.asid_width) - # from TLBs - # did we miss? - self.itlb_access_i = Signal() - self.itlb_hit_i = Signal() - self.itlb_vaddr_i = Signal(64) - - self.dtlb_access_i = Signal() - self.dtlb_hit_i = Signal() - self.dtlb_vaddr_i = Signal(64) - # from CSR file - self.satp_ppn_i = Signal(44) # ppn from satp - self.mxr_i = Signal() - # Performance counters - self.itlb_miss_o = Signal() - self.dtlb_miss_o = Signal() - - def ports(self): - return [self.ptw_active_o, self.walking_instr_o, self.ptw_error_o, - ] - return [ - self.enable_translation_i, self.en_ld_st_translation_i, - self.lsu_is_store_i, self.req_port_i, self.req_port_o, - self.update_vaddr_o, - self.asid_i, - self.itlb_access_i, self.itlb_hit_i, self.itlb_vaddr_i, - self.dtlb_access_i, self.dtlb_hit_i, self.dtlb_vaddr_i, - self.satp_ppn_i, self.mxr_i, - self.itlb_miss_o, self.dtlb_miss_o - ] + self.itlb_update_o.ports() + self.dtlb_update_o.ports() - - def elaborate(self, platform): - m = Module() - - # input registers - data_rvalid = Signal() - data_rdata = Signal(64) - - # NOTE: pte decodes the incoming bit-field (data_rdata). data_rdata - # is spec'd in 64-bit binary-format: better to spec as Record? - pte = PTE() - m.d.comb += pte.flatten().eq(data_rdata) - - # SV48 defines four levels of page tables - ptw_lvl = Signal(2) # default=0=LVL1 on reset (see above) - ptw_lvl1 = Signal() - ptw_lvl2 = Signal() - ptw_lvl3 = Signal() - ptw_lvl4 = Signal() - m.d.comb += [ptw_lvl1.eq(ptw_lvl == LVL1), - ptw_lvl2.eq(ptw_lvl == LVL2), - ptw_lvl3.eq(ptw_lvl == LVL3), - ptw_lvl4.eq(ptw_lvl == LVL4) - ] - - # is this an instruction page table walk? - is_instr_ptw = Signal() - global_mapping = Signal() - # latched tag signal - tag_valid = Signal() - # register the ASID - tlb_update_asid = Signal(self.asid_width) - # register VPN we need to walk, SV48 defines a 48 bit virtual addr - vaddr = Signal(64) - # 4 byte aligned physical pointer - ptw_pptr = Signal(56) - - end = DCACHE_INDEX_WIDTH + DCACHE_TAG_WIDTH - m.d.sync += [ - # Assignments - self.update_vaddr_o.eq(vaddr), - - self.walking_instr_o.eq(is_instr_ptw), - # directly output the correct physical address - self.req_port_o.address_index.eq(ptw_pptr[0:DCACHE_INDEX_WIDTH]), - self.req_port_o.address_tag.eq(ptw_pptr[DCACHE_INDEX_WIDTH:end]), - # we are never going to kill this request - self.req_port_o.kill_req.eq(0), # XXX assign comb? - # we are never going to write with the HPTW - self.req_port_o.data_wdata.eq(Const(0, 64)), # XXX assign comb? - # ----------- - # TLB Update - # ----------- - self.itlb_update_o.vpn.eq(vaddr[12:48]), - self.dtlb_update_o.vpn.eq(vaddr[12:48]), - # update the correct page table level - self.itlb_update_o.is_2M.eq(ptw_lvl3), - self.itlb_update_o.is_1G.eq(ptw_lvl2), - self.itlb_update_o.is_512G.eq(ptw_lvl1), - self.dtlb_update_o.is_2M.eq(ptw_lvl3), - self.dtlb_update_o.is_1G.eq(ptw_lvl2), - self.dtlb_update_o.is_512G.eq(ptw_lvl1), - - # output the correct ASID - self.itlb_update_o.asid.eq(tlb_update_asid), - self.dtlb_update_o.asid.eq(tlb_update_asid), - # set the global mapping bit - self.itlb_update_o.content.eq(pte), - self.itlb_update_o.content.g.eq(global_mapping), - self.dtlb_update_o.content.eq(pte), - self.dtlb_update_o.content.g.eq(global_mapping), - - self.req_port_o.tag_valid.eq(tag_valid), - ] - - #------------------- - # Page table walker #needs update - #------------------- - # A virtual address va is translated into a physical address pa as - # follows: - # 1. Let a be sptbr.ppn × PAGESIZE, and let i = LEVELS-1. (For Sv48, - # PAGESIZE=2^12 and LEVELS=4.) - # 2. Let pte be the value of the PTE at address a+va.vpn[i]×PTESIZE. - # (For Sv32, PTESIZE=4.) - # 3. If pte.v = 0, or if pte.r = 0 and pte.w = 1, stop and raise an - # access exception. - # 4. Otherwise, the PTE is valid. If pte.r = 1 or pte.x = 1, go to - # step 5. Otherwise, this PTE is a pointer to the next level of - # the page table. - # Let i=i-1. If i < 0, stop and raise an access exception. - # Otherwise, let a = pte.ppn × PAGESIZE and go to step 2. - # 5. A leaf PTE has been found. Determine if the requested memory - # access is allowed by the pte.r, pte.w, and pte.x bits. If not, - # stop and raise an access exception. Otherwise, the translation is - # successful. Set pte.a to 1, and, if the memory access is a - # store, set pte.d to 1. - # The translated physical address is given as follows: - # - pa.pgoff = va.pgoff. - # - If i > 0, then this is a superpage translation and - # pa.ppn[i-1:0] = va.vpn[i-1:0]. - # - pa.ppn[LEVELS-1:i] = pte.ppn[LEVELS-1:i]. - # 6. If i > 0 and pa.ppn[i − 1 : 0] != 0, this is a misaligned - # superpage stop and raise a page-fault exception. - - m.d.sync += tag_valid.eq(0) - - # default assignments - m.d.comb += [ - # PTW memory interface - self.req_port_o.data_req.eq(0), - self.req_port_o.data_be.eq(Const(0xFF, 8)), - self.req_port_o.data_size.eq(Const(0b11, 2)), - self.req_port_o.data_we.eq(0), - self.ptw_error_o.eq(0), - self.itlb_update_o.valid.eq(0), - self.dtlb_update_o.valid.eq(0), - - self.itlb_miss_o.eq(0), - self.dtlb_miss_o.eq(0), - ] - - # ------------ - # State Machine - # ------------ - - with m.FSM() as fsm: - - with m.State("IDLE"): - self.idle(m, is_instr_ptw, ptw_lvl, global_mapping, - ptw_pptr, vaddr, tlb_update_asid) - - with m.State("WAIT_GRANT"): - self.grant(m, tag_valid, data_rvalid) - - with m.State("PTE_LOOKUP"): - # we wait for the valid signal - with m.If(data_rvalid): - self.lookup(m, pte, ptw_lvl, ptw_lvl1, ptw_lvl2, ptw_lvl3, ptw_lvl4, - data_rvalid, global_mapping, - is_instr_ptw, ptw_pptr) - - # Propagate error to MMU/LSU - with m.State("PROPAGATE_ERROR"): - m.next = "IDLE" - m.d.comb += self.ptw_error_o.eq(1) - - # wait for the rvalid before going back to IDLE - with m.State("WAIT_RVALID"): - with m.If(data_rvalid): - m.next = "IDLE" - - m.d.sync += [data_rdata.eq(self.req_port_i.data_rdata), - data_rvalid.eq(self.req_port_i.data_rvalid) - ] - - return m - - def set_grant_state(self, m): - # should we have flushed before we got an rvalid, - # wait for it until going back to IDLE - with m.If(self.flush_i): - with m.If (self.req_port_i.data_gnt): - m.next = "WAIT_RVALID" - with m.Else(): - m.next = "IDLE" - with m.Else(): - m.next = "WAIT_GRANT" - - def idle(self, m, is_instr_ptw, ptw_lvl, global_mapping, - ptw_pptr, vaddr, tlb_update_asid): - # by default we start with the top-most page table - m.d.sync += [is_instr_ptw.eq(0), - ptw_lvl.eq(LVL1), - global_mapping.eq(0), - self.ptw_active_o.eq(0), # deactive (IDLE) - ] - # work out itlb/dtlb miss - m.d.comb += self.itlb_miss_o.eq(self.enable_translation_i & \ - self.itlb_access_i & \ - ~self.itlb_hit_i & \ - ~self.dtlb_access_i) - m.d.comb += self.dtlb_miss_o.eq(self.en_ld_st_translation_i & \ - self.dtlb_access_i & \ - ~self.dtlb_hit_i) - # we got an ITLB miss? - with m.If(self.itlb_miss_o): - pptr = Cat(Const(0, 3), self.itlb_vaddr_i[30:48], - self.satp_ppn_i) - m.d.sync += [ptw_pptr.eq(pptr), - is_instr_ptw.eq(1), - vaddr.eq(self.itlb_vaddr_i), - tlb_update_asid.eq(self.asid_i), - ] - self.set_grant_state(m) - - # we got a DTLB miss? - with m.Elif(self.dtlb_miss_o): - pptr = Cat(Const(0, 3), self.dtlb_vaddr_i[30:48], - self.satp_ppn_i) - m.d.sync += [ptw_pptr.eq(pptr), - vaddr.eq(self.dtlb_vaddr_i), - tlb_update_asid.eq(self.asid_i), - ] - self.set_grant_state(m) - - def grant(self, m, tag_valid, data_rvalid): - # we've got a data WAIT_GRANT so tell the - # cache that the tag is valid - - # send a request out - m.d.comb += self.req_port_o.data_req.eq(1) - # wait for the WAIT_GRANT - with m.If(self.req_port_i.data_gnt): - # send the tag valid signal one cycle later - m.d.sync += tag_valid.eq(1) - # should we have flushed before we got an rvalid, - # wait for it until going back to IDLE - with m.If(self.flush_i): - with m.If (~data_rvalid): - m.next = "WAIT_RVALID" - with m.Else(): - m.next = "IDLE" - with m.Else(): - m.next = "PTE_LOOKUP" - - def lookup(self, m, pte, ptw_lvl, ptw_lvl1, ptw_lvl2, ptw_lvl3, ptw_lvl4, - data_rvalid, global_mapping, - is_instr_ptw, ptw_pptr): - # temporaries - pte_rx = Signal(reset_less=True) - pte_exe = Signal(reset_less=True) - pte_inv = Signal(reset_less=True) - pte_a = Signal(reset_less=True) - st_wd = Signal(reset_less=True) - m.d.comb += [pte_rx.eq(pte.r | pte.x), - pte_exe.eq(~pte.x | ~pte.a), - pte_inv.eq(~pte.v | (~pte.r & pte.w)), - pte_a.eq(pte.a & (pte.r | (pte.x & self.mxr_i))), - st_wd.eq(self.lsu_is_store_i & (~pte.w | ~pte.d))] - - l1err = Signal(reset_less=True) - l2err = Signal(reset_less=True) - l3err = Signal(reset_less=True) - m.d.comb += [l3err.eq((ptw_lvl3) & pte.ppn[0:9] != Const(0,0)), - l2err.eq((ptw_lvl2) & pte.ppn[0:18] != Const(0, 18)), - l1err.eq((ptw_lvl1) & pte.ppn[0:27] != Const(0, 27))] - - # check if the global mapping bit is set - with m.If (pte.g): - m.d.sync += global_mapping.eq(1) - - m.next = "IDLE" - - # ------------- - # Invalid PTE - # ------------- - # If pte.v = 0, or if pte.r = 0 and pte.w = 1, - # stop and raise a page-fault exception. - with m.If (pte_inv): - m.next = "PROPAGATE_ERROR" - - # ----------- - # Valid PTE - # ----------- - - # it is a valid PTE - # if pte.r = 1 or pte.x = 1 it is a valid PTE - with m.Elif (pte_rx): - # Valid translation found (either 1G, 2M or 4K) - with m.If(is_instr_ptw): - # ------------ - # Update ITLB - # ------------ - # If page not executable, we can directly raise error. - # This doesn't put a useless entry into the TLB. - # The same idea applies to the access flag since we let - # the access flag be managed by SW. - with m.If (pte_exe): - m.next = "IDLE" - with m.Else(): - m.d.comb += self.itlb_update_o.valid.eq(1) - - with m.Else(): - # ------------ - # Update DTLB - # ------------ - # Check if the access flag has been set, otherwise - # throw page-fault and let software handle those bits. - # If page not readable (there are no write-only pages) - # directly raise an error. This doesn't put a useless - # entry into the TLB. - with m.If(pte_a): - m.d.comb += self.dtlb_update_o.valid.eq(1) - with m.Else(): - m.next = "PROPAGATE_ERROR" - # Request is a store: perform additional checks - # If the request was a store and the page not - # write-able, raise an error - # the same applies if the dirty flag is not set - with m.If (st_wd): - m.d.comb += self.dtlb_update_o.valid.eq(0) - m.next = "PROPAGATE_ERROR" - - # check if the ppn is correctly aligned: Case (6) - with m.If(l1err | l2err | l3err): - m.next = "PROPAGATE_ERROR" - m.d.comb += [self.dtlb_update_o.valid.eq(0), - self.itlb_update_o.valid.eq(0)] - - # this is a pointer to the next TLB level - with m.Else(): - # pointer to next level of page table - with m.If (ptw_lvl1): - # we are in the second level now - pptr = Cat(Const(0, 3), self.dtlb_vaddr_i[30:39], pte.ppn) - m.d.sync += [ptw_pptr.eq(pptr), - ptw_lvl.eq(LVL2) - ] - with m.If(ptw_lvl2): - # here we received a pointer to the third level - pptr = Cat(Const(0, 3), self.dtlb_vaddr_i[21:30], pte.ppn) - m.d.sync += [ptw_pptr.eq(pptr), - ptw_lvl.eq(LVL3) - ] - with m.If(ptw_lvl3): #guess: shift page levels by one - # here we received a pointer to the fourth level - # the last one is near the page offset - pptr = Cat(Const(0, 3), self.dtlb_vaddr_i[12:21], pte.ppn) - m.d.sync += [ptw_pptr.eq(pptr), - ptw_lvl.eq(LVL4) - ] - self.set_grant_state(m) - - with m.If (ptw_lvl4): - # Should already be the last level - # page table => Error - m.d.sync += ptw_lvl.eq(LVL4) - m.next = "PROPAGATE_ERROR" - - -if __name__ == '__main__': - ptw = PTW() - vl = rtlil.convert(ptw, ports=ptw.ports()) - with open("test_ptw.il", "w") as f: - f.write(vl) diff --git a/src/TLB/ariane/test/test_plru.py b/src/TLB/ariane/test/test_plru.py deleted file mode 100644 index 68dcfa58..00000000 --- a/src/TLB/ariane/test/test_plru.py +++ /dev/null @@ -1,15 +0,0 @@ -import sys -sys.path.append("../src") -sys.path.append("../../../TestUtil") - -from TLB.ariane.plru import PLRU - -from nmigen.compat.sim import run_simulation - -def tbench(dut): - yield - -if __name__ == "__main__": - dut = PLRU(4) - run_simulation(dut, tbench(dut), vcd_name="test_plru.vcd") - print("PLRU Unit Test Success") diff --git a/src/TLB/ariane/test/test_ptw.py b/src/TLB/ariane/test/test_ptw.py deleted file mode 100644 index b5deb28b..00000000 --- a/src/TLB/ariane/test/test_ptw.py +++ /dev/null @@ -1,130 +0,0 @@ -import sys -sys.path.append("../src") -sys.path.append("../../../TestUtil") - -from nmigen.compat.sim import run_simulation - -from TLB.ariane.ptw import PTW, PTE - -# unit was changed, test needs to be changed - -def tbench(dut): - - addr = 0x8000000 - - #pte = PTE() - #yield pte.v.eq(1) - #yield pte.r.eq(1) - - yield dut.req_port_i.data_gnt.eq(1) - yield dut.req_port_i.data_rvalid.eq(1) - yield dut.req_port_i.data_rdata.eq(0x43)#pte.flatten()) - - # data lookup - yield dut.en_ld_st_translation_i.eq(1) - yield dut.asid_i.eq(1) - - yield dut.dtlb_access_i.eq(1) - yield dut.dtlb_hit_i.eq(0) - yield dut.dtlb_vaddr_i.eq(0x400000000) - - yield - yield - yield - - yield dut.dtlb_access_i.eq(1) - yield dut.dtlb_hit_i.eq(0) - yield dut.dtlb_vaddr_i.eq(0x200000) - - yield - yield - yield - - yield dut.req_port_i.data_gnt.eq(0) - yield dut.dtlb_access_i.eq(1) - yield dut.dtlb_hit_i.eq(0) - yield dut.dtlb_vaddr_i.eq(0x400000011) - - yield - yield dut.req_port_i.data_gnt.eq(1) - yield - yield - - # data lookup, PTW levels 1-2-3 - addr = 0x4000000 - yield dut.dtlb_vaddr_i.eq(addr) - yield dut.mxr_i.eq(0x1) - yield dut.req_port_i.data_gnt.eq(1) - yield dut.req_port_i.data_rvalid.eq(1) - yield dut.req_port_i.data_rdata.eq(0x41 | (addr>>12)<<10)#pte.flatten()) - - yield dut.en_ld_st_translation_i.eq(1) - yield dut.asid_i.eq(1) - - yield dut.dtlb_access_i.eq(1) - yield dut.dtlb_hit_i.eq(0) - yield dut.dtlb_vaddr_i.eq(addr) - - yield - yield - yield - yield - yield - yield - yield - yield - - yield dut.req_port_i.data_gnt.eq(0) - yield dut.dtlb_access_i.eq(1) - yield dut.dtlb_hit_i.eq(0) - yield dut.dtlb_vaddr_i.eq(0x400000011) - - yield - yield dut.req_port_i.data_gnt.eq(1) - yield - yield - yield - yield - - - # instruction lookup - yield dut.en_ld_st_translation_i.eq(0) - yield dut.enable_translation_i.eq(1) - yield dut.asid_i.eq(1) - - yield dut.itlb_access_i.eq(1) - yield dut.itlb_hit_i.eq(0) - yield dut.itlb_vaddr_i.eq(0x800000) - - yield - yield - yield - - yield dut.itlb_access_i.eq(1) - yield dut.itlb_hit_i.eq(0) - yield dut.itlb_vaddr_i.eq(0x200000) - - yield - yield - yield - - yield dut.req_port_i.data_gnt.eq(0) - yield dut.itlb_access_i.eq(1) - yield dut.itlb_hit_i.eq(0) - yield dut.itlb_vaddr_i.eq(0x800011) - - yield - yield dut.req_port_i.data_gnt.eq(1) - yield - yield - - yield - - -def test_ptw(): - dut = PTW() - run_simulation(dut, tbench(dut), vcd_name="test_ptw.vcd") - print("PTW Unit Test Success") - -if __name__ == "__main__": - test_ptw() diff --git a/src/TLB/ariane/test/test_tlb.py b/src/TLB/ariane/test/test_tlb.py deleted file mode 100644 index b94438ff..00000000 --- a/src/TLB/ariane/test/test_tlb.py +++ /dev/null @@ -1,70 +0,0 @@ -import sys -sys.path.append("../src") -sys.path.append("../../../TestUtil") - -from nmigen.compat.sim import run_simulation - -from TLB.ariane.tlb import TLB - -def set_vaddr(addr): - yield dut.lu_vaddr_i.eq(addr) - yield dut.update_i.vpn.eq(addr>>12) - - -def tbench(dut): - yield dut.lu_access_i.eq(1) - yield dut.lu_asid_i.eq(1) - yield dut.update_i.valid.eq(1) - yield dut.update_i.is_1G.eq(0) - yield dut.update_i.is_2M.eq(0) - yield dut.update_i.asid.eq(1) - yield dut.update_i.content.ppn.eq(0) - yield dut.update_i.content.rsw.eq(0) - yield dut.update_i.content.r.eq(1) - - yield - - addr = 0x80000 - yield from set_vaddr(addr) - yield - - addr = 0x90001 - yield from set_vaddr(addr) - yield - - addr = 0x28000000 - yield from set_vaddr(addr) - yield - - addr = 0x28000001 - yield from set_vaddr(addr) - - addr = 0x28000001 - yield from set_vaddr(addr) - yield - - addr = 0x1000040000 - yield from set_vaddr(addr) - yield - - addr = 0x1000040001 - yield from set_vaddr(addr) - yield - - yield dut.update_i.is_1G.eq(1) - addr = 0x2040000 - yield from set_vaddr(addr) - yield - - yield dut.update_i.is_1G.eq(1) - addr = 0x2040001 - yield from set_vaddr(addr) - yield - - yield - - -if __name__ == "__main__": - dut = TLB() - run_simulation(dut, tbench(dut), vcd_name="test_tlb.vcd") - print("TLB Unit Test Success") diff --git a/src/TLB/ariane/test/test_tlb_content.py b/src/TLB/ariane/test/test_tlb_content.py deleted file mode 100644 index 145ded7d..00000000 --- a/src/TLB/ariane/test/test_tlb_content.py +++ /dev/null @@ -1,63 +0,0 @@ -import sys -sys.path.append("../src") -sys.path.append("../../../TestUtil") - -from nmigen.compat.sim import run_simulation - -from TLB.ariane.tlb_content import TLBContent -from TestUtil.test_helper import assert_op, assert_eq - -def update(dut,a,t,g,m): - yield dut.replace_en_i.eq(1) - yield dut.update_i.valid.eq(1) - yield dut.update_i.is_512G.eq(t) - yield dut.update_i.is_1G.eq(g) - yield dut.update_i.is_2M.eq(m) - yield dut.update_i.vpn.eq(a) - yield - yield - -def check_hit(dut,hit,pagesize): - hit_d = yield dut.lu_hit_o - assert_eq("hit", hit_d, hit) - - if(hit): - if(pagesize=="t"): - hitp = yield dut.lu_is_512G_o - assert_eq("lu_is_512G_o", hitp, 1) - elif(pagesize=="g"): - hitp = yield dut.lu_is_1G_o - assert_eq("lu_is_1G_o", hitp, 1) - elif(pagesize=="m"): - hitp = yield dut.lu_is_2M_o - assert_eq("lu_is_2M_o", hitp, 1) - -def addr(a,b,c,d): - return a | b << 9 | c << 18 | d << 27 - -def tbench(dut): - yield dut.vpn0.eq(0x0A) - yield dut.vpn1.eq(0x0B) - yield dut.vpn2.eq(0x0C) - yield dut.vpn3.eq(0x0D) - yield from update(dut,addr(0xFF,0xFF,0xFF,0x0D),1,0,0) - yield from check_hit(dut,1,"t") - - yield from update(dut,addr(0xFF,0xFF,0x0C,0x0D),0,1,0) - yield from check_hit(dut,1,"g") - - yield from update(dut,addr(0xFF,0x0B,0x0C,0x0D),0,0,1) - yield from check_hit(dut,1,"m") - - yield from update(dut,addr(0x0A,0x0B,0x0C,0x0D),0,0,0) - yield from check_hit(dut,1,"") - - yield from update(dut,addr(0xAA,0xBB,0xCC,0xDD),0,0,0) - yield from check_hit(dut,0,"miss") - - -if __name__ == "__main__": - dut = TLBContent(4,4) - # - run_simulation(dut, tbench(dut), vcd_name="test_tlb_content.vcd") - print("TLBContent Unit Test Success") diff --git a/src/TLB/ariane/tlb.py b/src/TLB/ariane/tlb.py deleted file mode 100644 index cf4af57a..00000000 --- a/src/TLB/ariane/tlb.py +++ /dev/null @@ -1,175 +0,0 @@ -""" -# Copyright 2018 ETH Zurich and University of Bologna. -# Copyright and related rights are licensed under the Solderpad Hardware -# License, Version 0.51 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# http:#solderpad.org/licenses/SHL-0.51. Unless required by applicable law -# or agreed to in writing, software, hardware and materials distributed under -# this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Author: David Schaffenrath, TU Graz -# Author: Florian Zaruba, ETH Zurich -# Date: 21.4.2017 -# Description: Translation Lookaside Buffer, SV48 -# fully set-associative - -Implementation in c++: -https://raw.githubusercontent.com/Tony-Hu/TreePLRU/master/TreePLRU.cpp - -Text description: -https://people.cs.clemson.edu/~mark/464/p_lru.txt - -Online simulator: -http://www.ntu.edu.sg/home/smitha/ParaCache/Paracache/vm.html -""" -from math import log2 -from nmigen import Signal, Module, Cat, Const, Array, Elaboratable -from nmigen.cli import verilog, rtlil -from nmigen.lib.coding import Encoder - -from TLB.ariane.ptw import TLBUpdate, PTE, ASID_WIDTH -from TLB.ariane.plru import PLRU -from TLB.ariane.tlb_content import TLBContent - -TLB_ENTRIES = 8 - -class TLB(Elaboratable): - def __init__(self, tlb_entries=8, asid_width=8): - self.tlb_entries = tlb_entries - self.asid_width = asid_width - - self.flush_i = Signal() # Flush signal - # Lookup signals - self.lu_access_i = Signal() - self.lu_asid_i = Signal(self.asid_width) - self.lu_vaddr_i = Signal(64) - self.lu_content_o = PTE() - self.lu_is_2M_o = Signal() - self.lu_is_1G_o = Signal() - self.lu_is_512G_o = Signal() - self.lu_hit_o = Signal() - # Update TLB - self.pte_width = len(self.lu_content_o.flatten()) - self.update_i = TLBUpdate(asid_width) - - def elaborate(self, platform): - m = Module() - - vpn3 = Signal(9) #FIXME unused signal - vpn2 = Signal(9) - vpn1 = Signal(9) - vpn0 = Signal(9) - - #------------- - # Translation - #------------- - - # SV48 defines four levels of page tables - m.d.comb += [ vpn0.eq(self.lu_vaddr_i[12:21]), - vpn1.eq(self.lu_vaddr_i[21:30]), - vpn2.eq(self.lu_vaddr_i[30:39]), - vpn3.eq(self.lu_vaddr_i[39:48]), ### FIXME - ] - - tc = [] - for i in range(self.tlb_entries): - tlc = TLBContent(self.pte_width, self.asid_width) - setattr(m.submodules, "tc%d" % i, tlc) - tc.append(tlc) - # connect inputs - tlc.update_i = self.update_i # saves a lot of graphviz links - m.d.comb += [tlc.vpn0.eq(vpn0), - tlc.vpn1.eq(vpn1), - tlc.vpn2.eq(vpn2), - # TODO 4th - tlc.flush_i.eq(self.flush_i), - #tlc.update_i.eq(self.update_i), - tlc.lu_asid_i.eq(self.lu_asid_i)] - tc = Array(tc) - - #-------------- - # Select hit - #-------------- - - # use Encoder to select hit index - # XXX TODO: assert that there's only one valid entry (one lu_hit) - hitsel = Encoder(self.tlb_entries) - m.submodules.hitsel = hitsel - - hits = [] - for i in range(self.tlb_entries): - hits.append(tc[i].lu_hit_o) - m.d.comb += hitsel.i.eq(Cat(*hits)) # (goes into plru as well) - idx = hitsel.o - - active = Signal(reset_less=True) - m.d.comb += active.eq(~hitsel.n) - with m.If(active): - # active hit, send selected as output - m.d.comb += [ self.lu_is_512G_o.eq(tc[idx].lu_is_512G_o), - self.lu_is_1G_o.eq(tc[idx].lu_is_1G_o), - self.lu_is_2M_o.eq(tc[idx].lu_is_2M_o), - self.lu_hit_o.eq(1), - self.lu_content_o.flatten().eq(tc[idx].lu_content_o), - ] - - #-------------- - # PLRU. - #-------------- - - p = PLRU(self.tlb_entries) - plru_tree = Signal(p.TLBSZ) - m.submodules.plru = p - - # connect PLRU inputs/outputs - # XXX TODO: assert that there's only one valid entry (one replace_en) - en = [] - for i in range(self.tlb_entries): - en.append(tc[i].replace_en_i) - m.d.comb += [Cat(*en).eq(p.replace_en_o), # output from PLRU into tags - p.lu_hit.eq(hitsel.i), - p.lu_access_i.eq(self.lu_access_i), - p.plru_tree.eq(plru_tree)] - m.d.sync += plru_tree.eq(p.plru_tree_o) - - #-------------- - # Sanity checks - #-------------- - - assert (self.tlb_entries % 2 == 0) and (self.tlb_entries > 1), \ - "TLB size must be a multiple of 2 and greater than 1" - assert (self.asid_width >= 1), \ - "ASID width must be at least 1" - - return m - - """ - # Just for checking - function int countSetBits(logic[self.tlb_entries-1:0] vector); - automatic int count = 0; - foreach (vector[idx]) begin - count += vector[idx]; - end - return count; - endfunction - - assert property (@(posedge clk_i)(countSetBits(lu_hit) <= 1)) - else $error("More then one hit in TLB!"); $stop(); end - assert property (@(posedge clk_i)(countSetBits(replace_en) <= 1)) - else $error("More then one TLB entry selected for next replace!"); - """ - - def ports(self): - return [self.flush_i, self.lu_access_i, - self.lu_asid_i, self.lu_vaddr_i, - self.lu_is_2M_o, self.lu_1G_o, self.lu_is_512G_o, self.lu_hit_o - ] + self.lu_content_o.ports() + self.update_i.ports() - -if __name__ == '__main__': - tlb = TLB() - vl = rtlil.convert(tlb, ports=tlb.ports()) - with open("test_tlb.il", "w") as f: - f.write(vl) - diff --git a/src/TLB/ariane/tlb_content.py b/src/TLB/ariane/tlb_content.py deleted file mode 100644 index 3384c885..00000000 --- a/src/TLB/ariane/tlb_content.py +++ /dev/null @@ -1,145 +0,0 @@ -from nmigen import Signal, Module, Cat, Const, Elaboratable - -from TLB.ariane.ptw import TLBUpdate, PTE - - -class TLBEntry: - def __init__(self, asid_width): - self.asid = Signal(asid_width,name="ent_asid") - # SV48 defines four levels of page tables - self.vpn0 = Signal(9,name="ent_vpn0") - self.vpn1 = Signal(9,name="ent_vpn1") - self.vpn2 = Signal(9,name="ent_vpn2") - self.vpn3 = Signal(9,name="ent_vpn3") - self.is_2M = Signal(name="ent_is_2M") - self.is_1G = Signal(name="ent_is_1G") - self.is_512G = Signal(name="ent_is_512G") - self.valid = Signal(name="ent_valid") - - def flatten(self): - return Cat(*self.ports()) - - def eq(self, x): - return self.flatten().eq(x.flatten()) - - def ports(self): - return [self.asid, self.vpn0, self.vpn1, self.vpn2, - self.is_2M, self.is_1G, self.valid] - - -class TLBContent(Elaboratable): - def __init__(self, pte_width, asid_width): - self.asid_width = asid_width - self.pte_width = pte_width - self.flush_i = Signal() # Flush signal - # Update TLB - self.update_i = TLBUpdate(asid_width) - self.vpn3 = Signal(9) - self.vpn2 = Signal(9) - self.vpn1 = Signal(9) - self.vpn0 = Signal(9) - self.replace_en_i = Signal() # replace the following entry, - # set by replacement strategy - # Lookup signals - self.lu_asid_i = Signal(asid_width) - self.lu_content_o = Signal(pte_width) - self.lu_is_512G_o = Signal() - self.lu_is_2M_o = Signal() - self.lu_is_1G_o = Signal() - self.lu_hit_o = Signal() - - def elaborate(self, platform): - m = Module() - - tags = TLBEntry(self.asid_width) - - - content = Signal(self.pte_width) - - m.d.comb += [self.lu_hit_o.eq(0), - self.lu_is_512G_o.eq(0), - self.lu_is_2M_o.eq(0), - self.lu_is_1G_o.eq(0)] - - # temporaries for lookup - asid_ok = Signal(reset_less=True) - # tags_ok = Signal(reset_less=True) - - vpn3_ok = Signal(reset_less=True) - vpn2_ok = Signal(reset_less=True) - vpn1_ok = Signal(reset_less=True) - vpn0_ok = Signal(reset_less=True) - - #tags_2M = Signal(reset_less=True) - vpn0_or_2M = Signal(reset_less=True) - - m.d.comb += [ - #compare asid and vpn* - asid_ok.eq(tags.asid == self.lu_asid_i), - vpn3_ok.eq(tags.vpn3 == self.vpn3), - vpn2_ok.eq(tags.vpn2 == self.vpn2), - vpn1_ok.eq(tags.vpn1 == self.vpn1), - vpn0_ok.eq(tags.vpn0 == self.vpn0), - vpn0_or_2M.eq(tags.is_2M | vpn0_ok) - ] - - - with m.If(asid_ok & tags.valid): - # first level, only vpn3 needs to match - with m.If (tags.is_512G & vpn3_ok): - m.d.comb += [ self.lu_content_o.eq(content), - self.lu_is_512G_o.eq(1), - self.lu_hit_o.eq(1), - ] - # second level , second level vpn2 and vpn3 need to match - with m.Elif (tags.is_1G & vpn2_ok & vpn3_ok): - m.d.comb += [ self.lu_content_o.eq(content), - self.lu_is_1G_o.eq(1), - self.lu_hit_o.eq(1), - ] - # not a giga page hit nor a tera page hit so check further - with m.Elif(vpn1_ok): - # this could be a 2 mega page hit or a 4 kB hit - # output accordingly - with m.If(vpn0_or_2M): - m.d.comb += [ self.lu_content_o.eq(content), - self.lu_is_2M_o.eq(tags.is_2M), - self.lu_hit_o.eq(1), - ] - # ------------------ - # Update or Flush - # ------------------ - - # temporaries - replace_valid = Signal(reset_less=True) - m.d.comb += replace_valid.eq(self.update_i.valid & self.replace_en_i) - - # flush - with m.If (self.flush_i): - # invalidate (flush) conditions: all if zero or just this ASID - with m.If (self.lu_asid_i == Const(0, self.asid_width) | - (self.lu_asid_i == tags.asid)): - m.d.sync += tags.valid.eq(0) - - # normal replacement - with m.Elif(replace_valid): - m.d.sync += [ # update tag array - tags.asid.eq(self.update_i.asid), - tags.vpn3.eq(self.update_i.vpn[27:36]), - tags.vpn2.eq(self.update_i.vpn[18:27]), - tags.vpn1.eq(self.update_i.vpn[9:18]), - tags.vpn0.eq(self.update_i.vpn[0:9]), - tags.is_512G.eq(self.update_i.is_512G), - tags.is_1G.eq(self.update_i.is_1G), - tags.is_2M.eq(self.update_i.is_2M), - tags.valid.eq(1), - # and content as well - content.eq(self.update_i.content.flatten()) - ] - return m - - def ports(self): - return [self.flush_i, - self.lu_asid_i, - self.lu_is_2M_o, self.lu_is_1G_o,self.lu_is_512G_o, self.lu_hit_o, - ] + self.update_i.content.ports() + self.update_i.ports() diff --git a/src/TLB/test/__init__.py b/src/TLB/test/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/TLB/test/test_LFSR2.py b/src/TLB/test/test_LFSR2.py deleted file mode 100644 index c05f55b7..00000000 --- a/src/TLB/test/test_LFSR2.py +++ /dev/null @@ -1,70 +0,0 @@ -# SPDX-License-Identifier: LGPL-2.1-or-later -# See Notices.txt for copyright information -from TLB.LFSR import LFSR, LFSRPolynomial, LFSR_POLY_3 - -from nmigen.back.pysim import Simulator, Delay, Tick -import unittest - - -class TestLFSR(unittest.TestCase): - def test_poly(self): - v = LFSRPolynomial() - self.assertEqual(repr(v), "LFSRPolynomial([0])") - self.assertEqual(str(v), "1") - v = LFSRPolynomial([1]) - self.assertEqual(repr(v), "LFSRPolynomial([1, 0])") - self.assertEqual(str(v), "x + 1") - v = LFSRPolynomial([0, 1]) - self.assertEqual(repr(v), "LFSRPolynomial([1, 0])") - self.assertEqual(str(v), "x + 1") - v = LFSRPolynomial([1, 2]) - self.assertEqual(repr(v), "LFSRPolynomial([2, 1, 0])") - self.assertEqual(str(v), "x^2 + x + 1") - v = LFSRPolynomial([2]) - self.assertEqual(repr(v), "LFSRPolynomial([2, 0])") - self.assertEqual(str(v), "x^2 + 1") - self.assertEqual(str(LFSR_POLY_3), "x^3 + x^2 + 1") - - def test_lfsr_3(self): - module = LFSR(LFSR_POLY_3) - traces = [module.state, module.enable] - with Simulator(module, - vcd_file=open("Waveforms/test_LFSR2.vcd", "w"), - gtkw_file=open("Waveforms/test_LFSR2.gtkw", "w"), - traces=traces) as sim: - sim.add_clock(1e-6, 0.25e-6) - delay = Delay(1e-7) - - def async_process(): - yield module.enable.eq(0) - yield Tick() - self.assertEqual((yield module.state), 0x1) - yield Tick() - self.assertEqual((yield module.state), 0x1) - yield module.enable.eq(1) - yield Tick() - yield delay - self.assertEqual((yield module.state), 0x2) - yield Tick() - yield delay - self.assertEqual((yield module.state), 0x5) - yield Tick() - yield delay - self.assertEqual((yield module.state), 0x3) - yield Tick() - yield delay - self.assertEqual((yield module.state), 0x7) - yield Tick() - yield delay - self.assertEqual((yield module.state), 0x6) - yield Tick() - yield delay - self.assertEqual((yield module.state), 0x4) - yield Tick() - yield delay - self.assertEqual((yield module.state), 0x1) - yield Tick() - - sim.add_process(async_process) - sim.run() - diff --git a/src/TLB/test/test_address_encoder.py b/src/TLB/test/test_address_encoder.py deleted file mode 100644 index 0aad35b4..00000000 --- a/src/TLB/test/test_address_encoder.py +++ /dev/null @@ -1,105 +0,0 @@ -from nmigen.compat.sim import run_simulation -from TLB.AddressEncoder import AddressEncoder -from TestUtil.test_helper import assert_eq, assert_ne, assert_op - - -# This function allows for the easy setting of values to the AddressEncoder -# Arguments: -# dut: The AddressEncoder being tested -# i (Input): The array of single bits to be written -def set_encoder(dut, i): - yield dut.i.eq(i) - yield - -# Checks the single match of the AddressEncoder -# Arguments: -# dut: The AddressEncoder being tested -# sm (Single Match): The expected match result -# op (Operation): (0 => ==), (1 => !=) -def check_single_match(dut, sm, op): - out_sm = yield dut.single_match - assert_op("Single Match", out_sm, sm, op) - -# Checks the multiple match of the AddressEncoder -# Arguments: -# dut: The AddressEncoder being tested -# mm (Multiple Match): The expected match result -# op (Operation): (0 => ==), (1 => !=) -def check_multiple_match(dut, mm, op): - out_mm = yield dut.multiple_match - assert_op("Multiple Match", out_mm, mm, op) - -# Checks the output of the AddressEncoder -# Arguments: -# dut: The AddressEncoder being tested -# o (Output): The expected output -# op (Operation): (0 => ==), (1 => !=) -def check_output(dut, o, op): - out_o = yield dut.o - assert_op("Output", out_o, o, op) - -# Checks the state of the AddressEncoder -# Arguments: -# dut: The AddressEncoder being tested -# sm (Single Match): The expected match result -# mm (Multiple Match): The expected match result -# o (Output): The expected output -# ss_op (Operation): Operation for the match assertion (0 => ==), (1 => !=) -# mm_op (Operation): Operation for the match assertion (0 => ==), (1 => !=) -# o_op (Operation): Operation for the match assertion (0 => ==), (1 => !=) -def check_all(dut, sm, mm, o, sm_op, mm_op, o_op): - yield from check_single_match(dut, sm, sm_op) - yield from check_multiple_match(dut, mm, mm_op) - yield from check_output(dut, o, o_op) - -def tbench(dut): - # Check invalid input - in_val = 0b000 - single_match = 0 - multiple_match = 0 - output = 0 - yield from set_encoder(dut, in_val) - yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0) - - # Check single bit - in_val = 0b001 - single_match = 1 - multiple_match = 0 - output = 0 - yield from set_encoder(dut, in_val) - yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0) - - # Check another single bit - in_val = 0b100 - single_match = 1 - multiple_match = 0 - output = 2 - yield from set_encoder(dut, in_val) - yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0) - - # Check multiple match - # We expected the lowest bit to be returned which is address 0 - in_val = 0b101 - single_match = 0 - multiple_match = 1 - output = 0 - yield from set_encoder(dut, in_val) - yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0) - - # Check another multiple match - # We expected the lowest bit to be returned which is address 1 - in_val = 0b110 - single_match = 0 - multiple_match = 1 - output = 1 - yield from set_encoder(dut, in_val) - yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0) - -def test_addr(): - dut = AddressEncoder(4) - run_simulation(dut, tbench(dut), - vcd_name="Waveforms/test_address_encoder.vcd") - print("AddressEncoder Unit Test Success") - -if __name__ == "__main__": - test_addr() diff --git a/src/TLB/test/test_cam.py b/src/TLB/test/test_cam.py deleted file mode 100644 index f11c48ad..00000000 --- a/src/TLB/test/test_cam.py +++ /dev/null @@ -1,206 +0,0 @@ -from nmigen.compat.sim import run_simulation - -from TLB.Cam import Cam - -from TestUtil.test_helper import assert_eq, assert_ne, assert_op - -# This function allows for the easy setting of values to the Cam -# Arguments: -# dut: The Cam being tested -# e (Enable): Whether the block is going to be enabled -# we (Write Enable): Whether the Cam will write on the next cycle -# a (Address): Where the data will be written if write enable is high -# d (Data): Either what we are looking for or will write to the address -def set_cam(dut, e, we, a, d): - yield dut.enable.eq(e) - yield dut.write_enable.eq(we) - yield dut.address_in.eq(a) - yield dut.data_in.eq(d) - yield - -# Checks the multiple match of the Cam -# Arguments: -# dut: The Cam being tested -# mm (Multiple Match): The expected match result -# op (Operation): (0 => ==), (1 => !=) -def check_multiple_match(dut, mm, op): - out_mm = yield dut.multiple_match - assert_op("Multiple Match", out_mm, mm, op) - -# Checks the single match of the Cam -# Arguments: -# dut: The Cam being tested -# sm (Single Match): The expected match result -# op (Operation): (0 => ==), (1 => !=) -def check_single_match(dut, sm, op): - out_sm = yield dut.single_match - assert_op("Single Match", out_sm, sm, op) - -# Checks the address output of the Cam -# Arguments: -# dut: The Cam being tested -# ma (Match Address): The expected match result -# op (Operation): (0 => ==), (1 => !=) -def check_match_address(dut, ma, op): - out_ma = yield dut.match_address - assert_op("Match Address", out_ma, ma, op) - -# Checks the state of the Cam -# Arguments: -# dut: The Cam being tested -# sm (Single Match): The expected match result -# mm (Multiple Match): The expected match result -# ma: (Match Address): The expected address output -# ss_op (Operation): Operation for the match assertion (0 => ==), (1 => !=) -# mm_op (Operation): Operation for the match assertion (0 => ==), (1 => !=) -# ma_op (Operation): Operation for the address assertion (0 => ==), (1 => !=) -def check_all(dut, mm, sm, ma, mm_op, sm_op, ma_op): - yield from check_multiple_match(dut, mm, mm_op) - yield from check_single_match(dut, sm, sm_op) - yield from check_match_address(dut, ma, ma_op) - -def tbench(dut): - # NA - enable = 0 - write_enable = 0 - address = 0 - data = 0 - single_match = 0 - yield from set_cam(dut, enable, write_enable, address, data) - yield - yield from check_single_match(dut, single_match, 0) - - # Read Miss Multiple - # Note that the default starting entry data bits are all 0 - enable = 1 - write_enable = 0 - address = 0 - data = 0 - multiple_match = 1 - single_match = 0 - yield from set_cam(dut, enable, write_enable, address, data) - yield - yield from check_multiple_match(dut, multiple_match, 0) - - # Read Miss - # Note that the default starting entry data bits are all 0 - enable = 1 - write_enable = 0 - address = 0 - data = 1 - multiple_match = 0 - single_match = 0 - yield from set_cam(dut, enable, write_enable, address, data) - yield - yield from check_single_match(dut, single_match, 0) - - # Write Entry 0 - enable = 1 - write_enable = 1 - address = 0 - data = 4 - multiple_match = 0 - single_match = 0 - yield from set_cam(dut, enable, write_enable, address, data) - yield - yield from check_single_match(dut, single_match, 0) - - # Read Hit Entry 0 - enable = 1 - write_enable = 0 - address = 0 - data = 4 - multiple_match = 0 - single_match = 1 - yield from set_cam(dut, enable, write_enable, address, data) - yield - yield from check_all(dut, multiple_match, single_match, address, 0, 0, 0) - - # Search Hit - enable = 1 - write_enable = 0 - address = 0 - data = 4 - multiple_match = 0 - single_match = 1 - yield from set_cam(dut, enable, write_enable, address, data) - yield - yield from check_all(dut, multiple_match, single_match, address, 0, 0, 0) - - # Search Miss - enable = 1 - write_enable = 0 - address = 0 - data = 5 - single_match = 0 - yield from set_cam(dut, enable, write_enable, address, data) - yield - yield from check_single_match(dut, single_match, 0) - - # Multiple Match test - # Write Entry 1 - enable = 1 - write_enable = 1 - address = 1 - data = 5 - multiple_match = 0 - single_match = 0 - yield from set_cam(dut, enable, write_enable, address, data) - yield - yield from check_single_match(dut, single_match, 0) - - # Write Entry 2 - # Same data as Entry 1 - enable = 1 - write_enable = 1 - address = 2 - data = 5 - multiple_match = 0 - single_match = 0 - yield from set_cam(dut, enable, write_enable, address, data) - yield - yield from check_single_match(dut, single_match, 0) - - # Read Hit Data 5 - enable = 1 - write_enable = 0 - address = 1 - data = 5 - multiple_match = 1 - single_match = 0 - yield from set_cam(dut, enable, write_enable, address, data) - yield - yield from check_all(dut, multiple_match, single_match, address,0,0,0) - - # Verify read_warning is not caused - # Write Entry 0 - enable = 1 - write_enable = 1 - address = 0 - data = 7 - multiple_match = 0 - single_match = 0 - yield from set_cam(dut, enable, write_enable, address, data) - # Note there is no yield we immediately attempt to read in the next cycle - - # Read Hit Data 7 - enable = 1 - write_enable = 0 - address = 0 - data = 7 - multiple_match = 0 - single_match = 1 - yield from set_cam(dut, enable, write_enable, address, data) - yield - yield from check_single_match(dut, single_match, 0) - - yield - - -def test_cam(): - dut = Cam(4, 4) - run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_cam.vcd") - print("Cam Unit Test Success") - -if __name__ == "__main__": - test_cam() diff --git a/src/TLB/test/test_cam_entry.py b/src/TLB/test/test_cam_entry.py deleted file mode 100644 index 43b699d2..00000000 --- a/src/TLB/test/test_cam_entry.py +++ /dev/null @@ -1,110 +0,0 @@ -from nmigen.compat.sim import run_simulation - -from TestUtil.test_helper import assert_eq, assert_ne, assert_op -from TLB.CamEntry import CamEntry - -# This function allows for the easy setting of values to the Cam Entry -# Arguments: -# dut: The CamEntry being tested -# c (command): NA (0), Read (1), Write (2), Reserve (3) -# d (data): The data to be set -def set_cam_entry(dut, c, d): - # Write desired values - yield dut.command.eq(c) - yield dut.data_in.eq(d) - yield - # Reset all lines - yield dut.command.eq(0) - yield dut.data_in.eq(0) - yield - -# Checks the data state of the CAM entry -# Arguments: -# dut: The CamEntry being tested -# d (Data): The expected data -# op (Operation): (0 => ==), (1 => !=) -def check_data(dut, d, op): - out_d = yield dut.data - assert_op("Data", out_d, d, op) - -# Checks the match state of the CAM entry -# Arguments: -# dut: The CamEntry being tested -# m (Match): The expected match -# op (Operation): (0 => ==), (1 => !=) -def check_match(dut, m, op): - out_m = yield dut.match - assert_op("Match", out_m, m, op) - -# Checks the state of the CAM entry -# Arguments: -# dut: The CamEntry being tested -# d (data): The expected data -# m (match): The expected match -# d_op (Operation): Operation for the data assertion (0 => ==), (1 => !=) -# m_op (Operation): Operation for the match assertion (0 => ==), (1 => !=) -def check_all(dut, d, m, d_op, m_op): - yield from check_data(dut, d, d_op) - yield from check_match(dut, m, m_op) - -# This tbench goes through the paces of testing the CamEntry module -# It is done by writing and then reading various combinations of key/data pairs -# and reading the results with varying keys to verify the resulting stored -# data is correct. -def tbench(dut): - # Check write - command = 2 - data = 1 - match = 0 - yield from set_cam_entry(dut, command, data) - yield from check_all(dut, data, match, 0, 0) - - # Check read miss - command = 1 - data = 2 - match = 0 - yield from set_cam_entry(dut, command, data) - yield from check_all(dut, data, match, 1, 0) - - # Check read hit - command = 1 - data = 1 - match = 1 - yield from set_cam_entry(dut, command, data) - yield from check_all(dut, data, match, 0, 0) - - # Check overwrite - command = 2 - data = 5 - match = 0 - yield from set_cam_entry(dut, command, data) - yield - yield from check_all(dut, data, match, 0, 0) - - # Check read hit - command = 1 - data = 5 - match = 1 - yield from set_cam_entry(dut, command, data) - yield from check_all(dut, data, match, 0, 0) - - # Check reset - command = 3 - data = 0 - match = 0 - yield from set_cam_entry(dut, command, data) - yield from check_all(dut, data, match, 0, 0) - - # Extra clock cycle for waveform - yield - - -def test_camentry(): - dut = CamEntry(4) - run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_cam_entry.vcd") - print("CamEntry Unit Test Success") - - -if __name__ == "__main__": - test_camentry() - diff --git a/src/TLB/test/test_permission_validator.py b/src/TLB/test/test_permission_validator.py deleted file mode 100644 index 81873d79..00000000 --- a/src/TLB/test/test_permission_validator.py +++ /dev/null @@ -1,146 +0,0 @@ -from nmigen.compat.sim import run_simulation - -from TLB.PermissionValidator import PermissionValidator - -from TestUtil.test_helper import assert_op - - -def set_validator(dut, d, xwr, sm, sa, asid): - yield dut.data.eq(d) - yield dut.xwr.eq(xwr) - yield dut.super_mode.eq(sm) - yield dut.super_access.eq(sa) - yield dut.asid.eq(asid) - yield - -def check_valid(dut, v, op): - out_v = yield dut.valid - assert_op("Valid", out_v, v, op) - -def tbench(dut): - # 80 bits represented. Ignore the MSB as it will be truncated - # ASID is bits first 4 hex values (bits 64 - 78) - - # Test user mode entry valid - # Global Bit matching ASID - # Ensure that user mode and valid is enabled! - data = 0x7FFF0000000000000031 - # Ignore MSB it will be truncated - asid = 0x7FFF - super_mode = 0 - super_access = 0 - xwr = 0 - valid = 1 - yield from set_validator(dut, data, xwr, super_mode, super_access, asid) - yield from check_valid(dut, valid, 0) - - # Test user mode entry valid - # Global Bit nonmatching ASID - # Ensure that user mode and valid is enabled! - data = 0x7FFF0000000000000031 - # Ignore MSB it will be truncated - asid = 0x7FF6 - super_mode = 0 - super_access = 0 - xwr = 0 - valid = 1 - yield from set_validator(dut, data, xwr, super_mode, super_access, asid) - yield from check_valid(dut, valid, 0) - - # Test user mode entry invalid - # Global Bit nonmatching ASID - # Ensure that user mode and valid is enabled! - data = 0x7FFF0000000000000021 - # Ignore MSB it will be truncated - asid = 0x7FF6 - super_mode = 0 - super_access = 0 - xwr = 0 - valid = 0 - yield from set_validator(dut, data, xwr, super_mode, super_access, asid) - yield from check_valid(dut, valid, 0) - - # Test user mode entry valid - # Ensure that user mode and valid is enabled! - data = 0x7FFF0000000000000011 - # Ignore MSB it will be truncated - asid = 0x7FFF - super_mode = 0 - super_access = 0 - xwr = 0 - valid = 1 - yield from set_validator(dut, data, xwr, super_mode, super_access, asid) - yield from check_valid(dut, valid, 0) - - # Test user mode entry invalid - # Ensure that user mode and valid is enabled! - data = 0x7FFF0000000000000011 - # Ignore MSB it will be truncated - asid = 0x7FF6 - super_mode = 0 - super_access = 0 - xwr = 0 - valid = 0 - yield from set_validator(dut, data, xwr, super_mode, super_access, asid) - yield from check_valid(dut, valid, 0) - - # Test supervisor mode entry valid - # The entry is NOT in user mode - # Ensure that user mode and valid is enabled! - data = 0x7FFF0000000000000001 - # Ignore MSB it will be truncated - asid = 0x7FFF - super_mode = 1 - super_access = 0 - xwr = 0 - valid = 1 - yield from set_validator(dut, data, xwr, super_mode, super_access, asid) - yield from check_valid(dut, valid, 0) - - # Test supervisor mode entry invalid - # The entry is in user mode - # Ensure that user mode and valid is enabled! - data = 0x7FFF0000000000000011 - # Ignore MSB it will be truncated - asid = 0x7FFF - super_mode = 1 - super_access = 0 - xwr = 0 - valid = 0 - yield from set_validator(dut, data, xwr, super_mode, super_access, asid) - yield from check_valid(dut, valid, 0) - - # Test supervisor mode entry valid - # The entry is NOT in user mode with access - # Ensure that user mode and valid is enabled! - data = 0x7FFF0000000000000001 - # Ignore MSB it will be truncated - asid = 0x7FFF - super_mode = 1 - super_access = 1 - xwr = 0 - valid = 1 - yield from set_validator(dut, data, xwr, super_mode, super_access, asid) - yield from check_valid(dut, valid, 0) - - # Test supervisor mode entry valid - # The entry is in user mode with access - # Ensure that user mode and valid is enabled! - data = 0x7FFF0000000000000011 - # Ignore MSB it will be truncated - asid = 0x7FFF - super_mode = 1 - super_access = 1 - xwr = 0 - valid = 1 - yield from set_validator(dut, data, xwr, super_mode, super_access, asid) - yield from check_valid(dut, valid, 0) - - -def test_permv(): - dut = PermissionValidator(15, 64); - run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_permission_validator.vcd") - print("PermissionValidator Unit Test Success") - -if __name__ == "__main__": - test_permv() diff --git a/src/TLB/test/test_pte_entry.py b/src/TLB/test/test_pte_entry.py deleted file mode 100644 index 5c0c34dc..00000000 --- a/src/TLB/test/test_pte_entry.py +++ /dev/null @@ -1,102 +0,0 @@ -from nmigen.compat.sim import run_simulation - -from TLB.PteEntry import PteEntry - -from TestUtil.test_helper import assert_op - -def set_entry(dut, i): - yield dut.i.eq(i) - yield - -def check_dirty(dut, d, op): - out_d = yield dut.d - assert_op("Dirty", out_d, d, op) - -def check_accessed(dut, a, op): - out_a = yield dut.a - assert_op("Accessed", out_a, a, op) - -def check_global(dut, o, op): - out = yield dut.g - assert_op("Global", out, o, op) - -def check_user(dut, o, op): - out = yield dut.u - assert_op("User Mode", out, o, op) - -def check_xwr(dut, o, op): - out = yield dut.xwr - assert_op("XWR", out, o, op) - -def check_asid(dut, o, op): - out = yield dut.asid - assert_op("ASID", out, o, op) - -def check_pte(dut, o, op): - out = yield dut.pte - assert_op("ASID", out, o, op) - -def check_valid(dut, v, op): - out_v = yield dut.v - assert_op("Valid", out_v, v, op) - -def check_all(dut, d, a, g, u, xwr, v, asid, pte): - yield from check_dirty(dut, d, 0) - yield from check_accessed(dut, a, 0) - yield from check_global(dut, g, 0) - yield from check_user(dut, u, 0) - yield from check_xwr(dut, xwr, 0) - yield from check_asid(dut, asid, 0) - yield from check_pte(dut, pte, 0) - yield from check_valid(dut, v, 0) - -def tbench(dut): - # 80 bits represented. Ignore the MSB as it will be truncated - # ASID is bits first 4 hex values (bits 64 - 78) - - i = 0x7FFF0000000000000031 - dirty = 0 - access = 0 - glob = 1 - user = 1 - xwr = 0 - valid = 1 - asid = 0x7FFF - pte = 0x0000000000000031 - yield from set_entry(dut, i) - yield from check_all(dut, dirty, access, glob, user, xwr, valid, asid, pte) - - i = 0x0FFF00000000000000FF - dirty = 1 - access = 1 - glob = 1 - user = 1 - xwr = 7 - valid = 1 - asid = 0x0FFF - pte = 0x00000000000000FF - yield from set_entry(dut, i) - yield from check_all(dut, dirty, access, glob, user, xwr, valid, asid, pte) - - i = 0x0721000000001100001F - dirty = 0 - access = 0 - glob = 0 - user = 1 - xwr = 7 - valid = 1 - asid = 0x0721 - pte = 0x000000001100001F - yield from set_entry(dut, i) - yield from check_all(dut, dirty, access, glob, user, xwr, valid, asid, pte) - - yield - - -def test_pteentry(): - dut = PteEntry(15, 64); - run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_pte_entry.vcd") - print("PteEntry Unit Test Success") - -if __name__ == "__main__": - test_pteentry() diff --git a/src/TLB/test/test_set_associative_cache.py b/src/TLB/test/test_set_associative_cache.py deleted file mode 100644 index 0641b556..00000000 --- a/src/TLB/test/test_set_associative_cache.py +++ /dev/null @@ -1,38 +0,0 @@ -from nmigen.compat.sim import run_simulation - -from TLB.SetAssociativeCache import SetAssociativeCache - -from TestUtil.test_helper import assert_eq, assert_ne, assert_op - -def set_sac(dut, e, c, s, t, d): - yield dut.enable.eq(e) - yield dut.command.eq(c) - yield dut.cset.eq(s) - yield dut.tag.eq(t) - yield dut.data_i.eq(d) - yield - -def tbench(dut): - enable = 1 - command = 2 - cset = 1 - tag = 2 - data = 3 - yield from set_sac(dut, enable, command, cset, tag, data) - yield - - enable = 1 - command = 2 - cset = 1 - tag = 5 - data = 8 - yield from set_sac(dut, enable, command, cset, tag, data) - yield - -def test_assoc_cache(): - dut = SetAssociativeCache(4, 4, 4, 4) - run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_set_associative_cache.vcd") - print("Set Associative Cache Unit Test Success") - -if __name__ == "__main__": - test_assoc_cache() diff --git a/src/TLB/test/test_tlb.py b/src/TLB/test/test_tlb.py deleted file mode 100644 index e9cc9d69..00000000 --- a/src/TLB/test/test_tlb.py +++ /dev/null @@ -1,80 +0,0 @@ -#import tracemalloc -#tracemalloc.start() - -from nmigen.compat.sim import run_simulation - -from TLB.TLB import TLB - -from TestUtil.test_helper import assert_op, assert_eq - -#self.supermode = Signal(1) # Supervisor Mode -#self.super_access = Signal(1) # Supervisor Access -#self.command = Signal(2) # 00=None, 01=Search, 10=Write L1, 11=Write L2 -#self.xwr = Signal(3) # Execute, Write, Read -#self.mode = Signal(4) # 4 bits for access to Sv48 on Rv64 -#self.address_L1 = Signal(max=L1_size) -#self.asid = Signal(asid_size) # Address Space IDentifier (ASID) -#self.vma = Signal(vma_size) # Virtual Memory Address (VMA) -#self.pte_in = Signal(pte_size) # To be saved Page Table Entry (PTE) -# -#self.hit = Signal(1) # Denotes if the VMA had a mapped PTE -#self.perm_valid = Signal(1) # Denotes if the permissions are correct -#self.pte_out = Signal(pte_size) # PTE that was mapped to by the VMA - -COMMAND_READ=1 -COMMAND_WRITE_L1=2 - -# Checks the data state of the CAM entry -# Arguments: -# dut: The CamEntry being tested -# d (Data): The expected data -# op (Operation): (0 => ==), (1 => !=) -def check_hit(dut, d): - hit_d = yield dut.hit - #assert_eq("hit", hit_d, d) - -def test_command(dut,cmd,xwr,cycles): - yield dut.command.eq(cmd) - yield dut.xwr.eq(xwr) - for i in range(0,cycles): - yield - -def test_write_L1(dut,vma,address_L1,asid,pte_in): - yield dut.address_L1.eq(address_L1) - yield dut.asid.eq(asid) - yield dut.vma.eq(vma) - yield dut.pte_in.eq(pte_in) - yield from test_command(dut,COMMAND_WRITE_L1,7,2) - -def test_search(dut,vma,found): - yield dut.vma.eq(vma) - yield from test_command(dut,COMMAND_READ,7,1) - yield from check_hit(dut,found) - -def zero(dut): - yield dut.supermode.eq(0) - yield dut.super_access.eq(0) - yield dut.mode.eq(0) - yield dut.address_L1.eq(0) - yield dut.asid.eq(0) - yield dut.vma.eq(0) - yield dut.pte_in.eq(0) - -def tbench(dut): - yield from zero(dut) - yield dut.mode.eq(0xF) # enable TLB - #test hit - yield from test_write_L1(dut,0xFEEDFACE,0,0xFFFF,0xF0F0) - yield from test_search(dut,0xFEEDFACE,1) - yield from test_search(dut,0xFACEFEED,0) - - - - -def test_tlb(): - dut = TLB(15,36,64,8) - run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_tlb.vcd") - print("TLB Unit Test Success") - -if __name__ == "__main__": - test_tlb() diff --git a/src/TestUtil/test_helper.py b/src/TestUtil/test_helper.py deleted file mode 100644 index c42990d6..00000000 --- a/src/TestUtil/test_helper.py +++ /dev/null @@ -1,30 +0,0 @@ -def assert_op(pre, o, e, op): - """ Verifies the given values given the particular operand - Arguments: - p (Prefix): Appended to the front of the assert statement - e (Expected): The expected value - o (Output): The output result - op (Operation): (0 => ==), (1 => !=) - """ - if op == 0: - assert_eq(pre, o, e) - else: - assert_ne(pre, o, e) - -def assert_eq(p, o, e): - """ Verifies the given values are equal - Arguments: - p (Prefix): Appended to the front of the assert statement - e (Expected): The expected value - o (Output): The output result - """ - assert o == e, p + " Output " + str(o) + " Expected " + str(e) - -def assert_ne(p, o, e): - """ Verifies the given values are not equal - Arguments: - p (Prefix): Appended to the front of the assert statement - e (Expected): The expected value - o (Output): The output result - """ - assert o != e, p + " Output " + str(o) + " Not Expecting " + str(e) diff --git a/src/decoder/.gitignore b/src/decoder/.gitignore deleted file mode 100644 index afed0735..00000000 --- a/src/decoder/.gitignore +++ /dev/null @@ -1 +0,0 @@ -*.csv diff --git a/src/decoder/power_decoder.py b/src/decoder/power_decoder.py deleted file mode 100644 index 5b5e7103..00000000 --- a/src/decoder/power_decoder.py +++ /dev/null @@ -1,275 +0,0 @@ -"""Cascading Power ISA Decoder - -This module uses CSV tables in a hierarchical/peer cascading fashion, -to create a multi-level instruction decoder by recognising appropriate -patterns. The output is a flattened (1-level) series of fields suitable -for a simple RISC engine. - -This is based on Anton Blanchard's excellent microwatt work: -https://github.com/antonblanchard/microwatt/blob/master/decode1.vhdl - -The basic principle is that the python code does the heavy lifting -(reading the CSV files, constructing the hierarchy), creating the HDL -AST with for-loops generating switch-case statements. - -PowerDecoder takes a *list* of CSV files with an associated bit-range -that it is requested to match against the "opcode" row of the CSV file. -This pattern can be either an integer, a binary number, *or* a wildcard -nmigen Case pattern of the form "001--1-100". - -Subdecoders are *additional* cases with further decoding. The "pattern" -argument is specified as one of the Case statements (a peer of the opcode -row in the CSV file), and thus further fields of the opcode may be decoded -giving increasing levels of detail. - -Top Level: - - [ (extra.csv: bit-fields entire 32-bit range - opcode -> matches - 000000---------------01000000000 -> ILLEGAL instruction - 01100000000000000000000000000000 -> SIM_CONFIG instruction - ................................ -> - ), - (major.csv: first 6 bits ONLY - opcode -> matches - 001100 -> ALU,OP_ADD (add) - 001101 -> ALU,OP_ADD (another type of add) - ...... -> ... - ...... -> ... - subdecoders: - 001011 this must match *MAJOR*.CSV - [ (minor_19.csv: bits 21 through 30 inclusive: - opcode -> matches - 0b0000000000 -> ALU,OP_MCRF - ............ -> .... - ), - (minor_19_00000.csv: bits 21 through 25 inclusive: - opcode -> matches - 0b00010 -> ALU,add_pcis - ) - ] - ), - ] - -""" - -from nmigen import Module, Elaboratable, Signal -from nmigen.cli import rtlil -from power_enums import (Function, Form, InternalOp, In1Sel, In2Sel, In3Sel, - OutSel, RC, LdstLen, CryIn, get_csv, single_bit_flags, - get_signal_name, default_values) -from collections import namedtuple -from power_fields import DecodeFields -from power_fieldsn import SigDecode, SignalBitRange - -Subdecoder = namedtuple("Subdecoder", ["pattern", "opcodes", "opint", - "bitsel", "suffix", "subdecoders"]) - - -class PowerOp: - """PowerOp: spec for execution. op type (ADD etc.) reg specs etc. - """ - - def __init__(self): - self.function_unit = Signal(Function, reset_less=True) - self.internal_op = Signal(InternalOp, reset_less=True) - self.form = Signal(Form, reset_less=True) - self.in1_sel = Signal(In1Sel, reset_less=True) - self.in2_sel = Signal(In2Sel, reset_less=True) - self.in3_sel = Signal(In3Sel, reset_less=True) - self.out_sel = Signal(OutSel, reset_less=True) - self.ldst_len = Signal(LdstLen, reset_less=True) - self.rc_sel = Signal(RC, reset_less=True) - self.cry_in = Signal(CryIn, reset_less=True) - for bit in single_bit_flags: - name = get_signal_name(bit) - setattr(self, name, Signal(reset_less=True, name=name)) - - def _eq(self, row=None): - if row is None: - row = default_values - res = [self.function_unit.eq(Function[row['unit']]), - self.form.eq(Form[row['form']]), - self.internal_op.eq(InternalOp[row['internal op']]), - self.in1_sel.eq(In1Sel[row['in1']]), - self.in2_sel.eq(In2Sel[row['in2']]), - self.in3_sel.eq(In3Sel[row['in3']]), - self.out_sel.eq(OutSel[row['out']]), - self.ldst_len.eq(LdstLen[row['ldst len']]), - self.rc_sel.eq(RC[row['rc']]), - self.cry_in.eq(CryIn[row['cry in']]), - ] - for bit in single_bit_flags: - sig = getattr(self, get_signal_name(bit)) - res.append(sig.eq(int(row.get(bit, 0)))) - return res - - def eq(self, otherop): - res = [self.function_unit.eq(otherop.function_unit), - self.form.eq(otherop.form), - self.internal_op.eq(otherop.internal_op), - self.in1_sel.eq(otherop.in1_sel), - self.in2_sel.eq(otherop.in2_sel), - self.in3_sel.eq(otherop.in3_sel), - self.out_sel.eq(otherop.out_sel), - self.rc_sel.eq(otherop.rc_sel), - self.ldst_len.eq(otherop.ldst_len), - self.cry_in.eq(otherop.cry_in)] - for bit in single_bit_flags: - sig = getattr(self, get_signal_name(bit)) - res.append(sig.eq(getattr(otherop, get_signal_name(bit)))) - return res - - def ports(self): - regular = [self.function_unit, - self.in1_sel, - self.in2_sel, - self.in3_sel, - self.out_sel, - self.ldst_len, - self.rc_sel, - self.internal_op, - self.form] - single_bit_ports = [getattr(self, get_signal_name(x)) - for x in single_bit_flags] - return regular + single_bit_ports - - -class PowerDecoder(Elaboratable): - """PowerDecoder - decodes an incoming opcode into the type of operation - """ - - def __init__(self, width, dec): - if not isinstance(dec, list): - dec = [dec] - self.dec = dec - self.opcode_in = Signal(width, reset_less=True) - - self.op = PowerOp() - for d in dec: - if d.suffix is not None and d.suffix >= width: - d.suffix = None - self.width = width - - def suffix_mask(self, d): - return ((1 << d.suffix) - 1) - - def divide_opcodes(self, d): - divided = {} - mask = self.suffix_mask(d) - print("mask", hex(mask)) - for row in d.opcodes: - opcode = row['opcode'] - if d.opint and '-' not in opcode: - opcode = int(opcode, 0) - key = opcode & mask - opcode = opcode >> d.suffix - if key not in divided: - divided[key] = [] - r = row.copy() - r['opcode'] = opcode - divided[key].append(r) - return divided - - def elaborate(self, platform): - m = Module() - comb = m.d.comb - - # note: default opcode is "illegal" as this is a combinatorial block - - # go through the list of CSV decoders first - for d in self.dec: - opcode_switch = Signal(d.bitsel[1] - d.bitsel[0], - reset_less=True) - comb += opcode_switch.eq(self.opcode_in[d.bitsel[0]:d.bitsel[1]]) - if d.suffix: - opcodes = self.divide_opcodes(d) - opc_in = Signal(d.suffix, reset_less=True) - comb += opc_in.eq(opcode_switch[:d.suffix]) - with m.Switch(opc_in): - for key, row in opcodes.items(): - bitsel = (d.suffix+d.bitsel[0], d.bitsel[1]) - sd = Subdecoder(pattern=None, opcodes=row, - bitsel=bitsel, suffix=None, - opint=False, subdecoders=[]) - subdecoder = PowerDecoder(width=32, dec=sd) - setattr(m.submodules, "dec_sub%d" % key, subdecoder) - comb += subdecoder.opcode_in.eq(self.opcode_in) - with m.Case(key): - comb += self.op.eq(subdecoder.op) - else: - # TODO: arguments, here (all of them) need to be a list. - # a for-loop around the *list* of decoder args. - with m.Switch(opcode_switch): - self.handle_subdecoders(m, d) - for row in d.opcodes: - opcode = row['opcode'] - if d.opint and '-' not in opcode: - opcode = int(opcode, 0) - if not row['unit']: - continue - with m.Case(opcode): - comb += self.op._eq(row) - return m - - def handle_subdecoders(self, m, d): - for dec in d.subdecoders: - subdecoder = PowerDecoder(self.width, dec) - if isinstance(dec, list): # XXX HACK: take first pattern - dec = dec[0] - setattr(m.submodules, "dec%d" % dec.pattern, subdecoder) - m.d.comb += subdecoder.opcode_in.eq(self.opcode_in) - with m.Case(dec.pattern): - m.d.comb += self.op.eq(subdecoder.op) - - def ports(self): - return [self.opcode_in] + self.op.ports() - - -class TopPowerDecoder(PowerDecoder, DecodeFields): - - def __init__(self, width, dec): - PowerDecoder.__init__(self, width, dec) - DecodeFields.__init__(self, SignalBitRange, [self.opcode_in]) - self.create_specs() - - -def create_pdecode(): - - # minor 19 has extra patterns - m19 = [] - m19.append(Subdecoder(pattern=19, opcodes=get_csv("minor_19.csv"), - opint=True, bitsel=(1, 11), suffix=None, subdecoders=[])) - m19.append(Subdecoder(pattern=19, opcodes=get_csv("minor_19_00000.csv"), - opint=True, bitsel=(1, 6), suffix=None, subdecoders=[])) - - # minor opcodes. - pminor = [ - m19, - Subdecoder(pattern=30, opcodes=get_csv("minor_30.csv"), - opint=True, bitsel=(1, 6), suffix=None, subdecoders=[]), - Subdecoder(pattern=31, opcodes=get_csv("minor_31.csv"), - opint=True, bitsel=(1, 11), suffix=0b00101, subdecoders=[]), - Subdecoder(pattern=58, opcodes=get_csv("minor_58.csv"), - opint=True, bitsel=(0, 2), suffix=None, subdecoders=[]), - Subdecoder(pattern=62, opcodes=get_csv("minor_62.csv"), - opint=True, bitsel=(0, 2), suffix=None, subdecoders=[]), - ] - - # top level: extra merged with major - dec = [] - opcodes = get_csv("major.csv") - dec.append(Subdecoder(pattern=None, opint=True, opcodes=opcodes, - bitsel=(26, 32), suffix=None, subdecoders=pminor)) - opcodes = get_csv("extra.csv") - dec.append(Subdecoder(pattern=None, opint=False, opcodes=opcodes, - bitsel=(0, 32), suffix=None, subdecoders=[])) - - return TopPowerDecoder(32, dec) - - -if __name__ == '__main__': - pdecode = create_pdecode() - vl = rtlil.convert(pdecode, ports=pdecode.ports()) - with open("decoder.il", "w") as f: - f.write(vl) diff --git a/src/decoder/power_decoder2.py b/src/decoder/power_decoder2.py deleted file mode 100644 index 1b7435a0..00000000 --- a/src/decoder/power_decoder2.py +++ /dev/null @@ -1,429 +0,0 @@ -"""Power ISA Decoder second stage - -based on Anton Blanchard microwatt decode2.vhdl - -""" -from nmigen import Module, Elaboratable, Signal, Mux, Const -from nmigen.cli import rtlil - -from power_decoder import create_pdecode -from power_enums import (InternalOp, CryIn, Function, LdstLen, - In1Sel, In2Sel, In3Sel, OutSel, SPR, RC) - - -class DecodeA(Elaboratable): - """DecodeA from instruction - - decodes register RA, whether immediate-zero, implicit and - explicit CSRs - """ - - def __init__(self, dec): - self.dec = dec - self.sel_in = Signal(In1Sel, reset_less=True) - self.insn_in = Signal(32, reset_less=True) - self.reg_out = Data(5, name="reg_a") - self.immz_out = Signal(reset_less=True) - self.spr_out = Data(10, "spr_a") - - def elaborate(self, platform): - m = Module() - comb = m.d.comb - - # select Register A field - with m.If((self.sel_in == In1Sel.RA) | - ((self.sel_in == In1Sel.RA_OR_ZERO) & - (self.reg_out.data != Const(0, 5)))): - comb += self.reg_out.data.eq(self.dec.RA[0:-1]) - comb += self.reg_out.ok.eq(1) - - # zero immediate requested - with m.If((self.sel_in == In1Sel.RA_OR_ZERO) & - (self.reg_out.data == Const(0, 5))): - comb += self.immz_out.eq(1) - - # decode SPR1 based on instruction type - op = self.dec.op - # BC or BCREG: potential implicit register (CTR) - with m.If((op.internal_op == InternalOp.OP_BC) | - (op.internal_op == InternalOp.OP_BCREG)): - with m.If(~self.dec.BO[2]): # 3.0B p38 BO2=0, use CTR reg - comb += self.spr_out.data.eq(SPR.CTR) # constant: CTR - comb += self.spr_out.ok.eq(1) - # MFSPR or MTSPR: move-from / move-to SPRs - with m.If((op.internal_op == InternalOp.OP_MFSPR) | - (op.internal_op == InternalOp.OP_MTSPR)): - comb += self.spr_out.data.eq(self.dec.SPR[0:-1]) # SPR field, XFX - comb += self.spr_out.ok.eq(1) - - return m - -class Data: - - def __init__(self, width, name): - - self.data = Signal(width, name=name, reset_less=True) - self.ok = Signal(name="%s_ok" % name, reset_less=True) - - def eq(self, rhs): - return [self.data.eq(rhs.data), - self.ok.eq(rhs.ok)] - - def ports(self): - return [self.data, self.ok] - - -class DecodeB(Elaboratable): - """DecodeB from instruction - - decodes register RB, different forms of immediate (signed, unsigned), - and implicit SPRs - """ - - def __init__(self, dec): - self.dec = dec - self.sel_in = Signal(In2Sel, reset_less=True) - self.insn_in = Signal(32, reset_less=True) - self.reg_out = Data(5, "reg_b") - self.imm_out = Data(64, "imm_b") - self.spr_out = Data(10, "spr_b") - - def elaborate(self, platform): - m = Module() - comb = m.d.comb - - # select Register B field - with m.Switch(self.sel_in): - with m.Case(In2Sel.RB): - comb += self.reg_out.data.eq(self.dec.RB[0:-1]) - comb += self.reg_out.ok.eq(1) - with m.Case(In2Sel.CONST_UI): - comb += self.imm_out.data.eq(self.dec.UI[0:-1]) - comb += self.imm_out.ok.eq(1) - with m.Case(In2Sel.CONST_SI): # TODO: sign-extend here? - comb += self.imm_out.data.eq(self.dec.SI[0:-1]) - comb += self.imm_out.ok.eq(1) - with m.Case(In2Sel.CONST_UI_HI): - comb += self.imm_out.data.eq(self.dec.UI[0:-1]<<4) - comb += self.imm_out.ok.eq(1) - with m.Case(In2Sel.CONST_SI_HI): # TODO: sign-extend here? - comb += self.imm_out.data.eq(self.dec.SI[0:-1]<<4) - comb += self.imm_out.ok.eq(1) - with m.Case(In2Sel.CONST_LI): - comb += self.imm_out.data.eq(self.dec.LI[0:-1]<<2) - comb += self.imm_out.ok.eq(1) - with m.Case(In2Sel.CONST_BD): - comb += self.imm_out.data.eq(self.dec.BD[0:-1]<<2) - comb += self.imm_out.ok.eq(1) - with m.Case(In2Sel.CONST_DS): - comb += self.imm_out.data.eq(self.dec.DS[0:-1]<<2) - comb += self.imm_out.ok.eq(1) - with m.Case(In2Sel.CONST_M1): - comb += self.imm_out.data.eq(~Const(0, 64)) # all 1s - comb += self.imm_out.ok.eq(1) - with m.Case(In2Sel.CONST_SH): - comb += self.imm_out.data.eq(self.dec.sh[0:-1]) - comb += self.imm_out.ok.eq(1) - with m.Case(In2Sel.CONST_SH32): - comb += self.imm_out.data.eq(self.dec.SH32[0:-1]) - comb += self.imm_out.ok.eq(1) - - # decode SPR2 based on instruction type - op = self.dec.op - # BCREG implicitly uses CTR or LR for 2nd reg - with m.If(op.internal_op == InternalOp.OP_BCREG): - with m.If(self.dec.FormXL.XO[9]): # 3.0B p38 top bit of XO - comb += self.spr_out.data.eq(SPR.CTR) - with m.Else(): - comb += self.spr_out.data.eq(SPR.LR) - comb += self.spr_out.ok.eq(1) - - return m - - -class DecodeC(Elaboratable): - """DecodeC from instruction - - decodes register RC - """ - - def __init__(self, dec): - self.dec = dec - self.sel_in = Signal(In3Sel, reset_less=True) - self.insn_in = Signal(32, reset_less=True) - self.reg_out = Data(5, "reg_c") - - def elaborate(self, platform): - m = Module() - comb = m.d.comb - - # select Register C field - with m.If(self.sel_in == In3Sel.RS): - comb += self.reg_out.data.eq(self.dec.RS[0:-1]) - comb += self.reg_out.ok.eq(1) - - return m - - -class DecodeOut(Elaboratable): - """DecodeOut from instruction - - decodes output register RA, RT or SPR - """ - - def __init__(self, dec): - self.dec = dec - self.sel_in = Signal(OutSel, reset_less=True) - self.insn_in = Signal(32, reset_less=True) - self.reg_out = Data(5, "reg_o") - self.spr_out = Data(10, "spr_o") - - def elaborate(self, platform): - m = Module() - comb = m.d.comb - - # select Register out field - with m.Switch(self.sel_in): - with m.Case(OutSel.RT): - comb += self.reg_out.data.eq(self.dec.RT[0:-1]) - comb += self.reg_out.ok.eq(1) - with m.Case(OutSel.RA): - comb += self.reg_out.data.eq(self.dec.RA[0:-1]) - comb += self.reg_out.ok.eq(1) - with m.Case(OutSel.SPR): - comb += self.spr_out.data.eq(self.dec.SPR[0:-1]) # from XFX - comb += self.spr_out.ok.eq(1) - - return m - - -class DecodeRC(Elaboratable): - """DecodeRc from instruction - - decodes Record bit Rc - """ - def __init__(self, dec): - self.dec = dec - self.sel_in = Signal(RC, reset_less=True) - self.insn_in = Signal(32, reset_less=True) - self.rc_out = Data(1, "rc") - - def elaborate(self, platform): - m = Module() - comb = m.d.comb - - # select Record bit out field - with m.Switch(self.sel_in): - with m.Case(RC.RC): - comb += self.rc_out.data.eq(self.dec.Rc[0:-1]) - comb += self.rc_out.ok.eq(1) - with m.Case(RC.ONE): - comb += self.rc_out.data.eq(1) - comb += self.rc_out.ok.eq(1) - with m.Case(RC.NONE): - comb += self.rc_out.data.eq(0) - comb += self.rc_out.ok.eq(1) - - return m - - -class DecodeOE(Elaboratable): - """DecodeOE from instruction - - decodes OE field: uses RC decode detection which might not be good - - -- For now, use "rc" in the decode table to decide whether oe exists. - -- This is not entirely correct architecturally: For mulhd and - -- mulhdu, the OE field is reserved. It remains to be seen what an - -- actual POWER9 does if we set it on those instructions, for now we - -- test that further down when assigning to the multiplier oe input. - """ - def __init__(self, dec): - self.dec = dec - self.sel_in = Signal(RC, reset_less=True) - self.insn_in = Signal(32, reset_less=True) - self.oe_out = Data(1, "oe") - - def elaborate(self, platform): - m = Module() - comb = m.d.comb - - # select OE bit out field - with m.Switch(self.sel_in): - with m.Case(RC.RC): - comb += self.oe_out.data.eq(self.dec.OE[0:-1]) - comb += self.oe_out.ok.eq(1) - - return m - - -class XerBits: - def __init__(self): - self.ca = Signal(reset_less=True) - self.ca32 = Signal(reset_less=True) - self.ov = Signal(reset_less=True) - self.ov32 = Signal(reset_less=True) - self.so = Signal(reset_less=True) - - def ports(self): - return [self.ca, self.ca32, self.ov, self.ov32, self.so, ] - - -class Decode2ToExecute1Type: - - def __init__(self): - - self.valid = Signal(reset_less=True) - self.insn_type = Signal(InternalOp, reset_less=True) - self.nia = Signal(64, reset_less=True) - self.write_reg = Data(5, name="rego") - self.read_reg1 = Data(5, name="reg1") - self.read_reg2 = Data(5, name="reg2") - self.read_reg3 = Data(5, name="reg3") - self.imm_data = Data(64, name="imm") - self.write_spr = Data(10, name="spro") - self.read_spr1 = Data(10, name="spr1") - self.read_spr2 = Data(10, name="spr2") - #self.read_data1 = Signal(64, reset_less=True) - #self.read_data2 = Signal(64, reset_less=True) - #self.read_data3 = Signal(64, reset_less=True) - #self.cr = Signal(32, reset_less=True) # NO: this is from the CR SPR - #self.xerc = XerBits() # NO: this is from the XER SPR - self.lk = Signal(reset_less=True) - self.rc = Data(1, "rc") - self.oe = Data(1, "oe") - self.invert_a = Signal(reset_less=True) - self.invert_out = Signal(reset_less=True) - self.input_carry = Signal(CryIn, reset_less=True) - self.output_carry = Signal(reset_less=True) - self.input_cr = Signal(reset_less=True) - self.output_cr = Signal(reset_less=True) - self.is_32bit = Signal(reset_less=True) - self.is_signed = Signal(reset_less=True) - self.insn = Signal(32, reset_less=True) - self.data_len = Signal(4, reset_less=True) # bytes - self.byte_reverse = Signal(reset_less=True) - self.sign_extend = Signal(reset_less=True)# do we need this? - self.update = Signal(reset_less=True) # is this an update instruction? - - def ports(self): - return [self.valid, self.insn_type, self.nia, - #self.read_data1, self.read_data2, self.read_data3, - #self.cr, - self.lk, - self.invert_a, self.invert_out, - self.input_carry, self.output_carry, - self.input_cr, self.output_cr, - self.is_32bit, self.is_signed, - self.insn, - self.data_len, self.byte_reverse , self.sign_extend , - self.update] + \ - self.oe.ports() + \ - self.rc.ports() + \ - self.write_spr.ports() + \ - self.read_spr1.ports() + \ - self.read_spr2.ports() + \ - self.write_reg.ports() + \ - self.read_reg1.ports() + \ - self.read_reg2.ports() + \ - self.read_reg3.ports() + \ - self.imm_data.ports() - # + self.xerc.ports() - -class PowerDecode2(Elaboratable): - - def __init__(self, dec): - - self.dec = dec - self.e = Decode2ToExecute1Type() - - def ports(self): - return self.dec.ports() + self.e.ports() - - def elaborate(self, platform): - m = Module() - comb = m.d.comb - - # set up submodule decoders - m.submodules.dec = self.dec - m.submodules.dec_a = dec_a = DecodeA(self.dec) - m.submodules.dec_b = dec_b = DecodeB(self.dec) - m.submodules.dec_c = dec_c = DecodeC(self.dec) - m.submodules.dec_o = dec_o = DecodeOut(self.dec) - m.submodules.dec_rc = dec_rc = DecodeRC(self.dec) - m.submodules.dec_oe = dec_oe = DecodeOE(self.dec) - - # copy instruction through... - for i in [self.e.insn, dec_a.insn_in, dec_b.insn_in, - dec_c.insn_in, dec_o.insn_in, dec_rc.insn_in, - dec_oe.insn_in]: - comb += i.eq(self.dec.opcode_in) - - # ...and subdecoders' input fields - comb += dec_a.sel_in.eq(self.dec.op.in1_sel) - comb += dec_b.sel_in.eq(self.dec.op.in2_sel) - comb += dec_c.sel_in.eq(self.dec.op.in3_sel) - comb += dec_o.sel_in.eq(self.dec.op.out_sel) - comb += dec_rc.sel_in.eq(self.dec.op.rc_sel) - comb += dec_oe.sel_in.eq(self.dec.op.rc_sel) # XXX should be OE sel - - # decode LD/ST length - with m.Switch(self.dec.op.ldst_len): - with m.Case(LdstLen.is1B): - comb += self.e.data_len.eq(1) - with m.Case(LdstLen.is2B): - comb += self.e.data_len.eq(2) - with m.Case(LdstLen.is4B): - comb += self.e.data_len.eq(4) - with m.Case(LdstLen.is8B): - comb += self.e.data_len.eq(8) - - #comb += self.e.nia.eq(self.dec.nia) # XXX TODO - itype = Mux(self.dec.op.function_unit == Function.NONE, - InternalOp.OP_ILLEGAL, - self.dec.op.internal_op) - comb += self.e.insn_type.eq(itype) - - # registers a, b, c and out - comb += self.e.read_reg1.eq(dec_a.reg_out) - comb += self.e.read_reg2.eq(dec_b.reg_out) - comb += self.e.read_reg3.eq(dec_c.reg_out) - comb += self.e.write_reg.eq(dec_o.reg_out) - comb += self.e.imm_data.eq(dec_b.imm_out) - - # rc and oe out - comb += self.e.rc.eq(dec_rc.rc_out) - comb += self.e.oe.eq(dec_oe.oe_out) - - # SPRs out - comb += self.e.read_spr1.eq(dec_a.spr_out) - comb += self.e.read_spr2.eq(dec_b.spr_out) - comb += self.e.write_spr.eq(dec_o.spr_out) - - # decoded/selected instruction flags - comb += self.e.invert_a.eq(self.dec.op.inv_a) - comb += self.e.invert_out.eq(self.dec.op.inv_out) - comb += self.e.input_carry.eq(self.dec.op.cry_in) - comb += self.e.output_carry.eq(self.dec.op.cry_out) - comb += self.e.is_32bit.eq(self.dec.op.is_32b) - comb += self.e.is_signed.eq(self.dec.op.sgn) - with m.If(self.dec.op.lk): - comb += self.e.lk.eq(self.dec.LK[0:-1]) # XXX TODO: accessor - - comb += self.e.byte_reverse.eq(self.dec.op.br) - comb += self.e.sign_extend.eq(self.dec.op.sgn_ext) - comb += self.e.update.eq(self.dec.op.upd) - - comb += self.e.input_cr.eq(self.dec.op.cr_in) - comb += self.e.output_cr.eq(self.dec.op.cr_out) - - return m - - -if __name__ == '__main__': - pdecode = create_pdecode() - dec2 = PowerDecode2(pdecode) - vl = rtlil.convert(dec2, ports=dec2.ports() + pdecode.ports()) - with open("dec2.il", "w") as f: - f.write(vl) - diff --git a/src/decoder/power_enums.py b/src/decoder/power_enums.py deleted file mode 100644 index dcf5cad2..00000000 --- a/src/decoder/power_enums.py +++ /dev/null @@ -1,229 +0,0 @@ -from enum import Enum, unique -import csv -import os -import requests - - -def get_csv(name): - file_dir = os.path.dirname(os.path.realpath(__file__)) - file_path = os.path.join(file_dir, name) - if not os.path.isfile(file_path): - url = 'https://libre-riscv.org/openpower/isatables/' + name - r = requests.get(url, allow_redirects=True) - with open(file_path, 'w') as outfile: - outfile.write(r.content.decode("utf-8")) - with open(file_path, 'r') as csvfile: - reader = csv.DictReader(csvfile) - return list(reader) - - -# names of the fields in the tables that don't correspond to an enum -single_bit_flags = ['CR in', 'CR out', 'inv A', 'inv out', - 'cry out', 'BR', 'sgn ext', 'upd', 'rsrv', '32b', - 'sgn', 'lk', 'sgl pipe'] - -# default values for fields in the table -default_values = {'unit': "NONE", 'internal op': "OP_ILLEGAL", - 'in1': "RA", 'in2': 'NONE', 'in3': 'NONE', 'out': 'NONE', - 'ldst len': 'NONE', - 'rc' : 'NONE', 'cry in' : 'ZERO', 'form': 'NONE'} - -def get_signal_name(name): - if name[0].isdigit(): - name = "is_" + name - return name.lower().replace(' ', '_') - - -@unique -class Function(Enum): - NONE = 0 - ALU = 1 - LDST = 2 - - -@unique -class Form(Enum): - NONE = 0 - I = 1 - B = 2 - SC = 3 - D = 4 - DS = 5 - DQ = 6 - DX = 7 - X = 8 - XL = 9 - XFX = 10 - XFL = 11 - XX1 = 12 - XX2 = 13 - XX3 = 14 - XX4 = 15 - XS = 16 - XO = 17 - A = 18 - M = 19 - MD = 20 - MDS = 21 - VA = 22 - VC = 23 - VX = 24 - EVX = 25 - EVS = 26 - Z22 = 27 - Z23 = 28 - - - -@unique -class InternalOp(Enum): - OP_ILLEGAL = 0 - OP_NOP = 1 - OP_ADD = 2 - OP_ADDPCIS = 3 - OP_AND = 4 - OP_ATTN = 5 - OP_B = 6 - OP_BC = 7 - OP_BCREG = 8 - OP_BPERM = 9 - OP_CMP = 10 - OP_CMPB = 11 - OP_CMPEQB = 12 - OP_CMPRB = 13 - OP_CNTZ = 14 - OP_CRAND = 15 - OP_CRANDC = 16 - OP_CREQV = 17 - OP_CRNAND = 18 - OP_CRNOR = 19 - OP_CROR = 20 - OP_CRORC = 21 - OP_CRXOR = 22 - OP_DARN = 23 - OP_DCBF = 24 - OP_DCBST = 25 - OP_DCBT = 26 - OP_DCBTST = 27 - OP_DCBZ = 28 - OP_DIV = 29 - OP_DIVE = 30 - OP_EXTS = 31 - OP_EXTSWSLI = 32 - OP_ICBI = 33 - OP_ICBT = 34 - OP_ISEL = 35 - OP_ISYNC = 36 - OP_LOAD = 37 - OP_STORE = 38 - OP_MADDHD = 39 - OP_MADDHDU = 40 - OP_MADDLD = 41 - OP_MCRF = 42 - OP_MCRXR = 43 - OP_MCRXRX = 44 - OP_MFCR = 45 - OP_MFSPR = 46 - OP_MOD = 47 - OP_MTCRF = 48 - OP_MTSPR = 49 - OP_MUL_L64 = 50 - OP_MUL_H64 = 51 - OP_MUL_H32 = 52 - OP_OR = 53 - OP_POPCNT = 54 - OP_PRTY = 55 - OP_RLC = 56 - OP_RLCL = 57 - OP_RLCR = 58 - OP_SETB = 59 - OP_SHL = 60 - OP_SHR = 61 - OP_SYNC = 62 - OP_TD = 63 - OP_TDI = 64 - OP_TW = 65 - OP_TWI = 66 - OP_XOR = 67 - OP_SIM_CONFIG = 68 - - -@unique -class In1Sel(Enum): - RA = 0 - RA_OR_ZERO = 1 - NONE = 2 - SPR = 3 - - -@unique -class In2Sel(Enum): - NONE = 0 - RB = 1 - CONST_UI = 2 - CONST_SI = 3 - CONST_UI_HI = 4 - CONST_SI_HI = 5 - CONST_LI = 6 - CONST_BD = 7 - CONST_DS = 8 - CONST_M1 = 9 - CONST_SH = 10 - CONST_SH32 = 11 - SPR = 12 - - -@unique -class In3Sel(Enum): - NONE = 0 - RS = 1 - - -@unique -class OutSel(Enum): - NONE = 0 - RT = 1 - RA = 2 - SPR = 3 - - -@unique -class LdstLen(Enum): - NONE = 0 - is1B = 1 - is2B = 2 - is4B = 3 - is8B = 4 - - -@unique -class RC(Enum): - NONE = 0 - ONE = 1 - RC = 2 - - -@unique -class CryIn(Enum): - ZERO = 0 - ONE = 1 - CA = 2 - -@unique -class SPR(Enum): - XER = 1 - LR = 8 - CTR = 9 - TB = 268 - SRR0 = 26 - SRR1 = 27 - HSRR0 = 314 - HSRR1 = 315 - SPRG0 = 272 - SPRG1 = 273 - SPRG2 = 274 - SPRG3 = 275 - SPRG3U = 259 - HSPRG0 = 304 - HSPRG1 = 305 - diff --git a/src/decoder/power_fields.py b/src/decoder/power_fields.py deleted file mode 100644 index 3457331e..00000000 --- a/src/decoder/power_fields.py +++ /dev/null @@ -1,242 +0,0 @@ -from collections import OrderedDict, namedtuple - - -class BitRange(OrderedDict): - """BitRange: remaps from straight indices (0,1,2..) to bit numbers - """ - def __getitem__(self, subscript): - if isinstance(subscript, slice): - return list(self)[subscript] - else: - return self[subscript] - -def decode_instructions(form): - res = {} - accum = [] - for l in form: - if l.strip().startswith("Formats"): - l = l.strip().split(":")[-1] - l = l.replace(" ", "") - l = l.split(",") - for fmt in l: - if fmt not in res: - res[fmt] = [accum[0]] - else: - res[fmt].append(accum[0]) - accum = [] - else: - accum.append(l.strip()) - return res - -def decode_form_header(hdr): - res = {} - count = 0 - hdr = hdr.strip() - print (hdr.split('|')) - for f in hdr.split("|"): - if not f: - continue - if f[0].isdigit(): - idx = int(f.strip().split(' ')[0]) - res[count] = idx - count += len(f) + 1 - return res - -def find_unique(d, key): - if key not in d: - return key - idx = 1 - while "%s_%d" % (key, idx) in d: - idx += 1 - return "%s_%d" % (key, idx) - - -def decode_line(header, line): - line = line.strip() - res = {} - count = 0 - print ("line", line) - prev_fieldname = None - for f in line.split("|"): - if not f: - continue - end = count + len(f) + 1 - fieldname = f.strip() - if not fieldname or fieldname.startswith('/'): - if prev_fieldname is not None: - res[prev_fieldname] = (res[prev_fieldname], header[count]) - prev_fieldname = None - count = end - continue - bitstart = header[count] - if prev_fieldname is not None: - res[prev_fieldname] = (res[prev_fieldname], bitstart) - res[fieldname] = bitstart - count = end - prev_fieldname = fieldname - res[prev_fieldname] = (bitstart, 32) - return res - - -def decode_form(form): - header = decode_form_header(form[0]) - res = [] - print ("header", header) - for line in form[1:]: - dec = decode_line(header, line) - if dec: - res.append(dec) - fields = {} - falternate = {} - for l in res: - for k, (start,end) in l.items(): - if k in fields: - if (start, end) == fields[k]: - continue # already in and matching for this Form - if k in falternate: - alternate = "%s_%d" % (k, falternate[k]) - if (start, end) == fields[alternate]: - continue - falternate[k] = fidx = falternate.get(k, 0) + 1 - fields["%s_%d" % (k, fidx)] = (start, end) - else: - fields[k] = (start, end) - return fields - - -class DecodeFields: - - def __init__(self, bitkls=BitRange, bitargs=(), fname="fields.txt"): - self.bitkls = bitkls - self.bitargs = bitargs - self.fname = fname - - def create_specs(self): - self.forms, self.instrs = self.decode_fields() - self.form_names = forms = self.instrs.keys() - for form in forms: - fields = self.instrs[form] - fk = fields.keys() - Fields = namedtuple("Fields", fk) - instr = Fields(**fields) - setattr(self, "Form%s" % form, instr) - # now add in some commonly-used fields (should be done automatically) - # note that these should only be ones which are the same on all Forms - # note: these are from microwatt insn_helpers.vhdl - self.RS = self.FormX.RS - self.RT = self.FormX.RT - self.RA = self.FormX.RA - self.RB = self.FormX.RB - self.SI = self.FormD.SI - self.UI = self.FormD.UI - self.L = self.FormD.L - self.SH32 = self.FormM.SH - self.sh = self.FormMD.sh - self.MB32 = self.FormM.MB - self.ME32 = self.FormM.ME - self.LI = self.FormI.LI - self.LK = self.FormI.LK - self.AA = self.FormB.AA - self.Rc = self.FormX.Rc - self.OE = self.FormXO.Rc - self.BD = self.FormB.BD - self.BF = self.FormX.BF - self.CR = self.FormXL.XO # used by further mcrf decoding - self.BB = self.FormXL.BB - self.BA = self.FormXL.BA - self.BT = self.FormXL.BT - self.FXM = self.FormXFX.FXM - self.BO = self.FormXL.BO - self.BI = self.FormXL.BI - self.BH = self.FormXL.BH - self.D = self.FormD.D - self.DS = self.FormDS.DS - self.TO = self.FormX.TO - self.BC = self.FormA.BC - self.SH = self.FormX.SH - self.ME = self.FormM.ME - self.MB = self.FormM.MB - self.SPR = self.FormXFX.SPR - - def decode_fields(self): - with open(self.fname) as f: - txt = f.readlines() - forms = {} - reading_data = False - for l in txt: - print ("line", l) - l = l.strip() - if len(l) == 0: - continue - if reading_data: - if l[0] == '#': - reading_data = False - else: - forms[heading].append(l) - if not reading_data: - assert l[0] == '#' - heading = l[1:].strip() - #if heading.startswith('1.6.28'): # skip instr fields for now - #break - heading = heading.split(' ')[-1] - print ("heading", heading) - reading_data = True - forms[heading] = [] - - res = {} - inst = {} - - for hdr, form in forms.items(): - print ("heading", hdr) - if heading == 'Fields': - i = decode_instructions(form) - for form, field in i.items(): - inst[form] = self.decode_instruction_fields(field) - #else: - # res[hdr] = decode_form(form) - return res, inst - - def decode_instruction_fields(self, fields): - res = {} - for field in fields: - f, spec = field.strip().split(" ") - d = self.bitkls(*self.bitargs) - idx = 0 - for s in spec[1:-1].split(","): - s = s.split(':') - if len(s) == 1: - d[idx] = int(s[0]) - idx += 1 - else: - start = int(s[0]) - end = int(s[1]) - while start <= end: - d[idx] = start - idx += 1 - start += 1 - f = f.replace(",", "_") - unique = find_unique(res, f) - res[unique] = d - - return res - -if __name__ == '__main__': - dec = DecodeFields() - dec.create_specs() - forms, instrs = dec.forms, dec.instrs - for hdr, form in forms.items(): - print () - print (hdr) - for k, v in form.items(): - #print ("line", l) - #for k, v in l.items(): - print ("%s: %d-%d" % (k, v[0], v[1])) - for form, field in instrs.items(): - print () - print (form) - for f, vals in field.items(): - print (" ", f, vals) - print (dec.FormX) - print (dec.FormX.A) - print (dir(dec.FormX)) - print (dec.FormX._fields) diff --git a/src/decoder/power_fieldsn.py b/src/decoder/power_fieldsn.py deleted file mode 100644 index e603bbd3..00000000 --- a/src/decoder/power_fieldsn.py +++ /dev/null @@ -1,74 +0,0 @@ -from collections import OrderedDict -from power_fields import DecodeFields, BitRange -from nmigen import Module, Elaboratable, Signal, Cat -from nmigen.cli import rtlil - - -class SignalBitRange(BitRange): - def __init__(self, signal): - BitRange.__init__(self) - self.signal = signal - - def __getitem__(self, subs): - # *sigh* field numberings are bit-inverted. PowerISA 3.0B section 1.3.2 - width = self.signal.shape()[0] - print (dir(self)) - print (self.items()) - if isinstance(subs, slice): - res = [] - print (subs) - start, stop, step = subs.start, subs.stop, subs.step - if step is None: - step = 1 - if start is None: - start = 0 - if stop is None: - stop = -1 - if start < 0: - start = len(self) - start - 1 - if stop < 0: - stop = len(self) - stop - 1 - print ("range", start, stop, step) - for t in range(start, stop, step): - k = OrderedDict.__getitem__(self, t) - print ("t", t, k) - res.append(self.signal[width-k-1]) - return Cat(*res) - else: - k = OrderedDict.__getitem__(self, subs) - return self.signal[width-k-1] - - print ("translated", subs, translated) - - -class SigDecode(Elaboratable): - - def __init__(self, width): - self.opcode_in = Signal(width, reset_less=False) - self.df = DecodeFields(SignalBitRange, [self.opcode_in]) - self.df.create_specs() - self.x_s = Signal(len(self.df.FormX.S), reset_less=True) - self.x_sh = Signal(len(self.df.FormX.SH), reset_less=True) - self.dq_xs_s = Signal(len(self.df.FormDQ.SX_S), reset_less=True) - - def elaborate(self, platform): - m = Module() - comb = m.d.comb - comb += self.x_s.eq(self.df.FormX.S[0]) - comb += self.x_sh.eq(self.df.FormX.SH[0:-1]) - comb += self.dq_xs_s.eq(self.df.FormDQ.SX_S[0:-1]) - return m - - def ports(self): - return [self.opcode_in, self.x_s, self.x_sh] - -def create_sigdecode(): - s = SigDecode(32) - return s - -if __name__ == '__main__': - sigdecode = create_sigdecode() - vl = rtlil.convert(sigdecode, ports=sigdecode.ports()) - with open("decoder.il", "w") as f: - f.write(vl) - diff --git a/src/decoder/test/test_power_decoder.py b/src/decoder/test/test_power_decoder.py deleted file mode 100644 index f64f4b96..00000000 --- a/src/decoder/test/test_power_decoder.py +++ /dev/null @@ -1,130 +0,0 @@ -from nmigen import Module, Signal -from nmigen.back.pysim import Simulator, Delay -from nmigen.test.utils import FHDLTestCase -from nmigen.cli import rtlil -import sys -import os -import unittest -sys.path.append("../") -from power_decoder import (PowerDecoder, pdecode) -from power_enums import (Function, InternalOp, In1Sel, In2Sel, In3Sel, - OutSel, RC, LdstLen, CryIn, single_bit_flags, - get_signal_name, get_csv) - - -class DecoderTestCase(FHDLTestCase): - - def run_tst(self, bitsel, csvname, minor=None, suffix=None, opint=True): - m = Module() - comb = m.d.comb - opcode = Signal(32) - function_unit = Signal(Function) - internal_op = Signal(InternalOp) - in1_sel = Signal(In1Sel) - in2_sel = Signal(In2Sel) - in3_sel = Signal(In3Sel) - out_sel = Signal(OutSel) - rc_sel = Signal(RC) - ldst_len = Signal(LdstLen) - cry_in = Signal(CryIn) - - # opcodes = get_csv(csvname) - # m.submodules.dut = dut = PowerDecoder(32, opcodes, bitsel=bitsel, - # opint=opint, suffix=suffix) - m.submodules.dut = dut = pdecode - comb += [dut.opcode_in.eq(opcode), - function_unit.eq(dut.op.function_unit), - in1_sel.eq(dut.op.in1_sel), - in2_sel.eq(dut.op.in2_sel), - in3_sel.eq(dut.op.in3_sel), - out_sel.eq(dut.op.out_sel), - rc_sel.eq(dut.op.rc_sel), - ldst_len.eq(dut.op.ldst_len), - cry_in.eq(dut.op.cry_in), - internal_op.eq(dut.op.internal_op)] - - sim = Simulator(m) - opcodes = get_csv(csvname) - - def process(): - for row in opcodes: - if not row['unit']: - continue - op = row['opcode'] - if not opint: # HACK: convert 001---10 to 0b00100010 - op = "0b" + op.replace('-', '0') - print ("opint", opint, row['opcode'], op) - print(row) - yield opcode.eq(0) - yield opcode[bitsel[0]:bitsel[1]].eq(int(op, 0)) - if minor: - print(minor) - minorbits = minor[1] - yield opcode[minorbits[0]:minorbits[1]].eq(minor[0]) - yield Delay(1e-6) - signals = [(function_unit, Function, 'unit'), - (internal_op, InternalOp, 'internal op'), - (in1_sel, In1Sel, 'in1'), - (in2_sel, In2Sel, 'in2'), - (in3_sel, In3Sel, 'in3'), - (out_sel, OutSel, 'out'), - (rc_sel, RC, 'rc'), - (cry_in, CryIn, 'cry in'), - (ldst_len, LdstLen, 'ldst len')] - for sig, enm, name in signals: - result = yield sig - expected = enm[row[name]] - msg = f"{sig.name} == {enm(result)}, expected: {expected}" - self.assertEqual(enm(result), expected, msg) - for bit in single_bit_flags: - sig = getattr(dut.op, get_signal_name(bit)) - result = yield sig - expected = int(row[bit]) - msg = f"{sig.name} == {result}, expected: {expected}" - self.assertEqual(expected, result, msg) - sim.add_process(process) - prefix = os.path.splitext(csvname)[0] - with sim.write_vcd("%s.vcd" % prefix, "%s.gtkw" % prefix, traces=[ - opcode, function_unit, internal_op, - in1_sel, in2_sel]): - sim.run() - - def generate_ilang(self): - vl = rtlil.convert(pdecode, ports=pdecode.ports()) - with open("decoder.il", "w") as f: - f.write(vl) - - def test_major(self): - self.run_tst((26, 32), "major.csv") - self.generate_ilang() - - def test_minor_19(self): - self.run_tst((1, 11), "minor_19.csv", minor=(19, (26, 32)), - suffix=(0, 5)) - - # def test_minor_19_00000(self): - # self.run_tst((1, 11), "minor_19_00000.csv") - - def test_minor_30(self): - self.run_tst((1, 5), "minor_30.csv", minor=(30, (26, 32))) - - def test_minor_31(self): - self.run_tst((1, 11), "minor_31.csv", minor=(31, (26, 32))) - - def test_minor_58(self): - self.run_tst((0, 2), "minor_58.csv", minor=(58, (26, 32))) - - def test_minor_62(self): - self.run_tst((0, 2), "minor_62.csv", minor=(62, (26, 32))) - - - # #def test_minor_31_prefix(self): - # # self.run_tst(10, "minor_31.csv", suffix=(5, 10)) - - # def test_extra(self): - # self.run_tst(32, "extra.csv", opint=False) - # self.generate_ilang(32, "extra.csv", opint=False) - - -if __name__ == "__main__": - unittest.main() diff --git a/src/experiment/alu_hier.py b/src/experiment/alu_hier.py deleted file mode 100644 index 9659059c..00000000 --- a/src/experiment/alu_hier.py +++ /dev/null @@ -1,239 +0,0 @@ -from nmigen import Elaboratable, Signal, Module, Const, Mux -from nmigen.cli import main -from nmigen.cli import verilog, rtlil - -import operator - - -class Adder(Elaboratable): - def __init__(self, width): - self.a = Signal(width) - self.b = Signal(width) - self.o = Signal(width) - - def elaborate(self, platform): - m = Module() - m.d.comb += self.o.eq(self.a + self.b) - return m - - -class Subtractor(Elaboratable): - def __init__(self, width): - self.a = Signal(width) - self.b = Signal(width) - self.o = Signal(width) - - def elaborate(self, platform): - m = Module() - m.d.comb += self.o.eq(self.a - self.b) - return m - - -class Multiplier(Elaboratable): - def __init__(self, width): - self.a = Signal(width) - self.b = Signal(width) - self.o = Signal(width) - - def elaborate(self, platform): - m = Module() - m.d.comb += self.o.eq(self.a * self.b) - return m - - -class Shifter(Elaboratable): - def __init__(self, width): - self.width = width - self.a = Signal(width) - self.b = Signal(width) - self.o = Signal(width) - - def elaborate(self, platform): - m = Module() - btrunc = Signal(self.width) - m.d.comb += btrunc.eq(self.b & Const((1<> btrunc) - return m - - -class ALU(Elaboratable): - def __init__(self, width): - self.p_valid_i = Signal() - self.p_ready_o = Signal() - self.n_ready_i = Signal() - self.n_valid_o = Signal() - self.counter = Signal(4) - self.op = Signal(2) - self.a = Signal(width) - self.b = Signal(width) - self.o = Signal(width) - self.width = width - - def elaborate(self, platform): - m = Module() - add = Adder(self.width) - sub = Subtractor(self.width) - mul = Multiplier(self.width) - shf = Shifter(self.width) - - m.submodules.add = add - m.submodules.sub = sub - m.submodules.mul = mul - m.submodules.shf = shf - for mod in [add, sub, mul, shf]: - m.d.comb += [ - mod.a.eq(self.a), - mod.b.eq(self.b), - ] - go_now = Signal(reset_less=True) # testing no-delay ALU - - with m.If(self.p_valid_i): - # input is valid. next check, if we already said "ready" or not - with m.If(~self.p_ready_o): - # we didn't say "ready" yet, so say so and initialise - m.d.sync += self.p_ready_o.eq(1) - - # as this is a "fake" pipeline, just grab the output right now - with m.Switch(self.op): - for i, mod in enumerate([add, sub, mul, shf]): - with m.Case(i): - m.d.sync += self.o.eq(mod.o) - with m.If(self.op == 2): # MUL, to take 5 instructions - m.d.sync += self.counter.eq(5) - with m.Elif(self.op == 3): # SHIFT to take 7 - m.d.sync += self.counter.eq(7) - with m.Elif(self.op == 1): # SUB to take 1, straight away - m.d.sync += self.counter.eq(1) - m.d.comb += go_now.eq(1) - with m.Else(): # ADD to take 2 - m.d.sync += self.counter.eq(2) - with m.Else(): - # input says no longer valid, so drop ready as well. - # a "proper" ALU would have had to sync in the opcode and a/b ops - m.d.sync += self.p_ready_o.eq(0) - - # ok so the counter's running: when it gets to 1, fire the output - with m.If((self.counter == 1) | go_now): - # set the output as valid if the recipient is ready for it - m.d.sync += self.n_valid_o.eq(1) - with m.If(self.n_ready_i & self.n_valid_o): - m.d.sync += self.n_valid_o.eq(0) - # recipient said it was ready: reset back to known-good. - m.d.sync += self.counter.eq(0) # reset the counter - m.d.sync += self.o.eq(0) # clear the output for tidiness sake - - # countdown to 1 (transition from 1 to 0 only on acknowledgement) - with m.If(self.counter > 1): - m.d.sync += self.counter.eq(self.counter - 1) - - return m - - def __iter__(self): - yield self.op - yield self.a - yield self.b - yield self.o - - def ports(self): - return list(self) - - -class BranchOp(Elaboratable): - def __init__(self, width, op): - self.a = Signal(width) - self.b = Signal(width) - self.o = Signal(width) - self.op = op - - def elaborate(self, platform): - m = Module() - m.d.comb += self.o.eq(Mux(self.op(self.a, self.b), 1, 0)) - return m - - -class BranchALU(Elaboratable): - def __init__(self, width): - self.p_valid_i = Signal() - self.p_ready_o = Signal() - self.n_ready_i = Signal() - self.n_valid_o = Signal() - self.counter = Signal(4) - self.op = Signal(2) - self.a = Signal(width) - self.b = Signal(width) - self.o = Signal(width) - self.width = width - - def elaborate(self, platform): - m = Module() - bgt = BranchOp(self.width, operator.gt) - blt = BranchOp(self.width, operator.lt) - beq = BranchOp(self.width, operator.eq) - bne = BranchOp(self.width, operator.ne) - - m.submodules.bgt = bgt - m.submodules.blt = blt - m.submodules.beq = beq - m.submodules.bne = bne - for mod in [bgt, blt, beq, bne]: - m.d.comb += [ - mod.a.eq(self.a), - mod.b.eq(self.b), - ] - - go_now = Signal(reset_less=True) # testing no-delay ALU - with m.If(self.p_valid_i): - # input is valid. next check, if we already said "ready" or not - with m.If(~self.p_ready_o): - # we didn't say "ready" yet, so say so and initialise - m.d.sync += self.p_ready_o.eq(1) - - # as this is a "fake" pipeline, just grab the output right now - with m.Switch(self.op): - for i, mod in enumerate([bgt, blt, beq, bne]): - with m.Case(i): - m.d.sync += self.o.eq(mod.o) - m.d.sync += self.counter.eq(5) # branch to take 5 cycles (fake) - #m.d.comb += go_now.eq(1) - with m.Else(): - # input says no longer valid, so drop ready as well. - # a "proper" ALU would have had to sync in the opcode and a/b ops - m.d.sync += self.p_ready_o.eq(0) - - # ok so the counter's running: when it gets to 1, fire the output - with m.If((self.counter == 1) | go_now): - # set the output as valid if the recipient is ready for it - m.d.sync += self.n_valid_o.eq(1) - with m.If(self.n_ready_i & self.n_valid_o): - m.d.sync += self.n_valid_o.eq(0) - # recipient said it was ready: reset back to known-good. - m.d.sync += self.counter.eq(0) # reset the counter - m.d.sync += self.o.eq(0) # clear the output for tidiness sake - - # countdown to 1 (transition from 1 to 0 only on acknowledgement) - with m.If(self.counter > 1): - m.d.sync += self.counter.eq(self.counter - 1) - - return m - - def __iter__(self): - yield self.op - yield self.a - yield self.b - yield self.o - - def ports(self): - return list(self) - - -if __name__ == "__main__": - alu = ALU(width=16) - vl = rtlil.convert(alu, ports=alu.ports()) - with open("test_alu.il", "w") as f: - f.write(vl) - - alu = BranchALU(width=16) - vl = rtlil.convert(alu, ports=alu.ports()) - with open("test_branch_alu.il", "w") as f: - f.write(vl) - diff --git a/src/experiment/compalu.py b/src/experiment/compalu.py deleted file mode 100644 index 7da6b5cf..00000000 --- a/src/experiment/compalu.py +++ /dev/null @@ -1,207 +0,0 @@ -from nmigen.compat.sim import run_simulation -from nmigen.cli import verilog, rtlil -from nmigen import Module, Signal, Mux, Elaboratable - -from nmutil.latch import SRLatch, latchregister - -""" Computation Unit (aka "ALU Manager"). - - This module runs a "revolving door" set of three latches, based on - * Issue - * Go_Read - * Go_Write - where one of them cannot be set on any given cycle. - (Note however that opc_l has been inverted (and qn used), due to SRLatch - default reset state being "0" rather than "1") - - * When issue is first raised, a busy signal is sent out. - The src1 and src2 registers and the operand can be latched in - at this point - - * Read request is set, which is acknowledged through the Scoreboard - to the priority picker, which generates (one and only one) Go_Read - at a time. One of those will (eventually) be this Computation Unit. - - * Once Go_Read is set, the src1/src2/operand latch door shuts (locking - src1/src2/operand in place), and the ALU is told to proceed. - - * As this is currently a "demo" unit, a countdown timer is activated - to simulate an ALU "pipeline", which activates "write request release", - and the ALU's output is captured into a temporary register. - - * Write request release will go through a similar process as Read request, - resulting (eventually) in Go_Write being asserted. - - * When Go_Write is asserted, two things happen: (1) the data in the temp - register is placed combinatorially onto the output, and (2) the - req_l latch is cleared, busy is dropped, and the Comp Unit is back - through its revolving door to do another task. - - Notes on oper_i: - - * bits[0:2] are for the ALU, add=0, sub=1, shift=2, mul=3 - * bit[2] are the immediate (bit[2]=1 == immediate mode) -""" - -class ComputationUnitNoDelay(Elaboratable): - def __init__(self, rwid, opwid, alu): - self.opwid = opwid - self.rwid = rwid - self.alu = alu - - self.counter = Signal(4) - self.go_rd_i = Signal(reset_less=True) # go read in - self.go_wr_i = Signal(reset_less=True) # go write in - self.issue_i = Signal(reset_less=True) # fn issue in - self.shadown_i = Signal(reset=1) # shadow function, defaults to ON - self.go_die_i = Signal() # go die (reset) - - self.oper_i = Signal(opwid, reset_less=True) # opcode in - self.imm_i = Signal(rwid, reset_less=True) # immediate in - self.src1_i = Signal(rwid, reset_less=True) # oper1 in - self.src2_i = Signal(rwid, reset_less=True) # oper2 in - - self.busy_o = Signal(reset_less=True) # fn busy out - self.data_o = Signal(rwid, reset_less=True) # Dest out - self.rd_rel_o = Signal(reset_less=True) # release src1/src2 request - self.req_rel_o = Signal(reset_less=True) # release request out (valid_o) - - def elaborate(self, platform): - m = Module() - m.submodules.alu = self.alu - m.submodules.src_l = src_l = SRLatch(sync=False) - m.submodules.opc_l = opc_l = SRLatch(sync=False) - m.submodules.req_l = req_l = SRLatch(sync=False) - - # shadow/go_die - reset_w = Signal(reset_less=True) - reset_r = Signal(reset_less=True) - m.d.comb += reset_w.eq(self.go_wr_i | self.go_die_i) - m.d.comb += reset_r.eq(self.go_rd_i | self.go_die_i) - - # This is fascinating and very important to observe that this - # is in effect a "3-way revolving door". At no time may all 3 - # latches be set at the same time. - - # opcode latch (not using go_rd_i) - inverted so that busy resets to 0 - m.d.sync += opc_l.s.eq(self.issue_i) # XXX NOTE: INVERTED FROM book! - m.d.sync += opc_l.r.eq(reset_w) # XXX NOTE: INVERTED FROM book! - - # src operand latch (not using go_wr_i) - m.d.sync += src_l.s.eq(self.issue_i) - m.d.sync += src_l.r.eq(reset_r) - - # dest operand latch (not using issue_i) - m.d.sync += req_l.s.eq(self.go_rd_i) - m.d.sync += req_l.r.eq(reset_w) - - - # create a latch/register for the operand - oper_r = Signal(self.opwid+1, reset_less=True) # opcode reg - latchregister(m, self.oper_i, oper_r, self.issue_i) - - # and one for the output from the ALU - data_r = Signal(self.rwid, reset_less=True) # Dest register - latchregister(m, self.alu.o, data_r, req_l.q) - - # get the top 2 bits for the ALU - m.d.comb += self.alu.op.eq(oper_r[0:2]) - - # 3rd bit is whether this is an immediate or not - op_is_imm = Signal(reset_less=True) - m.d.comb += op_is_imm.eq(oper_r[2]) - - # select immediate if opcode says so. however also change the latch - # to trigger *from* the opcode latch instead. - src2_or_imm = Signal(self.rwid, reset_less=True) - src_sel = Signal(reset_less=True) - m.d.comb += src_sel.eq(Mux(op_is_imm, opc_l.qn, src_l.q)) - m.d.comb += src2_or_imm.eq(Mux(op_is_imm, self.imm_i, self.src2_i)) - - # create a latch/register for src1/src2 - latchregister(m, self.src1_i, self.alu.a, src_l.q) - latchregister(m, src2_or_imm, self.alu.b, src_sel) - - # ----- - # outputs - # ----- - - # all request signals gated by busy_o. prevents picker problems - busy_o = self.busy_o - m.d.comb += busy_o.eq(opc_l.q) # busy out - m.d.comb += self.rd_rel_o.eq(src_l.q & busy_o) # src1/src2 req rel - - # on a go_read, tell the ALU we're accepting data. - # NOTE: this spells TROUBLE if the ALU isn't ready! - # go_read is only valid for one clock! - with m.If(self.go_rd_i): # src operands ready, GO! - with m.If(~self.alu.p_ready_o): # no ACK yet - m.d.comb += self.alu.p_valid_i.eq(1) # so indicate valid - - # only proceed if ALU says its output is valid - with m.If(self.alu.n_valid_o): - # when ALU ready, write req release out. waits for shadow - m.d.comb += self.req_rel_o.eq(req_l.q & busy_o & self.shadown_i) - # when output latch is ready, and ALU says ready, accept ALU output - with m.If(self.req_rel_o): - m.d.comb += self.alu.n_ready_i.eq(1) # tells ALU "thanks got it" - - # output the data from the latch on go_write - with m.If(self.go_wr_i): - m.d.comb += self.data_o.eq(data_r) - - return m - - def __iter__(self): - yield self.go_rd_i - yield self.go_wr_i - yield self.issue_i - yield self.shadown_i - yield self.go_die_i - yield self.oper_i - yield self.imm_i - yield self.src1_i - yield self.src2_i - yield self.busy_o - yield self.rd_rel_o - yield self.req_rel_o - yield self.data_o - - def ports(self): - return list(self) - - -def scoreboard_sim(dut): - yield dut.dest_i.eq(1) - yield dut.issue_i.eq(1) - yield - yield dut.issue_i.eq(0) - yield - yield dut.src1_i.eq(1) - yield dut.issue_i.eq(1) - yield - yield - yield - yield dut.issue_i.eq(0) - yield - yield dut.go_read_i.eq(1) - yield - yield dut.go_read_i.eq(0) - yield - yield dut.go_write_i.eq(1) - yield - yield dut.go_write_i.eq(0) - yield - -def test_scoreboard(): - from alu_hier import ALU - alu = ALU(16) - dut = ComputationUnitNoDelay(16, 8, alu) - vl = rtlil.convert(dut, ports=dut.ports()) - with open("test_compalu.il", "w") as f: - f.write(vl) - - run_simulation(dut, scoreboard_sim(dut), vcd_name='test_compalu.vcd') - -if __name__ == '__main__': - test_scoreboard() diff --git a/src/experiment/compldst.py b/src/experiment/compldst.py deleted file mode 100644 index 77ad39dd..00000000 --- a/src/experiment/compldst.py +++ /dev/null @@ -1,288 +0,0 @@ -from nmigen.compat.sim import run_simulation -from nmigen.cli import verilog, rtlil -from nmigen import Module, Signal, Mux, Cat, Elaboratable - -from nmutil.latch import SRLatch, latchregister - -""" LOAD / STORE Computation Unit. Also capable of doing ADD and ADD immediate - - This module runs a "revolving door" set of four latches, based on - * Issue - * Go_Read - * Go_Addr - * Go_Write *OR* Go_Store - - (Note that opc_l has been inverted (and qn used), due to SRLatch - default reset state being "0" rather than "1") -""" - -# internal opcodes. hypothetically this could do more combinations. -# meanings: -# * bit 0: 0 = ADD , 1 = SUB -# * bit 1: 0 = src1, 1 = IMM -# * bit 2: 1 = LD -# * bit 3: 1 = ST -LDST_OP_ADDI = 0b0000 # plain ADD (src1 + src2) -LDST_OP_SUBI = 0b0001 # plain SUB (src1 - src2) -LDST_OP_ADD = 0b0010 # immed ADD (imm + src1) -LDST_OP_SUB = 0b0011 # immed SUB (imm - src1) -LDST_OP_ST = 0b0110 # immed ADD plus LD op. ADD result is address -LDST_OP_LD = 0b1010 # immed ADD plus ST op. ADD result is address - - -class LDSTCompUnit(Elaboratable): - """ LOAD / STORE / ADD / SUB Computation Unit - - Inputs - ------ - - * :rwid: register width - * :alu: an ALU module - * :mem: a Memory Module (read-write capable) - - Control Signals (In) - -------------------- - - * :issue_i: LD/ST is being "issued". - * :isalu_i: ADD/SUB is being "issued" (aka issue_alu_i) - * :shadown_i: Inverted-shadow is being held (stops STORE *and* WRITE) - * :go_rd_i: read is being actioned (latches in src regs) - * :go_ad_i: address is being actioned (triggers actual mem LD) - * :go_st_i: store is being actioned (triggers actual mem STORE) - * :go_die_i: resets the unit back to "wait for issue" - """ - def __init__(self, rwid, opwid, alu, mem): - self.opwid = opwid - self.rwid = rwid - self.alu = alu - self.mem = mem - - self.counter = Signal(4) - self.go_rd_i = Signal(reset_less=True) # go read in - self.go_ad_i = Signal(reset_less=True) # go address in - self.go_wr_i = Signal(reset_less=True) # go write in - self.go_st_i = Signal(reset_less=True) # go store in - self.issue_i = Signal(reset_less=True) # fn issue in - self.isalu_i = Signal(reset_less=True) # fn issue as ALU in - self.shadown_i = Signal(reset=1) # shadow function, defaults to ON - self.go_die_i = Signal() # go die (reset) - - self.oper_i = Signal(opwid, reset_less=True) # opcode in - self.imm_i = Signal(rwid, reset_less=True) # immediate in - self.src1_i = Signal(rwid, reset_less=True) # oper1 in - self.src2_i = Signal(rwid, reset_less=True) # oper2 in - - self.busy_o = Signal(reset_less=True) # fn busy out - self.rd_rel_o = Signal(reset_less=True) # request src1/src2 - self.adr_rel_o = Signal(reset_less=True) # request address (from mem) - self.sto_rel_o = Signal(reset_less=True) # request store (to mem) - self.req_rel_o = Signal(reset_less=True) # request write (result) - self.data_o = Signal(rwid, reset_less=True) # Dest out (LD or ALU) - self.addr_o = Signal(rwid, reset_less=True) # Address out (LD or ST) - - # hmm... TODO... move these to outside of LDSTCompUnit - self.load_mem_o = Signal(reset_less=True) # activate memory LOAD - self.stwd_mem_o = Signal(reset_less=True) # activate memory STORE - self.ld_o = Signal(reset_less=True) # operation is a LD - self.st_o = Signal(reset_less=True) # operation is a ST - - def elaborate(self, platform): - m = Module() - comb = m.d.comb - sync = m.d.sync - - m.submodules.alu = self.alu - m.submodules.src_l = src_l = SRLatch(sync=False) - m.submodules.opc_l = opc_l = SRLatch(sync=False) - m.submodules.adr_l = adr_l = SRLatch(sync=False) - m.submodules.req_l = req_l = SRLatch(sync=False) - m.submodules.sto_l = sto_l = SRLatch(sync=False) - - # shadow/go_die - reset_b = Signal(reset_less=True) - reset_w = Signal(reset_less=True) - reset_a = Signal(reset_less=True) - reset_s = Signal(reset_less=True) - reset_r = Signal(reset_less=True) - comb += reset_b.eq(self.go_st_i | self.go_wr_i | self.go_die_i) - comb += reset_w.eq(self.go_wr_i | self.go_die_i) - comb += reset_s.eq(self.go_st_i | self.go_die_i) - comb += reset_r.eq(self.go_rd_i | self.go_die_i) - # this one is slightly different, issue_alu_i selects go_wr_i) - a_sel = Mux(self.isalu_i, self.go_wr_i, self.go_ad_i) - comb += reset_a.eq(a_sel| self.go_die_i) - - # opcode decode - op_alu = Signal(reset_less=True) - op_is_ld = Signal(reset_less=True) - op_is_st = Signal(reset_less=True) - op_ldst = Signal(reset_less=True) - op_is_imm = Signal(reset_less=True) - - # select immediate or src2 reg to add - src2_or_imm = Signal(self.rwid, reset_less=True) - src_sel = Signal(reset_less=True) - - # issue can be either issue_i or issue_alu_i (isalu_i) - issue_i = Signal(reset_less=True) - comb += issue_i.eq(self.issue_i | self.isalu_i) - - # Ripple-down the latches, each one set cancels the previous. - # NOTE: use sync to stop combinatorial loops. - - # opcode latch - inverted so that busy resets to 0 - sync += opc_l.s.eq(issue_i) # XXX NOTE: INVERTED FROM book! - sync += opc_l.r.eq(reset_b) # XXX NOTE: INVERTED FROM book! - - # src operand latch - sync += src_l.s.eq(issue_i) - sync += src_l.r.eq(reset_r) - - # addr latch - sync += adr_l.s.eq(self.go_rd_i) - sync += adr_l.r.eq(reset_a) - - # dest operand latch - sync += req_l.s.eq(self.go_ad_i) - sync += req_l.r.eq(reset_w) - - # store latch - sync += sto_l.s.eq(self.go_ad_i) - sync += sto_l.r.eq(reset_s) - - # outputs: busy and release signals - busy_o = self.busy_o - comb += self.busy_o.eq(opc_l.q) # busy out - comb += self.rd_rel_o.eq(src_l.q & busy_o) # src1/src2 req rel - comb += self.sto_rel_o.eq(sto_l.q & busy_o & self.shadown_i & op_is_st) - - # request release enabled based on if op is a LD/ST or a plain ALU - # if op is an ADD/SUB or a LD, req_rel activates. - wr_q = Signal(reset_less=True) - comb += wr_q.eq(req_l.q & (~op_ldst | op_is_ld)) - - alulatch = Signal(reset_less=True) - comb += alulatch.eq((op_ldst & self.adr_rel_o) | \ - (~op_ldst & self.req_rel_o)) - - # only proceed if ALU says its output is valid - with m.If(self.alu.n_valid_o): - - # write req release out. waits until shadow is dropped. - comb += self.req_rel_o.eq(wr_q & busy_o & self.shadown_i) - # address release only happens on LD/ST, and is shadowed. - comb += self.adr_rel_o.eq(adr_l.q & op_ldst & busy_o & \ - self.shadown_i) - # when output latch is ready, and ALU says ready, accept ALU output - with m.If(self.req_rel_o): - m.d.comb += self.alu.n_ready_i.eq(1) # tells ALU "thanks got it" - - # select immediate if opcode says so. however also change the latch - # to trigger *from* the opcode latch instead. - comb += src_sel.eq(Mux(op_is_imm, opc_l.qn, src_l.q)) - comb += src2_or_imm.eq(Mux(op_is_imm, self.imm_i, self.src2_i)) - - # create a latch/register for src1/src2 (include immediate select) - latchregister(m, self.src1_i, self.alu.a, src_l.q) - latchregister(m, src2_or_imm, self.alu.b, src_sel) - - # create a latch/register for the operand - oper_r = Signal(self.opwid, reset_less=True) # Dest register - latchregister(m, self.oper_i, oper_r, self.issue_i) - alu_op = Cat(op_alu, 0, op_is_imm) # using alu_hier, here. - comb += self.alu.op.eq(alu_op) - - # and one for the output from the ALU - data_r = Signal(self.rwid, reset_less=True) # Dest register - latchregister(m, self.alu.o, data_r, alulatch) - - # decode bits of operand (latched) - comb += op_alu.eq(oper_r[0]) - comb += op_is_imm.eq(oper_r[1]) - comb += op_is_ld.eq(oper_r[2]) - comb += op_is_st.eq(oper_r[3]) - comb += op_ldst.eq(op_is_ld | op_is_st) - comb += self.load_mem_o.eq(op_is_ld & self.go_ad_i) - comb += self.stwd_mem_o.eq(op_is_st & self.go_st_i) - comb += self.ld_o.eq(op_is_ld) - comb += self.st_o.eq(op_is_st) - - # on a go_read, tell the ALU we're accepting data. - # NOTE: this spells TROUBLE if the ALU isn't ready! - # go_read is only valid for one clock! - with m.If(self.go_rd_i): # src operands ready, GO! - with m.If(~self.alu.p_ready_o): # no ACK yet - m.d.comb += self.alu.p_valid_i.eq(1) # so indicate valid - - # put the register directly onto the output bus on a go_write - with m.If(self.go_wr_i): - comb += self.data_o.eq(data_r) - - # put the register directly onto the address bus - with m.If(self.go_ad_i): - comb += self.addr_o.eq(data_r) - - return m - - def __iter__(self): - yield self.go_rd_i - yield self.go_ad_i - yield self.go_wr_i - yield self.go_st_i - yield self.issue_i - yield self.isalu_i - yield self.shadown_i - yield self.go_die_i - yield self.oper_i - yield self.imm_i - yield self.src1_i - yield self.src2_i - yield self.busy_o - yield self.rd_rel_o - yield self.adr_rel_o - yield self.sto_rel_o - yield self.req_rel_o - yield self.data_o - yield self.load_mem_o - yield self.stwd_mem_o - - def ports(self): - return list(self) - - -def scoreboard_sim(dut): - yield dut.dest_i.eq(1) - yield dut.issue_i.eq(1) - yield - yield dut.issue_i.eq(0) - yield - yield dut.src1_i.eq(1) - yield dut.issue_i.eq(1) - yield - yield - yield - yield dut.issue_i.eq(0) - yield - yield dut.go_read_i.eq(1) - yield - yield dut.go_read_i.eq(0) - yield - yield dut.go_write_i.eq(1) - yield - yield dut.go_write_i.eq(0) - yield - - -def test_scoreboard(): - from alu_hier import ALU - alu = ALU(16) - mem = alu # fake - dut = LDSTCompUnit(16, 4, alu, mem) - vl = rtlil.convert(dut, ports=dut.ports()) - with open("test_ldst_comp.il", "w") as f: - f.write(vl) - - run_simulation(dut, scoreboard_sim(dut), vcd_name='test_ldst_comp.vcd') - -if __name__ == '__main__': - test_scoreboard() diff --git a/src/experiment/cscore.py b/src/experiment/cscore.py deleted file mode 100644 index 18b71c80..00000000 --- a/src/experiment/cscore.py +++ /dev/null @@ -1,435 +0,0 @@ -from nmigen.compat.sim import run_simulation -from nmigen.cli import verilog, rtlil -from nmigen import Module, Const, Signal, Array, Cat, Elaboratable - -from regfile.regfile import RegFileArray, treereduce -from scoreboard.fn_unit import IntFnUnit, FPFnUnit, LDFnUnit, STFnUnit -from scoreboard.fu_fu_matrix import FUFUDepMatrix -from scoreboard.fu_reg_matrix import FURegDepMatrix -from scoreboard.global_pending import GlobalPending -from scoreboard.group_picker import GroupPicker -from scoreboard.issue_unit import IntFPIssueUnit, RegDecode - -from compalu import ComputationUnitNoDelay - -from alu_hier import ALU -from nmutil.latch import SRLatch - -from random import randint - - -class Scoreboard(Elaboratable): - def __init__(self, rwid, n_regs): - """ Inputs: - - * :rwid: bit width of register file(s) - both FP and INT - * :n_regs: depth of register file(s) - number of FP and INT regs - """ - self.rwid = rwid - self.n_regs = n_regs - - # Register Files - self.intregs = RegFileArray(rwid, n_regs) - self.fpregs = RegFileArray(rwid, n_regs) - - # inputs - self.int_store_i = Signal(reset_less=True) # instruction is a store - self.int_dest_i = Signal(max=n_regs, reset_less=True) # Dest R# in - self.int_src1_i = Signal(max=n_regs, reset_less=True) # oper1 R# in - self.int_src2_i = Signal(max=n_regs, reset_less=True) # oper2 R# in - - self.issue_o = Signal(reset_less=True) # instruction was accepted - - def elaborate(self, platform): - m = Module() - - m.submodules.intregs = self.intregs - m.submodules.fpregs = self.fpregs - - # register ports - int_dest = self.intregs.write_port("dest") - int_src1 = self.intregs.read_port("src1") - int_src2 = self.intregs.read_port("src2") - - fp_dest = self.fpregs.write_port("dest") - fp_src1 = self.fpregs.read_port("src1") - fp_src2 = self.fpregs.read_port("src2") - - # Int ALUs - add = ALU(self.rwid) - sub = ALU(self.rwid) - m.submodules.comp1 = comp1 = ComputationUnitNoDelay(self.rwid, 1, add) - m.submodules.comp2 = comp2 = ComputationUnitNoDelay(self.rwid, 1, sub) - int_alus = [comp1, comp2] - - m.d.comb += comp1.oper_i.eq(Const(0)) # temporary/experiment: op=add - m.d.comb += comp2.oper_i.eq(Const(1)) # temporary/experiment: op=sub - - # Int FUs - if_l = [] - int_src1_pend_v = [] - int_src2_pend_v = [] - int_rd_pend_v = [] - int_wr_pend_v = [] - for i, a in enumerate(int_alus): - # set up Integer Function Unit, add to module (and python list) - fu = IntFnUnit(self.n_regs, shadow_wid=0) - setattr(m.submodules, "intfu%d" % i, fu) - if_l.append(fu) - # collate the read/write pending vectors (to go into global pending) - int_src1_pend_v.append(fu.src1_pend_o) - int_src2_pend_v.append(fu.src2_pend_o) - int_rd_pend_v.append(fu.int_rd_pend_o) - int_wr_pend_v.append(fu.int_wr_pend_o) - int_fus = Array(if_l) - - # Count of number of FUs - n_int_fus = len(if_l) - n_fp_fus = 0 # for now - - n_fus = n_int_fus + n_fp_fus # plus FP FUs - - # XXX replaced by array of FUs? *FnUnit - # # Integer FU-FU Dep Matrix - # m.submodules.intfudeps = FUFUDepMatrix(n_int_fus, n_int_fus) - # Integer FU-Reg Dep Matrix - # intregdeps = FURegDepMatrix(self.n_regs, n_int_fus) - # m.submodules.intregdeps = intregdeps - - # Integer Priority Picker 1: Adder + Subtractor - intpick1 = GroupPicker(2) # picks between add and sub - m.submodules.intpick1 = intpick1 - - # Global Pending Vectors (INT and FP) - # NOTE: number of vectors is NOT same as number of FUs. - g_int_src1_pend_v = GlobalPending(self.n_regs, int_src1_pend_v) - g_int_src2_pend_v = GlobalPending(self.n_regs, int_src2_pend_v) - g_int_rd_pend_v = GlobalPending(self.n_regs, int_rd_pend_v, True) - g_int_wr_pend_v = GlobalPending(self.n_regs, int_wr_pend_v, True) - m.submodules.g_int_src1_pend_v = g_int_src1_pend_v - m.submodules.g_int_src2_pend_v = g_int_src2_pend_v - m.submodules.g_int_rd_pend_v = g_int_rd_pend_v - m.submodules.g_int_wr_pend_v = g_int_wr_pend_v - - # INT/FP Issue Unit - regdecode = RegDecode(self.n_regs) - m.submodules.regdecode = regdecode - issueunit = IntFPIssueUnit(self.n_regs, n_int_fus, n_fp_fus) - m.submodules.issueunit = issueunit - - # FU-FU Dependency Matrices - intfudeps = FUFUDepMatrix(n_int_fus, n_int_fus) - m.submodules.intfudeps = intfudeps - - #--------- - # ok start wiring things together... - # "now hear de word of de looord... dem bones dem bones dem dryy bones" - # https://www.youtube.com/watch?v=pYb8Wm6-QfA - #--------- - - #--------- - # Issue Unit is where it starts. set up some in/outs for this module - #--------- - m.d.comb += [issueunit.i.store_i.eq(self.int_store_i), - regdecode.dest_i.eq(self.int_dest_i), - regdecode.src1_i.eq(self.int_src1_i), - regdecode.src2_i.eq(self.int_src2_i), - regdecode.enable_i.eq(1), - self.issue_o.eq(issueunit.issue_o), - issueunit.i.dest_i.eq(regdecode.dest_o), - ] - self.int_insn_i = issueunit.i.insn_i # enabled by instruction decode - - # connect global rd/wr pending vectors - m.d.comb += issueunit.i.g_wr_pend_i.eq(g_int_wr_pend_v.g_pend_o) - # TODO: issueunit.f (FP) - - # and int function issue / busy arrays, and dest/src1/src2 - fn_issue_l = [] - fn_busy_l = [] - for i, fu in enumerate(if_l): - fn_issue_l.append(fu.issue_i) - fn_busy_l.append(fu.busy_o) - m.d.sync += fu.issue_i.eq(issueunit.i.fn_issue_o[i]) - m.d.sync += fu.dest_i.eq(self.int_dest_i) - m.d.sync += fu.src1_i.eq(self.int_src1_i) - m.d.sync += fu.src2_i.eq(self.int_src2_i) - # XXX sync, so as to stop a simulation infinite loop - m.d.comb += issueunit.i.busy_i[i].eq(fu.busy_o) - - #--------- - # connect Function Units - #--------- - - # Group Picker... done manually for now. TODO: cat array of pick sigs - m.d.comb += if_l[0].go_rd_i.eq(intpick1.go_rd_o[0]) # add rd - m.d.comb += if_l[0].go_wr_i.eq(intpick1.go_wr_o[0]) # add wr - - m.d.comb += if_l[1].go_rd_i.eq(intpick1.go_rd_o[1]) # subtract rd - m.d.comb += if_l[1].go_wr_i.eq(intpick1.go_wr_o[1]) # subtract wr - - # create read-pending FU-FU vectors - intfu_rd_pend_v = Signal(n_int_fus, reset_less = True) - intfu_wr_pend_v = Signal(n_int_fus, reset_less = True) - for i in range(n_int_fus): - #m.d.comb += intfu_rd_pend_v[i].eq(if_l[i].int_rd_pend_o.bool()) - #m.d.comb += intfu_wr_pend_v[i].eq(if_l[i].int_wr_pend_o.bool()) - m.d.comb += intfu_rd_pend_v[i].eq(if_l[i].int_readable_o) - m.d.comb += intfu_wr_pend_v[i].eq(if_l[i].int_writable_o) - - # Connect INT Fn Unit global wr/rd pending - for fu in if_l: - m.d.comb += fu.g_int_wr_pend_i.eq(g_int_wr_pend_v.g_pend_o) - m.d.comb += fu.g_int_rd_pend_i.eq(g_int_rd_pend_v.g_pend_o) - - # Connect FU-FU Matrix, NOTE: FN Units readable/writable considered - # to be unit "read-pending / write-pending" - m.d.comb += intfudeps.rd_pend_i.eq(intfu_rd_pend_v) - m.d.comb += intfudeps.wr_pend_i.eq(intfu_wr_pend_v) - m.d.comb += intfudeps.issue_i.eq(issueunit.i.fn_issue_o) - for i in range(n_int_fus): - m.d.comb += intfudeps.go_rd_i[i].eq(intpick1.go_rd_o[i]) - m.d.comb += intfudeps.go_wr_i[i].eq(intpick1.go_wr_o[i]) - - # Connect Picker (note connection to FU-FU) - #--------- - readable_o = intfudeps.readable_o - writable_o = intfudeps.writable_o - m.d.comb += intpick1.rd_rel_i[0].eq(int_alus[0].rd_rel_o) - m.d.comb += intpick1.rd_rel_i[1].eq(int_alus[1].rd_rel_o) - m.d.comb += intpick1.req_rel_i[0].eq(int_alus[0].req_rel_o) - m.d.comb += intpick1.req_rel_i[1].eq(int_alus[1].req_rel_o) - m.d.comb += intpick1.readable_i[0].eq(readable_o[0]) # add rd - m.d.comb += intpick1.writable_i[0].eq(writable_o[0]) # add wr - m.d.comb += intpick1.readable_i[1].eq(readable_o[1]) # sub rd - m.d.comb += intpick1.writable_i[1].eq(writable_o[1]) # sub wr - - #--------- - # Connect Register File(s) - #--------- - #with m.If(if_l[0].go_wr_i | if_l[1].go_wr_i): - m.d.sync += int_dest.wen.eq(g_int_wr_pend_v.g_pend_o) - #with m.If(intpick1.go_rd_o): - #with m.If(if_l[0].go_rd_i | if_l[1].go_rd_i): - m.d.sync += int_src1.ren.eq(g_int_src1_pend_v.g_pend_o) - m.d.sync += int_src2.ren.eq(g_int_src2_pend_v.g_pend_o) - - # merge (OR) all integer FU / ALU outputs to a single value - # bit of a hack: treereduce needs a list with an item named "dest_o" - dest_o = treereduce(int_alus) - m.d.sync += int_dest.data_i.eq(dest_o) - - # connect ALUs - for i, alu in enumerate(int_alus): - m.d.comb += alu.go_rd_i.eq(intpick1.go_rd_o[i]) - m.d.comb += alu.go_wr_i.eq(intpick1.go_wr_o[i]) - m.d.comb += alu.issue_i.eq(fn_issue_l[i]) - #m.d.comb += fn_busy_l[i].eq(alu.busy_o) # XXX ignore, use fnissue - m.d.comb += alu.src1_i.eq(int_src1.data_o) - m.d.comb += alu.src2_i.eq(int_src2.data_o) - m.d.comb += if_l[i].req_rel_i.eq(alu.req_rel_o) # pipe out ready - - return m - - - def __iter__(self): - yield from self.intregs - yield from self.fpregs - yield self.int_store_i - yield self.int_dest_i - yield self.int_src1_i - yield self.int_src2_i - yield self.issue_o - #yield from self.int_src1 - #yield from self.int_dest - #yield from self.int_src1 - #yield from self.int_src2 - #yield from self.fp_dest - #yield from self.fp_src1 - #yield from self.fp_src2 - - def ports(self): - return list(self) - -IADD = 0 -ISUB = 1 - -class RegSim: - def __init__(self, rwidth, nregs): - self.rwidth = rwidth - self.regs = [0] * nregs - - def op(self, op, src1, src2, dest): - src1 = self.regs[src1] - src2 = self.regs[src2] - if op == IADD: - val = (src1 + src2) & ((1<<(self.rwidth))-1) - elif op == ISUB: - val = (src1 - src2) & ((1<<(self.rwidth))-1) - self.regs[dest] = val - - def setval(self, dest, val): - self.regs[dest] = val - - def dump(self, dut): - for i, val in enumerate(self.regs): - reg = yield dut.intregs.regs[i].reg - okstr = "OK" if reg == val else "!ok" - print("reg %d expected %x received %x %s" % (i, val, reg, okstr)) - - def check(self, dut): - for i, val in enumerate(self.regs): - reg = yield dut.intregs.regs[i].reg - if reg != val: - print("reg %d expected %x received %x\n" % (i, val, reg)) - yield from self.dump(dut) - assert False - -def int_instr(dut, alusim, op, src1, src2, dest): - for i in range(len(dut.int_insn_i)): - yield dut.int_insn_i[i].eq(0) - yield dut.int_dest_i.eq(dest) - yield dut.int_src1_i.eq(src1) - yield dut.int_src2_i.eq(src2) - yield dut.int_insn_i[op].eq(1) - alusim.op(op, src1, src2, dest) - - -def print_reg(dut, rnums): - rs = [] - for rnum in rnums: - reg = yield dut.intregs.regs[rnum].reg - rs.append("%x" % reg) - rnums = map(str, rnums) - print ("reg %s: %s" % (','.join(rnums), ','.join(rs))) - - -def scoreboard_sim(dut, alusim): - yield dut.int_store_i.eq(0) - - for i in range(1, dut.n_regs): - yield dut.intregs.regs[i].reg.eq(i) - alusim.setval(i, i) - - if False: - yield from int_instr(dut, alusim, IADD, 4, 3, 5) - yield from print_reg(dut, [3,4,5]) - yield - yield from int_instr(dut, alusim, IADD, 5, 2, 5) - yield from print_reg(dut, [3,4,5]) - yield - yield from int_instr(dut, alusim, ISUB, 5, 1, 3) - yield from print_reg(dut, [3,4,5]) - yield - for i in range(len(dut.int_insn_i)): - yield dut.int_insn_i[i].eq(0) - yield from print_reg(dut, [3,4,5]) - yield - yield from print_reg(dut, [3,4,5]) - yield - yield from print_reg(dut, [3,4,5]) - yield - - yield from alusim.check(dut) - - for i in range(2): - src1 = randint(1, dut.n_regs-1) - src2 = randint(1, dut.n_regs-1) - while True: - dest = randint(1, dut.n_regs-1) - break - if dest not in [src1, src2]: - break - op = randint(0, 1) - if False: - if i % 2 == 0: - src1 = 6 - src2 = 6 - dest = 1 - else: - src1 = 1 - src2 = 7 - dest = 2 - #src1 = 2 - #src2 = 3 - #dest = 2 - - op = i - - if True: - if i == 0: - src1 = 2 - src2 = 3 - dest = 3 - else: - src1 = 5 - src2 = 3 - dest = 4 - - #op = (i+1) % 2 - op = i - - print ("random %d: %d %d %d %d\n" % (i, op, src1, src2, dest)) - yield from int_instr(dut, alusim, op, src1, src2, dest) - yield from print_reg(dut, [3,4,5]) - while True: - yield - issue_o = yield dut.issue_o - if issue_o: - yield from print_reg(dut, [3,4,5]) - for i in range(len(dut.int_insn_i)): - yield dut.int_insn_i[i].eq(0) - break - print ("busy",) - yield from print_reg(dut, [3,4,5]) - yield - yield - yield - - - yield - yield from print_reg(dut, [3,4,5]) - yield - yield from print_reg(dut, [3,4,5]) - yield - yield from print_reg(dut, [3,4,5]) - yield - yield from print_reg(dut, [3,4,5]) - yield - yield - yield - yield - yield - yield - yield - yield - yield - yield from alusim.check(dut) - yield from alusim.dump(dut) - - -def explore_groups(dut): - from nmigen.hdl.ir import Fragment - from nmigen.hdl.xfrm import LHSGroupAnalyzer - - fragment = dut.elaborate(platform=None) - fr = Fragment.get(fragment, platform=None) - - groups = LHSGroupAnalyzer()(fragment._statements) - - print (groups) - - -def test_scoreboard(): - dut = Scoreboard(16, 8) - alusim = RegSim(16, 8) - vl = rtlil.convert(dut, ports=dut.ports()) - with open("test_scoreboard.il", "w") as f: - f.write(vl) - - run_simulation(dut, scoreboard_sim(dut, alusim), - vcd_name='test_scoreboard.vcd') - - -if __name__ == '__main__': - test_scoreboard() diff --git a/src/experiment/score6600.py b/src/experiment/score6600.py deleted file mode 100644 index 209bc99c..00000000 --- a/src/experiment/score6600.py +++ /dev/null @@ -1,1296 +0,0 @@ -from nmigen.compat.sim import run_simulation -from nmigen.cli import verilog, rtlil -from nmigen.hdl.ast import unsigned -from nmigen import Module, Const, Signal, Array, Cat, Elaboratable, Memory - -from regfile.regfile import RegFileArray, treereduce -from scoreboard.fu_fu_matrix import FUFUDepMatrix -from scoreboard.fu_reg_matrix import FURegDepMatrix -from scoreboard.global_pending import GlobalPending -from scoreboard.group_picker import GroupPicker -from scoreboard.issue_unit import IssueUnitGroup, IssueUnitArray, RegDecode -from scoreboard.shadow import ShadowMatrix, BranchSpeculationRecord -from scoreboard.instruction_q import Instruction, InstructionQ -from scoreboard.memfu import MemFunctionUnits - -from compalu import ComputationUnitNoDelay -from compldst import LDSTCompUnit - -from alu_hier import ALU, BranchALU -from nmutil.latch import SRLatch -from nmutil.nmoperator import eq - -from random import randint, seed -from copy import deepcopy -from math import log - - -class TestMemory(Elaboratable): - def __init__(self, regwid, addrw): - self.ddepth = 1 # regwid //8 - depth = (1<>self.ddepth] - - def st(self, addr, data): - self.mem[addr>>self.ddepth] = data & ((1< Mem FUs - comb += memfus.addr_en_i.eq(cul.adr_rel_o) # Match enable on adr rel - comb += memfus.addr_rs_i.eq(reset_b) # reset same as LDSTCompUnit - - # LD/STs have to accumulate prior LD/STs (TODO: multi-issue as well, - # in a transitive fashion). This cycle activates based on LDSTCompUnit - # issue_i. multi-issue gets a bit more complex but not a lot. - prior_ldsts = Signal(cul.n_units, reset_less=True) - sync += prior_ldsts.eq(memfus.g_int_ld_pend_o | memfus.g_int_st_pend_o) - with m.If(self.ls_oper_i[2]): # LD bit of operand - comb += memfus.ld_i.eq(cul.issue_i | prior_ldsts) - with m.If(self.ls_oper_i[3]): # ST bit of operand - comb += memfus.st_i.eq(cul.issue_i | prior_ldsts) - - # TODO: adr_rel_o needs to go into L1 Cache. for now, - # just immediately activate go_adr - comb += cul.go_ad_i.eq(cul.adr_rel_o) - - # connect up address data - comb += memfus.addrs_i[0].eq(cul.units[0].addr_o) - comb += memfus.addrs_i[1].eq(cul.units[1].addr_o) - - # connect loadable / storable to go_ld/go_st. - # XXX should only be done when the memory ld/st has actually happened! - go_st_i = Signal(cul.n_units, reset_less=True) - go_ld_i = Signal(cul.n_units, reset_less=True) - comb += go_ld_i.eq(memfus.loadable_o & memfus.addr_nomatch_o &\ - cul.req_rel_o & cul.ld_o) - comb += go_st_i.eq(memfus.storable_o & memfus.addr_nomatch_o &\ - cul.sto_rel_o & cul.st_o) - comb += memfus.go_ld_i.eq(go_ld_i) - comb += memfus.go_st_i.eq(go_st_i) - #comb += cul.go_wr_i.eq(go_ld_i) - comb += cul.go_st_i.eq(go_st_i) - - #comb += cu.go_rd_i[0:n_intfus].eq(go_rd_o[0:n_intfus]) - #comb += cu.go_wr_i[0:n_intfus].eq(go_wr_o[0:n_intfus]) - #comb += cu.issue_i[0:n_intfus].eq(fn_issue_o[0:n_intfus]) - - #--------- - # merge shadow matrices outputs - #--------- - - # these are explained in ShadowMatrix docstring, and are to be - # connected to the FUReg and FUFU Matrices, to get them to reset - anydie = Signal(n_intfus, reset_less=True) - allshadown = Signal(n_intfus, reset_less=True) - shreset = Signal(n_intfus, reset_less=True) - comb += allshadown.eq(shadows.shadown_o & bshadow.shadown_o) - comb += anydie.eq(shadows.go_die_o | bshadow.go_die_o) - comb += shreset.eq(bspec.match_g_o | bspec.match_f_o) - - #--------- - # connect fu-fu matrix - #--------- - - # Group Picker... done manually for now. - go_rd_o = intpick1.go_rd_o - go_wr_o = intpick1.go_wr_o - go_rd_i = intfus.go_rd_i - go_wr_i = intfus.go_wr_i - go_die_i = intfus.go_die_i - # NOTE: connect to the shadowed versions so that they can "die" (reset) - comb += go_rd_i[0:n_intfus].eq(go_rd_o[0:n_intfus]) # rd - comb += go_wr_i[0:n_intfus].eq(go_wr_o[0:n_intfus]) # wr - comb += go_die_i[0:n_intfus].eq(anydie[0:n_intfus]) # die - - # Connect Picker - #--------- - comb += intpick1.rd_rel_i[0:n_intfus].eq(cu.rd_rel_o[0:n_intfus]) - comb += intpick1.req_rel_i[0:n_intfus].eq(cu.req_rel_o[0:n_intfus]) - int_rd_o = intfus.readable_o - int_wr_o = intfus.writable_o - comb += intpick1.readable_i[0:n_intfus].eq(int_rd_o[0:n_intfus]) - comb += intpick1.writable_i[0:n_intfus].eq(int_wr_o[0:n_intfus]) - - #--------- - # Shadow Matrix - #--------- - - comb += shadows.issue_i.eq(fn_issue_o) - #comb += shadows.reset_i[0:n_intfus].eq(bshadow.go_die_o[0:n_intfus]) - comb += shadows.reset_i[0:n_intfus].eq(bshadow.go_die_o[0:n_intfus]) - #--------- - # NOTE; this setup is for the instruction order preservation... - - # connect shadows / go_dies to Computation Units - comb += cu.shadown_i[0:n_intfus].eq(allshadown) - comb += cu.go_die_i[0:n_intfus].eq(anydie) - - # ok connect first n_int_fu shadows to busy lines, to create an - # instruction-order linked-list-like arrangement, using a bit-matrix - # (instead of e.g. a ring buffer). - - # when written, the shadow can be cancelled (and was good) - for i in range(n_intfus): - comb += shadows.s_good_i[i][0:n_intfus].eq(go_wr_o[0:n_intfus]) - - # *previous* instruction shadows *current* instruction, and, obviously, - # if the previous is completed (!busy) don't cast the shadow! - comb += prev_shadow.eq(~fn_issue_o & cu.busy_o) - for i in range(n_intfus): - comb += shadows.shadow_i[i][0:n_intfus].eq(prev_shadow) - - #--------- - # ... and this is for branch speculation. it uses the extra bit - # tacked onto the ShadowMatrix (hence shadow_wid=n_intfus+1) - # only needs to set shadow_i, s_fail_i and s_good_i - - # issue captures shadow_i (if enabled) - comb += bshadow.reset_i[0:n_intfus].eq(shreset[0:n_intfus]) - - bactive = Signal(reset_less=True) - comb += bactive.eq((bspec.active_i | br1.issue_i) & ~br1.go_wr_i) - - # instruction being issued (fn_issue_o) has a shadow cast by the branch - with m.If(bactive & (self.branch_succ_i | self.branch_fail_i)): - comb += bshadow.issue_i.eq(fn_issue_o) - for i in range(n_intfus): - with m.If(fn_issue_o & (Const(1<> (src2 & maxbits) - elif op == IBGT: - val = int(src1 > src2) - elif op == IBLT: - val = int(src1 < src2) - elif op == IBEQ: - val = int(src1 == src2) - elif op == IBNE: - val = int(src1 != src2) - else: - return 0 # LD/ST TODO - val &= maxbits - self.setval(dest, val) - return val - - def setval(self, dest, val): - print ("sim setval", dest, hex(val)) - self.regs[dest] = val - - def dump(self, dut): - for i, val in enumerate(self.regs): - reg = yield dut.intregs.regs[i].reg - okstr = "OK" if reg == val else "!ok" - print("reg %d expected %x received %x %s" % (i, val, reg, okstr)) - - def check(self, dut): - for i, val in enumerate(self.regs): - reg = yield dut.intregs.regs[i].reg - if reg != val: - print("reg %d expected %x received %x\n" % (i, val, reg)) - yield from self.dump(dut) - assert False - -def instr_q(dut, op, op_imm, imm, src1, src2, dest, - branch_success, branch_fail): - instrs = [{'oper_i': op, 'dest_i': dest, 'imm_i': imm, 'opim_i': op_imm, - 'src1_i': src1, 'src2_i': src2}] - - sendlen = 1 - for idx in range(sendlen): - yield from eq(dut.data_i[idx], instrs[idx]) - di = yield dut.data_i[idx] - print ("senddata %d %x" % (idx, di)) - yield dut.p_add_i.eq(sendlen) - yield - o_p_ready = yield dut.p_ready_o - while not o_p_ready: - yield - o_p_ready = yield dut.p_ready_o - - yield dut.p_add_i.eq(0) - - -def int_instr(dut, op, imm, src1, src2, dest, branch_success, branch_fail): - yield from disable_issue(dut) - yield dut.int_dest_i.eq(dest) - yield dut.int_src1_i.eq(src1) - yield dut.int_src2_i.eq(src2) - if (op & (0x3<<2)) != 0: # branch - yield dut.brissue.insn_i.eq(1) - yield dut.br_oper_i.eq(Const(op & 0x3, 2)) - yield dut.br_imm_i.eq(imm) - dut_issue = dut.brissue - else: - yield dut.aluissue.insn_i.eq(1) - yield dut.alu_oper_i.eq(Const(op & 0x3, 2)) - yield dut.alu_imm_i.eq(imm) - dut_issue = dut.aluissue - yield dut.reg_enable_i.eq(1) - - # these indicate that the instruction is to be made shadow-dependent on - # (either) branch success or branch fail - yield dut.branch_fail_i.eq(branch_fail) - yield dut.branch_succ_i.eq(branch_success) - - yield - yield from wait_for_issue(dut, dut_issue) - - -def print_reg(dut, rnums): - rs = [] - for rnum in rnums: - reg = yield dut.intregs.regs[rnum].reg - rs.append("%x" % reg) - rnums = map(str, rnums) - print ("reg %s: %s" % (','.join(rnums), ','.join(rs))) - - -def create_random_ops(dut, n_ops, shadowing=False, max_opnums=3): - insts = [] - for i in range(n_ops): - src1 = randint(1, dut.n_regs-1) - src2 = randint(1, dut.n_regs-1) - imm = randint(1, (1<= 4 - if is_branch: - branch_ok, branch_fail = dest - dest = src2 - # ok zip up the branch success / fail instructions and - # drop them into the queue, one marked "to have branch success" - # the other to be marked shadow branch "fail". - # one out of each of these will be cancelled - for ok, fl in zip(branch_ok, branch_fail): - if ok: - instrs.append((ok[0], ok[1], ok[2], ok[3], (1, 0))) - if fl: - instrs.append((fl[0], fl[1], fl[2], fl[3], (0, 1))) - print ("instr %d: (%d, %d, %d, %d, (%d, %d))" % \ - (i, src1, src2, dest, op, shadow_on, shadow_off)) - yield from int_instr(dut, op, src1, src2, dest, - shadow_on, shadow_off) - - # wait for all instructions to stop before checking - yield - yield from wait_for_busy_clear(dut) - - i = -1 - while siminsts: - instr = siminsts.pop(0) - if instr is None: - continue - (src1, src2, dest, op, (shadow_on, shadow_off)) = instr - i += 1 - is_branch = op >= 4 - if is_branch: - branch_ok, branch_fail = dest - dest = src2 - print ("sim %d: (%d, %d, %d, %d, (%d, %d))" % \ - (i, src1, src2, dest, op, shadow_on, shadow_off)) - branch_res = alusim.op(op, src1, src2, dest) - if is_branch: - if branch_res: - siminsts += branch_ok - else: - siminsts += branch_fail - - # check status - yield from alusim.check(dut) - yield from alusim.dump(dut) - - -def scoreboard_sim(dut, alusim): - - seed(0) - - for i in range(1): - - # set random values in the registers - for i in range(1, dut.n_regs): - val = randint(0, (1< 1) || (block_forwarding == 1'b1); - - generate - if (ENABLE_L2TLB == 1) begin : HUM_BUFFER - - axi_buffer_rab_bram - #( - .DATA_WIDTH ( BUFFER_WIDTH ), - .BUFFER_DEPTH ( HUM_BUFFER_DEPTH ) - ) - u_hum_buf - ( - .clk ( axi4_aclk ), - .rstn ( axi4_arstn ), - // Push - .data_in ( {axi4_wuser, axi4_wstrb, axi4_wdata, axi4_wlast} ), - .valid_in ( hum_buf_valid_in ), - .ready_out ( hum_buf_ready_out ), - // Pop - .data_out ( {hum_buf_wuser, hum_buf_wstrb, hum_buf_wdata, hum_buf_wlast} ), - .valid_out ( hum_buf_valid_out ), - .ready_in ( hum_buf_ready_in ), - // Clear - .almost_full ( hum_buf_almost_full ), - .underfull ( hum_buf_underfull ), - .drop_req ( hum_buf_drop_req_SP ), - .drop_len ( hum_buf_drop_len_SP ) - ); - - axi_buffer_rab - #( - .DATA_WIDTH ( 2+AXI_ID_WIDTH+8+3 ), - .BUFFER_DEPTH ( L2_FIFO_DEPTH ) - ) - u_l2_fifo - ( - .clk ( axi4_aclk ), - .rstn ( axi4_arstn ), - // Push - .data_in ( {l2_prefetch_i, l2_hit_i, l2_id_i, l2_len_i, l2_master_i, l2_accept_i, l2_drop_i} ), - .valid_in ( l2_fifo_valid_in ), - .ready_out ( l2_fifo_ready_out ), - // Pop - .data_out ( {l2_prefetch_cur, l2_hit_cur, l2_id_cur, l2_len_cur, l2_master_cur, l2_accept_cur, l2_drop_cur} ), - .valid_out ( l2_fifo_valid_out ), - .ready_in ( l2_fifo_ready_in ) - ); - - // Push upon receiving new result from TLB. - assign l2_req = l2_accept_i | l2_drop_i; - assign l2_fifo_valid_in = l2_req & l2_fifo_ready_out; - - assign wlast_in = axi4_wlast & hum_buf_valid_in & hum_buf_ready_out; - assign wlast_out = hum_buf_wlast & hum_buf_valid_out & hum_buf_ready_in; - - always_ff @(posedge axi4_aclk or negedge axi4_arstn) begin - if (axi4_arstn == 0) begin - fifo_select_SP <= 1'b0; - hum_buf_drop_len_SP <= 'b0; - hum_buf_drop_req_SP <= 1'b0; - hum_buf_SP <= STORE; - n_wlast_SP <= 'b0; - end else begin - fifo_select_SP <= fifo_select_SN; - hum_buf_drop_len_SP <= hum_buf_drop_len_SN; - hum_buf_drop_req_SP <= hum_buf_drop_req_SN; - hum_buf_SP <= hum_buf_SN; - n_wlast_SP <= n_wlast_SN; - end - end - - always_comb begin - n_wlast_SN = n_wlast_SP; - if (hum_buf_drop_req_SP) begin // Happens exactly once per burst to be dropped. - n_wlast_SN -= 1; - end - if (wlast_in) begin - n_wlast_SN += 1; - end - if (wlast_out) begin - n_wlast_SN -= 1; - end - end - - always_comb begin : HUM_BUFFER_FSM - hum_buf_SN = hum_buf_SP; - - m_axi4_wlast = 1'b0; - m_axi4_wdata = 'b0; - m_axi4_wstrb = 'b0; - m_axi4_wuser = 'b0; - - m_axi4_wvalid = 1'b0; - axi4_wready = 1'b0; - - hum_buf_valid_in = 1'b0; - hum_buf_ready_in = 1'b0; - - hum_buf_drop_req_SN = hum_buf_drop_req_SP; - hum_buf_drop_len_SN = hum_buf_drop_len_SP; - master_select_o = 1'b0; - - w_done = 1'b0; // read from FIFO without handshake with B sender - b_drop_o = 1'b0; // send data from FIFO to B sender (with handshake) - fifo_select = 1'b0; - - fifo_select_SN = fifo_select_SP; - stop_store = 1'b0; - - block_forwarding = 1'b0; - - unique case (hum_buf_SP) - - STORE : begin - // Simply store the data in the buffer. - hum_buf_valid_in = axi4_wvalid & hum_buf_ready_out; - axi4_wready = hum_buf_ready_out; - - // We have got a full burst in the HUM buffer, thus stop storing. - if (wlast_in & !hum_buf_underfull | (n_wlast_SP > $signed(0))) begin - hum_buf_SN = WAIT_L1_BYPASS_YES; - - // The buffer is full, thus wait for decision. - end else if (~hum_buf_ready_out) begin - hum_buf_SN = WAIT_L1_BYPASS_NO; - end - - // Avoid the forwarding of L1 hits until we know whether we can bypass. - if (l1_fifo_valid_out & l1_save_cur) begin - block_forwarding = 1'b1; - end - end - - WAIT_L1_BYPASS_YES : begin - // Wait for orders from L1 TLB. - if (l1_fifo_valid_out) begin - - // L1 hit - forward data from buffer - if (l1_accept_cur) begin - m_axi4_wlast = hum_buf_wlast; - m_axi4_wdata = hum_buf_wdata; - m_axi4_wstrb = hum_buf_wstrb; - m_axi4_wuser = hum_buf_wuser; - - m_axi4_wvalid = hum_buf_valid_out; - hum_buf_ready_in = m_axi4_wready; - - master_select_o = l1_master_cur; - - // Detect last data beat. - if (wlast_out) begin - fifo_select = 1'b0; - w_done = 1'b1; - hum_buf_SN = STORE; - end - - // L1 miss - wait for L2 - end else if (l1_save_cur) begin - fifo_select = 1'b0; - w_done = 1'b1; - hum_buf_SN = WAIT_L2_BYPASS_YES; - - // L1 prefetch, prot, multi - drop data - end else if (l1_drop_cur) begin - fifo_select_SN = 1'b0; // L1 - hum_buf_drop_req_SN = 1'b1; - hum_buf_drop_len_SN = l1_len_cur; - hum_buf_SN = FLUSH; - end - end - end - - WAIT_L2_BYPASS_YES : begin - // Wait for orders from L2 TLB. - if (l2_fifo_valid_out) begin - - // L2 hit - forward data from buffer - if (l2_accept_cur) begin - m_axi4_wlast = hum_buf_wlast; - m_axi4_wdata = hum_buf_wdata; - m_axi4_wstrb = hum_buf_wstrb; - m_axi4_wuser = hum_buf_wuser; - - m_axi4_wvalid = hum_buf_valid_out; - hum_buf_ready_in = m_axi4_wready; - - master_select_o = l2_master_cur; - - // Detect last data beat. - if (wlast_out) begin - fifo_select = 1'b1; - w_done = 1'b1; - hum_buf_SN = STORE; - end - - // L2 miss/prefetch hit - end else if (l2_drop_cur) begin - fifo_select_SN = 1'b1; // L2 - hum_buf_drop_req_SN = 1'b1; - hum_buf_drop_len_SN = l2_len_cur; - hum_buf_SN = FLUSH; - end - - // While we wait for orders from L2 TLB, we can still drop and accept L1 transactions. - end else if (l1_fifo_valid_out) begin - - // L1 hit - if (l1_accept_cur) begin - hum_buf_SN = BYPASS; - - // L1 prefetch/prot/multi - end else if (l1_drop_cur) begin - hum_buf_SN = DISCARD; - end - end - end - - FLUSH : begin - // Clear HUM buffer flush request. - hum_buf_drop_req_SN = 1'b0; - - // perform handshake with B sender - fifo_select = fifo_select_SP; - b_drop_o = 1'b1; - if (b_done_i) begin - hum_buf_SN = STORE; - end - end - - BYPASS : begin - // Forward one full transaction from input buffer. - m_axi4_wlast = axi4_wlast; - m_axi4_wdata = axi4_wdata; - m_axi4_wstrb = axi4_wstrb; - m_axi4_wuser = axi4_wuser; - - m_axi4_wvalid = axi4_wvalid; - axi4_wready = m_axi4_wready; - - master_select_o = l1_master_cur; - - // We have got a full transaction. - if (axi4_wlast & axi4_wready & axi4_wvalid) begin - fifo_select = 1'b0; - w_done = 1'b1; - hum_buf_SN = WAIT_L2_BYPASS_YES; - end - end - - DISCARD : begin - // Discard one full transaction from input buffer. - axi4_wready = 1'b1; - - // We have got a full transaction. - if (axi4_wlast & axi4_wready & axi4_wvalid) begin - // Try to perform handshake with B sender. - fifo_select = 1'b0; - b_drop_o = 1'b1; - // We cannot wait here due to axi4_wready. - if (b_done_i) begin - hum_buf_SN = WAIT_L2_BYPASS_YES; - end else begin - hum_buf_SN = DISCARD_FINISH; - end - end - end - - DISCARD_FINISH : begin - // Perform handshake with B sender. - fifo_select = 1'b0; - b_drop_o = 1'b1; - if (b_done_i) begin - hum_buf_SN = WAIT_L2_BYPASS_YES; - end - end - - WAIT_L1_BYPASS_NO : begin - // Do not allow the forwarding of L1 hits. - block_forwarding = 1'b1; - - // Wait for orders from L1 TLB. - if (l1_fifo_valid_out) begin - - // L1 hit - forward data from/through HUM buffer and refill the buffer - if (l1_accept_cur) begin - // Forward data from HUM buffer. - m_axi4_wlast = hum_buf_wlast; - m_axi4_wdata = hum_buf_wdata; - m_axi4_wstrb = hum_buf_wstrb; - m_axi4_wuser = hum_buf_wuser; - - m_axi4_wvalid = hum_buf_valid_out; - hum_buf_ready_in = m_axi4_wready; - - master_select_o = l1_master_cur; - - // Refill the HUM buffer. Stop when buffer full. - stop_store = ~hum_buf_ready_out; - hum_buf_valid_in = stop_store ? 1'b0 : axi4_wvalid ; - axi4_wready = stop_store ? 1'b0 : hum_buf_ready_out; - - // Detect last data beat. - if (wlast_out) begin - fifo_select = 1'b0; - w_done = 1'b1; - if (~hum_buf_ready_out | hum_buf_almost_full) begin - hum_buf_SN = WAIT_L1_BYPASS_NO; - end else begin - hum_buf_SN = STORE; - end - end - - // Allow the forwarding of L1 hits. - block_forwarding = 1'b0; - - // L1 miss - wait for L2 - end else if (l1_save_cur) begin - fifo_select = 1'b0; - w_done = 1'b1; - hum_buf_SN = WAIT_L2_BYPASS_NO; - - // L1 prefetch, prot, multi - drop data - end else if (l1_drop_cur) begin - fifo_select_SN = 1'b0; // L1 - hum_buf_drop_req_SN = 1'b1; - hum_buf_drop_len_SN = l1_len_cur; - hum_buf_SN = FLUSH; - - // Allow the forwarding of L1 hits. - block_forwarding = 1'b0; - end - end - end - - WAIT_L2_BYPASS_NO : begin - // Do not allow the forwarding of L1 hits. - block_forwarding = 1'b1; - - // Wait for orders from L2 TLB. - if (l2_fifo_valid_out) begin - - // L2 hit - forward first part from HUM buffer, rest from input buffer - if (l2_accept_cur) begin - // Forward data from HUM buffer. - m_axi4_wlast = hum_buf_wlast; - m_axi4_wdata = hum_buf_wdata; - m_axi4_wstrb = hum_buf_wstrb; - m_axi4_wuser = hum_buf_wuser; - - m_axi4_wvalid = hum_buf_valid_out; - hum_buf_ready_in = m_axi4_wready; - - master_select_o = l2_master_cur; - - // Refill the HUM buffer. Stop when buffer full. - stop_store = ~hum_buf_ready_out; - hum_buf_valid_in = stop_store ? 1'b0 : axi4_wvalid ; - axi4_wready = stop_store ? 1'b0 : hum_buf_ready_out; - - // Detect last data beat. - if (wlast_out) begin - fifo_select = 1'b1; - w_done = 1'b1; - if (~hum_buf_ready_out | hum_buf_almost_full) begin - hum_buf_SN = WAIT_L1_BYPASS_NO; - end else begin - hum_buf_SN = STORE; - end - end - - // Allow the forwarding of L1 hits. - block_forwarding = 1'b0; - - // L2 miss/prefetch hit - drop data - end else if (l2_drop_cur) begin - fifo_select_SN = 1'b1; // L2 - hum_buf_drop_req_SN = 1'b1; - hum_buf_drop_len_SN = l2_len_cur; - hum_buf_SN = FLUSH; - - // Allow the forwarding of L1 hits. - block_forwarding = 1'b0; - end - end - end - - - default: begin - hum_buf_SN = STORE; - end - - endcase // hum_buf_SP - end // HUM_BUFFER_FSM - - assign b_drop_set = 1'b0; - - end else begin // HUM_BUFFER - - // register to perform the handshake with B sender - always_ff @(posedge axi4_aclk or negedge axi4_arstn) begin - if (axi4_arstn == 0) begin - b_drop_o <= 1'b0; - end else if (b_done_i) begin - b_drop_o <= 1'b0; - end else if (b_drop_set) begin - b_drop_o <= 1'b1;; - end - end - - always_comb begin : OUTPUT_CTRL - - fifo_select = 1'b0; - w_done = 1'b0; - b_drop_set = 1'b0; - - m_axi4_wlast = 1'b0; - m_axi4_wdata = 'b0; - m_axi4_wstrb = 'b0; - m_axi4_wuser = 'b0; - - m_axi4_wvalid = 1'b0; - axi4_wready = 1'b0; - - if (l1_fifo_valid_out) begin - // forward data - if (l1_accept_cur) begin - m_axi4_wlast = axi4_wlast; - m_axi4_wdata = axi4_wdata; - m_axi4_wstrb = axi4_wstrb; - m_axi4_wuser = axi4_wuser; - - m_axi4_wvalid = axi4_wvalid; - axi4_wready = m_axi4_wready; - - // Simply pop from FIFO upon last data beat. - w_done = axi4_wlast & axi4_wvalid & axi4_wready; - - // discard entire burst - end else if (b_drop_o == 1'b0) begin - axi4_wready = 1'b1; - - // Simply pop from FIFO upon last data beat. Perform handshake with B sender. - if (axi4_wlast & axi4_wvalid & axi4_wready) - b_drop_set = 1'b1; - end - end - - end // OUTPUT_CTRL - - assign master_select_o = l1_master_cur; - assign l2_fifo_ready_out = 1'b1; - assign block_forwarding = 1'b0; - - // unused signals - assign hum_buf_ready_out = 1'b0; - assign hum_buf_valid_in = 1'b0; - assign hum_buf_ready_in = 1'b0; - assign hum_buf_valid_out = 1'b0; - assign hum_buf_wdata = 'b0; - assign hum_buf_wstrb = 'b0; - assign hum_buf_wlast = 1'b0; - assign hum_buf_wuser = 'b0; - assign hum_buf_drop_len_SN = 'b0; - assign hum_buf_drop_req_SN = 1'b0; - assign hum_buf_almost_full = 1'b0; - - assign l2_fifo_valid_in = 1'b0; - assign l2_fifo_valid_out = 1'b0; - assign l2_prefetch_cur = 1'b0; - assign l2_hit_cur = 1'b0; - assign l2_id_cur = 'b0; - assign l2_len_cur = 'b0; - assign l2_master_cur = 1'b0; - assign l2_accept_cur = 1'b0; - assign l2_drop_cur = 1'b0; - - assign l2_req = 1'b0; - - assign fifo_select_SN = 1'b0; - assign fifo_select_SP = 1'b0; - - assign stop_store = 1'b0; - assign n_wlast_SP = 'b0; - assign wlast_in = 1'b0; - assign wlast_out = 1'b0; - - end // HUM_BUFFER - - endgenerate -""" diff --git a/src/iommu/axi_rab/axi4_w_sender.py b/src/iommu/axi_rab/axi4_w_sender.py deleted file mode 100644 index 9916334f..00000000 --- a/src/iommu/axi_rab/axi4_w_sender.py +++ /dev/null @@ -1,78 +0,0 @@ -# this file has been generated by sv2nmigen - -from nmigen import Signal, Module, Const, Cat, Elaboratable - - -class axi4_w_sender(Elaboratable): - - def __init__(self): - self.axi4_aclk = Signal() # input - self.axi4_arstn = Signal() # input - self.s_axi4_wdata = Signal() # input - self.s_axi4_wvalid = Signal() # input - self.s_axi4_wready = Signal() # output - self.s_axi4_wstrb = Signal() # input - self.s_axi4_wlast = Signal() # input - self.s_axi4_wuser = Signal() # input - self.m_axi4_wdata = Signal() # output - self.m_axi4_wvalid = Signal() # output - self.m_axi4_wready = Signal() # input - self.m_axi4_wstrb = Signal() # output - self.m_axi4_wlast = Signal() # output - self.m_axi4_wuser = Signal() # output - - def elaborate(self, platform=None): - m = Module() - m.d.comb += self.m_axi4_wdata.eq(self.s_axi4_wdata) - m.d.comb += self.m_axi4_wstrb.eq(self.s_axi4_wstrb) - m.d.comb += self.m_axi4_wlast.eq(self.s_axi4_wlast) - m.d.comb += self.m_axi4_wuser.eq(self.s_axi4_wuser) - m.d.comb += self.m_axi4_wvalid.eq(self.s_axi4_wvalid) - m.d.comb += self.s_axi4_wready.eq(self.m_axi4_wready) - return m - -# // Copyright 2018 ETH Zurich and University of Bologna. -# // Copyright and related rights are licensed under the Solderpad Hardware -# // License, Version 0.51 (the "License"); you may not use this file except in -# // compliance with the License. You may obtain a copy of the License at -# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law -# // or agreed to in writing, software, hardware and materials distributed under -# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# // CONDITIONS OF ANY KIND, either express or implied. See the License for the -# // specific language governing permissions and limitations under the License. -# -# module axi4_w_sender -# #( -# parameter AXI_DATA_WIDTH = 32, -# parameter AXI_USER_WIDTH = 2 -# ) -# ( -# input axi4_aclk, -# input axi4_arstn, -# -# input [AXI_DATA_WIDTH-1:0] s_axi4_wdata, -# input s_axi4_wvalid, -# output s_axi4_wready, -# input [AXI_DATA_WIDTH/8-1:0] s_axi4_wstrb, -# input s_axi4_wlast, -# input [AXI_USER_WIDTH-1:0] s_axi4_wuser, -# -# output [AXI_DATA_WIDTH-1:0] m_axi4_wdata, -# output m_axi4_wvalid, -# input m_axi4_wready, -# output [AXI_DATA_WIDTH/8-1:0] m_axi4_wstrb, -# output m_axi4_wlast, -# output [AXI_USER_WIDTH-1:0] m_axi4_wuser -# ); -# -# assign m_axi4_wdata = s_axi4_wdata; -# assign m_axi4_wstrb = s_axi4_wstrb; -# assign m_axi4_wlast = s_axi4_wlast; -# assign m_axi4_wuser = s_axi4_wuser; -# -# assign m_axi4_wvalid = s_axi4_wvalid; -# assign s_axi4_wready = m_axi4_wready; -# -# endmodule -# -# diff --git a/src/iommu/axi_rab/axi_buffer_rab.py b/src/iommu/axi_rab/axi_buffer_rab.py deleted file mode 100644 index b4d99299..00000000 --- a/src/iommu/axi_rab/axi_buffer_rab.py +++ /dev/null @@ -1,151 +0,0 @@ -# this file has been generated by sv2nmigen - -from nmigen import Signal, Module, Const, Cat, Elaboratable - - -class axi_buffer_rab(Elaboratable): - - def __init__(self): - self.clk = Signal() # input - self.rstn = Signal() # input - self.data_out = Signal(DATA_WIDTH) # output - self.valid_out = Signal() # output - self.ready_in = Signal() # input - self.valid_in = Signal() # input - self.data_in = Signal(DATA_WIDTH) # input - self.ready_out = Signal() # output - - def elaborate(self, platform=None): - m = Module() - m.d.comb += self.full.eq(self.None) - m.d.comb += self.data_out.eq(self.None) - m.d.comb += self.valid_out.eq(self.None) - m.d.comb += self.ready_out.eq(self.None) - return m - -# // Copyright 2018 ETH Zurich and University of Bologna. -# // Copyright and related rights are licensed under the Solderpad Hardware -# // License, Version 0.51 (the "License"); you may not use this file except in -# // compliance with the License. You may obtain a copy of the License at -# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law -# // or agreed to in writing, software, hardware and materials distributed under -# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# // CONDITIONS OF ANY KIND, either express or implied. See the License for the -# // specific language governing permissions and limitations under the License. -# -# //import CfMath::log2; -# -# module axi_buffer_rab -# //#( -# // parameter DATA_WIDTH, -# // parameter BUFFER_DEPTH -# //) -# ( -# input logic clk, -# input logic rstn, -# -# // Downstream port -# output logic [DATA_WIDTH-1:0] data_out, -# output logic valid_out, -# input logic ready_in, -# -# // Upstream port -# input logic valid_in, -# input logic [DATA_WIDTH-1:0] data_in, -# output logic ready_out -# ); -# -# localparam integer LOG_BUFFER_DEPTH = log2(BUFFER_DEPTH); -# -# // Internal data structures -# reg [LOG_BUFFER_DEPTH - 1 : 0] pointer_in; // location to which we last wrote -# reg [LOG_BUFFER_DEPTH - 1 : 0] pointer_out; // location from which we last sent -# reg [LOG_BUFFER_DEPTH : 0] elements; // number of elements in the buffer -# reg [DATA_WIDTH - 1 : 0] buffer [BUFFER_DEPTH - 1 : 0]; -# -# wire full; -# -# integer loop1; -# -# assign full = (elements == BUFFER_DEPTH); -# -# always @(posedge clk or negedge rstn) -# begin: elements_sequential -# if (rstn == 1'b0) -# elements <= 0; -# else -# begin -# // ------------------ -# // Are we filling up? -# // ------------------ -# // One out, none in -# if (ready_in && valid_out && (!valid_in || full)) -# elements <= elements - 1; -# // None out, one in -# else if ((!valid_out || !ready_in) && valid_in && !full) -# elements <= elements + 1; -# // Else, either one out and one in, or none out and none in - stays unchanged -# end -# end -# -# always @(posedge clk or negedge rstn) -# begin: buffers_sequential -# if (rstn == 1'b0) -# begin -# for (loop1 = 0 ; loop1 < BUFFER_DEPTH ; loop1 = loop1 + 1) -# buffer[loop1] <= 0; -# end -# else -# begin -# // Update the memory -# if (valid_in && !full) -# buffer[pointer_in] <= data_in; -# end -# end -# -# always @(posedge clk or negedge rstn) -# begin: sequential -# if (rstn == 1'b0) -# begin -# pointer_out <= 0; -# pointer_in <= 0; -# end -# else -# begin -# // ------------------------------------ -# // Check what to do with the input side -# // ------------------------------------ -# // We have some input, increase by 1 the input pointer -# if (valid_in && !full) -# begin -# if (pointer_in == $unsigned(BUFFER_DEPTH - 1)) -# pointer_in <= 0; -# else -# pointer_in <= pointer_in + 1; -# end -# // Else we don't have any input, the input pointer stays the same -# -# // ------------------------------------- -# // Check what to do with the output side -# // ------------------------------------- -# // We had pushed one flit out, we can try to go for the next one -# if (ready_in && valid_out) -# begin -# if (pointer_out == $unsigned(BUFFER_DEPTH - 1)) -# pointer_out <= 0; -# else -# pointer_out <= pointer_out + 1; -# end -# // Else stay on the same output location -# end -# end -# -# // Update output ports -# assign data_out = buffer[pointer_out]; -# assign valid_out = (elements != 0); -# -# assign ready_out = ~full; -# -# endmodule -# -# diff --git a/src/iommu/axi_rab/axi_buffer_rab_bram.py b/src/iommu/axi_rab/axi_buffer_rab_bram.py deleted file mode 100644 index 349b314e..00000000 --- a/src/iommu/axi_rab/axi_buffer_rab_bram.py +++ /dev/null @@ -1,209 +0,0 @@ -# this file has been generated by sv2nmigen - -from nmigen import Signal, Module, Const, Cat, Elaboratable - - -class axi_buffer_rab_bram(Elaboratable): - - def __init__(self): - self.clk = Signal() # input - self.rstn = Signal() # input - self.data_out = Signal(DATA_WIDTH) # output - self.valid_out = Signal() # output - self.ready_in = Signal() # input - self.valid_in = Signal() # input - self.data_in = Signal(DATA_WIDTH) # input - self.ready_out = Signal() # output - self.almost_full = Signal() # output - self.underfull = Signal() # output - self.drop_req = Signal() # input - self.drop_len = Signal(8) # input - - def elaborate(self, platform=None): - m = Module() - return m - - -# // Copyright 2018 ETH Zurich and University of Bologna. -# // Copyright and related rights are licensed under the Solderpad Hardware -# // License, Version 0.51 (the "License"); you may not use this file except in -# // compliance with the License. You may obtain a copy of the License at -# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law -# // or agreed to in writing, software, hardware and materials distributed under -# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# // CONDITIONS OF ANY KIND, either express or implied. See the License for the -# // specific language governing permissions and limitations under the License. -# -# ////import CfMath::log2; -# -# module axi_buffer_rab_bram -# //#( -# // parameter DATA_WIDTH, -# // parameter BUFFER_DEPTH -# // ) -# ( -# input logic clk, -# input logic rstn, -# -# // Downstream port -# output logic [DATA_WIDTH-1:0] data_out, -# output logic valid_out, -# input logic ready_in, -# -# // Upstream port -# input logic valid_in, -# input logic [DATA_WIDTH-1:0] data_in, -# output logic ready_out, -# -# // Status and drop control -# output logic almost_full, -# output logic underfull, -# input logic drop_req, -# // Number of items to drop. As for AXI lengths, counting starts at zero, i.e., `drop_len == 0` -# // and `drop_req` means drop one item. -# input logic [7:0] drop_len -# ); -# -""" #docstring_begin - // The BRAM needs to be in "write-first" mode for first-word fall-through FIFO behavior. - // To still push and pop simultaneously if the buffer is full, we internally increase the - // buffer depth by 1. - localparam ACT_BUFFER_DEPTH = BUFFER_DEPTH+1; - localparam ACT_LOG_BUFFER_DEPTH = log2(ACT_BUFFER_DEPTH+1); - - /** - * Internal data structures - */ - // Location to which we last wrote - logic [ACT_LOG_BUFFER_DEPTH-1:0] ptr_in_d, ptr_in_q; - // Location from which we last sent - logic [ACT_LOG_BUFFER_DEPTH-1:0] ptr_out_d, ptr_out_q; - // Required for fall-through behavior on the first word - logic [ACT_LOG_BUFFER_DEPTH-1:0] ptr_out_bram; - // Number of elements in the buffer. Can be negative if elements that have been dropped have not - // yet been written. - logic signed [ACT_LOG_BUFFER_DEPTH:0] n_elems_d, n_elems_q; - - logic [DATA_WIDTH-1:0] data_out_bram, data_out_q; - logic valid_out_q; - - logic full; - - assign almost_full = (n_elems_q == BUFFER_DEPTH-1); - assign full = (n_elems_q == BUFFER_DEPTH); - - always_ff @(posedge clk, negedge rstn) begin - if (~rstn) begin - n_elems_q <= '0; - ptr_in_q <= '0; - ptr_out_q <= '0; - end else begin - n_elems_q <= n_elems_d; - ptr_in_q <= ptr_in_d; - ptr_out_q <= ptr_out_d; - end - end - - // Update the number of elements. - always_comb begin - n_elems_d = n_elems_q; - if (drop_req) begin - n_elems_d -= (drop_len + 1); - end - if (valid_in && ready_out) begin - n_elems_d += 1; - end - if (valid_out && ready_in) begin - n_elems_d -= 1; - end - end - - // Update the output pointer. - always_comb begin - ptr_out_d = ptr_out_q; - if (drop_req) begin - if ((ptr_out_q + drop_len + 1) > (ACT_BUFFER_DEPTH - 1)) begin - ptr_out_d = drop_len + 1 - (ACT_BUFFER_DEPTH - ptr_out_q); - end else begin - ptr_out_d += (drop_len + 1); - end - end - if (valid_out && ready_in) begin - if (ptr_out_d == (ACT_BUFFER_DEPTH - 1)) begin - ptr_out_d = '0; - end else begin - ptr_out_d += 1; - end - end - end - - // The BRAM has a read latency of one cycle, so apply the new address one cycle earlier for - // first-word fall-through FIFO behavior. - //assign ptr_out_bram = (ptr_out_q == (ACT_BUFFER_DEPTH-1)) ? '0 : (ptr_out_q + 1); - assign ptr_out_bram = ptr_out_d; - - // Update the input pointer. - always_comb begin - ptr_in_d = ptr_in_q; - if (valid_in && ready_out) begin - if (ptr_in_d == (ACT_BUFFER_DEPTH - 1)) begin - ptr_in_d = '0; - end else begin - ptr_in_d += 1; - end - end - end - - // Update output ports. - assign valid_out = (n_elems_q > $signed(0)); - assign underfull = (n_elems_q < $signed(0)); - assign ready_out = ~full; - - ram_tp_write_first #( - .ADDR_WIDTH ( ACT_LOG_BUFFER_DEPTH ), - .DATA_WIDTH ( DATA_WIDTH ) - ) - ram_tp_write_first_0 - ( - .clk ( clk ), - .we ( valid_in & ~full ), - .addr0 ( ptr_in_q ), - .addr1 ( ptr_out_bram ), - .d_i ( data_in ), - .d0_o ( ), - .d1_o ( data_out_bram ) - ); - - // When reading from/writing two the same address on both ports ("Write-Read Collision"), - // the data on the read port is invalid (during the write cycle). In this implementation, - // this can happen only when the buffer is empty. Thus, we forward the data from an - // register in this case. - always @(posedge clk) begin - if (rstn == 1'b0) begin - data_out_q <= 'b0; - end else if ( (ptr_out_bram == ptr_in_q) && (valid_in && !full) ) begin - data_out_q <= data_in; - end - end - - always @(posedge clk) begin - if (rstn == 1'b0) begin - valid_out_q <= 'b0; - end else begin - valid_out_q <= valid_out; - end - end - - // Drive output data - always_comb begin - if (valid_out && !valid_out_q) begin // We have just written to an empty FIFO - data_out = data_out_q; - end else begin - data_out = data_out_bram; - end - end - -""" -# endmodule -# -# diff --git a/src/iommu/axi_rab/axi_rab_cfg.py b/src/iommu/axi_rab/axi_rab_cfg.py deleted file mode 100644 index 43843b95..00000000 --- a/src/iommu/axi_rab/axi_rab_cfg.py +++ /dev/null @@ -1,707 +0,0 @@ -# this file has been generated by sv2nmigen - -from nmigen import Signal, Module, Const, Cat, Elaboratable - - -class axi_rab_cfg(Elaboratable): - - def __init__(self): - self.Clk_CI = Signal() # input - self.Rst_RBI = Signal() # input - self.s_axi_awaddr = Signal(AXI_ADDR_WIDTH) # input - self.s_axi_awvalid = Signal() # input - self.s_axi_awready = Signal() # output - self.s_axi_wdata = Signal() # input - self.s_axi_wstrb = Signal(1+ERROR p_expression_25) # input - self.s_axi_wvalid = Signal() # input - self.s_axi_wready = Signal() # output - self.s_axi_bresp = Signal(2) # output - self.s_axi_bvalid = Signal() # output - self.s_axi_bready = Signal() # input - self.s_axi_araddr = Signal(AXI_ADDR_WIDTH) # input - self.s_axi_arvalid = Signal() # input - self.s_axi_arready = Signal() # output - self.s_axi_rdata = Signal(AXI_DATA_WIDTH) # output - self.s_axi_rresp = Signal(2) # output - self.s_axi_rvalid = Signal() # output - self.s_axi_rready = Signal() # input - self.L1Cfg_DO = Signal() # output - self.L1AllowMultiHit_SO = Signal() # output - self.MissAddr_DI = Signal(ADDR_WIDTH_VIRT) # input - self.MissMeta_DI = Signal(MISS_META_WIDTH) # input - self.Miss_SI = Signal() # input - self.MhFifoFull_SO = Signal() # output - self.wdata_l2 = Signal() # output - self.waddr_l2 = Signal() # output - self.wren_l2 = Signal(N_PORTS) # output - - def elaborate(self, platform=None): - m = Module() - return m - - -# // Copyright 2018 ETH Zurich and University of Bologna. -# // Copyright and related rights are licensed under the Solderpad Hardware -# // License, Version 0.51 (the "License"); you may not use this file except in -# // compliance with the License. You may obtain a copy of the License at -# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law -# // or agreed to in writing, software, hardware and materials distributed under -# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# // CONDITIONS OF ANY KIND, either express or implied. See the License for the -# // specific language governing permissions and limitations under the License. -# -# // --=========================================================================-- -# // -# // █████╗ ██╗ ██╗██╗ ██████╗ █████╗ ██████╗ ██████╗███████╗ ██████╗ -# // ██╔══██╗╚██╗██╔╝██║ ██╔══██╗██╔══██╗██╔══██╗ ██╔════╝██╔════╝██╔════╝ -# // ███████║ ╚███╔╝ ██║ ██████╔╝███████║██████╔╝ ██║ █████╗ ██║ ███╗ -# // ██╔══██║ ██╔██╗ ██║ ██╔══██╗██╔══██║██╔══██╗ ██║ ██╔══╝ ██║ ██║ -# // ██║ ██║██╔╝ ██╗██║ ██║ ██║██║ ██║██████╔╝ ╚██████╗██║ ╚██████╔╝ -# // ╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═╝ ╚═╝╚═╝ ╚═╝╚═════╝ ╚═════╝╚═╝ ╚═════╝ -# // -# // -# // Author: Pirmin Vogel - vogelpi@iis.ee.ethz.ch -# // -# // Purpose : AXI4-Lite configuration and miss handling interface for RAB -# // -# // --=========================================================================-- -# -# //import CfMath::log2; -# -# module axi_rab_cfg -# #( -# parameter N_PORTS = 3, -# parameter N_REGS = 196, -# parameter N_L2_SETS = 32, -# parameter N_L2_SET_ENTRIES= 32, -# parameter ADDR_WIDTH_PHYS = 40, -# parameter ADDR_WIDTH_VIRT = 32, -# parameter N_FLAGS = 4, -# parameter AXI_DATA_WIDTH = 64, -# parameter AXI_ADDR_WIDTH = 32, -# parameter MISS_META_WIDTH = 10, // <= FIFO_WIDTH -# parameter MH_FIFO_DEPTH = 16 -# ) -# ( -# input logic Clk_CI, -# input logic Rst_RBI, -# -# // AXI Lite interface -# input logic [AXI_ADDR_WIDTH-1:0] s_axi_awaddr, -# input logic s_axi_awvalid, -# output logic s_axi_awready, -# input logic [AXI_DATA_WIDTH/8-1:0][7:0] s_axi_wdata, -# input logic [AXI_DATA_WIDTH/8-1:0] s_axi_wstrb, -# input logic s_axi_wvalid, -# output logic s_axi_wready, -# output logic [1:0] s_axi_bresp, -# output logic s_axi_bvalid, -# input logic s_axi_bready, -# input logic [AXI_ADDR_WIDTH-1:0] s_axi_araddr, -# input logic s_axi_arvalid, -# output logic s_axi_arready, -# output logic [AXI_DATA_WIDTH-1:0] s_axi_rdata, -# output logic [1:0] s_axi_rresp, -# output logic s_axi_rvalid, -# input logic s_axi_rready, -# -# // Slice configuration -# output logic [N_REGS-1:0][63:0] L1Cfg_DO, -# output logic L1AllowMultiHit_SO, -# -# // Miss handling -# input logic [ADDR_WIDTH_VIRT-1:0] MissAddr_DI, -# input logic [MISS_META_WIDTH-1:0] MissMeta_DI, -# input logic Miss_SI, -# output logic MhFifoFull_SO, -# -# // L2 TLB -# output logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] wdata_l2, -# output logic [N_PORTS-1:0] [AXI_ADDR_WIDTH-1:0] waddr_l2, -# output logic [N_PORTS-1:0] wren_l2 -# ); -# -""" #docstring_begin - - localparam ADDR_LSB = log2(64/8); // 64 even if the AXI Lite interface is 32, - // because RAB slices are 64 bit wide. - localparam ADDR_MSB = log2(N_REGS)+ADDR_LSB-1; - - localparam L2SINGLE_AMAP_SIZE = 16'h4000; // Maximum 2048 TLB entries in L2 - - localparam integer N_L2_ENTRIES = N_L2_SETS * N_L2_SET_ENTRIES; - - localparam logic [AXI_ADDR_WIDTH-1:0] L2_VA_MAX_ADDR = (N_L2_ENTRIES-1) << 2; - - logic [AXI_DATA_WIDTH/8-1:0][7:0] L1Cfg_DP[N_REGS]; // [Byte][Bit] - genvar j; - - // █████╗ ██╗ ██╗██╗██╗ ██╗ ██╗ ██╗████████╗███████╗ - // ██╔══██╗╚██╗██╔╝██║██║ ██║ ██║ ██║╚══██╔══╝██╔════╝ - // ███████║ ╚███╔╝ ██║███████║█████╗██║ ██║ ██║ █████╗ - // ██╔══██║ ██╔██╗ ██║╚════██║╚════╝██║ ██║ ██║ ██╔══╝ - // ██║ ██║██╔╝ ██╗██║ ██║ ███████╗██║ ██║ ███████╗ - // ╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═╝ ╚══════╝╚═╝ ╚═╝ ╚══════╝ - // - logic [AXI_ADDR_WIDTH-1:0] awaddr_reg; - logic awaddr_done_rise; - logic awaddr_done_reg; - logic awaddr_done_reg_dly; - - logic [AXI_DATA_WIDTH/8-1:0][7:0] wdata_reg; - logic [AXI_DATA_WIDTH/8-1:0] wstrb_reg; - logic wdata_done_rise; - logic wdata_done_reg; - logic wdata_done_reg_dly; - - logic wresp_done_reg; - logic wresp_running_reg; - - logic [AXI_ADDR_WIDTH-1:0] araddr_reg; - logic araddr_done_reg; - - logic [AXI_DATA_WIDTH-1:0] rdata_reg; - logic rresp_done_reg; - logic rresp_running_reg; - - logic awready; - logic wready; - logic bvalid; - - logic arready; - logic rvalid; - - logic wren; - logic wren_l1; - - assign wren = ( wdata_done_rise & awaddr_done_reg ) | ( awaddr_done_rise & wdata_done_reg ); - assign wdata_done_rise = wdata_done_reg & ~wdata_done_reg_dly; - assign awaddr_done_rise = awaddr_done_reg & ~awaddr_done_reg_dly; - - // reg_dly - always @(posedge Clk_CI or negedge Rst_RBI) - begin - if (!Rst_RBI) - begin - wdata_done_reg_dly <= 1'b0; - awaddr_done_reg_dly <= 1'b0; - end - else - begin - wdata_done_reg_dly <= wdata_done_reg; - awaddr_done_reg_dly <= awaddr_done_reg; - end - end - - // AW Channel - always @(posedge Clk_CI or negedge Rst_RBI) - begin - if (!Rst_RBI) - begin - awaddr_done_reg <= 1'b0; - awaddr_reg <= '0; - awready <= 1'b1; - end - else - begin - if (awready && s_axi_awvalid) - begin - awready <= 1'b0; - awaddr_done_reg <= 1'b1; - awaddr_reg <= s_axi_awaddr; - end - else if (awaddr_done_reg && wresp_done_reg) - begin - awready <= 1'b1; - awaddr_done_reg <= 1'b0; - end - end - end - - // W Channel - always @(posedge Clk_CI or negedge Rst_RBI) - begin - if (!Rst_RBI) - begin - wdata_done_reg <= 1'b0; - wready <= 1'b1; - wdata_reg <= '0; - wstrb_reg <= '0; - end - else - begin - if (wready && s_axi_wvalid) - begin - wready <= 1'b0; - wdata_done_reg <= 1'b1; - wdata_reg <= s_axi_wdata; - wstrb_reg <= s_axi_wstrb; - end - else if (wdata_done_reg && wresp_done_reg) - begin - wready <= 1'b1; - wdata_done_reg <= 1'b0; - end - end - end - - // B Channel - always @(posedge Clk_CI or negedge Rst_RBI) - begin - if (!Rst_RBI) - begin - bvalid <= 1'b0; - wresp_done_reg <= 1'b0; - wresp_running_reg <= 1'b0; - end - else - begin - if (awaddr_done_reg && wdata_done_reg && !wresp_done_reg) - begin - if (!wresp_running_reg) - begin - bvalid <= 1'b1; - wresp_running_reg <= 1'b1; - end - else if (s_axi_bready) - begin - bvalid <= 1'b0; - wresp_done_reg <= 1'b1; - wresp_running_reg <= 1'b0; - end - end - else - begin - bvalid <= 1'b0; - wresp_done_reg <= 1'b0; - wresp_running_reg <= 1'b0; - end - end - end - - // AR Channel - always @(posedge Clk_CI or negedge Rst_RBI) - begin - if (!Rst_RBI) - begin - araddr_done_reg <= 1'b0; - arready <= 1'b1; - araddr_reg <= '0; - end - else - begin - if (arready && s_axi_arvalid) - begin - arready <= 1'b0; - araddr_done_reg <= 1'b1; - araddr_reg <= s_axi_araddr; - end - else if (araddr_done_reg && rresp_done_reg) - begin - arready <= 1'b1; - araddr_done_reg <= 1'b0; - end - end - end - - // R Channel - always @(posedge Clk_CI or negedge Rst_RBI) - begin - if (!Rst_RBI) - begin - rresp_done_reg <= 1'b0; - rvalid <= 1'b0; - rresp_running_reg <= 1'b0; - end - else - begin - if (araddr_done_reg && !rresp_done_reg) - begin - if (!rresp_running_reg) - begin - rvalid <= 1'b1; - rresp_running_reg <= 1'b1; - end - else if (s_axi_rready) - begin - rvalid <= 1'b0; - rresp_done_reg <= 1'b1; - rresp_running_reg <= 1'b0; - end - end - else - begin - rvalid <= 1'b0; - rresp_done_reg <= 1'b0; - rresp_running_reg <= 1'b0; - end - end - end - - // ██╗ ██╗ ██████╗███████╗ ██████╗ ██████╗ ███████╗ ██████╗ - // ██║ ███║ ██╔════╝██╔════╝██╔════╝ ██╔══██╗██╔════╝██╔════╝ - // ██║ ╚██║ ██║ █████╗ ██║ ███╗ ██████╔╝█████╗ ██║ ███╗ - // ██║ ██║ ██║ ██╔══╝ ██║ ██║ ██╔══██╗██╔══╝ ██║ ██║ - // ███████╗██║ ╚██████╗██║ ╚██████╔╝ ██║ ██║███████╗╚██████╔╝ - // ╚══════╝╚═╝ ╚═════╝╚═╝ ╚═════╝ ╚═╝ ╚═╝╚══════╝ ╚═════╝ - // - assign wren_l1 = wren && (awaddr_reg < L2SINGLE_AMAP_SIZE); - - always @( posedge Clk_CI or negedge Rst_RBI ) - begin - var integer idx_reg, idx_byte; - if ( Rst_RBI == 1'b0 ) - begin - for ( idx_reg = 0; idx_reg < N_REGS; idx_reg++ ) - L1Cfg_DP[idx_reg] <= '0; - end - else if ( wren_l1 ) - begin - if ( awaddr_reg[ADDR_LSB+1] == 1'b0 ) begin // VIRT_ADDR - for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8; idx_byte++ ) begin - if ( (idx_byte < ADDR_WIDTH_VIRT/8) ) begin - if ( wstrb_reg[idx_byte] ) begin - L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= wdata_reg[idx_byte]; - end - end - else begin // Let synthesizer optimize away unused registers. - L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= '0; - end - end - end - else if ( awaddr_reg[ADDR_LSB+1:ADDR_LSB] == 2'b10 ) begin // PHYS_ADDR - for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8; idx_byte++ ) begin - if ( (idx_byte < ADDR_WIDTH_PHYS/8) ) begin - if ( wstrb_reg[idx_byte] ) begin - L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= wdata_reg[idx_byte]; - end - end - else begin // Let synthesizer optimize away unused registers. - L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= '0; - end - end - end - else begin // ( awaddr_reg[ADDR_LSB+1:ADDR_LSB] == 2'b11 ) // FLAGS - for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8; idx_byte++ ) begin - if ( (idx_byte < 1) ) begin - if ( wstrb_reg[idx_byte] ) begin - L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= wdata_reg[idx_byte] & { {{8-N_FLAGS}{1'b0}}, {{N_FLAGS}{1'b1}} }; - end - end - else begin // Let synthesizer optimize away unused registers. - L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= '0; - end - end - end - end - end // always @ ( posedge Clk_CI or negedge Rst_RBI ) - - generate - // Mask unused bits -> Synthesizer should optimize away unused registers - for( j=0; j= (j+1)*L2SINGLE_AMAP_SIZE) && (awaddr_reg[log2(L2SINGLE_AMAP_SIZE)-1:0] <= L2_VA_MAX_ADDR); - assign upper_word_is_written[j] = (wstrb_reg[7:4] != 4'b0000); - assign lower_word_is_written[j] = (wstrb_reg[3:0] != 4'b0000); - end else begin - assign l2_addr_is_in_va_rams[j] = 1'b0; - assign upper_word_is_written[j] = 1'b0; - assign lower_word_is_written[j] = 1'b0; - end - - always @( posedge Clk_CI or negedge Rst_RBI ) begin - var integer idx_byte, off_byte; - if ( Rst_RBI == 1'b0 ) - begin - wren_l2[j] <= 1'b0; - wdata_l2[j] <= '0; - end - else if (wren) - begin - if ( (awaddr_reg >= (j+1)*L2SINGLE_AMAP_SIZE) && (awaddr_reg < (j+2)*L2SINGLE_AMAP_SIZE) && (|wstrb_reg) ) - wren_l2[j] <= 1'b1; - if (AXI_DATA_WIDTH == 32) begin - for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8; idx_byte++ ) - wdata_l2[j][idx_byte*8 +: 8] <= wdata_reg[idx_byte] & {8{wstrb_reg[idx_byte]}}; - end - else if (AXI_DATA_WIDTH == 64) begin - if (lower_word_is_written[j] == 1'b1) - off_byte = 0; - else - off_byte = 4; - // always put the payload in the lower word and set upper word to 0 - for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8/2; idx_byte++ ) - wdata_l2[j][idx_byte*8 +: 8] <= wdata_reg[idx_byte+off_byte] & {8{wstrb_reg[idx_byte+off_byte]}}; - wdata_l2[j][AXI_DATA_WIDTH-1:AXI_DATA_WIDTH/2] <= 'b0; - end - // pragma translate_off - else - $fatal(1, "Unsupported AXI_DATA_WIDTH!"); - // pragma translate_on - end - else - wren_l2[j] <= '0; - end // always @ ( posedge Clk_CI or negedge Rst_RBI ) - - // Properly align the 32-bit word address when writing from 64-bit interface: - // Depending on the system, the incoming address is (non-)aligned to the 64-bit - // word when writing the upper 32-bit word. - always_comb begin - waddr_l2[j] = (awaddr_reg -(j+1)*L2SINGLE_AMAP_SIZE)/4; - if (wren_l2[j]) begin - if (AXI_DATA_WIDTH == 64) begin - if (upper_word_is_written[j] == 1'b1) begin - // address must be non-aligned - waddr_l2[j][0] = 1'b1; - end - end - // pragma translate_off - else if (AXI_DATA_WIDTH != 32) begin - $fatal(1, "Unsupported AXI_DATA_WIDTH!"); - end - // pragma translate_on - end - end - - // Assert that only one 32-bit word is ever written at a time to VA RAMs on 64-bit data - // systems. - // pragma translate_off - always_ff @ (posedge Clk_CI) begin - if (AXI_DATA_WIDTH == 64) begin - if (l2_addr_is_in_va_rams[j]) begin - if (upper_word_is_written[j]) begin - assert (!lower_word_is_written[j]) - else $error("Unsupported write across two 32-bit words to VA RAMs!"); - end - else if (lower_word_is_written[j]) begin - assert (!upper_word_is_written[j]) - else $error("Unsupported write across two 32-bit words to VA RAMs!"); - end - end - end - end - // pragma translate_on - - end // for (j=0; j< N_PORTS; j++) - endgenerate - - // ███╗ ███╗██╗ ██╗ ███████╗██╗███████╗ ██████╗ ███████╗ - // ████╗ ████║██║ ██║ ██╔════╝██║██╔════╝██╔═══██╗██╔════╝ - // ██╔████╔██║███████║ █████╗ ██║█████╗ ██║ ██║███████╗ - // ██║╚██╔╝██║██╔══██║ ██╔══╝ ██║██╔══╝ ██║ ██║╚════██║ - // ██║ ╚═╝ ██║██║ ██║ ██║ ██║██║ ╚██████╔╝███████║ - // ╚═╝ ╚═╝╚═╝ ╚═╝ ╚═╝ ╚═╝╚═╝ ╚═════╝ ╚══════╝ - // - logic [ADDR_WIDTH_VIRT-1:0] AddrFifoDin_D; - logic AddrFifoWen_S; - logic AddrFifoRen_S; - logic [ADDR_WIDTH_VIRT-1:0] AddrFifoDout_D; - logic AddrFifoFull_S; - logic AddrFifoEmpty_S; - logic AddrFifoEmpty_SB; - logic AddrFifoFull_SB; - - logic [MISS_META_WIDTH-1:0] MetaFifoDin_D; - logic MetaFifoWen_S; - logic MetaFifoRen_S; - logic [MISS_META_WIDTH-1:0] MetaFifoDout_D; - logic MetaFifoFull_S; - logic MetaFifoEmpty_S; - logic MetaFifoEmpty_SB; - logic MetaFifoFull_SB; - - logic FifosDisabled_S; - logic ConfRegWen_S; - logic [1:0] ConfReg_DN; - logic [1:0] ConfReg_DP; - - logic [AXI_DATA_WIDTH-1:0] wdata_reg_vec; - - assign FifosDisabled_S = ConfReg_DP[0]; - assign L1AllowMultiHit_SO = ConfReg_DP[1]; - - assign AddrFifoEmpty_S = ~AddrFifoEmpty_SB; - assign MetaFifoEmpty_S = ~MetaFifoEmpty_SB; - - assign AddrFifoFull_S = ~AddrFifoFull_SB; - assign MetaFifoFull_S = ~MetaFifoFull_SB; - - assign MhFifoFull_SO = (AddrFifoWen_S & AddrFifoFull_S) | (MetaFifoWen_S & MetaFifoFull_S); - - generate - for ( j=0; j -# * Conrad Burchert -# * Maheshwara Sharma -# * Andreas Kurth -# * Johannes Weinbuch -# * Pirmin Vogel -# */ -# -# //`include "pulp_soc_defines.sv" -# -# ////import CfMath::log2; -# -# module axi_rab_top -# -# // Parameters {{{ -# #( -# parameter N_PORTS = 2, -# parameter N_L2_SETS = 32, -# parameter N_L2_SET_ENTRIES = 32, -# parameter AXI_DATA_WIDTH = 64, -# parameter AXI_S_ADDR_WIDTH = 32, -# parameter AXI_M_ADDR_WIDTH = 40, -# parameter AXI_LITE_DATA_WIDTH = 64, -# parameter AXI_LITE_ADDR_WIDTH = 32, -# parameter AXI_ID_WIDTH = 10, -# parameter AXI_USER_WIDTH = 6, -# parameter MH_FIFO_DEPTH = 16 -# ) -# // }}} -# -# // Ports {{{ -# ( -# -# input logic Clk_CI, // This clock may be gated. -# input logic NonGatedClk_CI, -# input logic Rst_RBI, -# -# // For every slave port there are two master ports. The master -# // port to use can be set using the master_select flag of the protection -# // bits of a slice -# -# // AXI4 Slave {{{ -# input logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] s_axi4_awid, -# input logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] s_axi4_awaddr, -# input logic [N_PORTS-1:0] s_axi4_awvalid, -# output logic [N_PORTS-1:0] s_axi4_awready, -# input logic [N_PORTS-1:0] [7:0] s_axi4_awlen, -# input logic [N_PORTS-1:0] [2:0] s_axi4_awsize, -# input logic [N_PORTS-1:0] [1:0] s_axi4_awburst, -# input logic [N_PORTS-1:0] s_axi4_awlock, -# input logic [N_PORTS-1:0] [2:0] s_axi4_awprot, -# input logic [N_PORTS-1:0] [3:0] s_axi4_awcache, -# input logic [N_PORTS-1:0] [3:0] s_axi4_awregion, -# input logic [N_PORTS-1:0] [3:0] s_axi4_awqos, -# input logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] s_axi4_awuser, -# -# input logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] s_axi4_wdata, -# input logic [N_PORTS-1:0] s_axi4_wvalid, -# output logic [N_PORTS-1:0] s_axi4_wready, -# input logic [N_PORTS-1:0] [AXI_DATA_WIDTH/8-1:0] s_axi4_wstrb, -# input logic [N_PORTS-1:0] s_axi4_wlast, -# input logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] s_axi4_wuser, -# -# output logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] s_axi4_bid, -# output logic [N_PORTS-1:0] [1:0] s_axi4_bresp, -# output logic [N_PORTS-1:0] s_axi4_bvalid, -# output logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] s_axi4_buser, -# input logic [N_PORTS-1:0] s_axi4_bready, -# -# input logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] s_axi4_arid, -# input logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] s_axi4_araddr, -# input logic [N_PORTS-1:0] s_axi4_arvalid, -# output logic [N_PORTS-1:0] s_axi4_arready, -# input logic [N_PORTS-1:0] [7:0] s_axi4_arlen, -# input logic [N_PORTS-1:0] [2:0] s_axi4_arsize, -# input logic [N_PORTS-1:0] [1:0] s_axi4_arburst, -# input logic [N_PORTS-1:0] s_axi4_arlock, -# input logic [N_PORTS-1:0] [2:0] s_axi4_arprot, -# input logic [N_PORTS-1:0] [3:0] s_axi4_arcache, -# input logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] s_axi4_aruser, -# -# output logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] s_axi4_rid, -# output logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] s_axi4_rdata, -# output logic [N_PORTS-1:0] [1:0] s_axi4_rresp, -# output logic [N_PORTS-1:0] s_axi4_rvalid, -# input logic [N_PORTS-1:0] s_axi4_rready, -# output logic [N_PORTS-1:0] s_axi4_rlast, -# output logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] s_axi4_ruser, -# // }}} -# -# // AXI4 Master 0 {{{ -# output logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] m0_axi4_awid, -# output logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] m0_axi4_awaddr, -# output logic [N_PORTS-1:0] m0_axi4_awvalid, -# input logic [N_PORTS-1:0] m0_axi4_awready, -# output logic [N_PORTS-1:0] [7:0] m0_axi4_awlen, -# output logic [N_PORTS-1:0] [2:0] m0_axi4_awsize, -# output logic [N_PORTS-1:0] [1:0] m0_axi4_awburst, -# output logic [N_PORTS-1:0] m0_axi4_awlock, -# output logic [N_PORTS-1:0] [2:0] m0_axi4_awprot, -# output logic [N_PORTS-1:0] [3:0] m0_axi4_awcache, -# output logic [N_PORTS-1:0] [3:0] m0_axi4_awregion, -# output logic [N_PORTS-1:0] [3:0] m0_axi4_awqos, -# output logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m0_axi4_awuser, -# -# output logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] m0_axi4_wdata, -# output logic [N_PORTS-1:0] m0_axi4_wvalid, -# input logic [N_PORTS-1:0] m0_axi4_wready, -# output logic [N_PORTS-1:0] [AXI_DATA_WIDTH/8-1:0] m0_axi4_wstrb, -# output logic [N_PORTS-1:0] m0_axi4_wlast, -# output logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m0_axi4_wuser, -# -# input logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] m0_axi4_bid, -# input logic [N_PORTS-1:0] [1:0] m0_axi4_bresp, -# input logic [N_PORTS-1:0] m0_axi4_bvalid, -# input logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m0_axi4_buser, -# output logic [N_PORTS-1:0] m0_axi4_bready, -# -# output logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] m0_axi4_arid, -# output logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] m0_axi4_araddr, -# output logic [N_PORTS-1:0] m0_axi4_arvalid, -# input logic [N_PORTS-1:0] m0_axi4_arready, -# output logic [N_PORTS-1:0] [7:0] m0_axi4_arlen, -# output logic [N_PORTS-1:0] [2:0] m0_axi4_arsize, -# output logic [N_PORTS-1:0] [1:0] m0_axi4_arburst, -# output logic [N_PORTS-1:0] m0_axi4_arlock, -# output logic [N_PORTS-1:0] [2:0] m0_axi4_arprot, -# output logic [N_PORTS-1:0] [3:0] m0_axi4_arcache, -# output logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m0_axi4_aruser, -# -# input logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] m0_axi4_rid, -# input logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] m0_axi4_rdata, -# input logic [N_PORTS-1:0] [1:0] m0_axi4_rresp, -# input logic [N_PORTS-1:0] m0_axi4_rvalid, -# output logic [N_PORTS-1:0] m0_axi4_rready, -# input logic [N_PORTS-1:0] m0_axi4_rlast, -# input logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m0_axi4_ruser, -# // }}} -# -# // AXI4 Master 1 {{{ -# output logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] m1_axi4_awid, -# output logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] m1_axi4_awaddr, -# output logic [N_PORTS-1:0] m1_axi4_awvalid, -# input logic [N_PORTS-1:0] m1_axi4_awready, -# output logic [N_PORTS-1:0] [7:0] m1_axi4_awlen, -# output logic [N_PORTS-1:0] [2:0] m1_axi4_awsize, -# output logic [N_PORTS-1:0] [1:0] m1_axi4_awburst, -# output logic [N_PORTS-1:0] m1_axi4_awlock, -# output logic [N_PORTS-1:0] [2:0] m1_axi4_awprot, -# output logic [N_PORTS-1:0] [3:0] m1_axi4_awcache, -# output logic [N_PORTS-1:0] [3:0] m1_axi4_awregion, -# output logic [N_PORTS-1:0] [3:0] m1_axi4_awqos, -# output logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m1_axi4_awuser, -# -# output logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] m1_axi4_wdata, -# output logic [N_PORTS-1:0] m1_axi4_wvalid, -# input logic [N_PORTS-1:0] m1_axi4_wready, -# output logic [N_PORTS-1:0] [AXI_DATA_WIDTH/8-1:0] m1_axi4_wstrb, -# output logic [N_PORTS-1:0] m1_axi4_wlast, -# output logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m1_axi4_wuser, -# -# input logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] m1_axi4_bid, -# input logic [N_PORTS-1:0] [1:0] m1_axi4_bresp, -# input logic [N_PORTS-1:0] m1_axi4_bvalid, -# input logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m1_axi4_buser, -# output logic [N_PORTS-1:0] m1_axi4_bready, -# -# output logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] m1_axi4_arid, -# output logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] m1_axi4_araddr, -# output logic [N_PORTS-1:0] m1_axi4_arvalid, -# input logic [N_PORTS-1:0] m1_axi4_arready, -# output logic [N_PORTS-1:0] [7:0] m1_axi4_arlen, -# output logic [N_PORTS-1:0] [2:0] m1_axi4_arsize, -# output logic [N_PORTS-1:0] [1:0] m1_axi4_arburst, -# output logic [N_PORTS-1:0] m1_axi4_arlock, -# output logic [N_PORTS-1:0] [2:0] m1_axi4_arprot, -# output logic [N_PORTS-1:0] [3:0] m1_axi4_arcache, -# output logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m1_axi4_aruser, -# -# input logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] m1_axi4_rid, -# input logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] m1_axi4_rdata, -# input logic [N_PORTS-1:0] [1:0] m1_axi4_rresp, -# input logic [N_PORTS-1:0] m1_axi4_rvalid, -# output logic [N_PORTS-1:0] m1_axi4_rready, -# input logic [N_PORTS-1:0] m1_axi4_rlast, -# input logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m1_axi4_ruser, -# // }}} -# -# // AXI 4 Lite Slave (Configuration Interface) {{{ -# // AXI4-Lite port to setup the rab slices -# // use this to program the configuration registers -# input logic [AXI_LITE_ADDR_WIDTH-1:0] s_axi4lite_awaddr, -# input logic s_axi4lite_awvalid, -# output logic s_axi4lite_awready, -# -# input logic [AXI_LITE_DATA_WIDTH-1:0] s_axi4lite_wdata, -# input logic s_axi4lite_wvalid, -# output logic s_axi4lite_wready, -# input logic [AXI_LITE_DATA_WIDTH/8-1:0] s_axi4lite_wstrb, -# -# output logic [1:0] s_axi4lite_bresp, -# output logic s_axi4lite_bvalid, -# input logic s_axi4lite_bready, -# -# input logic [AXI_LITE_ADDR_WIDTH-1:0] s_axi4lite_araddr, -# input logic s_axi4lite_arvalid, -# output logic s_axi4lite_arready, -# -# output logic [AXI_LITE_DATA_WIDTH-1:0] s_axi4lite_rdata, -# output logic [1:0] s_axi4lite_rresp, -# output logic s_axi4lite_rvalid, -# input logic s_axi4lite_rready, -# // }}} -# -# // BRAMs {{{ -# //`ifdef RAB_AX_LOG_EN -# // BramPort.Slave ArBram_PS, -# // BramPort.Slave AwBram_PS, -# //`endif -# // }}} -# -# // Logger Control {{{ -# //`ifdef RAB_AX_LOG_EN -# // input logic LogEn_SI, -# // input logic ArLogClr_SI, -# // input logic AwLogClr_SI, -# // output logic ArLogRdy_SO, -# // output logic AwLogRdy_SO, -# //`endif -# // }}} -# -# // Interrupt Outputs {{{ -# // Interrupt lines to handle misses, collisions of slices/multiple hits, -# // protection faults and overflow of the miss handling fifo -# //`ifdef RAB_AX_LOG_EN -# // output logic int_ar_log_full, -# // output logic int_aw_log_full, -# //`endif -# output logic [N_PORTS-1:0] int_miss, -# output logic [N_PORTS-1:0] int_multi, -# output logic [N_PORTS-1:0] int_prot, -# output logic int_mhf_full -# // }}} -# -# ); -# -"""#docstring_begin - - // }}} - - // Signals {{{ - // ███████╗██╗ ██████╗ ███╗ ██╗ █████╗ ██╗ ███████╗ - // ██╔════╝██║██╔════╝ ████╗ ██║██╔══██╗██║ ██╔════╝ - // ███████╗██║██║ ███╗██╔██╗ ██║███████║██║ ███████╗ - // ╚════██║██║██║ ██║██║╚██╗██║██╔══██║██║ ╚════██║ - // ███████║██║╚██████╔╝██║ ╚████║██║ ██║███████╗███████║ - // ╚══════╝╚═╝ ╚═════╝ ╚═╝ ╚═══╝╚═╝ ╚═╝╚══════╝╚══════╝ - // - - // Internal AXI4 lines, these connect buffers on the slave side to the rab core and - // multiplexers which switch between the two master outputs - logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] int_awid; - logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] int_awaddr; - logic [N_PORTS-1:0] int_awvalid; - logic [N_PORTS-1:0] int_awready; - logic [N_PORTS-1:0] [7:0] int_awlen; - logic [N_PORTS-1:0] [2:0] int_awsize; - logic [N_PORTS-1:0] [1:0] int_awburst; - logic [N_PORTS-1:0] int_awlock; - logic [N_PORTS-1:0] [2:0] int_awprot; - logic [N_PORTS-1:0] [3:0] int_awcache; - logic [N_PORTS-1:0] [3:0] int_awregion; - logic [N_PORTS-1:0] [3:0] int_awqos; - logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_awuser; - - logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] int_wdata; - logic [N_PORTS-1:0] int_wvalid; - logic [N_PORTS-1:0] int_wready; - logic [N_PORTS-1:0] [AXI_DATA_WIDTH/8-1:0] int_wstrb; - logic [N_PORTS-1:0] int_wlast; - logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_wuser; - - logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] int_bid; - logic [N_PORTS-1:0] [1:0] int_bresp; - logic [N_PORTS-1:0] int_bvalid; - logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_buser; - logic [N_PORTS-1:0] int_bready; - - logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] int_arid; - logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] int_araddr; - logic [N_PORTS-1:0] int_arvalid; - logic [N_PORTS-1:0] int_arready; - logic [N_PORTS-1:0] [7:0] int_arlen; - logic [N_PORTS-1:0] [2:0] int_arsize; - logic [N_PORTS-1:0] [1:0] int_arburst; - logic [N_PORTS-1:0] int_arlock; - logic [N_PORTS-1:0] [2:0] int_arprot; - logic [N_PORTS-1:0] [3:0] int_arcache; - logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_aruser; - - logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] int_rid; - logic [N_PORTS-1:0] [1:0] int_rresp; - logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] int_rdata; - logic [N_PORTS-1:0] int_rlast; - logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_ruser; - logic [N_PORTS-1:0] int_rvalid; - logic [N_PORTS-1:0] int_rready; - - // rab_core outputs - logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] int_wtrans_addr; - logic [N_PORTS-1:0] int_wtrans_accept; - logic [N_PORTS-1:0] int_wtrans_drop; - logic [N_PORTS-1:0] int_wtrans_miss; - logic [N_PORTS-1:0] int_wtrans_sent; - logic [N_PORTS-1:0] int_wtrans_cache_coherent; - logic [N_PORTS-1:0] int_wmaster_select; - - logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] int_rtrans_addr; - logic [N_PORTS-1:0] int_rtrans_accept; - logic [N_PORTS-1:0] int_rtrans_drop; - logic [N_PORTS-1:0] int_rtrans_miss; - logic [N_PORTS-1:0] int_rtrans_sent; - logic [N_PORTS-1:0] int_rtrans_cache_coherent; - logic [N_PORTS-1:0] int_rmaster_select; - - logic [N_PORTS-1:0] w_master_select; - - // Internal master0 AXI4 lines. These connect the first master port to the - // multiplexers - // For channels read address, write address and write data the other lines - // are ignored if valid is not set, therefore we only need to multiplex those - logic [N_PORTS-1:0] int_m0_awvalid; - logic [N_PORTS-1:0] int_m0_awready; - - logic [N_PORTS-1:0] int_m0_wvalid; - logic [N_PORTS-1:0] int_m0_wready; - - logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] int_m0_bid; - logic [N_PORTS-1:0] [1:0] int_m0_bresp; - logic [N_PORTS-1:0] int_m0_bvalid; - logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_m0_buser; - logic [N_PORTS-1:0] int_m0_bready; - - logic [N_PORTS-1:0] int_m0_arvalid; - logic [N_PORTS-1:0] int_m0_arready; - - logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] int_m0_rid; - logic [N_PORTS-1:0] [1:0] int_m0_rresp; - logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] int_m0_rdata; - logic [N_PORTS-1:0] int_m0_rlast; - logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_m0_ruser; - logic [N_PORTS-1:0] int_m0_rready; - logic [N_PORTS-1:0] int_m0_rvalid; - - logic [N_PORTS-1:0] l1_m0_ar_accept; - logic [N_PORTS-1:0] l1_m0_ar_drop; - logic [N_PORTS-1:0] l1_m0_ar_save; - logic [N_PORTS-1:0] l1_m0_ar_done; - logic [N_PORTS-1:0] l2_m0_ar_accept; - logic [N_PORTS-1:0] l2_m0_ar_drop; - logic [N_PORTS-1:0] l2_m0_ar_done; - logic [N_PORTS-1:0] l2_m0_ar_sending; - - logic [N_PORTS-1:0] l1_m0_aw_accept; - logic [N_PORTS-1:0] l1_m0_aw_drop; - logic [N_PORTS-1:0] l1_m0_aw_save; - logic [N_PORTS-1:0] l1_m0_aw_done; - logic [N_PORTS-1:0] l2_m0_aw_accept; - logic [N_PORTS-1:0] l2_m0_aw_drop; - logic [N_PORTS-1:0] l2_m0_aw_done; - logic [N_PORTS-1:0] l2_m0_aw_sending; - - // Internal master1 AXI4 lines. These connect the second master port to the - // multiplexers - // For channels read address, write address and write data the other lines - // are ignored if valid is not set, therefore we only need to multiplex those - logic [N_PORTS-1:0] int_m1_awvalid; - logic [N_PORTS-1:0] int_m1_awready; - - logic [N_PORTS-1:0] int_m1_wvalid; - logic [N_PORTS-1:0] int_m1_wready; - - logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] int_m1_bid; - logic [N_PORTS-1:0] [1:0] int_m1_bresp; - logic [N_PORTS-1:0] int_m1_bvalid; - logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_m1_buser; - logic [N_PORTS-1:0] int_m1_bready; - - logic [N_PORTS-1:0] int_m1_arvalid; - logic [N_PORTS-1:0] int_m1_arready; - - logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] int_m1_rid; - logic [N_PORTS-1:0] [1:0] int_m1_rresp; - logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] int_m1_rdata; - logic [N_PORTS-1:0] int_m1_rlast; - logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_m1_ruser; - logic [N_PORTS-1:0] int_m1_rvalid; - logic [N_PORTS-1:0] int_m1_rready; - - logic [N_PORTS-1:0] l1_m1_ar_accept; - logic [N_PORTS-1:0] l1_m1_ar_drop; - logic [N_PORTS-1:0] l1_m1_ar_save; - logic [N_PORTS-1:0] l1_m1_ar_done; - logic [N_PORTS-1:0] l2_m1_ar_accept; - logic [N_PORTS-1:0] l2_m1_ar_drop; - logic [N_PORTS-1:0] l2_m1_ar_done; - - logic [N_PORTS-1:0] l1_m1_aw_accept; - logic [N_PORTS-1:0] l1_m1_aw_drop; - logic [N_PORTS-1:0] l1_m1_aw_save; - logic [N_PORTS-1:0] l1_m1_aw_done; - logic [N_PORTS-1:0] l2_m1_aw_accept; - logic [N_PORTS-1:0] l2_m1_aw_drop; - logic [N_PORTS-1:0] l2_m1_aw_done; - - // L1 outputs - logic [N_PORTS-1:0] rab_miss; // L1 RAB miss - logic [N_PORTS-1:0] rab_prot; - logic [N_PORTS-1:0] rab_multi; - logic [N_PORTS-1:0] rab_prefetch; - - // - // Signals used to support L2 TLB - // - // L2 RAM configuration signals - logic [N_PORTS-1:0] [AXI_LITE_DATA_WIDTH-1:0] L2CfgWData_D; - logic [N_PORTS-1:0] [AXI_LITE_ADDR_WIDTH-1:0] L2CfgWAddr_D; - logic [N_PORTS-1:0] L2CfgWE_S; - - // L1 output and drop Buffer - logic [N_PORTS-1:0] L1OutRwType_D, L1DropRwType_DP; - logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] L1OutUser_D, L1DropUser_DP; - logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] L1OutId_D, L1DropId_DP; - logic [N_PORTS-1:0] [7:0] L1OutLen_D, L1DropLen_DP; - logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] L1OutAddr_D, L1DropAddr_DP; - logic [N_PORTS-1:0] L1OutProt_D, L1DropProt_DP; - logic [N_PORTS-1:0] L1OutMulti_D, L1DropMulti_DP; - logic [N_PORTS-1:0] L1DropEn_S; - logic [N_PORTS-1:0] L1DropPrefetch_S; - - logic [N_PORTS-1:0] L1DropValid_SN, L1DropValid_SP; - - // L2 input Buffer - logic [N_PORTS-1:0] L2InRwType_DP; - logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] L2InUser_DP; - logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] L2InId_DP; - logic [N_PORTS-1:0] [7:0] L2InLen_DP; - logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] L2InAddr_DP; - logic [N_PORTS-1:0] L2InEn_S; - - // L2 output Buffer - logic [N_PORTS-1:0] L2OutRwType_DP; - logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] L2OutUser_DP; - logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] L2OutId_DP; - logic [N_PORTS-1:0] [7:0] L2OutLen_DP; - logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] L2OutInAddr_DP; - - logic [N_PORTS-1:0] L2OutHit_SN, L2OutHit_SP; - logic [N_PORTS-1:0] L2OutMiss_SN, L2OutMiss_SP; - logic [N_PORTS-1:0] L2OutProt_SN, L2OutProt_SP; - logic [N_PORTS-1:0] L2OutMulti_SN, L2OutMulti_SP; - logic [N_PORTS-1:0] L2OutCC_SN, L2OutCC_SP; - logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] L2OutAddr_DN, L2OutAddr_DP; - - logic [N_PORTS-1:0] L2OutValid_SN, L2OutValid_SP; - logic [N_PORTS-1:0] L2OutPrefetch_S; - logic [N_PORTS-1:0] L2OutReady_S; - logic [N_PORTS-1:0] L2OutEn_S; - - // L2 outputs - logic [N_PORTS-1:0] L2Busy_S; - logic [N_PORTS-1:0] L2OutValid_S; - - logic [N_PORTS-1:0] L2Miss_S; - - // Signals for interfacing the AXI modules - logic [N_PORTS-1:0] l1_ar_accept; - logic [N_PORTS-1:0] l1_aw_accept; - logic [N_PORTS-1:0] l1_w_accept; - logic [N_PORTS-1:0] l1_xw_accept; - - logic [N_PORTS-1:0] l1_ar_drop; - logic [N_PORTS-1:0] l1_aw_drop; - logic [N_PORTS-1:0] l1_w_drop; - logic [N_PORTS-1:0] l1_xw_drop; - - logic [N_PORTS-1:0] l1_ar_save; - logic [N_PORTS-1:0] l1_aw_save; - logic [N_PORTS-1:0] l1_w_save; - logic [N_PORTS-1:0] l1_xw_save; - - logic [N_PORTS-1:0] l1_ar_done; - logic [N_PORTS-1:0] l1_r_done; - logic [N_PORTS-1:0] l1_r_drop; - logic [N_PORTS-1:0] lx_r_drop; - logic [N_PORTS-1:0] lx_r_done; - - logic [N_PORTS-1:0] l1_aw_done; - logic [N_PORTS-1:0] l1_w_done; - logic [N_PORTS-1:0] l1_xw_done; - logic [N_PORTS-1:0] l1_aw_done_SP; - logic [N_PORTS-1:0] l1_w_done_SP; - - logic [N_PORTS-1:0] l2_ar_accept; - logic [N_PORTS-1:0] l2_aw_accept; - logic [N_PORTS-1:0] l2_w_accept; - logic [N_PORTS-1:0] l2_xw_accept; - - logic [N_PORTS-1:0] l2_ar_drop; - logic [N_PORTS-1:0] l2_r_drop; - logic [N_PORTS-1:0] l2_xr_drop; - logic [N_PORTS-1:0] l2_aw_drop; - logic [N_PORTS-1:0] l2_w_drop; - logic [N_PORTS-1:0] l2_xw_drop; - - logic [N_PORTS-1:0] l2_aw_done; - logic [N_PORTS-1:0] l2_w_done; - logic [N_PORTS-1:0] l2_xw_done; - logic [N_PORTS-1:0] l2_aw_done_SP; - logic [N_PORTS-1:0] l2_w_done_SP; - - logic [N_PORTS-1:0] l2_ar_done; - logic [N_PORTS-1:0] l2_r_done; - logic [N_PORTS-1:0] l2_xr_done; - logic [N_PORTS-1:0] l2_ar_done_SP; - logic [N_PORTS-1:0] l2_r_done_SP; - - logic [N_PORTS-1:0] l1_mx_aw_done; - logic [N_PORTS-1:0] l1_mx_ar_done; - logic [N_PORTS-1:0] l1_m0_aw_done_SP; - logic [N_PORTS-1:0] l1_m0_ar_done_SP; - logic [N_PORTS-1:0] l1_m1_aw_done_SP; - logic [N_PORTS-1:0] l1_m1_ar_done_SP; - - logic [N_PORTS-1:0] l2_mx_aw_done; - logic [N_PORTS-1:0] l2_mx_ar_done; - logic [N_PORTS-1:0] l2_m0_aw_done_SP; - logic [N_PORTS-1:0] l2_m0_ar_done_SP; - logic [N_PORTS-1:0] l2_m1_aw_done_SP; - logic [N_PORTS-1:0] l2_m1_ar_done_SP; - - logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] l1_id_drop, lx_id_drop, b_id_drop; - logic [N_PORTS-1:0] [7:0] l1_len_drop, lx_len_drop; - logic [N_PORTS-1:0] l1_prefetch_drop, lx_prefetch_drop, b_prefetch_drop; - logic [N_PORTS-1:0] l1_hit_drop, lx_hit_drop, b_hit_drop; - - logic [N_PORTS-1:0] b_drop; - logic [N_PORTS-1:0] b_done; - - logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] l2_aw_addr; - logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] l2_ar_addr; - - logic [N_PORTS-1:0] l2_cache_coherent; - logic [N_PORTS-1:0] l2_master_select; - - logic [N_PORTS-1:0] aw_in_stall; - logic [N_PORTS-1:0] aw_out_stall; - - genvar i; - - // RRESP FSM - typedef enum logic {IDLE, BUSY} r_resp_mux_ctrl_state_t; - r_resp_mux_ctrl_state_t [N_PORTS-1:0] RRespMuxCtrl_SN, RRespMuxCtrl_SP; - logic [N_PORTS-1:0] RRespSel_SN, RRespSel_SP; - logic [N_PORTS-1:0] RRespBurst_S; - logic [N_PORTS-1:0] RRespSelIm_S; - - // }}} - - // Local parameters {{{ - - // Enable L2 for select ports - localparam integer ENABLE_L2TLB[N_PORTS-1:0] = `EN_L2TLB_ARRAY; - - // L2TLB parameters - localparam integer HUM_BUFFER_DEPTH = (N_L2_SET_ENTRIES/2/`RAB_L2_N_PAR_VA_RAMS)+13; - - // }}} - - // Derive `master_select` from cache coherency flag. {{{ - `ifdef EN_ACP - assign int_wmaster_select = int_wtrans_cache_coherent; - assign int_rmaster_select = int_rtrans_cache_coherent; - assign l2_master_select = l2_cache_coherent; - `else - assign int_wmaster_select = '0; - assign int_rmaster_select = '0; - assign l2_master_select = '0; - `endif - // }}} - - // Buf and Send {{{ - // ██████╗ ██╗ ██╗███████╗ ██╗ ███████╗███████╗███╗ ██╗██████╗ - // ██╔══██╗██║ ██║██╔════╝ ██║ ██╔════╝██╔════╝████╗ ██║██╔══██╗ - // ██████╔╝██║ ██║█████╗ ████████╗ ███████╗█████╗ ██╔██╗ ██║██║ ██║ - // ██╔══██╗██║ ██║██╔══╝ ██╔═██╔═╝ ╚════██║██╔══╝ ██║╚██╗██║██║ ██║ - // ██████╔╝╚██████╔╝██║ ██████║ ███████║███████╗██║ ╚████║██████╔╝ - // ╚═════╝ ╚═════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝╚═╝ ╚═══╝╚═════╝ - // - logic[N_PORTS-1:0] m0_write_is_burst, m0_read_is_burst; - logic[N_PORTS-1:0] m1_write_is_burst, m1_read_is_burst; - - generate for (i = 0; i < N_PORTS; i++) begin : BUF_AND_SEND - - // Write Address channel (aw) {{{ - /* - * write address channel (aw) - * - * ██╗ ██╗██████╗ ██╗████████╗███████╗ █████╗ ██████╗ ██████╗ ██████╗ - * ██║ ██║██╔══██╗██║╚══██╔══╝██╔════╝ ██╔══██╗██╔══██╗██╔══██╗██╔══██╗ - * ██║ █╗ ██║██████╔╝██║ ██║ █████╗ ███████║██║ ██║██║ ██║██████╔╝ - * ██║███╗██║██╔══██╗██║ ██║ ██╔══╝ ██╔══██║██║ ██║██║ ██║██╔══██╗ - * ╚███╔███╔╝██║ ██║██║ ██║ ███████╗ ██║ ██║██████╔╝██████╔╝██║ ██║ - * ╚══╝╚══╝ ╚═╝ ╚═╝╚═╝ ╚═╝ ╚══════╝ ╚═╝ ╚═╝╚═════╝ ╚═════╝ ╚═╝ ╚═╝ - * - */ - - axi4_aw_buffer - #( - .AXI_ID_WIDTH ( AXI_ID_WIDTH ), - .AXI_USER_WIDTH ( AXI_USER_WIDTH ) - ) - u_aw_buffer - ( - .axi4_aclk ( Clk_CI ), - .axi4_arstn ( Rst_RBI ), - .s_axi4_awid ( s_axi4_awid[i] ), - .s_axi4_awaddr ( s_axi4_awaddr[i] ), - .s_axi4_awvalid ( s_axi4_awvalid[i] ), - .s_axi4_awready ( s_axi4_awready[i] ), - .s_axi4_awlen ( s_axi4_awlen[i] ), - .s_axi4_awsize ( s_axi4_awsize[i] ), - .s_axi4_awburst ( s_axi4_awburst[i] ), - .s_axi4_awlock ( s_axi4_awlock[i] ), - .s_axi4_awprot ( s_axi4_awprot[i] ), - .s_axi4_awcache ( s_axi4_awcache[i] ), - .s_axi4_awregion ( s_axi4_awregion[i] ), - .s_axi4_awqos ( s_axi4_awqos[i] ), - .s_axi4_awuser ( s_axi4_awuser[i] ), - .m_axi4_awid ( int_awid[i] ), - .m_axi4_awaddr ( int_awaddr[i] ), - .m_axi4_awvalid ( int_awvalid[i] ), - .m_axi4_awready ( int_awready[i] ), - .m_axi4_awlen ( int_awlen[i] ), - .m_axi4_awsize ( int_awsize[i] ), - .m_axi4_awburst ( int_awburst[i] ), - .m_axi4_awlock ( int_awlock[i] ), - .m_axi4_awprot ( int_awprot[i] ), - .m_axi4_awcache ( int_awcache[i] ), - .m_axi4_awregion ( int_awregion[i] ), - .m_axi4_awqos ( int_awqos[i] ), - .m_axi4_awuser ( int_awuser[i] ) - ); - - axi4_aw_sender - #( - .AXI_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ), - .AXI_ID_WIDTH ( AXI_ID_WIDTH ), - .AXI_USER_WIDTH ( AXI_USER_WIDTH ), - .ENABLE_L2TLB ( ENABLE_L2TLB[i] ) - ) - u_aw_sender_m0 - ( - .axi4_aclk ( Clk_CI ), - .axi4_arstn ( Rst_RBI ), - .l1_done_o ( l1_m0_aw_done[i] ), - .l1_accept_i ( l1_m0_aw_accept[i] ), - .l1_drop_i ( l1_m0_aw_drop[i] ), - .l1_save_i ( l1_m0_aw_save[i] ), - .l2_done_o ( l2_m0_aw_done[i] ), - .l2_accept_i ( l2_m0_aw_accept[i] ), - .l2_drop_i ( l2_m0_aw_drop[i] ), - .l2_sending_o ( l2_m0_aw_sending[i] ), - .l1_awaddr_i ( int_wtrans_addr[i] ), - .l2_awaddr_i ( l2_aw_addr[i] ), - .s_axi4_awid ( int_awid[i] ), - .s_axi4_awvalid ( int_m0_awvalid[i] ), - .s_axi4_awready ( int_m0_awready[i] ), - .s_axi4_awlen ( int_awlen[i] ), - .s_axi4_awsize ( int_awsize[i] ), - .s_axi4_awburst ( int_awburst[i] ), - .s_axi4_awlock ( int_awlock[i] ), - .s_axi4_awprot ( int_awprot[i] ), - .s_axi4_awcache ( int_awcache[i] ), - .s_axi4_awregion ( int_awregion[i] ), - .s_axi4_awqos ( int_awqos[i] ), - .s_axi4_awuser ( int_awuser[i] ), - .m_axi4_awid ( m0_axi4_awid[i] ), - .m_axi4_awaddr ( m0_axi4_awaddr[i] ), - .m_axi4_awvalid ( m0_axi4_awvalid[i] ), - .m_axi4_awready ( m0_axi4_awready[i] ), - .m_axi4_awlen ( m0_axi4_awlen[i] ), - .m_axi4_awsize ( m0_axi4_awsize[i] ), - .m_axi4_awburst ( m0_axi4_awburst[i] ), - .m_axi4_awlock ( m0_axi4_awlock[i] ), - .m_axi4_awprot ( m0_axi4_awprot[i] ), - .m_axi4_awcache ( ), - .m_axi4_awregion ( m0_axi4_awregion[i] ), - .m_axi4_awqos ( m0_axi4_awqos[i] ), - .m_axi4_awuser ( m0_axi4_awuser[i] ) - ); - - // The AXCACHE signals are set according to burstiness and cache coherence or statically - // when not connected to ACP on Zynq (implemented below). - assign m0_write_is_burst[i] = (m0_axi4_awlen[i] != {8{1'b0}}) && (m0_axi4_awburst[i] != 2'b00); - `ifndef EN_ACP - always_comb begin - if ( (l2_m0_aw_sending[i] & l2_cache_coherent[i]) | int_wtrans_cache_coherent[i]) begin - if (m0_write_is_burst[i]) begin - m0_axi4_awcache[i] = 4'b0111; - end else begin - m0_axi4_awcache[i] = 4'b1111; - end - end else begin - m0_axi4_awcache[i] = 4'b0011; - end - end - `else - assign m0_axi4_awcache[i] = 4'b0011; - `endif - - axi4_aw_sender - #( - .AXI_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ), - .AXI_ID_WIDTH ( AXI_ID_WIDTH ), - .AXI_USER_WIDTH ( AXI_USER_WIDTH ), - .ENABLE_L2TLB ( ENABLE_L2TLB[i] ) - ) - u_aw_sender_m1 - ( - .axi4_aclk ( Clk_CI ), - .axi4_arstn ( Rst_RBI ), - .l1_accept_i ( l1_m1_aw_accept[i] ), - .l1_drop_i ( l1_m1_aw_drop[i] ), - .l1_save_i ( l1_m1_aw_save[i] ), - .l1_done_o ( l1_m1_aw_done[i] ), - .l2_accept_i ( l2_m1_aw_accept[i] ), - .l2_drop_i ( l2_m1_aw_drop[i] ), - .l2_done_o ( l2_m1_aw_done[i] ), - .l2_sending_o ( ), // just helps to set axcache - .l1_awaddr_i ( int_wtrans_addr[i] ), - .l2_awaddr_i ( l2_aw_addr[i] ), - .s_axi4_awid ( int_awid[i] ), - .s_axi4_awvalid ( int_m1_awvalid[i] ), - .s_axi4_awready ( int_m1_awready[i] ), - .s_axi4_awlen ( int_awlen[i] ), - .s_axi4_awsize ( int_awsize[i] ), - .s_axi4_awburst ( int_awburst[i] ), - .s_axi4_awlock ( int_awlock[i] ), - .s_axi4_awprot ( int_awprot[i] ), - .s_axi4_awcache ( int_awcache[i] ), - .s_axi4_awregion ( int_awregion[i] ), - .s_axi4_awqos ( int_awqos[i] ), - .s_axi4_awuser ( int_awuser[i] ), - .m_axi4_awid ( m1_axi4_awid[i] ), - .m_axi4_awaddr ( m1_axi4_awaddr[i] ), - .m_axi4_awvalid ( m1_axi4_awvalid[i] ), - .m_axi4_awready ( m1_axi4_awready[i] ), - .m_axi4_awlen ( m1_axi4_awlen[i] ), - .m_axi4_awsize ( m1_axi4_awsize[i] ), - .m_axi4_awburst ( m1_axi4_awburst[i] ), - .m_axi4_awlock ( m1_axi4_awlock[i] ), - .m_axi4_awprot ( m1_axi4_awprot[i] ), - .m_axi4_awcache ( ), - .m_axi4_awregion ( m1_axi4_awregion[i] ), - .m_axi4_awqos ( m1_axi4_awqos[i] ), - .m_axi4_awuser ( m1_axi4_awuser[i] ) - ); - - // The AXCACHE signals are set according to burstiness and cache coherence or statically - // when not connected to ACP on Zynq (implemented below). - assign m1_write_is_burst[i] = (m1_axi4_awlen[i] != {8{1'b0}}) && (m1_axi4_awburst[i] != 2'b00); - `ifdef EN_ACP - always_comb begin - if (m1_write_is_burst[i]) begin - m1_axi4_awcache[i] = 4'b1011; - end else begin - m1_axi4_awcache[i] = 4'b1111; - end - end - `else - assign m1_axi4_awcache[i] = 4'b0011; - `endif - - // }}} - - // Write Data channel (w) {{{ - /* - * write data channel (w) - * - * ██╗ ██╗██████╗ ██╗████████╗███████╗ ██████╗ █████╗ ████████╗ █████╗ - * ██║ ██║██╔══██╗██║╚══██╔══╝██╔════╝ ██╔══██╗██╔══██╗╚══██╔══╝██╔══██╗ - * ██║ █╗ ██║██████╔╝██║ ██║ █████╗ ██║ ██║███████║ ██║ ███████║ - * ██║███╗██║██╔══██╗██║ ██║ ██╔══╝ ██║ ██║██╔══██║ ██║ ██╔══██║ - * ╚███╔███╔╝██║ ██║██║ ██║ ███████╗ ██████╔╝██║ ██║ ██║ ██║ ██║ - * ╚══╝╚══╝ ╚═╝ ╚═╝╚═╝ ╚═╝ ╚══════╝ ╚═════╝ ╚═╝ ╚═╝ ╚═╝ ╚═╝ ╚═╝ - * - */ - axi4_w_buffer - #( - .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ), - .AXI_ID_WIDTH ( AXI_ID_WIDTH ), - .AXI_USER_WIDTH ( AXI_USER_WIDTH ), - .ENABLE_L2TLB ( ENABLE_L2TLB[i] ), - .HUM_BUFFER_DEPTH ( HUM_BUFFER_DEPTH ) - ) - u_w_buffer - ( - .axi4_aclk ( Clk_CI ), - .axi4_arstn ( Rst_RBI ), - - // L1 interface - .l1_done_o ( l1_w_done[i] ), - .l1_accept_i ( l1_w_accept[i] ), - .l1_save_i ( l1_w_save[i] ), - .l1_drop_i ( l1_w_drop[i] ), - .l1_master_i ( int_wmaster_select[i] ), - .l1_id_i ( l1_id_drop[i] ), - .l1_len_i ( l1_len_drop[i] ), - .l1_prefetch_i ( l1_prefetch_drop[i] ), - .l1_hit_i ( l1_hit_drop[i] ), - - // L2 interface - .l2_done_o ( l2_w_done[i] ), - .l2_accept_i ( l2_w_accept[i] ), - .l2_drop_i ( l2_w_drop[i] ), - .l2_master_i ( l2_master_select[i] ), - .l2_id_i ( lx_id_drop[i] ), - .l2_len_i ( lx_len_drop[i] ), - .l2_prefetch_i ( lx_prefetch_drop[i] ), - .l2_hit_i ( lx_hit_drop[i] ), - - // Top-level control outputs - .master_select_o ( w_master_select[i] ), - .input_stall_o ( aw_in_stall[i] ), // stall L1 AW input if request buffers full - .output_stall_o ( aw_out_stall[i] ), // stall L1 AW hit forwarding if bypass not possible - - // B sender interface - .b_drop_o ( b_drop[i] ), - .b_done_i ( b_done[i] ), - .id_o ( b_id_drop[i] ), - .prefetch_o ( b_prefetch_drop[i] ), - .hit_o ( b_hit_drop[i] ), - - // AXI W channel interfaces - .s_axi4_wdata ( s_axi4_wdata[i] ), - .s_axi4_wvalid ( s_axi4_wvalid[i] ), - .s_axi4_wready ( s_axi4_wready[i] ), - .s_axi4_wstrb ( s_axi4_wstrb[i] ), - .s_axi4_wlast ( s_axi4_wlast[i] ), - .s_axi4_wuser ( s_axi4_wuser[i] ), - .m_axi4_wdata ( int_wdata[i] ), - .m_axi4_wvalid ( int_wvalid[i] ), - .m_axi4_wready ( int_wready[i] ), - .m_axi4_wstrb ( int_wstrb[i] ), - .m_axi4_wlast ( int_wlast[i] ), - .m_axi4_wuser ( int_wuser[i] ) - ); - - axi4_w_sender - #( - .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ), - .AXI_USER_WIDTH ( AXI_USER_WIDTH ) - ) - u_w_sender_m0 - ( - .axi4_aclk ( Clk_CI ), - .axi4_arstn ( Rst_RBI ), - .s_axi4_wdata ( int_wdata[i] ), - .s_axi4_wvalid ( int_m0_wvalid[i] ), - .s_axi4_wready ( int_m0_wready[i] ), - .s_axi4_wstrb ( int_wstrb[i] ), - .s_axi4_wlast ( int_wlast[i] ), - .s_axi4_wuser ( int_wuser[i] ), - .m_axi4_wdata ( m0_axi4_wdata[i] ), - .m_axi4_wvalid ( m0_axi4_wvalid[i] ), - .m_axi4_wready ( m0_axi4_wready[i] ), - .m_axi4_wstrb ( m0_axi4_wstrb[i] ), - .m_axi4_wlast ( m0_axi4_wlast[i] ), - .m_axi4_wuser ( m0_axi4_wuser[i] ) - ); - - axi4_w_sender - #( - .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ), - .AXI_USER_WIDTH ( AXI_USER_WIDTH ) - - ) - u_w_sender_m1 - ( - .axi4_aclk ( Clk_CI ), - .axi4_arstn ( Rst_RBI ), - .s_axi4_wdata ( int_wdata[i] ), - .s_axi4_wvalid ( int_m1_wvalid[i] ), - .s_axi4_wready ( int_m1_wready[i] ), - .s_axi4_wstrb ( int_wstrb[i] ), - .s_axi4_wlast ( int_wlast[i] ), - .s_axi4_wuser ( int_wuser[i] ), - .m_axi4_wdata ( m1_axi4_wdata[i] ), - .m_axi4_wvalid ( m1_axi4_wvalid[i] ), - .m_axi4_wready ( m1_axi4_wready[i] ), - .m_axi4_wstrb ( m1_axi4_wstrb[i] ), - .m_axi4_wlast ( m1_axi4_wlast[i] ), - .m_axi4_wuser ( m1_axi4_wuser[i] ) - ); - - /* - * Multiplexer to switch between the two output master ports on the write data (w) channel - */ - always_comb begin - /* Only one output can be selected at any time */ - if (w_master_select[i] == 1'b0) begin - int_m0_wvalid[i] = int_wvalid[i]; - int_m1_wvalid[i] = 1'b0; - int_wready[i] = int_m0_wready[i]; - end else begin - int_m0_wvalid[i] = 1'b0; - int_m1_wvalid[i] = int_wvalid[i]; - int_wready[i] = int_m1_wready[i]; - end - end - - // }}} - - // Write Response channel (b) {{{ - /* - * write response channel (b) - * - * ██╗ ██╗██████╗ ██╗████████╗███████╗ ██████╗ ███████╗███████╗██████╗ - * ██║ ██║██╔══██╗██║╚══██╔══╝██╔════╝ ██╔══██╗██╔════╝██╔════╝██╔══██╗ - * ██║ █╗ ██║██████╔╝██║ ██║ █████╗ ██████╔╝█████╗ ███████╗██████╔╝ - * ██║███╗██║██╔══██╗██║ ██║ ██╔══╝ ██╔══██╗██╔══╝ ╚════██║██╔═══╝ - * ╚███╔███╔╝██║ ██║██║ ██║ ███████╗ ██║ ██║███████╗███████║██║ - * ╚══╝╚══╝ ╚═╝ ╚═╝╚═╝ ╚═╝ ╚══════╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝ - * - */ - axi4_b_buffer - #( - .AXI_ID_WIDTH ( AXI_ID_WIDTH ), - .AXI_USER_WIDTH ( AXI_USER_WIDTH ) - ) - u_b_buffer_m0 - ( - .axi4_aclk ( Clk_CI ), - .axi4_arstn ( Rst_RBI ), - .s_axi4_bid ( int_m0_bid[i] ), - .s_axi4_bresp ( int_m0_bresp[i] ), - .s_axi4_bvalid ( int_m0_bvalid[i] ), - .s_axi4_buser ( int_m0_buser[i] ), - .s_axi4_bready ( int_m0_bready[i] ), - .m_axi4_bid ( m0_axi4_bid[i] ), - .m_axi4_bresp ( m0_axi4_bresp[i] ), - .m_axi4_bvalid ( m0_axi4_bvalid[i] ), - .m_axi4_buser ( m0_axi4_buser[i] ), - .m_axi4_bready ( m0_axi4_bready[i] ) - ); - - axi4_b_buffer - #( - .AXI_ID_WIDTH ( AXI_ID_WIDTH ), - .AXI_USER_WIDTH ( AXI_USER_WIDTH ) - ) - u_b_buffer_m1 - ( - .axi4_aclk ( Clk_CI ), - .axi4_arstn ( Rst_RBI ), - .s_axi4_bid ( int_m1_bid[i] ), - .s_axi4_bresp ( int_m1_bresp[i] ), - .s_axi4_bvalid ( int_m1_bvalid[i] ), - .s_axi4_buser ( int_m1_buser[i] ), - .s_axi4_bready ( int_m1_bready[i] ), - .m_axi4_bid ( m1_axi4_bid[i] ), - .m_axi4_bresp ( m1_axi4_bresp[i] ), - .m_axi4_bvalid ( m1_axi4_bvalid[i] ), - .m_axi4_buser ( m1_axi4_buser[i] ), - .m_axi4_bready ( m1_axi4_bready[i] ) - ); - - axi4_b_sender - #( - .AXI_ID_WIDTH ( AXI_ID_WIDTH ), - .AXI_USER_WIDTH ( AXI_USER_WIDTH ) - ) - u_b_sender - ( - .axi4_aclk ( Clk_CI ), - .axi4_arstn ( Rst_RBI ), - .drop_i ( b_drop[i] ), - .done_o ( b_done[i] ), - .id_i ( b_id_drop[i] ), - .prefetch_i ( b_prefetch_drop[i] ), - .hit_i ( b_hit_drop[i] ), - .s_axi4_bid ( s_axi4_bid[i] ), - .s_axi4_bresp ( s_axi4_bresp[i] ), - .s_axi4_bvalid ( s_axi4_bvalid[i] ), - .s_axi4_buser ( s_axi4_buser[i] ), - .s_axi4_bready ( s_axi4_bready[i] ), - .m_axi4_bid ( int_bid[i] ), - .m_axi4_bresp ( int_bresp[i] ), - .m_axi4_bvalid ( int_bvalid[i] ), - .m_axi4_buser ( int_buser[i] ), - .m_axi4_bready ( int_bready[i] ) - ); - - /* - * Multiplexer to switch between the two output master ports on the write response (b) channel - */ - always_comb begin - /* Output 1 always gets priority, so if it has something to send connect - it and let output 0 wait using rready = 0 */ - if (int_m1_bvalid[i] == 1'b1) begin - int_m0_bready[i] = 1'b0; - int_m1_bready[i] = int_bready[i]; - - int_bid[i] = int_m1_bid[i]; - int_bresp[i] = int_m1_bresp[i]; - int_buser[i] = int_m1_buser[i]; - int_bvalid[i] = int_m1_bvalid[i]; - end else begin - int_m0_bready[i] = int_bready[i]; - int_m1_bready[i] = 1'b0; - - int_bid[i] = int_m0_bid[i]; - int_bresp[i] = int_m0_bresp[i]; - int_buser[i] = int_m0_buser[i]; - int_bvalid[i] = int_m0_bvalid[i]; - end - end - - // }}} - - // Read Address channel (ar) {{{ - /* - * read address channel (ar) - * - * ██████╗ ███████╗ █████╗ ██████╗ █████╗ ██████╗ ██████╗ ██████╗ - * ██╔══██╗██╔════╝██╔══██╗██╔══██╗ ██╔══██╗██╔══██╗██╔══██╗██╔══██╗ - * ██████╔╝█████╗ ███████║██║ ██║ ███████║██║ ██║██║ ██║██████╔╝ - * ██╔══██╗██╔══╝ ██╔══██║██║ ██║ ██╔══██║██║ ██║██║ ██║██╔══██╗ - * ██║ ██║███████╗██║ ██║██████╔╝ ██║ ██║██████╔╝██████╔╝██║ ██║ - * ╚═╝ ╚═╝╚══════╝╚═╝ ╚═╝╚═════╝ ╚═╝ ╚═╝╚═════╝ ╚═════╝ ╚═╝ ╚═╝ - * - */ - axi4_ar_buffer - #( - .AXI_ID_WIDTH ( AXI_ID_WIDTH ), - .AXI_USER_WIDTH ( AXI_USER_WIDTH ) - ) - u_ar_buffer - ( - .axi4_aclk ( Clk_CI ), - .axi4_arstn ( Rst_RBI ), - .s_axi4_arid ( s_axi4_arid[i] ), - .s_axi4_araddr ( s_axi4_araddr[i] ), - .s_axi4_arvalid ( s_axi4_arvalid[i] ), - .s_axi4_arready ( s_axi4_arready[i] ), - .s_axi4_arlen ( s_axi4_arlen[i] ), - .s_axi4_arsize ( s_axi4_arsize[i] ), - .s_axi4_arburst ( s_axi4_arburst[i] ), - .s_axi4_arlock ( s_axi4_arlock[i] ), - .s_axi4_arprot ( s_axi4_arprot[i] ), - .s_axi4_arcache ( s_axi4_arcache[i] ), - .s_axi4_aruser ( s_axi4_aruser[i] ), - .m_axi4_arid ( int_arid[i] ), - .m_axi4_araddr ( int_araddr[i] ), - .m_axi4_arvalid ( int_arvalid[i] ), - .m_axi4_arready ( int_arready[i] ), - .m_axi4_arlen ( int_arlen[i] ), - .m_axi4_arsize ( int_arsize[i] ), - .m_axi4_arburst ( int_arburst[i] ), - .m_axi4_arlock ( int_arlock[i] ), - .m_axi4_arprot ( int_arprot[i] ), - .m_axi4_arcache ( int_arcache[i] ), - .m_axi4_aruser ( int_aruser[i] ) - ); - - axi4_ar_sender - #( - .AXI_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ), - .AXI_ID_WIDTH ( AXI_ID_WIDTH ), - .AXI_USER_WIDTH ( AXI_USER_WIDTH ), - .ENABLE_L2TLB ( ENABLE_L2TLB[i] ) - ) - u_ar_sender_m0 - ( - .axi4_aclk ( Clk_CI ), - .axi4_arstn ( Rst_RBI ), - .l1_done_o ( l1_m0_ar_done[i] ), - .l1_accept_i ( l1_m0_ar_accept[i] ), - .l1_drop_i ( l1_m0_ar_drop[i] ), - .l1_save_i ( l1_m0_ar_save[i] ), - .l2_done_o ( l2_m0_ar_done[i] ), - .l2_accept_i ( l2_m0_ar_accept[i] ), - .l2_drop_i ( l2_m0_ar_drop[i] ), - .l2_sending_o ( l2_m0_ar_sending[i] ), - .l1_araddr_i ( int_rtrans_addr[i] ), - .l2_araddr_i ( l2_ar_addr[i] ), - .s_axi4_arid ( int_arid[i] ), - .s_axi4_arvalid ( int_m0_arvalid[i] ), - .s_axi4_arready ( int_m0_arready[i] ), - .s_axi4_arlen ( int_arlen[i] ), - .s_axi4_arsize ( int_arsize[i] ), - .s_axi4_arburst ( int_arburst[i] ), - .s_axi4_arlock ( int_arlock[i] ), - .s_axi4_arprot ( int_arprot[i] ), - .s_axi4_arcache ( int_arcache[i] ), - .s_axi4_aruser ( int_aruser[i] ), - .m_axi4_arid ( m0_axi4_arid[i] ), - .m_axi4_araddr ( m0_axi4_araddr[i] ), - .m_axi4_arvalid ( m0_axi4_arvalid[i] ), - .m_axi4_arready ( m0_axi4_arready[i] ), - .m_axi4_arlen ( m0_axi4_arlen[i] ), - .m_axi4_arsize ( m0_axi4_arsize[i] ), - .m_axi4_arburst ( m0_axi4_arburst[i] ), - .m_axi4_arlock ( m0_axi4_arlock[i] ), - .m_axi4_arprot ( m0_axi4_arprot[i] ), - .m_axi4_arcache ( ), - .m_axi4_aruser ( m0_axi4_aruser[i] ) - ); - - // The AXCACHE signals are set according to burstiness and cache coherence or statically - // when not connected to ACP on Zynq (implemented below). - assign m0_read_is_burst[i] = (m0_axi4_arlen[i] != {8{1'b0}}) && (m0_axi4_arburst[i] != 2'b00); - `ifndef EN_ACP - always_comb begin - if ( (l2_m0_ar_sending[i] & l2_cache_coherent[i]) | int_rtrans_cache_coherent[i]) begin - if (m0_read_is_burst[i]) begin - m0_axi4_arcache[i] = 4'b1011; - end else begin - m0_axi4_arcache[i] = 4'b1111; - end - end else begin - m0_axi4_arcache[i] = 4'b0011; - end - end - `else - assign m0_axi4_arcache[i] = 4'b0011; - `endif - - axi4_ar_sender - #( - .AXI_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ), - .AXI_ID_WIDTH ( AXI_ID_WIDTH ), - .AXI_USER_WIDTH ( AXI_USER_WIDTH ), - .ENABLE_L2TLB ( ENABLE_L2TLB[i] ) - ) - u_ar_sender_m1 - ( - .axi4_aclk ( Clk_CI ), - .axi4_arstn ( Rst_RBI ), - .l1_done_o ( l1_m1_ar_done[i] ), - .l1_accept_i ( l1_m1_ar_accept[i] ), - .l1_drop_i ( l1_m1_ar_drop[i] ), - .l1_save_i ( l1_m1_ar_save[i] ), - .l2_done_o ( l2_m1_ar_done[i] ), - .l2_accept_i ( l2_m1_ar_accept[i] ), - .l2_drop_i ( l2_m1_ar_drop[i] ), - .l2_sending_o ( ), // just helps to set axcache - .l1_araddr_i ( int_rtrans_addr[i] ), - .l2_araddr_i ( l2_ar_addr[i] ), - .s_axi4_arid ( int_arid[i] ), - .s_axi4_arvalid ( int_m1_arvalid[i] ), - .s_axi4_arready ( int_m1_arready[i] ), - .s_axi4_arlen ( int_arlen[i] ), - .s_axi4_arsize ( int_arsize[i] ), - .s_axi4_arburst ( int_arburst[i] ), - .s_axi4_arlock ( int_arlock[i] ), - .s_axi4_arprot ( int_arprot[i] ), - .s_axi4_arcache ( int_arcache[i] ), - .s_axi4_aruser ( int_aruser[i] ), - .m_axi4_arid ( m1_axi4_arid[i] ), - .m_axi4_araddr ( m1_axi4_araddr[i] ), - .m_axi4_arvalid ( m1_axi4_arvalid[i] ), - .m_axi4_arready ( m1_axi4_arready[i] ), - .m_axi4_arlen ( m1_axi4_arlen[i] ), - .m_axi4_arsize ( m1_axi4_arsize[i] ), - .m_axi4_arburst ( m1_axi4_arburst[i] ), - .m_axi4_arlock ( m1_axi4_arlock[i] ), - .m_axi4_arprot ( m1_axi4_arprot[i] ), - .m_axi4_arcache ( ), - .m_axi4_aruser ( m1_axi4_aruser[i] ) - ); - - // The AXCACHE signals are set according to burstiness and cache coherence or statically - // when not connected to ACP on Zynq (implemented below). - assign m1_read_is_burst[i] = (m1_axi4_arlen[i] != {8{1'b0}}) && (m1_axi4_arburst[i] != 2'b00); - `ifdef EN_ACP - always_comb begin - if (m1_read_is_burst[i]) begin - m1_axi4_arcache[i] = 4'b1011; - end else begin - m1_axi4_arcache[i] = 4'b1111; - end - end - `else - assign m1_axi4_arcache[i] = 4'b0011; - `endif - - // }}} - - // Read Response channel (r) {{{ - /* - * read response channel (r) - * - * ██████╗ ███████╗ █████╗ ██████╗ ██████╗ ███████╗███████╗██████╗ - * ██╔══██╗██╔════╝██╔══██╗██╔══██╗ ██╔══██╗██╔════╝██╔════╝██╔══██╗ - * ██████╔╝█████╗ ███████║██║ ██║ ██████╔╝█████╗ ███████╗██████╔╝ - * ██╔══██╗██╔══╝ ██╔══██║██║ ██║ ██╔══██╗██╔══╝ ╚════██║██╔═══╝ - * ██║ ██║███████╗██║ ██║██████╔╝ ██║ ██║███████╗███████║██║ - * ╚═╝ ╚═╝╚══════╝╚═╝ ╚═╝╚═════╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝ - * - */ - axi4_r_buffer - #( - .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ), - .AXI_ID_WIDTH ( AXI_ID_WIDTH ), - .AXI_USER_WIDTH ( AXI_USER_WIDTH ) - ) - u_r_buffer_m0 - ( - .axi4_aclk ( Clk_CI ), - .axi4_arstn ( Rst_RBI ), - .s_axi4_rid ( int_m0_rid[i] ), - .s_axi4_rresp ( int_m0_rresp[i] ), - .s_axi4_rdata ( int_m0_rdata[i] ), - .s_axi4_rlast ( int_m0_rlast[i] ), - .s_axi4_rvalid ( int_m0_rvalid[i] ), - .s_axi4_ruser ( int_m0_ruser[i] ), - .s_axi4_rready ( int_m0_rready[i] ), - .m_axi4_rid ( m0_axi4_rid[i] ), - .m_axi4_rresp ( m0_axi4_rresp[i] ), - .m_axi4_rdata ( m0_axi4_rdata[i] ), - .m_axi4_rlast ( m0_axi4_rlast[i] ), - .m_axi4_rvalid ( m0_axi4_rvalid[i] ), - .m_axi4_ruser ( m0_axi4_ruser[i] ), - .m_axi4_rready ( m0_axi4_rready[i] ) - ); - - axi4_r_buffer - #( - .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ), - .AXI_ID_WIDTH ( AXI_ID_WIDTH ), - .AXI_USER_WIDTH ( AXI_USER_WIDTH ) - ) - u_r_buffer_m1 - ( - .axi4_aclk ( Clk_CI ), - .axi4_arstn ( Rst_RBI ), - .s_axi4_rid ( int_m1_rid[i] ), - .s_axi4_rresp ( int_m1_rresp[i] ), - .s_axi4_rdata ( int_m1_rdata[i] ), - .s_axi4_rlast ( int_m1_rlast[i] ), - .s_axi4_rvalid ( int_m1_rvalid[i] ), - .s_axi4_ruser ( int_m1_ruser[i] ), - .s_axi4_rready ( int_m1_rready[i] ), - .m_axi4_rid ( m1_axi4_rid[i] ), - .m_axi4_rresp ( m1_axi4_rresp[i] ), - .m_axi4_rdata ( m1_axi4_rdata[i] ), - .m_axi4_rlast ( m1_axi4_rlast[i] ), - .m_axi4_rvalid ( m1_axi4_rvalid[i] ), - .m_axi4_ruser ( m1_axi4_ruser[i] ), - .m_axi4_rready ( m1_axi4_rready[i] ) - ); - - axi4_r_sender - #( - .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ), - .AXI_ID_WIDTH ( AXI_ID_WIDTH ), - .AXI_USER_WIDTH ( AXI_USER_WIDTH ) - ) - u_r_sender - ( - .axi4_aclk ( Clk_CI ), - .axi4_arstn ( Rst_RBI ), - .drop_i ( lx_r_drop[i] ), - .drop_len_i ( lx_len_drop[i] ), - .done_o ( lx_r_done[i] ), - .id_i ( lx_id_drop[i] ), - .prefetch_i ( lx_prefetch_drop[i] ), - .hit_i ( lx_hit_drop[i] ), - .s_axi4_rid ( s_axi4_rid[i] ), - .s_axi4_rresp ( s_axi4_rresp[i] ), - .s_axi4_rdata ( s_axi4_rdata[i] ), - .s_axi4_rlast ( s_axi4_rlast[i] ), - .s_axi4_rvalid ( s_axi4_rvalid[i] ), - .s_axi4_ruser ( s_axi4_ruser[i] ), - .s_axi4_rready ( s_axi4_rready[i] ), - .m_axi4_rid ( int_rid[i] ), - .m_axi4_rresp ( int_rresp[i] ), - .m_axi4_rdata ( int_rdata[i] ), - .m_axi4_rlast ( int_rlast[i] ), - .m_axi4_rvalid ( int_rvalid[i] ), - .m_axi4_ruser ( int_ruser[i] ), - .m_axi4_rready ( int_rready[i] ) - ); - - /* - * Multiplexer to switch between the two output master ports on the read response(r) channel - * - * Do not perform read burst interleaving as the DMA does not support it. This means we can only - * switch between the two masters upon sending rlast or when idle. - * - * However, if the downstream already performs burst interleaving, this cannot be undone here. - * Also, the downstream may interleave a burst reponse with a single-beat transaction. In this - * case, the FSM below falls out of the burst mode. To avoid it performing burst interleaving - * after such an event, it gives priority to the master which received the last burst in case - * both have a have a burst ready (rvalid). - * - * Order of priority: - * 1. Ongoing burst transaction - * 2. Single-beat transaction on Master 1. - * 3. Single-beat transaction on Master 0. - * 4. Burst transaction on master that received the last burst. - */ - // Select signal - always_ff @(posedge Clk_CI) begin - if (Rst_RBI == 0) begin - RRespSel_SP[i] <= 1'b0; - end else begin - RRespSel_SP[i] <= RRespSel_SN[i]; - end - end - - // FSM - always_comb begin : RRespMuxFsm - RRespMuxCtrl_SN[i] = RRespMuxCtrl_SP[i]; - RRespSel_SN[i] = RRespSel_SP[i]; - - RRespBurst_S[i] = 1'b0; - RRespSelIm_S[i] = 1'b0; - - unique case (RRespMuxCtrl_SP[i]) - - IDLE: begin - // immediately forward single-beat transactions - if (int_m1_rvalid[i] && int_m1_rlast[i]) - RRespSelIm_S[i] = 1'b1; - else if (int_m0_rvalid[i] && int_m0_rlast[i]) - RRespSelIm_S[i] = 1'b0; - - // bursts - they also start immediately - else if (int_m1_rvalid[i] || int_m0_rvalid[i]) begin - RRespMuxCtrl_SN[i] = BUSY; - - // in case both are ready, continue with the master that had the last burst - if (int_m1_rvalid[i] && int_m0_rvalid[i]) begin - RRespSel_SN[i] = RRespSel_SP[i]; - RRespSelIm_S[i] = RRespSel_SP[i]; - end else if (int_m1_rvalid[i]) begin - RRespSel_SN[i] = 1'b1; - RRespSelIm_S[i] = 1'b1; - end else begin - RRespSel_SN[i] = 1'b0; - RRespSelIm_S[i] = 1'b0; - end - end - end - - BUSY: begin - RRespBurst_S[i] = 1'b1; - // detect last handshake of currently ongoing transfer - if (int_rvalid[i] && int_rready[i] && int_rlast[i]) - RRespMuxCtrl_SN[i] = IDLE; - end - - default: begin - RRespMuxCtrl_SN[i] = IDLE; - end - - endcase - end - - // FSM state - always_ff @(posedge Clk_CI) begin - if (Rst_RBI == 0) begin - RRespMuxCtrl_SP[i] <= IDLE; - end else begin - RRespMuxCtrl_SP[i] <= RRespMuxCtrl_SN[i]; - end - end - - // Actual multiplexer - always_comb begin - if ( (RRespBurst_S[i] && RRespSel_SP[i]) || (!RRespBurst_S[i] && RRespSelIm_S[i]) ) begin - int_m0_rready[i] = 1'b0; - int_m1_rready[i] = int_rready[i]; - - int_rid[i] = int_m1_rid[i]; - int_rresp[i] = int_m1_rresp[i]; - int_rdata[i] = int_m1_rdata[i]; - int_rlast[i] = int_m1_rlast[i]; - int_ruser[i] = int_m1_ruser[i]; - int_rvalid[i] = int_m1_rvalid[i]; - end else begin - int_m0_rready[i] = int_rready[i]; - int_m1_rready[i] = 1'b0; - - int_rid[i] = int_m0_rid[i]; - int_rresp[i] = int_m0_rresp[i]; - int_rdata[i] = int_m0_rdata[i]; - int_rlast[i] = int_m0_rlast[i]; - int_ruser[i] = int_m0_ruser[i]; - int_rvalid[i] = int_m0_rvalid[i]; - end - end - - end // BUF & SEND - - // }}} - - endgenerate // BUF & SEND }}} - - // Log {{{ - -`ifdef RAB_AX_LOG_EN - AxiBramLogger - #( - .AXI_ID_BITW ( AXI_ID_WIDTH ), - .AXI_ADDR_BITW ( AXI_S_ADDR_WIDTH ), - .NUM_LOG_ENTRIES ( `RAB_AX_LOG_ENTRIES ) - ) - u_aw_logger - ( - .Clk_CI ( NonGatedClk_CI ), - .TimestampClk_CI ( Clk_CI ), - .Rst_RBI ( Rst_RBI ), - .AxiValid_SI ( s_axi4_awvalid[1] ), - .AxiReady_SI ( s_axi4_awready[1] ), - .AxiId_DI ( s_axi4_awid[1] ), - .AxiAddr_DI ( s_axi4_awaddr[1] ), - .AxiLen_DI ( s_axi4_awlen[1] ), - .Clear_SI ( AwLogClr_SI ), - .LogEn_SI ( LogEn_SI ), - .Full_SO ( int_aw_log_full ), - .Ready_SO ( AwLogRdy_SO ), - .Bram_PS ( AwBram_PS ) - ); - - AxiBramLogger - #( - .AXI_ID_BITW ( AXI_ID_WIDTH ), - .AXI_ADDR_BITW ( AXI_S_ADDR_WIDTH ), - .NUM_LOG_ENTRIES ( `RAB_AX_LOG_ENTRIES ) - ) - u_ar_logger - ( - .Clk_CI ( NonGatedClk_CI ), - .TimestampClk_CI ( Clk_CI ), - .Rst_RBI ( Rst_RBI ), - .AxiValid_SI ( s_axi4_arvalid[1] ), - .AxiReady_SI ( s_axi4_arready[1] ), - .AxiId_DI ( s_axi4_arid[1] ), - .AxiAddr_DI ( s_axi4_araddr[1] ), - .AxiLen_DI ( s_axi4_arlen[1] ), - .Clear_SI ( ArLogClr_SI ), - .LogEn_SI ( LogEn_SI ), - .Full_SO ( int_ar_log_full ), - .Ready_SO ( ArLogRdy_SO ), - .Bram_PS ( ArBram_PS ) - ); -`endif - - // }}} - - // RAB Core {{{ - // ██████╗ █████╗ ██████╗ ██████╗ ██████╗ ██████╗ ███████╗ - // ██╔══██╗██╔══██╗██╔══██╗ ██╔════╝██╔═══██╗██╔══██╗██╔════╝ - // ██████╔╝███████║██████╔╝ ██║ ██║ ██║██████╔╝█████╗ - // ██╔══██╗██╔══██║██╔══██╗ ██║ ██║ ██║██╔══██╗██╔══╝ - // ██║ ██║██║ ██║██████╔╝ ╚██████╗╚██████╔╝██║ ██║███████╗ - // ╚═╝ ╚═╝╚═╝ ╚═╝╚═════╝ ╚═════╝ ╚═════╝ ╚═╝ ╚═╝╚══════╝ - // - /* - * rab_core - * - * The rab core translates addresses. It has two ports, which can be used - * independently, however they will compete for time internally, as lookups - * are serialized. - * - * type is the read(0) or write(1) used to check the protection flags. If they - * don't match an interrupt is created on the int_prot line. - */ - - rab_core - #( - .N_PORTS ( N_PORTS ), - .N_L2_SETS ( N_L2_SETS ), - .N_L2_SET_ENTRIES ( N_L2_SET_ENTRIES ), - .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ), - .AXI_S_ADDR_WIDTH ( AXI_S_ADDR_WIDTH ), - .AXI_M_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ), - .AXI_LITE_DATA_WIDTH ( AXI_LITE_DATA_WIDTH ), - .AXI_LITE_ADDR_WIDTH ( AXI_LITE_ADDR_WIDTH ), - .AXI_ID_WIDTH ( AXI_ID_WIDTH ), - .AXI_USER_WIDTH ( AXI_USER_WIDTH ), - .MH_FIFO_DEPTH ( MH_FIFO_DEPTH ) - ) - u_rab_core - ( - .Clk_CI ( Clk_CI ), - .Rst_RBI ( Rst_RBI ), - - // Config IF - .s_axi_awaddr ( s_axi4lite_awaddr ), - .s_axi_awvalid ( s_axi4lite_awvalid ), - .s_axi_awready ( s_axi4lite_awready ), - .s_axi_wdata ( s_axi4lite_wdata ), - .s_axi_wstrb ( s_axi4lite_wstrb ), - .s_axi_wvalid ( s_axi4lite_wvalid ), - .s_axi_wready ( s_axi4lite_wready ), - .s_axi_bresp ( s_axi4lite_bresp ), - .s_axi_bvalid ( s_axi4lite_bvalid ), - .s_axi_bready ( s_axi4lite_bready ), - .s_axi_araddr ( s_axi4lite_araddr ), - .s_axi_arvalid ( s_axi4lite_arvalid ), - .s_axi_arready ( s_axi4lite_arready ), - .s_axi_rready ( s_axi4lite_rready ), - .s_axi_rdata ( s_axi4lite_rdata ), - .s_axi_rresp ( s_axi4lite_rresp ), - .s_axi_rvalid ( s_axi4lite_rvalid ), - - // L1 miss info outputs -> L2 TLB arbitration - .int_miss ( rab_miss ), - .int_multi ( rab_multi ), - .int_prot ( rab_prot ), - .int_prefetch ( rab_prefetch ), - .int_mhf_full ( int_mhf_full ), - - // L1 transaction info outputs -> L2 TLB arbitration - .int_axaddr_o ( L1OutAddr_D ), - .int_axid_o ( L1OutId_D ), - .int_axlen_o ( L1OutLen_D ), - .int_axuser_o ( L1OutUser_D ), - - // Write Req IF - .port1_addr ( int_awaddr ), - .port1_id ( int_awid ), - .port1_len ( int_awlen ), - .port1_size ( int_awsize ), - .port1_addr_valid ( int_awvalid & ~aw_in_stall ), // avoid the FSM accepting new AW requests - .port1_type ( {N_PORTS{1'b1}} ), - .port1_user ( int_awuser ), - .port1_sent ( int_wtrans_sent ), // signal done to L1 FSM - .port1_out_addr ( int_wtrans_addr ), - .port1_cache_coherent ( int_wtrans_cache_coherent ), - .port1_accept ( int_wtrans_accept ), - .port1_drop ( int_wtrans_drop ), - .port1_miss ( int_wtrans_miss ), - - // Read Req IF - .port2_addr ( int_araddr ), - .port2_id ( int_arid ), - .port2_len ( int_arlen ), - .port2_size ( int_arsize ), - .port2_addr_valid ( int_arvalid ), - .port2_type ( {N_PORTS{1'b0}} ), - .port2_user ( int_aruser ), - .port2_sent ( int_rtrans_sent ), // signal done to L1 FSM - .port2_out_addr ( int_rtrans_addr ), - .port2_cache_coherent ( int_rtrans_cache_coherent ), - .port2_accept ( int_rtrans_accept ), - .port2_drop ( int_rtrans_drop ), - .port2_miss ( int_rtrans_miss ), - - // L2 miss info inputs -> axi_rab_cfg - .miss_l2_i ( L2Miss_S ), - .miss_l2_addr_i ( L2OutInAddr_DP ), - .miss_l2_id_i ( L2OutId_DP ), - .miss_l2_user_i ( L2OutUser_DP ), - - // L2 config outputs - .wdata_l2_o ( L2CfgWData_D ), - .waddr_l2_o ( L2CfgWAddr_D ), - .wren_l2_o ( L2CfgWE_S ) - ); - - // }}} - - // AX SPLITS {{{ - // █████╗ ██╗ ██╗ ███████╗██████╗ ██╗ ██╗████████╗ - // ██╔══██╗╚██╗██╔╝ ██╔════╝██╔══██╗██║ ██║╚══██╔══╝ - // ███████║ ╚███╔╝ ███████╗██████╔╝██║ ██║ ██║ - // ██╔══██║ ██╔██╗ ╚════██║██╔═══╝ ██║ ██║ ██║ - // ██║ ██║██╔╝ ██╗ ███████║██║ ███████╗██║ ██║ - // ╚═╝ ╚═╝╚═╝ ╚═╝ ╚══════╝╚═╝ ╚══════╝╚═╝ ╚═╝ - // - /** - * Multiplex the two output master ports of the Read Address and Write Address (AR/AW) channels. - * - * Use the `int_xmaster_select` signal to route the signals to either Master 0 (to memory) or - * Master 1 (to ACP). In case of an L1 miss: Route the signals to both masters. They shall be - * saved until the L2 outputs are available. - */ - generate for (i = 0; i < N_PORTS; i++) begin : AX_SPLIT - - /* - * When accepting L1 transactions, we must just do so on the selected master. Drop requests must - * be performed on any one of the two masters. Save requests must be performed by both masters. - */ - always_comb begin : AW_L1_SPLIT - - // TLB handshake - l1_m0_aw_accept[i] = 1'b0; - l1_m1_aw_accept[i] = 1'b0; - l1_m0_aw_drop[i] = 1'b0; - l1_m1_aw_drop[i] = 1'b0; - l1_m0_aw_save[i] = 1'b0; - l1_m1_aw_save[i] = 1'b0; - - l1_mx_aw_done[i] = 1'b0; - - // AXI sender input handshake - int_m0_awvalid[i] = 1'b0; - int_m1_awvalid[i] = 1'b0; - int_awready[i] = 1'b0; - - // accept on selected master only - if (l1_aw_accept[i]) begin - if (int_wmaster_select[i]) begin - l1_m1_aw_accept[i] = 1'b1; - l1_mx_aw_done[i] = l1_m1_aw_done[i]; - - int_m1_awvalid[i] = int_awvalid[i]; - int_awready[i] = int_m1_awready[i]; - - end else begin - l1_m0_aw_accept[i] = 1'b1; - l1_mx_aw_done[i] = l1_m0_aw_done[i]; - - int_m0_awvalid[i] = int_awvalid[i]; - int_awready[i] = int_m0_awready[i]; - end - - // drop on Master 0 only - end else if (l1_aw_drop[i]) begin - l1_m0_aw_drop[i] = 1'b1; - l1_mx_aw_done[i] = l1_m0_aw_done[i]; - - int_m0_awvalid[i] = int_awvalid[i]; - int_awready[i] = l1_m0_aw_done[i]; - - // save on both masters - end else if (l1_aw_save[i]) begin - // split save - l1_m0_aw_save[i] = ~l1_m0_aw_done_SP[i]; - l1_m1_aw_save[i] = ~l1_m1_aw_done_SP[i]; - - // combine done - l1_mx_aw_done[i] = l1_m0_aw_done_SP[i] & l1_m1_aw_done_SP[i]; - - int_m0_awvalid[i] = int_awvalid[i]; - int_m1_awvalid[i] = int_awvalid[i]; - int_awready[i] = l1_mx_aw_done[i]; - end - end - - // signal back to handshake splitter - assign l1_aw_done[i] = l1_mx_aw_done[i]; - - always_ff @(posedge Clk_CI) begin : L1_MX_AW_DONE_REG - if (Rst_RBI == 0) begin - l1_m0_aw_done_SP[i] <= 1'b0; - l1_m1_aw_done_SP[i] <= 1'b0; - end else if (l1_mx_aw_done[i]) begin - l1_m0_aw_done_SP[i] <= 1'b0; - l1_m1_aw_done_SP[i] <= 1'b0; - end else begin - l1_m0_aw_done_SP[i] <= l1_m0_aw_done_SP[i] | l1_m0_aw_done[i]; - l1_m1_aw_done_SP[i] <= l1_m1_aw_done_SP[i] | l1_m1_aw_done[i]; - end - end - - /* - * When accepting L2 transactions, we must drop the corresponding transaction from the other - * master to make it available again for save requests from L1_DROP_SAVE. - */ - always_comb begin : AW_L2_SPLIT - - l2_m0_aw_accept[i] = 1'b0; - l2_m1_aw_accept[i] = 1'b0; - l2_m0_aw_drop[i] = 1'b0; - l2_m1_aw_drop[i] = 1'b0; - - // de-assert request signals individually upon handshakes - if (l2_aw_accept[i]) begin - if (l2_master_select[i]) begin - l2_m1_aw_accept[i] = ~l2_m1_aw_done_SP[i]; - l2_m0_aw_drop[i] = ~l2_m0_aw_done_SP[i]; - - end else begin - l2_m0_aw_accept[i] = ~l2_m0_aw_done_SP[i]; - l2_m1_aw_drop[i] = ~l2_m1_aw_done_SP[i]; - - end - end else begin - l2_m0_aw_drop[i] = ~l2_m0_aw_done_SP[i] ? l2_aw_drop[i] : 1'b0; - l2_m1_aw_drop[i] = ~l2_m1_aw_done_SP[i] ? l2_aw_drop[i] : 1'b0; - - end - - // combine done - l2_mx_aw_done[i] = l2_m0_aw_done_SP[i] & l2_m1_aw_done_SP[i]; - - l2_aw_done[i] = l2_mx_aw_done[i]; - end - - always_ff @(posedge Clk_CI) begin : L2_MX_AW_DONE_REG - if (Rst_RBI == 0) begin - l2_m0_aw_done_SP[i] <= 1'b0; - l2_m1_aw_done_SP[i] <= 1'b0; - end else if (l2_mx_aw_done[i]) begin - l2_m0_aw_done_SP[i] <= 1'b0; - l2_m1_aw_done_SP[i] <= 1'b0; - end else begin - l2_m0_aw_done_SP[i] <= l2_m0_aw_done_SP[i] | l2_m0_aw_done[i]; - l2_m1_aw_done_SP[i] <= l2_m1_aw_done_SP[i] | l2_m1_aw_done[i]; - end - end - - /* - * When accepting L1 transactions, we must just do so on the selected master. Drop requests must - * be performed on any one of the two masters. Save requests must be performed by both masters. - */ - always_comb begin : AR_L1_SPLIT - - // TLB handshake - l1_m0_ar_accept[i] = 1'b0; - l1_m1_ar_accept[i] = 1'b0; - l1_m0_ar_drop[i] = 1'b0; - l1_m1_ar_drop[i] = 1'b0; - l1_m0_ar_save[i] = 1'b0; - l1_m1_ar_save[i] = 1'b0; - - l1_mx_ar_done[i] = 1'b0; - - // AXI sender input handshake - int_m0_arvalid[i] = 1'b0; - int_m1_arvalid[i] = 1'b0; - int_arready[i] = 1'b0; - - // accept on selected master only - if (l1_ar_accept[i]) begin - if (int_rmaster_select[i]) begin - l1_m1_ar_accept[i] = 1'b1; - l1_mx_ar_done[i] = l1_m1_ar_done[i]; - - int_m1_arvalid[i] = int_arvalid[i]; - int_arready[i] = int_m1_arready[i]; - - end else begin - l1_m0_ar_accept[i] = 1'b1; - l1_mx_ar_done[i] = l1_m0_ar_done[i]; - - int_m0_arvalid[i] = int_arvalid[i]; - int_arready[i] = int_m0_arready[i]; - end - - // drop on Master 0 only - end else if (l1_ar_drop[i]) begin - l1_m0_ar_drop[i] = 1'b1; - l1_mx_ar_done[i] = l1_m0_ar_done[i]; - - int_m0_arvalid[i] = int_arvalid[i]; - int_arready[i] = l1_m0_ar_done[i]; - - // save on both masters - end else if (l1_ar_save[i]) begin - // split save - l1_m0_ar_save[i] = ~l1_m0_ar_done_SP[i]; - l1_m1_ar_save[i] = ~l1_m1_ar_done_SP[i]; - - // combine done - l1_mx_ar_done[i] = l1_m0_ar_done_SP[i] & l1_m1_ar_done_SP[i]; - - int_m0_arvalid[i] = int_arvalid[i]; - int_m1_arvalid[i] = int_arvalid[i]; - int_arready[i] = l1_mx_ar_done[i]; - end - end - - // signal back to handshake splitter - assign l1_ar_done[i] = l1_mx_ar_done[i]; - - always_ff @(posedge Clk_CI) begin : L1_MX_AR_DONE_REG - if (Rst_RBI == 0) begin - l1_m0_ar_done_SP[i] <= 1'b0; - l1_m1_ar_done_SP[i] <= 1'b0; - end else if (l1_mx_ar_done[i]) begin - l1_m0_ar_done_SP[i] <= 1'b0; - l1_m1_ar_done_SP[i] <= 1'b0; - end else begin - l1_m0_ar_done_SP[i] <= l1_m0_ar_done_SP[i] | l1_m0_ar_done[i]; - l1_m1_ar_done_SP[i] <= l1_m1_ar_done_SP[i] | l1_m1_ar_done[i]; - end - end - - /* - * When accepting L2 transactions, we must drop the corresponding transaction from the other - * master to make it available again for save requests from L1_DROP_SAVE. - */ - always_comb begin : AR_L2_SPLIT - - l2_m0_ar_accept[i] = 1'b0; - l2_m1_ar_accept[i] = 1'b0; - l2_m0_ar_drop[i] = 1'b0; - l2_m1_ar_drop[i] = 1'b0; - - // de-assert request signals individually upon handshakes - if (l2_ar_accept[i]) begin - if (l2_master_select[i]) begin - l2_m1_ar_accept[i] = ~l2_m1_ar_done_SP[i]; - l2_m0_ar_drop[i] = ~l2_m0_ar_done_SP[i]; - - end else begin - l2_m0_ar_accept[i] = ~l2_m0_ar_done_SP[i]; - l2_m1_ar_drop[i] = ~l2_m1_ar_done_SP[i]; - - end - end else if (l2_ar_drop[i]) begin - l2_m0_ar_drop[i] = ~l2_m0_ar_done_SP[i] ? l2_ar_drop[i] : 1'b0; - l2_m1_ar_drop[i] = ~l2_m1_ar_done_SP[i] ? l2_ar_drop[i] : 1'b0; - - end - - // combine done - l2_mx_ar_done[i] = l2_m0_ar_done_SP[i] & l2_m1_ar_done_SP[i]; - - l2_ar_done[i] = l2_mx_ar_done[i]; - end - - always_ff @(posedge Clk_CI) begin : L2_MX_AR_DONE_REG - if (Rst_RBI == 0) begin - l2_m0_ar_done_SP[i] <= 1'b0; - l2_m1_ar_done_SP[i] <= 1'b0; - end else if (l2_mx_ar_done[i]) begin - l2_m0_ar_done_SP[i] <= 1'b0; - l2_m1_ar_done_SP[i] <= 1'b0; - end else begin - l2_m0_ar_done_SP[i] <= l2_m0_ar_done_SP[i] | l2_m0_ar_done[i]; - l2_m1_ar_done_SP[i] <= l2_m1_ar_done_SP[i] | l2_m1_ar_done[i]; - end - end - - end // AX_SPLIT - endgenerate // AX_SPLIT - - // }}} - - // HANDSHAKE SPLITS {{{ - // ██╗ ██╗███████╗ ███████╗██████╗ ██╗ ██╗████████╗ - // ██║ ██║██╔════╝ ██╔════╝██╔══██╗██║ ██║╚══██╔══╝ - // ███████║███████╗ ███████╗██████╔╝██║ ██║ ██║ - // ██╔══██║╚════██║ ╚════██║██╔═══╝ ██║ ██║ ██║ - // ██║ ██║███████║ ███████║██║ ███████╗██║ ██║ - // ╚═╝ ╚═╝╚══════╝ ╚══════╝╚═╝ ╚══════╝╚═╝ ╚═╝ - // - /* - * We need to perform combined handshakes with multiple AXI modules - * upon transactions drops, accepts, saves etc. from two TLBs. - */ - generate for (i = 0; i < N_PORTS; i++) begin : HANDSHAKE_SPLIT - - assign l1_xw_accept[i] = int_wtrans_accept[i] & ~aw_out_stall[i]; - assign int_wtrans_sent[i] = l1_xw_done[i]; - - assign l1_ar_accept[i] = int_rtrans_accept[i]; - assign int_rtrans_sent[i] = l1_ar_done[i]; - - /* - * L1 AW sender + W buffer handshake split - */ - // forward - assign l1_aw_accept[i] = l1_xw_accept[i] & ~l1_aw_done_SP[i]; - assign l1_w_accept[i] = l1_xw_accept[i] & ~l1_w_done_SP[i]; - - assign l1_aw_save[i] = l1_xw_save[i] & ~l1_aw_done_SP[i]; - assign l1_w_save[i] = l1_xw_save[i] & ~l1_w_done_SP[i]; - - assign l1_aw_drop[i] = l1_xw_drop[i] & ~l1_aw_done_SP[i]; - assign l1_w_drop[i] = l1_xw_drop[i] & ~l1_w_done_SP[i]; - - // backward - assign l1_xw_done[i] = l1_aw_done_SP[i] & l1_w_done_SP[i]; - - always_ff @(posedge Clk_CI) begin : L1_XW_HS_SPLIT - if (Rst_RBI == 0) begin - l1_aw_done_SP[i] <= 1'b0; - l1_w_done_SP[i] <= 1'b0; - end else if (l1_xw_done[i]) begin - l1_aw_done_SP[i] <= 1'b0; - l1_w_done_SP[i] <= 1'b0; - end else begin - l1_aw_done_SP[i] <= l1_aw_done_SP[i] | l1_aw_done[i]; - l1_w_done_SP[i] <= l1_w_done_SP[i] | l1_w_done[i]; - end - end - - if (ENABLE_L2TLB[i] == 1) begin : L2_HS_SPLIT - - /* - * L1 AR sender + R sender handshake split - * - * AR and R do not need to be strictly in sync. We thus use separate handshakes. - * But the handshake signals for the R sender are multiplexed with the those for - * the L2. However, L2_ACCEPT_DROP_SAVE has always higher priority. - */ - assign lx_r_drop[i] = l2_r_drop[i] | l1_r_drop[i]; - assign l1_r_done[i] = l2_r_drop[i] ? 1'b0 : lx_r_done[i]; - assign l2_r_done[i] = l2_r_drop[i] ? lx_r_done[i] : 1'b0; - - /* - * L2 AW sender + W buffer handshake split - */ - // forward - assign l2_aw_accept[i] = l2_xw_accept[i] & ~l2_aw_done_SP[i]; - assign l2_w_accept[i] = l2_xw_accept[i] & ~l2_w_done_SP[i]; - - assign l2_aw_drop[i] = l2_xw_drop[i] & ~l2_aw_done_SP[i]; - assign l2_w_drop[i] = l2_xw_drop[i] & ~l2_w_done_SP[i]; - - // backward - assign l2_xw_done[i] = l2_aw_done_SP[i] & l2_w_done_SP[i]; - - always_ff @(posedge Clk_CI) begin : L2_XW_HS_SPLIT - if (Rst_RBI == 0) begin - l2_aw_done_SP[i] <= 1'b0; - l2_w_done_SP[i] <= 1'b0; - end else if (l2_xw_done[i]) begin - l2_aw_done_SP[i] <= 1'b0; - l2_w_done_SP[i] <= 1'b0; - end else begin - l2_aw_done_SP[i] <= l2_aw_done_SP[i] | l2_aw_done[i]; - l2_w_done_SP[i] <= l2_w_done_SP[i] | l2_w_done[i]; - end - end - - /* - * L2 AR + R sender handshake split - */ - // forward - assign l2_ar_drop[i] = l2_xr_drop[i] & ~l2_ar_done_SP[i]; - assign l2_r_drop[i] = l2_xr_drop[i] & ~l2_r_done_SP[i]; - - // backward - make sure to always clear L2_XR_HS_SPLIT - always_comb begin - if (l2_xr_drop[i]) begin - l2_xr_done[i] = l2_ar_done_SP[i] & l2_r_done_SP[i]; - end else begin - l2_xr_done[i] = l2_ar_done_SP[i]; - end - end - - always_ff @(posedge Clk_CI) begin : L2_XR_HS_SPLIT - if (Rst_RBI == 0) begin - l2_ar_done_SP[i] <= 1'b0; - l2_r_done_SP[i] <= 1'b0; - end else if (l2_xr_done[i]) begin - l2_ar_done_SP[i] <= 1'b0; - l2_r_done_SP[i] <= 1'b0; - end else begin - l2_ar_done_SP[i] <= l2_ar_done_SP[i] | l2_ar_done[i]; - l2_r_done_SP[i] <= l2_r_done_SP[i] | l2_r_done[i]; - end - end - - end else begin // if (ENABLE_L2TLB[i] == 1) - - assign lx_r_drop[i] = l1_r_drop[i]; - assign l1_r_done[i] = lx_r_done[i]; - - assign l2_aw_accept[i] = 1'b0; - assign l2_w_accept[i] = 1'b0; - assign l2_aw_drop[i] = 1'b0; - assign l2_w_drop[i] = 1'b0; - assign l2_xw_done[i] = 1'b0; - assign l2_aw_done_SP[i] = 1'b0; - assign l2_w_done_SP[i] = 1'b0; - - assign l2_ar_accept[i] = 1'b0; - assign l2_ar_drop[i] = 1'b0; - assign l2_r_drop[i] = 1'b0; - assign l2_xr_done[i] = 1'b0; - assign l2_r_done[i] = 1'b0; - assign l2_ar_done_SP[i] = 1'b0; - assign l2_r_done_SP[i] = 1'b0; - - end // if (ENABLE_L2TLB[i] == 1) - - end // HANDSHAKE_SPLIT - endgenerate // HANDSHAKE_SPLIT - - // }}} - - // L2 TLB {{{ - // ██╗ ██████╗ ████████╗██╗ ██████╗ - // ██║ ╚════██╗ ╚══██╔══╝██║ ██╔══██╗ - // ██║ █████╔╝ ██║ ██║ ██████╔╝ - // ██║ ██╔═══╝ ██║ ██║ ██╔══██╗ - // ███████╗███████╗ ██║ ███████╗██████╔╝ - // ╚══════╝╚══════╝ ╚═╝ ╚══════╝╚═════╝ - // - /* - * l2_tlb - * - * The L2 TLB translates addresses upon misses in the L1 TLB (rab_core). - * - * It supports one ongoing translation at a time. If an L1 miss occurs while the L2 is busy, - * the L1 is stalled untill the L2 is available again. - * - */ - generate for (i = 0; i < N_PORTS; i++) begin : L2_TLB - if (ENABLE_L2TLB[i] == 1) begin : L2_TLB - - /* - * L1 output selector - */ - assign L1OutRwType_D[i] = int_wtrans_drop[i] ? 1'b1 : 1'b0; - assign L1OutProt_D[i] = rab_prot[i]; - assign L1OutMulti_D[i] = rab_multi[i]; - - /* - * L1 output control + L1_DROP_BUF, L2_IN_BUF management - * - * Forward the L1 drop request to AR/AW sender modules if - * 1. the transactions needs to be dropped (L1 multi, prot, prefetch), or - * 2. if a lookup in the L2 TLB is required (L1 miss) and the input buffer is not full. - * - * The AR/AW senders do not support more than 1 oustanding L1 miss. The push back towards - * the upstream is realized by not accepting the save request (saving the L1 transaction) - * in the senders as long as the L2 TLB is busy or has valid output. This ultimately - * blocks the L1 TLB. - * - * Together with the AW drop/save, we also perform the W drop/save as AW and W need to - * absolutely remain in order. In contrast, the R drop is performed - */ - always_comb begin : L1_DROP_SAVE - - l1_ar_drop[i] = 1'b0; - l1_ar_save[i] = 1'b0; - l1_xw_drop[i] = 1'b0; - l1_xw_save[i] = 1'b0; - - l1_id_drop[i] = L1OutId_D[i]; - l1_len_drop[i] = L1OutLen_D[i]; - l1_prefetch_drop[i] = rab_prefetch[i]; - l1_hit_drop[i] = 1'b1; // there are no drops for L1 misses - - L1DropEn_S[i] = 1'b0; - L2InEn_S[i] = 1'b0; - - if ( rab_prot[i] | rab_multi[i] | rab_prefetch[i] ) begin - // 1. Drop - l1_ar_drop[i] = int_rtrans_drop[i] & ~L1DropValid_SP[i]; - l1_xw_drop[i] = int_wtrans_drop[i] & ~L1DropValid_SP[i]; - - // Store to L1_DROP_BUF upon handshake - L1DropEn_S[i] = (l1_ar_drop[i] & l1_ar_done[i]) | - (l1_xw_drop[i] & l1_xw_done[i]); - - end else if ( rab_miss[i] ) begin - // 2. Save - Make sure L2 is really available. - l1_ar_save[i] = int_rtrans_drop[i] & ~L2Busy_S[i]; - l1_xw_save[i] = int_wtrans_drop[i] & ~L2Busy_S[i]; - - // Store to L2_IN_BUF upon handshake - triggers the L2 TLB - L2InEn_S[i] = (l1_ar_save[i] & l1_ar_done[i]) | - (l1_xw_save[i] & l1_xw_done[i]); - end - end - - /* - * L2 output control + L2_OUT_BUF management + R/B sender control + W buffer control - * - * Perform L1 R transaction drops unless the L2 output buffer holds valid data. The AXI specs - * require the B response to be sent only after consuming/discarding the corresponding data - * in the W channel. Thus, we only send L2 drop request to the W buffer here. The drop - * request to the B sender is then sent by the W buffer autonomously. - * - * L1 AW/W drop requests are managed by L1_DROP_SAVE. - */ - always_comb begin : L2_ACCEPT_DROP_SAVE - - l2_ar_addr[i] = 'b0; - l2_aw_addr[i] = 'b0; - l2_ar_accept[i] = 1'b0; - l2_xr_drop[i] = 1'b0; - l2_xw_accept[i] = 1'b0; - l2_xw_drop[i] = 1'b0; - - l1_r_drop[i] = 1'b0; - - lx_id_drop[i] = 'b0; - lx_len_drop[i] = 'b0; - lx_prefetch_drop[i] = 1'b0; - lx_hit_drop[i] = 1'b0; - - L1DropValid_SN[i] = L1DropValid_SP[i] | L1DropEn_S[i]; - L2OutValid_SN[i] = L2OutValid_SP[i]; - L2OutReady_S[i] = 1'b0; - L2OutEn_S[i] = 1'b0; - - L2Miss_S[i] = 1'b0; - int_multi[i] = 1'b0; - int_prot[i] = 1'b0; - - if (L2OutValid_SP[i] == 1'b0) begin - - // Drop L1 from R senders - if (L1DropValid_SP[i] == 1'b1) begin - - // Only perform the R sender drop here. - if (~L1DropRwType_DP[i]) begin - - l1_r_drop[i] = 1'b1; - lx_id_drop[i] = L1DropId_DP[i]; - lx_len_drop[i] = L1DropLen_DP[i]; - lx_prefetch_drop[i] = L1DropPrefetch_S[i]; - lx_hit_drop[i] = 1'b1; // there are no drops for L1 misses - - // Invalidate L1_DROP_BUF upon handshake - if ( l1_r_drop[i] & l1_r_done[i] ) begin - - L1DropValid_SN[i] = 1'b0; - int_prot[i] = L1DropProt_DP[i]; - int_multi[i] = L1DropMulti_DP[i]; - end - - end else begin - // Invalidate L1_DROP_BUF - L1DropValid_SN[i] = 1'b0; - int_prot[i] = L1DropProt_DP[i]; - int_multi[i] = L1DropMulti_DP[i]; - end - end - - end else begin // L2_OUT_BUF has valid data - - if ( L2OutHit_SP[i] & ~(L2OutPrefetch_S[i] | L2OutProt_SP[i] | L2OutMulti_SP[i]) ) begin - - l2_ar_addr[i] = L2OutAddr_DP[i]; - l2_aw_addr[i] = L2OutAddr_DP[i]; - - l2_ar_accept[i] = L2OutRwType_DP[i] ? 1'b0 : 1'b1; - l2_xw_accept[i] = L2OutRwType_DP[i] ? 1'b1 : 1'b0; - - // Invalidate L2_OUT_BUF upon handshake - L2OutValid_SN[i] = ~( (l2_ar_accept[i] & l2_ar_done[i]) | - (l2_xw_accept[i] & l2_xw_done[i]) ); - end else begin - - lx_id_drop[i] = L2OutId_DP[i]; - lx_len_drop[i] = L2OutLen_DP[i]; - lx_prefetch_drop[i] = L2OutPrefetch_S[i]; - lx_hit_drop[i] = L2OutHit_SP[i]; - - // The l2_xr_drop will also perform the handshake with the R sender - l2_xr_drop[i] = L2OutRwType_DP[i] ? 1'b0 : 1'b1; - l2_xw_drop[i] = L2OutRwType_DP[i] ? 1'b1 : 1'b0; - - // Invalidate L1_DROP_BUF upon handshake - if ( (l2_xr_drop[i] & l2_xr_done[i]) | (l2_xw_drop[i] & l2_xw_done[i]) ) begin - - L2OutValid_SN[i] = 1'b0; - L2Miss_S[i] = ~L2OutHit_SP[i]; - int_prot[i] = L2OutProt_SP[i]; - int_multi[i] = L2OutMulti_SP[i]; - end - end - end - - // Only accept new L2 output after ongoing drops have finished. - if ( (l2_xr_drop[i] == l2_xr_done[i]) & - (l2_xw_drop[i] == l2_xw_done[i]) & - (l1_r_drop[i] == l1_r_done[i] ) ) begin - // Store to L2_OUT_BUF upon handshake with L2 TLB module - if ( (L2OutValid_SP[i] == 1'b0) && (L2OutValid_S[i] == 1'b1) ) begin - L2OutValid_SN[i] = 1'b1; - L2OutReady_S[i] = 1'b1; - L2OutEn_S[i] = 1'b1; - end - end - end - - /* - * L1 drop buffer - * - * Used in case of multi, prot and prefetch hits in the L1 TLB. - */ - always_ff @(posedge Clk_CI) begin : L1_DROP_BUF - if (Rst_RBI == 0) begin - L1DropProt_DP[i] <= 1'b0; - L1DropMulti_DP[i] <= 1'b0; - L1DropRwType_DP[i] <= 1'b0; - L1DropUser_DP[i] <= 'b0; - L1DropId_DP[i] <= 'b0; - L1DropLen_DP[i] <= 'b0; - L1DropAddr_DP[i] <= 'b0; - end else if (L1DropEn_S[i] == 1'b1) begin - L1DropProt_DP[i] <= L1OutProt_D[i] ; - L1DropMulti_DP[i] <= L1OutMulti_D[i] ; - L1DropRwType_DP[i] <= L1OutRwType_D[i]; - L1DropUser_DP[i] <= L1OutUser_D[i] ; - L1DropId_DP[i] <= L1OutId_D[i] ; - L1DropLen_DP[i] <= L1OutLen_D[i] ; - L1DropAddr_DP[i] <= L1OutAddr_D[i] ; - end - end // always_ff @ (posedge Clk_CI) - - /* - * L2 input buffer - * - * Make sure there are no combinational paths between L1 TLB/inputs and L2 TLB. - */ - always_ff @(posedge Clk_CI) begin : L2_IN_BUF - if (Rst_RBI == 0) begin - L2InRwType_DP[i] <= 1'b0; - L2InUser_DP[i] <= 'b0; - L2InId_DP[i] <= 'b0; - L2InLen_DP[i] <= 'b0; - L2InAddr_DP[i] <= 'b0; - end else if (L2InEn_S[i] == 1'b1) begin - L2InRwType_DP[i] <= L1OutRwType_D[i]; - L2InUser_DP[i] <= L1OutUser_D[i] ; - L2InId_DP[i] <= L1OutId_D[i] ; - L2InLen_DP[i] <= L1OutLen_D[i] ; - L2InAddr_DP[i] <= L1OutAddr_D[i] ; - end - end // always_ff @ (posedge Clk_CI) - - l2_tlb - #( - .AXI_S_ADDR_WIDTH ( AXI_S_ADDR_WIDTH ), - .AXI_M_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ), - .AXI_LITE_DATA_WIDTH ( AXI_LITE_DATA_WIDTH ), - .AXI_LITE_ADDR_WIDTH ( AXI_LITE_ADDR_WIDTH ), - .N_SETS ( `RAB_L2_N_SETS ), - .N_OFFSETS ( `RAB_L2_N_SET_ENTRIES/2/`RAB_L2_N_PAR_VA_RAMS ), - .N_PAR_VA_RAMS ( `RAB_L2_N_PAR_VA_RAMS ), - .HIT_OFFSET_STORE_WIDTH ( log2(`RAB_L2_N_SET_ENTRIES/2/`RAB_L2_N_PAR_VA_RAMS) ) - ) - u_l2_tlb - ( - .clk_i ( Clk_CI ), - .rst_ni ( Rst_RBI ), - - // Config inputs - .we_i ( L2CfgWE_S[i] ), - .waddr_i ( L2CfgWAddr_D[i] ), - .wdata_i ( L2CfgWData_D[i] ), - - // Request input - .start_i ( L2InEn_S[i] ), - .busy_o ( L2Busy_S[i] ), - .rw_type_i ( L2InRwType_DP[i] ), - .in_addr_i ( L2InAddr_DP[i] ), - - // Response output - .out_ready_i ( L2OutReady_S[i] ), - .out_valid_o ( L2OutValid_S[i] ), - .hit_o ( L2OutHit_SN[i] ), - .miss_o ( L2OutMiss_SN[i] ), - .prot_o ( L2OutProt_SN[i] ), - .multi_o ( L2OutMulti_SN[i] ), - .cache_coherent_o ( L2OutCC_SN[i] ), - .out_addr_o ( L2OutAddr_DN[i] ) - ); - - /* - * L2 output buffer - * - * Make sure there are no combinational paths between L1 TLB/inputs and L2 TLB. - */ - always_ff @(posedge Clk_CI) begin : L2_OUT_BUF - if (Rst_RBI == 0) begin - L2OutRwType_DP[i] <= 1'b0; - L2OutUser_DP[i] <= 'b0; - L2OutLen_DP[i] <= 'b0; - L2OutId_DP[i] <= 'b0; - L2OutInAddr_DP[i] <= 'b0; - - L2OutHit_SP[i] <= 1'b0; - L2OutMiss_SP[i] <= 1'b0; - L2OutProt_SP[i] <= 1'b0; - L2OutMulti_SP[i] <= 1'b0; - L2OutCC_SP[i] <= 1'b0; - L2OutAddr_DP[i] <= 'b0; - end else if (L2OutEn_S[i] == 1'b1) begin - L2OutRwType_DP[i] <= L2InRwType_DP[i]; - L2OutUser_DP[i] <= L2InUser_DP[i] ; - L2OutLen_DP[i] <= L2InLen_DP[i] ; - L2OutId_DP[i] <= L2InId_DP[i] ; - L2OutInAddr_DP[i] <= L2InAddr_DP[i] ; - - L2OutHit_SP[i] <= L2OutHit_SN[i] ; - L2OutMiss_SP[i] <= L2OutMiss_SN[i] ; - L2OutProt_SP[i] <= L2OutProt_SN[i] ; - L2OutMulti_SP[i] <= L2OutMulti_SN[i]; - L2OutCC_SP[i] <= L2OutCC_SN[i] ; - L2OutAddr_DP[i] <= L2OutAddr_DN[i] ; - end - end // always_ff @ (posedge Clk_CI) - - always_ff @(posedge Clk_CI) begin : BUF_VALID - if (Rst_RBI == 0) begin - L1DropValid_SP[i] = 1'b0; - L2OutValid_SP[i] = 1'b0; - end else begin - L1DropValid_SP[i] = L1DropValid_SN[i]; - L2OutValid_SP[i] = L2OutValid_SN[i]; - end - end - - always_comb begin : BUF_TO_PREFETCH - // L1 Drop Buf - if (L1DropUser_DP[i] == {AXI_USER_WIDTH{1'b1}}) - L1DropPrefetch_S[i] = 1'b1; - else - L1DropPrefetch_S[i] = 1'b0; - - // L2 Out Buf - if (L2OutUser_DP[i] == {AXI_USER_WIDTH{1'b1}}) - L2OutPrefetch_S[i] = 1'b1; - else - L2OutPrefetch_S[i] = 1'b0; - end - - assign l2_cache_coherent[i] = L2OutCC_SP[i]; - assign int_miss[i] = L2Miss_S[i]; - - end else begin : L2_TLB_STUB // if (ENABLE_L2TLB[i] == 1) - - assign l1_ar_drop[i] = int_rtrans_drop[i]; - assign l1_r_drop[i] = int_rtrans_drop[i]; - assign l1_xw_drop[i] = int_wtrans_drop[i]; - - assign l1_ar_save[i] = 1'b0; - assign l1_xw_save[i] = 1'b0; - assign l2_xw_accept[i] = 1'b0; - assign l2_xr_drop[i] = 1'b0; - assign l2_xw_drop[i] = 1'b0; - - assign l2_ar_addr[i] = 'b0; - assign l2_aw_addr[i] = 'b0; - - assign l1_id_drop[i] = int_wtrans_drop[i] ? int_awid[i] : - int_rtrans_drop[i] ? int_arid[i] : - '0; - assign l1_len_drop[i] = int_wtrans_drop[i] ? int_awlen[i] : - int_rtrans_drop[i] ? int_arlen[i] : - '0; - assign l1_prefetch_drop[i] = rab_prefetch[i]; - assign l1_hit_drop[i] = ~rab_miss[i]; - - assign lx_id_drop[i] = int_wtrans_drop[i] ? int_awid[i] : - int_rtrans_drop[i] ? int_arid[i] : - '0; - assign lx_len_drop[i] = int_wtrans_drop[i] ? int_awlen[i] : - int_rtrans_drop[i] ? int_arlen[i] : - '0; - assign lx_prefetch_drop[i] = rab_prefetch[i]; - assign lx_hit_drop[i] = ~rab_miss[i]; - - assign l2_cache_coherent[i] = 1'b0; - - assign int_miss[i] = rab_miss[i]; - assign int_prot[i] = rab_prot[i]; - assign int_multi[i] = rab_multi[i]; - - // unused signals - assign L2Miss_S[i] = 1'b0; - - assign L1OutRwType_D[i] = 1'b0; - assign L1OutProt_D[i] = 1'b0; - assign L1OutMulti_D[i] = 1'b0; - - assign L1DropRwType_DP[i] = 1'b0; - assign L1DropUser_DP[i] = 'b0; - assign L1DropId_DP[i] = 'b0; - assign L1DropLen_DP[i] = 'b0; - assign L1DropAddr_DP[i] = 'b0; - assign L1DropProt_DP[i] = 1'b0; - assign L1DropMulti_DP[i] = 1'b0; - - assign L1DropEn_S[i] = 1'b0; - assign L1DropPrefetch_S[i] = 1'b0; - assign L1DropValid_SN[i] = 1'b0; - assign L1DropValid_SP[i] = 1'b0; - - assign L2InRwType_DP[i] = 1'b0; - assign L2InUser_DP[i] = 'b0; - assign L2InId_DP[i] = 'b0; - assign L2InLen_DP[i] = 'b0; - assign L2InAddr_DP[i] = 'b0; - - assign L2InEn_S[i] = 1'b0; - - assign L2OutHit_SN[i] = 1'b0; - assign L2OutMiss_SN[i] = 1'b0; - assign L2OutProt_SN[i] = 1'b0; - assign L2OutMulti_SN[i] = 1'b0; - assign L2OutCC_SN[i] = 1'b0; - assign L2OutAddr_DN[i] = 'b0; - - assign L2OutRwType_DP[i] = 1'b0; - assign L2OutUser_DP[i] = 'b0; - assign L2OutId_DP[i] = 'b0; - assign L2OutLen_DP[i] = 'b0; - assign L2OutInAddr_DP[i] = 'b0; - assign L2OutHit_SP[i] = 1'b0; - assign L2OutMiss_SP[i] = 1'b0; - assign L2OutProt_SP[i] = 1'b0; - assign L2OutMulti_SP[i] = 1'b0; - assign L2OutCC_SP[i] = 1'b0; - assign L2OutAddr_DP[i] = 'b0; - - assign L2OutEn_S[i] = 1'b0; - assign L2OutPrefetch_S[i] = 1'b0; - assign L2Busy_S[i] = 1'b0; - assign L2OutValid_S[i] = 1'b0; - assign L2OutValid_SN[i] = 1'b0; - assign L2OutValid_SP[i] = 1'b0; - assign L2OutReady_S[i] = 1'b0; - - end // !`ifdef ENABLE_L2TLB - end // for (i = 0; i < N_PORTS; i++) - endgenerate - -// }}} -""" -# endmodule -# -# -# // vim: ts=2 sw=2 sts=2 et nosmartindent autoindent foldmethod=marker -# -# diff --git a/src/iommu/axi_rab/check_ram.py b/src/iommu/axi_rab/check_ram.py deleted file mode 100644 index 31bf32ea..00000000 --- a/src/iommu/axi_rab/check_ram.py +++ /dev/null @@ -1,240 +0,0 @@ -# this file has been generated by sv2nmigen - -from nmigen import Signal, Module, Const, Cat, Elaboratable - - -class check_ram(Elaboratable): - - def __init__(self): - self.clk_i = Signal() # input - self.rst_ni = Signal() # input - self.in_addr = Signal(ADDR_WIDTH) # input - self.rw_type = Signal() # input - self.ram_we = Signal() # input - self.port0_addr = Signal(1+ERROR p_expression_25) # input - self.port1_addr = Signal(1+ERROR p_expression_25) # input - self.ram_wdata = Signal(RAM_DATA_WIDTH) # input - self.output_sent = Signal() # input - self.output_valid = Signal() # input - self.offset_addr_d = Signal(OFFSET_WIDTH) # input - self.hit_addr = Signal(1+ERROR p_expression_25) # output - self.master = Signal() # output - self.hit = Signal() # output - self.multi_hit = Signal() # output - self.prot = Signal() # output - - def elaborate(self, platform=None): - m = Module() - return m - - -# // Copyright 2018 ETH Zurich and University of Bologna. -# // Copyright and related rights are licensed under the Solderpad Hardware -# // License, Version 0.51 (the "License"); you may not use this file except in -# // compliance with the License. You may obtain a copy of the License at -# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law -# // or agreed to in writing, software, hardware and materials distributed under -# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# // CONDITIONS OF ANY KIND, either express or implied. See the License for the -# // specific language governing permissions and limitations under the License. -# -# //import CfMath::log2; -# -# //`define MULTI_HIT_FULL_SET -# -# module check_ram -# //#( -# // parameter ADDR_WIDTH = 32, -# // parameter RAM_DATA_WIDTH = 32, -# // parameter PAGE_SIZE = 4096, // 4kB -# // parameter SET_WIDTH = 5, -# // parameter OFFSET_WIDTH = 4 -# // ) -# ( -# input logic clk_i, -# input logic rst_ni, -# input logic [ADDR_WIDTH-1:0] in_addr, -# input logic rw_type, // 1 => write, 0=> read -# input logic ram_we, -# input logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] port0_addr, -# input logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] port1_addr, -# input logic [RAM_DATA_WIDTH-1:0] ram_wdata, -# input logic output_sent, -# input logic output_valid, -# input logic [OFFSET_WIDTH-1:0] offset_addr_d, -# output logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] hit_addr, -# output logic master, -# output logic hit, -# output logic multi_hit, -# output logic prot -# ); -# -""" #docstring_begin - - localparam IGNORE_LSB = log2(PAGE_SIZE); // 12 - - logic [RAM_DATA_WIDTH-1:0] port0_data_o, port1_data_o; // RAM read data outputs - logic port0_hit, port1_hit; // Ram output matches in_addr - - logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] port0_addr_saved, port1_addr_saved; - - // Hit FSM Signals - typedef enum logic {SEARCH, HIT} hit_state_t; - hit_state_t hit_SP; // Hit FSM state - hit_state_t hit_SN; // Hit FSM next state - - // Multi Hit FSM signals -`ifdef MULTI_HIT_FULL_SET - typedef enum logic[1:0] {NO_HITS, ONE_HIT, MULTI_HIT} multi_state_t; - multi_state_t multi_SP; // Multi Hit FSM state - multi_state_t multi_SN; // Multi Hit FSM next state - - logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] hit_addr_saved; - logic master_saved; -`endif - - //// --------------- Block RAM (Dual Port) -------------- //// - - // The outputs of the BRAMs are only valid if in the previous cycle: - // 1. the inputs were valid, and - // 2. the BRAM was not written to. - // Otherwise, the outputs must be ignored which is controlled by the output_valid signal. - // This signal is driven by the uppler level L2 TLB module. - ram_tp_no_change #( - .ADDR_WIDTH( SET_WIDTH+OFFSET_WIDTH+1 ), - .DATA_WIDTH( RAM_DATA_WIDTH ) - ) - ram_tp_no_change_0 - ( - .clk ( clk_i ), - .we ( ram_we ), - .addr0 ( port0_addr ), - .addr1 ( port1_addr ), - .d_i ( ram_wdata ), - .d0_o ( port0_data_o ), - .d1_o ( port1_data_o ) - ); - - //// Check Ram Outputs - assign port0_hit = (port0_data_o[0] == 1'b1) && (in_addr[ADDR_WIDTH-1: IGNORE_LSB] == port0_data_o[RAM_DATA_WIDTH-1:4]); - assign port1_hit = (port1_data_o[0] == 1'b1) && (in_addr[ADDR_WIDTH-1: IGNORE_LSB] == port1_data_o[RAM_DATA_WIDTH-1:4]); - //// ----------------------------------------------------- ///// - - //// ------------------- Check if Hit ------------------------ //// - // FSM - always_ff @(posedge clk_i) begin - if (rst_ni == 0) begin - hit_SP <= SEARCH; - end else begin - hit_SP <= hit_SN; - end - end - - always_ff @(posedge clk_i, negedge rst_ni) begin - if (!rst_ni) begin - port0_addr_saved <= '0; - port1_addr_saved <= '0; - end else begin - port0_addr_saved <= port0_addr; - port1_addr_saved <= port1_addr; - end - end - - always_comb begin - hit_SN = hit_SP; - hit = 1'b0; - hit_addr = 0; - master = 1'b0; - unique case(hit_SP) - SEARCH : - if (output_valid) - if (port0_hit || port1_hit) begin - hit_SN = HIT; - hit = 1'b1; - hit_addr = port0_hit ? {port0_addr_saved[SET_WIDTH+OFFSET_WIDTH:OFFSET_WIDTH], offset_addr_d} : - port1_hit ? {port1_addr_saved[SET_WIDTH+OFFSET_WIDTH:OFFSET_WIDTH], offset_addr_d} : - 0; - master = port0_hit ? port0_data_o[3] : - port1_hit ? port1_data_o[3] : - 1'b0; - end - - HIT : begin -`ifdef MULTI_HIT_FULL_SET // Since the search continues after the first hit, it needs to be saved to be accessed later. - hit = 1'b1; - hit_addr = hit_addr_saved; - master = master_saved; -`endif - if (output_sent) - hit_SN = SEARCH; - end - - default : begin - hit_SN = SEARCH; - end - endcase // case (hit_SP) - end // always_comb begin - - //// ------------------------------------------- //// - - assign prot = output_valid && port0_hit ? ((~port0_data_o[2] && rw_type) || (~port0_data_o[1] && ~rw_type)) : - output_valid && port1_hit ? ((~port1_data_o[2] && rw_type) || (~port1_data_o[1] && ~rw_type)) : - 1'b0; - - //// ------------------- Multi ------------------- //// -`ifdef MULTI_HIT_FULL_SET - - always_ff @(posedge clk_i) begin - if (rst_ni == 0) begin - hit_addr_saved <= 0; - master_saved <= 1'b0; - end else if (output_valid) begin - hit_addr_saved <= hit_addr; - master_saved <= master; - end - end - - // FSM - always_ff @(posedge clk_i) begin - if (rst_ni == 0) begin - multi_SP <= NO_HITS; - end else begin - multi_SP <= multi_SN; - end - end - - always_comb begin - multi_SN = multi_SP; - multi_hit = 1'b0; - unique case(multi_SP) - NO_HITS : - if(output_valid && (port0_hit && port1_hit)) begin - multi_SN = MULTI_HIT; - multi_hit = 1'b1; - end else if(output_valid && (port0_hit || port1_hit)) - multi_SN = ONE_HIT; - - ONE_HIT : - if(output_valid && (port0_hit || port1_hit)) begin - multi_SN = MULTI_HIT; - multi_hit = 1'b1; - end else if (output_sent) - multi_SN = NO_HITS; - - MULTI_HIT : begin - multi_hit = 1'b1; - if (output_sent) - multi_SN = NO_HITS; - end - - endcase // case (multi_SP) - end // always_comb begin - -`else // !`ifdef MULTI_HIT_FULL_SET - assign multi_hit = output_valid && port0_hit && port1_hit; -`endif // !`ifdef MULTI_HIT_FULL_SET - //// ------------------------------------------- //// -""" -# endmodule -# -# diff --git a/src/iommu/axi_rab/coreconfig.py b/src/iommu/axi_rab/coreconfig.py deleted file mode 100644 index 247d0ce3..00000000 --- a/src/iommu/axi_rab/coreconfig.py +++ /dev/null @@ -1,6 +0,0 @@ -class CoreConfig: - def __init__(self): - self.N_SLICES = 16 - self.N_REGS = 4*self.N_SLICES - self.ADDR_WIDTH_PHYS = 40 - self.ADDR_WIDTH_VIRT = 32 diff --git a/src/iommu/axi_rab/fsm.py b/src/iommu/axi_rab/fsm.py deleted file mode 100644 index d64b1cb4..00000000 --- a/src/iommu/axi_rab/fsm.py +++ /dev/null @@ -1,243 +0,0 @@ -# this file has been generated by sv2nmigen - -from nmigen import Signal, Module, Const, Cat, Elaboratable - - -class fsm(Elaboratable): - - def __init__(self): - self.Clk_CI = Signal() # input - self.Rst_RBI = Signal() # input - self.port1_addr_valid_i = Signal() # input - self.port2_addr_valid_i = Signal() # input - self.port1_sent_i = Signal() # input - self.port2_sent_i = Signal() # input - self.select_i = Signal() # input - self.no_hit_i = Signal() # input - self.multi_hit_i = Signal() # input - self.no_prot_i = Signal() # input - self.prefetch_i = Signal() # input - self.out_addr_i = Signal(AXI_M_ADDR_WIDTH) # input - self.cache_coherent_i = Signal() # input - self.port1_accept_o = Signal() # output - self.port1_drop_o = Signal() # output - self.port1_miss_o = Signal() # output - self.port2_accept_o = Signal() # output - self.port2_drop_o = Signal() # output - self.port2_miss_o = Signal() # output - self.out_addr_o = Signal(AXI_M_ADDR_WIDTH) # output - self.cache_coherent_o = Signal() # output - self.miss_o = Signal() # output - self.multi_o = Signal() # output - self.prot_o = Signal() # output - self.prefetch_o = Signal() # output - self.in_addr_i = Signal(AXI_S_ADDR_WIDTH) # input - self.in_id_i = Signal(AXI_ID_WIDTH) # input - self.in_len_i = Signal(8) # input - self.in_user_i = Signal(AXI_USER_WIDTH) # input - self.in_addr_o = Signal(AXI_S_ADDR_WIDTH) # output - self.in_id_o = Signal(AXI_ID_WIDTH) # output - self.in_len_o = Signal(8) # output - self.in_user_o = Signal(AXI_USER_WIDTH) # output - - def elaborate(self, platform=None): - m = Module() - return m - - -# // Copyright 2018 ETH Zurich and University of Bologna. -# // Copyright and related rights are licensed under the Solderpad Hardware -# // License, Version 0.51 (the "License"); you may not use this file except in -# // compliance with the License. You may obtain a copy of the License at -# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law -# // or agreed to in writing, software, hardware and materials distributed under -# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# // CONDITIONS OF ANY KIND, either express or implied. See the License for the -# // specific language governing permissions and limitations under the License. -# -# //`timescale 1ns / 1ps -# -# module fsm -# #( -# parameter AXI_M_ADDR_WIDTH = 40, -# parameter AXI_S_ADDR_WIDTH = 32, -# parameter AXI_ID_WIDTH = 8, -# parameter AXI_USER_WIDTH = 6 -# ) -# ( -# input logic Clk_CI, -# input logic Rst_RBI, -# -# input logic port1_addr_valid_i, -# input logic port2_addr_valid_i, -# input logic port1_sent_i, -# input logic port2_sent_i, -# input logic select_i, -# input logic no_hit_i, -# input logic multi_hit_i, -# input logic no_prot_i, -# input logic prefetch_i, -# input logic [AXI_M_ADDR_WIDTH-1:0] out_addr_i, -# input logic cache_coherent_i, -# output logic port1_accept_o, -# output logic port1_drop_o, -# output logic port1_miss_o, -# output logic port2_accept_o, -# output logic port2_drop_o, -# output logic port2_miss_o, -# output logic [AXI_M_ADDR_WIDTH-1:0] out_addr_o, -# output logic cache_coherent_o, -# output logic miss_o, -# output logic multi_o, -# output logic prot_o, -# output logic prefetch_o, -# input logic [AXI_S_ADDR_WIDTH-1:0] in_addr_i, -# input logic [AXI_ID_WIDTH-1:0] in_id_i, -# input logic [7:0] in_len_i, -# input logic [AXI_USER_WIDTH-1:0] in_user_i, -# output logic [AXI_S_ADDR_WIDTH-1:0] in_addr_o, -# output logic [AXI_ID_WIDTH-1:0] in_id_o, -# output logic [7:0] in_len_o, -# output logic [AXI_USER_WIDTH-1:0] in_user_o -# ); -# -""" #docstring_begin - - //-------------Internal Signals---------------------- - - typedef enum logic {IDLE, WAIT} state_t; - logic state_SP; // Present state - logic state_SN; // Next State - - logic port1_accept_SN; - logic port1_drop_SN; - logic port1_miss_SN; - logic port2_accept_SN; - logic port2_drop_SN; - logic port2_miss_SN; - logic miss_SN; - logic multi_SN; - logic prot_SN; - logic prefetch_SN; - logic cache_coherent_SN; - logic [AXI_M_ADDR_WIDTH-1:0] out_addr_DN; - - logic out_reg_en_S; - - //----------FSM comb------------------------------ - - always_comb begin: FSM_COMBO - state_SN = state_SP; - - port1_accept_SN = 1'b0; - port1_drop_SN = 1'b0; - port1_miss_SN = 1'b0; - port2_accept_SN = 1'b0; - port2_drop_SN = 1'b0; - port2_miss_SN = 1'b0; - miss_SN = 1'b0; - multi_SN = 1'b0; - prot_SN = 1'b0; - prefetch_SN = 1'b0; - cache_coherent_SN = 1'b0; - out_addr_DN = '0; - - out_reg_en_S = 1'b0; // by default hold register output - - unique case(state_SP) - IDLE : - if ( (port1_addr_valid_i & select_i) | (port2_addr_valid_i & ~select_i) ) begin - out_reg_en_S = 1'b1; - state_SN = WAIT; - - // Select inputs for output registers - if (port1_addr_valid_i & select_i) begin - port1_accept_SN = ~(no_hit_i | multi_hit_i | ~no_prot_i | prefetch_i); - port1_drop_SN = (no_hit_i | multi_hit_i | ~no_prot_i | prefetch_i); - port1_miss_SN = no_hit_i; - port2_accept_SN = 1'b0; - port2_drop_SN = 1'b0; - port2_miss_SN = 1'b0; - end else if (port2_addr_valid_i & ~select_i) begin - port1_accept_SN = 1'b0; - port1_drop_SN = 1'b0; - port1_miss_SN = 1'b0; - port2_accept_SN = ~(no_hit_i | multi_hit_i | ~no_prot_i | prefetch_i); - port2_drop_SN = (no_hit_i | multi_hit_i | ~no_prot_i | prefetch_i); - port2_miss_SN = no_hit_i; - end - - miss_SN = port1_miss_SN | port2_miss_SN; - multi_SN = multi_hit_i; - prot_SN = ~no_prot_i; - prefetch_SN = ~no_hit_i & prefetch_i; - - cache_coherent_SN = cache_coherent_i; - out_addr_DN = out_addr_i; - end - - WAIT : - if ( port1_sent_i | port2_sent_i ) begin - out_reg_en_S = 1'b1; // "clear" the register - state_SN = IDLE; - end - - default : begin - state_SN = IDLE; - end - endcase - end - - //----------FSM seq------------------------------- - - always_ff @(posedge Clk_CI, negedge Rst_RBI) begin: FSM_SEQ - if (Rst_RBI == 1'b0) - state_SP <= IDLE; - else - state_SP <= state_SN; - end - - //----------Output seq-------------------------- - - always_ff @(posedge Clk_CI, negedge Rst_RBI) begin: OUTPUT_SEQ - if (Rst_RBI == 1'b0) begin - port1_accept_o = 1'b0; - port1_drop_o = 1'b0; - port1_miss_o = 1'b0; - port2_accept_o = 1'b0; - port2_drop_o = 1'b0; - port2_miss_o = 1'b0; - miss_o = 1'b0; - multi_o = 1'b0; - prot_o = 1'b0; - prefetch_o = 1'b0; - cache_coherent_o = 1'b0; - out_addr_o = '0; - in_addr_o = '0; - in_id_o = '0; - in_len_o = '0; - in_user_o = '0; - end else if (out_reg_en_S == 1'b1) begin - port1_accept_o = port1_accept_SN; - port1_drop_o = port1_drop_SN; - port1_miss_o = port1_miss_SN; - port2_accept_o = port2_accept_SN; - port2_drop_o = port2_drop_SN; - port2_miss_o = port2_miss_SN; - miss_o = miss_SN; - multi_o = multi_SN; - prot_o = prot_SN; - prefetch_o = prefetch_SN; - cache_coherent_o = cache_coherent_SN; - out_addr_o = out_addr_DN; - in_addr_o = in_addr_i; - in_id_o = in_id_i; - in_len_o = in_len_i; - in_user_o = in_user_i; - end - end // block: OUTPUT_SEQ -""" -# -# endmodule -# -# diff --git a/src/iommu/axi_rab/l2_tlb.py b/src/iommu/axi_rab/l2_tlb.py deleted file mode 100644 index 11983f64..00000000 --- a/src/iommu/axi_rab/l2_tlb.py +++ /dev/null @@ -1,550 +0,0 @@ -# this file has been generated by sv2nmigen - -from nmigen import Signal, Module, Const, Cat, Elaboratable - - -class l2_tlb(Elaboratable): - - def __init__(self): - self.clk_i = Signal() # input - self.rst_ni = Signal() # input - self.we_i = Signal() # input - self.waddr_i = Signal(AXI_LITE_ADDR_WIDTH) # input - self.wdata_i = Signal(AXI_LITE_DATA_WIDTH) # input - self.start_i = Signal() # input - self.busy_o = Signal() # output - self.in_addr_i = Signal(AXI_S_ADDR_WIDTH) # input - self.rw_type_i = Signal() # input - self.out_ready_i = Signal() # input - self.out_valid_o = Signal() # output - self.hit_o = Signal() # output - self.miss_o = Signal() # output - self.prot_o = Signal() # output - self.multi_o = Signal() # output - self.cache_coherent_o = Signal() # output - self.out_addr_o = Signal(AXI_M_ADDR_WIDTH) # output - - def elaborate(self, platform=None): - m = Module() - return m - - -# // Copyright 2018 ETH Zurich and University of Bologna. -# // Copyright and related rights are licensed under the Solderpad Hardware -# // License, Version 0.51 (the "License"); you may not use this file except in -# // compliance with the License. You may obtain a copy of the License at -# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law -# // or agreed to in writing, software, hardware and materials distributed under -# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# // CONDITIONS OF ANY KIND, either express or implied. See the License for the -# // specific language governing permissions and limitations under the License. -# -# //`include "pulp_soc_defines.sv" -# -# ////import CfMath::log2; -# -# //`define MULTI_HIT_FULL_SET // Enable full multi hit detection. Always the entire set is searched. -# //`define MULTI_HIT_CUR_CYCLE // Enable partial multi hit detection. Only multi hits in the same search cycle are detected. -# -# //`ifdef MULTI_HIT_FULL_SET -# // `ifndef MULTI_HIT_CUR_CYCLE -# // `define MULTI_HIT_CUR_CYCLE -# // `endif -# //`endif -# -# module l2_tlb -# //#( -# // parameter AXI_S_ADDR_WIDTH = 32, -# // parameter AXI_M_ADDR_WIDTH = 40, -# // parameter AXI_LITE_DATA_WIDTH = 64, -# // parameter AXI_LITE_ADDR_WIDTH = 32, -# // parameter N_SETS = 32, -# // parameter N_OFFSETS = 4, //per port. There are 2 ports. -# // parameter PAGE_SIZE = 4096, // 4kB -# // parameter N_PAR_VA_RAMS = 4, -# // parameter HIT_OFFSET_STORE_WIDTH = 2 // Num of bits of VA RAM offset stored. This should not be greater than OFFSET_WIDTH -# // ) -# ( -# input logic clk_i, -# input logic rst_ni, -# -# input logic we_i, -# input logic [AXI_LITE_ADDR_WIDTH-1:0] waddr_i, -# input logic [AXI_LITE_DATA_WIDTH-1:0] wdata_i, -# -# input logic start_i, -# output logic busy_o, -# input logic [AXI_S_ADDR_WIDTH-1:0] in_addr_i, -# input logic rw_type_i, //1 => write, 0=> read -# -# input logic out_ready_i, -# output logic out_valid_o, -# output logic hit_o, -# output logic miss_o, -# output logic prot_o, -# output logic multi_o, -# output logic cache_coherent_o, -# output logic [AXI_M_ADDR_WIDTH-1:0] out_addr_o -# ); -# -""" #docstring_begin - - localparam VA_RAM_DEPTH = N_SETS * N_OFFSETS * 2; - localparam PA_RAM_DEPTH = VA_RAM_DEPTH * N_PAR_VA_RAMS; - localparam VA_RAM_ADDR_WIDTH = log2(VA_RAM_DEPTH); - localparam PA_RAM_ADDR_WIDTH = log2(PA_RAM_DEPTH); - localparam SET_WIDTH = log2(N_SETS); - localparam OFFSET_WIDTH = log2(N_OFFSETS); - localparam LL_WIDTH = log2(N_PAR_VA_RAMS); - localparam IGNORE_LSB = log2(PAGE_SIZE); - - localparam VA_RAM_DATA_WIDTH = AXI_S_ADDR_WIDTH - IGNORE_LSB + 4; - localparam PA_RAM_DATA_WIDTH = AXI_M_ADDR_WIDTH - IGNORE_LSB; - - logic [N_PAR_VA_RAMS-1:0] hit, prot, multi_hit, cache_coherent; - logic [N_PAR_VA_RAMS-1:0] ram_we; - logic last_search, last_search_next; - logic first_search, first_search_next; - logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] ram_waddr; - logic [N_PAR_VA_RAMS-1:0][SET_WIDTH+OFFSET_WIDTH+1-1:0] hit_addr; - logic pa_ram_we; - logic [PA_RAM_ADDR_WIDTH-1:0] pa_port0_raddr, pa_port0_waddr; // PA RAM read, Write addr; - logic [PA_RAM_ADDR_WIDTH-1:0] pa_port0_raddr_reg_SN, pa_port0_raddr_reg_SP; // registered addresses, needed for WAIT_ON_WRITE; - logic [PA_RAM_ADDR_WIDTH-1:0] pa_port0_addr; // PA RAM addr - logic [PA_RAM_DATA_WIDTH-1:0] pa_port0_data, pa_data, pa_port0_data_reg; // PA RAM data - logic pa_ram_store_data_SN, pa_ram_store_data_SP; - logic hit_top, prot_top, multi_hit_top, first_hit_top; - logic output_sent; - int hit_block_num; - - logic searching, search_done; - logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] port0_addr, port0_raddr; // VA RAM port0 addr - logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] port1_addr; // VA RAM port1 addr - logic [OFFSET_WIDTH-1:0] offset_addr, offset_addr_d; - logic [OFFSET_WIDTH-1:0] offset_start_addr, offset_end_addr; - logic [SET_WIDTH-1:0] set_num; - - logic va_output_valid; - logic searching_q; - - genvar z; - - // Search FSM - typedef enum logic [1:0] {IDLE, SEARCH, DONE} search_state_t; - search_state_t search_SP; // Present state - search_state_t search_SN; // Next State - - // Output FSM - typedef enum logic [1:0] {OUT_IDLE, SEND_OUTPUT, WAIT_ON_WRITE} out_state_t; - out_state_t out_SP; // Present state - out_state_t out_SN; // Next State - - logic miss_next; - logic hit_next; - logic prot_next; - logic multi_next; - logic cache_coherent_next; - - // Generate the VA Block rams and their surrounding logic - generate - for (z = 0; z < N_PAR_VA_RAMS; z++) begin : VA_RAMS - check_ram - #( - .ADDR_WIDTH ( AXI_S_ADDR_WIDTH ), - .RAM_DATA_WIDTH ( VA_RAM_DATA_WIDTH ), - .PAGE_SIZE ( PAGE_SIZE ), - .SET_WIDTH ( SET_WIDTH ), - .OFFSET_WIDTH ( OFFSET_WIDTH ) - ) - u_check_ram - ( - .clk_i ( clk_i ), - .rst_ni ( rst_ni ), - .in_addr ( in_addr_i ), - .rw_type ( rw_type_i ), - .ram_we ( ram_we[z] ), - .port0_addr ( port0_addr ), - .port1_addr ( port1_addr ), - .ram_wdata ( wdata_i[VA_RAM_DATA_WIDTH-1:0] ), - .output_sent ( output_sent ), - .output_valid ( va_output_valid ), - .offset_addr_d ( offset_addr_d ), - .hit_addr ( hit_addr[z] ), - .master ( cache_coherent[z] ), - .hit ( hit[z] ), - .multi_hit ( multi_hit[z] ), - .prot ( prot[z] ) - ); - end // for (z = 0; z < N_PORTS; z++) - endgenerate - - ////////////////// ---------------- Control and Address --------------- //////////////////////// - // FSM - always_ff @(posedge clk_i) begin - if (rst_ni == 0) begin - search_SP <= IDLE; - end else begin - search_SP <= search_SN; - end - end - - always_comb begin : SEARCH_FSM - search_SN = search_SP; - busy_o = 1'b0; - searching = 1'b0; - search_done = 1'b0; - last_search_next = 1'b0; - first_search_next = first_search; - - unique case (search_SP) - IDLE : begin - if (start_i) begin - search_SN = SEARCH; - first_search_next = 1'b1; - end - end - - SEARCH : begin - busy_o = 1'b1; - - // detect last search cycle - if ( (first_search == 1'b0) && (offset_addr == offset_end_addr) ) - last_search_next = 1'b1; - - // pause search during VA RAM reconfigration - if (|ram_we) begin - searching = 1'b0; - end else begin - searching = 1'b1; - first_search_next = 1'b0; - end - - if (va_output_valid) begin - // stop search -`ifdef MULTI_HIT_FULL_SET - if (last_search | prot_top | multi_hit_top) begin -`else - if (last_search | prot_top | multi_hit_top | hit_top ) begin -`endif - search_SN = DONE; - search_done = 1'b1; - end - end - end - - DONE : begin - busy_o = 1'b1; - if (out_valid_o & out_ready_i) - search_SN = IDLE; - end - - default : begin - search_SN = IDLE; - end - endcase // case (prot_SP) - end // always_comb begin - - always_ff @(posedge clk_i) begin - if (rst_ni == 0) begin - last_search <= 1'b0; - first_search <= 1'b0; - end else begin - last_search <= last_search_next; - first_search <= first_search_next; - end - end - - /* - * VA RAM address generation - * - * The input address and set number, and thus the offset start address, are available in the - * cycle after the start signal. The buffered offset_addr becomes available one cycle later. - * During the first search cycle, we therefore directly use offset_addr_start for the lookup. - */ - assign set_num = in_addr_i[SET_WIDTH+IGNORE_LSB -1 : IGNORE_LSB]; - - assign port0_raddr[OFFSET_WIDTH] = 1'b0; - assign port1_addr [OFFSET_WIDTH] = 1'b1; - - assign port0_raddr[OFFSET_WIDTH-1:0] = first_search ? offset_start_addr : offset_addr; - assign port1_addr [OFFSET_WIDTH-1:0] = first_search ? offset_start_addr : offset_addr; - - assign port0_raddr[SET_WIDTH+OFFSET_WIDTH : OFFSET_WIDTH+1] = set_num; - assign port1_addr [SET_WIDTH+OFFSET_WIDTH : OFFSET_WIDTH+1] = set_num; - - assign port0_addr = ram_we ? ram_waddr : port0_raddr; - - // The outputs of the BRAMs are only valid if in the previous cycle: - // 1. the inputs were valid, and - // 2. the BRAMs were not written to. - // Otherwise, the outputs must be ignored. - always_ff @(posedge clk_i) begin - if (rst_ni == 0) begin - searching_q <= 1'b0; - end else begin - searching_q <= searching; - end - end - assign va_output_valid = searching_q; - - // Address offset for looking up the VA RAMs - always_ff @(posedge clk_i) begin - if (rst_ni == 0) begin - offset_addr <= 0; - end else if (first_search) begin - offset_addr <= offset_start_addr + 1'b1; - end else if (searching) begin - offset_addr <= offset_addr + 1'b1; - end - end - - // Delayed address offest for looking up the PA RAM upon a hit in the VA RAMs - always_ff @(posedge clk_i) begin - if (rst_ni == 0) begin - offset_addr_d <= 0; - end else if (first_search) begin - offset_addr_d <= offset_start_addr; - end else if (searching) begin - offset_addr_d <= offset_addr_d + 1'b1; - end - end - - // Store the offset addr for hit to reduce latency for next search. - generate - if (HIT_OFFSET_STORE_WIDTH > 0) begin : OFFSET_STORE -`ifndef MULTI_HIT_FULL_SET - logic [N_SETS-1:0][HIT_OFFSET_STORE_WIDTH-1:0] hit_offset_addr; // Contains offset addr for previous hit for every SET. - logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] hit_addr_reg; - - assign offset_start_addr = { hit_offset_addr[set_num] , {{OFFSET_WIDTH-HIT_OFFSET_STORE_WIDTH}{1'b0}} }; - assign offset_end_addr = hit_offset_addr[set_num]-1'b1; - - // Register the hit addr - always_ff @(posedge clk_i) begin - if (rst_ni == 0) begin - hit_addr_reg <= 0; - end else if (hit_top) begin - hit_addr_reg <= hit_addr[hit_block_num]; - end - end - - // Store hit addr for each set. The next search in the same set will start from the saved addr. - always_ff @(posedge clk_i) begin - if (rst_ni == 0) begin - hit_offset_addr <= 0; - end else if (hit_o) begin - hit_offset_addr[set_num][HIT_OFFSET_STORE_WIDTH-1:0] <= hit_addr_reg[OFFSET_WIDTH-1 : (OFFSET_WIDTH - HIT_OFFSET_STORE_WIDTH)]; - end - end -`else // No need to store offset if full multi hit detection is enabled because the entire SET is searched. - assign offset_start_addr = 0; - assign offset_end_addr = {OFFSET_WIDTH{1'b1}}; -`endif - end else begin // if (HIT_OFFSET_STORE_WIDTH > 0) - assign offset_start_addr = 0; - assign offset_end_addr = {OFFSET_WIDTH{1'b1}}; - end - endgenerate - - assign prot_top = |prot; - - ////////////////////////////////////////////////////////////////////////////////////// - // check for hit, multi hit - // In case of a multi hit, the hit_block_num indicates the lowest VA RAM with a hit. - // In case of a multi hit in the same VA RAM, Port 0 is given priority. - always_comb begin : HIT_CHECK - hit_top = |hit; - hit_block_num = 0; - first_hit_top = 1'b0; - multi_hit_top = 1'b0; - for (int i=N_PAR_VA_RAMS-1; i>=0; i--) begin - if (hit[i] == 1'b1) begin -`ifdef MULTI_HIT_CUR_CYCLE - if (multi_hit[i] | first_hit_top ) begin - multi_hit_top = 1'b1; - end -`endif - first_hit_top = 1'b1; - hit_block_num = i; - end - end // for (int i=0; i port1 active - // select = 0 -> port2 active - select[idx] = (curr_priority[idx] & port1_addr_valid[idx]) | ~port2_addr_valid[idx]; - - p1_burst_size[idx] = (port1_len[idx] + 1) << port1_size[idx]; - p2_burst_size[idx] = (port2_len[idx] + 1) << port2_size[idx]; - - // align min addr for max addr computation to allow for smart AXI bursts around the 4k boundary - if (port1_size[idx] == 3'b001) - p1_mask[idx] = 3'b110; - else if (port1_size[idx] == 3'b010) - p1_mask[idx] = 3'b100; - else if (port1_size[idx] == 3'b011) - p1_mask[idx] = 3'b000; - else - p1_mask[idx] = 3'b111; - - p1_align_addr[idx][AXI_S_ADDR_WIDTH-1:AXI_SIZE_WIDTH] = port1_addr[idx][AXI_S_ADDR_WIDTH-1:AXI_SIZE_WIDTH]; - p1_align_addr[idx][AXI_SIZE_WIDTH-1:0] = port1_addr[idx][AXI_SIZE_WIDTH-1:0] & p1_mask[idx]; - - if (port2_size[idx] == 3'b001) - p2_mask[idx] = 3'b110; - else if (port2_size[idx] == 3'b010) - p2_mask[idx] = 3'b100; - else if (port2_size[idx] == 3'b011) - p2_mask[idx] = 3'b000; - else - p2_mask[idx] = 3'b111; - - if (port1_user[idx] == {AXI_USER_WIDTH{1'b1}}) - p1_prefetch[idx] = 1'b1; - else - p1_prefetch[idx] = 1'b0; - - if (port2_user[idx] == {AXI_USER_WIDTH{1'b1}}) - p2_prefetch[idx] = 1'b1; - else - p2_prefetch[idx] = 1'b0; - - p2_align_addr[idx][AXI_S_ADDR_WIDTH-1:AXI_SIZE_WIDTH] = port2_addr[idx][AXI_S_ADDR_WIDTH-1:AXI_SIZE_WIDTH]; - p2_align_addr[idx][AXI_SIZE_WIDTH-1:0] = port2_addr[idx][AXI_SIZE_WIDTH-1:0] & p2_mask[idx]; - - p1_max_addr[idx] = p1_align_addr[idx] + p1_burst_size[idx] - 1; - p2_max_addr[idx] = p2_align_addr[idx] + p2_burst_size[idx] - 1; - - int_addr_min[idx] = select[idx] ? port1_addr[idx] : port2_addr[idx]; - int_addr_max[idx] = select[idx] ? p1_max_addr[idx] : p2_max_addr[idx]; - int_rw[idx] = select[idx] ? port1_type[idx] : port2_type[idx]; - int_id[idx] = select[idx] ? port1_id[idx] : port2_id[idx]; - int_len[idx] = select[idx] ? port1_len[idx] : port2_len[idx]; - int_user[idx] = select[idx] ? port1_user[idx] : port2_user[idx]; - prefetch[idx] = select[idx] ? p1_prefetch[idx] : p2_prefetch[idx]; - - hit [idx] = | hit_slices [idx]; - prot[idx] = | prot_slices[idx]; - - no_hit [idx] = ~hit [idx]; - no_prot[idx] = ~prot[idx]; - - port1_out_addr[idx] = out_addr_reg[idx]; - port2_out_addr[idx] = out_addr_reg[idx]; - - port1_cache_coherent[idx] = cache_coherent_reg[idx]; - port2_cache_coherent[idx] = cache_coherent_reg[idx]; - end - end - - always_comb - begin - var integer idx_port, idx_slice; - var integer reg_num; - reg_num=0; - for ( idx_port = 0; idx_port < N_PORTS; idx_port++ ) begin - for ( idx_slice = 0; idx_slice < 4*N_SLICES[idx_port]; idx_slice++ ) begin - int_cfg_regs_slices[idx_port][idx_slice] = int_cfg_regs[4+reg_num]; - reg_num++; - end - // int_cfg_regs_slices[idx_port][N_SLICES_MAX:N_SLICES[idx_port]] will be dangling - // Fix to zero. Synthesis will remove these signals. - // int_cfg_regs_slices[idx_port][4*N_SLICES_MAX-1:4*N_SLICES[idx_port]] = 0; - end - end - - always @(posedge Clk_CI or negedge Rst_RBI) - begin : PORT_PRIORITY - var integer idx; - if (Rst_RBI == 1'b0) - curr_priority = 'h0; - else begin - for (idx=0; idx= cfg_min) ? 1'b1 : 1'b0; - # assign min_below_max = (in_addr_min <= cfg_max) ? 1'b1 : 1'b0; - # assign max_below_max = (in_addr_max <= cfg_max) ? 1'b1 : 1'b0; - # assign out_hit = cfg_en & min_above_min & min_below_max & max_below_max; - # assign out_prot = out_hit & ((in_trans_type & ~cfg_wen) | (~in_trans_type & ~cfg_ren)); - # assign out_addr = in_addr_min - cfg_min + cfg_offset; - m.d.comb += [ - min_above_min.eq(self.in_addr_min >= self.cfg_min), - min_below_max.eq(self.in_addr_min <= self.cfg_max), - max_below_max.eq(self.in_addr_max <= self.cfg_max), - self.out_hit.eq(self.cfg_en & min_above_min & - min_below_max & max_below_max), - self.out_prot.eq(self.out_hit & ( - (self.in_trans_type & ~self.cfg_wen) | (~self.in_trans_type & ~self.cfg_ren))), - self.out_addr.eq(self.in_addr_min - self.cfg_min + self.cfg_offset) - ] - - return m diff --git a/src/iommu/axi_rab/ram_tp_no_change.py b/src/iommu/axi_rab/ram_tp_no_change.py deleted file mode 100644 index bdcd5550..00000000 --- a/src/iommu/axi_rab/ram_tp_no_change.py +++ /dev/null @@ -1,97 +0,0 @@ -# // Copyright 2018 ETH Zurich and University of Bologna. -# // Copyright and related rights are licensed under the Solderpad Hardware -# // License, Version 0.51 (the "License"); you may not use this file except in -# // compliance with the License. You may obtain a copy of the License at -# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law -# // or agreed to in writing, software, hardware and materials distributed under -# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# // CONDITIONS OF ANY KIND, either express or implied. See the License for the -# // specific language governing permissions and limitations under the License. -# -# /* -# * ram_tp_no_change -# * -# * This code implements a parameterizable two-port memory. Port 0 can read and -# * write while Port 1 can read only. The Xilinx tools will infer a BRAM with -# * Port 0 in "no change" mode, i.e., during a write, it retains the last read -# * value on the output. Port 1 (read-only) is in "write first" mode. Still, it -# * outputs the old data during the write cycle. Note: Port 1 outputs invalid -# * data in the cycle after the write when reading the same address. -# * -# * For more information, see Xilinx PG058 Block Memory Generator Product Guide. -# */ - -from nmigen import Signal, Module, Const, Cat, Elaboratable -from nmigen import Memory - -import math - -# -# module ram_tp_no_change -# #( -ADDR_WIDTH = 10 -DATA_WIDTH = 36 -# ) -# ( -# input clk, -# input we, -# input [ADDR_WIDTH-1:0] addr0, -# input [ADDR_WIDTH-1:0] addr1, -# input [DATA_WIDTH-1:0] d_i, -# output [DATA_WIDTH-1:0] d0_o, -# output [DATA_WIDTH-1:0] d1_o -# ); - - -class ram_tp_no_change(Elaboratable): - - def __init__(self): - self.we = Signal() # input - self.addr0 = Signal(ADDR_WIDTH) # input - self.addr1 = Signal(ADDR_WIDTH) # input - self.d_i = Signal(DATA_WIDTH) # input - self.d0_o = Signal(DATA_WIDTH) # output - self.d1_o = Signal(DATA_WIDTH) # output - - DEPTH = int(math.pow(2, ADDR_WIDTH)) - self.ram = Memory(DATA_WIDTH, DEPTH) - # - # localparam DEPTH = 2**ADDR_WIDTH; - # - # (* ram_style = "block" *) reg [DATA_WIDTH-1:0] ram[DEPTH]; - # reg [DATA_WIDTH-1:0] d0; - # reg [DATA_WIDTH-1:0] d1; - # - # always_ff @(posedge clk) begin - # if(we == 1'b1) begin - # ram[addr0] <= d_i; - # end else begin - # only change data if we==false - # d0 <= ram[addr0]; - # end - # d1 <= ram[addr1]; - # end - # - # assign d0_o = d0; - # assign d1_o = d1; - # - - def elaborate(self, platform=None): - m = Module() - m.submodules.read_ram0 = read_ram0 = self.ram.read_port() - m.submodules.read_ram1 = read_ram1 = self.ram.read_port() - m.submodules.write_ram = write_ram = self.ram.write_port() - - # write port - m.d.comb += write_ram.en.eq(self.we) - m.d.comb += write_ram.addr.eq(self.addr0) - m.d.comb += write_ram.data.eq(self.d_i) - - # read ports - m.d.comb += read_ram0.addr.eq(self.addr0) - m.d.comb += read_ram1.addr.eq(self.addr1) - with m.If(self.we == 0): - m.d.sync += self.d0_o.eq(read_ram0.data) - m.d.sync += self.d1_o.eq(read_ram1.data) - - return m diff --git a/src/iommu/axi_rab/ram_tp_write_first.py b/src/iommu/axi_rab/ram_tp_write_first.py deleted file mode 100644 index 7a21969c..00000000 --- a/src/iommu/axi_rab/ram_tp_write_first.py +++ /dev/null @@ -1,93 +0,0 @@ -# // Copyright 2018 ETH Zurich and University of Bologna. -# // Copyright and related rights are licensed under the Solderpad Hardware -# // License, Version 0.51 (the "License"); you may not use this file except in -# // compliance with the License. You may obtain a copy of the License at -# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law -# // or agreed to in writing, software, hardware and materials distributed under -# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# // CONDITIONS OF ANY KIND, either express or implied. See the License for the -# // specific language governing permissions and limitations under the License. -# -# /* -# * ram_tp_write_first -# * -# * This code implements a parameterizable two-port memory. Port 0 can read and -# * write while Port 1 can read only. Xilinx Vivado will infer a BRAM in -# * "write first" mode, i.e., upon a read and write to the same address, the -# * new value is read. Note: Port 1 outputs invalid data in the cycle after -# * the write when reading the same address. -# * -# * For more information, see Xilinx PG058 Block Memory Generator Product Guide. -# */ - -from nmigen import Signal, Module, Const, Cat, Elaboratable -from nmigen import Memory - -import math -# -# module ram_tp_write_first -# #( -ADDR_WIDTH = 10 -DATA_WIDTH = 36 -# ) -# ( -# input clk, -# input we, -# input [ADDR_WIDTH-1:0] addr0, -# input [ADDR_WIDTH-1:0] addr1, -# input [DATA_WIDTH-1:0] d_i, -# output [DATA_WIDTH-1:0] d0_o, -# output [DATA_WIDTH-1:0] d1_o -# ); - - -class ram_tp_write_first(Elaboratable): - - def __init__(self): - self.we = Signal() # input - self.addr0 = Signal(ADDR_WIDTH) # input - self.addr1 = Signal(ADDR_WIDTH) # input - self.d_i = Signal(DATA_WIDTH) # input - self.d0_o = Signal(DATA_WIDTH) # output - self.d1_o = Signal(DATA_WIDTH) # output - - DEPTH = int(math.pow(2, ADDR_WIDTH)) - self.ram = Memory(DATA_WIDTH, DEPTH) - - # - # localparam DEPTH = 2**ADDR_WIDTH; - # - # (* ram_style = "block" *) reg [DATA_WIDTH-1:0] ram[DEPTH]; - # reg [ADDR_WIDTH-1:0] raddr0; - # reg [ADDR_WIDTH-1:0] raddr1; - # - # always_ff @(posedge clk) begin - # if(we == 1'b1) begin - # ram[addr0] <= d_i; - # end - # raddr0 <= addr0; - # raddr1 <= addr1; - # end - # - # assign d0_o = ram[raddr0]; - # assign d1_o = ram[raddr1]; - # - - def elaborate(self, platform=None): - m = Module() - m.submodules.read_ram0 = read_ram0 = self.ram.read_port() - m.submodules.read_ram1 = read_ram1 = self.ram.read_port() - m.submodules.write_ram = write_ram = self.ram.write_port() - - # write port - m.d.comb += write_ram.en.eq(self.we) - m.d.comb += write_ram.addr.eq(self.addr0) - m.d.comb += write_ram.data.eq(self.d_i) - - # read ports - m.d.comb += read_ram0.addr.eq(self.addr0) - m.d.comb += read_ram1.addr.eq(self.addr1) - m.d.sync += self.d0_o.eq(read_ram0.data) - m.d.sync += self.d1_o.eq(read_ram1.data) - - return m diff --git a/src/iommu/axi_rab/slice_top.py b/src/iommu/axi_rab/slice_top.py deleted file mode 100644 index 6eedb1cd..00000000 --- a/src/iommu/axi_rab/slice_top.py +++ /dev/null @@ -1,141 +0,0 @@ -# // Copyright 2018 ETH Zurich and University of Bologna. -# // Copyright and related rights are licensed under the Solderpad Hardware -# // License, Version 0.51 (the "License"); you may not use this file except in -# // compliance with the License. You may obtain a copy of the License at -# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law -# // or agreed to in writing, software, hardware and materials distributed under -# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# // CONDITIONS OF ANY KIND, either express or implied. See the License for the -# // specific language governing permissions and limitations under the License. - -# this file has been generated by sv2nmigen - -from nmigen import Signal, Module, Const, Cat, Elaboratable -import rab_slice -import coreconfig - -# -# module slice_top -# //#( -# // parameter N_SLICES = 16, -# // parameter N_REGS = 4*N_SLICES, -# // parameter ADDR_WIDTH_PHYS = 40, -# // parameter ADDR_WIDTH_VIRT = 32 -# // ) -# ( -# input logic [N_REGS-1:0] [63:0] int_cfg_regs, -# input logic int_rw, -# input logic [ADDR_WIDTH_VIRT-1:0] int_addr_min, -# input logic [ADDR_WIDTH_VIRT-1:0] int_addr_max, -# input logic multi_hit_allow, -# output logic multi_hit, -# output logic [N_SLICES-1:0] prot, -# output logic [N_SLICES-1:0] hit, -# output logic cache_coherent, -# output logic [ADDR_WIDTH_PHYS-1:0] out_addr -# ); -# - - -class slice_top(Elaboratable): - - def __init__(self): - # FIXME self.int_cfg_regs = Signal() # input - self.params = coreconfig.CoreConfig() # rename ? - self.int_rw = Signal() # input - self.int_addr_min = Signal(self.params.ADDR_WIDTH_VIRT) # input - self.int_addr_max = Signal(self.params.ADDR_WIDTH_VIRT) # input - self.multi_hit_allow = Signal() # input - self.multi_hit = Signal() # output - self.prot = Signal(self.params.N_SLICES) # output - self.hit = Signal(self.params.N_SLICES) # output - self.cache_coherent = Signal() # output - self.out_addr = Signal(self.params.ADDR_WIDTH_PHYS) # output - - def elaborate(self, platform=None): - m = Module() - - first_hit = Signal() - - for i in range(self.params.N_SLICES): - # TODO pass params / core config here - u_slice = rab_slice.rab_slice(self.params) - setattr(m.submodules, "u_slice%d" % i, u_slice) - # TODO set param and connect ports - - # In case of a multi hit, the lowest slice with a hit is selected. - # TODO always_comb begin : HIT_CHECK - m.d.comb += [ - first_hit.eq(0), - self.multi_hit.eq(0), - self.out_addr.eq(0), - self.cache_coherent.eq(0)] - - for j in range(self.params.N_SLICES): - with m.If(self.hit[j] == 1): - with m.If(first_hit == 1): - with m.If(self.multi_hit_allow == 0): - m.d.comb += [self.multi_hit.eq(1)] - with m.Elif(first_hit == 1): - m.d.comb += [first_hit.eq(1) - # only output first slice that was hit - # SV self.out_addr.eq(slice_out_addr[ADDR_WIDTH_PHYS*j + : ADDR_WIDTH_PHYS]), - # SV self.cache_coherent.eq(int_cfg_regs[4*j+3][3]), - ] - return m - - # TODO translate generate statement - - -""" - logic [ADDR_WIDTH_PHYS*N_SLICES-1:0] slice_out_addr; - - generate - for ( i=0; i I have used bits <11:6> as they are not translated (4KB pages) -> and larger than a cache line (64 bytes). -> I have used bits <11:4> when the L1 cache was QuadW sized and -> the L2 cache was Line sized. -""" - -from nmigen.compat.sim import run_simulation -from nmigen.cli import verilog, rtlil -from nmigen import Module, Signal, Const, Array, Cat, Elaboratable - -from nmutil.latch import latchregister, SRLatch - - -class PartialAddrMatch(Elaboratable): - """A partial address matcher - """ - def __init__(self, n_adr, bitwid): - self.n_adr = n_adr - self.bitwid = bitwid - # inputs - self.addrs_i = Array(Signal(bitwid, name="addr") for i in range(n_adr)) - self.addr_we_i = Signal(n_adr) # write-enable for incoming address - self.addr_en_i = Signal(n_adr) # address latched in - self.addr_rs_i = Signal(n_adr) # address deactivated - - # output - self.addr_nomatch_o = Signal(n_adr, name="nomatch_o") - self.addr_nomatch_a_o = Array(Signal(n_adr, name="nomatch_array_o") \ - for i in range(n_adr)) - - def elaborate(self, platform): - m = Module() - return self._elaborate(m, platform) - - def _elaborate(self, m, platform): - comb = m.d.comb - sync = m.d.sync - - m.submodules.l = l = SRLatch(llen=self.n_adr, sync=False) - addrs_r = Array(Signal(self.bitwid, name="a_r") \ - for i in range(self.n_adr)) - - # latch set/reset - comb += l.s.eq(self.addr_en_i) - comb += l.r.eq(self.addr_rs_i) - - # copy in addresses (and "enable" signals) - for i in range(self.n_adr): - latchregister(m, self.addrs_i[i], addrs_r[i], l.q[i]) - - # is there a clash, yes/no - matchgrp = [] - for i in range(self.n_adr): - match = [] - for j in range(self.n_adr): - if i == j: - match.append(Const(0)) # don't match against self! - else: - match.append(addrs_r[i] == addrs_r[j]) - comb += self.addr_nomatch_a_o[i].eq(~Cat(*match) & l.q) - matchgrp.append(self.addr_nomatch_a_o[i] == l.q) - comb += self.addr_nomatch_o.eq(Cat(*matchgrp) & l.q) - - return m - - def __iter__(self): - yield from self.addrs_i - yield self.addr_we_i - yield self.addr_en_i - yield from self.addr_nomatch_a_o - yield self.addr_nomatch_o - - def ports(self): - return list(self) - - -def part_addr_sim(dut): - yield dut.dest_i.eq(1) - yield dut.issue_i.eq(1) - yield - yield dut.issue_i.eq(0) - yield - yield dut.src1_i.eq(1) - yield dut.issue_i.eq(1) - yield - yield dut.issue_i.eq(0) - yield - yield dut.go_rd_i.eq(1) - yield - yield dut.go_rd_i.eq(0) - yield - yield dut.go_wr_i.eq(1) - yield - yield dut.go_wr_i.eq(0) - yield - -def test_part_addr(): - dut = PartialAddrMatch(3, 10) - vl = rtlil.convert(dut, ports=dut.ports()) - with open("test_part_addr.il", "w") as f: - f.write(vl) - - run_simulation(dut, part_addr_sim(dut), vcd_name='test_part_addr.vcd') - -if __name__ == '__main__': - test_part_addr() diff --git a/src/scoreboard/dependence_cell.py b/src/scoreboard/dependence_cell.py deleted file mode 100644 index 16108229..00000000 --- a/src/scoreboard/dependence_cell.py +++ /dev/null @@ -1,169 +0,0 @@ -from nmigen.compat.sim import run_simulation -from nmigen.cli import verilog, rtlil -from nmigen import Module, Signal, Elaboratable, Array, Cat, Repl -from nmutil.latch import SRLatch -from functools import reduce -from operator import or_ - - -class DependencyRow(Elaboratable): - """ implements 11.4.7 mitch alsup dependence cell, p27 - adjusted to be clock-sync'd on rising edge only. - mitch design (as does 6600) requires alternating rising/falling clock - - * SET mode: issue_i HI, go_i LO, reg_i HI - register is captured - - FWD is DISABLED (~issue_i) - - RSEL DISABLED - * QRY mode: issue_i LO, go_i LO, haz_i HI - FWD is ASSERTED - reg_i HI - ignored - * GO mode : issue_i LO, go_i HI - RSEL is ASSERTED - haz_i HI - FWD still can be ASSERTED - - FWD assertion (hazard protection) therefore still occurs in both - Query and Go Modes, for this cycle, due to the cq register - - GO mode works for one cycle, again due to the cq register capturing - the latch output. Without the cq register, the SR Latch (which is - asynchronous) would be reset at the exact moment that GO was requested, - and the RSEL would be garbage. - """ - def __init__(self, n_reg, n_src, cancel_mode=False): - self.cancel_mode = cancel_mode - self.n_reg = n_reg - self.n_src = n_src - # arrays - src = [] - rsel = [] - fwd = [] - for i in range(n_src): - j = i + 1 # name numbering to match src1/src2 - src.append(Signal(n_reg, name="src%d" % j, reset_less=True)) - rsel.append(Signal(n_reg, name="src%d_rsel_o" % j, reset_less=True)) - fwd.append(Signal(n_reg, name="src%d_fwd_o" % j, reset_less=True)) - - # inputs - self.dest_i = Signal(n_reg, reset_less=True) # Dest in (top) - self.src_i = Array(src) # operands in (top) - self.issue_i = Signal(reset_less=True) # Issue in (top) - - self.rd_pend_i = Signal(n_reg, reset_less=True) # Read pend in (top) - self.wr_pend_i = Signal(n_reg, reset_less=True) # Write pend in (top) - self.v_rd_rsel_o = Signal(n_reg, reset_less=True) # Read pend out (bot) - self.v_wr_rsel_o = Signal(n_reg, reset_less=True) # Write pend out (bot) - - self.go_wr_i = Signal(reset_less=True) # Go Write in (left) - self.go_rd_i = Signal(reset_less=True) # Go Read in (left) - if self.cancel_mode: - self.go_die_i = Signal(n_reg, reset_less=True) # Go Die in (left) - else: - self.go_die_i = Signal(reset_less=True) # Go Die in (left) - - # for Register File Select Lines (vertical) - self.dest_rsel_o = Signal(n_reg, reset_less=True) # dest reg sel (bot) - self.src_rsel_o = Array(rsel) # src reg sel (bot) - self.src2_rsel_o = Signal(n_reg, reset_less=True) # src2 reg sel (bot) - - # for Function Unit "forward progress" (horizontal) - self.dest_fwd_o = Signal(n_reg, reset_less=True) # dest FU fw (right) - self.src_fwd_o = Array(fwd) # src FU fw (right) - - def elaborate(self, platform): - m = Module() - m.submodules.dest_c = dest_c = SRLatch(sync=False, llen=self.n_reg) - src_c = [] - for i in range(self.n_src): - src_l = SRLatch(sync=False, llen=self.n_reg) - setattr(m.submodules, "src%d_c" % (i+1), src_l) - src_c.append(src_l) - - # connect go_rd / go_wr (dest->wr, src->rd) - wr_die = Signal(self.n_reg, reset_less=True) - rd_die = Signal(self.n_reg, reset_less=True) - if self.cancel_mode: - go_die = self.go_die_i - else: - go_die = Repl(self.go_die_i, self.n_reg) - m.d.comb += wr_die.eq(Repl(self.go_wr_i, self.n_reg) | go_die) - m.d.comb += rd_die.eq(Repl(self.go_rd_i, self.n_reg) | go_die) - m.d.comb += dest_c.r.eq(wr_die) - for i in range(self.n_src): - m.d.comb += src_c[i].r.eq(rd_die) - - # connect input reg bit (unary) - i_ext = Repl(self.issue_i, self.n_reg) - m.d.comb += dest_c.s.eq(i_ext & self.dest_i) - for i in range(self.n_src): - m.d.comb += src_c[i].s.eq(i_ext & self.src_i[i]) - - # connect up hazard checks: read-after-write and write-after-read - m.d.comb += self.dest_fwd_o.eq(dest_c.q & self.rd_pend_i) - for i in range(self.n_src): - m.d.comb += self.src_fwd_o[i].eq(src_c[i].q & self.wr_pend_i) - - # connect reg-sel outputs - rd_ext = Repl(self.go_rd_i, self.n_reg) - wr_ext = Repl(self.go_wr_i, self.n_reg) - m.d.comb += self.dest_rsel_o.eq(dest_c.qlq & wr_ext) - for i in range(self.n_src): - m.d.comb += self.src_rsel_o[i].eq(src_c[i].qlq & rd_ext) - - # to be accumulated to indicate if register is in use (globally) - # after ORing, is fed back in to rd_pend_i / wr_pend_i - src_q = [] - for i in range(self.n_src): - src_q.append(src_c[i].qlq) - m.d.comb += self.v_rd_rsel_o.eq(reduce(or_, src_q)) - m.d.comb += self.v_wr_rsel_o.eq(dest_c.qlq) - - return m - - def __iter__(self): - yield self.dest_i - yield from self.src_i - yield self.rd_pend_i - yield self.wr_pend_i - yield self.issue_i - yield self.go_wr_i - yield self.go_rd_i - yield self.go_die_i - yield self.dest_rsel_o - yield from self.src_rsel_o - yield self.dest_fwd_o - yield from self.src_fwd_o - - def ports(self): - return list(self) - - -def dcell_sim(dut): - yield dut.dest_i.eq(1) - yield dut.issue_i.eq(1) - yield - yield dut.issue_i.eq(0) - yield - yield dut.src1_i.eq(1) - yield dut.issue_i.eq(1) - yield - yield - yield - yield dut.issue_i.eq(0) - yield - yield dut.go_rd_i.eq(1) - yield - yield dut.go_rd_i.eq(0) - yield - yield dut.go_wr_i.eq(1) - yield - yield dut.go_wr_i.eq(0) - yield - -def test_dcell(): - dut = DependencyRow(4, 2, True) - vl = rtlil.convert(dut, ports=dut.ports()) - with open("test_drow.il", "w") as f: - f.write(vl) - - run_simulation(dut, dcell_sim(dut), vcd_name='test_dcell.vcd') - -if __name__ == '__main__': - test_dcell() diff --git a/src/scoreboard/fn_unit.py b/src/scoreboard/fn_unit.py deleted file mode 100644 index 63beb70b..00000000 --- a/src/scoreboard/fn_unit.py +++ /dev/null @@ -1,321 +0,0 @@ -from nmigen.compat.sim import run_simulation -from nmigen.cli import verilog, rtlil -from nmigen import Module, Signal, Cat, Array, Const, Elaboratable -from nmigen.lib.coding import Decoder - -from nmutil.latch import SRLatch, latchregister - -from scoreboard.shadow import Shadow - - -class FnUnit(Elaboratable): - """ implements 11.4.8 function unit, p31 - also implements optional shadowing 11.5.1, p55 - - shadowing can be used for branches as well as exceptions (interrupts), - load/store hold (exceptions again), and vector-element predication - (once the predicate is known, which it may not be at instruction issue) - - Inputs - - * :wid: register file width - * :shadow_wid: number of shadow/fail/good/go_die sets - * :n_dests: number of destination regfile(s) (index: rfile_sel_i) - * :wr_pend: if true, writable observes the g_wr_pend_i vector - otherwise observes g_rd_pend_i - - notes: - - * dest_i / src1_i / src2_i are in *binary*, whereas... - * ...g_rd_pend_i / g_wr_pend_i and rd_pend_o / wr_pend_o are UNARY - * req_rel_i (request release) is the direct equivalent of pipeline - "output valid" (valid_o) - * recover is a local python variable (actually go_die_o) - * when shadow_wid = 0, recover and shadown are Consts (i.e. do nothing) - * wr_pend is set False for the majority of uses: however for - use in a STORE Function Unit it is set to True - """ - def __init__(self, wid, shadow_wid=0, n_dests=1, wr_pend=False): - self.reg_width = wid - self.n_dests = n_dests - self.shadow_wid = shadow_wid - self.wr_pend = wr_pend - - # inputs - if n_dests > 1: - self.rfile_sel_i = Signal(max=n_dests, reset_less=True) - else: - self.rfile_sel_i = Const(0) # no selection. gets Array[0] - self.dest_i = Signal(max=wid, reset_less=True) # Dest R# in (top) - self.src1_i = Signal(max=wid, reset_less=True) # oper1 R# in (top) - self.src2_i = Signal(max=wid, reset_less=True) # oper2 R# in (top) - self.issue_i = Signal(reset_less=True) # Issue in (top) - - self.go_wr_i = Signal(reset_less=True) # Go Write in (left) - self.go_rd_i = Signal(reset_less=True) # Go Read in (left) - self.req_rel_i = Signal(reset_less=True) # request release (left) - - self.g_xx_pend_i = Array(Signal(wid, reset_less=True, name="g_pend_i") \ - for i in range(n_dests)) # global rd (right) - self.g_wr_pend_i = Signal(wid, reset_less=True) # global wr (right) - - if shadow_wid: - self.shadow_i = Signal(shadow_wid, reset_less=True) - self.s_fail_i = Signal(shadow_wid, reset_less=True) - self.s_good_i = Signal(shadow_wid, reset_less=True) - self.go_die_o = Signal(reset_less=True) - - # outputs - self.readable_o = Signal(reset_less=True) # Readable out (right) - self.writable_o = Array(Signal(reset_less=True, name="writable_o") \ - for i in range(n_dests)) # writable out (right) - self.busy_o = Signal(reset_less=True) # busy out (left) - - self.src1_pend_o = Signal(wid, reset_less=True) # src1 pending - self.src2_pend_o = Signal(wid, reset_less=True) # src1 pending - self.rd_pend_o = Signal(wid, reset_less=True) # rd pending (right) - self.xx_pend_o = Array(Signal(wid, reset_less=True, name="pend_o") \ - for i in range(n_dests))# wr pending (right) - - def elaborate(self, platform): - m = Module() - m.submodules.rd_l = rd_l = SRLatch(sync=False) - m.submodules.wr_l = wr_l = SRLatch(sync=False) - m.submodules.dest_d = dest_d = Decoder(self.reg_width) - m.submodules.src1_d = src1_d = Decoder(self.reg_width) - m.submodules.src2_d = src2_d = Decoder(self.reg_width) - - # shadow / recover (optional: shadow_wid > 0) - m.submodules.shadow = shadow = Shadow(self.shadow_wid) - if self.shadow_wid: - m.d.comb += shadow.issue_i.eq(self.issue_i) - m.d.comb += shadow.s_fail_i.eq(self.s_fail_i) - m.d.comb += shadow.s_good_i.eq(self.s_good_i) - m.d.comb += shadow.shadow_i.eq(self.shadow_i) - shadown = shadow.shadown_o - recover = shadow.go_die_o - - # selector - xx_pend_o = self.xx_pend_o[self.rfile_sel_i] - writable_o = self.writable_o[self.rfile_sel_i] - g_pend_i = self.g_xx_pend_i[self.rfile_sel_i] - - for i in range(self.n_dests): - m.d.comb += self.xx_pend_o[i].eq(0) # initialise all array - m.d.comb += self.writable_o[i].eq(0) # to zero - m.d.comb += self.readable_o.eq(0) # to zero - - # go_wr latch: reset on go_wr HI, set on issue - m.d.comb += wr_l.s.eq(self.issue_i) - m.d.comb += wr_l.r.eq(self.go_wr_i | recover) - - # src1 latch: reset on go_rd HI, set on issue - m.d.comb += rd_l.s.eq(self.issue_i) - m.d.comb += rd_l.r.eq(self.go_rd_i | recover) - - # latch/registers for dest / src1 / src2 - dest_r = Signal(max=self.reg_width, reset_less=True) - src1_r = Signal(max=self.reg_width, reset_less=True) - src2_r = Signal(max=self.reg_width, reset_less=True) - # XXX latch based on *issue* rather than !latch (as in book) - latchregister(m, self.dest_i, dest_r, self.issue_i) #wr_l.qn) - latchregister(m, self.src1_i, src1_r, self.issue_i) #wr_l.qn) - latchregister(m, self.src2_i, src2_r, self.issue_i) #wr_l.qn) - - # dest decoder (use dest reg as input): write-pending out - m.d.comb += dest_d.i.eq(dest_r) - m.d.comb += dest_d.n.eq(wr_l.qn) # decode is inverted - m.d.comb += self.busy_o.eq(wr_l.q) # busy if set - m.d.comb += xx_pend_o.eq(dest_d.o) - - # src1/src2 decoder (use src1/2 regs as input): read-pending out - m.d.comb += src1_d.i.eq(src1_r) - m.d.comb += src1_d.n.eq(rd_l.qn) # decode is inverted - m.d.comb += src2_d.i.eq(src2_r) - m.d.comb += src2_d.n.eq(rd_l.qn) # decode is inverted - m.d.comb += self.src1_pend_o.eq(src1_d.o) - m.d.comb += self.src2_pend_o.eq(src2_d.o) - m.d.comb += self.rd_pend_o.eq(src1_d.o | src2_d.o) - - # readable output signal - g_rd = Signal(self.reg_width, reset_less=True) - ro = Signal(reset_less=True) - m.d.comb += g_rd.eq(~self.g_wr_pend_i & self.rd_pend_o) - m.d.comb += ro.eq(~g_rd.bool()) - m.d.comb += self.readable_o.eq(ro) - - # writable output signal - g_wr_v = Signal(self.reg_width, reset_less=True) - g_wr = Signal(reset_less=True) - wo = Signal(reset_less=True) - m.d.comb += g_wr_v.eq(g_pend_i & xx_pend_o) - m.d.comb += g_wr.eq(~g_wr_v.bool()) - m.d.comb += wo.eq(g_wr & rd_l.qn & self.req_rel_i & shadown) - m.d.comb += writable_o.eq(wo) - - return m - - def __iter__(self): - yield self.dest_i - yield self.src1_i - yield self.src2_i - yield self.issue_i - yield self.go_wr_i - yield self.go_rd_i - yield self.req_rel_i - yield from self.g_xx_pend_i - yield self.g_wr_pend_i - yield self.readable_o - yield from self.writable_o - yield self.rd_pend_o - yield from self.xx_pend_o - - def ports(self): - return list(self) - -############# ############### -# --- --- # -# --- renamed / redirected from base class --- # -# --- --- # -# --- below are convenience classes which match the names --- # -# --- of the various mitch alsup book chapter gate diagrams --- # -# --- --- # -############# ############### - - -class IntFnUnit(FnUnit): - def __init__(self, wid, shadow_wid=0): - FnUnit.__init__(self, wid, shadow_wid) - self.int_rd_pend_o = self.rd_pend_o - self.int_wr_pend_o = self.xx_pend_o[0] - self.g_int_wr_pend_i = self.g_wr_pend_i - self.g_int_rd_pend_i = self.g_xx_pend_i[0] - self.int_readable_o = self.readable_o - self.int_writable_o = self.writable_o[0] - - self.int_rd_pend_o.name = "int_rd_pend_o" - self.int_wr_pend_o.name = "int_wr_pend_o" - self.g_int_rd_pend_i.name = "g_int_rd_pend_i" - self.g_int_wr_pend_i.name = "g_int_wr_pend_i" - self.int_readable_o.name = "int_readable_o" - self.int_writable_o.name = "int_writable_o" - - -class FPFnUnit(FnUnit): - def __init__(self, wid, shadow_wid=0): - FnUnit.__init__(self, wid, shadow_wid) - self.fp_rd_pend_o = self.rd_pend_o - self.fp_wr_pend_o = self.xx_pend_o[0] - self.g_fp_wr_pend_i = self.g_wr_pend_i - self.g_fp_rd_pend_i = self.g_xx_pend_i[0] - self.fp_writable_o = self.writable_o[0] - self.fp_readable_o = self.readable_o - - self.fp_rd_pend_o.name = "fp_rd_pend_o" - self.fp_wr_pend_o.name = "fp_wr_pend_o" - self.g_fp_rd_pend_i.name = "g_fp_rd_pend_i" - self.g_fp_wr_pend_i.name = "g_fp_wr_pend_i" - self.fp_writable_o.name = "fp_writable_o" - self.fp_readable_o.name = "fp_readable_o" - - -class LDFnUnit(FnUnit): - """ number of dest selectors: 2. assumes len(int_regfile) == len(fp_regfile) - * when rfile_sel_i == 0, int_wr_pend_o is set - * when rfile_sel_i == 1, fp_wr_pend_o is set - """ - def __init__(self, wid, shadow_wid=0): - FnUnit.__init__(self, wid, shadow_wid, n_dests=2) - self.int_rd_pend_o = self.rd_pend_o - self.int_wr_pend_o = self.xx_pend_o[0] - self.fp_wr_pend_o = self.xx_pend_o[1] - self.g_int_wr_pend_i = self.g_wr_pend_i - self.g_int_rd_pend_i = self.g_xx_pend_i[0] - self.g_fp_rd_pend_i = self.g_xx_pend_i[1] - self.int_readable_o = self.readable_o - self.int_writable_o = self.writable_o[0] - self.fp_writable_o = self.writable_o[1] - - self.int_rd_pend_o.name = "int_rd_pend_o" - self.int_wr_pend_o.name = "int_wr_pend_o" - self.fp_wr_pend_o.name = "fp_wr_pend_o" - self.g_int_wr_pend_i.name = "g_int_wr_pend_i" - self.g_int_rd_pend_i.name = "g_int_rd_pend_i" - self.g_fp_rd_pend_i.name = "g_fp_rd_pend_i" - self.int_readable_o.name = "int_readable_o" - self.int_writable_o.name = "int_writable_o" - self.fp_writable_o.name = "fp_writable_o" - - -class STFnUnit(FnUnit): - """ number of dest selectors: 2. assumes len(int_regfile) == len(fp_regfile) - * wr_pend=False indicates to observe global fp write pending - * when rfile_sel_i == 0, int_wr_pend_o is set - * when rfile_sel_i == 1, fp_wr_pend_o is set - * - """ - def __init__(self, wid, shadow_wid=0): - FnUnit.__init__(self, wid, shadow_wid, n_dests=2, wr_pend=True) - self.int_rd_pend_o = self.rd_pend_o # 1st int read-pending vector - self.int2_rd_pend_o = self.xx_pend_o[0] # 2nd int read-pending vector - self.fp_rd_pend_o = self.xx_pend_o[1] # 1x FP read-pending vector - # yes overwrite FnUnit base class g_wr_pend_i vector - self.g_int_wr_pend_i = self.g_wr_pend_i = self.g_xx_pend_i[0] - self.g_fp_wr_pend_i = self.g_xx_pend_i[1] - self.int_readable_o = self.readable_o - self.int_writable_o = self.writable_o[0] - self.fp_writable_o = self.writable_o[1] - - self.int_rd_pend_o.name = "int_rd_pend_o" - self.int2_rd_pend_o.name = "int2_rd_pend_o" - self.fp_rd_pend_o.name = "fp_rd_pend_o" - self.g_int_wr_pend_i.name = "g_int_wr_pend_i" - self.g_fp_wr_pend_i.name = "g_fp_wr_pend_i" - self.int_readable_o.name = "int_readable_o" - self.int_writable_o.name = "int_writable_o" - self.fp_writable_o.name = "fp_writable_o" - - - -def int_fn_unit_sim(dut): - yield dut.dest_i.eq(1) - yield dut.issue_i.eq(1) - yield - yield dut.issue_i.eq(0) - yield - yield dut.src1_i.eq(1) - yield dut.issue_i.eq(1) - yield - yield - yield - yield dut.issue_i.eq(0) - yield - yield dut.go_rd_i.eq(1) - yield - yield dut.go_rd_i.eq(0) - yield - yield dut.go_wr_i.eq(1) - yield - yield dut.go_wr_i.eq(0) - yield - -def test_int_fn_unit(): - dut = FnUnit(32, 2, 2) - vl = rtlil.convert(dut, ports=dut.ports()) - with open("test_fn_unit.il", "w") as f: - f.write(vl) - - dut = LDFnUnit(32, 2) - vl = rtlil.convert(dut, ports=dut.ports()) - with open("test_ld_fn_unit.il", "w") as f: - f.write(vl) - - dut = STFnUnit(32, 0) - vl = rtlil.convert(dut, ports=dut.ports()) - with open("test_st_fn_unit.il", "w") as f: - f.write(vl) - - run_simulation(dut, int_fn_unit_sim(dut), vcd_name='test_fn_unit.vcd') - -if __name__ == '__main__': - test_int_fn_unit() diff --git a/src/scoreboard/fu_dep_cell.py b/src/scoreboard/fu_dep_cell.py deleted file mode 100644 index 9946dcb5..00000000 --- a/src/scoreboard/fu_dep_cell.py +++ /dev/null @@ -1,92 +0,0 @@ -from nmigen.compat.sim import run_simulation -from nmigen.cli import verilog, rtlil -from nmigen import Module, Signal, Const, Elaboratable -from nmutil.latch import SRLatch - - -class FUDependenceCell(Elaboratable): - """ implements 11.4.7 mitch alsup dependence cell, p27 - """ - def __init__(self, dummy, n_fu=1): - self.n_fu = n_fu - self.dummy = Const(~(1< - self.rd_pend_i = Signal(n_fu_row, reset_less=True) # Rd pending (left) - self.wr_pend_i = Signal(n_fu_row, reset_less=True) # Wr pending (left) - self.issue_i = Signal(n_fu_col, reset_less=True) # Issue in (top) - - self.go_wr_i = Signal(n_fu_row, reset_less=True) # Go Write in (left) - self.go_rd_i = Signal(n_fu_row, reset_less=True) # Go Read in (left) - self.go_die_i = Signal(n_fu_row, reset_less=True) # Go Die in (left) - - # for Function Unit Readable/Writable (horizontal) - self.readable_o = Signal(n_fu_col, reset_less=True) # readable (bot) - self.writable_o = Signal(n_fu_col, reset_less=True) # writable (bot) - - def elaborate(self, platform): - m = Module() - - # --- - # matrix of dependency cells - # --- - dm = Array(FUDependenceCell(f, self.n_fu_col) \ - for f in range(self.n_fu_row)) - for y in range(self.n_fu_row): - setattr(m.submodules, "dm%d" % y, dm[y]) - - # --- - # array of Function Unit Readable/Writable: row-length, horizontal - # --- - fur = Array(FU_Pick_Vec(self.n_fu_row) for r in range(self.n_fu_col)) - for x in range(self.n_fu_col): - setattr(m.submodules, "fur_x%d" % (x), fur[x]) - - # --- - # connect FU Readable/Writable vector - # --- - readable = [] - writable = [] - for y in range(self.n_fu_row): - fu = fur[y] - # accumulate Readable/Writable Vector outputs - readable.append(fu.readable_o) - writable.append(fu.writable_o) - - # ... and output them from this module (horizontal, width=REGs) - m.d.comb += self.readable_o.eq(Cat(*readable)) - m.d.comb += self.writable_o.eq(Cat(*writable)) - - # --- - # connect FU Pending - # --- - for y in range(self.n_fu_row): - dc = dm[y] - fu = fur[y] - # connect cell reg-select outputs to Reg Vector In - m.d.comb += [fu.rd_pend_i.eq(dc.rd_wait_o), - fu.wr_pend_i.eq(dc.wr_wait_o), - ] - - # --- - # connect Dependency Matrix dest/src1/src2/issue to module d/s/s/i - # --- - for x in range(self.n_fu_col): - issue_i = [] - for y in range(self.n_fu_row): - dc = dm[y] - # accumulate cell inputs issue - issue_i.append(dc.issue_i[x]) - # wire up inputs from module to row cell inputs - m.d.comb += Cat(*issue_i).eq(self.issue_i) - - # --- - # connect Matrix go_rd_i/go_wr_i to module readable/writable - # --- - for y in range(self.n_fu_row): - dc = dm[y] - # wire up inputs from module to row cell inputs - m.d.comb += [dc.go_rd_i.eq(self.go_rd_i), - dc.go_wr_i.eq(self.go_wr_i), - dc.go_die_i.eq(self.go_die_i), - ] - - # --- - # connect Matrix pending - # --- - for y in range(self.n_fu_row): - dc = dm[y] - # wire up inputs from module to row cell inputs - m.d.comb += [dc.rd_pend_i.eq(self.rd_pend_i), - dc.wr_pend_i.eq(self.wr_pend_i), - ] - - return m - - def __iter__(self): - yield self.rd_pend_i - yield self.wr_pend_i - yield self.issue_i - yield self.go_wr_i - yield self.go_rd_i - yield self.readable_o - yield self.writable_o - - def ports(self): - return list(self) - -def d_matrix_sim(dut): - """ XXX TODO - """ - yield dut.dest_i.eq(1) - yield dut.issue_i.eq(1) - yield - yield dut.issue_i.eq(0) - yield - yield dut.src1_i.eq(1) - yield dut.issue_i.eq(1) - yield - yield dut.issue_i.eq(0) - yield - yield dut.go_rd_i.eq(1) - yield - yield dut.go_rd_i.eq(0) - yield - yield dut.go_wr_i.eq(1) - yield - yield dut.go_wr_i.eq(0) - yield - -def test_fu_fu_matrix(): - dut = FUFUDepMatrix(n_fu_row=3, n_fu_col=4) - vl = rtlil.convert(dut, ports=dut.ports()) - with open("test_fu_fu_matrix.il", "w") as f: - f.write(vl) - - run_simulation(dut, d_matrix_sim(dut), vcd_name='test_fu_fu_matrix.vcd') - -if __name__ == '__main__': - test_fu_fu_matrix() diff --git a/src/scoreboard/fu_mem_matrix.py b/src/scoreboard/fu_mem_matrix.py deleted file mode 100644 index baaa02be..00000000 --- a/src/scoreboard/fu_mem_matrix.py +++ /dev/null @@ -1,155 +0,0 @@ -from nmigen.compat.sim import run_simulation -from nmigen.cli import verilog, rtlil -from nmigen import Module, Signal, Elaboratable, Array, Cat, Const - -from scoreboard.fumem_dep_cell import FUMemDependenceCell -from scoreboard.fu_mem_picker_vec import FUMem_Pick_Vec - -""" - - 6600 Function Unit Dependency Table Matrix inputs / outputs - ----------------------------------------------------------- - -""" - -class FUMemDepMatrix(Elaboratable): - """ implements FU-to-FU Memory Dependency Matrix - """ - def __init__(self, n_fu_row, n_fu_col): - self.n_fu_row = n_fu_row # Y (FU row#) ^v - self.n_fu_col = n_fu_col # X (FU col #) <> - self.st_pend_i = Signal(n_fu_row, reset_less=True) # Rd pending (left) - self.ld_pend_i = Signal(n_fu_row, reset_less=True) # Wr pending (left) - self.issue_i = Signal(n_fu_col, reset_less=True) # Issue in (top) - - self.go_ld_i = Signal(n_fu_row, reset_less=True) # Go Write in (left) - self.go_st_i = Signal(n_fu_row, reset_less=True) # Go Read in (left) - self.go_die_i = Signal(n_fu_row, reset_less=True) # Go Die in (left) - - # for Function Unit Readable/Writable (horizontal) - self.storable_o = Signal(n_fu_col, reset_less=True) # storable (bot) - self.loadable_o = Signal(n_fu_col, reset_less=True) # loadable (bot) - - def elaborate(self, platform): - m = Module() - - # --- - # matrix of dependency cells - # --- - dm = Array(FUMemDependenceCell(f, self.n_fu_col) \ - for f in range(self.n_fu_row)) - for y in range(self.n_fu_row): - setattr(m.submodules, "dm%d" % y, dm[y]) - - # --- - # array of Function Unit Readable/Writable: row-length, horizontal - # --- - fur = Array(FUMem_Pick_Vec(self.n_fu_row) for r in range(self.n_fu_col)) - for x in range(self.n_fu_col): - setattr(m.submodules, "fur_x%d" % (x), fur[x]) - - # --- - # connect FU Readable/Writable vector - # --- - storable = [] - loadable = [] - for y in range(self.n_fu_row): - fu = fur[y] - # accumulate Readable/Writable Vector outputs - storable.append(fu.storable_o) - loadable.append(fu.loadable_o) - - # ... and output them from this module (horizontal, width=REGs) - m.d.comb += self.storable_o.eq(Cat(*storable)) - m.d.comb += self.loadable_o.eq(Cat(*loadable)) - - # --- - # connect FU Pending - # --- - for y in range(self.n_fu_row): - dc = dm[y] - fu = fur[y] - # connect cell reg-select outputs to Reg Vector In - m.d.comb += [fu.st_pend_i.eq(dc.st_wait_o), - fu.ld_pend_i.eq(dc.ld_wait_o), - ] - - # --- - # connect Dependency Matrix dest/src1/src2/issue to module d/s/s/i - # --- - for x in range(self.n_fu_col): - issue_i = [] - for y in range(self.n_fu_row): - dc = dm[y] - # accumulate cell inputs issue - issue_i.append(dc.issue_i[x]) - # wire up inputs from module to row cell inputs - m.d.comb += Cat(*issue_i).eq(self.issue_i) - - # --- - # connect Matrix go_st_i/go_ld_i to module storable/loadable - # --- - for y in range(self.n_fu_row): - dc = dm[y] - # wire up inputs from module to row cell inputs - m.d.comb += [dc.go_st_i.eq(self.go_st_i), - dc.go_ld_i.eq(self.go_ld_i), - dc.go_die_i.eq(self.go_die_i), - ] - - # --- - # connect Matrix pending - # --- - for y in range(self.n_fu_row): - dc = dm[y] - # wire up inputs from module to row cell inputs - m.d.comb += [dc.st_pend_i.eq(self.st_pend_i), - dc.ld_pend_i.eq(self.ld_pend_i), - ] - - return m - - def __iter__(self): - yield self.st_pend_i - yield self.ld_pend_i - yield self.issue_i - yield self.go_ld_i - yield self.go_st_i - yield self.storable_o - yield self.loadable_o - - def ports(self): - return list(self) - -def d_matrix_sim(dut): - """ XXX TODO - """ - yield dut.ld_i.eq(1) - yield dut.issue_i.eq(1) - yield - yield dut.issue_i.eq(0) - yield - yield dut.st_i.eq(1) - yield dut.issue_i.eq(1) - yield - yield dut.issue_i.eq(0) - yield - yield dut.go_st_i.eq(1) - yield - yield dut.go_st_i.eq(0) - yield - yield dut.go_ld_i.eq(1) - yield - yield dut.go_ld_i.eq(0) - yield - -def test_fu_fu_matrix(): - dut = FUMemDepMatrix(n_fu_row=3, n_fu_col=3) - vl = rtlil.convert(dut, ports=dut.ports()) - with open("test_fu_mem_matrix.il", "w") as f: - f.write(vl) - - run_simulation(dut, d_matrix_sim(dut), vcd_name='test_fu_mem_matrix.vcd') - -if __name__ == '__main__': - test_fu_fu_matrix() diff --git a/src/scoreboard/fu_mem_picker_vec.py b/src/scoreboard/fu_mem_picker_vec.py deleted file mode 100644 index dc40bd09..00000000 --- a/src/scoreboard/fu_mem_picker_vec.py +++ /dev/null @@ -1,26 +0,0 @@ -from nmigen import Elaboratable, Module, Signal, Cat - - -class FUMem_Pick_Vec(Elaboratable): - """ these are allocated per-FU (horizontally), - and are of length fu_row_n - """ - def __init__(self, fu_row_n): - self.fu_row_n = fu_row_n - self.st_pend_i = Signal(fu_row_n, reset_less=True) - self.ld_pend_i = Signal(fu_row_n, reset_less=True) - - self.storable_o = Signal(reset_less=True) - self.loadable_o = Signal(reset_less=True) - - def elaborate(self, platform): - m = Module() - - # Readable if there are no writes pending - m.d.comb += self.storable_o.eq(~self.ld_pend_i.bool()) - - # Writable if there are no reads pending - m.d.comb += self.loadable_o.eq(~self.st_pend_i.bool()) - - return m - diff --git a/src/scoreboard/fu_picker_vec.py b/src/scoreboard/fu_picker_vec.py deleted file mode 100644 index d38bbfae..00000000 --- a/src/scoreboard/fu_picker_vec.py +++ /dev/null @@ -1,26 +0,0 @@ -from nmigen import Elaboratable, Module, Signal, Cat - - -class FU_Pick_Vec(Elaboratable): - """ these are allocated per-FU (horizontally), - and are of length fu_row_n - """ - def __init__(self, fu_row_n): - self.fu_row_n = fu_row_n - self.rd_pend_i = Signal(fu_row_n, reset_less=True) - self.wr_pend_i = Signal(fu_row_n, reset_less=True) - - self.readable_o = Signal(reset_less=True) - self.writable_o = Signal(reset_less=True) - - def elaborate(self, platform): - m = Module() - - # Readable if there are no writes pending - m.d.comb += self.readable_o.eq(~self.wr_pend_i.bool()) - - # Writable if there are no reads pending - m.d.comb += self.writable_o.eq(~self.rd_pend_i.bool()) - - return m - diff --git a/src/scoreboard/fu_reg_matrix.py b/src/scoreboard/fu_reg_matrix.py deleted file mode 100644 index 8ca1494e..00000000 --- a/src/scoreboard/fu_reg_matrix.py +++ /dev/null @@ -1,304 +0,0 @@ -from nmigen.compat.sim import run_simulation -from nmigen.cli import verilog, rtlil -from nmigen import Module, Signal, Elaboratable, Array, Cat, Repl - -from scoreboard.dependence_cell import DependencyRow -from scoreboard.fu_wr_pending import FU_RW_Pend -from scoreboard.reg_select import Reg_Rsv -from scoreboard.global_pending import GlobalPending - -""" - - 6600 Dependency Table Matrix inputs / outputs - --------------------------------------------- - - d s1 s2 i d s1 s2 i d s1 s2 i d s1 s2 i - | | | | | | | | | | | | | | | | - v v v v v v v v v v v v v v v v - go_rd/go_wr -> dm-r0-fu0 dm-r1-fu0 dm-r2-fu0 dm-r3-fu0 -> wr/rd-pend - go_rd/go_wr -> dm-r0-fu1 dm-r1-fu1 dm-r2-fu1 dm-r3-fu1 -> wr/rd-pend - go_rd/go_wr -> dm-r0-fu2 dm-r1-fu2 dm-r2-fu2 dm-r3-fu2 -> wr/rd-pend - | | | | | | | | | | | | - v v v v v v v v v v v v - d s1 s2 d s1 s2 d s1 s2 d s1 s2 - reg sel reg sel reg sel reg sel - -""" - -class FURegDepMatrix(Elaboratable): - """ implements 11.4.7 mitch alsup FU-to-Reg Dependency Matrix, p26 - """ - def __init__(self, n_fu_row, n_reg_col, n_src, cancel=None): - self.n_src = n_src - self.n_fu_row = nf = n_fu_row # Y (FUs) ^v - self.n_reg_col = n_reg = n_reg_col # X (Regs) <> - - # arrays - src = [] - rsel = [] - for i in range(n_src): - j = i + 1 # name numbering to match src1/src2 - src.append(Signal(n_reg, name="src%d" % j, reset_less=True)) - rsel.append(Signal(n_reg, name="src%d_rsel_o" % j, reset_less=True)) - pend = [] - for i in range(nf): - j = i + 1 # name numbering to match src1/src2 - pend.append(Signal(nf, name="rd_src%d_pend_o" % j, reset_less=True)) - - self.dest_i = Signal(n_reg_col, reset_less=True) # Dest in (top) - self.src_i = Array(src) # oper in (top) - - # cancellation array (from Address Matching), ties in with go_die_i - self.cancel = cancel - - # Register "Global" vectors for determining RaW and WaR hazards - self.wr_pend_i = Signal(n_reg_col, reset_less=True) # wr pending (top) - self.rd_pend_i = Signal(n_reg_col, reset_less=True) # rd pending (top) - self.v_wr_rsel_o = Signal(n_reg_col, reset_less=True) # wr pending (bot) - self.v_rd_rsel_o = Signal(n_reg_col, reset_less=True) # rd pending (bot) - - self.issue_i = Signal(n_fu_row, reset_less=True) # Issue in (top) - self.go_wr_i = Signal(n_fu_row, reset_less=True) # Go Write in (left) - self.go_rd_i = Signal(n_fu_row, reset_less=True) # Go Read in (left) - self.go_die_i = Signal(n_fu_row, reset_less=True) # Go Die in (left) - - # for Register File Select Lines (horizontal), per-reg - self.dest_rsel_o = Signal(n_reg_col, reset_less=True) # dest reg (bot) - self.src_rsel_o = Array(rsel) # src reg (bot) - - # for Function Unit "forward progress" (vertical), per-FU - self.wr_pend_o = Signal(n_fu_row, reset_less=True) # wr pending (right) - self.rd_pend_o = Signal(n_fu_row, reset_less=True) # rd pending (right) - self.rd_src_pend_o = Array(pend) # src1 pending - - def elaborate(self, platform): - m = Module() - return self._elaborate(m, platform) - - def _elaborate(self, m, platform): - - # --- - # matrix of dependency cells - # --- - cancel_mode = self.cancel is not None - dm = Array(DependencyRow(self.n_reg_col, self.n_src, cancel_mode) \ - for r in range(self.n_fu_row)) - for fu in range(self.n_fu_row): - setattr(m.submodules, "dr_fu%d" % fu, dm[fu]) - - # --- - # array of Function Unit Pending vectors - # --- - fupend = Array(FU_RW_Pend(self.n_reg_col, self.n_src) \ - for f in range(self.n_fu_row)) - for fu in range(self.n_fu_row): - setattr(m.submodules, "fu_fu%d" % (fu), fupend[fu]) - - # --- - # array of Register Reservation vectors - # --- - regrsv = Array(Reg_Rsv(self.n_fu_row, self.n_src) \ - for r in range(self.n_reg_col)) - for rn in range(self.n_reg_col): - setattr(m.submodules, "rr_r%d" % (rn), regrsv[rn]) - - # --- - # connect Function Unit vector - # --- - wr_pend = [] - rd_pend = [] - for fu in range(self.n_fu_row): - dc = dm[fu] - fup = fupend[fu] - dest_fwd_o = [] - for rn in range(self.n_reg_col): - # accumulate cell fwd outputs for dest/src1/src2 - dest_fwd_o.append(dc.dest_fwd_o[rn]) - # connect cell fwd outputs to FU Vector in [Cat is gooood] - m.d.comb += [fup.dest_fwd_i.eq(Cat(*dest_fwd_o)), - ] - # accumulate FU Vector outputs - wr_pend.append(fup.reg_wr_pend_o) - rd_pend.append(fup.reg_rd_pend_o) - - # ... and output them from this module (vertical, width=FUs) - m.d.comb += self.wr_pend_o.eq(Cat(*wr_pend)) - m.d.comb += self.rd_pend_o.eq(Cat(*rd_pend)) - - # same for src - for i in range(self.n_src): - rd_src_pend = [] - for fu in range(self.n_fu_row): - dc = dm[fu] - fup = fupend[fu] - src_fwd_o = [] - for rn in range(self.n_reg_col): - # accumulate cell fwd outputs for dest/src1/src2 - src_fwd_o.append(dc.src_fwd_o[i][rn]) - # connect cell fwd outputs to FU Vector in [Cat is gooood] - m.d.comb += [fup.src_fwd_i[i].eq(Cat(*src_fwd_o)), - ] - # accumulate FU Vector outputs - rd_src_pend.append(fup.reg_rd_src_pend_o[i]) - # ... and output them from this module (vertical, width=FUs) - m.d.comb += self.rd_src_pend_o[i].eq(Cat(*rd_src_pend)) - - # --- - # connect Reg Selection vector - # --- - dest_rsel = [] - for rn in range(self.n_reg_col): - rsv = regrsv[rn] - dest_rsel_o = [] - for fu in range(self.n_fu_row): - dc = dm[fu] - # accumulate cell reg-select outputs dest/src1/src2 - dest_rsel_o.append(dc.dest_rsel_o[rn]) - # connect cell reg-select outputs to Reg Vector In - m.d.comb += rsv.dest_rsel_i.eq(Cat(*dest_rsel_o)), - - # accumulate Reg-Sel Vector outputs - dest_rsel.append(rsv.dest_rsel_o) - - # ... and output them from this module (horizontal, width=REGs) - m.d.comb += self.dest_rsel_o.eq(Cat(*dest_rsel)) - - # same for src - for i in range(self.n_src): - src_rsel = [] - for rn in range(self.n_reg_col): - rsv = regrsv[rn] - src_rsel_o = [] - for fu in range(self.n_fu_row): - dc = dm[fu] - # accumulate cell reg-select outputs dest/src1/src2 - src_rsel_o.append(dc.src_rsel_o[i][rn]) - # connect cell reg-select outputs to Reg Vector In - m.d.comb += rsv.src_rsel_i[i].eq(Cat(*src_rsel_o)), - # accumulate Reg-Sel Vector outputs - src_rsel.append(rsv.src_rsel_o[i]) - - # ... and output them from this module (horizontal, width=REGs) - m.d.comb += self.src_rsel_o[i].eq(Cat(*src_rsel)) - - # --- - # connect Dependency Matrix dest/src1/src2/issue to module d/s/s/i - # --- - for fu in range(self.n_fu_row): - dc = dm[fu] - # wire up inputs from module to row cell inputs (Cat is gooood) - m.d.comb += [dc.dest_i.eq(self.dest_i), - dc.rd_pend_i.eq(self.rd_pend_i), - dc.wr_pend_i.eq(self.wr_pend_i), - ] - # same for src - for i in range(self.n_src): - for fu in range(self.n_fu_row): - dc = dm[fu] - # wire up inputs from module to row cell inputs (Cat is gooood) - m.d.comb += dc.src_i[i].eq(self.src_i[i]) - - # accumulate rsel bits into read/write pending vectors. - rd_pend_v = [] - wr_pend_v = [] - for fu in range(self.n_fu_row): - dc = dm[fu] - rd_pend_v.append(dc.v_rd_rsel_o) - wr_pend_v.append(dc.v_wr_rsel_o) - rd_v = GlobalPending(self.n_reg_col, rd_pend_v) - wr_v = GlobalPending(self.n_reg_col, wr_pend_v) - m.submodules.rd_v = rd_v - m.submodules.wr_v = wr_v - - m.d.comb += self.v_rd_rsel_o.eq(rd_v.g_pend_o) - m.d.comb += self.v_wr_rsel_o.eq(wr_v.g_pend_o) - - # --- - # connect Dep issue_i/go_rd_i/go_wr_i to module issue_i/go_rd/go_wr - # --- - go_rd_i = [] - go_wr_i = [] - issue_i = [] - for fu in range(self.n_fu_row): - dc = dm[fu] - # accumulate cell fwd outputs for dest/src1/src2 - go_rd_i.append(dc.go_rd_i) - go_wr_i.append(dc.go_wr_i) - issue_i.append(dc.issue_i) - # wire up inputs from module to row cell inputs (Cat is gooood) - m.d.comb += [Cat(*go_rd_i).eq(self.go_rd_i), - Cat(*go_wr_i).eq(self.go_wr_i), - Cat(*issue_i).eq(self.issue_i), - ] - - # --- - # connect Dep go_die_i - # --- - if cancel_mode: - for fu in range(self.n_fu_row): - dc = dm[fu] - go_die = Repl(self.go_die_i[fu], self.n_fu_row) - go_die = go_die | self.cancel[fu] - m.d.comb += dc.go_die_i.eq(go_die) - else: - go_die_i = [] - for fu in range(self.n_fu_row): - dc = dm[fu] - # accumulate cell fwd outputs for dest/src1/src2 - go_die_i.append(dc.go_die_i) - # wire up inputs from module to row cell inputs (Cat is gooood) - m.d.comb += Cat(*go_die_i).eq(self.go_die_i) - return m - - def __iter__(self): - yield self.dest_i - yield from self.src_i - yield self.issue_i - yield self.go_wr_i - yield self.go_rd_i - yield self.go_die_i - yield self.dest_rsel_o - yield from self.src_rsel_o - yield self.wr_pend_o - yield self.rd_pend_o - yield self.wr_pend_i - yield self.rd_pend_i - yield self.v_wr_rsel_o - yield self.v_rd_rsel_o - yield from self.rd_src_pend_o - - def ports(self): - return list(self) - -def d_matrix_sim(dut): - """ XXX TODO - """ - yield dut.dest_i.eq(1) - yield dut.issue_i.eq(1) - yield - yield dut.issue_i.eq(0) - yield - yield dut.src1_i.eq(1) - yield dut.issue_i.eq(1) - yield - yield dut.issue_i.eq(0) - yield - yield dut.go_rd_i.eq(1) - yield - yield dut.go_rd_i.eq(0) - yield - yield dut.go_wr_i.eq(1) - yield - yield dut.go_wr_i.eq(0) - yield - -def test_d_matrix(): - dut = FURegDepMatrix(n_fu_row=3, n_reg_col=4, n_src=2) - vl = rtlil.convert(dut, ports=dut.ports()) - with open("test_fu_reg_matrix.il", "w") as f: - f.write(vl) - - run_simulation(dut, d_matrix_sim(dut), vcd_name='test_fu_reg_matrix.vcd') - -if __name__ == '__main__': - test_d_matrix() diff --git a/src/scoreboard/fu_wr_pending.py b/src/scoreboard/fu_wr_pending.py deleted file mode 100644 index d0bcb954..00000000 --- a/src/scoreboard/fu_wr_pending.py +++ /dev/null @@ -1,29 +0,0 @@ -from nmigen import Elaboratable, Module, Signal, Array - - -class FU_RW_Pend(Elaboratable): - """ these are allocated per-FU (horizontally), - and are of length reg_count - """ - def __init__(self, reg_count, n_src): - self.n_src = n_src - self.reg_count = reg_count - self.dest_fwd_i = Signal(reg_count, reset_less=True) - src = [] - for i in range(n_src): - j = i + 1 # name numbering to match src1/src2 - src.append(Signal(reg_count, name="src%d" % j, reset_less=True)) - self.src_fwd_i = Array(src) - - self.reg_wr_pend_o = Signal(reset_less=True) - self.reg_rd_pend_o = Signal(reset_less=True) - self.reg_rd_src_pend_o = Signal(n_src, reset_less=True) - - def elaborate(self, platform): - m = Module() - m.d.comb += self.reg_wr_pend_o.eq(self.dest_fwd_i.bool()) - for i in range(self.n_src): - m.d.comb += self.reg_rd_src_pend_o[i].eq(self.src_fwd_i[i].bool()) - m.d.comb += self.reg_rd_pend_o.eq(self.reg_rd_src_pend_o.bool()) - return m - diff --git a/src/scoreboard/fumem_dep_cell.py b/src/scoreboard/fumem_dep_cell.py deleted file mode 100644 index 982b55a3..00000000 --- a/src/scoreboard/fumem_dep_cell.py +++ /dev/null @@ -1,92 +0,0 @@ -from nmigen.compat.sim import run_simulation -from nmigen.cli import verilog, rtlil -from nmigen import Module, Signal, Const, Elaboratable -from nmutil.latch import SRLatch - - -class FUMemDependenceCell(Elaboratable): - """ implements 11.4.7 mitch alsup dependence cell, p27 - """ - def __init__(self, dummy, n_fu=1): - self.n_fu = n_fu - self.dummy = Const(~(1< self.qlen_o) - with m.If(qinmax): - comb += self.n_sub_o.eq(self.qlen_o) - with m.Else(): - comb += self.n_sub_o.eq(self.n_sub_i) - - # work out how many new items are going to be in the queue - comb += left.eq(self.qlen_o )#- self.n_sub_o) - comb += spare.eq(mqlen - self.p_add_i) - comb += qmaxed.eq(left <= spare) - comb += self.p_ready_o.eq(qmaxed & (self.p_add_i != 0)) - - # put q (flattened) into output - for i in range(self.n_out): - opos = Signal(mqbits) - comb += opos.eq(end_q + i) - comb += cat(self.data_o[i]).eq(self.q[opos]) - - with m.If(self.n_sub_o): - # ok now the end's moved - sync += end_q.eq(end_q + self.n_sub_o) - - with m.If(self.p_ready_o): - # copy in the input... insanely gate-costly... *sigh*... - for i in range(self.n_in): - with m.If(self.p_add_i > Const(i, len(self.p_add_i))): - ipos = Signal(mqbits) - comb += ipos.eq(start_q + i) # should roll round - sync += self.q[ipos].eq(cat(self.data_i[i])) - sync += start_q.eq(start_q + self.p_add_i) - - with m.If(self.p_ready_o): - # update the queue length - add2 = Signal(mqbits+1) - comb += add2.eq(self.qlen_o + self.p_add_i) - sync += self.qlen_o.eq(add2 - self.n_sub_o) - with m.Else(): - sync += self.qlen_o.eq(self.qlen_o - self.n_sub_o) - - return m - - def __iter__(self): - yield from self.q - - yield self.p_ready_o - for o in self.data_i: - yield from list(o) - yield self.p_add_i - - for o in self.data_o: - yield from list(o) - yield self.n_sub_i - yield self.n_sub_o - - def ports(self): - return list(self) - - -def instruction_q_sim(dut): - yield dut.dest_i.eq(1) - yield dut.issue_i.eq(1) - yield - yield dut.issue_i.eq(0) - yield - yield dut.src1_i.eq(1) - yield dut.issue_i.eq(1) - yield - yield - yield - yield dut.issue_i.eq(0) - yield - yield dut.go_rd_i.eq(1) - yield - yield dut.go_rd_i.eq(0) - yield - yield dut.go_wr_i.eq(1) - yield - yield dut.go_wr_i.eq(0) - yield - -def test_instruction_q(): - dut = InstructionQ(16, 4, 4, n_in=2, n_out=2) - vl = rtlil.convert(dut, ports=dut.ports()) - with open("test_instruction_q.il", "w") as f: - f.write(vl) - - run_simulation(dut, instruction_q_sim(dut), - vcd_name='test_instruction_q.vcd') - -if __name__ == '__main__': - test_instruction_q() diff --git a/src/scoreboard/issue_unit.py b/src/scoreboard/issue_unit.py deleted file mode 100644 index 3ec2a31c..00000000 --- a/src/scoreboard/issue_unit.py +++ /dev/null @@ -1,278 +0,0 @@ -from nmigen.compat.sim import run_simulation -from nmigen.cli import verilog, rtlil -from nmigen import Module, Signal, Cat, Array, Const, Repl, Elaboratable -from nmigen.lib.coding import Decoder - -from scoreboard.group_picker import PriorityPicker - - -class RegDecode(Elaboratable): - """ decodes registers into unary - - Inputs - - * :wid: register file width - """ - def __init__(self, wid): - self.reg_width = wid - - # inputs - self.enable_i = Signal(reset_less=True) # enable decoders - self.dest_i = Signal(range(wid), reset_less=True) # Dest R# in - self.src1_i = Signal(range(wid), reset_less=True) # oper1 R# in - self.src2_i = Signal(range(wid), reset_less=True) # oper2 R# in - - # outputs - self.dest_o = Signal(wid, reset_less=True) # Dest unary out - self.src1_o = Signal(wid, reset_less=True) # oper1 unary out - self.src2_o = Signal(wid, reset_less=True) # oper2 unary out - - def elaborate(self, platform): - m = Module() - m.submodules.dest_d = dest_d = Decoder(self.reg_width) - m.submodules.src1_d = src1_d = Decoder(self.reg_width) - m.submodules.src2_d = src2_d = Decoder(self.reg_width) - - # dest decoder: write-pending - for d, i, o in [(dest_d, self.dest_i, self.dest_o), - (src1_d, self.src1_i, self.src1_o), - (src2_d, self.src2_i, self.src2_o)]: - m.d.comb += d.i.eq(i) - m.d.comb += d.n.eq(~self.enable_i) - m.d.comb += o.eq(d.o) - - return m - - def __iter__(self): - yield self.enable_i - yield self.dest_i - yield self.src1_i - yield self.src2_i - yield self.dest_o - yield self.src1_o - yield self.src2_o - - def ports(self): - return list(self) - - -class IssueUnitGroup(Elaboratable): - """ Manages a batch of Computation Units all of which can do the same task - - A priority picker will allocate one instruction in this cycle based - on whether the others are busy. - - insn_i indicates to this module that there is an instruction to be - issued which this group can handle - - busy_i is a vector of signals that indicate, in this cycle, which - of the units are currently busy. - - busy_o indicates whether it is "safe to proceed" i.e. whether - there is a unit here that can *be* issued an instruction - - fn_issue_o indicates, out of the available (non-busy) units, - which one may be selected - """ - def __init__(self, n_insns): - """ Set up inputs and outputs for the Group - - Input Parameters - - * :n_insns: number of instructions in this issue unit. - """ - self.n_insns = n_insns - - # inputs - self.insn_i = Signal(reset_less=True, name="insn_i") - self.busy_i = Signal(n_insns, reset_less=True, name="busy_i") - - # outputs - self.fn_issue_o = Signal(n_insns, reset_less=True, name="fn_issue_o") - self.busy_o = Signal(reset_less=True) - - def elaborate(self, platform): - m = Module() - - if self.n_insns == 0: - return m - - m.submodules.pick = pick = PriorityPicker(self.n_insns) - - # temporaries - allissue = Signal(self.n_insns, reset_less=True) - - m.d.comb += allissue.eq(Repl(self.insn_i, self.n_insns)) - # Pick one (and only one) of the units to proceed in this cycle - m.d.comb += pick.i.eq(~self.busy_i & allissue) - - # "Safe to issue" condition is basically when all units are not busy - m.d.comb += self.busy_o.eq(~((~self.busy_i).bool())) - - # Picker only raises one signal, therefore it's also the fn_issue - m.d.comb += self.fn_issue_o.eq(pick.o & Repl(~self.busy_o, self.n_insns)) - - return m - - def __iter__(self): - yield self.insn_i - yield self.busy_i - yield self.fn_issue_o - yield self.g_issue_o - - def ports(self): - return list(self) - - -class IssueUnitArray(Elaboratable): - """ Convenience module that amalgamates the issue and busy signals - - unit issue_i is to be set externally, at the same time as the - ALU group oper_i - """ - def __init__(self, units): - self.units = units - self.issue_o = Signal(reset_less=True) - n_insns = 0 - for u in self.units: - n_insns += len(u.fn_issue_o) - self.busy_i = Signal(n_insns, reset_less=True) - self.fn_issue_o = Signal(n_insns, reset_less=True) - self.n_insns = n_insns - - def elaborate(self, platform): - m = Module() - for i, u in enumerate(self.units): - setattr(m.submodules, "issue%d" % i, u) - - g_issue_o = [] - busy_i = [] - fn_issue_o = [] - for u in self.units: - busy_i.append(u.busy_i) - g_issue_o.append(u.busy_o) - fn_issue_o.append(u.fn_issue_o) - m.d.comb += self.issue_o.eq(~(Cat(*g_issue_o).bool())) - m.d.comb += self.fn_issue_o.eq(Cat(*fn_issue_o)) - m.d.comb += Cat(*busy_i).eq(self.busy_i) - - return m - - def ports(self): - yield self.busy_i - yield self.issue_o - yield self.fn_issue_o - yield from self.units - - - -class IssueUnit(Elaboratable): - """ implements 11.4.14 issue unit, p50 - - Inputs - - * :n_insns: number of instructions in this issue unit. - """ - def __init__(self, n_insns): - self.n_insns = n_insns - - # inputs - self.insn_i = Signal(n_insns, reset_less=True, name="insn_i") - self.busy_i = Signal(n_insns, reset_less=True, name="busy_i") - - # outputs - self.fn_issue_o = Signal(n_insns, reset_less=True, name="fn_issue_o") - self.g_issue_o = Signal(reset_less=True) - - def elaborate(self, platform): - m = Module() - - if self.n_insns == 0: - return m - - # temporaries - fu_stall = Signal(reset_less=True) - - ib_l = [] - for i in range(self.n_insns): - ib_l.append(self.insn_i[i] & self.busy_i[i]) - m.d.comb += fu_stall.eq(Cat(*ib_l).bool()) - m.d.comb += self.g_issue_o.eq(~(fu_stall)) - for i in range(self.n_insns): - m.d.comb += self.fn_issue_o[i].eq(self.g_issue_o & self.insn_i[i]) - - return m - - def __iter__(self): - yield self.insn_i - yield self.busy_i - yield self.fn_issue_o - yield self.g_issue_o - - def ports(self): - return list(self) - - -class IntFPIssueUnit(Elaboratable): - def __init__(self, n_int_insns, n_fp_insns): - self.i = IssueUnit(n_int_insns) - self.f = IssueUnit(n_fp_insns) - self.issue_o = Signal(reset_less=True) - - def elaborate(self, platform): - m = Module() - m.submodules.intissue = self.i - m.submodules.fpissue = self.f - - m.d.comb += self.issue_o.eq(self.i.g_issue_o | self.f.g_issue_o) - - return m - - def ports(self): - yield self.issue_o - yield from self.i - yield from self.f - - -def issue_unit_sim(dut): - yield dut.dest_i.eq(1) - yield dut.issue_i.eq(1) - yield - yield dut.issue_i.eq(0) - yield - yield dut.src1_i.eq(1) - yield dut.issue_i.eq(1) - yield - yield - yield - yield dut.issue_i.eq(0) - yield - yield dut.go_rd_i.eq(1) - yield - yield dut.go_rd_i.eq(0) - yield - yield dut.go_wr_i.eq(1) - yield - yield dut.go_wr_i.eq(0) - yield - -def test_issue_unit(): - dut = IssueUnitGroup(3) - vl = rtlil.convert(dut, ports=dut.ports()) - with open("test_issue_unit_group.il", "w") as f: - f.write(vl) - - dut = IssueUnit(32, 3) - vl = rtlil.convert(dut, ports=dut.ports()) - with open("test_issue_unit.il", "w") as f: - f.write(vl) - - dut = IntFPIssueUnit(32, 3, 3) - vl = rtlil.convert(dut, ports=dut.ports()) - with open("test_intfp_issue_unit.il", "w") as f: - f.write(vl) - - run_simulation(dut, issue_unit_sim(dut), vcd_name='test_issue_unit.vcd') - -if __name__ == '__main__': - test_issue_unit() diff --git a/src/scoreboard/ldst_dep_cell.py b/src/scoreboard/ldst_dep_cell.py deleted file mode 100644 index 70f4b9ba..00000000 --- a/src/scoreboard/ldst_dep_cell.py +++ /dev/null @@ -1,116 +0,0 @@ -""" Mitch Alsup 6600-style LD/ST scoreboard Dependency Cell - -Relevant bugreports: - -* http://bugs.libre-riscv.org/show_bug.cgi?id=81 - -""" - -from nmigen.compat.sim import run_simulation -from nmigen.cli import verilog, rtlil -from nmigen import Module, Signal, Repl, Elaboratable -from nmutil.latch import SRLatch - - -class LDSTDepCell(Elaboratable): - """ implements 11.4.12 mitch alsup load/store dependence cell, p45 - """ - def __init__(self, n_ls=1): - self.n_ls = n_ls - # inputs - self.load_h_i = Signal(reset_less=True) # load in (left) - self.stor_h_i = Signal(reset_less=True) # store in (left) - self.load_v_i = Signal(n_ls, reset_less=True) # load in (top) - self.stor_v_i = Signal(n_ls, reset_less=True) # store in (top) - self.issue_i = Signal(reset_less=True) # Issue in (left) - self.go_die_i = Signal(reset_less=True) # Issue in (left) - - # load / store hit - basically connect these to go_wr from LD/STCompUnit - # LD.go_wr -> load_hit_i, ST.go_wr -> stwd_hit_i. - self.load_hit_i = Signal(n_ls, reset_less=True) # ld hit in (right) - self.stwd_hit_i = Signal(n_ls, reset_less=True) # st w/ hit in (right) - - # outputs (latched rd/wr pend) - self.ld_hold_st_o = Signal(reset_less=True) # ld holds st out (l) - self.st_hold_ld_o = Signal(reset_less=True) # st holds ld out (l) - - def elaborate(self, platform): - m = Module() - m.submodules.war_l = war_l = SRLatch(sync=False, llen=self.n_ls) # WaR - m.submodules.raw_l = raw_l = SRLatch(sync=False, llen=self.n_ls) # RaW - - # temporaries (repeat-extend) - issue = Repl(self.issue_i, self.n_ls) - die = Repl(self.go_die_i, self.n_ls) - - # issue & store & load - used for WAR Setting. LD is left, ST is top - i_s = Signal(reset_less=True) - i_s_l = Signal(self.n_ls, reset_less=True) - m.d.comb += i_s.eq(issue & self.stor_h_i) # horizontal single-signal - m.d.comb += i_s_l.eq(Repl(i_s, self.n_ls) & self.load_v_i) # multi, vert - - # issue & load & store - used for RAW Setting. ST is left, LD is top - i_l = Signal(reset_less=True) - i_l_s = Signal(self.n_ls, reset_less=True) - m.d.comb += i_l.eq(issue & self.load_h_i) # horizontal single-signal - m.d.comb += i_l_s.eq(Repl(i_l, self.n_ls) & self.stor_v_i) # multi, vert - - # write after read latch: loads block stores - m.d.comb += war_l.s.eq(i_s_l) - m.d.comb += war_l.r.eq(die | ~self.load_v_i) # reset on LD - - # read after write latch: stores block loads - m.d.comb += raw_l.s.eq(i_s_l) - m.d.comb += raw_l.r.eq(die | ~self.stor_v_i) # reset on ST - - # Hold results (read out horizontally, accumulate in OR fashion) - m.d.comb += self.ld_hold_st_o.eq((war_l.qn & self.load_hit_i).bool()) - m.d.comb += self.st_hold_ld_o.eq((raw_l.qn & self.stwd_hit_i).bool()) - - return m - - def __iter__(self): - yield self.load_h_i - yield self.load_v_i - yield self.stor_h_i - yield self.stor_h_i - yield self.issue_i - yield self.load_hit_i - yield self.stwd_hit_i - yield self.ld_hold_st_o - yield self.st_hold_ld_o - - def ports(self): - return list(self) - - -def dcell_sim(dut): - yield dut.dest_i.eq(1) - yield dut.issue_i.eq(1) - yield - yield dut.issue_i.eq(0) - yield - yield dut.src1_i.eq(1) - yield dut.issue_i.eq(1) - yield - yield dut.issue_i.eq(0) - yield - yield dut.go_rd_i.eq(1) - yield - yield dut.go_rd_i.eq(0) - yield - yield dut.go_wr_i.eq(1) - yield - yield dut.go_wr_i.eq(0) - yield - -def test_dcell(): - dut = LDSTDepCell() - vl = rtlil.convert(dut, ports=dut.ports()) - with open("test_ldst_dcell.il", "w") as f: - f.write(vl) - - run_simulation(dut, dcell_sim(dut), vcd_name='test_ldst_dcell.vcd') - -if __name__ == '__main__': - test_dcell() diff --git a/src/scoreboard/ldst_matrix.py b/src/scoreboard/ldst_matrix.py deleted file mode 100644 index 1bb75b03..00000000 --- a/src/scoreboard/ldst_matrix.py +++ /dev/null @@ -1,163 +0,0 @@ -""" Mitch Alsup 6600-style LD/ST Memory Scoreboard Matrix (sparse vector) - -6600 LD/ST Dependency Table Matrix inputs / outputs ---------------------------------------------------- - -Relevant comments (p45-46): - -* If there are no WAR dependencies on a Load instruction with a computed - address it can assert Bank_Addressable and Translate_Addressable. - -* If there are no RAW dependencies on a Store instruction with both a - write permission and store data present it can assert Bank_Addressable - -Relevant bugreports: - -* http://bugs.libre-riscv.org/show_bug.cgi?id=81 - -Notes: - -* Load Hit (or Store Hit with Data) are asserted by the LD/ST Computation - Unit when it has data and address ready - -* Asserting the ld_hit_i (or stwd_hit_i) *requires* that the output be - captured or at least taken into consideration for the next LD/STs - *right then*. Failure to observe the xx_hold_xx_o *will* result in - data corruption, as they are *only* asserted if xx_hit_i is asserted - -* The hold signals still have to go through "maybe address clashes" - detection, they cannot just be used as-is to stop a LD/ST. - -""" - -from nmigen.compat.sim import run_simulation -from nmigen.cli import verilog, rtlil -from nmigen import Module, Signal, Elaboratable, Array, Cat, Const - -from ldst_dep_cell import LDSTDepCell - - -class LDSTDepMatrix(Elaboratable): - """ implements 11.4.12 mitch alsup LD/ST Dependency Matrix, p46 - actually a sparse matrix along the diagonal. - - load-hold-store and store-hold-load accumulate in a priority-picking - fashion, ORing together. the OR gate from the dependency cell is - here. - """ - def __init__(self, n_ldst): - self.n_ldst = n_ldst # X and Y (FUs) - self.ld_pend_i = Signal(n_ldst, reset_less=True) # load pending in - self.st_pend_i = Signal(n_ldst, reset_less=True) # store pending in - self.issue_i = Signal(n_ldst, reset_less=True) # Issue in - self.go_die_i = Signal(n_ldst, reset_less=True) # Die/Reset in - - self.load_hit_i = Signal(n_ldst, reset_less=True) # load hit in - self.stwd_hit_i = Signal(n_ldst, reset_less=True) # store w/data hit in - - # outputs - self.ld_hold_st_o = Signal(n_ldst, reset_less=True) # load holds st out - self.st_hold_ld_o = Signal(n_ldst, reset_less=True) # st holds load out - - def elaborate(self, platform): - m = Module() - - # --- - # matrix of dependency cells. actually, LDSTDepCell is a row, now - # --- - dm = Array(LDSTDepCell(self.n_ldst) for f in range(self.n_ldst)) - for fu in range(self.n_ldst): - setattr(m.submodules, "dm_fu%d" % (fu), dm[fu]) - - # --- - # connect Function Unit vector, all horizontal - # --- - lhs_l = [] - shl_l = [] - issue_l = [] - go_die_l = [] - lh_l = [] - sh_l = [] - for fu in range(self.n_ldst): - dc = dm[fu] - # accumulate load-hold-store / store-hold-load bits (horizontal) - lhs_l.append(dc.ld_hold_st_o) - shl_l.append(dc.st_hold_ld_o) - # accumulate inputs (for Cat'ing later) - TODO: must be a better way - issue_l.append(dc.issue_i) - go_die_l.append(dc.go_die_i) - - # load-hit and store-with-data-hit go in vertically (top) - m.d.comb += [dc.load_hit_i.eq(self.load_hit_i), - dc.stwd_hit_i.eq(self.stwd_hit_i), - dc.load_v_i.eq(self.ld_pend_i), - dc.stor_v_i.eq(self.st_pend_i), - ] - - # connect cell inputs using Cat(*list_of_stuff) - m.d.comb += [Cat(*issue_l).eq(self.issue_i), - Cat(*go_die_l).eq(self.go_die_i), - ] - # connect the load-hold-store / store-hold-load OR-accumulated outputs - m.d.comb += self.ld_hold_st_o.eq(Cat(*lhs_l)) - m.d.comb += self.st_hold_ld_o.eq(Cat(*shl_l)) - - # the load/store input also needs to be connected to "top" (vertically) - for fu in range(self.n_ldst): - load_h_l = [] - stor_h_l = [] - for fux in range(self.n_ldst): - dc = dm[fux] - load_h_l.append(dc.load_h_i) - stor_h_l.append(dc.stor_h_i) - m.d.comb += [Cat(*load_h_l).eq(self.ld_pend_i), - Cat(*stor_h_l).eq(self.st_pend_i), - ] - - return m - - def __iter__(self): - yield self.ld_pend_i - yield self.st_pend_i - yield self.issue_i - yield self.go_die_i - yield self.load_hit_i - yield self.stwd_hit_i - yield self.ld_hold_st_o - yield self.st_hold_ld_o - - def ports(self): - return list(self) - -def d_matrix_sim(dut): - """ XXX TODO - """ - yield dut.dest_i.eq(1) - yield dut.issue_i.eq(1) - yield - yield dut.issue_i.eq(0) - yield - yield dut.src1_i.eq(1) - yield dut.issue_i.eq(1) - yield - yield dut.issue_i.eq(0) - yield - yield dut.go_rd_i.eq(1) - yield - yield dut.go_rd_i.eq(0) - yield - yield dut.go_wr_i.eq(1) - yield - yield dut.go_wr_i.eq(0) - yield - -def test_d_matrix(): - dut = LDSTDepMatrix(n_ldst=4) - vl = rtlil.convert(dut, ports=dut.ports()) - with open("test_ld_st_matrix.il", "w") as f: - f.write(vl) - - run_simulation(dut, d_matrix_sim(dut), vcd_name='test_ld_st_matrix.vcd') - -if __name__ == '__main__': - test_d_matrix() diff --git a/src/scoreboard/mdm.py b/src/scoreboard/mdm.py deleted file mode 100644 index 184931ef..00000000 --- a/src/scoreboard/mdm.py +++ /dev/null @@ -1,22 +0,0 @@ -from nmigen.compat.sim import run_simulation -from nmigen.cli import verilog, rtlil -from nmigen import Module - -from scoreboard.fu_reg_matrix import FURegDepMatrix -from scoreboard.addr_match import PartialAddrMatch - -class FUMemMatchMatrix(FURegDepMatrix, PartialAddrMatch): - """ implement a FU-Regs overload with memory-address matching - """ - def __init__(self, n_fu, addrbitwid): - PartialAddrMatch.__init__(self, n_fu, addrbitwid) - FURegDepMatrix.__init__(self, n_fu, n_fu, 1, self.addr_nomatch_o) - - def elaborate(self, platform): - m = Module() - PartialAddrMatch._elaborate(self, m, platform) - FURegDepMatrix._elaborate(self, m, platform) - - return m - - diff --git a/src/scoreboard/mem_dependence_cell.py b/src/scoreboard/mem_dependence_cell.py deleted file mode 100644 index 2958d864..00000000 --- a/src/scoreboard/mem_dependence_cell.py +++ /dev/null @@ -1,120 +0,0 @@ -from nmigen.compat.sim import run_simulation -from nmigen.cli import verilog, rtlil -from nmigen import Module, Signal, Elaboratable, Array, Cat, Repl -from nmutil.latch import SRLatch - - -class MemDepRow(Elaboratable): - """ implements 1st phase Memory Depencency cell - """ - def __init__(self, n_reg): - self.n_reg = n_reg - # inputs - self.ld_i = Signal(n_reg, reset_less=True) # Dest in (top) - self.st_i = Signal(n_reg, reset_less=True) # oper1 in (top) - self.issue_i = Signal(reset_less=True) # Issue in (top) - - self.st_pend_i = Signal(n_reg, reset_less=True) # Read pend in (top) - self.ld_pend_i = Signal(n_reg, reset_less=True) # Write pend in (top) - self.v_st_rsel_o = Signal(n_reg, reset_less=True) # Read pend out (bot) - self.v_ld_rsel_o = Signal(n_reg, reset_less=True) # Write pend out (bot) - - self.go_ld_i = Signal(reset_less=True) # Go Write in (left) - self.go_st_i = Signal(reset_less=True) # Go Read in (left) - self.go_die_i = Signal(reset_less=True) # Go Die in (left) - - # for Register File Select Lines (vertical) - self.ld_rsel_o = Signal(n_reg, reset_less=True) # dest reg sel (bot) - self.st_rsel_o = Signal(n_reg, reset_less=True) # src1 reg sel (bot) - - # for Function Unit "forward progress" (horizontal) - self.ld_fwd_o = Signal(n_reg, reset_less=True) # dest FU fw (right) - self.st_fwd_o = Signal(n_reg, reset_less=True) # src1 FU fw (right) - - def elaborate(self, platform): - m = Module() - m.submodules.ld_c = ld_c = SRLatch(sync=False, llen=self.n_reg) - m.submodules.st_c = st_c = SRLatch(sync=False, llen=self.n_reg) - - # connect go_rd / go_wr (dest->wr, src->rd) - ld_die = Signal(reset_less=True) - st_die = Signal(reset_less=True) - m.d.comb += ld_die.eq(self.go_ld_i | self.go_die_i) - m.d.comb += st_die.eq(self.go_st_i | self.go_die_i) - m.d.comb += ld_c.r.eq(Repl(ld_die, self.n_reg)) - m.d.comb += st_c.r.eq(Repl(st_die, self.n_reg)) - - # connect input reg bit (unary) - i_ext = Repl(self.issue_i, self.n_reg) - m.d.comb += ld_c.s.eq(i_ext & self.ld_i) - m.d.comb += st_c.s.eq(i_ext & self.st_i) - - # connect up hazard checks: read-after-write and write-after-read - m.d.comb += self.ld_fwd_o.eq(ld_c.q & self.st_pend_i) - m.d.comb += self.st_fwd_o.eq(st_c.q & self.ld_pend_i) - - # connect reg-sel outputs - st_ext = Repl(self.go_st_i, self.n_reg) - ld_ext = Repl(self.go_ld_i, self.n_reg) - m.d.comb += self.ld_rsel_o.eq(ld_c.qlq & ld_ext) - m.d.comb += self.st_rsel_o.eq(st_c.qlq & st_ext) - - # to be accumulated to indicate if register is in use (globally) - # after ORing, is fed back in to st_pend_i / ld_pend_i - m.d.comb += self.v_st_rsel_o.eq(st_c.qlq) - m.d.comb += self.v_ld_rsel_o.eq(ld_c.qlq) - - return m - - def __iter__(self): - yield self.ld_i - yield self.st_i - yield self.st_pend_i - yield self.ld_pend_i - yield self.issue_i - yield self.go_ld_i - yield self.go_st_i - yield self.go_die_i - yield self.v_ld_rsel_o - yield self.v_st_rsel_o - yield self.ld_rsel_o - yield self.st_rsel_o - yield self.ld_fwd_o - yield self.st_fwd_o - - def ports(self): - return list(self) - - -def dcell_sim(dut): - yield dut.ld_i.eq(1) - yield dut.issue_i.eq(1) - yield - yield dut.issue_i.eq(0) - yield - yield dut.st_i.eq(1) - yield dut.issue_i.eq(1) - yield - yield - yield - yield dut.issue_i.eq(0) - yield - yield dut.go_st_i.eq(1) - yield - yield dut.go_st_i.eq(0) - yield - yield dut.go_ld_i.eq(1) - yield - yield dut.go_ld_i.eq(0) - yield - -def test_dcell(): - dut = MemDepRow(4) - vl = rtlil.convert(dut, ports=dut.ports()) - with open("test_mem_drow.il", "w") as f: - f.write(vl) - - run_simulation(dut, dcell_sim(dut), vcd_name='test_mem_dcell.vcd') - -if __name__ == '__main__': - test_dcell() diff --git a/src/scoreboard/mem_fu_matrix.py b/src/scoreboard/mem_fu_matrix.py deleted file mode 100644 index 98595996..00000000 --- a/src/scoreboard/mem_fu_matrix.py +++ /dev/null @@ -1,218 +0,0 @@ -from nmigen.compat.sim import run_simulation -from nmigen.cli import verilog, rtlil -from nmigen import Module, Signal, Elaboratable, Array, Cat - -from scoreboard.mem_dependence_cell import MemDepRow -from scoreboard.mem_fu_pending import MemFU_Pend -from scoreboard.mem_select import Mem_Rsv -from scoreboard.global_pending import GlobalPending - -""" - -""" - -class MemFUDepMatrix(Elaboratable): - """ implements 1st phase Memory-to-FU Dependency Matrix - """ - def __init__(self, n_fu_row, n_reg_col): - self.n_fu_row = n_fu_row # Y (FUs) ^v - self.n_reg_col = n_reg_col # X (Regs) <> - self.ld_i = Signal(n_reg_col, reset_less=True) # LD in (top) - self.st_i = Signal(n_reg_col, reset_less=True) # ST in (top) - - # Register "Global" vectors for determining RaW and WaR hazards - self.ld_pend_i = Signal(n_reg_col, reset_less=True) # ld pending (top) - self.st_pend_i = Signal(n_reg_col, reset_less=True) # st pending (top) - self.v_ld_rsel_o = Signal(n_reg_col, reset_less=True) # ld pending (bot) - self.v_st_rsel_o = Signal(n_reg_col, reset_less=True) # st pending (bot) - - self.issue_i = Signal(n_fu_row, reset_less=True) # Issue in (top) - self.go_ld_i = Signal(n_fu_row, reset_less=True) # Go LOAD in (left) - self.go_st_i = Signal(n_fu_row, reset_less=True) # Go STOR in (left) - self.go_die_i = Signal(n_fu_row, reset_less=True) # Go Die in (left) - - # for Register File Select Lines (horizontal), per-reg - self.ld_rsel_o = Signal(n_reg_col, reset_less=True) # ld reg (bot) - self.st_rsel_o = Signal(n_reg_col, reset_less=True) # st reg (bot) - - # for Function Unit "forward progress" (vertical), per-FU - self.ld_pend_o = Signal(n_fu_row, reset_less=True) # ld pending (right) - self.st_pend_o = Signal(n_fu_row, reset_less=True) # st pending (right) - - def elaborate(self, platform): - m = Module() - - # --- - # matrix of dependency cells - # --- - dm = Array(MemDepRow(self.n_reg_col) for r in range(self.n_fu_row)) - for fu in range(self.n_fu_row): - setattr(m.submodules, "dr_fu%d" % fu, dm[fu]) - - # --- - # array of Function Unit Pending vectors - # --- - fupend = Array(MemFU_Pend(self.n_reg_col) for f in range(self.n_fu_row)) - for fu in range(self.n_fu_row): - setattr(m.submodules, "fu_fu%d" % (fu), fupend[fu]) - - # --- - # array of Register Reservation vectors - # --- - regrsv = Array(Mem_Rsv(self.n_fu_row) for r in range(self.n_reg_col)) - for rn in range(self.n_reg_col): - setattr(m.submodules, "rr_r%d" % (rn), regrsv[rn]) - - # --- - # connect Function Unit vector - # --- - ld_pend = [] - st_pend = [] - for fu in range(self.n_fu_row): - dc = dm[fu] - fup = fupend[fu] - ld_fwd_o = [] - st_fwd_o = [] - for rn in range(self.n_reg_col): - # accumulate cell fwd outputs for dest/src1 - ld_fwd_o.append(dc.ld_fwd_o[rn]) - st_fwd_o.append(dc.st_fwd_o[rn]) - # connect cell fwd outputs to FU Vector in [Cat is gooood] - m.d.comb += [fup.ld_fwd_i.eq(Cat(*ld_fwd_o)), - fup.st_fwd_i.eq(Cat(*st_fwd_o)), - ] - # accumulate FU Vector outputs - ld_pend.append(fup.reg_ld_pend_o) - st_pend.append(fup.reg_st_pend_o) - - # ... and output them from this module (vertical, width=FUs) - m.d.comb += self.ld_pend_o.eq(Cat(*ld_pend)) - m.d.comb += self.st_pend_o.eq(Cat(*st_pend)) - - # --- - # connect Reg Selection vector - # --- - ld_rsel = [] - st_rsel = [] - for rn in range(self.n_reg_col): - rsv = regrsv[rn] - ld_rsel_o = [] - st_rsel_o = [] - for fu in range(self.n_fu_row): - dc = dm[fu] - # accumulate cell reg-select outputs dest/src1 - ld_rsel_o.append(dc.ld_rsel_o[rn]) - st_rsel_o.append(dc.st_rsel_o[rn]) - # connect cell reg-select outputs to Reg Vector In - m.d.comb += [rsv.ld_rsel_i.eq(Cat(*ld_rsel_o)), - rsv.st_rsel_i.eq(Cat(*st_rsel_o)), - ] - # accumulate Reg-Sel Vector outputs - ld_rsel.append(rsv.ld_rsel_o) - st_rsel.append(rsv.st_rsel_o) - - # ... and output them from this module (horizontal, width=REGs) - m.d.comb += self.ld_rsel_o.eq(Cat(*ld_rsel)) - m.d.comb += self.st_rsel_o.eq(Cat(*st_rsel)) - - # --- - # connect Dependency Matrix dest/src1/issue to module d/s/s/i - # --- - for fu in range(self.n_fu_row): - dc = dm[fu] - # wire up inputs from module to row cell inputs (Cat is gooood) - m.d.comb += [dc.ld_i.eq(self.ld_i), - dc.st_i.eq(self.st_i), - dc.st_pend_i.eq(self.st_pend_i), - dc.ld_pend_i.eq(self.ld_pend_i), - ] - - # accumulate rsel bits into read/write pending vectors. - st_pend_v = [] - ld_pend_v = [] - for fu in range(self.n_fu_row): - dc = dm[fu] - st_pend_v.append(dc.v_st_rsel_o) - ld_pend_v.append(dc.v_ld_rsel_o) - st_v = GlobalPending(self.n_reg_col, st_pend_v) - ld_v = GlobalPending(self.n_reg_col, ld_pend_v) - m.submodules.st_v = st_v - m.submodules.ld_v = ld_v - - m.d.comb += self.v_st_rsel_o.eq(st_v.g_pend_o) - m.d.comb += self.v_ld_rsel_o.eq(ld_v.g_pend_o) - - # --- - # connect Dep issue_i/go_st_i/go_ld_i to module issue_i/go_rd/go_wr - # --- - go_st_i = [] - go_ld_i = [] - go_die_i = [] - issue_i = [] - for fu in range(self.n_fu_row): - dc = dm[fu] - # accumulate cell fwd outputs for dest/src1 - go_st_i.append(dc.go_st_i) - go_ld_i.append(dc.go_ld_i) - go_die_i.append(dc.go_die_i) - issue_i.append(dc.issue_i) - # wire up inputs from module to row cell inputs (Cat is gooood) - m.d.comb += [Cat(*go_st_i).eq(self.go_st_i), - Cat(*go_ld_i).eq(self.go_ld_i), - Cat(*go_die_i).eq(self.go_die_i), - Cat(*issue_i).eq(self.issue_i), - ] - - return m - - def __iter__(self): - yield self.ld_i - yield self.st_i - yield self.issue_i - yield self.go_ld_i - yield self.go_st_i - yield self.go_die_i - yield self.ld_rsel_o - yield self.st_rsel_o - yield self.ld_pend_o - yield self.st_pend_o - yield self.ld_pend_i - yield self.st_pend_i - yield self.ld_rsel_o - yield self.st_rsel_o - - def ports(self): - return list(self) - -def d_matrix_sim(dut): - """ XXX TODO - """ - yield dut.ld_i.eq(1) - yield dut.issue_i.eq(1) - yield - yield dut.issue_i.eq(0) - yield - yield dut.st_i.eq(1) - yield dut.issue_i.eq(1) - yield - yield dut.issue_i.eq(0) - yield - yield dut.go_st_i.eq(1) - yield - yield dut.go_st_i.eq(0) - yield - yield dut.go_ld_i.eq(1) - yield - yield dut.go_ld_i.eq(0) - yield - -def test_d_matrix(): - dut = MemFUDepMatrix(n_fu_row=3, n_reg_col=3) - vl = rtlil.convert(dut, ports=dut.ports()) - with open("test_fu_mem_matrix.il", "w") as f: - f.write(vl) - - run_simulation(dut, d_matrix_sim(dut), vcd_name='test_fu_mem_matrix.vcd') - -if __name__ == '__main__': - test_d_matrix() diff --git a/src/scoreboard/mem_fu_pending.py b/src/scoreboard/mem_fu_pending.py deleted file mode 100644 index 951f7ac1..00000000 --- a/src/scoreboard/mem_fu_pending.py +++ /dev/null @@ -1,22 +0,0 @@ -from nmigen import Elaboratable, Module, Signal, Cat - - -class MemFU_Pend(Elaboratable): - """ these are allocated per-FU (horizontally), - and are of length reg_count - """ - def __init__(self, reg_count): - self.reg_count = reg_count - self.ld_fwd_i = Signal(reg_count, reset_less=True) - self.st_fwd_i = Signal(reg_count, reset_less=True) - - self.reg_ld_pend_o = Signal(reset_less=True) - self.reg_st_pend_o = Signal(reset_less=True) - - def elaborate(self, platform): - m = Module() - m.d.comb += self.reg_ld_pend_o.eq(self.ld_fwd_i.bool()) - m.d.comb += self.reg_st_pend_o.eq(self.st_fwd_i.bool()) - - return m - diff --git a/src/scoreboard/mem_select.py b/src/scoreboard/mem_select.py deleted file mode 100644 index 627d7d10..00000000 --- a/src/scoreboard/mem_select.py +++ /dev/null @@ -1,20 +0,0 @@ -from nmigen import Elaboratable, Module, Signal - - -class Mem_Rsv(Elaboratable): - """ these are allocated per-Register (vertically), - and are each of length fu_count - """ - def __init__(self, fu_count): - self.fu_count = fu_count - self.ld_rsel_i = Signal(fu_count, reset_less=True) - self.st_rsel_i = Signal(fu_count, reset_less=True) - self.ld_rsel_o = Signal(reset_less=True) - self.st_rsel_o = Signal(reset_less=True) - - def elaborate(self, platform): - m = Module() - m.d.comb += self.ld_rsel_o.eq(self.ld_rsel_i.bool()) - m.d.comb += self.st_rsel_o.eq(self.st_rsel_i.bool()) - return m - diff --git a/src/scoreboard/memfu.py b/src/scoreboard/memfu.py deleted file mode 100644 index 857d96c9..00000000 --- a/src/scoreboard/memfu.py +++ /dev/null @@ -1,120 +0,0 @@ -from nmigen.compat.sim import run_simulation -from nmigen.cli import verilog, rtlil -from nmigen import Module, Signal, Array, Elaboratable - -from scoreboard.fu_fu_matrix import FUFUDepMatrix -from scoreboard.mdm import FUMemMatchMatrix - - -class MemFunctionUnits(Elaboratable): - - def __init__(self, n_ldsts, addrbitwid): - self.n_ldsts = n_ldsts - self.bitwid = addrbitwid - - self.st_i = Signal(n_ldsts, reset_less=True) # Dest R# in - self.ld_i = Signal(n_ldsts, reset_less=True) # oper1 R# in - - self.g_int_ld_pend_o = Signal(n_ldsts, reset_less=True) - self.g_int_st_pend_o = Signal(n_ldsts, reset_less=True) - - self.st_rsel_o = Signal(n_ldsts, reset_less=True) # dest reg (bot) - self.ld_rsel_o = Signal(n_ldsts, reset_less=True) # src1 reg (bot) - - self.loadable_o = Signal(n_ldsts, reset_less=True) - self.storable_o = Signal(n_ldsts, reset_less=True) - self.addr_nomatch_o = Signal(n_ldsts, reset_less=True) - - self.go_ld_i = Signal(n_ldsts, reset_less=True) - self.go_st_i = Signal(n_ldsts, reset_less=True) - self.go_die_i = Signal(n_ldsts, reset_less=True) - self.fn_issue_i = Signal(n_ldsts, reset_less=True) - - # address matching - self.addrs_i = Array(Signal(self.bitwid, name="addrs_i%d" % i) \ - for i in range(n_ldsts)) - self.addr_we_i = Signal(n_ldsts) # write-enable for incoming address - self.addr_en_i = Signal(n_ldsts) # address latched in - self.addr_rs_i = Signal(n_ldsts) # address deactivated - - # Note: FURegs st_pend_o is also outputted from here, for use in WaWGrid - - def elaborate(self, platform): - m = Module() - comb = m.d.comb - sync = m.d.sync - - n_fus = self.n_ldsts - - # Integer FU-FU Dep Matrix - intfudeps = FUFUDepMatrix(n_fus, n_fus) - m.submodules.intfudeps = intfudeps - # Integer FU-Reg Dep Matrix - intregdeps = FUMemMatchMatrix(n_fus, self.bitwid) - m.submodules.intregdeps = intregdeps - - # ok, because we do not know in advance what the AGEN (address gen) - # is, we have to make a transitive dependency set. i.e. the LD - # (or ST) being requested now must depend on ALL prior LDs *AND* STs. - # these get dropped very rapidly once AGEN is carried out. - # XXX TODO - - # connect fureg matrix as a mem system - comb += self.g_int_ld_pend_o.eq(intregdeps.v_rd_rsel_o) - comb += self.g_int_st_pend_o.eq(intregdeps.v_wr_rsel_o) - - comb += intregdeps.rd_pend_i.eq(intregdeps.v_rd_rsel_o) - comb += intregdeps.wr_pend_i.eq(intregdeps.v_wr_rsel_o) - - comb += intfudeps.rd_pend_i.eq(intregdeps.rd_pend_o) - comb += intfudeps.wr_pend_i.eq(intregdeps.wr_pend_o) - self.st_pend_o = intregdeps.wr_pend_o # also output for use in WaWGrid - - comb += intfudeps.issue_i.eq(self.fn_issue_i) - comb += intfudeps.go_rd_i.eq(self.go_ld_i) - comb += intfudeps.go_wr_i.eq(self.go_st_i) - comb += intfudeps.go_die_i.eq(self.go_die_i) - comb += self.loadable_o.eq(intfudeps.readable_o) - comb += self.storable_o.eq(intfudeps.writable_o) - comb += self.addr_nomatch_o.eq(intregdeps.addr_nomatch_o) - - # Connect function issue / arrays, and dest/src1/src2 - comb += intregdeps.dest_i.eq(self.st_i) - comb += intregdeps.src_i[0].eq(self.ld_i) - - comb += intregdeps.go_rd_i.eq(self.go_ld_i) - comb += intregdeps.go_wr_i.eq(self.go_st_i) - comb += intregdeps.go_die_i.eq(self.go_die_i) - comb += intregdeps.issue_i.eq(self.fn_issue_i) - - comb += self.st_rsel_o.eq(intregdeps.dest_rsel_o) - comb += self.ld_rsel_o.eq(intregdeps.src_rsel_o[0]) - - # connect address matching: these get connected to the Addr CUs - for i in range(self.n_ldsts): - comb += intregdeps.addrs_i[i].eq(self.addrs_i[i]) - comb += intregdeps.addr_we_i.eq(self.addr_we_i) - comb += intregdeps.addr_en_i.eq(self.addr_en_i) - comb += intregdeps.addr_rs_i.eq(self.addr_rs_i) - - return m - - def __iter__(self): - yield self.ld_i - yield self.st_i - yield self.g_int_st_pend_o - yield self.g_int_ld_pend_o - yield self.ld_rsel_o - yield self.st_rsel_o - yield self.loadable_o - yield self.storable_o - yield self.go_st_i - yield self.go_ld_i - yield self.go_die_i - yield self.fn_issue_i - yield from self.addrs_i - yield self.addr_we_i - yield self.addr_en_i - - def ports(self): - return list(self) diff --git a/src/scoreboard/reg_select.py b/src/scoreboard/reg_select.py deleted file mode 100644 index 3919cce3..00000000 --- a/src/scoreboard/reg_select.py +++ /dev/null @@ -1,24 +0,0 @@ -from nmigen import Elaboratable, Module, Signal, Array - - -class Reg_Rsv(Elaboratable): - """ these are allocated per-Register (vertically), - and are each of length fu_count - """ - def __init__(self, fu_count, n_src): - self.n_src = n_src - self.fu_count = fu_count - self.dest_rsel_i = Signal(fu_count, reset_less=True) - self.src_rsel_i = Array(Signal(fu_count, name="src_rsel_i", - reset_less=True) \ - for i in range(n_src)) - self.dest_rsel_o = Signal(reset_less=True) - self.src_rsel_o = Signal(n_src, reset_less=True) - - def elaborate(self, platform): - m = Module() - m.d.comb += self.dest_rsel_o.eq(self.dest_rsel_i.bool()) - for i in range(self.n_src): - m.d.comb += self.src_rsel_o[i].eq(self.src_rsel_i[i].bool()) - return m - diff --git a/src/scoreboard/shadow.py b/src/scoreboard/shadow.py deleted file mode 100644 index 12f20893..00000000 --- a/src/scoreboard/shadow.py +++ /dev/null @@ -1,226 +0,0 @@ -from nmigen.compat.sim import run_simulation -from nmigen.cli import verilog, rtlil -from nmigen import Module, Signal, Cat, Array, Const, Elaboratable, Repl -from nmigen.lib.coding import Decoder - -from scoreboard.shadow_fn import ShadowFn - - -class ShadowMatrix(Elaboratable): - """ Matrix of Shadow Functions. One per FU. - - Inputs - * :n_fus: register file width - * :shadow_wid: number of shadow/fail/good/go_die sets - - Notes: - - * Shadow enable/fail/good are all connected to all Shadow Functions - (incoming at the top) - - * Output is an array of "shadow active" (schroedinger wires: neither - alive nor dead) and an array of "go die" signals, one per FU. - - * the shadown must be connected to the Computation Unit's - write release request, preventing it (ANDing) from firing - (and thus preventing Writable. this by the way being the - whole point of having the Shadow Matrix...) - - * go_die_o must be connected to *both* the Computation Unit's - src-operand and result-operand latch resets, causing both - of them to reset. - - * go_die_o also needs to be wired into the Dependency and Function - Unit Matrices by way of over-enabling (ORing) into Go_Read and - Go_Write, resetting every cell that is required to "die" - """ - def __init__(self, n_fus, shadow_wid=0, syncreset=False): - self.syncreset = syncreset - self.n_fus = n_fus - self.shadow_wid = shadow_wid - - # inputs - self.issue_i = Signal(n_fus, reset_less=True) - self.reset_i = Signal(n_fus, reset_less=True) - self.shadow_i = Array(Signal(shadow_wid, name="sh_i", reset_less=True) \ - for f in range(n_fus)) - self.s_fail_i = Array(Signal(shadow_wid, name="fl_i", reset_less=True) \ - for f in range(n_fus)) - self.s_good_i = Array(Signal(shadow_wid, name="gd_i", reset_less=True) \ - for f in range(n_fus)) - # outputs - self.go_die_o = Signal(n_fus, reset_less=True) - self.shadown_o = Signal(n_fus, reset_less=True) - - def elaborate(self, platform): - m = Module() - shadows = [] - for i in range(self.n_fus): - sh = ShadowFn(self.shadow_wid, self.syncreset) - setattr(m.submodules, "sh%d" % i, sh) - shadows.append(sh) - # connect shadow/fail/good to all shadows - m.d.comb += sh.s_fail_i.eq(self.s_fail_i[i]) - m.d.comb += sh.s_good_i.eq(self.s_good_i[i]) - # this one is the matrix (shadow enables) - m.d.comb += sh.shadow_i.eq(self.shadow_i[i]) - - # connect all shadow outputs and issue input - issue_l = [] - reset_l = [] - sho_l = [] - rec_l = [] - for l in shadows: - issue_l.append(l.issue_i) - reset_l.append(l.reset_i) - sho_l.append(l.shadown_o) - rec_l.append(l.go_die_o) - m.d.comb += Cat(*issue_l).eq(self.issue_i) - m.d.comb += Cat(*reset_l).eq(self.reset_i) - m.d.comb += self.shadown_o.eq(Cat(*sho_l)) - m.d.comb += self.go_die_o.eq(Cat(*rec_l)) - - return m - - def __iter__(self): - yield self.issue_i - yield self.reset_i - yield from self.shadow_i - yield from self.s_fail_i - yield from self.s_good_i - yield self.go_die_o - yield self.shadown_o - - def ports(self): - return list(self) - - -class BranchSpeculationRecord(Elaboratable): - """ A record of which function units will be cancelled and which - allowed to proceed, on a branch. - - Whilst the input is a pair that says whether the instruction is - under the "success" branch shadow (good_i) or the "fail" shadow - (fail_i path), when the branch result is known, the "good" path - must be cancelled if "fail" occurred, and the "fail" path cancelled - if "good" occurred. - - therefore, use "good|~fail" and "fail|~good" respectively as - output. - """ - - def __init__(self, n_fus): - self.n_fus = n_fus - - # inputs: record *expected* status - self.active_i = Signal(reset_less=True) - self.good_i = Signal(n_fus, reset_less=True) - self.fail_i = Signal(n_fus, reset_less=True) - - # inputs: status of branch (when result was known) - self.br_i = Signal(reset_less=True) - self.br_ok_i = Signal(reset_less=True) - - # outputs: true if the *expected* outcome matched the *actual* outcome - self.match_f_o = Signal(n_fus, reset_less=True) - self.match_g_o = Signal(n_fus, reset_less=True) - - def elaborate(self, platform): - m = Module() - - # registers to record *expected* status - good_r = Signal(self.n_fus) - fail_r = Signal(self.n_fus) - - for i in range(self.n_fus): - with m.If(self.active_i): - m.d.sync += good_r[i].eq(good_r[i] | self.good_i[i]) - m.d.sync += fail_r[i].eq(fail_r[i] | self.fail_i[i]) - with m.If(self.br_i): - with m.If(good_r[i]): - # we expected good, return OK that good was EXPECTED - m.d.comb += self.match_g_o[i].eq(self.br_ok_i) - m.d.comb += self.match_f_o[i].eq(~self.br_ok_i) - with m.If(fail_r[i]): - # we expected fail, return OK that fail was EXPECTED - m.d.comb += self.match_g_o[i].eq(~self.br_ok_i) - m.d.comb += self.match_f_o[i].eq(self.br_ok_i) - m.d.sync += good_r[i].eq(0) # might be set if issue set as well - m.d.sync += fail_r[i].eq(0) # might be set if issue set as well - - return m - - def __iter__(self): - yield self.active_i - yield self.good_i - yield self.fail_i - yield self.br_i - yield self.br_good_i - yield self.br_fail_i - yield self.good_o - yield self.fail_o - - def ports(self): - return list(self) - - - -class WaWGrid(Elaboratable): - """ An NxM grid-selector which raises a 2D bit selected by N and M - """ - - def __init__(self, n_fus, shadow_wid): - self.n_fus = n_fus - self.shadow_wid = shadow_wid - - self.shadow_i = Signal(shadow_wid, reset_less=True) - self.fu_i = Signal(n_fus, reset_less=True) - - self.waw_o = Array(Signal(shadow_wid, name="waw_o", reset_less=True) \ - for f in range(n_fus)) - - def elaborate(self, platform): - m = Module() - for i in range(self.n_fus): - v = Repl(self.fu_i[i], self.shadow_wid) - m.d.comb += self.waw_o[i].eq(v & self.shadow_i) - return m - - -def shadow_sim(dut): - yield dut.dest_i.eq(1) - yield dut.issue_i.eq(1) - yield - yield dut.issue_i.eq(0) - yield - yield dut.src1_i.eq(1) - yield dut.issue_i.eq(1) - yield - yield - yield - yield dut.issue_i.eq(0) - yield - yield dut.go_rd_i.eq(1) - yield - yield dut.go_rd_i.eq(0) - yield - yield dut.go_wr_i.eq(1) - yield - yield dut.go_wr_i.eq(0) - yield - -def test_shadow(): - dut = ShadowMatrix(4, 2) - vl = rtlil.convert(dut, ports=dut.ports()) - with open("test_shadow.il", "w") as f: - f.write(vl) - - dut = BranchSpeculationRecord(4) - vl = rtlil.convert(dut, ports=dut.ports()) - with open("test_branchspecrecord.il", "w") as f: - f.write(vl) - - run_simulation(dut, shadow_sim(dut), vcd_name='test_shadow.vcd') - -if __name__ == '__main__': - test_shadow() diff --git a/src/scoreboard/shadow_fn.py b/src/scoreboard/shadow_fn.py deleted file mode 100644 index 69a56a5c..00000000 --- a/src/scoreboard/shadow_fn.py +++ /dev/null @@ -1,111 +0,0 @@ -from nmigen.compat.sim import run_simulation -from nmigen.cli import verilog, rtlil -from nmigen import Module, Signal, Cat, Repl, Const, Elaboratable -from nmutil.latch import SRLatch - - -class ShadowFn(Elaboratable): - """ implements shadowing 11.5.1, p55, just the individual shadow function - - shadowing can be used for branches as well as exceptions (interrupts), - load/store hold (exceptions again), and vector-element predication - (once the predicate is known, which it may not be at instruction issue) - - Inputs - * :shadow_wid: number of shadow/fail/good/go_die sets - - notes: - * when shadow_wid = 0, recover and shadown are Consts (i.e. do nothing) - """ - def __init__(self, slen, syncreset=False): - - self.slen = slen - self.syncreset = syncreset - - if self.slen: - # inputs - self.issue_i = Signal(reset_less=True) - self.shadow_i = Signal(slen, reset_less=True) - self.reset_i = Signal(reset_less=True) - self.s_fail_i = Signal(slen, reset_less=True) - self.s_good_i = Signal(slen, reset_less=True) - - # outputs - self.shadown_o = Signal(reset_less=True) - self.go_die_o = Signal(reset_less=True) - else: - # outputs when no shadowing needed - self.shadown_o = Const(1) - self.go_die_o = Const(0) - - def elaborate(self, platform): - m = Module() - if self.slen == 0: - return - - m.submodules.sl = sl = SRLatch(sync=False, llen=self.slen) - - r_ext = Repl(self.reset_i, self.slen) - reset_r = Signal(self.slen) - if self.syncreset: - m.d.comb += reset_r.eq(self.s_good_i | self.s_fail_i | r_ext) - else: - m.d.comb += reset_r.eq(self.s_good_i | self.s_fail_i | r_ext) - - i_ext = Repl(self.issue_i, self.slen) - m.d.comb += sl.s.eq(self.shadow_i & i_ext & \ - ~self.s_good_i & ~reset_r) - m.d.comb += sl.r.eq(r_ext | reset_r | self.s_good_i | \ - (i_ext & ~self.shadow_i)) - m.d.comb += self.go_die_o.eq((sl.qlq & self.s_fail_i).bool()) - m.d.comb += self.shadown_o.eq(~sl.qlq.bool()) - - return m - - def __iter__(self): - yield self.issue_i - yield self.reset_i - yield self.shadow_i - yield self.s_fail_i - yield self.s_good_i - yield self.shadown_o - yield self.go_die_o - - def ports(self): - return list(self) - - -def shadow_fn_unit_sim(dut): - yield dut.dest_i.eq(1) - yield dut.issue_i.eq(1) - yield - yield dut.issue_i.eq(0) - yield - yield dut.src1_i.eq(1) - yield dut.issue_i.eq(1) - yield - yield - yield - yield dut.issue_i.eq(0) - yield - yield dut.go_rd_i.eq(1) - yield - yield dut.go_rd_i.eq(0) - yield - yield dut.go_wr_i.eq(1) - yield - yield dut.go_wr_i.eq(0) - yield - - -def test_shadow_fn_unit(): - dut = ShadowFn(4) - vl = rtlil.convert(dut, ports=dut.ports()) - with open("test_shadow_fn_unit.il", "w") as f: - f.write(vl) - - run_simulation(dut, shadow_fn_unit_sim(dut), - vcd_name='test_shadow_fn_unit.vcd') - -if __name__ == '__main__': - test_shadow_fn_unit() diff --git a/src/scoreboard/test_iq.py b/src/scoreboard/test_iq.py deleted file mode 100644 index 94ceac7e..00000000 --- a/src/scoreboard/test_iq.py +++ /dev/null @@ -1,126 +0,0 @@ -""" testing of InstructionQ -""" - -from copy import deepcopy -from random import randint -from nmigen.compat.sim import run_simulation -from nmigen.cli import verilog, rtlil - -from scoreboard.instruction_q import InstructionQ -from nmutil.nmoperator import eq - - -class IQSim: - def __init__(self, dut, iq, n_in, n_out): - self.dut = dut - self.iq = iq - self.oq = [] - self.n_in = n_in - self.n_out = n_out - - def send(self): - i = 0 - while i < len(self.iq): - sendlen = randint(1, self.n_in) - sendlen = 1 - sendlen = min(len(self.iq) - i, sendlen) - print ("sendlen", len(self.iq)-i, sendlen) - for idx in range(sendlen): - instr = self.iq[i+idx] - yield from eq(self.dut.data_i[idx], instr) - di = yield self.dut.data_i[idx]#.src1_i - print ("senddata %d %x" % ((i+idx), di)) - self.oq.append(di) - yield self.dut.p_add_i.eq(sendlen) - yield - o_p_ready = yield self.dut.p_ready_o - while not o_p_ready: - yield - o_p_ready = yield self.dut.p_ready_o - - yield self.dut.p_add_i.eq(0) - - print ("send", len(self.iq), i, sendlen) - - # wait random period of time before queueing another value - for j in range(randint(0, 3)): - yield - - i += sendlen - - yield self.dut.p_add_i.eq(0) - yield - - print ("send ended") - - ## wait random period of time before queueing another value - #for i in range(randint(0, 3)): - # yield - - #send_range = randint(0, 3) - #if send_range == 0: - # send = True - #else: - # send = randint(0, send_range) != 0 - - def rcv(self): - i = 0 - yield - yield - yield - while i < len(self.iq): - rcvlen = randint(1, self.n_out) - #print ("outreq", rcvlen) - yield self.dut.n_sub_i.eq(rcvlen) - n_sub_o = yield self.dut.n_sub_o - print ("recv", n_sub_o) - for j in range(n_sub_o): - r = yield self.dut.data_o[j]#.src1_i - print ("recvdata %x %s" % (r, repr(self.iq[i+j]))) - assert r == self.oq[i+j] - yield - if n_sub_o == 0: - continue - yield self.dut.n_sub_i.eq(0) - - i += n_sub_o - - print ("recv ended") - - -def mk_insns(n_insns, wid, opwid): - res = [] - for i in range(n_insns): - op1 = randint(0, (1<>self.ddepth] - - def st(self, addr, data): - self.mem[addr>>self.ddepth] = data & ((1<>self.ddepth] - - def st(self, addr, data): - self.mem[addr>>self.ddepth] = data & ((1< 0 + with m.Else(): + # Multiple Match if encoder n is invalid + with m.If(self.encoder.n): + m.d.comb += [ + self.single_match.eq(0), + self.multiple_match.eq(1) + ] + # Single Match if encoder n is valid + with m.Else(): + m.d.comb += [ + self.single_match.eq(1), + self.multiple_match.eq(0) + ] + # Always set output based on priority encoder output + m.d.comb += self.o.eq(self.p_encoder.o) + return m diff --git a/src/soc/TLB/Cam.py b/src/soc/TLB/Cam.py new file mode 100644 index 00000000..e7d901ff --- /dev/null +++ b/src/soc/TLB/Cam.py @@ -0,0 +1,125 @@ +from nmigen import Array, Cat, Module, Signal, Elaboratable +from nmigen.lib.coding import Decoder +from nmigen.cli import main #, verilog + +from .CamEntry import CamEntry +from .AddressEncoder import AddressEncoder + + +class Cam(Elaboratable): + """ Content Addressable Memory (CAM) + + The purpose of this module is to quickly look up whether an + entry exists given a data key. + This module will search for the given data in all internal entries + and output whether a single or multiple match was found. + If an single entry is found the address be returned and single_match + is set HIGH. If multiple entries are found the lowest address is + returned and multiple_match is set HIGH. If neither single_match or + multiple_match are HIGH this implies no match was found. To write + to the CAM set the address bus to the desired entry and set write_enable + HIGH. Entry managment should be performed one level above this block + as lookup is performed within. + + Notes: + The read and write operations take one clock cycle to complete. + Currently the read_warning line is present for interfacing but + is not necessary for this design. This module is capable of writing + in the first cycle, reading on the second, and output the correct + address on the third. + """ + + def __init__(self, data_size, cam_size): + """ Arguments: + * data_size: (bits) The bit size of the data + * cam_size: (number) The number of entries in the CAM + """ + + # Internal + self.cam_size = cam_size + self.encoder = AddressEncoder(cam_size) + self.decoder = Decoder(cam_size) + self.entry_array = Array(CamEntry(data_size) for x in range(cam_size)) + + # Input + self.enable = Signal(1) + self.write_enable = Signal(1) + self.data_in = Signal(data_size) # The data to be written + self.data_mask = Signal(data_size) # mask for ternary writes + self.address_in = Signal(max=cam_size) # address of CAM Entry to write + + # Output + self.read_warning = Signal(1) # High when a read interrupts a write + self.single_match = Signal(1) # High when there is only one match + self.multiple_match = Signal(1) # High when there at least two matches + self.match_address = Signal(max=cam_size) # The lowest address matched + + def elaborate(self, platform=None): + m = Module() + # AddressEncoder for match types and output address + m.submodules.AddressEncoder = self.encoder + # Decoder is used to select which entry will be written to + m.submodules.Decoder = self.decoder + # CamEntry Array Submodules + # Note these area added anonymously + entry_array = self.entry_array + m.submodules += entry_array + + # Decoder logic + m.d.comb += [ + self.decoder.i.eq(self.address_in), + self.decoder.n.eq(0) + ] + + encoder_vector = [] + with m.If(self.enable): + # Set the key value for every CamEntry + for index in range(self.cam_size): + + # Write Operation + with m.If(self.write_enable): + with m.If(self.decoder.o[index]): + m.d.comb += entry_array[index].command.eq(2) + with m.Else(): + m.d.comb += entry_array[index].command.eq(0) + + # Read Operation + with m.Else(): + m.d.comb += entry_array[index].command.eq(1) + + # Send data input to all entries + m.d.comb += entry_array[index].data_in.eq(self.data_in) + # Send all entry matches to encoder + ematch = entry_array[index].match + encoder_vector.append(ematch) + + # Give input to and accept output from encoder module + m.d.comb += [ + self.encoder.i.eq(Cat(*encoder_vector)), + self.single_match.eq(self.encoder.single_match), + self.multiple_match.eq(self.encoder.multiple_match), + self.match_address.eq(self.encoder.o) + ] + + # If the CAM is not enabled set all outputs to 0 + with m.Else(): + m.d.comb += [ + self.read_warning.eq(0), + self.single_match.eq(0), + self.multiple_match.eq(0), + self.match_address.eq(0) + ] + + return m + + def ports(self): + return [self.enable, self.write_enable, + self.data_in, self.data_mask, + self.read_warning, self.single_match, + self.multiple_match, self.match_address] + + +if __name__ == '__main__': + cam = Cam(4, 4) + main(cam, ports=cam.ports()) + diff --git a/src/soc/TLB/CamEntry.py b/src/soc/TLB/CamEntry.py new file mode 100644 index 00000000..b1d93082 --- /dev/null +++ b/src/soc/TLB/CamEntry.py @@ -0,0 +1,46 @@ +from nmigen import Module, Signal, Elaboratable + + +class CamEntry(Elaboratable): + """ Content Addressable Memory (CAM) Entry + + The purpose of this module is to represent an entry within a CAM. + This module when given a read command will compare the given data + and output whether a match was found or not. When given a write + command it will write the given data into internal registers. + """ + + def __init__(self, data_size): + """ Arguments: + * data_size: (bit count) The size of the data + """ + # Input + self.command = Signal(2) # 00 => NA 01 => Read 10 => Write 11 => Reset + self.data_in = Signal(data_size) # Data input when writing + + # Output + self.match = Signal(1) # Result of the internal/input key comparison + self.data = Signal(data_size) + + def elaborate(self, platform=None): + m = Module() + with m.Switch(self.command): + with m.Case("00"): + m.d.sync += self.match.eq(0) + with m.Case("01"): + with m.If(self.data == self.data_in): + m.d.sync += self.match.eq(1) + with m.Else(): + m.d.sync += self.match.eq(0) + with m.Case("10"): + m.d.sync += [ + self.data.eq(self.data_in), + self.match.eq(0) + ] + with m.Case(): + m.d.sync += [ + self.match.eq(0), + self.data.eq(0) + ] + + return m diff --git a/src/soc/TLB/LFSR.py b/src/soc/TLB/LFSR.py new file mode 100644 index 00000000..d8b606ec --- /dev/null +++ b/src/soc/TLB/LFSR.py @@ -0,0 +1,109 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# See Notices.txt for copyright information +from nmigen import Signal, Module, Const, Cat, Elaboratable +from nmigen.cli import verilog, rtlil + + +class LFSRPolynomial(set): + """ implements a polynomial for use in LFSR + """ + def __init__(self, exponents=()): + for e in exponents: + assert isinstance(e, int), TypeError("%s must be an int" % repr(e)) + assert (e >= 0), ValueError("%d must not be negative" % e) + set.__init__(self, set(exponents).union({0})) # must contain zero + + @property + def max_exponent(self): + return max(self) # derived from set, so this returns the max exponent + + @property + def exponents(self): + exponents = list(self) # get elements of set as a list + exponents.sort(reverse=True) + return exponents + + def __str__(self): + expd = {0: "1", 1: 'x', 2: "x^{}"} # case 2 isn't 2, it's min(i,2) + retval = map(lambda i: expd[min(i,2)].format(i), self.exponents) + return " + ".join(retval) + + def __repr__(self): + return "LFSRPolynomial(%s)" % self.exponents + + +# list of selected polynomials from https://web.archive.org/web/20190418121923/https://en.wikipedia.org/wiki/Linear-feedback_shift_register#Some_polynomials_for_maximal_LFSRs # noqa +LFSR_POLY_2 = LFSRPolynomial([2, 1, 0]) +LFSR_POLY_3 = LFSRPolynomial([3, 2, 0]) +LFSR_POLY_4 = LFSRPolynomial([4, 3, 0]) +LFSR_POLY_5 = LFSRPolynomial([5, 3, 0]) +LFSR_POLY_6 = LFSRPolynomial([6, 5, 0]) +LFSR_POLY_7 = LFSRPolynomial([7, 6, 0]) +LFSR_POLY_8 = LFSRPolynomial([8, 6, 5, 4, 0]) +LFSR_POLY_9 = LFSRPolynomial([9, 5, 0]) +LFSR_POLY_10 = LFSRPolynomial([10, 7, 0]) +LFSR_POLY_11 = LFSRPolynomial([11, 9, 0]) +LFSR_POLY_12 = LFSRPolynomial([12, 11, 10, 4, 0]) +LFSR_POLY_13 = LFSRPolynomial([13, 12, 11, 8, 0]) +LFSR_POLY_14 = LFSRPolynomial([14, 13, 12, 2, 0]) +LFSR_POLY_15 = LFSRPolynomial([15, 14, 0]) +LFSR_POLY_16 = LFSRPolynomial([16, 15, 13, 4, 0]) +LFSR_POLY_17 = LFSRPolynomial([17, 14, 0]) +LFSR_POLY_18 = LFSRPolynomial([18, 11, 0]) +LFSR_POLY_19 = LFSRPolynomial([19, 18, 17, 14, 0]) +LFSR_POLY_20 = LFSRPolynomial([20, 17, 0]) +LFSR_POLY_21 = LFSRPolynomial([21, 19, 0]) +LFSR_POLY_22 = LFSRPolynomial([22, 21, 0]) +LFSR_POLY_23 = LFSRPolynomial([23, 18, 0]) +LFSR_POLY_24 = LFSRPolynomial([24, 23, 22, 17, 0]) + + +class LFSR(LFSRPolynomial, Elaboratable): + """ implements a Linear Feedback Shift Register + """ + def __init__(self, polynomial): + """ Inputs: + ------ + :polynomial: the polynomial to feedback on. may be a LFSRPolynomial + instance or an iterable of ints (list/tuple/generator) + :enable: enable (set LO to disable. NOTE: defaults to HI) + + Outputs: + ------- + :state: the LFSR state. bitwidth is taken from the polynomial + maximum exponent. + + Note: if an LFSRPolynomial is passed in as the input, because + LFSRPolynomial is derived from set() it's ok: + LFSRPolynomial(LFSRPolynomial(p)) == LFSRPolynomial(p) + """ + LFSRPolynomial.__init__(self, polynomial) + self.state = Signal(self.max_exponent, reset=1) + self.enable = Signal(reset=1) + + def elaborate(self, platform): + m = Module() + # do absolutely nothing if the polynomial is empty (always has a zero) + if self.max_exponent <= 1: + return m + + # create XOR-bunch, select bits from state based on exponent + feedback = Const(0) # doesn't do any harm starting from 0b0 (xor chain) + for exponent in self: + if exponent > 0: # don't have to skip, saves CPU cycles though + feedback ^= self.state[exponent - 1] + + # if enabled, shift-and-feedback + with m.If(self.enable): + # shift up lower bits by Cat'ing in a new bit zero (feedback) + newstate = Cat(feedback, self.state[:-1]) + m.d.sync += self.state.eq(newstate) + + return m + + +# example: Poly24 +if __name__ == '__main__': + p24 = rtlil.convert(LFSR(LFSR_POLY_24)) + with open("lfsr2_p24.il", "w") as f: + f.write(p24) diff --git a/src/soc/TLB/LFSR.pyi b/src/soc/TLB/LFSR.pyi new file mode 100644 index 00000000..64eb9115 --- /dev/null +++ b/src/soc/TLB/LFSR.pyi @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# See Notices.txt for copyright information +from nmigen import Module +from typing import Iterable, Optional, Iterator, Any, Union +from typing_extensions import final + + +@final +class LFSRPolynomial(set): + def __init__(self, exponents: Iterable[int] = ()): + def elements() -> Iterable[int]: ... + @property + def exponents(self) -> list[int]: ... + def __str__(self) -> str: ... + def __repr__(self) -> str: ... + + +@final +class LFSR: + def __init__(self, polynomial: Union[Iterable[int], LFSRPolynomial]): ... + @property + def width(self) -> int: ... + def elaborate(self, platform: Any) -> Module: ... diff --git a/src/soc/TLB/Makefile b/src/soc/TLB/Makefile new file mode 100644 index 00000000..1eb67acc --- /dev/null +++ b/src/soc/TLB/Makefile @@ -0,0 +1,2 @@ +verilog: + python3 Cam.py generate -t v > Cam.v diff --git a/src/soc/TLB/MemorySet.py b/src/soc/TLB/MemorySet.py new file mode 100644 index 00000000..ea61bdf5 --- /dev/null +++ b/src/soc/TLB/MemorySet.py @@ -0,0 +1,66 @@ +from nmigen import Cat, Memory, Module, Signal, Elaboratable +from nmigen.cli import main +from nmigen.cli import verilog, rtlil + + +class MemorySet(Elaboratable): + def __init__(self, data_size, tag_size, set_count, active): + self.active = active + input_size = tag_size + data_size # Size of the input data + memory_width = input_size + 1 # The width of the cache memory + self.active = active + self.data_size = data_size + self.tag_size = tag_size + + # XXX TODO, use rd-enable and wr-enable? + self.mem = Memory(memory_width, set_count) + self.r = self.mem.read_port() + self.w = self.mem.write_port() + + # inputs (address) + self.cset = Signal(max=set_count) # The set to be checked + self.tag = Signal(tag_size) # The tag to find + self.data_i = Signal(data_size) # Incoming data + + # outputs + self.valid = Signal() + self.data_o = Signal(data_size) # Outgoing data (excludes tag) + + def elaborate(self, platform): + m = Module() + m.submodules.mem = self.mem + m.submodules.r = self.r + m.submodules.w = self.w + + # temporaries + active_bit = Signal() + tag_valid = Signal() + data_start = self.active + 1 + data_end = data_start + self.data_size + tag_start = data_end + tag_end = tag_start + self.tag_size + + # connect the read port address to the set/entry + read_port = self.r + m.d.comb += read_port.addr.eq(self.cset) + # Pull out active bit from data + data = read_port.data + m.d.comb += active_bit.eq(data[self.active]) + # Validate given tag vs stored tag + tag = data[tag_start:tag_end] + m.d.comb += tag_valid.eq(self.tag == tag) + # An entry is only valid if the tags match AND + # is marked as a valid entry + m.d.comb += self.valid.eq(tag_valid & active_bit) + + # output data: TODO, check rd-enable? + m.d.comb += self.data_o.eq(data[data_start:data_end]) + + # connect the write port addr to the set/entry (only if write enabled) + # (which is only done on a match, see SAC.write_entry below) + write_port = self.w + with m.If(write_port.en): + m.d.comb += write_port.addr.eq(self.cset) + m.d.comb += write_port.data.eq(Cat(1, self.data_i, self.tag)) + + return m diff --git a/src/soc/TLB/PermissionValidator.py b/src/soc/TLB/PermissionValidator.py new file mode 100644 index 00000000..0107c0e9 --- /dev/null +++ b/src/soc/TLB/PermissionValidator.py @@ -0,0 +1,68 @@ +from nmigen import Module, Signal, Elaboratable +from nmigen.cli import main + +from TLB.PteEntry import PteEntry + + +class PermissionValidator(Elaboratable): + """ The purpose of this Module is to check the Permissions of a given PTE + against the requested access permissions. + + This module will either validate (by setting the valid bit HIGH) + the request or find a permission fault and invalidate (by setting + the valid bit LOW) the request + """ + + def __init__(self, asid_size, pte_size): + """ Arguments: + * asid_size: (bit count) The size of the asid to be processed + * pte_size: (bit count) The size of the pte to be processed + + Return: + * valid HIGH when permissions are correct + """ + # Internal + self.pte_entry = PteEntry(asid_size, pte_size) + + # Input + self.data = Signal(asid_size + pte_size); + self.xwr = Signal(3) # Execute, Write, Read + self.super_mode = Signal(1) # Supervisor Mode + self.super_access = Signal(1) # Supervisor Access + self.asid = Signal(15) # Address Space IDentifier (ASID) + + # Output + self.valid = Signal(1) # Denotes if the permissions are correct + + def elaborate(self, platform=None): + m = Module() + + m.submodules.pte_entry = self.pte_entry + + m.d.comb += self.pte_entry.i.eq(self.data) + + # Check if the entry is valid + with m.If(self.pte_entry.v): + # ASID match or Global Permission + # Note that the MSB bound is exclusive + with m.If((self.pte_entry.asid == self.asid) | self.pte_entry.g): + # Check Execute, Write, Read (XWR) Permissions + with m.If(self.pte_entry.xwr == self.xwr): + # Supervisor Logic + with m.If(self.super_mode): + # Valid if entry is not in user mode or supervisor + # has Supervisor User Memory (SUM) access via the + # SUM bit in the sstatus register + m.d.comb += self.valid.eq((~self.pte_entry.u) \ + | self.super_access) + # User logic + with m.Else(): + # Valid if the entry is in user mode only + m.d.comb += self.valid.eq(self.pte_entry.u) + with m.Else(): + m.d.comb += self.valid.eq(0) + with m.Else(): + m.d.comb += self.valid.eq(0) + with m.Else(): + m.d.comb += self.valid.eq(0) + return m diff --git a/src/soc/TLB/PteEntry.py b/src/soc/TLB/PteEntry.py new file mode 100644 index 00000000..73ea9220 --- /dev/null +++ b/src/soc/TLB/PteEntry.py @@ -0,0 +1,67 @@ +from nmigen import Module, Signal, Elaboratable +from nmigen.cli import main + + +class PteEntry(Elaboratable): + """ The purpose of this Module is to centralize the parsing of Page + Table Entries (PTE) into one module to prevent common mistakes + and duplication of code. The control bits are parsed out for + ease of use. + + This module parses according to the standard PTE given by the + Volume II: RISC-V Privileged Architectures V1.10 Pg 60. + The Address Space IDentifier (ASID) is appended to the MSB of the input + and is parsed out as such. + + An valid input Signal would be: + ASID PTE + Bits:[78-64][63-0] + + The output PTE value will include the control bits. + """ + def __init__(self, asid_size, pte_size): + """ Arguments: + * asid_size: (bit count) The size of the asid to be processed + * pte_size: (bit count) The size of the pte to be processed + + Return: + * d The Dirty bit from the PTE portion of i + * a The Accessed bit from the PTE portion of i + * g The Global bit from the PTE portion of i + * u The User Mode bit from the PTE portion of i + * xwr The Execute/Write/Read bit from the PTE portion of i + * v The Valid bit from the PTE portion of i + * asid The asid portion of i + * pte The pte portion of i + """ + # Internal + self.asid_start = pte_size + self.asid_end = pte_size + asid_size + + # Input + self.i = Signal(asid_size + pte_size) + + # Output + self.d = Signal(1) # Dirty bit (From pte) + self.a = Signal(1) # Accessed bit (From pte) + self.g = Signal(1) # Global Access (From pte) + self.u = Signal(1) # User Mode (From pte) + self.xwr = Signal(3) # Execute Read Write (From pte) + self.v = Signal(1) # Valid (From pte) + self.asid = Signal(asid_size) # Associated Address Space IDentifier + self.pte = Signal(pte_size) # Full Page Table Entry + + def elaborate(self, platform=None): + m = Module() + # Pull out all control bites from PTE + m.d.comb += [ + self.d.eq(self.i[7]), + self.a.eq(self.i[6]), + self.g.eq(self.i[5]), + self.u.eq(self.i[4]), + self.xwr.eq(self.i[1:4]), + self.v.eq(self.i[0]) + ] + m.d.comb += self.asid.eq(self.i[self.asid_start:self.asid_end]) + m.d.comb += self.pte.eq(self.i[0:self.asid_start]) + return m diff --git a/src/soc/TLB/SetAssociativeCache.py b/src/soc/TLB/SetAssociativeCache.py new file mode 100644 index 00000000..70c075da --- /dev/null +++ b/src/soc/TLB/SetAssociativeCache.py @@ -0,0 +1,272 @@ +""" + +Online simulator of 4-way set-associative cache: +http://www.ntu.edu.sg/home/smitha/ParaCache/Paracache/sa4.html + +Python simulator of a N-way set-associative cache: +https://github.com/vaskevich/CacheSim/blob/master/cachesim.py +""" + +from nmigen import Array, Cat, Memory, Module, Signal, Mux, Elaboratable +from nmigen.compat.genlib import fsm +from nmigen.cli import main +from nmigen.cli import verilog, rtlil + +from .AddressEncoder import AddressEncoder +from .MemorySet import MemorySet + +# TODO: use a LFSR that advances continuously and picking the bottom +# few bits from it to select which cache line to replace, instead of PLRU +# http://bugs.libre-riscv.org/show_bug.cgi?id=71 +from .ariane.plru import PLRU +from .LFSR import LFSR, LFSR_POLY_24 + +SA_NA = "00" # no action (none) +SA_RD = "01" # read +SA_WR = "10" # write + + +class SetAssociativeCache(Elaboratable): + """ Set Associative Cache Memory + + The purpose of this module is to generate a memory cache given the + constraints passed in. This will create a n-way set associative cache. + It is expected for the SV TLB that the VMA will provide the set number + while the ASID provides the tag (still to be decided). + + """ + def __init__(self, tag_size, data_size, set_count, way_count, lfsr=False): + """ Arguments + * tag_size (bits): The bit count of the tag + * data_size (bits): The bit count of the data to be stored + * set_count (number): The number of sets/entries in the cache + * way_count (number): The number of slots a data can be stored + in one set + * lfsr: if set, use an LFSR for (pseudo-randomly) selecting + set/entry to write to. otherwise, use a PLRU + """ + # Internals + self.lfsr_mode = lfsr + self.way_count = way_count # The number of slots in one set + self.tag_size = tag_size # The bit count of the tag + self.data_size = data_size # The bit count of the data to be stored + + # set up Memory array + self.mem_array = Array() # memory array + for i in range(way_count): + ms = MemorySet(data_size, tag_size, set_count, active=0) + self.mem_array.append(ms) + + # Finds valid entries + self.encoder = AddressEncoder(way_count) + + # setup PLRU or LFSR + if lfsr: + # LFSR mode + self.lfsr = LFSR(LFSR_POLY_24) + else: + # PLRU mode + self.plru = PLRU(way_count) # One block to handle plru calculations + self.plru_array = Array() # PLRU data on each set + for i in range(set_count): + name="plru%d" % i + self.plru_array.append(Signal(self.plru.TLBSZ, name=name)) + + # Input + self.enable = Signal(1) # Whether the cache is enabled + self.command = Signal(2) # 00=None, 01=Read, 10=Write (see SA_XX) + self.cset = Signal(max=set_count) # The set to be checked + self.tag = Signal(tag_size) # The tag to find + self.data_i = Signal(data_size) # The input data + + # Output + self.ready = Signal(1) # 0 => Processing 1 => Ready for commands + self.hit = Signal(1) # Tag matched one way in the given set + self.multiple_hit = Signal(1) # Tag matched many ways in the given set + self.data_o = Signal(data_size) # The data linked to the matched tag + + def check_tags(self, m): + """ Validate the tags in the selected set. If one and only one + tag matches set its state to zero and increment all others + by one. We only advance to next state if a single hit is found. + """ + # Vector to store way valid results + # A zero denotes a way is invalid + valid_vector = [] + # Loop through memory to prep read/write ports and set valid_vector + for i in range(self.way_count): + valid_vector.append(self.mem_array[i].valid) + + # Pass encoder the valid vector + m.d.comb += self.encoder.i.eq(Cat(*valid_vector)) + + # Only one entry should be marked + # This is due to already verifying the tags + # matched and the valid bit is high + with m.If(self.hit): + m.next = "FINISHED_READ" + # Pull out data from the read port + data = self.mem_array[self.encoder.o].data_o + m.d.comb += self.data_o.eq(data) + if not self.lfsr_mode: + self.access_plru(m) + + # Oh no! Seal the gates! Multiple tags matched?!? kasd;ljkafdsj;k + with m.Elif(self.multiple_hit): + # XXX TODO, m.next = "FINISHED_READ" ? otherwise stuck + m.d.comb += self.data_o.eq(0) + + # No tag matches means no data + with m.Else(): + # XXX TODO, m.next = "FINISHED_READ" ? otherwise stuck + m.d.comb += self.data_o.eq(0) + + def access_plru(self, m): + """ An entry was accessed and the plru tree must now be updated + """ + # Pull out the set's entry being edited + plru_entry = self.plru_array[self.cset] + m.d.comb += [ + # Set the plru data to the current state + self.plru.plru_tree.eq(plru_entry), + # Set that the cache was accessed + self.plru.lu_access_i.eq(1) + ] + + def read(self, m): + """ Go through the read process of the cache. + This takes two cycles to complete. First it checks for a valid tag + and secondly it updates the LRU values. + """ + with m.FSM() as fsm_read: + with m.State("READY"): + m.d.comb += self.ready.eq(0) + # check_tags will set the state if the conditions are met + self.check_tags(m) + with m.State("FINISHED_READ"): + m.next = "READY" + m.d.comb += self.ready.eq(1) + if not self.lfsr_mode: + plru_tree_o = self.plru.plru_tree_o + m.d.sync += self.plru_array[self.cset].eq(plru_tree_o) + + def write_entry(self, m): + if not self.lfsr_mode: + m.d.comb += [# set cset (mem address) into PLRU + self.plru.plru_tree.eq(self.plru_array[self.cset]), + # and connect plru to encoder for write + self.encoder.i.eq(self.plru.replace_en_o) + ] + write_port = self.mem_array[self.encoder.o].w + else: + # use the LFSR to generate a random(ish) one of the mem array + lfsr_output = Signal(max=self.way_count) + lfsr_random = Signal(max=self.way_count) + m.d.comb += lfsr_output.eq(self.lfsr.state) # lose some bits + # address too big, limit to range of array + m.d.comb += lfsr_random.eq(Mux(lfsr_output > self.way_count, + lfsr_output - self.way_count, + lfsr_output)) + write_port = self.mem_array[lfsr_random].w + + # then if there is a match from the encoder, enable the selected write + with m.If(self.encoder.single_match): + m.d.comb += write_port.en.eq(1) + + def write(self, m): + """ Go through the write process of the cache. + This takes two cycles to complete. First it writes the entry, + and secondly it updates the PLRU (in plru mode) + """ + with m.FSM() as fsm_write: + with m.State("READY"): + m.d.comb += self.ready.eq(0) + self.write_entry(m) + m.next ="FINISHED_WRITE" + with m.State("FINISHED_WRITE"): + m.d.comb += self.ready.eq(1) + if not self.lfsr_mode: + plru_entry = self.plru_array[self.cset] + m.d.sync += plru_entry.eq(self.plru.plru_tree_o) + m.next = "READY" + + + def elaborate(self, platform=None): + m = Module() + + # ---- + # set up Modules: AddressEncoder, LFSR/PLRU, Mem Array + # ---- + + m.submodules.AddressEncoder = self.encoder + if self.lfsr_mode: + m.submodules.LFSR = self.lfsr + else: + m.submodules.PLRU = self.plru + + for i, mem in enumerate(self.mem_array): + setattr(m.submodules, "mem%d" % i, mem) + + # ---- + # select mode: PLRU connect to encoder, LFSR do... something + # ---- + + if not self.lfsr_mode: + # Set what entry was hit + m.d.comb += self.plru.lu_hit.eq(self.encoder.o) + else: + # enable LFSR + m.d.comb += self.lfsr.enable.eq(self.enable) + + # ---- + # connect hit/multiple hit to encoder output + # ---- + + m.d.comb += [ + self.hit.eq(self.encoder.single_match), + self.multiple_hit.eq(self.encoder.multiple_match), + ] + + # ---- + # connect incoming data/tag/cset(addr) to mem_array + # ---- + + for mem in self.mem_array: + write_port = mem.w + m.d.comb += [mem.cset.eq(self.cset), + mem.tag.eq(self.tag), + mem.data_i.eq(self.data_i), + write_port.en.eq(0), # default: disable write + ] + # ---- + # Commands: READ/WRITE/TODO + # ---- + + with m.If(self.enable): + with m.Switch(self.command): + # Search all sets at a particular tag + with m.Case(SA_RD): + self.read(m) + with m.Case(SA_WR): + self.write(m) + # Maybe catch multiple tags write here? + # TODO + # TODO: invalidate/flush, flush-all? + + return m + + def ports(self): + return [self.enable, self.command, self.cset, self.tag, self.data_i, + self.ready, self.hit, self.multiple_hit, self.data_o] + + +if __name__ == '__main__': + sac = SetAssociativeCache(4, 8, 4, 6) + vl = rtlil.convert(sac, ports=sac.ports()) + with open("SetAssociativeCache.il", "w") as f: + f.write(vl) + + sac_lfsr = SetAssociativeCache(4, 8, 4, 6, True) + vl = rtlil.convert(sac_lfsr, ports=sac_lfsr.ports()) + with open("SetAssociativeCacheLFSR.il", "w") as f: + f.write(vl) diff --git a/src/soc/TLB/TLB.py b/src/soc/TLB/TLB.py new file mode 100644 index 00000000..98c9af72 --- /dev/null +++ b/src/soc/TLB/TLB.py @@ -0,0 +1,175 @@ +""" TLB Module + + The expected form of the data is: + * Item (Bits) + * Tag (N - 79) / ASID (78 - 64) / PTE (63 - 0) +""" + +from nmigen import Memory, Module, Signal, Cat, Elaboratable +from nmigen.cli import main + +from .PermissionValidator import PermissionValidator +from .Cam import Cam + +class TLB(Elaboratable): + def __init__(self, asid_size, vma_size, pte_size, L1_size): + """ Arguments + * asid_size: Address Space IDentifier (ASID) typically 15 bits + * vma_size: Virtual Memory Address (VMA) typically 36 bits + * pte_size: Page Table Entry (PTE) typically 64 bits + + Notes: + These arguments should represent the largest possible size + defined by the MODE settings. See + Volume II: RISC-V Privileged Architectures V1.10 Page 57 + """ + + # Internal + self.state = 0 + # L1 Cache Modules + self.cam_L1 = Cam(vma_size, L1_size) + self.mem_L1 = Memory(asid_size + pte_size, L1_size) + + # Permission Validator + self.perm_validator = PermissionValidator(asid_size, pte_size) + + # Inputs + self.supermode = Signal(1) # Supervisor Mode + self.super_access = Signal(1) # Supervisor Access + self.command = Signal(2) # 00=None, 01=Search, 10=Write L1, 11=Write L2 + self.xwr = Signal(3) # Execute, Write, Read + self.mode = Signal(4) # 4 bits for access to Sv48 on Rv64 + self.address_L1 = Signal(max=L1_size) + self.asid = Signal(asid_size) # Address Space IDentifier (ASID) + self.vma = Signal(vma_size) # Virtual Memory Address (VMA) + self.pte_in = Signal(pte_size) # To be saved Page Table Entry (PTE) + + # Outputs + self.hit = Signal(1) # Denotes if the VMA had a mapped PTE + self.perm_valid = Signal(1) # Denotes if the permissions are correct + self.pte_out = Signal(pte_size) # PTE that was mapped to by the VMA + + def search(self, m, read_L1, write_L1): + """ searches the TLB + """ + m.d.comb += [ + write_L1.en.eq(0), + self.cam_L1.write_enable.eq(0), + self.cam_L1.data_in.eq(self.vma) + ] + # Match found in L1 CAM + match_found = Signal(reset_less=True) + m.d.comb += match_found.eq(self.cam_L1.single_match + | self.cam_L1.multiple_match) + with m.If(match_found): + # Memory shortcut variables + mem_address = self.cam_L1.match_address + # Memory Logic + m.d.comb += read_L1.addr.eq(mem_address) + # Permission Validator Logic + m.d.comb += [ + self.hit.eq(1), + # Set permission validator data to the correct + # register file data according to CAM match + # address + self.perm_validator.data.eq(read_L1.data), + # Execute, Read, Write + self.perm_validator.xwr.eq(self.xwr), + # Supervisor Mode + self.perm_validator.super_mode.eq(self.supermode), + # Supverisor Access + self.perm_validator.super_access.eq(self.super_access), + # Address Space IDentifier (ASID) + self.perm_validator.asid.eq(self.asid), + # Output result of permission validation + self.perm_valid.eq(self.perm_validator.valid) + ] + # Only output PTE if permissions are valid + with m.If(self.perm_validator.valid): + # XXX TODO - dummy for now + reg_data = Signal.like(self.pte_out) + m.d.comb += [ + self.pte_out.eq(reg_data) + ] + with m.Else(): + m.d.comb += [ + self.pte_out.eq(0) + ] + # Miss Logic + with m.Else(): + m.d.comb += [ + self.hit.eq(0), + self.perm_valid.eq(0), + self.pte_out.eq(0) + ] + + def write_l1(self, m, read_L1, write_L1): + """ writes to the L1 cache + """ + # Memory_L1 Logic + m.d.comb += [ + write_L1.en.eq(1), + write_L1.addr.eq(self.address_L1), + # The Cat places arguments from LSB -> MSB + write_L1.data.eq(Cat(self.pte_in, self.asid)) + ] + # CAM_L1 Logic + m.d.comb += [ + self.cam_L1.write_enable.eq(1), + self.cam_L1.data_in.eq(self.vma), #data_in is sent to all entries + # self.cam_L1.address_in.eq(todo) # a CAM entry needs to be selected + + ] + + def elaborate(self, platform): + m = Module() + # Add submodules + # Submodules for L1 Cache + m.submodules.cam_L1 = self.cam_L1 + m.submodules.read_L1 = read_L1 = self.mem_L1.read_port() + m.submodules.write_L1 = write_L1 = self.mem_L1.write_port() + + # Permission Validator Submodule + m.submodules.perm_valididator = self.perm_validator + + # When MODE specifies translation + # TODO add in different bit length handling ie prefix 0s + tlb_enable = Signal(reset_less=True) + m.d.comb += tlb_enable.eq(self.mode != 0) + + with m.If(tlb_enable): + m.d.comb += [ + self.cam_L1.enable.eq(1) + ] + with m.Switch(self.command): + # Search + with m.Case("01"): + self.search(m, read_L1, write_L1) + + # Write L1 + # Expected that the miss will be handled in software + with m.Case("10"): + self.write_l1(m, read_L1, write_L1) + + # TODO + #with m.Case("11"): + + # When disabled + with m.Else(): + m.d.comb += [ + self.cam_L1.enable.eq(0), + # XXX TODO - self.reg_file.enable.eq(0), + self.hit.eq(0), + self.perm_valid.eq(0), # XXX TODO, check this + self.pte_out.eq(0) + ] + return m + + +if __name__ == '__main__': + tlb = TLB(15, 36, 64, 4) + main(tlb, ports=[ tlb.supermode, tlb.super_access, tlb.command, + tlb.xwr, tlb.mode, tlb.address_L1, tlb.asid, + tlb.vma, tlb.pte_in, + tlb.hit, tlb.perm_valid, tlb.pte_out, + ] + tlb.cam_L1.ports()) diff --git a/src/soc/TLB/__init__.py b/src/soc/TLB/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/soc/TLB/ariane/TreePLRU.cpp b/src/soc/TLB/ariane/TreePLRU.cpp new file mode 100644 index 00000000..2f6aeea5 --- /dev/null +++ b/src/soc/TLB/ariane/TreePLRU.cpp @@ -0,0 +1,211 @@ +#include +#include +#include + + +#define NWAY 4 +#define NLINE 256 +#define HIT 0 +#define MISS 1 +#define MS 1000 +/* +Detailed TreePLRU inference see here: https://docs.google.com/spreadsheets/d/14zQpPYPwDAbCCjBT_a3KLaE5FEk-RNhI8Z7Qm_biW8g/edit?usp=sharing +Ref: https://people.cs.clemson.edu/~mark/464/p_lru.txt +four-way set associative - three bits + each bit represents one branch point in a binary decision tree; let 1 + represent that the left side has been referenced more recently than the + right side, and 0 vice-versa + are all 4 lines valid? + / \ + yes no, use an invalid line + | + | + | + bit_0 == 0? state | replace ref to | next state + / \ ------+-------- -------+----------- + y n 00x | line_0 line_0 | 11_ + / \ 01x | line_1 line_1 | 10_ + bit_1 == 0? bit_2 == 0? 1x0 | line_2 line_2 | 0_1 + / \ / \ 1x1 | line_3 line_3 | 0_0 + y n y n + / \ / \ ('x' means ('_' means unchanged) + line_0 line_1 line_2 line_3 don't care) + 8-way set associative - 7 = 1+2+4 bits +16-way set associative - 15 = 1+2+4+8 bits +32-way set associative - 31 = 1+2+4+8+16 bits +64-way set associative - 63 = 1+2+4+8+16+32 bits +*/ +using namespace std; +struct AddressField { + uint64_t wd_idx : 2;//Unused + uint64_t offset : 4;//Unused + uint64_t index : 8;//NLINE = 256 = 2^8 + uint64_t tag : 50; +}; + +union Address { + uint32_t* p; + AddressField fields; +}; + +struct Cell { + bool v; + uint64_t tag; + + Cell() : v(false), tag(0) {} + + bool isHit(uint64_t tag) { + return v && (tag == this->tag); + } + + void fetch(uint32_t* address) { + Address addr; + addr.p = address; + addr.fields.offset = 0; + addr.fields.wd_idx = 0; + tag = addr.fields.tag; + v = true; + } +}; + +ostream& operator<<(ostream & out, const Cell& cell) { + out << " v:" << cell.v << " tag:" << hex << cell.tag; + return out; +} + +struct Block { + Cell cell[NWAY]; + uint32_t state; + uint64_t *mask;//Mask the state to get accurate value for specified 1 bit. + uint64_t *value; + uint64_t *next_value; + + Block() : state(0) { + switch (NWAY) { + case 4: + mask = new uint64_t[4]{0b110, 0b110, 0b101, 0b101}; + value = new uint64_t[4]{0b000, 0b010, 0b100, 0b101}; + next_value = new uint64_t[4]{0b110, 0b100, 0b001, 0b000}; + break; + case 8: + mask = new uint64_t[8]{0b1101000, 0b1101000, 0b1100100, 0b1100100, 0b1010010, 0b1010010, 0b1010001, + 0b1010001}; + value = new uint64_t[8]{0b0000000, 0b0001000, 0b0100000, 0b0100100, 0b1000000, 0b1000010, 0b1010000, + 0b1010001}; + next_value = new uint64_t[8]{0b1101000, 0b1100000, 0b1000100, 0b1000000, 0b0010010, 0b0010000, + 0b0000001, 0b0000000}; + break; + //TODO - more NWAY goes here. + default: + std::cout << "Error definition NWAY = " << NWAY << std::endl; + } + } + + uint32_t *getByTag(uint64_t tag, uint32_t *pway) { + for (int i = 0; i < NWAY; ++i) { + if (cell[i].isHit(tag)) { + *pway = i; + return pway; + } + } + return NULL; + } + + void setLRU(uint32_t *address) { + int way = 0; + uint32_t st = state; + for (int i = 0; i < NWAY; ++i) { + if ((state & mask[i]) == value[i]) { + state ^= mask[i]; + way = i; + break; + } + } + cell[way].fetch(address); + cout << "MISS: way:" << way << " address:" << address << " state:" << st << "->" << state << endl; + } + + uint32_t *get(uint32_t *address, uint32_t *pway) { + Address addr; + addr.p = address; + uint32_t *d = getByTag(addr.fields.tag, pway); + if (d != NULL) { + return &d[addr.fields.offset]; + } + return d; + } + + int set(uint32_t *address) { + uint32_t way = 0; + uint32_t *p = get(address, &way); + if (p != NULL) { + printf("HIT: address:%p ref_to way:%d state %X --> ", address, way, state); + state &= ~mask[way]; + printf("%X --> ", state); + state |= next_value[way]; + printf("%X\n", state); + // *p = *address; //skip since address is fake. + return HIT; + } else { + setLRU(address); + return MISS; + } + } +}; + +ostream& operator<<(ostream & out, const Block& block) { + out << "state:" << block.state << " "; + for (int i = 0; i cacheline refill) + self.miss_gnt_o = Signal(NR_PORTS) + self.active_serving_o = Signal(NR_PORTS) + + self.critical_word_o = Signal(64) + self.critical_word_valid_o = Signal() + output ariane_axi::req_t axi_data_o, + input ariane_axi::resp_t axi_data_i, + + self.mshr_addr_i = Array(Signal(name="bdata_o", 56) \ + for i in range(NR_PORTS)) + self.mshr_addr_matches_o = Signal(NR_PORTS) + self.mshr_index_matches_o = Signal(NR_PORTS) + + # AMO + self.amo_req_i = AMOReq() + self.amo_resp_o = AMOResp() + # Port to SRAMs, for refill and eviction + self.req_o = Signal(DCACHE_SET_ASSOC) + self.addr_o = Signal(DCACHE_INDEX_WIDTH) # address into cache array + self.data_o = CacheLine() + self.be_o = CLBE() + self.data_i = Array(CacheLine() \ + for i in range(DCACHE_SET_ASSOC)) + self.we_o = Signal() + + def elaborate(self, platform): + # Registers + mshr_t mshr_d, mshr_q; + logic [DCACHE_INDEX_WIDTH-1:0] cnt_d, cnt_q; + logic [DCACHE_SET_ASSOC-1:0] evict_way_d, evict_way_q; + # cache line to evict + cache_line_t evict_cl_d, evict_cl_q; + + logic serve_amo_d, serve_amo_q; + # Request from one FSM + miss_req_valid = Signal(self.NR_PORTS) + miss_req_bypass = Signal(self.NR_PORTS) + miss_req_addr = Array(Signal(name="miss_req_addr", 64) \ + for i in range(NR_PORTS)) + miss_req_wdata = Array(Signal(name="miss_req_wdata", 64) \ + for i in range(NR_PORTS)) + miss_req_we = Signal(self.NR_PORTS) + miss_req_be = Array(Signal(name="miss_req_be", 8) \ + for i in range(NR_PORTS)) + miss_req_size = Array(Signal(name="miss_req_size", 2) \ + for i in range(NR_PORTS)) + + # Cache Line Refill <-> AXI + req_fsm_miss_valid = Signal() + req_fsm_miss_addr = Signal(64) + req_fsm_miss_wdata = Signal(DCACHE_LINE_WIDTH) + req_fsm_miss_we = Signal() + req_fsm_miss_be = Signal(DCACHE_LINE_WIDTH//8) + ariane_axi::ad_req_t req_fsm_miss_req; + req_fsm_miss_size = Signal(2) + + gnt_miss_fsm = Signal() + valid_miss_fsm = Signal() + nmiss = DCACHE_LINE_WIDTH//64 + data_miss_fsm = Array(Signal(name="data_miss_fsm", 64) \ + for i in range(nmiss)) + + # Cache Management <-> LFSR + lfsr_enable = Signal() + lfsr_oh = Signal(DCACHE_SET_ASSOC) + lfsr_bin = Signal($clog2(DCACHE_SET_ASSOC-1)) + # AMOs + ariane_pkg::amo_t amo_op; + amo_operand_a = Signal(64) + amo_operand_b = Signal(64) + amo_result_o = Signal(64) + + struct packed { + logic [63:3] address; + logic valid; + } reservation_d, reservation_q; + + # ------------------------------ + # Cache Management + # ------------------------------ + evict_way = Signal(DCACHE_SET_ASSOC) + valid_way = Signal(DCACHE_SET_ASSOC) + + for (i in range(DCACHE_SET_ASSOC): + comb += evict_way[i].eq(data_i[i].valid & data_i[i].dirty) + comb += valid_way[i].eq(data_i[i].valid) + + # ---------------------- + # Default Assignments + # ---------------------- + # to AXI refill + req_fsm_miss_req = ariane_axi::CACHE_LINE_REQ; + req_fsm_miss_size = Const(0b11, 2) + # core + serve_amo_d = serve_amo_q; + # -------------------------------- + # Flush and Miss operation + # -------------------------------- + state_d = state_q; + cnt_d = cnt_q; + evict_way_d = evict_way_q; + evict_cl_d = evict_cl_q; + mshr_d = mshr_q; + # communicate to the requester which unit we are currently serving + active_serving_o[mshr_q.id] = mshr_q.valid; + # AMOs + # silence the unit when not used + amo_op = amo_req_i.amo_op; + + reservation_d = reservation_q; + with m.FSM() as state_q: + + with m.Case("IDLE"): + # lowest priority are AMOs, wait until everything else + # is served before going for the AMOs + with m.If (amo_req_i.req & ~busy_i): + # 1. Flush the cache + with m.If(~serve_amo_q): + m.next = "FLUSH_REQ_STATUS" + serve_amo_d.eq(0b1 + cnt_d.eq(0 + # 2. Do the AMO + with m.Else(): + m.next = "AMO_LOAD" + serve_amo_d.eq(0b0 + + # check if we want to flush and can flush + # e.g.: we are not busy anymore + # TODO: Check that the busy flag is indeed needed + with m.If (flush_i & ~busy_i): + m.next = "FLUSH_REQ_STATUS" + cnt_d = 0 + + # check if one of the state machines missed + for i in range(NR_PORTS): + # here comes the refill portion of code + with m.If (miss_req_valid[i] & ~miss_req_bypass[i]): + m.next = "MISS" + # we are taking another request so don't + # take the AMO + serve_amo_d = 0b0; + # save to MSHR + wid = DCACHE_TAG_WIDTH+DCACHE_INDEX_WIDTH + comb += [ mshr_d.valid.eq(0b1), + mshr_d.we.eq(miss_req_we[i]), + mshr_d.id.eq(i), + mshr_d.addr.eq(miss_req_addr[i][0:wid]), + mshr_d.wdata.eq(miss_req_wdata[i]), + mshr_d.be.eq(miss_req_be[i]), + ] + break + + # ~> we missed on the cache + with m.Case("MISS"): + # 1. Check if there is an empty cache-line + # 2. If not -> evict one + comb += req_o.eq(1) + sync += addr_o.eq(mshr_q.addr[:DCACHE_INDEX_WIDTH] + m.next = "MISS_REPL" + comb += miss_o.eq(1) + + # ~> second miss cycle + with m.Case("MISS_REPL"): + # if all are valid we need to evict one, + # pseudo random from LFSR + with m.If(~(~valid_way).bool()): + comb += lfsr_enable.eq(0b1) + comb += evict_way_d.eq(lfsr_oh) + # do we need to write back the cache line? + with m.If(data_i[lfsr_bin].dirty): + state_d = WB_CACHELINE_MISS; + comb += evict_cl_d.tag.eq(data_i[lfsr_bin].tag) + comb += evict_cl_d.data.eq(data_i[lfsr_bin].data) + comb += cnt_d.eq(mshr_q.addr[:DCACHE_INDEX_WIDTH]) + # no - we can request a cache line now + with m.Else(): + m.next = "REQ_CACHELINE" + # we have at least one free way + with m.Else(): + # get victim cache-line by looking for the + # first non-valid bit + comb += evict_way_d.eq(get_victim_cl(~valid_way) + m.next = "REQ_CACHELINE" + + # ~> we can just load the cache-line, + # the way is store in evict_way_q + with m.Case("REQ_CACHELINE"): + comb += req_fsm_miss_valid .eq(1) + sync += req_fsm_miss_addr .eq(mshr_q.addr) + + with m.If (gnt_miss_fsm): + m.next = "SAVE_CACHELINE" + comb += miss_gnt_o[mshr_q.id].eq(1) + + # ~> replace the cacheline + with m.Case("SAVE_CACHELINE"): + # calculate cacheline offset + automatic logic [$clog2(DCACHE_LINE_WIDTH)-1:0] cl_offset; + sync += cl_offset.eq(mshr_q.addr[3:DCACHE_BYTE_OFFSET] << 6) + # we've got a valid response from refill unit + with m.If (valid_miss_fsm): + wid = DCACHE_TAG_WIDTH+DCACHE_INDEX_WIDTH + sync += addr_o .eq(mshr_q.addr[:DCACHE_INDEX_WIDTH]) + sync += req_o .eq(evict_way_q) + comb += we_o .eq(1) + comb += be_o .eq(1) + sync += be_o.vldrty .eq(evict_way_q) + sync += data_o.tag .eq(mshr_q.addr[DCACHE_INDEX_WIDTH:wid] + comb += data_o.data .eq(data_miss_fsm) + comb += data_o.valid.eq(1) + comb += data_o.dirty.eq(0) + + # is this a write? + with m.If (mshr_q.we): + # Yes, so safe the updated data now + for i in range(8): + # check if we really want to write + # the corresponding byte + with m.If (mshr_q.be[i]): + sync += data_o.data[(cl_offset + i*8) +: 8].eq(mshr_q.wdata[i]; + # it's immediately dirty if we write + comb += data_o.dirty.eq(1) + + # reset MSHR + comb += mshr_d.valid.eq(0) + # go back to idle + m.next = 'IDLE' + + # ------------------------------ + # Write Back Operation + # ------------------------------ + # ~> evict a cache line from way saved in evict_way_q + with m.Case("WB_CACHELINE_FLUSH"): + with m.Case("WB_CACHELINE_MISS"): + + comb += req_fsm_miss_valid .eq(0b1) + sync += req_fsm_miss_addr .eq({evict_cl_q.tag, cnt_q[DCACHE_INDEX_WIDTH-1:DCACHE_BYTE_OFFSET], {{DCACHE_BYTE_OFFSET}{0b0}}}; + comb += req_fsm_miss_be .eq(1) + comb += req_fsm_miss_we .eq(0b1) + sync += req_fsm_miss_wdata .eq(evict_cl_q.data; + + # we've got a grant --> this is timing critical, think about it + if (gnt_miss_fsm) begin + # write status array + sync += addr_o .eq(cnt_q) + comb += req_o .eq(0b1) + comb += we_o .eq(0b1) + comb += data_o.valid.eq(INVALIDATE_ON_FLUSH ? 0b0 : 0b1) + # invalidate + sync += be_o.vldrty.eq(evict_way_q) + # go back to handling the miss or flushing, + # depending on where we came from + with m.If(state_q == WB_CACHELINE_MISS): + m.next = "MISS" + with m.Else(): + m.next = "FLUSH_REQ_STATUS" + + # ------------------------------ + # Flushing & Initialization + # ------------------------------ + # ~> make another request to check the same + # cache-line if there are still some valid entries + with m.Case("FLUSH_REQ_STATUS"): + comb += req_o .eq(1) + sync += addr_o .eq(cnt_q) + m.next = "FLUSHING" + + with m.Case("FLUSHING"): + # this has priority + # at least one of the cache lines is dirty + with m.If(~evict_way): + # evict cache line, look for the first + # cache-line which is dirty + comb += evict_way_d.eq(get_victim_cl(evict_way)) + comb += evict_cl_d .eq(data_i[one_hot_to_bin(evict_way)]) + state_d = WB_CACHELINE_FLUSH; + # not dirty ~> increment and continue + with m.Else(): + # increment and re-request + sync += cnt_d.eq(cnt_q + (1 << DCACHE_BYTE_OFFSET)) + m.next = "FLUSH_REQ_STATUS" + sync += addr_o .eq(cnt_q) + comb += req_o .eq(1) + comb += be_o.vldrty.eq(INVALIDATE_ON_FLUSH ? 1 : 0) + comb += we_o .eq(1) + # finished with flushing operation, go back to idle + with m.If (cnt_q[DCACHE_BYTE_OFFSET:DCACHE_INDEX_WIDTH] \ + == DCACHE_NUM_WORDS-1): + # only acknowledge if the flush wasn't + # triggered by an atomic + sync += flush_ack_o.eq(~serve_amo_q) + m.next = "IDLE" + + # ~> only called after reset + with m.Case("INIT"): + # initialize status array + sync += addr_o.eq(cnt_q) + comb += req_o .eq(1) + comb += we_o .eq(1) + # only write the dirty array + comb += be_o.vldrty.eq(1) + sync += cnt_d .eq(cnt_q + (1 << DCACHE_BYTE_OFFSET)) + # finished initialization + with m.If (cnt_q[DCACHE_BYTE_OFFSET:DCACHE_INDEX_WIDTH] \ + == DCACHE_NUM_WORDS-1) + m.next = "IDLE" + + # ---------------------- + # AMOs + # ---------------------- + # TODO(zarubaf) Move this closer to memory + # ~> we are here because we need to do the AMO, + # the cache is clean at this point + # start by executing the load + with m.Case("AMO_LOAD"): + comb += req_fsm_miss_valid.eq(1) + # address is in operand a + comb += req_fsm_miss_addr.eq(amo_req_i.operand_a) + comb += req_fsm_miss_req.eq(ariane_axi::SINGLE_REQ) + comb += req_fsm_miss_size.eq(amo_req_i.size) + # the request has been granted + with m.If(gnt_miss_fsm): + m.next = "AMO_SAVE_LOAD" + # save the load value + with m.Case("AMO_SAVE_LOAD"): + with m.If (valid_miss_fsm): + # we are only concerned about the lower 64-bit + comb += mshr_d.wdata.eq(data_miss_fsm[0]) + m.next = "AMO_STORE" + # and do the store + with m.Case("AMO_STORE"): + load_data = Signal(64) + # re-align load data + comb += load_data.eq(data_align(amo_req_i.operand_a[:3], + mshr_q.wdata)) + # Sign-extend for word operation + with m.If (amo_req_i.size == 0b10): + comb += amo_operand_a.eq(sext32(load_data[:32])) + comb += amo_operand_b.eq(sext32(amo_req_i.operand_b[:32])) + with m.Else(): + comb += amo_operand_a.eq(load_data) + comb += amo_operand_b.eq(amo_req_i.operand_b) + + # we do not need a store request for load reserved + # or a failing store conditional + # we can bail-out without making any further requests + with m.If ((amo_req_i.amo_op == AMO_LR) | \ + ((amo_req_i.amo_op == AMO_SC) & \ + ((reservation_q.valid & \ + (reservation_q.address != \ + amo_req_i.operand_a[3:64])) | \ + ~reservation_q.valid))): + comb += req_fsm_miss_valid.eq(0) + m.next = "IDLE" + comb += amo_resp_o.ack.eq(1) + # write-back the result + comb += amo_resp_o.result.eq(amo_operand_a) + # we know that the SC failed + with m.If (amo_req_i.amo_op == AMO_SC): + comb += amo_resp_o.result.eq(1) + # also clear the reservation + comb += reservation_d.valid.eq(0) + with m.Else(): + comb += req_fsm_miss_valid.eq(1) + + comb += req_fsm_miss_we .eq(1) + comb += req_fsm_miss_req .eq(ariane_axi::SINGLE_REQ) + comb += req_fsm_miss_size.eq(amo_req_i.size) + comb += req_fsm_miss_addr.eq(amo_req_i.operand_a) + + comb += req_fsm_miss_wdata.eq( + data_align(amo_req_i.operand_a[0:3], amo_result_o)) + comb += req_fsm_miss_be.eq( + be_gen(amo_req_i.operand_a[0:3], amo_req_i.size)) + + # place a reservation on the memory + with m.If (amo_req_i.amo_op == AMO_LR): + comb += reservation_d.address.eq(amo_req_i.operand_a[3:64]) + comb += reservation_d.valid.eq(1) + + # the request is valid or we didn't need to go for another store + with m.If (valid_miss_fsm): + m.next = "IDLE" + comb += amo_resp_o.ack.eq(1) + # write-back the result + comb += amo_resp_o.result.eq(amo_operand_a; + + if (amo_req_i.amo_op == AMO_SC) begin + comb += amo_resp_o.result.eq(0) + # An SC must fail if there is another SC + # (to any address) between the LR and the SC in + # program order (even to the same address). + # in any case destroy the reservation + comb += reservation_d.valid.eq(0) + + # check MSHR for aliasing + + comb += mshr_addr_matches_o .eq(0) + comb += mshr_index_matches_o.eq() + + for i in range(NR_PORTS): + # check mshr for potential matching of other units, + # exclude the unit currently being served + with m.If (mshr_q.valid & \ + (mshr_addr_i[i][DCACHE_BYTE_OFFSET:56] == \ + mshr_q.addr[DCACHE_BYTE_OFFSET:56])): + comb += mshr_addr_matches_o[i].eq(1) + + # same as previous, but checking only the index + with m.If (mshr_q.valid & \ + (mshr_addr_i[i][DCACHE_BYTE_OFFSET:DCACHE_INDEX_WIDTH] == \ + mshr_q.addr[DCACHE_BYTE_OFFSET:DCACHE_INDEX_WIDTH])): + mshr_index_matches_o[i].eq(1) + + # -------------------- + # Sequential Process + # -------------------- + + """ + #pragma translate_off + `ifndef VERILATOR + # assert that cache only hits on one way + assert property ( + @(posedge clk_i) $onehot0(evict_way_q)) else $warning("Evict-way should be one-hot encoded"); + `endif + #pragma translate_on + """ + + # ---------------------- + # Bypass Arbiter + # ---------------------- + # Connection Arbiter <-> AXI + req_fsm_bypass_valid = Signal() + req_fsm_bypass_addr = Signal(64) + req_fsm_bypass_wdata = Signal(64) + req_fsm_bypass_we = Signal() + req_fsm_bypass_be = Signal(8) + req_fsm_bypass_size = Signal(2) + gnt_bypass_fsm = Signal() + valid_bypass_fsm = Signal() + data_bypass_fsm = Signal(64) + logic [$clog2(NR_PORTS)-1:0] id_fsm_bypass; + logic [3:0] id_bypass_fsm; + logic [3:0] gnt_id_bypass_fsm; + + i_bypass_arbiter = ib = AXIArbiter( NR_PORTS, 64) + comb += [ + # Master Side + ib.data_req_i .eq( miss_req_valid & miss_req_bypass ), + ib.address_i .eq( miss_req_addr ), + ib.data_wdata_i .eq( miss_req_wdata ), + ib.data_we_i .eq( miss_req_we ), + ib.data_be_i .eq( miss_req_be ), + ib.data_size_i .eq( miss_req_size ), + ib.data_gnt_o .eq( bypass_gnt_o ), + ib.data_rvalid_o .eq( bypass_valid_o ), + ib.data_rdata_o .eq( bypass_data_o ), + # Slave Sid + ib.id_i .eq( id_bypass_fsm[$clog2(NR_PORTS)-1:0] ), + ib.id_o .eq( id_fsm_bypass ), + ib.gnt_id_i .eq( gnt_id_bypass_fsm[$clog2(NR_PORTS)-1:0] ), + ib.address_o .eq( req_fsm_bypass_addr ), + ib.data_wdata_o .eq( req_fsm_bypass_wdata ), + ib.data_req_o .eq( req_fsm_bypass_valid ), + ib.data_we_o .eq( req_fsm_bypass_we ), + ib.data_be_o .eq( req_fsm_bypass_be ), + ib.data_size_o .eq( req_fsm_bypass_size ), + ib.data_gnt_i .eq( gnt_bypass_fsm ), + ib.data_rvalid_i .eq( valid_bypass_fsm ), + ib.data_rdata_i .eq( data_bypass_fsm ), + ] + + axi_adapter #( + .DATA_WIDTH ( 64 ), + .AXI_ID_WIDTH ( 4 ), + .CACHELINE_BYTE_OFFSET ( DCACHE_BYTE_OFFSET ) + ) i_bypass_axi_adapter ( + .clk_i, + .rst_ni, + .req_i ( req_fsm_bypass_valid ), + .type_i ( ariane_axi::SINGLE_REQ ), + .gnt_o ( gnt_bypass_fsm ), + .addr_i ( req_fsm_bypass_addr ), + .we_i ( req_fsm_bypass_we ), + .wdata_i ( req_fsm_bypass_wdata ), + .be_i ( req_fsm_bypass_be ), + .size_i ( req_fsm_bypass_size ), + .id_i ( Cat(id_fsm_bypass, 0, 0) ), + .valid_o ( valid_bypass_fsm ), + .rdata_o ( data_bypass_fsm ), + .gnt_id_o ( gnt_id_bypass_fsm ), + .id_o ( id_bypass_fsm ), + .critical_word_o ( ), # not used for single requests + .critical_word_valid_o ( ), # not used for single requests + .axi_req_o ( axi_bypass_o ), + .axi_resp_i ( axi_bypass_i ) + ); + + # ---------------------- + # Cache Line AXI Refill + # ---------------------- + axi_adapter #( + .DATA_WIDTH ( DCACHE_LINE_WIDTH ), + .AXI_ID_WIDTH ( 4 ), + .CACHELINE_BYTE_OFFSET ( DCACHE_BYTE_OFFSET ) + ) i_miss_axi_adapter ( + .clk_i, + .rst_ni, + .req_i ( req_fsm_miss_valid ), + .type_i ( req_fsm_miss_req ), + .gnt_o ( gnt_miss_fsm ), + .addr_i ( req_fsm_miss_addr ), + .we_i ( req_fsm_miss_we ), + .wdata_i ( req_fsm_miss_wdata ), + .be_i ( req_fsm_miss_be ), + .size_i ( req_fsm_miss_size ), + .id_i ( Const(0b1100, 4) ), + .gnt_id_o ( ), # open + .valid_o ( valid_miss_fsm ), + .rdata_o ( data_miss_fsm ), + .id_o ( ), + .critical_word_o, + .critical_word_valid_o, + .axi_req_o ( axi_data_o ), + .axi_resp_i ( axi_data_i ) + ); + + # ----------------- + # Replacement LFSR + # ----------------- + lfsr_8bit #(.WIDTH (DCACHE_SET_ASSOC)) i_lfsr ( + .en_i ( lfsr_enable ), + .refill_way_oh ( lfsr_oh ), + .refill_way_bin ( lfsr_bin ), + .* + ); + + # ----------------- + # AMO ALU + # ----------------- + amo_alu i_amo_alu ( + .amo_op_i ( amo_op ), + .amo_operand_a_i ( amo_operand_a ), + .amo_operand_b_i ( amo_operand_b ), + .amo_result_o ( amo_result_o ) + ); + + # ----------------- + # Struct Split + # ----------------- + + for i in range(NR_PORTS): + miss_req = MissReq() + comb += miss_req.eq(miss_req_i[i]); + comb += miss_req_valid [i] .eq(miss_req.valid) + comb += miss_req_bypass [i] .eq(miss_req.bypass) + comb += miss_req_addr [i] .eq(miss_req.addr) + comb += miss_req_wdata [i] .eq(miss_req.wdata) + comb += miss_req_we [i] .eq(miss_req.we) + comb += miss_req_be [i] .eq(miss_req.be) + comb += miss_req_size [i] .eq(miss_req.size) + + # -------------- + # AXI Arbiter + # --------------s + # + # Description: Arbitrates access to AXI refill/bypass + # +class AXIArbiter: + def __init__(self, NR_PORTS = 3, DATA_WIDTH = 64): + self.NR_PORTS = NR_PORTS + self.DATA_WIDTH = DATA_WIDTH + self.pwid = pwid = ceil(log(NR_PORTS) / log(2)) + rst_ni = ResetSignal() # Asynchronous reset active low + # master ports + self.data_req_i = Signal(NR_PORTS) + self.address_i = Array(Signal(name="address_i", 64) \ + for i in range(NR_PORTS)) + self.data_wdata_i = Array(Signal(name="data_wdata_i", 64) \ + for i in range(NR_PORTS)) + self.data_we_i = Signal(NR_PORTS) + self.data_be_i = Array(Signal(name="data_wdata_i", DATA_WIDTH/8) \ + for i in range(NR_PORTS)) + self.data_size_i = Array(Signal(name="data_size_i", 2) \ + for i in range(NR_PORTS)) + self.data_gnt_o = Signal(NR_PORTS) + self.data_rvalid_o = Signal(NR_PORTS) + self.data_rdata_o = Array(Signal(name="data_rdata_o", 64) \ + for i in range(NR_PORTS)) + + # slave port + self.id_i = Signal(pwid) + self.id_o = Signal(pwid) + self.gnt_id_i = Signal(pwid) + self.data_req_o = Signal() + self.address_o = Signal(64) + self.data_wdata_o = Signal(DATA_WIDTH) + self.data_we_o = Signal() + self.data_be_o = Signal(DATA_WIDTH/8) + self.data_size_o = Signal(2) + self.data_gnt_i = Signal() + self.data_rvalid_i = Signal() + self.data_rdata_i = Signal(DATA_WIDTH) + + def elaborate(self, platform): + #enum logic [1:0] { IDLE, REQ, SERVING } state_d, state_q; + + class Packet: + def __init__(self, pwid, DATA_WIDTH): + self.id = Signal(pwid) + self.address = Signal(64) + self.data = Signal(64) + self.size = Signal(2) + self.be = Signal(DATA_WIDTH/8) + self.we = Signal() + + request_index = Signal(self.pwid) + req_q = Packet(self.pwid, self.DATA_WIDTH) + req_d = Packet(self.pwid, self.DATA_WIDTH) + + # request register + sync += req_q.eq(req_d) + + # request port + comb += self.address_o .eq(req_q.address) + comb += self.data_wdata_o .eq(req_q.data) + comb += self.data_be_o .eq(req_q.be) + comb += self.data_size_o .eq(req_q.size) + comb += self.data_we_o .eq(req_q.we) + comb += self.id_o .eq(req_q.id) + comb += self.data_gnt_o .eq(0) + # read port + comb += self.data_rvalid_o .eq(0) + comb += self.data_rdata_o .eq(0) + comb += self.data_rdata_o[req_q.id].eq(data_rdata_i) + + m.submodules.pp = pp = PriorityEncoder(self.NR_PORTS) + comb += pp.i.eq(self.data_req_i) # select one request (priority-based) + comb += request_index.eq(pp.o) + + with m.Switch("state") as s: + + with m.Case("IDLE"): + # wait for incoming requests (priority encoder data_req_i) + with m.If(~pp.n): # one output valid from encoder + comb += self.data_req_o .eq(self.data_req_i[i]) + comb += self.data_gnt_o[i].eq(self.data_req_i[i]) + # save the request + comb += req_d.address.eq(self.address_i[i]) + comb += req_d.id.eq(request_index) + comb += req_d.data.eq(self.data_wdata_i[i]) + comb += req_d.size.eq(self.data_size_i[i]) + comb += req_d.be.eq(self.data_be_i[i]) + comb += req_d.we.eq(self.data_we_i[i]) + m.next = "SERVING" + + comb += self.address_o .eq(self.address_i[request_index]) + comb += self.data_wdata_o .eq(self.data_wdata_i[request_index]) + comb += self.data_be_o .eq(self.data_be_i[request_index]) + comb += self.data_size_o .eq(self.data_size_i[request_index]) + comb += self.data_we_o .eq(self.data_we_i[request_index]) + comb += self.id_o .eq(request_index) + + with m.Case("SERVING"): + comb += self.data_req_o.eq(1) + with m.If (self.data_rvalid_i): + comb += self.data_rvalid_o[req_q.id].eq(1) + m.next = "IDLE" + + # ------------ + # Assertions + # ------------ + + """ +#pragma translate_off +`ifndef VERILATOR +# make sure that we eventually get an rvalid after we received a grant +assert property (@(posedge clk_i) data_gnt_i |-> ##[1:$] data_rvalid_i ) + else begin $error("There was a grant without a rvalid"); $stop(); end +# assert that there is no grant without a request +assert property (@(negedge clk_i) data_gnt_i |-> data_req_o) + else begin $error("There was a grant without a request."); $stop(); end +# assert that the address does not contain X when request is sent +assert property ( @(posedge clk_i) (data_req_o) |-> (!$isunknown(address_o)) ) + else begin $error("address contains X when request is set"); $stop(); end + +`endif +#pragma translate_on + """ + diff --git a/src/soc/TLB/ariane/mmu.py b/src/soc/TLB/ariane/mmu.py new file mode 100644 index 00000000..a14862cd --- /dev/null +++ b/src/soc/TLB/ariane/mmu.py @@ -0,0 +1,474 @@ +""" +# Copyright 2018 ETH Zurich and University of Bologna. +# Copyright and related rights are licensed under the Solderpad Hardware +# License, Version 0.51 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# http:#solderpad.org/licenses/SHL-0.51. Unless required by applicable law +# or agreed to in writing, software, hardware and materials distributed under +# this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Author: Florian Zaruba, ETH Zurich +# Date: 19/04/2017 +# Description: Memory Management Unit for Ariane, contains TLB and +# address translation unit. SV48 as defined in +# Volume II: RISC-V Privileged Architectures V1.10 Page 63 + +import ariane_pkg::*; +""" + +from nmigen import Const, Signal, Cat, Module, Mux +from nmigen.cli import verilog, rtlil + +from ptw import DCacheReqI, DCacheReqO, TLBUpdate, PTE, PTW +from tlb import TLB +from exceptcause import (INSTR_ACCESS_FAULT, INSTR_PAGE_FAULT, + LOAD_PAGE_FAULT, STORE_PAGE_FAULT) + +PRIV_LVL_M = Const(0b11, 2) +PRIV_LVL_S = Const(0b01, 2) +PRIV_LVL_U = Const(0b00, 2) + + +class RVException: + def __init__(self): + self.cause = Signal(64) # cause of exception + self.tval = Signal(64) # more info of causing exception + # (e.g.: instruction causing it), + # address of LD/ST fault + self.valid = Signal() + + def eq(self, inp): + res = [] + for (o, i) in zip(self.ports(), inp.ports()): + res.append(o.eq(i)) + return res + + def __iter__(self): + yield self.cause + yield self.tval + yield self.valid + + def ports(self): + return list(self) + + +class ICacheReqI: + def __init__(self): + self.fetch_valid = Signal() # address translation valid + self.fetch_paddr = Signal(64) # physical address in + self.fetch_exception = RVException() # exception occurred during fetch + + def __iter__(self): + yield self.fetch_valid + yield self.fetch_paddr + yield from self.fetch_exception + + def ports(self): + return list(self) + + +class ICacheReqO: + def __init__(self): + self.fetch_req = Signal() # address translation request + self.fetch_vaddr = Signal(64) # virtual address out + + def __iter__(self): + yield self.fetch_req + yield self.fetch_vaddr + + def ports(self): + return list(self) + + +class MMU: + def __init__(self, instr_tlb_entries = 4, + data_tlb_entries = 4, + asid_width = 1): + self.instr_tlb_entries = instr_tlb_entries + self.data_tlb_entries = data_tlb_entries + self.asid_width = asid_width + + self.flush_i = Signal() + self.enable_translation_i = Signal() + self.en_ld_st_translation_i = Signal() # enable VM translation for LD/ST + # IF interface + self.icache_areq_i = ICacheReqO() + self.icache_areq_o = ICacheReqI() + # LSU interface + # this is a more minimalistic interface because the actual addressing + # logic is handled in the LSU as we distinguish load and stores, + # what we do here is simple address translation + self.misaligned_ex_i = RVException() + self.lsu_req_i = Signal() # request address translation + self.lsu_vaddr_i = Signal(64) # virtual address in + self.lsu_is_store_i = Signal() # the translation is requested by a store + # if we need to walk the page table we can't grant in the same cycle + + # Cycle 0 + self.lsu_dtlb_hit_o = Signal() # sent in the same cycle as the request + # if translation hits in the DTLB + # Cycle 1 + self.lsu_valid_o = Signal() # translation is valid + self.lsu_paddr_o = Signal(64) # translated address + self.lsu_exception_o = RVException() # addr translate threw exception + + # General control signals + self.priv_lvl_i = Signal(2) + self.ld_st_priv_lvl_i = Signal(2) + self.sum_i = Signal() + self.mxr_i = Signal() + # input logic flag_mprv_i, + self.satp_ppn_i = Signal(44) + self.asid_i = Signal(self.asid_width) + self.flush_tlb_i = Signal() + # Performance counters + self.itlb_miss_o = Signal() + self.dtlb_miss_o = Signal() + # PTW memory interface + self.req_port_i = DCacheReqO() + self.req_port_o = DCacheReqI() + + def elaborate(self, platform): + m = Module() + + iaccess_err = Signal() # insufficient priv to access instr page + daccess_err = Signal() # insufficient priv to access data page + ptw_active = Signal() # PTW is currently walking a page table + walking_instr = Signal() # PTW is walking because of an ITLB miss + ptw_error = Signal() # PTW threw an exception + + update_vaddr = Signal(48) # guessed + uaddr64 = Cat(update_vaddr, Const(0, 25)) # extend to 64bit with zeros + update_ptw_itlb = TLBUpdate(self.asid_width) + update_ptw_dtlb = TLBUpdate(self.asid_width) + + itlb_lu_access = Signal() + itlb_content = PTE() + itlb_is_2M = Signal() + itlb_is_1G = Signal() + itlb_is_512G = Signal() + itlb_lu_hit = Signal() + + dtlb_lu_access = Signal() + dtlb_content = PTE() + dtlb_is_2M = Signal() + dtlb_is_1G = Signal() + dtlb_is_512G = Signal() + dtlb_lu_hit = Signal() + + # Assignments + m.d.comb += [itlb_lu_access.eq(self.icache_areq_i.fetch_req), + dtlb_lu_access.eq(self.lsu_req_i) + ] + + # ITLB + m.submodules.i_tlb = i_tlb = TLB(self.instr_tlb_entries, + self.asid_width) + m.d.comb += [i_tlb.flush_i.eq(self.flush_tlb_i), + i_tlb.update_i.eq(update_ptw_itlb), + i_tlb.lu_access_i.eq(itlb_lu_access), + i_tlb.lu_asid_i.eq(self.asid_i), + i_tlb.lu_vaddr_i.eq(self.icache_areq_i.fetch_vaddr), + itlb_content.eq(i_tlb.lu_content_o), + itlb_is_2M.eq(i_tlb.lu_is_2M_o), + itlb_is_1G.eq(i_tlb.lu_is_1G_o), + itlb_is_512G.eq(i_tlb.lu_is_512G_o), + itlb_lu_hit.eq(i_tlb.lu_hit_o), + ] + + # DTLB + m.submodules.d_tlb = d_tlb = TLB(self.data_tlb_entries, + self.asid_width) + m.d.comb += [d_tlb.flush_i.eq(self.flush_tlb_i), + d_tlb.update_i.eq(update_ptw_dtlb), + d_tlb.lu_access_i.eq(dtlb_lu_access), + d_tlb.lu_asid_i.eq(self.asid_i), + d_tlb.lu_vaddr_i.eq(self.lsu_vaddr_i), + dtlb_content.eq(d_tlb.lu_content_o), + dtlb_is_2M.eq(d_tlb.lu_is_2M_o), + dtlb_is_1G.eq(d_tlb.lu_is_1G_o), + dtlb_is_512G.eq(d_tlb.lu_is_512G_o), + dtlb_lu_hit.eq(d_tlb.lu_hit_o), + ] + + # PTW + m.submodules.ptw = ptw = PTW(self.asid_width) + m.d.comb += [ptw_active.eq(ptw.ptw_active_o), + walking_instr.eq(ptw.walking_instr_o), + ptw_error.eq(ptw.ptw_error_o), + ptw.enable_translation_i.eq(self.enable_translation_i), + + update_vaddr.eq(ptw.update_vaddr_o), + update_ptw_itlb.eq(ptw.itlb_update_o), + update_ptw_dtlb.eq(ptw.dtlb_update_o), + + ptw.itlb_access_i.eq(itlb_lu_access), + ptw.itlb_hit_i.eq(itlb_lu_hit), + ptw.itlb_vaddr_i.eq(self.icache_areq_i.fetch_vaddr), + + ptw.dtlb_access_i.eq(dtlb_lu_access), + ptw.dtlb_hit_i.eq(dtlb_lu_hit), + ptw.dtlb_vaddr_i.eq(self.lsu_vaddr_i), + + ptw.req_port_i.eq(self.req_port_i), + self.req_port_o.eq(ptw.req_port_o), + ] + + # ila_1 i_ila_1 ( + # .clk(clk_i), # input wire clk + # .probe0({req_port_o.address_tag, req_port_o.address_index}), + # .probe1(req_port_o.data_req), # input wire [63:0] probe1 + # .probe2(req_port_i.data_gnt), # input wire [0:0] probe2 + # .probe3(req_port_i.data_rdata), # input wire [0:0] probe3 + # .probe4(req_port_i.data_rvalid), # input wire [0:0] probe4 + # .probe5(ptw_error), # input wire [1:0] probe5 + # .probe6(update_vaddr), # input wire [0:0] probe6 + # .probe7(update_ptw_itlb.valid), # input wire [0:0] probe7 + # .probe8(update_ptw_dtlb.valid), # input wire [0:0] probe8 + # .probe9(dtlb_lu_access), # input wire [0:0] probe9 + # .probe10(lsu_vaddr_i), # input wire [0:0] probe10 + # .probe11(dtlb_lu_hit), # input wire [0:0] probe11 + # .probe12(itlb_lu_access), # input wire [0:0] probe12 + # .probe13(icache_areq_i.fetch_vaddr), # input wire [0:0] probe13 + # .probe14(itlb_lu_hit) # input wire [0:0] probe13 + # ); + + #----------------------- + # Instruction Interface + #----------------------- + # The instruction interface is a simple request response interface + + # MMU disabled: just pass through + m.d.comb += [self.icache_areq_o.fetch_valid.eq( + self.icache_areq_i.fetch_req), + # play through in case we disabled address translation + self.icache_areq_o.fetch_paddr.eq( + self.icache_areq_i.fetch_vaddr) + ] + # two potential exception sources: + # 1. HPTW threw an exception -> signal with a page fault exception + # 2. We got an access error because of insufficient permissions -> + # throw an access exception + m.d.comb += self.icache_areq_o.fetch_exception.valid.eq(0) + # Check whether we are allowed to access this memory region + # from a fetch perspective + + # PLATEN TODO: use PermissionValidator instead [we like modules] + m.d.comb += iaccess_err.eq(self.icache_areq_i.fetch_req & \ + (((self.priv_lvl_i == PRIV_LVL_U) & \ + ~itlb_content.u) | \ + ((self.priv_lvl_i == PRIV_LVL_S) & \ + itlb_content.u))) + + # MMU enabled: address from TLB, request delayed until hit. + # Error when TLB hit and no access right or TLB hit and + # translated address not valid (e.g. AXI decode error), + # or when PTW performs walk due to ITLB miss and raises + # an error. + with m.If (self.enable_translation_i): + # we work with SV48, so if VM is enabled, check that + # all bits [47:38] are equal + with m.If (self.icache_areq_i.fetch_req & \ + ~(((~self.icache_areq_i.fetch_vaddr[47:64]) == 0) | \ + (self.icache_areq_i.fetch_vaddr[47:64]) == 0)): + fe = self.icache_areq_o.fetch_exception + m.d.comb += [fe.cause.eq(INSTR_ACCESS_FAULT), + fe.tval.eq(self.icache_areq_i.fetch_vaddr), + fe.valid.eq(1) + ] + + m.d.comb += self.icache_areq_o.fetch_valid.eq(0) + + # 4K page + paddr = Signal.like(self.icache_areq_o.fetch_paddr) + paddr4k = Cat(self.icache_areq_i.fetch_vaddr[0:12], + itlb_content.ppn) + m.d.comb += paddr.eq(paddr4k) + # Mega page + with m.If(itlb_is_2M): + m.d.comb += paddr[12:21].eq( + self.icache_areq_i.fetch_vaddr[12:21]) + # Giga page + with m.If(itlb_is_1G): + m.d.comb += paddr[12:30].eq( + self.icache_areq_i.fetch_vaddr[12:30]) + m.d.comb += self.icache_areq_o.fetch_paddr.eq(paddr) + # Tera page + with m.If(itlb_is_512G): + m.d.comb += paddr[12:39].eq( + self.icache_areq_i.fetch_vaddr[12:39]) + m.d.comb += self.icache_areq_o.fetch_paddr.eq(paddr) + + # --------- + # ITLB Hit + # -------- + # if we hit the ITLB output the request signal immediately + with m.If(itlb_lu_hit): + m.d.comb += self.icache_areq_o.fetch_valid.eq( + self.icache_areq_i.fetch_req) + # we got an access error + with m.If (iaccess_err): + # throw a page fault + fe = self.icache_areq_o.fetch_exception + m.d.comb += [fe.cause.eq(INSTR_ACCESS_FAULT), + fe.tval.eq(self.icache_areq_i.fetch_vaddr), + fe.valid.eq(1) + ] + # --------- + # ITLB Miss + # --------- + # watch out for exceptions happening during walking the page table + with m.Elif(ptw_active & walking_instr): + m.d.comb += self.icache_areq_o.fetch_valid.eq(ptw_error) + fe = self.icache_areq_o.fetch_exception + m.d.comb += [fe.cause.eq(INSTR_PAGE_FAULT), + fe.tval.eq(uaddr64), + fe.valid.eq(1) + ] + + #----------------------- + # Data Interface + #----------------------- + + lsu_vaddr = Signal(64) + dtlb_pte = PTE() + misaligned_ex = RVException() + lsu_req = Signal() + lsu_is_store = Signal() + dtlb_hit = Signal() + #dtlb_is_2M = Signal() + #dtlb_is_1G = Signal() + #dtlb_is_512 = Signal() + + # check if we need to do translation or if we are always + # ready (e.g.: we are not translating anything) + m.d.comb += self.lsu_dtlb_hit_o.eq(Mux(self.en_ld_st_translation_i, + dtlb_lu_hit, 1)) + + # The data interface is simpler and only consists of a + # request/response interface + m.d.comb += [ + # save request and DTLB response + lsu_vaddr.eq(self.lsu_vaddr_i), + lsu_req.eq(self.lsu_req_i), + misaligned_ex.eq(self.misaligned_ex_i), + dtlb_pte.eq(dtlb_content), + dtlb_hit.eq(dtlb_lu_hit), + lsu_is_store.eq(self.lsu_is_store_i), + #dtlb_is_2M.eq(dtlb_is_2M), + #dtlb_is_1G.eq(dtlb_is_1G), + ##dtlb_is_512.eq(self.dtlb_is_512G) #???? + ] + m.d.sync += [ + self.lsu_paddr_o.eq(lsu_vaddr), + self.lsu_valid_o.eq(lsu_req), + self.lsu_exception_o.eq(misaligned_ex), + ] + + sverr = Signal() + usrerr = Signal() + + m.d.comb += [ + # mute misaligned exceptions if there is no request + # otherwise they will throw accidental exceptions + misaligned_ex.valid.eq(self.misaligned_ex_i.valid & self.lsu_req_i), + + # SUM is not set and we are trying to access a user + # page in supervisor mode + sverr.eq(self.ld_st_priv_lvl_i == PRIV_LVL_S & ~self.sum_i & \ + dtlb_pte.u), + # this is not a user page but we are in user mode and + # trying to access it + usrerr.eq(self.ld_st_priv_lvl_i == PRIV_LVL_U & ~dtlb_pte.u), + + # Check if the User flag is set, then we may only + # access it in supervisor mode if SUM is enabled + daccess_err.eq(sverr | usrerr), + ] + + # translation is enabled and no misaligned exception occurred + with m.If(self.en_ld_st_translation_i & ~misaligned_ex.valid): + m.d.comb += lsu_req.eq(0) + # 4K page + paddr = Signal.like(lsu_vaddr) + paddr4k = Cat(lsu_vaddr[0:12], itlb_content.ppn) + m.d.comb += paddr.eq(paddr4k) + # Mega page + with m.If(dtlb_is_2M): + m.d.comb += paddr[12:21].eq(lsu_vaddr[12:21]) + # Giga page + with m.If(dtlb_is_1G): + m.d.comb += paddr[12:30].eq(lsu_vaddr[12:30]) + m.d.sync += self.lsu_paddr_o.eq(paddr) + # TODO platen tera_page + + # --------- + # DTLB Hit + # -------- + with m.If(dtlb_hit & lsu_req): + m.d.comb += lsu_req.eq(1) + # this is a store + with m.If (lsu_is_store): + # check if the page is write-able and + # we are not violating privileges + # also check if the dirty flag is set + with m.If(~dtlb_pte.w | daccess_err | ~dtlb_pte.d): + le = self.lsu_exception_o + m.d.sync += [le.cause.eq(STORE_PAGE_FAULT), + le.tval.eq(lsu_vaddr), + le.valid.eq(1) + ] + + # this is a load, check for sufficient access + # privileges - throw a page fault if necessary + with m.Elif(daccess_err): + le = self.lsu_exception_o + m.d.sync += [le.cause.eq(LOAD_PAGE_FAULT), + le.tval.eq(lsu_vaddr), + le.valid.eq(1) + ] + # --------- + # DTLB Miss + # --------- + # watch out for exceptions + with m.Elif (ptw_active & ~walking_instr): + # page table walker threw an exception + with m.If (ptw_error): + # an error makes the translation valid + m.d.comb += lsu_req.eq(1) + # the page table walker can only throw page faults + with m.If (lsu_is_store): + le = self.lsu_exception_o + m.d.sync += [le.cause.eq(STORE_PAGE_FAULT), + le.tval.eq(uaddr64), + le.valid.eq(1) + ] + with m.Else(): + m.d.sync += [le.cause.eq(LOAD_PAGE_FAULT), + le.tval.eq(uaddr64), + le.valid.eq(1) + ] + + return m + + def ports(self): + return [self.flush_i, self.enable_translation_i, + self.en_ld_st_translation_i, + self.lsu_req_i, + self.lsu_vaddr_i, self.lsu_is_store_i, self.lsu_dtlb_hit_o, + self.lsu_valid_o, self.lsu_paddr_o, + self.priv_lvl_i, self.ld_st_priv_lvl_i, self.sum_i, self.mxr_i, + self.satp_ppn_i, self.asid_i, self.flush_tlb_i, + self.itlb_miss_o, self.dtlb_miss_o] + \ + self.icache_areq_i.ports() + self.icache_areq_o.ports() + \ + self.req_port_i.ports() + self.req_port_o.ports() + \ + self.misaligned_ex_i.ports() + self.lsu_exception_o.ports() + +if __name__ == '__main__': + mmu = MMU() + vl = rtlil.convert(mmu, ports=mmu.ports()) + with open("test_mmu.il", "w") as f: + f.write(vl) + diff --git a/src/soc/TLB/ariane/p_lru.txt b/src/soc/TLB/ariane/p_lru.txt new file mode 100644 index 00000000..4bac7680 --- /dev/null +++ b/src/soc/TLB/ariane/p_lru.txt @@ -0,0 +1,51 @@ +pseudo-LRU + +two-way set associative - one bit + + indicates which line of the two has been reference more recently + + +four-way set associative - three bits + + each bit represents one branch point in a binary decision tree; let 1 + represent that the left side has been referenced more recently than the + right side, and 0 vice-versa + + are all 4 lines valid? + / \ + yes no, use an invalid line + | + | + | + bit_0 == 0? state | replace ref to | next state + / \ ------+-------- -------+----------- + y n 00x | line_0 line_0 | 11_ + / \ 01x | line_1 line_1 | 10_ + bit_1 == 0? bit_2 == 0? 1x0 | line_2 line_2 | 0_1 + / \ / \ 1x1 | line_3 line_3 | 0_0 + y n y n + / \ / \ ('x' means ('_' means unchanged) + line_0 line_1 line_2 line_3 don't care) + + (see Figure 3-7, p. 3-18, in Intel Embedded Pentium Processor Family Dev. + Manual, 1998, http://www.intel.com/design/intarch/manuals/273204.htm) + + +note that there is a 6-bit encoding for true LRU for four-way set associative + + bit 0: bank[1] more recently used than bank[0] + bit 1: bank[2] more recently used than bank[0] + bit 2: bank[2] more recently used than bank[1] + bit 3: bank[3] more recently used than bank[0] + bit 4: bank[3] more recently used than bank[1] + bit 5: bank[3] more recently used than bank[2] + + this results in 24 valid bit patterns within the 64 possible bit patterns + (4! possible valid traces for bank references) + + e.g., a trace of 0 1 2 3, where 0 is LRU and 3 is MRU, is encoded as 111111 + + you can implement a state machine with a 256x6 ROM (6-bit state encoding + appended with a 2-bit bank reference input will yield a new 6-bit state), + and you can implement an LRU bank indicator with a 64x2 ROM + diff --git a/src/soc/TLB/ariane/plru.py b/src/soc/TLB/ariane/plru.py new file mode 100644 index 00000000..a8db5c27 --- /dev/null +++ b/src/soc/TLB/ariane/plru.py @@ -0,0 +1,105 @@ +from nmigen import Signal, Module, Cat, Const +from nmigen.hdl.ir import Elaboratable +from math import log2 + + +class PLRU(Elaboratable): + """ PLRU - Pseudo Least Recently Used Replacement + + PLRU-tree indexing: + lvl0 0 + / \ + / \ + lvl1 1 2 + / \ / \ + lvl2 3 4 5 6 + / \ /\/\ /\ + ... ... ... ... + """ + def __init__(self, entries): + self.entries = entries + self.lu_hit = Signal(entries) + self.replace_en_o = Signal(entries) + self.lu_access_i = Signal() + # Tree (bit per entry) + self.TLBSZ = 2*(self.entries-1) + self.plru_tree = Signal(self.TLBSZ) + self.plru_tree_o = Signal(self.TLBSZ) + + def elaborate(self, platform=None): + m = Module() + + # Just predefine which nodes will be set/cleared + # E.g. for a TLB with 8 entries, the for-loop is semantically + # equivalent to the following pseudo-code: + # unique case (1'b1) + # lu_hit[7]: plru_tree[0, 2, 6] = {1, 1, 1}; + # lu_hit[6]: plru_tree[0, 2, 6] = {1, 1, 0}; + # lu_hit[5]: plru_tree[0, 2, 5] = {1, 0, 1}; + # lu_hit[4]: plru_tree[0, 2, 5] = {1, 0, 0}; + # lu_hit[3]: plru_tree[0, 1, 4] = {0, 1, 1}; + # lu_hit[2]: plru_tree[0, 1, 4] = {0, 1, 0}; + # lu_hit[1]: plru_tree[0, 1, 3] = {0, 0, 1}; + # lu_hit[0]: plru_tree[0, 1, 3] = {0, 0, 0}; + # default: begin /* No hit */ end + # endcase + LOG_TLB = int(log2(self.entries)) + print(LOG_TLB) + for i in range(self.entries): + # we got a hit so update the pointer as it was least recently used + hit = Signal(reset_less=True) + m.d.comb += hit.eq(self.lu_hit[i] & self.lu_access_i) + with m.If(hit): + # Set the nodes to the values we would expect + for lvl in range(LOG_TLB): + idx_base = (1< MSB, lvl1 <=> MSB-1, ... + shift = LOG_TLB - lvl; + new_idx = Const(~((i >> (shift-1)) & 1), (1, False)) + plru_idx = idx_base + (i >> shift) + print ("plru", i, lvl, hex(idx_base), + plru_idx, shift, new_idx) + m.d.comb += self.plru_tree_o[plru_idx].eq(new_idx) + + # Decode tree to write enable signals + # Next for-loop basically creates the following logic for e.g. + # an 8 entry TLB (note: pseudo-code obviously): + # replace_en[7] = &plru_tree[ 6, 2, 0]; #plru_tree[0,2,6]=={1,1,1} + # replace_en[6] = &plru_tree[~6, 2, 0]; #plru_tree[0,2,6]=={1,1,0} + # replace_en[5] = &plru_tree[ 5,~2, 0]; #plru_tree[0,2,5]=={1,0,1} + # replace_en[4] = &plru_tree[~5,~2, 0]; #plru_tree[0,2,5]=={1,0,0} + # replace_en[3] = &plru_tree[ 4, 1,~0]; #plru_tree[0,1,4]=={0,1,1} + # replace_en[2] = &plru_tree[~4, 1,~0]; #plru_tree[0,1,4]=={0,1,0} + # replace_en[1] = &plru_tree[ 3,~1,~0]; #plru_tree[0,1,3]=={0,0,1} + # replace_en[0] = &plru_tree[~3,~1,~0]; #plru_tree[0,1,3]=={0,0,0} + # For each entry traverse the tree. If every tree-node matches + # the corresponding bit of the entry's index, this is + # the next entry to replace. + replace = [] + for i in range(self.entries): + en = [] + for lvl in range(LOG_TLB): + idx_base = (1< MSB, lvl1 <=> MSB-1, ... + shift = LOG_TLB - lvl; + new_idx = (i >> (shift-1)) & 1; + plru_idx = idx_base + (i>>shift) + plru = Signal(reset_less=True, + name="plru-%d-%d-%d" % (i, lvl, plru_idx)) + m.d.comb += plru.eq(self.plru_tree[plru_idx]) + # en &= plru_tree_q[idx_base + (i>>shift)] == new_idx; + if new_idx: + en.append(~plru) # yes inverted (using bool()) + else: + en.append(plru) # yes inverted (using bool()) + print ("plru", i, en) + # boolean logic manipulation: + # plru0 & plru1 & plru2 == ~(~plru0 | ~plru1 | ~plru2) + replace.append(~Cat(*en).bool()) + m.d.comb += self.replace_en_o.eq(Cat(*replace)) + + return m + + def ports(self): + return [self.entries, self.lu_hit, self.replace_en_o, + self.lu_access_i, self.plru_tree, self.plru_tree_o] diff --git a/src/soc/TLB/ariane/ptw.py b/src/soc/TLB/ariane/ptw.py new file mode 100644 index 00000000..4046c711 --- /dev/null +++ b/src/soc/TLB/ariane/ptw.py @@ -0,0 +1,556 @@ +""" +# Copyright 2018 ETH Zurich and University of Bologna. +# Copyright and related rights are licensed under the Solderpad Hardware +# License, Version 0.51 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# http:#solderpad.org/licenses/SHL-0.51. Unless required by applicable law +# or agreed to in writing, software, hardware and materials distributed under +# this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Author: David Schaffenrath, TU Graz +# Author: Florian Zaruba, ETH Zurich +# Date: 24.4.2017 +# Description: Hardware-PTW + +/* verilator lint_off WIDTH */ +import ariane_pkg::*; + +see linux kernel source: + +* "arch/riscv/include/asm/page.h" +* "arch/riscv/include/asm/mmu_context.h" +* "arch/riscv/Kconfig" (CONFIG_PAGE_OFFSET) + +""" + +from nmigen import Const, Signal, Cat, Module, Elaboratable +from nmigen.hdl.ast import ArrayProxy +from nmigen.cli import verilog, rtlil +from math import log2 + + +DCACHE_SET_ASSOC = 8 +CONFIG_L1D_SIZE = 32*1024 +DCACHE_INDEX_WIDTH = int(log2(CONFIG_L1D_SIZE / DCACHE_SET_ASSOC)) +DCACHE_TAG_WIDTH = 56 - DCACHE_INDEX_WIDTH + +ASID_WIDTH = 8 + + +class DCacheReqI: + def __init__(self): + self.address_index = Signal(DCACHE_INDEX_WIDTH) + self.address_tag = Signal(DCACHE_TAG_WIDTH) + self.data_wdata = Signal(64) + self.data_req = Signal() + self.data_we = Signal() + self.data_be = Signal(8) + self.data_size = Signal(2) + self.kill_req = Signal() + self.tag_valid = Signal() + + def eq(self, inp): + res = [] + for (o, i) in zip(self.ports(), inp.ports()): + res.append(o.eq(i)) + return res + + def ports(self): + return [self.address_index, self.address_tag, + self.data_wdata, self.data_req, + self.data_we, self.data_be, self.data_size, + self.kill_req, self.tag_valid, + ] + +class DCacheReqO: + def __init__(self): + self.data_gnt = Signal() + self.data_rvalid = Signal() + self.data_rdata = Signal(64) # actually in PTE object format + + def eq(self, inp): + res = [] + for (o, i) in zip(self.ports(), inp.ports()): + res.append(o.eq(i)) + return res + + def ports(self): + return [self.data_gnt, self.data_rvalid, self.data_rdata] + + +class PTE: #(RecordObject): + def __init__(self): + self.v = Signal() + self.r = Signal() + self.w = Signal() + self.x = Signal() + self.u = Signal() + self.g = Signal() + self.a = Signal() + self.d = Signal() + self.rsw = Signal(2) + self.ppn = Signal(44) + self.reserved = Signal(10) + + def flatten(self): + return Cat(*self.ports()) + + def eq(self, x): + if isinstance(x, ArrayProxy): + res = [] + for o in self.ports(): + i = getattr(x, o.name) + res.append(i) + x = Cat(*res) + else: + x = x.flatten() + return self.flatten().eq(x) + + def __iter__(self): + """ order is critical so that flatten creates LSB to MSB + """ + yield self.v + yield self.r + yield self.w + yield self.x + yield self.u + yield self.g + yield self.a + yield self.d + yield self.rsw + yield self.ppn + yield self.reserved + + def ports(self): + return list(self) + + +class TLBUpdate: + def __init__(self, asid_width): + self.valid = Signal() # valid flag + self.is_2M = Signal() + self.is_1G = Signal() + self.is_512G = Signal() + self.vpn = Signal(36) + self.asid = Signal(asid_width) + self.content = PTE() + + def flatten(self): + return Cat(*self.ports()) + + def eq(self, x): + return self.flatten().eq(x.flatten()) + + def ports(self): + return [self.valid, self.is_2M, self.is_1G, self.vpn, self.asid] + \ + self.content.ports() + + +# SV48 defines four levels of page tables +LVL1 = Const(0, 2) # defined to 0 so that ptw_lvl default-resets to LVL1 +LVL2 = Const(1, 2) +LVL3 = Const(2, 2) +LVL4 = Const(3, 2) + + +class PTW(Elaboratable): + def __init__(self, asid_width=8): + self.asid_width = asid_width + + self.flush_i = Signal() # flush everything, we need to do this because + # actually everything we do is speculative at this stage + # e.g.: there could be a CSR instruction that changes everything + self.ptw_active_o = Signal(reset=1) # active if not IDLE + self.walking_instr_o = Signal() # set when walking for TLB + self.ptw_error_o = Signal() # set when an error occurred + self.enable_translation_i = Signal() # CSRs indicate to enable SV48 + self.en_ld_st_translation_i = Signal() # enable VM translation for ld/st + + self.lsu_is_store_i = Signal() # translation triggered by store + # PTW memory interface + self.req_port_i = DCacheReqO() + self.req_port_o = DCacheReqI() + + # to TLBs, update logic + self.itlb_update_o = TLBUpdate(asid_width) + self.dtlb_update_o = TLBUpdate(asid_width) + + self.update_vaddr_o = Signal(48) + + self.asid_i = Signal(self.asid_width) + # from TLBs + # did we miss? + self.itlb_access_i = Signal() + self.itlb_hit_i = Signal() + self.itlb_vaddr_i = Signal(64) + + self.dtlb_access_i = Signal() + self.dtlb_hit_i = Signal() + self.dtlb_vaddr_i = Signal(64) + # from CSR file + self.satp_ppn_i = Signal(44) # ppn from satp + self.mxr_i = Signal() + # Performance counters + self.itlb_miss_o = Signal() + self.dtlb_miss_o = Signal() + + def ports(self): + return [self.ptw_active_o, self.walking_instr_o, self.ptw_error_o, + ] + return [ + self.enable_translation_i, self.en_ld_st_translation_i, + self.lsu_is_store_i, self.req_port_i, self.req_port_o, + self.update_vaddr_o, + self.asid_i, + self.itlb_access_i, self.itlb_hit_i, self.itlb_vaddr_i, + self.dtlb_access_i, self.dtlb_hit_i, self.dtlb_vaddr_i, + self.satp_ppn_i, self.mxr_i, + self.itlb_miss_o, self.dtlb_miss_o + ] + self.itlb_update_o.ports() + self.dtlb_update_o.ports() + + def elaborate(self, platform): + m = Module() + + # input registers + data_rvalid = Signal() + data_rdata = Signal(64) + + # NOTE: pte decodes the incoming bit-field (data_rdata). data_rdata + # is spec'd in 64-bit binary-format: better to spec as Record? + pte = PTE() + m.d.comb += pte.flatten().eq(data_rdata) + + # SV48 defines four levels of page tables + ptw_lvl = Signal(2) # default=0=LVL1 on reset (see above) + ptw_lvl1 = Signal() + ptw_lvl2 = Signal() + ptw_lvl3 = Signal() + ptw_lvl4 = Signal() + m.d.comb += [ptw_lvl1.eq(ptw_lvl == LVL1), + ptw_lvl2.eq(ptw_lvl == LVL2), + ptw_lvl3.eq(ptw_lvl == LVL3), + ptw_lvl4.eq(ptw_lvl == LVL4) + ] + + # is this an instruction page table walk? + is_instr_ptw = Signal() + global_mapping = Signal() + # latched tag signal + tag_valid = Signal() + # register the ASID + tlb_update_asid = Signal(self.asid_width) + # register VPN we need to walk, SV48 defines a 48 bit virtual addr + vaddr = Signal(64) + # 4 byte aligned physical pointer + ptw_pptr = Signal(56) + + end = DCACHE_INDEX_WIDTH + DCACHE_TAG_WIDTH + m.d.sync += [ + # Assignments + self.update_vaddr_o.eq(vaddr), + + self.walking_instr_o.eq(is_instr_ptw), + # directly output the correct physical address + self.req_port_o.address_index.eq(ptw_pptr[0:DCACHE_INDEX_WIDTH]), + self.req_port_o.address_tag.eq(ptw_pptr[DCACHE_INDEX_WIDTH:end]), + # we are never going to kill this request + self.req_port_o.kill_req.eq(0), # XXX assign comb? + # we are never going to write with the HPTW + self.req_port_o.data_wdata.eq(Const(0, 64)), # XXX assign comb? + # ----------- + # TLB Update + # ----------- + self.itlb_update_o.vpn.eq(vaddr[12:48]), + self.dtlb_update_o.vpn.eq(vaddr[12:48]), + # update the correct page table level + self.itlb_update_o.is_2M.eq(ptw_lvl3), + self.itlb_update_o.is_1G.eq(ptw_lvl2), + self.itlb_update_o.is_512G.eq(ptw_lvl1), + self.dtlb_update_o.is_2M.eq(ptw_lvl3), + self.dtlb_update_o.is_1G.eq(ptw_lvl2), + self.dtlb_update_o.is_512G.eq(ptw_lvl1), + + # output the correct ASID + self.itlb_update_o.asid.eq(tlb_update_asid), + self.dtlb_update_o.asid.eq(tlb_update_asid), + # set the global mapping bit + self.itlb_update_o.content.eq(pte), + self.itlb_update_o.content.g.eq(global_mapping), + self.dtlb_update_o.content.eq(pte), + self.dtlb_update_o.content.g.eq(global_mapping), + + self.req_port_o.tag_valid.eq(tag_valid), + ] + + #------------------- + # Page table walker #needs update + #------------------- + # A virtual address va is translated into a physical address pa as + # follows: + # 1. Let a be sptbr.ppn × PAGESIZE, and let i = LEVELS-1. (For Sv48, + # PAGESIZE=2^12 and LEVELS=4.) + # 2. Let pte be the value of the PTE at address a+va.vpn[i]×PTESIZE. + # (For Sv32, PTESIZE=4.) + # 3. If pte.v = 0, or if pte.r = 0 and pte.w = 1, stop and raise an + # access exception. + # 4. Otherwise, the PTE is valid. If pte.r = 1 or pte.x = 1, go to + # step 5. Otherwise, this PTE is a pointer to the next level of + # the page table. + # Let i=i-1. If i < 0, stop and raise an access exception. + # Otherwise, let a = pte.ppn × PAGESIZE and go to step 2. + # 5. A leaf PTE has been found. Determine if the requested memory + # access is allowed by the pte.r, pte.w, and pte.x bits. If not, + # stop and raise an access exception. Otherwise, the translation is + # successful. Set pte.a to 1, and, if the memory access is a + # store, set pte.d to 1. + # The translated physical address is given as follows: + # - pa.pgoff = va.pgoff. + # - If i > 0, then this is a superpage translation and + # pa.ppn[i-1:0] = va.vpn[i-1:0]. + # - pa.ppn[LEVELS-1:i] = pte.ppn[LEVELS-1:i]. + # 6. If i > 0 and pa.ppn[i − 1 : 0] != 0, this is a misaligned + # superpage stop and raise a page-fault exception. + + m.d.sync += tag_valid.eq(0) + + # default assignments + m.d.comb += [ + # PTW memory interface + self.req_port_o.data_req.eq(0), + self.req_port_o.data_be.eq(Const(0xFF, 8)), + self.req_port_o.data_size.eq(Const(0b11, 2)), + self.req_port_o.data_we.eq(0), + self.ptw_error_o.eq(0), + self.itlb_update_o.valid.eq(0), + self.dtlb_update_o.valid.eq(0), + + self.itlb_miss_o.eq(0), + self.dtlb_miss_o.eq(0), + ] + + # ------------ + # State Machine + # ------------ + + with m.FSM() as fsm: + + with m.State("IDLE"): + self.idle(m, is_instr_ptw, ptw_lvl, global_mapping, + ptw_pptr, vaddr, tlb_update_asid) + + with m.State("WAIT_GRANT"): + self.grant(m, tag_valid, data_rvalid) + + with m.State("PTE_LOOKUP"): + # we wait for the valid signal + with m.If(data_rvalid): + self.lookup(m, pte, ptw_lvl, ptw_lvl1, ptw_lvl2, ptw_lvl3, ptw_lvl4, + data_rvalid, global_mapping, + is_instr_ptw, ptw_pptr) + + # Propagate error to MMU/LSU + with m.State("PROPAGATE_ERROR"): + m.next = "IDLE" + m.d.comb += self.ptw_error_o.eq(1) + + # wait for the rvalid before going back to IDLE + with m.State("WAIT_RVALID"): + with m.If(data_rvalid): + m.next = "IDLE" + + m.d.sync += [data_rdata.eq(self.req_port_i.data_rdata), + data_rvalid.eq(self.req_port_i.data_rvalid) + ] + + return m + + def set_grant_state(self, m): + # should we have flushed before we got an rvalid, + # wait for it until going back to IDLE + with m.If(self.flush_i): + with m.If (self.req_port_i.data_gnt): + m.next = "WAIT_RVALID" + with m.Else(): + m.next = "IDLE" + with m.Else(): + m.next = "WAIT_GRANT" + + def idle(self, m, is_instr_ptw, ptw_lvl, global_mapping, + ptw_pptr, vaddr, tlb_update_asid): + # by default we start with the top-most page table + m.d.sync += [is_instr_ptw.eq(0), + ptw_lvl.eq(LVL1), + global_mapping.eq(0), + self.ptw_active_o.eq(0), # deactive (IDLE) + ] + # work out itlb/dtlb miss + m.d.comb += self.itlb_miss_o.eq(self.enable_translation_i & \ + self.itlb_access_i & \ + ~self.itlb_hit_i & \ + ~self.dtlb_access_i) + m.d.comb += self.dtlb_miss_o.eq(self.en_ld_st_translation_i & \ + self.dtlb_access_i & \ + ~self.dtlb_hit_i) + # we got an ITLB miss? + with m.If(self.itlb_miss_o): + pptr = Cat(Const(0, 3), self.itlb_vaddr_i[30:48], + self.satp_ppn_i) + m.d.sync += [ptw_pptr.eq(pptr), + is_instr_ptw.eq(1), + vaddr.eq(self.itlb_vaddr_i), + tlb_update_asid.eq(self.asid_i), + ] + self.set_grant_state(m) + + # we got a DTLB miss? + with m.Elif(self.dtlb_miss_o): + pptr = Cat(Const(0, 3), self.dtlb_vaddr_i[30:48], + self.satp_ppn_i) + m.d.sync += [ptw_pptr.eq(pptr), + vaddr.eq(self.dtlb_vaddr_i), + tlb_update_asid.eq(self.asid_i), + ] + self.set_grant_state(m) + + def grant(self, m, tag_valid, data_rvalid): + # we've got a data WAIT_GRANT so tell the + # cache that the tag is valid + + # send a request out + m.d.comb += self.req_port_o.data_req.eq(1) + # wait for the WAIT_GRANT + with m.If(self.req_port_i.data_gnt): + # send the tag valid signal one cycle later + m.d.sync += tag_valid.eq(1) + # should we have flushed before we got an rvalid, + # wait for it until going back to IDLE + with m.If(self.flush_i): + with m.If (~data_rvalid): + m.next = "WAIT_RVALID" + with m.Else(): + m.next = "IDLE" + with m.Else(): + m.next = "PTE_LOOKUP" + + def lookup(self, m, pte, ptw_lvl, ptw_lvl1, ptw_lvl2, ptw_lvl3, ptw_lvl4, + data_rvalid, global_mapping, + is_instr_ptw, ptw_pptr): + # temporaries + pte_rx = Signal(reset_less=True) + pte_exe = Signal(reset_less=True) + pte_inv = Signal(reset_less=True) + pte_a = Signal(reset_less=True) + st_wd = Signal(reset_less=True) + m.d.comb += [pte_rx.eq(pte.r | pte.x), + pte_exe.eq(~pte.x | ~pte.a), + pte_inv.eq(~pte.v | (~pte.r & pte.w)), + pte_a.eq(pte.a & (pte.r | (pte.x & self.mxr_i))), + st_wd.eq(self.lsu_is_store_i & (~pte.w | ~pte.d))] + + l1err = Signal(reset_less=True) + l2err = Signal(reset_less=True) + l3err = Signal(reset_less=True) + m.d.comb += [l3err.eq((ptw_lvl3) & pte.ppn[0:9] != Const(0,0)), + l2err.eq((ptw_lvl2) & pte.ppn[0:18] != Const(0, 18)), + l1err.eq((ptw_lvl1) & pte.ppn[0:27] != Const(0, 27))] + + # check if the global mapping bit is set + with m.If (pte.g): + m.d.sync += global_mapping.eq(1) + + m.next = "IDLE" + + # ------------- + # Invalid PTE + # ------------- + # If pte.v = 0, or if pte.r = 0 and pte.w = 1, + # stop and raise a page-fault exception. + with m.If (pte_inv): + m.next = "PROPAGATE_ERROR" + + # ----------- + # Valid PTE + # ----------- + + # it is a valid PTE + # if pte.r = 1 or pte.x = 1 it is a valid PTE + with m.Elif (pte_rx): + # Valid translation found (either 1G, 2M or 4K) + with m.If(is_instr_ptw): + # ------------ + # Update ITLB + # ------------ + # If page not executable, we can directly raise error. + # This doesn't put a useless entry into the TLB. + # The same idea applies to the access flag since we let + # the access flag be managed by SW. + with m.If (pte_exe): + m.next = "IDLE" + with m.Else(): + m.d.comb += self.itlb_update_o.valid.eq(1) + + with m.Else(): + # ------------ + # Update DTLB + # ------------ + # Check if the access flag has been set, otherwise + # throw page-fault and let software handle those bits. + # If page not readable (there are no write-only pages) + # directly raise an error. This doesn't put a useless + # entry into the TLB. + with m.If(pte_a): + m.d.comb += self.dtlb_update_o.valid.eq(1) + with m.Else(): + m.next = "PROPAGATE_ERROR" + # Request is a store: perform additional checks + # If the request was a store and the page not + # write-able, raise an error + # the same applies if the dirty flag is not set + with m.If (st_wd): + m.d.comb += self.dtlb_update_o.valid.eq(0) + m.next = "PROPAGATE_ERROR" + + # check if the ppn is correctly aligned: Case (6) + with m.If(l1err | l2err | l3err): + m.next = "PROPAGATE_ERROR" + m.d.comb += [self.dtlb_update_o.valid.eq(0), + self.itlb_update_o.valid.eq(0)] + + # this is a pointer to the next TLB level + with m.Else(): + # pointer to next level of page table + with m.If (ptw_lvl1): + # we are in the second level now + pptr = Cat(Const(0, 3), self.dtlb_vaddr_i[30:39], pte.ppn) + m.d.sync += [ptw_pptr.eq(pptr), + ptw_lvl.eq(LVL2) + ] + with m.If(ptw_lvl2): + # here we received a pointer to the third level + pptr = Cat(Const(0, 3), self.dtlb_vaddr_i[21:30], pte.ppn) + m.d.sync += [ptw_pptr.eq(pptr), + ptw_lvl.eq(LVL3) + ] + with m.If(ptw_lvl3): #guess: shift page levels by one + # here we received a pointer to the fourth level + # the last one is near the page offset + pptr = Cat(Const(0, 3), self.dtlb_vaddr_i[12:21], pte.ppn) + m.d.sync += [ptw_pptr.eq(pptr), + ptw_lvl.eq(LVL4) + ] + self.set_grant_state(m) + + with m.If (ptw_lvl4): + # Should already be the last level + # page table => Error + m.d.sync += ptw_lvl.eq(LVL4) + m.next = "PROPAGATE_ERROR" + + +if __name__ == '__main__': + ptw = PTW() + vl = rtlil.convert(ptw, ports=ptw.ports()) + with open("test_ptw.il", "w") as f: + f.write(vl) diff --git a/src/soc/TLB/ariane/test/test_plru.py b/src/soc/TLB/ariane/test/test_plru.py new file mode 100644 index 00000000..68dcfa58 --- /dev/null +++ b/src/soc/TLB/ariane/test/test_plru.py @@ -0,0 +1,15 @@ +import sys +sys.path.append("../src") +sys.path.append("../../../TestUtil") + +from TLB.ariane.plru import PLRU + +from nmigen.compat.sim import run_simulation + +def tbench(dut): + yield + +if __name__ == "__main__": + dut = PLRU(4) + run_simulation(dut, tbench(dut), vcd_name="test_plru.vcd") + print("PLRU Unit Test Success") diff --git a/src/soc/TLB/ariane/test/test_ptw.py b/src/soc/TLB/ariane/test/test_ptw.py new file mode 100644 index 00000000..b5deb28b --- /dev/null +++ b/src/soc/TLB/ariane/test/test_ptw.py @@ -0,0 +1,130 @@ +import sys +sys.path.append("../src") +sys.path.append("../../../TestUtil") + +from nmigen.compat.sim import run_simulation + +from TLB.ariane.ptw import PTW, PTE + +# unit was changed, test needs to be changed + +def tbench(dut): + + addr = 0x8000000 + + #pte = PTE() + #yield pte.v.eq(1) + #yield pte.r.eq(1) + + yield dut.req_port_i.data_gnt.eq(1) + yield dut.req_port_i.data_rvalid.eq(1) + yield dut.req_port_i.data_rdata.eq(0x43)#pte.flatten()) + + # data lookup + yield dut.en_ld_st_translation_i.eq(1) + yield dut.asid_i.eq(1) + + yield dut.dtlb_access_i.eq(1) + yield dut.dtlb_hit_i.eq(0) + yield dut.dtlb_vaddr_i.eq(0x400000000) + + yield + yield + yield + + yield dut.dtlb_access_i.eq(1) + yield dut.dtlb_hit_i.eq(0) + yield dut.dtlb_vaddr_i.eq(0x200000) + + yield + yield + yield + + yield dut.req_port_i.data_gnt.eq(0) + yield dut.dtlb_access_i.eq(1) + yield dut.dtlb_hit_i.eq(0) + yield dut.dtlb_vaddr_i.eq(0x400000011) + + yield + yield dut.req_port_i.data_gnt.eq(1) + yield + yield + + # data lookup, PTW levels 1-2-3 + addr = 0x4000000 + yield dut.dtlb_vaddr_i.eq(addr) + yield dut.mxr_i.eq(0x1) + yield dut.req_port_i.data_gnt.eq(1) + yield dut.req_port_i.data_rvalid.eq(1) + yield dut.req_port_i.data_rdata.eq(0x41 | (addr>>12)<<10)#pte.flatten()) + + yield dut.en_ld_st_translation_i.eq(1) + yield dut.asid_i.eq(1) + + yield dut.dtlb_access_i.eq(1) + yield dut.dtlb_hit_i.eq(0) + yield dut.dtlb_vaddr_i.eq(addr) + + yield + yield + yield + yield + yield + yield + yield + yield + + yield dut.req_port_i.data_gnt.eq(0) + yield dut.dtlb_access_i.eq(1) + yield dut.dtlb_hit_i.eq(0) + yield dut.dtlb_vaddr_i.eq(0x400000011) + + yield + yield dut.req_port_i.data_gnt.eq(1) + yield + yield + yield + yield + + + # instruction lookup + yield dut.en_ld_st_translation_i.eq(0) + yield dut.enable_translation_i.eq(1) + yield dut.asid_i.eq(1) + + yield dut.itlb_access_i.eq(1) + yield dut.itlb_hit_i.eq(0) + yield dut.itlb_vaddr_i.eq(0x800000) + + yield + yield + yield + + yield dut.itlb_access_i.eq(1) + yield dut.itlb_hit_i.eq(0) + yield dut.itlb_vaddr_i.eq(0x200000) + + yield + yield + yield + + yield dut.req_port_i.data_gnt.eq(0) + yield dut.itlb_access_i.eq(1) + yield dut.itlb_hit_i.eq(0) + yield dut.itlb_vaddr_i.eq(0x800011) + + yield + yield dut.req_port_i.data_gnt.eq(1) + yield + yield + + yield + + +def test_ptw(): + dut = PTW() + run_simulation(dut, tbench(dut), vcd_name="test_ptw.vcd") + print("PTW Unit Test Success") + +if __name__ == "__main__": + test_ptw() diff --git a/src/soc/TLB/ariane/test/test_tlb.py b/src/soc/TLB/ariane/test/test_tlb.py new file mode 100644 index 00000000..b94438ff --- /dev/null +++ b/src/soc/TLB/ariane/test/test_tlb.py @@ -0,0 +1,70 @@ +import sys +sys.path.append("../src") +sys.path.append("../../../TestUtil") + +from nmigen.compat.sim import run_simulation + +from TLB.ariane.tlb import TLB + +def set_vaddr(addr): + yield dut.lu_vaddr_i.eq(addr) + yield dut.update_i.vpn.eq(addr>>12) + + +def tbench(dut): + yield dut.lu_access_i.eq(1) + yield dut.lu_asid_i.eq(1) + yield dut.update_i.valid.eq(1) + yield dut.update_i.is_1G.eq(0) + yield dut.update_i.is_2M.eq(0) + yield dut.update_i.asid.eq(1) + yield dut.update_i.content.ppn.eq(0) + yield dut.update_i.content.rsw.eq(0) + yield dut.update_i.content.r.eq(1) + + yield + + addr = 0x80000 + yield from set_vaddr(addr) + yield + + addr = 0x90001 + yield from set_vaddr(addr) + yield + + addr = 0x28000000 + yield from set_vaddr(addr) + yield + + addr = 0x28000001 + yield from set_vaddr(addr) + + addr = 0x28000001 + yield from set_vaddr(addr) + yield + + addr = 0x1000040000 + yield from set_vaddr(addr) + yield + + addr = 0x1000040001 + yield from set_vaddr(addr) + yield + + yield dut.update_i.is_1G.eq(1) + addr = 0x2040000 + yield from set_vaddr(addr) + yield + + yield dut.update_i.is_1G.eq(1) + addr = 0x2040001 + yield from set_vaddr(addr) + yield + + yield + + +if __name__ == "__main__": + dut = TLB() + run_simulation(dut, tbench(dut), vcd_name="test_tlb.vcd") + print("TLB Unit Test Success") diff --git a/src/soc/TLB/ariane/test/test_tlb_content.py b/src/soc/TLB/ariane/test/test_tlb_content.py new file mode 100644 index 00000000..145ded7d --- /dev/null +++ b/src/soc/TLB/ariane/test/test_tlb_content.py @@ -0,0 +1,63 @@ +import sys +sys.path.append("../src") +sys.path.append("../../../TestUtil") + +from nmigen.compat.sim import run_simulation + +from TLB.ariane.tlb_content import TLBContent +from TestUtil.test_helper import assert_op, assert_eq + +def update(dut,a,t,g,m): + yield dut.replace_en_i.eq(1) + yield dut.update_i.valid.eq(1) + yield dut.update_i.is_512G.eq(t) + yield dut.update_i.is_1G.eq(g) + yield dut.update_i.is_2M.eq(m) + yield dut.update_i.vpn.eq(a) + yield + yield + +def check_hit(dut,hit,pagesize): + hit_d = yield dut.lu_hit_o + assert_eq("hit", hit_d, hit) + + if(hit): + if(pagesize=="t"): + hitp = yield dut.lu_is_512G_o + assert_eq("lu_is_512G_o", hitp, 1) + elif(pagesize=="g"): + hitp = yield dut.lu_is_1G_o + assert_eq("lu_is_1G_o", hitp, 1) + elif(pagesize=="m"): + hitp = yield dut.lu_is_2M_o + assert_eq("lu_is_2M_o", hitp, 1) + +def addr(a,b,c,d): + return a | b << 9 | c << 18 | d << 27 + +def tbench(dut): + yield dut.vpn0.eq(0x0A) + yield dut.vpn1.eq(0x0B) + yield dut.vpn2.eq(0x0C) + yield dut.vpn3.eq(0x0D) + yield from update(dut,addr(0xFF,0xFF,0xFF,0x0D),1,0,0) + yield from check_hit(dut,1,"t") + + yield from update(dut,addr(0xFF,0xFF,0x0C,0x0D),0,1,0) + yield from check_hit(dut,1,"g") + + yield from update(dut,addr(0xFF,0x0B,0x0C,0x0D),0,0,1) + yield from check_hit(dut,1,"m") + + yield from update(dut,addr(0x0A,0x0B,0x0C,0x0D),0,0,0) + yield from check_hit(dut,1,"") + + yield from update(dut,addr(0xAA,0xBB,0xCC,0xDD),0,0,0) + yield from check_hit(dut,0,"miss") + + +if __name__ == "__main__": + dut = TLBContent(4,4) + # + run_simulation(dut, tbench(dut), vcd_name="test_tlb_content.vcd") + print("TLBContent Unit Test Success") diff --git a/src/soc/TLB/ariane/tlb.py b/src/soc/TLB/ariane/tlb.py new file mode 100644 index 00000000..cf4af57a --- /dev/null +++ b/src/soc/TLB/ariane/tlb.py @@ -0,0 +1,175 @@ +""" +# Copyright 2018 ETH Zurich and University of Bologna. +# Copyright and related rights are licensed under the Solderpad Hardware +# License, Version 0.51 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# http:#solderpad.org/licenses/SHL-0.51. Unless required by applicable law +# or agreed to in writing, software, hardware and materials distributed under +# this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Author: David Schaffenrath, TU Graz +# Author: Florian Zaruba, ETH Zurich +# Date: 21.4.2017 +# Description: Translation Lookaside Buffer, SV48 +# fully set-associative + +Implementation in c++: +https://raw.githubusercontent.com/Tony-Hu/TreePLRU/master/TreePLRU.cpp + +Text description: +https://people.cs.clemson.edu/~mark/464/p_lru.txt + +Online simulator: +http://www.ntu.edu.sg/home/smitha/ParaCache/Paracache/vm.html +""" +from math import log2 +from nmigen import Signal, Module, Cat, Const, Array, Elaboratable +from nmigen.cli import verilog, rtlil +from nmigen.lib.coding import Encoder + +from TLB.ariane.ptw import TLBUpdate, PTE, ASID_WIDTH +from TLB.ariane.plru import PLRU +from TLB.ariane.tlb_content import TLBContent + +TLB_ENTRIES = 8 + +class TLB(Elaboratable): + def __init__(self, tlb_entries=8, asid_width=8): + self.tlb_entries = tlb_entries + self.asid_width = asid_width + + self.flush_i = Signal() # Flush signal + # Lookup signals + self.lu_access_i = Signal() + self.lu_asid_i = Signal(self.asid_width) + self.lu_vaddr_i = Signal(64) + self.lu_content_o = PTE() + self.lu_is_2M_o = Signal() + self.lu_is_1G_o = Signal() + self.lu_is_512G_o = Signal() + self.lu_hit_o = Signal() + # Update TLB + self.pte_width = len(self.lu_content_o.flatten()) + self.update_i = TLBUpdate(asid_width) + + def elaborate(self, platform): + m = Module() + + vpn3 = Signal(9) #FIXME unused signal + vpn2 = Signal(9) + vpn1 = Signal(9) + vpn0 = Signal(9) + + #------------- + # Translation + #------------- + + # SV48 defines four levels of page tables + m.d.comb += [ vpn0.eq(self.lu_vaddr_i[12:21]), + vpn1.eq(self.lu_vaddr_i[21:30]), + vpn2.eq(self.lu_vaddr_i[30:39]), + vpn3.eq(self.lu_vaddr_i[39:48]), ### FIXME + ] + + tc = [] + for i in range(self.tlb_entries): + tlc = TLBContent(self.pte_width, self.asid_width) + setattr(m.submodules, "tc%d" % i, tlc) + tc.append(tlc) + # connect inputs + tlc.update_i = self.update_i # saves a lot of graphviz links + m.d.comb += [tlc.vpn0.eq(vpn0), + tlc.vpn1.eq(vpn1), + tlc.vpn2.eq(vpn2), + # TODO 4th + tlc.flush_i.eq(self.flush_i), + #tlc.update_i.eq(self.update_i), + tlc.lu_asid_i.eq(self.lu_asid_i)] + tc = Array(tc) + + #-------------- + # Select hit + #-------------- + + # use Encoder to select hit index + # XXX TODO: assert that there's only one valid entry (one lu_hit) + hitsel = Encoder(self.tlb_entries) + m.submodules.hitsel = hitsel + + hits = [] + for i in range(self.tlb_entries): + hits.append(tc[i].lu_hit_o) + m.d.comb += hitsel.i.eq(Cat(*hits)) # (goes into plru as well) + idx = hitsel.o + + active = Signal(reset_less=True) + m.d.comb += active.eq(~hitsel.n) + with m.If(active): + # active hit, send selected as output + m.d.comb += [ self.lu_is_512G_o.eq(tc[idx].lu_is_512G_o), + self.lu_is_1G_o.eq(tc[idx].lu_is_1G_o), + self.lu_is_2M_o.eq(tc[idx].lu_is_2M_o), + self.lu_hit_o.eq(1), + self.lu_content_o.flatten().eq(tc[idx].lu_content_o), + ] + + #-------------- + # PLRU. + #-------------- + + p = PLRU(self.tlb_entries) + plru_tree = Signal(p.TLBSZ) + m.submodules.plru = p + + # connect PLRU inputs/outputs + # XXX TODO: assert that there's only one valid entry (one replace_en) + en = [] + for i in range(self.tlb_entries): + en.append(tc[i].replace_en_i) + m.d.comb += [Cat(*en).eq(p.replace_en_o), # output from PLRU into tags + p.lu_hit.eq(hitsel.i), + p.lu_access_i.eq(self.lu_access_i), + p.plru_tree.eq(plru_tree)] + m.d.sync += plru_tree.eq(p.plru_tree_o) + + #-------------- + # Sanity checks + #-------------- + + assert (self.tlb_entries % 2 == 0) and (self.tlb_entries > 1), \ + "TLB size must be a multiple of 2 and greater than 1" + assert (self.asid_width >= 1), \ + "ASID width must be at least 1" + + return m + + """ + # Just for checking + function int countSetBits(logic[self.tlb_entries-1:0] vector); + automatic int count = 0; + foreach (vector[idx]) begin + count += vector[idx]; + end + return count; + endfunction + + assert property (@(posedge clk_i)(countSetBits(lu_hit) <= 1)) + else $error("More then one hit in TLB!"); $stop(); end + assert property (@(posedge clk_i)(countSetBits(replace_en) <= 1)) + else $error("More then one TLB entry selected for next replace!"); + """ + + def ports(self): + return [self.flush_i, self.lu_access_i, + self.lu_asid_i, self.lu_vaddr_i, + self.lu_is_2M_o, self.lu_1G_o, self.lu_is_512G_o, self.lu_hit_o + ] + self.lu_content_o.ports() + self.update_i.ports() + +if __name__ == '__main__': + tlb = TLB() + vl = rtlil.convert(tlb, ports=tlb.ports()) + with open("test_tlb.il", "w") as f: + f.write(vl) + diff --git a/src/soc/TLB/ariane/tlb_content.py b/src/soc/TLB/ariane/tlb_content.py new file mode 100644 index 00000000..3384c885 --- /dev/null +++ b/src/soc/TLB/ariane/tlb_content.py @@ -0,0 +1,145 @@ +from nmigen import Signal, Module, Cat, Const, Elaboratable + +from TLB.ariane.ptw import TLBUpdate, PTE + + +class TLBEntry: + def __init__(self, asid_width): + self.asid = Signal(asid_width,name="ent_asid") + # SV48 defines four levels of page tables + self.vpn0 = Signal(9,name="ent_vpn0") + self.vpn1 = Signal(9,name="ent_vpn1") + self.vpn2 = Signal(9,name="ent_vpn2") + self.vpn3 = Signal(9,name="ent_vpn3") + self.is_2M = Signal(name="ent_is_2M") + self.is_1G = Signal(name="ent_is_1G") + self.is_512G = Signal(name="ent_is_512G") + self.valid = Signal(name="ent_valid") + + def flatten(self): + return Cat(*self.ports()) + + def eq(self, x): + return self.flatten().eq(x.flatten()) + + def ports(self): + return [self.asid, self.vpn0, self.vpn1, self.vpn2, + self.is_2M, self.is_1G, self.valid] + + +class TLBContent(Elaboratable): + def __init__(self, pte_width, asid_width): + self.asid_width = asid_width + self.pte_width = pte_width + self.flush_i = Signal() # Flush signal + # Update TLB + self.update_i = TLBUpdate(asid_width) + self.vpn3 = Signal(9) + self.vpn2 = Signal(9) + self.vpn1 = Signal(9) + self.vpn0 = Signal(9) + self.replace_en_i = Signal() # replace the following entry, + # set by replacement strategy + # Lookup signals + self.lu_asid_i = Signal(asid_width) + self.lu_content_o = Signal(pte_width) + self.lu_is_512G_o = Signal() + self.lu_is_2M_o = Signal() + self.lu_is_1G_o = Signal() + self.lu_hit_o = Signal() + + def elaborate(self, platform): + m = Module() + + tags = TLBEntry(self.asid_width) + + + content = Signal(self.pte_width) + + m.d.comb += [self.lu_hit_o.eq(0), + self.lu_is_512G_o.eq(0), + self.lu_is_2M_o.eq(0), + self.lu_is_1G_o.eq(0)] + + # temporaries for lookup + asid_ok = Signal(reset_less=True) + # tags_ok = Signal(reset_less=True) + + vpn3_ok = Signal(reset_less=True) + vpn2_ok = Signal(reset_less=True) + vpn1_ok = Signal(reset_less=True) + vpn0_ok = Signal(reset_less=True) + + #tags_2M = Signal(reset_less=True) + vpn0_or_2M = Signal(reset_less=True) + + m.d.comb += [ + #compare asid and vpn* + asid_ok.eq(tags.asid == self.lu_asid_i), + vpn3_ok.eq(tags.vpn3 == self.vpn3), + vpn2_ok.eq(tags.vpn2 == self.vpn2), + vpn1_ok.eq(tags.vpn1 == self.vpn1), + vpn0_ok.eq(tags.vpn0 == self.vpn0), + vpn0_or_2M.eq(tags.is_2M | vpn0_ok) + ] + + + with m.If(asid_ok & tags.valid): + # first level, only vpn3 needs to match + with m.If (tags.is_512G & vpn3_ok): + m.d.comb += [ self.lu_content_o.eq(content), + self.lu_is_512G_o.eq(1), + self.lu_hit_o.eq(1), + ] + # second level , second level vpn2 and vpn3 need to match + with m.Elif (tags.is_1G & vpn2_ok & vpn3_ok): + m.d.comb += [ self.lu_content_o.eq(content), + self.lu_is_1G_o.eq(1), + self.lu_hit_o.eq(1), + ] + # not a giga page hit nor a tera page hit so check further + with m.Elif(vpn1_ok): + # this could be a 2 mega page hit or a 4 kB hit + # output accordingly + with m.If(vpn0_or_2M): + m.d.comb += [ self.lu_content_o.eq(content), + self.lu_is_2M_o.eq(tags.is_2M), + self.lu_hit_o.eq(1), + ] + # ------------------ + # Update or Flush + # ------------------ + + # temporaries + replace_valid = Signal(reset_less=True) + m.d.comb += replace_valid.eq(self.update_i.valid & self.replace_en_i) + + # flush + with m.If (self.flush_i): + # invalidate (flush) conditions: all if zero or just this ASID + with m.If (self.lu_asid_i == Const(0, self.asid_width) | + (self.lu_asid_i == tags.asid)): + m.d.sync += tags.valid.eq(0) + + # normal replacement + with m.Elif(replace_valid): + m.d.sync += [ # update tag array + tags.asid.eq(self.update_i.asid), + tags.vpn3.eq(self.update_i.vpn[27:36]), + tags.vpn2.eq(self.update_i.vpn[18:27]), + tags.vpn1.eq(self.update_i.vpn[9:18]), + tags.vpn0.eq(self.update_i.vpn[0:9]), + tags.is_512G.eq(self.update_i.is_512G), + tags.is_1G.eq(self.update_i.is_1G), + tags.is_2M.eq(self.update_i.is_2M), + tags.valid.eq(1), + # and content as well + content.eq(self.update_i.content.flatten()) + ] + return m + + def ports(self): + return [self.flush_i, + self.lu_asid_i, + self.lu_is_2M_o, self.lu_is_1G_o,self.lu_is_512G_o, self.lu_hit_o, + ] + self.update_i.content.ports() + self.update_i.ports() diff --git a/src/soc/TLB/test/__init__.py b/src/soc/TLB/test/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/soc/TLB/test/test_LFSR2.py b/src/soc/TLB/test/test_LFSR2.py new file mode 100644 index 00000000..c05f55b7 --- /dev/null +++ b/src/soc/TLB/test/test_LFSR2.py @@ -0,0 +1,70 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# See Notices.txt for copyright information +from TLB.LFSR import LFSR, LFSRPolynomial, LFSR_POLY_3 + +from nmigen.back.pysim import Simulator, Delay, Tick +import unittest + + +class TestLFSR(unittest.TestCase): + def test_poly(self): + v = LFSRPolynomial() + self.assertEqual(repr(v), "LFSRPolynomial([0])") + self.assertEqual(str(v), "1") + v = LFSRPolynomial([1]) + self.assertEqual(repr(v), "LFSRPolynomial([1, 0])") + self.assertEqual(str(v), "x + 1") + v = LFSRPolynomial([0, 1]) + self.assertEqual(repr(v), "LFSRPolynomial([1, 0])") + self.assertEqual(str(v), "x + 1") + v = LFSRPolynomial([1, 2]) + self.assertEqual(repr(v), "LFSRPolynomial([2, 1, 0])") + self.assertEqual(str(v), "x^2 + x + 1") + v = LFSRPolynomial([2]) + self.assertEqual(repr(v), "LFSRPolynomial([2, 0])") + self.assertEqual(str(v), "x^2 + 1") + self.assertEqual(str(LFSR_POLY_3), "x^3 + x^2 + 1") + + def test_lfsr_3(self): + module = LFSR(LFSR_POLY_3) + traces = [module.state, module.enable] + with Simulator(module, + vcd_file=open("Waveforms/test_LFSR2.vcd", "w"), + gtkw_file=open("Waveforms/test_LFSR2.gtkw", "w"), + traces=traces) as sim: + sim.add_clock(1e-6, 0.25e-6) + delay = Delay(1e-7) + + def async_process(): + yield module.enable.eq(0) + yield Tick() + self.assertEqual((yield module.state), 0x1) + yield Tick() + self.assertEqual((yield module.state), 0x1) + yield module.enable.eq(1) + yield Tick() + yield delay + self.assertEqual((yield module.state), 0x2) + yield Tick() + yield delay + self.assertEqual((yield module.state), 0x5) + yield Tick() + yield delay + self.assertEqual((yield module.state), 0x3) + yield Tick() + yield delay + self.assertEqual((yield module.state), 0x7) + yield Tick() + yield delay + self.assertEqual((yield module.state), 0x6) + yield Tick() + yield delay + self.assertEqual((yield module.state), 0x4) + yield Tick() + yield delay + self.assertEqual((yield module.state), 0x1) + yield Tick() + + sim.add_process(async_process) + sim.run() + diff --git a/src/soc/TLB/test/test_address_encoder.py b/src/soc/TLB/test/test_address_encoder.py new file mode 100644 index 00000000..0aad35b4 --- /dev/null +++ b/src/soc/TLB/test/test_address_encoder.py @@ -0,0 +1,105 @@ +from nmigen.compat.sim import run_simulation +from TLB.AddressEncoder import AddressEncoder +from TestUtil.test_helper import assert_eq, assert_ne, assert_op + + +# This function allows for the easy setting of values to the AddressEncoder +# Arguments: +# dut: The AddressEncoder being tested +# i (Input): The array of single bits to be written +def set_encoder(dut, i): + yield dut.i.eq(i) + yield + +# Checks the single match of the AddressEncoder +# Arguments: +# dut: The AddressEncoder being tested +# sm (Single Match): The expected match result +# op (Operation): (0 => ==), (1 => !=) +def check_single_match(dut, sm, op): + out_sm = yield dut.single_match + assert_op("Single Match", out_sm, sm, op) + +# Checks the multiple match of the AddressEncoder +# Arguments: +# dut: The AddressEncoder being tested +# mm (Multiple Match): The expected match result +# op (Operation): (0 => ==), (1 => !=) +def check_multiple_match(dut, mm, op): + out_mm = yield dut.multiple_match + assert_op("Multiple Match", out_mm, mm, op) + +# Checks the output of the AddressEncoder +# Arguments: +# dut: The AddressEncoder being tested +# o (Output): The expected output +# op (Operation): (0 => ==), (1 => !=) +def check_output(dut, o, op): + out_o = yield dut.o + assert_op("Output", out_o, o, op) + +# Checks the state of the AddressEncoder +# Arguments: +# dut: The AddressEncoder being tested +# sm (Single Match): The expected match result +# mm (Multiple Match): The expected match result +# o (Output): The expected output +# ss_op (Operation): Operation for the match assertion (0 => ==), (1 => !=) +# mm_op (Operation): Operation for the match assertion (0 => ==), (1 => !=) +# o_op (Operation): Operation for the match assertion (0 => ==), (1 => !=) +def check_all(dut, sm, mm, o, sm_op, mm_op, o_op): + yield from check_single_match(dut, sm, sm_op) + yield from check_multiple_match(dut, mm, mm_op) + yield from check_output(dut, o, o_op) + +def tbench(dut): + # Check invalid input + in_val = 0b000 + single_match = 0 + multiple_match = 0 + output = 0 + yield from set_encoder(dut, in_val) + yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0) + + # Check single bit + in_val = 0b001 + single_match = 1 + multiple_match = 0 + output = 0 + yield from set_encoder(dut, in_val) + yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0) + + # Check another single bit + in_val = 0b100 + single_match = 1 + multiple_match = 0 + output = 2 + yield from set_encoder(dut, in_val) + yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0) + + # Check multiple match + # We expected the lowest bit to be returned which is address 0 + in_val = 0b101 + single_match = 0 + multiple_match = 1 + output = 0 + yield from set_encoder(dut, in_val) + yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0) + + # Check another multiple match + # We expected the lowest bit to be returned which is address 1 + in_val = 0b110 + single_match = 0 + multiple_match = 1 + output = 1 + yield from set_encoder(dut, in_val) + yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0) + +def test_addr(): + dut = AddressEncoder(4) + run_simulation(dut, tbench(dut), + vcd_name="Waveforms/test_address_encoder.vcd") + print("AddressEncoder Unit Test Success") + +if __name__ == "__main__": + test_addr() diff --git a/src/soc/TLB/test/test_cam.py b/src/soc/TLB/test/test_cam.py new file mode 100644 index 00000000..f11c48ad --- /dev/null +++ b/src/soc/TLB/test/test_cam.py @@ -0,0 +1,206 @@ +from nmigen.compat.sim import run_simulation + +from TLB.Cam import Cam + +from TestUtil.test_helper import assert_eq, assert_ne, assert_op + +# This function allows for the easy setting of values to the Cam +# Arguments: +# dut: The Cam being tested +# e (Enable): Whether the block is going to be enabled +# we (Write Enable): Whether the Cam will write on the next cycle +# a (Address): Where the data will be written if write enable is high +# d (Data): Either what we are looking for or will write to the address +def set_cam(dut, e, we, a, d): + yield dut.enable.eq(e) + yield dut.write_enable.eq(we) + yield dut.address_in.eq(a) + yield dut.data_in.eq(d) + yield + +# Checks the multiple match of the Cam +# Arguments: +# dut: The Cam being tested +# mm (Multiple Match): The expected match result +# op (Operation): (0 => ==), (1 => !=) +def check_multiple_match(dut, mm, op): + out_mm = yield dut.multiple_match + assert_op("Multiple Match", out_mm, mm, op) + +# Checks the single match of the Cam +# Arguments: +# dut: The Cam being tested +# sm (Single Match): The expected match result +# op (Operation): (0 => ==), (1 => !=) +def check_single_match(dut, sm, op): + out_sm = yield dut.single_match + assert_op("Single Match", out_sm, sm, op) + +# Checks the address output of the Cam +# Arguments: +# dut: The Cam being tested +# ma (Match Address): The expected match result +# op (Operation): (0 => ==), (1 => !=) +def check_match_address(dut, ma, op): + out_ma = yield dut.match_address + assert_op("Match Address", out_ma, ma, op) + +# Checks the state of the Cam +# Arguments: +# dut: The Cam being tested +# sm (Single Match): The expected match result +# mm (Multiple Match): The expected match result +# ma: (Match Address): The expected address output +# ss_op (Operation): Operation for the match assertion (0 => ==), (1 => !=) +# mm_op (Operation): Operation for the match assertion (0 => ==), (1 => !=) +# ma_op (Operation): Operation for the address assertion (0 => ==), (1 => !=) +def check_all(dut, mm, sm, ma, mm_op, sm_op, ma_op): + yield from check_multiple_match(dut, mm, mm_op) + yield from check_single_match(dut, sm, sm_op) + yield from check_match_address(dut, ma, ma_op) + +def tbench(dut): + # NA + enable = 0 + write_enable = 0 + address = 0 + data = 0 + single_match = 0 + yield from set_cam(dut, enable, write_enable, address, data) + yield + yield from check_single_match(dut, single_match, 0) + + # Read Miss Multiple + # Note that the default starting entry data bits are all 0 + enable = 1 + write_enable = 0 + address = 0 + data = 0 + multiple_match = 1 + single_match = 0 + yield from set_cam(dut, enable, write_enable, address, data) + yield + yield from check_multiple_match(dut, multiple_match, 0) + + # Read Miss + # Note that the default starting entry data bits are all 0 + enable = 1 + write_enable = 0 + address = 0 + data = 1 + multiple_match = 0 + single_match = 0 + yield from set_cam(dut, enable, write_enable, address, data) + yield + yield from check_single_match(dut, single_match, 0) + + # Write Entry 0 + enable = 1 + write_enable = 1 + address = 0 + data = 4 + multiple_match = 0 + single_match = 0 + yield from set_cam(dut, enable, write_enable, address, data) + yield + yield from check_single_match(dut, single_match, 0) + + # Read Hit Entry 0 + enable = 1 + write_enable = 0 + address = 0 + data = 4 + multiple_match = 0 + single_match = 1 + yield from set_cam(dut, enable, write_enable, address, data) + yield + yield from check_all(dut, multiple_match, single_match, address, 0, 0, 0) + + # Search Hit + enable = 1 + write_enable = 0 + address = 0 + data = 4 + multiple_match = 0 + single_match = 1 + yield from set_cam(dut, enable, write_enable, address, data) + yield + yield from check_all(dut, multiple_match, single_match, address, 0, 0, 0) + + # Search Miss + enable = 1 + write_enable = 0 + address = 0 + data = 5 + single_match = 0 + yield from set_cam(dut, enable, write_enable, address, data) + yield + yield from check_single_match(dut, single_match, 0) + + # Multiple Match test + # Write Entry 1 + enable = 1 + write_enable = 1 + address = 1 + data = 5 + multiple_match = 0 + single_match = 0 + yield from set_cam(dut, enable, write_enable, address, data) + yield + yield from check_single_match(dut, single_match, 0) + + # Write Entry 2 + # Same data as Entry 1 + enable = 1 + write_enable = 1 + address = 2 + data = 5 + multiple_match = 0 + single_match = 0 + yield from set_cam(dut, enable, write_enable, address, data) + yield + yield from check_single_match(dut, single_match, 0) + + # Read Hit Data 5 + enable = 1 + write_enable = 0 + address = 1 + data = 5 + multiple_match = 1 + single_match = 0 + yield from set_cam(dut, enable, write_enable, address, data) + yield + yield from check_all(dut, multiple_match, single_match, address,0,0,0) + + # Verify read_warning is not caused + # Write Entry 0 + enable = 1 + write_enable = 1 + address = 0 + data = 7 + multiple_match = 0 + single_match = 0 + yield from set_cam(dut, enable, write_enable, address, data) + # Note there is no yield we immediately attempt to read in the next cycle + + # Read Hit Data 7 + enable = 1 + write_enable = 0 + address = 0 + data = 7 + multiple_match = 0 + single_match = 1 + yield from set_cam(dut, enable, write_enable, address, data) + yield + yield from check_single_match(dut, single_match, 0) + + yield + + +def test_cam(): + dut = Cam(4, 4) + run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_cam.vcd") + print("Cam Unit Test Success") + +if __name__ == "__main__": + test_cam() diff --git a/src/soc/TLB/test/test_cam_entry.py b/src/soc/TLB/test/test_cam_entry.py new file mode 100644 index 00000000..43b699d2 --- /dev/null +++ b/src/soc/TLB/test/test_cam_entry.py @@ -0,0 +1,110 @@ +from nmigen.compat.sim import run_simulation + +from TestUtil.test_helper import assert_eq, assert_ne, assert_op +from TLB.CamEntry import CamEntry + +# This function allows for the easy setting of values to the Cam Entry +# Arguments: +# dut: The CamEntry being tested +# c (command): NA (0), Read (1), Write (2), Reserve (3) +# d (data): The data to be set +def set_cam_entry(dut, c, d): + # Write desired values + yield dut.command.eq(c) + yield dut.data_in.eq(d) + yield + # Reset all lines + yield dut.command.eq(0) + yield dut.data_in.eq(0) + yield + +# Checks the data state of the CAM entry +# Arguments: +# dut: The CamEntry being tested +# d (Data): The expected data +# op (Operation): (0 => ==), (1 => !=) +def check_data(dut, d, op): + out_d = yield dut.data + assert_op("Data", out_d, d, op) + +# Checks the match state of the CAM entry +# Arguments: +# dut: The CamEntry being tested +# m (Match): The expected match +# op (Operation): (0 => ==), (1 => !=) +def check_match(dut, m, op): + out_m = yield dut.match + assert_op("Match", out_m, m, op) + +# Checks the state of the CAM entry +# Arguments: +# dut: The CamEntry being tested +# d (data): The expected data +# m (match): The expected match +# d_op (Operation): Operation for the data assertion (0 => ==), (1 => !=) +# m_op (Operation): Operation for the match assertion (0 => ==), (1 => !=) +def check_all(dut, d, m, d_op, m_op): + yield from check_data(dut, d, d_op) + yield from check_match(dut, m, m_op) + +# This tbench goes through the paces of testing the CamEntry module +# It is done by writing and then reading various combinations of key/data pairs +# and reading the results with varying keys to verify the resulting stored +# data is correct. +def tbench(dut): + # Check write + command = 2 + data = 1 + match = 0 + yield from set_cam_entry(dut, command, data) + yield from check_all(dut, data, match, 0, 0) + + # Check read miss + command = 1 + data = 2 + match = 0 + yield from set_cam_entry(dut, command, data) + yield from check_all(dut, data, match, 1, 0) + + # Check read hit + command = 1 + data = 1 + match = 1 + yield from set_cam_entry(dut, command, data) + yield from check_all(dut, data, match, 0, 0) + + # Check overwrite + command = 2 + data = 5 + match = 0 + yield from set_cam_entry(dut, command, data) + yield + yield from check_all(dut, data, match, 0, 0) + + # Check read hit + command = 1 + data = 5 + match = 1 + yield from set_cam_entry(dut, command, data) + yield from check_all(dut, data, match, 0, 0) + + # Check reset + command = 3 + data = 0 + match = 0 + yield from set_cam_entry(dut, command, data) + yield from check_all(dut, data, match, 0, 0) + + # Extra clock cycle for waveform + yield + + +def test_camentry(): + dut = CamEntry(4) + run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_cam_entry.vcd") + print("CamEntry Unit Test Success") + + +if __name__ == "__main__": + test_camentry() + diff --git a/src/soc/TLB/test/test_permission_validator.py b/src/soc/TLB/test/test_permission_validator.py new file mode 100644 index 00000000..81873d79 --- /dev/null +++ b/src/soc/TLB/test/test_permission_validator.py @@ -0,0 +1,146 @@ +from nmigen.compat.sim import run_simulation + +from TLB.PermissionValidator import PermissionValidator + +from TestUtil.test_helper import assert_op + + +def set_validator(dut, d, xwr, sm, sa, asid): + yield dut.data.eq(d) + yield dut.xwr.eq(xwr) + yield dut.super_mode.eq(sm) + yield dut.super_access.eq(sa) + yield dut.asid.eq(asid) + yield + +def check_valid(dut, v, op): + out_v = yield dut.valid + assert_op("Valid", out_v, v, op) + +def tbench(dut): + # 80 bits represented. Ignore the MSB as it will be truncated + # ASID is bits first 4 hex values (bits 64 - 78) + + # Test user mode entry valid + # Global Bit matching ASID + # Ensure that user mode and valid is enabled! + data = 0x7FFF0000000000000031 + # Ignore MSB it will be truncated + asid = 0x7FFF + super_mode = 0 + super_access = 0 + xwr = 0 + valid = 1 + yield from set_validator(dut, data, xwr, super_mode, super_access, asid) + yield from check_valid(dut, valid, 0) + + # Test user mode entry valid + # Global Bit nonmatching ASID + # Ensure that user mode and valid is enabled! + data = 0x7FFF0000000000000031 + # Ignore MSB it will be truncated + asid = 0x7FF6 + super_mode = 0 + super_access = 0 + xwr = 0 + valid = 1 + yield from set_validator(dut, data, xwr, super_mode, super_access, asid) + yield from check_valid(dut, valid, 0) + + # Test user mode entry invalid + # Global Bit nonmatching ASID + # Ensure that user mode and valid is enabled! + data = 0x7FFF0000000000000021 + # Ignore MSB it will be truncated + asid = 0x7FF6 + super_mode = 0 + super_access = 0 + xwr = 0 + valid = 0 + yield from set_validator(dut, data, xwr, super_mode, super_access, asid) + yield from check_valid(dut, valid, 0) + + # Test user mode entry valid + # Ensure that user mode and valid is enabled! + data = 0x7FFF0000000000000011 + # Ignore MSB it will be truncated + asid = 0x7FFF + super_mode = 0 + super_access = 0 + xwr = 0 + valid = 1 + yield from set_validator(dut, data, xwr, super_mode, super_access, asid) + yield from check_valid(dut, valid, 0) + + # Test user mode entry invalid + # Ensure that user mode and valid is enabled! + data = 0x7FFF0000000000000011 + # Ignore MSB it will be truncated + asid = 0x7FF6 + super_mode = 0 + super_access = 0 + xwr = 0 + valid = 0 + yield from set_validator(dut, data, xwr, super_mode, super_access, asid) + yield from check_valid(dut, valid, 0) + + # Test supervisor mode entry valid + # The entry is NOT in user mode + # Ensure that user mode and valid is enabled! + data = 0x7FFF0000000000000001 + # Ignore MSB it will be truncated + asid = 0x7FFF + super_mode = 1 + super_access = 0 + xwr = 0 + valid = 1 + yield from set_validator(dut, data, xwr, super_mode, super_access, asid) + yield from check_valid(dut, valid, 0) + + # Test supervisor mode entry invalid + # The entry is in user mode + # Ensure that user mode and valid is enabled! + data = 0x7FFF0000000000000011 + # Ignore MSB it will be truncated + asid = 0x7FFF + super_mode = 1 + super_access = 0 + xwr = 0 + valid = 0 + yield from set_validator(dut, data, xwr, super_mode, super_access, asid) + yield from check_valid(dut, valid, 0) + + # Test supervisor mode entry valid + # The entry is NOT in user mode with access + # Ensure that user mode and valid is enabled! + data = 0x7FFF0000000000000001 + # Ignore MSB it will be truncated + asid = 0x7FFF + super_mode = 1 + super_access = 1 + xwr = 0 + valid = 1 + yield from set_validator(dut, data, xwr, super_mode, super_access, asid) + yield from check_valid(dut, valid, 0) + + # Test supervisor mode entry valid + # The entry is in user mode with access + # Ensure that user mode and valid is enabled! + data = 0x7FFF0000000000000011 + # Ignore MSB it will be truncated + asid = 0x7FFF + super_mode = 1 + super_access = 1 + xwr = 0 + valid = 1 + yield from set_validator(dut, data, xwr, super_mode, super_access, asid) + yield from check_valid(dut, valid, 0) + + +def test_permv(): + dut = PermissionValidator(15, 64); + run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_permission_validator.vcd") + print("PermissionValidator Unit Test Success") + +if __name__ == "__main__": + test_permv() diff --git a/src/soc/TLB/test/test_pte_entry.py b/src/soc/TLB/test/test_pte_entry.py new file mode 100644 index 00000000..5c0c34dc --- /dev/null +++ b/src/soc/TLB/test/test_pte_entry.py @@ -0,0 +1,102 @@ +from nmigen.compat.sim import run_simulation + +from TLB.PteEntry import PteEntry + +from TestUtil.test_helper import assert_op + +def set_entry(dut, i): + yield dut.i.eq(i) + yield + +def check_dirty(dut, d, op): + out_d = yield dut.d + assert_op("Dirty", out_d, d, op) + +def check_accessed(dut, a, op): + out_a = yield dut.a + assert_op("Accessed", out_a, a, op) + +def check_global(dut, o, op): + out = yield dut.g + assert_op("Global", out, o, op) + +def check_user(dut, o, op): + out = yield dut.u + assert_op("User Mode", out, o, op) + +def check_xwr(dut, o, op): + out = yield dut.xwr + assert_op("XWR", out, o, op) + +def check_asid(dut, o, op): + out = yield dut.asid + assert_op("ASID", out, o, op) + +def check_pte(dut, o, op): + out = yield dut.pte + assert_op("ASID", out, o, op) + +def check_valid(dut, v, op): + out_v = yield dut.v + assert_op("Valid", out_v, v, op) + +def check_all(dut, d, a, g, u, xwr, v, asid, pte): + yield from check_dirty(dut, d, 0) + yield from check_accessed(dut, a, 0) + yield from check_global(dut, g, 0) + yield from check_user(dut, u, 0) + yield from check_xwr(dut, xwr, 0) + yield from check_asid(dut, asid, 0) + yield from check_pte(dut, pte, 0) + yield from check_valid(dut, v, 0) + +def tbench(dut): + # 80 bits represented. Ignore the MSB as it will be truncated + # ASID is bits first 4 hex values (bits 64 - 78) + + i = 0x7FFF0000000000000031 + dirty = 0 + access = 0 + glob = 1 + user = 1 + xwr = 0 + valid = 1 + asid = 0x7FFF + pte = 0x0000000000000031 + yield from set_entry(dut, i) + yield from check_all(dut, dirty, access, glob, user, xwr, valid, asid, pte) + + i = 0x0FFF00000000000000FF + dirty = 1 + access = 1 + glob = 1 + user = 1 + xwr = 7 + valid = 1 + asid = 0x0FFF + pte = 0x00000000000000FF + yield from set_entry(dut, i) + yield from check_all(dut, dirty, access, glob, user, xwr, valid, asid, pte) + + i = 0x0721000000001100001F + dirty = 0 + access = 0 + glob = 0 + user = 1 + xwr = 7 + valid = 1 + asid = 0x0721 + pte = 0x000000001100001F + yield from set_entry(dut, i) + yield from check_all(dut, dirty, access, glob, user, xwr, valid, asid, pte) + + yield + + +def test_pteentry(): + dut = PteEntry(15, 64); + run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_pte_entry.vcd") + print("PteEntry Unit Test Success") + +if __name__ == "__main__": + test_pteentry() diff --git a/src/soc/TLB/test/test_set_associative_cache.py b/src/soc/TLB/test/test_set_associative_cache.py new file mode 100644 index 00000000..0641b556 --- /dev/null +++ b/src/soc/TLB/test/test_set_associative_cache.py @@ -0,0 +1,38 @@ +from nmigen.compat.sim import run_simulation + +from TLB.SetAssociativeCache import SetAssociativeCache + +from TestUtil.test_helper import assert_eq, assert_ne, assert_op + +def set_sac(dut, e, c, s, t, d): + yield dut.enable.eq(e) + yield dut.command.eq(c) + yield dut.cset.eq(s) + yield dut.tag.eq(t) + yield dut.data_i.eq(d) + yield + +def tbench(dut): + enable = 1 + command = 2 + cset = 1 + tag = 2 + data = 3 + yield from set_sac(dut, enable, command, cset, tag, data) + yield + + enable = 1 + command = 2 + cset = 1 + tag = 5 + data = 8 + yield from set_sac(dut, enable, command, cset, tag, data) + yield + +def test_assoc_cache(): + dut = SetAssociativeCache(4, 4, 4, 4) + run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_set_associative_cache.vcd") + print("Set Associative Cache Unit Test Success") + +if __name__ == "__main__": + test_assoc_cache() diff --git a/src/soc/TLB/test/test_tlb.py b/src/soc/TLB/test/test_tlb.py new file mode 100644 index 00000000..e9cc9d69 --- /dev/null +++ b/src/soc/TLB/test/test_tlb.py @@ -0,0 +1,80 @@ +#import tracemalloc +#tracemalloc.start() + +from nmigen.compat.sim import run_simulation + +from TLB.TLB import TLB + +from TestUtil.test_helper import assert_op, assert_eq + +#self.supermode = Signal(1) # Supervisor Mode +#self.super_access = Signal(1) # Supervisor Access +#self.command = Signal(2) # 00=None, 01=Search, 10=Write L1, 11=Write L2 +#self.xwr = Signal(3) # Execute, Write, Read +#self.mode = Signal(4) # 4 bits for access to Sv48 on Rv64 +#self.address_L1 = Signal(max=L1_size) +#self.asid = Signal(asid_size) # Address Space IDentifier (ASID) +#self.vma = Signal(vma_size) # Virtual Memory Address (VMA) +#self.pte_in = Signal(pte_size) # To be saved Page Table Entry (PTE) +# +#self.hit = Signal(1) # Denotes if the VMA had a mapped PTE +#self.perm_valid = Signal(1) # Denotes if the permissions are correct +#self.pte_out = Signal(pte_size) # PTE that was mapped to by the VMA + +COMMAND_READ=1 +COMMAND_WRITE_L1=2 + +# Checks the data state of the CAM entry +# Arguments: +# dut: The CamEntry being tested +# d (Data): The expected data +# op (Operation): (0 => ==), (1 => !=) +def check_hit(dut, d): + hit_d = yield dut.hit + #assert_eq("hit", hit_d, d) + +def test_command(dut,cmd,xwr,cycles): + yield dut.command.eq(cmd) + yield dut.xwr.eq(xwr) + for i in range(0,cycles): + yield + +def test_write_L1(dut,vma,address_L1,asid,pte_in): + yield dut.address_L1.eq(address_L1) + yield dut.asid.eq(asid) + yield dut.vma.eq(vma) + yield dut.pte_in.eq(pte_in) + yield from test_command(dut,COMMAND_WRITE_L1,7,2) + +def test_search(dut,vma,found): + yield dut.vma.eq(vma) + yield from test_command(dut,COMMAND_READ,7,1) + yield from check_hit(dut,found) + +def zero(dut): + yield dut.supermode.eq(0) + yield dut.super_access.eq(0) + yield dut.mode.eq(0) + yield dut.address_L1.eq(0) + yield dut.asid.eq(0) + yield dut.vma.eq(0) + yield dut.pte_in.eq(0) + +def tbench(dut): + yield from zero(dut) + yield dut.mode.eq(0xF) # enable TLB + #test hit + yield from test_write_L1(dut,0xFEEDFACE,0,0xFFFF,0xF0F0) + yield from test_search(dut,0xFEEDFACE,1) + yield from test_search(dut,0xFACEFEED,0) + + + + +def test_tlb(): + dut = TLB(15,36,64,8) + run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_tlb.vcd") + print("TLB Unit Test Success") + +if __name__ == "__main__": + test_tlb() diff --git a/src/soc/TestUtil/test_helper.py b/src/soc/TestUtil/test_helper.py new file mode 100644 index 00000000..c42990d6 --- /dev/null +++ b/src/soc/TestUtil/test_helper.py @@ -0,0 +1,30 @@ +def assert_op(pre, o, e, op): + """ Verifies the given values given the particular operand + Arguments: + p (Prefix): Appended to the front of the assert statement + e (Expected): The expected value + o (Output): The output result + op (Operation): (0 => ==), (1 => !=) + """ + if op == 0: + assert_eq(pre, o, e) + else: + assert_ne(pre, o, e) + +def assert_eq(p, o, e): + """ Verifies the given values are equal + Arguments: + p (Prefix): Appended to the front of the assert statement + e (Expected): The expected value + o (Output): The output result + """ + assert o == e, p + " Output " + str(o) + " Expected " + str(e) + +def assert_ne(p, o, e): + """ Verifies the given values are not equal + Arguments: + p (Prefix): Appended to the front of the assert statement + e (Expected): The expected value + o (Output): The output result + """ + assert o != e, p + " Output " + str(o) + " Not Expecting " + str(e) diff --git a/src/soc/decoder/.gitignore b/src/soc/decoder/.gitignore new file mode 100644 index 00000000..afed0735 --- /dev/null +++ b/src/soc/decoder/.gitignore @@ -0,0 +1 @@ +*.csv diff --git a/src/soc/decoder/power_decoder.py b/src/soc/decoder/power_decoder.py new file mode 100644 index 00000000..5b5e7103 --- /dev/null +++ b/src/soc/decoder/power_decoder.py @@ -0,0 +1,275 @@ +"""Cascading Power ISA Decoder + +This module uses CSV tables in a hierarchical/peer cascading fashion, +to create a multi-level instruction decoder by recognising appropriate +patterns. The output is a flattened (1-level) series of fields suitable +for a simple RISC engine. + +This is based on Anton Blanchard's excellent microwatt work: +https://github.com/antonblanchard/microwatt/blob/master/decode1.vhdl + +The basic principle is that the python code does the heavy lifting +(reading the CSV files, constructing the hierarchy), creating the HDL +AST with for-loops generating switch-case statements. + +PowerDecoder takes a *list* of CSV files with an associated bit-range +that it is requested to match against the "opcode" row of the CSV file. +This pattern can be either an integer, a binary number, *or* a wildcard +nmigen Case pattern of the form "001--1-100". + +Subdecoders are *additional* cases with further decoding. The "pattern" +argument is specified as one of the Case statements (a peer of the opcode +row in the CSV file), and thus further fields of the opcode may be decoded +giving increasing levels of detail. + +Top Level: + + [ (extra.csv: bit-fields entire 32-bit range + opcode -> matches + 000000---------------01000000000 -> ILLEGAL instruction + 01100000000000000000000000000000 -> SIM_CONFIG instruction + ................................ -> + ), + (major.csv: first 6 bits ONLY + opcode -> matches + 001100 -> ALU,OP_ADD (add) + 001101 -> ALU,OP_ADD (another type of add) + ...... -> ... + ...... -> ... + subdecoders: + 001011 this must match *MAJOR*.CSV + [ (minor_19.csv: bits 21 through 30 inclusive: + opcode -> matches + 0b0000000000 -> ALU,OP_MCRF + ............ -> .... + ), + (minor_19_00000.csv: bits 21 through 25 inclusive: + opcode -> matches + 0b00010 -> ALU,add_pcis + ) + ] + ), + ] + +""" + +from nmigen import Module, Elaboratable, Signal +from nmigen.cli import rtlil +from power_enums import (Function, Form, InternalOp, In1Sel, In2Sel, In3Sel, + OutSel, RC, LdstLen, CryIn, get_csv, single_bit_flags, + get_signal_name, default_values) +from collections import namedtuple +from power_fields import DecodeFields +from power_fieldsn import SigDecode, SignalBitRange + +Subdecoder = namedtuple("Subdecoder", ["pattern", "opcodes", "opint", + "bitsel", "suffix", "subdecoders"]) + + +class PowerOp: + """PowerOp: spec for execution. op type (ADD etc.) reg specs etc. + """ + + def __init__(self): + self.function_unit = Signal(Function, reset_less=True) + self.internal_op = Signal(InternalOp, reset_less=True) + self.form = Signal(Form, reset_less=True) + self.in1_sel = Signal(In1Sel, reset_less=True) + self.in2_sel = Signal(In2Sel, reset_less=True) + self.in3_sel = Signal(In3Sel, reset_less=True) + self.out_sel = Signal(OutSel, reset_less=True) + self.ldst_len = Signal(LdstLen, reset_less=True) + self.rc_sel = Signal(RC, reset_less=True) + self.cry_in = Signal(CryIn, reset_less=True) + for bit in single_bit_flags: + name = get_signal_name(bit) + setattr(self, name, Signal(reset_less=True, name=name)) + + def _eq(self, row=None): + if row is None: + row = default_values + res = [self.function_unit.eq(Function[row['unit']]), + self.form.eq(Form[row['form']]), + self.internal_op.eq(InternalOp[row['internal op']]), + self.in1_sel.eq(In1Sel[row['in1']]), + self.in2_sel.eq(In2Sel[row['in2']]), + self.in3_sel.eq(In3Sel[row['in3']]), + self.out_sel.eq(OutSel[row['out']]), + self.ldst_len.eq(LdstLen[row['ldst len']]), + self.rc_sel.eq(RC[row['rc']]), + self.cry_in.eq(CryIn[row['cry in']]), + ] + for bit in single_bit_flags: + sig = getattr(self, get_signal_name(bit)) + res.append(sig.eq(int(row.get(bit, 0)))) + return res + + def eq(self, otherop): + res = [self.function_unit.eq(otherop.function_unit), + self.form.eq(otherop.form), + self.internal_op.eq(otherop.internal_op), + self.in1_sel.eq(otherop.in1_sel), + self.in2_sel.eq(otherop.in2_sel), + self.in3_sel.eq(otherop.in3_sel), + self.out_sel.eq(otherop.out_sel), + self.rc_sel.eq(otherop.rc_sel), + self.ldst_len.eq(otherop.ldst_len), + self.cry_in.eq(otherop.cry_in)] + for bit in single_bit_flags: + sig = getattr(self, get_signal_name(bit)) + res.append(sig.eq(getattr(otherop, get_signal_name(bit)))) + return res + + def ports(self): + regular = [self.function_unit, + self.in1_sel, + self.in2_sel, + self.in3_sel, + self.out_sel, + self.ldst_len, + self.rc_sel, + self.internal_op, + self.form] + single_bit_ports = [getattr(self, get_signal_name(x)) + for x in single_bit_flags] + return regular + single_bit_ports + + +class PowerDecoder(Elaboratable): + """PowerDecoder - decodes an incoming opcode into the type of operation + """ + + def __init__(self, width, dec): + if not isinstance(dec, list): + dec = [dec] + self.dec = dec + self.opcode_in = Signal(width, reset_less=True) + + self.op = PowerOp() + for d in dec: + if d.suffix is not None and d.suffix >= width: + d.suffix = None + self.width = width + + def suffix_mask(self, d): + return ((1 << d.suffix) - 1) + + def divide_opcodes(self, d): + divided = {} + mask = self.suffix_mask(d) + print("mask", hex(mask)) + for row in d.opcodes: + opcode = row['opcode'] + if d.opint and '-' not in opcode: + opcode = int(opcode, 0) + key = opcode & mask + opcode = opcode >> d.suffix + if key not in divided: + divided[key] = [] + r = row.copy() + r['opcode'] = opcode + divided[key].append(r) + return divided + + def elaborate(self, platform): + m = Module() + comb = m.d.comb + + # note: default opcode is "illegal" as this is a combinatorial block + + # go through the list of CSV decoders first + for d in self.dec: + opcode_switch = Signal(d.bitsel[1] - d.bitsel[0], + reset_less=True) + comb += opcode_switch.eq(self.opcode_in[d.bitsel[0]:d.bitsel[1]]) + if d.suffix: + opcodes = self.divide_opcodes(d) + opc_in = Signal(d.suffix, reset_less=True) + comb += opc_in.eq(opcode_switch[:d.suffix]) + with m.Switch(opc_in): + for key, row in opcodes.items(): + bitsel = (d.suffix+d.bitsel[0], d.bitsel[1]) + sd = Subdecoder(pattern=None, opcodes=row, + bitsel=bitsel, suffix=None, + opint=False, subdecoders=[]) + subdecoder = PowerDecoder(width=32, dec=sd) + setattr(m.submodules, "dec_sub%d" % key, subdecoder) + comb += subdecoder.opcode_in.eq(self.opcode_in) + with m.Case(key): + comb += self.op.eq(subdecoder.op) + else: + # TODO: arguments, here (all of them) need to be a list. + # a for-loop around the *list* of decoder args. + with m.Switch(opcode_switch): + self.handle_subdecoders(m, d) + for row in d.opcodes: + opcode = row['opcode'] + if d.opint and '-' not in opcode: + opcode = int(opcode, 0) + if not row['unit']: + continue + with m.Case(opcode): + comb += self.op._eq(row) + return m + + def handle_subdecoders(self, m, d): + for dec in d.subdecoders: + subdecoder = PowerDecoder(self.width, dec) + if isinstance(dec, list): # XXX HACK: take first pattern + dec = dec[0] + setattr(m.submodules, "dec%d" % dec.pattern, subdecoder) + m.d.comb += subdecoder.opcode_in.eq(self.opcode_in) + with m.Case(dec.pattern): + m.d.comb += self.op.eq(subdecoder.op) + + def ports(self): + return [self.opcode_in] + self.op.ports() + + +class TopPowerDecoder(PowerDecoder, DecodeFields): + + def __init__(self, width, dec): + PowerDecoder.__init__(self, width, dec) + DecodeFields.__init__(self, SignalBitRange, [self.opcode_in]) + self.create_specs() + + +def create_pdecode(): + + # minor 19 has extra patterns + m19 = [] + m19.append(Subdecoder(pattern=19, opcodes=get_csv("minor_19.csv"), + opint=True, bitsel=(1, 11), suffix=None, subdecoders=[])) + m19.append(Subdecoder(pattern=19, opcodes=get_csv("minor_19_00000.csv"), + opint=True, bitsel=(1, 6), suffix=None, subdecoders=[])) + + # minor opcodes. + pminor = [ + m19, + Subdecoder(pattern=30, opcodes=get_csv("minor_30.csv"), + opint=True, bitsel=(1, 6), suffix=None, subdecoders=[]), + Subdecoder(pattern=31, opcodes=get_csv("minor_31.csv"), + opint=True, bitsel=(1, 11), suffix=0b00101, subdecoders=[]), + Subdecoder(pattern=58, opcodes=get_csv("minor_58.csv"), + opint=True, bitsel=(0, 2), suffix=None, subdecoders=[]), + Subdecoder(pattern=62, opcodes=get_csv("minor_62.csv"), + opint=True, bitsel=(0, 2), suffix=None, subdecoders=[]), + ] + + # top level: extra merged with major + dec = [] + opcodes = get_csv("major.csv") + dec.append(Subdecoder(pattern=None, opint=True, opcodes=opcodes, + bitsel=(26, 32), suffix=None, subdecoders=pminor)) + opcodes = get_csv("extra.csv") + dec.append(Subdecoder(pattern=None, opint=False, opcodes=opcodes, + bitsel=(0, 32), suffix=None, subdecoders=[])) + + return TopPowerDecoder(32, dec) + + +if __name__ == '__main__': + pdecode = create_pdecode() + vl = rtlil.convert(pdecode, ports=pdecode.ports()) + with open("decoder.il", "w") as f: + f.write(vl) diff --git a/src/soc/decoder/power_decoder2.py b/src/soc/decoder/power_decoder2.py new file mode 100644 index 00000000..1b7435a0 --- /dev/null +++ b/src/soc/decoder/power_decoder2.py @@ -0,0 +1,429 @@ +"""Power ISA Decoder second stage + +based on Anton Blanchard microwatt decode2.vhdl + +""" +from nmigen import Module, Elaboratable, Signal, Mux, Const +from nmigen.cli import rtlil + +from power_decoder import create_pdecode +from power_enums import (InternalOp, CryIn, Function, LdstLen, + In1Sel, In2Sel, In3Sel, OutSel, SPR, RC) + + +class DecodeA(Elaboratable): + """DecodeA from instruction + + decodes register RA, whether immediate-zero, implicit and + explicit CSRs + """ + + def __init__(self, dec): + self.dec = dec + self.sel_in = Signal(In1Sel, reset_less=True) + self.insn_in = Signal(32, reset_less=True) + self.reg_out = Data(5, name="reg_a") + self.immz_out = Signal(reset_less=True) + self.spr_out = Data(10, "spr_a") + + def elaborate(self, platform): + m = Module() + comb = m.d.comb + + # select Register A field + with m.If((self.sel_in == In1Sel.RA) | + ((self.sel_in == In1Sel.RA_OR_ZERO) & + (self.reg_out.data != Const(0, 5)))): + comb += self.reg_out.data.eq(self.dec.RA[0:-1]) + comb += self.reg_out.ok.eq(1) + + # zero immediate requested + with m.If((self.sel_in == In1Sel.RA_OR_ZERO) & + (self.reg_out.data == Const(0, 5))): + comb += self.immz_out.eq(1) + + # decode SPR1 based on instruction type + op = self.dec.op + # BC or BCREG: potential implicit register (CTR) + with m.If((op.internal_op == InternalOp.OP_BC) | + (op.internal_op == InternalOp.OP_BCREG)): + with m.If(~self.dec.BO[2]): # 3.0B p38 BO2=0, use CTR reg + comb += self.spr_out.data.eq(SPR.CTR) # constant: CTR + comb += self.spr_out.ok.eq(1) + # MFSPR or MTSPR: move-from / move-to SPRs + with m.If((op.internal_op == InternalOp.OP_MFSPR) | + (op.internal_op == InternalOp.OP_MTSPR)): + comb += self.spr_out.data.eq(self.dec.SPR[0:-1]) # SPR field, XFX + comb += self.spr_out.ok.eq(1) + + return m + +class Data: + + def __init__(self, width, name): + + self.data = Signal(width, name=name, reset_less=True) + self.ok = Signal(name="%s_ok" % name, reset_less=True) + + def eq(self, rhs): + return [self.data.eq(rhs.data), + self.ok.eq(rhs.ok)] + + def ports(self): + return [self.data, self.ok] + + +class DecodeB(Elaboratable): + """DecodeB from instruction + + decodes register RB, different forms of immediate (signed, unsigned), + and implicit SPRs + """ + + def __init__(self, dec): + self.dec = dec + self.sel_in = Signal(In2Sel, reset_less=True) + self.insn_in = Signal(32, reset_less=True) + self.reg_out = Data(5, "reg_b") + self.imm_out = Data(64, "imm_b") + self.spr_out = Data(10, "spr_b") + + def elaborate(self, platform): + m = Module() + comb = m.d.comb + + # select Register B field + with m.Switch(self.sel_in): + with m.Case(In2Sel.RB): + comb += self.reg_out.data.eq(self.dec.RB[0:-1]) + comb += self.reg_out.ok.eq(1) + with m.Case(In2Sel.CONST_UI): + comb += self.imm_out.data.eq(self.dec.UI[0:-1]) + comb += self.imm_out.ok.eq(1) + with m.Case(In2Sel.CONST_SI): # TODO: sign-extend here? + comb += self.imm_out.data.eq(self.dec.SI[0:-1]) + comb += self.imm_out.ok.eq(1) + with m.Case(In2Sel.CONST_UI_HI): + comb += self.imm_out.data.eq(self.dec.UI[0:-1]<<4) + comb += self.imm_out.ok.eq(1) + with m.Case(In2Sel.CONST_SI_HI): # TODO: sign-extend here? + comb += self.imm_out.data.eq(self.dec.SI[0:-1]<<4) + comb += self.imm_out.ok.eq(1) + with m.Case(In2Sel.CONST_LI): + comb += self.imm_out.data.eq(self.dec.LI[0:-1]<<2) + comb += self.imm_out.ok.eq(1) + with m.Case(In2Sel.CONST_BD): + comb += self.imm_out.data.eq(self.dec.BD[0:-1]<<2) + comb += self.imm_out.ok.eq(1) + with m.Case(In2Sel.CONST_DS): + comb += self.imm_out.data.eq(self.dec.DS[0:-1]<<2) + comb += self.imm_out.ok.eq(1) + with m.Case(In2Sel.CONST_M1): + comb += self.imm_out.data.eq(~Const(0, 64)) # all 1s + comb += self.imm_out.ok.eq(1) + with m.Case(In2Sel.CONST_SH): + comb += self.imm_out.data.eq(self.dec.sh[0:-1]) + comb += self.imm_out.ok.eq(1) + with m.Case(In2Sel.CONST_SH32): + comb += self.imm_out.data.eq(self.dec.SH32[0:-1]) + comb += self.imm_out.ok.eq(1) + + # decode SPR2 based on instruction type + op = self.dec.op + # BCREG implicitly uses CTR or LR for 2nd reg + with m.If(op.internal_op == InternalOp.OP_BCREG): + with m.If(self.dec.FormXL.XO[9]): # 3.0B p38 top bit of XO + comb += self.spr_out.data.eq(SPR.CTR) + with m.Else(): + comb += self.spr_out.data.eq(SPR.LR) + comb += self.spr_out.ok.eq(1) + + return m + + +class DecodeC(Elaboratable): + """DecodeC from instruction + + decodes register RC + """ + + def __init__(self, dec): + self.dec = dec + self.sel_in = Signal(In3Sel, reset_less=True) + self.insn_in = Signal(32, reset_less=True) + self.reg_out = Data(5, "reg_c") + + def elaborate(self, platform): + m = Module() + comb = m.d.comb + + # select Register C field + with m.If(self.sel_in == In3Sel.RS): + comb += self.reg_out.data.eq(self.dec.RS[0:-1]) + comb += self.reg_out.ok.eq(1) + + return m + + +class DecodeOut(Elaboratable): + """DecodeOut from instruction + + decodes output register RA, RT or SPR + """ + + def __init__(self, dec): + self.dec = dec + self.sel_in = Signal(OutSel, reset_less=True) + self.insn_in = Signal(32, reset_less=True) + self.reg_out = Data(5, "reg_o") + self.spr_out = Data(10, "spr_o") + + def elaborate(self, platform): + m = Module() + comb = m.d.comb + + # select Register out field + with m.Switch(self.sel_in): + with m.Case(OutSel.RT): + comb += self.reg_out.data.eq(self.dec.RT[0:-1]) + comb += self.reg_out.ok.eq(1) + with m.Case(OutSel.RA): + comb += self.reg_out.data.eq(self.dec.RA[0:-1]) + comb += self.reg_out.ok.eq(1) + with m.Case(OutSel.SPR): + comb += self.spr_out.data.eq(self.dec.SPR[0:-1]) # from XFX + comb += self.spr_out.ok.eq(1) + + return m + + +class DecodeRC(Elaboratable): + """DecodeRc from instruction + + decodes Record bit Rc + """ + def __init__(self, dec): + self.dec = dec + self.sel_in = Signal(RC, reset_less=True) + self.insn_in = Signal(32, reset_less=True) + self.rc_out = Data(1, "rc") + + def elaborate(self, platform): + m = Module() + comb = m.d.comb + + # select Record bit out field + with m.Switch(self.sel_in): + with m.Case(RC.RC): + comb += self.rc_out.data.eq(self.dec.Rc[0:-1]) + comb += self.rc_out.ok.eq(1) + with m.Case(RC.ONE): + comb += self.rc_out.data.eq(1) + comb += self.rc_out.ok.eq(1) + with m.Case(RC.NONE): + comb += self.rc_out.data.eq(0) + comb += self.rc_out.ok.eq(1) + + return m + + +class DecodeOE(Elaboratable): + """DecodeOE from instruction + + decodes OE field: uses RC decode detection which might not be good + + -- For now, use "rc" in the decode table to decide whether oe exists. + -- This is not entirely correct architecturally: For mulhd and + -- mulhdu, the OE field is reserved. It remains to be seen what an + -- actual POWER9 does if we set it on those instructions, for now we + -- test that further down when assigning to the multiplier oe input. + """ + def __init__(self, dec): + self.dec = dec + self.sel_in = Signal(RC, reset_less=True) + self.insn_in = Signal(32, reset_less=True) + self.oe_out = Data(1, "oe") + + def elaborate(self, platform): + m = Module() + comb = m.d.comb + + # select OE bit out field + with m.Switch(self.sel_in): + with m.Case(RC.RC): + comb += self.oe_out.data.eq(self.dec.OE[0:-1]) + comb += self.oe_out.ok.eq(1) + + return m + + +class XerBits: + def __init__(self): + self.ca = Signal(reset_less=True) + self.ca32 = Signal(reset_less=True) + self.ov = Signal(reset_less=True) + self.ov32 = Signal(reset_less=True) + self.so = Signal(reset_less=True) + + def ports(self): + return [self.ca, self.ca32, self.ov, self.ov32, self.so, ] + + +class Decode2ToExecute1Type: + + def __init__(self): + + self.valid = Signal(reset_less=True) + self.insn_type = Signal(InternalOp, reset_less=True) + self.nia = Signal(64, reset_less=True) + self.write_reg = Data(5, name="rego") + self.read_reg1 = Data(5, name="reg1") + self.read_reg2 = Data(5, name="reg2") + self.read_reg3 = Data(5, name="reg3") + self.imm_data = Data(64, name="imm") + self.write_spr = Data(10, name="spro") + self.read_spr1 = Data(10, name="spr1") + self.read_spr2 = Data(10, name="spr2") + #self.read_data1 = Signal(64, reset_less=True) + #self.read_data2 = Signal(64, reset_less=True) + #self.read_data3 = Signal(64, reset_less=True) + #self.cr = Signal(32, reset_less=True) # NO: this is from the CR SPR + #self.xerc = XerBits() # NO: this is from the XER SPR + self.lk = Signal(reset_less=True) + self.rc = Data(1, "rc") + self.oe = Data(1, "oe") + self.invert_a = Signal(reset_less=True) + self.invert_out = Signal(reset_less=True) + self.input_carry = Signal(CryIn, reset_less=True) + self.output_carry = Signal(reset_less=True) + self.input_cr = Signal(reset_less=True) + self.output_cr = Signal(reset_less=True) + self.is_32bit = Signal(reset_less=True) + self.is_signed = Signal(reset_less=True) + self.insn = Signal(32, reset_less=True) + self.data_len = Signal(4, reset_less=True) # bytes + self.byte_reverse = Signal(reset_less=True) + self.sign_extend = Signal(reset_less=True)# do we need this? + self.update = Signal(reset_less=True) # is this an update instruction? + + def ports(self): + return [self.valid, self.insn_type, self.nia, + #self.read_data1, self.read_data2, self.read_data3, + #self.cr, + self.lk, + self.invert_a, self.invert_out, + self.input_carry, self.output_carry, + self.input_cr, self.output_cr, + self.is_32bit, self.is_signed, + self.insn, + self.data_len, self.byte_reverse , self.sign_extend , + self.update] + \ + self.oe.ports() + \ + self.rc.ports() + \ + self.write_spr.ports() + \ + self.read_spr1.ports() + \ + self.read_spr2.ports() + \ + self.write_reg.ports() + \ + self.read_reg1.ports() + \ + self.read_reg2.ports() + \ + self.read_reg3.ports() + \ + self.imm_data.ports() + # + self.xerc.ports() + +class PowerDecode2(Elaboratable): + + def __init__(self, dec): + + self.dec = dec + self.e = Decode2ToExecute1Type() + + def ports(self): + return self.dec.ports() + self.e.ports() + + def elaborate(self, platform): + m = Module() + comb = m.d.comb + + # set up submodule decoders + m.submodules.dec = self.dec + m.submodules.dec_a = dec_a = DecodeA(self.dec) + m.submodules.dec_b = dec_b = DecodeB(self.dec) + m.submodules.dec_c = dec_c = DecodeC(self.dec) + m.submodules.dec_o = dec_o = DecodeOut(self.dec) + m.submodules.dec_rc = dec_rc = DecodeRC(self.dec) + m.submodules.dec_oe = dec_oe = DecodeOE(self.dec) + + # copy instruction through... + for i in [self.e.insn, dec_a.insn_in, dec_b.insn_in, + dec_c.insn_in, dec_o.insn_in, dec_rc.insn_in, + dec_oe.insn_in]: + comb += i.eq(self.dec.opcode_in) + + # ...and subdecoders' input fields + comb += dec_a.sel_in.eq(self.dec.op.in1_sel) + comb += dec_b.sel_in.eq(self.dec.op.in2_sel) + comb += dec_c.sel_in.eq(self.dec.op.in3_sel) + comb += dec_o.sel_in.eq(self.dec.op.out_sel) + comb += dec_rc.sel_in.eq(self.dec.op.rc_sel) + comb += dec_oe.sel_in.eq(self.dec.op.rc_sel) # XXX should be OE sel + + # decode LD/ST length + with m.Switch(self.dec.op.ldst_len): + with m.Case(LdstLen.is1B): + comb += self.e.data_len.eq(1) + with m.Case(LdstLen.is2B): + comb += self.e.data_len.eq(2) + with m.Case(LdstLen.is4B): + comb += self.e.data_len.eq(4) + with m.Case(LdstLen.is8B): + comb += self.e.data_len.eq(8) + + #comb += self.e.nia.eq(self.dec.nia) # XXX TODO + itype = Mux(self.dec.op.function_unit == Function.NONE, + InternalOp.OP_ILLEGAL, + self.dec.op.internal_op) + comb += self.e.insn_type.eq(itype) + + # registers a, b, c and out + comb += self.e.read_reg1.eq(dec_a.reg_out) + comb += self.e.read_reg2.eq(dec_b.reg_out) + comb += self.e.read_reg3.eq(dec_c.reg_out) + comb += self.e.write_reg.eq(dec_o.reg_out) + comb += self.e.imm_data.eq(dec_b.imm_out) + + # rc and oe out + comb += self.e.rc.eq(dec_rc.rc_out) + comb += self.e.oe.eq(dec_oe.oe_out) + + # SPRs out + comb += self.e.read_spr1.eq(dec_a.spr_out) + comb += self.e.read_spr2.eq(dec_b.spr_out) + comb += self.e.write_spr.eq(dec_o.spr_out) + + # decoded/selected instruction flags + comb += self.e.invert_a.eq(self.dec.op.inv_a) + comb += self.e.invert_out.eq(self.dec.op.inv_out) + comb += self.e.input_carry.eq(self.dec.op.cry_in) + comb += self.e.output_carry.eq(self.dec.op.cry_out) + comb += self.e.is_32bit.eq(self.dec.op.is_32b) + comb += self.e.is_signed.eq(self.dec.op.sgn) + with m.If(self.dec.op.lk): + comb += self.e.lk.eq(self.dec.LK[0:-1]) # XXX TODO: accessor + + comb += self.e.byte_reverse.eq(self.dec.op.br) + comb += self.e.sign_extend.eq(self.dec.op.sgn_ext) + comb += self.e.update.eq(self.dec.op.upd) + + comb += self.e.input_cr.eq(self.dec.op.cr_in) + comb += self.e.output_cr.eq(self.dec.op.cr_out) + + return m + + +if __name__ == '__main__': + pdecode = create_pdecode() + dec2 = PowerDecode2(pdecode) + vl = rtlil.convert(dec2, ports=dec2.ports() + pdecode.ports()) + with open("dec2.il", "w") as f: + f.write(vl) + diff --git a/src/soc/decoder/power_enums.py b/src/soc/decoder/power_enums.py new file mode 100644 index 00000000..dcf5cad2 --- /dev/null +++ b/src/soc/decoder/power_enums.py @@ -0,0 +1,229 @@ +from enum import Enum, unique +import csv +import os +import requests + + +def get_csv(name): + file_dir = os.path.dirname(os.path.realpath(__file__)) + file_path = os.path.join(file_dir, name) + if not os.path.isfile(file_path): + url = 'https://libre-riscv.org/openpower/isatables/' + name + r = requests.get(url, allow_redirects=True) + with open(file_path, 'w') as outfile: + outfile.write(r.content.decode("utf-8")) + with open(file_path, 'r') as csvfile: + reader = csv.DictReader(csvfile) + return list(reader) + + +# names of the fields in the tables that don't correspond to an enum +single_bit_flags = ['CR in', 'CR out', 'inv A', 'inv out', + 'cry out', 'BR', 'sgn ext', 'upd', 'rsrv', '32b', + 'sgn', 'lk', 'sgl pipe'] + +# default values for fields in the table +default_values = {'unit': "NONE", 'internal op': "OP_ILLEGAL", + 'in1': "RA", 'in2': 'NONE', 'in3': 'NONE', 'out': 'NONE', + 'ldst len': 'NONE', + 'rc' : 'NONE', 'cry in' : 'ZERO', 'form': 'NONE'} + +def get_signal_name(name): + if name[0].isdigit(): + name = "is_" + name + return name.lower().replace(' ', '_') + + +@unique +class Function(Enum): + NONE = 0 + ALU = 1 + LDST = 2 + + +@unique +class Form(Enum): + NONE = 0 + I = 1 + B = 2 + SC = 3 + D = 4 + DS = 5 + DQ = 6 + DX = 7 + X = 8 + XL = 9 + XFX = 10 + XFL = 11 + XX1 = 12 + XX2 = 13 + XX3 = 14 + XX4 = 15 + XS = 16 + XO = 17 + A = 18 + M = 19 + MD = 20 + MDS = 21 + VA = 22 + VC = 23 + VX = 24 + EVX = 25 + EVS = 26 + Z22 = 27 + Z23 = 28 + + + +@unique +class InternalOp(Enum): + OP_ILLEGAL = 0 + OP_NOP = 1 + OP_ADD = 2 + OP_ADDPCIS = 3 + OP_AND = 4 + OP_ATTN = 5 + OP_B = 6 + OP_BC = 7 + OP_BCREG = 8 + OP_BPERM = 9 + OP_CMP = 10 + OP_CMPB = 11 + OP_CMPEQB = 12 + OP_CMPRB = 13 + OP_CNTZ = 14 + OP_CRAND = 15 + OP_CRANDC = 16 + OP_CREQV = 17 + OP_CRNAND = 18 + OP_CRNOR = 19 + OP_CROR = 20 + OP_CRORC = 21 + OP_CRXOR = 22 + OP_DARN = 23 + OP_DCBF = 24 + OP_DCBST = 25 + OP_DCBT = 26 + OP_DCBTST = 27 + OP_DCBZ = 28 + OP_DIV = 29 + OP_DIVE = 30 + OP_EXTS = 31 + OP_EXTSWSLI = 32 + OP_ICBI = 33 + OP_ICBT = 34 + OP_ISEL = 35 + OP_ISYNC = 36 + OP_LOAD = 37 + OP_STORE = 38 + OP_MADDHD = 39 + OP_MADDHDU = 40 + OP_MADDLD = 41 + OP_MCRF = 42 + OP_MCRXR = 43 + OP_MCRXRX = 44 + OP_MFCR = 45 + OP_MFSPR = 46 + OP_MOD = 47 + OP_MTCRF = 48 + OP_MTSPR = 49 + OP_MUL_L64 = 50 + OP_MUL_H64 = 51 + OP_MUL_H32 = 52 + OP_OR = 53 + OP_POPCNT = 54 + OP_PRTY = 55 + OP_RLC = 56 + OP_RLCL = 57 + OP_RLCR = 58 + OP_SETB = 59 + OP_SHL = 60 + OP_SHR = 61 + OP_SYNC = 62 + OP_TD = 63 + OP_TDI = 64 + OP_TW = 65 + OP_TWI = 66 + OP_XOR = 67 + OP_SIM_CONFIG = 68 + + +@unique +class In1Sel(Enum): + RA = 0 + RA_OR_ZERO = 1 + NONE = 2 + SPR = 3 + + +@unique +class In2Sel(Enum): + NONE = 0 + RB = 1 + CONST_UI = 2 + CONST_SI = 3 + CONST_UI_HI = 4 + CONST_SI_HI = 5 + CONST_LI = 6 + CONST_BD = 7 + CONST_DS = 8 + CONST_M1 = 9 + CONST_SH = 10 + CONST_SH32 = 11 + SPR = 12 + + +@unique +class In3Sel(Enum): + NONE = 0 + RS = 1 + + +@unique +class OutSel(Enum): + NONE = 0 + RT = 1 + RA = 2 + SPR = 3 + + +@unique +class LdstLen(Enum): + NONE = 0 + is1B = 1 + is2B = 2 + is4B = 3 + is8B = 4 + + +@unique +class RC(Enum): + NONE = 0 + ONE = 1 + RC = 2 + + +@unique +class CryIn(Enum): + ZERO = 0 + ONE = 1 + CA = 2 + +@unique +class SPR(Enum): + XER = 1 + LR = 8 + CTR = 9 + TB = 268 + SRR0 = 26 + SRR1 = 27 + HSRR0 = 314 + HSRR1 = 315 + SPRG0 = 272 + SPRG1 = 273 + SPRG2 = 274 + SPRG3 = 275 + SPRG3U = 259 + HSPRG0 = 304 + HSPRG1 = 305 + diff --git a/src/soc/decoder/power_fields.py b/src/soc/decoder/power_fields.py new file mode 100644 index 00000000..3457331e --- /dev/null +++ b/src/soc/decoder/power_fields.py @@ -0,0 +1,242 @@ +from collections import OrderedDict, namedtuple + + +class BitRange(OrderedDict): + """BitRange: remaps from straight indices (0,1,2..) to bit numbers + """ + def __getitem__(self, subscript): + if isinstance(subscript, slice): + return list(self)[subscript] + else: + return self[subscript] + +def decode_instructions(form): + res = {} + accum = [] + for l in form: + if l.strip().startswith("Formats"): + l = l.strip().split(":")[-1] + l = l.replace(" ", "") + l = l.split(",") + for fmt in l: + if fmt not in res: + res[fmt] = [accum[0]] + else: + res[fmt].append(accum[0]) + accum = [] + else: + accum.append(l.strip()) + return res + +def decode_form_header(hdr): + res = {} + count = 0 + hdr = hdr.strip() + print (hdr.split('|')) + for f in hdr.split("|"): + if not f: + continue + if f[0].isdigit(): + idx = int(f.strip().split(' ')[0]) + res[count] = idx + count += len(f) + 1 + return res + +def find_unique(d, key): + if key not in d: + return key + idx = 1 + while "%s_%d" % (key, idx) in d: + idx += 1 + return "%s_%d" % (key, idx) + + +def decode_line(header, line): + line = line.strip() + res = {} + count = 0 + print ("line", line) + prev_fieldname = None + for f in line.split("|"): + if not f: + continue + end = count + len(f) + 1 + fieldname = f.strip() + if not fieldname or fieldname.startswith('/'): + if prev_fieldname is not None: + res[prev_fieldname] = (res[prev_fieldname], header[count]) + prev_fieldname = None + count = end + continue + bitstart = header[count] + if prev_fieldname is not None: + res[prev_fieldname] = (res[prev_fieldname], bitstart) + res[fieldname] = bitstart + count = end + prev_fieldname = fieldname + res[prev_fieldname] = (bitstart, 32) + return res + + +def decode_form(form): + header = decode_form_header(form[0]) + res = [] + print ("header", header) + for line in form[1:]: + dec = decode_line(header, line) + if dec: + res.append(dec) + fields = {} + falternate = {} + for l in res: + for k, (start,end) in l.items(): + if k in fields: + if (start, end) == fields[k]: + continue # already in and matching for this Form + if k in falternate: + alternate = "%s_%d" % (k, falternate[k]) + if (start, end) == fields[alternate]: + continue + falternate[k] = fidx = falternate.get(k, 0) + 1 + fields["%s_%d" % (k, fidx)] = (start, end) + else: + fields[k] = (start, end) + return fields + + +class DecodeFields: + + def __init__(self, bitkls=BitRange, bitargs=(), fname="fields.txt"): + self.bitkls = bitkls + self.bitargs = bitargs + self.fname = fname + + def create_specs(self): + self.forms, self.instrs = self.decode_fields() + self.form_names = forms = self.instrs.keys() + for form in forms: + fields = self.instrs[form] + fk = fields.keys() + Fields = namedtuple("Fields", fk) + instr = Fields(**fields) + setattr(self, "Form%s" % form, instr) + # now add in some commonly-used fields (should be done automatically) + # note that these should only be ones which are the same on all Forms + # note: these are from microwatt insn_helpers.vhdl + self.RS = self.FormX.RS + self.RT = self.FormX.RT + self.RA = self.FormX.RA + self.RB = self.FormX.RB + self.SI = self.FormD.SI + self.UI = self.FormD.UI + self.L = self.FormD.L + self.SH32 = self.FormM.SH + self.sh = self.FormMD.sh + self.MB32 = self.FormM.MB + self.ME32 = self.FormM.ME + self.LI = self.FormI.LI + self.LK = self.FormI.LK + self.AA = self.FormB.AA + self.Rc = self.FormX.Rc + self.OE = self.FormXO.Rc + self.BD = self.FormB.BD + self.BF = self.FormX.BF + self.CR = self.FormXL.XO # used by further mcrf decoding + self.BB = self.FormXL.BB + self.BA = self.FormXL.BA + self.BT = self.FormXL.BT + self.FXM = self.FormXFX.FXM + self.BO = self.FormXL.BO + self.BI = self.FormXL.BI + self.BH = self.FormXL.BH + self.D = self.FormD.D + self.DS = self.FormDS.DS + self.TO = self.FormX.TO + self.BC = self.FormA.BC + self.SH = self.FormX.SH + self.ME = self.FormM.ME + self.MB = self.FormM.MB + self.SPR = self.FormXFX.SPR + + def decode_fields(self): + with open(self.fname) as f: + txt = f.readlines() + forms = {} + reading_data = False + for l in txt: + print ("line", l) + l = l.strip() + if len(l) == 0: + continue + if reading_data: + if l[0] == '#': + reading_data = False + else: + forms[heading].append(l) + if not reading_data: + assert l[0] == '#' + heading = l[1:].strip() + #if heading.startswith('1.6.28'): # skip instr fields for now + #break + heading = heading.split(' ')[-1] + print ("heading", heading) + reading_data = True + forms[heading] = [] + + res = {} + inst = {} + + for hdr, form in forms.items(): + print ("heading", hdr) + if heading == 'Fields': + i = decode_instructions(form) + for form, field in i.items(): + inst[form] = self.decode_instruction_fields(field) + #else: + # res[hdr] = decode_form(form) + return res, inst + + def decode_instruction_fields(self, fields): + res = {} + for field in fields: + f, spec = field.strip().split(" ") + d = self.bitkls(*self.bitargs) + idx = 0 + for s in spec[1:-1].split(","): + s = s.split(':') + if len(s) == 1: + d[idx] = int(s[0]) + idx += 1 + else: + start = int(s[0]) + end = int(s[1]) + while start <= end: + d[idx] = start + idx += 1 + start += 1 + f = f.replace(",", "_") + unique = find_unique(res, f) + res[unique] = d + + return res + +if __name__ == '__main__': + dec = DecodeFields() + dec.create_specs() + forms, instrs = dec.forms, dec.instrs + for hdr, form in forms.items(): + print () + print (hdr) + for k, v in form.items(): + #print ("line", l) + #for k, v in l.items(): + print ("%s: %d-%d" % (k, v[0], v[1])) + for form, field in instrs.items(): + print () + print (form) + for f, vals in field.items(): + print (" ", f, vals) + print (dec.FormX) + print (dec.FormX.A) + print (dir(dec.FormX)) + print (dec.FormX._fields) diff --git a/src/soc/decoder/power_fieldsn.py b/src/soc/decoder/power_fieldsn.py new file mode 100644 index 00000000..e603bbd3 --- /dev/null +++ b/src/soc/decoder/power_fieldsn.py @@ -0,0 +1,74 @@ +from collections import OrderedDict +from power_fields import DecodeFields, BitRange +from nmigen import Module, Elaboratable, Signal, Cat +from nmigen.cli import rtlil + + +class SignalBitRange(BitRange): + def __init__(self, signal): + BitRange.__init__(self) + self.signal = signal + + def __getitem__(self, subs): + # *sigh* field numberings are bit-inverted. PowerISA 3.0B section 1.3.2 + width = self.signal.shape()[0] + print (dir(self)) + print (self.items()) + if isinstance(subs, slice): + res = [] + print (subs) + start, stop, step = subs.start, subs.stop, subs.step + if step is None: + step = 1 + if start is None: + start = 0 + if stop is None: + stop = -1 + if start < 0: + start = len(self) - start - 1 + if stop < 0: + stop = len(self) - stop - 1 + print ("range", start, stop, step) + for t in range(start, stop, step): + k = OrderedDict.__getitem__(self, t) + print ("t", t, k) + res.append(self.signal[width-k-1]) + return Cat(*res) + else: + k = OrderedDict.__getitem__(self, subs) + return self.signal[width-k-1] + + print ("translated", subs, translated) + + +class SigDecode(Elaboratable): + + def __init__(self, width): + self.opcode_in = Signal(width, reset_less=False) + self.df = DecodeFields(SignalBitRange, [self.opcode_in]) + self.df.create_specs() + self.x_s = Signal(len(self.df.FormX.S), reset_less=True) + self.x_sh = Signal(len(self.df.FormX.SH), reset_less=True) + self.dq_xs_s = Signal(len(self.df.FormDQ.SX_S), reset_less=True) + + def elaborate(self, platform): + m = Module() + comb = m.d.comb + comb += self.x_s.eq(self.df.FormX.S[0]) + comb += self.x_sh.eq(self.df.FormX.SH[0:-1]) + comb += self.dq_xs_s.eq(self.df.FormDQ.SX_S[0:-1]) + return m + + def ports(self): + return [self.opcode_in, self.x_s, self.x_sh] + +def create_sigdecode(): + s = SigDecode(32) + return s + +if __name__ == '__main__': + sigdecode = create_sigdecode() + vl = rtlil.convert(sigdecode, ports=sigdecode.ports()) + with open("decoder.il", "w") as f: + f.write(vl) + diff --git a/src/soc/decoder/test/test_power_decoder.py b/src/soc/decoder/test/test_power_decoder.py new file mode 100644 index 00000000..f64f4b96 --- /dev/null +++ b/src/soc/decoder/test/test_power_decoder.py @@ -0,0 +1,130 @@ +from nmigen import Module, Signal +from nmigen.back.pysim import Simulator, Delay +from nmigen.test.utils import FHDLTestCase +from nmigen.cli import rtlil +import sys +import os +import unittest +sys.path.append("../") +from power_decoder import (PowerDecoder, pdecode) +from power_enums import (Function, InternalOp, In1Sel, In2Sel, In3Sel, + OutSel, RC, LdstLen, CryIn, single_bit_flags, + get_signal_name, get_csv) + + +class DecoderTestCase(FHDLTestCase): + + def run_tst(self, bitsel, csvname, minor=None, suffix=None, opint=True): + m = Module() + comb = m.d.comb + opcode = Signal(32) + function_unit = Signal(Function) + internal_op = Signal(InternalOp) + in1_sel = Signal(In1Sel) + in2_sel = Signal(In2Sel) + in3_sel = Signal(In3Sel) + out_sel = Signal(OutSel) + rc_sel = Signal(RC) + ldst_len = Signal(LdstLen) + cry_in = Signal(CryIn) + + # opcodes = get_csv(csvname) + # m.submodules.dut = dut = PowerDecoder(32, opcodes, bitsel=bitsel, + # opint=opint, suffix=suffix) + m.submodules.dut = dut = pdecode + comb += [dut.opcode_in.eq(opcode), + function_unit.eq(dut.op.function_unit), + in1_sel.eq(dut.op.in1_sel), + in2_sel.eq(dut.op.in2_sel), + in3_sel.eq(dut.op.in3_sel), + out_sel.eq(dut.op.out_sel), + rc_sel.eq(dut.op.rc_sel), + ldst_len.eq(dut.op.ldst_len), + cry_in.eq(dut.op.cry_in), + internal_op.eq(dut.op.internal_op)] + + sim = Simulator(m) + opcodes = get_csv(csvname) + + def process(): + for row in opcodes: + if not row['unit']: + continue + op = row['opcode'] + if not opint: # HACK: convert 001---10 to 0b00100010 + op = "0b" + op.replace('-', '0') + print ("opint", opint, row['opcode'], op) + print(row) + yield opcode.eq(0) + yield opcode[bitsel[0]:bitsel[1]].eq(int(op, 0)) + if minor: + print(minor) + minorbits = minor[1] + yield opcode[minorbits[0]:minorbits[1]].eq(minor[0]) + yield Delay(1e-6) + signals = [(function_unit, Function, 'unit'), + (internal_op, InternalOp, 'internal op'), + (in1_sel, In1Sel, 'in1'), + (in2_sel, In2Sel, 'in2'), + (in3_sel, In3Sel, 'in3'), + (out_sel, OutSel, 'out'), + (rc_sel, RC, 'rc'), + (cry_in, CryIn, 'cry in'), + (ldst_len, LdstLen, 'ldst len')] + for sig, enm, name in signals: + result = yield sig + expected = enm[row[name]] + msg = f"{sig.name} == {enm(result)}, expected: {expected}" + self.assertEqual(enm(result), expected, msg) + for bit in single_bit_flags: + sig = getattr(dut.op, get_signal_name(bit)) + result = yield sig + expected = int(row[bit]) + msg = f"{sig.name} == {result}, expected: {expected}" + self.assertEqual(expected, result, msg) + sim.add_process(process) + prefix = os.path.splitext(csvname)[0] + with sim.write_vcd("%s.vcd" % prefix, "%s.gtkw" % prefix, traces=[ + opcode, function_unit, internal_op, + in1_sel, in2_sel]): + sim.run() + + def generate_ilang(self): + vl = rtlil.convert(pdecode, ports=pdecode.ports()) + with open("decoder.il", "w") as f: + f.write(vl) + + def test_major(self): + self.run_tst((26, 32), "major.csv") + self.generate_ilang() + + def test_minor_19(self): + self.run_tst((1, 11), "minor_19.csv", minor=(19, (26, 32)), + suffix=(0, 5)) + + # def test_minor_19_00000(self): + # self.run_tst((1, 11), "minor_19_00000.csv") + + def test_minor_30(self): + self.run_tst((1, 5), "minor_30.csv", minor=(30, (26, 32))) + + def test_minor_31(self): + self.run_tst((1, 11), "minor_31.csv", minor=(31, (26, 32))) + + def test_minor_58(self): + self.run_tst((0, 2), "minor_58.csv", minor=(58, (26, 32))) + + def test_minor_62(self): + self.run_tst((0, 2), "minor_62.csv", minor=(62, (26, 32))) + + + # #def test_minor_31_prefix(self): + # # self.run_tst(10, "minor_31.csv", suffix=(5, 10)) + + # def test_extra(self): + # self.run_tst(32, "extra.csv", opint=False) + # self.generate_ilang(32, "extra.csv", opint=False) + + +if __name__ == "__main__": + unittest.main() diff --git a/src/soc/experiment/alu_hier.py b/src/soc/experiment/alu_hier.py new file mode 100644 index 00000000..9659059c --- /dev/null +++ b/src/soc/experiment/alu_hier.py @@ -0,0 +1,239 @@ +from nmigen import Elaboratable, Signal, Module, Const, Mux +from nmigen.cli import main +from nmigen.cli import verilog, rtlil + +import operator + + +class Adder(Elaboratable): + def __init__(self, width): + self.a = Signal(width) + self.b = Signal(width) + self.o = Signal(width) + + def elaborate(self, platform): + m = Module() + m.d.comb += self.o.eq(self.a + self.b) + return m + + +class Subtractor(Elaboratable): + def __init__(self, width): + self.a = Signal(width) + self.b = Signal(width) + self.o = Signal(width) + + def elaborate(self, platform): + m = Module() + m.d.comb += self.o.eq(self.a - self.b) + return m + + +class Multiplier(Elaboratable): + def __init__(self, width): + self.a = Signal(width) + self.b = Signal(width) + self.o = Signal(width) + + def elaborate(self, platform): + m = Module() + m.d.comb += self.o.eq(self.a * self.b) + return m + + +class Shifter(Elaboratable): + def __init__(self, width): + self.width = width + self.a = Signal(width) + self.b = Signal(width) + self.o = Signal(width) + + def elaborate(self, platform): + m = Module() + btrunc = Signal(self.width) + m.d.comb += btrunc.eq(self.b & Const((1<> btrunc) + return m + + +class ALU(Elaboratable): + def __init__(self, width): + self.p_valid_i = Signal() + self.p_ready_o = Signal() + self.n_ready_i = Signal() + self.n_valid_o = Signal() + self.counter = Signal(4) + self.op = Signal(2) + self.a = Signal(width) + self.b = Signal(width) + self.o = Signal(width) + self.width = width + + def elaborate(self, platform): + m = Module() + add = Adder(self.width) + sub = Subtractor(self.width) + mul = Multiplier(self.width) + shf = Shifter(self.width) + + m.submodules.add = add + m.submodules.sub = sub + m.submodules.mul = mul + m.submodules.shf = shf + for mod in [add, sub, mul, shf]: + m.d.comb += [ + mod.a.eq(self.a), + mod.b.eq(self.b), + ] + go_now = Signal(reset_less=True) # testing no-delay ALU + + with m.If(self.p_valid_i): + # input is valid. next check, if we already said "ready" or not + with m.If(~self.p_ready_o): + # we didn't say "ready" yet, so say so and initialise + m.d.sync += self.p_ready_o.eq(1) + + # as this is a "fake" pipeline, just grab the output right now + with m.Switch(self.op): + for i, mod in enumerate([add, sub, mul, shf]): + with m.Case(i): + m.d.sync += self.o.eq(mod.o) + with m.If(self.op == 2): # MUL, to take 5 instructions + m.d.sync += self.counter.eq(5) + with m.Elif(self.op == 3): # SHIFT to take 7 + m.d.sync += self.counter.eq(7) + with m.Elif(self.op == 1): # SUB to take 1, straight away + m.d.sync += self.counter.eq(1) + m.d.comb += go_now.eq(1) + with m.Else(): # ADD to take 2 + m.d.sync += self.counter.eq(2) + with m.Else(): + # input says no longer valid, so drop ready as well. + # a "proper" ALU would have had to sync in the opcode and a/b ops + m.d.sync += self.p_ready_o.eq(0) + + # ok so the counter's running: when it gets to 1, fire the output + with m.If((self.counter == 1) | go_now): + # set the output as valid if the recipient is ready for it + m.d.sync += self.n_valid_o.eq(1) + with m.If(self.n_ready_i & self.n_valid_o): + m.d.sync += self.n_valid_o.eq(0) + # recipient said it was ready: reset back to known-good. + m.d.sync += self.counter.eq(0) # reset the counter + m.d.sync += self.o.eq(0) # clear the output for tidiness sake + + # countdown to 1 (transition from 1 to 0 only on acknowledgement) + with m.If(self.counter > 1): + m.d.sync += self.counter.eq(self.counter - 1) + + return m + + def __iter__(self): + yield self.op + yield self.a + yield self.b + yield self.o + + def ports(self): + return list(self) + + +class BranchOp(Elaboratable): + def __init__(self, width, op): + self.a = Signal(width) + self.b = Signal(width) + self.o = Signal(width) + self.op = op + + def elaborate(self, platform): + m = Module() + m.d.comb += self.o.eq(Mux(self.op(self.a, self.b), 1, 0)) + return m + + +class BranchALU(Elaboratable): + def __init__(self, width): + self.p_valid_i = Signal() + self.p_ready_o = Signal() + self.n_ready_i = Signal() + self.n_valid_o = Signal() + self.counter = Signal(4) + self.op = Signal(2) + self.a = Signal(width) + self.b = Signal(width) + self.o = Signal(width) + self.width = width + + def elaborate(self, platform): + m = Module() + bgt = BranchOp(self.width, operator.gt) + blt = BranchOp(self.width, operator.lt) + beq = BranchOp(self.width, operator.eq) + bne = BranchOp(self.width, operator.ne) + + m.submodules.bgt = bgt + m.submodules.blt = blt + m.submodules.beq = beq + m.submodules.bne = bne + for mod in [bgt, blt, beq, bne]: + m.d.comb += [ + mod.a.eq(self.a), + mod.b.eq(self.b), + ] + + go_now = Signal(reset_less=True) # testing no-delay ALU + with m.If(self.p_valid_i): + # input is valid. next check, if we already said "ready" or not + with m.If(~self.p_ready_o): + # we didn't say "ready" yet, so say so and initialise + m.d.sync += self.p_ready_o.eq(1) + + # as this is a "fake" pipeline, just grab the output right now + with m.Switch(self.op): + for i, mod in enumerate([bgt, blt, beq, bne]): + with m.Case(i): + m.d.sync += self.o.eq(mod.o) + m.d.sync += self.counter.eq(5) # branch to take 5 cycles (fake) + #m.d.comb += go_now.eq(1) + with m.Else(): + # input says no longer valid, so drop ready as well. + # a "proper" ALU would have had to sync in the opcode and a/b ops + m.d.sync += self.p_ready_o.eq(0) + + # ok so the counter's running: when it gets to 1, fire the output + with m.If((self.counter == 1) | go_now): + # set the output as valid if the recipient is ready for it + m.d.sync += self.n_valid_o.eq(1) + with m.If(self.n_ready_i & self.n_valid_o): + m.d.sync += self.n_valid_o.eq(0) + # recipient said it was ready: reset back to known-good. + m.d.sync += self.counter.eq(0) # reset the counter + m.d.sync += self.o.eq(0) # clear the output for tidiness sake + + # countdown to 1 (transition from 1 to 0 only on acknowledgement) + with m.If(self.counter > 1): + m.d.sync += self.counter.eq(self.counter - 1) + + return m + + def __iter__(self): + yield self.op + yield self.a + yield self.b + yield self.o + + def ports(self): + return list(self) + + +if __name__ == "__main__": + alu = ALU(width=16) + vl = rtlil.convert(alu, ports=alu.ports()) + with open("test_alu.il", "w") as f: + f.write(vl) + + alu = BranchALU(width=16) + vl = rtlil.convert(alu, ports=alu.ports()) + with open("test_branch_alu.il", "w") as f: + f.write(vl) + diff --git a/src/soc/experiment/compalu.py b/src/soc/experiment/compalu.py new file mode 100644 index 00000000..7da6b5cf --- /dev/null +++ b/src/soc/experiment/compalu.py @@ -0,0 +1,207 @@ +from nmigen.compat.sim import run_simulation +from nmigen.cli import verilog, rtlil +from nmigen import Module, Signal, Mux, Elaboratable + +from nmutil.latch import SRLatch, latchregister + +""" Computation Unit (aka "ALU Manager"). + + This module runs a "revolving door" set of three latches, based on + * Issue + * Go_Read + * Go_Write + where one of them cannot be set on any given cycle. + (Note however that opc_l has been inverted (and qn used), due to SRLatch + default reset state being "0" rather than "1") + + * When issue is first raised, a busy signal is sent out. + The src1 and src2 registers and the operand can be latched in + at this point + + * Read request is set, which is acknowledged through the Scoreboard + to the priority picker, which generates (one and only one) Go_Read + at a time. One of those will (eventually) be this Computation Unit. + + * Once Go_Read is set, the src1/src2/operand latch door shuts (locking + src1/src2/operand in place), and the ALU is told to proceed. + + * As this is currently a "demo" unit, a countdown timer is activated + to simulate an ALU "pipeline", which activates "write request release", + and the ALU's output is captured into a temporary register. + + * Write request release will go through a similar process as Read request, + resulting (eventually) in Go_Write being asserted. + + * When Go_Write is asserted, two things happen: (1) the data in the temp + register is placed combinatorially onto the output, and (2) the + req_l latch is cleared, busy is dropped, and the Comp Unit is back + through its revolving door to do another task. + + Notes on oper_i: + + * bits[0:2] are for the ALU, add=0, sub=1, shift=2, mul=3 + * bit[2] are the immediate (bit[2]=1 == immediate mode) +""" + +class ComputationUnitNoDelay(Elaboratable): + def __init__(self, rwid, opwid, alu): + self.opwid = opwid + self.rwid = rwid + self.alu = alu + + self.counter = Signal(4) + self.go_rd_i = Signal(reset_less=True) # go read in + self.go_wr_i = Signal(reset_less=True) # go write in + self.issue_i = Signal(reset_less=True) # fn issue in + self.shadown_i = Signal(reset=1) # shadow function, defaults to ON + self.go_die_i = Signal() # go die (reset) + + self.oper_i = Signal(opwid, reset_less=True) # opcode in + self.imm_i = Signal(rwid, reset_less=True) # immediate in + self.src1_i = Signal(rwid, reset_less=True) # oper1 in + self.src2_i = Signal(rwid, reset_less=True) # oper2 in + + self.busy_o = Signal(reset_less=True) # fn busy out + self.data_o = Signal(rwid, reset_less=True) # Dest out + self.rd_rel_o = Signal(reset_less=True) # release src1/src2 request + self.req_rel_o = Signal(reset_less=True) # release request out (valid_o) + + def elaborate(self, platform): + m = Module() + m.submodules.alu = self.alu + m.submodules.src_l = src_l = SRLatch(sync=False) + m.submodules.opc_l = opc_l = SRLatch(sync=False) + m.submodules.req_l = req_l = SRLatch(sync=False) + + # shadow/go_die + reset_w = Signal(reset_less=True) + reset_r = Signal(reset_less=True) + m.d.comb += reset_w.eq(self.go_wr_i | self.go_die_i) + m.d.comb += reset_r.eq(self.go_rd_i | self.go_die_i) + + # This is fascinating and very important to observe that this + # is in effect a "3-way revolving door". At no time may all 3 + # latches be set at the same time. + + # opcode latch (not using go_rd_i) - inverted so that busy resets to 0 + m.d.sync += opc_l.s.eq(self.issue_i) # XXX NOTE: INVERTED FROM book! + m.d.sync += opc_l.r.eq(reset_w) # XXX NOTE: INVERTED FROM book! + + # src operand latch (not using go_wr_i) + m.d.sync += src_l.s.eq(self.issue_i) + m.d.sync += src_l.r.eq(reset_r) + + # dest operand latch (not using issue_i) + m.d.sync += req_l.s.eq(self.go_rd_i) + m.d.sync += req_l.r.eq(reset_w) + + + # create a latch/register for the operand + oper_r = Signal(self.opwid+1, reset_less=True) # opcode reg + latchregister(m, self.oper_i, oper_r, self.issue_i) + + # and one for the output from the ALU + data_r = Signal(self.rwid, reset_less=True) # Dest register + latchregister(m, self.alu.o, data_r, req_l.q) + + # get the top 2 bits for the ALU + m.d.comb += self.alu.op.eq(oper_r[0:2]) + + # 3rd bit is whether this is an immediate or not + op_is_imm = Signal(reset_less=True) + m.d.comb += op_is_imm.eq(oper_r[2]) + + # select immediate if opcode says so. however also change the latch + # to trigger *from* the opcode latch instead. + src2_or_imm = Signal(self.rwid, reset_less=True) + src_sel = Signal(reset_less=True) + m.d.comb += src_sel.eq(Mux(op_is_imm, opc_l.qn, src_l.q)) + m.d.comb += src2_or_imm.eq(Mux(op_is_imm, self.imm_i, self.src2_i)) + + # create a latch/register for src1/src2 + latchregister(m, self.src1_i, self.alu.a, src_l.q) + latchregister(m, src2_or_imm, self.alu.b, src_sel) + + # ----- + # outputs + # ----- + + # all request signals gated by busy_o. prevents picker problems + busy_o = self.busy_o + m.d.comb += busy_o.eq(opc_l.q) # busy out + m.d.comb += self.rd_rel_o.eq(src_l.q & busy_o) # src1/src2 req rel + + # on a go_read, tell the ALU we're accepting data. + # NOTE: this spells TROUBLE if the ALU isn't ready! + # go_read is only valid for one clock! + with m.If(self.go_rd_i): # src operands ready, GO! + with m.If(~self.alu.p_ready_o): # no ACK yet + m.d.comb += self.alu.p_valid_i.eq(1) # so indicate valid + + # only proceed if ALU says its output is valid + with m.If(self.alu.n_valid_o): + # when ALU ready, write req release out. waits for shadow + m.d.comb += self.req_rel_o.eq(req_l.q & busy_o & self.shadown_i) + # when output latch is ready, and ALU says ready, accept ALU output + with m.If(self.req_rel_o): + m.d.comb += self.alu.n_ready_i.eq(1) # tells ALU "thanks got it" + + # output the data from the latch on go_write + with m.If(self.go_wr_i): + m.d.comb += self.data_o.eq(data_r) + + return m + + def __iter__(self): + yield self.go_rd_i + yield self.go_wr_i + yield self.issue_i + yield self.shadown_i + yield self.go_die_i + yield self.oper_i + yield self.imm_i + yield self.src1_i + yield self.src2_i + yield self.busy_o + yield self.rd_rel_o + yield self.req_rel_o + yield self.data_o + + def ports(self): + return list(self) + + +def scoreboard_sim(dut): + yield dut.dest_i.eq(1) + yield dut.issue_i.eq(1) + yield + yield dut.issue_i.eq(0) + yield + yield dut.src1_i.eq(1) + yield dut.issue_i.eq(1) + yield + yield + yield + yield dut.issue_i.eq(0) + yield + yield dut.go_read_i.eq(1) + yield + yield dut.go_read_i.eq(0) + yield + yield dut.go_write_i.eq(1) + yield + yield dut.go_write_i.eq(0) + yield + +def test_scoreboard(): + from alu_hier import ALU + alu = ALU(16) + dut = ComputationUnitNoDelay(16, 8, alu) + vl = rtlil.convert(dut, ports=dut.ports()) + with open("test_compalu.il", "w") as f: + f.write(vl) + + run_simulation(dut, scoreboard_sim(dut), vcd_name='test_compalu.vcd') + +if __name__ == '__main__': + test_scoreboard() diff --git a/src/soc/experiment/compldst.py b/src/soc/experiment/compldst.py new file mode 100644 index 00000000..77ad39dd --- /dev/null +++ b/src/soc/experiment/compldst.py @@ -0,0 +1,288 @@ +from nmigen.compat.sim import run_simulation +from nmigen.cli import verilog, rtlil +from nmigen import Module, Signal, Mux, Cat, Elaboratable + +from nmutil.latch import SRLatch, latchregister + +""" LOAD / STORE Computation Unit. Also capable of doing ADD and ADD immediate + + This module runs a "revolving door" set of four latches, based on + * Issue + * Go_Read + * Go_Addr + * Go_Write *OR* Go_Store + + (Note that opc_l has been inverted (and qn used), due to SRLatch + default reset state being "0" rather than "1") +""" + +# internal opcodes. hypothetically this could do more combinations. +# meanings: +# * bit 0: 0 = ADD , 1 = SUB +# * bit 1: 0 = src1, 1 = IMM +# * bit 2: 1 = LD +# * bit 3: 1 = ST +LDST_OP_ADDI = 0b0000 # plain ADD (src1 + src2) +LDST_OP_SUBI = 0b0001 # plain SUB (src1 - src2) +LDST_OP_ADD = 0b0010 # immed ADD (imm + src1) +LDST_OP_SUB = 0b0011 # immed SUB (imm - src1) +LDST_OP_ST = 0b0110 # immed ADD plus LD op. ADD result is address +LDST_OP_LD = 0b1010 # immed ADD plus ST op. ADD result is address + + +class LDSTCompUnit(Elaboratable): + """ LOAD / STORE / ADD / SUB Computation Unit + + Inputs + ------ + + * :rwid: register width + * :alu: an ALU module + * :mem: a Memory Module (read-write capable) + + Control Signals (In) + -------------------- + + * :issue_i: LD/ST is being "issued". + * :isalu_i: ADD/SUB is being "issued" (aka issue_alu_i) + * :shadown_i: Inverted-shadow is being held (stops STORE *and* WRITE) + * :go_rd_i: read is being actioned (latches in src regs) + * :go_ad_i: address is being actioned (triggers actual mem LD) + * :go_st_i: store is being actioned (triggers actual mem STORE) + * :go_die_i: resets the unit back to "wait for issue" + """ + def __init__(self, rwid, opwid, alu, mem): + self.opwid = opwid + self.rwid = rwid + self.alu = alu + self.mem = mem + + self.counter = Signal(4) + self.go_rd_i = Signal(reset_less=True) # go read in + self.go_ad_i = Signal(reset_less=True) # go address in + self.go_wr_i = Signal(reset_less=True) # go write in + self.go_st_i = Signal(reset_less=True) # go store in + self.issue_i = Signal(reset_less=True) # fn issue in + self.isalu_i = Signal(reset_less=True) # fn issue as ALU in + self.shadown_i = Signal(reset=1) # shadow function, defaults to ON + self.go_die_i = Signal() # go die (reset) + + self.oper_i = Signal(opwid, reset_less=True) # opcode in + self.imm_i = Signal(rwid, reset_less=True) # immediate in + self.src1_i = Signal(rwid, reset_less=True) # oper1 in + self.src2_i = Signal(rwid, reset_less=True) # oper2 in + + self.busy_o = Signal(reset_less=True) # fn busy out + self.rd_rel_o = Signal(reset_less=True) # request src1/src2 + self.adr_rel_o = Signal(reset_less=True) # request address (from mem) + self.sto_rel_o = Signal(reset_less=True) # request store (to mem) + self.req_rel_o = Signal(reset_less=True) # request write (result) + self.data_o = Signal(rwid, reset_less=True) # Dest out (LD or ALU) + self.addr_o = Signal(rwid, reset_less=True) # Address out (LD or ST) + + # hmm... TODO... move these to outside of LDSTCompUnit + self.load_mem_o = Signal(reset_less=True) # activate memory LOAD + self.stwd_mem_o = Signal(reset_less=True) # activate memory STORE + self.ld_o = Signal(reset_less=True) # operation is a LD + self.st_o = Signal(reset_less=True) # operation is a ST + + def elaborate(self, platform): + m = Module() + comb = m.d.comb + sync = m.d.sync + + m.submodules.alu = self.alu + m.submodules.src_l = src_l = SRLatch(sync=False) + m.submodules.opc_l = opc_l = SRLatch(sync=False) + m.submodules.adr_l = adr_l = SRLatch(sync=False) + m.submodules.req_l = req_l = SRLatch(sync=False) + m.submodules.sto_l = sto_l = SRLatch(sync=False) + + # shadow/go_die + reset_b = Signal(reset_less=True) + reset_w = Signal(reset_less=True) + reset_a = Signal(reset_less=True) + reset_s = Signal(reset_less=True) + reset_r = Signal(reset_less=True) + comb += reset_b.eq(self.go_st_i | self.go_wr_i | self.go_die_i) + comb += reset_w.eq(self.go_wr_i | self.go_die_i) + comb += reset_s.eq(self.go_st_i | self.go_die_i) + comb += reset_r.eq(self.go_rd_i | self.go_die_i) + # this one is slightly different, issue_alu_i selects go_wr_i) + a_sel = Mux(self.isalu_i, self.go_wr_i, self.go_ad_i) + comb += reset_a.eq(a_sel| self.go_die_i) + + # opcode decode + op_alu = Signal(reset_less=True) + op_is_ld = Signal(reset_less=True) + op_is_st = Signal(reset_less=True) + op_ldst = Signal(reset_less=True) + op_is_imm = Signal(reset_less=True) + + # select immediate or src2 reg to add + src2_or_imm = Signal(self.rwid, reset_less=True) + src_sel = Signal(reset_less=True) + + # issue can be either issue_i or issue_alu_i (isalu_i) + issue_i = Signal(reset_less=True) + comb += issue_i.eq(self.issue_i | self.isalu_i) + + # Ripple-down the latches, each one set cancels the previous. + # NOTE: use sync to stop combinatorial loops. + + # opcode latch - inverted so that busy resets to 0 + sync += opc_l.s.eq(issue_i) # XXX NOTE: INVERTED FROM book! + sync += opc_l.r.eq(reset_b) # XXX NOTE: INVERTED FROM book! + + # src operand latch + sync += src_l.s.eq(issue_i) + sync += src_l.r.eq(reset_r) + + # addr latch + sync += adr_l.s.eq(self.go_rd_i) + sync += adr_l.r.eq(reset_a) + + # dest operand latch + sync += req_l.s.eq(self.go_ad_i) + sync += req_l.r.eq(reset_w) + + # store latch + sync += sto_l.s.eq(self.go_ad_i) + sync += sto_l.r.eq(reset_s) + + # outputs: busy and release signals + busy_o = self.busy_o + comb += self.busy_o.eq(opc_l.q) # busy out + comb += self.rd_rel_o.eq(src_l.q & busy_o) # src1/src2 req rel + comb += self.sto_rel_o.eq(sto_l.q & busy_o & self.shadown_i & op_is_st) + + # request release enabled based on if op is a LD/ST or a plain ALU + # if op is an ADD/SUB or a LD, req_rel activates. + wr_q = Signal(reset_less=True) + comb += wr_q.eq(req_l.q & (~op_ldst | op_is_ld)) + + alulatch = Signal(reset_less=True) + comb += alulatch.eq((op_ldst & self.adr_rel_o) | \ + (~op_ldst & self.req_rel_o)) + + # only proceed if ALU says its output is valid + with m.If(self.alu.n_valid_o): + + # write req release out. waits until shadow is dropped. + comb += self.req_rel_o.eq(wr_q & busy_o & self.shadown_i) + # address release only happens on LD/ST, and is shadowed. + comb += self.adr_rel_o.eq(adr_l.q & op_ldst & busy_o & \ + self.shadown_i) + # when output latch is ready, and ALU says ready, accept ALU output + with m.If(self.req_rel_o): + m.d.comb += self.alu.n_ready_i.eq(1) # tells ALU "thanks got it" + + # select immediate if opcode says so. however also change the latch + # to trigger *from* the opcode latch instead. + comb += src_sel.eq(Mux(op_is_imm, opc_l.qn, src_l.q)) + comb += src2_or_imm.eq(Mux(op_is_imm, self.imm_i, self.src2_i)) + + # create a latch/register for src1/src2 (include immediate select) + latchregister(m, self.src1_i, self.alu.a, src_l.q) + latchregister(m, src2_or_imm, self.alu.b, src_sel) + + # create a latch/register for the operand + oper_r = Signal(self.opwid, reset_less=True) # Dest register + latchregister(m, self.oper_i, oper_r, self.issue_i) + alu_op = Cat(op_alu, 0, op_is_imm) # using alu_hier, here. + comb += self.alu.op.eq(alu_op) + + # and one for the output from the ALU + data_r = Signal(self.rwid, reset_less=True) # Dest register + latchregister(m, self.alu.o, data_r, alulatch) + + # decode bits of operand (latched) + comb += op_alu.eq(oper_r[0]) + comb += op_is_imm.eq(oper_r[1]) + comb += op_is_ld.eq(oper_r[2]) + comb += op_is_st.eq(oper_r[3]) + comb += op_ldst.eq(op_is_ld | op_is_st) + comb += self.load_mem_o.eq(op_is_ld & self.go_ad_i) + comb += self.stwd_mem_o.eq(op_is_st & self.go_st_i) + comb += self.ld_o.eq(op_is_ld) + comb += self.st_o.eq(op_is_st) + + # on a go_read, tell the ALU we're accepting data. + # NOTE: this spells TROUBLE if the ALU isn't ready! + # go_read is only valid for one clock! + with m.If(self.go_rd_i): # src operands ready, GO! + with m.If(~self.alu.p_ready_o): # no ACK yet + m.d.comb += self.alu.p_valid_i.eq(1) # so indicate valid + + # put the register directly onto the output bus on a go_write + with m.If(self.go_wr_i): + comb += self.data_o.eq(data_r) + + # put the register directly onto the address bus + with m.If(self.go_ad_i): + comb += self.addr_o.eq(data_r) + + return m + + def __iter__(self): + yield self.go_rd_i + yield self.go_ad_i + yield self.go_wr_i + yield self.go_st_i + yield self.issue_i + yield self.isalu_i + yield self.shadown_i + yield self.go_die_i + yield self.oper_i + yield self.imm_i + yield self.src1_i + yield self.src2_i + yield self.busy_o + yield self.rd_rel_o + yield self.adr_rel_o + yield self.sto_rel_o + yield self.req_rel_o + yield self.data_o + yield self.load_mem_o + yield self.stwd_mem_o + + def ports(self): + return list(self) + + +def scoreboard_sim(dut): + yield dut.dest_i.eq(1) + yield dut.issue_i.eq(1) + yield + yield dut.issue_i.eq(0) + yield + yield dut.src1_i.eq(1) + yield dut.issue_i.eq(1) + yield + yield + yield + yield dut.issue_i.eq(0) + yield + yield dut.go_read_i.eq(1) + yield + yield dut.go_read_i.eq(0) + yield + yield dut.go_write_i.eq(1) + yield + yield dut.go_write_i.eq(0) + yield + + +def test_scoreboard(): + from alu_hier import ALU + alu = ALU(16) + mem = alu # fake + dut = LDSTCompUnit(16, 4, alu, mem) + vl = rtlil.convert(dut, ports=dut.ports()) + with open("test_ldst_comp.il", "w") as f: + f.write(vl) + + run_simulation(dut, scoreboard_sim(dut), vcd_name='test_ldst_comp.vcd') + +if __name__ == '__main__': + test_scoreboard() diff --git a/src/soc/experiment/cscore.py b/src/soc/experiment/cscore.py new file mode 100644 index 00000000..18b71c80 --- /dev/null +++ b/src/soc/experiment/cscore.py @@ -0,0 +1,435 @@ +from nmigen.compat.sim import run_simulation +from nmigen.cli import verilog, rtlil +from nmigen import Module, Const, Signal, Array, Cat, Elaboratable + +from regfile.regfile import RegFileArray, treereduce +from scoreboard.fn_unit import IntFnUnit, FPFnUnit, LDFnUnit, STFnUnit +from scoreboard.fu_fu_matrix import FUFUDepMatrix +from scoreboard.fu_reg_matrix import FURegDepMatrix +from scoreboard.global_pending import GlobalPending +from scoreboard.group_picker import GroupPicker +from scoreboard.issue_unit import IntFPIssueUnit, RegDecode + +from compalu import ComputationUnitNoDelay + +from alu_hier import ALU +from nmutil.latch import SRLatch + +from random import randint + + +class Scoreboard(Elaboratable): + def __init__(self, rwid, n_regs): + """ Inputs: + + * :rwid: bit width of register file(s) - both FP and INT + * :n_regs: depth of register file(s) - number of FP and INT regs + """ + self.rwid = rwid + self.n_regs = n_regs + + # Register Files + self.intregs = RegFileArray(rwid, n_regs) + self.fpregs = RegFileArray(rwid, n_regs) + + # inputs + self.int_store_i = Signal(reset_less=True) # instruction is a store + self.int_dest_i = Signal(max=n_regs, reset_less=True) # Dest R# in + self.int_src1_i = Signal(max=n_regs, reset_less=True) # oper1 R# in + self.int_src2_i = Signal(max=n_regs, reset_less=True) # oper2 R# in + + self.issue_o = Signal(reset_less=True) # instruction was accepted + + def elaborate(self, platform): + m = Module() + + m.submodules.intregs = self.intregs + m.submodules.fpregs = self.fpregs + + # register ports + int_dest = self.intregs.write_port("dest") + int_src1 = self.intregs.read_port("src1") + int_src2 = self.intregs.read_port("src2") + + fp_dest = self.fpregs.write_port("dest") + fp_src1 = self.fpregs.read_port("src1") + fp_src2 = self.fpregs.read_port("src2") + + # Int ALUs + add = ALU(self.rwid) + sub = ALU(self.rwid) + m.submodules.comp1 = comp1 = ComputationUnitNoDelay(self.rwid, 1, add) + m.submodules.comp2 = comp2 = ComputationUnitNoDelay(self.rwid, 1, sub) + int_alus = [comp1, comp2] + + m.d.comb += comp1.oper_i.eq(Const(0)) # temporary/experiment: op=add + m.d.comb += comp2.oper_i.eq(Const(1)) # temporary/experiment: op=sub + + # Int FUs + if_l = [] + int_src1_pend_v = [] + int_src2_pend_v = [] + int_rd_pend_v = [] + int_wr_pend_v = [] + for i, a in enumerate(int_alus): + # set up Integer Function Unit, add to module (and python list) + fu = IntFnUnit(self.n_regs, shadow_wid=0) + setattr(m.submodules, "intfu%d" % i, fu) + if_l.append(fu) + # collate the read/write pending vectors (to go into global pending) + int_src1_pend_v.append(fu.src1_pend_o) + int_src2_pend_v.append(fu.src2_pend_o) + int_rd_pend_v.append(fu.int_rd_pend_o) + int_wr_pend_v.append(fu.int_wr_pend_o) + int_fus = Array(if_l) + + # Count of number of FUs + n_int_fus = len(if_l) + n_fp_fus = 0 # for now + + n_fus = n_int_fus + n_fp_fus # plus FP FUs + + # XXX replaced by array of FUs? *FnUnit + # # Integer FU-FU Dep Matrix + # m.submodules.intfudeps = FUFUDepMatrix(n_int_fus, n_int_fus) + # Integer FU-Reg Dep Matrix + # intregdeps = FURegDepMatrix(self.n_regs, n_int_fus) + # m.submodules.intregdeps = intregdeps + + # Integer Priority Picker 1: Adder + Subtractor + intpick1 = GroupPicker(2) # picks between add and sub + m.submodules.intpick1 = intpick1 + + # Global Pending Vectors (INT and FP) + # NOTE: number of vectors is NOT same as number of FUs. + g_int_src1_pend_v = GlobalPending(self.n_regs, int_src1_pend_v) + g_int_src2_pend_v = GlobalPending(self.n_regs, int_src2_pend_v) + g_int_rd_pend_v = GlobalPending(self.n_regs, int_rd_pend_v, True) + g_int_wr_pend_v = GlobalPending(self.n_regs, int_wr_pend_v, True) + m.submodules.g_int_src1_pend_v = g_int_src1_pend_v + m.submodules.g_int_src2_pend_v = g_int_src2_pend_v + m.submodules.g_int_rd_pend_v = g_int_rd_pend_v + m.submodules.g_int_wr_pend_v = g_int_wr_pend_v + + # INT/FP Issue Unit + regdecode = RegDecode(self.n_regs) + m.submodules.regdecode = regdecode + issueunit = IntFPIssueUnit(self.n_regs, n_int_fus, n_fp_fus) + m.submodules.issueunit = issueunit + + # FU-FU Dependency Matrices + intfudeps = FUFUDepMatrix(n_int_fus, n_int_fus) + m.submodules.intfudeps = intfudeps + + #--------- + # ok start wiring things together... + # "now hear de word of de looord... dem bones dem bones dem dryy bones" + # https://www.youtube.com/watch?v=pYb8Wm6-QfA + #--------- + + #--------- + # Issue Unit is where it starts. set up some in/outs for this module + #--------- + m.d.comb += [issueunit.i.store_i.eq(self.int_store_i), + regdecode.dest_i.eq(self.int_dest_i), + regdecode.src1_i.eq(self.int_src1_i), + regdecode.src2_i.eq(self.int_src2_i), + regdecode.enable_i.eq(1), + self.issue_o.eq(issueunit.issue_o), + issueunit.i.dest_i.eq(regdecode.dest_o), + ] + self.int_insn_i = issueunit.i.insn_i # enabled by instruction decode + + # connect global rd/wr pending vectors + m.d.comb += issueunit.i.g_wr_pend_i.eq(g_int_wr_pend_v.g_pend_o) + # TODO: issueunit.f (FP) + + # and int function issue / busy arrays, and dest/src1/src2 + fn_issue_l = [] + fn_busy_l = [] + for i, fu in enumerate(if_l): + fn_issue_l.append(fu.issue_i) + fn_busy_l.append(fu.busy_o) + m.d.sync += fu.issue_i.eq(issueunit.i.fn_issue_o[i]) + m.d.sync += fu.dest_i.eq(self.int_dest_i) + m.d.sync += fu.src1_i.eq(self.int_src1_i) + m.d.sync += fu.src2_i.eq(self.int_src2_i) + # XXX sync, so as to stop a simulation infinite loop + m.d.comb += issueunit.i.busy_i[i].eq(fu.busy_o) + + #--------- + # connect Function Units + #--------- + + # Group Picker... done manually for now. TODO: cat array of pick sigs + m.d.comb += if_l[0].go_rd_i.eq(intpick1.go_rd_o[0]) # add rd + m.d.comb += if_l[0].go_wr_i.eq(intpick1.go_wr_o[0]) # add wr + + m.d.comb += if_l[1].go_rd_i.eq(intpick1.go_rd_o[1]) # subtract rd + m.d.comb += if_l[1].go_wr_i.eq(intpick1.go_wr_o[1]) # subtract wr + + # create read-pending FU-FU vectors + intfu_rd_pend_v = Signal(n_int_fus, reset_less = True) + intfu_wr_pend_v = Signal(n_int_fus, reset_less = True) + for i in range(n_int_fus): + #m.d.comb += intfu_rd_pend_v[i].eq(if_l[i].int_rd_pend_o.bool()) + #m.d.comb += intfu_wr_pend_v[i].eq(if_l[i].int_wr_pend_o.bool()) + m.d.comb += intfu_rd_pend_v[i].eq(if_l[i].int_readable_o) + m.d.comb += intfu_wr_pend_v[i].eq(if_l[i].int_writable_o) + + # Connect INT Fn Unit global wr/rd pending + for fu in if_l: + m.d.comb += fu.g_int_wr_pend_i.eq(g_int_wr_pend_v.g_pend_o) + m.d.comb += fu.g_int_rd_pend_i.eq(g_int_rd_pend_v.g_pend_o) + + # Connect FU-FU Matrix, NOTE: FN Units readable/writable considered + # to be unit "read-pending / write-pending" + m.d.comb += intfudeps.rd_pend_i.eq(intfu_rd_pend_v) + m.d.comb += intfudeps.wr_pend_i.eq(intfu_wr_pend_v) + m.d.comb += intfudeps.issue_i.eq(issueunit.i.fn_issue_o) + for i in range(n_int_fus): + m.d.comb += intfudeps.go_rd_i[i].eq(intpick1.go_rd_o[i]) + m.d.comb += intfudeps.go_wr_i[i].eq(intpick1.go_wr_o[i]) + + # Connect Picker (note connection to FU-FU) + #--------- + readable_o = intfudeps.readable_o + writable_o = intfudeps.writable_o + m.d.comb += intpick1.rd_rel_i[0].eq(int_alus[0].rd_rel_o) + m.d.comb += intpick1.rd_rel_i[1].eq(int_alus[1].rd_rel_o) + m.d.comb += intpick1.req_rel_i[0].eq(int_alus[0].req_rel_o) + m.d.comb += intpick1.req_rel_i[1].eq(int_alus[1].req_rel_o) + m.d.comb += intpick1.readable_i[0].eq(readable_o[0]) # add rd + m.d.comb += intpick1.writable_i[0].eq(writable_o[0]) # add wr + m.d.comb += intpick1.readable_i[1].eq(readable_o[1]) # sub rd + m.d.comb += intpick1.writable_i[1].eq(writable_o[1]) # sub wr + + #--------- + # Connect Register File(s) + #--------- + #with m.If(if_l[0].go_wr_i | if_l[1].go_wr_i): + m.d.sync += int_dest.wen.eq(g_int_wr_pend_v.g_pend_o) + #with m.If(intpick1.go_rd_o): + #with m.If(if_l[0].go_rd_i | if_l[1].go_rd_i): + m.d.sync += int_src1.ren.eq(g_int_src1_pend_v.g_pend_o) + m.d.sync += int_src2.ren.eq(g_int_src2_pend_v.g_pend_o) + + # merge (OR) all integer FU / ALU outputs to a single value + # bit of a hack: treereduce needs a list with an item named "dest_o" + dest_o = treereduce(int_alus) + m.d.sync += int_dest.data_i.eq(dest_o) + + # connect ALUs + for i, alu in enumerate(int_alus): + m.d.comb += alu.go_rd_i.eq(intpick1.go_rd_o[i]) + m.d.comb += alu.go_wr_i.eq(intpick1.go_wr_o[i]) + m.d.comb += alu.issue_i.eq(fn_issue_l[i]) + #m.d.comb += fn_busy_l[i].eq(alu.busy_o) # XXX ignore, use fnissue + m.d.comb += alu.src1_i.eq(int_src1.data_o) + m.d.comb += alu.src2_i.eq(int_src2.data_o) + m.d.comb += if_l[i].req_rel_i.eq(alu.req_rel_o) # pipe out ready + + return m + + + def __iter__(self): + yield from self.intregs + yield from self.fpregs + yield self.int_store_i + yield self.int_dest_i + yield self.int_src1_i + yield self.int_src2_i + yield self.issue_o + #yield from self.int_src1 + #yield from self.int_dest + #yield from self.int_src1 + #yield from self.int_src2 + #yield from self.fp_dest + #yield from self.fp_src1 + #yield from self.fp_src2 + + def ports(self): + return list(self) + +IADD = 0 +ISUB = 1 + +class RegSim: + def __init__(self, rwidth, nregs): + self.rwidth = rwidth + self.regs = [0] * nregs + + def op(self, op, src1, src2, dest): + src1 = self.regs[src1] + src2 = self.regs[src2] + if op == IADD: + val = (src1 + src2) & ((1<<(self.rwidth))-1) + elif op == ISUB: + val = (src1 - src2) & ((1<<(self.rwidth))-1) + self.regs[dest] = val + + def setval(self, dest, val): + self.regs[dest] = val + + def dump(self, dut): + for i, val in enumerate(self.regs): + reg = yield dut.intregs.regs[i].reg + okstr = "OK" if reg == val else "!ok" + print("reg %d expected %x received %x %s" % (i, val, reg, okstr)) + + def check(self, dut): + for i, val in enumerate(self.regs): + reg = yield dut.intregs.regs[i].reg + if reg != val: + print("reg %d expected %x received %x\n" % (i, val, reg)) + yield from self.dump(dut) + assert False + +def int_instr(dut, alusim, op, src1, src2, dest): + for i in range(len(dut.int_insn_i)): + yield dut.int_insn_i[i].eq(0) + yield dut.int_dest_i.eq(dest) + yield dut.int_src1_i.eq(src1) + yield dut.int_src2_i.eq(src2) + yield dut.int_insn_i[op].eq(1) + alusim.op(op, src1, src2, dest) + + +def print_reg(dut, rnums): + rs = [] + for rnum in rnums: + reg = yield dut.intregs.regs[rnum].reg + rs.append("%x" % reg) + rnums = map(str, rnums) + print ("reg %s: %s" % (','.join(rnums), ','.join(rs))) + + +def scoreboard_sim(dut, alusim): + yield dut.int_store_i.eq(0) + + for i in range(1, dut.n_regs): + yield dut.intregs.regs[i].reg.eq(i) + alusim.setval(i, i) + + if False: + yield from int_instr(dut, alusim, IADD, 4, 3, 5) + yield from print_reg(dut, [3,4,5]) + yield + yield from int_instr(dut, alusim, IADD, 5, 2, 5) + yield from print_reg(dut, [3,4,5]) + yield + yield from int_instr(dut, alusim, ISUB, 5, 1, 3) + yield from print_reg(dut, [3,4,5]) + yield + for i in range(len(dut.int_insn_i)): + yield dut.int_insn_i[i].eq(0) + yield from print_reg(dut, [3,4,5]) + yield + yield from print_reg(dut, [3,4,5]) + yield + yield from print_reg(dut, [3,4,5]) + yield + + yield from alusim.check(dut) + + for i in range(2): + src1 = randint(1, dut.n_regs-1) + src2 = randint(1, dut.n_regs-1) + while True: + dest = randint(1, dut.n_regs-1) + break + if dest not in [src1, src2]: + break + op = randint(0, 1) + if False: + if i % 2 == 0: + src1 = 6 + src2 = 6 + dest = 1 + else: + src1 = 1 + src2 = 7 + dest = 2 + #src1 = 2 + #src2 = 3 + #dest = 2 + + op = i + + if True: + if i == 0: + src1 = 2 + src2 = 3 + dest = 3 + else: + src1 = 5 + src2 = 3 + dest = 4 + + #op = (i+1) % 2 + op = i + + print ("random %d: %d %d %d %d\n" % (i, op, src1, src2, dest)) + yield from int_instr(dut, alusim, op, src1, src2, dest) + yield from print_reg(dut, [3,4,5]) + while True: + yield + issue_o = yield dut.issue_o + if issue_o: + yield from print_reg(dut, [3,4,5]) + for i in range(len(dut.int_insn_i)): + yield dut.int_insn_i[i].eq(0) + break + print ("busy",) + yield from print_reg(dut, [3,4,5]) + yield + yield + yield + + + yield + yield from print_reg(dut, [3,4,5]) + yield + yield from print_reg(dut, [3,4,5]) + yield + yield from print_reg(dut, [3,4,5]) + yield + yield from print_reg(dut, [3,4,5]) + yield + yield + yield + yield + yield + yield + yield + yield + yield + yield from alusim.check(dut) + yield from alusim.dump(dut) + + +def explore_groups(dut): + from nmigen.hdl.ir import Fragment + from nmigen.hdl.xfrm import LHSGroupAnalyzer + + fragment = dut.elaborate(platform=None) + fr = Fragment.get(fragment, platform=None) + + groups = LHSGroupAnalyzer()(fragment._statements) + + print (groups) + + +def test_scoreboard(): + dut = Scoreboard(16, 8) + alusim = RegSim(16, 8) + vl = rtlil.convert(dut, ports=dut.ports()) + with open("test_scoreboard.il", "w") as f: + f.write(vl) + + run_simulation(dut, scoreboard_sim(dut, alusim), + vcd_name='test_scoreboard.vcd') + + +if __name__ == '__main__': + test_scoreboard() diff --git a/src/soc/experiment/score6600.py b/src/soc/experiment/score6600.py new file mode 100644 index 00000000..209bc99c --- /dev/null +++ b/src/soc/experiment/score6600.py @@ -0,0 +1,1296 @@ +from nmigen.compat.sim import run_simulation +from nmigen.cli import verilog, rtlil +from nmigen.hdl.ast import unsigned +from nmigen import Module, Const, Signal, Array, Cat, Elaboratable, Memory + +from regfile.regfile import RegFileArray, treereduce +from scoreboard.fu_fu_matrix import FUFUDepMatrix +from scoreboard.fu_reg_matrix import FURegDepMatrix +from scoreboard.global_pending import GlobalPending +from scoreboard.group_picker import GroupPicker +from scoreboard.issue_unit import IssueUnitGroup, IssueUnitArray, RegDecode +from scoreboard.shadow import ShadowMatrix, BranchSpeculationRecord +from scoreboard.instruction_q import Instruction, InstructionQ +from scoreboard.memfu import MemFunctionUnits + +from compalu import ComputationUnitNoDelay +from compldst import LDSTCompUnit + +from alu_hier import ALU, BranchALU +from nmutil.latch import SRLatch +from nmutil.nmoperator import eq + +from random import randint, seed +from copy import deepcopy +from math import log + + +class TestMemory(Elaboratable): + def __init__(self, regwid, addrw): + self.ddepth = 1 # regwid //8 + depth = (1<>self.ddepth] + + def st(self, addr, data): + self.mem[addr>>self.ddepth] = data & ((1< Mem FUs + comb += memfus.addr_en_i.eq(cul.adr_rel_o) # Match enable on adr rel + comb += memfus.addr_rs_i.eq(reset_b) # reset same as LDSTCompUnit + + # LD/STs have to accumulate prior LD/STs (TODO: multi-issue as well, + # in a transitive fashion). This cycle activates based on LDSTCompUnit + # issue_i. multi-issue gets a bit more complex but not a lot. + prior_ldsts = Signal(cul.n_units, reset_less=True) + sync += prior_ldsts.eq(memfus.g_int_ld_pend_o | memfus.g_int_st_pend_o) + with m.If(self.ls_oper_i[2]): # LD bit of operand + comb += memfus.ld_i.eq(cul.issue_i | prior_ldsts) + with m.If(self.ls_oper_i[3]): # ST bit of operand + comb += memfus.st_i.eq(cul.issue_i | prior_ldsts) + + # TODO: adr_rel_o needs to go into L1 Cache. for now, + # just immediately activate go_adr + comb += cul.go_ad_i.eq(cul.adr_rel_o) + + # connect up address data + comb += memfus.addrs_i[0].eq(cul.units[0].addr_o) + comb += memfus.addrs_i[1].eq(cul.units[1].addr_o) + + # connect loadable / storable to go_ld/go_st. + # XXX should only be done when the memory ld/st has actually happened! + go_st_i = Signal(cul.n_units, reset_less=True) + go_ld_i = Signal(cul.n_units, reset_less=True) + comb += go_ld_i.eq(memfus.loadable_o & memfus.addr_nomatch_o &\ + cul.req_rel_o & cul.ld_o) + comb += go_st_i.eq(memfus.storable_o & memfus.addr_nomatch_o &\ + cul.sto_rel_o & cul.st_o) + comb += memfus.go_ld_i.eq(go_ld_i) + comb += memfus.go_st_i.eq(go_st_i) + #comb += cul.go_wr_i.eq(go_ld_i) + comb += cul.go_st_i.eq(go_st_i) + + #comb += cu.go_rd_i[0:n_intfus].eq(go_rd_o[0:n_intfus]) + #comb += cu.go_wr_i[0:n_intfus].eq(go_wr_o[0:n_intfus]) + #comb += cu.issue_i[0:n_intfus].eq(fn_issue_o[0:n_intfus]) + + #--------- + # merge shadow matrices outputs + #--------- + + # these are explained in ShadowMatrix docstring, and are to be + # connected to the FUReg and FUFU Matrices, to get them to reset + anydie = Signal(n_intfus, reset_less=True) + allshadown = Signal(n_intfus, reset_less=True) + shreset = Signal(n_intfus, reset_less=True) + comb += allshadown.eq(shadows.shadown_o & bshadow.shadown_o) + comb += anydie.eq(shadows.go_die_o | bshadow.go_die_o) + comb += shreset.eq(bspec.match_g_o | bspec.match_f_o) + + #--------- + # connect fu-fu matrix + #--------- + + # Group Picker... done manually for now. + go_rd_o = intpick1.go_rd_o + go_wr_o = intpick1.go_wr_o + go_rd_i = intfus.go_rd_i + go_wr_i = intfus.go_wr_i + go_die_i = intfus.go_die_i + # NOTE: connect to the shadowed versions so that they can "die" (reset) + comb += go_rd_i[0:n_intfus].eq(go_rd_o[0:n_intfus]) # rd + comb += go_wr_i[0:n_intfus].eq(go_wr_o[0:n_intfus]) # wr + comb += go_die_i[0:n_intfus].eq(anydie[0:n_intfus]) # die + + # Connect Picker + #--------- + comb += intpick1.rd_rel_i[0:n_intfus].eq(cu.rd_rel_o[0:n_intfus]) + comb += intpick1.req_rel_i[0:n_intfus].eq(cu.req_rel_o[0:n_intfus]) + int_rd_o = intfus.readable_o + int_wr_o = intfus.writable_o + comb += intpick1.readable_i[0:n_intfus].eq(int_rd_o[0:n_intfus]) + comb += intpick1.writable_i[0:n_intfus].eq(int_wr_o[0:n_intfus]) + + #--------- + # Shadow Matrix + #--------- + + comb += shadows.issue_i.eq(fn_issue_o) + #comb += shadows.reset_i[0:n_intfus].eq(bshadow.go_die_o[0:n_intfus]) + comb += shadows.reset_i[0:n_intfus].eq(bshadow.go_die_o[0:n_intfus]) + #--------- + # NOTE; this setup is for the instruction order preservation... + + # connect shadows / go_dies to Computation Units + comb += cu.shadown_i[0:n_intfus].eq(allshadown) + comb += cu.go_die_i[0:n_intfus].eq(anydie) + + # ok connect first n_int_fu shadows to busy lines, to create an + # instruction-order linked-list-like arrangement, using a bit-matrix + # (instead of e.g. a ring buffer). + + # when written, the shadow can be cancelled (and was good) + for i in range(n_intfus): + comb += shadows.s_good_i[i][0:n_intfus].eq(go_wr_o[0:n_intfus]) + + # *previous* instruction shadows *current* instruction, and, obviously, + # if the previous is completed (!busy) don't cast the shadow! + comb += prev_shadow.eq(~fn_issue_o & cu.busy_o) + for i in range(n_intfus): + comb += shadows.shadow_i[i][0:n_intfus].eq(prev_shadow) + + #--------- + # ... and this is for branch speculation. it uses the extra bit + # tacked onto the ShadowMatrix (hence shadow_wid=n_intfus+1) + # only needs to set shadow_i, s_fail_i and s_good_i + + # issue captures shadow_i (if enabled) + comb += bshadow.reset_i[0:n_intfus].eq(shreset[0:n_intfus]) + + bactive = Signal(reset_less=True) + comb += bactive.eq((bspec.active_i | br1.issue_i) & ~br1.go_wr_i) + + # instruction being issued (fn_issue_o) has a shadow cast by the branch + with m.If(bactive & (self.branch_succ_i | self.branch_fail_i)): + comb += bshadow.issue_i.eq(fn_issue_o) + for i in range(n_intfus): + with m.If(fn_issue_o & (Const(1<> (src2 & maxbits) + elif op == IBGT: + val = int(src1 > src2) + elif op == IBLT: + val = int(src1 < src2) + elif op == IBEQ: + val = int(src1 == src2) + elif op == IBNE: + val = int(src1 != src2) + else: + return 0 # LD/ST TODO + val &= maxbits + self.setval(dest, val) + return val + + def setval(self, dest, val): + print ("sim setval", dest, hex(val)) + self.regs[dest] = val + + def dump(self, dut): + for i, val in enumerate(self.regs): + reg = yield dut.intregs.regs[i].reg + okstr = "OK" if reg == val else "!ok" + print("reg %d expected %x received %x %s" % (i, val, reg, okstr)) + + def check(self, dut): + for i, val in enumerate(self.regs): + reg = yield dut.intregs.regs[i].reg + if reg != val: + print("reg %d expected %x received %x\n" % (i, val, reg)) + yield from self.dump(dut) + assert False + +def instr_q(dut, op, op_imm, imm, src1, src2, dest, + branch_success, branch_fail): + instrs = [{'oper_i': op, 'dest_i': dest, 'imm_i': imm, 'opim_i': op_imm, + 'src1_i': src1, 'src2_i': src2}] + + sendlen = 1 + for idx in range(sendlen): + yield from eq(dut.data_i[idx], instrs[idx]) + di = yield dut.data_i[idx] + print ("senddata %d %x" % (idx, di)) + yield dut.p_add_i.eq(sendlen) + yield + o_p_ready = yield dut.p_ready_o + while not o_p_ready: + yield + o_p_ready = yield dut.p_ready_o + + yield dut.p_add_i.eq(0) + + +def int_instr(dut, op, imm, src1, src2, dest, branch_success, branch_fail): + yield from disable_issue(dut) + yield dut.int_dest_i.eq(dest) + yield dut.int_src1_i.eq(src1) + yield dut.int_src2_i.eq(src2) + if (op & (0x3<<2)) != 0: # branch + yield dut.brissue.insn_i.eq(1) + yield dut.br_oper_i.eq(Const(op & 0x3, 2)) + yield dut.br_imm_i.eq(imm) + dut_issue = dut.brissue + else: + yield dut.aluissue.insn_i.eq(1) + yield dut.alu_oper_i.eq(Const(op & 0x3, 2)) + yield dut.alu_imm_i.eq(imm) + dut_issue = dut.aluissue + yield dut.reg_enable_i.eq(1) + + # these indicate that the instruction is to be made shadow-dependent on + # (either) branch success or branch fail + yield dut.branch_fail_i.eq(branch_fail) + yield dut.branch_succ_i.eq(branch_success) + + yield + yield from wait_for_issue(dut, dut_issue) + + +def print_reg(dut, rnums): + rs = [] + for rnum in rnums: + reg = yield dut.intregs.regs[rnum].reg + rs.append("%x" % reg) + rnums = map(str, rnums) + print ("reg %s: %s" % (','.join(rnums), ','.join(rs))) + + +def create_random_ops(dut, n_ops, shadowing=False, max_opnums=3): + insts = [] + for i in range(n_ops): + src1 = randint(1, dut.n_regs-1) + src2 = randint(1, dut.n_regs-1) + imm = randint(1, (1<= 4 + if is_branch: + branch_ok, branch_fail = dest + dest = src2 + # ok zip up the branch success / fail instructions and + # drop them into the queue, one marked "to have branch success" + # the other to be marked shadow branch "fail". + # one out of each of these will be cancelled + for ok, fl in zip(branch_ok, branch_fail): + if ok: + instrs.append((ok[0], ok[1], ok[2], ok[3], (1, 0))) + if fl: + instrs.append((fl[0], fl[1], fl[2], fl[3], (0, 1))) + print ("instr %d: (%d, %d, %d, %d, (%d, %d))" % \ + (i, src1, src2, dest, op, shadow_on, shadow_off)) + yield from int_instr(dut, op, src1, src2, dest, + shadow_on, shadow_off) + + # wait for all instructions to stop before checking + yield + yield from wait_for_busy_clear(dut) + + i = -1 + while siminsts: + instr = siminsts.pop(0) + if instr is None: + continue + (src1, src2, dest, op, (shadow_on, shadow_off)) = instr + i += 1 + is_branch = op >= 4 + if is_branch: + branch_ok, branch_fail = dest + dest = src2 + print ("sim %d: (%d, %d, %d, %d, (%d, %d))" % \ + (i, src1, src2, dest, op, shadow_on, shadow_off)) + branch_res = alusim.op(op, src1, src2, dest) + if is_branch: + if branch_res: + siminsts += branch_ok + else: + siminsts += branch_fail + + # check status + yield from alusim.check(dut) + yield from alusim.dump(dut) + + +def scoreboard_sim(dut, alusim): + + seed(0) + + for i in range(1): + + # set random values in the registers + for i in range(1, dut.n_regs): + val = randint(0, (1< 1) || (block_forwarding == 1'b1); + + generate + if (ENABLE_L2TLB == 1) begin : HUM_BUFFER + + axi_buffer_rab_bram + #( + .DATA_WIDTH ( BUFFER_WIDTH ), + .BUFFER_DEPTH ( HUM_BUFFER_DEPTH ) + ) + u_hum_buf + ( + .clk ( axi4_aclk ), + .rstn ( axi4_arstn ), + // Push + .data_in ( {axi4_wuser, axi4_wstrb, axi4_wdata, axi4_wlast} ), + .valid_in ( hum_buf_valid_in ), + .ready_out ( hum_buf_ready_out ), + // Pop + .data_out ( {hum_buf_wuser, hum_buf_wstrb, hum_buf_wdata, hum_buf_wlast} ), + .valid_out ( hum_buf_valid_out ), + .ready_in ( hum_buf_ready_in ), + // Clear + .almost_full ( hum_buf_almost_full ), + .underfull ( hum_buf_underfull ), + .drop_req ( hum_buf_drop_req_SP ), + .drop_len ( hum_buf_drop_len_SP ) + ); + + axi_buffer_rab + #( + .DATA_WIDTH ( 2+AXI_ID_WIDTH+8+3 ), + .BUFFER_DEPTH ( L2_FIFO_DEPTH ) + ) + u_l2_fifo + ( + .clk ( axi4_aclk ), + .rstn ( axi4_arstn ), + // Push + .data_in ( {l2_prefetch_i, l2_hit_i, l2_id_i, l2_len_i, l2_master_i, l2_accept_i, l2_drop_i} ), + .valid_in ( l2_fifo_valid_in ), + .ready_out ( l2_fifo_ready_out ), + // Pop + .data_out ( {l2_prefetch_cur, l2_hit_cur, l2_id_cur, l2_len_cur, l2_master_cur, l2_accept_cur, l2_drop_cur} ), + .valid_out ( l2_fifo_valid_out ), + .ready_in ( l2_fifo_ready_in ) + ); + + // Push upon receiving new result from TLB. + assign l2_req = l2_accept_i | l2_drop_i; + assign l2_fifo_valid_in = l2_req & l2_fifo_ready_out; + + assign wlast_in = axi4_wlast & hum_buf_valid_in & hum_buf_ready_out; + assign wlast_out = hum_buf_wlast & hum_buf_valid_out & hum_buf_ready_in; + + always_ff @(posedge axi4_aclk or negedge axi4_arstn) begin + if (axi4_arstn == 0) begin + fifo_select_SP <= 1'b0; + hum_buf_drop_len_SP <= 'b0; + hum_buf_drop_req_SP <= 1'b0; + hum_buf_SP <= STORE; + n_wlast_SP <= 'b0; + end else begin + fifo_select_SP <= fifo_select_SN; + hum_buf_drop_len_SP <= hum_buf_drop_len_SN; + hum_buf_drop_req_SP <= hum_buf_drop_req_SN; + hum_buf_SP <= hum_buf_SN; + n_wlast_SP <= n_wlast_SN; + end + end + + always_comb begin + n_wlast_SN = n_wlast_SP; + if (hum_buf_drop_req_SP) begin // Happens exactly once per burst to be dropped. + n_wlast_SN -= 1; + end + if (wlast_in) begin + n_wlast_SN += 1; + end + if (wlast_out) begin + n_wlast_SN -= 1; + end + end + + always_comb begin : HUM_BUFFER_FSM + hum_buf_SN = hum_buf_SP; + + m_axi4_wlast = 1'b0; + m_axi4_wdata = 'b0; + m_axi4_wstrb = 'b0; + m_axi4_wuser = 'b0; + + m_axi4_wvalid = 1'b0; + axi4_wready = 1'b0; + + hum_buf_valid_in = 1'b0; + hum_buf_ready_in = 1'b0; + + hum_buf_drop_req_SN = hum_buf_drop_req_SP; + hum_buf_drop_len_SN = hum_buf_drop_len_SP; + master_select_o = 1'b0; + + w_done = 1'b0; // read from FIFO without handshake with B sender + b_drop_o = 1'b0; // send data from FIFO to B sender (with handshake) + fifo_select = 1'b0; + + fifo_select_SN = fifo_select_SP; + stop_store = 1'b0; + + block_forwarding = 1'b0; + + unique case (hum_buf_SP) + + STORE : begin + // Simply store the data in the buffer. + hum_buf_valid_in = axi4_wvalid & hum_buf_ready_out; + axi4_wready = hum_buf_ready_out; + + // We have got a full burst in the HUM buffer, thus stop storing. + if (wlast_in & !hum_buf_underfull | (n_wlast_SP > $signed(0))) begin + hum_buf_SN = WAIT_L1_BYPASS_YES; + + // The buffer is full, thus wait for decision. + end else if (~hum_buf_ready_out) begin + hum_buf_SN = WAIT_L1_BYPASS_NO; + end + + // Avoid the forwarding of L1 hits until we know whether we can bypass. + if (l1_fifo_valid_out & l1_save_cur) begin + block_forwarding = 1'b1; + end + end + + WAIT_L1_BYPASS_YES : begin + // Wait for orders from L1 TLB. + if (l1_fifo_valid_out) begin + + // L1 hit - forward data from buffer + if (l1_accept_cur) begin + m_axi4_wlast = hum_buf_wlast; + m_axi4_wdata = hum_buf_wdata; + m_axi4_wstrb = hum_buf_wstrb; + m_axi4_wuser = hum_buf_wuser; + + m_axi4_wvalid = hum_buf_valid_out; + hum_buf_ready_in = m_axi4_wready; + + master_select_o = l1_master_cur; + + // Detect last data beat. + if (wlast_out) begin + fifo_select = 1'b0; + w_done = 1'b1; + hum_buf_SN = STORE; + end + + // L1 miss - wait for L2 + end else if (l1_save_cur) begin + fifo_select = 1'b0; + w_done = 1'b1; + hum_buf_SN = WAIT_L2_BYPASS_YES; + + // L1 prefetch, prot, multi - drop data + end else if (l1_drop_cur) begin + fifo_select_SN = 1'b0; // L1 + hum_buf_drop_req_SN = 1'b1; + hum_buf_drop_len_SN = l1_len_cur; + hum_buf_SN = FLUSH; + end + end + end + + WAIT_L2_BYPASS_YES : begin + // Wait for orders from L2 TLB. + if (l2_fifo_valid_out) begin + + // L2 hit - forward data from buffer + if (l2_accept_cur) begin + m_axi4_wlast = hum_buf_wlast; + m_axi4_wdata = hum_buf_wdata; + m_axi4_wstrb = hum_buf_wstrb; + m_axi4_wuser = hum_buf_wuser; + + m_axi4_wvalid = hum_buf_valid_out; + hum_buf_ready_in = m_axi4_wready; + + master_select_o = l2_master_cur; + + // Detect last data beat. + if (wlast_out) begin + fifo_select = 1'b1; + w_done = 1'b1; + hum_buf_SN = STORE; + end + + // L2 miss/prefetch hit + end else if (l2_drop_cur) begin + fifo_select_SN = 1'b1; // L2 + hum_buf_drop_req_SN = 1'b1; + hum_buf_drop_len_SN = l2_len_cur; + hum_buf_SN = FLUSH; + end + + // While we wait for orders from L2 TLB, we can still drop and accept L1 transactions. + end else if (l1_fifo_valid_out) begin + + // L1 hit + if (l1_accept_cur) begin + hum_buf_SN = BYPASS; + + // L1 prefetch/prot/multi + end else if (l1_drop_cur) begin + hum_buf_SN = DISCARD; + end + end + end + + FLUSH : begin + // Clear HUM buffer flush request. + hum_buf_drop_req_SN = 1'b0; + + // perform handshake with B sender + fifo_select = fifo_select_SP; + b_drop_o = 1'b1; + if (b_done_i) begin + hum_buf_SN = STORE; + end + end + + BYPASS : begin + // Forward one full transaction from input buffer. + m_axi4_wlast = axi4_wlast; + m_axi4_wdata = axi4_wdata; + m_axi4_wstrb = axi4_wstrb; + m_axi4_wuser = axi4_wuser; + + m_axi4_wvalid = axi4_wvalid; + axi4_wready = m_axi4_wready; + + master_select_o = l1_master_cur; + + // We have got a full transaction. + if (axi4_wlast & axi4_wready & axi4_wvalid) begin + fifo_select = 1'b0; + w_done = 1'b1; + hum_buf_SN = WAIT_L2_BYPASS_YES; + end + end + + DISCARD : begin + // Discard one full transaction from input buffer. + axi4_wready = 1'b1; + + // We have got a full transaction. + if (axi4_wlast & axi4_wready & axi4_wvalid) begin + // Try to perform handshake with B sender. + fifo_select = 1'b0; + b_drop_o = 1'b1; + // We cannot wait here due to axi4_wready. + if (b_done_i) begin + hum_buf_SN = WAIT_L2_BYPASS_YES; + end else begin + hum_buf_SN = DISCARD_FINISH; + end + end + end + + DISCARD_FINISH : begin + // Perform handshake with B sender. + fifo_select = 1'b0; + b_drop_o = 1'b1; + if (b_done_i) begin + hum_buf_SN = WAIT_L2_BYPASS_YES; + end + end + + WAIT_L1_BYPASS_NO : begin + // Do not allow the forwarding of L1 hits. + block_forwarding = 1'b1; + + // Wait for orders from L1 TLB. + if (l1_fifo_valid_out) begin + + // L1 hit - forward data from/through HUM buffer and refill the buffer + if (l1_accept_cur) begin + // Forward data from HUM buffer. + m_axi4_wlast = hum_buf_wlast; + m_axi4_wdata = hum_buf_wdata; + m_axi4_wstrb = hum_buf_wstrb; + m_axi4_wuser = hum_buf_wuser; + + m_axi4_wvalid = hum_buf_valid_out; + hum_buf_ready_in = m_axi4_wready; + + master_select_o = l1_master_cur; + + // Refill the HUM buffer. Stop when buffer full. + stop_store = ~hum_buf_ready_out; + hum_buf_valid_in = stop_store ? 1'b0 : axi4_wvalid ; + axi4_wready = stop_store ? 1'b0 : hum_buf_ready_out; + + // Detect last data beat. + if (wlast_out) begin + fifo_select = 1'b0; + w_done = 1'b1; + if (~hum_buf_ready_out | hum_buf_almost_full) begin + hum_buf_SN = WAIT_L1_BYPASS_NO; + end else begin + hum_buf_SN = STORE; + end + end + + // Allow the forwarding of L1 hits. + block_forwarding = 1'b0; + + // L1 miss - wait for L2 + end else if (l1_save_cur) begin + fifo_select = 1'b0; + w_done = 1'b1; + hum_buf_SN = WAIT_L2_BYPASS_NO; + + // L1 prefetch, prot, multi - drop data + end else if (l1_drop_cur) begin + fifo_select_SN = 1'b0; // L1 + hum_buf_drop_req_SN = 1'b1; + hum_buf_drop_len_SN = l1_len_cur; + hum_buf_SN = FLUSH; + + // Allow the forwarding of L1 hits. + block_forwarding = 1'b0; + end + end + end + + WAIT_L2_BYPASS_NO : begin + // Do not allow the forwarding of L1 hits. + block_forwarding = 1'b1; + + // Wait for orders from L2 TLB. + if (l2_fifo_valid_out) begin + + // L2 hit - forward first part from HUM buffer, rest from input buffer + if (l2_accept_cur) begin + // Forward data from HUM buffer. + m_axi4_wlast = hum_buf_wlast; + m_axi4_wdata = hum_buf_wdata; + m_axi4_wstrb = hum_buf_wstrb; + m_axi4_wuser = hum_buf_wuser; + + m_axi4_wvalid = hum_buf_valid_out; + hum_buf_ready_in = m_axi4_wready; + + master_select_o = l2_master_cur; + + // Refill the HUM buffer. Stop when buffer full. + stop_store = ~hum_buf_ready_out; + hum_buf_valid_in = stop_store ? 1'b0 : axi4_wvalid ; + axi4_wready = stop_store ? 1'b0 : hum_buf_ready_out; + + // Detect last data beat. + if (wlast_out) begin + fifo_select = 1'b1; + w_done = 1'b1; + if (~hum_buf_ready_out | hum_buf_almost_full) begin + hum_buf_SN = WAIT_L1_BYPASS_NO; + end else begin + hum_buf_SN = STORE; + end + end + + // Allow the forwarding of L1 hits. + block_forwarding = 1'b0; + + // L2 miss/prefetch hit - drop data + end else if (l2_drop_cur) begin + fifo_select_SN = 1'b1; // L2 + hum_buf_drop_req_SN = 1'b1; + hum_buf_drop_len_SN = l2_len_cur; + hum_buf_SN = FLUSH; + + // Allow the forwarding of L1 hits. + block_forwarding = 1'b0; + end + end + end + + + default: begin + hum_buf_SN = STORE; + end + + endcase // hum_buf_SP + end // HUM_BUFFER_FSM + + assign b_drop_set = 1'b0; + + end else begin // HUM_BUFFER + + // register to perform the handshake with B sender + always_ff @(posedge axi4_aclk or negedge axi4_arstn) begin + if (axi4_arstn == 0) begin + b_drop_o <= 1'b0; + end else if (b_done_i) begin + b_drop_o <= 1'b0; + end else if (b_drop_set) begin + b_drop_o <= 1'b1;; + end + end + + always_comb begin : OUTPUT_CTRL + + fifo_select = 1'b0; + w_done = 1'b0; + b_drop_set = 1'b0; + + m_axi4_wlast = 1'b0; + m_axi4_wdata = 'b0; + m_axi4_wstrb = 'b0; + m_axi4_wuser = 'b0; + + m_axi4_wvalid = 1'b0; + axi4_wready = 1'b0; + + if (l1_fifo_valid_out) begin + // forward data + if (l1_accept_cur) begin + m_axi4_wlast = axi4_wlast; + m_axi4_wdata = axi4_wdata; + m_axi4_wstrb = axi4_wstrb; + m_axi4_wuser = axi4_wuser; + + m_axi4_wvalid = axi4_wvalid; + axi4_wready = m_axi4_wready; + + // Simply pop from FIFO upon last data beat. + w_done = axi4_wlast & axi4_wvalid & axi4_wready; + + // discard entire burst + end else if (b_drop_o == 1'b0) begin + axi4_wready = 1'b1; + + // Simply pop from FIFO upon last data beat. Perform handshake with B sender. + if (axi4_wlast & axi4_wvalid & axi4_wready) + b_drop_set = 1'b1; + end + end + + end // OUTPUT_CTRL + + assign master_select_o = l1_master_cur; + assign l2_fifo_ready_out = 1'b1; + assign block_forwarding = 1'b0; + + // unused signals + assign hum_buf_ready_out = 1'b0; + assign hum_buf_valid_in = 1'b0; + assign hum_buf_ready_in = 1'b0; + assign hum_buf_valid_out = 1'b0; + assign hum_buf_wdata = 'b0; + assign hum_buf_wstrb = 'b0; + assign hum_buf_wlast = 1'b0; + assign hum_buf_wuser = 'b0; + assign hum_buf_drop_len_SN = 'b0; + assign hum_buf_drop_req_SN = 1'b0; + assign hum_buf_almost_full = 1'b0; + + assign l2_fifo_valid_in = 1'b0; + assign l2_fifo_valid_out = 1'b0; + assign l2_prefetch_cur = 1'b0; + assign l2_hit_cur = 1'b0; + assign l2_id_cur = 'b0; + assign l2_len_cur = 'b0; + assign l2_master_cur = 1'b0; + assign l2_accept_cur = 1'b0; + assign l2_drop_cur = 1'b0; + + assign l2_req = 1'b0; + + assign fifo_select_SN = 1'b0; + assign fifo_select_SP = 1'b0; + + assign stop_store = 1'b0; + assign n_wlast_SP = 'b0; + assign wlast_in = 1'b0; + assign wlast_out = 1'b0; + + end // HUM_BUFFER + + endgenerate +""" diff --git a/src/soc/iommu/axi_rab/axi4_w_sender.py b/src/soc/iommu/axi_rab/axi4_w_sender.py new file mode 100644 index 00000000..9916334f --- /dev/null +++ b/src/soc/iommu/axi_rab/axi4_w_sender.py @@ -0,0 +1,78 @@ +# this file has been generated by sv2nmigen + +from nmigen import Signal, Module, Const, Cat, Elaboratable + + +class axi4_w_sender(Elaboratable): + + def __init__(self): + self.axi4_aclk = Signal() # input + self.axi4_arstn = Signal() # input + self.s_axi4_wdata = Signal() # input + self.s_axi4_wvalid = Signal() # input + self.s_axi4_wready = Signal() # output + self.s_axi4_wstrb = Signal() # input + self.s_axi4_wlast = Signal() # input + self.s_axi4_wuser = Signal() # input + self.m_axi4_wdata = Signal() # output + self.m_axi4_wvalid = Signal() # output + self.m_axi4_wready = Signal() # input + self.m_axi4_wstrb = Signal() # output + self.m_axi4_wlast = Signal() # output + self.m_axi4_wuser = Signal() # output + + def elaborate(self, platform=None): + m = Module() + m.d.comb += self.m_axi4_wdata.eq(self.s_axi4_wdata) + m.d.comb += self.m_axi4_wstrb.eq(self.s_axi4_wstrb) + m.d.comb += self.m_axi4_wlast.eq(self.s_axi4_wlast) + m.d.comb += self.m_axi4_wuser.eq(self.s_axi4_wuser) + m.d.comb += self.m_axi4_wvalid.eq(self.s_axi4_wvalid) + m.d.comb += self.s_axi4_wready.eq(self.m_axi4_wready) + return m + +# // Copyright 2018 ETH Zurich and University of Bologna. +# // Copyright and related rights are licensed under the Solderpad Hardware +# // License, Version 0.51 (the "License"); you may not use this file except in +# // compliance with the License. You may obtain a copy of the License at +# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law +# // or agreed to in writing, software, hardware and materials distributed under +# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# // CONDITIONS OF ANY KIND, either express or implied. See the License for the +# // specific language governing permissions and limitations under the License. +# +# module axi4_w_sender +# #( +# parameter AXI_DATA_WIDTH = 32, +# parameter AXI_USER_WIDTH = 2 +# ) +# ( +# input axi4_aclk, +# input axi4_arstn, +# +# input [AXI_DATA_WIDTH-1:0] s_axi4_wdata, +# input s_axi4_wvalid, +# output s_axi4_wready, +# input [AXI_DATA_WIDTH/8-1:0] s_axi4_wstrb, +# input s_axi4_wlast, +# input [AXI_USER_WIDTH-1:0] s_axi4_wuser, +# +# output [AXI_DATA_WIDTH-1:0] m_axi4_wdata, +# output m_axi4_wvalid, +# input m_axi4_wready, +# output [AXI_DATA_WIDTH/8-1:0] m_axi4_wstrb, +# output m_axi4_wlast, +# output [AXI_USER_WIDTH-1:0] m_axi4_wuser +# ); +# +# assign m_axi4_wdata = s_axi4_wdata; +# assign m_axi4_wstrb = s_axi4_wstrb; +# assign m_axi4_wlast = s_axi4_wlast; +# assign m_axi4_wuser = s_axi4_wuser; +# +# assign m_axi4_wvalid = s_axi4_wvalid; +# assign s_axi4_wready = m_axi4_wready; +# +# endmodule +# +# diff --git a/src/soc/iommu/axi_rab/axi_buffer_rab.py b/src/soc/iommu/axi_rab/axi_buffer_rab.py new file mode 100644 index 00000000..b4d99299 --- /dev/null +++ b/src/soc/iommu/axi_rab/axi_buffer_rab.py @@ -0,0 +1,151 @@ +# this file has been generated by sv2nmigen + +from nmigen import Signal, Module, Const, Cat, Elaboratable + + +class axi_buffer_rab(Elaboratable): + + def __init__(self): + self.clk = Signal() # input + self.rstn = Signal() # input + self.data_out = Signal(DATA_WIDTH) # output + self.valid_out = Signal() # output + self.ready_in = Signal() # input + self.valid_in = Signal() # input + self.data_in = Signal(DATA_WIDTH) # input + self.ready_out = Signal() # output + + def elaborate(self, platform=None): + m = Module() + m.d.comb += self.full.eq(self.None) + m.d.comb += self.data_out.eq(self.None) + m.d.comb += self.valid_out.eq(self.None) + m.d.comb += self.ready_out.eq(self.None) + return m + +# // Copyright 2018 ETH Zurich and University of Bologna. +# // Copyright and related rights are licensed under the Solderpad Hardware +# // License, Version 0.51 (the "License"); you may not use this file except in +# // compliance with the License. You may obtain a copy of the License at +# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law +# // or agreed to in writing, software, hardware and materials distributed under +# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# // CONDITIONS OF ANY KIND, either express or implied. See the License for the +# // specific language governing permissions and limitations under the License. +# +# //import CfMath::log2; +# +# module axi_buffer_rab +# //#( +# // parameter DATA_WIDTH, +# // parameter BUFFER_DEPTH +# //) +# ( +# input logic clk, +# input logic rstn, +# +# // Downstream port +# output logic [DATA_WIDTH-1:0] data_out, +# output logic valid_out, +# input logic ready_in, +# +# // Upstream port +# input logic valid_in, +# input logic [DATA_WIDTH-1:0] data_in, +# output logic ready_out +# ); +# +# localparam integer LOG_BUFFER_DEPTH = log2(BUFFER_DEPTH); +# +# // Internal data structures +# reg [LOG_BUFFER_DEPTH - 1 : 0] pointer_in; // location to which we last wrote +# reg [LOG_BUFFER_DEPTH - 1 : 0] pointer_out; // location from which we last sent +# reg [LOG_BUFFER_DEPTH : 0] elements; // number of elements in the buffer +# reg [DATA_WIDTH - 1 : 0] buffer [BUFFER_DEPTH - 1 : 0]; +# +# wire full; +# +# integer loop1; +# +# assign full = (elements == BUFFER_DEPTH); +# +# always @(posedge clk or negedge rstn) +# begin: elements_sequential +# if (rstn == 1'b0) +# elements <= 0; +# else +# begin +# // ------------------ +# // Are we filling up? +# // ------------------ +# // One out, none in +# if (ready_in && valid_out && (!valid_in || full)) +# elements <= elements - 1; +# // None out, one in +# else if ((!valid_out || !ready_in) && valid_in && !full) +# elements <= elements + 1; +# // Else, either one out and one in, or none out and none in - stays unchanged +# end +# end +# +# always @(posedge clk or negedge rstn) +# begin: buffers_sequential +# if (rstn == 1'b0) +# begin +# for (loop1 = 0 ; loop1 < BUFFER_DEPTH ; loop1 = loop1 + 1) +# buffer[loop1] <= 0; +# end +# else +# begin +# // Update the memory +# if (valid_in && !full) +# buffer[pointer_in] <= data_in; +# end +# end +# +# always @(posedge clk or negedge rstn) +# begin: sequential +# if (rstn == 1'b0) +# begin +# pointer_out <= 0; +# pointer_in <= 0; +# end +# else +# begin +# // ------------------------------------ +# // Check what to do with the input side +# // ------------------------------------ +# // We have some input, increase by 1 the input pointer +# if (valid_in && !full) +# begin +# if (pointer_in == $unsigned(BUFFER_DEPTH - 1)) +# pointer_in <= 0; +# else +# pointer_in <= pointer_in + 1; +# end +# // Else we don't have any input, the input pointer stays the same +# +# // ------------------------------------- +# // Check what to do with the output side +# // ------------------------------------- +# // We had pushed one flit out, we can try to go for the next one +# if (ready_in && valid_out) +# begin +# if (pointer_out == $unsigned(BUFFER_DEPTH - 1)) +# pointer_out <= 0; +# else +# pointer_out <= pointer_out + 1; +# end +# // Else stay on the same output location +# end +# end +# +# // Update output ports +# assign data_out = buffer[pointer_out]; +# assign valid_out = (elements != 0); +# +# assign ready_out = ~full; +# +# endmodule +# +# diff --git a/src/soc/iommu/axi_rab/axi_buffer_rab_bram.py b/src/soc/iommu/axi_rab/axi_buffer_rab_bram.py new file mode 100644 index 00000000..349b314e --- /dev/null +++ b/src/soc/iommu/axi_rab/axi_buffer_rab_bram.py @@ -0,0 +1,209 @@ +# this file has been generated by sv2nmigen + +from nmigen import Signal, Module, Const, Cat, Elaboratable + + +class axi_buffer_rab_bram(Elaboratable): + + def __init__(self): + self.clk = Signal() # input + self.rstn = Signal() # input + self.data_out = Signal(DATA_WIDTH) # output + self.valid_out = Signal() # output + self.ready_in = Signal() # input + self.valid_in = Signal() # input + self.data_in = Signal(DATA_WIDTH) # input + self.ready_out = Signal() # output + self.almost_full = Signal() # output + self.underfull = Signal() # output + self.drop_req = Signal() # input + self.drop_len = Signal(8) # input + + def elaborate(self, platform=None): + m = Module() + return m + + +# // Copyright 2018 ETH Zurich and University of Bologna. +# // Copyright and related rights are licensed under the Solderpad Hardware +# // License, Version 0.51 (the "License"); you may not use this file except in +# // compliance with the License. You may obtain a copy of the License at +# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law +# // or agreed to in writing, software, hardware and materials distributed under +# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# // CONDITIONS OF ANY KIND, either express or implied. See the License for the +# // specific language governing permissions and limitations under the License. +# +# ////import CfMath::log2; +# +# module axi_buffer_rab_bram +# //#( +# // parameter DATA_WIDTH, +# // parameter BUFFER_DEPTH +# // ) +# ( +# input logic clk, +# input logic rstn, +# +# // Downstream port +# output logic [DATA_WIDTH-1:0] data_out, +# output logic valid_out, +# input logic ready_in, +# +# // Upstream port +# input logic valid_in, +# input logic [DATA_WIDTH-1:0] data_in, +# output logic ready_out, +# +# // Status and drop control +# output logic almost_full, +# output logic underfull, +# input logic drop_req, +# // Number of items to drop. As for AXI lengths, counting starts at zero, i.e., `drop_len == 0` +# // and `drop_req` means drop one item. +# input logic [7:0] drop_len +# ); +# +""" #docstring_begin + // The BRAM needs to be in "write-first" mode for first-word fall-through FIFO behavior. + // To still push and pop simultaneously if the buffer is full, we internally increase the + // buffer depth by 1. + localparam ACT_BUFFER_DEPTH = BUFFER_DEPTH+1; + localparam ACT_LOG_BUFFER_DEPTH = log2(ACT_BUFFER_DEPTH+1); + + /** + * Internal data structures + */ + // Location to which we last wrote + logic [ACT_LOG_BUFFER_DEPTH-1:0] ptr_in_d, ptr_in_q; + // Location from which we last sent + logic [ACT_LOG_BUFFER_DEPTH-1:0] ptr_out_d, ptr_out_q; + // Required for fall-through behavior on the first word + logic [ACT_LOG_BUFFER_DEPTH-1:0] ptr_out_bram; + // Number of elements in the buffer. Can be negative if elements that have been dropped have not + // yet been written. + logic signed [ACT_LOG_BUFFER_DEPTH:0] n_elems_d, n_elems_q; + + logic [DATA_WIDTH-1:0] data_out_bram, data_out_q; + logic valid_out_q; + + logic full; + + assign almost_full = (n_elems_q == BUFFER_DEPTH-1); + assign full = (n_elems_q == BUFFER_DEPTH); + + always_ff @(posedge clk, negedge rstn) begin + if (~rstn) begin + n_elems_q <= '0; + ptr_in_q <= '0; + ptr_out_q <= '0; + end else begin + n_elems_q <= n_elems_d; + ptr_in_q <= ptr_in_d; + ptr_out_q <= ptr_out_d; + end + end + + // Update the number of elements. + always_comb begin + n_elems_d = n_elems_q; + if (drop_req) begin + n_elems_d -= (drop_len + 1); + end + if (valid_in && ready_out) begin + n_elems_d += 1; + end + if (valid_out && ready_in) begin + n_elems_d -= 1; + end + end + + // Update the output pointer. + always_comb begin + ptr_out_d = ptr_out_q; + if (drop_req) begin + if ((ptr_out_q + drop_len + 1) > (ACT_BUFFER_DEPTH - 1)) begin + ptr_out_d = drop_len + 1 - (ACT_BUFFER_DEPTH - ptr_out_q); + end else begin + ptr_out_d += (drop_len + 1); + end + end + if (valid_out && ready_in) begin + if (ptr_out_d == (ACT_BUFFER_DEPTH - 1)) begin + ptr_out_d = '0; + end else begin + ptr_out_d += 1; + end + end + end + + // The BRAM has a read latency of one cycle, so apply the new address one cycle earlier for + // first-word fall-through FIFO behavior. + //assign ptr_out_bram = (ptr_out_q == (ACT_BUFFER_DEPTH-1)) ? '0 : (ptr_out_q + 1); + assign ptr_out_bram = ptr_out_d; + + // Update the input pointer. + always_comb begin + ptr_in_d = ptr_in_q; + if (valid_in && ready_out) begin + if (ptr_in_d == (ACT_BUFFER_DEPTH - 1)) begin + ptr_in_d = '0; + end else begin + ptr_in_d += 1; + end + end + end + + // Update output ports. + assign valid_out = (n_elems_q > $signed(0)); + assign underfull = (n_elems_q < $signed(0)); + assign ready_out = ~full; + + ram_tp_write_first #( + .ADDR_WIDTH ( ACT_LOG_BUFFER_DEPTH ), + .DATA_WIDTH ( DATA_WIDTH ) + ) + ram_tp_write_first_0 + ( + .clk ( clk ), + .we ( valid_in & ~full ), + .addr0 ( ptr_in_q ), + .addr1 ( ptr_out_bram ), + .d_i ( data_in ), + .d0_o ( ), + .d1_o ( data_out_bram ) + ); + + // When reading from/writing two the same address on both ports ("Write-Read Collision"), + // the data on the read port is invalid (during the write cycle). In this implementation, + // this can happen only when the buffer is empty. Thus, we forward the data from an + // register in this case. + always @(posedge clk) begin + if (rstn == 1'b0) begin + data_out_q <= 'b0; + end else if ( (ptr_out_bram == ptr_in_q) && (valid_in && !full) ) begin + data_out_q <= data_in; + end + end + + always @(posedge clk) begin + if (rstn == 1'b0) begin + valid_out_q <= 'b0; + end else begin + valid_out_q <= valid_out; + end + end + + // Drive output data + always_comb begin + if (valid_out && !valid_out_q) begin // We have just written to an empty FIFO + data_out = data_out_q; + end else begin + data_out = data_out_bram; + end + end + +""" +# endmodule +# +# diff --git a/src/soc/iommu/axi_rab/axi_rab_cfg.py b/src/soc/iommu/axi_rab/axi_rab_cfg.py new file mode 100644 index 00000000..43843b95 --- /dev/null +++ b/src/soc/iommu/axi_rab/axi_rab_cfg.py @@ -0,0 +1,707 @@ +# this file has been generated by sv2nmigen + +from nmigen import Signal, Module, Const, Cat, Elaboratable + + +class axi_rab_cfg(Elaboratable): + + def __init__(self): + self.Clk_CI = Signal() # input + self.Rst_RBI = Signal() # input + self.s_axi_awaddr = Signal(AXI_ADDR_WIDTH) # input + self.s_axi_awvalid = Signal() # input + self.s_axi_awready = Signal() # output + self.s_axi_wdata = Signal() # input + self.s_axi_wstrb = Signal(1+ERROR p_expression_25) # input + self.s_axi_wvalid = Signal() # input + self.s_axi_wready = Signal() # output + self.s_axi_bresp = Signal(2) # output + self.s_axi_bvalid = Signal() # output + self.s_axi_bready = Signal() # input + self.s_axi_araddr = Signal(AXI_ADDR_WIDTH) # input + self.s_axi_arvalid = Signal() # input + self.s_axi_arready = Signal() # output + self.s_axi_rdata = Signal(AXI_DATA_WIDTH) # output + self.s_axi_rresp = Signal(2) # output + self.s_axi_rvalid = Signal() # output + self.s_axi_rready = Signal() # input + self.L1Cfg_DO = Signal() # output + self.L1AllowMultiHit_SO = Signal() # output + self.MissAddr_DI = Signal(ADDR_WIDTH_VIRT) # input + self.MissMeta_DI = Signal(MISS_META_WIDTH) # input + self.Miss_SI = Signal() # input + self.MhFifoFull_SO = Signal() # output + self.wdata_l2 = Signal() # output + self.waddr_l2 = Signal() # output + self.wren_l2 = Signal(N_PORTS) # output + + def elaborate(self, platform=None): + m = Module() + return m + + +# // Copyright 2018 ETH Zurich and University of Bologna. +# // Copyright and related rights are licensed under the Solderpad Hardware +# // License, Version 0.51 (the "License"); you may not use this file except in +# // compliance with the License. You may obtain a copy of the License at +# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law +# // or agreed to in writing, software, hardware and materials distributed under +# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# // CONDITIONS OF ANY KIND, either express or implied. See the License for the +# // specific language governing permissions and limitations under the License. +# +# // --=========================================================================-- +# // +# // █████╗ ██╗ ██╗██╗ ██████╗ █████╗ ██████╗ ██████╗███████╗ ██████╗ +# // ██╔══██╗╚██╗██╔╝██║ ██╔══██╗██╔══██╗██╔══██╗ ██╔════╝██╔════╝██╔════╝ +# // ███████║ ╚███╔╝ ██║ ██████╔╝███████║██████╔╝ ██║ █████╗ ██║ ███╗ +# // ██╔══██║ ██╔██╗ ██║ ██╔══██╗██╔══██║██╔══██╗ ██║ ██╔══╝ ██║ ██║ +# // ██║ ██║██╔╝ ██╗██║ ██║ ██║██║ ██║██████╔╝ ╚██████╗██║ ╚██████╔╝ +# // ╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═╝ ╚═╝╚═╝ ╚═╝╚═════╝ ╚═════╝╚═╝ ╚═════╝ +# // +# // +# // Author: Pirmin Vogel - vogelpi@iis.ee.ethz.ch +# // +# // Purpose : AXI4-Lite configuration and miss handling interface for RAB +# // +# // --=========================================================================-- +# +# //import CfMath::log2; +# +# module axi_rab_cfg +# #( +# parameter N_PORTS = 3, +# parameter N_REGS = 196, +# parameter N_L2_SETS = 32, +# parameter N_L2_SET_ENTRIES= 32, +# parameter ADDR_WIDTH_PHYS = 40, +# parameter ADDR_WIDTH_VIRT = 32, +# parameter N_FLAGS = 4, +# parameter AXI_DATA_WIDTH = 64, +# parameter AXI_ADDR_WIDTH = 32, +# parameter MISS_META_WIDTH = 10, // <= FIFO_WIDTH +# parameter MH_FIFO_DEPTH = 16 +# ) +# ( +# input logic Clk_CI, +# input logic Rst_RBI, +# +# // AXI Lite interface +# input logic [AXI_ADDR_WIDTH-1:0] s_axi_awaddr, +# input logic s_axi_awvalid, +# output logic s_axi_awready, +# input logic [AXI_DATA_WIDTH/8-1:0][7:0] s_axi_wdata, +# input logic [AXI_DATA_WIDTH/8-1:0] s_axi_wstrb, +# input logic s_axi_wvalid, +# output logic s_axi_wready, +# output logic [1:0] s_axi_bresp, +# output logic s_axi_bvalid, +# input logic s_axi_bready, +# input logic [AXI_ADDR_WIDTH-1:0] s_axi_araddr, +# input logic s_axi_arvalid, +# output logic s_axi_arready, +# output logic [AXI_DATA_WIDTH-1:0] s_axi_rdata, +# output logic [1:0] s_axi_rresp, +# output logic s_axi_rvalid, +# input logic s_axi_rready, +# +# // Slice configuration +# output logic [N_REGS-1:0][63:0] L1Cfg_DO, +# output logic L1AllowMultiHit_SO, +# +# // Miss handling +# input logic [ADDR_WIDTH_VIRT-1:0] MissAddr_DI, +# input logic [MISS_META_WIDTH-1:0] MissMeta_DI, +# input logic Miss_SI, +# output logic MhFifoFull_SO, +# +# // L2 TLB +# output logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] wdata_l2, +# output logic [N_PORTS-1:0] [AXI_ADDR_WIDTH-1:0] waddr_l2, +# output logic [N_PORTS-1:0] wren_l2 +# ); +# +""" #docstring_begin + + localparam ADDR_LSB = log2(64/8); // 64 even if the AXI Lite interface is 32, + // because RAB slices are 64 bit wide. + localparam ADDR_MSB = log2(N_REGS)+ADDR_LSB-1; + + localparam L2SINGLE_AMAP_SIZE = 16'h4000; // Maximum 2048 TLB entries in L2 + + localparam integer N_L2_ENTRIES = N_L2_SETS * N_L2_SET_ENTRIES; + + localparam logic [AXI_ADDR_WIDTH-1:0] L2_VA_MAX_ADDR = (N_L2_ENTRIES-1) << 2; + + logic [AXI_DATA_WIDTH/8-1:0][7:0] L1Cfg_DP[N_REGS]; // [Byte][Bit] + genvar j; + + // █████╗ ██╗ ██╗██╗██╗ ██╗ ██╗ ██╗████████╗███████╗ + // ██╔══██╗╚██╗██╔╝██║██║ ██║ ██║ ██║╚══██╔══╝██╔════╝ + // ███████║ ╚███╔╝ ██║███████║█████╗██║ ██║ ██║ █████╗ + // ██╔══██║ ██╔██╗ ██║╚════██║╚════╝██║ ██║ ██║ ██╔══╝ + // ██║ ██║██╔╝ ██╗██║ ██║ ███████╗██║ ██║ ███████╗ + // ╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═╝ ╚══════╝╚═╝ ╚═╝ ╚══════╝ + // + logic [AXI_ADDR_WIDTH-1:0] awaddr_reg; + logic awaddr_done_rise; + logic awaddr_done_reg; + logic awaddr_done_reg_dly; + + logic [AXI_DATA_WIDTH/8-1:0][7:0] wdata_reg; + logic [AXI_DATA_WIDTH/8-1:0] wstrb_reg; + logic wdata_done_rise; + logic wdata_done_reg; + logic wdata_done_reg_dly; + + logic wresp_done_reg; + logic wresp_running_reg; + + logic [AXI_ADDR_WIDTH-1:0] araddr_reg; + logic araddr_done_reg; + + logic [AXI_DATA_WIDTH-1:0] rdata_reg; + logic rresp_done_reg; + logic rresp_running_reg; + + logic awready; + logic wready; + logic bvalid; + + logic arready; + logic rvalid; + + logic wren; + logic wren_l1; + + assign wren = ( wdata_done_rise & awaddr_done_reg ) | ( awaddr_done_rise & wdata_done_reg ); + assign wdata_done_rise = wdata_done_reg & ~wdata_done_reg_dly; + assign awaddr_done_rise = awaddr_done_reg & ~awaddr_done_reg_dly; + + // reg_dly + always @(posedge Clk_CI or negedge Rst_RBI) + begin + if (!Rst_RBI) + begin + wdata_done_reg_dly <= 1'b0; + awaddr_done_reg_dly <= 1'b0; + end + else + begin + wdata_done_reg_dly <= wdata_done_reg; + awaddr_done_reg_dly <= awaddr_done_reg; + end + end + + // AW Channel + always @(posedge Clk_CI or negedge Rst_RBI) + begin + if (!Rst_RBI) + begin + awaddr_done_reg <= 1'b0; + awaddr_reg <= '0; + awready <= 1'b1; + end + else + begin + if (awready && s_axi_awvalid) + begin + awready <= 1'b0; + awaddr_done_reg <= 1'b1; + awaddr_reg <= s_axi_awaddr; + end + else if (awaddr_done_reg && wresp_done_reg) + begin + awready <= 1'b1; + awaddr_done_reg <= 1'b0; + end + end + end + + // W Channel + always @(posedge Clk_CI or negedge Rst_RBI) + begin + if (!Rst_RBI) + begin + wdata_done_reg <= 1'b0; + wready <= 1'b1; + wdata_reg <= '0; + wstrb_reg <= '0; + end + else + begin + if (wready && s_axi_wvalid) + begin + wready <= 1'b0; + wdata_done_reg <= 1'b1; + wdata_reg <= s_axi_wdata; + wstrb_reg <= s_axi_wstrb; + end + else if (wdata_done_reg && wresp_done_reg) + begin + wready <= 1'b1; + wdata_done_reg <= 1'b0; + end + end + end + + // B Channel + always @(posedge Clk_CI or negedge Rst_RBI) + begin + if (!Rst_RBI) + begin + bvalid <= 1'b0; + wresp_done_reg <= 1'b0; + wresp_running_reg <= 1'b0; + end + else + begin + if (awaddr_done_reg && wdata_done_reg && !wresp_done_reg) + begin + if (!wresp_running_reg) + begin + bvalid <= 1'b1; + wresp_running_reg <= 1'b1; + end + else if (s_axi_bready) + begin + bvalid <= 1'b0; + wresp_done_reg <= 1'b1; + wresp_running_reg <= 1'b0; + end + end + else + begin + bvalid <= 1'b0; + wresp_done_reg <= 1'b0; + wresp_running_reg <= 1'b0; + end + end + end + + // AR Channel + always @(posedge Clk_CI or negedge Rst_RBI) + begin + if (!Rst_RBI) + begin + araddr_done_reg <= 1'b0; + arready <= 1'b1; + araddr_reg <= '0; + end + else + begin + if (arready && s_axi_arvalid) + begin + arready <= 1'b0; + araddr_done_reg <= 1'b1; + araddr_reg <= s_axi_araddr; + end + else if (araddr_done_reg && rresp_done_reg) + begin + arready <= 1'b1; + araddr_done_reg <= 1'b0; + end + end + end + + // R Channel + always @(posedge Clk_CI or negedge Rst_RBI) + begin + if (!Rst_RBI) + begin + rresp_done_reg <= 1'b0; + rvalid <= 1'b0; + rresp_running_reg <= 1'b0; + end + else + begin + if (araddr_done_reg && !rresp_done_reg) + begin + if (!rresp_running_reg) + begin + rvalid <= 1'b1; + rresp_running_reg <= 1'b1; + end + else if (s_axi_rready) + begin + rvalid <= 1'b0; + rresp_done_reg <= 1'b1; + rresp_running_reg <= 1'b0; + end + end + else + begin + rvalid <= 1'b0; + rresp_done_reg <= 1'b0; + rresp_running_reg <= 1'b0; + end + end + end + + // ██╗ ██╗ ██████╗███████╗ ██████╗ ██████╗ ███████╗ ██████╗ + // ██║ ███║ ██╔════╝██╔════╝██╔════╝ ██╔══██╗██╔════╝██╔════╝ + // ██║ ╚██║ ██║ █████╗ ██║ ███╗ ██████╔╝█████╗ ██║ ███╗ + // ██║ ██║ ██║ ██╔══╝ ██║ ██║ ██╔══██╗██╔══╝ ██║ ██║ + // ███████╗██║ ╚██████╗██║ ╚██████╔╝ ██║ ██║███████╗╚██████╔╝ + // ╚══════╝╚═╝ ╚═════╝╚═╝ ╚═════╝ ╚═╝ ╚═╝╚══════╝ ╚═════╝ + // + assign wren_l1 = wren && (awaddr_reg < L2SINGLE_AMAP_SIZE); + + always @( posedge Clk_CI or negedge Rst_RBI ) + begin + var integer idx_reg, idx_byte; + if ( Rst_RBI == 1'b0 ) + begin + for ( idx_reg = 0; idx_reg < N_REGS; idx_reg++ ) + L1Cfg_DP[idx_reg] <= '0; + end + else if ( wren_l1 ) + begin + if ( awaddr_reg[ADDR_LSB+1] == 1'b0 ) begin // VIRT_ADDR + for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8; idx_byte++ ) begin + if ( (idx_byte < ADDR_WIDTH_VIRT/8) ) begin + if ( wstrb_reg[idx_byte] ) begin + L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= wdata_reg[idx_byte]; + end + end + else begin // Let synthesizer optimize away unused registers. + L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= '0; + end + end + end + else if ( awaddr_reg[ADDR_LSB+1:ADDR_LSB] == 2'b10 ) begin // PHYS_ADDR + for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8; idx_byte++ ) begin + if ( (idx_byte < ADDR_WIDTH_PHYS/8) ) begin + if ( wstrb_reg[idx_byte] ) begin + L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= wdata_reg[idx_byte]; + end + end + else begin // Let synthesizer optimize away unused registers. + L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= '0; + end + end + end + else begin // ( awaddr_reg[ADDR_LSB+1:ADDR_LSB] == 2'b11 ) // FLAGS + for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8; idx_byte++ ) begin + if ( (idx_byte < 1) ) begin + if ( wstrb_reg[idx_byte] ) begin + L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= wdata_reg[idx_byte] & { {{8-N_FLAGS}{1'b0}}, {{N_FLAGS}{1'b1}} }; + end + end + else begin // Let synthesizer optimize away unused registers. + L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= '0; + end + end + end + end + end // always @ ( posedge Clk_CI or negedge Rst_RBI ) + + generate + // Mask unused bits -> Synthesizer should optimize away unused registers + for( j=0; j= (j+1)*L2SINGLE_AMAP_SIZE) && (awaddr_reg[log2(L2SINGLE_AMAP_SIZE)-1:0] <= L2_VA_MAX_ADDR); + assign upper_word_is_written[j] = (wstrb_reg[7:4] != 4'b0000); + assign lower_word_is_written[j] = (wstrb_reg[3:0] != 4'b0000); + end else begin + assign l2_addr_is_in_va_rams[j] = 1'b0; + assign upper_word_is_written[j] = 1'b0; + assign lower_word_is_written[j] = 1'b0; + end + + always @( posedge Clk_CI or negedge Rst_RBI ) begin + var integer idx_byte, off_byte; + if ( Rst_RBI == 1'b0 ) + begin + wren_l2[j] <= 1'b0; + wdata_l2[j] <= '0; + end + else if (wren) + begin + if ( (awaddr_reg >= (j+1)*L2SINGLE_AMAP_SIZE) && (awaddr_reg < (j+2)*L2SINGLE_AMAP_SIZE) && (|wstrb_reg) ) + wren_l2[j] <= 1'b1; + if (AXI_DATA_WIDTH == 32) begin + for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8; idx_byte++ ) + wdata_l2[j][idx_byte*8 +: 8] <= wdata_reg[idx_byte] & {8{wstrb_reg[idx_byte]}}; + end + else if (AXI_DATA_WIDTH == 64) begin + if (lower_word_is_written[j] == 1'b1) + off_byte = 0; + else + off_byte = 4; + // always put the payload in the lower word and set upper word to 0 + for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8/2; idx_byte++ ) + wdata_l2[j][idx_byte*8 +: 8] <= wdata_reg[idx_byte+off_byte] & {8{wstrb_reg[idx_byte+off_byte]}}; + wdata_l2[j][AXI_DATA_WIDTH-1:AXI_DATA_WIDTH/2] <= 'b0; + end + // pragma translate_off + else + $fatal(1, "Unsupported AXI_DATA_WIDTH!"); + // pragma translate_on + end + else + wren_l2[j] <= '0; + end // always @ ( posedge Clk_CI or negedge Rst_RBI ) + + // Properly align the 32-bit word address when writing from 64-bit interface: + // Depending on the system, the incoming address is (non-)aligned to the 64-bit + // word when writing the upper 32-bit word. + always_comb begin + waddr_l2[j] = (awaddr_reg -(j+1)*L2SINGLE_AMAP_SIZE)/4; + if (wren_l2[j]) begin + if (AXI_DATA_WIDTH == 64) begin + if (upper_word_is_written[j] == 1'b1) begin + // address must be non-aligned + waddr_l2[j][0] = 1'b1; + end + end + // pragma translate_off + else if (AXI_DATA_WIDTH != 32) begin + $fatal(1, "Unsupported AXI_DATA_WIDTH!"); + end + // pragma translate_on + end + end + + // Assert that only one 32-bit word is ever written at a time to VA RAMs on 64-bit data + // systems. + // pragma translate_off + always_ff @ (posedge Clk_CI) begin + if (AXI_DATA_WIDTH == 64) begin + if (l2_addr_is_in_va_rams[j]) begin + if (upper_word_is_written[j]) begin + assert (!lower_word_is_written[j]) + else $error("Unsupported write across two 32-bit words to VA RAMs!"); + end + else if (lower_word_is_written[j]) begin + assert (!upper_word_is_written[j]) + else $error("Unsupported write across two 32-bit words to VA RAMs!"); + end + end + end + end + // pragma translate_on + + end // for (j=0; j< N_PORTS; j++) + endgenerate + + // ███╗ ███╗██╗ ██╗ ███████╗██╗███████╗ ██████╗ ███████╗ + // ████╗ ████║██║ ██║ ██╔════╝██║██╔════╝██╔═══██╗██╔════╝ + // ██╔████╔██║███████║ █████╗ ██║█████╗ ██║ ██║███████╗ + // ██║╚██╔╝██║██╔══██║ ██╔══╝ ██║██╔══╝ ██║ ██║╚════██║ + // ██║ ╚═╝ ██║██║ ██║ ██║ ██║██║ ╚██████╔╝███████║ + // ╚═╝ ╚═╝╚═╝ ╚═╝ ╚═╝ ╚═╝╚═╝ ╚═════╝ ╚══════╝ + // + logic [ADDR_WIDTH_VIRT-1:0] AddrFifoDin_D; + logic AddrFifoWen_S; + logic AddrFifoRen_S; + logic [ADDR_WIDTH_VIRT-1:0] AddrFifoDout_D; + logic AddrFifoFull_S; + logic AddrFifoEmpty_S; + logic AddrFifoEmpty_SB; + logic AddrFifoFull_SB; + + logic [MISS_META_WIDTH-1:0] MetaFifoDin_D; + logic MetaFifoWen_S; + logic MetaFifoRen_S; + logic [MISS_META_WIDTH-1:0] MetaFifoDout_D; + logic MetaFifoFull_S; + logic MetaFifoEmpty_S; + logic MetaFifoEmpty_SB; + logic MetaFifoFull_SB; + + logic FifosDisabled_S; + logic ConfRegWen_S; + logic [1:0] ConfReg_DN; + logic [1:0] ConfReg_DP; + + logic [AXI_DATA_WIDTH-1:0] wdata_reg_vec; + + assign FifosDisabled_S = ConfReg_DP[0]; + assign L1AllowMultiHit_SO = ConfReg_DP[1]; + + assign AddrFifoEmpty_S = ~AddrFifoEmpty_SB; + assign MetaFifoEmpty_S = ~MetaFifoEmpty_SB; + + assign AddrFifoFull_S = ~AddrFifoFull_SB; + assign MetaFifoFull_S = ~MetaFifoFull_SB; + + assign MhFifoFull_SO = (AddrFifoWen_S & AddrFifoFull_S) | (MetaFifoWen_S & MetaFifoFull_S); + + generate + for ( j=0; j +# * Conrad Burchert +# * Maheshwara Sharma +# * Andreas Kurth +# * Johannes Weinbuch +# * Pirmin Vogel +# */ +# +# //`include "pulp_soc_defines.sv" +# +# ////import CfMath::log2; +# +# module axi_rab_top +# +# // Parameters {{{ +# #( +# parameter N_PORTS = 2, +# parameter N_L2_SETS = 32, +# parameter N_L2_SET_ENTRIES = 32, +# parameter AXI_DATA_WIDTH = 64, +# parameter AXI_S_ADDR_WIDTH = 32, +# parameter AXI_M_ADDR_WIDTH = 40, +# parameter AXI_LITE_DATA_WIDTH = 64, +# parameter AXI_LITE_ADDR_WIDTH = 32, +# parameter AXI_ID_WIDTH = 10, +# parameter AXI_USER_WIDTH = 6, +# parameter MH_FIFO_DEPTH = 16 +# ) +# // }}} +# +# // Ports {{{ +# ( +# +# input logic Clk_CI, // This clock may be gated. +# input logic NonGatedClk_CI, +# input logic Rst_RBI, +# +# // For every slave port there are two master ports. The master +# // port to use can be set using the master_select flag of the protection +# // bits of a slice +# +# // AXI4 Slave {{{ +# input logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] s_axi4_awid, +# input logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] s_axi4_awaddr, +# input logic [N_PORTS-1:0] s_axi4_awvalid, +# output logic [N_PORTS-1:0] s_axi4_awready, +# input logic [N_PORTS-1:0] [7:0] s_axi4_awlen, +# input logic [N_PORTS-1:0] [2:0] s_axi4_awsize, +# input logic [N_PORTS-1:0] [1:0] s_axi4_awburst, +# input logic [N_PORTS-1:0] s_axi4_awlock, +# input logic [N_PORTS-1:0] [2:0] s_axi4_awprot, +# input logic [N_PORTS-1:0] [3:0] s_axi4_awcache, +# input logic [N_PORTS-1:0] [3:0] s_axi4_awregion, +# input logic [N_PORTS-1:0] [3:0] s_axi4_awqos, +# input logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] s_axi4_awuser, +# +# input logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] s_axi4_wdata, +# input logic [N_PORTS-1:0] s_axi4_wvalid, +# output logic [N_PORTS-1:0] s_axi4_wready, +# input logic [N_PORTS-1:0] [AXI_DATA_WIDTH/8-1:0] s_axi4_wstrb, +# input logic [N_PORTS-1:0] s_axi4_wlast, +# input logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] s_axi4_wuser, +# +# output logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] s_axi4_bid, +# output logic [N_PORTS-1:0] [1:0] s_axi4_bresp, +# output logic [N_PORTS-1:0] s_axi4_bvalid, +# output logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] s_axi4_buser, +# input logic [N_PORTS-1:0] s_axi4_bready, +# +# input logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] s_axi4_arid, +# input logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] s_axi4_araddr, +# input logic [N_PORTS-1:0] s_axi4_arvalid, +# output logic [N_PORTS-1:0] s_axi4_arready, +# input logic [N_PORTS-1:0] [7:0] s_axi4_arlen, +# input logic [N_PORTS-1:0] [2:0] s_axi4_arsize, +# input logic [N_PORTS-1:0] [1:0] s_axi4_arburst, +# input logic [N_PORTS-1:0] s_axi4_arlock, +# input logic [N_PORTS-1:0] [2:0] s_axi4_arprot, +# input logic [N_PORTS-1:0] [3:0] s_axi4_arcache, +# input logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] s_axi4_aruser, +# +# output logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] s_axi4_rid, +# output logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] s_axi4_rdata, +# output logic [N_PORTS-1:0] [1:0] s_axi4_rresp, +# output logic [N_PORTS-1:0] s_axi4_rvalid, +# input logic [N_PORTS-1:0] s_axi4_rready, +# output logic [N_PORTS-1:0] s_axi4_rlast, +# output logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] s_axi4_ruser, +# // }}} +# +# // AXI4 Master 0 {{{ +# output logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] m0_axi4_awid, +# output logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] m0_axi4_awaddr, +# output logic [N_PORTS-1:0] m0_axi4_awvalid, +# input logic [N_PORTS-1:0] m0_axi4_awready, +# output logic [N_PORTS-1:0] [7:0] m0_axi4_awlen, +# output logic [N_PORTS-1:0] [2:0] m0_axi4_awsize, +# output logic [N_PORTS-1:0] [1:0] m0_axi4_awburst, +# output logic [N_PORTS-1:0] m0_axi4_awlock, +# output logic [N_PORTS-1:0] [2:0] m0_axi4_awprot, +# output logic [N_PORTS-1:0] [3:0] m0_axi4_awcache, +# output logic [N_PORTS-1:0] [3:0] m0_axi4_awregion, +# output logic [N_PORTS-1:0] [3:0] m0_axi4_awqos, +# output logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m0_axi4_awuser, +# +# output logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] m0_axi4_wdata, +# output logic [N_PORTS-1:0] m0_axi4_wvalid, +# input logic [N_PORTS-1:0] m0_axi4_wready, +# output logic [N_PORTS-1:0] [AXI_DATA_WIDTH/8-1:0] m0_axi4_wstrb, +# output logic [N_PORTS-1:0] m0_axi4_wlast, +# output logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m0_axi4_wuser, +# +# input logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] m0_axi4_bid, +# input logic [N_PORTS-1:0] [1:0] m0_axi4_bresp, +# input logic [N_PORTS-1:0] m0_axi4_bvalid, +# input logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m0_axi4_buser, +# output logic [N_PORTS-1:0] m0_axi4_bready, +# +# output logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] m0_axi4_arid, +# output logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] m0_axi4_araddr, +# output logic [N_PORTS-1:0] m0_axi4_arvalid, +# input logic [N_PORTS-1:0] m0_axi4_arready, +# output logic [N_PORTS-1:0] [7:0] m0_axi4_arlen, +# output logic [N_PORTS-1:0] [2:0] m0_axi4_arsize, +# output logic [N_PORTS-1:0] [1:0] m0_axi4_arburst, +# output logic [N_PORTS-1:0] m0_axi4_arlock, +# output logic [N_PORTS-1:0] [2:0] m0_axi4_arprot, +# output logic [N_PORTS-1:0] [3:0] m0_axi4_arcache, +# output logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m0_axi4_aruser, +# +# input logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] m0_axi4_rid, +# input logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] m0_axi4_rdata, +# input logic [N_PORTS-1:0] [1:0] m0_axi4_rresp, +# input logic [N_PORTS-1:0] m0_axi4_rvalid, +# output logic [N_PORTS-1:0] m0_axi4_rready, +# input logic [N_PORTS-1:0] m0_axi4_rlast, +# input logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m0_axi4_ruser, +# // }}} +# +# // AXI4 Master 1 {{{ +# output logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] m1_axi4_awid, +# output logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] m1_axi4_awaddr, +# output logic [N_PORTS-1:0] m1_axi4_awvalid, +# input logic [N_PORTS-1:0] m1_axi4_awready, +# output logic [N_PORTS-1:0] [7:0] m1_axi4_awlen, +# output logic [N_PORTS-1:0] [2:0] m1_axi4_awsize, +# output logic [N_PORTS-1:0] [1:0] m1_axi4_awburst, +# output logic [N_PORTS-1:0] m1_axi4_awlock, +# output logic [N_PORTS-1:0] [2:0] m1_axi4_awprot, +# output logic [N_PORTS-1:0] [3:0] m1_axi4_awcache, +# output logic [N_PORTS-1:0] [3:0] m1_axi4_awregion, +# output logic [N_PORTS-1:0] [3:0] m1_axi4_awqos, +# output logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m1_axi4_awuser, +# +# output logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] m1_axi4_wdata, +# output logic [N_PORTS-1:0] m1_axi4_wvalid, +# input logic [N_PORTS-1:0] m1_axi4_wready, +# output logic [N_PORTS-1:0] [AXI_DATA_WIDTH/8-1:0] m1_axi4_wstrb, +# output logic [N_PORTS-1:0] m1_axi4_wlast, +# output logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m1_axi4_wuser, +# +# input logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] m1_axi4_bid, +# input logic [N_PORTS-1:0] [1:0] m1_axi4_bresp, +# input logic [N_PORTS-1:0] m1_axi4_bvalid, +# input logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m1_axi4_buser, +# output logic [N_PORTS-1:0] m1_axi4_bready, +# +# output logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] m1_axi4_arid, +# output logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] m1_axi4_araddr, +# output logic [N_PORTS-1:0] m1_axi4_arvalid, +# input logic [N_PORTS-1:0] m1_axi4_arready, +# output logic [N_PORTS-1:0] [7:0] m1_axi4_arlen, +# output logic [N_PORTS-1:0] [2:0] m1_axi4_arsize, +# output logic [N_PORTS-1:0] [1:0] m1_axi4_arburst, +# output logic [N_PORTS-1:0] m1_axi4_arlock, +# output logic [N_PORTS-1:0] [2:0] m1_axi4_arprot, +# output logic [N_PORTS-1:0] [3:0] m1_axi4_arcache, +# output logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m1_axi4_aruser, +# +# input logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] m1_axi4_rid, +# input logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] m1_axi4_rdata, +# input logic [N_PORTS-1:0] [1:0] m1_axi4_rresp, +# input logic [N_PORTS-1:0] m1_axi4_rvalid, +# output logic [N_PORTS-1:0] m1_axi4_rready, +# input logic [N_PORTS-1:0] m1_axi4_rlast, +# input logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] m1_axi4_ruser, +# // }}} +# +# // AXI 4 Lite Slave (Configuration Interface) {{{ +# // AXI4-Lite port to setup the rab slices +# // use this to program the configuration registers +# input logic [AXI_LITE_ADDR_WIDTH-1:0] s_axi4lite_awaddr, +# input logic s_axi4lite_awvalid, +# output logic s_axi4lite_awready, +# +# input logic [AXI_LITE_DATA_WIDTH-1:0] s_axi4lite_wdata, +# input logic s_axi4lite_wvalid, +# output logic s_axi4lite_wready, +# input logic [AXI_LITE_DATA_WIDTH/8-1:0] s_axi4lite_wstrb, +# +# output logic [1:0] s_axi4lite_bresp, +# output logic s_axi4lite_bvalid, +# input logic s_axi4lite_bready, +# +# input logic [AXI_LITE_ADDR_WIDTH-1:0] s_axi4lite_araddr, +# input logic s_axi4lite_arvalid, +# output logic s_axi4lite_arready, +# +# output logic [AXI_LITE_DATA_WIDTH-1:0] s_axi4lite_rdata, +# output logic [1:0] s_axi4lite_rresp, +# output logic s_axi4lite_rvalid, +# input logic s_axi4lite_rready, +# // }}} +# +# // BRAMs {{{ +# //`ifdef RAB_AX_LOG_EN +# // BramPort.Slave ArBram_PS, +# // BramPort.Slave AwBram_PS, +# //`endif +# // }}} +# +# // Logger Control {{{ +# //`ifdef RAB_AX_LOG_EN +# // input logic LogEn_SI, +# // input logic ArLogClr_SI, +# // input logic AwLogClr_SI, +# // output logic ArLogRdy_SO, +# // output logic AwLogRdy_SO, +# //`endif +# // }}} +# +# // Interrupt Outputs {{{ +# // Interrupt lines to handle misses, collisions of slices/multiple hits, +# // protection faults and overflow of the miss handling fifo +# //`ifdef RAB_AX_LOG_EN +# // output logic int_ar_log_full, +# // output logic int_aw_log_full, +# //`endif +# output logic [N_PORTS-1:0] int_miss, +# output logic [N_PORTS-1:0] int_multi, +# output logic [N_PORTS-1:0] int_prot, +# output logic int_mhf_full +# // }}} +# +# ); +# +"""#docstring_begin + + // }}} + + // Signals {{{ + // ███████╗██╗ ██████╗ ███╗ ██╗ █████╗ ██╗ ███████╗ + // ██╔════╝██║██╔════╝ ████╗ ██║██╔══██╗██║ ██╔════╝ + // ███████╗██║██║ ███╗██╔██╗ ██║███████║██║ ███████╗ + // ╚════██║██║██║ ██║██║╚██╗██║██╔══██║██║ ╚════██║ + // ███████║██║╚██████╔╝██║ ╚████║██║ ██║███████╗███████║ + // ╚══════╝╚═╝ ╚═════╝ ╚═╝ ╚═══╝╚═╝ ╚═╝╚══════╝╚══════╝ + // + + // Internal AXI4 lines, these connect buffers on the slave side to the rab core and + // multiplexers which switch between the two master outputs + logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] int_awid; + logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] int_awaddr; + logic [N_PORTS-1:0] int_awvalid; + logic [N_PORTS-1:0] int_awready; + logic [N_PORTS-1:0] [7:0] int_awlen; + logic [N_PORTS-1:0] [2:0] int_awsize; + logic [N_PORTS-1:0] [1:0] int_awburst; + logic [N_PORTS-1:0] int_awlock; + logic [N_PORTS-1:0] [2:0] int_awprot; + logic [N_PORTS-1:0] [3:0] int_awcache; + logic [N_PORTS-1:0] [3:0] int_awregion; + logic [N_PORTS-1:0] [3:0] int_awqos; + logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_awuser; + + logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] int_wdata; + logic [N_PORTS-1:0] int_wvalid; + logic [N_PORTS-1:0] int_wready; + logic [N_PORTS-1:0] [AXI_DATA_WIDTH/8-1:0] int_wstrb; + logic [N_PORTS-1:0] int_wlast; + logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_wuser; + + logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] int_bid; + logic [N_PORTS-1:0] [1:0] int_bresp; + logic [N_PORTS-1:0] int_bvalid; + logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_buser; + logic [N_PORTS-1:0] int_bready; + + logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] int_arid; + logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] int_araddr; + logic [N_PORTS-1:0] int_arvalid; + logic [N_PORTS-1:0] int_arready; + logic [N_PORTS-1:0] [7:0] int_arlen; + logic [N_PORTS-1:0] [2:0] int_arsize; + logic [N_PORTS-1:0] [1:0] int_arburst; + logic [N_PORTS-1:0] int_arlock; + logic [N_PORTS-1:0] [2:0] int_arprot; + logic [N_PORTS-1:0] [3:0] int_arcache; + logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_aruser; + + logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] int_rid; + logic [N_PORTS-1:0] [1:0] int_rresp; + logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] int_rdata; + logic [N_PORTS-1:0] int_rlast; + logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_ruser; + logic [N_PORTS-1:0] int_rvalid; + logic [N_PORTS-1:0] int_rready; + + // rab_core outputs + logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] int_wtrans_addr; + logic [N_PORTS-1:0] int_wtrans_accept; + logic [N_PORTS-1:0] int_wtrans_drop; + logic [N_PORTS-1:0] int_wtrans_miss; + logic [N_PORTS-1:0] int_wtrans_sent; + logic [N_PORTS-1:0] int_wtrans_cache_coherent; + logic [N_PORTS-1:0] int_wmaster_select; + + logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] int_rtrans_addr; + logic [N_PORTS-1:0] int_rtrans_accept; + logic [N_PORTS-1:0] int_rtrans_drop; + logic [N_PORTS-1:0] int_rtrans_miss; + logic [N_PORTS-1:0] int_rtrans_sent; + logic [N_PORTS-1:0] int_rtrans_cache_coherent; + logic [N_PORTS-1:0] int_rmaster_select; + + logic [N_PORTS-1:0] w_master_select; + + // Internal master0 AXI4 lines. These connect the first master port to the + // multiplexers + // For channels read address, write address and write data the other lines + // are ignored if valid is not set, therefore we only need to multiplex those + logic [N_PORTS-1:0] int_m0_awvalid; + logic [N_PORTS-1:0] int_m0_awready; + + logic [N_PORTS-1:0] int_m0_wvalid; + logic [N_PORTS-1:0] int_m0_wready; + + logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] int_m0_bid; + logic [N_PORTS-1:0] [1:0] int_m0_bresp; + logic [N_PORTS-1:0] int_m0_bvalid; + logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_m0_buser; + logic [N_PORTS-1:0] int_m0_bready; + + logic [N_PORTS-1:0] int_m0_arvalid; + logic [N_PORTS-1:0] int_m0_arready; + + logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] int_m0_rid; + logic [N_PORTS-1:0] [1:0] int_m0_rresp; + logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] int_m0_rdata; + logic [N_PORTS-1:0] int_m0_rlast; + logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_m0_ruser; + logic [N_PORTS-1:0] int_m0_rready; + logic [N_PORTS-1:0] int_m0_rvalid; + + logic [N_PORTS-1:0] l1_m0_ar_accept; + logic [N_PORTS-1:0] l1_m0_ar_drop; + logic [N_PORTS-1:0] l1_m0_ar_save; + logic [N_PORTS-1:0] l1_m0_ar_done; + logic [N_PORTS-1:0] l2_m0_ar_accept; + logic [N_PORTS-1:0] l2_m0_ar_drop; + logic [N_PORTS-1:0] l2_m0_ar_done; + logic [N_PORTS-1:0] l2_m0_ar_sending; + + logic [N_PORTS-1:0] l1_m0_aw_accept; + logic [N_PORTS-1:0] l1_m0_aw_drop; + logic [N_PORTS-1:0] l1_m0_aw_save; + logic [N_PORTS-1:0] l1_m0_aw_done; + logic [N_PORTS-1:0] l2_m0_aw_accept; + logic [N_PORTS-1:0] l2_m0_aw_drop; + logic [N_PORTS-1:0] l2_m0_aw_done; + logic [N_PORTS-1:0] l2_m0_aw_sending; + + // Internal master1 AXI4 lines. These connect the second master port to the + // multiplexers + // For channels read address, write address and write data the other lines + // are ignored if valid is not set, therefore we only need to multiplex those + logic [N_PORTS-1:0] int_m1_awvalid; + logic [N_PORTS-1:0] int_m1_awready; + + logic [N_PORTS-1:0] int_m1_wvalid; + logic [N_PORTS-1:0] int_m1_wready; + + logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] int_m1_bid; + logic [N_PORTS-1:0] [1:0] int_m1_bresp; + logic [N_PORTS-1:0] int_m1_bvalid; + logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_m1_buser; + logic [N_PORTS-1:0] int_m1_bready; + + logic [N_PORTS-1:0] int_m1_arvalid; + logic [N_PORTS-1:0] int_m1_arready; + + logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] int_m1_rid; + logic [N_PORTS-1:0] [1:0] int_m1_rresp; + logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] int_m1_rdata; + logic [N_PORTS-1:0] int_m1_rlast; + logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] int_m1_ruser; + logic [N_PORTS-1:0] int_m1_rvalid; + logic [N_PORTS-1:0] int_m1_rready; + + logic [N_PORTS-1:0] l1_m1_ar_accept; + logic [N_PORTS-1:0] l1_m1_ar_drop; + logic [N_PORTS-1:0] l1_m1_ar_save; + logic [N_PORTS-1:0] l1_m1_ar_done; + logic [N_PORTS-1:0] l2_m1_ar_accept; + logic [N_PORTS-1:0] l2_m1_ar_drop; + logic [N_PORTS-1:0] l2_m1_ar_done; + + logic [N_PORTS-1:0] l1_m1_aw_accept; + logic [N_PORTS-1:0] l1_m1_aw_drop; + logic [N_PORTS-1:0] l1_m1_aw_save; + logic [N_PORTS-1:0] l1_m1_aw_done; + logic [N_PORTS-1:0] l2_m1_aw_accept; + logic [N_PORTS-1:0] l2_m1_aw_drop; + logic [N_PORTS-1:0] l2_m1_aw_done; + + // L1 outputs + logic [N_PORTS-1:0] rab_miss; // L1 RAB miss + logic [N_PORTS-1:0] rab_prot; + logic [N_PORTS-1:0] rab_multi; + logic [N_PORTS-1:0] rab_prefetch; + + // + // Signals used to support L2 TLB + // + // L2 RAM configuration signals + logic [N_PORTS-1:0] [AXI_LITE_DATA_WIDTH-1:0] L2CfgWData_D; + logic [N_PORTS-1:0] [AXI_LITE_ADDR_WIDTH-1:0] L2CfgWAddr_D; + logic [N_PORTS-1:0] L2CfgWE_S; + + // L1 output and drop Buffer + logic [N_PORTS-1:0] L1OutRwType_D, L1DropRwType_DP; + logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] L1OutUser_D, L1DropUser_DP; + logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] L1OutId_D, L1DropId_DP; + logic [N_PORTS-1:0] [7:0] L1OutLen_D, L1DropLen_DP; + logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] L1OutAddr_D, L1DropAddr_DP; + logic [N_PORTS-1:0] L1OutProt_D, L1DropProt_DP; + logic [N_PORTS-1:0] L1OutMulti_D, L1DropMulti_DP; + logic [N_PORTS-1:0] L1DropEn_S; + logic [N_PORTS-1:0] L1DropPrefetch_S; + + logic [N_PORTS-1:0] L1DropValid_SN, L1DropValid_SP; + + // L2 input Buffer + logic [N_PORTS-1:0] L2InRwType_DP; + logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] L2InUser_DP; + logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] L2InId_DP; + logic [N_PORTS-1:0] [7:0] L2InLen_DP; + logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] L2InAddr_DP; + logic [N_PORTS-1:0] L2InEn_S; + + // L2 output Buffer + logic [N_PORTS-1:0] L2OutRwType_DP; + logic [N_PORTS-1:0] [AXI_USER_WIDTH-1:0] L2OutUser_DP; + logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] L2OutId_DP; + logic [N_PORTS-1:0] [7:0] L2OutLen_DP; + logic [N_PORTS-1:0] [AXI_S_ADDR_WIDTH-1:0] L2OutInAddr_DP; + + logic [N_PORTS-1:0] L2OutHit_SN, L2OutHit_SP; + logic [N_PORTS-1:0] L2OutMiss_SN, L2OutMiss_SP; + logic [N_PORTS-1:0] L2OutProt_SN, L2OutProt_SP; + logic [N_PORTS-1:0] L2OutMulti_SN, L2OutMulti_SP; + logic [N_PORTS-1:0] L2OutCC_SN, L2OutCC_SP; + logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] L2OutAddr_DN, L2OutAddr_DP; + + logic [N_PORTS-1:0] L2OutValid_SN, L2OutValid_SP; + logic [N_PORTS-1:0] L2OutPrefetch_S; + logic [N_PORTS-1:0] L2OutReady_S; + logic [N_PORTS-1:0] L2OutEn_S; + + // L2 outputs + logic [N_PORTS-1:0] L2Busy_S; + logic [N_PORTS-1:0] L2OutValid_S; + + logic [N_PORTS-1:0] L2Miss_S; + + // Signals for interfacing the AXI modules + logic [N_PORTS-1:0] l1_ar_accept; + logic [N_PORTS-1:0] l1_aw_accept; + logic [N_PORTS-1:0] l1_w_accept; + logic [N_PORTS-1:0] l1_xw_accept; + + logic [N_PORTS-1:0] l1_ar_drop; + logic [N_PORTS-1:0] l1_aw_drop; + logic [N_PORTS-1:0] l1_w_drop; + logic [N_PORTS-1:0] l1_xw_drop; + + logic [N_PORTS-1:0] l1_ar_save; + logic [N_PORTS-1:0] l1_aw_save; + logic [N_PORTS-1:0] l1_w_save; + logic [N_PORTS-1:0] l1_xw_save; + + logic [N_PORTS-1:0] l1_ar_done; + logic [N_PORTS-1:0] l1_r_done; + logic [N_PORTS-1:0] l1_r_drop; + logic [N_PORTS-1:0] lx_r_drop; + logic [N_PORTS-1:0] lx_r_done; + + logic [N_PORTS-1:0] l1_aw_done; + logic [N_PORTS-1:0] l1_w_done; + logic [N_PORTS-1:0] l1_xw_done; + logic [N_PORTS-1:0] l1_aw_done_SP; + logic [N_PORTS-1:0] l1_w_done_SP; + + logic [N_PORTS-1:0] l2_ar_accept; + logic [N_PORTS-1:0] l2_aw_accept; + logic [N_PORTS-1:0] l2_w_accept; + logic [N_PORTS-1:0] l2_xw_accept; + + logic [N_PORTS-1:0] l2_ar_drop; + logic [N_PORTS-1:0] l2_r_drop; + logic [N_PORTS-1:0] l2_xr_drop; + logic [N_PORTS-1:0] l2_aw_drop; + logic [N_PORTS-1:0] l2_w_drop; + logic [N_PORTS-1:0] l2_xw_drop; + + logic [N_PORTS-1:0] l2_aw_done; + logic [N_PORTS-1:0] l2_w_done; + logic [N_PORTS-1:0] l2_xw_done; + logic [N_PORTS-1:0] l2_aw_done_SP; + logic [N_PORTS-1:0] l2_w_done_SP; + + logic [N_PORTS-1:0] l2_ar_done; + logic [N_PORTS-1:0] l2_r_done; + logic [N_PORTS-1:0] l2_xr_done; + logic [N_PORTS-1:0] l2_ar_done_SP; + logic [N_PORTS-1:0] l2_r_done_SP; + + logic [N_PORTS-1:0] l1_mx_aw_done; + logic [N_PORTS-1:0] l1_mx_ar_done; + logic [N_PORTS-1:0] l1_m0_aw_done_SP; + logic [N_PORTS-1:0] l1_m0_ar_done_SP; + logic [N_PORTS-1:0] l1_m1_aw_done_SP; + logic [N_PORTS-1:0] l1_m1_ar_done_SP; + + logic [N_PORTS-1:0] l2_mx_aw_done; + logic [N_PORTS-1:0] l2_mx_ar_done; + logic [N_PORTS-1:0] l2_m0_aw_done_SP; + logic [N_PORTS-1:0] l2_m0_ar_done_SP; + logic [N_PORTS-1:0] l2_m1_aw_done_SP; + logic [N_PORTS-1:0] l2_m1_ar_done_SP; + + logic [N_PORTS-1:0] [AXI_ID_WIDTH-1:0] l1_id_drop, lx_id_drop, b_id_drop; + logic [N_PORTS-1:0] [7:0] l1_len_drop, lx_len_drop; + logic [N_PORTS-1:0] l1_prefetch_drop, lx_prefetch_drop, b_prefetch_drop; + logic [N_PORTS-1:0] l1_hit_drop, lx_hit_drop, b_hit_drop; + + logic [N_PORTS-1:0] b_drop; + logic [N_PORTS-1:0] b_done; + + logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] l2_aw_addr; + logic [N_PORTS-1:0] [AXI_M_ADDR_WIDTH-1:0] l2_ar_addr; + + logic [N_PORTS-1:0] l2_cache_coherent; + logic [N_PORTS-1:0] l2_master_select; + + logic [N_PORTS-1:0] aw_in_stall; + logic [N_PORTS-1:0] aw_out_stall; + + genvar i; + + // RRESP FSM + typedef enum logic {IDLE, BUSY} r_resp_mux_ctrl_state_t; + r_resp_mux_ctrl_state_t [N_PORTS-1:0] RRespMuxCtrl_SN, RRespMuxCtrl_SP; + logic [N_PORTS-1:0] RRespSel_SN, RRespSel_SP; + logic [N_PORTS-1:0] RRespBurst_S; + logic [N_PORTS-1:0] RRespSelIm_S; + + // }}} + + // Local parameters {{{ + + // Enable L2 for select ports + localparam integer ENABLE_L2TLB[N_PORTS-1:0] = `EN_L2TLB_ARRAY; + + // L2TLB parameters + localparam integer HUM_BUFFER_DEPTH = (N_L2_SET_ENTRIES/2/`RAB_L2_N_PAR_VA_RAMS)+13; + + // }}} + + // Derive `master_select` from cache coherency flag. {{{ + `ifdef EN_ACP + assign int_wmaster_select = int_wtrans_cache_coherent; + assign int_rmaster_select = int_rtrans_cache_coherent; + assign l2_master_select = l2_cache_coherent; + `else + assign int_wmaster_select = '0; + assign int_rmaster_select = '0; + assign l2_master_select = '0; + `endif + // }}} + + // Buf and Send {{{ + // ██████╗ ██╗ ██╗███████╗ ██╗ ███████╗███████╗███╗ ██╗██████╗ + // ██╔══██╗██║ ██║██╔════╝ ██║ ██╔════╝██╔════╝████╗ ██║██╔══██╗ + // ██████╔╝██║ ██║█████╗ ████████╗ ███████╗█████╗ ██╔██╗ ██║██║ ██║ + // ██╔══██╗██║ ██║██╔══╝ ██╔═██╔═╝ ╚════██║██╔══╝ ██║╚██╗██║██║ ██║ + // ██████╔╝╚██████╔╝██║ ██████║ ███████║███████╗██║ ╚████║██████╔╝ + // ╚═════╝ ╚═════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝╚═╝ ╚═══╝╚═════╝ + // + logic[N_PORTS-1:0] m0_write_is_burst, m0_read_is_burst; + logic[N_PORTS-1:0] m1_write_is_burst, m1_read_is_burst; + + generate for (i = 0; i < N_PORTS; i++) begin : BUF_AND_SEND + + // Write Address channel (aw) {{{ + /* + * write address channel (aw) + * + * ██╗ ██╗██████╗ ██╗████████╗███████╗ █████╗ ██████╗ ██████╗ ██████╗ + * ██║ ██║██╔══██╗██║╚══██╔══╝██╔════╝ ██╔══██╗██╔══██╗██╔══██╗██╔══██╗ + * ██║ █╗ ██║██████╔╝██║ ██║ █████╗ ███████║██║ ██║██║ ██║██████╔╝ + * ██║███╗██║██╔══██╗██║ ██║ ██╔══╝ ██╔══██║██║ ██║██║ ██║██╔══██╗ + * ╚███╔███╔╝██║ ██║██║ ██║ ███████╗ ██║ ██║██████╔╝██████╔╝██║ ██║ + * ╚══╝╚══╝ ╚═╝ ╚═╝╚═╝ ╚═╝ ╚══════╝ ╚═╝ ╚═╝╚═════╝ ╚═════╝ ╚═╝ ╚═╝ + * + */ + + axi4_aw_buffer + #( + .AXI_ID_WIDTH ( AXI_ID_WIDTH ), + .AXI_USER_WIDTH ( AXI_USER_WIDTH ) + ) + u_aw_buffer + ( + .axi4_aclk ( Clk_CI ), + .axi4_arstn ( Rst_RBI ), + .s_axi4_awid ( s_axi4_awid[i] ), + .s_axi4_awaddr ( s_axi4_awaddr[i] ), + .s_axi4_awvalid ( s_axi4_awvalid[i] ), + .s_axi4_awready ( s_axi4_awready[i] ), + .s_axi4_awlen ( s_axi4_awlen[i] ), + .s_axi4_awsize ( s_axi4_awsize[i] ), + .s_axi4_awburst ( s_axi4_awburst[i] ), + .s_axi4_awlock ( s_axi4_awlock[i] ), + .s_axi4_awprot ( s_axi4_awprot[i] ), + .s_axi4_awcache ( s_axi4_awcache[i] ), + .s_axi4_awregion ( s_axi4_awregion[i] ), + .s_axi4_awqos ( s_axi4_awqos[i] ), + .s_axi4_awuser ( s_axi4_awuser[i] ), + .m_axi4_awid ( int_awid[i] ), + .m_axi4_awaddr ( int_awaddr[i] ), + .m_axi4_awvalid ( int_awvalid[i] ), + .m_axi4_awready ( int_awready[i] ), + .m_axi4_awlen ( int_awlen[i] ), + .m_axi4_awsize ( int_awsize[i] ), + .m_axi4_awburst ( int_awburst[i] ), + .m_axi4_awlock ( int_awlock[i] ), + .m_axi4_awprot ( int_awprot[i] ), + .m_axi4_awcache ( int_awcache[i] ), + .m_axi4_awregion ( int_awregion[i] ), + .m_axi4_awqos ( int_awqos[i] ), + .m_axi4_awuser ( int_awuser[i] ) + ); + + axi4_aw_sender + #( + .AXI_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ), + .AXI_ID_WIDTH ( AXI_ID_WIDTH ), + .AXI_USER_WIDTH ( AXI_USER_WIDTH ), + .ENABLE_L2TLB ( ENABLE_L2TLB[i] ) + ) + u_aw_sender_m0 + ( + .axi4_aclk ( Clk_CI ), + .axi4_arstn ( Rst_RBI ), + .l1_done_o ( l1_m0_aw_done[i] ), + .l1_accept_i ( l1_m0_aw_accept[i] ), + .l1_drop_i ( l1_m0_aw_drop[i] ), + .l1_save_i ( l1_m0_aw_save[i] ), + .l2_done_o ( l2_m0_aw_done[i] ), + .l2_accept_i ( l2_m0_aw_accept[i] ), + .l2_drop_i ( l2_m0_aw_drop[i] ), + .l2_sending_o ( l2_m0_aw_sending[i] ), + .l1_awaddr_i ( int_wtrans_addr[i] ), + .l2_awaddr_i ( l2_aw_addr[i] ), + .s_axi4_awid ( int_awid[i] ), + .s_axi4_awvalid ( int_m0_awvalid[i] ), + .s_axi4_awready ( int_m0_awready[i] ), + .s_axi4_awlen ( int_awlen[i] ), + .s_axi4_awsize ( int_awsize[i] ), + .s_axi4_awburst ( int_awburst[i] ), + .s_axi4_awlock ( int_awlock[i] ), + .s_axi4_awprot ( int_awprot[i] ), + .s_axi4_awcache ( int_awcache[i] ), + .s_axi4_awregion ( int_awregion[i] ), + .s_axi4_awqos ( int_awqos[i] ), + .s_axi4_awuser ( int_awuser[i] ), + .m_axi4_awid ( m0_axi4_awid[i] ), + .m_axi4_awaddr ( m0_axi4_awaddr[i] ), + .m_axi4_awvalid ( m0_axi4_awvalid[i] ), + .m_axi4_awready ( m0_axi4_awready[i] ), + .m_axi4_awlen ( m0_axi4_awlen[i] ), + .m_axi4_awsize ( m0_axi4_awsize[i] ), + .m_axi4_awburst ( m0_axi4_awburst[i] ), + .m_axi4_awlock ( m0_axi4_awlock[i] ), + .m_axi4_awprot ( m0_axi4_awprot[i] ), + .m_axi4_awcache ( ), + .m_axi4_awregion ( m0_axi4_awregion[i] ), + .m_axi4_awqos ( m0_axi4_awqos[i] ), + .m_axi4_awuser ( m0_axi4_awuser[i] ) + ); + + // The AXCACHE signals are set according to burstiness and cache coherence or statically + // when not connected to ACP on Zynq (implemented below). + assign m0_write_is_burst[i] = (m0_axi4_awlen[i] != {8{1'b0}}) && (m0_axi4_awburst[i] != 2'b00); + `ifndef EN_ACP + always_comb begin + if ( (l2_m0_aw_sending[i] & l2_cache_coherent[i]) | int_wtrans_cache_coherent[i]) begin + if (m0_write_is_burst[i]) begin + m0_axi4_awcache[i] = 4'b0111; + end else begin + m0_axi4_awcache[i] = 4'b1111; + end + end else begin + m0_axi4_awcache[i] = 4'b0011; + end + end + `else + assign m0_axi4_awcache[i] = 4'b0011; + `endif + + axi4_aw_sender + #( + .AXI_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ), + .AXI_ID_WIDTH ( AXI_ID_WIDTH ), + .AXI_USER_WIDTH ( AXI_USER_WIDTH ), + .ENABLE_L2TLB ( ENABLE_L2TLB[i] ) + ) + u_aw_sender_m1 + ( + .axi4_aclk ( Clk_CI ), + .axi4_arstn ( Rst_RBI ), + .l1_accept_i ( l1_m1_aw_accept[i] ), + .l1_drop_i ( l1_m1_aw_drop[i] ), + .l1_save_i ( l1_m1_aw_save[i] ), + .l1_done_o ( l1_m1_aw_done[i] ), + .l2_accept_i ( l2_m1_aw_accept[i] ), + .l2_drop_i ( l2_m1_aw_drop[i] ), + .l2_done_o ( l2_m1_aw_done[i] ), + .l2_sending_o ( ), // just helps to set axcache + .l1_awaddr_i ( int_wtrans_addr[i] ), + .l2_awaddr_i ( l2_aw_addr[i] ), + .s_axi4_awid ( int_awid[i] ), + .s_axi4_awvalid ( int_m1_awvalid[i] ), + .s_axi4_awready ( int_m1_awready[i] ), + .s_axi4_awlen ( int_awlen[i] ), + .s_axi4_awsize ( int_awsize[i] ), + .s_axi4_awburst ( int_awburst[i] ), + .s_axi4_awlock ( int_awlock[i] ), + .s_axi4_awprot ( int_awprot[i] ), + .s_axi4_awcache ( int_awcache[i] ), + .s_axi4_awregion ( int_awregion[i] ), + .s_axi4_awqos ( int_awqos[i] ), + .s_axi4_awuser ( int_awuser[i] ), + .m_axi4_awid ( m1_axi4_awid[i] ), + .m_axi4_awaddr ( m1_axi4_awaddr[i] ), + .m_axi4_awvalid ( m1_axi4_awvalid[i] ), + .m_axi4_awready ( m1_axi4_awready[i] ), + .m_axi4_awlen ( m1_axi4_awlen[i] ), + .m_axi4_awsize ( m1_axi4_awsize[i] ), + .m_axi4_awburst ( m1_axi4_awburst[i] ), + .m_axi4_awlock ( m1_axi4_awlock[i] ), + .m_axi4_awprot ( m1_axi4_awprot[i] ), + .m_axi4_awcache ( ), + .m_axi4_awregion ( m1_axi4_awregion[i] ), + .m_axi4_awqos ( m1_axi4_awqos[i] ), + .m_axi4_awuser ( m1_axi4_awuser[i] ) + ); + + // The AXCACHE signals are set according to burstiness and cache coherence or statically + // when not connected to ACP on Zynq (implemented below). + assign m1_write_is_burst[i] = (m1_axi4_awlen[i] != {8{1'b0}}) && (m1_axi4_awburst[i] != 2'b00); + `ifdef EN_ACP + always_comb begin + if (m1_write_is_burst[i]) begin + m1_axi4_awcache[i] = 4'b1011; + end else begin + m1_axi4_awcache[i] = 4'b1111; + end + end + `else + assign m1_axi4_awcache[i] = 4'b0011; + `endif + + // }}} + + // Write Data channel (w) {{{ + /* + * write data channel (w) + * + * ██╗ ██╗██████╗ ██╗████████╗███████╗ ██████╗ █████╗ ████████╗ █████╗ + * ██║ ██║██╔══██╗██║╚══██╔══╝██╔════╝ ██╔══██╗██╔══██╗╚══██╔══╝██╔══██╗ + * ██║ █╗ ██║██████╔╝██║ ██║ █████╗ ██║ ██║███████║ ██║ ███████║ + * ██║███╗██║██╔══██╗██║ ██║ ██╔══╝ ██║ ██║██╔══██║ ██║ ██╔══██║ + * ╚███╔███╔╝██║ ██║██║ ██║ ███████╗ ██████╔╝██║ ██║ ██║ ██║ ██║ + * ╚══╝╚══╝ ╚═╝ ╚═╝╚═╝ ╚═╝ ╚══════╝ ╚═════╝ ╚═╝ ╚═╝ ╚═╝ ╚═╝ ╚═╝ + * + */ + axi4_w_buffer + #( + .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ), + .AXI_ID_WIDTH ( AXI_ID_WIDTH ), + .AXI_USER_WIDTH ( AXI_USER_WIDTH ), + .ENABLE_L2TLB ( ENABLE_L2TLB[i] ), + .HUM_BUFFER_DEPTH ( HUM_BUFFER_DEPTH ) + ) + u_w_buffer + ( + .axi4_aclk ( Clk_CI ), + .axi4_arstn ( Rst_RBI ), + + // L1 interface + .l1_done_o ( l1_w_done[i] ), + .l1_accept_i ( l1_w_accept[i] ), + .l1_save_i ( l1_w_save[i] ), + .l1_drop_i ( l1_w_drop[i] ), + .l1_master_i ( int_wmaster_select[i] ), + .l1_id_i ( l1_id_drop[i] ), + .l1_len_i ( l1_len_drop[i] ), + .l1_prefetch_i ( l1_prefetch_drop[i] ), + .l1_hit_i ( l1_hit_drop[i] ), + + // L2 interface + .l2_done_o ( l2_w_done[i] ), + .l2_accept_i ( l2_w_accept[i] ), + .l2_drop_i ( l2_w_drop[i] ), + .l2_master_i ( l2_master_select[i] ), + .l2_id_i ( lx_id_drop[i] ), + .l2_len_i ( lx_len_drop[i] ), + .l2_prefetch_i ( lx_prefetch_drop[i] ), + .l2_hit_i ( lx_hit_drop[i] ), + + // Top-level control outputs + .master_select_o ( w_master_select[i] ), + .input_stall_o ( aw_in_stall[i] ), // stall L1 AW input if request buffers full + .output_stall_o ( aw_out_stall[i] ), // stall L1 AW hit forwarding if bypass not possible + + // B sender interface + .b_drop_o ( b_drop[i] ), + .b_done_i ( b_done[i] ), + .id_o ( b_id_drop[i] ), + .prefetch_o ( b_prefetch_drop[i] ), + .hit_o ( b_hit_drop[i] ), + + // AXI W channel interfaces + .s_axi4_wdata ( s_axi4_wdata[i] ), + .s_axi4_wvalid ( s_axi4_wvalid[i] ), + .s_axi4_wready ( s_axi4_wready[i] ), + .s_axi4_wstrb ( s_axi4_wstrb[i] ), + .s_axi4_wlast ( s_axi4_wlast[i] ), + .s_axi4_wuser ( s_axi4_wuser[i] ), + .m_axi4_wdata ( int_wdata[i] ), + .m_axi4_wvalid ( int_wvalid[i] ), + .m_axi4_wready ( int_wready[i] ), + .m_axi4_wstrb ( int_wstrb[i] ), + .m_axi4_wlast ( int_wlast[i] ), + .m_axi4_wuser ( int_wuser[i] ) + ); + + axi4_w_sender + #( + .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ), + .AXI_USER_WIDTH ( AXI_USER_WIDTH ) + ) + u_w_sender_m0 + ( + .axi4_aclk ( Clk_CI ), + .axi4_arstn ( Rst_RBI ), + .s_axi4_wdata ( int_wdata[i] ), + .s_axi4_wvalid ( int_m0_wvalid[i] ), + .s_axi4_wready ( int_m0_wready[i] ), + .s_axi4_wstrb ( int_wstrb[i] ), + .s_axi4_wlast ( int_wlast[i] ), + .s_axi4_wuser ( int_wuser[i] ), + .m_axi4_wdata ( m0_axi4_wdata[i] ), + .m_axi4_wvalid ( m0_axi4_wvalid[i] ), + .m_axi4_wready ( m0_axi4_wready[i] ), + .m_axi4_wstrb ( m0_axi4_wstrb[i] ), + .m_axi4_wlast ( m0_axi4_wlast[i] ), + .m_axi4_wuser ( m0_axi4_wuser[i] ) + ); + + axi4_w_sender + #( + .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ), + .AXI_USER_WIDTH ( AXI_USER_WIDTH ) + + ) + u_w_sender_m1 + ( + .axi4_aclk ( Clk_CI ), + .axi4_arstn ( Rst_RBI ), + .s_axi4_wdata ( int_wdata[i] ), + .s_axi4_wvalid ( int_m1_wvalid[i] ), + .s_axi4_wready ( int_m1_wready[i] ), + .s_axi4_wstrb ( int_wstrb[i] ), + .s_axi4_wlast ( int_wlast[i] ), + .s_axi4_wuser ( int_wuser[i] ), + .m_axi4_wdata ( m1_axi4_wdata[i] ), + .m_axi4_wvalid ( m1_axi4_wvalid[i] ), + .m_axi4_wready ( m1_axi4_wready[i] ), + .m_axi4_wstrb ( m1_axi4_wstrb[i] ), + .m_axi4_wlast ( m1_axi4_wlast[i] ), + .m_axi4_wuser ( m1_axi4_wuser[i] ) + ); + + /* + * Multiplexer to switch between the two output master ports on the write data (w) channel + */ + always_comb begin + /* Only one output can be selected at any time */ + if (w_master_select[i] == 1'b0) begin + int_m0_wvalid[i] = int_wvalid[i]; + int_m1_wvalid[i] = 1'b0; + int_wready[i] = int_m0_wready[i]; + end else begin + int_m0_wvalid[i] = 1'b0; + int_m1_wvalid[i] = int_wvalid[i]; + int_wready[i] = int_m1_wready[i]; + end + end + + // }}} + + // Write Response channel (b) {{{ + /* + * write response channel (b) + * + * ██╗ ██╗██████╗ ██╗████████╗███████╗ ██████╗ ███████╗███████╗██████╗ + * ██║ ██║██╔══██╗██║╚══██╔══╝██╔════╝ ██╔══██╗██╔════╝██╔════╝██╔══██╗ + * ██║ █╗ ██║██████╔╝██║ ██║ █████╗ ██████╔╝█████╗ ███████╗██████╔╝ + * ██║███╗██║██╔══██╗██║ ██║ ██╔══╝ ██╔══██╗██╔══╝ ╚════██║██╔═══╝ + * ╚███╔███╔╝██║ ██║██║ ██║ ███████╗ ██║ ██║███████╗███████║██║ + * ╚══╝╚══╝ ╚═╝ ╚═╝╚═╝ ╚═╝ ╚══════╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝ + * + */ + axi4_b_buffer + #( + .AXI_ID_WIDTH ( AXI_ID_WIDTH ), + .AXI_USER_WIDTH ( AXI_USER_WIDTH ) + ) + u_b_buffer_m0 + ( + .axi4_aclk ( Clk_CI ), + .axi4_arstn ( Rst_RBI ), + .s_axi4_bid ( int_m0_bid[i] ), + .s_axi4_bresp ( int_m0_bresp[i] ), + .s_axi4_bvalid ( int_m0_bvalid[i] ), + .s_axi4_buser ( int_m0_buser[i] ), + .s_axi4_bready ( int_m0_bready[i] ), + .m_axi4_bid ( m0_axi4_bid[i] ), + .m_axi4_bresp ( m0_axi4_bresp[i] ), + .m_axi4_bvalid ( m0_axi4_bvalid[i] ), + .m_axi4_buser ( m0_axi4_buser[i] ), + .m_axi4_bready ( m0_axi4_bready[i] ) + ); + + axi4_b_buffer + #( + .AXI_ID_WIDTH ( AXI_ID_WIDTH ), + .AXI_USER_WIDTH ( AXI_USER_WIDTH ) + ) + u_b_buffer_m1 + ( + .axi4_aclk ( Clk_CI ), + .axi4_arstn ( Rst_RBI ), + .s_axi4_bid ( int_m1_bid[i] ), + .s_axi4_bresp ( int_m1_bresp[i] ), + .s_axi4_bvalid ( int_m1_bvalid[i] ), + .s_axi4_buser ( int_m1_buser[i] ), + .s_axi4_bready ( int_m1_bready[i] ), + .m_axi4_bid ( m1_axi4_bid[i] ), + .m_axi4_bresp ( m1_axi4_bresp[i] ), + .m_axi4_bvalid ( m1_axi4_bvalid[i] ), + .m_axi4_buser ( m1_axi4_buser[i] ), + .m_axi4_bready ( m1_axi4_bready[i] ) + ); + + axi4_b_sender + #( + .AXI_ID_WIDTH ( AXI_ID_WIDTH ), + .AXI_USER_WIDTH ( AXI_USER_WIDTH ) + ) + u_b_sender + ( + .axi4_aclk ( Clk_CI ), + .axi4_arstn ( Rst_RBI ), + .drop_i ( b_drop[i] ), + .done_o ( b_done[i] ), + .id_i ( b_id_drop[i] ), + .prefetch_i ( b_prefetch_drop[i] ), + .hit_i ( b_hit_drop[i] ), + .s_axi4_bid ( s_axi4_bid[i] ), + .s_axi4_bresp ( s_axi4_bresp[i] ), + .s_axi4_bvalid ( s_axi4_bvalid[i] ), + .s_axi4_buser ( s_axi4_buser[i] ), + .s_axi4_bready ( s_axi4_bready[i] ), + .m_axi4_bid ( int_bid[i] ), + .m_axi4_bresp ( int_bresp[i] ), + .m_axi4_bvalid ( int_bvalid[i] ), + .m_axi4_buser ( int_buser[i] ), + .m_axi4_bready ( int_bready[i] ) + ); + + /* + * Multiplexer to switch between the two output master ports on the write response (b) channel + */ + always_comb begin + /* Output 1 always gets priority, so if it has something to send connect + it and let output 0 wait using rready = 0 */ + if (int_m1_bvalid[i] == 1'b1) begin + int_m0_bready[i] = 1'b0; + int_m1_bready[i] = int_bready[i]; + + int_bid[i] = int_m1_bid[i]; + int_bresp[i] = int_m1_bresp[i]; + int_buser[i] = int_m1_buser[i]; + int_bvalid[i] = int_m1_bvalid[i]; + end else begin + int_m0_bready[i] = int_bready[i]; + int_m1_bready[i] = 1'b0; + + int_bid[i] = int_m0_bid[i]; + int_bresp[i] = int_m0_bresp[i]; + int_buser[i] = int_m0_buser[i]; + int_bvalid[i] = int_m0_bvalid[i]; + end + end + + // }}} + + // Read Address channel (ar) {{{ + /* + * read address channel (ar) + * + * ██████╗ ███████╗ █████╗ ██████╗ █████╗ ██████╗ ██████╗ ██████╗ + * ██╔══██╗██╔════╝██╔══██╗██╔══██╗ ██╔══██╗██╔══██╗██╔══██╗██╔══██╗ + * ██████╔╝█████╗ ███████║██║ ██║ ███████║██║ ██║██║ ██║██████╔╝ + * ██╔══██╗██╔══╝ ██╔══██║██║ ██║ ██╔══██║██║ ██║██║ ██║██╔══██╗ + * ██║ ██║███████╗██║ ██║██████╔╝ ██║ ██║██████╔╝██████╔╝██║ ██║ + * ╚═╝ ╚═╝╚══════╝╚═╝ ╚═╝╚═════╝ ╚═╝ ╚═╝╚═════╝ ╚═════╝ ╚═╝ ╚═╝ + * + */ + axi4_ar_buffer + #( + .AXI_ID_WIDTH ( AXI_ID_WIDTH ), + .AXI_USER_WIDTH ( AXI_USER_WIDTH ) + ) + u_ar_buffer + ( + .axi4_aclk ( Clk_CI ), + .axi4_arstn ( Rst_RBI ), + .s_axi4_arid ( s_axi4_arid[i] ), + .s_axi4_araddr ( s_axi4_araddr[i] ), + .s_axi4_arvalid ( s_axi4_arvalid[i] ), + .s_axi4_arready ( s_axi4_arready[i] ), + .s_axi4_arlen ( s_axi4_arlen[i] ), + .s_axi4_arsize ( s_axi4_arsize[i] ), + .s_axi4_arburst ( s_axi4_arburst[i] ), + .s_axi4_arlock ( s_axi4_arlock[i] ), + .s_axi4_arprot ( s_axi4_arprot[i] ), + .s_axi4_arcache ( s_axi4_arcache[i] ), + .s_axi4_aruser ( s_axi4_aruser[i] ), + .m_axi4_arid ( int_arid[i] ), + .m_axi4_araddr ( int_araddr[i] ), + .m_axi4_arvalid ( int_arvalid[i] ), + .m_axi4_arready ( int_arready[i] ), + .m_axi4_arlen ( int_arlen[i] ), + .m_axi4_arsize ( int_arsize[i] ), + .m_axi4_arburst ( int_arburst[i] ), + .m_axi4_arlock ( int_arlock[i] ), + .m_axi4_arprot ( int_arprot[i] ), + .m_axi4_arcache ( int_arcache[i] ), + .m_axi4_aruser ( int_aruser[i] ) + ); + + axi4_ar_sender + #( + .AXI_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ), + .AXI_ID_WIDTH ( AXI_ID_WIDTH ), + .AXI_USER_WIDTH ( AXI_USER_WIDTH ), + .ENABLE_L2TLB ( ENABLE_L2TLB[i] ) + ) + u_ar_sender_m0 + ( + .axi4_aclk ( Clk_CI ), + .axi4_arstn ( Rst_RBI ), + .l1_done_o ( l1_m0_ar_done[i] ), + .l1_accept_i ( l1_m0_ar_accept[i] ), + .l1_drop_i ( l1_m0_ar_drop[i] ), + .l1_save_i ( l1_m0_ar_save[i] ), + .l2_done_o ( l2_m0_ar_done[i] ), + .l2_accept_i ( l2_m0_ar_accept[i] ), + .l2_drop_i ( l2_m0_ar_drop[i] ), + .l2_sending_o ( l2_m0_ar_sending[i] ), + .l1_araddr_i ( int_rtrans_addr[i] ), + .l2_araddr_i ( l2_ar_addr[i] ), + .s_axi4_arid ( int_arid[i] ), + .s_axi4_arvalid ( int_m0_arvalid[i] ), + .s_axi4_arready ( int_m0_arready[i] ), + .s_axi4_arlen ( int_arlen[i] ), + .s_axi4_arsize ( int_arsize[i] ), + .s_axi4_arburst ( int_arburst[i] ), + .s_axi4_arlock ( int_arlock[i] ), + .s_axi4_arprot ( int_arprot[i] ), + .s_axi4_arcache ( int_arcache[i] ), + .s_axi4_aruser ( int_aruser[i] ), + .m_axi4_arid ( m0_axi4_arid[i] ), + .m_axi4_araddr ( m0_axi4_araddr[i] ), + .m_axi4_arvalid ( m0_axi4_arvalid[i] ), + .m_axi4_arready ( m0_axi4_arready[i] ), + .m_axi4_arlen ( m0_axi4_arlen[i] ), + .m_axi4_arsize ( m0_axi4_arsize[i] ), + .m_axi4_arburst ( m0_axi4_arburst[i] ), + .m_axi4_arlock ( m0_axi4_arlock[i] ), + .m_axi4_arprot ( m0_axi4_arprot[i] ), + .m_axi4_arcache ( ), + .m_axi4_aruser ( m0_axi4_aruser[i] ) + ); + + // The AXCACHE signals are set according to burstiness and cache coherence or statically + // when not connected to ACP on Zynq (implemented below). + assign m0_read_is_burst[i] = (m0_axi4_arlen[i] != {8{1'b0}}) && (m0_axi4_arburst[i] != 2'b00); + `ifndef EN_ACP + always_comb begin + if ( (l2_m0_ar_sending[i] & l2_cache_coherent[i]) | int_rtrans_cache_coherent[i]) begin + if (m0_read_is_burst[i]) begin + m0_axi4_arcache[i] = 4'b1011; + end else begin + m0_axi4_arcache[i] = 4'b1111; + end + end else begin + m0_axi4_arcache[i] = 4'b0011; + end + end + `else + assign m0_axi4_arcache[i] = 4'b0011; + `endif + + axi4_ar_sender + #( + .AXI_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ), + .AXI_ID_WIDTH ( AXI_ID_WIDTH ), + .AXI_USER_WIDTH ( AXI_USER_WIDTH ), + .ENABLE_L2TLB ( ENABLE_L2TLB[i] ) + ) + u_ar_sender_m1 + ( + .axi4_aclk ( Clk_CI ), + .axi4_arstn ( Rst_RBI ), + .l1_done_o ( l1_m1_ar_done[i] ), + .l1_accept_i ( l1_m1_ar_accept[i] ), + .l1_drop_i ( l1_m1_ar_drop[i] ), + .l1_save_i ( l1_m1_ar_save[i] ), + .l2_done_o ( l2_m1_ar_done[i] ), + .l2_accept_i ( l2_m1_ar_accept[i] ), + .l2_drop_i ( l2_m1_ar_drop[i] ), + .l2_sending_o ( ), // just helps to set axcache + .l1_araddr_i ( int_rtrans_addr[i] ), + .l2_araddr_i ( l2_ar_addr[i] ), + .s_axi4_arid ( int_arid[i] ), + .s_axi4_arvalid ( int_m1_arvalid[i] ), + .s_axi4_arready ( int_m1_arready[i] ), + .s_axi4_arlen ( int_arlen[i] ), + .s_axi4_arsize ( int_arsize[i] ), + .s_axi4_arburst ( int_arburst[i] ), + .s_axi4_arlock ( int_arlock[i] ), + .s_axi4_arprot ( int_arprot[i] ), + .s_axi4_arcache ( int_arcache[i] ), + .s_axi4_aruser ( int_aruser[i] ), + .m_axi4_arid ( m1_axi4_arid[i] ), + .m_axi4_araddr ( m1_axi4_araddr[i] ), + .m_axi4_arvalid ( m1_axi4_arvalid[i] ), + .m_axi4_arready ( m1_axi4_arready[i] ), + .m_axi4_arlen ( m1_axi4_arlen[i] ), + .m_axi4_arsize ( m1_axi4_arsize[i] ), + .m_axi4_arburst ( m1_axi4_arburst[i] ), + .m_axi4_arlock ( m1_axi4_arlock[i] ), + .m_axi4_arprot ( m1_axi4_arprot[i] ), + .m_axi4_arcache ( ), + .m_axi4_aruser ( m1_axi4_aruser[i] ) + ); + + // The AXCACHE signals are set according to burstiness and cache coherence or statically + // when not connected to ACP on Zynq (implemented below). + assign m1_read_is_burst[i] = (m1_axi4_arlen[i] != {8{1'b0}}) && (m1_axi4_arburst[i] != 2'b00); + `ifdef EN_ACP + always_comb begin + if (m1_read_is_burst[i]) begin + m1_axi4_arcache[i] = 4'b1011; + end else begin + m1_axi4_arcache[i] = 4'b1111; + end + end + `else + assign m1_axi4_arcache[i] = 4'b0011; + `endif + + // }}} + + // Read Response channel (r) {{{ + /* + * read response channel (r) + * + * ██████╗ ███████╗ █████╗ ██████╗ ██████╗ ███████╗███████╗██████╗ + * ██╔══██╗██╔════╝██╔══██╗██╔══██╗ ██╔══██╗██╔════╝██╔════╝██╔══██╗ + * ██████╔╝█████╗ ███████║██║ ██║ ██████╔╝█████╗ ███████╗██████╔╝ + * ██╔══██╗██╔══╝ ██╔══██║██║ ██║ ██╔══██╗██╔══╝ ╚════██║██╔═══╝ + * ██║ ██║███████╗██║ ██║██████╔╝ ██║ ██║███████╗███████║██║ + * ╚═╝ ╚═╝╚══════╝╚═╝ ╚═╝╚═════╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝ + * + */ + axi4_r_buffer + #( + .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ), + .AXI_ID_WIDTH ( AXI_ID_WIDTH ), + .AXI_USER_WIDTH ( AXI_USER_WIDTH ) + ) + u_r_buffer_m0 + ( + .axi4_aclk ( Clk_CI ), + .axi4_arstn ( Rst_RBI ), + .s_axi4_rid ( int_m0_rid[i] ), + .s_axi4_rresp ( int_m0_rresp[i] ), + .s_axi4_rdata ( int_m0_rdata[i] ), + .s_axi4_rlast ( int_m0_rlast[i] ), + .s_axi4_rvalid ( int_m0_rvalid[i] ), + .s_axi4_ruser ( int_m0_ruser[i] ), + .s_axi4_rready ( int_m0_rready[i] ), + .m_axi4_rid ( m0_axi4_rid[i] ), + .m_axi4_rresp ( m0_axi4_rresp[i] ), + .m_axi4_rdata ( m0_axi4_rdata[i] ), + .m_axi4_rlast ( m0_axi4_rlast[i] ), + .m_axi4_rvalid ( m0_axi4_rvalid[i] ), + .m_axi4_ruser ( m0_axi4_ruser[i] ), + .m_axi4_rready ( m0_axi4_rready[i] ) + ); + + axi4_r_buffer + #( + .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ), + .AXI_ID_WIDTH ( AXI_ID_WIDTH ), + .AXI_USER_WIDTH ( AXI_USER_WIDTH ) + ) + u_r_buffer_m1 + ( + .axi4_aclk ( Clk_CI ), + .axi4_arstn ( Rst_RBI ), + .s_axi4_rid ( int_m1_rid[i] ), + .s_axi4_rresp ( int_m1_rresp[i] ), + .s_axi4_rdata ( int_m1_rdata[i] ), + .s_axi4_rlast ( int_m1_rlast[i] ), + .s_axi4_rvalid ( int_m1_rvalid[i] ), + .s_axi4_ruser ( int_m1_ruser[i] ), + .s_axi4_rready ( int_m1_rready[i] ), + .m_axi4_rid ( m1_axi4_rid[i] ), + .m_axi4_rresp ( m1_axi4_rresp[i] ), + .m_axi4_rdata ( m1_axi4_rdata[i] ), + .m_axi4_rlast ( m1_axi4_rlast[i] ), + .m_axi4_rvalid ( m1_axi4_rvalid[i] ), + .m_axi4_ruser ( m1_axi4_ruser[i] ), + .m_axi4_rready ( m1_axi4_rready[i] ) + ); + + axi4_r_sender + #( + .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ), + .AXI_ID_WIDTH ( AXI_ID_WIDTH ), + .AXI_USER_WIDTH ( AXI_USER_WIDTH ) + ) + u_r_sender + ( + .axi4_aclk ( Clk_CI ), + .axi4_arstn ( Rst_RBI ), + .drop_i ( lx_r_drop[i] ), + .drop_len_i ( lx_len_drop[i] ), + .done_o ( lx_r_done[i] ), + .id_i ( lx_id_drop[i] ), + .prefetch_i ( lx_prefetch_drop[i] ), + .hit_i ( lx_hit_drop[i] ), + .s_axi4_rid ( s_axi4_rid[i] ), + .s_axi4_rresp ( s_axi4_rresp[i] ), + .s_axi4_rdata ( s_axi4_rdata[i] ), + .s_axi4_rlast ( s_axi4_rlast[i] ), + .s_axi4_rvalid ( s_axi4_rvalid[i] ), + .s_axi4_ruser ( s_axi4_ruser[i] ), + .s_axi4_rready ( s_axi4_rready[i] ), + .m_axi4_rid ( int_rid[i] ), + .m_axi4_rresp ( int_rresp[i] ), + .m_axi4_rdata ( int_rdata[i] ), + .m_axi4_rlast ( int_rlast[i] ), + .m_axi4_rvalid ( int_rvalid[i] ), + .m_axi4_ruser ( int_ruser[i] ), + .m_axi4_rready ( int_rready[i] ) + ); + + /* + * Multiplexer to switch between the two output master ports on the read response(r) channel + * + * Do not perform read burst interleaving as the DMA does not support it. This means we can only + * switch between the two masters upon sending rlast or when idle. + * + * However, if the downstream already performs burst interleaving, this cannot be undone here. + * Also, the downstream may interleave a burst reponse with a single-beat transaction. In this + * case, the FSM below falls out of the burst mode. To avoid it performing burst interleaving + * after such an event, it gives priority to the master which received the last burst in case + * both have a have a burst ready (rvalid). + * + * Order of priority: + * 1. Ongoing burst transaction + * 2. Single-beat transaction on Master 1. + * 3. Single-beat transaction on Master 0. + * 4. Burst transaction on master that received the last burst. + */ + // Select signal + always_ff @(posedge Clk_CI) begin + if (Rst_RBI == 0) begin + RRespSel_SP[i] <= 1'b0; + end else begin + RRespSel_SP[i] <= RRespSel_SN[i]; + end + end + + // FSM + always_comb begin : RRespMuxFsm + RRespMuxCtrl_SN[i] = RRespMuxCtrl_SP[i]; + RRespSel_SN[i] = RRespSel_SP[i]; + + RRespBurst_S[i] = 1'b0; + RRespSelIm_S[i] = 1'b0; + + unique case (RRespMuxCtrl_SP[i]) + + IDLE: begin + // immediately forward single-beat transactions + if (int_m1_rvalid[i] && int_m1_rlast[i]) + RRespSelIm_S[i] = 1'b1; + else if (int_m0_rvalid[i] && int_m0_rlast[i]) + RRespSelIm_S[i] = 1'b0; + + // bursts - they also start immediately + else if (int_m1_rvalid[i] || int_m0_rvalid[i]) begin + RRespMuxCtrl_SN[i] = BUSY; + + // in case both are ready, continue with the master that had the last burst + if (int_m1_rvalid[i] && int_m0_rvalid[i]) begin + RRespSel_SN[i] = RRespSel_SP[i]; + RRespSelIm_S[i] = RRespSel_SP[i]; + end else if (int_m1_rvalid[i]) begin + RRespSel_SN[i] = 1'b1; + RRespSelIm_S[i] = 1'b1; + end else begin + RRespSel_SN[i] = 1'b0; + RRespSelIm_S[i] = 1'b0; + end + end + end + + BUSY: begin + RRespBurst_S[i] = 1'b1; + // detect last handshake of currently ongoing transfer + if (int_rvalid[i] && int_rready[i] && int_rlast[i]) + RRespMuxCtrl_SN[i] = IDLE; + end + + default: begin + RRespMuxCtrl_SN[i] = IDLE; + end + + endcase + end + + // FSM state + always_ff @(posedge Clk_CI) begin + if (Rst_RBI == 0) begin + RRespMuxCtrl_SP[i] <= IDLE; + end else begin + RRespMuxCtrl_SP[i] <= RRespMuxCtrl_SN[i]; + end + end + + // Actual multiplexer + always_comb begin + if ( (RRespBurst_S[i] && RRespSel_SP[i]) || (!RRespBurst_S[i] && RRespSelIm_S[i]) ) begin + int_m0_rready[i] = 1'b0; + int_m1_rready[i] = int_rready[i]; + + int_rid[i] = int_m1_rid[i]; + int_rresp[i] = int_m1_rresp[i]; + int_rdata[i] = int_m1_rdata[i]; + int_rlast[i] = int_m1_rlast[i]; + int_ruser[i] = int_m1_ruser[i]; + int_rvalid[i] = int_m1_rvalid[i]; + end else begin + int_m0_rready[i] = int_rready[i]; + int_m1_rready[i] = 1'b0; + + int_rid[i] = int_m0_rid[i]; + int_rresp[i] = int_m0_rresp[i]; + int_rdata[i] = int_m0_rdata[i]; + int_rlast[i] = int_m0_rlast[i]; + int_ruser[i] = int_m0_ruser[i]; + int_rvalid[i] = int_m0_rvalid[i]; + end + end + + end // BUF & SEND + + // }}} + + endgenerate // BUF & SEND }}} + + // Log {{{ + +`ifdef RAB_AX_LOG_EN + AxiBramLogger + #( + .AXI_ID_BITW ( AXI_ID_WIDTH ), + .AXI_ADDR_BITW ( AXI_S_ADDR_WIDTH ), + .NUM_LOG_ENTRIES ( `RAB_AX_LOG_ENTRIES ) + ) + u_aw_logger + ( + .Clk_CI ( NonGatedClk_CI ), + .TimestampClk_CI ( Clk_CI ), + .Rst_RBI ( Rst_RBI ), + .AxiValid_SI ( s_axi4_awvalid[1] ), + .AxiReady_SI ( s_axi4_awready[1] ), + .AxiId_DI ( s_axi4_awid[1] ), + .AxiAddr_DI ( s_axi4_awaddr[1] ), + .AxiLen_DI ( s_axi4_awlen[1] ), + .Clear_SI ( AwLogClr_SI ), + .LogEn_SI ( LogEn_SI ), + .Full_SO ( int_aw_log_full ), + .Ready_SO ( AwLogRdy_SO ), + .Bram_PS ( AwBram_PS ) + ); + + AxiBramLogger + #( + .AXI_ID_BITW ( AXI_ID_WIDTH ), + .AXI_ADDR_BITW ( AXI_S_ADDR_WIDTH ), + .NUM_LOG_ENTRIES ( `RAB_AX_LOG_ENTRIES ) + ) + u_ar_logger + ( + .Clk_CI ( NonGatedClk_CI ), + .TimestampClk_CI ( Clk_CI ), + .Rst_RBI ( Rst_RBI ), + .AxiValid_SI ( s_axi4_arvalid[1] ), + .AxiReady_SI ( s_axi4_arready[1] ), + .AxiId_DI ( s_axi4_arid[1] ), + .AxiAddr_DI ( s_axi4_araddr[1] ), + .AxiLen_DI ( s_axi4_arlen[1] ), + .Clear_SI ( ArLogClr_SI ), + .LogEn_SI ( LogEn_SI ), + .Full_SO ( int_ar_log_full ), + .Ready_SO ( ArLogRdy_SO ), + .Bram_PS ( ArBram_PS ) + ); +`endif + + // }}} + + // RAB Core {{{ + // ██████╗ █████╗ ██████╗ ██████╗ ██████╗ ██████╗ ███████╗ + // ██╔══██╗██╔══██╗██╔══██╗ ██╔════╝██╔═══██╗██╔══██╗██╔════╝ + // ██████╔╝███████║██████╔╝ ██║ ██║ ██║██████╔╝█████╗ + // ██╔══██╗██╔══██║██╔══██╗ ██║ ██║ ██║██╔══██╗██╔══╝ + // ██║ ██║██║ ██║██████╔╝ ╚██████╗╚██████╔╝██║ ██║███████╗ + // ╚═╝ ╚═╝╚═╝ ╚═╝╚═════╝ ╚═════╝ ╚═════╝ ╚═╝ ╚═╝╚══════╝ + // + /* + * rab_core + * + * The rab core translates addresses. It has two ports, which can be used + * independently, however they will compete for time internally, as lookups + * are serialized. + * + * type is the read(0) or write(1) used to check the protection flags. If they + * don't match an interrupt is created on the int_prot line. + */ + + rab_core + #( + .N_PORTS ( N_PORTS ), + .N_L2_SETS ( N_L2_SETS ), + .N_L2_SET_ENTRIES ( N_L2_SET_ENTRIES ), + .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ), + .AXI_S_ADDR_WIDTH ( AXI_S_ADDR_WIDTH ), + .AXI_M_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ), + .AXI_LITE_DATA_WIDTH ( AXI_LITE_DATA_WIDTH ), + .AXI_LITE_ADDR_WIDTH ( AXI_LITE_ADDR_WIDTH ), + .AXI_ID_WIDTH ( AXI_ID_WIDTH ), + .AXI_USER_WIDTH ( AXI_USER_WIDTH ), + .MH_FIFO_DEPTH ( MH_FIFO_DEPTH ) + ) + u_rab_core + ( + .Clk_CI ( Clk_CI ), + .Rst_RBI ( Rst_RBI ), + + // Config IF + .s_axi_awaddr ( s_axi4lite_awaddr ), + .s_axi_awvalid ( s_axi4lite_awvalid ), + .s_axi_awready ( s_axi4lite_awready ), + .s_axi_wdata ( s_axi4lite_wdata ), + .s_axi_wstrb ( s_axi4lite_wstrb ), + .s_axi_wvalid ( s_axi4lite_wvalid ), + .s_axi_wready ( s_axi4lite_wready ), + .s_axi_bresp ( s_axi4lite_bresp ), + .s_axi_bvalid ( s_axi4lite_bvalid ), + .s_axi_bready ( s_axi4lite_bready ), + .s_axi_araddr ( s_axi4lite_araddr ), + .s_axi_arvalid ( s_axi4lite_arvalid ), + .s_axi_arready ( s_axi4lite_arready ), + .s_axi_rready ( s_axi4lite_rready ), + .s_axi_rdata ( s_axi4lite_rdata ), + .s_axi_rresp ( s_axi4lite_rresp ), + .s_axi_rvalid ( s_axi4lite_rvalid ), + + // L1 miss info outputs -> L2 TLB arbitration + .int_miss ( rab_miss ), + .int_multi ( rab_multi ), + .int_prot ( rab_prot ), + .int_prefetch ( rab_prefetch ), + .int_mhf_full ( int_mhf_full ), + + // L1 transaction info outputs -> L2 TLB arbitration + .int_axaddr_o ( L1OutAddr_D ), + .int_axid_o ( L1OutId_D ), + .int_axlen_o ( L1OutLen_D ), + .int_axuser_o ( L1OutUser_D ), + + // Write Req IF + .port1_addr ( int_awaddr ), + .port1_id ( int_awid ), + .port1_len ( int_awlen ), + .port1_size ( int_awsize ), + .port1_addr_valid ( int_awvalid & ~aw_in_stall ), // avoid the FSM accepting new AW requests + .port1_type ( {N_PORTS{1'b1}} ), + .port1_user ( int_awuser ), + .port1_sent ( int_wtrans_sent ), // signal done to L1 FSM + .port1_out_addr ( int_wtrans_addr ), + .port1_cache_coherent ( int_wtrans_cache_coherent ), + .port1_accept ( int_wtrans_accept ), + .port1_drop ( int_wtrans_drop ), + .port1_miss ( int_wtrans_miss ), + + // Read Req IF + .port2_addr ( int_araddr ), + .port2_id ( int_arid ), + .port2_len ( int_arlen ), + .port2_size ( int_arsize ), + .port2_addr_valid ( int_arvalid ), + .port2_type ( {N_PORTS{1'b0}} ), + .port2_user ( int_aruser ), + .port2_sent ( int_rtrans_sent ), // signal done to L1 FSM + .port2_out_addr ( int_rtrans_addr ), + .port2_cache_coherent ( int_rtrans_cache_coherent ), + .port2_accept ( int_rtrans_accept ), + .port2_drop ( int_rtrans_drop ), + .port2_miss ( int_rtrans_miss ), + + // L2 miss info inputs -> axi_rab_cfg + .miss_l2_i ( L2Miss_S ), + .miss_l2_addr_i ( L2OutInAddr_DP ), + .miss_l2_id_i ( L2OutId_DP ), + .miss_l2_user_i ( L2OutUser_DP ), + + // L2 config outputs + .wdata_l2_o ( L2CfgWData_D ), + .waddr_l2_o ( L2CfgWAddr_D ), + .wren_l2_o ( L2CfgWE_S ) + ); + + // }}} + + // AX SPLITS {{{ + // █████╗ ██╗ ██╗ ███████╗██████╗ ██╗ ██╗████████╗ + // ██╔══██╗╚██╗██╔╝ ██╔════╝██╔══██╗██║ ██║╚══██╔══╝ + // ███████║ ╚███╔╝ ███████╗██████╔╝██║ ██║ ██║ + // ██╔══██║ ██╔██╗ ╚════██║██╔═══╝ ██║ ██║ ██║ + // ██║ ██║██╔╝ ██╗ ███████║██║ ███████╗██║ ██║ + // ╚═╝ ╚═╝╚═╝ ╚═╝ ╚══════╝╚═╝ ╚══════╝╚═╝ ╚═╝ + // + /** + * Multiplex the two output master ports of the Read Address and Write Address (AR/AW) channels. + * + * Use the `int_xmaster_select` signal to route the signals to either Master 0 (to memory) or + * Master 1 (to ACP). In case of an L1 miss: Route the signals to both masters. They shall be + * saved until the L2 outputs are available. + */ + generate for (i = 0; i < N_PORTS; i++) begin : AX_SPLIT + + /* + * When accepting L1 transactions, we must just do so on the selected master. Drop requests must + * be performed on any one of the two masters. Save requests must be performed by both masters. + */ + always_comb begin : AW_L1_SPLIT + + // TLB handshake + l1_m0_aw_accept[i] = 1'b0; + l1_m1_aw_accept[i] = 1'b0; + l1_m0_aw_drop[i] = 1'b0; + l1_m1_aw_drop[i] = 1'b0; + l1_m0_aw_save[i] = 1'b0; + l1_m1_aw_save[i] = 1'b0; + + l1_mx_aw_done[i] = 1'b0; + + // AXI sender input handshake + int_m0_awvalid[i] = 1'b0; + int_m1_awvalid[i] = 1'b0; + int_awready[i] = 1'b0; + + // accept on selected master only + if (l1_aw_accept[i]) begin + if (int_wmaster_select[i]) begin + l1_m1_aw_accept[i] = 1'b1; + l1_mx_aw_done[i] = l1_m1_aw_done[i]; + + int_m1_awvalid[i] = int_awvalid[i]; + int_awready[i] = int_m1_awready[i]; + + end else begin + l1_m0_aw_accept[i] = 1'b1; + l1_mx_aw_done[i] = l1_m0_aw_done[i]; + + int_m0_awvalid[i] = int_awvalid[i]; + int_awready[i] = int_m0_awready[i]; + end + + // drop on Master 0 only + end else if (l1_aw_drop[i]) begin + l1_m0_aw_drop[i] = 1'b1; + l1_mx_aw_done[i] = l1_m0_aw_done[i]; + + int_m0_awvalid[i] = int_awvalid[i]; + int_awready[i] = l1_m0_aw_done[i]; + + // save on both masters + end else if (l1_aw_save[i]) begin + // split save + l1_m0_aw_save[i] = ~l1_m0_aw_done_SP[i]; + l1_m1_aw_save[i] = ~l1_m1_aw_done_SP[i]; + + // combine done + l1_mx_aw_done[i] = l1_m0_aw_done_SP[i] & l1_m1_aw_done_SP[i]; + + int_m0_awvalid[i] = int_awvalid[i]; + int_m1_awvalid[i] = int_awvalid[i]; + int_awready[i] = l1_mx_aw_done[i]; + end + end + + // signal back to handshake splitter + assign l1_aw_done[i] = l1_mx_aw_done[i]; + + always_ff @(posedge Clk_CI) begin : L1_MX_AW_DONE_REG + if (Rst_RBI == 0) begin + l1_m0_aw_done_SP[i] <= 1'b0; + l1_m1_aw_done_SP[i] <= 1'b0; + end else if (l1_mx_aw_done[i]) begin + l1_m0_aw_done_SP[i] <= 1'b0; + l1_m1_aw_done_SP[i] <= 1'b0; + end else begin + l1_m0_aw_done_SP[i] <= l1_m0_aw_done_SP[i] | l1_m0_aw_done[i]; + l1_m1_aw_done_SP[i] <= l1_m1_aw_done_SP[i] | l1_m1_aw_done[i]; + end + end + + /* + * When accepting L2 transactions, we must drop the corresponding transaction from the other + * master to make it available again for save requests from L1_DROP_SAVE. + */ + always_comb begin : AW_L2_SPLIT + + l2_m0_aw_accept[i] = 1'b0; + l2_m1_aw_accept[i] = 1'b0; + l2_m0_aw_drop[i] = 1'b0; + l2_m1_aw_drop[i] = 1'b0; + + // de-assert request signals individually upon handshakes + if (l2_aw_accept[i]) begin + if (l2_master_select[i]) begin + l2_m1_aw_accept[i] = ~l2_m1_aw_done_SP[i]; + l2_m0_aw_drop[i] = ~l2_m0_aw_done_SP[i]; + + end else begin + l2_m0_aw_accept[i] = ~l2_m0_aw_done_SP[i]; + l2_m1_aw_drop[i] = ~l2_m1_aw_done_SP[i]; + + end + end else begin + l2_m0_aw_drop[i] = ~l2_m0_aw_done_SP[i] ? l2_aw_drop[i] : 1'b0; + l2_m1_aw_drop[i] = ~l2_m1_aw_done_SP[i] ? l2_aw_drop[i] : 1'b0; + + end + + // combine done + l2_mx_aw_done[i] = l2_m0_aw_done_SP[i] & l2_m1_aw_done_SP[i]; + + l2_aw_done[i] = l2_mx_aw_done[i]; + end + + always_ff @(posedge Clk_CI) begin : L2_MX_AW_DONE_REG + if (Rst_RBI == 0) begin + l2_m0_aw_done_SP[i] <= 1'b0; + l2_m1_aw_done_SP[i] <= 1'b0; + end else if (l2_mx_aw_done[i]) begin + l2_m0_aw_done_SP[i] <= 1'b0; + l2_m1_aw_done_SP[i] <= 1'b0; + end else begin + l2_m0_aw_done_SP[i] <= l2_m0_aw_done_SP[i] | l2_m0_aw_done[i]; + l2_m1_aw_done_SP[i] <= l2_m1_aw_done_SP[i] | l2_m1_aw_done[i]; + end + end + + /* + * When accepting L1 transactions, we must just do so on the selected master. Drop requests must + * be performed on any one of the two masters. Save requests must be performed by both masters. + */ + always_comb begin : AR_L1_SPLIT + + // TLB handshake + l1_m0_ar_accept[i] = 1'b0; + l1_m1_ar_accept[i] = 1'b0; + l1_m0_ar_drop[i] = 1'b0; + l1_m1_ar_drop[i] = 1'b0; + l1_m0_ar_save[i] = 1'b0; + l1_m1_ar_save[i] = 1'b0; + + l1_mx_ar_done[i] = 1'b0; + + // AXI sender input handshake + int_m0_arvalid[i] = 1'b0; + int_m1_arvalid[i] = 1'b0; + int_arready[i] = 1'b0; + + // accept on selected master only + if (l1_ar_accept[i]) begin + if (int_rmaster_select[i]) begin + l1_m1_ar_accept[i] = 1'b1; + l1_mx_ar_done[i] = l1_m1_ar_done[i]; + + int_m1_arvalid[i] = int_arvalid[i]; + int_arready[i] = int_m1_arready[i]; + + end else begin + l1_m0_ar_accept[i] = 1'b1; + l1_mx_ar_done[i] = l1_m0_ar_done[i]; + + int_m0_arvalid[i] = int_arvalid[i]; + int_arready[i] = int_m0_arready[i]; + end + + // drop on Master 0 only + end else if (l1_ar_drop[i]) begin + l1_m0_ar_drop[i] = 1'b1; + l1_mx_ar_done[i] = l1_m0_ar_done[i]; + + int_m0_arvalid[i] = int_arvalid[i]; + int_arready[i] = l1_m0_ar_done[i]; + + // save on both masters + end else if (l1_ar_save[i]) begin + // split save + l1_m0_ar_save[i] = ~l1_m0_ar_done_SP[i]; + l1_m1_ar_save[i] = ~l1_m1_ar_done_SP[i]; + + // combine done + l1_mx_ar_done[i] = l1_m0_ar_done_SP[i] & l1_m1_ar_done_SP[i]; + + int_m0_arvalid[i] = int_arvalid[i]; + int_m1_arvalid[i] = int_arvalid[i]; + int_arready[i] = l1_mx_ar_done[i]; + end + end + + // signal back to handshake splitter + assign l1_ar_done[i] = l1_mx_ar_done[i]; + + always_ff @(posedge Clk_CI) begin : L1_MX_AR_DONE_REG + if (Rst_RBI == 0) begin + l1_m0_ar_done_SP[i] <= 1'b0; + l1_m1_ar_done_SP[i] <= 1'b0; + end else if (l1_mx_ar_done[i]) begin + l1_m0_ar_done_SP[i] <= 1'b0; + l1_m1_ar_done_SP[i] <= 1'b0; + end else begin + l1_m0_ar_done_SP[i] <= l1_m0_ar_done_SP[i] | l1_m0_ar_done[i]; + l1_m1_ar_done_SP[i] <= l1_m1_ar_done_SP[i] | l1_m1_ar_done[i]; + end + end + + /* + * When accepting L2 transactions, we must drop the corresponding transaction from the other + * master to make it available again for save requests from L1_DROP_SAVE. + */ + always_comb begin : AR_L2_SPLIT + + l2_m0_ar_accept[i] = 1'b0; + l2_m1_ar_accept[i] = 1'b0; + l2_m0_ar_drop[i] = 1'b0; + l2_m1_ar_drop[i] = 1'b0; + + // de-assert request signals individually upon handshakes + if (l2_ar_accept[i]) begin + if (l2_master_select[i]) begin + l2_m1_ar_accept[i] = ~l2_m1_ar_done_SP[i]; + l2_m0_ar_drop[i] = ~l2_m0_ar_done_SP[i]; + + end else begin + l2_m0_ar_accept[i] = ~l2_m0_ar_done_SP[i]; + l2_m1_ar_drop[i] = ~l2_m1_ar_done_SP[i]; + + end + end else if (l2_ar_drop[i]) begin + l2_m0_ar_drop[i] = ~l2_m0_ar_done_SP[i] ? l2_ar_drop[i] : 1'b0; + l2_m1_ar_drop[i] = ~l2_m1_ar_done_SP[i] ? l2_ar_drop[i] : 1'b0; + + end + + // combine done + l2_mx_ar_done[i] = l2_m0_ar_done_SP[i] & l2_m1_ar_done_SP[i]; + + l2_ar_done[i] = l2_mx_ar_done[i]; + end + + always_ff @(posedge Clk_CI) begin : L2_MX_AR_DONE_REG + if (Rst_RBI == 0) begin + l2_m0_ar_done_SP[i] <= 1'b0; + l2_m1_ar_done_SP[i] <= 1'b0; + end else if (l2_mx_ar_done[i]) begin + l2_m0_ar_done_SP[i] <= 1'b0; + l2_m1_ar_done_SP[i] <= 1'b0; + end else begin + l2_m0_ar_done_SP[i] <= l2_m0_ar_done_SP[i] | l2_m0_ar_done[i]; + l2_m1_ar_done_SP[i] <= l2_m1_ar_done_SP[i] | l2_m1_ar_done[i]; + end + end + + end // AX_SPLIT + endgenerate // AX_SPLIT + + // }}} + + // HANDSHAKE SPLITS {{{ + // ██╗ ██╗███████╗ ███████╗██████╗ ██╗ ██╗████████╗ + // ██║ ██║██╔════╝ ██╔════╝██╔══██╗██║ ██║╚══██╔══╝ + // ███████║███████╗ ███████╗██████╔╝██║ ██║ ██║ + // ██╔══██║╚════██║ ╚════██║██╔═══╝ ██║ ██║ ██║ + // ██║ ██║███████║ ███████║██║ ███████╗██║ ██║ + // ╚═╝ ╚═╝╚══════╝ ╚══════╝╚═╝ ╚══════╝╚═╝ ╚═╝ + // + /* + * We need to perform combined handshakes with multiple AXI modules + * upon transactions drops, accepts, saves etc. from two TLBs. + */ + generate for (i = 0; i < N_PORTS; i++) begin : HANDSHAKE_SPLIT + + assign l1_xw_accept[i] = int_wtrans_accept[i] & ~aw_out_stall[i]; + assign int_wtrans_sent[i] = l1_xw_done[i]; + + assign l1_ar_accept[i] = int_rtrans_accept[i]; + assign int_rtrans_sent[i] = l1_ar_done[i]; + + /* + * L1 AW sender + W buffer handshake split + */ + // forward + assign l1_aw_accept[i] = l1_xw_accept[i] & ~l1_aw_done_SP[i]; + assign l1_w_accept[i] = l1_xw_accept[i] & ~l1_w_done_SP[i]; + + assign l1_aw_save[i] = l1_xw_save[i] & ~l1_aw_done_SP[i]; + assign l1_w_save[i] = l1_xw_save[i] & ~l1_w_done_SP[i]; + + assign l1_aw_drop[i] = l1_xw_drop[i] & ~l1_aw_done_SP[i]; + assign l1_w_drop[i] = l1_xw_drop[i] & ~l1_w_done_SP[i]; + + // backward + assign l1_xw_done[i] = l1_aw_done_SP[i] & l1_w_done_SP[i]; + + always_ff @(posedge Clk_CI) begin : L1_XW_HS_SPLIT + if (Rst_RBI == 0) begin + l1_aw_done_SP[i] <= 1'b0; + l1_w_done_SP[i] <= 1'b0; + end else if (l1_xw_done[i]) begin + l1_aw_done_SP[i] <= 1'b0; + l1_w_done_SP[i] <= 1'b0; + end else begin + l1_aw_done_SP[i] <= l1_aw_done_SP[i] | l1_aw_done[i]; + l1_w_done_SP[i] <= l1_w_done_SP[i] | l1_w_done[i]; + end + end + + if (ENABLE_L2TLB[i] == 1) begin : L2_HS_SPLIT + + /* + * L1 AR sender + R sender handshake split + * + * AR and R do not need to be strictly in sync. We thus use separate handshakes. + * But the handshake signals for the R sender are multiplexed with the those for + * the L2. However, L2_ACCEPT_DROP_SAVE has always higher priority. + */ + assign lx_r_drop[i] = l2_r_drop[i] | l1_r_drop[i]; + assign l1_r_done[i] = l2_r_drop[i] ? 1'b0 : lx_r_done[i]; + assign l2_r_done[i] = l2_r_drop[i] ? lx_r_done[i] : 1'b0; + + /* + * L2 AW sender + W buffer handshake split + */ + // forward + assign l2_aw_accept[i] = l2_xw_accept[i] & ~l2_aw_done_SP[i]; + assign l2_w_accept[i] = l2_xw_accept[i] & ~l2_w_done_SP[i]; + + assign l2_aw_drop[i] = l2_xw_drop[i] & ~l2_aw_done_SP[i]; + assign l2_w_drop[i] = l2_xw_drop[i] & ~l2_w_done_SP[i]; + + // backward + assign l2_xw_done[i] = l2_aw_done_SP[i] & l2_w_done_SP[i]; + + always_ff @(posedge Clk_CI) begin : L2_XW_HS_SPLIT + if (Rst_RBI == 0) begin + l2_aw_done_SP[i] <= 1'b0; + l2_w_done_SP[i] <= 1'b0; + end else if (l2_xw_done[i]) begin + l2_aw_done_SP[i] <= 1'b0; + l2_w_done_SP[i] <= 1'b0; + end else begin + l2_aw_done_SP[i] <= l2_aw_done_SP[i] | l2_aw_done[i]; + l2_w_done_SP[i] <= l2_w_done_SP[i] | l2_w_done[i]; + end + end + + /* + * L2 AR + R sender handshake split + */ + // forward + assign l2_ar_drop[i] = l2_xr_drop[i] & ~l2_ar_done_SP[i]; + assign l2_r_drop[i] = l2_xr_drop[i] & ~l2_r_done_SP[i]; + + // backward - make sure to always clear L2_XR_HS_SPLIT + always_comb begin + if (l2_xr_drop[i]) begin + l2_xr_done[i] = l2_ar_done_SP[i] & l2_r_done_SP[i]; + end else begin + l2_xr_done[i] = l2_ar_done_SP[i]; + end + end + + always_ff @(posedge Clk_CI) begin : L2_XR_HS_SPLIT + if (Rst_RBI == 0) begin + l2_ar_done_SP[i] <= 1'b0; + l2_r_done_SP[i] <= 1'b0; + end else if (l2_xr_done[i]) begin + l2_ar_done_SP[i] <= 1'b0; + l2_r_done_SP[i] <= 1'b0; + end else begin + l2_ar_done_SP[i] <= l2_ar_done_SP[i] | l2_ar_done[i]; + l2_r_done_SP[i] <= l2_r_done_SP[i] | l2_r_done[i]; + end + end + + end else begin // if (ENABLE_L2TLB[i] == 1) + + assign lx_r_drop[i] = l1_r_drop[i]; + assign l1_r_done[i] = lx_r_done[i]; + + assign l2_aw_accept[i] = 1'b0; + assign l2_w_accept[i] = 1'b0; + assign l2_aw_drop[i] = 1'b0; + assign l2_w_drop[i] = 1'b0; + assign l2_xw_done[i] = 1'b0; + assign l2_aw_done_SP[i] = 1'b0; + assign l2_w_done_SP[i] = 1'b0; + + assign l2_ar_accept[i] = 1'b0; + assign l2_ar_drop[i] = 1'b0; + assign l2_r_drop[i] = 1'b0; + assign l2_xr_done[i] = 1'b0; + assign l2_r_done[i] = 1'b0; + assign l2_ar_done_SP[i] = 1'b0; + assign l2_r_done_SP[i] = 1'b0; + + end // if (ENABLE_L2TLB[i] == 1) + + end // HANDSHAKE_SPLIT + endgenerate // HANDSHAKE_SPLIT + + // }}} + + // L2 TLB {{{ + // ██╗ ██████╗ ████████╗██╗ ██████╗ + // ██║ ╚════██╗ ╚══██╔══╝██║ ██╔══██╗ + // ██║ █████╔╝ ██║ ██║ ██████╔╝ + // ██║ ██╔═══╝ ██║ ██║ ██╔══██╗ + // ███████╗███████╗ ██║ ███████╗██████╔╝ + // ╚══════╝╚══════╝ ╚═╝ ╚══════╝╚═════╝ + // + /* + * l2_tlb + * + * The L2 TLB translates addresses upon misses in the L1 TLB (rab_core). + * + * It supports one ongoing translation at a time. If an L1 miss occurs while the L2 is busy, + * the L1 is stalled untill the L2 is available again. + * + */ + generate for (i = 0; i < N_PORTS; i++) begin : L2_TLB + if (ENABLE_L2TLB[i] == 1) begin : L2_TLB + + /* + * L1 output selector + */ + assign L1OutRwType_D[i] = int_wtrans_drop[i] ? 1'b1 : 1'b0; + assign L1OutProt_D[i] = rab_prot[i]; + assign L1OutMulti_D[i] = rab_multi[i]; + + /* + * L1 output control + L1_DROP_BUF, L2_IN_BUF management + * + * Forward the L1 drop request to AR/AW sender modules if + * 1. the transactions needs to be dropped (L1 multi, prot, prefetch), or + * 2. if a lookup in the L2 TLB is required (L1 miss) and the input buffer is not full. + * + * The AR/AW senders do not support more than 1 oustanding L1 miss. The push back towards + * the upstream is realized by not accepting the save request (saving the L1 transaction) + * in the senders as long as the L2 TLB is busy or has valid output. This ultimately + * blocks the L1 TLB. + * + * Together with the AW drop/save, we also perform the W drop/save as AW and W need to + * absolutely remain in order. In contrast, the R drop is performed + */ + always_comb begin : L1_DROP_SAVE + + l1_ar_drop[i] = 1'b0; + l1_ar_save[i] = 1'b0; + l1_xw_drop[i] = 1'b0; + l1_xw_save[i] = 1'b0; + + l1_id_drop[i] = L1OutId_D[i]; + l1_len_drop[i] = L1OutLen_D[i]; + l1_prefetch_drop[i] = rab_prefetch[i]; + l1_hit_drop[i] = 1'b1; // there are no drops for L1 misses + + L1DropEn_S[i] = 1'b0; + L2InEn_S[i] = 1'b0; + + if ( rab_prot[i] | rab_multi[i] | rab_prefetch[i] ) begin + // 1. Drop + l1_ar_drop[i] = int_rtrans_drop[i] & ~L1DropValid_SP[i]; + l1_xw_drop[i] = int_wtrans_drop[i] & ~L1DropValid_SP[i]; + + // Store to L1_DROP_BUF upon handshake + L1DropEn_S[i] = (l1_ar_drop[i] & l1_ar_done[i]) | + (l1_xw_drop[i] & l1_xw_done[i]); + + end else if ( rab_miss[i] ) begin + // 2. Save - Make sure L2 is really available. + l1_ar_save[i] = int_rtrans_drop[i] & ~L2Busy_S[i]; + l1_xw_save[i] = int_wtrans_drop[i] & ~L2Busy_S[i]; + + // Store to L2_IN_BUF upon handshake - triggers the L2 TLB + L2InEn_S[i] = (l1_ar_save[i] & l1_ar_done[i]) | + (l1_xw_save[i] & l1_xw_done[i]); + end + end + + /* + * L2 output control + L2_OUT_BUF management + R/B sender control + W buffer control + * + * Perform L1 R transaction drops unless the L2 output buffer holds valid data. The AXI specs + * require the B response to be sent only after consuming/discarding the corresponding data + * in the W channel. Thus, we only send L2 drop request to the W buffer here. The drop + * request to the B sender is then sent by the W buffer autonomously. + * + * L1 AW/W drop requests are managed by L1_DROP_SAVE. + */ + always_comb begin : L2_ACCEPT_DROP_SAVE + + l2_ar_addr[i] = 'b0; + l2_aw_addr[i] = 'b0; + l2_ar_accept[i] = 1'b0; + l2_xr_drop[i] = 1'b0; + l2_xw_accept[i] = 1'b0; + l2_xw_drop[i] = 1'b0; + + l1_r_drop[i] = 1'b0; + + lx_id_drop[i] = 'b0; + lx_len_drop[i] = 'b0; + lx_prefetch_drop[i] = 1'b0; + lx_hit_drop[i] = 1'b0; + + L1DropValid_SN[i] = L1DropValid_SP[i] | L1DropEn_S[i]; + L2OutValid_SN[i] = L2OutValid_SP[i]; + L2OutReady_S[i] = 1'b0; + L2OutEn_S[i] = 1'b0; + + L2Miss_S[i] = 1'b0; + int_multi[i] = 1'b0; + int_prot[i] = 1'b0; + + if (L2OutValid_SP[i] == 1'b0) begin + + // Drop L1 from R senders + if (L1DropValid_SP[i] == 1'b1) begin + + // Only perform the R sender drop here. + if (~L1DropRwType_DP[i]) begin + + l1_r_drop[i] = 1'b1; + lx_id_drop[i] = L1DropId_DP[i]; + lx_len_drop[i] = L1DropLen_DP[i]; + lx_prefetch_drop[i] = L1DropPrefetch_S[i]; + lx_hit_drop[i] = 1'b1; // there are no drops for L1 misses + + // Invalidate L1_DROP_BUF upon handshake + if ( l1_r_drop[i] & l1_r_done[i] ) begin + + L1DropValid_SN[i] = 1'b0; + int_prot[i] = L1DropProt_DP[i]; + int_multi[i] = L1DropMulti_DP[i]; + end + + end else begin + // Invalidate L1_DROP_BUF + L1DropValid_SN[i] = 1'b0; + int_prot[i] = L1DropProt_DP[i]; + int_multi[i] = L1DropMulti_DP[i]; + end + end + + end else begin // L2_OUT_BUF has valid data + + if ( L2OutHit_SP[i] & ~(L2OutPrefetch_S[i] | L2OutProt_SP[i] | L2OutMulti_SP[i]) ) begin + + l2_ar_addr[i] = L2OutAddr_DP[i]; + l2_aw_addr[i] = L2OutAddr_DP[i]; + + l2_ar_accept[i] = L2OutRwType_DP[i] ? 1'b0 : 1'b1; + l2_xw_accept[i] = L2OutRwType_DP[i] ? 1'b1 : 1'b0; + + // Invalidate L2_OUT_BUF upon handshake + L2OutValid_SN[i] = ~( (l2_ar_accept[i] & l2_ar_done[i]) | + (l2_xw_accept[i] & l2_xw_done[i]) ); + end else begin + + lx_id_drop[i] = L2OutId_DP[i]; + lx_len_drop[i] = L2OutLen_DP[i]; + lx_prefetch_drop[i] = L2OutPrefetch_S[i]; + lx_hit_drop[i] = L2OutHit_SP[i]; + + // The l2_xr_drop will also perform the handshake with the R sender + l2_xr_drop[i] = L2OutRwType_DP[i] ? 1'b0 : 1'b1; + l2_xw_drop[i] = L2OutRwType_DP[i] ? 1'b1 : 1'b0; + + // Invalidate L1_DROP_BUF upon handshake + if ( (l2_xr_drop[i] & l2_xr_done[i]) | (l2_xw_drop[i] & l2_xw_done[i]) ) begin + + L2OutValid_SN[i] = 1'b0; + L2Miss_S[i] = ~L2OutHit_SP[i]; + int_prot[i] = L2OutProt_SP[i]; + int_multi[i] = L2OutMulti_SP[i]; + end + end + end + + // Only accept new L2 output after ongoing drops have finished. + if ( (l2_xr_drop[i] == l2_xr_done[i]) & + (l2_xw_drop[i] == l2_xw_done[i]) & + (l1_r_drop[i] == l1_r_done[i] ) ) begin + // Store to L2_OUT_BUF upon handshake with L2 TLB module + if ( (L2OutValid_SP[i] == 1'b0) && (L2OutValid_S[i] == 1'b1) ) begin + L2OutValid_SN[i] = 1'b1; + L2OutReady_S[i] = 1'b1; + L2OutEn_S[i] = 1'b1; + end + end + end + + /* + * L1 drop buffer + * + * Used in case of multi, prot and prefetch hits in the L1 TLB. + */ + always_ff @(posedge Clk_CI) begin : L1_DROP_BUF + if (Rst_RBI == 0) begin + L1DropProt_DP[i] <= 1'b0; + L1DropMulti_DP[i] <= 1'b0; + L1DropRwType_DP[i] <= 1'b0; + L1DropUser_DP[i] <= 'b0; + L1DropId_DP[i] <= 'b0; + L1DropLen_DP[i] <= 'b0; + L1DropAddr_DP[i] <= 'b0; + end else if (L1DropEn_S[i] == 1'b1) begin + L1DropProt_DP[i] <= L1OutProt_D[i] ; + L1DropMulti_DP[i] <= L1OutMulti_D[i] ; + L1DropRwType_DP[i] <= L1OutRwType_D[i]; + L1DropUser_DP[i] <= L1OutUser_D[i] ; + L1DropId_DP[i] <= L1OutId_D[i] ; + L1DropLen_DP[i] <= L1OutLen_D[i] ; + L1DropAddr_DP[i] <= L1OutAddr_D[i] ; + end + end // always_ff @ (posedge Clk_CI) + + /* + * L2 input buffer + * + * Make sure there are no combinational paths between L1 TLB/inputs and L2 TLB. + */ + always_ff @(posedge Clk_CI) begin : L2_IN_BUF + if (Rst_RBI == 0) begin + L2InRwType_DP[i] <= 1'b0; + L2InUser_DP[i] <= 'b0; + L2InId_DP[i] <= 'b0; + L2InLen_DP[i] <= 'b0; + L2InAddr_DP[i] <= 'b0; + end else if (L2InEn_S[i] == 1'b1) begin + L2InRwType_DP[i] <= L1OutRwType_D[i]; + L2InUser_DP[i] <= L1OutUser_D[i] ; + L2InId_DP[i] <= L1OutId_D[i] ; + L2InLen_DP[i] <= L1OutLen_D[i] ; + L2InAddr_DP[i] <= L1OutAddr_D[i] ; + end + end // always_ff @ (posedge Clk_CI) + + l2_tlb + #( + .AXI_S_ADDR_WIDTH ( AXI_S_ADDR_WIDTH ), + .AXI_M_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ), + .AXI_LITE_DATA_WIDTH ( AXI_LITE_DATA_WIDTH ), + .AXI_LITE_ADDR_WIDTH ( AXI_LITE_ADDR_WIDTH ), + .N_SETS ( `RAB_L2_N_SETS ), + .N_OFFSETS ( `RAB_L2_N_SET_ENTRIES/2/`RAB_L2_N_PAR_VA_RAMS ), + .N_PAR_VA_RAMS ( `RAB_L2_N_PAR_VA_RAMS ), + .HIT_OFFSET_STORE_WIDTH ( log2(`RAB_L2_N_SET_ENTRIES/2/`RAB_L2_N_PAR_VA_RAMS) ) + ) + u_l2_tlb + ( + .clk_i ( Clk_CI ), + .rst_ni ( Rst_RBI ), + + // Config inputs + .we_i ( L2CfgWE_S[i] ), + .waddr_i ( L2CfgWAddr_D[i] ), + .wdata_i ( L2CfgWData_D[i] ), + + // Request input + .start_i ( L2InEn_S[i] ), + .busy_o ( L2Busy_S[i] ), + .rw_type_i ( L2InRwType_DP[i] ), + .in_addr_i ( L2InAddr_DP[i] ), + + // Response output + .out_ready_i ( L2OutReady_S[i] ), + .out_valid_o ( L2OutValid_S[i] ), + .hit_o ( L2OutHit_SN[i] ), + .miss_o ( L2OutMiss_SN[i] ), + .prot_o ( L2OutProt_SN[i] ), + .multi_o ( L2OutMulti_SN[i] ), + .cache_coherent_o ( L2OutCC_SN[i] ), + .out_addr_o ( L2OutAddr_DN[i] ) + ); + + /* + * L2 output buffer + * + * Make sure there are no combinational paths between L1 TLB/inputs and L2 TLB. + */ + always_ff @(posedge Clk_CI) begin : L2_OUT_BUF + if (Rst_RBI == 0) begin + L2OutRwType_DP[i] <= 1'b0; + L2OutUser_DP[i] <= 'b0; + L2OutLen_DP[i] <= 'b0; + L2OutId_DP[i] <= 'b0; + L2OutInAddr_DP[i] <= 'b0; + + L2OutHit_SP[i] <= 1'b0; + L2OutMiss_SP[i] <= 1'b0; + L2OutProt_SP[i] <= 1'b0; + L2OutMulti_SP[i] <= 1'b0; + L2OutCC_SP[i] <= 1'b0; + L2OutAddr_DP[i] <= 'b0; + end else if (L2OutEn_S[i] == 1'b1) begin + L2OutRwType_DP[i] <= L2InRwType_DP[i]; + L2OutUser_DP[i] <= L2InUser_DP[i] ; + L2OutLen_DP[i] <= L2InLen_DP[i] ; + L2OutId_DP[i] <= L2InId_DP[i] ; + L2OutInAddr_DP[i] <= L2InAddr_DP[i] ; + + L2OutHit_SP[i] <= L2OutHit_SN[i] ; + L2OutMiss_SP[i] <= L2OutMiss_SN[i] ; + L2OutProt_SP[i] <= L2OutProt_SN[i] ; + L2OutMulti_SP[i] <= L2OutMulti_SN[i]; + L2OutCC_SP[i] <= L2OutCC_SN[i] ; + L2OutAddr_DP[i] <= L2OutAddr_DN[i] ; + end + end // always_ff @ (posedge Clk_CI) + + always_ff @(posedge Clk_CI) begin : BUF_VALID + if (Rst_RBI == 0) begin + L1DropValid_SP[i] = 1'b0; + L2OutValid_SP[i] = 1'b0; + end else begin + L1DropValid_SP[i] = L1DropValid_SN[i]; + L2OutValid_SP[i] = L2OutValid_SN[i]; + end + end + + always_comb begin : BUF_TO_PREFETCH + // L1 Drop Buf + if (L1DropUser_DP[i] == {AXI_USER_WIDTH{1'b1}}) + L1DropPrefetch_S[i] = 1'b1; + else + L1DropPrefetch_S[i] = 1'b0; + + // L2 Out Buf + if (L2OutUser_DP[i] == {AXI_USER_WIDTH{1'b1}}) + L2OutPrefetch_S[i] = 1'b1; + else + L2OutPrefetch_S[i] = 1'b0; + end + + assign l2_cache_coherent[i] = L2OutCC_SP[i]; + assign int_miss[i] = L2Miss_S[i]; + + end else begin : L2_TLB_STUB // if (ENABLE_L2TLB[i] == 1) + + assign l1_ar_drop[i] = int_rtrans_drop[i]; + assign l1_r_drop[i] = int_rtrans_drop[i]; + assign l1_xw_drop[i] = int_wtrans_drop[i]; + + assign l1_ar_save[i] = 1'b0; + assign l1_xw_save[i] = 1'b0; + assign l2_xw_accept[i] = 1'b0; + assign l2_xr_drop[i] = 1'b0; + assign l2_xw_drop[i] = 1'b0; + + assign l2_ar_addr[i] = 'b0; + assign l2_aw_addr[i] = 'b0; + + assign l1_id_drop[i] = int_wtrans_drop[i] ? int_awid[i] : + int_rtrans_drop[i] ? int_arid[i] : + '0; + assign l1_len_drop[i] = int_wtrans_drop[i] ? int_awlen[i] : + int_rtrans_drop[i] ? int_arlen[i] : + '0; + assign l1_prefetch_drop[i] = rab_prefetch[i]; + assign l1_hit_drop[i] = ~rab_miss[i]; + + assign lx_id_drop[i] = int_wtrans_drop[i] ? int_awid[i] : + int_rtrans_drop[i] ? int_arid[i] : + '0; + assign lx_len_drop[i] = int_wtrans_drop[i] ? int_awlen[i] : + int_rtrans_drop[i] ? int_arlen[i] : + '0; + assign lx_prefetch_drop[i] = rab_prefetch[i]; + assign lx_hit_drop[i] = ~rab_miss[i]; + + assign l2_cache_coherent[i] = 1'b0; + + assign int_miss[i] = rab_miss[i]; + assign int_prot[i] = rab_prot[i]; + assign int_multi[i] = rab_multi[i]; + + // unused signals + assign L2Miss_S[i] = 1'b0; + + assign L1OutRwType_D[i] = 1'b0; + assign L1OutProt_D[i] = 1'b0; + assign L1OutMulti_D[i] = 1'b0; + + assign L1DropRwType_DP[i] = 1'b0; + assign L1DropUser_DP[i] = 'b0; + assign L1DropId_DP[i] = 'b0; + assign L1DropLen_DP[i] = 'b0; + assign L1DropAddr_DP[i] = 'b0; + assign L1DropProt_DP[i] = 1'b0; + assign L1DropMulti_DP[i] = 1'b0; + + assign L1DropEn_S[i] = 1'b0; + assign L1DropPrefetch_S[i] = 1'b0; + assign L1DropValid_SN[i] = 1'b0; + assign L1DropValid_SP[i] = 1'b0; + + assign L2InRwType_DP[i] = 1'b0; + assign L2InUser_DP[i] = 'b0; + assign L2InId_DP[i] = 'b0; + assign L2InLen_DP[i] = 'b0; + assign L2InAddr_DP[i] = 'b0; + + assign L2InEn_S[i] = 1'b0; + + assign L2OutHit_SN[i] = 1'b0; + assign L2OutMiss_SN[i] = 1'b0; + assign L2OutProt_SN[i] = 1'b0; + assign L2OutMulti_SN[i] = 1'b0; + assign L2OutCC_SN[i] = 1'b0; + assign L2OutAddr_DN[i] = 'b0; + + assign L2OutRwType_DP[i] = 1'b0; + assign L2OutUser_DP[i] = 'b0; + assign L2OutId_DP[i] = 'b0; + assign L2OutLen_DP[i] = 'b0; + assign L2OutInAddr_DP[i] = 'b0; + assign L2OutHit_SP[i] = 1'b0; + assign L2OutMiss_SP[i] = 1'b0; + assign L2OutProt_SP[i] = 1'b0; + assign L2OutMulti_SP[i] = 1'b0; + assign L2OutCC_SP[i] = 1'b0; + assign L2OutAddr_DP[i] = 'b0; + + assign L2OutEn_S[i] = 1'b0; + assign L2OutPrefetch_S[i] = 1'b0; + assign L2Busy_S[i] = 1'b0; + assign L2OutValid_S[i] = 1'b0; + assign L2OutValid_SN[i] = 1'b0; + assign L2OutValid_SP[i] = 1'b0; + assign L2OutReady_S[i] = 1'b0; + + end // !`ifdef ENABLE_L2TLB + end // for (i = 0; i < N_PORTS; i++) + endgenerate + +// }}} +""" +# endmodule +# +# +# // vim: ts=2 sw=2 sts=2 et nosmartindent autoindent foldmethod=marker +# +# diff --git a/src/soc/iommu/axi_rab/check_ram.py b/src/soc/iommu/axi_rab/check_ram.py new file mode 100644 index 00000000..31bf32ea --- /dev/null +++ b/src/soc/iommu/axi_rab/check_ram.py @@ -0,0 +1,240 @@ +# this file has been generated by sv2nmigen + +from nmigen import Signal, Module, Const, Cat, Elaboratable + + +class check_ram(Elaboratable): + + def __init__(self): + self.clk_i = Signal() # input + self.rst_ni = Signal() # input + self.in_addr = Signal(ADDR_WIDTH) # input + self.rw_type = Signal() # input + self.ram_we = Signal() # input + self.port0_addr = Signal(1+ERROR p_expression_25) # input + self.port1_addr = Signal(1+ERROR p_expression_25) # input + self.ram_wdata = Signal(RAM_DATA_WIDTH) # input + self.output_sent = Signal() # input + self.output_valid = Signal() # input + self.offset_addr_d = Signal(OFFSET_WIDTH) # input + self.hit_addr = Signal(1+ERROR p_expression_25) # output + self.master = Signal() # output + self.hit = Signal() # output + self.multi_hit = Signal() # output + self.prot = Signal() # output + + def elaborate(self, platform=None): + m = Module() + return m + + +# // Copyright 2018 ETH Zurich and University of Bologna. +# // Copyright and related rights are licensed under the Solderpad Hardware +# // License, Version 0.51 (the "License"); you may not use this file except in +# // compliance with the License. You may obtain a copy of the License at +# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law +# // or agreed to in writing, software, hardware and materials distributed under +# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# // CONDITIONS OF ANY KIND, either express or implied. See the License for the +# // specific language governing permissions and limitations under the License. +# +# //import CfMath::log2; +# +# //`define MULTI_HIT_FULL_SET +# +# module check_ram +# //#( +# // parameter ADDR_WIDTH = 32, +# // parameter RAM_DATA_WIDTH = 32, +# // parameter PAGE_SIZE = 4096, // 4kB +# // parameter SET_WIDTH = 5, +# // parameter OFFSET_WIDTH = 4 +# // ) +# ( +# input logic clk_i, +# input logic rst_ni, +# input logic [ADDR_WIDTH-1:0] in_addr, +# input logic rw_type, // 1 => write, 0=> read +# input logic ram_we, +# input logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] port0_addr, +# input logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] port1_addr, +# input logic [RAM_DATA_WIDTH-1:0] ram_wdata, +# input logic output_sent, +# input logic output_valid, +# input logic [OFFSET_WIDTH-1:0] offset_addr_d, +# output logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] hit_addr, +# output logic master, +# output logic hit, +# output logic multi_hit, +# output logic prot +# ); +# +""" #docstring_begin + + localparam IGNORE_LSB = log2(PAGE_SIZE); // 12 + + logic [RAM_DATA_WIDTH-1:0] port0_data_o, port1_data_o; // RAM read data outputs + logic port0_hit, port1_hit; // Ram output matches in_addr + + logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] port0_addr_saved, port1_addr_saved; + + // Hit FSM Signals + typedef enum logic {SEARCH, HIT} hit_state_t; + hit_state_t hit_SP; // Hit FSM state + hit_state_t hit_SN; // Hit FSM next state + + // Multi Hit FSM signals +`ifdef MULTI_HIT_FULL_SET + typedef enum logic[1:0] {NO_HITS, ONE_HIT, MULTI_HIT} multi_state_t; + multi_state_t multi_SP; // Multi Hit FSM state + multi_state_t multi_SN; // Multi Hit FSM next state + + logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] hit_addr_saved; + logic master_saved; +`endif + + //// --------------- Block RAM (Dual Port) -------------- //// + + // The outputs of the BRAMs are only valid if in the previous cycle: + // 1. the inputs were valid, and + // 2. the BRAM was not written to. + // Otherwise, the outputs must be ignored which is controlled by the output_valid signal. + // This signal is driven by the uppler level L2 TLB module. + ram_tp_no_change #( + .ADDR_WIDTH( SET_WIDTH+OFFSET_WIDTH+1 ), + .DATA_WIDTH( RAM_DATA_WIDTH ) + ) + ram_tp_no_change_0 + ( + .clk ( clk_i ), + .we ( ram_we ), + .addr0 ( port0_addr ), + .addr1 ( port1_addr ), + .d_i ( ram_wdata ), + .d0_o ( port0_data_o ), + .d1_o ( port1_data_o ) + ); + + //// Check Ram Outputs + assign port0_hit = (port0_data_o[0] == 1'b1) && (in_addr[ADDR_WIDTH-1: IGNORE_LSB] == port0_data_o[RAM_DATA_WIDTH-1:4]); + assign port1_hit = (port1_data_o[0] == 1'b1) && (in_addr[ADDR_WIDTH-1: IGNORE_LSB] == port1_data_o[RAM_DATA_WIDTH-1:4]); + //// ----------------------------------------------------- ///// + + //// ------------------- Check if Hit ------------------------ //// + // FSM + always_ff @(posedge clk_i) begin + if (rst_ni == 0) begin + hit_SP <= SEARCH; + end else begin + hit_SP <= hit_SN; + end + end + + always_ff @(posedge clk_i, negedge rst_ni) begin + if (!rst_ni) begin + port0_addr_saved <= '0; + port1_addr_saved <= '0; + end else begin + port0_addr_saved <= port0_addr; + port1_addr_saved <= port1_addr; + end + end + + always_comb begin + hit_SN = hit_SP; + hit = 1'b0; + hit_addr = 0; + master = 1'b0; + unique case(hit_SP) + SEARCH : + if (output_valid) + if (port0_hit || port1_hit) begin + hit_SN = HIT; + hit = 1'b1; + hit_addr = port0_hit ? {port0_addr_saved[SET_WIDTH+OFFSET_WIDTH:OFFSET_WIDTH], offset_addr_d} : + port1_hit ? {port1_addr_saved[SET_WIDTH+OFFSET_WIDTH:OFFSET_WIDTH], offset_addr_d} : + 0; + master = port0_hit ? port0_data_o[3] : + port1_hit ? port1_data_o[3] : + 1'b0; + end + + HIT : begin +`ifdef MULTI_HIT_FULL_SET // Since the search continues after the first hit, it needs to be saved to be accessed later. + hit = 1'b1; + hit_addr = hit_addr_saved; + master = master_saved; +`endif + if (output_sent) + hit_SN = SEARCH; + end + + default : begin + hit_SN = SEARCH; + end + endcase // case (hit_SP) + end // always_comb begin + + //// ------------------------------------------- //// + + assign prot = output_valid && port0_hit ? ((~port0_data_o[2] && rw_type) || (~port0_data_o[1] && ~rw_type)) : + output_valid && port1_hit ? ((~port1_data_o[2] && rw_type) || (~port1_data_o[1] && ~rw_type)) : + 1'b0; + + //// ------------------- Multi ------------------- //// +`ifdef MULTI_HIT_FULL_SET + + always_ff @(posedge clk_i) begin + if (rst_ni == 0) begin + hit_addr_saved <= 0; + master_saved <= 1'b0; + end else if (output_valid) begin + hit_addr_saved <= hit_addr; + master_saved <= master; + end + end + + // FSM + always_ff @(posedge clk_i) begin + if (rst_ni == 0) begin + multi_SP <= NO_HITS; + end else begin + multi_SP <= multi_SN; + end + end + + always_comb begin + multi_SN = multi_SP; + multi_hit = 1'b0; + unique case(multi_SP) + NO_HITS : + if(output_valid && (port0_hit && port1_hit)) begin + multi_SN = MULTI_HIT; + multi_hit = 1'b1; + end else if(output_valid && (port0_hit || port1_hit)) + multi_SN = ONE_HIT; + + ONE_HIT : + if(output_valid && (port0_hit || port1_hit)) begin + multi_SN = MULTI_HIT; + multi_hit = 1'b1; + end else if (output_sent) + multi_SN = NO_HITS; + + MULTI_HIT : begin + multi_hit = 1'b1; + if (output_sent) + multi_SN = NO_HITS; + end + + endcase // case (multi_SP) + end // always_comb begin + +`else // !`ifdef MULTI_HIT_FULL_SET + assign multi_hit = output_valid && port0_hit && port1_hit; +`endif // !`ifdef MULTI_HIT_FULL_SET + //// ------------------------------------------- //// +""" +# endmodule +# +# diff --git a/src/soc/iommu/axi_rab/coreconfig.py b/src/soc/iommu/axi_rab/coreconfig.py new file mode 100644 index 00000000..247d0ce3 --- /dev/null +++ b/src/soc/iommu/axi_rab/coreconfig.py @@ -0,0 +1,6 @@ +class CoreConfig: + def __init__(self): + self.N_SLICES = 16 + self.N_REGS = 4*self.N_SLICES + self.ADDR_WIDTH_PHYS = 40 + self.ADDR_WIDTH_VIRT = 32 diff --git a/src/soc/iommu/axi_rab/fsm.py b/src/soc/iommu/axi_rab/fsm.py new file mode 100644 index 00000000..d64b1cb4 --- /dev/null +++ b/src/soc/iommu/axi_rab/fsm.py @@ -0,0 +1,243 @@ +# this file has been generated by sv2nmigen + +from nmigen import Signal, Module, Const, Cat, Elaboratable + + +class fsm(Elaboratable): + + def __init__(self): + self.Clk_CI = Signal() # input + self.Rst_RBI = Signal() # input + self.port1_addr_valid_i = Signal() # input + self.port2_addr_valid_i = Signal() # input + self.port1_sent_i = Signal() # input + self.port2_sent_i = Signal() # input + self.select_i = Signal() # input + self.no_hit_i = Signal() # input + self.multi_hit_i = Signal() # input + self.no_prot_i = Signal() # input + self.prefetch_i = Signal() # input + self.out_addr_i = Signal(AXI_M_ADDR_WIDTH) # input + self.cache_coherent_i = Signal() # input + self.port1_accept_o = Signal() # output + self.port1_drop_o = Signal() # output + self.port1_miss_o = Signal() # output + self.port2_accept_o = Signal() # output + self.port2_drop_o = Signal() # output + self.port2_miss_o = Signal() # output + self.out_addr_o = Signal(AXI_M_ADDR_WIDTH) # output + self.cache_coherent_o = Signal() # output + self.miss_o = Signal() # output + self.multi_o = Signal() # output + self.prot_o = Signal() # output + self.prefetch_o = Signal() # output + self.in_addr_i = Signal(AXI_S_ADDR_WIDTH) # input + self.in_id_i = Signal(AXI_ID_WIDTH) # input + self.in_len_i = Signal(8) # input + self.in_user_i = Signal(AXI_USER_WIDTH) # input + self.in_addr_o = Signal(AXI_S_ADDR_WIDTH) # output + self.in_id_o = Signal(AXI_ID_WIDTH) # output + self.in_len_o = Signal(8) # output + self.in_user_o = Signal(AXI_USER_WIDTH) # output + + def elaborate(self, platform=None): + m = Module() + return m + + +# // Copyright 2018 ETH Zurich and University of Bologna. +# // Copyright and related rights are licensed under the Solderpad Hardware +# // License, Version 0.51 (the "License"); you may not use this file except in +# // compliance with the License. You may obtain a copy of the License at +# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law +# // or agreed to in writing, software, hardware and materials distributed under +# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# // CONDITIONS OF ANY KIND, either express or implied. See the License for the +# // specific language governing permissions and limitations under the License. +# +# //`timescale 1ns / 1ps +# +# module fsm +# #( +# parameter AXI_M_ADDR_WIDTH = 40, +# parameter AXI_S_ADDR_WIDTH = 32, +# parameter AXI_ID_WIDTH = 8, +# parameter AXI_USER_WIDTH = 6 +# ) +# ( +# input logic Clk_CI, +# input logic Rst_RBI, +# +# input logic port1_addr_valid_i, +# input logic port2_addr_valid_i, +# input logic port1_sent_i, +# input logic port2_sent_i, +# input logic select_i, +# input logic no_hit_i, +# input logic multi_hit_i, +# input logic no_prot_i, +# input logic prefetch_i, +# input logic [AXI_M_ADDR_WIDTH-1:0] out_addr_i, +# input logic cache_coherent_i, +# output logic port1_accept_o, +# output logic port1_drop_o, +# output logic port1_miss_o, +# output logic port2_accept_o, +# output logic port2_drop_o, +# output logic port2_miss_o, +# output logic [AXI_M_ADDR_WIDTH-1:0] out_addr_o, +# output logic cache_coherent_o, +# output logic miss_o, +# output logic multi_o, +# output logic prot_o, +# output logic prefetch_o, +# input logic [AXI_S_ADDR_WIDTH-1:0] in_addr_i, +# input logic [AXI_ID_WIDTH-1:0] in_id_i, +# input logic [7:0] in_len_i, +# input logic [AXI_USER_WIDTH-1:0] in_user_i, +# output logic [AXI_S_ADDR_WIDTH-1:0] in_addr_o, +# output logic [AXI_ID_WIDTH-1:0] in_id_o, +# output logic [7:0] in_len_o, +# output logic [AXI_USER_WIDTH-1:0] in_user_o +# ); +# +""" #docstring_begin + + //-------------Internal Signals---------------------- + + typedef enum logic {IDLE, WAIT} state_t; + logic state_SP; // Present state + logic state_SN; // Next State + + logic port1_accept_SN; + logic port1_drop_SN; + logic port1_miss_SN; + logic port2_accept_SN; + logic port2_drop_SN; + logic port2_miss_SN; + logic miss_SN; + logic multi_SN; + logic prot_SN; + logic prefetch_SN; + logic cache_coherent_SN; + logic [AXI_M_ADDR_WIDTH-1:0] out_addr_DN; + + logic out_reg_en_S; + + //----------FSM comb------------------------------ + + always_comb begin: FSM_COMBO + state_SN = state_SP; + + port1_accept_SN = 1'b0; + port1_drop_SN = 1'b0; + port1_miss_SN = 1'b0; + port2_accept_SN = 1'b0; + port2_drop_SN = 1'b0; + port2_miss_SN = 1'b0; + miss_SN = 1'b0; + multi_SN = 1'b0; + prot_SN = 1'b0; + prefetch_SN = 1'b0; + cache_coherent_SN = 1'b0; + out_addr_DN = '0; + + out_reg_en_S = 1'b0; // by default hold register output + + unique case(state_SP) + IDLE : + if ( (port1_addr_valid_i & select_i) | (port2_addr_valid_i & ~select_i) ) begin + out_reg_en_S = 1'b1; + state_SN = WAIT; + + // Select inputs for output registers + if (port1_addr_valid_i & select_i) begin + port1_accept_SN = ~(no_hit_i | multi_hit_i | ~no_prot_i | prefetch_i); + port1_drop_SN = (no_hit_i | multi_hit_i | ~no_prot_i | prefetch_i); + port1_miss_SN = no_hit_i; + port2_accept_SN = 1'b0; + port2_drop_SN = 1'b0; + port2_miss_SN = 1'b0; + end else if (port2_addr_valid_i & ~select_i) begin + port1_accept_SN = 1'b0; + port1_drop_SN = 1'b0; + port1_miss_SN = 1'b0; + port2_accept_SN = ~(no_hit_i | multi_hit_i | ~no_prot_i | prefetch_i); + port2_drop_SN = (no_hit_i | multi_hit_i | ~no_prot_i | prefetch_i); + port2_miss_SN = no_hit_i; + end + + miss_SN = port1_miss_SN | port2_miss_SN; + multi_SN = multi_hit_i; + prot_SN = ~no_prot_i; + prefetch_SN = ~no_hit_i & prefetch_i; + + cache_coherent_SN = cache_coherent_i; + out_addr_DN = out_addr_i; + end + + WAIT : + if ( port1_sent_i | port2_sent_i ) begin + out_reg_en_S = 1'b1; // "clear" the register + state_SN = IDLE; + end + + default : begin + state_SN = IDLE; + end + endcase + end + + //----------FSM seq------------------------------- + + always_ff @(posedge Clk_CI, negedge Rst_RBI) begin: FSM_SEQ + if (Rst_RBI == 1'b0) + state_SP <= IDLE; + else + state_SP <= state_SN; + end + + //----------Output seq-------------------------- + + always_ff @(posedge Clk_CI, negedge Rst_RBI) begin: OUTPUT_SEQ + if (Rst_RBI == 1'b0) begin + port1_accept_o = 1'b0; + port1_drop_o = 1'b0; + port1_miss_o = 1'b0; + port2_accept_o = 1'b0; + port2_drop_o = 1'b0; + port2_miss_o = 1'b0; + miss_o = 1'b0; + multi_o = 1'b0; + prot_o = 1'b0; + prefetch_o = 1'b0; + cache_coherent_o = 1'b0; + out_addr_o = '0; + in_addr_o = '0; + in_id_o = '0; + in_len_o = '0; + in_user_o = '0; + end else if (out_reg_en_S == 1'b1) begin + port1_accept_o = port1_accept_SN; + port1_drop_o = port1_drop_SN; + port1_miss_o = port1_miss_SN; + port2_accept_o = port2_accept_SN; + port2_drop_o = port2_drop_SN; + port2_miss_o = port2_miss_SN; + miss_o = miss_SN; + multi_o = multi_SN; + prot_o = prot_SN; + prefetch_o = prefetch_SN; + cache_coherent_o = cache_coherent_SN; + out_addr_o = out_addr_DN; + in_addr_o = in_addr_i; + in_id_o = in_id_i; + in_len_o = in_len_i; + in_user_o = in_user_i; + end + end // block: OUTPUT_SEQ +""" +# +# endmodule +# +# diff --git a/src/soc/iommu/axi_rab/l2_tlb.py b/src/soc/iommu/axi_rab/l2_tlb.py new file mode 100644 index 00000000..11983f64 --- /dev/null +++ b/src/soc/iommu/axi_rab/l2_tlb.py @@ -0,0 +1,550 @@ +# this file has been generated by sv2nmigen + +from nmigen import Signal, Module, Const, Cat, Elaboratable + + +class l2_tlb(Elaboratable): + + def __init__(self): + self.clk_i = Signal() # input + self.rst_ni = Signal() # input + self.we_i = Signal() # input + self.waddr_i = Signal(AXI_LITE_ADDR_WIDTH) # input + self.wdata_i = Signal(AXI_LITE_DATA_WIDTH) # input + self.start_i = Signal() # input + self.busy_o = Signal() # output + self.in_addr_i = Signal(AXI_S_ADDR_WIDTH) # input + self.rw_type_i = Signal() # input + self.out_ready_i = Signal() # input + self.out_valid_o = Signal() # output + self.hit_o = Signal() # output + self.miss_o = Signal() # output + self.prot_o = Signal() # output + self.multi_o = Signal() # output + self.cache_coherent_o = Signal() # output + self.out_addr_o = Signal(AXI_M_ADDR_WIDTH) # output + + def elaborate(self, platform=None): + m = Module() + return m + + +# // Copyright 2018 ETH Zurich and University of Bologna. +# // Copyright and related rights are licensed under the Solderpad Hardware +# // License, Version 0.51 (the "License"); you may not use this file except in +# // compliance with the License. You may obtain a copy of the License at +# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law +# // or agreed to in writing, software, hardware and materials distributed under +# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# // CONDITIONS OF ANY KIND, either express or implied. See the License for the +# // specific language governing permissions and limitations under the License. +# +# //`include "pulp_soc_defines.sv" +# +# ////import CfMath::log2; +# +# //`define MULTI_HIT_FULL_SET // Enable full multi hit detection. Always the entire set is searched. +# //`define MULTI_HIT_CUR_CYCLE // Enable partial multi hit detection. Only multi hits in the same search cycle are detected. +# +# //`ifdef MULTI_HIT_FULL_SET +# // `ifndef MULTI_HIT_CUR_CYCLE +# // `define MULTI_HIT_CUR_CYCLE +# // `endif +# //`endif +# +# module l2_tlb +# //#( +# // parameter AXI_S_ADDR_WIDTH = 32, +# // parameter AXI_M_ADDR_WIDTH = 40, +# // parameter AXI_LITE_DATA_WIDTH = 64, +# // parameter AXI_LITE_ADDR_WIDTH = 32, +# // parameter N_SETS = 32, +# // parameter N_OFFSETS = 4, //per port. There are 2 ports. +# // parameter PAGE_SIZE = 4096, // 4kB +# // parameter N_PAR_VA_RAMS = 4, +# // parameter HIT_OFFSET_STORE_WIDTH = 2 // Num of bits of VA RAM offset stored. This should not be greater than OFFSET_WIDTH +# // ) +# ( +# input logic clk_i, +# input logic rst_ni, +# +# input logic we_i, +# input logic [AXI_LITE_ADDR_WIDTH-1:0] waddr_i, +# input logic [AXI_LITE_DATA_WIDTH-1:0] wdata_i, +# +# input logic start_i, +# output logic busy_o, +# input logic [AXI_S_ADDR_WIDTH-1:0] in_addr_i, +# input logic rw_type_i, //1 => write, 0=> read +# +# input logic out_ready_i, +# output logic out_valid_o, +# output logic hit_o, +# output logic miss_o, +# output logic prot_o, +# output logic multi_o, +# output logic cache_coherent_o, +# output logic [AXI_M_ADDR_WIDTH-1:0] out_addr_o +# ); +# +""" #docstring_begin + + localparam VA_RAM_DEPTH = N_SETS * N_OFFSETS * 2; + localparam PA_RAM_DEPTH = VA_RAM_DEPTH * N_PAR_VA_RAMS; + localparam VA_RAM_ADDR_WIDTH = log2(VA_RAM_DEPTH); + localparam PA_RAM_ADDR_WIDTH = log2(PA_RAM_DEPTH); + localparam SET_WIDTH = log2(N_SETS); + localparam OFFSET_WIDTH = log2(N_OFFSETS); + localparam LL_WIDTH = log2(N_PAR_VA_RAMS); + localparam IGNORE_LSB = log2(PAGE_SIZE); + + localparam VA_RAM_DATA_WIDTH = AXI_S_ADDR_WIDTH - IGNORE_LSB + 4; + localparam PA_RAM_DATA_WIDTH = AXI_M_ADDR_WIDTH - IGNORE_LSB; + + logic [N_PAR_VA_RAMS-1:0] hit, prot, multi_hit, cache_coherent; + logic [N_PAR_VA_RAMS-1:0] ram_we; + logic last_search, last_search_next; + logic first_search, first_search_next; + logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] ram_waddr; + logic [N_PAR_VA_RAMS-1:0][SET_WIDTH+OFFSET_WIDTH+1-1:0] hit_addr; + logic pa_ram_we; + logic [PA_RAM_ADDR_WIDTH-1:0] pa_port0_raddr, pa_port0_waddr; // PA RAM read, Write addr; + logic [PA_RAM_ADDR_WIDTH-1:0] pa_port0_raddr_reg_SN, pa_port0_raddr_reg_SP; // registered addresses, needed for WAIT_ON_WRITE; + logic [PA_RAM_ADDR_WIDTH-1:0] pa_port0_addr; // PA RAM addr + logic [PA_RAM_DATA_WIDTH-1:0] pa_port0_data, pa_data, pa_port0_data_reg; // PA RAM data + logic pa_ram_store_data_SN, pa_ram_store_data_SP; + logic hit_top, prot_top, multi_hit_top, first_hit_top; + logic output_sent; + int hit_block_num; + + logic searching, search_done; + logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] port0_addr, port0_raddr; // VA RAM port0 addr + logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] port1_addr; // VA RAM port1 addr + logic [OFFSET_WIDTH-1:0] offset_addr, offset_addr_d; + logic [OFFSET_WIDTH-1:0] offset_start_addr, offset_end_addr; + logic [SET_WIDTH-1:0] set_num; + + logic va_output_valid; + logic searching_q; + + genvar z; + + // Search FSM + typedef enum logic [1:0] {IDLE, SEARCH, DONE} search_state_t; + search_state_t search_SP; // Present state + search_state_t search_SN; // Next State + + // Output FSM + typedef enum logic [1:0] {OUT_IDLE, SEND_OUTPUT, WAIT_ON_WRITE} out_state_t; + out_state_t out_SP; // Present state + out_state_t out_SN; // Next State + + logic miss_next; + logic hit_next; + logic prot_next; + logic multi_next; + logic cache_coherent_next; + + // Generate the VA Block rams and their surrounding logic + generate + for (z = 0; z < N_PAR_VA_RAMS; z++) begin : VA_RAMS + check_ram + #( + .ADDR_WIDTH ( AXI_S_ADDR_WIDTH ), + .RAM_DATA_WIDTH ( VA_RAM_DATA_WIDTH ), + .PAGE_SIZE ( PAGE_SIZE ), + .SET_WIDTH ( SET_WIDTH ), + .OFFSET_WIDTH ( OFFSET_WIDTH ) + ) + u_check_ram + ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .in_addr ( in_addr_i ), + .rw_type ( rw_type_i ), + .ram_we ( ram_we[z] ), + .port0_addr ( port0_addr ), + .port1_addr ( port1_addr ), + .ram_wdata ( wdata_i[VA_RAM_DATA_WIDTH-1:0] ), + .output_sent ( output_sent ), + .output_valid ( va_output_valid ), + .offset_addr_d ( offset_addr_d ), + .hit_addr ( hit_addr[z] ), + .master ( cache_coherent[z] ), + .hit ( hit[z] ), + .multi_hit ( multi_hit[z] ), + .prot ( prot[z] ) + ); + end // for (z = 0; z < N_PORTS; z++) + endgenerate + + ////////////////// ---------------- Control and Address --------------- //////////////////////// + // FSM + always_ff @(posedge clk_i) begin + if (rst_ni == 0) begin + search_SP <= IDLE; + end else begin + search_SP <= search_SN; + end + end + + always_comb begin : SEARCH_FSM + search_SN = search_SP; + busy_o = 1'b0; + searching = 1'b0; + search_done = 1'b0; + last_search_next = 1'b0; + first_search_next = first_search; + + unique case (search_SP) + IDLE : begin + if (start_i) begin + search_SN = SEARCH; + first_search_next = 1'b1; + end + end + + SEARCH : begin + busy_o = 1'b1; + + // detect last search cycle + if ( (first_search == 1'b0) && (offset_addr == offset_end_addr) ) + last_search_next = 1'b1; + + // pause search during VA RAM reconfigration + if (|ram_we) begin + searching = 1'b0; + end else begin + searching = 1'b1; + first_search_next = 1'b0; + end + + if (va_output_valid) begin + // stop search +`ifdef MULTI_HIT_FULL_SET + if (last_search | prot_top | multi_hit_top) begin +`else + if (last_search | prot_top | multi_hit_top | hit_top ) begin +`endif + search_SN = DONE; + search_done = 1'b1; + end + end + end + + DONE : begin + busy_o = 1'b1; + if (out_valid_o & out_ready_i) + search_SN = IDLE; + end + + default : begin + search_SN = IDLE; + end + endcase // case (prot_SP) + end // always_comb begin + + always_ff @(posedge clk_i) begin + if (rst_ni == 0) begin + last_search <= 1'b0; + first_search <= 1'b0; + end else begin + last_search <= last_search_next; + first_search <= first_search_next; + end + end + + /* + * VA RAM address generation + * + * The input address and set number, and thus the offset start address, are available in the + * cycle after the start signal. The buffered offset_addr becomes available one cycle later. + * During the first search cycle, we therefore directly use offset_addr_start for the lookup. + */ + assign set_num = in_addr_i[SET_WIDTH+IGNORE_LSB -1 : IGNORE_LSB]; + + assign port0_raddr[OFFSET_WIDTH] = 1'b0; + assign port1_addr [OFFSET_WIDTH] = 1'b1; + + assign port0_raddr[OFFSET_WIDTH-1:0] = first_search ? offset_start_addr : offset_addr; + assign port1_addr [OFFSET_WIDTH-1:0] = first_search ? offset_start_addr : offset_addr; + + assign port0_raddr[SET_WIDTH+OFFSET_WIDTH : OFFSET_WIDTH+1] = set_num; + assign port1_addr [SET_WIDTH+OFFSET_WIDTH : OFFSET_WIDTH+1] = set_num; + + assign port0_addr = ram_we ? ram_waddr : port0_raddr; + + // The outputs of the BRAMs are only valid if in the previous cycle: + // 1. the inputs were valid, and + // 2. the BRAMs were not written to. + // Otherwise, the outputs must be ignored. + always_ff @(posedge clk_i) begin + if (rst_ni == 0) begin + searching_q <= 1'b0; + end else begin + searching_q <= searching; + end + end + assign va_output_valid = searching_q; + + // Address offset for looking up the VA RAMs + always_ff @(posedge clk_i) begin + if (rst_ni == 0) begin + offset_addr <= 0; + end else if (first_search) begin + offset_addr <= offset_start_addr + 1'b1; + end else if (searching) begin + offset_addr <= offset_addr + 1'b1; + end + end + + // Delayed address offest for looking up the PA RAM upon a hit in the VA RAMs + always_ff @(posedge clk_i) begin + if (rst_ni == 0) begin + offset_addr_d <= 0; + end else if (first_search) begin + offset_addr_d <= offset_start_addr; + end else if (searching) begin + offset_addr_d <= offset_addr_d + 1'b1; + end + end + + // Store the offset addr for hit to reduce latency for next search. + generate + if (HIT_OFFSET_STORE_WIDTH > 0) begin : OFFSET_STORE +`ifndef MULTI_HIT_FULL_SET + logic [N_SETS-1:0][HIT_OFFSET_STORE_WIDTH-1:0] hit_offset_addr; // Contains offset addr for previous hit for every SET. + logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] hit_addr_reg; + + assign offset_start_addr = { hit_offset_addr[set_num] , {{OFFSET_WIDTH-HIT_OFFSET_STORE_WIDTH}{1'b0}} }; + assign offset_end_addr = hit_offset_addr[set_num]-1'b1; + + // Register the hit addr + always_ff @(posedge clk_i) begin + if (rst_ni == 0) begin + hit_addr_reg <= 0; + end else if (hit_top) begin + hit_addr_reg <= hit_addr[hit_block_num]; + end + end + + // Store hit addr for each set. The next search in the same set will start from the saved addr. + always_ff @(posedge clk_i) begin + if (rst_ni == 0) begin + hit_offset_addr <= 0; + end else if (hit_o) begin + hit_offset_addr[set_num][HIT_OFFSET_STORE_WIDTH-1:0] <= hit_addr_reg[OFFSET_WIDTH-1 : (OFFSET_WIDTH - HIT_OFFSET_STORE_WIDTH)]; + end + end +`else // No need to store offset if full multi hit detection is enabled because the entire SET is searched. + assign offset_start_addr = 0; + assign offset_end_addr = {OFFSET_WIDTH{1'b1}}; +`endif + end else begin // if (HIT_OFFSET_STORE_WIDTH > 0) + assign offset_start_addr = 0; + assign offset_end_addr = {OFFSET_WIDTH{1'b1}}; + end + endgenerate + + assign prot_top = |prot; + + ////////////////////////////////////////////////////////////////////////////////////// + // check for hit, multi hit + // In case of a multi hit, the hit_block_num indicates the lowest VA RAM with a hit. + // In case of a multi hit in the same VA RAM, Port 0 is given priority. + always_comb begin : HIT_CHECK + hit_top = |hit; + hit_block_num = 0; + first_hit_top = 1'b0; + multi_hit_top = 1'b0; + for (int i=N_PAR_VA_RAMS-1; i>=0; i--) begin + if (hit[i] == 1'b1) begin +`ifdef MULTI_HIT_CUR_CYCLE + if (multi_hit[i] | first_hit_top ) begin + multi_hit_top = 1'b1; + end +`endif + first_hit_top = 1'b1; + hit_block_num = i; + end + end // for (int i=0; i port1 active + // select = 0 -> port2 active + select[idx] = (curr_priority[idx] & port1_addr_valid[idx]) | ~port2_addr_valid[idx]; + + p1_burst_size[idx] = (port1_len[idx] + 1) << port1_size[idx]; + p2_burst_size[idx] = (port2_len[idx] + 1) << port2_size[idx]; + + // align min addr for max addr computation to allow for smart AXI bursts around the 4k boundary + if (port1_size[idx] == 3'b001) + p1_mask[idx] = 3'b110; + else if (port1_size[idx] == 3'b010) + p1_mask[idx] = 3'b100; + else if (port1_size[idx] == 3'b011) + p1_mask[idx] = 3'b000; + else + p1_mask[idx] = 3'b111; + + p1_align_addr[idx][AXI_S_ADDR_WIDTH-1:AXI_SIZE_WIDTH] = port1_addr[idx][AXI_S_ADDR_WIDTH-1:AXI_SIZE_WIDTH]; + p1_align_addr[idx][AXI_SIZE_WIDTH-1:0] = port1_addr[idx][AXI_SIZE_WIDTH-1:0] & p1_mask[idx]; + + if (port2_size[idx] == 3'b001) + p2_mask[idx] = 3'b110; + else if (port2_size[idx] == 3'b010) + p2_mask[idx] = 3'b100; + else if (port2_size[idx] == 3'b011) + p2_mask[idx] = 3'b000; + else + p2_mask[idx] = 3'b111; + + if (port1_user[idx] == {AXI_USER_WIDTH{1'b1}}) + p1_prefetch[idx] = 1'b1; + else + p1_prefetch[idx] = 1'b0; + + if (port2_user[idx] == {AXI_USER_WIDTH{1'b1}}) + p2_prefetch[idx] = 1'b1; + else + p2_prefetch[idx] = 1'b0; + + p2_align_addr[idx][AXI_S_ADDR_WIDTH-1:AXI_SIZE_WIDTH] = port2_addr[idx][AXI_S_ADDR_WIDTH-1:AXI_SIZE_WIDTH]; + p2_align_addr[idx][AXI_SIZE_WIDTH-1:0] = port2_addr[idx][AXI_SIZE_WIDTH-1:0] & p2_mask[idx]; + + p1_max_addr[idx] = p1_align_addr[idx] + p1_burst_size[idx] - 1; + p2_max_addr[idx] = p2_align_addr[idx] + p2_burst_size[idx] - 1; + + int_addr_min[idx] = select[idx] ? port1_addr[idx] : port2_addr[idx]; + int_addr_max[idx] = select[idx] ? p1_max_addr[idx] : p2_max_addr[idx]; + int_rw[idx] = select[idx] ? port1_type[idx] : port2_type[idx]; + int_id[idx] = select[idx] ? port1_id[idx] : port2_id[idx]; + int_len[idx] = select[idx] ? port1_len[idx] : port2_len[idx]; + int_user[idx] = select[idx] ? port1_user[idx] : port2_user[idx]; + prefetch[idx] = select[idx] ? p1_prefetch[idx] : p2_prefetch[idx]; + + hit [idx] = | hit_slices [idx]; + prot[idx] = | prot_slices[idx]; + + no_hit [idx] = ~hit [idx]; + no_prot[idx] = ~prot[idx]; + + port1_out_addr[idx] = out_addr_reg[idx]; + port2_out_addr[idx] = out_addr_reg[idx]; + + port1_cache_coherent[idx] = cache_coherent_reg[idx]; + port2_cache_coherent[idx] = cache_coherent_reg[idx]; + end + end + + always_comb + begin + var integer idx_port, idx_slice; + var integer reg_num; + reg_num=0; + for ( idx_port = 0; idx_port < N_PORTS; idx_port++ ) begin + for ( idx_slice = 0; idx_slice < 4*N_SLICES[idx_port]; idx_slice++ ) begin + int_cfg_regs_slices[idx_port][idx_slice] = int_cfg_regs[4+reg_num]; + reg_num++; + end + // int_cfg_regs_slices[idx_port][N_SLICES_MAX:N_SLICES[idx_port]] will be dangling + // Fix to zero. Synthesis will remove these signals. + // int_cfg_regs_slices[idx_port][4*N_SLICES_MAX-1:4*N_SLICES[idx_port]] = 0; + end + end + + always @(posedge Clk_CI or negedge Rst_RBI) + begin : PORT_PRIORITY + var integer idx; + if (Rst_RBI == 1'b0) + curr_priority = 'h0; + else begin + for (idx=0; idx= cfg_min) ? 1'b1 : 1'b0; + # assign min_below_max = (in_addr_min <= cfg_max) ? 1'b1 : 1'b0; + # assign max_below_max = (in_addr_max <= cfg_max) ? 1'b1 : 1'b0; + # assign out_hit = cfg_en & min_above_min & min_below_max & max_below_max; + # assign out_prot = out_hit & ((in_trans_type & ~cfg_wen) | (~in_trans_type & ~cfg_ren)); + # assign out_addr = in_addr_min - cfg_min + cfg_offset; + m.d.comb += [ + min_above_min.eq(self.in_addr_min >= self.cfg_min), + min_below_max.eq(self.in_addr_min <= self.cfg_max), + max_below_max.eq(self.in_addr_max <= self.cfg_max), + self.out_hit.eq(self.cfg_en & min_above_min & + min_below_max & max_below_max), + self.out_prot.eq(self.out_hit & ( + (self.in_trans_type & ~self.cfg_wen) | (~self.in_trans_type & ~self.cfg_ren))), + self.out_addr.eq(self.in_addr_min - self.cfg_min + self.cfg_offset) + ] + + return m diff --git a/src/soc/iommu/axi_rab/ram_tp_no_change.py b/src/soc/iommu/axi_rab/ram_tp_no_change.py new file mode 100644 index 00000000..bdcd5550 --- /dev/null +++ b/src/soc/iommu/axi_rab/ram_tp_no_change.py @@ -0,0 +1,97 @@ +# // Copyright 2018 ETH Zurich and University of Bologna. +# // Copyright and related rights are licensed under the Solderpad Hardware +# // License, Version 0.51 (the "License"); you may not use this file except in +# // compliance with the License. You may obtain a copy of the License at +# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law +# // or agreed to in writing, software, hardware and materials distributed under +# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# // CONDITIONS OF ANY KIND, either express or implied. See the License for the +# // specific language governing permissions and limitations under the License. +# +# /* +# * ram_tp_no_change +# * +# * This code implements a parameterizable two-port memory. Port 0 can read and +# * write while Port 1 can read only. The Xilinx tools will infer a BRAM with +# * Port 0 in "no change" mode, i.e., during a write, it retains the last read +# * value on the output. Port 1 (read-only) is in "write first" mode. Still, it +# * outputs the old data during the write cycle. Note: Port 1 outputs invalid +# * data in the cycle after the write when reading the same address. +# * +# * For more information, see Xilinx PG058 Block Memory Generator Product Guide. +# */ + +from nmigen import Signal, Module, Const, Cat, Elaboratable +from nmigen import Memory + +import math + +# +# module ram_tp_no_change +# #( +ADDR_WIDTH = 10 +DATA_WIDTH = 36 +# ) +# ( +# input clk, +# input we, +# input [ADDR_WIDTH-1:0] addr0, +# input [ADDR_WIDTH-1:0] addr1, +# input [DATA_WIDTH-1:0] d_i, +# output [DATA_WIDTH-1:0] d0_o, +# output [DATA_WIDTH-1:0] d1_o +# ); + + +class ram_tp_no_change(Elaboratable): + + def __init__(self): + self.we = Signal() # input + self.addr0 = Signal(ADDR_WIDTH) # input + self.addr1 = Signal(ADDR_WIDTH) # input + self.d_i = Signal(DATA_WIDTH) # input + self.d0_o = Signal(DATA_WIDTH) # output + self.d1_o = Signal(DATA_WIDTH) # output + + DEPTH = int(math.pow(2, ADDR_WIDTH)) + self.ram = Memory(DATA_WIDTH, DEPTH) + # + # localparam DEPTH = 2**ADDR_WIDTH; + # + # (* ram_style = "block" *) reg [DATA_WIDTH-1:0] ram[DEPTH]; + # reg [DATA_WIDTH-1:0] d0; + # reg [DATA_WIDTH-1:0] d1; + # + # always_ff @(posedge clk) begin + # if(we == 1'b1) begin + # ram[addr0] <= d_i; + # end else begin + # only change data if we==false + # d0 <= ram[addr0]; + # end + # d1 <= ram[addr1]; + # end + # + # assign d0_o = d0; + # assign d1_o = d1; + # + + def elaborate(self, platform=None): + m = Module() + m.submodules.read_ram0 = read_ram0 = self.ram.read_port() + m.submodules.read_ram1 = read_ram1 = self.ram.read_port() + m.submodules.write_ram = write_ram = self.ram.write_port() + + # write port + m.d.comb += write_ram.en.eq(self.we) + m.d.comb += write_ram.addr.eq(self.addr0) + m.d.comb += write_ram.data.eq(self.d_i) + + # read ports + m.d.comb += read_ram0.addr.eq(self.addr0) + m.d.comb += read_ram1.addr.eq(self.addr1) + with m.If(self.we == 0): + m.d.sync += self.d0_o.eq(read_ram0.data) + m.d.sync += self.d1_o.eq(read_ram1.data) + + return m diff --git a/src/soc/iommu/axi_rab/ram_tp_write_first.py b/src/soc/iommu/axi_rab/ram_tp_write_first.py new file mode 100644 index 00000000..7a21969c --- /dev/null +++ b/src/soc/iommu/axi_rab/ram_tp_write_first.py @@ -0,0 +1,93 @@ +# // Copyright 2018 ETH Zurich and University of Bologna. +# // Copyright and related rights are licensed under the Solderpad Hardware +# // License, Version 0.51 (the "License"); you may not use this file except in +# // compliance with the License. You may obtain a copy of the License at +# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law +# // or agreed to in writing, software, hardware and materials distributed under +# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# // CONDITIONS OF ANY KIND, either express or implied. See the License for the +# // specific language governing permissions and limitations under the License. +# +# /* +# * ram_tp_write_first +# * +# * This code implements a parameterizable two-port memory. Port 0 can read and +# * write while Port 1 can read only. Xilinx Vivado will infer a BRAM in +# * "write first" mode, i.e., upon a read and write to the same address, the +# * new value is read. Note: Port 1 outputs invalid data in the cycle after +# * the write when reading the same address. +# * +# * For more information, see Xilinx PG058 Block Memory Generator Product Guide. +# */ + +from nmigen import Signal, Module, Const, Cat, Elaboratable +from nmigen import Memory + +import math +# +# module ram_tp_write_first +# #( +ADDR_WIDTH = 10 +DATA_WIDTH = 36 +# ) +# ( +# input clk, +# input we, +# input [ADDR_WIDTH-1:0] addr0, +# input [ADDR_WIDTH-1:0] addr1, +# input [DATA_WIDTH-1:0] d_i, +# output [DATA_WIDTH-1:0] d0_o, +# output [DATA_WIDTH-1:0] d1_o +# ); + + +class ram_tp_write_first(Elaboratable): + + def __init__(self): + self.we = Signal() # input + self.addr0 = Signal(ADDR_WIDTH) # input + self.addr1 = Signal(ADDR_WIDTH) # input + self.d_i = Signal(DATA_WIDTH) # input + self.d0_o = Signal(DATA_WIDTH) # output + self.d1_o = Signal(DATA_WIDTH) # output + + DEPTH = int(math.pow(2, ADDR_WIDTH)) + self.ram = Memory(DATA_WIDTH, DEPTH) + + # + # localparam DEPTH = 2**ADDR_WIDTH; + # + # (* ram_style = "block" *) reg [DATA_WIDTH-1:0] ram[DEPTH]; + # reg [ADDR_WIDTH-1:0] raddr0; + # reg [ADDR_WIDTH-1:0] raddr1; + # + # always_ff @(posedge clk) begin + # if(we == 1'b1) begin + # ram[addr0] <= d_i; + # end + # raddr0 <= addr0; + # raddr1 <= addr1; + # end + # + # assign d0_o = ram[raddr0]; + # assign d1_o = ram[raddr1]; + # + + def elaborate(self, platform=None): + m = Module() + m.submodules.read_ram0 = read_ram0 = self.ram.read_port() + m.submodules.read_ram1 = read_ram1 = self.ram.read_port() + m.submodules.write_ram = write_ram = self.ram.write_port() + + # write port + m.d.comb += write_ram.en.eq(self.we) + m.d.comb += write_ram.addr.eq(self.addr0) + m.d.comb += write_ram.data.eq(self.d_i) + + # read ports + m.d.comb += read_ram0.addr.eq(self.addr0) + m.d.comb += read_ram1.addr.eq(self.addr1) + m.d.sync += self.d0_o.eq(read_ram0.data) + m.d.sync += self.d1_o.eq(read_ram1.data) + + return m diff --git a/src/soc/iommu/axi_rab/slice_top.py b/src/soc/iommu/axi_rab/slice_top.py new file mode 100644 index 00000000..6eedb1cd --- /dev/null +++ b/src/soc/iommu/axi_rab/slice_top.py @@ -0,0 +1,141 @@ +# // Copyright 2018 ETH Zurich and University of Bologna. +# // Copyright and related rights are licensed under the Solderpad Hardware +# // License, Version 0.51 (the "License"); you may not use this file except in +# // compliance with the License. You may obtain a copy of the License at +# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law +# // or agreed to in writing, software, hardware and materials distributed under +# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# // CONDITIONS OF ANY KIND, either express or implied. See the License for the +# // specific language governing permissions and limitations under the License. + +# this file has been generated by sv2nmigen + +from nmigen import Signal, Module, Const, Cat, Elaboratable +import rab_slice +import coreconfig + +# +# module slice_top +# //#( +# // parameter N_SLICES = 16, +# // parameter N_REGS = 4*N_SLICES, +# // parameter ADDR_WIDTH_PHYS = 40, +# // parameter ADDR_WIDTH_VIRT = 32 +# // ) +# ( +# input logic [N_REGS-1:0] [63:0] int_cfg_regs, +# input logic int_rw, +# input logic [ADDR_WIDTH_VIRT-1:0] int_addr_min, +# input logic [ADDR_WIDTH_VIRT-1:0] int_addr_max, +# input logic multi_hit_allow, +# output logic multi_hit, +# output logic [N_SLICES-1:0] prot, +# output logic [N_SLICES-1:0] hit, +# output logic cache_coherent, +# output logic [ADDR_WIDTH_PHYS-1:0] out_addr +# ); +# + + +class slice_top(Elaboratable): + + def __init__(self): + # FIXME self.int_cfg_regs = Signal() # input + self.params = coreconfig.CoreConfig() # rename ? + self.int_rw = Signal() # input + self.int_addr_min = Signal(self.params.ADDR_WIDTH_VIRT) # input + self.int_addr_max = Signal(self.params.ADDR_WIDTH_VIRT) # input + self.multi_hit_allow = Signal() # input + self.multi_hit = Signal() # output + self.prot = Signal(self.params.N_SLICES) # output + self.hit = Signal(self.params.N_SLICES) # output + self.cache_coherent = Signal() # output + self.out_addr = Signal(self.params.ADDR_WIDTH_PHYS) # output + + def elaborate(self, platform=None): + m = Module() + + first_hit = Signal() + + for i in range(self.params.N_SLICES): + # TODO pass params / core config here + u_slice = rab_slice.rab_slice(self.params) + setattr(m.submodules, "u_slice%d" % i, u_slice) + # TODO set param and connect ports + + # In case of a multi hit, the lowest slice with a hit is selected. + # TODO always_comb begin : HIT_CHECK + m.d.comb += [ + first_hit.eq(0), + self.multi_hit.eq(0), + self.out_addr.eq(0), + self.cache_coherent.eq(0)] + + for j in range(self.params.N_SLICES): + with m.If(self.hit[j] == 1): + with m.If(first_hit == 1): + with m.If(self.multi_hit_allow == 0): + m.d.comb += [self.multi_hit.eq(1)] + with m.Elif(first_hit == 1): + m.d.comb += [first_hit.eq(1) + # only output first slice that was hit + # SV self.out_addr.eq(slice_out_addr[ADDR_WIDTH_PHYS*j + : ADDR_WIDTH_PHYS]), + # SV self.cache_coherent.eq(int_cfg_regs[4*j+3][3]), + ] + return m + + # TODO translate generate statement + + +""" + logic [ADDR_WIDTH_PHYS*N_SLICES-1:0] slice_out_addr; + + generate + for ( i=0; i I have used bits <11:6> as they are not translated (4KB pages) +> and larger than a cache line (64 bytes). +> I have used bits <11:4> when the L1 cache was QuadW sized and +> the L2 cache was Line sized. +""" + +from nmigen.compat.sim import run_simulation +from nmigen.cli import verilog, rtlil +from nmigen import Module, Signal, Const, Array, Cat, Elaboratable + +from nmutil.latch import latchregister, SRLatch + + +class PartialAddrMatch(Elaboratable): + """A partial address matcher + """ + def __init__(self, n_adr, bitwid): + self.n_adr = n_adr + self.bitwid = bitwid + # inputs + self.addrs_i = Array(Signal(bitwid, name="addr") for i in range(n_adr)) + self.addr_we_i = Signal(n_adr) # write-enable for incoming address + self.addr_en_i = Signal(n_adr) # address latched in + self.addr_rs_i = Signal(n_adr) # address deactivated + + # output + self.addr_nomatch_o = Signal(n_adr, name="nomatch_o") + self.addr_nomatch_a_o = Array(Signal(n_adr, name="nomatch_array_o") \ + for i in range(n_adr)) + + def elaborate(self, platform): + m = Module() + return self._elaborate(m, platform) + + def _elaborate(self, m, platform): + comb = m.d.comb + sync = m.d.sync + + m.submodules.l = l = SRLatch(llen=self.n_adr, sync=False) + addrs_r = Array(Signal(self.bitwid, name="a_r") \ + for i in range(self.n_adr)) + + # latch set/reset + comb += l.s.eq(self.addr_en_i) + comb += l.r.eq(self.addr_rs_i) + + # copy in addresses (and "enable" signals) + for i in range(self.n_adr): + latchregister(m, self.addrs_i[i], addrs_r[i], l.q[i]) + + # is there a clash, yes/no + matchgrp = [] + for i in range(self.n_adr): + match = [] + for j in range(self.n_adr): + if i == j: + match.append(Const(0)) # don't match against self! + else: + match.append(addrs_r[i] == addrs_r[j]) + comb += self.addr_nomatch_a_o[i].eq(~Cat(*match) & l.q) + matchgrp.append(self.addr_nomatch_a_o[i] == l.q) + comb += self.addr_nomatch_o.eq(Cat(*matchgrp) & l.q) + + return m + + def __iter__(self): + yield from self.addrs_i + yield self.addr_we_i + yield self.addr_en_i + yield from self.addr_nomatch_a_o + yield self.addr_nomatch_o + + def ports(self): + return list(self) + + +def part_addr_sim(dut): + yield dut.dest_i.eq(1) + yield dut.issue_i.eq(1) + yield + yield dut.issue_i.eq(0) + yield + yield dut.src1_i.eq(1) + yield dut.issue_i.eq(1) + yield + yield dut.issue_i.eq(0) + yield + yield dut.go_rd_i.eq(1) + yield + yield dut.go_rd_i.eq(0) + yield + yield dut.go_wr_i.eq(1) + yield + yield dut.go_wr_i.eq(0) + yield + +def test_part_addr(): + dut = PartialAddrMatch(3, 10) + vl = rtlil.convert(dut, ports=dut.ports()) + with open("test_part_addr.il", "w") as f: + f.write(vl) + + run_simulation(dut, part_addr_sim(dut), vcd_name='test_part_addr.vcd') + +if __name__ == '__main__': + test_part_addr() diff --git a/src/soc/scoreboard/dependence_cell.py b/src/soc/scoreboard/dependence_cell.py new file mode 100644 index 00000000..16108229 --- /dev/null +++ b/src/soc/scoreboard/dependence_cell.py @@ -0,0 +1,169 @@ +from nmigen.compat.sim import run_simulation +from nmigen.cli import verilog, rtlil +from nmigen import Module, Signal, Elaboratable, Array, Cat, Repl +from nmutil.latch import SRLatch +from functools import reduce +from operator import or_ + + +class DependencyRow(Elaboratable): + """ implements 11.4.7 mitch alsup dependence cell, p27 + adjusted to be clock-sync'd on rising edge only. + mitch design (as does 6600) requires alternating rising/falling clock + + * SET mode: issue_i HI, go_i LO, reg_i HI - register is captured + - FWD is DISABLED (~issue_i) + - RSEL DISABLED + * QRY mode: issue_i LO, go_i LO, haz_i HI - FWD is ASSERTED + reg_i HI - ignored + * GO mode : issue_i LO, go_i HI - RSEL is ASSERTED + haz_i HI - FWD still can be ASSERTED + + FWD assertion (hazard protection) therefore still occurs in both + Query and Go Modes, for this cycle, due to the cq register + + GO mode works for one cycle, again due to the cq register capturing + the latch output. Without the cq register, the SR Latch (which is + asynchronous) would be reset at the exact moment that GO was requested, + and the RSEL would be garbage. + """ + def __init__(self, n_reg, n_src, cancel_mode=False): + self.cancel_mode = cancel_mode + self.n_reg = n_reg + self.n_src = n_src + # arrays + src = [] + rsel = [] + fwd = [] + for i in range(n_src): + j = i + 1 # name numbering to match src1/src2 + src.append(Signal(n_reg, name="src%d" % j, reset_less=True)) + rsel.append(Signal(n_reg, name="src%d_rsel_o" % j, reset_less=True)) + fwd.append(Signal(n_reg, name="src%d_fwd_o" % j, reset_less=True)) + + # inputs + self.dest_i = Signal(n_reg, reset_less=True) # Dest in (top) + self.src_i = Array(src) # operands in (top) + self.issue_i = Signal(reset_less=True) # Issue in (top) + + self.rd_pend_i = Signal(n_reg, reset_less=True) # Read pend in (top) + self.wr_pend_i = Signal(n_reg, reset_less=True) # Write pend in (top) + self.v_rd_rsel_o = Signal(n_reg, reset_less=True) # Read pend out (bot) + self.v_wr_rsel_o = Signal(n_reg, reset_less=True) # Write pend out (bot) + + self.go_wr_i = Signal(reset_less=True) # Go Write in (left) + self.go_rd_i = Signal(reset_less=True) # Go Read in (left) + if self.cancel_mode: + self.go_die_i = Signal(n_reg, reset_less=True) # Go Die in (left) + else: + self.go_die_i = Signal(reset_less=True) # Go Die in (left) + + # for Register File Select Lines (vertical) + self.dest_rsel_o = Signal(n_reg, reset_less=True) # dest reg sel (bot) + self.src_rsel_o = Array(rsel) # src reg sel (bot) + self.src2_rsel_o = Signal(n_reg, reset_less=True) # src2 reg sel (bot) + + # for Function Unit "forward progress" (horizontal) + self.dest_fwd_o = Signal(n_reg, reset_less=True) # dest FU fw (right) + self.src_fwd_o = Array(fwd) # src FU fw (right) + + def elaborate(self, platform): + m = Module() + m.submodules.dest_c = dest_c = SRLatch(sync=False, llen=self.n_reg) + src_c = [] + for i in range(self.n_src): + src_l = SRLatch(sync=False, llen=self.n_reg) + setattr(m.submodules, "src%d_c" % (i+1), src_l) + src_c.append(src_l) + + # connect go_rd / go_wr (dest->wr, src->rd) + wr_die = Signal(self.n_reg, reset_less=True) + rd_die = Signal(self.n_reg, reset_less=True) + if self.cancel_mode: + go_die = self.go_die_i + else: + go_die = Repl(self.go_die_i, self.n_reg) + m.d.comb += wr_die.eq(Repl(self.go_wr_i, self.n_reg) | go_die) + m.d.comb += rd_die.eq(Repl(self.go_rd_i, self.n_reg) | go_die) + m.d.comb += dest_c.r.eq(wr_die) + for i in range(self.n_src): + m.d.comb += src_c[i].r.eq(rd_die) + + # connect input reg bit (unary) + i_ext = Repl(self.issue_i, self.n_reg) + m.d.comb += dest_c.s.eq(i_ext & self.dest_i) + for i in range(self.n_src): + m.d.comb += src_c[i].s.eq(i_ext & self.src_i[i]) + + # connect up hazard checks: read-after-write and write-after-read + m.d.comb += self.dest_fwd_o.eq(dest_c.q & self.rd_pend_i) + for i in range(self.n_src): + m.d.comb += self.src_fwd_o[i].eq(src_c[i].q & self.wr_pend_i) + + # connect reg-sel outputs + rd_ext = Repl(self.go_rd_i, self.n_reg) + wr_ext = Repl(self.go_wr_i, self.n_reg) + m.d.comb += self.dest_rsel_o.eq(dest_c.qlq & wr_ext) + for i in range(self.n_src): + m.d.comb += self.src_rsel_o[i].eq(src_c[i].qlq & rd_ext) + + # to be accumulated to indicate if register is in use (globally) + # after ORing, is fed back in to rd_pend_i / wr_pend_i + src_q = [] + for i in range(self.n_src): + src_q.append(src_c[i].qlq) + m.d.comb += self.v_rd_rsel_o.eq(reduce(or_, src_q)) + m.d.comb += self.v_wr_rsel_o.eq(dest_c.qlq) + + return m + + def __iter__(self): + yield self.dest_i + yield from self.src_i + yield self.rd_pend_i + yield self.wr_pend_i + yield self.issue_i + yield self.go_wr_i + yield self.go_rd_i + yield self.go_die_i + yield self.dest_rsel_o + yield from self.src_rsel_o + yield self.dest_fwd_o + yield from self.src_fwd_o + + def ports(self): + return list(self) + + +def dcell_sim(dut): + yield dut.dest_i.eq(1) + yield dut.issue_i.eq(1) + yield + yield dut.issue_i.eq(0) + yield + yield dut.src1_i.eq(1) + yield dut.issue_i.eq(1) + yield + yield + yield + yield dut.issue_i.eq(0) + yield + yield dut.go_rd_i.eq(1) + yield + yield dut.go_rd_i.eq(0) + yield + yield dut.go_wr_i.eq(1) + yield + yield dut.go_wr_i.eq(0) + yield + +def test_dcell(): + dut = DependencyRow(4, 2, True) + vl = rtlil.convert(dut, ports=dut.ports()) + with open("test_drow.il", "w") as f: + f.write(vl) + + run_simulation(dut, dcell_sim(dut), vcd_name='test_dcell.vcd') + +if __name__ == '__main__': + test_dcell() diff --git a/src/soc/scoreboard/fn_unit.py b/src/soc/scoreboard/fn_unit.py new file mode 100644 index 00000000..63beb70b --- /dev/null +++ b/src/soc/scoreboard/fn_unit.py @@ -0,0 +1,321 @@ +from nmigen.compat.sim import run_simulation +from nmigen.cli import verilog, rtlil +from nmigen import Module, Signal, Cat, Array, Const, Elaboratable +from nmigen.lib.coding import Decoder + +from nmutil.latch import SRLatch, latchregister + +from scoreboard.shadow import Shadow + + +class FnUnit(Elaboratable): + """ implements 11.4.8 function unit, p31 + also implements optional shadowing 11.5.1, p55 + + shadowing can be used for branches as well as exceptions (interrupts), + load/store hold (exceptions again), and vector-element predication + (once the predicate is known, which it may not be at instruction issue) + + Inputs + + * :wid: register file width + * :shadow_wid: number of shadow/fail/good/go_die sets + * :n_dests: number of destination regfile(s) (index: rfile_sel_i) + * :wr_pend: if true, writable observes the g_wr_pend_i vector + otherwise observes g_rd_pend_i + + notes: + + * dest_i / src1_i / src2_i are in *binary*, whereas... + * ...g_rd_pend_i / g_wr_pend_i and rd_pend_o / wr_pend_o are UNARY + * req_rel_i (request release) is the direct equivalent of pipeline + "output valid" (valid_o) + * recover is a local python variable (actually go_die_o) + * when shadow_wid = 0, recover and shadown are Consts (i.e. do nothing) + * wr_pend is set False for the majority of uses: however for + use in a STORE Function Unit it is set to True + """ + def __init__(self, wid, shadow_wid=0, n_dests=1, wr_pend=False): + self.reg_width = wid + self.n_dests = n_dests + self.shadow_wid = shadow_wid + self.wr_pend = wr_pend + + # inputs + if n_dests > 1: + self.rfile_sel_i = Signal(max=n_dests, reset_less=True) + else: + self.rfile_sel_i = Const(0) # no selection. gets Array[0] + self.dest_i = Signal(max=wid, reset_less=True) # Dest R# in (top) + self.src1_i = Signal(max=wid, reset_less=True) # oper1 R# in (top) + self.src2_i = Signal(max=wid, reset_less=True) # oper2 R# in (top) + self.issue_i = Signal(reset_less=True) # Issue in (top) + + self.go_wr_i = Signal(reset_less=True) # Go Write in (left) + self.go_rd_i = Signal(reset_less=True) # Go Read in (left) + self.req_rel_i = Signal(reset_less=True) # request release (left) + + self.g_xx_pend_i = Array(Signal(wid, reset_less=True, name="g_pend_i") \ + for i in range(n_dests)) # global rd (right) + self.g_wr_pend_i = Signal(wid, reset_less=True) # global wr (right) + + if shadow_wid: + self.shadow_i = Signal(shadow_wid, reset_less=True) + self.s_fail_i = Signal(shadow_wid, reset_less=True) + self.s_good_i = Signal(shadow_wid, reset_less=True) + self.go_die_o = Signal(reset_less=True) + + # outputs + self.readable_o = Signal(reset_less=True) # Readable out (right) + self.writable_o = Array(Signal(reset_less=True, name="writable_o") \ + for i in range(n_dests)) # writable out (right) + self.busy_o = Signal(reset_less=True) # busy out (left) + + self.src1_pend_o = Signal(wid, reset_less=True) # src1 pending + self.src2_pend_o = Signal(wid, reset_less=True) # src1 pending + self.rd_pend_o = Signal(wid, reset_less=True) # rd pending (right) + self.xx_pend_o = Array(Signal(wid, reset_less=True, name="pend_o") \ + for i in range(n_dests))# wr pending (right) + + def elaborate(self, platform): + m = Module() + m.submodules.rd_l = rd_l = SRLatch(sync=False) + m.submodules.wr_l = wr_l = SRLatch(sync=False) + m.submodules.dest_d = dest_d = Decoder(self.reg_width) + m.submodules.src1_d = src1_d = Decoder(self.reg_width) + m.submodules.src2_d = src2_d = Decoder(self.reg_width) + + # shadow / recover (optional: shadow_wid > 0) + m.submodules.shadow = shadow = Shadow(self.shadow_wid) + if self.shadow_wid: + m.d.comb += shadow.issue_i.eq(self.issue_i) + m.d.comb += shadow.s_fail_i.eq(self.s_fail_i) + m.d.comb += shadow.s_good_i.eq(self.s_good_i) + m.d.comb += shadow.shadow_i.eq(self.shadow_i) + shadown = shadow.shadown_o + recover = shadow.go_die_o + + # selector + xx_pend_o = self.xx_pend_o[self.rfile_sel_i] + writable_o = self.writable_o[self.rfile_sel_i] + g_pend_i = self.g_xx_pend_i[self.rfile_sel_i] + + for i in range(self.n_dests): + m.d.comb += self.xx_pend_o[i].eq(0) # initialise all array + m.d.comb += self.writable_o[i].eq(0) # to zero + m.d.comb += self.readable_o.eq(0) # to zero + + # go_wr latch: reset on go_wr HI, set on issue + m.d.comb += wr_l.s.eq(self.issue_i) + m.d.comb += wr_l.r.eq(self.go_wr_i | recover) + + # src1 latch: reset on go_rd HI, set on issue + m.d.comb += rd_l.s.eq(self.issue_i) + m.d.comb += rd_l.r.eq(self.go_rd_i | recover) + + # latch/registers for dest / src1 / src2 + dest_r = Signal(max=self.reg_width, reset_less=True) + src1_r = Signal(max=self.reg_width, reset_less=True) + src2_r = Signal(max=self.reg_width, reset_less=True) + # XXX latch based on *issue* rather than !latch (as in book) + latchregister(m, self.dest_i, dest_r, self.issue_i) #wr_l.qn) + latchregister(m, self.src1_i, src1_r, self.issue_i) #wr_l.qn) + latchregister(m, self.src2_i, src2_r, self.issue_i) #wr_l.qn) + + # dest decoder (use dest reg as input): write-pending out + m.d.comb += dest_d.i.eq(dest_r) + m.d.comb += dest_d.n.eq(wr_l.qn) # decode is inverted + m.d.comb += self.busy_o.eq(wr_l.q) # busy if set + m.d.comb += xx_pend_o.eq(dest_d.o) + + # src1/src2 decoder (use src1/2 regs as input): read-pending out + m.d.comb += src1_d.i.eq(src1_r) + m.d.comb += src1_d.n.eq(rd_l.qn) # decode is inverted + m.d.comb += src2_d.i.eq(src2_r) + m.d.comb += src2_d.n.eq(rd_l.qn) # decode is inverted + m.d.comb += self.src1_pend_o.eq(src1_d.o) + m.d.comb += self.src2_pend_o.eq(src2_d.o) + m.d.comb += self.rd_pend_o.eq(src1_d.o | src2_d.o) + + # readable output signal + g_rd = Signal(self.reg_width, reset_less=True) + ro = Signal(reset_less=True) + m.d.comb += g_rd.eq(~self.g_wr_pend_i & self.rd_pend_o) + m.d.comb += ro.eq(~g_rd.bool()) + m.d.comb += self.readable_o.eq(ro) + + # writable output signal + g_wr_v = Signal(self.reg_width, reset_less=True) + g_wr = Signal(reset_less=True) + wo = Signal(reset_less=True) + m.d.comb += g_wr_v.eq(g_pend_i & xx_pend_o) + m.d.comb += g_wr.eq(~g_wr_v.bool()) + m.d.comb += wo.eq(g_wr & rd_l.qn & self.req_rel_i & shadown) + m.d.comb += writable_o.eq(wo) + + return m + + def __iter__(self): + yield self.dest_i + yield self.src1_i + yield self.src2_i + yield self.issue_i + yield self.go_wr_i + yield self.go_rd_i + yield self.req_rel_i + yield from self.g_xx_pend_i + yield self.g_wr_pend_i + yield self.readable_o + yield from self.writable_o + yield self.rd_pend_o + yield from self.xx_pend_o + + def ports(self): + return list(self) + +############# ############### +# --- --- # +# --- renamed / redirected from base class --- # +# --- --- # +# --- below are convenience classes which match the names --- # +# --- of the various mitch alsup book chapter gate diagrams --- # +# --- --- # +############# ############### + + +class IntFnUnit(FnUnit): + def __init__(self, wid, shadow_wid=0): + FnUnit.__init__(self, wid, shadow_wid) + self.int_rd_pend_o = self.rd_pend_o + self.int_wr_pend_o = self.xx_pend_o[0] + self.g_int_wr_pend_i = self.g_wr_pend_i + self.g_int_rd_pend_i = self.g_xx_pend_i[0] + self.int_readable_o = self.readable_o + self.int_writable_o = self.writable_o[0] + + self.int_rd_pend_o.name = "int_rd_pend_o" + self.int_wr_pend_o.name = "int_wr_pend_o" + self.g_int_rd_pend_i.name = "g_int_rd_pend_i" + self.g_int_wr_pend_i.name = "g_int_wr_pend_i" + self.int_readable_o.name = "int_readable_o" + self.int_writable_o.name = "int_writable_o" + + +class FPFnUnit(FnUnit): + def __init__(self, wid, shadow_wid=0): + FnUnit.__init__(self, wid, shadow_wid) + self.fp_rd_pend_o = self.rd_pend_o + self.fp_wr_pend_o = self.xx_pend_o[0] + self.g_fp_wr_pend_i = self.g_wr_pend_i + self.g_fp_rd_pend_i = self.g_xx_pend_i[0] + self.fp_writable_o = self.writable_o[0] + self.fp_readable_o = self.readable_o + + self.fp_rd_pend_o.name = "fp_rd_pend_o" + self.fp_wr_pend_o.name = "fp_wr_pend_o" + self.g_fp_rd_pend_i.name = "g_fp_rd_pend_i" + self.g_fp_wr_pend_i.name = "g_fp_wr_pend_i" + self.fp_writable_o.name = "fp_writable_o" + self.fp_readable_o.name = "fp_readable_o" + + +class LDFnUnit(FnUnit): + """ number of dest selectors: 2. assumes len(int_regfile) == len(fp_regfile) + * when rfile_sel_i == 0, int_wr_pend_o is set + * when rfile_sel_i == 1, fp_wr_pend_o is set + """ + def __init__(self, wid, shadow_wid=0): + FnUnit.__init__(self, wid, shadow_wid, n_dests=2) + self.int_rd_pend_o = self.rd_pend_o + self.int_wr_pend_o = self.xx_pend_o[0] + self.fp_wr_pend_o = self.xx_pend_o[1] + self.g_int_wr_pend_i = self.g_wr_pend_i + self.g_int_rd_pend_i = self.g_xx_pend_i[0] + self.g_fp_rd_pend_i = self.g_xx_pend_i[1] + self.int_readable_o = self.readable_o + self.int_writable_o = self.writable_o[0] + self.fp_writable_o = self.writable_o[1] + + self.int_rd_pend_o.name = "int_rd_pend_o" + self.int_wr_pend_o.name = "int_wr_pend_o" + self.fp_wr_pend_o.name = "fp_wr_pend_o" + self.g_int_wr_pend_i.name = "g_int_wr_pend_i" + self.g_int_rd_pend_i.name = "g_int_rd_pend_i" + self.g_fp_rd_pend_i.name = "g_fp_rd_pend_i" + self.int_readable_o.name = "int_readable_o" + self.int_writable_o.name = "int_writable_o" + self.fp_writable_o.name = "fp_writable_o" + + +class STFnUnit(FnUnit): + """ number of dest selectors: 2. assumes len(int_regfile) == len(fp_regfile) + * wr_pend=False indicates to observe global fp write pending + * when rfile_sel_i == 0, int_wr_pend_o is set + * when rfile_sel_i == 1, fp_wr_pend_o is set + * + """ + def __init__(self, wid, shadow_wid=0): + FnUnit.__init__(self, wid, shadow_wid, n_dests=2, wr_pend=True) + self.int_rd_pend_o = self.rd_pend_o # 1st int read-pending vector + self.int2_rd_pend_o = self.xx_pend_o[0] # 2nd int read-pending vector + self.fp_rd_pend_o = self.xx_pend_o[1] # 1x FP read-pending vector + # yes overwrite FnUnit base class g_wr_pend_i vector + self.g_int_wr_pend_i = self.g_wr_pend_i = self.g_xx_pend_i[0] + self.g_fp_wr_pend_i = self.g_xx_pend_i[1] + self.int_readable_o = self.readable_o + self.int_writable_o = self.writable_o[0] + self.fp_writable_o = self.writable_o[1] + + self.int_rd_pend_o.name = "int_rd_pend_o" + self.int2_rd_pend_o.name = "int2_rd_pend_o" + self.fp_rd_pend_o.name = "fp_rd_pend_o" + self.g_int_wr_pend_i.name = "g_int_wr_pend_i" + self.g_fp_wr_pend_i.name = "g_fp_wr_pend_i" + self.int_readable_o.name = "int_readable_o" + self.int_writable_o.name = "int_writable_o" + self.fp_writable_o.name = "fp_writable_o" + + + +def int_fn_unit_sim(dut): + yield dut.dest_i.eq(1) + yield dut.issue_i.eq(1) + yield + yield dut.issue_i.eq(0) + yield + yield dut.src1_i.eq(1) + yield dut.issue_i.eq(1) + yield + yield + yield + yield dut.issue_i.eq(0) + yield + yield dut.go_rd_i.eq(1) + yield + yield dut.go_rd_i.eq(0) + yield + yield dut.go_wr_i.eq(1) + yield + yield dut.go_wr_i.eq(0) + yield + +def test_int_fn_unit(): + dut = FnUnit(32, 2, 2) + vl = rtlil.convert(dut, ports=dut.ports()) + with open("test_fn_unit.il", "w") as f: + f.write(vl) + + dut = LDFnUnit(32, 2) + vl = rtlil.convert(dut, ports=dut.ports()) + with open("test_ld_fn_unit.il", "w") as f: + f.write(vl) + + dut = STFnUnit(32, 0) + vl = rtlil.convert(dut, ports=dut.ports()) + with open("test_st_fn_unit.il", "w") as f: + f.write(vl) + + run_simulation(dut, int_fn_unit_sim(dut), vcd_name='test_fn_unit.vcd') + +if __name__ == '__main__': + test_int_fn_unit() diff --git a/src/soc/scoreboard/fu_dep_cell.py b/src/soc/scoreboard/fu_dep_cell.py new file mode 100644 index 00000000..9946dcb5 --- /dev/null +++ b/src/soc/scoreboard/fu_dep_cell.py @@ -0,0 +1,92 @@ +from nmigen.compat.sim import run_simulation +from nmigen.cli import verilog, rtlil +from nmigen import Module, Signal, Const, Elaboratable +from nmutil.latch import SRLatch + + +class FUDependenceCell(Elaboratable): + """ implements 11.4.7 mitch alsup dependence cell, p27 + """ + def __init__(self, dummy, n_fu=1): + self.n_fu = n_fu + self.dummy = Const(~(1< + self.rd_pend_i = Signal(n_fu_row, reset_less=True) # Rd pending (left) + self.wr_pend_i = Signal(n_fu_row, reset_less=True) # Wr pending (left) + self.issue_i = Signal(n_fu_col, reset_less=True) # Issue in (top) + + self.go_wr_i = Signal(n_fu_row, reset_less=True) # Go Write in (left) + self.go_rd_i = Signal(n_fu_row, reset_less=True) # Go Read in (left) + self.go_die_i = Signal(n_fu_row, reset_less=True) # Go Die in (left) + + # for Function Unit Readable/Writable (horizontal) + self.readable_o = Signal(n_fu_col, reset_less=True) # readable (bot) + self.writable_o = Signal(n_fu_col, reset_less=True) # writable (bot) + + def elaborate(self, platform): + m = Module() + + # --- + # matrix of dependency cells + # --- + dm = Array(FUDependenceCell(f, self.n_fu_col) \ + for f in range(self.n_fu_row)) + for y in range(self.n_fu_row): + setattr(m.submodules, "dm%d" % y, dm[y]) + + # --- + # array of Function Unit Readable/Writable: row-length, horizontal + # --- + fur = Array(FU_Pick_Vec(self.n_fu_row) for r in range(self.n_fu_col)) + for x in range(self.n_fu_col): + setattr(m.submodules, "fur_x%d" % (x), fur[x]) + + # --- + # connect FU Readable/Writable vector + # --- + readable = [] + writable = [] + for y in range(self.n_fu_row): + fu = fur[y] + # accumulate Readable/Writable Vector outputs + readable.append(fu.readable_o) + writable.append(fu.writable_o) + + # ... and output them from this module (horizontal, width=REGs) + m.d.comb += self.readable_o.eq(Cat(*readable)) + m.d.comb += self.writable_o.eq(Cat(*writable)) + + # --- + # connect FU Pending + # --- + for y in range(self.n_fu_row): + dc = dm[y] + fu = fur[y] + # connect cell reg-select outputs to Reg Vector In + m.d.comb += [fu.rd_pend_i.eq(dc.rd_wait_o), + fu.wr_pend_i.eq(dc.wr_wait_o), + ] + + # --- + # connect Dependency Matrix dest/src1/src2/issue to module d/s/s/i + # --- + for x in range(self.n_fu_col): + issue_i = [] + for y in range(self.n_fu_row): + dc = dm[y] + # accumulate cell inputs issue + issue_i.append(dc.issue_i[x]) + # wire up inputs from module to row cell inputs + m.d.comb += Cat(*issue_i).eq(self.issue_i) + + # --- + # connect Matrix go_rd_i/go_wr_i to module readable/writable + # --- + for y in range(self.n_fu_row): + dc = dm[y] + # wire up inputs from module to row cell inputs + m.d.comb += [dc.go_rd_i.eq(self.go_rd_i), + dc.go_wr_i.eq(self.go_wr_i), + dc.go_die_i.eq(self.go_die_i), + ] + + # --- + # connect Matrix pending + # --- + for y in range(self.n_fu_row): + dc = dm[y] + # wire up inputs from module to row cell inputs + m.d.comb += [dc.rd_pend_i.eq(self.rd_pend_i), + dc.wr_pend_i.eq(self.wr_pend_i), + ] + + return m + + def __iter__(self): + yield self.rd_pend_i + yield self.wr_pend_i + yield self.issue_i + yield self.go_wr_i + yield self.go_rd_i + yield self.readable_o + yield self.writable_o + + def ports(self): + return list(self) + +def d_matrix_sim(dut): + """ XXX TODO + """ + yield dut.dest_i.eq(1) + yield dut.issue_i.eq(1) + yield + yield dut.issue_i.eq(0) + yield + yield dut.src1_i.eq(1) + yield dut.issue_i.eq(1) + yield + yield dut.issue_i.eq(0) + yield + yield dut.go_rd_i.eq(1) + yield + yield dut.go_rd_i.eq(0) + yield + yield dut.go_wr_i.eq(1) + yield + yield dut.go_wr_i.eq(0) + yield + +def test_fu_fu_matrix(): + dut = FUFUDepMatrix(n_fu_row=3, n_fu_col=4) + vl = rtlil.convert(dut, ports=dut.ports()) + with open("test_fu_fu_matrix.il", "w") as f: + f.write(vl) + + run_simulation(dut, d_matrix_sim(dut), vcd_name='test_fu_fu_matrix.vcd') + +if __name__ == '__main__': + test_fu_fu_matrix() diff --git a/src/soc/scoreboard/fu_mem_matrix.py b/src/soc/scoreboard/fu_mem_matrix.py new file mode 100644 index 00000000..baaa02be --- /dev/null +++ b/src/soc/scoreboard/fu_mem_matrix.py @@ -0,0 +1,155 @@ +from nmigen.compat.sim import run_simulation +from nmigen.cli import verilog, rtlil +from nmigen import Module, Signal, Elaboratable, Array, Cat, Const + +from scoreboard.fumem_dep_cell import FUMemDependenceCell +from scoreboard.fu_mem_picker_vec import FUMem_Pick_Vec + +""" + + 6600 Function Unit Dependency Table Matrix inputs / outputs + ----------------------------------------------------------- + +""" + +class FUMemDepMatrix(Elaboratable): + """ implements FU-to-FU Memory Dependency Matrix + """ + def __init__(self, n_fu_row, n_fu_col): + self.n_fu_row = n_fu_row # Y (FU row#) ^v + self.n_fu_col = n_fu_col # X (FU col #) <> + self.st_pend_i = Signal(n_fu_row, reset_less=True) # Rd pending (left) + self.ld_pend_i = Signal(n_fu_row, reset_less=True) # Wr pending (left) + self.issue_i = Signal(n_fu_col, reset_less=True) # Issue in (top) + + self.go_ld_i = Signal(n_fu_row, reset_less=True) # Go Write in (left) + self.go_st_i = Signal(n_fu_row, reset_less=True) # Go Read in (left) + self.go_die_i = Signal(n_fu_row, reset_less=True) # Go Die in (left) + + # for Function Unit Readable/Writable (horizontal) + self.storable_o = Signal(n_fu_col, reset_less=True) # storable (bot) + self.loadable_o = Signal(n_fu_col, reset_less=True) # loadable (bot) + + def elaborate(self, platform): + m = Module() + + # --- + # matrix of dependency cells + # --- + dm = Array(FUMemDependenceCell(f, self.n_fu_col) \ + for f in range(self.n_fu_row)) + for y in range(self.n_fu_row): + setattr(m.submodules, "dm%d" % y, dm[y]) + + # --- + # array of Function Unit Readable/Writable: row-length, horizontal + # --- + fur = Array(FUMem_Pick_Vec(self.n_fu_row) for r in range(self.n_fu_col)) + for x in range(self.n_fu_col): + setattr(m.submodules, "fur_x%d" % (x), fur[x]) + + # --- + # connect FU Readable/Writable vector + # --- + storable = [] + loadable = [] + for y in range(self.n_fu_row): + fu = fur[y] + # accumulate Readable/Writable Vector outputs + storable.append(fu.storable_o) + loadable.append(fu.loadable_o) + + # ... and output them from this module (horizontal, width=REGs) + m.d.comb += self.storable_o.eq(Cat(*storable)) + m.d.comb += self.loadable_o.eq(Cat(*loadable)) + + # --- + # connect FU Pending + # --- + for y in range(self.n_fu_row): + dc = dm[y] + fu = fur[y] + # connect cell reg-select outputs to Reg Vector In + m.d.comb += [fu.st_pend_i.eq(dc.st_wait_o), + fu.ld_pend_i.eq(dc.ld_wait_o), + ] + + # --- + # connect Dependency Matrix dest/src1/src2/issue to module d/s/s/i + # --- + for x in range(self.n_fu_col): + issue_i = [] + for y in range(self.n_fu_row): + dc = dm[y] + # accumulate cell inputs issue + issue_i.append(dc.issue_i[x]) + # wire up inputs from module to row cell inputs + m.d.comb += Cat(*issue_i).eq(self.issue_i) + + # --- + # connect Matrix go_st_i/go_ld_i to module storable/loadable + # --- + for y in range(self.n_fu_row): + dc = dm[y] + # wire up inputs from module to row cell inputs + m.d.comb += [dc.go_st_i.eq(self.go_st_i), + dc.go_ld_i.eq(self.go_ld_i), + dc.go_die_i.eq(self.go_die_i), + ] + + # --- + # connect Matrix pending + # --- + for y in range(self.n_fu_row): + dc = dm[y] + # wire up inputs from module to row cell inputs + m.d.comb += [dc.st_pend_i.eq(self.st_pend_i), + dc.ld_pend_i.eq(self.ld_pend_i), + ] + + return m + + def __iter__(self): + yield self.st_pend_i + yield self.ld_pend_i + yield self.issue_i + yield self.go_ld_i + yield self.go_st_i + yield self.storable_o + yield self.loadable_o + + def ports(self): + return list(self) + +def d_matrix_sim(dut): + """ XXX TODO + """ + yield dut.ld_i.eq(1) + yield dut.issue_i.eq(1) + yield + yield dut.issue_i.eq(0) + yield + yield dut.st_i.eq(1) + yield dut.issue_i.eq(1) + yield + yield dut.issue_i.eq(0) + yield + yield dut.go_st_i.eq(1) + yield + yield dut.go_st_i.eq(0) + yield + yield dut.go_ld_i.eq(1) + yield + yield dut.go_ld_i.eq(0) + yield + +def test_fu_fu_matrix(): + dut = FUMemDepMatrix(n_fu_row=3, n_fu_col=3) + vl = rtlil.convert(dut, ports=dut.ports()) + with open("test_fu_mem_matrix.il", "w") as f: + f.write(vl) + + run_simulation(dut, d_matrix_sim(dut), vcd_name='test_fu_mem_matrix.vcd') + +if __name__ == '__main__': + test_fu_fu_matrix() diff --git a/src/soc/scoreboard/fu_mem_picker_vec.py b/src/soc/scoreboard/fu_mem_picker_vec.py new file mode 100644 index 00000000..dc40bd09 --- /dev/null +++ b/src/soc/scoreboard/fu_mem_picker_vec.py @@ -0,0 +1,26 @@ +from nmigen import Elaboratable, Module, Signal, Cat + + +class FUMem_Pick_Vec(Elaboratable): + """ these are allocated per-FU (horizontally), + and are of length fu_row_n + """ + def __init__(self, fu_row_n): + self.fu_row_n = fu_row_n + self.st_pend_i = Signal(fu_row_n, reset_less=True) + self.ld_pend_i = Signal(fu_row_n, reset_less=True) + + self.storable_o = Signal(reset_less=True) + self.loadable_o = Signal(reset_less=True) + + def elaborate(self, platform): + m = Module() + + # Readable if there are no writes pending + m.d.comb += self.storable_o.eq(~self.ld_pend_i.bool()) + + # Writable if there are no reads pending + m.d.comb += self.loadable_o.eq(~self.st_pend_i.bool()) + + return m + diff --git a/src/soc/scoreboard/fu_picker_vec.py b/src/soc/scoreboard/fu_picker_vec.py new file mode 100644 index 00000000..d38bbfae --- /dev/null +++ b/src/soc/scoreboard/fu_picker_vec.py @@ -0,0 +1,26 @@ +from nmigen import Elaboratable, Module, Signal, Cat + + +class FU_Pick_Vec(Elaboratable): + """ these are allocated per-FU (horizontally), + and are of length fu_row_n + """ + def __init__(self, fu_row_n): + self.fu_row_n = fu_row_n + self.rd_pend_i = Signal(fu_row_n, reset_less=True) + self.wr_pend_i = Signal(fu_row_n, reset_less=True) + + self.readable_o = Signal(reset_less=True) + self.writable_o = Signal(reset_less=True) + + def elaborate(self, platform): + m = Module() + + # Readable if there are no writes pending + m.d.comb += self.readable_o.eq(~self.wr_pend_i.bool()) + + # Writable if there are no reads pending + m.d.comb += self.writable_o.eq(~self.rd_pend_i.bool()) + + return m + diff --git a/src/soc/scoreboard/fu_reg_matrix.py b/src/soc/scoreboard/fu_reg_matrix.py new file mode 100644 index 00000000..8ca1494e --- /dev/null +++ b/src/soc/scoreboard/fu_reg_matrix.py @@ -0,0 +1,304 @@ +from nmigen.compat.sim import run_simulation +from nmigen.cli import verilog, rtlil +from nmigen import Module, Signal, Elaboratable, Array, Cat, Repl + +from scoreboard.dependence_cell import DependencyRow +from scoreboard.fu_wr_pending import FU_RW_Pend +from scoreboard.reg_select import Reg_Rsv +from scoreboard.global_pending import GlobalPending + +""" + + 6600 Dependency Table Matrix inputs / outputs + --------------------------------------------- + + d s1 s2 i d s1 s2 i d s1 s2 i d s1 s2 i + | | | | | | | | | | | | | | | | + v v v v v v v v v v v v v v v v + go_rd/go_wr -> dm-r0-fu0 dm-r1-fu0 dm-r2-fu0 dm-r3-fu0 -> wr/rd-pend + go_rd/go_wr -> dm-r0-fu1 dm-r1-fu1 dm-r2-fu1 dm-r3-fu1 -> wr/rd-pend + go_rd/go_wr -> dm-r0-fu2 dm-r1-fu2 dm-r2-fu2 dm-r3-fu2 -> wr/rd-pend + | | | | | | | | | | | | + v v v v v v v v v v v v + d s1 s2 d s1 s2 d s1 s2 d s1 s2 + reg sel reg sel reg sel reg sel + +""" + +class FURegDepMatrix(Elaboratable): + """ implements 11.4.7 mitch alsup FU-to-Reg Dependency Matrix, p26 + """ + def __init__(self, n_fu_row, n_reg_col, n_src, cancel=None): + self.n_src = n_src + self.n_fu_row = nf = n_fu_row # Y (FUs) ^v + self.n_reg_col = n_reg = n_reg_col # X (Regs) <> + + # arrays + src = [] + rsel = [] + for i in range(n_src): + j = i + 1 # name numbering to match src1/src2 + src.append(Signal(n_reg, name="src%d" % j, reset_less=True)) + rsel.append(Signal(n_reg, name="src%d_rsel_o" % j, reset_less=True)) + pend = [] + for i in range(nf): + j = i + 1 # name numbering to match src1/src2 + pend.append(Signal(nf, name="rd_src%d_pend_o" % j, reset_less=True)) + + self.dest_i = Signal(n_reg_col, reset_less=True) # Dest in (top) + self.src_i = Array(src) # oper in (top) + + # cancellation array (from Address Matching), ties in with go_die_i + self.cancel = cancel + + # Register "Global" vectors for determining RaW and WaR hazards + self.wr_pend_i = Signal(n_reg_col, reset_less=True) # wr pending (top) + self.rd_pend_i = Signal(n_reg_col, reset_less=True) # rd pending (top) + self.v_wr_rsel_o = Signal(n_reg_col, reset_less=True) # wr pending (bot) + self.v_rd_rsel_o = Signal(n_reg_col, reset_less=True) # rd pending (bot) + + self.issue_i = Signal(n_fu_row, reset_less=True) # Issue in (top) + self.go_wr_i = Signal(n_fu_row, reset_less=True) # Go Write in (left) + self.go_rd_i = Signal(n_fu_row, reset_less=True) # Go Read in (left) + self.go_die_i = Signal(n_fu_row, reset_less=True) # Go Die in (left) + + # for Register File Select Lines (horizontal), per-reg + self.dest_rsel_o = Signal(n_reg_col, reset_less=True) # dest reg (bot) + self.src_rsel_o = Array(rsel) # src reg (bot) + + # for Function Unit "forward progress" (vertical), per-FU + self.wr_pend_o = Signal(n_fu_row, reset_less=True) # wr pending (right) + self.rd_pend_o = Signal(n_fu_row, reset_less=True) # rd pending (right) + self.rd_src_pend_o = Array(pend) # src1 pending + + def elaborate(self, platform): + m = Module() + return self._elaborate(m, platform) + + def _elaborate(self, m, platform): + + # --- + # matrix of dependency cells + # --- + cancel_mode = self.cancel is not None + dm = Array(DependencyRow(self.n_reg_col, self.n_src, cancel_mode) \ + for r in range(self.n_fu_row)) + for fu in range(self.n_fu_row): + setattr(m.submodules, "dr_fu%d" % fu, dm[fu]) + + # --- + # array of Function Unit Pending vectors + # --- + fupend = Array(FU_RW_Pend(self.n_reg_col, self.n_src) \ + for f in range(self.n_fu_row)) + for fu in range(self.n_fu_row): + setattr(m.submodules, "fu_fu%d" % (fu), fupend[fu]) + + # --- + # array of Register Reservation vectors + # --- + regrsv = Array(Reg_Rsv(self.n_fu_row, self.n_src) \ + for r in range(self.n_reg_col)) + for rn in range(self.n_reg_col): + setattr(m.submodules, "rr_r%d" % (rn), regrsv[rn]) + + # --- + # connect Function Unit vector + # --- + wr_pend = [] + rd_pend = [] + for fu in range(self.n_fu_row): + dc = dm[fu] + fup = fupend[fu] + dest_fwd_o = [] + for rn in range(self.n_reg_col): + # accumulate cell fwd outputs for dest/src1/src2 + dest_fwd_o.append(dc.dest_fwd_o[rn]) + # connect cell fwd outputs to FU Vector in [Cat is gooood] + m.d.comb += [fup.dest_fwd_i.eq(Cat(*dest_fwd_o)), + ] + # accumulate FU Vector outputs + wr_pend.append(fup.reg_wr_pend_o) + rd_pend.append(fup.reg_rd_pend_o) + + # ... and output them from this module (vertical, width=FUs) + m.d.comb += self.wr_pend_o.eq(Cat(*wr_pend)) + m.d.comb += self.rd_pend_o.eq(Cat(*rd_pend)) + + # same for src + for i in range(self.n_src): + rd_src_pend = [] + for fu in range(self.n_fu_row): + dc = dm[fu] + fup = fupend[fu] + src_fwd_o = [] + for rn in range(self.n_reg_col): + # accumulate cell fwd outputs for dest/src1/src2 + src_fwd_o.append(dc.src_fwd_o[i][rn]) + # connect cell fwd outputs to FU Vector in [Cat is gooood] + m.d.comb += [fup.src_fwd_i[i].eq(Cat(*src_fwd_o)), + ] + # accumulate FU Vector outputs + rd_src_pend.append(fup.reg_rd_src_pend_o[i]) + # ... and output them from this module (vertical, width=FUs) + m.d.comb += self.rd_src_pend_o[i].eq(Cat(*rd_src_pend)) + + # --- + # connect Reg Selection vector + # --- + dest_rsel = [] + for rn in range(self.n_reg_col): + rsv = regrsv[rn] + dest_rsel_o = [] + for fu in range(self.n_fu_row): + dc = dm[fu] + # accumulate cell reg-select outputs dest/src1/src2 + dest_rsel_o.append(dc.dest_rsel_o[rn]) + # connect cell reg-select outputs to Reg Vector In + m.d.comb += rsv.dest_rsel_i.eq(Cat(*dest_rsel_o)), + + # accumulate Reg-Sel Vector outputs + dest_rsel.append(rsv.dest_rsel_o) + + # ... and output them from this module (horizontal, width=REGs) + m.d.comb += self.dest_rsel_o.eq(Cat(*dest_rsel)) + + # same for src + for i in range(self.n_src): + src_rsel = [] + for rn in range(self.n_reg_col): + rsv = regrsv[rn] + src_rsel_o = [] + for fu in range(self.n_fu_row): + dc = dm[fu] + # accumulate cell reg-select outputs dest/src1/src2 + src_rsel_o.append(dc.src_rsel_o[i][rn]) + # connect cell reg-select outputs to Reg Vector In + m.d.comb += rsv.src_rsel_i[i].eq(Cat(*src_rsel_o)), + # accumulate Reg-Sel Vector outputs + src_rsel.append(rsv.src_rsel_o[i]) + + # ... and output them from this module (horizontal, width=REGs) + m.d.comb += self.src_rsel_o[i].eq(Cat(*src_rsel)) + + # --- + # connect Dependency Matrix dest/src1/src2/issue to module d/s/s/i + # --- + for fu in range(self.n_fu_row): + dc = dm[fu] + # wire up inputs from module to row cell inputs (Cat is gooood) + m.d.comb += [dc.dest_i.eq(self.dest_i), + dc.rd_pend_i.eq(self.rd_pend_i), + dc.wr_pend_i.eq(self.wr_pend_i), + ] + # same for src + for i in range(self.n_src): + for fu in range(self.n_fu_row): + dc = dm[fu] + # wire up inputs from module to row cell inputs (Cat is gooood) + m.d.comb += dc.src_i[i].eq(self.src_i[i]) + + # accumulate rsel bits into read/write pending vectors. + rd_pend_v = [] + wr_pend_v = [] + for fu in range(self.n_fu_row): + dc = dm[fu] + rd_pend_v.append(dc.v_rd_rsel_o) + wr_pend_v.append(dc.v_wr_rsel_o) + rd_v = GlobalPending(self.n_reg_col, rd_pend_v) + wr_v = GlobalPending(self.n_reg_col, wr_pend_v) + m.submodules.rd_v = rd_v + m.submodules.wr_v = wr_v + + m.d.comb += self.v_rd_rsel_o.eq(rd_v.g_pend_o) + m.d.comb += self.v_wr_rsel_o.eq(wr_v.g_pend_o) + + # --- + # connect Dep issue_i/go_rd_i/go_wr_i to module issue_i/go_rd/go_wr + # --- + go_rd_i = [] + go_wr_i = [] + issue_i = [] + for fu in range(self.n_fu_row): + dc = dm[fu] + # accumulate cell fwd outputs for dest/src1/src2 + go_rd_i.append(dc.go_rd_i) + go_wr_i.append(dc.go_wr_i) + issue_i.append(dc.issue_i) + # wire up inputs from module to row cell inputs (Cat is gooood) + m.d.comb += [Cat(*go_rd_i).eq(self.go_rd_i), + Cat(*go_wr_i).eq(self.go_wr_i), + Cat(*issue_i).eq(self.issue_i), + ] + + # --- + # connect Dep go_die_i + # --- + if cancel_mode: + for fu in range(self.n_fu_row): + dc = dm[fu] + go_die = Repl(self.go_die_i[fu], self.n_fu_row) + go_die = go_die | self.cancel[fu] + m.d.comb += dc.go_die_i.eq(go_die) + else: + go_die_i = [] + for fu in range(self.n_fu_row): + dc = dm[fu] + # accumulate cell fwd outputs for dest/src1/src2 + go_die_i.append(dc.go_die_i) + # wire up inputs from module to row cell inputs (Cat is gooood) + m.d.comb += Cat(*go_die_i).eq(self.go_die_i) + return m + + def __iter__(self): + yield self.dest_i + yield from self.src_i + yield self.issue_i + yield self.go_wr_i + yield self.go_rd_i + yield self.go_die_i + yield self.dest_rsel_o + yield from self.src_rsel_o + yield self.wr_pend_o + yield self.rd_pend_o + yield self.wr_pend_i + yield self.rd_pend_i + yield self.v_wr_rsel_o + yield self.v_rd_rsel_o + yield from self.rd_src_pend_o + + def ports(self): + return list(self) + +def d_matrix_sim(dut): + """ XXX TODO + """ + yield dut.dest_i.eq(1) + yield dut.issue_i.eq(1) + yield + yield dut.issue_i.eq(0) + yield + yield dut.src1_i.eq(1) + yield dut.issue_i.eq(1) + yield + yield dut.issue_i.eq(0) + yield + yield dut.go_rd_i.eq(1) + yield + yield dut.go_rd_i.eq(0) + yield + yield dut.go_wr_i.eq(1) + yield + yield dut.go_wr_i.eq(0) + yield + +def test_d_matrix(): + dut = FURegDepMatrix(n_fu_row=3, n_reg_col=4, n_src=2) + vl = rtlil.convert(dut, ports=dut.ports()) + with open("test_fu_reg_matrix.il", "w") as f: + f.write(vl) + + run_simulation(dut, d_matrix_sim(dut), vcd_name='test_fu_reg_matrix.vcd') + +if __name__ == '__main__': + test_d_matrix() diff --git a/src/soc/scoreboard/fu_wr_pending.py b/src/soc/scoreboard/fu_wr_pending.py new file mode 100644 index 00000000..d0bcb954 --- /dev/null +++ b/src/soc/scoreboard/fu_wr_pending.py @@ -0,0 +1,29 @@ +from nmigen import Elaboratable, Module, Signal, Array + + +class FU_RW_Pend(Elaboratable): + """ these are allocated per-FU (horizontally), + and are of length reg_count + """ + def __init__(self, reg_count, n_src): + self.n_src = n_src + self.reg_count = reg_count + self.dest_fwd_i = Signal(reg_count, reset_less=True) + src = [] + for i in range(n_src): + j = i + 1 # name numbering to match src1/src2 + src.append(Signal(reg_count, name="src%d" % j, reset_less=True)) + self.src_fwd_i = Array(src) + + self.reg_wr_pend_o = Signal(reset_less=True) + self.reg_rd_pend_o = Signal(reset_less=True) + self.reg_rd_src_pend_o = Signal(n_src, reset_less=True) + + def elaborate(self, platform): + m = Module() + m.d.comb += self.reg_wr_pend_o.eq(self.dest_fwd_i.bool()) + for i in range(self.n_src): + m.d.comb += self.reg_rd_src_pend_o[i].eq(self.src_fwd_i[i].bool()) + m.d.comb += self.reg_rd_pend_o.eq(self.reg_rd_src_pend_o.bool()) + return m + diff --git a/src/soc/scoreboard/fumem_dep_cell.py b/src/soc/scoreboard/fumem_dep_cell.py new file mode 100644 index 00000000..982b55a3 --- /dev/null +++ b/src/soc/scoreboard/fumem_dep_cell.py @@ -0,0 +1,92 @@ +from nmigen.compat.sim import run_simulation +from nmigen.cli import verilog, rtlil +from nmigen import Module, Signal, Const, Elaboratable +from nmutil.latch import SRLatch + + +class FUMemDependenceCell(Elaboratable): + """ implements 11.4.7 mitch alsup dependence cell, p27 + """ + def __init__(self, dummy, n_fu=1): + self.n_fu = n_fu + self.dummy = Const(~(1< self.qlen_o) + with m.If(qinmax): + comb += self.n_sub_o.eq(self.qlen_o) + with m.Else(): + comb += self.n_sub_o.eq(self.n_sub_i) + + # work out how many new items are going to be in the queue + comb += left.eq(self.qlen_o )#- self.n_sub_o) + comb += spare.eq(mqlen - self.p_add_i) + comb += qmaxed.eq(left <= spare) + comb += self.p_ready_o.eq(qmaxed & (self.p_add_i != 0)) + + # put q (flattened) into output + for i in range(self.n_out): + opos = Signal(mqbits) + comb += opos.eq(end_q + i) + comb += cat(self.data_o[i]).eq(self.q[opos]) + + with m.If(self.n_sub_o): + # ok now the end's moved + sync += end_q.eq(end_q + self.n_sub_o) + + with m.If(self.p_ready_o): + # copy in the input... insanely gate-costly... *sigh*... + for i in range(self.n_in): + with m.If(self.p_add_i > Const(i, len(self.p_add_i))): + ipos = Signal(mqbits) + comb += ipos.eq(start_q + i) # should roll round + sync += self.q[ipos].eq(cat(self.data_i[i])) + sync += start_q.eq(start_q + self.p_add_i) + + with m.If(self.p_ready_o): + # update the queue length + add2 = Signal(mqbits+1) + comb += add2.eq(self.qlen_o + self.p_add_i) + sync += self.qlen_o.eq(add2 - self.n_sub_o) + with m.Else(): + sync += self.qlen_o.eq(self.qlen_o - self.n_sub_o) + + return m + + def __iter__(self): + yield from self.q + + yield self.p_ready_o + for o in self.data_i: + yield from list(o) + yield self.p_add_i + + for o in self.data_o: + yield from list(o) + yield self.n_sub_i + yield self.n_sub_o + + def ports(self): + return list(self) + + +def instruction_q_sim(dut): + yield dut.dest_i.eq(1) + yield dut.issue_i.eq(1) + yield + yield dut.issue_i.eq(0) + yield + yield dut.src1_i.eq(1) + yield dut.issue_i.eq(1) + yield + yield + yield + yield dut.issue_i.eq(0) + yield + yield dut.go_rd_i.eq(1) + yield + yield dut.go_rd_i.eq(0) + yield + yield dut.go_wr_i.eq(1) + yield + yield dut.go_wr_i.eq(0) + yield + +def test_instruction_q(): + dut = InstructionQ(16, 4, 4, n_in=2, n_out=2) + vl = rtlil.convert(dut, ports=dut.ports()) + with open("test_instruction_q.il", "w") as f: + f.write(vl) + + run_simulation(dut, instruction_q_sim(dut), + vcd_name='test_instruction_q.vcd') + +if __name__ == '__main__': + test_instruction_q() diff --git a/src/soc/scoreboard/issue_unit.py b/src/soc/scoreboard/issue_unit.py new file mode 100644 index 00000000..3ec2a31c --- /dev/null +++ b/src/soc/scoreboard/issue_unit.py @@ -0,0 +1,278 @@ +from nmigen.compat.sim import run_simulation +from nmigen.cli import verilog, rtlil +from nmigen import Module, Signal, Cat, Array, Const, Repl, Elaboratable +from nmigen.lib.coding import Decoder + +from scoreboard.group_picker import PriorityPicker + + +class RegDecode(Elaboratable): + """ decodes registers into unary + + Inputs + + * :wid: register file width + """ + def __init__(self, wid): + self.reg_width = wid + + # inputs + self.enable_i = Signal(reset_less=True) # enable decoders + self.dest_i = Signal(range(wid), reset_less=True) # Dest R# in + self.src1_i = Signal(range(wid), reset_less=True) # oper1 R# in + self.src2_i = Signal(range(wid), reset_less=True) # oper2 R# in + + # outputs + self.dest_o = Signal(wid, reset_less=True) # Dest unary out + self.src1_o = Signal(wid, reset_less=True) # oper1 unary out + self.src2_o = Signal(wid, reset_less=True) # oper2 unary out + + def elaborate(self, platform): + m = Module() + m.submodules.dest_d = dest_d = Decoder(self.reg_width) + m.submodules.src1_d = src1_d = Decoder(self.reg_width) + m.submodules.src2_d = src2_d = Decoder(self.reg_width) + + # dest decoder: write-pending + for d, i, o in [(dest_d, self.dest_i, self.dest_o), + (src1_d, self.src1_i, self.src1_o), + (src2_d, self.src2_i, self.src2_o)]: + m.d.comb += d.i.eq(i) + m.d.comb += d.n.eq(~self.enable_i) + m.d.comb += o.eq(d.o) + + return m + + def __iter__(self): + yield self.enable_i + yield self.dest_i + yield self.src1_i + yield self.src2_i + yield self.dest_o + yield self.src1_o + yield self.src2_o + + def ports(self): + return list(self) + + +class IssueUnitGroup(Elaboratable): + """ Manages a batch of Computation Units all of which can do the same task + + A priority picker will allocate one instruction in this cycle based + on whether the others are busy. + + insn_i indicates to this module that there is an instruction to be + issued which this group can handle + + busy_i is a vector of signals that indicate, in this cycle, which + of the units are currently busy. + + busy_o indicates whether it is "safe to proceed" i.e. whether + there is a unit here that can *be* issued an instruction + + fn_issue_o indicates, out of the available (non-busy) units, + which one may be selected + """ + def __init__(self, n_insns): + """ Set up inputs and outputs for the Group + + Input Parameters + + * :n_insns: number of instructions in this issue unit. + """ + self.n_insns = n_insns + + # inputs + self.insn_i = Signal(reset_less=True, name="insn_i") + self.busy_i = Signal(n_insns, reset_less=True, name="busy_i") + + # outputs + self.fn_issue_o = Signal(n_insns, reset_less=True, name="fn_issue_o") + self.busy_o = Signal(reset_less=True) + + def elaborate(self, platform): + m = Module() + + if self.n_insns == 0: + return m + + m.submodules.pick = pick = PriorityPicker(self.n_insns) + + # temporaries + allissue = Signal(self.n_insns, reset_less=True) + + m.d.comb += allissue.eq(Repl(self.insn_i, self.n_insns)) + # Pick one (and only one) of the units to proceed in this cycle + m.d.comb += pick.i.eq(~self.busy_i & allissue) + + # "Safe to issue" condition is basically when all units are not busy + m.d.comb += self.busy_o.eq(~((~self.busy_i).bool())) + + # Picker only raises one signal, therefore it's also the fn_issue + m.d.comb += self.fn_issue_o.eq(pick.o & Repl(~self.busy_o, self.n_insns)) + + return m + + def __iter__(self): + yield self.insn_i + yield self.busy_i + yield self.fn_issue_o + yield self.g_issue_o + + def ports(self): + return list(self) + + +class IssueUnitArray(Elaboratable): + """ Convenience module that amalgamates the issue and busy signals + + unit issue_i is to be set externally, at the same time as the + ALU group oper_i + """ + def __init__(self, units): + self.units = units + self.issue_o = Signal(reset_less=True) + n_insns = 0 + for u in self.units: + n_insns += len(u.fn_issue_o) + self.busy_i = Signal(n_insns, reset_less=True) + self.fn_issue_o = Signal(n_insns, reset_less=True) + self.n_insns = n_insns + + def elaborate(self, platform): + m = Module() + for i, u in enumerate(self.units): + setattr(m.submodules, "issue%d" % i, u) + + g_issue_o = [] + busy_i = [] + fn_issue_o = [] + for u in self.units: + busy_i.append(u.busy_i) + g_issue_o.append(u.busy_o) + fn_issue_o.append(u.fn_issue_o) + m.d.comb += self.issue_o.eq(~(Cat(*g_issue_o).bool())) + m.d.comb += self.fn_issue_o.eq(Cat(*fn_issue_o)) + m.d.comb += Cat(*busy_i).eq(self.busy_i) + + return m + + def ports(self): + yield self.busy_i + yield self.issue_o + yield self.fn_issue_o + yield from self.units + + + +class IssueUnit(Elaboratable): + """ implements 11.4.14 issue unit, p50 + + Inputs + + * :n_insns: number of instructions in this issue unit. + """ + def __init__(self, n_insns): + self.n_insns = n_insns + + # inputs + self.insn_i = Signal(n_insns, reset_less=True, name="insn_i") + self.busy_i = Signal(n_insns, reset_less=True, name="busy_i") + + # outputs + self.fn_issue_o = Signal(n_insns, reset_less=True, name="fn_issue_o") + self.g_issue_o = Signal(reset_less=True) + + def elaborate(self, platform): + m = Module() + + if self.n_insns == 0: + return m + + # temporaries + fu_stall = Signal(reset_less=True) + + ib_l = [] + for i in range(self.n_insns): + ib_l.append(self.insn_i[i] & self.busy_i[i]) + m.d.comb += fu_stall.eq(Cat(*ib_l).bool()) + m.d.comb += self.g_issue_o.eq(~(fu_stall)) + for i in range(self.n_insns): + m.d.comb += self.fn_issue_o[i].eq(self.g_issue_o & self.insn_i[i]) + + return m + + def __iter__(self): + yield self.insn_i + yield self.busy_i + yield self.fn_issue_o + yield self.g_issue_o + + def ports(self): + return list(self) + + +class IntFPIssueUnit(Elaboratable): + def __init__(self, n_int_insns, n_fp_insns): + self.i = IssueUnit(n_int_insns) + self.f = IssueUnit(n_fp_insns) + self.issue_o = Signal(reset_less=True) + + def elaborate(self, platform): + m = Module() + m.submodules.intissue = self.i + m.submodules.fpissue = self.f + + m.d.comb += self.issue_o.eq(self.i.g_issue_o | self.f.g_issue_o) + + return m + + def ports(self): + yield self.issue_o + yield from self.i + yield from self.f + + +def issue_unit_sim(dut): + yield dut.dest_i.eq(1) + yield dut.issue_i.eq(1) + yield + yield dut.issue_i.eq(0) + yield + yield dut.src1_i.eq(1) + yield dut.issue_i.eq(1) + yield + yield + yield + yield dut.issue_i.eq(0) + yield + yield dut.go_rd_i.eq(1) + yield + yield dut.go_rd_i.eq(0) + yield + yield dut.go_wr_i.eq(1) + yield + yield dut.go_wr_i.eq(0) + yield + +def test_issue_unit(): + dut = IssueUnitGroup(3) + vl = rtlil.convert(dut, ports=dut.ports()) + with open("test_issue_unit_group.il", "w") as f: + f.write(vl) + + dut = IssueUnit(32, 3) + vl = rtlil.convert(dut, ports=dut.ports()) + with open("test_issue_unit.il", "w") as f: + f.write(vl) + + dut = IntFPIssueUnit(32, 3, 3) + vl = rtlil.convert(dut, ports=dut.ports()) + with open("test_intfp_issue_unit.il", "w") as f: + f.write(vl) + + run_simulation(dut, issue_unit_sim(dut), vcd_name='test_issue_unit.vcd') + +if __name__ == '__main__': + test_issue_unit() diff --git a/src/soc/scoreboard/ldst_dep_cell.py b/src/soc/scoreboard/ldst_dep_cell.py new file mode 100644 index 00000000..70f4b9ba --- /dev/null +++ b/src/soc/scoreboard/ldst_dep_cell.py @@ -0,0 +1,116 @@ +""" Mitch Alsup 6600-style LD/ST scoreboard Dependency Cell + +Relevant bugreports: + +* http://bugs.libre-riscv.org/show_bug.cgi?id=81 + +""" + +from nmigen.compat.sim import run_simulation +from nmigen.cli import verilog, rtlil +from nmigen import Module, Signal, Repl, Elaboratable +from nmutil.latch import SRLatch + + +class LDSTDepCell(Elaboratable): + """ implements 11.4.12 mitch alsup load/store dependence cell, p45 + """ + def __init__(self, n_ls=1): + self.n_ls = n_ls + # inputs + self.load_h_i = Signal(reset_less=True) # load in (left) + self.stor_h_i = Signal(reset_less=True) # store in (left) + self.load_v_i = Signal(n_ls, reset_less=True) # load in (top) + self.stor_v_i = Signal(n_ls, reset_less=True) # store in (top) + self.issue_i = Signal(reset_less=True) # Issue in (left) + self.go_die_i = Signal(reset_less=True) # Issue in (left) + + # load / store hit - basically connect these to go_wr from LD/STCompUnit + # LD.go_wr -> load_hit_i, ST.go_wr -> stwd_hit_i. + self.load_hit_i = Signal(n_ls, reset_less=True) # ld hit in (right) + self.stwd_hit_i = Signal(n_ls, reset_less=True) # st w/ hit in (right) + + # outputs (latched rd/wr pend) + self.ld_hold_st_o = Signal(reset_less=True) # ld holds st out (l) + self.st_hold_ld_o = Signal(reset_less=True) # st holds ld out (l) + + def elaborate(self, platform): + m = Module() + m.submodules.war_l = war_l = SRLatch(sync=False, llen=self.n_ls) # WaR + m.submodules.raw_l = raw_l = SRLatch(sync=False, llen=self.n_ls) # RaW + + # temporaries (repeat-extend) + issue = Repl(self.issue_i, self.n_ls) + die = Repl(self.go_die_i, self.n_ls) + + # issue & store & load - used for WAR Setting. LD is left, ST is top + i_s = Signal(reset_less=True) + i_s_l = Signal(self.n_ls, reset_less=True) + m.d.comb += i_s.eq(issue & self.stor_h_i) # horizontal single-signal + m.d.comb += i_s_l.eq(Repl(i_s, self.n_ls) & self.load_v_i) # multi, vert + + # issue & load & store - used for RAW Setting. ST is left, LD is top + i_l = Signal(reset_less=True) + i_l_s = Signal(self.n_ls, reset_less=True) + m.d.comb += i_l.eq(issue & self.load_h_i) # horizontal single-signal + m.d.comb += i_l_s.eq(Repl(i_l, self.n_ls) & self.stor_v_i) # multi, vert + + # write after read latch: loads block stores + m.d.comb += war_l.s.eq(i_s_l) + m.d.comb += war_l.r.eq(die | ~self.load_v_i) # reset on LD + + # read after write latch: stores block loads + m.d.comb += raw_l.s.eq(i_s_l) + m.d.comb += raw_l.r.eq(die | ~self.stor_v_i) # reset on ST + + # Hold results (read out horizontally, accumulate in OR fashion) + m.d.comb += self.ld_hold_st_o.eq((war_l.qn & self.load_hit_i).bool()) + m.d.comb += self.st_hold_ld_o.eq((raw_l.qn & self.stwd_hit_i).bool()) + + return m + + def __iter__(self): + yield self.load_h_i + yield self.load_v_i + yield self.stor_h_i + yield self.stor_h_i + yield self.issue_i + yield self.load_hit_i + yield self.stwd_hit_i + yield self.ld_hold_st_o + yield self.st_hold_ld_o + + def ports(self): + return list(self) + + +def dcell_sim(dut): + yield dut.dest_i.eq(1) + yield dut.issue_i.eq(1) + yield + yield dut.issue_i.eq(0) + yield + yield dut.src1_i.eq(1) + yield dut.issue_i.eq(1) + yield + yield dut.issue_i.eq(0) + yield + yield dut.go_rd_i.eq(1) + yield + yield dut.go_rd_i.eq(0) + yield + yield dut.go_wr_i.eq(1) + yield + yield dut.go_wr_i.eq(0) + yield + +def test_dcell(): + dut = LDSTDepCell() + vl = rtlil.convert(dut, ports=dut.ports()) + with open("test_ldst_dcell.il", "w") as f: + f.write(vl) + + run_simulation(dut, dcell_sim(dut), vcd_name='test_ldst_dcell.vcd') + +if __name__ == '__main__': + test_dcell() diff --git a/src/soc/scoreboard/ldst_matrix.py b/src/soc/scoreboard/ldst_matrix.py new file mode 100644 index 00000000..1bb75b03 --- /dev/null +++ b/src/soc/scoreboard/ldst_matrix.py @@ -0,0 +1,163 @@ +""" Mitch Alsup 6600-style LD/ST Memory Scoreboard Matrix (sparse vector) + +6600 LD/ST Dependency Table Matrix inputs / outputs +--------------------------------------------------- + +Relevant comments (p45-46): + +* If there are no WAR dependencies on a Load instruction with a computed + address it can assert Bank_Addressable and Translate_Addressable. + +* If there are no RAW dependencies on a Store instruction with both a + write permission and store data present it can assert Bank_Addressable + +Relevant bugreports: + +* http://bugs.libre-riscv.org/show_bug.cgi?id=81 + +Notes: + +* Load Hit (or Store Hit with Data) are asserted by the LD/ST Computation + Unit when it has data and address ready + +* Asserting the ld_hit_i (or stwd_hit_i) *requires* that the output be + captured or at least taken into consideration for the next LD/STs + *right then*. Failure to observe the xx_hold_xx_o *will* result in + data corruption, as they are *only* asserted if xx_hit_i is asserted + +* The hold signals still have to go through "maybe address clashes" + detection, they cannot just be used as-is to stop a LD/ST. + +""" + +from nmigen.compat.sim import run_simulation +from nmigen.cli import verilog, rtlil +from nmigen import Module, Signal, Elaboratable, Array, Cat, Const + +from ldst_dep_cell import LDSTDepCell + + +class LDSTDepMatrix(Elaboratable): + """ implements 11.4.12 mitch alsup LD/ST Dependency Matrix, p46 + actually a sparse matrix along the diagonal. + + load-hold-store and store-hold-load accumulate in a priority-picking + fashion, ORing together. the OR gate from the dependency cell is + here. + """ + def __init__(self, n_ldst): + self.n_ldst = n_ldst # X and Y (FUs) + self.ld_pend_i = Signal(n_ldst, reset_less=True) # load pending in + self.st_pend_i = Signal(n_ldst, reset_less=True) # store pending in + self.issue_i = Signal(n_ldst, reset_less=True) # Issue in + self.go_die_i = Signal(n_ldst, reset_less=True) # Die/Reset in + + self.load_hit_i = Signal(n_ldst, reset_less=True) # load hit in + self.stwd_hit_i = Signal(n_ldst, reset_less=True) # store w/data hit in + + # outputs + self.ld_hold_st_o = Signal(n_ldst, reset_less=True) # load holds st out + self.st_hold_ld_o = Signal(n_ldst, reset_less=True) # st holds load out + + def elaborate(self, platform): + m = Module() + + # --- + # matrix of dependency cells. actually, LDSTDepCell is a row, now + # --- + dm = Array(LDSTDepCell(self.n_ldst) for f in range(self.n_ldst)) + for fu in range(self.n_ldst): + setattr(m.submodules, "dm_fu%d" % (fu), dm[fu]) + + # --- + # connect Function Unit vector, all horizontal + # --- + lhs_l = [] + shl_l = [] + issue_l = [] + go_die_l = [] + lh_l = [] + sh_l = [] + for fu in range(self.n_ldst): + dc = dm[fu] + # accumulate load-hold-store / store-hold-load bits (horizontal) + lhs_l.append(dc.ld_hold_st_o) + shl_l.append(dc.st_hold_ld_o) + # accumulate inputs (for Cat'ing later) - TODO: must be a better way + issue_l.append(dc.issue_i) + go_die_l.append(dc.go_die_i) + + # load-hit and store-with-data-hit go in vertically (top) + m.d.comb += [dc.load_hit_i.eq(self.load_hit_i), + dc.stwd_hit_i.eq(self.stwd_hit_i), + dc.load_v_i.eq(self.ld_pend_i), + dc.stor_v_i.eq(self.st_pend_i), + ] + + # connect cell inputs using Cat(*list_of_stuff) + m.d.comb += [Cat(*issue_l).eq(self.issue_i), + Cat(*go_die_l).eq(self.go_die_i), + ] + # connect the load-hold-store / store-hold-load OR-accumulated outputs + m.d.comb += self.ld_hold_st_o.eq(Cat(*lhs_l)) + m.d.comb += self.st_hold_ld_o.eq(Cat(*shl_l)) + + # the load/store input also needs to be connected to "top" (vertically) + for fu in range(self.n_ldst): + load_h_l = [] + stor_h_l = [] + for fux in range(self.n_ldst): + dc = dm[fux] + load_h_l.append(dc.load_h_i) + stor_h_l.append(dc.stor_h_i) + m.d.comb += [Cat(*load_h_l).eq(self.ld_pend_i), + Cat(*stor_h_l).eq(self.st_pend_i), + ] + + return m + + def __iter__(self): + yield self.ld_pend_i + yield self.st_pend_i + yield self.issue_i + yield self.go_die_i + yield self.load_hit_i + yield self.stwd_hit_i + yield self.ld_hold_st_o + yield self.st_hold_ld_o + + def ports(self): + return list(self) + +def d_matrix_sim(dut): + """ XXX TODO + """ + yield dut.dest_i.eq(1) + yield dut.issue_i.eq(1) + yield + yield dut.issue_i.eq(0) + yield + yield dut.src1_i.eq(1) + yield dut.issue_i.eq(1) + yield + yield dut.issue_i.eq(0) + yield + yield dut.go_rd_i.eq(1) + yield + yield dut.go_rd_i.eq(0) + yield + yield dut.go_wr_i.eq(1) + yield + yield dut.go_wr_i.eq(0) + yield + +def test_d_matrix(): + dut = LDSTDepMatrix(n_ldst=4) + vl = rtlil.convert(dut, ports=dut.ports()) + with open("test_ld_st_matrix.il", "w") as f: + f.write(vl) + + run_simulation(dut, d_matrix_sim(dut), vcd_name='test_ld_st_matrix.vcd') + +if __name__ == '__main__': + test_d_matrix() diff --git a/src/soc/scoreboard/mdm.py b/src/soc/scoreboard/mdm.py new file mode 100644 index 00000000..184931ef --- /dev/null +++ b/src/soc/scoreboard/mdm.py @@ -0,0 +1,22 @@ +from nmigen.compat.sim import run_simulation +from nmigen.cli import verilog, rtlil +from nmigen import Module + +from scoreboard.fu_reg_matrix import FURegDepMatrix +from scoreboard.addr_match import PartialAddrMatch + +class FUMemMatchMatrix(FURegDepMatrix, PartialAddrMatch): + """ implement a FU-Regs overload with memory-address matching + """ + def __init__(self, n_fu, addrbitwid): + PartialAddrMatch.__init__(self, n_fu, addrbitwid) + FURegDepMatrix.__init__(self, n_fu, n_fu, 1, self.addr_nomatch_o) + + def elaborate(self, platform): + m = Module() + PartialAddrMatch._elaborate(self, m, platform) + FURegDepMatrix._elaborate(self, m, platform) + + return m + + diff --git a/src/soc/scoreboard/mem_dependence_cell.py b/src/soc/scoreboard/mem_dependence_cell.py new file mode 100644 index 00000000..2958d864 --- /dev/null +++ b/src/soc/scoreboard/mem_dependence_cell.py @@ -0,0 +1,120 @@ +from nmigen.compat.sim import run_simulation +from nmigen.cli import verilog, rtlil +from nmigen import Module, Signal, Elaboratable, Array, Cat, Repl +from nmutil.latch import SRLatch + + +class MemDepRow(Elaboratable): + """ implements 1st phase Memory Depencency cell + """ + def __init__(self, n_reg): + self.n_reg = n_reg + # inputs + self.ld_i = Signal(n_reg, reset_less=True) # Dest in (top) + self.st_i = Signal(n_reg, reset_less=True) # oper1 in (top) + self.issue_i = Signal(reset_less=True) # Issue in (top) + + self.st_pend_i = Signal(n_reg, reset_less=True) # Read pend in (top) + self.ld_pend_i = Signal(n_reg, reset_less=True) # Write pend in (top) + self.v_st_rsel_o = Signal(n_reg, reset_less=True) # Read pend out (bot) + self.v_ld_rsel_o = Signal(n_reg, reset_less=True) # Write pend out (bot) + + self.go_ld_i = Signal(reset_less=True) # Go Write in (left) + self.go_st_i = Signal(reset_less=True) # Go Read in (left) + self.go_die_i = Signal(reset_less=True) # Go Die in (left) + + # for Register File Select Lines (vertical) + self.ld_rsel_o = Signal(n_reg, reset_less=True) # dest reg sel (bot) + self.st_rsel_o = Signal(n_reg, reset_less=True) # src1 reg sel (bot) + + # for Function Unit "forward progress" (horizontal) + self.ld_fwd_o = Signal(n_reg, reset_less=True) # dest FU fw (right) + self.st_fwd_o = Signal(n_reg, reset_less=True) # src1 FU fw (right) + + def elaborate(self, platform): + m = Module() + m.submodules.ld_c = ld_c = SRLatch(sync=False, llen=self.n_reg) + m.submodules.st_c = st_c = SRLatch(sync=False, llen=self.n_reg) + + # connect go_rd / go_wr (dest->wr, src->rd) + ld_die = Signal(reset_less=True) + st_die = Signal(reset_less=True) + m.d.comb += ld_die.eq(self.go_ld_i | self.go_die_i) + m.d.comb += st_die.eq(self.go_st_i | self.go_die_i) + m.d.comb += ld_c.r.eq(Repl(ld_die, self.n_reg)) + m.d.comb += st_c.r.eq(Repl(st_die, self.n_reg)) + + # connect input reg bit (unary) + i_ext = Repl(self.issue_i, self.n_reg) + m.d.comb += ld_c.s.eq(i_ext & self.ld_i) + m.d.comb += st_c.s.eq(i_ext & self.st_i) + + # connect up hazard checks: read-after-write and write-after-read + m.d.comb += self.ld_fwd_o.eq(ld_c.q & self.st_pend_i) + m.d.comb += self.st_fwd_o.eq(st_c.q & self.ld_pend_i) + + # connect reg-sel outputs + st_ext = Repl(self.go_st_i, self.n_reg) + ld_ext = Repl(self.go_ld_i, self.n_reg) + m.d.comb += self.ld_rsel_o.eq(ld_c.qlq & ld_ext) + m.d.comb += self.st_rsel_o.eq(st_c.qlq & st_ext) + + # to be accumulated to indicate if register is in use (globally) + # after ORing, is fed back in to st_pend_i / ld_pend_i + m.d.comb += self.v_st_rsel_o.eq(st_c.qlq) + m.d.comb += self.v_ld_rsel_o.eq(ld_c.qlq) + + return m + + def __iter__(self): + yield self.ld_i + yield self.st_i + yield self.st_pend_i + yield self.ld_pend_i + yield self.issue_i + yield self.go_ld_i + yield self.go_st_i + yield self.go_die_i + yield self.v_ld_rsel_o + yield self.v_st_rsel_o + yield self.ld_rsel_o + yield self.st_rsel_o + yield self.ld_fwd_o + yield self.st_fwd_o + + def ports(self): + return list(self) + + +def dcell_sim(dut): + yield dut.ld_i.eq(1) + yield dut.issue_i.eq(1) + yield + yield dut.issue_i.eq(0) + yield + yield dut.st_i.eq(1) + yield dut.issue_i.eq(1) + yield + yield + yield + yield dut.issue_i.eq(0) + yield + yield dut.go_st_i.eq(1) + yield + yield dut.go_st_i.eq(0) + yield + yield dut.go_ld_i.eq(1) + yield + yield dut.go_ld_i.eq(0) + yield + +def test_dcell(): + dut = MemDepRow(4) + vl = rtlil.convert(dut, ports=dut.ports()) + with open("test_mem_drow.il", "w") as f: + f.write(vl) + + run_simulation(dut, dcell_sim(dut), vcd_name='test_mem_dcell.vcd') + +if __name__ == '__main__': + test_dcell() diff --git a/src/soc/scoreboard/mem_fu_matrix.py b/src/soc/scoreboard/mem_fu_matrix.py new file mode 100644 index 00000000..98595996 --- /dev/null +++ b/src/soc/scoreboard/mem_fu_matrix.py @@ -0,0 +1,218 @@ +from nmigen.compat.sim import run_simulation +from nmigen.cli import verilog, rtlil +from nmigen import Module, Signal, Elaboratable, Array, Cat + +from scoreboard.mem_dependence_cell import MemDepRow +from scoreboard.mem_fu_pending import MemFU_Pend +from scoreboard.mem_select import Mem_Rsv +from scoreboard.global_pending import GlobalPending + +""" + +""" + +class MemFUDepMatrix(Elaboratable): + """ implements 1st phase Memory-to-FU Dependency Matrix + """ + def __init__(self, n_fu_row, n_reg_col): + self.n_fu_row = n_fu_row # Y (FUs) ^v + self.n_reg_col = n_reg_col # X (Regs) <> + self.ld_i = Signal(n_reg_col, reset_less=True) # LD in (top) + self.st_i = Signal(n_reg_col, reset_less=True) # ST in (top) + + # Register "Global" vectors for determining RaW and WaR hazards + self.ld_pend_i = Signal(n_reg_col, reset_less=True) # ld pending (top) + self.st_pend_i = Signal(n_reg_col, reset_less=True) # st pending (top) + self.v_ld_rsel_o = Signal(n_reg_col, reset_less=True) # ld pending (bot) + self.v_st_rsel_o = Signal(n_reg_col, reset_less=True) # st pending (bot) + + self.issue_i = Signal(n_fu_row, reset_less=True) # Issue in (top) + self.go_ld_i = Signal(n_fu_row, reset_less=True) # Go LOAD in (left) + self.go_st_i = Signal(n_fu_row, reset_less=True) # Go STOR in (left) + self.go_die_i = Signal(n_fu_row, reset_less=True) # Go Die in (left) + + # for Register File Select Lines (horizontal), per-reg + self.ld_rsel_o = Signal(n_reg_col, reset_less=True) # ld reg (bot) + self.st_rsel_o = Signal(n_reg_col, reset_less=True) # st reg (bot) + + # for Function Unit "forward progress" (vertical), per-FU + self.ld_pend_o = Signal(n_fu_row, reset_less=True) # ld pending (right) + self.st_pend_o = Signal(n_fu_row, reset_less=True) # st pending (right) + + def elaborate(self, platform): + m = Module() + + # --- + # matrix of dependency cells + # --- + dm = Array(MemDepRow(self.n_reg_col) for r in range(self.n_fu_row)) + for fu in range(self.n_fu_row): + setattr(m.submodules, "dr_fu%d" % fu, dm[fu]) + + # --- + # array of Function Unit Pending vectors + # --- + fupend = Array(MemFU_Pend(self.n_reg_col) for f in range(self.n_fu_row)) + for fu in range(self.n_fu_row): + setattr(m.submodules, "fu_fu%d" % (fu), fupend[fu]) + + # --- + # array of Register Reservation vectors + # --- + regrsv = Array(Mem_Rsv(self.n_fu_row) for r in range(self.n_reg_col)) + for rn in range(self.n_reg_col): + setattr(m.submodules, "rr_r%d" % (rn), regrsv[rn]) + + # --- + # connect Function Unit vector + # --- + ld_pend = [] + st_pend = [] + for fu in range(self.n_fu_row): + dc = dm[fu] + fup = fupend[fu] + ld_fwd_o = [] + st_fwd_o = [] + for rn in range(self.n_reg_col): + # accumulate cell fwd outputs for dest/src1 + ld_fwd_o.append(dc.ld_fwd_o[rn]) + st_fwd_o.append(dc.st_fwd_o[rn]) + # connect cell fwd outputs to FU Vector in [Cat is gooood] + m.d.comb += [fup.ld_fwd_i.eq(Cat(*ld_fwd_o)), + fup.st_fwd_i.eq(Cat(*st_fwd_o)), + ] + # accumulate FU Vector outputs + ld_pend.append(fup.reg_ld_pend_o) + st_pend.append(fup.reg_st_pend_o) + + # ... and output them from this module (vertical, width=FUs) + m.d.comb += self.ld_pend_o.eq(Cat(*ld_pend)) + m.d.comb += self.st_pend_o.eq(Cat(*st_pend)) + + # --- + # connect Reg Selection vector + # --- + ld_rsel = [] + st_rsel = [] + for rn in range(self.n_reg_col): + rsv = regrsv[rn] + ld_rsel_o = [] + st_rsel_o = [] + for fu in range(self.n_fu_row): + dc = dm[fu] + # accumulate cell reg-select outputs dest/src1 + ld_rsel_o.append(dc.ld_rsel_o[rn]) + st_rsel_o.append(dc.st_rsel_o[rn]) + # connect cell reg-select outputs to Reg Vector In + m.d.comb += [rsv.ld_rsel_i.eq(Cat(*ld_rsel_o)), + rsv.st_rsel_i.eq(Cat(*st_rsel_o)), + ] + # accumulate Reg-Sel Vector outputs + ld_rsel.append(rsv.ld_rsel_o) + st_rsel.append(rsv.st_rsel_o) + + # ... and output them from this module (horizontal, width=REGs) + m.d.comb += self.ld_rsel_o.eq(Cat(*ld_rsel)) + m.d.comb += self.st_rsel_o.eq(Cat(*st_rsel)) + + # --- + # connect Dependency Matrix dest/src1/issue to module d/s/s/i + # --- + for fu in range(self.n_fu_row): + dc = dm[fu] + # wire up inputs from module to row cell inputs (Cat is gooood) + m.d.comb += [dc.ld_i.eq(self.ld_i), + dc.st_i.eq(self.st_i), + dc.st_pend_i.eq(self.st_pend_i), + dc.ld_pend_i.eq(self.ld_pend_i), + ] + + # accumulate rsel bits into read/write pending vectors. + st_pend_v = [] + ld_pend_v = [] + for fu in range(self.n_fu_row): + dc = dm[fu] + st_pend_v.append(dc.v_st_rsel_o) + ld_pend_v.append(dc.v_ld_rsel_o) + st_v = GlobalPending(self.n_reg_col, st_pend_v) + ld_v = GlobalPending(self.n_reg_col, ld_pend_v) + m.submodules.st_v = st_v + m.submodules.ld_v = ld_v + + m.d.comb += self.v_st_rsel_o.eq(st_v.g_pend_o) + m.d.comb += self.v_ld_rsel_o.eq(ld_v.g_pend_o) + + # --- + # connect Dep issue_i/go_st_i/go_ld_i to module issue_i/go_rd/go_wr + # --- + go_st_i = [] + go_ld_i = [] + go_die_i = [] + issue_i = [] + for fu in range(self.n_fu_row): + dc = dm[fu] + # accumulate cell fwd outputs for dest/src1 + go_st_i.append(dc.go_st_i) + go_ld_i.append(dc.go_ld_i) + go_die_i.append(dc.go_die_i) + issue_i.append(dc.issue_i) + # wire up inputs from module to row cell inputs (Cat is gooood) + m.d.comb += [Cat(*go_st_i).eq(self.go_st_i), + Cat(*go_ld_i).eq(self.go_ld_i), + Cat(*go_die_i).eq(self.go_die_i), + Cat(*issue_i).eq(self.issue_i), + ] + + return m + + def __iter__(self): + yield self.ld_i + yield self.st_i + yield self.issue_i + yield self.go_ld_i + yield self.go_st_i + yield self.go_die_i + yield self.ld_rsel_o + yield self.st_rsel_o + yield self.ld_pend_o + yield self.st_pend_o + yield self.ld_pend_i + yield self.st_pend_i + yield self.ld_rsel_o + yield self.st_rsel_o + + def ports(self): + return list(self) + +def d_matrix_sim(dut): + """ XXX TODO + """ + yield dut.ld_i.eq(1) + yield dut.issue_i.eq(1) + yield + yield dut.issue_i.eq(0) + yield + yield dut.st_i.eq(1) + yield dut.issue_i.eq(1) + yield + yield dut.issue_i.eq(0) + yield + yield dut.go_st_i.eq(1) + yield + yield dut.go_st_i.eq(0) + yield + yield dut.go_ld_i.eq(1) + yield + yield dut.go_ld_i.eq(0) + yield + +def test_d_matrix(): + dut = MemFUDepMatrix(n_fu_row=3, n_reg_col=3) + vl = rtlil.convert(dut, ports=dut.ports()) + with open("test_fu_mem_matrix.il", "w") as f: + f.write(vl) + + run_simulation(dut, d_matrix_sim(dut), vcd_name='test_fu_mem_matrix.vcd') + +if __name__ == '__main__': + test_d_matrix() diff --git a/src/soc/scoreboard/mem_fu_pending.py b/src/soc/scoreboard/mem_fu_pending.py new file mode 100644 index 00000000..951f7ac1 --- /dev/null +++ b/src/soc/scoreboard/mem_fu_pending.py @@ -0,0 +1,22 @@ +from nmigen import Elaboratable, Module, Signal, Cat + + +class MemFU_Pend(Elaboratable): + """ these are allocated per-FU (horizontally), + and are of length reg_count + """ + def __init__(self, reg_count): + self.reg_count = reg_count + self.ld_fwd_i = Signal(reg_count, reset_less=True) + self.st_fwd_i = Signal(reg_count, reset_less=True) + + self.reg_ld_pend_o = Signal(reset_less=True) + self.reg_st_pend_o = Signal(reset_less=True) + + def elaborate(self, platform): + m = Module() + m.d.comb += self.reg_ld_pend_o.eq(self.ld_fwd_i.bool()) + m.d.comb += self.reg_st_pend_o.eq(self.st_fwd_i.bool()) + + return m + diff --git a/src/soc/scoreboard/mem_select.py b/src/soc/scoreboard/mem_select.py new file mode 100644 index 00000000..627d7d10 --- /dev/null +++ b/src/soc/scoreboard/mem_select.py @@ -0,0 +1,20 @@ +from nmigen import Elaboratable, Module, Signal + + +class Mem_Rsv(Elaboratable): + """ these are allocated per-Register (vertically), + and are each of length fu_count + """ + def __init__(self, fu_count): + self.fu_count = fu_count + self.ld_rsel_i = Signal(fu_count, reset_less=True) + self.st_rsel_i = Signal(fu_count, reset_less=True) + self.ld_rsel_o = Signal(reset_less=True) + self.st_rsel_o = Signal(reset_less=True) + + def elaborate(self, platform): + m = Module() + m.d.comb += self.ld_rsel_o.eq(self.ld_rsel_i.bool()) + m.d.comb += self.st_rsel_o.eq(self.st_rsel_i.bool()) + return m + diff --git a/src/soc/scoreboard/memfu.py b/src/soc/scoreboard/memfu.py new file mode 100644 index 00000000..857d96c9 --- /dev/null +++ b/src/soc/scoreboard/memfu.py @@ -0,0 +1,120 @@ +from nmigen.compat.sim import run_simulation +from nmigen.cli import verilog, rtlil +from nmigen import Module, Signal, Array, Elaboratable + +from scoreboard.fu_fu_matrix import FUFUDepMatrix +from scoreboard.mdm import FUMemMatchMatrix + + +class MemFunctionUnits(Elaboratable): + + def __init__(self, n_ldsts, addrbitwid): + self.n_ldsts = n_ldsts + self.bitwid = addrbitwid + + self.st_i = Signal(n_ldsts, reset_less=True) # Dest R# in + self.ld_i = Signal(n_ldsts, reset_less=True) # oper1 R# in + + self.g_int_ld_pend_o = Signal(n_ldsts, reset_less=True) + self.g_int_st_pend_o = Signal(n_ldsts, reset_less=True) + + self.st_rsel_o = Signal(n_ldsts, reset_less=True) # dest reg (bot) + self.ld_rsel_o = Signal(n_ldsts, reset_less=True) # src1 reg (bot) + + self.loadable_o = Signal(n_ldsts, reset_less=True) + self.storable_o = Signal(n_ldsts, reset_less=True) + self.addr_nomatch_o = Signal(n_ldsts, reset_less=True) + + self.go_ld_i = Signal(n_ldsts, reset_less=True) + self.go_st_i = Signal(n_ldsts, reset_less=True) + self.go_die_i = Signal(n_ldsts, reset_less=True) + self.fn_issue_i = Signal(n_ldsts, reset_less=True) + + # address matching + self.addrs_i = Array(Signal(self.bitwid, name="addrs_i%d" % i) \ + for i in range(n_ldsts)) + self.addr_we_i = Signal(n_ldsts) # write-enable for incoming address + self.addr_en_i = Signal(n_ldsts) # address latched in + self.addr_rs_i = Signal(n_ldsts) # address deactivated + + # Note: FURegs st_pend_o is also outputted from here, for use in WaWGrid + + def elaborate(self, platform): + m = Module() + comb = m.d.comb + sync = m.d.sync + + n_fus = self.n_ldsts + + # Integer FU-FU Dep Matrix + intfudeps = FUFUDepMatrix(n_fus, n_fus) + m.submodules.intfudeps = intfudeps + # Integer FU-Reg Dep Matrix + intregdeps = FUMemMatchMatrix(n_fus, self.bitwid) + m.submodules.intregdeps = intregdeps + + # ok, because we do not know in advance what the AGEN (address gen) + # is, we have to make a transitive dependency set. i.e. the LD + # (or ST) being requested now must depend on ALL prior LDs *AND* STs. + # these get dropped very rapidly once AGEN is carried out. + # XXX TODO + + # connect fureg matrix as a mem system + comb += self.g_int_ld_pend_o.eq(intregdeps.v_rd_rsel_o) + comb += self.g_int_st_pend_o.eq(intregdeps.v_wr_rsel_o) + + comb += intregdeps.rd_pend_i.eq(intregdeps.v_rd_rsel_o) + comb += intregdeps.wr_pend_i.eq(intregdeps.v_wr_rsel_o) + + comb += intfudeps.rd_pend_i.eq(intregdeps.rd_pend_o) + comb += intfudeps.wr_pend_i.eq(intregdeps.wr_pend_o) + self.st_pend_o = intregdeps.wr_pend_o # also output for use in WaWGrid + + comb += intfudeps.issue_i.eq(self.fn_issue_i) + comb += intfudeps.go_rd_i.eq(self.go_ld_i) + comb += intfudeps.go_wr_i.eq(self.go_st_i) + comb += intfudeps.go_die_i.eq(self.go_die_i) + comb += self.loadable_o.eq(intfudeps.readable_o) + comb += self.storable_o.eq(intfudeps.writable_o) + comb += self.addr_nomatch_o.eq(intregdeps.addr_nomatch_o) + + # Connect function issue / arrays, and dest/src1/src2 + comb += intregdeps.dest_i.eq(self.st_i) + comb += intregdeps.src_i[0].eq(self.ld_i) + + comb += intregdeps.go_rd_i.eq(self.go_ld_i) + comb += intregdeps.go_wr_i.eq(self.go_st_i) + comb += intregdeps.go_die_i.eq(self.go_die_i) + comb += intregdeps.issue_i.eq(self.fn_issue_i) + + comb += self.st_rsel_o.eq(intregdeps.dest_rsel_o) + comb += self.ld_rsel_o.eq(intregdeps.src_rsel_o[0]) + + # connect address matching: these get connected to the Addr CUs + for i in range(self.n_ldsts): + comb += intregdeps.addrs_i[i].eq(self.addrs_i[i]) + comb += intregdeps.addr_we_i.eq(self.addr_we_i) + comb += intregdeps.addr_en_i.eq(self.addr_en_i) + comb += intregdeps.addr_rs_i.eq(self.addr_rs_i) + + return m + + def __iter__(self): + yield self.ld_i + yield self.st_i + yield self.g_int_st_pend_o + yield self.g_int_ld_pend_o + yield self.ld_rsel_o + yield self.st_rsel_o + yield self.loadable_o + yield self.storable_o + yield self.go_st_i + yield self.go_ld_i + yield self.go_die_i + yield self.fn_issue_i + yield from self.addrs_i + yield self.addr_we_i + yield self.addr_en_i + + def ports(self): + return list(self) diff --git a/src/soc/scoreboard/reg_select.py b/src/soc/scoreboard/reg_select.py new file mode 100644 index 00000000..3919cce3 --- /dev/null +++ b/src/soc/scoreboard/reg_select.py @@ -0,0 +1,24 @@ +from nmigen import Elaboratable, Module, Signal, Array + + +class Reg_Rsv(Elaboratable): + """ these are allocated per-Register (vertically), + and are each of length fu_count + """ + def __init__(self, fu_count, n_src): + self.n_src = n_src + self.fu_count = fu_count + self.dest_rsel_i = Signal(fu_count, reset_less=True) + self.src_rsel_i = Array(Signal(fu_count, name="src_rsel_i", + reset_less=True) \ + for i in range(n_src)) + self.dest_rsel_o = Signal(reset_less=True) + self.src_rsel_o = Signal(n_src, reset_less=True) + + def elaborate(self, platform): + m = Module() + m.d.comb += self.dest_rsel_o.eq(self.dest_rsel_i.bool()) + for i in range(self.n_src): + m.d.comb += self.src_rsel_o[i].eq(self.src_rsel_i[i].bool()) + return m + diff --git a/src/soc/scoreboard/shadow.py b/src/soc/scoreboard/shadow.py new file mode 100644 index 00000000..12f20893 --- /dev/null +++ b/src/soc/scoreboard/shadow.py @@ -0,0 +1,226 @@ +from nmigen.compat.sim import run_simulation +from nmigen.cli import verilog, rtlil +from nmigen import Module, Signal, Cat, Array, Const, Elaboratable, Repl +from nmigen.lib.coding import Decoder + +from scoreboard.shadow_fn import ShadowFn + + +class ShadowMatrix(Elaboratable): + """ Matrix of Shadow Functions. One per FU. + + Inputs + * :n_fus: register file width + * :shadow_wid: number of shadow/fail/good/go_die sets + + Notes: + + * Shadow enable/fail/good are all connected to all Shadow Functions + (incoming at the top) + + * Output is an array of "shadow active" (schroedinger wires: neither + alive nor dead) and an array of "go die" signals, one per FU. + + * the shadown must be connected to the Computation Unit's + write release request, preventing it (ANDing) from firing + (and thus preventing Writable. this by the way being the + whole point of having the Shadow Matrix...) + + * go_die_o must be connected to *both* the Computation Unit's + src-operand and result-operand latch resets, causing both + of them to reset. + + * go_die_o also needs to be wired into the Dependency and Function + Unit Matrices by way of over-enabling (ORing) into Go_Read and + Go_Write, resetting every cell that is required to "die" + """ + def __init__(self, n_fus, shadow_wid=0, syncreset=False): + self.syncreset = syncreset + self.n_fus = n_fus + self.shadow_wid = shadow_wid + + # inputs + self.issue_i = Signal(n_fus, reset_less=True) + self.reset_i = Signal(n_fus, reset_less=True) + self.shadow_i = Array(Signal(shadow_wid, name="sh_i", reset_less=True) \ + for f in range(n_fus)) + self.s_fail_i = Array(Signal(shadow_wid, name="fl_i", reset_less=True) \ + for f in range(n_fus)) + self.s_good_i = Array(Signal(shadow_wid, name="gd_i", reset_less=True) \ + for f in range(n_fus)) + # outputs + self.go_die_o = Signal(n_fus, reset_less=True) + self.shadown_o = Signal(n_fus, reset_less=True) + + def elaborate(self, platform): + m = Module() + shadows = [] + for i in range(self.n_fus): + sh = ShadowFn(self.shadow_wid, self.syncreset) + setattr(m.submodules, "sh%d" % i, sh) + shadows.append(sh) + # connect shadow/fail/good to all shadows + m.d.comb += sh.s_fail_i.eq(self.s_fail_i[i]) + m.d.comb += sh.s_good_i.eq(self.s_good_i[i]) + # this one is the matrix (shadow enables) + m.d.comb += sh.shadow_i.eq(self.shadow_i[i]) + + # connect all shadow outputs and issue input + issue_l = [] + reset_l = [] + sho_l = [] + rec_l = [] + for l in shadows: + issue_l.append(l.issue_i) + reset_l.append(l.reset_i) + sho_l.append(l.shadown_o) + rec_l.append(l.go_die_o) + m.d.comb += Cat(*issue_l).eq(self.issue_i) + m.d.comb += Cat(*reset_l).eq(self.reset_i) + m.d.comb += self.shadown_o.eq(Cat(*sho_l)) + m.d.comb += self.go_die_o.eq(Cat(*rec_l)) + + return m + + def __iter__(self): + yield self.issue_i + yield self.reset_i + yield from self.shadow_i + yield from self.s_fail_i + yield from self.s_good_i + yield self.go_die_o + yield self.shadown_o + + def ports(self): + return list(self) + + +class BranchSpeculationRecord(Elaboratable): + """ A record of which function units will be cancelled and which + allowed to proceed, on a branch. + + Whilst the input is a pair that says whether the instruction is + under the "success" branch shadow (good_i) or the "fail" shadow + (fail_i path), when the branch result is known, the "good" path + must be cancelled if "fail" occurred, and the "fail" path cancelled + if "good" occurred. + + therefore, use "good|~fail" and "fail|~good" respectively as + output. + """ + + def __init__(self, n_fus): + self.n_fus = n_fus + + # inputs: record *expected* status + self.active_i = Signal(reset_less=True) + self.good_i = Signal(n_fus, reset_less=True) + self.fail_i = Signal(n_fus, reset_less=True) + + # inputs: status of branch (when result was known) + self.br_i = Signal(reset_less=True) + self.br_ok_i = Signal(reset_less=True) + + # outputs: true if the *expected* outcome matched the *actual* outcome + self.match_f_o = Signal(n_fus, reset_less=True) + self.match_g_o = Signal(n_fus, reset_less=True) + + def elaborate(self, platform): + m = Module() + + # registers to record *expected* status + good_r = Signal(self.n_fus) + fail_r = Signal(self.n_fus) + + for i in range(self.n_fus): + with m.If(self.active_i): + m.d.sync += good_r[i].eq(good_r[i] | self.good_i[i]) + m.d.sync += fail_r[i].eq(fail_r[i] | self.fail_i[i]) + with m.If(self.br_i): + with m.If(good_r[i]): + # we expected good, return OK that good was EXPECTED + m.d.comb += self.match_g_o[i].eq(self.br_ok_i) + m.d.comb += self.match_f_o[i].eq(~self.br_ok_i) + with m.If(fail_r[i]): + # we expected fail, return OK that fail was EXPECTED + m.d.comb += self.match_g_o[i].eq(~self.br_ok_i) + m.d.comb += self.match_f_o[i].eq(self.br_ok_i) + m.d.sync += good_r[i].eq(0) # might be set if issue set as well + m.d.sync += fail_r[i].eq(0) # might be set if issue set as well + + return m + + def __iter__(self): + yield self.active_i + yield self.good_i + yield self.fail_i + yield self.br_i + yield self.br_good_i + yield self.br_fail_i + yield self.good_o + yield self.fail_o + + def ports(self): + return list(self) + + + +class WaWGrid(Elaboratable): + """ An NxM grid-selector which raises a 2D bit selected by N and M + """ + + def __init__(self, n_fus, shadow_wid): + self.n_fus = n_fus + self.shadow_wid = shadow_wid + + self.shadow_i = Signal(shadow_wid, reset_less=True) + self.fu_i = Signal(n_fus, reset_less=True) + + self.waw_o = Array(Signal(shadow_wid, name="waw_o", reset_less=True) \ + for f in range(n_fus)) + + def elaborate(self, platform): + m = Module() + for i in range(self.n_fus): + v = Repl(self.fu_i[i], self.shadow_wid) + m.d.comb += self.waw_o[i].eq(v & self.shadow_i) + return m + + +def shadow_sim(dut): + yield dut.dest_i.eq(1) + yield dut.issue_i.eq(1) + yield + yield dut.issue_i.eq(0) + yield + yield dut.src1_i.eq(1) + yield dut.issue_i.eq(1) + yield + yield + yield + yield dut.issue_i.eq(0) + yield + yield dut.go_rd_i.eq(1) + yield + yield dut.go_rd_i.eq(0) + yield + yield dut.go_wr_i.eq(1) + yield + yield dut.go_wr_i.eq(0) + yield + +def test_shadow(): + dut = ShadowMatrix(4, 2) + vl = rtlil.convert(dut, ports=dut.ports()) + with open("test_shadow.il", "w") as f: + f.write(vl) + + dut = BranchSpeculationRecord(4) + vl = rtlil.convert(dut, ports=dut.ports()) + with open("test_branchspecrecord.il", "w") as f: + f.write(vl) + + run_simulation(dut, shadow_sim(dut), vcd_name='test_shadow.vcd') + +if __name__ == '__main__': + test_shadow() diff --git a/src/soc/scoreboard/shadow_fn.py b/src/soc/scoreboard/shadow_fn.py new file mode 100644 index 00000000..69a56a5c --- /dev/null +++ b/src/soc/scoreboard/shadow_fn.py @@ -0,0 +1,111 @@ +from nmigen.compat.sim import run_simulation +from nmigen.cli import verilog, rtlil +from nmigen import Module, Signal, Cat, Repl, Const, Elaboratable +from nmutil.latch import SRLatch + + +class ShadowFn(Elaboratable): + """ implements shadowing 11.5.1, p55, just the individual shadow function + + shadowing can be used for branches as well as exceptions (interrupts), + load/store hold (exceptions again), and vector-element predication + (once the predicate is known, which it may not be at instruction issue) + + Inputs + * :shadow_wid: number of shadow/fail/good/go_die sets + + notes: + * when shadow_wid = 0, recover and shadown are Consts (i.e. do nothing) + """ + def __init__(self, slen, syncreset=False): + + self.slen = slen + self.syncreset = syncreset + + if self.slen: + # inputs + self.issue_i = Signal(reset_less=True) + self.shadow_i = Signal(slen, reset_less=True) + self.reset_i = Signal(reset_less=True) + self.s_fail_i = Signal(slen, reset_less=True) + self.s_good_i = Signal(slen, reset_less=True) + + # outputs + self.shadown_o = Signal(reset_less=True) + self.go_die_o = Signal(reset_less=True) + else: + # outputs when no shadowing needed + self.shadown_o = Const(1) + self.go_die_o = Const(0) + + def elaborate(self, platform): + m = Module() + if self.slen == 0: + return + + m.submodules.sl = sl = SRLatch(sync=False, llen=self.slen) + + r_ext = Repl(self.reset_i, self.slen) + reset_r = Signal(self.slen) + if self.syncreset: + m.d.comb += reset_r.eq(self.s_good_i | self.s_fail_i | r_ext) + else: + m.d.comb += reset_r.eq(self.s_good_i | self.s_fail_i | r_ext) + + i_ext = Repl(self.issue_i, self.slen) + m.d.comb += sl.s.eq(self.shadow_i & i_ext & \ + ~self.s_good_i & ~reset_r) + m.d.comb += sl.r.eq(r_ext | reset_r | self.s_good_i | \ + (i_ext & ~self.shadow_i)) + m.d.comb += self.go_die_o.eq((sl.qlq & self.s_fail_i).bool()) + m.d.comb += self.shadown_o.eq(~sl.qlq.bool()) + + return m + + def __iter__(self): + yield self.issue_i + yield self.reset_i + yield self.shadow_i + yield self.s_fail_i + yield self.s_good_i + yield self.shadown_o + yield self.go_die_o + + def ports(self): + return list(self) + + +def shadow_fn_unit_sim(dut): + yield dut.dest_i.eq(1) + yield dut.issue_i.eq(1) + yield + yield dut.issue_i.eq(0) + yield + yield dut.src1_i.eq(1) + yield dut.issue_i.eq(1) + yield + yield + yield + yield dut.issue_i.eq(0) + yield + yield dut.go_rd_i.eq(1) + yield + yield dut.go_rd_i.eq(0) + yield + yield dut.go_wr_i.eq(1) + yield + yield dut.go_wr_i.eq(0) + yield + + +def test_shadow_fn_unit(): + dut = ShadowFn(4) + vl = rtlil.convert(dut, ports=dut.ports()) + with open("test_shadow_fn_unit.il", "w") as f: + f.write(vl) + + run_simulation(dut, shadow_fn_unit_sim(dut), + vcd_name='test_shadow_fn_unit.vcd') + +if __name__ == '__main__': + test_shadow_fn_unit() diff --git a/src/soc/scoreboard/test_iq.py b/src/soc/scoreboard/test_iq.py new file mode 100644 index 00000000..94ceac7e --- /dev/null +++ b/src/soc/scoreboard/test_iq.py @@ -0,0 +1,126 @@ +""" testing of InstructionQ +""" + +from copy import deepcopy +from random import randint +from nmigen.compat.sim import run_simulation +from nmigen.cli import verilog, rtlil + +from scoreboard.instruction_q import InstructionQ +from nmutil.nmoperator import eq + + +class IQSim: + def __init__(self, dut, iq, n_in, n_out): + self.dut = dut + self.iq = iq + self.oq = [] + self.n_in = n_in + self.n_out = n_out + + def send(self): + i = 0 + while i < len(self.iq): + sendlen = randint(1, self.n_in) + sendlen = 1 + sendlen = min(len(self.iq) - i, sendlen) + print ("sendlen", len(self.iq)-i, sendlen) + for idx in range(sendlen): + instr = self.iq[i+idx] + yield from eq(self.dut.data_i[idx], instr) + di = yield self.dut.data_i[idx]#.src1_i + print ("senddata %d %x" % ((i+idx), di)) + self.oq.append(di) + yield self.dut.p_add_i.eq(sendlen) + yield + o_p_ready = yield self.dut.p_ready_o + while not o_p_ready: + yield + o_p_ready = yield self.dut.p_ready_o + + yield self.dut.p_add_i.eq(0) + + print ("send", len(self.iq), i, sendlen) + + # wait random period of time before queueing another value + for j in range(randint(0, 3)): + yield + + i += sendlen + + yield self.dut.p_add_i.eq(0) + yield + + print ("send ended") + + ## wait random period of time before queueing another value + #for i in range(randint(0, 3)): + # yield + + #send_range = randint(0, 3) + #if send_range == 0: + # send = True + #else: + # send = randint(0, send_range) != 0 + + def rcv(self): + i = 0 + yield + yield + yield + while i < len(self.iq): + rcvlen = randint(1, self.n_out) + #print ("outreq", rcvlen) + yield self.dut.n_sub_i.eq(rcvlen) + n_sub_o = yield self.dut.n_sub_o + print ("recv", n_sub_o) + for j in range(n_sub_o): + r = yield self.dut.data_o[j]#.src1_i + print ("recvdata %x %s" % (r, repr(self.iq[i+j]))) + assert r == self.oq[i+j] + yield + if n_sub_o == 0: + continue + yield self.dut.n_sub_i.eq(0) + + i += n_sub_o + + print ("recv ended") + + +def mk_insns(n_insns, wid, opwid): + res = [] + for i in range(n_insns): + op1 = randint(0, (1<>self.ddepth] + + def st(self, addr, data): + self.mem[addr>>self.ddepth] = data & ((1<>self.ddepth] + + def st(self, addr, data): + self.mem[addr>>self.ddepth] = data & ((1<