SVSHAPE2[28:29] <- 0b11 # size schedule
# set schedule up for DCT inverse of half-swapped ordering
if (SVRM = 0b0110) then
- vlen[0:6] <- (0b00 || SVxd)
+ vlen[0:6] <- (0b00 || SVxd) + 0b0000001
# set up template in SVSHAPE0
SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim
SVSHAPE0[30:31] <- 0b01 # DCT/FFT mode
yield self.dec2.dec.raw_opcode_in.eq(ins & 0xffffffff) # v3.0B suffix
yield self.dec2.sv_rm.eq(sv_rm) # svp64 prefix
yield Settle()
+ # store this for use in get_src_dststeps()
+ self.ldstmode = yield self.dec2.rm_dec.ldstmode
def execute_one(self):
"""execute one instruction
in the class for later use. this to avoid problems with yield
"""
# go through all iterators in lock-step, advance to next remap_idx
- srcstep, dststep = self.new_srcstep, self.new_dststep
+ srcstep, dststep = self.get_src_dststeps()
# get four SVSHAPEs. here we are hard-coding
SVSHAPE0 = self.spr['SVSHAPE0']
SVSHAPE1 = self.spr['SVSHAPE1']
self.update_nia()
self.update_pc_next()
return
- srcstep, dststep = self.new_srcstep, self.new_dststep
+ srcstep, dststep = self.get_src_dststeps()
pred_dst_zero = self.pred_dst_zero
pred_src_zero = self.pred_src_zero
vl = self.svstate.vl
op = yield self.dec2.e.do.insn_type
offsmul = 0
if op == MicrOp.OP_LOAD.value:
- offsmul = srcstep
- log("D-field src", imm, offsmul)
+ if remap_active:
+ offsmul = yield self.dec2.in1_step
+ log("D-field REMAP src", imm, offsmul)
+ else:
+ offsmul = srcstep
+ log("D-field src", imm, offsmul)
elif op == MicrOp.OP_STORE.value:
+ # XXX NOTE! no bit-reversed STORE! this should not ever be used
offsmul = dststep
log("D-field dst", imm, offsmul)
- # bit-reverse mode
+ # bit-reverse mode, rev already done through get_src_dst_steps()
if ldstmode == SVP64LDSTmode.BITREVERSE.value:
# manually look up RC, sigh
RC = yield self.dec2.dec.RC[0:5]
RC = self.gpr(RC)
- log ("RC", RC.value, "imm", imm, "offs", bin(offsmul),
- "rev", bin(bitrev(offsmul, vl)))
- imm = SelectableInt((imm * bitrev(offsmul, vl)) << RC.value, 32)
+ log ("LD-BITREVERSE:", "VL", vl,
+ "RC", RC.value, "imm", imm,
+ "offs", bin(offsmul),
+ )
+ imm = SelectableInt((imm * offsmul) << RC.value, 32)
# Unit-Strided LD/ST adds offset*width to immediate
elif ldstmode == SVP64LDSTmode.UNITSTRIDE.value:
ldst_len = yield self.dec2.e.do.data_len
yield from self.svstate_post_inc()
else:
log ("SVSTATE_NEXT: post-inc")
+ # use actual src/dst-step here to check end, do NOT
+ # use bit-reversed version
srcstep, dststep = self.new_srcstep, self.new_dststep
remaps = self.get_remap_indices()
remap_idxs = self.remap_idxs
log (" new srcstep", srcstep)
log (" new dststep", dststep)
+ def get_src_dststeps(self):
+ """gets srcstep and dststep but performs bit-reversal on srcstep if
+ required. use this ONLY to perform calculations, do NOT update
+ SVSTATE with the bit-reversed value of srcstep
+
+ ARGH, had to store self.ldstmode and VL due to yield issues
+ """
+ srcstep, dststep = self.new_srcstep, self.new_dststep
+ if self.is_svp64_mode:
+ if self.ldstmode == SVP64LDSTmode.BITREVERSE.value:
+ vl = self.svstate.vl
+ log ("SRCSTEP-BITREVERSE:", "VL", vl, "srcstep", srcstep,
+ "rev", bin(bitrev(srcstep, vl)))
+ srcstep = bitrev(srcstep, vl)
+
+ return (srcstep, dststep)
+
def update_new_svstate_steps(self):
+ # note, do not get the bit-reversed srcstep here!
srcstep, dststep = self.new_srcstep, self.new_dststep
# update SVSTATE with new srcstep
# *indices* are referenced (two levels of indirection at the moment)
# pre-reverse the data-swap list so that it *ends up* in the order 0123..
ji = list(range(n))
- ji = halfrev2(ji, True)
+
+ levels = n.bit_length() - 1
+ ji = halfrev2(ji, False)
+ if False: # swap: TODO, add extra bit-reverse mode
+ ri = [reverse_bits(i, levels) for i in range(n)]
+ ji = [ji[ri[i]] for i in range(n)]
+
# invert order if requested
if SVSHAPE.invxyz[0]:
ji.reverse()
- yield from ji
+ for i, jl in enumerate(ji):
+ y_end = jl == ji[-1]
+ yield jl, (0b111 if y_end else 0b000)
# python "yield" can be iterated. use this to make it clear how
print ("outer butterfly")
pprint_schedule_outer(schedule, n)
+ # for DCT half-swap LDs
+ # j schedule
+ SVSHAPE0 = SVSHAPE()
+ SVSHAPE0.lims = [xdim, 0b000101, zdim]
+ SVSHAPE0.mode = 0b01
+ SVSHAPE0.submode2 = 0
+ SVSHAPE0.skip = 0
+ SVSHAPE0.offset = 0 # experiment with different offset, here
+ SVSHAPE0.invxyz = [0,0,0] # inversion if desired
+
+ # expected results
+ levels = n.bit_length() - 1
+ avi = list(range(n))
+ ri = [reverse_bits(i, levels) for i in range(n)]
+ av = halfrev2(avi, False)
+ av = [av[ri[i]] for i in range(n)]
+
+
+ i0 = iterate_dct_inner_halfswap_loadstore(SVSHAPE0)
+ for idx, (jl) in enumerate(i0):
+ print ("inverse half-swap ld", idx, jl, av[idx])
+ if jl[1] == 0b111: # end
+ break
+
+
# run the demo
if __name__ == '__main__':
demo()
from openpower.sv.trans.svp64 import SVP64Asm
from openpower.consts import SVP64CROffs
from openpower.decoder.helpers import fp64toselectable
+from openpower.decoder.isa.remap_dct_yield import (halfrev2, reverse_bits,
+ )
from copy import deepcopy
# (24, 0x040400000303)])
self._check_fpregs(sim, expected_fprs)
- def test_sv_load_store_bitreverse_remap(self):
+ def test_sv_load_store_bitreverse_remap_matrix(self):
""">>> lst = ["addi 1, 0, 0x0010",
"addi 2, 0, 0x0004",
"addi 3, 0, 0x0002",
"addi 7, 0, 0x303",
"addi 8, 0, 0x404",
"sv.stw 5.v, 0(1)",
- "svshape 4, 4, 4, 0, 0",
+ "svshape 4, 4, 2, 0, 0",
"svremap 31, 1, 2, 3, 0, 0, 0, 0",
"sv.lwzbr 12.v, 4(1), 2"]
and thus creates the butterfly needed for one iteration of FFT.
the RC (shift) is to be able to offset the LDs by Radix-2 spans
+
+ in this case however it is REMAPed via a Matrix Multiply Schedule,
+ which is set up as 4x2.
"""
lst = SVP64Asm(["addi 1, 0, 0x0010",
"addi 2, 0, 0x0000",
self.assertEqual(sim.gpr(18), SelectableInt(0x404, 64))
self.assertEqual(sim.gpr(19), SelectableInt(0x808, 64))
+ def test_sv_load_store_bitreverse_remap_halfswap(self):
+ """>>> lst = ["addi 1, 0, 0x0010",
+ "addi 2, 0, 0x0000",
+ "addi 4, 0, 0x101",
+ "addi 5, 0, 0x202",
+ "addi 6, 0, 0x303",
+ "addi 7, 0, 0x404",
+ "addi 8, 0, 0x505",
+ "addi 9, 0, 0x606",
+ "addi 10, 0, 0x707",
+ "addi 11, 0, 0x808",
+ "sv.stw 5.v, 0(1)",
+ "svshape 8, 1, 1, 6, 0",
+ "svremap 31, 1, 2, 3, 0, 0, 0, 0",
+ "sv.lwzbr 12.v, 4(1), 2"]
+
+ bitreverse LD is computed as:
+ for i in range(VL):
+ EA = (RA|0) + (EXTS(D) * LDSTsize * bitreverse(i, VL)) << RC
+
+ bitreversal of 0 1 2 3 in binary 0b00 0b01 0b10 0b11
+ produces 0 2 1 3 in binary 0b00 0b10 0b01 0b11
+
+ and thus creates the butterfly needed for one iteration of FFT.
+ the RC (shift) is to be able to offset the LDs by Radix-2 spans
+
+ on top of the bit-reversal is a REMAP for half-swaps for DCT
+ in-place.
+ """
+ lst = SVP64Asm(["addi 1, 0, 0x0010",
+ "addi 2, 0, 0x0000",
+ "addi 4, 0, 0x001",
+ "addi 5, 0, 0x102",
+ "addi 6, 0, 0x203",
+ "addi 7, 0, 0x304",
+ "addi 8, 0, 0x405",
+ "addi 9, 0, 0x506",
+ "addi 10, 0, 0x607",
+ "addi 11, 0, 0x708",
+ "sv.stw 4.v, 0(1)", # scalar r1 + 0 + wordlen*offs
+ "svshape 8, 1, 1, 6, 0",
+ "svremap 1, 0, 0, 0, 0, 0, 0, 1",
+ #"setvl 0, 0, 8, 0, 1, 1",
+ "sv.lwzbr 12.v, 4(1), 2",
+ #"sv.lwz 12.v, 0(1)" # bit-reversed
+ ])
+ lst = list(lst)
+
+ # SVSTATE (in this case, VL=4)
+ svstate = SVP64State()
+ svstate.vl = 8 # VL
+ svstate.maxvl = 8 # MAXVL
+ print ("SVSTATE", bin(svstate.asint()))
+
+ regs = [0] * 64
+
+ avi = [0x001, 0x102, 0x203, 0x304, 0x405, 0x506, 0x607, 0x708]
+ n = len(avi)
+ levels = n.bit_length() - 1
+ ri = list(range(n))
+ ri = [ri[reverse_bits(i, levels)] for i in range(n)]
+ av = halfrev2(avi, False)
+ av = [av[ri[i]] for i in range(n)]
+
+ with Program(lst, bigendian=False) as program:
+ sim = self.run_tst_program(program, svstate=svstate,
+ initial_regs=regs)
+ mem = sim.mem.dump(printout=False)
+ print ("Mem")
+ print (mem)
+
+ self.assertEqual(mem, [(16, 0x010200000001),
+ (24, 0x030400000203),
+ (32, 0x050600000405),
+ (40, 0x070800000607)])
+ # from STs
+ for i in range(len(avi)):
+ print ("st gpr", i, sim.gpr(i+4), hex(avi[i]))
+ self.assertEqual(sim.gpr(i+4), avi[i])
+ self.assertEqual(sim.gpr(5), SelectableInt(0x102, 64))
+ self.assertEqual(sim.gpr(6), SelectableInt(0x203, 64))
+ self.assertEqual(sim.gpr(7), SelectableInt(0x304, 64))
+ self.assertEqual(sim.gpr(8), SelectableInt(0x405, 64))
+ self.assertEqual(sim.gpr(9), SelectableInt(0x506, 64))
+ self.assertEqual(sim.gpr(10), SelectableInt(0x607, 64))
+ self.assertEqual(sim.gpr(11), SelectableInt(0x708, 64))
+ # combination of bit-reversed load with a DCT half-swap REMAP
+ # schedule
+ for i in range(len(avi)):
+ print ("ld gpr", i, sim.gpr(i+12), hex(av[i]))
+ self.assertEqual(sim.gpr(i+12), av[i])
+
def run_tst_program(self, prog, initial_regs=None,
svstate=None, initial_fprs=None):
if initial_regs is None: