###### popcount #######
with m.Case(InternalOp.OP_POPCNT):
- pc2 = array_of(32, 2)
- pc4 = array_of(16, 3)
- pc8 = array_of(8, 4)
- pc16 = array_of(4, 5)
- pc32 = array_of(2, 6)
- popcnt = Signal(64, reset_less=True)
- for i in range(32):
- stt, end = i*2, i*2+1
- comb += pc2[i].eq(Cat(a[stt], Const(0, 1)) +
- Cat(a[end], Const(0, 1)))
- for i in range(16):
- stt, end = i*2, i*2+1
- comb += pc4[i].eq(Cat(pc2[stt], Const(0, 1)) +
- Cat(pc2[end], Const(0, 1)))
- for i in range(8):
- stt, end = i*2, i*2+1
- comb += pc8[i].eq(Cat(pc4[stt], Const(0, 1)) +
- Cat(pc4[end], Const(0, 1)))
- for i in range(4):
- stt, end = i*2, i*2+1
- comb += pc16[i].eq(Cat(pc8[stt], Const(0, 1)) +
- Cat(pc8[end], Const(0, 1)))
- for i in range(2):
- stt, end = i*2, i*2+1
- comb += pc32[i].eq(Cat(pc16[stt], Const(0, 1)) +
- Cat(pc16[end], Const(0, 1)))
+ # starting from a, perform successive addition-reductions
+ pc = [a]
+ work = [(32, 2), (16, 3), (8, 4), (4, 5), (2, 6), (1, 6)]
+ for l, b in work:
+ pc.append(array_of(l, b))
+ pc8 = pc[3] # array of 8 8-bit counts (popcntb)
+ pc32 = pc[5] # array of 2 32-bit counts (popcntw)
+ popcnt = pc[-1] # array of 1 64-bit count (popcntd)
+ # cascade-tree of adds
+ for idx, (l, b) in enumerate(work):
+ for i in range(l):
+ stt, end = i*2, i*2+1
+ src, dst = pc[idx], pc[idx+1]
+ comb += dst[i].eq(Cat(src[stt], Const(0, 1)) +
+ Cat(src[end], Const(0, 1)))
+ # decode operation length
with m.If(self.i.ctx.op.data_len[2:4] == 0b00):
- # popcntb
+ # popcntb - pack 8x 4-bit answers into output
for i in range(8):
- comb += popcnt[i*8:i*8+4].eq(pc8[i])
+ comb += o[i*8:i*8+4].eq(pc8[i])
with m.Elif(self.i.ctx.op.data_len[3] == 0):
- # popcntw
+ # popcntw - pack 2x 5-bit answers into output
for i in range(2):
- comb += popcnt[i*32:i*32+5].eq(pc32[i])
+ comb += o[i*32:i*32+5].eq(pc32[i])
with m.Else():
- comb += popcnt.eq(Cat(pc32[0], Const(0, 1)) +
- Cat(pc32[1], Const(0, 1)))
- comb += o.eq(popcnt)
+ # popcntd - put 1x 6-bit answer into output
+ comb += o.eq(popcnt[0])
###### parity #######
# TODO with m.Case(InternalOp.OP_PRTY):