pc = [a]
# QTY32 2-bit (to take 2x 1-bit sums) etc.
work = [(32, 2), (16, 3), (8, 4), (4, 5), (2, 6), (1, 7)]
- for l, bw in work:
+ for l, bw in work: # l=number of add-reductions, bw=bitwidth
pc.append(array_of(l, bw))
pc8 = pc[3] # array of 8 8-bit counts (popcntb)
pc32 = pc[5] # array of 2 32-bit counts (popcntw)
src, dst = pc[idx], pc[idx+1]
comb += dst[i].eq(Cat(src[stt], Const(0, 1)) +
Cat(src[end], Const(0, 1)))
- # decode operation length
+ # decode operation length (1-hot)
with m.If(data_len == 1):
- # popcntb - pack 8x 4-bit answers into output
+ # popcntb - pack 8x 4-bit answers into 8x 8-bit output fields
for i in range(8):
comb += o[i*8:(i+1)*8].eq(pc8[i])
with m.Elif(data_len == 4):
- # popcntw - pack 2x 5-bit answers into output
+ # popcntw - pack 2x 5-bit answers into 2x 32-bit output fields
for i in range(2):
comb += o[i*32:(i+1)*32].eq(pc32[i])
with m.Else():
- # popcntd - put 1x 6-bit answer into output
+ # popcntd - put 1x 6-bit answer into 64-bit output
comb += o.eq(popcnt[0])
return m