vpx_get_mb_ss_svp64_real:
.LFB0:
.cfi_startproc
- li sum, 0
- li ctr, 8
- mtctr ctr
- setvl 0,0,32,0,1,1 # Set VL to 64 elements
+ # Set sum to zero
+ li sum, 0 # Set sum to zero
+ li ctr, 8 # Need 8 iterations of 32 elements
+ mtctr ctr # Set counter special register
+ setvl 0,0,32,0,1,1 # Set VL to 32 elements
.L2:
- # Load 32 ints from (in)
- sv.lha *src, 0(in)
+ sv.lha *src, 0(in) # Load 32 ints from (in)
# equivalent to: for (i = 0; i < 32; i++) vprod[i] = src[i] * src[i];
sv.mulld *prod, *src, *src
# equivalent to: for (i = 0; i < 32; i++) sum += prod[i];
sv.add/mr sum, *prod, sum
- addi in, in, 64
-# rldicl in,ctr,0,32
- bdnz .L2
- li in, 0
- addi in, sum, 0
+ addi in, in, 64 # Advance (in) pointer by 64 bytes
+ bdnz .L2 # Loop until CTR is zero
+ li in, 0 # Zero register 3 (result)
+ addi in, sum, 0 # Add sum to r3
blr
.long 0
.byte 0,0,0,0,0,0,0,0