From: Konstantinos Margaritis Date: Wed, 21 Sep 2022 14:28:49 +0000 (+0000) Subject: Initial SVP64 attempt to vpx_get4x4sse_cs_svp64_real() X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=96d50f2126f119abfc5b274f321557db1914f805;p=openpower-isa.git Initial SVP64 attempt to vpx_get4x4sse_cs_svp64_real() --- diff --git a/media/video/libvpx/vpx_get4x4sse_cs_svp64_real.s b/media/video/libvpx/vpx_get4x4sse_cs_svp64_real.s index 1d7c6e5d..cac95eee 100644 --- a/media/video/libvpx/vpx_get4x4sse_cs_svp64_real.s +++ b/media/video/libvpx/vpx_get4x4sse_cs_svp64_real.s @@ -1,3 +1,15 @@ +.set src_ptr, 3 +.set src_stride, 4 +.set ref_ptr, 5 +.set ref_stride, 6 +.set sum, 7 +.set src, 10 +.set ref, 36 +.set diff, 52 +.set prod, 68 +.set ctr, 9 + + .machine libresoc .file "vpx_get4x4sse_cs_svp64_real.c" .abiversion 2 .section ".text" @@ -7,29 +19,36 @@ vpx_get4x4sse_cs_svp64_real: .LFB0: .cfi_startproc - addi 5,5,-1 - addi 3,3,3 - li 12,4 - li 8,0 -.L2: - addi 7,3,-4 - mr 11,5 - subf 9,7,3 - mtctr 9 -.L3: - lbzu 9,1(7) - lbzu 10,1(11) - subf 9,10,9 - mullw 9,9,9 - add 9,9,8 - extsw 8,9 - bdnz .L3 - addi 9,12,-1 - add 5,5,6 - add 3,3,4 - rldicl. 12,9,0,32 - bne 0,.L2 - rldicl 3,8,0,32 + # Set sum to zero + li sum, 0 # Set sum to zero + li ctr, 4 # Need 4 iterations of 4 elements + mtctr ctr # Set counter special register + # Load 16 elements from src_ptr and ref_ptr, at groups of 4 with stride + setvl 0,0,4,0,1,1 # Set VL to 4 elements + sv.lha *src, 0(src_ptr) # Load 4 ints from (src_ptr) + sv.lha *ref, 0(ref_ptr) # Load 4 ints from (ref_ptr) + add src_ptr, src_ptr, src_stride # Advance src_ptr by src_stride + add ref_ptr, ref_ptr, ref_stride # Advance ref_ptr by ref_stride + sv.lha *{src + 4}, 0(src_ptr) + sv.lha *{ref + 4}, 0(ref_ptr) + add src_ptr, src_ptr, src_stride + add ref_ptr, ref_ptr, ref_stride + sv.lha *{src + 8}, 0(src_ptr) + sv.lha *{ref + 8}, 0(ref_ptr) + add src_ptr, src_ptr, src_stride + add ref_ptr, ref_ptr, ref_stride + sv.lha *{src + 12}, 0(src_ptr) + sv.lha *{ref + 12}, 0(ref_ptr) + + # now our values are in consecutive registers and we can set VL to 16 elements + setvl 0,0,16,0,1,1 + # equivalent to: for (i = 0; i < 16; i++) diff[i] = src[i] - ref[i]; + sv.sub *diff, *src, *ref + # equivalent to: for (i = 0; i < 16; i++) prod[i] = diff[i] * diff[i]; + sv.mulld *prod, *diff, *diff + # equivalent to: for (i = 0; i < 32; i++) sum += prod[i]; + sv.add/mr sum, *prod, sum + mr 3, sum # Set r3 to sum blr .long 0 .byte 0,0,0,0,0,0,0,0