From: Konstantinos Margaritis Date: Fri, 23 Sep 2022 07:33:37 +0000 (+0000) Subject: add new SVP64 function X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=4720370cebca592bb72ab53a7ab0cadbf4bcd876;p=openpower-isa.git add new SVP64 function --- diff --git a/media/video/libvpx/variance_svp64_real.c.in b/media/video/libvpx/variance_svp64_real.c.in new file mode 100644 index 00000000..642f27ae --- /dev/null +++ b/media/video/libvpx/variance_svp64_real.c.in @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +void variance_svp64_real(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, int w, int h, + uint32_t *sse, int *sum) { + int i, j; + + *sum = 0; + *sse = 0; + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int diff = src_ptr[j] - ref_ptr[j]; + *sum += diff; + *sse += diff * diff; + } + + src_ptr += src_stride; + ref_ptr += ref_stride; + } +} diff --git a/media/video/libvpx/variance_svp64_real.s b/media/video/libvpx/variance_svp64_real.s new file mode 100644 index 00000000..4242f68e --- /dev/null +++ b/media/video/libvpx/variance_svp64_real.s @@ -0,0 +1,76 @@ +.set src_ptr, 3 +.set src_stride, 4 +.set ref_ptr, 5 +.set ref_stride, 6 +.set width, 7 +.set height, 8 +.set sse_ptr, 9 +.set sum_ptr, 10 +.set sum, 11 +.set sse, 12 +.set ctr, 13 +.set src_col, 14 +.set ref_col, 15 +.set row, 16 +.set src, 20 +.set ref, 36 +.set diff, 52 +.set prod, 68 + + .machine libresoc + .file "variance_svp64_real.c" + .abiversion 2 + .section ".text" + .align 2 + .globl variance_svp64_real + .type variance_svp64_real, @function +variance_svp64_real: +.LFB0: + .cfi_startproc + # Set sum to zero + li sum, 0 # Set sum to zero + li sse, 0 # Set sse to zero + mr row, height # Set row to height + sldi src_stride, src_stride, 1 # strides are for 16-bit elements + sldi ref_stride, ref_stride, 1 # we need to increase by bytes + srdi ctr, width, 2 + mtctr ctr + setvl 0,0,4,0,1,1 # Set VL to 4 elements + +.L1: # outer loop: for (r=0; r < h; r++) + +.L2: # inner loop: for (c=0; c < w; c += 4) + # Load 4 elements from src_ptr and ref_ptr, at groups of 4 + mr src_col, src_ptr # Temporary variables + mr ref_col, ref_ptr + sv.lha *src, 0(src_col) # Load 4 ints from (src_ptr) + sv.lha *ref, 0(ref_col) # Load 4 ints from (ref_ptr) + addi src_col, src_col, 8 # Increment src, ref by 8 bytes + addi ref_col, ref_col, 8 + + # equivalent to: for (i = 0; i < 4; i++) diff[i] = src[i] - ref[i]; + sv.subf *diff, *src, *ref + # equivalent to: for (i = 0; i < 4; i++) prod[i] = diff[i] * diff[i]; + sv.mulld *prod, *diff, *diff + # equivalent to: for (i = 0; i < 4; i++) sum += diff[i]; + sv.add/mr sum, *diff, sum + # equivalent to: for (i = 0; i < 4; i++) sum += diff[i]; + sv.add/mr sse, *prod, sse + + bdnz .L2 # Loop until CTR is zero + add src_ptr, src_ptr, src_stride # Advance src_ptr by src_stride + add ref_ptr, ref_ptr, ref_stride # Advance ref_ptr by ref_stride + + subi row, row, 1 # Subtract 1 from row + cmpwi cr1, row, 0 # Is row zero? + bne cr1, .L1 # Go back to L1 if not done + std sum, 0(sum_ptr) # Set (sum_ptr) to sum + std sse, 0(sse_ptr) # Set (sum_ptr) to sum + blr + .long 0 + .byte 0,0,0,0,0,3,0,0 + .cfi_endproc +.LFE0: + .size variance_svp64_real,.-variance_svp64_real + .ident "GCC: (Debian 8.3.0-6) 8.3.0" + .section .note.GNU-stack,"",@progbits diff --git a/media/video/libvpx/variance_svp64_wrappers.c b/media/video/libvpx/variance_svp64_wrappers.c index ed459040..38828e10 100644 --- a/media/video/libvpx/variance_svp64_wrappers.c +++ b/media/video/libvpx/variance_svp64_wrappers.c @@ -4,6 +4,7 @@ #include "pypowersim_wrapper_common.h" #include "variance_svp64_wrappers.h" +#include "variance_ref.h" uint32_t vpx_get_mb_ss_svp64(const int16_t *src_ptr) { // It cannot be the same pointer as the original function, as it is really a separate CPU/RAM @@ -66,8 +67,6 @@ uint32_t vpx_get_mb_ss_svp64(const int16_t *src_ptr) { uint32_t vpx_get4x4sse_cs_svp64(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride) { -// vpx_get4x4sse_cs_svp64_ref(src_ptr, src_stride, ref_ptr, ref_stride); - // It cannot be the same pointer as the original function, as it is really a separate CPU/RAM // we have to memcpy from src_ptr to this pointer, the address was chosen arbitrarily uint64_t src_ptr_svp64 = 0x100000; @@ -145,20 +144,122 @@ uint32_t vpx_get4x4sse_cs_svp64(const uint8_t *src_ptr, int src_stride, // Return value return (uint32_t) val; +} + +void variance_svp64(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, int w, int h, + uint32_t *sse, int *sum) { + + int sse2, sum2; + variance_c(src_ptr, src_stride, ref_ptr, ref_stride, w, h, &sse2, &sum2); + printf("src_ptr: %p, src_stride: %d, ref_ptr: %p, ref_stride: %d, w: %d, h: %d, sse_ptr: %p, sum_ptr: %p, sse2: %d, sum2: %d\n", + src_ptr, src_stride, ref_ptr, ref_stride, w, h, sse, sum, sse2, sum2); + // It cannot be the same pointer as the original function, as it is really a separate CPU/RAM + // we have to memcpy from src_ptr to this pointer, the address was chosen arbitrarily + uint64_t src_ptr_svp64 = 0x100000; + uint64_t ref_ptr_svp64 = 0x200000; + uint64_t sse_ptr_svp64 = 0x300000; + uint64_t sum_ptr_svp64 = 0x300008; - int distortion = 0; - int r, c; + // Create the pypowersim_state + pypowersim_state_t *state = pypowersim_prepare(); - for (r = 0; r < 4; ++r) { - for (c = 0; c < 4; ++c) { - int diff = src_ptr[c] - ref_ptr[c]; - distortion += diff * diff; + // Change the relevant elements, mandatory: body + state->binary = PyBytes_FromStringAndSize((const char *)&variance_svp64_real, 1000); + // Set GPR #3 to the src_ptr + PyObject *src_address = PyLong_FromLongLong(src_ptr_svp64); + PyList_SetItem(state->initial_regs, 3, src_address); + // Load data into buffer from real memory + for (int r=0; r < 4; r++) { + PyObject *address = PyLong_FromLongLong(src_ptr_svp64); + uint64_t val = src_ptr[0]; + val |= (uint64_t)(src_ptr[1]) << 16; + val |= (uint64_t)(src_ptr[2]) << 32; + val |= (uint64_t)(src_ptr[3]) << 48; + PyObject *word = PyLong_FromLongLong(val); + PyDict_SetItem(state->initial_mem, address, word); + src_ptr += src_stride; + src_ptr_svp64 += 8; } - src_ptr += src_stride; - ref_ptr += ref_stride; - } + // Set GPR #4 to the src_stride + PyList_SetItem(state->initial_regs, 4, PyLong_FromLongLong(src_stride)); - return distortion; -} + // Set GPR #5 to the ref_ptr + PyObject *ref_address = PyLong_FromLongLong(ref_ptr_svp64); + PyList_SetItem(state->initial_regs, 5, ref_address); + // Load data into buffer from real memory + for (int r=0; r < 4; r++) { + PyObject *address = PyLong_FromLongLong(ref_ptr_svp64); + uint64_t val = ref_ptr[0]; + val |= (uint64_t)(ref_ptr[1]) << 16; + val |= (uint64_t)(ref_ptr[2]) << 32; + val |= (uint64_t)(ref_ptr[3]) << 48; + //printf("ref: %p -> %04x %04x %04x %04x, val: %016lx -> %p\n", ref_ptr, ref_ptr[0], ref_ptr[1], ref_ptr[2], ref_ptr[3], val, ref_ptr_svp64); + PyObject *word = PyLong_FromLongLong(val); + PyDict_SetItem(state->initial_mem, address, word); + ref_ptr += ref_stride; + ref_ptr_svp64 += 8; + } + + // Set GPR #6 to the ref_stride + PyList_SetItem(state->initial_regs, 6, PyLong_FromLongLong(ref_stride)); + // Set GPR #7 to the width + PyList_SetItem(state->initial_regs, 7, PyLong_FromLongLong(w)); + // Set GPR #8 to the height + PyList_SetItem(state->initial_regs, 8, PyLong_FromLongLong(h)); + // Set GPR #9 to the sse pointer + PyList_SetItem(state->initial_regs, 9, PyLong_FromLongLong(sse_ptr_svp64)); + // Set GPR #10 to the sum pointer + PyList_SetItem(state->initial_regs, 10, PyLong_FromLongLong(sum_ptr_svp64)); + + PyObject *sse_address = PyLong_FromLongLong(sse_ptr_svp64); + PyObject *sum_address = PyLong_FromLongLong(sum_ptr_svp64); + PyObject *word = PyLong_FromLongLong(0); + PyDict_SetItem(state->initial_mem, sse_address, word); + PyDict_SetItem(state->initial_mem, sum_address, word); + + // Prepare the arguments object for the call + pypowersim_prepareargs(state); + + // Call the function and get the resulting object + state->result_obj = PyObject_CallObject(state->simulator, state->args); + Py_DECREF(state->simulator); + Py_DECREF(state->args); + if (!state->result_obj) { + PyErr_Print(); + printf("Error invoking 'run_a_simulation'\n"); + } + // Get the GPRs from the result_obj + PyObject *final_regs = PyObject_GetAttrString(state->result_obj, "gpr"); + if (!final_regs) { + PyErr_Print(); + Py_DECREF(state->result_obj); + printf("Error getting final GPRs\n"); + } + + PyObject *memobj = PyObject_GetAttrString(state->result_obj, "mem"); + if (!memobj) { + PyErr_Print(); + Py_DECREF(state->result_obj); + printf("Error getting mem object\n"); + } + + PyObject *mem = PyObject_GetAttrString(memobj, "mem"); + if (!mem) { + PyErr_Print(); + Py_DECREF(state->result_obj); + printf("Error getting mem dict\n"); + } + + sse_address = PyLong_FromLongLong(sse_ptr_svp64/8); + sum_address = PyLong_FromLongLong(sum_ptr_svp64/8); + + PyObject *sse_val = PyDict_GetItem(mem, sse_address); + *sse = (uint32_t) PyLong_AsLongLong(sse_val); + + PyObject *sum_val = PyDict_GetItem(mem, sum_address); + *sum = (int32_t)PyLong_AsLongLong(sum_val); + printf("sse: %d/%08x, sum: %d/%08x\n", *sse, *sse, *sum, *sum); +} diff --git a/media/video/libvpx/variance_svp64_wrappers.h b/media/video/libvpx/variance_svp64_wrappers.h index 4441dcce..1783096e 100644 --- a/media/video/libvpx/variance_svp64_wrappers.h +++ b/media/video/libvpx/variance_svp64_wrappers.h @@ -5,3 +5,7 @@ uint32_t vpx_get_mb_ss_svp64_real(const int16_t *src_ptr); uint32_t vpx_get4x4sse_cs_svp64_real(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +void variance_svp64_real(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, int w, int h, + uint32_t *sse, int *sum); +