From: Konstantinos Margaritis <konstantinos.margaritis@vectorcamp.gr>
Date: Fri, 23 Sep 2022 07:33:37 +0000 (+0000)
Subject: add new SVP64 function
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=4720370cebca592bb72ab53a7ab0cadbf4bcd876;p=openpower-isa.git

add new SVP64 function
---

diff --git a/media/video/libvpx/variance_svp64_real.c.in b/media/video/libvpx/variance_svp64_real.c.in
new file mode 100644
index 00000000..642f27ae
--- /dev/null
+++ b/media/video/libvpx/variance_svp64_real.c.in
@@ -0,0 +1,31 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdint.h>
+
+void variance_svp64_real(const uint8_t *src_ptr, int src_stride,
+                         const uint8_t *ref_ptr, int ref_stride, int w, int h,
+                         uint32_t *sse, int *sum) {
+  int i, j;
+
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      const int diff = src_ptr[j] - ref_ptr[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  }
+}
diff --git a/media/video/libvpx/variance_svp64_real.s b/media/video/libvpx/variance_svp64_real.s
new file mode 100644
index 00000000..4242f68e
--- /dev/null
+++ b/media/video/libvpx/variance_svp64_real.s
@@ -0,0 +1,76 @@
+.set src_ptr, 3
+.set src_stride, 4
+.set ref_ptr, 5
+.set ref_stride, 6
+.set width, 7
+.set height, 8
+.set sse_ptr, 9
+.set sum_ptr, 10
+.set sum, 11
+.set sse, 12
+.set ctr, 13
+.set src_col, 14
+.set ref_col, 15
+.set row, 16
+.set src, 20
+.set ref, 36
+.set diff, 52
+.set prod, 68
+
+	.machine libresoc
+	.file	"variance_svp64_real.c"
+	.abiversion 2
+	.section	".text"
+	.align 2
+	.globl variance_svp64_real
+	.type	variance_svp64_real, @function
+variance_svp64_real:
+.LFB0:
+	.cfi_startproc
+	# Set sum to zero
+	li sum, 0				# Set sum to zero
+	li sse, 0				# Set sse to zero
+	mr row, height				# Set row to height
+	sldi	src_stride, src_stride, 1	# strides are for 16-bit elements
+	sldi	ref_stride, ref_stride, 1	# we need to increase by bytes
+	srdi	ctr, width, 2
+	mtctr	ctr
+	setvl	0,0,4,0,1,1			# Set VL to 4 elements
+
+.L1:	# outer loop: for (r=0; r < h; r++)
+
+.L2:	# inner loop: for (c=0; c < w; c += 4)
+	# Load 4 elements from src_ptr and ref_ptr, at groups of 4
+	mr	src_col, src_ptr		# Temporary variables
+	mr	ref_col, ref_ptr
+	sv.lha	*src, 0(src_col)		# Load 4 ints from (src_ptr)
+	sv.lha	*ref, 0(ref_col)		# Load 4 ints from (ref_ptr)
+	addi	src_col, src_col, 8		# Increment src, ref by 8 bytes
+	addi	ref_col, ref_col, 8
+
+	# equivalent to: for (i = 0; i < 4; i++) diff[i] = src[i] - ref[i];
+	sv.subf		*diff, *src, *ref
+	# equivalent to: for (i = 0; i < 4; i++) prod[i] = diff[i] * diff[i];
+	sv.mulld 	*prod, *diff, *diff
+	# equivalent to: for (i = 0; i < 4; i++) sum += diff[i];
+	sv.add/mr	sum, *diff, sum
+	# equivalent to: for (i = 0; i < 4; i++) sum += diff[i];
+	sv.add/mr	sse, *prod, sse
+
+	bdnz .L2				# Loop until CTR is zero
+	add 	src_ptr, src_ptr, src_stride	# Advance src_ptr by src_stride
+	add 	ref_ptr, ref_ptr, ref_stride	# Advance ref_ptr by ref_stride
+
+	subi row, row, 1			# Subtract 1 from row
+	cmpwi cr1, row, 0			# Is row zero?
+	bne cr1, .L1				# Go back to L1 if not done
+	std sum, 0(sum_ptr)			# Set (sum_ptr) to sum
+	std sse, 0(sse_ptr)			# Set (sum_ptr) to sum
+	blr
+	.long 0
+	.byte 0,0,0,0,0,3,0,0
+	.cfi_endproc
+.LFE0:
+	.size	variance_svp64_real,.-variance_svp64_real
+	.ident	"GCC: (Debian 8.3.0-6) 8.3.0"
+	.section	.note.GNU-stack,"",@progbits
diff --git a/media/video/libvpx/variance_svp64_wrappers.c b/media/video/libvpx/variance_svp64_wrappers.c
index ed459040..38828e10 100644
--- a/media/video/libvpx/variance_svp64_wrappers.c
+++ b/media/video/libvpx/variance_svp64_wrappers.c
@@ -4,6 +4,7 @@
 
 #include "pypowersim_wrapper_common.h"
 #include "variance_svp64_wrappers.h"
+#include "variance_ref.h"
 
 uint32_t vpx_get_mb_ss_svp64(const int16_t *src_ptr) {
     // It cannot be the same pointer as the original function, as it is really a separate CPU/RAM
@@ -66,8 +67,6 @@ uint32_t vpx_get_mb_ss_svp64(const int16_t *src_ptr) {
 uint32_t vpx_get4x4sse_cs_svp64(const uint8_t *src_ptr, int src_stride,
                                 const uint8_t *ref_ptr, int ref_stride) {
 
-//    vpx_get4x4sse_cs_svp64_ref(src_ptr, src_stride, ref_ptr, ref_stride);
-
     // It cannot be the same pointer as the original function, as it is really a separate CPU/RAM
     // we have to memcpy from src_ptr to this pointer, the address was chosen arbitrarily
     uint64_t src_ptr_svp64 = 0x100000;
@@ -145,20 +144,122 @@ uint32_t vpx_get4x4sse_cs_svp64(const uint8_t *src_ptr, int src_stride,
 
     // Return value
     return (uint32_t) val;
+}
+
+void variance_svp64(const uint8_t *src_ptr, int src_stride,
+                    const uint8_t *ref_ptr, int ref_stride, int w, int h,
+                    uint32_t *sse, int *sum) {
+
+    int sse2, sum2;
+    variance_c(src_ptr, src_stride, ref_ptr, ref_stride, w, h, &sse2, &sum2);
+    printf("src_ptr: %p, src_stride: %d, ref_ptr: %p, ref_stride: %d, w: %d, h: %d, sse_ptr: %p, sum_ptr: %p, sse2: %d, sum2: %d\n",
+		    src_ptr, src_stride, ref_ptr, ref_stride, w, h, sse, sum, sse2, sum2);
+    // It cannot be the same pointer as the original function, as it is really a separate CPU/RAM
+    // we have to memcpy from src_ptr to this pointer, the address was chosen arbitrarily
+    uint64_t src_ptr_svp64 = 0x100000;
+    uint64_t ref_ptr_svp64 = 0x200000;
+    uint64_t sse_ptr_svp64 = 0x300000;
+    uint64_t sum_ptr_svp64 = 0x300008;
 
-  int distortion = 0;
-  int r, c;
+    // Create the pypowersim_state
+    pypowersim_state_t *state = pypowersim_prepare();
 
-  for (r = 0; r < 4; ++r) {
-    for (c = 0; c < 4; ++c) {
-      int diff = src_ptr[c] - ref_ptr[c];
-      distortion += diff * diff;
+    // Change the relevant elements, mandatory: body
+    state->binary = PyBytes_FromStringAndSize((const char *)&variance_svp64_real, 1000);
+    // Set GPR #3 to the src_ptr
+    PyObject *src_address = PyLong_FromLongLong(src_ptr_svp64);
+    PyList_SetItem(state->initial_regs, 3, src_address);
+    // Load data into buffer from real memory
+    for (int r=0; r < 4; r++) {
+      PyObject *address = PyLong_FromLongLong(src_ptr_svp64);
+      uint64_t val = src_ptr[0];
+      val |= (uint64_t)(src_ptr[1]) << 16;
+      val |= (uint64_t)(src_ptr[2]) << 32;
+      val |= (uint64_t)(src_ptr[3]) << 48;
+      PyObject *word = PyLong_FromLongLong(val);
+      PyDict_SetItem(state->initial_mem, address, word);
+      src_ptr += src_stride;
+      src_ptr_svp64 += 8;
     }
 
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-  }
+    // Set GPR #4 to the src_stride 
+    PyList_SetItem(state->initial_regs, 4, PyLong_FromLongLong(src_stride));
 
-  return distortion;
-}
+    // Set GPR #5 to the ref_ptr
+    PyObject *ref_address = PyLong_FromLongLong(ref_ptr_svp64);
+    PyList_SetItem(state->initial_regs, 5, ref_address);
+    // Load data into buffer from real memory
+    for (int r=0; r < 4; r++) {
+      PyObject *address = PyLong_FromLongLong(ref_ptr_svp64);
+      uint64_t val = ref_ptr[0];
+      val |= (uint64_t)(ref_ptr[1]) << 16;
+      val |= (uint64_t)(ref_ptr[2]) << 32;
+      val |= (uint64_t)(ref_ptr[3]) << 48;
+      //printf("ref: %p -> %04x %04x %04x %04x, val: %016lx -> %p\n", ref_ptr, ref_ptr[0], ref_ptr[1], ref_ptr[2], ref_ptr[3], val, ref_ptr_svp64);
+      PyObject *word = PyLong_FromLongLong(val);
+      PyDict_SetItem(state->initial_mem, address, word);
+      ref_ptr += ref_stride;
+      ref_ptr_svp64 += 8;
+    }
+
+    // Set GPR #6 to the ref_stride 
+    PyList_SetItem(state->initial_regs, 6, PyLong_FromLongLong(ref_stride));
+    // Set GPR #7 to the width
+    PyList_SetItem(state->initial_regs, 7, PyLong_FromLongLong(w));
+    // Set GPR #8 to the height
+    PyList_SetItem(state->initial_regs, 8, PyLong_FromLongLong(h));
+    // Set GPR #9 to the sse pointer
+    PyList_SetItem(state->initial_regs, 9, PyLong_FromLongLong(sse_ptr_svp64));
+    // Set GPR #10 to the sum pointer
+    PyList_SetItem(state->initial_regs, 10, PyLong_FromLongLong(sum_ptr_svp64));
+
+    PyObject *sse_address = PyLong_FromLongLong(sse_ptr_svp64);
+    PyObject *sum_address = PyLong_FromLongLong(sum_ptr_svp64);
+    PyObject *word = PyLong_FromLongLong(0);
+    PyDict_SetItem(state->initial_mem, sse_address, word);
+    PyDict_SetItem(state->initial_mem, sum_address, word);
+
+    // Prepare the arguments object for the call
+    pypowersim_prepareargs(state);
+
+    // Call the function and get the resulting object
+    state->result_obj = PyObject_CallObject(state->simulator, state->args);
+    Py_DECREF(state->simulator);
+    Py_DECREF(state->args);
+    if (!state->result_obj) {
+        PyErr_Print();
+        printf("Error invoking 'run_a_simulation'\n");
+    }
 
+    // Get the GPRs from the result_obj
+    PyObject *final_regs = PyObject_GetAttrString(state->result_obj, "gpr");
+    if (!final_regs) {
+        PyErr_Print();
+        Py_DECREF(state->result_obj);
+        printf("Error getting final GPRs\n");
+    }
+
+    PyObject *memobj = PyObject_GetAttrString(state->result_obj, "mem");
+    if (!memobj) {
+        PyErr_Print();
+        Py_DECREF(state->result_obj);
+        printf("Error getting mem object\n");
+    }
+
+    PyObject *mem = PyObject_GetAttrString(memobj, "mem");
+    if (!mem) {
+        PyErr_Print();
+        Py_DECREF(state->result_obj);
+        printf("Error getting mem dict\n");
+    }
+
+    sse_address = PyLong_FromLongLong(sse_ptr_svp64/8);
+    sum_address = PyLong_FromLongLong(sum_ptr_svp64/8);
+
+    PyObject *sse_val = PyDict_GetItem(mem, sse_address);
+    *sse = (uint32_t) PyLong_AsLongLong(sse_val);
+
+    PyObject *sum_val = PyDict_GetItem(mem, sum_address);
+    *sum = (int32_t)PyLong_AsLongLong(sum_val);
+    printf("sse: %d/%08x, sum: %d/%08x\n", *sse, *sse, *sum, *sum);
+}
diff --git a/media/video/libvpx/variance_svp64_wrappers.h b/media/video/libvpx/variance_svp64_wrappers.h
index 4441dcce..1783096e 100644
--- a/media/video/libvpx/variance_svp64_wrappers.h
+++ b/media/video/libvpx/variance_svp64_wrappers.h
@@ -5,3 +5,7 @@ uint32_t vpx_get_mb_ss_svp64_real(const int16_t *src_ptr);
 uint32_t vpx_get4x4sse_cs_svp64_real(const uint8_t *src_ptr, int src_stride,
                                      const uint8_t *ref_ptr, int ref_stride);
 
+void variance_svp64_real(const uint8_t *src_ptr, int src_stride,
+                    const uint8_t *ref_ptr, int ref_stride, int w, int h,
+                    uint32_t *sse, int *sum);
+