--- /dev/null
+.set src_ptr, 3
+.set src_stride, 4
+.set ref_ptr, 5
+.set ref_stride, 6
+.set width, 7
+.set height, 8
+.set sse_ptr, 9
+.set sum_ptr, 10
+.set sum, 11
+.set sse, 12
+.set ctr, 13
+.set src_col, 14
+.set ref_col, 15
+.set row, 16
+.set src, 20
+.set ref, 36
+.set diff, 52
+.set prod, 68
+
+ .machine libresoc
+ .file "variance_svp64_real.c"
+ .abiversion 2
+ .section ".text"
+ .align 2
+ .globl variance_svp64_real
+ .type variance_svp64_real, @function
+variance_svp64_real:
+.LFB0:
+ .cfi_startproc
+ # Set sum to zero
+ li sum, 0 # Set sum to zero
+ li sse, 0 # Set sse to zero
+ mr row, height # Set row to height
+ sldi src_stride, src_stride, 1 # strides are for 16-bit elements
+ sldi ref_stride, ref_stride, 1 # we need to increase by bytes
+ srdi ctr, width, 2
+ mtctr ctr
+ setvl 0,0,4,0,1,1 # Set VL to 4 elements
+
+.L1: # outer loop: for (r=0; r < h; r++)
+
+.L2: # inner loop: for (c=0; c < w; c += 4)
+ # Load 4 elements from src_ptr and ref_ptr, at groups of 4
+ mr src_col, src_ptr # Temporary variables
+ mr ref_col, ref_ptr
+ sv.lha *src, 0(src_col) # Load 4 ints from (src_ptr)
+ sv.lha *ref, 0(ref_col) # Load 4 ints from (ref_ptr)
+ addi src_col, src_col, 8 # Increment src, ref by 8 bytes
+ addi ref_col, ref_col, 8
+
+ # equivalent to: for (i = 0; i < 4; i++) diff[i] = src[i] - ref[i];
+ sv.subf *diff, *src, *ref
+ # equivalent to: for (i = 0; i < 4; i++) prod[i] = diff[i] * diff[i];
+ sv.mulld *prod, *diff, *diff
+ # equivalent to: for (i = 0; i < 4; i++) sum += diff[i];
+ sv.add/mr sum, *diff, sum
+ # equivalent to: for (i = 0; i < 4; i++) sum += diff[i];
+ sv.add/mr sse, *prod, sse
+
+ bdnz .L2 # Loop until CTR is zero
+ add src_ptr, src_ptr, src_stride # Advance src_ptr by src_stride
+ add ref_ptr, ref_ptr, ref_stride # Advance ref_ptr by ref_stride
+
+ subi row, row, 1 # Subtract 1 from row
+ cmpwi cr1, row, 0 # Is row zero?
+ bne cr1, .L1 # Go back to L1 if not done
+ std sum, 0(sum_ptr) # Set (sum_ptr) to sum
+ std sse, 0(sse_ptr) # Set (sum_ptr) to sum
+ blr
+ .long 0
+ .byte 0,0,0,0,0,3,0,0
+ .cfi_endproc
+.LFE0:
+ .size variance_svp64_real,.-variance_svp64_real
+ .ident "GCC: (Debian 8.3.0-6) 8.3.0"
+ .section .note.GNU-stack,"",@progbits
#include "pypowersim_wrapper_common.h"
#include "variance_svp64_wrappers.h"
+#include "variance_ref.h"
uint32_t vpx_get_mb_ss_svp64(const int16_t *src_ptr) {
// It cannot be the same pointer as the original function, as it is really a separate CPU/RAM
uint32_t vpx_get4x4sse_cs_svp64(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride) {
-// vpx_get4x4sse_cs_svp64_ref(src_ptr, src_stride, ref_ptr, ref_stride);
-
// It cannot be the same pointer as the original function, as it is really a separate CPU/RAM
// we have to memcpy from src_ptr to this pointer, the address was chosen arbitrarily
uint64_t src_ptr_svp64 = 0x100000;
// Return value
return (uint32_t) val;
+}
+
+void variance_svp64(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride, int w, int h,
+ uint32_t *sse, int *sum) {
+
+ int sse2, sum2;
+ variance_c(src_ptr, src_stride, ref_ptr, ref_stride, w, h, &sse2, &sum2);
+ printf("src_ptr: %p, src_stride: %d, ref_ptr: %p, ref_stride: %d, w: %d, h: %d, sse_ptr: %p, sum_ptr: %p, sse2: %d, sum2: %d\n",
+ src_ptr, src_stride, ref_ptr, ref_stride, w, h, sse, sum, sse2, sum2);
+ // It cannot be the same pointer as the original function, as it is really a separate CPU/RAM
+ // we have to memcpy from src_ptr to this pointer, the address was chosen arbitrarily
+ uint64_t src_ptr_svp64 = 0x100000;
+ uint64_t ref_ptr_svp64 = 0x200000;
+ uint64_t sse_ptr_svp64 = 0x300000;
+ uint64_t sum_ptr_svp64 = 0x300008;
- int distortion = 0;
- int r, c;
+ // Create the pypowersim_state
+ pypowersim_state_t *state = pypowersim_prepare();
- for (r = 0; r < 4; ++r) {
- for (c = 0; c < 4; ++c) {
- int diff = src_ptr[c] - ref_ptr[c];
- distortion += diff * diff;
+ // Change the relevant elements, mandatory: body
+ state->binary = PyBytes_FromStringAndSize((const char *)&variance_svp64_real, 1000);
+ // Set GPR #3 to the src_ptr
+ PyObject *src_address = PyLong_FromLongLong(src_ptr_svp64);
+ PyList_SetItem(state->initial_regs, 3, src_address);
+ // Load data into buffer from real memory
+ for (int r=0; r < 4; r++) {
+ PyObject *address = PyLong_FromLongLong(src_ptr_svp64);
+ uint64_t val = src_ptr[0];
+ val |= (uint64_t)(src_ptr[1]) << 16;
+ val |= (uint64_t)(src_ptr[2]) << 32;
+ val |= (uint64_t)(src_ptr[3]) << 48;
+ PyObject *word = PyLong_FromLongLong(val);
+ PyDict_SetItem(state->initial_mem, address, word);
+ src_ptr += src_stride;
+ src_ptr_svp64 += 8;
}
- src_ptr += src_stride;
- ref_ptr += ref_stride;
- }
+ // Set GPR #4 to the src_stride
+ PyList_SetItem(state->initial_regs, 4, PyLong_FromLongLong(src_stride));
- return distortion;
-}
+ // Set GPR #5 to the ref_ptr
+ PyObject *ref_address = PyLong_FromLongLong(ref_ptr_svp64);
+ PyList_SetItem(state->initial_regs, 5, ref_address);
+ // Load data into buffer from real memory
+ for (int r=0; r < 4; r++) {
+ PyObject *address = PyLong_FromLongLong(ref_ptr_svp64);
+ uint64_t val = ref_ptr[0];
+ val |= (uint64_t)(ref_ptr[1]) << 16;
+ val |= (uint64_t)(ref_ptr[2]) << 32;
+ val |= (uint64_t)(ref_ptr[3]) << 48;
+ //printf("ref: %p -> %04x %04x %04x %04x, val: %016lx -> %p\n", ref_ptr, ref_ptr[0], ref_ptr[1], ref_ptr[2], ref_ptr[3], val, ref_ptr_svp64);
+ PyObject *word = PyLong_FromLongLong(val);
+ PyDict_SetItem(state->initial_mem, address, word);
+ ref_ptr += ref_stride;
+ ref_ptr_svp64 += 8;
+ }
+
+ // Set GPR #6 to the ref_stride
+ PyList_SetItem(state->initial_regs, 6, PyLong_FromLongLong(ref_stride));
+ // Set GPR #7 to the width
+ PyList_SetItem(state->initial_regs, 7, PyLong_FromLongLong(w));
+ // Set GPR #8 to the height
+ PyList_SetItem(state->initial_regs, 8, PyLong_FromLongLong(h));
+ // Set GPR #9 to the sse pointer
+ PyList_SetItem(state->initial_regs, 9, PyLong_FromLongLong(sse_ptr_svp64));
+ // Set GPR #10 to the sum pointer
+ PyList_SetItem(state->initial_regs, 10, PyLong_FromLongLong(sum_ptr_svp64));
+
+ PyObject *sse_address = PyLong_FromLongLong(sse_ptr_svp64);
+ PyObject *sum_address = PyLong_FromLongLong(sum_ptr_svp64);
+ PyObject *word = PyLong_FromLongLong(0);
+ PyDict_SetItem(state->initial_mem, sse_address, word);
+ PyDict_SetItem(state->initial_mem, sum_address, word);
+
+ // Prepare the arguments object for the call
+ pypowersim_prepareargs(state);
+
+ // Call the function and get the resulting object
+ state->result_obj = PyObject_CallObject(state->simulator, state->args);
+ Py_DECREF(state->simulator);
+ Py_DECREF(state->args);
+ if (!state->result_obj) {
+ PyErr_Print();
+ printf("Error invoking 'run_a_simulation'\n");
+ }
+ // Get the GPRs from the result_obj
+ PyObject *final_regs = PyObject_GetAttrString(state->result_obj, "gpr");
+ if (!final_regs) {
+ PyErr_Print();
+ Py_DECREF(state->result_obj);
+ printf("Error getting final GPRs\n");
+ }
+
+ PyObject *memobj = PyObject_GetAttrString(state->result_obj, "mem");
+ if (!memobj) {
+ PyErr_Print();
+ Py_DECREF(state->result_obj);
+ printf("Error getting mem object\n");
+ }
+
+ PyObject *mem = PyObject_GetAttrString(memobj, "mem");
+ if (!mem) {
+ PyErr_Print();
+ Py_DECREF(state->result_obj);
+ printf("Error getting mem dict\n");
+ }
+
+ sse_address = PyLong_FromLongLong(sse_ptr_svp64/8);
+ sum_address = PyLong_FromLongLong(sum_ptr_svp64/8);
+
+ PyObject *sse_val = PyDict_GetItem(mem, sse_address);
+ *sse = (uint32_t) PyLong_AsLongLong(sse_val);
+
+ PyObject *sum_val = PyDict_GetItem(mem, sum_address);
+ *sum = (int32_t)PyLong_AsLongLong(sum_val);
+ printf("sse: %d/%08x, sum: %d/%08x\n", *sse, *sse, *sum, *sum);
+}