From 175318267a1bd4480c0efc001d102b9ab1caa849 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos.margaritis@vectorcamp.gr>
Date: Tue, 27 Sep 2022 10:04:49 +0000
Subject: [PATCH] Working version of VP8 DCT4x4 in SVP64

---
 media/video/libvpx/Makefile              |  34 ++--
 media/video/libvpx/include/vp8_rtcd.h    |  13 ++
 media/video/libvpx/vp8_dct4x4_real.c.in  |  51 ++++++
 media/video/libvpx/vp8_dct4x4_real.s     | 111 +++++++++++++
 media/video/libvpx/vp8_dct4x4_ref.c      |  66 ++++++++
 media/video/libvpx/vp8_dct4x4_wrappers.c |  85 ++++++++++
 media/video/libvpx/vp8_dct4x4_wrappers.h |   3 +
 media/video/libvpx/vp8_fdct4x4_test.cc   | 194 +++++++++++++++++++++++
 8 files changed, 544 insertions(+), 13 deletions(-)
 create mode 100644 media/video/libvpx/include/vp8_rtcd.h
 create mode 100644 media/video/libvpx/vp8_dct4x4_real.c.in
 create mode 100644 media/video/libvpx/vp8_dct4x4_real.s
 create mode 100644 media/video/libvpx/vp8_dct4x4_ref.c
 create mode 100644 media/video/libvpx/vp8_dct4x4_wrappers.c
 create mode 100644 media/video/libvpx/vp8_dct4x4_wrappers.h
 create mode 100644 media/video/libvpx/vp8_fdct4x4_test.cc

diff --git a/media/video/libvpx/Makefile b/media/video/libvpx/Makefile
index 00a1fdd2..9d16c33a 100644
--- a/media/video/libvpx/Makefile
+++ b/media/video/libvpx/Makefile
@@ -1,4 +1,5 @@
-TARGET=libvpx_variance_test
+VPXTARGET=libvpx_variance_test
+VP8TARGET=vp8_dct_test
 EXAMPLE=pypowersim_wrapper_example
 
 CC=gcc
@@ -7,27 +8,34 @@ AS=powerpc64le-linux-gnu-as
 OBJCOPY=powerpc64le-linux-gnu-objcopy
 CFLAGS= -Iinclude -O -g3 -I/usr/include/python3.7m
 CXXFLAGS= -Iinclude -O -g3
-ASFLAGS= -mlibresoc
+ASFLAGS= -mlibresoc -mregnames
 LDFLAGS=-lgtest -pthread -lpython3.7m
 
-BINFILES = vpx_get_mb_ss_svp64_real.bin vpx_get4x4sse_cs_svp64_real.bin 
-ASFILES  = vpx_get_mb_ss_svp64_real.s vpx_get4x4sse_cs_svp64_real.s variance_svp64_real.s
-CFILES   = variance_ref.c  variancefuncs_svp64.c  variance_svp64_wrappers.c  vpx_mem.c
-CPPFILES = test_libvpx.cc  variance_test.cc
-EXAMPLEC = pypowersim_wrapper_example.c
-EXAMPLEOBJ= ${EXAMPLEC:.c=.o}
-OBJFILES = $(CFILES:.c=.o) $(CPPFILES:.cc=.o) $(ASFILES:.s=.o)
+BINFILES     = vpx_get_mb_ss_svp64_real.bin vpx_get4x4sse_cs_svp64_real.bin variance_svp64_real.bin vp8_dct4x4_real.bin
+VP8_ASFILES  = vp8_dct4x4_real.s
+VPX_ASFILES  = vpx_get_mb_ss_svp64_real.s vpx_get4x4sse_cs_svp64_real.s variance_svp64_real.s
+VP8_CFILES   = vp8_dct4x4_ref.c vp8_dct4x4_wrappers.c
+VPX_CFILES   = variance_ref.c  variancefuncs_svp64.c  variance_svp64_wrappers.c  vpx_mem.c
+VP8_CPPFILES = test_libvpx.cc  vp8_fdct4x4_test.cc
+VPX_CPPFILES = test_libvpx.cc  variance_test.cc
+EXAMPLEC     = pypowersim_wrapper_example.c
+EXAMPLEOBJ   = ${EXAMPLEC:.c=.o}
+VP8_OBJFILES = $(VP8_ASFILES:.s=.o) $(VP8_CFILES:.c=.o) $(VP8_CPPFILES:.cc=.o)
+VPX_OBJFILES = $(VPX_ASFILES:.s=.o) $(VPX_CFILES:.c=.o) $(VPX_CPPFILES:.cc=.o)
 
 %.bin: %.o
 	${OBJCOPY} -I elf64-little -O binary $< $@ 
 
-${TARGET}: ${OBJFILES} ${BINFILES}
-	${CXX} -o ${TARGET} ${OBJFILES} ${LDFLAGS}
+${VP8TARGET}: ${VP8_OBJFILES}
+	${CXX} -o ${VP8TARGET} ${VP8_OBJFILES} ${LDFLAGS}
+
+${VPXTARGET}: ${VPX_OBJFILES}
+	${CXX} -o ${VPXTARGET} ${VPX_OBJFILES} ${LDFLAGS}
 
 ${EXAMPLE}: ${EXAMPLEOBJ}
 
-all: ${TARGET} ${EXAMPLE}
+all: ${VP8TARGET} ${VPXTARGET} ${EXAMPLE} ${BINFILES}
 
 .PHONY: clean
 clean:
-	rm -f ${TARGET} ${OBJFILES} ${BINFILES}
+	rm -f ${VP8TARGET} ${VPXTARGET} ${VP8_OBJFILES} ${VPX_OBJFILES} ${BINFILES} ${EXAMPLE} ${EXAMPLEOBJ}
diff --git a/media/video/libvpx/include/vp8_rtcd.h b/media/video/libvpx/include/vp8_rtcd.h
new file mode 100644
index 00000000..d883cdd3
--- /dev/null
+++ b/media/video/libvpx/include/vp8_rtcd.h
@@ -0,0 +1,13 @@
+#include "vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_short_fdct4x4_c(int16_t *input, int16_t *output, int32_t pitch);
+void vp8_short_fdct4x4_svp64(int16_t *input, int16_t *output, int32_t pitch);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
diff --git a/media/video/libvpx/vp8_dct4x4_real.c.in b/media/video/libvpx/vp8_dct4x4_real.c.in
new file mode 100644
index 00000000..0c26d91f
--- /dev/null
+++ b/media/video/libvpx/vp8_dct4x4_real.c.in
@@ -0,0 +1,51 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+
+void vp8_short_fdct4x4_svp64(short *input, short *output, int pitch) {
+  int i;
+  int a1, b1, c1, d1;
+  short *ip = input;
+  short *op = output;
+
+  for (i = 0; i < 4; ++i) {
+    a1 = ((ip[0] + ip[3]) * 8);
+    b1 = ((ip[1] + ip[2]) * 8);
+    c1 = ((ip[1] - ip[2]) * 8);
+    d1 = ((ip[0] - ip[3]) * 8);
+
+    op[0] = a1 + b1;
+    op[2] = a1 - b1;
+
+    op[1] = (c1 * 2217 + d1 * 5352 + 14500) >> 12;
+    op[3] = (d1 * 2217 - c1 * 5352 + 7500) >> 12;
+
+    ip += pitch / 2;
+    op += 4;
+  }
+  ip = output;
+  op = output;
+  for (i = 0; i < 4; ++i) {
+    a1 = ip[0] + ip[12];
+    b1 = ip[4] + ip[8];
+    c1 = ip[4] - ip[8];
+    d1 = ip[0] - ip[12];
+
+    op[0] = (a1 + b1 + 7) >> 4;
+    op[8] = (a1 - b1 + 7) >> 4;
+
+    op[4] = ((c1 * 2217 + d1 * 5352 + 12000) >> 16) + (d1 != 0);
+    op[12] = (d1 * 2217 - c1 * 5352 + 51000) >> 16;
+
+    ip++;
+    op++;
+  }
+}
diff --git a/media/video/libvpx/vp8_dct4x4_real.s b/media/video/libvpx/vp8_dct4x4_real.s
new file mode 100644
index 00000000..34b59ce3
--- /dev/null
+++ b/media/video/libvpx/vp8_dct4x4_real.s
@@ -0,0 +1,111 @@
+.set in, 3
+.set out, 4
+.set pitch, 5
+.set c_2217, 6
+.set c_5352, 7
+.set c_7500, 9
+.set c_12000, 11
+.set c_51000, 12
+.set pred, 10
+.set ip, 16
+.set t, 32
+.set t2, 50 
+.set t3, 70
+.set op, 90
+
+	.machine libresoc
+	.file	"vp8_dct4x4_real.c"
+	.abiversion 2
+	.section	".text"
+	.align 2
+	.globl vp8_short_fdct4x4_svp64_real
+	.type	vp8_short_fdct4x4_svp64_real, @function
+vp8_short_fdct4x4_svp64_real:
+.LFB0:
+	.cfi_startproc
+	li			c_51000, 25500
+	sldi			c_51000, c_51000, 1		# c_51000 = 51000
+	setvl			0,0,16,0,1,1			# Set VL to 16 elements
+	sv.lha	 		*ip, 0(in)			# Load 4 ints from (in)
+
+	ori			pred, 0, 0b0001000100010001
+	sv.add/dm=r10		*t, *ip, *ip+3			# a1 = ip[0] + ip[3]
+	sv.add/dm=r10		*t+1, *ip+1, *ip+2		# b1 = ip[1] + ip[2]
+	sv.subf/dm=r10		*t+2, *ip+2, *ip+1		# c1 = ip[1] - ip[2]
+	sv.subf/dm=r10		*t+3, *ip+3, *ip		# d1 = ip[0] - ip[3]
+	sv.mulli		*t, *t, 8			# a1 *= 8, b1 *= 8, c1 *= 8, d1 *= 8
+
+	sv.add/dm=r10		*op, *t, *t+1			# op[0] = a1 + b1;
+	sv.subf/dm=r10		*op+2, *t+1, *t			# op[2] = a1 - b1;
+
+	# Calculate c1 * 2217, c1 *5352, d1 * 2217 and d1 * 5352
+	ori			pred, 0, 0b1100110011001100
+	sv.mulli/m=r10		*t2, *t, 2217			# t2 has c1 * 2217, d1 * 2217
+	sv.mulli/m=r10		*t3, *t, 5352 			# t3 has c1 * 5352, d1 * 5352
+
+	ori			pred, 0, 0b0010001000100010
+	# op[1] = (c1 * 2217 + d1 * 5352 + 14500)
+	sv.add/m=r10		*op, *t2+1, *t3+2		# c1 * 2217 + d1 * 5352
+	sv.addi/m=r10		*op, *op, 14500			# + 14500
+	
+	ori			pred, 0, 0b0100010001000100
+	# op[3] = (d1 * 2217 - c1 * 5352 + 7500)
+	sv.subf/m=r10		*op+1, *t3, *t2+1		# - c1 * 5352 + d1 * 2127
+	sv.addi/m=r10		*op+1, *op+1, 7500		# + 7500
+
+	ori			pred, 0, 0b1010101010101010
+	sv.rldicl/m=r10		*op, *op, 52, 12		# op[1] >>= 12, op[3] >>= 12
+
+	# column-wise DCT
+	ori			pred, 0, 0b0000000000001111
+	sv.add/m=r10		*t, *op, *op+12			# a1 = ip[0] + ip[12]
+	sv.add/m=r10		*t+4, *op+4, *op+8		# b1 = ip[4] + ip[8]
+	sv.subf/m=r10		*t+8, *op+8, *op+4		# c1 = ip[4] - ip[8]
+	sv.subf/m=r10		*t+12, *op+12, *op		# d1 = ip[0] - ip[12]
+
+	# op[0] = (a1 + b1 + 7) >> 4
+	sv.add/m=r10		*op, *t, *t+4			# op[0] = a1 + b1
+	sv.addi/m=r10		*op, *op, 7			# op[0] += 7
+
+	# op[8] = (a1 - b1 + 7) >> 4
+	sv.subf/m=r10		*op+8, *t+4, *t			# op[8] = a1 - b1
+	sv.addi/m=r10		*op+8, *op+8, 7			# op[8] += 7
+
+	ori			pred, 0, 0b0000111100001111
+	sv.rldicl/m=r10		*op, *op, 60, 4			# op[0] >>= 4, op[8] >>= 4
+
+	# Calculate c1 * 2217, c1 *5352, d1 * 2217 and d1 * 5352
+	ori			pred, 0, 0b1111111100000000
+	sv.mulli/m=r10		*t2, *t, 2217			# t2 has c1 * 2217, d1 * 2217
+	sv.mulli/m=r10		*t3, *t, 5352 			# t3 has c1 * 5352, d1 * 5352
+
+	# op[4] = ((c1 * 2217 + d1 * 5352 + 12000)
+	ori			pred, 0, 0b0000000011110000
+	sv.add/m=r10		*op, *t2+4, *t3+8		# c1 * 2217 + d1 * 5352
+	sv.addi/m=r10		*op, *op, 12000			# + 12000
+	
+	# op[12] = (d1 * 2217 - c1 * 5352 + 51000)
+	ori			pred, 0, 0b1111000000000000
+	sv.subf/m=r10		*op, *t3-4, *t2			# - c1 * 5352 + d1 * 2127
+	sv.add/m=r10		*op, *op, c_51000		# + 51000
+
+	ori			pred, 0, 0b1111000011110000
+	sv.rldicl/m=r10		*op, *op, 48, 16		# op[4] >>= 16, op[12] >= 16
+
+	# op[4] += (d1 != 0)
+	#ori			pred, 0, 0b0000000011110000
+	setvl			0,0,4,0,1,1			# Set VL to 16 elements
+	sv.cmpi			*cr0, 0, *t+12, 1
+	sv.addi/m=ne		*op+4, *op+4, 1
+
+	# store to buffer
+	setvl			0,0,16,0,1,1			# Set VL to 16 elements
+	sv.sth			*op, 0(out)
+	blr
+	.long 0
+	.byte 0,0,0,0,128,1,0,1
+	.cfi_endproc
+.LFE0:
+	.size	vp8_short_fdct4x4_svp64_real,.-vp8_short_fdct4x4_svp64_real
+	.ident	"GCC: (Debian 8.3.0-6) 8.3.0"
+	.section	.note.GNU-stack,"",@progbits
diff --git a/media/video/libvpx/vp8_dct4x4_ref.c b/media/video/libvpx/vp8_dct4x4_ref.c
new file mode 100644
index 00000000..0cb2b0a5
--- /dev/null
+++ b/media/video/libvpx/vp8_dct4x4_ref.c
@@ -0,0 +1,66 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include "vp8_rtcd.h"
+
+void vp8_short_fdct4x4_c(int16_t *input, int16_t *output, int32_t pitch) {
+  int i;
+  int a1, b1, c1, d1;
+  short *ip = input;
+  short *op = output;
+
+  for (i = 0; i < 4; ++i) {
+    a1 = ((ip[0] + ip[3]));
+    b1 = ((ip[1] + ip[2]));
+    c1 = ((ip[1] - ip[2]));
+    d1 = ((ip[0] - ip[3]));
+
+    a1 *= 8;
+    b1 *= 8;
+    c1 *= 8;
+    d1 *= 8;
+    printf("a1 = %08x\tb1 = %08x\tc1 = %08x\td1 = %08x\n", a1, b1, c1, d1);
+
+    op[0] = a1 + b1;
+    op[2] = a1 - b1;
+    printf("op[0] = %04x\top[2] = %04x\n", (uint16_t)op[0], (uint16_t)op[2]);
+
+    op[1] = (c1 * 2217 + d1 * 5352 + 14500) >> 12;
+    op[3] = (d1 * 2217 - c1 * 5352 + 7500) >> 12;
+    printf("op[1] = %04x\top[3] = %04x\n", (uint16_t)op[1], (uint16_t)op[3]);
+
+    ip += pitch / 2;
+    op += 4;
+  }
+  ip = output;
+  op = output;
+  for (i = 0; i < 4; ++i) {
+    a1 = ip[0] + ip[12];
+    b1 = ip[4] + ip[8];
+    c1 = ip[4] - ip[8];
+    d1 = ip[0] - ip[12];
+    printf("a1 = %08x\tb1 = %08x\tc1 = %08x\td1 = %08x\n", a1, b1, c1, d1);
+
+    op[0] = (a1 + b1 + 7) >> 4;
+    op[8] = (a1 - b1 + 7) >> 4;
+    printf("op[%d] = %08x\top[%d] = %08x\n", i, op[0], i+8, op[8]);
+
+    op[4] = ((c1 * 2217 + d1 * 5352 + 12000) >> 16) + (d1 != 0);
+    op[12] = (d1 * 2217 - c1 * 5352 + 51000) >> 16;
+    printf("op[%d] = %04x\top[%d] = %04x\n", i+4, (uint16_t)op[4], i+12, (uint16_t)op[12]);
+
+    ip++;
+    op++;
+  }
+}
diff --git a/media/video/libvpx/vp8_dct4x4_wrappers.c b/media/video/libvpx/vp8_dct4x4_wrappers.c
new file mode 100644
index 00000000..da92cf3c
--- /dev/null
+++ b/media/video/libvpx/vp8_dct4x4_wrappers.c
@@ -0,0 +1,85 @@
+#include <Python.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include "pypowersim_wrapper_common.h"
+#include "vp8_dct4x4_wrappers.h"
+#include "vp8_rtcd.h"
+
+void vp8_short_fdct4x4_svp64(int16_t *input, int16_t *output, int32_t pitch) {
+
+    printf("pitch: %d\n", pitch);
+    int16_t output2[16];
+    vp8_short_fdct4x4_c(input, output2, pitch);
+
+
+    // It cannot be the same pointer as the original function, as it is really a separate CPU/RAM
+    // we have to memcpy from input to this pointer, the address was chosen arbitrarily
+    uint64_t input_svp64  = 0x100000;
+    uint64_t output_svp64 = 0x200000;
+
+    // Create the pypowersim_state
+    pypowersim_state_t *state = pypowersim_prepare();
+
+    // Change the relevant elements, mandatory: body
+    state->binary = PyBytes_FromStringAndSize((const char *)&vp8_short_fdct4x4_svp64_real, 1000);
+    // Set GPR #3 to the input pointer
+    PyObject *address = PyLong_FromUnsignedLongLong(input_svp64);
+    PyList_SetItem(state->initial_regs, 3, address);
+    // Load data into buffer from real memory
+    for (int i=0; i < 16; i += 4) {
+      PyObject *svp64_address = PyLong_FromUnsignedLongLong(input_svp64 + i*2);
+      uint64_t val = (uint64_t)(input[0]) & 0xffff;
+      val |= ((uint64_t)(input[1]) & 0xffff) << 16;
+      val |= ((uint64_t)(input[2]) & 0xffff) << 32;
+      val |= ((uint64_t)(input[3]) & 0xffff) << 48;
+      //printf("src: %p -> %04x %04x %04x %04x\t val: %016lx -> %p\n", input, (uint16_t)input[0], (uint16_t)input[1], (uint16_t)input[2], (uint16_t)input[3], val, input_svp64);
+      PyObject *word = PyLong_FromUnsignedLongLong(val);
+      PyDict_SetItem(state->initial_mem, svp64_address, word);
+      input += 4;
+    }
+    // Set GPR #4 to the output pointer
+    PyObject *out_address = PyLong_FromUnsignedLongLong(output_svp64);
+    PyList_SetItem(state->initial_regs, 4, out_address);
+
+    // Prepare the arguments object for the call
+    pypowersim_prepareargs(state);
+
+    // Call the function and get the resulting object
+    state->result_obj = PyObject_CallObject(state->simulator, state->args);
+    if (!state->result_obj) {
+        PyErr_Print();
+        printf("Error invoking 'run_a_simulation'\n");
+        pypowersim_finalize(state);
+	exit(1);
+    }
+
+    PyObject *memobj = PyObject_GetAttrString(state->result_obj, "mem");
+    if (!memobj) {
+        PyErr_Print();
+        Py_DECREF(state->result_obj);
+        printf("Error getting mem object\n");
+    }
+
+    PyObject *mem = PyObject_GetAttrString(memobj, "mem");
+    if (!mem) {
+        PyErr_Print();
+        Py_DECREF(state->result_obj);
+        printf("Error getting mem dict\n");
+    }
+    for (int i=0; i < 16; i += 4) {
+      PyObject *svp64_address = PyLong_FromUnsignedLongLong((output_svp64 + i*2)/8);
+      PyObject *pyval = PyDict_GetItem(mem, svp64_address);
+      uint64_t val = PyLong_AsUnsignedLongLong(pyval);
+      output[i + 0] = (uint16_t) val;
+      output[i + 1] = (uint16_t) (val >> 16);
+      output[i + 2] = (uint16_t) (val >> 32);
+      output[i + 3] = (uint16_t) (val >> 48);
+      //printf("output: %p -> %04x %04x %04x %04x\t val: %016lx -> %p\n", output, (uint16_t)output[i], (uint16_t)output[i + 1], (uint16_t)output[i + 2], (uint16_t)output[i + 3], val, output_svp64);
+    }
+
+    for (int i=0; i < 16; i += 4) {
+      printf("output[%d] : %04x %04x %04x %04x\n", i, (uint16_t)output[i],  (uint16_t)output[i+1],  (uint16_t)output[i+2],  (uint16_t)output[i+3]);
+      printf("output2[%d]: %04x %04x %04x %04x\n", i, (uint16_t)output2[i], (uint16_t)output2[i+1], (uint16_t)output2[i+2], (uint16_t)output2[i+3]);
+    }
+}
diff --git a/media/video/libvpx/vp8_dct4x4_wrappers.h b/media/video/libvpx/vp8_dct4x4_wrappers.h
new file mode 100644
index 00000000..cef130ca
--- /dev/null
+++ b/media/video/libvpx/vp8_dct4x4_wrappers.h
@@ -0,0 +1,3 @@
+#include <stdint.h>
+
+void vp8_short_fdct4x4_svp64_real(int16_t *input, int16_t *output, int32_t pitch);
diff --git a/media/video/libvpx/vp8_fdct4x4_test.cc b/media/video/libvpx/vp8_fdct4x4_test.cc
new file mode 100644
index 00000000..7c3f7336
--- /dev/null
+++ b/media/video/libvpx/vp8_fdct4x4_test.cc
@@ -0,0 +1,194 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+
+#include <gtest/gtest.h>
+
+#include "vpx_misc.h"
+#include "vp8_rtcd.h"
+#include "acm_random.h"
+#include "clear_system_state.h"
+#include "register_state_check.h"
+#include "vpx_integer.h"
+#include "vpx_mem.h"
+#include "mem.h"
+#include "vpx_timer.h"
+
+namespace {
+
+typedef void (*FdctFunc)(int16_t *a, int16_t *b, int a_stride);
+
+const int cospi8sqrt2minus1 = 20091;
+const int sinpi8sqrt2 = 35468;
+
+void reference_idct4x4(const int16_t *input, int16_t *output) {
+  const int16_t *ip = input;
+  int16_t *op = output;
+
+  for (int i = 0; i < 4; ++i) {
+    const int a1 = ip[0] + ip[8];
+    const int b1 = ip[0] - ip[8];
+    const int temp1 = (ip[4] * sinpi8sqrt2) >> 16;
+    const int temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
+    const int c1 = temp1 - temp2;
+    const int temp3 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16);
+    const int temp4 = (ip[12] * sinpi8sqrt2) >> 16;
+    const int d1 = temp3 + temp4;
+    op[0] = a1 + d1;
+    op[12] = a1 - d1;
+    op[4] = b1 + c1;
+    op[8] = b1 - c1;
+    ++ip;
+    ++op;
+  }
+  ip = output;
+  op = output;
+  for (int i = 0; i < 4; ++i) {
+    const int a1 = ip[0] + ip[2];
+    const int b1 = ip[0] - ip[2];
+    const int temp1 = (ip[1] * sinpi8sqrt2) >> 16;
+    const int temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16);
+    const int c1 = temp1 - temp2;
+    const int temp3 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16);
+    const int temp4 = (ip[3] * sinpi8sqrt2) >> 16;
+    const int d1 = temp3 + temp4;
+    op[0] = (a1 + d1 + 4) >> 3;
+    op[3] = (a1 - d1 + 4) >> 3;
+    op[1] = (b1 + c1 + 4) >> 3;
+    op[2] = (b1 - c1 + 4) >> 3;
+    ip += 4;
+    op += 4;
+  }
+}
+
+using libvpx_test::ACMRandom;
+
+class FdctTest : public ::testing::TestWithParam<FdctFunc> {
+ public:
+  virtual void SetUp() {
+    fdct_func_ = GetParam();
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+ protected:
+  FdctFunc fdct_func_;
+  ACMRandom rnd_;
+};
+
+TEST_P(FdctTest, SignBiasCheck) {
+  int16_t test_input_block[16];
+  DECLARE_ALIGNED(16, int16_t, test_output_block[16]);
+  const int pitch = 8;
+  int count_sign_block[16][2];
+  const int count_test_block = 5;
+
+  memset(count_sign_block, 0, sizeof(count_sign_block));
+
+  for (int i = 0; i < count_test_block; ++i) {
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < 16; ++j) {
+      test_input_block[j] = rnd_.Rand8() - rnd_.Rand8();
+    }
+
+    fdct_func_(test_input_block, test_output_block, pitch);
+
+    for (int j = 0; j < 16; ++j) {
+      if (test_output_block[j] < 0) {
+        ++count_sign_block[j][0];
+      } else if (test_output_block[j] > 0) {
+        ++count_sign_block[j][1];
+      }
+    }
+  }
+
+  bool bias_acceptable = true;
+  for (int j = 0; j < 16; ++j) {
+    bias_acceptable =
+        bias_acceptable &&
+        (abs(count_sign_block[j][0] - count_sign_block[j][1]) < 10000);
+  }
+
+  EXPECT_EQ(true, bias_acceptable)
+      << "Error: 4x4 FDCT has a sign bias > 1% for input range [-255, 255]";
+
+  memset(count_sign_block, 0, sizeof(count_sign_block));
+
+  for (int i = 0; i < count_test_block; ++i) {
+    // Initialize a test block with input range [-15, 15].
+    for (int j = 0; j < 16; ++j) {
+      test_input_block[j] = (rnd_.Rand8() >> 4) - (rnd_.Rand8() >> 4);
+    }
+
+    fdct_func_(test_input_block, test_output_block, pitch);
+
+    for (int j = 0; j < 16; ++j) {
+      if (test_output_block[j] < 0) {
+        ++count_sign_block[j][0];
+      } else if (test_output_block[j] > 0) {
+        ++count_sign_block[j][1];
+      }
+    }
+  }
+
+  bias_acceptable = true;
+  for (int j = 0; j < 16; ++j) {
+    bias_acceptable =
+        bias_acceptable &&
+        (abs(count_sign_block[j][0] - count_sign_block[j][1]) < 100000);
+  }
+
+  EXPECT_EQ(true, bias_acceptable)
+      << "Error: 4x4 FDCT has a sign bias > 10% for input range [-15, 15]";
+}
+
+TEST_P(FdctTest, RoundTripErrorCheck) {
+  int max_error = 0;
+  double total_error = 0;
+  const int count_test_block = 5;
+  for (int i = 0; i < count_test_block; ++i) {
+    int16_t test_input_block[16];
+    int16_t test_output_block[16];
+    DECLARE_ALIGNED(16, int16_t, test_temp_block[16]);
+
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < 16; ++j) {
+      test_input_block[j] = rnd_.Rand8() - rnd_.Rand8();
+    }
+
+    const int pitch = 8;
+    fdct_func_(test_input_block, test_temp_block, pitch);
+    reference_idct4x4(test_temp_block, test_output_block);
+
+    for (int j = 0; j < 16; ++j) {
+      const int diff = test_input_block[j] - test_output_block[j];
+      const int error = diff * diff;
+      if (max_error < error) max_error = error;
+      total_error += error;
+    }
+  }
+
+  EXPECT_GE(1, max_error)
+      << "Error: FDCT/IDCT has an individual roundtrip error > 1";
+
+  EXPECT_GE(count_test_block, total_error)
+      << "Error: FDCT/IDCT has average roundtrip error > 1 per block";
+}
+
+INSTANTIATE_TEST_SUITE_P(C, FdctTest, ::testing::Values(vp8_short_fdct4x4_c));
+
+INSTANTIATE_TEST_SUITE_P(SVP64, FdctTest, ::testing::Values(vp8_short_fdct4x4_svp64));
+
+}  // namespace
-- 
2.30.2