-TARGET=libvpx_variance_test
+VPXTARGET=libvpx_variance_test
+VP8TARGET=vp8_dct_test
EXAMPLE=pypowersim_wrapper_example
CC=gcc
OBJCOPY=powerpc64le-linux-gnu-objcopy
CFLAGS= -Iinclude -O -g3 -I/usr/include/python3.7m
CXXFLAGS= -Iinclude -O -g3
-ASFLAGS= -mlibresoc
+ASFLAGS= -mlibresoc -mregnames
LDFLAGS=-lgtest -pthread -lpython3.7m
-BINFILES = vpx_get_mb_ss_svp64_real.bin vpx_get4x4sse_cs_svp64_real.bin
-ASFILES = vpx_get_mb_ss_svp64_real.s vpx_get4x4sse_cs_svp64_real.s variance_svp64_real.s
-CFILES = variance_ref.c variancefuncs_svp64.c variance_svp64_wrappers.c vpx_mem.c
-CPPFILES = test_libvpx.cc variance_test.cc
-EXAMPLEC = pypowersim_wrapper_example.c
-EXAMPLEOBJ= ${EXAMPLEC:.c=.o}
-OBJFILES = $(CFILES:.c=.o) $(CPPFILES:.cc=.o) $(ASFILES:.s=.o)
+BINFILES = vpx_get_mb_ss_svp64_real.bin vpx_get4x4sse_cs_svp64_real.bin variance_svp64_real.bin vp8_dct4x4_real.bin
+VP8_ASFILES = vp8_dct4x4_real.s
+VPX_ASFILES = vpx_get_mb_ss_svp64_real.s vpx_get4x4sse_cs_svp64_real.s variance_svp64_real.s
+VP8_CFILES = vp8_dct4x4_ref.c vp8_dct4x4_wrappers.c
+VPX_CFILES = variance_ref.c variancefuncs_svp64.c variance_svp64_wrappers.c vpx_mem.c
+VP8_CPPFILES = test_libvpx.cc vp8_fdct4x4_test.cc
+VPX_CPPFILES = test_libvpx.cc variance_test.cc
+EXAMPLEC = pypowersim_wrapper_example.c
+EXAMPLEOBJ = ${EXAMPLEC:.c=.o}
+VP8_OBJFILES = $(VP8_ASFILES:.s=.o) $(VP8_CFILES:.c=.o) $(VP8_CPPFILES:.cc=.o)
+VPX_OBJFILES = $(VPX_ASFILES:.s=.o) $(VPX_CFILES:.c=.o) $(VPX_CPPFILES:.cc=.o)
%.bin: %.o
${OBJCOPY} -I elf64-little -O binary $< $@
-${TARGET}: ${OBJFILES} ${BINFILES}
- ${CXX} -o ${TARGET} ${OBJFILES} ${LDFLAGS}
+${VP8TARGET}: ${VP8_OBJFILES}
+ ${CXX} -o ${VP8TARGET} ${VP8_OBJFILES} ${LDFLAGS}
+
+${VPXTARGET}: ${VPX_OBJFILES}
+ ${CXX} -o ${VPXTARGET} ${VPX_OBJFILES} ${LDFLAGS}
${EXAMPLE}: ${EXAMPLEOBJ}
-all: ${TARGET} ${EXAMPLE}
+all: ${VP8TARGET} ${VPXTARGET} ${EXAMPLE} ${BINFILES}
.PHONY: clean
clean:
- rm -f ${TARGET} ${OBJFILES} ${BINFILES}
+ rm -f ${VP8TARGET} ${VPXTARGET} ${VP8_OBJFILES} ${VPX_OBJFILES} ${BINFILES} ${EXAMPLE} ${EXAMPLEOBJ}
--- /dev/null
+#include "vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_short_fdct4x4_c(int16_t *input, int16_t *output, int32_t pitch);
+void vp8_short_fdct4x4_svp64(int16_t *input, int16_t *output, int32_t pitch);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
--- /dev/null
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+
+void vp8_short_fdct4x4_svp64(short *input, short *output, int pitch) {
+ int i;
+ int a1, b1, c1, d1;
+ short *ip = input;
+ short *op = output;
+
+ for (i = 0; i < 4; ++i) {
+ a1 = ((ip[0] + ip[3]) * 8);
+ b1 = ((ip[1] + ip[2]) * 8);
+ c1 = ((ip[1] - ip[2]) * 8);
+ d1 = ((ip[0] - ip[3]) * 8);
+
+ op[0] = a1 + b1;
+ op[2] = a1 - b1;
+
+ op[1] = (c1 * 2217 + d1 * 5352 + 14500) >> 12;
+ op[3] = (d1 * 2217 - c1 * 5352 + 7500) >> 12;
+
+ ip += pitch / 2;
+ op += 4;
+ }
+ ip = output;
+ op = output;
+ for (i = 0; i < 4; ++i) {
+ a1 = ip[0] + ip[12];
+ b1 = ip[4] + ip[8];
+ c1 = ip[4] - ip[8];
+ d1 = ip[0] - ip[12];
+
+ op[0] = (a1 + b1 + 7) >> 4;
+ op[8] = (a1 - b1 + 7) >> 4;
+
+ op[4] = ((c1 * 2217 + d1 * 5352 + 12000) >> 16) + (d1 != 0);
+ op[12] = (d1 * 2217 - c1 * 5352 + 51000) >> 16;
+
+ ip++;
+ op++;
+ }
+}
--- /dev/null
+.set in, 3
+.set out, 4
+.set pitch, 5
+.set c_2217, 6
+.set c_5352, 7
+.set c_7500, 9
+.set c_12000, 11
+.set c_51000, 12
+.set pred, 10
+.set ip, 16
+.set t, 32
+.set t2, 50
+.set t3, 70
+.set op, 90
+
+ .machine libresoc
+ .file "vp8_dct4x4_real.c"
+ .abiversion 2
+ .section ".text"
+ .align 2
+ .globl vp8_short_fdct4x4_svp64_real
+ .type vp8_short_fdct4x4_svp64_real, @function
+vp8_short_fdct4x4_svp64_real:
+.LFB0:
+ .cfi_startproc
+ li c_51000, 25500
+ sldi c_51000, c_51000, 1 # c_51000 = 51000
+ setvl 0,0,16,0,1,1 # Set VL to 16 elements
+ sv.lha *ip, 0(in) # Load 4 ints from (in)
+
+ ori pred, 0, 0b0001000100010001
+ sv.add/dm=r10 *t, *ip, *ip+3 # a1 = ip[0] + ip[3]
+ sv.add/dm=r10 *t+1, *ip+1, *ip+2 # b1 = ip[1] + ip[2]
+ sv.subf/dm=r10 *t+2, *ip+2, *ip+1 # c1 = ip[1] - ip[2]
+ sv.subf/dm=r10 *t+3, *ip+3, *ip # d1 = ip[0] - ip[3]
+ sv.mulli *t, *t, 8 # a1 *= 8, b1 *= 8, c1 *= 8, d1 *= 8
+
+ sv.add/dm=r10 *op, *t, *t+1 # op[0] = a1 + b1;
+ sv.subf/dm=r10 *op+2, *t+1, *t # op[2] = a1 - b1;
+
+ # Calculate c1 * 2217, c1 *5352, d1 * 2217 and d1 * 5352
+ ori pred, 0, 0b1100110011001100
+ sv.mulli/m=r10 *t2, *t, 2217 # t2 has c1 * 2217, d1 * 2217
+ sv.mulli/m=r10 *t3, *t, 5352 # t3 has c1 * 5352, d1 * 5352
+
+ ori pred, 0, 0b0010001000100010
+ # op[1] = (c1 * 2217 + d1 * 5352 + 14500)
+ sv.add/m=r10 *op, *t2+1, *t3+2 # c1 * 2217 + d1 * 5352
+ sv.addi/m=r10 *op, *op, 14500 # + 14500
+
+ ori pred, 0, 0b0100010001000100
+ # op[3] = (d1 * 2217 - c1 * 5352 + 7500)
+ sv.subf/m=r10 *op+1, *t3, *t2+1 # - c1 * 5352 + d1 * 2127
+ sv.addi/m=r10 *op+1, *op+1, 7500 # + 7500
+
+ ori pred, 0, 0b1010101010101010
+ sv.rldicl/m=r10 *op, *op, 52, 12 # op[1] >>= 12, op[3] >>= 12
+
+ # column-wise DCT
+ ori pred, 0, 0b0000000000001111
+ sv.add/m=r10 *t, *op, *op+12 # a1 = ip[0] + ip[12]
+ sv.add/m=r10 *t+4, *op+4, *op+8 # b1 = ip[4] + ip[8]
+ sv.subf/m=r10 *t+8, *op+8, *op+4 # c1 = ip[4] - ip[8]
+ sv.subf/m=r10 *t+12, *op+12, *op # d1 = ip[0] - ip[12]
+
+ # op[0] = (a1 + b1 + 7) >> 4
+ sv.add/m=r10 *op, *t, *t+4 # op[0] = a1 + b1
+ sv.addi/m=r10 *op, *op, 7 # op[0] += 7
+
+ # op[8] = (a1 - b1 + 7) >> 4
+ sv.subf/m=r10 *op+8, *t+4, *t # op[8] = a1 - b1
+ sv.addi/m=r10 *op+8, *op+8, 7 # op[8] += 7
+
+ ori pred, 0, 0b0000111100001111
+ sv.rldicl/m=r10 *op, *op, 60, 4 # op[0] >>= 4, op[8] >>= 4
+
+ # Calculate c1 * 2217, c1 *5352, d1 * 2217 and d1 * 5352
+ ori pred, 0, 0b1111111100000000
+ sv.mulli/m=r10 *t2, *t, 2217 # t2 has c1 * 2217, d1 * 2217
+ sv.mulli/m=r10 *t3, *t, 5352 # t3 has c1 * 5352, d1 * 5352
+
+ # op[4] = ((c1 * 2217 + d1 * 5352 + 12000)
+ ori pred, 0, 0b0000000011110000
+ sv.add/m=r10 *op, *t2+4, *t3+8 # c1 * 2217 + d1 * 5352
+ sv.addi/m=r10 *op, *op, 12000 # + 12000
+
+ # op[12] = (d1 * 2217 - c1 * 5352 + 51000)
+ ori pred, 0, 0b1111000000000000
+ sv.subf/m=r10 *op, *t3-4, *t2 # - c1 * 5352 + d1 * 2127
+ sv.add/m=r10 *op, *op, c_51000 # + 51000
+
+ ori pred, 0, 0b1111000011110000
+ sv.rldicl/m=r10 *op, *op, 48, 16 # op[4] >>= 16, op[12] >= 16
+
+ # op[4] += (d1 != 0)
+ #ori pred, 0, 0b0000000011110000
+ setvl 0,0,4,0,1,1 # Set VL to 16 elements
+ sv.cmpi *cr0, 0, *t+12, 1
+ sv.addi/m=ne *op+4, *op+4, 1
+
+ # store to buffer
+ setvl 0,0,16,0,1,1 # Set VL to 16 elements
+ sv.sth *op, 0(out)
+ blr
+ .long 0
+ .byte 0,0,0,0,128,1,0,1
+ .cfi_endproc
+.LFE0:
+ .size vp8_short_fdct4x4_svp64_real,.-vp8_short_fdct4x4_svp64_real
+ .ident "GCC: (Debian 8.3.0-6) 8.3.0"
+ .section .note.GNU-stack,"",@progbits
--- /dev/null
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include "vp8_rtcd.h"
+
+void vp8_short_fdct4x4_c(int16_t *input, int16_t *output, int32_t pitch) {
+ int i;
+ int a1, b1, c1, d1;
+ short *ip = input;
+ short *op = output;
+
+ for (i = 0; i < 4; ++i) {
+ a1 = ((ip[0] + ip[3]));
+ b1 = ((ip[1] + ip[2]));
+ c1 = ((ip[1] - ip[2]));
+ d1 = ((ip[0] - ip[3]));
+
+ a1 *= 8;
+ b1 *= 8;
+ c1 *= 8;
+ d1 *= 8;
+ printf("a1 = %08x\tb1 = %08x\tc1 = %08x\td1 = %08x\n", a1, b1, c1, d1);
+
+ op[0] = a1 + b1;
+ op[2] = a1 - b1;
+ printf("op[0] = %04x\top[2] = %04x\n", (uint16_t)op[0], (uint16_t)op[2]);
+
+ op[1] = (c1 * 2217 + d1 * 5352 + 14500) >> 12;
+ op[3] = (d1 * 2217 - c1 * 5352 + 7500) >> 12;
+ printf("op[1] = %04x\top[3] = %04x\n", (uint16_t)op[1], (uint16_t)op[3]);
+
+ ip += pitch / 2;
+ op += 4;
+ }
+ ip = output;
+ op = output;
+ for (i = 0; i < 4; ++i) {
+ a1 = ip[0] + ip[12];
+ b1 = ip[4] + ip[8];
+ c1 = ip[4] - ip[8];
+ d1 = ip[0] - ip[12];
+ printf("a1 = %08x\tb1 = %08x\tc1 = %08x\td1 = %08x\n", a1, b1, c1, d1);
+
+ op[0] = (a1 + b1 + 7) >> 4;
+ op[8] = (a1 - b1 + 7) >> 4;
+ printf("op[%d] = %08x\top[%d] = %08x\n", i, op[0], i+8, op[8]);
+
+ op[4] = ((c1 * 2217 + d1 * 5352 + 12000) >> 16) + (d1 != 0);
+ op[12] = (d1 * 2217 - c1 * 5352 + 51000) >> 16;
+ printf("op[%d] = %04x\top[%d] = %04x\n", i+4, (uint16_t)op[4], i+12, (uint16_t)op[12]);
+
+ ip++;
+ op++;
+ }
+}
--- /dev/null
+#include <Python.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include "pypowersim_wrapper_common.h"
+#include "vp8_dct4x4_wrappers.h"
+#include "vp8_rtcd.h"
+
+void vp8_short_fdct4x4_svp64(int16_t *input, int16_t *output, int32_t pitch) {
+
+ printf("pitch: %d\n", pitch);
+ int16_t output2[16];
+ vp8_short_fdct4x4_c(input, output2, pitch);
+
+
+ // It cannot be the same pointer as the original function, as it is really a separate CPU/RAM
+ // we have to memcpy from input to this pointer, the address was chosen arbitrarily
+ uint64_t input_svp64 = 0x100000;
+ uint64_t output_svp64 = 0x200000;
+
+ // Create the pypowersim_state
+ pypowersim_state_t *state = pypowersim_prepare();
+
+ // Change the relevant elements, mandatory: body
+ state->binary = PyBytes_FromStringAndSize((const char *)&vp8_short_fdct4x4_svp64_real, 1000);
+ // Set GPR #3 to the input pointer
+ PyObject *address = PyLong_FromUnsignedLongLong(input_svp64);
+ PyList_SetItem(state->initial_regs, 3, address);
+ // Load data into buffer from real memory
+ for (int i=0; i < 16; i += 4) {
+ PyObject *svp64_address = PyLong_FromUnsignedLongLong(input_svp64 + i*2);
+ uint64_t val = (uint64_t)(input[0]) & 0xffff;
+ val |= ((uint64_t)(input[1]) & 0xffff) << 16;
+ val |= ((uint64_t)(input[2]) & 0xffff) << 32;
+ val |= ((uint64_t)(input[3]) & 0xffff) << 48;
+ //printf("src: %p -> %04x %04x %04x %04x\t val: %016lx -> %p\n", input, (uint16_t)input[0], (uint16_t)input[1], (uint16_t)input[2], (uint16_t)input[3], val, input_svp64);
+ PyObject *word = PyLong_FromUnsignedLongLong(val);
+ PyDict_SetItem(state->initial_mem, svp64_address, word);
+ input += 4;
+ }
+ // Set GPR #4 to the output pointer
+ PyObject *out_address = PyLong_FromUnsignedLongLong(output_svp64);
+ PyList_SetItem(state->initial_regs, 4, out_address);
+
+ // Prepare the arguments object for the call
+ pypowersim_prepareargs(state);
+
+ // Call the function and get the resulting object
+ state->result_obj = PyObject_CallObject(state->simulator, state->args);
+ if (!state->result_obj) {
+ PyErr_Print();
+ printf("Error invoking 'run_a_simulation'\n");
+ pypowersim_finalize(state);
+ exit(1);
+ }
+
+ PyObject *memobj = PyObject_GetAttrString(state->result_obj, "mem");
+ if (!memobj) {
+ PyErr_Print();
+ Py_DECREF(state->result_obj);
+ printf("Error getting mem object\n");
+ }
+
+ PyObject *mem = PyObject_GetAttrString(memobj, "mem");
+ if (!mem) {
+ PyErr_Print();
+ Py_DECREF(state->result_obj);
+ printf("Error getting mem dict\n");
+ }
+ for (int i=0; i < 16; i += 4) {
+ PyObject *svp64_address = PyLong_FromUnsignedLongLong((output_svp64 + i*2)/8);
+ PyObject *pyval = PyDict_GetItem(mem, svp64_address);
+ uint64_t val = PyLong_AsUnsignedLongLong(pyval);
+ output[i + 0] = (uint16_t) val;
+ output[i + 1] = (uint16_t) (val >> 16);
+ output[i + 2] = (uint16_t) (val >> 32);
+ output[i + 3] = (uint16_t) (val >> 48);
+ //printf("output: %p -> %04x %04x %04x %04x\t val: %016lx -> %p\n", output, (uint16_t)output[i], (uint16_t)output[i + 1], (uint16_t)output[i + 2], (uint16_t)output[i + 3], val, output_svp64);
+ }
+
+ for (int i=0; i < 16; i += 4) {
+ printf("output[%d] : %04x %04x %04x %04x\n", i, (uint16_t)output[i], (uint16_t)output[i+1], (uint16_t)output[i+2], (uint16_t)output[i+3]);
+ printf("output2[%d]: %04x %04x %04x %04x\n", i, (uint16_t)output2[i], (uint16_t)output2[i+1], (uint16_t)output2[i+2], (uint16_t)output2[i+3]);
+ }
+}
--- /dev/null
+#include <stdint.h>
+
+void vp8_short_fdct4x4_svp64_real(int16_t *input, int16_t *output, int32_t pitch);
--- /dev/null
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+
+#include <gtest/gtest.h>
+
+#include "vpx_misc.h"
+#include "vp8_rtcd.h"
+#include "acm_random.h"
+#include "clear_system_state.h"
+#include "register_state_check.h"
+#include "vpx_integer.h"
+#include "vpx_mem.h"
+#include "mem.h"
+#include "vpx_timer.h"
+
+namespace {
+
+typedef void (*FdctFunc)(int16_t *a, int16_t *b, int a_stride);
+
+const int cospi8sqrt2minus1 = 20091;
+const int sinpi8sqrt2 = 35468;
+
+void reference_idct4x4(const int16_t *input, int16_t *output) {
+ const int16_t *ip = input;
+ int16_t *op = output;
+
+ for (int i = 0; i < 4; ++i) {
+ const int a1 = ip[0] + ip[8];
+ const int b1 = ip[0] - ip[8];
+ const int temp1 = (ip[4] * sinpi8sqrt2) >> 16;
+ const int temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
+ const int c1 = temp1 - temp2;
+ const int temp3 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16);
+ const int temp4 = (ip[12] * sinpi8sqrt2) >> 16;
+ const int d1 = temp3 + temp4;
+ op[0] = a1 + d1;
+ op[12] = a1 - d1;
+ op[4] = b1 + c1;
+ op[8] = b1 - c1;
+ ++ip;
+ ++op;
+ }
+ ip = output;
+ op = output;
+ for (int i = 0; i < 4; ++i) {
+ const int a1 = ip[0] + ip[2];
+ const int b1 = ip[0] - ip[2];
+ const int temp1 = (ip[1] * sinpi8sqrt2) >> 16;
+ const int temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16);
+ const int c1 = temp1 - temp2;
+ const int temp3 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16);
+ const int temp4 = (ip[3] * sinpi8sqrt2) >> 16;
+ const int d1 = temp3 + temp4;
+ op[0] = (a1 + d1 + 4) >> 3;
+ op[3] = (a1 - d1 + 4) >> 3;
+ op[1] = (b1 + c1 + 4) >> 3;
+ op[2] = (b1 - c1 + 4) >> 3;
+ ip += 4;
+ op += 4;
+ }
+}
+
+using libvpx_test::ACMRandom;
+
+class FdctTest : public ::testing::TestWithParam<FdctFunc> {
+ public:
+ virtual void SetUp() {
+ fdct_func_ = GetParam();
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+ }
+
+ protected:
+ FdctFunc fdct_func_;
+ ACMRandom rnd_;
+};
+
+TEST_P(FdctTest, SignBiasCheck) {
+ int16_t test_input_block[16];
+ DECLARE_ALIGNED(16, int16_t, test_output_block[16]);
+ const int pitch = 8;
+ int count_sign_block[16][2];
+ const int count_test_block = 5;
+
+ memset(count_sign_block, 0, sizeof(count_sign_block));
+
+ for (int i = 0; i < count_test_block; ++i) {
+ // Initialize a test block with input range [-255, 255].
+ for (int j = 0; j < 16; ++j) {
+ test_input_block[j] = rnd_.Rand8() - rnd_.Rand8();
+ }
+
+ fdct_func_(test_input_block, test_output_block, pitch);
+
+ for (int j = 0; j < 16; ++j) {
+ if (test_output_block[j] < 0) {
+ ++count_sign_block[j][0];
+ } else if (test_output_block[j] > 0) {
+ ++count_sign_block[j][1];
+ }
+ }
+ }
+
+ bool bias_acceptable = true;
+ for (int j = 0; j < 16; ++j) {
+ bias_acceptable =
+ bias_acceptable &&
+ (abs(count_sign_block[j][0] - count_sign_block[j][1]) < 10000);
+ }
+
+ EXPECT_EQ(true, bias_acceptable)
+ << "Error: 4x4 FDCT has a sign bias > 1% for input range [-255, 255]";
+
+ memset(count_sign_block, 0, sizeof(count_sign_block));
+
+ for (int i = 0; i < count_test_block; ++i) {
+ // Initialize a test block with input range [-15, 15].
+ for (int j = 0; j < 16; ++j) {
+ test_input_block[j] = (rnd_.Rand8() >> 4) - (rnd_.Rand8() >> 4);
+ }
+
+ fdct_func_(test_input_block, test_output_block, pitch);
+
+ for (int j = 0; j < 16; ++j) {
+ if (test_output_block[j] < 0) {
+ ++count_sign_block[j][0];
+ } else if (test_output_block[j] > 0) {
+ ++count_sign_block[j][1];
+ }
+ }
+ }
+
+ bias_acceptable = true;
+ for (int j = 0; j < 16; ++j) {
+ bias_acceptable =
+ bias_acceptable &&
+ (abs(count_sign_block[j][0] - count_sign_block[j][1]) < 100000);
+ }
+
+ EXPECT_EQ(true, bias_acceptable)
+ << "Error: 4x4 FDCT has a sign bias > 10% for input range [-15, 15]";
+}
+
+TEST_P(FdctTest, RoundTripErrorCheck) {
+ int max_error = 0;
+ double total_error = 0;
+ const int count_test_block = 5;
+ for (int i = 0; i < count_test_block; ++i) {
+ int16_t test_input_block[16];
+ int16_t test_output_block[16];
+ DECLARE_ALIGNED(16, int16_t, test_temp_block[16]);
+
+ // Initialize a test block with input range [-255, 255].
+ for (int j = 0; j < 16; ++j) {
+ test_input_block[j] = rnd_.Rand8() - rnd_.Rand8();
+ }
+
+ const int pitch = 8;
+ fdct_func_(test_input_block, test_temp_block, pitch);
+ reference_idct4x4(test_temp_block, test_output_block);
+
+ for (int j = 0; j < 16; ++j) {
+ const int diff = test_input_block[j] - test_output_block[j];
+ const int error = diff * diff;
+ if (max_error < error) max_error = error;
+ total_error += error;
+ }
+ }
+
+ EXPECT_GE(1, max_error)
+ << "Error: FDCT/IDCT has an individual roundtrip error > 1";
+
+ EXPECT_GE(count_test_block, total_error)
+ << "Error: FDCT/IDCT has average roundtrip error > 1 per block";
+}
+
+INSTANTIATE_TEST_SUITE_P(C, FdctTest, ::testing::Values(vp8_short_fdct4x4_c));
+
+INSTANTIATE_TEST_SUITE_P(SVP64, FdctTest, ::testing::Values(vp8_short_fdct4x4_svp64));
+
+} // namespace