From 175318267a1bd4480c0efc001d102b9ab1caa849 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 27 Sep 2022 10:04:49 +0000 Subject: [PATCH] Working version of VP8 DCT4x4 in SVP64 --- media/video/libvpx/Makefile | 34 ++-- media/video/libvpx/include/vp8_rtcd.h | 13 ++ media/video/libvpx/vp8_dct4x4_real.c.in | 51 ++++++ media/video/libvpx/vp8_dct4x4_real.s | 111 +++++++++++++ media/video/libvpx/vp8_dct4x4_ref.c | 66 ++++++++ media/video/libvpx/vp8_dct4x4_wrappers.c | 85 ++++++++++ media/video/libvpx/vp8_dct4x4_wrappers.h | 3 + media/video/libvpx/vp8_fdct4x4_test.cc | 194 +++++++++++++++++++++++ 8 files changed, 544 insertions(+), 13 deletions(-) create mode 100644 media/video/libvpx/include/vp8_rtcd.h create mode 100644 media/video/libvpx/vp8_dct4x4_real.c.in create mode 100644 media/video/libvpx/vp8_dct4x4_real.s create mode 100644 media/video/libvpx/vp8_dct4x4_ref.c create mode 100644 media/video/libvpx/vp8_dct4x4_wrappers.c create mode 100644 media/video/libvpx/vp8_dct4x4_wrappers.h create mode 100644 media/video/libvpx/vp8_fdct4x4_test.cc diff --git a/media/video/libvpx/Makefile b/media/video/libvpx/Makefile index 00a1fdd2..9d16c33a 100644 --- a/media/video/libvpx/Makefile +++ b/media/video/libvpx/Makefile @@ -1,4 +1,5 @@ -TARGET=libvpx_variance_test +VPXTARGET=libvpx_variance_test +VP8TARGET=vp8_dct_test EXAMPLE=pypowersim_wrapper_example CC=gcc @@ -7,27 +8,34 @@ AS=powerpc64le-linux-gnu-as OBJCOPY=powerpc64le-linux-gnu-objcopy CFLAGS= -Iinclude -O -g3 -I/usr/include/python3.7m CXXFLAGS= -Iinclude -O -g3 -ASFLAGS= -mlibresoc +ASFLAGS= -mlibresoc -mregnames LDFLAGS=-lgtest -pthread -lpython3.7m -BINFILES = vpx_get_mb_ss_svp64_real.bin vpx_get4x4sse_cs_svp64_real.bin -ASFILES = vpx_get_mb_ss_svp64_real.s vpx_get4x4sse_cs_svp64_real.s variance_svp64_real.s -CFILES = variance_ref.c variancefuncs_svp64.c variance_svp64_wrappers.c vpx_mem.c -CPPFILES = test_libvpx.cc variance_test.cc -EXAMPLEC = pypowersim_wrapper_example.c -EXAMPLEOBJ= ${EXAMPLEC:.c=.o} -OBJFILES = $(CFILES:.c=.o) $(CPPFILES:.cc=.o) $(ASFILES:.s=.o) +BINFILES = vpx_get_mb_ss_svp64_real.bin vpx_get4x4sse_cs_svp64_real.bin variance_svp64_real.bin vp8_dct4x4_real.bin +VP8_ASFILES = vp8_dct4x4_real.s +VPX_ASFILES = vpx_get_mb_ss_svp64_real.s vpx_get4x4sse_cs_svp64_real.s variance_svp64_real.s +VP8_CFILES = vp8_dct4x4_ref.c vp8_dct4x4_wrappers.c +VPX_CFILES = variance_ref.c variancefuncs_svp64.c variance_svp64_wrappers.c vpx_mem.c +VP8_CPPFILES = test_libvpx.cc vp8_fdct4x4_test.cc +VPX_CPPFILES = test_libvpx.cc variance_test.cc +EXAMPLEC = pypowersim_wrapper_example.c +EXAMPLEOBJ = ${EXAMPLEC:.c=.o} +VP8_OBJFILES = $(VP8_ASFILES:.s=.o) $(VP8_CFILES:.c=.o) $(VP8_CPPFILES:.cc=.o) +VPX_OBJFILES = $(VPX_ASFILES:.s=.o) $(VPX_CFILES:.c=.o) $(VPX_CPPFILES:.cc=.o) %.bin: %.o ${OBJCOPY} -I elf64-little -O binary $< $@ -${TARGET}: ${OBJFILES} ${BINFILES} - ${CXX} -o ${TARGET} ${OBJFILES} ${LDFLAGS} +${VP8TARGET}: ${VP8_OBJFILES} + ${CXX} -o ${VP8TARGET} ${VP8_OBJFILES} ${LDFLAGS} + +${VPXTARGET}: ${VPX_OBJFILES} + ${CXX} -o ${VPXTARGET} ${VPX_OBJFILES} ${LDFLAGS} ${EXAMPLE}: ${EXAMPLEOBJ} -all: ${TARGET} ${EXAMPLE} +all: ${VP8TARGET} ${VPXTARGET} ${EXAMPLE} ${BINFILES} .PHONY: clean clean: - rm -f ${TARGET} ${OBJFILES} ${BINFILES} + rm -f ${VP8TARGET} ${VPXTARGET} ${VP8_OBJFILES} ${VPX_OBJFILES} ${BINFILES} ${EXAMPLE} ${EXAMPLEOBJ} diff --git a/media/video/libvpx/include/vp8_rtcd.h b/media/video/libvpx/include/vp8_rtcd.h new file mode 100644 index 00000000..d883cdd3 --- /dev/null +++ b/media/video/libvpx/include/vp8_rtcd.h @@ -0,0 +1,13 @@ +#include "vpx_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_short_fdct4x4_c(int16_t *input, int16_t *output, int32_t pitch); +void vp8_short_fdct4x4_svp64(int16_t *input, int16_t *output, int32_t pitch); + +#ifdef __cplusplus +} // extern "C" +#endif + diff --git a/media/video/libvpx/vp8_dct4x4_real.c.in b/media/video/libvpx/vp8_dct4x4_real.c.in new file mode 100644 index 00000000..0c26d91f --- /dev/null +++ b/media/video/libvpx/vp8_dct4x4_real.c.in @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +void vp8_short_fdct4x4_svp64(short *input, short *output, int pitch) { + int i; + int a1, b1, c1, d1; + short *ip = input; + short *op = output; + + for (i = 0; i < 4; ++i) { + a1 = ((ip[0] + ip[3]) * 8); + b1 = ((ip[1] + ip[2]) * 8); + c1 = ((ip[1] - ip[2]) * 8); + d1 = ((ip[0] - ip[3]) * 8); + + op[0] = a1 + b1; + op[2] = a1 - b1; + + op[1] = (c1 * 2217 + d1 * 5352 + 14500) >> 12; + op[3] = (d1 * 2217 - c1 * 5352 + 7500) >> 12; + + ip += pitch / 2; + op += 4; + } + ip = output; + op = output; + for (i = 0; i < 4; ++i) { + a1 = ip[0] + ip[12]; + b1 = ip[4] + ip[8]; + c1 = ip[4] - ip[8]; + d1 = ip[0] - ip[12]; + + op[0] = (a1 + b1 + 7) >> 4; + op[8] = (a1 - b1 + 7) >> 4; + + op[4] = ((c1 * 2217 + d1 * 5352 + 12000) >> 16) + (d1 != 0); + op[12] = (d1 * 2217 - c1 * 5352 + 51000) >> 16; + + ip++; + op++; + } +} diff --git a/media/video/libvpx/vp8_dct4x4_real.s b/media/video/libvpx/vp8_dct4x4_real.s new file mode 100644 index 00000000..34b59ce3 --- /dev/null +++ b/media/video/libvpx/vp8_dct4x4_real.s @@ -0,0 +1,111 @@ +.set in, 3 +.set out, 4 +.set pitch, 5 +.set c_2217, 6 +.set c_5352, 7 +.set c_7500, 9 +.set c_12000, 11 +.set c_51000, 12 +.set pred, 10 +.set ip, 16 +.set t, 32 +.set t2, 50 +.set t3, 70 +.set op, 90 + + .machine libresoc + .file "vp8_dct4x4_real.c" + .abiversion 2 + .section ".text" + .align 2 + .globl vp8_short_fdct4x4_svp64_real + .type vp8_short_fdct4x4_svp64_real, @function +vp8_short_fdct4x4_svp64_real: +.LFB0: + .cfi_startproc + li c_51000, 25500 + sldi c_51000, c_51000, 1 # c_51000 = 51000 + setvl 0,0,16,0,1,1 # Set VL to 16 elements + sv.lha *ip, 0(in) # Load 4 ints from (in) + + ori pred, 0, 0b0001000100010001 + sv.add/dm=r10 *t, *ip, *ip+3 # a1 = ip[0] + ip[3] + sv.add/dm=r10 *t+1, *ip+1, *ip+2 # b1 = ip[1] + ip[2] + sv.subf/dm=r10 *t+2, *ip+2, *ip+1 # c1 = ip[1] - ip[2] + sv.subf/dm=r10 *t+3, *ip+3, *ip # d1 = ip[0] - ip[3] + sv.mulli *t, *t, 8 # a1 *= 8, b1 *= 8, c1 *= 8, d1 *= 8 + + sv.add/dm=r10 *op, *t, *t+1 # op[0] = a1 + b1; + sv.subf/dm=r10 *op+2, *t+1, *t # op[2] = a1 - b1; + + # Calculate c1 * 2217, c1 *5352, d1 * 2217 and d1 * 5352 + ori pred, 0, 0b1100110011001100 + sv.mulli/m=r10 *t2, *t, 2217 # t2 has c1 * 2217, d1 * 2217 + sv.mulli/m=r10 *t3, *t, 5352 # t3 has c1 * 5352, d1 * 5352 + + ori pred, 0, 0b0010001000100010 + # op[1] = (c1 * 2217 + d1 * 5352 + 14500) + sv.add/m=r10 *op, *t2+1, *t3+2 # c1 * 2217 + d1 * 5352 + sv.addi/m=r10 *op, *op, 14500 # + 14500 + + ori pred, 0, 0b0100010001000100 + # op[3] = (d1 * 2217 - c1 * 5352 + 7500) + sv.subf/m=r10 *op+1, *t3, *t2+1 # - c1 * 5352 + d1 * 2127 + sv.addi/m=r10 *op+1, *op+1, 7500 # + 7500 + + ori pred, 0, 0b1010101010101010 + sv.rldicl/m=r10 *op, *op, 52, 12 # op[1] >>= 12, op[3] >>= 12 + + # column-wise DCT + ori pred, 0, 0b0000000000001111 + sv.add/m=r10 *t, *op, *op+12 # a1 = ip[0] + ip[12] + sv.add/m=r10 *t+4, *op+4, *op+8 # b1 = ip[4] + ip[8] + sv.subf/m=r10 *t+8, *op+8, *op+4 # c1 = ip[4] - ip[8] + sv.subf/m=r10 *t+12, *op+12, *op # d1 = ip[0] - ip[12] + + # op[0] = (a1 + b1 + 7) >> 4 + sv.add/m=r10 *op, *t, *t+4 # op[0] = a1 + b1 + sv.addi/m=r10 *op, *op, 7 # op[0] += 7 + + # op[8] = (a1 - b1 + 7) >> 4 + sv.subf/m=r10 *op+8, *t+4, *t # op[8] = a1 - b1 + sv.addi/m=r10 *op+8, *op+8, 7 # op[8] += 7 + + ori pred, 0, 0b0000111100001111 + sv.rldicl/m=r10 *op, *op, 60, 4 # op[0] >>= 4, op[8] >>= 4 + + # Calculate c1 * 2217, c1 *5352, d1 * 2217 and d1 * 5352 + ori pred, 0, 0b1111111100000000 + sv.mulli/m=r10 *t2, *t, 2217 # t2 has c1 * 2217, d1 * 2217 + sv.mulli/m=r10 *t3, *t, 5352 # t3 has c1 * 5352, d1 * 5352 + + # op[4] = ((c1 * 2217 + d1 * 5352 + 12000) + ori pred, 0, 0b0000000011110000 + sv.add/m=r10 *op, *t2+4, *t3+8 # c1 * 2217 + d1 * 5352 + sv.addi/m=r10 *op, *op, 12000 # + 12000 + + # op[12] = (d1 * 2217 - c1 * 5352 + 51000) + ori pred, 0, 0b1111000000000000 + sv.subf/m=r10 *op, *t3-4, *t2 # - c1 * 5352 + d1 * 2127 + sv.add/m=r10 *op, *op, c_51000 # + 51000 + + ori pred, 0, 0b1111000011110000 + sv.rldicl/m=r10 *op, *op, 48, 16 # op[4] >>= 16, op[12] >= 16 + + # op[4] += (d1 != 0) + #ori pred, 0, 0b0000000011110000 + setvl 0,0,4,0,1,1 # Set VL to 16 elements + sv.cmpi *cr0, 0, *t+12, 1 + sv.addi/m=ne *op+4, *op+4, 1 + + # store to buffer + setvl 0,0,16,0,1,1 # Set VL to 16 elements + sv.sth *op, 0(out) + blr + .long 0 + .byte 0,0,0,0,128,1,0,1 + .cfi_endproc +.LFE0: + .size vp8_short_fdct4x4_svp64_real,.-vp8_short_fdct4x4_svp64_real + .ident "GCC: (Debian 8.3.0-6) 8.3.0" + .section .note.GNU-stack,"",@progbits diff --git a/media/video/libvpx/vp8_dct4x4_ref.c b/media/video/libvpx/vp8_dct4x4_ref.c new file mode 100644 index 00000000..0cb2b0a5 --- /dev/null +++ b/media/video/libvpx/vp8_dct4x4_ref.c @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "vp8_rtcd.h" + +void vp8_short_fdct4x4_c(int16_t *input, int16_t *output, int32_t pitch) { + int i; + int a1, b1, c1, d1; + short *ip = input; + short *op = output; + + for (i = 0; i < 4; ++i) { + a1 = ((ip[0] + ip[3])); + b1 = ((ip[1] + ip[2])); + c1 = ((ip[1] - ip[2])); + d1 = ((ip[0] - ip[3])); + + a1 *= 8; + b1 *= 8; + c1 *= 8; + d1 *= 8; + printf("a1 = %08x\tb1 = %08x\tc1 = %08x\td1 = %08x\n", a1, b1, c1, d1); + + op[0] = a1 + b1; + op[2] = a1 - b1; + printf("op[0] = %04x\top[2] = %04x\n", (uint16_t)op[0], (uint16_t)op[2]); + + op[1] = (c1 * 2217 + d1 * 5352 + 14500) >> 12; + op[3] = (d1 * 2217 - c1 * 5352 + 7500) >> 12; + printf("op[1] = %04x\top[3] = %04x\n", (uint16_t)op[1], (uint16_t)op[3]); + + ip += pitch / 2; + op += 4; + } + ip = output; + op = output; + for (i = 0; i < 4; ++i) { + a1 = ip[0] + ip[12]; + b1 = ip[4] + ip[8]; + c1 = ip[4] - ip[8]; + d1 = ip[0] - ip[12]; + printf("a1 = %08x\tb1 = %08x\tc1 = %08x\td1 = %08x\n", a1, b1, c1, d1); + + op[0] = (a1 + b1 + 7) >> 4; + op[8] = (a1 - b1 + 7) >> 4; + printf("op[%d] = %08x\top[%d] = %08x\n", i, op[0], i+8, op[8]); + + op[4] = ((c1 * 2217 + d1 * 5352 + 12000) >> 16) + (d1 != 0); + op[12] = (d1 * 2217 - c1 * 5352 + 51000) >> 16; + printf("op[%d] = %04x\top[%d] = %04x\n", i+4, (uint16_t)op[4], i+12, (uint16_t)op[12]); + + ip++; + op++; + } +} diff --git a/media/video/libvpx/vp8_dct4x4_wrappers.c b/media/video/libvpx/vp8_dct4x4_wrappers.c new file mode 100644 index 00000000..da92cf3c --- /dev/null +++ b/media/video/libvpx/vp8_dct4x4_wrappers.c @@ -0,0 +1,85 @@ +#include +#include +#include + +#include "pypowersim_wrapper_common.h" +#include "vp8_dct4x4_wrappers.h" +#include "vp8_rtcd.h" + +void vp8_short_fdct4x4_svp64(int16_t *input, int16_t *output, int32_t pitch) { + + printf("pitch: %d\n", pitch); + int16_t output2[16]; + vp8_short_fdct4x4_c(input, output2, pitch); + + + // It cannot be the same pointer as the original function, as it is really a separate CPU/RAM + // we have to memcpy from input to this pointer, the address was chosen arbitrarily + uint64_t input_svp64 = 0x100000; + uint64_t output_svp64 = 0x200000; + + // Create the pypowersim_state + pypowersim_state_t *state = pypowersim_prepare(); + + // Change the relevant elements, mandatory: body + state->binary = PyBytes_FromStringAndSize((const char *)&vp8_short_fdct4x4_svp64_real, 1000); + // Set GPR #3 to the input pointer + PyObject *address = PyLong_FromUnsignedLongLong(input_svp64); + PyList_SetItem(state->initial_regs, 3, address); + // Load data into buffer from real memory + for (int i=0; i < 16; i += 4) { + PyObject *svp64_address = PyLong_FromUnsignedLongLong(input_svp64 + i*2); + uint64_t val = (uint64_t)(input[0]) & 0xffff; + val |= ((uint64_t)(input[1]) & 0xffff) << 16; + val |= ((uint64_t)(input[2]) & 0xffff) << 32; + val |= ((uint64_t)(input[3]) & 0xffff) << 48; + //printf("src: %p -> %04x %04x %04x %04x\t val: %016lx -> %p\n", input, (uint16_t)input[0], (uint16_t)input[1], (uint16_t)input[2], (uint16_t)input[3], val, input_svp64); + PyObject *word = PyLong_FromUnsignedLongLong(val); + PyDict_SetItem(state->initial_mem, svp64_address, word); + input += 4; + } + // Set GPR #4 to the output pointer + PyObject *out_address = PyLong_FromUnsignedLongLong(output_svp64); + PyList_SetItem(state->initial_regs, 4, out_address); + + // Prepare the arguments object for the call + pypowersim_prepareargs(state); + + // Call the function and get the resulting object + state->result_obj = PyObject_CallObject(state->simulator, state->args); + if (!state->result_obj) { + PyErr_Print(); + printf("Error invoking 'run_a_simulation'\n"); + pypowersim_finalize(state); + exit(1); + } + + PyObject *memobj = PyObject_GetAttrString(state->result_obj, "mem"); + if (!memobj) { + PyErr_Print(); + Py_DECREF(state->result_obj); + printf("Error getting mem object\n"); + } + + PyObject *mem = PyObject_GetAttrString(memobj, "mem"); + if (!mem) { + PyErr_Print(); + Py_DECREF(state->result_obj); + printf("Error getting mem dict\n"); + } + for (int i=0; i < 16; i += 4) { + PyObject *svp64_address = PyLong_FromUnsignedLongLong((output_svp64 + i*2)/8); + PyObject *pyval = PyDict_GetItem(mem, svp64_address); + uint64_t val = PyLong_AsUnsignedLongLong(pyval); + output[i + 0] = (uint16_t) val; + output[i + 1] = (uint16_t) (val >> 16); + output[i + 2] = (uint16_t) (val >> 32); + output[i + 3] = (uint16_t) (val >> 48); + //printf("output: %p -> %04x %04x %04x %04x\t val: %016lx -> %p\n", output, (uint16_t)output[i], (uint16_t)output[i + 1], (uint16_t)output[i + 2], (uint16_t)output[i + 3], val, output_svp64); + } + + for (int i=0; i < 16; i += 4) { + printf("output[%d] : %04x %04x %04x %04x\n", i, (uint16_t)output[i], (uint16_t)output[i+1], (uint16_t)output[i+2], (uint16_t)output[i+3]); + printf("output2[%d]: %04x %04x %04x %04x\n", i, (uint16_t)output2[i], (uint16_t)output2[i+1], (uint16_t)output2[i+2], (uint16_t)output2[i+3]); + } +} diff --git a/media/video/libvpx/vp8_dct4x4_wrappers.h b/media/video/libvpx/vp8_dct4x4_wrappers.h new file mode 100644 index 00000000..cef130ca --- /dev/null +++ b/media/video/libvpx/vp8_dct4x4_wrappers.h @@ -0,0 +1,3 @@ +#include + +void vp8_short_fdct4x4_svp64_real(int16_t *input, int16_t *output, int32_t pitch); diff --git a/media/video/libvpx/vp8_fdct4x4_test.cc b/media/video/libvpx/vp8_fdct4x4_test.cc new file mode 100644 index 00000000..7c3f7336 --- /dev/null +++ b/media/video/libvpx/vp8_fdct4x4_test.cc @@ -0,0 +1,194 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include "vpx_misc.h" +#include "vp8_rtcd.h" +#include "acm_random.h" +#include "clear_system_state.h" +#include "register_state_check.h" +#include "vpx_integer.h" +#include "vpx_mem.h" +#include "mem.h" +#include "vpx_timer.h" + +namespace { + +typedef void (*FdctFunc)(int16_t *a, int16_t *b, int a_stride); + +const int cospi8sqrt2minus1 = 20091; +const int sinpi8sqrt2 = 35468; + +void reference_idct4x4(const int16_t *input, int16_t *output) { + const int16_t *ip = input; + int16_t *op = output; + + for (int i = 0; i < 4; ++i) { + const int a1 = ip[0] + ip[8]; + const int b1 = ip[0] - ip[8]; + const int temp1 = (ip[4] * sinpi8sqrt2) >> 16; + const int temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16); + const int c1 = temp1 - temp2; + const int temp3 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16); + const int temp4 = (ip[12] * sinpi8sqrt2) >> 16; + const int d1 = temp3 + temp4; + op[0] = a1 + d1; + op[12] = a1 - d1; + op[4] = b1 + c1; + op[8] = b1 - c1; + ++ip; + ++op; + } + ip = output; + op = output; + for (int i = 0; i < 4; ++i) { + const int a1 = ip[0] + ip[2]; + const int b1 = ip[0] - ip[2]; + const int temp1 = (ip[1] * sinpi8sqrt2) >> 16; + const int temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16); + const int c1 = temp1 - temp2; + const int temp3 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16); + const int temp4 = (ip[3] * sinpi8sqrt2) >> 16; + const int d1 = temp3 + temp4; + op[0] = (a1 + d1 + 4) >> 3; + op[3] = (a1 - d1 + 4) >> 3; + op[1] = (b1 + c1 + 4) >> 3; + op[2] = (b1 - c1 + 4) >> 3; + ip += 4; + op += 4; + } +} + +using libvpx_test::ACMRandom; + +class FdctTest : public ::testing::TestWithParam { + public: + virtual void SetUp() { + fdct_func_ = GetParam(); + rnd_.Reset(ACMRandom::DeterministicSeed()); + } + + protected: + FdctFunc fdct_func_; + ACMRandom rnd_; +}; + +TEST_P(FdctTest, SignBiasCheck) { + int16_t test_input_block[16]; + DECLARE_ALIGNED(16, int16_t, test_output_block[16]); + const int pitch = 8; + int count_sign_block[16][2]; + const int count_test_block = 5; + + memset(count_sign_block, 0, sizeof(count_sign_block)); + + for (int i = 0; i < count_test_block; ++i) { + // Initialize a test block with input range [-255, 255]. + for (int j = 0; j < 16; ++j) { + test_input_block[j] = rnd_.Rand8() - rnd_.Rand8(); + } + + fdct_func_(test_input_block, test_output_block, pitch); + + for (int j = 0; j < 16; ++j) { + if (test_output_block[j] < 0) { + ++count_sign_block[j][0]; + } else if (test_output_block[j] > 0) { + ++count_sign_block[j][1]; + } + } + } + + bool bias_acceptable = true; + for (int j = 0; j < 16; ++j) { + bias_acceptable = + bias_acceptable && + (abs(count_sign_block[j][0] - count_sign_block[j][1]) < 10000); + } + + EXPECT_EQ(true, bias_acceptable) + << "Error: 4x4 FDCT has a sign bias > 1% for input range [-255, 255]"; + + memset(count_sign_block, 0, sizeof(count_sign_block)); + + for (int i = 0; i < count_test_block; ++i) { + // Initialize a test block with input range [-15, 15]. + for (int j = 0; j < 16; ++j) { + test_input_block[j] = (rnd_.Rand8() >> 4) - (rnd_.Rand8() >> 4); + } + + fdct_func_(test_input_block, test_output_block, pitch); + + for (int j = 0; j < 16; ++j) { + if (test_output_block[j] < 0) { + ++count_sign_block[j][0]; + } else if (test_output_block[j] > 0) { + ++count_sign_block[j][1]; + } + } + } + + bias_acceptable = true; + for (int j = 0; j < 16; ++j) { + bias_acceptable = + bias_acceptable && + (abs(count_sign_block[j][0] - count_sign_block[j][1]) < 100000); + } + + EXPECT_EQ(true, bias_acceptable) + << "Error: 4x4 FDCT has a sign bias > 10% for input range [-15, 15]"; +} + +TEST_P(FdctTest, RoundTripErrorCheck) { + int max_error = 0; + double total_error = 0; + const int count_test_block = 5; + for (int i = 0; i < count_test_block; ++i) { + int16_t test_input_block[16]; + int16_t test_output_block[16]; + DECLARE_ALIGNED(16, int16_t, test_temp_block[16]); + + // Initialize a test block with input range [-255, 255]. + for (int j = 0; j < 16; ++j) { + test_input_block[j] = rnd_.Rand8() - rnd_.Rand8(); + } + + const int pitch = 8; + fdct_func_(test_input_block, test_temp_block, pitch); + reference_idct4x4(test_temp_block, test_output_block); + + for (int j = 0; j < 16; ++j) { + const int diff = test_input_block[j] - test_output_block[j]; + const int error = diff * diff; + if (max_error < error) max_error = error; + total_error += error; + } + } + + EXPECT_GE(1, max_error) + << "Error: FDCT/IDCT has an individual roundtrip error > 1"; + + EXPECT_GE(count_test_block, total_error) + << "Error: FDCT/IDCT has average roundtrip error > 1 per block"; +} + +INSTANTIATE_TEST_SUITE_P(C, FdctTest, ::testing::Values(vp8_short_fdct4x4_c)); + +INSTANTIATE_TEST_SUITE_P(SVP64, FdctTest, ::testing::Values(vp8_short_fdct4x4_svp64)); + +} // namespace -- 2.30.2