source "package/xpdf/Config.in"
source "package/xstroke/Config.in"
source "package/xvkbd/Config.in"
-
-comment "Video libraries/codecs and applications"
-source "package/mplayer/Config.in"
-source "package/vlc/Config.in"
endmenu
menu "Compressors / decompressors"
+++ /dev/null
-config BR2_PACKAGE_MPLAYER
- bool "mplayer"
- select BR2_PACKAGE_LIBMAD
- help
- MPlayer is a movie player which runs on many systems and supports
- many different file formats.
-
- http://www.mplayerhq.hu/
+++ /dev/null
- cfg-common.h | 4 +
- cfg-mencoder.h | 4 +
- cfg-mplayer.h | 4 +
- configure | 13 +-
- libaf/af_format.c | 7 +
- libavcodec/Makefile | 7 +
- libavcodec/avr32/dsputil_avr32.c | 2678 ++++++++++++++++++++++++++++++++++++++
- libavcodec/avr32/fdct.S | 541 ++++++++
- libavcodec/avr32/h264idct.S | 451 +++++++
- libavcodec/avr32/idct.S | 829 ++++++++++++
- libavcodec/avr32/mc.S | 434 ++++++
- libavcodec/avr32/pico.h | 260 ++++
- libavcodec/bitstream.h | 77 +-
- libavcodec/dsputil.c | 3 +
- libavcodec/h264.c | 15 +
- libavutil/common.h | 16 +
- libavutil/internal.h | 9 +
- libfaad2/common.h | 2 +-
- libmpcodecs/ad_libmad.c | 5 +
- libswscale/pico-avr32.h | 137 ++
- libswscale/swscale_internal.h | 2 +-
- libswscale/yuv2rgb.c | 14 +
- libswscale/yuv2rgb_avr32.c | 416 ++++++
- libvo/vo_fbdev2.c | 101 ++-
- version.sh | 2 +-
- 25 files changed, 6011 insertions(+), 20 deletions(-)
- create mode 100644 libavcodec/avr32/dsputil_avr32.c
- create mode 100644 libavcodec/avr32/fdct.S
- create mode 100644 libavcodec/avr32/h264idct.S
- create mode 100644 libavcodec/avr32/idct.S
- create mode 100644 libavcodec/avr32/mc.S
- create mode 100644 libavcodec/avr32/pico.h
- create mode 100644 libswscale/pico-avr32.h
- create mode 100644 libswscale/yuv2rgb_avr32.c
-
-diff --git a/cfg-common.h b/cfg-common.h
-index 780df38..7d878a8 100644
---- a/cfg-common.h
-+++ b/cfg-common.h
-@@ -235,6 +235,10 @@
- {"tsprobe", &ts_probe, CONF_TYPE_POSITION, 0, 0, TS_MAX_PROBE_SIZE, NULL},
- {"tskeepbroken", &ts_keep_broken, CONF_TYPE_FLAG, 0, 0, 1, NULL},
-
-+#ifdef ARCH_AVR32
-+ {"use-pico", &avr32_use_pico, CONF_TYPE_FLAG, 0, 0, 1, NULL},
-+ {"nouse-pico", &avr32_use_pico, CONF_TYPE_FLAG, 0, 1, 0, NULL},
-+#endif
- // draw by slices or whole frame (useful with libmpeg2/libavcodec)
- {"slices", &vd_use_slices, CONF_TYPE_FLAG, 0, 0, 1, NULL},
- {"noslices", &vd_use_slices, CONF_TYPE_FLAG, 0, 1, 0, NULL},
-diff --git a/cfg-mencoder.h b/cfg-mencoder.h
-index 411b748..addf791 100644
---- a/cfg-mencoder.h
-+++ b/cfg-mencoder.h
-@@ -5,6 +5,10 @@
-
- #include "cfg-common.h"
-
-+#ifdef ARCH_AVR32
-+extern int avr32_use_pico;
-+#endif
-+
- #ifdef USE_FAKE_MONO
- extern int fakemono; // defined in dec_audio.c
- #endif
-diff --git a/cfg-mplayer.h b/cfg-mplayer.h
-index 62b6eac..31499c2 100644
---- a/cfg-mplayer.h
-+++ b/cfg-mplayer.h
-@@ -4,6 +4,10 @@
-
- #include "cfg-common.h"
-
-+#ifdef ARCH_AVR32
-+extern int avr32_use_pico;
-+#endif
-+
- extern int noconsolecontrols;
-
- #if defined(HAVE_FBDEV)||defined(HAVE_VESA)
-diff --git a/configure b/configure
-index 29002c8..56c6fe4 100755
---- a/configure
-+++ b/configure
-@@ -1203,6 +1203,15 @@ EOF
- _optimizing="$proc"
- ;;
-
-+ avr32)
-+ _def_arch='#define ARCH_AVR32'
-+ _target_arch='TARGET_ARCH_AVR32 = yes'
-+ iproc='avr32'
-+ proc=''
-+ _march=''
-+ _mcpu=''
-+ _optimizing=''
-+ ;;
- arm|armv4l|armv5tel)
- _def_arch='#define ARCH_ARMV4L 1'
- _target_arch='TARGET_ARCH_ARMV4L = yes'
-@@ -1533,7 +1542,7 @@ echores $_named_asm_args
- # Checking for CFLAGS
- _stripbinaries=yes
- if test "$_profile" != "" || test "$_debug" != "" ; then
-- CFLAGS="-W -Wall -O2 $_march $_mcpu $_debug $_profile"
-+ CFLAGS="-W -Wall -O4 $_march $_mcpu $_debug $_profile"
- if test "$_cc_major" -ge "3" ; then
- CFLAGS=`echo "$CFLAGS" | sed -e 's/\(-Wall\)/\1 -Wno-unused-parameter/'`
- fi
-@@ -3794,7 +3803,7 @@ fi
-
-
- echocheck "X11 headers presence"
-- for I in `echo $_inc_extra | sed s/-I//g` /usr/X11/include /usr/X11R6/include /usr/include/X11R6 /usr/include /usr/openwin/include ; do
-+ for I in `echo $_inc_extra | sed s/-I//g`; do
- if test -f "$I/X11/Xlib.h" ; then
- _inc_x11="-I$I"
- _x11_headers="yes"
-diff --git a/libaf/af_format.c b/libaf/af_format.c
-index e5b7cc9..5d7ea6d 100644
---- a/libaf/af_format.c
-+++ b/libaf/af_format.c
-@@ -20,7 +20,14 @@
- // Integer to float conversion through lrintf()
- #ifdef HAVE_LRINTF
- #include <math.h>
-+
-+#ifdef ARCH_AVR32
-+#define lrintf(x) rint(x)
-+#define llrint(x) (long long)rint(x)
-+#else
- long int lrintf(float);
-+#endif
-+
- #else
- #define lrintf(x) ((int)(x))
- #endif
-diff --git a/libavcodec/Makefile b/libavcodec/Makefile
-index 17b6c45..8e1dc96 100644
---- a/libavcodec/Makefile
-+++ b/libavcodec/Makefile
-@@ -360,6 +360,12 @@ OBJS-$(TARGET_ARCH_SPARC) += sparc/dsputil_vis.o \
-
- sparc/dsputil_vis.o: CFLAGS += -mcpu=ultrasparc -mtune=ultrasparc
-
-+# avr32 specific stuff
-+ifeq ($(TARGET_ARCH_AVR32),yes)
-+ASM_OBJS += avr32/idct.o avr32/fdct.o avr32/mc.o avr32/h264idct.o
-+OBJS += avr32/dsputil_avr32.o
-+endif
-+
- # sun mediaLib specific stuff
- OBJS-$(HAVE_MLIB) += mlib/dsputil_mlib.o \
-
-@@ -419,6 +425,7 @@ tests: apiexample $(TESTS)
- clean::
- rm -f \
- i386/*.o i386/*~ \
-+ avr32/*.o avr32/*~ \
- armv4l/*.o armv4l/*~ \
- mlib/*.o mlib/*~ \
- alpha/*.o alpha/*~ \
-diff --git a/libavcodec/avr32/dsputil_avr32.c b/libavcodec/avr32/dsputil_avr32.c
-new file mode 100644
-index 0000000..200284d
---- /dev/null
-+++ b/libavcodec/avr32/dsputil_avr32.c
-@@ -0,0 +1,2678 @@
-+/*
-+ * Copyright (c) 2007 Atmel Corporation. All rights reserved.
-+ *
-+ * Redistribution and use in source and binary forms, with or without
-+ * modification, are permitted provided that the following conditions
-+ * are met:
-+ *
-+ * 1. Redistributions of source code must retain the above copyright
-+ * notice, this list of conditions and the following disclaimer.
-+ *
-+ * 2. Redistributions in binary form must reproduce the above
-+ * copyright notice, this list of conditions and the following
-+ * disclaimer in the documentation and/or other materials provided
-+ * with the distribution.
-+ *
-+ * 3. The name of ATMEL may not be used to endorse or promote products
-+ * derived from this software without specific prior written
-+ * permission.
-+ *
-+ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
-+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-+ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
-+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
-+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
-+ * DAMAGE.
-+ */
-+
-+#include "../dsputil.h"
-+#include "pico.h"
-+
-+int avr32_use_pico = 1;
-+
-+//#define CHECK_DSP_FUNCS_AGAINST_C
-+
-+#ifdef CHECK_DSP_FUNCS_AGAINST_C
-+#define DSP_FUNC_NAME(name) test_ ## name
-+#else
-+#define DSP_FUNC_NAME(name) name
-+#endif
-+
-+union doubleword {
-+ int64_t doubleword;
-+ struct {
-+ int32_t top;
-+ int32_t bottom;
-+ } words;
-+};
-+
-+#undef LD16
-+#undef LD32
-+#undef LD64
-+
-+#define LD16(a) (*((uint16_t*)(a)))
-+#define LD32(a) (*((uint32_t*)(a)))
-+#define LD64(a) (*((uint64_t*)(a)))
-+#define LD64_UNALIGNED(a) \
-+ ({ union doubleword __tmp__; \
-+ __tmp__.words.top = LD32(a); \
-+ __tmp__.words.bottom = LD32(a + 4); \
-+ __tmp__.doubleword; })
-+
-+#undef ST32
-+#undef ST16
-+
-+#define ST16(a, b) *((uint16_t*)(a)) = (b)
-+#define ST32(a, b) *((uint32_t*)(a)) = (b)
-+
-+#undef rnd_avg32
-+#define rnd_avg32(a, b) \
-+ ({ uint32_t __tmp__;\
-+ asm("pavg.ub\t%0, %1, %2" : "=r"(__tmp__) : "r"(a), "r"(b));\
-+ __tmp__;})
-+
-+void idct_avr32(DCTELEM *data);
-+void fdct_avr32(DCTELEM *data);
-+
-+void idct_put_avr32(uint8_t *dest, int line_size, DCTELEM *data);
-+void idct_add_avr32(uint8_t *dest, int line_size, DCTELEM *data);
-+
-+void h264_idct_add_avr32(uint8_t *dest, DCTELEM *data, int stride);
-+void h264_idct8_add_avr32(uint8_t *dest, DCTELEM *data, int stride);
-+
-+#define extern_dspfunc(PFX, NUM) \
-+ void PFX ## _pixels ## NUM ## _avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
-+ void PFX ## _pixels ## NUM ## _h_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
-+ void PFX ## _pixels ## NUM ## _v_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
-+ void PFX ## _pixels ## NUM ## _hv_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h )
-+
-+extern_dspfunc(put, 8);
-+extern_dspfunc(put_no_rnd, 8);
-+extern_dspfunc(avg, 8);
-+extern_dspfunc(avg_no_rnd, 8);
-+#undef extern_dspfunc
-+
-+#ifdef CHECK_DSP_FUNCS_AGAINST_C
-+#define extern_dspfunc(PFX, NUM) \
-+ void PFX ## _pixels ## NUM ## _c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
-+ void PFX ## _pixels ## NUM ## _x2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
-+ void PFX ## _pixels ## NUM ## _y2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
-+ void PFX ## _pixels ## NUM ## _xy2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h )
-+
-+extern_dspfunc(put, 4);
-+extern_dspfunc(put_no_rnd, 4);
-+extern_dspfunc(put, 8);
-+extern_dspfunc(put_no_rnd, 8);
-+extern_dspfunc(put, 16);
-+extern_dspfunc(put_no_rnd, 16);
-+extern_dspfunc(avg, 8);
-+extern_dspfunc(avg_no_rnd, 8);
-+extern_dspfunc(avg, 16);
-+extern_dspfunc(avg_no_rnd, 16);
-+
-+
-+#undef extern_dspfunc
-+#define extern_dspfunc(PFX, NUM) \
-+void PFX ## NUM ## _mc00_c(uint8_t *dst, uint8_t *src, int stride); \
-+void PFX ## NUM ## _mc10_c(uint8_t *dst, uint8_t *src, int stride); \
-+void PFX ## NUM ## _mc20_c(uint8_t *dst, uint8_t *src, int stride); \
-+void PFX ## NUM ## _mc30_c(uint8_t *dst, uint8_t *src, int stride); \
-+void PFX ## NUM ## _mc01_c(uint8_t *dst, uint8_t *src, int stride); \
-+void PFX ## NUM ## _mc11_c(uint8_t *dst, uint8_t *src, int stride); \
-+void PFX ## NUM ## _mc21_c(uint8_t *dst, uint8_t *src, int stride); \
-+void PFX ## NUM ## _mc31_c(uint8_t *dst, uint8_t *src, int stride); \
-+void PFX ## NUM ## _mc02_c(uint8_t *dst, uint8_t *src, int stride); \
-+void PFX ## NUM ## _mc12_c(uint8_t *dst, uint8_t *src, int stride); \
-+void PFX ## NUM ## _mc22_c(uint8_t *dst, uint8_t *src, int stride); \
-+void PFX ## NUM ## _mc32_c(uint8_t *dst, uint8_t *src, int stride); \
-+void PFX ## NUM ## _mc03_c(uint8_t *dst, uint8_t *src, int stride); \
-+void PFX ## NUM ## _mc13_c(uint8_t *dst, uint8_t *src, int stride); \
-+void PFX ## NUM ## _mc23_c(uint8_t *dst, uint8_t *src, int stride); \
-+void PFX ## NUM ## _mc33_c(uint8_t *dst, uint8_t *src, int stride); \
-+
-+extern_dspfunc(put_h264_qpel, 16);
-+extern_dspfunc(put_h264_qpel, 8);
-+extern_dspfunc(put_h264_qpel, 4);
-+extern_dspfunc(avg_h264_qpel, 16);
-+extern_dspfunc(avg_h264_qpel, 8);
-+extern_dspfunc(avg_h264_qpel, 4);
-+
-+#undef extern_dspfunc
-+
-+void put_h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
-+void put_h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
-+void put_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
-+
-+void avg_h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
-+void avg_h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
-+void avg_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
-+
-+
-+void dump_block8(uint8_t *block, int line_size, int h);
-+void dump_block4(uint8_t *block, int line_size, int h);
-+void dump_block(uint8_t *block, int line_size, int h, int w);
-+
-+void check_block8(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
-+ int h, char *name, int max_dev);
-+void check_block4(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
-+ int h, char *name, int max_dev);
-+void check_block(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
-+ int h, int width, char *name, int max_dev);
-+
-+#define PIXOP2( OPNAME, OP ) \
-+void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
-+ int i;\
-+ for(i=0; i<h; i++){\
-+ OP(*((uint32_t*)(block )), LD32(pixels ));\
-+ pixels+=line_size;\
-+ block +=line_size;\
-+ }\
-+}\
-+void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
-+ int src_stride1, int src_stride2, int h){\
-+ int i;\
-+ for(i=0; i<h; i++){\
-+ uint32_t a,b;\
-+ a= LD32(&src1[i*src_stride1 ]);\
-+ b= LD32(&src2[i*src_stride2 ]);\
-+ OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
-+ a= LD32(&src1[i*src_stride1+4]);\
-+ b= LD32(&src2[i*src_stride2+4]);\
-+ OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
-+ }\
-+}\
-+\
-+void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
-+ int src_stride1, int src_stride2, int h){\
-+ int i;\
-+ for(i=0; i<h; i++){\
-+ uint32_t a,b;\
-+ a= LD32(&src1[i*src_stride1 ]);\
-+ b= LD32(&src2[i*src_stride2 ]);\
-+ OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
-+ }\
-+}\
-+\
-+void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
-+ int src_stride1, int src_stride2, int h){\
-+ OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
-+ OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
-+}\
-+
-+#else
-+#define PIXOP2( OPNAME, OP ) \
-+static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
-+ int i;\
-+ for(i=0; i<h; i++){\
-+ OP(*((uint32_t*)(block )), LD32(pixels ));\
-+ pixels+=line_size;\
-+ block +=line_size;\
-+ }\
-+}\
-+static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
-+ int i;\
-+ for(i=0; i<h; i++){\
-+ OP(*((uint32_t*)(block )), LD32(pixels ));\
-+ OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
-+ pixels+=line_size;\
-+ block +=line_size;\
-+ }\
-+}\
-+static void OPNAME ## _pixels16_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
-+ int i;\
-+ for(i=0; i<h; i++){\
-+ OP(*((uint32_t*)(block )), LD32(pixels ));\
-+ OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
-+ OP(*((uint32_t*)(block+8)), LD32(pixels+8));\
-+ OP(*((uint32_t*)(block+12)), LD32(pixels+12));\
-+ pixels+=line_size;\
-+ block +=line_size;\
-+ }\
-+}\
-+static void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
-+ int src_stride1, int src_stride2, int h){\
-+ int i;\
-+ for(i=0; i<h; i++){\
-+ uint32_t a,b;\
-+ a= LD32(&src1[i*src_stride1 ]);\
-+ b= LD32(&src2[i*src_stride2 ]);\
-+ OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
-+ a= LD32(&src1[i*src_stride1+4]);\
-+ b= LD32(&src2[i*src_stride2+4]);\
-+ OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
-+ }\
-+}\
-+\
-+static void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
-+ int src_stride1, int src_stride2, int h){\
-+ int i;\
-+ for(i=0; i<h; i++){\
-+ uint32_t a,b;\
-+ a= LD32(&src1[i*src_stride1 ]);\
-+ b= LD32(&src2[i*src_stride2 ]);\
-+ OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
-+ }\
-+}\
-+\
-+static void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
-+ int src_stride1, int src_stride2, int h){\
-+ OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
-+ OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
-+}\
-+
-+#endif
-+
-+#define op_avg(a, b) a = rnd_avg32(a, b)
-+#define op_put(a, b) a = b
-+
-+PIXOP2(avg, op_avg)
-+PIXOP2(put, op_put)
-+#undef op_avg
-+#undef op_put
-+
-+
-+
-+static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
-+{
-+ int i;
-+ for(i=0; i<h; i++)
-+ {
-+ ST32(dst , LD32(src ));
-+ dst+=dstStride;
-+ src+=srcStride;
-+ }
-+}
-+
-+static void clear_blocks_avr32(DCTELEM *blocks)
-+{
-+ int n = 12;
-+ uint64_t tmp1, tmp2;
-+ blocks += 6*64;
-+ asm volatile ( "mov\t%1, 0\n"
-+ "mov\t%m1, 0\n"
-+ "mov\t%2, 0\n"
-+ "mov\t%m2, 0\n"
-+ "0:\n"
-+ "stm\t--%3, %1, %m1, %2, %m2\n"
-+ "stm\t--%3, %1, %m1, %2, %m2\n"
-+ "stm\t--%3, %1, %m1, %2, %m2\n"
-+ "stm\t--%3, %1, %m1, %2, %m2\n"
-+ "sub\t%0, 1\n"
-+ "brne\t0b\n"
-+ : "+r"(n), "=&r"(tmp1), "=&r"(tmp2),
-+ "+r"(blocks));
-+}
-+
-+
-+static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
-+{
-+ int i;
-+ for(i=0; i<h; i++)
-+ {
-+ ST32(dst , LD32(src ));
-+ ST32(dst+4 , LD32(src+4 ));
-+ dst+=dstStride;
-+ src+=srcStride;
-+ }
-+}
-+
-+static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
-+{
-+ int i;
-+ for(i=0; i<h; i++)
-+ {
-+ ST32(dst , LD32(src ));
-+ ST32(dst+4 , LD32(src+4 ));
-+ ST32(dst+8 , LD32(src+8 ));
-+ ST32(dst+12, LD32(src+12));
-+ dst+=dstStride;
-+ src+=srcStride;
-+ }
-+}
-+
-+
-+static void put_h264_chroma_mc2_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
-+ const int A=(8-x)*(8-y);
-+ const int B=( x)*(8-y);
-+ const int C=(8-x)*( y);
-+ const int D=( x)*( y);
-+ int i;
-+
-+ PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
-+ PICO_PUT_W(PICO_COEFF0_B, 32);
-+ PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
-+ PICO_PUT_W(PICO_COEFF1_B, 0);
-+ PICO_PUT_W(PICO_COEFF2_A, 0);
-+ PICO_PUT_W(PICO_COEFF2_B, 0);
-+ PICO_PUT_W(PICO_CONFIG,
-+ PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
-+ | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
-+ | PICO_COEFF_FRAC_BITS(6)
-+ | PICO_OFFSET_FRAC_BITS(6));
-+
-+ for(i=0; i<h; i++)
-+ {
-+
-+ int src0 = LD32(src);
-+ int src1 = LD32(src + stride);
-+
-+ PICO_MVRC_W(PICO_INPIX0, src0);
-+ PICO_MVRC_W(PICO_INPIX1, src1);
-+ PICO_OP(PICO_SINGLE_VECTOR, 2, 0, 4, 0);
-+ PICO_OP(PICO_SINGLE_VECTOR, 3, 1, 5, 0);
-+ src += stride;
-+ ST16(dst,(short)PICO_GET_W(PICO_OUTPIX0));
-+ dst += stride;
-+ }
-+}
-+
-+
-+static void put_h264_chroma_mc4_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
-+ const int A=(8-x)*(8-y);\
-+ const int B=( x)*(8-y);
-+ const int C=(8-x)*( y);
-+ const int D=( x)*( y);
-+ int i;
-+
-+ PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
-+ PICO_PUT_W(PICO_COEFF0_B, 32);
-+ PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
-+ PICO_PUT_W(PICO_COEFF1_B, 0);
-+ PICO_PUT_W(PICO_COEFF2_A, 0);
-+ PICO_PUT_W(PICO_COEFF2_B, 0);
-+ PICO_PUT_W(PICO_CONFIG,
-+ PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
-+ | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
-+ | PICO_COEFF_FRAC_BITS(6)
-+ | PICO_OFFSET_FRAC_BITS(6));
-+
-+ for(i=0; i<h; i++)
-+ {
-+ /*
-+ OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
-+ OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
-+ OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
-+ OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
-+ dst+= stride;
-+ src+= stride;
-+ */
-+
-+ int src0 = LD32(src);
-+ int src1 = (((int)src[4] << 24) | (int)src[stride]);
-+ int src2 = LD32(src + stride + 1);
-+
-+ PICO_MVRC_W(PICO_INPIX0, src0);
-+ PICO_MVRC_W(PICO_INPIX1, src1);
-+ PICO_MVRC_W(PICO_INPIX2, src2);
-+ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
-+ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
-+ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
-+ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
-+ src += stride;
-+ ST32(dst, PICO_GET_W(PICO_OUTPIX0));
-+
-+ dst += stride;
-+ }
-+}
-+
-+static void put_h264_chroma_mc8_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
-+ const int A=(8-x)*(8-y);
-+ const int B=( x)*(8-y);
-+ const int C=(8-x)*( y);
-+ const int D=( x)*( y);
-+ int i;
-+
-+ PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
-+ PICO_PUT_W(PICO_COEFF0_B, 32);
-+ PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
-+ PICO_PUT_W(PICO_COEFF1_B, 0);
-+ PICO_PUT_W(PICO_COEFF2_A, 0);
-+ PICO_PUT_W(PICO_COEFF2_B, 0);
-+ PICO_PUT_W(PICO_CONFIG,
-+ PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
-+ | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
-+ | PICO_COEFF_FRAC_BITS(6)
-+ | PICO_OFFSET_FRAC_BITS(6));
-+
-+ for(i=0; i<h; i++)
-+ {
-+ /*
-+ OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
-+ OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
-+ OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
-+ OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
-+ OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));
-+ OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));
-+ OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));
-+ OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));
-+ dst+= stride;
-+ src+= stride;
-+ */
-+ int src0 = LD32(src);
-+ int src1 = (((int)src[4] << 24) | (int)src[stride]);
-+ int src2 = LD32(src + stride + 1);
-+
-+ PICO_MVRC_W(PICO_INPIX0, src0);
-+ PICO_MVRC_W(PICO_INPIX1, src1);
-+ PICO_MVRC_W(PICO_INPIX2, src2);
-+ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
-+ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
-+ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
-+ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
-+ ST32(dst, PICO_GET_W(PICO_OUTPIX0));
-+
-+ src0 = LD32(src + 4);
-+ src1 = (src[8] << 24) | src[stride + 4];
-+ src2 = LD32(src + stride + 5);
-+
-+ PICO_MVRC_W(PICO_INPIX0, src0);
-+ PICO_MVRC_W(PICO_INPIX1, src1);
-+ PICO_MVRC_W(PICO_INPIX2, src2);
-+ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
-+ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
-+ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
-+ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
-+ src += stride;
-+ ST32(dst + 4, PICO_GET_W(PICO_OUTPIX0));
-+
-+ dst += stride;
-+ }
-+}
-+
-+
-+static void avg_h264_chroma_mc2_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
-+ const int A=(8-x)*(8-y);
-+ const int B=( x)*(8-y);
-+ const int C=(8-x)*( y);
-+ const int D=( x)*( y);
-+ int i;
-+
-+ PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
-+ PICO_PUT_W(PICO_COEFF0_B, 32);
-+ PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
-+ PICO_PUT_W(PICO_COEFF1_B, 0);
-+ PICO_PUT_W(PICO_COEFF2_A, 0);
-+ PICO_PUT_W(PICO_COEFF2_B, 0);
-+ PICO_PUT_W(PICO_CONFIG,
-+ PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
-+ | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
-+ | PICO_COEFF_FRAC_BITS(6)
-+ | PICO_OFFSET_FRAC_BITS(6));
-+
-+ for(i=0; i<h; i++)
-+ {
-+ int src0 = LD32(src);
-+ int src1 = LD32(src + stride);
-+
-+ PICO_MVRC_W(PICO_INPIX0, src0);
-+ PICO_MVRC_W(PICO_INPIX1, src1);
-+ PICO_OP(PICO_SINGLE_VECTOR, 2, 0, 4, 0);
-+ PICO_OP(PICO_SINGLE_VECTOR, 3, 1, 5, 0);
-+ src += stride;
-+ ST16(dst, rnd_avg32(LD16(dst), PICO_GET_W(PICO_OUTPIX0)));
-+ dst += stride;
-+ }
-+}
-+
-+
-+static void avg_h264_chroma_mc4_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
-+ const int A=(8-x)*(8-y);\
-+ const int B=( x)*(8-y);
-+ const int C=(8-x)*( y);
-+ const int D=( x)*( y);
-+ int i;
-+
-+ PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
-+ PICO_PUT_W(PICO_COEFF0_B, 32);
-+ PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
-+ PICO_PUT_W(PICO_COEFF1_B, 0);
-+ PICO_PUT_W(PICO_COEFF2_A, 0);
-+ PICO_PUT_W(PICO_COEFF2_B, 0);
-+ PICO_PUT_W(PICO_CONFIG,
-+ PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
-+ | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
-+ | PICO_COEFF_FRAC_BITS(6)
-+ | PICO_OFFSET_FRAC_BITS(6));
-+
-+ for(i=0; i<h; i++)
-+ {
-+ /*
-+ OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
-+ OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
-+ OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
-+ OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
-+ dst+= stride;
-+ src+= stride;
-+ */
-+
-+ int src0 = *((int *)src);
-+ int src1 = (int)((src[4] << 24) | src[stride]);
-+ int src2 = *((int *)(src + stride + 1));
-+
-+ PICO_MVRC_W(PICO_INPIX0, src0);
-+ PICO_MVRC_W(PICO_INPIX1, src1);
-+ PICO_MVRC_W(PICO_INPIX2, src2);
-+ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
-+ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
-+ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
-+ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
-+ src += stride;
-+ ST32(dst, rnd_avg32(LD32(dst), PICO_GET_W(PICO_OUTPIX0)));
-+ dst += stride;
-+ }
-+}
-+
-+static void avg_h264_chroma_mc8_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
-+ const int A=(8-x)*(8-y);
-+ const int B=( x)*(8-y);
-+ const int C=(8-x)*( y);
-+ const int D=( x)*( y);
-+ int i;
-+
-+ PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
-+ PICO_PUT_W(PICO_COEFF0_B, 32);
-+ PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
-+ PICO_PUT_W(PICO_COEFF1_B, 0);
-+ PICO_PUT_W(PICO_COEFF2_A, 0);
-+ PICO_PUT_W(PICO_COEFF2_B, 0);
-+ PICO_PUT_W(PICO_CONFIG,
-+ PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
-+ | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
-+ | PICO_COEFF_FRAC_BITS(6)
-+ | PICO_OFFSET_FRAC_BITS(6));
-+
-+ for(i=0; i<h; i++)
-+ {
-+ /*
-+ OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
-+ OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
-+ OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
-+ OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
-+ OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));
-+ OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));
-+ OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));
-+ OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));
-+ dst+= stride;
-+ src+= stride;
-+ */
-+ int src0 = *((int *)src);
-+ int src1 = (volatile int)((src[4] << 24) | src[stride]);
-+ int src2 = *((int *)(src + stride + 1));
-+
-+ PICO_MVRC_W(PICO_INPIX0, src0);
-+ PICO_MVRC_W(PICO_INPIX1, src1);
-+ PICO_MVRC_W(PICO_INPIX2, src2);
-+ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
-+ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
-+ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
-+ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
-+ ST32(dst, rnd_avg32(LD32(dst), PICO_GET_W(PICO_OUTPIX0)));
-+
-+ src0 = *((int *)(src + 4));
-+ src1 = (int)((src[8] << 24) | src[stride + 4]);
-+ src2 = *((int *)(src + stride + 5));
-+
-+ PICO_MVRC_W(PICO_INPIX0, src0);
-+ PICO_MVRC_W(PICO_INPIX1, src1);
-+ PICO_MVRC_W(PICO_INPIX2, src2);
-+ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
-+ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
-+ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
-+ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
-+ src += stride;
-+ ST32(dst + 4, rnd_avg32(LD32(dst + 4), PICO_GET_W(PICO_OUTPIX0)));
-+ dst += stride;
-+ }
-+}
-+
-+static struct pico_config_t h264_qpel4_h_lowpass_config = {
-+ .input_mode = PICO_HOR_FILTER_MODE,
-+ .output_mode = PICO_PLANAR_MODE,
-+ .coeff_frac_bits = 5,
-+ .offset_frac_bits = 5,
-+ .coeff0_0 = 1,
-+ .coeff0_1 = -5,
-+ .coeff0_2 = 20,
-+ .coeff0_3 = 16,
-+ .coeff1_0 = 20,
-+ .coeff1_1 = -5,
-+ .coeff1_2 = 1,
-+ .coeff1_3 = 0,
-+ .coeff2_0 = 0,
-+ .coeff2_1 = 0,
-+ .coeff2_2 = 0,
-+ .coeff2_3 = 0
-+};
-+
-+
-+
-+static void put_h264_qpel4_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
-+ const int h=4;
-+ int i;
-+
-+ set_pico_config(&h264_qpel4_h_lowpass_config);
-+
-+ for(i=0; i<h; i++){
-+
-+ /*
-+ OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
-+ OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
-+ OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
-+ OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
-+ dst+=dstStride;\
-+ src+=srcStride;\ */
-+ PICO_MVRC_W(PICO_INPIX0, LD32(src - 2));
-+ PICO_MVRC_D(PICO_INPIX2, LD64_UNALIGNED(src + 2));
-+ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6);
-+ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7);
-+ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8);
-+ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9);
-+ src += srcStride;
-+ ST32(dst, PICO_GET_W(PICO_OUTPIX0));
-+ dst += dstStride;
-+ }
-+}
-+
-+static void avg_h264_qpel4_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
-+ const int h=4;
-+ int i;
-+
-+ set_pico_config(&h264_qpel4_h_lowpass_config);
-+
-+ for(i=0; i<h; i++){
-+
-+ /*
-+ OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
-+ OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
-+ OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
-+ OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
-+ dst+=dstStride;\
-+ src+=srcStride;\ */
-+
-+ PICO_MVRC_W(PICO_INPIX0, LD32(src - 2));
-+ PICO_MVRC_D(PICO_INPIX2, LD64_UNALIGNED(src + 2));
-+ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6);
-+ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7);
-+ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8);
-+ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9);
-+ src += srcStride;
-+ ST32(dst, rnd_avg32(LD32(dst), PICO_GET_W(PICO_OUTPIX0)));
-+ dst += dstStride;
-+ }
-+}
-+
-+static struct pico_config_t h264_qpel4_v_lowpass_config1 = {
-+ .input_mode = PICO_VERT_FILTER_MODE,
-+ .output_mode = PICO_PACKED_MODE,
-+ .coeff_frac_bits = 5,
-+ .offset_frac_bits = 5,
-+ .coeff0_0 = 1,
-+ .coeff0_1 = -5,
-+ .coeff0_2 = 20,
-+ .coeff0_3 = 16,
-+ .coeff1_0 = 1,
-+ .coeff1_1 = -5,
-+ .coeff1_2 = 20,
-+ .coeff1_3 = 16,
-+ .coeff2_0 = 1,
-+ .coeff2_1 = -5,
-+ .coeff2_2 = 20,
-+ .coeff2_3 = 16
-+};
-+
-+
-+
-+static struct pico_config_t h264_qpel4_v_lowpass_config2 = {
-+ .input_mode = PICO_VERT_FILTER_MODE,
-+ .output_mode = PICO_PLANAR_MODE,
-+ .coeff_frac_bits = 5,
-+ .offset_frac_bits = 5,
-+ .coeff0_0 = 1,
-+ .coeff0_1 = -5,
-+ .coeff0_2 = 20,
-+ .coeff0_3 = 16,
-+ .coeff1_0 = 20,
-+ .coeff1_1 = -5,
-+ .coeff1_2 = 1,
-+ .coeff1_3 = 0,
-+ .coeff2_0 = 0,
-+ .coeff2_1 = 0,
-+ .coeff2_2 = 0,
-+ .coeff2_3 = 0
-+};
-+
-+static void put_h264_qpel4_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
-+
-+ /*
-+ const int w=4;
-+ uint8_t *cm = cropTbl + MAX_NEG_CROP;
-+ int i;
-+ for(i=0; i<w; i++)
-+ {
-+ const int srcB= src[-2*srcStride];\
-+ const int srcA= src[-1*srcStride];\
-+ const int src0= src[0 *srcStride];\
-+ const int src1= src[1 *srcStride];\
-+ const int src2= src[2 *srcStride];\
-+ const int src3= src[3 *srcStride];\
-+ const int src4= src[4 *srcStride];\
-+ const int src5= src[5 *srcStride];\
-+ const int src6= src[6 *srcStride];\
-+ OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
-+ OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
-+ OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
-+ OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
-+ dst++;\
-+ src++;\
-+ */
-+
-+ set_pico_config(&h264_qpel4_v_lowpass_config1);
-+
-+ {
-+ int srcB= LD32(src - 2*srcStride);
-+ int srcA= LD32(src - 1*srcStride);
-+ int src0= LD32(src + 0 *srcStride);
-+ int src1= LD32(src + 1 *srcStride);
-+ int src2= LD32(src + 2 *srcStride);
-+ int src3= LD32(src + 3 *srcStride);
-+ int src4= LD32(src + 4 *srcStride);
-+ int src5= LD32(src + 5 *srcStride);
-+ int src6= LD32(src + 6 *srcStride);
-+
-+ /* First compute the leftmost three colums */
-+ PICO_MVRC_W(PICO_INPIX0, srcB);
-+ PICO_MVRC_W(PICO_INPIX1, srcA);
-+ PICO_MVRC_W(PICO_INPIX2, src0);
-+ PICO_OP(0, 0, 0, 3, 6);
-+ PICO_MVRC_W(PICO_INPIX2, src1);
-+ PICO_MVRC_W(PICO_INPIX1, src2);
-+ PICO_MVRC_W(PICO_INPIX0, src3);
-+ PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
-+ ST32(dst, PICO_GET_W(PICO_OUTPIX0));
-+ dst += dstStride;
-+ PICO_MVRC_W(PICO_INPIX0, srcA);
-+ PICO_MVRC_W(PICO_INPIX1, src0);
-+ PICO_MVRC_W(PICO_INPIX2, src1);
-+ PICO_OP(0, 0, 0, 3, 6);
-+ PICO_MVRC_W(PICO_INPIX2, src2);
-+ PICO_MVRC_W(PICO_INPIX1, src3);
-+ PICO_MVRC_W(PICO_INPIX0, src4);
-+ PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
-+ ST32(dst, PICO_GET_W(PICO_OUTPIX0));
-+ dst += dstStride;
-+ PICO_MVRC_W(PICO_INPIX0, src0);
-+ PICO_MVRC_W(PICO_INPIX1, src1);
-+ PICO_MVRC_W(PICO_INPIX2, src2);
-+ PICO_OP(0, 0, 0, 3, 6);
-+ PICO_MVRC_W(PICO_INPIX2, src3);
-+ PICO_MVRC_W(PICO_INPIX1, src4);
-+ PICO_MVRC_W(PICO_INPIX0, src5);
-+ PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
-+ ST32(dst, PICO_GET_W(PICO_OUTPIX0));
-+ dst += dstStride;
-+ PICO_MVRC_W(PICO_INPIX0, src1);
-+ PICO_MVRC_W(PICO_INPIX1, src2);
-+ PICO_MVRC_W(PICO_INPIX2, src3);
-+ PICO_OP(0, 0, 0, 3, 6);
-+ PICO_MVRC_W(PICO_INPIX2, src4);
-+ PICO_MVRC_W(PICO_INPIX1, src5);
-+ PICO_MVRC_W(PICO_INPIX0, src6);
-+ PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
-+ ST32(dst, PICO_GET_W(PICO_OUTPIX0));
-+ /* Now compute the last column */
-+
-+ union wordbytes {
-+ int word;
-+ struct {
-+ unsigned int t:8;
-+ unsigned int u:8;
-+ unsigned int l:8;
-+ unsigned int b:8;
-+ } bytes; } tmp1, tmp2, tmp3;
-+
-+
-+ tmp1.bytes.t = srcB;
-+ tmp1.bytes.u = src1;
-+ tmp1.bytes.l = src4;
-+
-+ tmp2.bytes.t = srcA;
-+ tmp2.bytes.u = src2;
-+ tmp2.bytes.l = src5;
-+
-+ tmp3.bytes.t = src0;
-+ tmp3.bytes.u = src3;
-+ tmp3.bytes.l = src6;
-+
-+ PICO_MVRC_W(PICO_INPIX0, tmp1.word);
-+ PICO_MVRC_W(PICO_INPIX1, tmp2.word);
-+ PICO_MVRC_W(PICO_INPIX2, tmp3.word);
-+ set_pico_config(&h264_qpel4_v_lowpass_config2);
-+
-+
-+ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6);
-+ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7);
-+ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8);
-+ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9);
-+
-+ PICO_MVCR_W(tmp1.word, PICO_OUTPIX0);
-+ dst[3] = (char)(tmp1.bytes.b);
-+ dst[3 - dstStride] = (char)(tmp1.bytes.l);
-+ dst[3 - 2*dstStride] = (char)(tmp1.bytes.u);
-+ dst[3 - 3*dstStride] = (char)(tmp1.bytes.t);
-+
-+ }
-+ /*}
-+
-+
-+ }*/
-+}
-+
-+static void avg_h264_qpel4_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
-+
-+ /*
-+ const int w=4;
-+ uint8_t *cm = cropTbl + MAX_NEG_CROP;
-+ int i;
-+ for(i=0; i<w; i++)
-+ {
-+ const int srcB= src[-2*srcStride];\
-+ const int srcA= src[-1*srcStride];\
-+ const int src0= src[0 *srcStride];\
-+ const int src1= src[1 *srcStride];\
-+ const int src2= src[2 *srcStride];\
-+ const int src3= src[3 *srcStride];\
-+ const int src4= src[4 *srcStride];\
-+ const int src5= src[5 *srcStride];\
-+ const int src6= src[6 *srcStride];\
-+ OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
-+ OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
-+ OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
-+ OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
-+ dst++;\
-+ src++;\
-+ */
-+ uint8_t tmp_block[4*4];
-+
-+ set_pico_config(&h264_qpel4_v_lowpass_config1);
-+
-+ {
-+ int srcB= LD32(src - 2*srcStride);
-+ int srcA= LD32(src - 1*srcStride);
-+ int src0= LD32(src + 0 *srcStride);
-+ int src1= LD32(src + 1 *srcStride);
-+ int src2= LD32(src + 2 *srcStride);
-+ int src3= LD32(src + 3 *srcStride);
-+ int src4= LD32(src + 4 *srcStride);
-+ int src5= LD32(src + 5 *srcStride);
-+ int src6= LD32(src + 6 *srcStride);
-+
-+ /* First compute the leftmost three colums */
-+ PICO_MVRC_W(PICO_INPIX0, srcB);
-+ PICO_MVRC_W(PICO_INPIX1, srcA);
-+ PICO_MVRC_W(PICO_INPIX2, src0);
-+ PICO_OP(0, 0, 0, 3, 6);
-+ PICO_MVRC_W(PICO_INPIX2, src1);
-+ PICO_MVRC_W(PICO_INPIX1, src2);
-+ PICO_MVRC_W(PICO_INPIX0, src3);
-+ PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
-+ ST32(tmp_block, PICO_GET_W(PICO_OUTPIX0));
-+ PICO_MVRC_W(PICO_INPIX0, srcA);
-+ PICO_MVRC_W(PICO_INPIX1, src0);
-+ PICO_MVRC_W(PICO_INPIX2, src1);
-+ PICO_OP(0, 0, 0, 3, 6);
-+ PICO_MVRC_W(PICO_INPIX2, src2);
-+ PICO_MVRC_W(PICO_INPIX1, src3);
-+ PICO_MVRC_W(PICO_INPIX0, src4);
-+ PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
-+ ST32(tmp_block + 4, PICO_GET_W(PICO_OUTPIX0));
-+ PICO_MVRC_W(PICO_INPIX0, src0);
-+ PICO_MVRC_W(PICO_INPIX1, src1);
-+ PICO_MVRC_W(PICO_INPIX2, src2);
-+ PICO_OP(0, 0, 0, 3, 6);
-+ PICO_MVRC_W(PICO_INPIX2, src3);
-+ PICO_MVRC_W(PICO_INPIX1, src4);
-+ PICO_MVRC_W(PICO_INPIX0, src5);
-+ PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
-+ ST32(tmp_block + 8, PICO_GET_W(PICO_OUTPIX0));
-+ PICO_MVRC_W(PICO_INPIX0, src1);
-+ PICO_MVRC_W(PICO_INPIX1, src2);
-+ PICO_MVRC_W(PICO_INPIX2, src3);
-+ PICO_OP(0, 0, 0, 3, 6);
-+ PICO_MVRC_W(PICO_INPIX2, src4);
-+ PICO_MVRC_W(PICO_INPIX1, src5);
-+ PICO_MVRC_W(PICO_INPIX0, src6);
-+ PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
-+ ST32(tmp_block + 12, PICO_GET_W(PICO_OUTPIX0));
-+ /* Now compute the last column */
-+
-+ union wordbytes {
-+ int word;
-+ struct {
-+ unsigned int t:8;
-+ unsigned int u:8;
-+ unsigned int l:8;
-+ unsigned int b:8;
-+ } bytes; } tmp1, tmp2, tmp3;
-+
-+
-+ tmp1.bytes.t = srcB;
-+ tmp1.bytes.u = src1;
-+ tmp1.bytes.l = src4;
-+
-+ tmp2.bytes.t = srcA;
-+ tmp2.bytes.u = src2;
-+ tmp2.bytes.l = src5;
-+
-+ tmp3.bytes.t = src0;
-+ tmp3.bytes.u = src3;
-+ tmp3.bytes.l = src6;
-+
-+ PICO_MVRC_W(PICO_INPIX0, tmp1.word);
-+ PICO_MVRC_W(PICO_INPIX1, tmp2.word);
-+ PICO_MVRC_W(PICO_INPIX2, tmp3.word);
-+ set_pico_config(&h264_qpel4_v_lowpass_config2);
-+
-+
-+ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6);
-+ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7);
-+ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8);
-+ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9);
-+
-+ PICO_MVCR_W(tmp1.word, PICO_OUTPIX0);
-+ tmp_block[3 + 3*4] = (char)(tmp1.bytes.b);
-+ tmp_block[3 + 2*4] = (char)(tmp1.bytes.l);
-+ tmp_block[3 + 1*4] = (char)(tmp1.bytes.u);
-+ tmp_block[3] = (char)(tmp1.bytes.t);
-+
-+ /* Compute the average */
-+ srcB= LD32(dst);
-+ srcA= LD32(dst + dstStride);
-+ src0= LD32(dst + dstStride*2);
-+ src1= LD32(dst + dstStride*3);
-+
-+ src2= LD32(tmp_block);
-+ src3= LD32(tmp_block + 4);
-+ src4= LD32(tmp_block + 8);
-+ src5= LD32(tmp_block + 12);
-+
-+ ST32(dst, rnd_avg32(srcB, src2));
-+ ST32(dst + dstStride, rnd_avg32(srcA, src3));
-+ ST32(dst + 2*dstStride, rnd_avg32(src0, src4));
-+ ST32(dst + 3*dstStride, rnd_avg32(src1, src5));
-+ }
-+}
-+
-+static struct pico_config_t h264_qpel4_hv_lowpass_config = {
-+ .input_mode = PICO_HOR_FILTER_MODE,
-+ .output_mode = PICO_PACKED_MODE,
-+ .coeff_frac_bits = 10,
-+ .offset_frac_bits = 10,
-+ .coeff0_0 = 1,
-+ .coeff0_1 = -5,
-+ .coeff0_2 = 20,
-+ .coeff0_3 = 512,
-+ .coeff1_0 = -5,
-+ .coeff1_1 = 25,
-+ .coeff1_2 = -100,
-+ .coeff1_3 = 0,
-+ .coeff2_0 = 20,
-+ .coeff2_1 = -100,
-+ .coeff2_2 = 400,
-+ .coeff2_3 = 0
-+};
-+
-+static void put_h264_qpel4_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
-+
-+ int32_t tmp_block[48];
-+ int32_t *tmp = tmp_block;
-+ int i;
-+
-+ set_pico_config(&h264_qpel4_hv_lowpass_config);
-+
-+ src -= 2;
-+ for ( i = 0; i < 2; i++ ){
-+ int srcB= LD32(src - 2*srcStride);
-+ int srcA= LD32(src - 1*srcStride);
-+ int src0= LD32(src + 0 *srcStride);
-+ int src1= LD32(src + 1 *srcStride);
-+ int src2= LD32(src + 2 *srcStride);
-+ int src3= LD32(src + 3 *srcStride);
-+ int src4= LD32(src + 4 *srcStride);
-+ int src5= LD32(src + 5 *srcStride);
-+ int src6= LD32(src + 6 *srcStride);
-+
-+ PICO_MVRC_W(PICO_INPIX0, srcB);
-+ PICO_MVRC_W(PICO_INPIX1, srcA);
-+ PICO_MVRC_W(PICO_INPIX2, src0);
-+ PICO_OP(0, 0, 0, 4, 8);
-+ PICO_MVRC_W(PICO_INPIX2, src1);
-+ PICO_MVRC_W(PICO_INPIX1, src2);
-+ PICO_MVRC_W(PICO_INPIX0, src3);
-+ PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
-+ PICO_STCM_W(tmp,
-+ PICO_REGVECT_VMU0_OUT,
-+ PICO_REGVECT_VMU1_OUT,
-+ PICO_REGVECT_VMU2_OUT);
-+ tmp += 3;
-+
-+ PICO_OP(0, 0, 1, 5, 9);
-+ PICO_MVRC_W(PICO_INPIX0, srcB);
-+ PICO_MVRC_W(PICO_INPIX1, srcA);
-+ PICO_MVRC_W(PICO_INPIX2, src0);
-+ PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
-+ PICO_STCM_W(tmp,
-+ PICO_REGVECT_VMU0_OUT,
-+ PICO_REGVECT_VMU1_OUT,
-+ PICO_REGVECT_VMU2_OUT);
-+ tmp += 3;
-+
-+ PICO_MVRC_W(PICO_INPIX0, src1);
-+ PICO_OP(0, 0, 4, 8, 0);
-+ PICO_MVRC_W(PICO_INPIX2, src2);
-+ PICO_MVRC_W(PICO_INPIX1, src3);
-+ PICO_MVRC_W(PICO_INPIX0, src4);
-+ PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
-+ PICO_STCM_W(tmp,
-+ PICO_REGVECT_VMU0_OUT,
-+ PICO_REGVECT_VMU1_OUT,
-+ PICO_REGVECT_VMU2_OUT);
-+ tmp += 3;
-+
-+ PICO_OP(0, 0, 1, 5, 9);
-+ PICO_MVRC_W(PICO_INPIX0, srcA);
-+ PICO_MVRC_W(PICO_INPIX1, src0);
-+ PICO_MVRC_W(PICO_INPIX2, src1);
-+ PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
-+ PICO_STCM_W(tmp,
-+ PICO_REGVECT_VMU0_OUT,
-+ PICO_REGVECT_VMU1_OUT,
-+ PICO_REGVECT_VMU2_OUT);
-+ tmp += 3;
-+
-+ PICO_MVRC_W(PICO_INPIX0, src2);
-+ PICO_OP(0, 0, 4, 8, 0);
-+ PICO_MVRC_W(PICO_INPIX2, src3);
-+ PICO_MVRC_W(PICO_INPIX1, src4);
-+ PICO_MVRC_W(PICO_INPIX0, src5);
-+ PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
-+ PICO_STCM_W(tmp,
-+ PICO_REGVECT_VMU0_OUT,
-+ PICO_REGVECT_VMU1_OUT,
-+ PICO_REGVECT_VMU2_OUT);
-+ tmp += 3;
-+
-+ PICO_OP(0, 0, 1, 5, 9);
-+ PICO_MVRC_W(PICO_INPIX0, src0);
-+ PICO_MVRC_W(PICO_INPIX1, src1);
-+ PICO_MVRC_W(PICO_INPIX2, src2);
-+ PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
-+ PICO_STCM_W(tmp,
-+ PICO_REGVECT_VMU0_OUT,
-+ PICO_REGVECT_VMU1_OUT,
-+ PICO_REGVECT_VMU2_OUT);
-+ tmp += 3;
-+
-+ PICO_MVRC_W(PICO_INPIX0, src3);
-+ PICO_OP(0, 0, 4, 8, 0);
-+ PICO_MVRC_W(PICO_INPIX2, src4);
-+ PICO_MVRC_W(PICO_INPIX1, src5);
-+ PICO_MVRC_W(PICO_INPIX0, src6);
-+ PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
-+ PICO_STCM_W(tmp,
-+ PICO_REGVECT_VMU0_OUT,
-+ PICO_REGVECT_VMU1_OUT,
-+ PICO_REGVECT_VMU2_OUT);
-+ tmp += 3;
-+
-+ PICO_OP(0, 0, 1, 5, 9);
-+ PICO_MVRC_W(PICO_INPIX0, src1);
-+ PICO_MVRC_W(PICO_INPIX1, src2);
-+ PICO_MVRC_W(PICO_INPIX2, src3);
-+ PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
-+ PICO_STCM_W(tmp,
-+ PICO_REGVECT_VMU0_OUT,
-+ PICO_REGVECT_VMU1_OUT,
-+ PICO_REGVECT_VMU2_OUT);
-+ tmp += 3;
-+ src += 2;
-+ }
-+
-+ src -= 1;
-+ tmp -= 48;
-+
-+
-+ PICO_PUT_W(PICO_CONFIG,
-+ PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
-+ | PICO_INPUT_MODE(PICO_VERT_FILTER_MODE)
-+ | PICO_COEFF_FRAC_BITS(10)
-+ | PICO_OFFSET_FRAC_BITS(10));
-+
-+ for ( i = 0; i < 2; i++ ){
-+ int srcB= LD32(src - 2*srcStride);
-+ int srcA= LD32(src - 1*srcStride);
-+ int src0= LD32(src + 0 *srcStride);
-+ int src1= LD32(src + 1 *srcStride);
-+ int src2= LD32(src + 2 *srcStride);
-+ int src3= LD32(src + 3 *srcStride);
-+ int src4= LD32(src + 4 *srcStride);
-+ int src5= LD32(src + 5 *srcStride);
-+ int src6= LD32(src + 6 *srcStride);
-+
-+
-+ PICO_LDCM_W_INC(tmp,
-+ PICO_REGVECT_VMU0_OUT,
-+ PICO_REGVECT_VMU1_OUT,
-+ PICO_REGVECT_VMU2_OUT);
-+ PICO_MVRC_W(PICO_INPIX0, srcB);
-+ PICO_MVRC_W(PICO_INPIX1, srcA);
-+ PICO_MVRC_W(PICO_INPIX2, src0);
-+ PICO_OP(PICO_USE_ACC, 0, 6, 3, 0);
-+ PICO_MVRC_W(PICO_INPIX2, src1);
-+ PICO_MVRC_W(PICO_INPIX1, src2);
-+ PICO_MVRC_W(PICO_INPIX0, src3);
-+ PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 0, 6, 3, 0);
-+
-+ PICO_LDCM_W_INC(tmp,
-+ PICO_REGVECT_VMU0_OUT,
-+ PICO_REGVECT_VMU1_OUT,
-+ PICO_REGVECT_VMU2_OUT);
-+ PICO_OP(PICO_USE_ACC, 1, 9, 6, 3);
-+ PICO_MVRC_W(PICO_INPIX0, srcB);
-+ PICO_MVRC_W(PICO_INPIX1, srcA);
-+ PICO_MVRC_W(PICO_INPIX2, src0);
-+ PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 1, 9, 6, 3);
-+
-+ PICO_LDCM_W_INC(tmp,
-+ PICO_REGVECT_VMU0_OUT,
-+ PICO_REGVECT_VMU1_OUT,
-+ PICO_REGVECT_VMU2_OUT);
-+ PICO_MVRC_W(PICO_INPIX0, srcA);
-+ PICO_MVRC_W(PICO_INPIX1, src0);
-+ PICO_MVRC_W(PICO_INPIX2, src1);
-+ PICO_OP(PICO_USE_ACC, 2, 6, 3, 0);
-+ PICO_MVRC_W(PICO_INPIX2, src2);
-+ PICO_MVRC_W(PICO_INPIX1, src3);
-+ PICO_MVRC_W(PICO_INPIX0, src4);
-+ PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 2, 6, 3, 0);
-+
-+ PICO_LDCM_W_INC(tmp,
-+ PICO_REGVECT_VMU0_OUT,
-+ PICO_REGVECT_VMU1_OUT,
-+ PICO_REGVECT_VMU2_OUT);
-+ PICO_OP(PICO_USE_ACC, 3, 9, 6, 3);
-+ PICO_MVRC_W(PICO_INPIX0, srcA);
-+ PICO_MVRC_W(PICO_INPIX1, src0);
-+ PICO_MVRC_W(PICO_INPIX2, src1);
-+ PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 3, 9, 6, 3);
-+
-+ ST16(dst + 0*dstStride, (short)(PICO_GET_W(PICO_OUTPIX0) >> 16));
-+ ST16(dst + 1*dstStride, (short)PICO_GET_W(PICO_OUTPIX0));
-+
-+
-+ PICO_LDCM_W_INC(tmp,
-+ PICO_REGVECT_VMU0_OUT,
-+ PICO_REGVECT_VMU1_OUT,
-+ PICO_REGVECT_VMU2_OUT);
-+ PICO_MVRC_W(PICO_INPIX0, src0);
-+ PICO_MVRC_W(PICO_INPIX1, src1);
-+ PICO_MVRC_W(PICO_INPIX2, src2);
-+ PICO_OP(PICO_USE_ACC, 0, 6, 3, 0);
-+ PICO_MVRC_W(PICO_INPIX2, src3);
-+ PICO_MVRC_W(PICO_INPIX1, src4);
-+ PICO_MVRC_W(PICO_INPIX0, src5);
-+ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 0, 6, 3, 0);
-+
-+ PICO_LDCM_W_INC(tmp,
-+ PICO_REGVECT_VMU0_OUT,
-+ PICO_REGVECT_VMU1_OUT,
-+ PICO_REGVECT_VMU2_OUT);
-+ PICO_OP(PICO_USE_ACC, 1, 9, 6, 3);
-+ PICO_MVRC_W(PICO_INPIX0, src0);
-+ PICO_MVRC_W(PICO_INPIX1, src1);
-+ PICO_MVRC_W(PICO_INPIX2, src2);
-+ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 1, 9, 6, 3);
-+
-+ PICO_LDCM_W_INC(tmp,
-+ PICO_REGVECT_VMU0_OUT,
-+ PICO_REGVECT_VMU1_OUT,
-+ PICO_REGVECT_VMU2_OUT);
-+ PICO_MVRC_W(PICO_INPIX0, src1);
-+ PICO_MVRC_W(PICO_INPIX1, src2);
-+ PICO_MVRC_W(PICO_INPIX2, src3);
-+ PICO_OP(PICO_USE_ACC, 2, 6, 3, 0);
-+ PICO_MVRC_W(PICO_INPIX2, src4);
-+ PICO_MVRC_W(PICO_INPIX1, src5);
-+ PICO_MVRC_W(PICO_INPIX0, src6);
-+ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 2, 6, 3, 0);
-+
-+ PICO_LDCM_W_INC(tmp,
-+ PICO_REGVECT_VMU0_OUT,
-+ PICO_REGVECT_VMU1_OUT,
-+ PICO_REGVECT_VMU2_OUT);
-+ PICO_OP(PICO_USE_ACC, 3, 9, 6, 3);
-+ PICO_MVRC_W(PICO_INPIX0, src1);
-+ PICO_MVRC_W(PICO_INPIX1, src2);
-+ PICO_MVRC_W(PICO_INPIX2, src3);
-+ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 3, 9, 6, 3);
-+
-+ ST16(dst + 2*dstStride, (short)(PICO_GET_W(PICO_OUTPIX0) >> 16));
-+ ST16(dst + 3*dstStride, (short)PICO_GET_W(PICO_OUTPIX0));
-+
-+ dst += 2;
-+ src += 2;
-+ }
-+}
-+
-+
-+
-+
-+static void avg_h264_qpel4_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
-+
-+ int32_t tmp_block[48];
-+ int32_t *tmp = tmp_block;
-+ int i;
-+
-+ set_pico_config(&h264_qpel4_hv_lowpass_config);
-+
-+ src -= 2;
-+ for ( i = 0; i < 2; i++ ){
-+ int srcB= LD32(src - 2*srcStride);
-+ int srcA= LD32(src - 1*srcStride);
-+ int src0= LD32(src + 0 *srcStride);
-+ int src1= LD32(src + 1 *srcStride);
-+ int src2= LD32(src + 2 *srcStride);
-+ int src3= LD32(src + 3 *srcStride);
-+ int src4= LD32(src + 4 *srcStride);
-+ int src5= LD32(src + 5 *srcStride);
-+ int src6= LD32(src + 6 *srcStride);
-+
-+ PICO_MVRC_W(PICO_INPIX0, srcB);
-+ PICO_MVRC_W(PICO_INPIX1, srcA);
-+ PICO_MVRC_W(PICO_INPIX2, src0);
-+ PICO_OP(0, 0, 0, 4, 8);
-+ PICO_MVRC_W(PICO_INPIX2, src1);
-+ PICO_MVRC_W(PICO_INPIX1, src2);
-+ PICO_MVRC_W(PICO_INPIX0, src3);
-+ PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
-+ PICO_STCM_W(tmp,
-+ PICO_REGVECT_VMU0_OUT,
-+ PICO_REGVECT_VMU1_OUT,
-+ PICO_REGVECT_VMU2_OUT);
-+ tmp += 3;
-+
-+ PICO_OP(0, 0, 1, 5, 9);
-+ PICO_MVRC_W(PICO_INPIX0, srcB);
-+ PICO_MVRC_W(PICO_INPIX1, srcA);
-+ PICO_MVRC_W(PICO_INPIX2, src0);
-+ PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
-+ PICO_STCM_W(tmp,
-+ PICO_REGVECT_VMU0_OUT,
-+ PICO_REGVECT_VMU1_OUT,
-+ PICO_REGVECT_VMU2_OUT);
-+ tmp += 3;
-+
-+ PICO_MVRC_W(PICO_INPIX0, src1);
-+ PICO_OP(0, 0, 4, 8, 0);
-+ PICO_MVRC_W(PICO_INPIX2, src2);
-+ PICO_MVRC_W(PICO_INPIX1, src3);
-+ PICO_MVRC_W(PICO_INPIX0, src4);
-+ PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
-+ PICO_STCM_W(tmp,
-+ PICO_REGVECT_VMU0_OUT,
-+ PICO_REGVECT_VMU1_OUT,
-+ PICO_REGVECT_VMU2_OUT);
-+ tmp += 3;
-+
-+ PICO_OP(0, 0, 1, 5, 9);
-+ PICO_MVRC_W(PICO_INPIX0, srcA);
-+ PICO_MVRC_W(PICO_INPIX1, src0);
-+ PICO_MVRC_W(PICO_INPIX2, src1);
-+ PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
-+ PICO_STCM_W(tmp,
-+ PICO_REGVECT_VMU0_OUT,
-+ PICO_REGVECT_VMU1_OUT,
-+ PICO_REGVECT_VMU2_OUT);
-+ tmp += 3;
-+
-+ PICO_MVRC_W(PICO_INPIX0, src2);
-+ PICO_OP(0, 0, 4, 8, 0);
-+ PICO_MVRC_W(PICO_INPIX2, src3);
-+ PICO_MVRC_W(PICO_INPIX1, src4);
-+ PICO_MVRC_W(PICO_INPIX0, src5);
-+ PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
-+ PICO_STCM_W(tmp,
-+ PICO_REGVECT_VMU0_OUT,
-+ PICO_REGVECT_VMU1_OUT,
-+ PICO_REGVECT_VMU2_OUT);
-+ tmp += 3;
-+
-+ PICO_OP(0, 0, 1, 5, 9);
-+ PICO_MVRC_W(PICO_INPIX0, src0);
-+ PICO_MVRC_W(PICO_INPIX1, src1);
-+ PICO_MVRC_W(PICO_INPIX2, src2);
-+ PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
-+ PICO_STCM_W(tmp,
-+ PICO_REGVECT_VMU0_OUT,
-+ PICO_REGVECT_VMU1_OUT,
-+ PICO_REGVECT_VMU2_OUT);
-+ tmp += 3;
-+
-+ PICO_MVRC_W(PICO_INPIX0, src3);
-+ PICO_OP(0, 0, 4, 8, 0);
-+ PICO_MVRC_W(PICO_INPIX2, src4);
-+ PICO_MVRC_W(PICO_INPIX1, src5);
-+ PICO_MVRC_W(PICO_INPIX0, src6);
-+ PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
-+ PICO_STCM_W(tmp,
-+ PICO_REGVECT_VMU0_OUT,
-+ PICO_REGVECT_VMU1_OUT,
-+ PICO_REGVECT_VMU2_OUT);
-+ tmp += 3;
-+
-+ PICO_OP(0, 0, 1, 5, 9);
-+ PICO_MVRC_W(PICO_INPIX0, src1);
-+ PICO_MVRC_W(PICO_INPIX1, src2);
-+ PICO_MVRC_W(PICO_INPIX2, src3);
-+ PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
-+ PICO_STCM_W(tmp,
-+ PICO_REGVECT_VMU0_OUT,
-+ PICO_REGVECT_VMU1_OUT,
-+ PICO_REGVECT_VMU2_OUT);
-+ tmp += 3;
-+ src += 2;
-+ }
-+
-+ src -= 1;
-+ tmp -= 48;
-+
-+
-+ PICO_PUT_W(PICO_CONFIG,
-+ PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
-+ | PICO_INPUT_MODE(PICO_VERT_FILTER_MODE)
-+ | PICO_COEFF_FRAC_BITS(10)
-+ | PICO_OFFSET_FRAC_BITS(10));
-+
-+ for ( i = 0; i < 2; i++ ){
-+ int srcB= LD32(src - 2*srcStride);
-+ int srcA= LD32(src - 1*srcStride);
-+ int src0= LD32(src + 0 *srcStride);
-+ int src1= LD32(src + 1 *srcStride);
-+ int src2= LD32(src + 2 *srcStride);
-+ int src3= LD32(src + 3 *srcStride);
-+ int src4= LD32(src + 4 *srcStride);
-+ int src5= LD32(src + 5 *srcStride);
-+ int src6= LD32(src + 6 *srcStride);
-+
-+ PICO_LDCM_W_INC(tmp,
-+ PICO_REGVECT_VMU0_OUT,
-+ PICO_REGVECT_VMU1_OUT,
-+ PICO_REGVECT_VMU2_OUT);
-+ PICO_MVRC_W(PICO_INPIX0, srcB);
-+ PICO_MVRC_W(PICO_INPIX1, srcA);
-+ PICO_MVRC_W(PICO_INPIX2, src0);
-+ PICO_OP(PICO_USE_ACC, 0, 6, 3, 0);
-+ PICO_MVRC_W(PICO_INPIX2, src1);
-+ PICO_MVRC_W(PICO_INPIX1, src2);
-+ PICO_MVRC_W(PICO_INPIX0, src3);
-+ PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 0, 6, 3, 0);
-+
-+ PICO_LDCM_W_INC(tmp,
-+ PICO_REGVECT_VMU0_OUT,
-+ PICO_REGVECT_VMU1_OUT,
-+ PICO_REGVECT_VMU2_OUT);
-+ PICO_OP(PICO_USE_ACC, 1, 9, 6, 3);
-+ PICO_MVRC_W(PICO_INPIX0, srcB);
-+ PICO_MVRC_W(PICO_INPIX1, srcA);
-+ PICO_MVRC_W(PICO_INPIX2, src0);
-+ PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 1, 9, 6, 3);
-+
-+ PICO_LDCM_W_INC(tmp,
-+ PICO_REGVECT_VMU0_OUT,
-+ PICO_REGVECT_VMU1_OUT,
-+ PICO_REGVECT_VMU2_OUT);
-+ PICO_MVRC_W(PICO_INPIX0, srcA);
-+ PICO_MVRC_W(PICO_INPIX1, src0);
-+ PICO_MVRC_W(PICO_INPIX2, src1);
-+ PICO_OP(PICO_USE_ACC, 2, 6, 3, 0);
-+ PICO_MVRC_W(PICO_INPIX2, src2);
-+ PICO_MVRC_W(PICO_INPIX1, src3);
-+ PICO_MVRC_W(PICO_INPIX0, src4);
-+ PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 2, 6, 3, 0);
-+
-+ PICO_LDCM_W_INC(tmp,
-+ PICO_REGVECT_VMU0_OUT,
-+ PICO_REGVECT_VMU1_OUT,
-+ PICO_REGVECT_VMU2_OUT);
-+ PICO_OP(PICO_USE_ACC, 3, 9, 6, 3);
-+ PICO_MVRC_W(PICO_INPIX0, srcA);
-+ PICO_MVRC_W(PICO_INPIX1, src0);
-+ PICO_MVRC_W(PICO_INPIX2, src1);
-+ PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 3, 9, 6, 3);
-+
-+ ST16(dst + 0*dstStride, rnd_avg32(LD16(dst + 0*dstStride), PICO_GET_W(PICO_OUTPIX0) >> 16));
-+ ST16(dst + 1*dstStride, rnd_avg32(LD16(dst + 1*dstStride), PICO_GET_W(PICO_OUTPIX0)));
-+
-+
-+ PICO_LDCM_W_INC(tmp,
-+ PICO_REGVECT_VMU0_OUT,
-+ PICO_REGVECT_VMU1_OUT,
-+ PICO_REGVECT_VMU2_OUT);
-+ PICO_MVRC_W(PICO_INPIX0, src0);
-+ PICO_MVRC_W(PICO_INPIX1, src1);
-+ PICO_MVRC_W(PICO_INPIX2, src2);
-+ PICO_OP(PICO_USE_ACC, 0, 6, 3, 0);
-+ PICO_MVRC_W(PICO_INPIX2, src3);
-+ PICO_MVRC_W(PICO_INPIX1, src4);
-+ PICO_MVRC_W(PICO_INPIX0, src5);
-+ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 0, 6, 3, 0);
-+
-+ PICO_LDCM_W_INC(tmp,
-+ PICO_REGVECT_VMU0_OUT,
-+ PICO_REGVECT_VMU1_OUT,
-+ PICO_REGVECT_VMU2_OUT);
-+ PICO_OP(PICO_USE_ACC, 1, 9, 6, 3);
-+ PICO_MVRC_W(PICO_INPIX0, src0);
-+ PICO_MVRC_W(PICO_INPIX1, src1);
-+ PICO_MVRC_W(PICO_INPIX2, src2);
-+ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 1, 9, 6, 3);
-+
-+ PICO_LDCM_W_INC(tmp,
-+ PICO_REGVECT_VMU0_OUT,
-+ PICO_REGVECT_VMU1_OUT,
-+ PICO_REGVECT_VMU2_OUT);
-+ PICO_MVRC_W(PICO_INPIX0, src1);
-+ PICO_MVRC_W(PICO_INPIX1, src2);
-+ PICO_MVRC_W(PICO_INPIX2, src3);
-+ PICO_OP(PICO_USE_ACC, 2, 6, 3, 0);
-+ PICO_MVRC_W(PICO_INPIX2, src4);
-+ PICO_MVRC_W(PICO_INPIX1, src5);
-+ PICO_MVRC_W(PICO_INPIX0, src6);
-+ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 2, 6, 3, 0);
-+
-+ PICO_LDCM_W_INC(tmp,
-+ PICO_REGVECT_VMU0_OUT,
-+ PICO_REGVECT_VMU1_OUT,
-+ PICO_REGVECT_VMU2_OUT);
-+ PICO_OP(PICO_USE_ACC, 3, 9, 6, 3);
-+ PICO_MVRC_W(PICO_INPIX0, src1);
-+ PICO_MVRC_W(PICO_INPIX1, src2);
-+ PICO_MVRC_W(PICO_INPIX2, src3);
-+ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 3, 9, 6, 3);
-+
-+ ST16(dst + 2*dstStride, rnd_avg32(LD16(dst + 2*dstStride), PICO_GET_W(PICO_OUTPIX0) >> 16));
-+ ST16(dst + 3*dstStride, rnd_avg32(LD16(dst + 3*dstStride), PICO_GET_W(PICO_OUTPIX0)));
-+
-+ dst += 2;
-+ src += 2;
-+ }
-+}
-+
-+
-+static void put_h264_qpel8_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
-+ put_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride);
-+ put_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride);
-+ src += 4*srcStride;
-+ dst += 4*dstStride;
-+ put_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride);
-+ put_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride);
-+}
-+
-+static void avg_h264_qpel8_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
-+ avg_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride);
-+ avg_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride);
-+ src += 4*srcStride;
-+ dst += 4*dstStride;
-+ avg_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride);
-+ avg_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride);
-+}
-+
-+static void put_h264_qpel8_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
-+ put_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride);
-+ put_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride);
-+ src += 4*srcStride;
-+ dst += 4*dstStride;
-+ put_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride);
-+ put_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride);
-+}
-+
-+static void avg_h264_qpel8_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
-+ avg_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride);
-+ avg_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride);
-+ src += 4*srcStride;
-+ dst += 4*dstStride;
-+ avg_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride);
-+ avg_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride);
-+}
-+
-+static void put_h264_qpel8_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
-+ put_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride);
-+ put_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride);
-+ src += 4*srcStride;
-+ dst += 4*dstStride;
-+ put_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride);
-+ put_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride);
-+}
-+
-+static void avg_h264_qpel8_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
-+ avg_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride);
-+ avg_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride);
-+ src += 4*srcStride;
-+ dst += 4*dstStride;
-+ avg_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride);
-+ avg_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride);
-+}
-+
-+static void put_h264_qpel16_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
-+ put_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride);
-+ put_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride);
-+ src += 8*srcStride;
-+ dst += 8*dstStride;
-+ put_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride);
-+ put_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride);
-+}
-+
-+static void avg_h264_qpel16_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
-+ avg_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride);
-+ avg_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride);
-+ src += 8*srcStride;
-+ dst += 8*dstStride;
-+ avg_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride);
-+ avg_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride);
-+}
-+
-+static void put_h264_qpel16_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
-+ put_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride);
-+ put_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride);
-+ src += 8*srcStride;
-+ dst += 8*dstStride;
-+ put_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride);
-+ put_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride);
-+}
-+
-+static void avg_h264_qpel16_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
-+ avg_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride);
-+ avg_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride);
-+ src += 8*srcStride;
-+ dst += 8*dstStride;
-+ avg_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride);
-+ avg_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride);
-+}
-+
-+static void put_h264_qpel16_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
-+ put_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride);
-+ put_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride);
-+ src += 8*srcStride;
-+ dst += 8*dstStride;
-+ put_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride);
-+ put_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride);
-+}
-+
-+static void avg_h264_qpel16_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
-+ avg_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride);
-+ avg_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride);
-+ src += 8*srcStride;
-+ dst += 8*dstStride;
-+ avg_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride);
-+ avg_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride);
-+}
-+
-+
-+#define H264_MC(OPNAME, SIZE) \
-+static void OPNAME ## h264_qpel ## SIZE ## _mc00_pico (uint8_t *dst, uint8_t *src, int stride){\
-+ OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
-+}\
-+\
-+static void OPNAME ## h264_qpel ## SIZE ## _mc10_pico(uint8_t *dst, uint8_t *src, int stride){\
-+ uint8_t half[SIZE*SIZE];\
-+ put_h264_qpel ## SIZE ## _h_lowpass_pico(half, src, SIZE, stride);\
-+ OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
-+}\
-+\
-+static void OPNAME ## h264_qpel ## SIZE ## _mc20_pico(uint8_t *dst, uint8_t *src, int stride){\
-+ OPNAME ## h264_qpel ## SIZE ## _h_lowpass_pico(dst, src, stride, stride);\
-+}\
-+\
-+static void OPNAME ## h264_qpel ## SIZE ## _mc30_pico(uint8_t *dst, uint8_t *src, int stride){\
-+ uint8_t half[SIZE*SIZE];\
-+ put_h264_qpel ## SIZE ## _h_lowpass_pico(half, src, SIZE, stride);\
-+ OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
-+}\
-+\
-+static void OPNAME ## h264_qpel ## SIZE ## _mc01_pico(uint8_t *dst, uint8_t *src, int stride){\
-+ uint8_t full[SIZE*(SIZE+5)];\
-+ uint8_t * const full_mid= full + SIZE*2;\
-+ uint8_t half[SIZE*SIZE];\
-+ copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
-+ put_h264_qpel ## SIZE ## _v_lowpass_pico(half, full_mid, SIZE, SIZE);\
-+ OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
-+}\
-+\
-+static void OPNAME ## h264_qpel ## SIZE ## _mc02_pico(uint8_t *dst, uint8_t *src, int stride){\
-+ uint8_t full[SIZE*(SIZE+5)];\
-+ uint8_t * const full_mid= full + SIZE*2;\
-+ copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
-+ OPNAME ## h264_qpel ## SIZE ## _v_lowpass_pico(dst, full_mid, stride, SIZE);\
-+}\
-+\
-+static void OPNAME ## h264_qpel ## SIZE ## _mc03_pico(uint8_t *dst, uint8_t *src, int stride){\
-+ uint8_t full[SIZE*(SIZE+5)];\
-+ uint8_t * const full_mid= full + SIZE*2;\
-+ uint8_t half[SIZE*SIZE];\
-+ copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
-+ put_h264_qpel ## SIZE ## _v_lowpass_pico(half, full_mid, SIZE, SIZE);\
-+ OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
-+}\
-+\
-+static void OPNAME ## h264_qpel ## SIZE ## _mc11_pico(uint8_t *dst, uint8_t *src, int stride){\
-+ uint8_t full[SIZE*(SIZE+5)];\
-+ uint8_t * const full_mid= full + SIZE*2;\
-+ uint8_t halfH[SIZE*SIZE];\
-+ uint8_t halfV[SIZE*SIZE];\
-+ put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src, SIZE, stride);\
-+ copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
-+ put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
-+ OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
-+}\
-+\
-+static void OPNAME ## h264_qpel ## SIZE ## _mc31_pico(uint8_t *dst, uint8_t *src, int stride){\
-+ uint8_t full[SIZE*(SIZE+5)];\
-+ uint8_t * const full_mid= full + SIZE*2;\
-+ uint8_t halfH[SIZE*SIZE];\
-+ uint8_t halfV[SIZE*SIZE];\
-+ put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src, SIZE, stride);\
-+ copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
-+ put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
-+ OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
-+}\
-+\
-+static void OPNAME ## h264_qpel ## SIZE ## _mc13_pico(uint8_t *dst, uint8_t *src, int stride){\
-+ uint8_t full[SIZE*(SIZE+5)];\
-+ uint8_t * const full_mid= full + SIZE*2;\
-+ uint8_t halfH[SIZE*SIZE];\
-+ uint8_t halfV[SIZE*SIZE];\
-+ put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src + stride, SIZE, stride);\
-+ copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
-+ put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
-+ OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
-+}\
-+\
-+static void OPNAME ## h264_qpel ## SIZE ## _mc33_pico(uint8_t *dst, uint8_t *src, int stride){\
-+ uint8_t full[SIZE*(SIZE+5)];\
-+ uint8_t * const full_mid= full + SIZE*2;\
-+ uint8_t halfH[SIZE*SIZE];\
-+ uint8_t halfV[SIZE*SIZE];\
-+ put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src + stride, SIZE, stride);\
-+ copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
-+ put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
-+ OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
-+}\
-+\
-+static void OPNAME ## h264_qpel ## SIZE ## _mc22_pico(uint8_t *dst, uint8_t *src, int stride){\
-+ OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_pico(dst, src, stride, stride);\
-+}\
-+\
-+static void OPNAME ## h264_qpel ## SIZE ## _mc21_pico(uint8_t *dst, uint8_t *src, int stride){\
-+ uint8_t halfH[SIZE*SIZE];\
-+ uint8_t halfHV[SIZE*SIZE];\
-+ put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src, SIZE, stride);\
-+ put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\
-+ OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
-+}\
-+\
-+static void OPNAME ## h264_qpel ## SIZE ## _mc23_pico(uint8_t *dst, uint8_t *src, int stride){\
-+ uint8_t halfH[SIZE*SIZE];\
-+ uint8_t halfHV[SIZE*SIZE];\
-+ put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src + stride, SIZE, stride);\
-+ put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\
-+ OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
-+}\
-+\
-+static void OPNAME ## h264_qpel ## SIZE ## _mc12_pico(uint8_t *dst, uint8_t *src, int stride){\
-+ uint8_t full[SIZE*(SIZE+5)];\
-+ uint8_t * const full_mid= full + SIZE*2;\
-+ uint8_t halfV[SIZE*SIZE];\
-+ uint8_t halfHV[SIZE*SIZE];\
-+ copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
-+ put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
-+ put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\
-+ OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
-+}\
-+\
-+static void OPNAME ## h264_qpel ## SIZE ## _mc32_pico(uint8_t *dst, uint8_t *src, int stride){\
-+ uint8_t full[SIZE*(SIZE+5)];\
-+ uint8_t * const full_mid= full + SIZE*2;\
-+ uint8_t halfV[SIZE*SIZE];\
-+ uint8_t halfHV[SIZE*SIZE];\
-+ copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
-+ put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
-+ put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\
-+ OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
-+}\
-+
-+H264_MC(put_, 4)
-+H264_MC(put_, 8)
-+H264_MC(put_, 16)
-+H264_MC(avg_, 4)
-+H264_MC(avg_, 8)
-+H264_MC(avg_, 16)
-+
-+
-+
-+#define dspfunc16(PFX) \
-+ void PFX ## _pixels16_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \
-+ PFX ## _pixels8_avr32(dst, pixels, line_size, h);\
-+ PFX ## _pixels8_avr32(dst + 8, pixels + 8, line_size, h);\
-+ }\
-+ void PFX ## _pixels16_h_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \
-+ PFX ## _pixels8_h_avr32(dst, pixels, line_size, h);\
-+ PFX ## _pixels8_h_avr32(dst + 8, pixels + 8, line_size, h);\
-+ }\
-+ void PFX ## _pixels16_v_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \
-+ PFX ## _pixels8_v_avr32(dst, pixels, line_size, h);\
-+ PFX ## _pixels8_v_avr32(dst + 8, pixels + 8, line_size, h);\
-+ }\
-+ void PFX ## _pixels16_hv_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \
-+ PFX ## _pixels8_hv_avr32(dst, pixels, line_size, h);\
-+ PFX ## _pixels8_hv_avr32(dst + 8, pixels + 8, line_size, h);\
-+ }\
-+
-+
-+dspfunc16(put)
-+dspfunc16(put_no_rnd)
-+dspfunc16(avg)
-+dspfunc16(avg_no_rnd)
-+#undef dspfunc16
-+
-+static int pix_sum_avr32(uint8_t * pix, int line_size)
-+{
-+ int s, i;
-+
-+ s = 0;
-+ for (i = 0; i < 16; i++) {
-+ int tmp1,tmp2,tmp3,tmp4,tmp5;
-+ __asm__ volatile ( "ld.w\t%0, %6[0]\n\t"
-+ "ld.w\t%1, %6[4]\n\t"
-+ "ld.w\t%2, %6[8]\n\t"
-+ "ld.w\t%3, %6[12]\n\t"
-+ "punpckub.h\t%4, %0:t\n\t"
-+ "padd.h\t%5, %5, %4\n\t"
-+ "punpckub.h\t%4, %0:b\n\t"
-+ "padd.h\t%5, %5, %4\n\t"
-+ "punpckub.h\t%4, %1:t\n\t"
-+ "padd.h\t%5, %5, %4\n\t"
-+ "punpckub.h\t%4, %1:b\n\t"
-+ "padd.h\t%5, %5, %4\n\t"
-+ "punpckub.h\t%4, %2:t\n\t"
-+ "padd.h\t%5, %5, %4\n\t"
-+ "punpckub.h\t%4, %2:b\n\t"
-+ "padd.h\t%5, %5, %4\n\t"
-+ "punpckub.h\t%4, %3:t\n\t"
-+ "padd.h\t%5, %5, %4\n\t"
-+ "punpckub.h\t%4, %3:b\n\t"
-+ "padd.h\t%5, %5, %4\n\t"
-+ : "=&r"(tmp1),"=&r"(tmp2),"=&r"(tmp3),"=&r"(tmp4),"=&r"(tmp5),"=&r"(s)
-+ : "r"(pix));
-+ pix += line_size;
-+ }
-+ __asm__ volatile ( "addhh.w\t%0, %0:t, %0:b" : "=&r" (s) );
-+
-+ return s;
-+}
-+
-+
-+//#define op_scale1(x) block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
-+//#define op_scale2(x) dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
-+//#define H264_WEIGHT(W,H) \
-+//static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
-+// int attribute_unused x, y; \
-+// offset <<= log2_denom; \
-+// if(log2_denom) offset += 1<<(log2_denom-1); \
-+// for(y=0; y<H; y++, block += stride){ \
-+// uint32_t tmp0, tmp1;
-+// if(W==2) { \
-+// asm volatile ( "ld.ub\t%[tmp0], %[block][0]\n" \
-+// "ld.ub\t%[tmp1], %[block][1]\n" \
-+// "mulhh.w\t%[tmp0], %[tmp0]:b, %[weight]:b\n" \
-+// "mulhh.w\t%[tmp1], %[tmp1]:b, %[weight]:b\n" \
-+// "asr\t%[tmp0], %[log2_denom]\n" \
-+// "asr\t%[tmp1], %[log2_denom]\n" \
-+// "satu\t%[tmp0] >> 0, 8\n" \
-+// "satu\t%[tmp1] >> 0, 8\n" \
-+// "st.b\t%[block][0], %[tmp0]\n" \
-+// "st.b\t%[block][1], %[tmp1]\n" \
-+// : [tmp0] "=&r"(tmp0), [tmp1] "=&r"(tmp1) \
-+// : [block] "r"(block), [weight]"r"(weight), [log2_denom]"r"(log2denom) ); \
-+// } else if ( W==4 ) { \
-+// asm volatile ( "ld.w\t%[tmp0], %[block][0]\n" \
-+// "punpckub.h\t%[tmp1], %[tmp0]:t\n" \
-+// "punpckub.h\t%[tmp0], %[tmp0]:b\n" \
-+// "mulhh.w\t%[tmp2], %[tmp1]:t, %[weight]:b\n" \
-+// "mulhh.w\t%[tmp1], %[tmp1]:b, %[weight]:b\n" \
-+// "asr\t%[tmp0], %[log2_denom]\n" \
-+// "asr\t%[tmp1], %[log2_denom]\n" \
-+// "satu\t%[tmp0] >> 0, 8\n" \
-+// "satu\t%[tmp1] >> 0, 8\n" \
-+// "st.b\t%[block][0], %[tmp0]\n" \
-+// "st.b\t%[block][1], %[tmp1]\n" \
-+// : [tmp0] "=&r"(tmp0), [tmp1] "=&r"(tmp1) \
-+// : [block] "r"(block), [weight]"r"(weight), [log2_denom]"r"(log2denom) ); \
-+//
-+//
-+//
-+// if(W==4) continue; \
-+// op_scale1(4); \
-+// op_scale1(5); \
-+// op_scale1(6); \
-+// op_scale1(7); \
-+// if(W==8) continue; \
-+// op_scale1(8); \
-+// op_scale1(9); \
-+// op_scale1(10); \
-+// op_scale1(11); \
-+// op_scale1(12); \
-+// op_scale1(13); \
-+// op_scale1(14); \
-+// op_scale1(15); \
-+// } \
-+//} \
-+//static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets){ \
-+// int attribute_unused x, y; \
-+// int offset = (offsets + offsetd + 1) >> 1; \
-+// offset = ((offset << 1) + 1) << log2_denom; \
-+// for(y=0; y<H; y++, dst += stride, src += stride){ \
-+// op_scale2(0); \
-+// op_scale2(1); \
-+// if(W==2) continue; \
-+// op_scale2(2); \
-+// op_scale2(3); \
-+// if(W==4) continue; \
-+// op_scale2(4); \
-+// op_scale2(5); \
-+// op_scale2(6); \
-+// op_scale2(7); \
-+// if(W==8) continue; \
-+// op_scale2(8); \
-+// op_scale2(9); \
-+// op_scale2(10); \
-+// op_scale2(11); \
-+// op_scale2(12); \
-+// op_scale2(13); \
-+// op_scale2(14); \
-+// op_scale2(15); \
-+// } \
-+//}
-+
-+
-+
-+/* Returns zero in each byte where the absolute difference between <a> and <b>
-+ is not less than <compare> */
-+#define PABS_DIFF_LESS_THAN( a, b, compare) \
-+ ({ uint32_t __tmp__, __tmp2__, __mask__; \
-+ asm ( \
-+ /* Check ABS( a - b ) < compare */ \
-+ "psubs.ub\t%[tmp], %[opa], %[opb]\n" \
-+ "psubs.ub\t%[tmp2], %[opb], %[opa]\n" \
-+ "or\t%[tmp], %[tmp2]\n" /* ABS ( a - b ) */ \
-+ /* This produces 0 for all bytes where the comparison is not true */ \
-+ "psubs.ub\t%[mask], %[cmp], %[tmp]\n" \
-+ : [tmp] "=&r"(__tmp__), [tmp2] "=&r"(__tmp2__), [mask] "=&r"(__mask__) \
-+ : [opa] "r"(a), [opb] "r"(b), [cmp] "r"(compare) ); \
-+ __mask__; })
-+
-+/*
-+ Set all bytes containing zero in <value> to 255 and the rest to zero.
-+
-+ Add with saturation 254 to all bytes making all bytes different from
-+ zero become 255. Then add one without saturation to make all bytes
-+ originally containing zero 255 and the rest 0. */
-+#define SET_ALL_BITS_IN_ZERO_BYTES(value) \
-+ ({ uint32_t __tmp__; \
-+ asm ( \
-+ "padds.ub\t%[tmp], %[val], %[max_minus_one]\n" \
-+ "padd.b\t%[tmp], %[tmp], %[all_ones]\n" \
-+ : [tmp] "=r"(__tmp__) \
-+ : [val] "r"(value), [max_minus_one] "r"(0xFEFEFEFE), [all_ones] "r"(0x01010101) ); \
-+ __tmp__; })
-+
-+#define PACKW_SH(upper, lower) \
-+ ({ uint32_t __tmp__; \
-+ asm ( \
-+ "packw.sh\t%[tmp], %[u], %[l]\n" \
-+ : [tmp] "=r"(__tmp__) \
-+ : [u] "r"(upper), [l] "r"(lower) ); \
-+ __tmp__; })
-+
-+#define PACKSH_UB(upper, lower) \
-+ ({ uint32_t __tmp__; \
-+ asm ( \
-+ "packsh.sb\t%[tmp], %[u], %[l]\n" \
-+ : [tmp] "=r"(__tmp__) \
-+ : [u] "r"(upper), [l] "r"(lower) ); \
-+ __tmp__; })
-+
-+static void h264_v_loop_filter_luma_avr32(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
-+{
-+ int i;
-+
-+ if ( alpha == 0 )
-+ return;
-+
-+ alpha = PACKW_SH(alpha, alpha);
-+ alpha = PACKSH_UB(alpha, alpha);
-+ beta = PACKW_SH(beta, beta);
-+ beta = PACKSH_UB(beta, beta);
-+
-+ for( i = 0; i < 4; i++ ) {
-+ uint32_t p0, p1, p2, q0, q1, q2;
-+ uint32_t mask, mask2;
-+ uint32_t tmp, tmp2, tmp3, tmp4;
-+
-+ if( tc0[i] < 0 ) {
-+ pix += 4;
-+ continue;
-+ }
-+
-+/* for( d = 0; d < 4; d++ ) {
-+ const int p0 = pix[-1*stride];
-+ const int p1 = pix[-2*stride];
-+ const int p2 = pix[-3*stride];
-+ const int q0 = pix[0];
-+ const int q1 = pix[1*stride];
-+ const int q2 = pix[2*stride];
-+
-+ if( ABS( p0 - q0 ) < alpha &&
-+ ABS( p1 - p0 ) < beta &&
-+ ABS( q1 - q0 ) < beta ) { */
-+
-+ p0 = LD32(pix - stride);
-+ p1 = LD32(pix - 2*stride);
-+ q0 = LD32(pix);
-+ q1 = LD32(pix + stride);
-+
-+ /* Check which of the columns should be filtered, if any. */
-+ mask = PABS_DIFF_LESS_THAN(p0, q0, alpha);
-+ mask |= PABS_DIFF_LESS_THAN(p1, p0, beta);
-+ mask |= PABS_DIFF_LESS_THAN(q1, q0, beta);
-+
-+ if ( !mask )
-+ continue;
-+
-+ mask = SET_ALL_BITS_IN_ZERO_BYTES(mask);
-+
-+
-+ int tc = PACKW_SH(tc0[i], tc0[i]);
-+ int tc0_p = tc;
-+ int tc0_m = PACKW_SH(-tc0[i], -tc0[i]);
-+
-+ /*
-+ int i_delta;
-+ if( ABS( p2 - p0 ) < beta ) {
-+ pix[-2*stride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
-+ tc++;
-+ }*/
-+
-+ p2 = LD32(pix - 3*stride);
-+ mask2 = PABS_DIFF_LESS_THAN(p2, p0, beta) & ~mask;
-+
-+ if ( mask2 ){
-+ mask2 = SET_ALL_BITS_IN_ZERO_BYTES(mask2);
-+ asm ("pavg.ub\t%[tmp], %[p0], %[q0]\n"
-+ "paddh.ub\t%[tmp], %[tmp], %[p2]\n"
-+ "punpckub.h\t%[tmp2], %[tmp]:t\n"
-+ "punpckub.h\t%[tmp], %[tmp]:b\n"
-+ "punpckub.h\t%[tmp3], %[p1]:t\n"
-+ "punpckub.h\t%[tmp4], %[p1]:b\n"
-+ "psub.h\t%[tmp2], %[tmp2], %[tmp3]\n"
-+ "psub.h\t%[tmp], %[tmp], %[tmp4]\n"
-+ "pmin.sh\t%[tmp2], %[tmp2], %[tc0_p]\n"
-+ "pmin.sh\t%[tmp], %[tmp], %[tc0_p]\n"
-+ "pmax.sh\t%[tmp2], %[tmp2], %[tc0_m]\n"
-+ "pmax.sh\t%[tmp], %[tmp], %[tc0_m]\n"
-+ "padd.h\t%[tmp2], %[tmp2], %[tmp3]\n"
-+ "padd.h\t%[tmp], %[tmp], %[tmp4]\n"
-+ "packsh.ub\t%[tmp], %[tmp2], %[tmp]\n"
-+ "andn\t%[tmp], %[mask2]\n"
-+ "and\t%[tmp2], %[q1], %[mask2]\n"
-+ "or\t%[tmp], %[tmp2]\n"
-+ : [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3),
-+ [tmp4]"=&r"(tmp4)
-+ : [q0]"r"(q0), [p2]"r"(p2), [p1]"r"(p1), [p0]"r"(p0), [q1]"r"(q1), [tc0_p]"r"(tc0_p),
-+ [tc0_m]"r"(tc0_m), [mask2]"r"(mask2));
-+ ST32(pix - 2*stride, tmp);
-+ tc += 0x00010001;
-+ }
-+
-+
-+ q2 = LD32(pix + 2*stride);
-+
-+ /*
-+ if( ABS( q2 - q0 ) < beta ) {
-+ pix[ stride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
-+ tc++;
-+ }
-+ */
-+ mask2 = PABS_DIFF_LESS_THAN(q2, q0, beta) & ~mask;
-+
-+ if ( mask2 ){
-+ mask2 = SET_ALL_BITS_IN_ZERO_BYTES(mask2);
-+ asm ("pavg.ub\t%[tmp], %[p0], %[q0]\n"
-+ "paddh.ub\t%[tmp], %[tmp], %[q2]\n"
-+ "punpckub.h\t%[tmp2], %[tmp]:t\n"
-+ "punpckub.h\t%[tmp], %[tmp]:b\n"
-+ "punpckub.h\t%[tmp3], %[q1]:t\n"
-+ "punpckub.h\t%[tmp4], %[q1]:b\n"
-+ "psub.h\t%[tmp2], %[tmp2], %[tmp3]\n"
-+ "psub.h\t%[tmp], %[tmp], %[tmp4]\n"
-+ "pmin.sh\t%[tmp2], %[tmp2], %[tc0_p]\n"
-+ "pmin.sh\t%[tmp], %[tmp], %[tc0_p]\n"
-+ "pmax.sh\t%[tmp2], %[tmp2], %[tc0_m]\n"
-+ "pmax.sh\t%[tmp], %[tmp], %[tc0_m]\n"
-+ "padd.h\t%[tmp2], %[tmp2], %[tmp3]\n"
-+ "padd.h\t%[tmp], %[tmp], %[tmp4]\n"
-+ "packsh.ub\t%[tmp], %[tmp2], %[tmp]\n"
-+ "andn\t%[tmp], %[mask2]\n"
-+ "and\t%[tmp2], %[q1], %[mask2]\n"
-+ "or\t%[tmp], %[tmp2]\n"
-+ : [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3),
-+ [tmp4]"=&r"(tmp4)
-+ : [q0]"r"(q0), [q2]"r"(q2), [q1]"r"(q1), [p0]"r"(p0), [tc0_p]"r"(tc0_p),
-+ [tc0_m]"r"(tc0_m), [mask2]"r"(mask2));
-+ ST32(pix + stride, tmp);
-+ tc += 0x00010001;
-+ }
-+
-+ uint32_t old_p0 = p0;
-+ uint32_t old_q0 = q0;
-+
-+ /* i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
-+ pix[-stride] = clip_uint8( p0 + i_delta );
-+ pix[0] = clip_uint8( q0 - i_delta ); */
-+
-+ asm (
-+ /* Check if the two upper pixels should be filtered */
-+ "lsr\t%[tmp], %[inv_mask], 16\n"
-+ "breq\t0f\n"
-+
-+ "punpckub.h\t%[tmp], %[p1]:t\n"
-+ "punpckub.h\t%[tmp2], %[q1]:t\n"
-+
-+ /* p1 - q1 */
-+ "psub.h\t%[tmp], %[tmp], %[tmp2]\n"
-+
-+ "punpckub.h\t%[tmp3], %[q0]:t\n"
-+ "punpckub.h\t%[tmp4], %[p0]:t\n"
-+
-+ /* q0 - p0 */
-+ "psub.h\t%[tmp2], %[tmp3], %[tmp4]\n"
-+
-+ /* (q0 - p0) << 2 */
-+ "plsl.h\t%[tmp2], %[tmp2], 2\n"
-+
-+ /* ((q0 - p0) << 2) + (p1 - q1) */
-+ "padd.h\t%[tmp2], %[tmp2], %[tmp]\n"
-+
-+ "mov\t%[tmp], 0x00040004\n"
-+ /* ((q0 - p0) << 2) + (p1 - q1) + 4*/
-+ "padd.h\t%[tmp2], %[tmp2], %[tmp]\n"
-+
-+ /* (((q0 - p0) << 2) + (p1 - q1) + 4) >> 3*/
-+ "pasr.h\t%[tmp2], %[tmp2], 3\n"
-+
-+ "mov\t%[tmp], 0\n"
-+ "psub.h\t%[tmp], %[tmp], %[tc]\n"
-+
-+ /* i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); */
-+ "pmin.sh\t%[tmp2], %[tmp2], %[tc]\n"
-+ "pmax.sh\t%[tmp2], %[tmp2], %[tmp]\n"
-+
-+
-+ /* pix[-stride] = clip_uint8( p0 + i_delta ); */
-+ "padd.h\t%[tmp4], %[tmp4], %[tmp2]\n"
-+
-+
-+ /* pix[0] = clip_uint8( q0 - i_delta ); */
-+ "psub.h\t%[tmp3], %[tmp3], %[tmp2]\n"
-+
-+ /* Check if the two lower pixels should be filtered */
-+ "lsl\t%[tmp2], %[inv_mask], 16\n"
-+ "breq\t1f\n"
-+
-+ "0:\n"
-+ "punpckub.h\t%[p1], %[p1]:b\n"
-+ "punpckub.h\t%[q1], %[q1]:b\n"
-+
-+ /* p1 - q1 */
-+ "psub.h\t%[p1], %[p1], %[q1]\n"
-+
-+ "punpckub.h\t%[q0], %[q0]:b\n"
-+ "punpckub.h\t%[p0], %[p0]:b\n"
-+
-+ /* q0 - p0 */
-+ "psub.h\t%[tmp2], %[q0], %[p0]\n"
-+
-+ /* (q0 - p0) << 2 */
-+ "plsl.h\t%[tmp2], %[tmp2], 2\n"
-+
-+ /* ((q0 - p0) << 2) + (p1 - q1) */
-+ "padd.h\t%[tmp2], %[tmp2], %[p1]\n"
-+
-+ "mov\t%[q1], 0x00040004\n"
-+ /* ((q0 - p0) << 2) + (p1 - q1) + 4*/
-+ "padd.h\t%[tmp2], %[tmp2], %[q1]\n"
-+
-+ /* (((q0 - p0) << 2) + (p1 - q1) + 4) >> 3*/
-+ "pasr.h\t%[tmp2], %[tmp2], 3\n"
-+
-+ /* i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); */
-+ "pmin.sh\t%[tmp2], %[tmp2], %[tc]\n"
-+ "pmax.sh\t%[tmp2], %[tmp2], %[tmp]\n"
-+
-+ /* pix[-stride] = clip_uint8( p0 + i_delta ); */
-+ "padd.h\t%[p0], %[p0], %[tmp2]\n"
-+
-+ /* pix[0] = clip_uint8( q0 - i_delta ); */
-+ "psub.h\t%[q0], %[q0], %[tmp2]\n"
-+
-+ "1:\n"
-+ "packsh.ub\t%[p0], %[tmp4], %[p0]\n"
-+ "packsh.ub\t%[q0], %[tmp3], %[tmp4]\n"
-+
-+ : [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3),
-+ [tmp4]"=&r"(tmp4), [q0]"=&r"(q0), [q1]"=&r"(q1), [p0]"=&r"(p0), [p1]"=&r"(p1)
-+ : [tc]"r"(tc), [inv_mask]"r"(~mask));
-+
-+ ST32(pix - stride, (mask & old_p0) | (p0 & ~mask));
-+ ST32(pix, (mask & old_q0) | (q0 & ~mask));
-+
-+ }
-+ pix += 1;
-+}
-+
-+
-+
-+
-+#ifdef CHECK_DSP_FUNCS_AGAINST_C
-+
-+void dump_block8(uint8_t *block, int line_size, int h){
-+ int i, j;
-+
-+ for ( i = 0; i < h ; i++ ){
-+ av_log(NULL, AV_LOG_ERROR, "\t");
-+ for ( j = 0; j < 8 ; j++ ){
-+ av_log(NULL, AV_LOG_ERROR, "%d ", block[j + i*line_size]);
-+ }
-+ av_log(NULL, AV_LOG_ERROR, "\n");
-+ }
-+}
-+
-+void dump_block4(uint8_t *block, int line_size, int h){
-+ int i, j;
-+
-+ for ( i = 0; i < h ; i++ ){
-+ av_log(NULL, AV_LOG_ERROR, "\t");
-+ for ( j = 0; j < 4 ; j++ ){
-+ av_log(NULL, AV_LOG_ERROR, "%d ", block[j + i*line_size]);
-+ }
-+ av_log(NULL, AV_LOG_ERROR, "\n");
-+ }
-+}
-+
-+void dump_block(uint8_t *block, int line_size, int h, int w){
-+ int i, j;
-+
-+ for ( i = 0; i < h ; i++ ){
-+ av_log(NULL, AV_LOG_ERROR, "\t");
-+ for ( j = 0; j < w ; j++ ){
-+ av_log(NULL, AV_LOG_ERROR, "%d ", block[j + i*line_size]);
-+ }
-+ av_log(NULL, AV_LOG_ERROR, "\n");
-+ }
-+}
-+
-+void check_block8(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
-+ int h, char *name, int max_dev){
-+ int i,j;
-+ for ( i = 0; i < 8 ; i++ ){
-+ for ( j = 0; j < h ; j++ ){
-+ int diff = test[i + line_size_test*j] - correct[i + line_size_correct*j];
-+ diff = diff < 0 ? -diff : diff;
-+ if ( diff > max_dev ){
-+ av_log(NULL, AV_LOG_ERROR, "Error pixel x=%i, y=%i differs. Is 0x%x should be 0x%x\n",
-+ i, j, test[i + line_size_test*j], correct[i + j*line_size_correct]);
-+ av_log(NULL, AV_LOG_ERROR, "Error resulting block from %s is:\n", name);
-+ dump_block8(test, line_size_test, h);
-+ av_log(NULL, AV_LOG_ERROR, "But should be equal to:\n");
-+ dump_block8(correct, line_size_correct, h);
-+ exit(1);
-+ }
-+ }
-+ }
-+}
-+
-+void check_block4(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
-+ int h, char *name, int max_dev){
-+ int i,j;
-+ for ( i = 0; i < 4 ; i++ ){
-+ for ( j = 0; j < h ; j++ ){
-+ int diff = test[i + line_size_test*j] - correct[i + line_size_correct*j];
-+ diff = diff < 0 ? -diff : diff;
-+ if ( diff > max_dev ){
-+ av_log(NULL, AV_LOG_ERROR, "Error pixel x=%i, y=%i differs. Is 0x%x should be 0x%x\n",
-+ i, j, test[i + line_size_test*j], correct[i + j*line_size_correct]);
-+ av_log(NULL, AV_LOG_ERROR, "Error resulting block from %s is:\n", name);
-+ dump_block8(test, line_size_test, h);
-+ av_log(NULL, AV_LOG_ERROR, "But should be equal to:\n");
-+ dump_block4(correct, line_size_correct, h);
-+ exit(1);
-+ }
-+ }
-+ }
-+}
-+
-+void check_block(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
-+ int h, int width, char *name, int max_dev){
-+ int i,j;
-+ for ( i = 0; i < width ; i++ ){
-+ for ( j = 0; j < h ; j++ ){
-+ int diff = test[i + line_size_test*j] - correct[i + line_size_correct*j];
-+ diff = diff < 0 ? -diff : diff;
-+ if ( diff > max_dev ){
-+ av_log(NULL, AV_LOG_ERROR, "Error pixel x=%i, y=%i differs. Is 0x%x should be 0x%x\n",
-+ i, j, test[i + line_size_test*j], correct[i + j*line_size_correct]);
-+ av_log(NULL, AV_LOG_ERROR, "Error resulting block from %s is:\n", name);
-+ dump_block(test, line_size_test, h, width);
-+ av_log(NULL, AV_LOG_ERROR, "But should be equal to:\n");
-+ dump_block(correct, line_size_correct, h, width);
-+ exit(1);
-+ }
-+ }
-+ }
-+}
-+
-+void dump_dct_block(DCTELEM *block){
-+ int i, j;
-+
-+ for ( i = 0; i < 8 ; i++ ){
-+ av_log(NULL, AV_LOG_ERROR, "\t");
-+ for ( j = 0; j < 8 ; j++ ){
-+ av_log(NULL, AV_LOG_ERROR, "0x%x ", block[j + i*8]);
-+ }
-+ av_log(NULL, AV_LOG_ERROR, "\n");
-+ }
-+}
-+
-+void test_idct_avr32(DCTELEM *block){
-+ DCTELEM testBlock[64];
-+ int i, j;
-+
-+ /* Copy transposed block to testBlock */
-+ for ( i = 0; i < 8 ; i++ ){
-+ for ( j = 0; j < 8 ; j++ ){
-+ testBlock[i + 8*j] = block[j + i*8];
-+ }
-+ }
-+
-+ idct_avr32(block);
-+ simple_idct(&testBlock);
-+
-+ for ( i = 0; i < 64 ; i++ ){
-+ if ( block[i] != testBlock[i] ){
-+ av_log(NULL, AV_LOG_ERROR, "Error resulting block from idct is:\n");
-+ dump_dct_block(block);
-+ av_log(NULL, AV_LOG_ERROR, "But should be equal to the transposed of:\n");
-+ dump_dct_block(testBlock);
-+ exit(1);
-+ }
-+ }
-+}
-+
-+void test_idct_put_avr32(uint8_t *dest, int line_size, DCTELEM *block){
-+ uint8_t testBlock[64];
-+ DCTELEM blockCopy[64];
-+ int i, j;
-+
-+ /* Copy transposed block to blockCopy */
-+ for ( i = 0; i < 8 ; i++ ){
-+ for ( j = 0; j < 8 ; j++ ){
-+ blockCopy[i + 8*j] = block[j + i*8];
-+ }
-+ }
-+
-+ idct_put_avr32(dest, line_size, block);
-+ simple_idct_put(&testBlock, 8, blockCopy);
-+
-+ check_block8(dest, testBlock, line_size, 8, 8, "idct_put", 1);
-+}
-+
-+
-+void test_idct_add_avr32(uint8_t *dest, int line_size, DCTELEM *block){
-+ uint8_t testBlock[64];
-+ DCTELEM blockCopy[64];
-+ int i, j;
-+
-+ /* Copy dest to testBlock */
-+ for ( i = 0; i < 8 ; i++ ){
-+ for ( j = 0; j < 8 ; j++ ){
-+ testBlock[i + 8*j] = dest[i + j*line_size];
-+ }
-+ }
-+
-+ /* Copy transposed block to blockCopy */
-+ for ( i = 0; i < 8 ; i++ ){
-+ for ( j = 0; j < 8 ; j++ ){
-+ blockCopy[i + 8*j] = block[j + i*8];
-+ }
-+ }
-+
-+ idct_add_avr32(dest, line_size, block);
-+ simple_idct_add(&testBlock, 8, blockCopy);
-+
-+ check_block8(dest, testBlock, line_size, 8, 8, "idct_add", 1);
-+}
-+
-+void test_h264_idct_add_avr32(uint8_t *dest, DCTELEM *block, int stride){
-+ uint8_t testBlock[16];
-+ DCTELEM blockCopy[16];
-+ int i, j;
-+
-+ /* Copy dest to testBlock */
-+ for ( i = 0; i < 4 ; i++ ){
-+ for ( j = 0; j < 4 ; j++ ){
-+ testBlock[i + 4*j] = dest[i + j*stride];
-+ }
-+ }
-+
-+ /* Copy transposed block to blockCopy */
-+ for ( i = 0; i < 16 ; i++ ){
-+ blockCopy[i] = block[i];
-+ }
-+
-+ ff_h264_idct_add_c(dest, block, stride);
-+
-+ h264_idct_add_avr32(testBlock, blockCopy, 4);
-+
-+ check_block(dest, testBlock, stride, 4, 4, 4, "h264_idct_add", 0);
-+}
-+
-+void test_h264_idct8_add_avr32(uint8_t *dest, DCTELEM *block, int stride){
-+ uint8_t testBlock[8*8];
-+ DCTELEM blockCopy[8*8];
-+ int i, j;
-+
-+ /* Copy dest to testBlock */
-+ for ( i = 0; i < 8 ; i++ ){
-+ for ( j = 0; j < 8 ; j++ ){
-+ testBlock[i + 8*j] = dest[i + j*stride];
-+ }
-+ }
-+
-+ /* Copy source block to blockCopy */
-+ for ( i = 0; i < 8*8 ; i++ ){
-+ blockCopy[i] = block[i];
-+ }
-+
-+ ff_h264_idct8_add_c(dest, block, stride);
-+ h264_idct8_add_avr32(testBlock, blockCopy, 8);
-+
-+ check_block(dest, testBlock, stride, 8, 8, 8, "h264_idct8_add", 0);
-+}
-+
-+void test_put_pixels_funcs8(op_pixels_func test, op_pixels_func correct, uint8_t *block,
-+ const uint8_t *pixels, int line_size, int h, char *name, int in_h_size, int in_v_size){
-+ uint8_t *testBlock, *testBlock2;
-+ int i, j;
-+ int input_v_size = h + in_v_size;
-+ int input_h_size = 8 + in_h_size;
-+
-+ testBlock = alloca(input_h_size*input_v_size);
-+ testBlock2 = alloca(input_h_size*input_v_size);
-+
-+ for ( i = 0; i < input_h_size ; i++ ){
-+ for ( j = 0; j < input_v_size ; j++ ){
-+ testBlock[i + input_h_size*j] = pixels[i + j*line_size];
-+ }
-+ }
-+
-+ test(block, pixels, line_size, h);
-+ correct(testBlock2, testBlock, input_h_size, h);
-+
-+ check_block8(block, testBlock2, line_size, input_h_size, h, name, 0);
-+
-+}
-+
-+void test_h264_chroma_mc_funcs(h264_chroma_mc_func test, h264_chroma_mc_func correct, uint8_t *dst,
-+ uint8_t *src, int stride, int h, int w, int x, int y, char *name){
-+ uint8_t *testBlock, *testBlock2;
-+ int i, j;
-+ int input_v_size = h + 1;
-+ int input_h_size = ((w + 1) + 3) & ~3;
-+
-+ testBlock = alloca(input_h_size*input_v_size);
-+ testBlock2 = alloca(input_h_size*input_v_size);
-+
-+ for ( i = 0; i < w + 1 ; i++ ){
-+ for ( j = 0; j < h + 1 ; j++ ){
-+ testBlock[i + input_h_size*j] = src[i + j*stride];
-+ }
-+ }
-+
-+ for ( i = 0; i < w ; i++ ){
-+ for ( j = 0; j < h ; j++ ){
-+ testBlock2[i + input_h_size*j] = dst[i + j*stride];
-+ }
-+ }
-+
-+ test(dst, src, stride, h, x, y);
-+ correct(testBlock2, testBlock, input_h_size, h, x, y);
-+
-+ check_block(dst, testBlock2, stride, input_h_size, h, w, name, 0);
-+
-+}
-+
-+void test_qpel_mc_funcs(qpel_mc_func test, qpel_mc_func correct, uint8_t *dst,
-+ uint8_t *src, int stride, int size, char *name){
-+ uint8_t *testBlock, *testBlock2;
-+ int i, j;
-+ int test_stride = size + 8;
-+
-+ testBlock = alloca(test_stride*(size+8)) + 4 + test_stride*4;
-+ testBlock2 = alloca(test_stride*size);
-+
-+ for ( i = -4; i < size+4 ; i++ ){
-+ for ( j = -4; j < size+4 ; j++ ){
-+ testBlock[i + test_stride*j] = src[i + j*stride];
-+ }
-+ }
-+
-+ for ( i = 0; i < size ; i++ ){
-+ for ( j = 0; j < size ; j++ ){
-+ testBlock2[i + test_stride*j] = dst[i + j*stride];
-+ }
-+ }
-+
-+ correct(dst, src, stride);
-+ test(testBlock2, testBlock, test_stride);
-+
-+ check_block(testBlock2, dst, test_stride, stride, size, size, name, 0);
-+
-+}
-+
-+
-+#define test_pixels_funcs(PFX, NUM ) \
-+void test_ ## PFX ## _pixels ## NUM ## _avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \
-+ test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _avr32, PFX ## _pixels ## NUM ## _c, \
-+ block, pixels, line_size, h, "test_" #PFX "_pixels", 0, 0); } \
-+void test_ ## PFX ## _pixels ## NUM ## _h_avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \
-+ test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _h_avr32, PFX ## _pixels ## NUM ## _x2_c, \
-+ block, pixels, line_size, h, "test_" #PFX "_pixels_h", 1, 0); } \
-+void test_ ## PFX ## _pixels ## NUM ## _v_avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \
-+ test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _v_avr32, PFX ## _pixels ## NUM ## _y2_c, \
-+ block, pixels, line_size, h, "test_" #PFX "_pixels_v", 0, 1); } \
-+void test_ ## PFX ## _pixels ## NUM ## _hv_avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \
-+ test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _hv_avr32, PFX ## _pixels ## NUM ## _xy2_c, \
-+ block, pixels, line_size, h, "test_" #PFX "_pixels_hv", 1, 1); }
-+
-+test_pixels_funcs(put, 8);
-+test_pixels_funcs(put_no_rnd, 8);
-+test_pixels_funcs(put, 16);
-+test_pixels_funcs(put_no_rnd, 16);
-+
-+test_pixels_funcs(avg, 8);
-+test_pixels_funcs(avg_no_rnd, 8);
-+test_pixels_funcs(avg, 16);
-+test_pixels_funcs(avg_no_rnd, 16);
-+
-+#define test_h264_chroma_mc_funcs(PFX, NUM ) \
-+void test_ ## PFX ## _h264_chroma_mc ## NUM ## _pico( uint8_t *dst, uint8_t *src, int stride, int h, int x, int y){ \
-+ test_h264_chroma_mc_funcs(PFX ## _h264_chroma_mc ## NUM ## _pico, PFX ## _h264_chroma_mc ## NUM ## _c, \
-+ dst, src, stride, h, NUM, x, y, "test_" #PFX "_h264_chroma_mc" #NUM "_pico"); } \
-+
-+test_h264_chroma_mc_funcs(put, 2);
-+test_h264_chroma_mc_funcs(put, 4);
-+test_h264_chroma_mc_funcs(put, 8);
-+test_h264_chroma_mc_funcs(avg, 2);
-+test_h264_chroma_mc_funcs(avg, 4);
-+test_h264_chroma_mc_funcs(avg, 8);
-+
-+#define test_qpel_mc_funcs_type(PFX, NUM, TYPE ) \
-+void test_ ## PFX ## NUM ## _ ## TYPE ## _pico( uint8_t *dst, uint8_t *src, int stride){ \
-+ test_qpel_mc_funcs(PFX ## NUM ## _ ## TYPE ## _pico, PFX ## NUM ## _ ## TYPE ## _c, \
-+ dst, src, stride, NUM, "test_" #PFX #NUM "_" #TYPE "_pico"); }
-+
-+#define test_qpel_mc_funcs(PFX, NUM) \
-+ test_qpel_mc_funcs_type(PFX, NUM, mc00);\
-+ test_qpel_mc_funcs_type(PFX, NUM, mc10);\
-+ test_qpel_mc_funcs_type(PFX, NUM, mc20);\
-+ test_qpel_mc_funcs_type(PFX, NUM, mc30);\
-+ test_qpel_mc_funcs_type(PFX, NUM, mc01);\
-+ test_qpel_mc_funcs_type(PFX, NUM, mc11);\
-+ test_qpel_mc_funcs_type(PFX, NUM, mc21);\
-+ test_qpel_mc_funcs_type(PFX, NUM, mc31);\
-+ test_qpel_mc_funcs_type(PFX, NUM, mc02);\
-+ test_qpel_mc_funcs_type(PFX, NUM, mc12);\
-+ test_qpel_mc_funcs_type(PFX, NUM, mc22);\
-+ test_qpel_mc_funcs_type(PFX, NUM, mc32);\
-+ test_qpel_mc_funcs_type(PFX, NUM, mc03);\
-+ test_qpel_mc_funcs_type(PFX, NUM, mc13);\
-+ test_qpel_mc_funcs_type(PFX, NUM, mc23);\
-+ test_qpel_mc_funcs_type(PFX, NUM, mc33)
-+
-+test_qpel_mc_funcs(put_h264_qpel, 4);
-+test_qpel_mc_funcs(put_h264_qpel, 8);
-+test_qpel_mc_funcs(put_h264_qpel, 16);
-+test_qpel_mc_funcs(avg_h264_qpel, 4);
-+test_qpel_mc_funcs(avg_h264_qpel, 8);
-+test_qpel_mc_funcs(avg_h264_qpel, 16);
-+
-+
-+#define dspfunc(PFX, IDX, NUM) \
-+ c->PFX ## _pixels_tab[IDX][ 0] = DSP_FUNC_NAME( PFX ## NUM ## _mc00_pico ); \
-+ c->PFX ## _pixels_tab[IDX][ 1] = DSP_FUNC_NAME( PFX ## NUM ## _mc10_pico ); \
-+ c->PFX ## _pixels_tab[IDX][ 2] = DSP_FUNC_NAME( PFX ## NUM ## _mc20_pico ); \
-+ c->PFX ## _pixels_tab[IDX][ 3] = DSP_FUNC_NAME( PFX ## NUM ## _mc30_pico ); \
-+ c->PFX ## _pixels_tab[IDX][ 4] = DSP_FUNC_NAME( PFX ## NUM ## _mc01_pico ); \
-+ c->PFX ## _pixels_tab[IDX][ 5] = DSP_FUNC_NAME( PFX ## NUM ## _mc11_pico ); \
-+ c->PFX ## _pixels_tab[IDX][ 6] = DSP_FUNC_NAME( PFX ## NUM ## _mc21_pico ); \
-+ c->PFX ## _pixels_tab[IDX][ 7] = DSP_FUNC_NAME( PFX ## NUM ## _mc31_pico ); \
-+ c->PFX ## _pixels_tab[IDX][ 8] = DSP_FUNC_NAME( PFX ## NUM ## _mc02_pico ); \
-+ c->PFX ## _pixels_tab[IDX][ 9] = DSP_FUNC_NAME( PFX ## NUM ## _mc12_pico ); \
-+ c->PFX ## _pixels_tab[IDX][10] = DSP_FUNC_NAME( PFX ## NUM ## _mc22_pico ); \
-+ c->PFX ## _pixels_tab[IDX][11] = DSP_FUNC_NAME( PFX ## NUM ## _mc32_pico ); \
-+ c->PFX ## _pixels_tab[IDX][12] = DSP_FUNC_NAME( PFX ## NUM ## _mc03_pico ); \
-+ c->PFX ## _pixels_tab[IDX][13] = DSP_FUNC_NAME( PFX ## NUM ## _mc13_pico ); \
-+ c->PFX ## _pixels_tab[IDX][14] = DSP_FUNC_NAME( PFX ## NUM ## _mc23_pico ); \
-+ c->PFX ## _pixels_tab[IDX][15] = DSP_FUNC_NAME( PFX ## NUM ## _mc33_pico )
-+
-+#endif
-+
-+void dsputil_init_avr32(DSPContext* c, AVCodecContext *avctx)
-+{
-+
-+ /* H264 */
-+
-+ if ( 0 /*avr32_use_pico*/ ){
-+ c->put_h264_chroma_pixels_tab[0]= DSP_FUNC_NAME(put_h264_chroma_mc8_pico);
-+ c->put_h264_chroma_pixels_tab[1]= DSP_FUNC_NAME(put_h264_chroma_mc4_pico);
-+ c->put_h264_chroma_pixels_tab[2]= DSP_FUNC_NAME(put_h264_chroma_mc2_pico);
-+
-+ c->avg_h264_chroma_pixels_tab[0]= DSP_FUNC_NAME(avg_h264_chroma_mc8_pico);
-+ c->avg_h264_chroma_pixels_tab[1]= DSP_FUNC_NAME(avg_h264_chroma_mc4_pico);
-+ c->avg_h264_chroma_pixels_tab[2]= DSP_FUNC_NAME(avg_h264_chroma_mc2_pico);
-+ }
-+
-+#define dspfunc(PFX, IDX, NUM) \
-+ c->PFX ## _pixels_tab[IDX][ 0] = DSP_FUNC_NAME( PFX ## NUM ## _mc00_pico ); \
-+ c->PFX ## _pixels_tab[IDX][ 1] = DSP_FUNC_NAME( PFX ## NUM ## _mc10_pico ); \
-+ c->PFX ## _pixels_tab[IDX][ 2] = DSP_FUNC_NAME( PFX ## NUM ## _mc20_pico ); \
-+ c->PFX ## _pixels_tab[IDX][ 3] = DSP_FUNC_NAME( PFX ## NUM ## _mc30_pico ); \
-+ c->PFX ## _pixels_tab[IDX][ 4] = DSP_FUNC_NAME( PFX ## NUM ## _mc01_pico ); \
-+ c->PFX ## _pixels_tab[IDX][ 5] = DSP_FUNC_NAME( PFX ## NUM ## _mc11_pico ); \
-+ c->PFX ## _pixels_tab[IDX][ 6] = DSP_FUNC_NAME( PFX ## NUM ## _mc21_pico ); \
-+ c->PFX ## _pixels_tab[IDX][ 7] = DSP_FUNC_NAME( PFX ## NUM ## _mc31_pico ); \
-+ c->PFX ## _pixels_tab[IDX][ 8] = DSP_FUNC_NAME( PFX ## NUM ## _mc02_pico ); \
-+ c->PFX ## _pixels_tab[IDX][ 9] = DSP_FUNC_NAME( PFX ## NUM ## _mc12_pico ); \
-+ c->PFX ## _pixels_tab[IDX][10] = DSP_FUNC_NAME( PFX ## NUM ## _mc22_pico ); \
-+ c->PFX ## _pixels_tab[IDX][11] = DSP_FUNC_NAME( PFX ## NUM ## _mc32_pico ); \
-+ c->PFX ## _pixels_tab[IDX][12] = DSP_FUNC_NAME( PFX ## NUM ## _mc03_pico ); \
-+ c->PFX ## _pixels_tab[IDX][13] = DSP_FUNC_NAME( PFX ## NUM ## _mc13_pico ); \
-+ c->PFX ## _pixels_tab[IDX][14] = DSP_FUNC_NAME( PFX ## NUM ## _mc23_pico ); \
-+ c->PFX ## _pixels_tab[IDX][15] = DSP_FUNC_NAME( PFX ## NUM ## _mc33_pico )
-+
-+ if ( avr32_use_pico ){
-+ dspfunc(put_h264_qpel, 0, 16);
-+ dspfunc(put_h264_qpel, 1, 8);
-+ dspfunc(put_h264_qpel, 2, 4);
-+ dspfunc(avg_h264_qpel, 0, 16);
-+ dspfunc(avg_h264_qpel, 1, 8);
-+ dspfunc(avg_h264_qpel, 2, 4);
-+ }
-+
-+ c->idct_put= DSP_FUNC_NAME(idct_put_avr32);
-+ c->idct_add= DSP_FUNC_NAME(idct_add_avr32);
-+ c->idct = DSP_FUNC_NAME(idct_avr32);
-+ c->h264_idct_add = DSP_FUNC_NAME(h264_idct_add_avr32);
-+ c->h264_idct8_add = DSP_FUNC_NAME(h264_idct8_add_avr32);
-+
-+ /*c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_avr32;*/
-+
-+ c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
-+
-+ c->fdct = fdct_avr32;
-+
-+ c->clear_blocks = clear_blocks_avr32;
-+
-+#undef dspfunc
-+#define dspfunc(PFX, IDX, NUM) \
-+ c->PFX ## _pixels_tab[IDX][0] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _avr32 ); \
-+ c->PFX ## _pixels_tab[IDX][1] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _h_avr32); \
-+ c->PFX ## _pixels_tab[IDX][2] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _v_avr32); \
-+ c->PFX ## _pixels_tab[IDX][3] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _hv_avr32)
-+
-+ dspfunc(put, 0, 16);
-+ dspfunc(put_no_rnd, 0, 16);
-+ dspfunc(put, 1, 8);
-+ dspfunc(put_no_rnd, 1, 8);
-+
-+ dspfunc(avg, 1, 8);
-+ dspfunc(avg_no_rnd, 1, 8);
-+ dspfunc(avg, 0, 16);
-+ dspfunc(avg_no_rnd, 0, 16);
-+#undef dspfunc
-+
-+}
-+
-+
-+
-+#if 0
-+int main(int argc, char *argv[]){
-+
-+
-+}
-+#endif
-+
-diff --git a/libavcodec/avr32/fdct.S b/libavcodec/avr32/fdct.S
-new file mode 100644
-index 0000000..be45b86
---- /dev/null
-+++ b/libavcodec/avr32/fdct.S
-@@ -0,0 +1,541 @@
-+/*
-+ * Copyright (c) 2007 Atmel Corporation. All rights reserved.
-+ *
-+ * Redistribution and use in source and binary forms, with or without
-+ * modification, are permitted provided that the following conditions
-+ * are met:
-+ *
-+ * 1. Redistributions of source code must retain the above copyright
-+ * notice, this list of conditions and the following disclaimer.
-+ *
-+ * 2. Redistributions in binary form must reproduce the above
-+ * copyright notice, this list of conditions and the following
-+ * disclaimer in the documentation and/or other materials provided
-+ * with the distribution.
-+ *
-+ * 3. The name of ATMEL may not be used to endorse or promote products
-+ * derived from this software without specific prior written
-+ * permission.
-+ *
-+ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
-+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-+ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
-+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
-+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
-+ * DAMAGE.
-+ */
-+
-+//**********************************************************
-+//* 2-D fDCT, Based on: *
-+//* C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical *
-+//* Fast 1-D DCT Algorithms with 11 Multiplications", *
-+//* Proc. Int'l. Conf. on Acoustics, Speech, and Signal *
-+//* Processing 1989 (ICASSP '89), pp. 988-991. *
-+//* *
-+//* Fixed point implementation optimized for the AVR-II *
-+//* instruction set. If a table is used for the *
-+//* coeffisients we can load two and two of them from *
-+//* This will give a reduction of
-+//* *
-+//* *
-+//**********************************************************
-+
-+
-+/* This routine is a slow-but-accurate integer implementation of the
-+ * forward DCT (Discrete Cosine Transform). Taken from the IJG software
-+ *
-+ * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
-+ * on each column. Direct algorithms are also available, but they are
-+ * much more complex and seem not to be any faster when reduced to code.
-+ *
-+ * This implementation is based on an algorithm described in
-+ * C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
-+ * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
-+ * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
-+ * The primary algorithm described there uses 11 multiplies and 29 adds.
-+ * We use their alternate method with 12 multiplies and 32 adds.
-+ * The advantage of this method is that no data path contains more than one
-+ * multiplication; this allows a very simple and accurate implementation in
-+ * scaled fixed-point arithmetic, with a minimal number of shifts.
-+ *
-+ * The poop on this scaling stuff is as follows:
-+ *
-+ * Each 1-D DCT step produces outputs which are a factor of sqrt(N)
-+ * larger than the true DCT outputs. The final outputs are therefore
-+ * a factor of N larger than desired; since N=8 this can be cured by
-+ * a simple right shift at the end of the algorithm. The advantage of
-+ * this arrangement is that we save two multiplications per 1-D DCT,
-+ * because the y0 and y4 outputs need not be divided by sqrt(N).
-+ * In the IJG code, this factor of 8 is removed by the quantization step
-+ * (in jcdctmgr.c), here it is removed.
-+ *
-+ * We have to do addition and subtraction of the integer inputs, which
-+ * is no problem, and multiplication by fractional constants, which is
-+ * a problem to do in integer arithmetic. We multiply all the constants
-+ * by CONST_SCALE and convert them to integer constants (thus retaining
-+ * CONST_BITS bits of precision in the constants). After doing a
-+ * multiplication we have to divide the product by CONST_SCALE, with proper
-+ * rounding, to produce the correct output. This division can be done
-+ * cheaply as a right shift of CONST_BITS bits. We postpone shifting
-+ * as long as possible so that partial sums can be added together with
-+ * full fractional precision.
-+ *
-+ * The outputs of the first pass are scaled up by PASS1_BITS bits so that
-+ * they are represented to better-than-integral precision. These outputs
-+ * require 8 + PASS1_BITS + 3 bits; this fits in a 16-bit word
-+ * with the recommended scaling. (For 12-bit sample data, the intermediate
-+ * array is INT32 anyway.)
-+ *
-+ * To avoid overflow of the 32-bit intermediate results in pass 2, we must
-+ * have 8 + CONST_BITS + PASS1_BITS <= 26. Error analysis
-+ * shows that the values given below are the most effective.
-+ *
-+ * We can gain a little more speed, with a further compromise in accuracy,
-+ * by omitting the addition in a descaling shift. This yields an incorrectly
-+ * rounded result half the time...
-+ */
-+
-+ .global fdct_avr32
-+
-+
-+
-+#define CONST_BITS 13
-+#define PASS1_BITS 2
-+
-+#define FIX_0_298631336 2446 /* FIX(0.298631336) */
-+#define FIX_0_390180644 3196 /* FIX(0.390180644) */
-+#define FIX_0_541196100 4433 /* FIX(0.541196100) */
-+#define FIX_0_765366865 6270 /* FIX(0.765366865) */
-+#define FIX_0_899976223 7373 /* FIX(0.899976223) */
-+#define FIX_1_175875602 9633 /* FIX(1.175875602) */
-+#define FIX_1_501321110 12299 /* FIX(1.501321110) */
-+#define FIX_1_847759065 15137 /* FIX(1.847759065) */
-+#define FIX_1_961570560 16069 /* FIX(1.961570560) */
-+#define FIX_2_053119869 16819 /* FIX(2.053119869) */
-+#define FIX_2_562915447 20995 /* FIX(2.562915447) */
-+#define FIX_3_072711026 25172 /* FIX(3.072711026) */
-+
-+
-+/*
-+ * Perform an integer forward DCT on one block of samples.
-+ */
-+
-+//void
-+//fdct_int32(short *const block)
-+//{
-+// int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-+// int tmp10, tmp11, tmp12, tmp13;
-+// int z1, z2, z3, z4, z5;
-+// short *blkptr;
-+// int *dataptr;
-+// int data[64];
-+// int i;
-+//
-+// /* Pass 1: process rows. */
-+// /* Note results are scaled up by sqrt(8) compared to a true DCT; */
-+// /* furthermore, we scale the results by 2**PASS1_BITS. */
-+//
-+// dataptr = data;
-+// blkptr = block;
-+
-+ .text
-+fdct_avr32:
-+ pushm r0-r3, r4-r7, lr
-+#define loop_ctr r0
-+#define blkptr r12
-+#define x0 r1
-+#define x1 r2
-+#define x2 r3
-+#define x3 r4
-+#define x4 r5
-+#define x5 r6
-+#define x6 r7
-+#define x7 r8
-+#define tmp0 r5
-+#define tmp7 r2
-+#define tmp1 r3
-+#define tmp6 r4
-+#define tmp2 r9
-+#define tmp5 r8
-+#define tmp3 r7
-+#define tmp4 r6
-+
-+
-+ mov loop_ctr, 8
-+// for (i = 0; i < 8; i++) {
-+ROW_LOOP:
-+
-+ ldm blkptr, r1, r2, r3, r4
-+
-+// tmp2 = blkptr[2] + blkptr[5];
-+// tmp3 = blkptr[3] + blkptr[4];
-+ paddx.h r5, r3, r2
-+// tmp5 = blkptr[2] - blkptr[5];
-+// tmp4 = blkptr[3] - blkptr[4];
-+ psubx.h r6, r3, r2
-+// tmp0 = blkptr[0] + blkptr[7];
-+// tmp1 = blkptr[1] + blkptr[6];
-+ paddx.h r2, r4, r1
-+// tmp7 = blkptr[0] - blkptr[7];
-+// tmp6 = blkptr[1] - blkptr[6];
-+ psubx.h r3, r4, r1
-+
-+// /* Even part per LL&M figure 1 --- note that published figure is faulty;
-+// * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
-+// */
-+
-+#define tmp10 r1
-+#define tmp13 r5
-+#define tmp11 r7
-+#define tmp12 r3
-+#define z1 r9
-+
-+// tmp10 = tmp0 + tmp3;
-+// tmp13 = tmp0 - tmp3;
-+ paddsub.h r1, r2:t, r5:b
-+// tmp11 = tmp1 + tmp2;
-+// tmp12 = tmp1 - tmp2;
-+ paddsub.h r4, r2:b, r5:t
-+
-+
-+// dataptr[0] = (tmp10 + tmp11) << PASS1_BITS;
-+// dataptr[4] = (tmp10 - tmp11) << PASS1_BITS;
-+ paddsub.h r7, r1:t, r4:t
-+ ld.w r10, pc[const_table - .]
-+ plsl.h r7, r7, PASS1_BITS
-+
-+// z1 = (tmp12 + tmp13) * FIX_0_541196100;
-+ addhh.w r8, r4:b, r1:b
-+ mulhh.w r8, r8:b, r10:t
-+
-+// dataptr[2] =
-+// DESCALE(z1 + tmp13 * FIX_0_765366865, CONST_BITS - PASS1_BITS);
-+// dataptr[6] =
-+// DESCALE(z1 + tmp12 * (-FIX_1_847759065), CONST_BITS - PASS1_BITS);
-+ mulhh.w r9, r1:b, r10:b
-+ ld.w r10, pc[const_table - . + 4]
-+ add r1, r8, r9
-+ satrnds r1 >> (CONST_BITS - PASS1_BITS), 31
-+
-+ mulhh.w r9, r4:b, r10:t
-+ add r4, r8, r9
-+ satrnds r4 >> (CONST_BITS - PASS1_BITS), 31
-+
-+
-+// /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
-+// * cK represents cos(K*pi/16).
-+// * i0..i3 in the paper are tmp4..tmp7 here.
-+// */
-+
-+#define z2 r5
-+#define z3 r6
-+#define z4 r7
-+#define z5 r8
-+
-+// z4 = tmp5 + tmp7;
-+// z3 = tmp4 + tmp6;
-+ padd.h r2, r6, r3
-+// z2 = tmp5 + tmp6;
-+// z1 = tmp4 + tmp7;
-+ paddx.h r5, r6, r3
-+
-+ lddpc r9, pc[const_table - . + 8]
-+// z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */
-+ addhh.w r8, r2:t, r2:b
-+ mulhh.w r8, r8:b, r10:b
-+ lddpc r10, pc[const_table - . + 12]
-+
-+
-+// tmp4 *= FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */
-+ mulhh.w r11, r6:b, r9:t
-+
-+// tmp5 *= FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */
-+ mulhh.w r6, r6:t, r9:b
-+
-+// tmp6 *= FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */
-+ lddpc r9, pc[const_table - . + 20]
-+ mulhh.w lr, r3:b, r10:t
-+
-+// tmp7 *= FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */
-+ mulhh.w r3, r3:t, r10:b
-+
-+// z3 *= -FIX_1_961570560; /* sqrt(2) * (-c3-c5) */
-+ mulhh.w r10, r2:b, r9:t
-+
-+// z4 *= -FIX_0_390180644; /* sqrt(2) * (c5-c3) */
-+ mulhh.w r2, r2:t, r9:b
-+ lddpc r9, pc[const_table - . + 16]
-+// z3 += z5;
-+// z4 += z5;
-+ add r10, r8
-+ add r2, r8
-+
-+// z1 *= -FIX_0_899976223; /* sqrt(2) * (c7-c3) */
-+ mulhh.w r8, r5:b, r9:t
-+
-+// z2 *= -FIX_2_562915447; /* sqrt(2) * (-c1-c3) */
-+ mulhh.w r5, r5:t, r9:b
-+
-+// dataptr[7] = DESCALE(tmp4 + z1 + z3, CONST_BITS - PASS1_BITS);
-+ add r11, r8
-+ add r11, r10
-+ satrnds r11 >> (CONST_BITS - PASS1_BITS), 31
-+
-+// dataptr[5] = DESCALE(tmp5 + z2 + z4, CONST_BITS - PASS1_BITS);
-+ add r6, r5
-+
-+ sthh.w blkptr[6*2], r4:b, r11:b
-+ add r6, r2
-+ satrnds r6 >> (CONST_BITS - PASS1_BITS), 31
-+
-+// dataptr[3] = DESCALE(tmp6 + z2 + z3, CONST_BITS - PASS1_BITS);
-+ add lr, r5
-+ sthh.w blkptr[4*2], r7:b, r6:b
-+ add lr, r10
-+ satrnds lr >> (CONST_BITS - PASS1_BITS), 31
-+
-+// dataptr[1] = DESCALE(tmp7 + z1 + z4, CONST_BITS - PASS1_BITS);
-+ add r3, r8
-+ sthh.w blkptr[2*2], r1:b, lr:b
-+ add r3, r2
-+ satrnds r3 >> (CONST_BITS - PASS1_BITS), 31
-+
-+
-+
-+// dataptr += 8; /* advance pointer to next row */
-+// blkptr += 8;
-+ sthh.w blkptr[0], r7:t, r3:b
-+ sub blkptr, -16
-+ sub loop_ctr, 1
-+ brne ROW_LOOP
-+
-+// }
-+
-+ /* Pass 2: process columns.
-+ * We remove the PASS1_BITS scaling, but leave the results scaled up
-+ * by an overall factor of 8.
-+ */
-+
-+// dataptr = data;
-+ sub blkptr, 128
-+
-+ mov loop_ctr, 4
-+// for (i = 0; i < 8; i++) {
-+COLOUMN_LOOP:
-+ ld.w r1, blkptr[0]
-+ ld.w r2, blkptr[1*8*2]
-+ ld.w r3, blkptr[2*8*2]
-+ ld.w r4, blkptr[3*8*2]
-+ ld.w r5, blkptr[4*8*2]
-+ ld.w r6, blkptr[5*8*2]
-+ ld.w r7, blkptr[6*8*2]
-+ ld.w r8, blkptr[7*8*2]
-+
-+// tmp0 = blkptr[0] + blkptr[7*8];
-+ padds.sh r9, r1, r8
-+// tmp7 = blkptr[0] - blkptr[7*8];
-+ psubs.sh r1, r1, r8
-+// tmp1 = blkptr[1*8] + blkptr[6*8];
-+ padds.sh r8, r2, r7
-+// tmp6 = blkptr[1*8] - blkptr[6*8];
-+ psubs.sh r2, r2, r7
-+// tmp2 = blkptr[2*8] + blkptr[5*8];
-+ padds.sh r7, r3, r6
-+// tmp5 = blkptr[2*8] - blkptr[5*8];
-+ psubs.sh r3, r3, r6
-+// tmp3 = blkptr[3*8] + blkptr[4*8];
-+ padds.sh r6, r4, r5
-+// tmp4 = blkptr[3*8] - blkptr[4*8];
-+ psubs.sh r4, r4, r5
-+
-+// /* even part per ll&m figure 1 --- note that published figure is faulty;
-+// * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
-+// */
-+//
-+// tmp10 = tmp0 + tmp3;
-+ padds.sh r5, r9, r6
-+// tmp13 = tmp0 - tmp3;
-+ psubs.sh r9, r9, r6
-+// tmp11 = tmp1 + tmp2;
-+ padds.sh r6, r8, r7
-+// tmp12 = tmp1 - tmp2;
-+ psubs.sh r8, r8, r7
-+
-+// dataptr[0] = DESCALE(tmp10 + tmp11, PASS1_BITS);
-+// dataptr[32] = DESCALE(tmp10 - tmp11, PASS1_BITS);
-+//Might get an overflow here
-+ padds.sh r7, r5, r6
-+ psubs.sh r5, r5, r6
-+
-+ //Rounding
-+ mov lr, (1 << (PASS1_BITS + 2))
-+ orh lr, hi(1 << (16 + PASS1_BITS + 2))
-+ padds.sh r7, r7, lr
-+ padds.sh r5, r5, lr
-+
-+ pasr.h r7, r7, PASS1_BITS + 3
-+ pasr.h r5, r5, PASS1_BITS + 3
-+ st.w r12[0], r7
-+ st.w r12[4*8*2], r5
-+
-+ lddpc r10, const_table2
-+
-+
-+// z1 = (tmp12 + tmp13) * FIX_0_541196100;
-+ padds.sh r5, r8, r9
-+ mulhh.w r6, r5:t, r10:t
-+ mulhh.w r7, r5:b, r10:t
-+
-+// dataptr[16] =
-+// DESCALE(z1 + tmp13 * FIX_0_765366865, CONST_BITS + PASS1_BITS);
-+ lddpc r11, const_table2 + 4
-+ mulhh.w lr, r9:t, r10:b
-+ mulhh.w r9, r9:b, r10:b
-+ add lr, r6
-+ add r9, r7
-+ satrnds lr >> (CONST_BITS + PASS1_BITS + 3), 31
-+ satrnds r9 >> (CONST_BITS + PASS1_BITS + 3), 31
-+ sthh.w r12[2*8*2], lr:b, r9:b
-+
-+// dataptr[48] =
-+// DESCALE(z1 + tmp12 * (-FIX_1_847759065), CONST_BITS + PASS1_BITS);
-+ mulhh.w lr, r8:t, r11:t
-+ mulhh.w r8, r8:b, r11:t
-+ add lr, r6
-+ add r8, r7
-+ satrnds lr >> (CONST_BITS + PASS1_BITS + 3), 31
-+ satrnds r8 >> (CONST_BITS + PASS1_BITS + 3), 31
-+ sthh.w r12[6*8*2], lr:b, r8:b
-+
-+// /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
-+// * cK represents cos(K*pi/16).
-+// * i0..i3 in the paper are tmp4..tmp7 here.
-+// */
-+//
-+// z2 = tmp5 + tmp6;
-+// z3 = tmp4 + tmp6;
-+// z4 = tmp5 + tmp7;
-+ padds.sh r5, r3, r2
-+ padds.sh r6, r4, r2
-+ padds.sh r7, r3, r1
-+
-+// z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */
-+ padds.sh r8, r6, r7
-+ mulhh.w r9, r8:t, r11:b
-+ mulhh.w r8, r8:b, r11:b
-+
-+// z3 *= -FIX_1_961570560; /* sqrt(2) * (-c3-c5) */
-+// z3 += z5;
-+ lddpc r11, const_table2 + 8
-+ mulhh.w r10, r6:t, r11:t
-+ mulhh.w r6, r6:b, r11:t
-+ add r10, r9
-+ add r6, r8
-+
-+// z4 *= -FIX_0_390180644; /* sqrt(2) * (c5-c3) */
-+// z4 += z5;
-+ mulhh.w lr, r7:t, r11:b
-+ mulhh.w r7, r7:b, r11:b
-+ lddpc r11, const_table2 + 12
-+ st.w --sp,r0
-+ add lr, r9
-+ add r7, r8
-+
-+// tmp6 *= FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */
-+ mulhh.w r0, r2:t, r11:t
-+ machh.w r0, r5:t, r11:b
-+ mulhh.w r2, r2:b, r11:t
-+ machh.w r2, r5:b, r11:b
-+
-+// z2 *= -FIX_2_562915447; /* sqrt(2) * (-c1-c3) */
-+// dataptr[24] = DESCALE(tmp6 + z2 + z3, CONST_BITS + PASS1_BITS);
-+ add r0, r10
-+ lddpc r11, const_table2 + 16
-+ add r2, r6
-+ satrnds r0 >> (CONST_BITS + PASS1_BITS + 3), 31
-+ satrnds r2 >> (CONST_BITS + PASS1_BITS + 3), 31
-+ sthh.w r12[3*8*2], r0:b, r2:b
-+// tmp5 *= FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */
-+ mulhh.w r0, r3:t, r11:t
-+ machh.w r0, r5:t, r11:b
-+ mulhh.w r2, r3:b, r11:t
-+ machh.w r2, r5:b, r11:b
-+ add r0, lr
-+ lddpc r11, const_table2 + 20
-+ add r2, r7
-+
-+// dataptr[40] = DESCALE(tmp5 + z2 + z4, CONST_BITS + PASS1_BITS);
-+ satrnds r0 >> (CONST_BITS + PASS1_BITS + 3), 31
-+ satrnds r2 >> (CONST_BITS + PASS1_BITS + 3), 31
-+ sthh.w r12[5*8*2], r0:b, r2:b
-+
-+
-+// z1 = tmp4 + tmp7;
-+ padds.sh r2, r4, r1
-+
-+// tmp4 *= FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */
-+ mulhh.w r3, r4:t, r11:t
-+ machh.w r3, r2:t, r11:b
-+ mulhh.w r4, r4:b, r11:t
-+ machh.w r4, r2:b, r11:b
-+ add r3, r10
-+ lddpc r11, const_table2 + 24
-+ add r4, r6
-+
-+// z1 *= -FIX_0_899976223; /* sqrt(2) * (c7-c3) */
-+// dataptr[56] = DESCALE(tmp4 + z1 + z3, CONST_BITS + PASS1_BITS);
-+ satrnds r3 >> (CONST_BITS + PASS1_BITS + 3), 31
-+ satrnds r4 >> (CONST_BITS + PASS1_BITS + 3), 31
-+ sthh.w r12[7*8*2], r3:b, r4:b
-+
-+
-+// tmp7 *= FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */
-+ mulhh.w r3, r1:t, r11:t
-+ machh.w r3, r2:t, r11:b
-+ mulhh.w r4, r1:b, r11:t
-+ machh.w r4, r2:b, r11:b
-+ add r3, lr
-+ add r4, r7
-+
-+// dataptr[8] = DESCALE(tmp7 + z1 + z4, CONST_BITS + PASS1_BITS);
-+ satrnds r3 >> (CONST_BITS + PASS1_BITS + 3), 31
-+ satrnds r4 >> (CONST_BITS + PASS1_BITS + 3), 31
-+ sthh.w r12[1*8*2], r3:b, r4:b
-+ ld.w r0, sp++
-+
-+// dataptr++; /* advance pointer to next column */
-+ sub blkptr, -4
-+ sub loop_ctr, 1
-+ brne COLOUMN_LOOP
-+
-+// }
-+
-+ popm r0-r3, r4-r7, pc
-+
-+// /* descale */
-+// for (i = 0; i < 64; i++)
-+// block[i] = (short int) DESCALE(data[i], 3);
-+
-+
-+//}
-+
-+
-+ .align 2
-+const_table: .short FIX_0_541196100, FIX_0_765366865, -FIX_1_847759065, FIX_1_175875602
-+ .short FIX_0_298631336, FIX_2_053119869, FIX_3_072711026, FIX_1_501321110
-+ .short -FIX_0_899976223,-FIX_2_562915447, -FIX_1_961570560, -FIX_0_390180644
-+
-+const_table2: .short FIX_0_541196100, FIX_0_765366865, -FIX_1_847759065, FIX_1_175875602
-+ .short -FIX_1_961570560, -FIX_0_390180644, FIX_3_072711026, -FIX_2_562915447
-+ .short FIX_2_053119869, -FIX_2_562915447, FIX_0_298631336, -FIX_0_899976223
-+ .short FIX_1_501321110, -FIX_0_899976223
-+
-+
-+
-+
-diff --git a/libavcodec/avr32/h264idct.S b/libavcodec/avr32/h264idct.S
-new file mode 100644
-index 0000000..4b23e2d
---- /dev/null
-+++ b/libavcodec/avr32/h264idct.S
-@@ -0,0 +1,451 @@
-+/*
-+ * Copyright (c) 2007 Atmel Corporation. All rights reserved.
-+ *
-+ * Redistribution and use in source and binary forms, with or without
-+ * modification, are permitted provided that the following conditions
-+ * are met:
-+ *
-+ * 1. Redistributions of source code must retain the above copyright
-+ * notice, this list of conditions and the following disclaimer.
-+ *
-+ * 2. Redistributions in binary form must reproduce the above
-+ * copyright notice, this list of conditions and the following
-+ * disclaimer in the documentation and/or other materials provided
-+ * with the distribution.
-+ *
-+ * 3. The name of ATMEL may not be used to endorse or promote products
-+ * derived from this software without specific prior written
-+ * permission.
-+ *
-+ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
-+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-+ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
-+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
-+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
-+ * DAMAGE.
-+ */
-+
-+ .global h264_idct_add_avr32
-+
-+ /* Macro for performing the 1-D transform on one row line.
-+
-+ The register 'w01' should contain the first two pixels,
-+ and the register 'w23' should contain the last two pixels
-+ in the line. The resulting line is placed in p01 and p23
-+ so that { w01, w23 } = { x0, x1, x3, x2 }.
-+ 'tmp' and 'tmp2' should be scratchpad registers. */
-+ .macro transform_row w01, w23, tmp, tmp2
-+ add \tmp, \w23, \w01 << 1 /* tmp = { xxxx, 2*w1 + w3 } */
-+ sub \tmp2, \w01, \w23 << 1 /* tmp2 = { xxxx, w1 - 2*w3 } */
-+ bfins \tmp2, \tmp, 16, 16 /* tmp2 = { 2*w1 + w3, w1 - 2*w3 } */
-+ pasr.h \tmp2, \tmp2, 1 /* tmp2 = { w1 + w3/2, w1/2 - w3 } */
-+ paddsub.h \tmp, \w01:t, \w23:t /* tmp = { w0 + w2, w0 - w2 } */
-+ padd.h \w01, \tmp, \tmp2 /* w01 = { w0 + w2 + w1 + w3/2, w0 - w2 + w1/2 - w3 } */
-+ psub.h \w23, \tmp, \tmp2 /* w23 = { w0 + w2 - w1 - w3/2, w0 - w2 - w1/2 + w3 } */
-+ .endm
-+
-+ /* Macro for performing the 1-D transform on two columns.
-+
-+ The registers w0, w1, w2, w3 should each contain two
-+ packed samples from the two colomns to transform.
-+ tmp and tmp2 are scratchpad registers.
-+
-+ The resulting transformed columns are placed in the
-+ same positions as the input columns.
-+ */
-+ .macro transform_2columns w0, w1, w2, w3, tmp, tmp2
-+ padd.h \tmp, \w0, \w2 /* tmp = z0 = w0 + w2 */
-+ psub.h \w0, \w0, \w2 /* w0 = z1 = w0 - w2 */
-+ pasr.h \w2, \w1, 1 /* w2 = w1/2 */
-+ pasr.h \tmp2, \w3, 1 /* tmp2 = w3/2 */
-+ psub.h \w3, \w2, \w3 /* w3 = z2 = w1/2 - w3 */
-+ padd.h \tmp2, \w1, \tmp2/* tmp2 = z3 = w1 + w3/2 */
-+ padd.h \w1, \w0, \w3 /* w1 = x1 = z1 + z2 */
-+ psub.h \w2, \w0, \w3 /* w2 = x2 = z1 - z2 */
-+ padd.h \w0, \tmp, \tmp2/* w0 = x0 = z0 + z3 */
-+ psub.h \w3, \tmp, \tmp2/* w3 = x3 = z0 - z3 */
-+ /* Scale down result. */
-+ pasr.h \w0, \w0, 6
-+ pasr.h \w1, \w1, 6
-+ pasr.h \w2, \w2, 6
-+ pasr.h \w3, \w3, 6
-+ .endm
-+
-+/*void h264_idct_add_avr32(uint8_t *dst, DCTELEM *block, int stride)*/
-+
-+h264_idct_add_avr32:
-+
-+ stm --sp,r0-r3,r4-r7, lr
-+
-+ /* Setup rounding factor. */
-+ mov r0, (1 << 5)
-+ lsl r0, 16
-+
-+ /* Load block */
-+ ldm r11,r2-r9
-+ /* r9 = { w00, w01 },
-+ r8 = { w02, w03 },
-+ r7 = { w10, w11 },
-+ r6 = { w12, w13 },
-+ r5 = { w20, w21 },
-+ r4 = { w22, w23 },
-+ r3 = { w30, w31 },
-+ r2 = { w32, w33 } */
-+
-+
-+ /* Add the rounding factor to w00. */
-+ add r9, r0
-+
-+ /* Transform rows */
-+ transform_row r9, r8, r0, r1
-+ transform_row r7, r6, r0, r1
-+ transform_row r5, r4, r0, r1
-+ transform_row r3, r2, r0, r1
-+
-+ /* Transform columns */
-+ transform_2columns r9, r7, r5, r3, r0, r1
-+ transform_2columns r8, r6, r4, r2, r0, r1
-+
-+ /* Load predicted pixels.*/
-+ ld.w lr, r12[0]
-+ ld.w r11, r12[r10]
-+
-+ /* Unpack to halwords. */
-+ punpckub.h r0, lr:t
-+ punpckub.h r1, lr:b
-+
-+ /* Add with transformed row. */
-+ padd.h r0, r0, r9
-+ paddx.h r1, r1, r8
-+ /* Pack and saturate back to 8-bit pixels. */
-+ packsh.ub r0, r0, r1
-+
-+ /* Unpack to halwords. */
-+ punpckub.h lr, r11:t
-+ punpckub.h r11, r11:b
-+
-+ /* Add with transformed row. */
-+ padd.h lr, lr, r7
-+ paddx.h r11, r11, r6
-+ /* Pack and saturate back to 8-bit pixels. */
-+ packsh.ub r1, lr, r11
-+
-+ /* Store back to frame. */
-+ st.w r12[0], r0
-+ st.w r12[r10], r1
-+
-+ add r12, r12, r10 << 1
-+
-+ /* Load predicted pixels.*/
-+ ld.w lr, r12[0]
-+ ld.w r11, r12[r10]
-+
-+ /* Unpack to halwords. */
-+ punpckub.h r0, lr:t
-+ punpckub.h r1, lr:b
-+
-+ /* Add with transformed row. */
-+ padd.h r0, r0, r5
-+ paddx.h r1, r1, r4
-+ /* Pack and saturate back to 8-bit pixels. */
-+ packsh.ub r0, r0, r1
-+
-+ /* Unpack to halwords. */
-+ punpckub.h lr, r11:t
-+ punpckub.h r11, r11:b
-+
-+ /* Add with transformed row. */
-+ padd.h lr, lr, r3
-+ paddx.h r11, r11, r2
-+ /* Pack and saturate back to 8-bit pixels. */
-+ packsh.ub r1, lr, r11
-+
-+ /* Store back to frame. */
-+ st.w r12[0], r0
-+ st.w r12[r10], r1
-+
-+ ldm sp++,r0-r3,r4-r7, pc
-+
-+
-+ .global h264_idct8_add_avr32
-+//void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride){
-+
-+h264_idct8_add_avr32:
-+ stm --sp,r0-r3,r4-r7, lr
-+
-+ /* Push dst and stride on stack */
-+ stm --sp,r10,r12
-+
-+// int i;
-+// DCTELEM (*src)[8] = (DCTELEM(*)[8])block;
-+// uint8_t *cm = cropTbl + MAX_NEG_CROP;
-+
-+// block[0] += 32;
-+
-+
-+// for( i = 0; i < 8; i++ )
-+// {
-+ mov lr, 4
-+0:
-+ ld.w r7, r11[0*(8*2)]
-+ ld.w r6, r11[1*(8*2)]
-+ ld.w r5, r11[2*(8*2)]
-+ ld.w r4, r11[3*(8*2)]
-+ ld.w r3, r11[4*(8*2)]
-+ ld.w r2, r11[5*(8*2)]
-+ ld.w r1, r11[6*(8*2)]
-+ ld.w r0, r11[7*(8*2)]
-+
-+/*
-+
-+ const int a0 = src[0][i] + src[4][i];
-+ const int a2 = src[0][i] - src[4][i];
-+ const int a4 = (src[2][i]>>1) - src[6][i];
-+ const int a6 = (src[6][i]>>1) + src[2][i];
-+*/
-+ padd.h r8, r7, r3 /* r8 = a0 */
-+ psub.h r7, r7, r3 /* r7 = a2 */
-+ pasr.h r3, r5, 1 /* r3 = src[2][i] >> 1 */
-+ pasr.h r9, r1, 1 /* r9 = src[6][i] >> 1 */
-+ psub.h r3, r3, r1 /* r3 = a4 */
-+ padd.h r9, r9, r5 /* r9 = a6 */
-+
-+/*
-+ const int b0 = a0 + a6;
-+ const int b2 = a2 + a4;
-+ const int b4 = a2 - a4;
-+ const int b6 = a0 - a6;
-+*/
-+ padd.h r1, r8, r9 /* r1 = b0 */
-+ psub.h r8, r8, r9 /* r8 = b6 */
-+ padd.h r5, r7, r3 /* r5 = b2 */
-+ psub.h r7, r7, r3 /* r7 = b4 */
-+
-+/*
-+ const int a1 = -src[3][i] + src[5][i] - src[7][i] - (src[7][i]>>1);
-+ const int a3 = src[1][i] + src[7][i] - src[3][i] - (src[3][i]>>1);
-+ const int a5 = -src[1][i] + src[7][i] + src[5][i] + (src[5][i]>>1);
-+ const int a7 = src[3][i] + src[5][i] + src[1][i] + (src[1][i]>>1);
-+*/
-+ pasr.h r3, r0, 1
-+ padd.h r3, r3, r0
-+ psub.h r3, r2, r3
-+ psub.h r3, r3, r4 /* r3 = a1 */
-+
-+ pasr.h r9, r4, 1
-+ padd.h r9, r9, r4
-+ psub.h r9, r0, r9
-+ padd.h r9, r6, r9 /* r9 = a3 */
-+
-+ pasr.h r10, r2, 1
-+ padd.h r10, r10, r2
-+ padd.h r10, r10, r0
-+ psub.h r10, r10, r6 /* r10 = a5 */
-+
-+ pasr.h r0, r6, 1
-+ padd.h r0, r0, r6
-+ padd.h r0, r0, r2
-+ padd.h r0, r0, r4 /* r0 = a7 */
-+/*
-+ const int b1 = (a7>>2) + a1;
-+ const int b3 = a3 + (a5>>2);
-+ const int b5 = (a3>>2) - a5;
-+ const int b7 = a7 - (a1>>2);
-+*/
-+ pasr.h r2, r0, 2
-+ padd.h r2, r2, r3 /* r2 = b1 */
-+ pasr.h r3, r3, 2
-+ psub.h r3, r0, r3 /* r3 = b7 */
-+
-+ pasr.h r0, r10, 2
-+ padd.h r0, r0, r9 /* r0 = b3 */
-+ pasr.h r9, r9, 2
-+ psub.h r9, r9, r10 /* r9 = b5 */
-+
-+
-+/*
-+ src[0][i] = b0 + b7;
-+ src[7][i] = b0 - b7;
-+ src[1][i] = b2 + b5;
-+ src[6][i] = b2 - b5;
-+ src[2][i] = b4 + b3;
-+ src[5][i] = b4 - b3;
-+ src[3][i] = b6 + b1;
-+ src[4][i] = b6 - b1; */
-+
-+ padd.h r4, r1, r3
-+ psub.h r1, r1, r3
-+ st.w r11[0*(8*2)], r4
-+ st.w r11[7*(8*2)], r1
-+
-+ padd.h r3, r5, r9
-+ psub.h r5, r5, r9
-+ st.w r11[1*(8*2)], r3
-+ st.w r11[6*(8*2)], r5
-+
-+ padd.h r9, r7, r0
-+ psub.h r7, r7, r0
-+ st.w r11[2*(8*2)], r9
-+ st.w r11[5*(8*2)], r7
-+
-+ padd.h r0, r8, r2
-+ psub.h r8, r8, r2
-+ st.w r11[3*(8*2)], r0
-+ st.w r11[4*(8*2)], r8
-+
-+ sub r11, -4
-+ sub lr, 1
-+ brne 0b
-+
-+// }
-+
-+ lddsp r12, sp[0] /* r12 = dst */
-+ sub r11, 4*4
-+ ldm r11++, r4-r7
-+ mov lr, 8
-+ /* Push dst and stride on stack */
-+
-+1:
-+// for( i = 0; i < 8; i++ )
-+// {
-+
-+ /* r7 = {src[i][0], src[i][1]}
-+ r6 = {src[i][2], src[i][3]}
-+ r5 = {src[i][4], src[i][5]}
-+ r4 = {src[i][6], src[i][7]} */
-+
-+/*
-+ const int a0 = src[i][0] + src[i][4];
-+ const int a2 = src[i][0] - src[i][4];
-+ const int a4 = (src[i][2]>>1) - src[i][6];
-+ const int a6 = (src[i][6]>>1) + src[i][2];
-+*/
-+ pasr.h r8, r6, 1
-+ pasr.h r9, r4, 1
-+ addhh.w r0, r7:t, r5:t /* r0 = a0 */
-+ subhh.w r1, r7:t, r5:t /* r1 = a2 */
-+ subhh.w r2, r8:t, r4:t /* r2 = a4 */
-+ addhh.w r3, r9:t, r6:t /* r3 = a6 */
-+
-+/*
-+ const int b0 = a0 + a6;
-+ const int b2 = a2 + a4;
-+ const int b4 = a2 - a4;
-+ const int b6 = a0 - a6;
-+*/
-+ add r10, r0, r3 /* r10 = b0 */
-+ sub r0, r3 /* r0 = b6 */
-+ add r3, r1, r2 /* r3 = b2 */
-+ sub r1, r2 /* r1 = b4 */
-+/*
-+
-+
-+ const int a7 = src[i][5] + src[i][3] + src[i][1] + (src[i][1]>>1);
-+ const int a1 = src[i][5] - src[i][3] - src[i][7] - (src[i][7]>>1);
-+ const int a3 = src[i][7] + src[i][1] - src[i][3] - (src[i][3]>>1);
-+ const int a5 = src[i][7] - src[i][1] + src[i][5] + (src[i][5]>>1); */
-+ addhh.w r8, r8:b, r6:b
-+ addhh.w r2, r4:b, r7:b
-+ sub r2, r8 /* r2 = a3 */
-+
-+ addhh.w r9, r9:b, r4:b
-+ subhh.w r8, r5:b, r6:b
-+ sub r8, r9 /* r8 = a1 */
-+
-+ pasr.h r9, r7, 1
-+ addhh.w r9, r9:b, r7:b
-+ addhh.w r6, r5:b, r6:b
-+ add r6, r9 /* r6 = a7 */
-+
-+ pasr.h r9, r5, 1
-+ addhh.w r9, r9:b, r5:b
-+ subhh.w r5, r4:b, r7:b
-+ add r5, r9 /* r5 = a5 */
-+
-+/* const int b1 = (a7>>2) + a1;
-+ const int b3 = (a5>>2) + a3;
-+ const int b5 = (a3>>2) - a5;
-+ const int b7 = -(a1>>2) + a7 ; */
-+ asr r4, r6, 2
-+ add r4, r8 /* r4 = b1 */
-+ asr r8, 2
-+ rsub r8, r6 /* r8 = b7 */
-+
-+ asr r6, r5, 2
-+ add r6, r2 /* r6 = b3 */
-+ asr r2, 2
-+ sub r2, r5 /* r2 = b5 */
-+
-+/*
-+ dst[i*stride + 0] = cm[ dst[i*stride + 0] + ((b0 + b7) >> 6) ];
-+ dst[i*stride + 1] = cm[ dst[i*stride + 1] + ((b2 + b5) >> 6) ];
-+ dst[i*stride + 2] = cm[ dst[i*stride + 2] + ((b4 + b3) >> 6) ];
-+ dst[i*stride + 3] = cm[ dst[i*stride + 3] + ((b6 + b1) >> 6) ];
-+ dst[i*stride + 4] = cm[ dst[i*stride + 4] + ((b6 - b1) >> 6) ];
-+ dst[i*stride + 5] = cm[ dst[i*stride + 5] + ((b4 - b3) >> 6) ];
-+ dst[i*stride + 6] = cm[ dst[i*stride + 6] + ((b2 - b5) >> 6) ];
-+ dst[i*stride + 7] = cm[ dst[i*stride + 7] + ((b0 - b7) >> 6) ];
-+*/
-+ add r5, r10, r8
-+ satrnds r5 >> 6, 0 /* r5 = (b0 + b7) >> 6 */
-+ sub r10, r8
-+ satrnds r10 >> 6, 0 /* r10 = (b0 - b7) >> 6 */
-+ add r8, r3, r2
-+ satrnds r8 >> 6, 0 /* r8 = (b2 + b5) >> 6 */
-+ sub r3, r2
-+ satrnds r3 >> 6, 0 /* r3 = (b2 - b5) >> 6 */
-+
-+ add r2, r1, r6
-+ satrnds r2 >> 6, 0 /* r2 = (b4 + b3) >> 6 */
-+ sub r1, r6
-+ satrnds r1 >> 6, 0 /* r1 = (b4 - b3) >> 6 */
-+
-+ add r6, r0, r4
-+ satrnds r6 >> 6, 0 /* r6 = (b6 + b1) >> 6 */
-+ sub r0, r4
-+ satrnds r0 >> 6, 0 /* r0 = (b6 - b1) >> 6 */
-+
-+ ld.w r4, r12[0]
-+
-+ packw.sh r8, r5, r8
-+ packw.sh r7, r2, r6
-+ ld.w r9, r12[4]
-+ packw.sh r6, r0, r1
-+ packw.sh r5, r3, r10
-+
-+ punpckub.h r10, r4:t
-+ punpckub.h r4, r4:b
-+ punpckub.h r3, r9:t
-+ punpckub.h r9, r9:b
-+
-+ padd.h r8, r8, r10
-+ padd.h r7, r7, r4
-+ padd.h r6, r6, r3
-+ padd.h r5, r5, r9
-+
-+ lddsp r10, sp[4] /* r10 = stride */
-+ packsh.ub r0, r8, r7
-+ packsh.ub r1, r6, r5
-+
-+ st.w r12[0], r0
-+ st.w r12[4], r1
-+
-+ ldm r11++, r4-r7
-+ add r12, r10 /* dst += stride */
-+
-+ sub lr, 1
-+ brne 1b
-+
-+ sub sp, -8
-+ ldm sp++,r0-r3,r4-r7, pc
-+
-+
-+
-+// }
-+//}
-diff --git a/libavcodec/avr32/idct.S b/libavcodec/avr32/idct.S
-new file mode 100644
-index 0000000..e7551ec
---- /dev/null
-+++ b/libavcodec/avr32/idct.S
-@@ -0,0 +1,829 @@
-+/*
-+ * Copyright (c) 2007 Atmel Corporation. All rights reserved.
-+ *
-+ * Redistribution and use in source and binary forms, with or without
-+ * modification, are permitted provided that the following conditions
-+ * are met:
-+ *
-+ * 1. Redistributions of source code must retain the above copyright
-+ * notice, this list of conditions and the following disclaimer.
-+ *
-+ * 2. Redistributions in binary form must reproduce the above
-+ * copyright notice, this list of conditions and the following
-+ * disclaimer in the documentation and/or other materials provided
-+ * with the distribution.
-+ *
-+ * 3. The name of ATMEL may not be used to endorse or promote products
-+ * derived from this software without specific prior written
-+ * permission.
-+ *
-+ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
-+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-+ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
-+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
-+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
-+ * DAMAGE.
-+ */
-+
-+ .global idct_add_avr32
-+ .global idct_put_avr32
-+ .global idct_avr32
-+
-+
-+#define CONST_BITS 13
-+#define PASS1_BITS 2
-+
-+#define ONE ((INT32) 1)
-+
-+#define CONST_SCALE (ONE << CONST_BITS)
-+
-+#define LINE_SIZE 32
-+
-+#define FIX_0_298631336 (2446) /* FIX(0.298631336) */
-+#define FIX_0_390180644 (3196) /* FIX(0.390180644) */
-+#define FIX_0_541196100 (4433) /* FIX(0.541196100) */
-+#define FIX_0_765366865 (6270) /* FIX(0.765366865) */
-+#define FIX_0_899976223 (7373) /* FIX(0.899976223) */
-+#define FIX_1_175875602 (9633) /* FIX(1.175875602) */
-+#define FIX_1_501321110 (12299)/* FIX(1.501321110) */
-+#define FIX_1_847759065 (15137)/* FIX(1.847759065) */
-+#define FIX_1_961570560 (16069)/* FIX(1.961570560) */
-+#define FIX_2_053119869 (16819)/* FIX(2.053119869) */
-+#define FIX_2_562915447 (20995)/* FIX(2.562915447) */
-+#define FIX_3_072711026 (25172)/* FIX(3.072711026) */
-+
-+
-+#define loop_cnt r11
-+
-+ .text
-+
-+idct_add_avr32:
-+ pushm r0-r3, r4-r7, lr //Free up registers to use for local variables
-+
-+ // Give room for some variables on the stack
-+ sub sp, 8
-+ stdsp SP[0], r12 // rfp
-+ stdsp SP[4], r11 // iinc
-+
-+ mov loop_cnt, 8 //Initialize loop counter
-+
-+FOR_ROW:
-+
-+ ldm r10, r0, r1, r2, r3 //Load 8 DCT-coeffisients from the current row in the DCT-block
-+ mov r6, 0
-+#ifdef USE_PREFETCH
-+ pref r10[LINE_SIZE] //Prefetch next line
-+#endif
-+ or r4, r2, r3 << 16
-+ or r4, r1 //Check if all DCT-coeffisients except the DC is zero
-+ or r4, r0
-+ brne AC_ROW //If there are non-zero AC coeffisients perform row-transform
-+
-+ paddsub.h r5, r3:t, r6:b //Extract the DC-coeff from r5
-+ plsl.h r5, r5, PASS1_BITS
-+ mov r4, r5
-+ st.d r10++, r4
-+ st.d r10++, r4
-+
-+ sub loop_cnt, 1 //Decrement loop counter
-+ brne FOR_ROW //Perform loop one more time if loop_cnt is not zero
-+
-+ bral COLOUMN_TRANSFORM //Perform coloumn transform after row transform is computed
-+
-+
-+AC_ROW:
-+
-+
-+ ld.w r12, pc[coef_table - .]
-+ ld.w r9, pc[coef_table - . + 4]
-+
-+ padd.h r4, r2, r0 // r4:t = dataptr[2] + dataptr[6],r4:b = dataptr[3] + dataptr[7]
-+ mulhh.w r5, r4:t, r12:t
-+ mulhh.w r6, r0:t, r12:b
-+ ld.w r12, pc[coef_table - . + 8]
-+ mulhh.w r7, r2:t, r9:t
-+ add r6, r5 // tmp2
-+ satrnds r6 >> (CONST_BITS - PASS1_BITS), 31
-+ add r7, r5 // tmp3
-+ satrnds r7 >> (CONST_BITS - PASS1_BITS), 31
-+
-+ paddsub.h r5, r3:t, r1:t
-+ plsl.h r5, r5, PASS1_BITS // r5:t = tmp0, r5:b = tmp1
-+
-+ paddsub.h r7, r5:t, r7:b // r7:t = tmp10, r7:b = tmp13
-+ paddsub.h r6, r5:b, r6:b // r6:t = tmp11, r6:b = tmp12
-+
-+
-+ addhh.w lr, r3:b, r1:b // lr = z4
-+ addhh.w r5, r4:b, lr:b
-+ mulhh.w r5, r5:b, r9:b // r5 = z5
-+
-+ ld.w r9, pc[coef_table - . + 12]
-+ mulhh.w r4, r4:b, r12:t // r4 = z3
-+ mulhh.w lr, lr:b, r12:b // lr = z4
-+
-+ add r4, r5
-+ add lr, r5
-+
-+ addhh.w r5, r2:b, r1:b // r5 = z2
-+ addhh.w r8, r3:b, r0:b // r8 = z1
-+
-+
-+ mulhh.w r0, r0:b, r9:t // r0 = tmp0
-+ ld.w r12, pc[coef_table - . + 16]
-+ mulhh.w r1, r1:b, r9:b // r1 = tmp1
-+ ld.w r9, pc[coef_table - . + 20]
-+ mulhh.w r2, r2:b, r12:t // r2 = tmp2
-+ mulhh.w r3, r3:b, r12:b // r3 = tmp3
-+ mulhh.w r8, r8:b, r9:t // r8 = z1
-+ mulhh.w r5, r5:b, r9:b // r5 = z2
-+
-+
-+ add r0, r8
-+ add r0, r4
-+ add r1, r5
-+ add r1, lr
-+ add r2, r5
-+ add r2, r4
-+ add r3, r8
-+ add r3, lr
-+
-+ satrnds r0 >> (CONST_BITS - PASS1_BITS), 31
-+ satrnds r1 >> (CONST_BITS - PASS1_BITS), 31
-+ satrnds r2 >> (CONST_BITS - PASS1_BITS), 31
-+ satrnds r3 >> (CONST_BITS - PASS1_BITS), 31
-+
-+ paddsub.h r5, r6:t, r2:b // r5:t = dataptr[1], r5:b = dataptr[6]
-+ paddsub.h r4, r7:t, r3:b // r4:t = dataptr[0], r4:b = dataptr[7]
-+ paddsub.h r3, r6:b, r1:b // r3:t = dataptr[2], r3:b = dataptr[5]
-+ paddsub.h r2, r7:b, r0:b // r2:t = dataptr[3], r2:b = dataptr[4]
-+
-+ sthh.w r10[0], r4:t, r5:t
-+ sthh.w r10[4], r3:t, r2:t
-+ sthh.w r10[8], r2:b, r3:b
-+ sthh.w r10[12], r5:b, r4:b
-+
-+
-+
-+ sub r10, -16
-+ sub loop_cnt, 1
-+ brne FOR_ROW, e
-+
-+COLOUMN_TRANSFORM:
-+
-+ sub r10, 128 //Set pointer to start of DCT block
-+
-+
-+ mov loop_cnt, 8
-+FOR_COLOUMN:
-+ ldins.h r3:t,r10[0] // r3:t = dataptr[0]
-+ ldins.h r1:t,r10[1*8*2]// r1:t = dataptr[1]
-+ ldins.h r2:t,r10[2*8*2]// r2:t = dataptr[2]
-+ ldins.h r0:t,r10[5*8*2]// r0:t = dataptr[5]
-+ ldins.h r3:b,r10[4*8*2]// r3:b = dataptr[4]
-+ ldins.h r1:b,r10[3*8*2]// r1:b = dataptr[3]
-+ ldins.h r2:b,r10[6*8*2]// r2:b = dataptr[6]
-+ ldins.h r0:b,r10[7*8*2]// r0:b = dataptr[7]
-+
-+ or r4, r1, r3 << 16
-+ or r4, r2
-+ or r4, r0
-+ brne AC_COLOUMN //If there are non-zero AC coeffisients perform row-transform
-+
-+ lddsp r12, SP[0] // rfp
-+ lddsp r9, SP[4] // iinc
-+ satrnds r3 >> ( PASS1_BITS + 3 + 16 ), 9
-+ ld.d r0, r12[0]
-+ sub r10, -2 // Increment the dataptr
-+ bfins r3, r3, 16, 16
-+ punpckub.h r2, r1:t
-+ padd.h r2, r2, r3
-+ punpckub.h r1, r1:b
-+ padd.h r1, r1, r3
-+ packsh.ub r1, r2, r1
-+ punpckub.h r2, r0:t
-+ padd.h r2, r2, r3
-+ punpckub.h r0, r0:b
-+ padd.h r0, r0, r3
-+ packsh.ub r0, r2, r0
-+ st.d r12[0], r0
-+ add r12, r9 // increment rfp
-+ stdsp SP[0], r12
-+
-+ sub loop_cnt, 1//Decrement loop counter
-+ brne FOR_COLOUMN//Perform loop one more time if loop_cnt is not zero
-+
-+ sub sp, -8
-+ popm r0-r3, r4-r7, pc//Pop back registers and PC
-+
-+AC_COLOUMN:
-+
-+ ld.w r12, pc[coef_table - .]
-+ ld.w r9, pc[coef_table - . + 4]
-+
-+ addhh.w r4, r2:t, r2:b
-+ mulhh.w r4, r4:b, r12:t // r4 = z1
-+ mulhh.w r5, r2:b, r12:b
-+ ld.w r12, pc[coef_table - . + 8]
-+ mulhh.w r6, r2:t, r9:t
-+ add r5, r4 // r5 = tmp2
-+ add r6, r4 // r6 = tmp3
-+
-+ addhh.w r7, r3:t, r3:b
-+ subhh.w r8, r3:t, r3:b
-+
-+ lsl r7, CONST_BITS
-+ lsl r8, CONST_BITS
-+
-+ add r2, r7, r6 // r2 = tmp10
-+ sub r3, r7, r6 // r3 = tmp13
-+ add r4, r8, r5 // r4 = tmp11
-+ sub r5, r8, r5 // r5 = tmp12
-+
-+ padd.h r6, r0, r1 // r6:t = z4, r6:b = z3
-+ addhh.w r7, r6:t, r6:b
-+ mulhh.w r7, r7:b, r9:b // r7 = z5
-+
-+ ld.w r9, pc[coef_table - . + 12]
-+ mulhh.w r8, r6:b, r12:t // r8 = z3
-+ mulhh.w r6, r6:t, r12:b // r6 = z4
-+
-+ add r8, r7
-+ add r6, r7
-+
-+ paddx.h r7, r0, r1 // r7:t = z2, r7:b = z1
-+
-+ mulhh.w r12, r0:b, r9:t // r12 = tmp0
-+ mulhh.w r0, r0:t, r9:b // r0 = tmp1
-+ ld.w r9, pc[coef_table - . + 16]
-+ add r12, r8
-+ add r0, r6
-+
-+ ld.w lr, pc[coef_table - . + 20]
-+ machh.w r8, r1:b, r9:t // r8 = tmp2
-+ machh.w r6, r1:t, r9:b // r6 = tmp3
-+ mulhh.w r9, r7:b, lr:t // r9 = z1
-+ mulhh.w r7, r7:t, lr:b // r7 = z2
-+
-+
-+ add r12, r9
-+ add r0, r7
-+ add r8, r7
-+ add r6, r9
-+
-+ add r1, r2, r6 // r1 = dataptr[DCTSIZE*0]
-+ sub r2, r2, r6 // r2 = dataptr[DCTSIZE*7]
-+ add r6, r4, r8 // r6 = dataptr[DCTSIZE*1]
-+ sub r4, r4, r8 // r4 = dataptr[DCTSIZE*6]
-+ add r8, r5, r0 // r8 = dataptr[DCTSIZE*2]
-+ sub r5, r5, r0 // r5 = dataptr[DCTSIZE*5]
-+ add r0, r3, r12 // r0 = dataptr[DCTSIZE*3]
-+ sub r3, r3, r12 // r3 = dataptr[DCTSIZE*4]
-+
-+ satrnds r1 >> (CONST_BITS+PASS1_BITS+3), 9
-+ satrnds r2 >> (CONST_BITS+PASS1_BITS+3), 9
-+ satrnds r6 >> (CONST_BITS+PASS1_BITS+3), 9
-+ satrnds r4 >> (CONST_BITS+PASS1_BITS+3), 9
-+ satrnds r8 >> (CONST_BITS+PASS1_BITS+3), 9
-+ satrnds r5 >> (CONST_BITS+PASS1_BITS+3), 9
-+ satrnds r0 >> (CONST_BITS+PASS1_BITS+3), 9
-+ satrnds r3 >> (CONST_BITS+PASS1_BITS+3), 9
-+
-+ packw.sh r1, r1, r6
-+ packw.sh r8, r8, r0
-+ packw.sh r3, r3, r5
-+ packw.sh r4, r4, r2
-+
-+ lddsp r12, SP[0] // rfp
-+ lddsp r9, SP[4] // iinc
-+ ld.d r6, r12[0]
-+ sub r10, -2 // Increment the dataptr
-+ punpckub.h r0, r7:t
-+ padd.h r1, r1, r0
-+ punpckub.h r0, r7:b
-+ padd.h r8, r8, r0
-+ packsh.ub r7, r1, r8
-+ punpckub.h r0, r6:t
-+ padd.h r3, r3, r0
-+ punpckub.h r0, r6:b
-+ padd.h r4, r4, r0
-+ packsh.ub r6, r3, r4
-+ st.d r12[0], r6
-+ add r12, r9 // increment rfp
-+ stdsp SP[0], r12
-+
-+ sub loop_cnt, 1 //Decrement loop counter
-+ brne FOR_COLOUMN //Perform loop one more time if loop_cnt is not zero
-+
-+ sub sp, -8
-+ popm r0-r3, r4-r7, pc //Pop back registers and PC
-+
-+
-+
-+//Coeffisient Table:
-+ .align 2
-+coef_table:
-+ .short FIX_0_541196100, -FIX_1_847759065, FIX_0_765366865, FIX_1_175875602
-+ .short - FIX_1_961570560, - FIX_0_390180644, FIX_0_298631336, FIX_2_053119869
-+ .short FIX_3_072711026, FIX_1_501321110, - FIX_0_899976223, - FIX_2_562915447
-+
-+
-+idct_put_avr32:
-+ pushm r0-r3, r4-r7, lr //Free up registers to use for local variables
-+
-+ //; Give room for some variables on the stack
-+ sub sp, 8
-+ stdsp SP[0], r12 // rfp
-+ stdsp SP[4], r11 // iinc
-+
-+ mov loop_cnt, 8 //Initialize loop counter
-+
-+0:
-+
-+ ldm r10, r0, r1, r2, r3 //Load 8 DCT-coeffisients from the current row in the DCT-block
-+ mov r6, 0
-+#ifdef USE_PREFETCH
-+ pref r10[LINE_SIZE] //Prefetch next line
-+#endif
-+ or r4, r2, r3 << 16
-+ or r4, r1 //Check if all DCT-coeffisients except the DC is zero
-+ or r4, r0
-+ brne 1f //If there are non-zero AC coeffisients perform row-transform
-+
-+ paddsub.h r5, r3:t, r6:b //Extract the DC-coeff from r5
-+ plsl.h r5, r5, PASS1_BITS
-+ mov r4, r5
-+ st.d r10++, r4
-+ st.d r10++, r4
-+
-+ sub loop_cnt, 1 //Decrement loop counter
-+ brne 0b //Perform loop one more time if loop_cnt is not zero
-+
-+ bral 2f //Perform coloumn transform after row transform is computed
-+
-+1:
-+
-+ ld.w r12, pc[coef_table_copy - .]
-+ ld.w r9, pc[coef_table_copy - . + 4]
-+
-+ padd.h r4, r2, r0 // r4:t = dataptr[2] + dataptr[6],r4:b = dataptr[3] + dataptr[7]
-+ mulhh.w r5, r4:t, r12:t
-+ mulhh.w r6, r0:t, r12:b
-+ ld.w r12, pc[coef_table_copy - . + 8]
-+ mulhh.w r7, r2:t, r9:t
-+ add r6, r5 // tmp2
-+ satrnds r6 >> (CONST_BITS - PASS1_BITS), 31
-+ add r7, r5 // tmp3
-+ satrnds r7 >> (CONST_BITS - PASS1_BITS), 31
-+
-+ paddsub.h r5, r3:t, r1:t
-+ plsl.h r5, r5, PASS1_BITS // r5:t = tmp0, r5:b = tmp1
-+
-+ paddsub.h r7, r5:t, r7:b // r7:t = tmp10, r7:b = tmp13
-+ paddsub.h r6, r5:b, r6:b // r6:t = tmp11, r6:b = tmp12
-+
-+
-+
-+ addhh.w lr, r3:b, r1:b // lr = z4
-+ addhh.w r5, r4:b, lr:b
-+ mulhh.w r5, r5:b, r9:b // r5 = z5
-+
-+ ld.w r9, pc[coef_table_copy - . + 12]
-+ mulhh.w r4, r4:b, r12:t // r4 = z3
-+ mulhh.w lr, lr:b, r12:b // lr = z4
-+
-+ add r4, r5
-+ add lr, r5
-+
-+ addhh.w r5, r2:b, r1:b // r5 = z2
-+ addhh.w r8, r3:b, r0:b // r8 = z1
-+
-+
-+ mulhh.w r0, r0:b, r9:t // r0 = tmp0
-+ ld.w r12, pc[coef_table_copy - . + 16]
-+ mulhh.w r1, r1:b, r9:b // r1 = tmp1
-+ ld.w r9, pc[coef_table_copy - . + 20]
-+ mulhh.w r2, r2:b, r12:t // r2 = tmp2
-+ mulhh.w r3, r3:b, r12:b // r3 = tmp3
-+ mulhh.w r8, r8:b, r9:t // r8 = z1
-+ mulhh.w r5, r5:b, r9:b // r5 = z2
-+
-+
-+ add r0, r8
-+ add r0, r4
-+ add r1, r5
-+ add r1, lr
-+ add r2, r5
-+ add r2, r4
-+ add r3, r8
-+ add r3, lr
-+
-+ satrnds r0 >> (CONST_BITS - PASS1_BITS), 31
-+ satrnds r1 >> (CONST_BITS - PASS1_BITS), 31
-+ satrnds r2 >> (CONST_BITS - PASS1_BITS), 31
-+ satrnds r3 >> (CONST_BITS - PASS1_BITS), 31
-+
-+ paddsub.h r5, r6:t, r2:b // r5:t = dataptr[1], r5:b = dataptr[6]
-+ paddsub.h r4, r7:t, r3:b // r4:t = dataptr[0], r4:b = dataptr[7]
-+ paddsub.h r3, r6:b, r1:b // r3:t = dataptr[2], r3:b = dataptr[5]
-+ paddsub.h r2, r7:b, r0:b // r2:t = dataptr[3], r2:b = dataptr[4]
-+
-+ sthh.w r10[0], r4:t, r5:t
-+ sthh.w r10[4], r3:t, r2:t
-+ sthh.w r10[8], r2:b, r3:b
-+ sthh.w r10[12], r5:b, r4:b
-+
-+
-+
-+ sub r10, -16
-+ sub loop_cnt, 1
-+ brne 0b
-+
-+2:
-+
-+ sub r10, 128 //Set pointer to start of DCT block
-+
-+ mov loop_cnt, 8
-+
-+0:
-+ ldins.h r3:t,r10[0] // r3:t = dataptr[0]
-+ ldins.h r1:t,r10[1*8*2]// r1:t = dataptr[1]
-+ ldins.h r2:t,r10[2*8*2]// r2:t = dataptr[2]
-+ ldins.h r0:t,r10[5*8*2]// r0:t = dataptr[5]
-+ ldins.h r3:b,r10[4*8*2]// r3:b = dataptr[4]
-+ ldins.h r1:b,r10[3*8*2]// r1:b = dataptr[3]
-+ ldins.h r2:b,r10[6*8*2]// r2:b = dataptr[6]
-+ ldins.h r0:b,r10[7*8*2]// r0:b = dataptr[7]
-+
-+ or r4, r1, r3 << 16
-+ or r4, r2
-+ or r4, r0
-+ brne 1f //If there are non-zero AC coeffisients perform row-transform
-+
-+ lddsp r12, SP[0] // rfp
-+ lddsp r9, SP[4] // iinc
-+ satrnds r3 >> ( PASS1_BITS + 3 + 16 ), 31
-+ packw.sh r3, r3, r3
-+ packsh.ub r3, r3, r3
-+ mov r2, r3
-+ st.d r12[0], r2
-+ add r12, r9 // increment rfp
-+ sub r10, -2 // Increment the dataptr
-+ stdsp SP[0], r12
-+
-+ sub loop_cnt, 1//Decrement loop counter
-+ brne 0b //Perform loop one more time if loop_cnt is not zero
-+
-+ sub sp, -8
-+ popm r0-r3, r4-r7, pc//Pop back registers and PC
-+
-+1:
-+
-+ ld.w r12, pc[coef_table_copy - .]
-+ ld.w r9, pc[coef_table_copy - . + 4]
-+
-+ addhh.w r4, r2:t, r2:b
-+ mulhh.w r4, r4:b, r12:t // r4 = z1
-+ mulhh.w r5, r2:b, r12:b
-+ ld.w r12, pc[coef_table_copy - . + 8]
-+ mulhh.w r6, r2:t, r9:t
-+ add r5, r4 // r5 = tmp2
-+ add r6, r4 // r6 = tmp3
-+
-+ addhh.w r7, r3:t, r3:b
-+ subhh.w r8, r3:t, r3:b
-+
-+ lsl r7, CONST_BITS
-+ lsl r8, CONST_BITS
-+
-+ add r2, r7, r6 // r2 = tmp10
-+ sub r3, r7, r6 // r3 = tmp13
-+ add r4, r8, r5 // r4 = tmp11
-+ sub r5, r8, r5 // r5 = tmp12
-+
-+
-+ padd.h r6, r0, r1 // r6:t = z4, r6:b = z3
-+ addhh.w r7, r6:t, r6:b
-+ mulhh.w r7, r7:b, r9:b // r7 = z5
-+
-+ ld.w r9, pc[coef_table_copy - . + 12]
-+ mulhh.w r8, r6:b, r12:t // r8 = z3
-+ mulhh.w r6, r6:t, r12:b // r6 = z4
-+
-+ add r8, r7
-+ add r6, r7
-+
-+ paddx.h r7, r0, r1 // r7:t = z2, r7:b = z1
-+
-+ mulhh.w r12, r0:b, r9:t // r12 = tmp0
-+ mulhh.w r0, r0:t, r9:b // r0 = tmp1
-+ ld.w r9, pc[coef_table_copy - . + 16]
-+ add r12, r8
-+ add r0, r6
-+
-+ ld.w lr, pc[coef_table_copy - . + 20]
-+ machh.w r8, r1:b, r9:t // r8 = tmp2
-+ machh.w r6, r1:t, r9:b // r6 = tmp3
-+ mulhh.w r9, r7:b, lr:t // r9 = z1
-+ mulhh.w r7, r7:t, lr:b // r7 = z2
-+
-+
-+ add r12, r9
-+ add r0, r7
-+ add r8, r7
-+ add r6, r9
-+
-+ add r1, r2, r6 // r1 = dataptr[DCTSIZE*0]
-+ sub r2, r2, r6 // r2 = dataptr[DCTSIZE*7]
-+ add r6, r4, r8 // r6 = dataptr[DCTSIZE*1]
-+ sub r4, r4, r8 // r4 = dataptr[DCTSIZE*6]
-+ add r8, r5, r0 // r8 = dataptr[DCTSIZE*2]
-+ sub r5, r5, r0 // r5 = dataptr[DCTSIZE*5]
-+ add r0, r3, r12 // r0 = dataptr[DCTSIZE*3]
-+ sub r3, r3, r12 // r3 = dataptr[DCTSIZE*4]
-+
-+ satrnds r1 >> (CONST_BITS+PASS1_BITS+3), 9
-+ satrnds r2 >> (CONST_BITS+PASS1_BITS+3), 9
-+ satrnds r6 >> (CONST_BITS+PASS1_BITS+3), 9
-+ satrnds r4 >> (CONST_BITS+PASS1_BITS+3), 9
-+ satrnds r8 >> (CONST_BITS+PASS1_BITS+3), 9
-+ satrnds r5 >> (CONST_BITS+PASS1_BITS+3), 9
-+ satrnds r0 >> (CONST_BITS+PASS1_BITS+3), 9
-+ satrnds r3 >> (CONST_BITS+PASS1_BITS+3), 9
-+
-+ packw.sh r1, r1, r6
-+ packw.sh r8, r8, r0
-+ packw.sh r3, r3, r5
-+ packw.sh r4, r4, r2
-+
-+ packsh.ub r1, r1, r8
-+ packsh.ub r0, r3, r4
-+ lddsp r12, SP[0] // rfp
-+ lddsp r9, SP[4] // iinc
-+ st.d r12[0], r0
-+ sub r10, -2 // Increment the dataptr
-+ add r12, r9 // increment rfp
-+ stdsp SP[0], r12
-+
-+ sub loop_cnt, 1 //Decrement loop counter
-+ brne 0b //Perform loop one more time if loop_cnt is not zero
-+
-+ sub sp, -8
-+ popm r0-r3, r4-r7, pc //Pop back registers and PC
-+
-+
-+
-+ .align 2
-+coef_table_copy:
-+ .short FIX_0_541196100, -FIX_1_847759065, FIX_0_765366865, FIX_1_175875602
-+ .short - FIX_1_961570560, - FIX_0_390180644, FIX_0_298631336, FIX_2_053119869
-+ .short FIX_3_072711026, FIX_1_501321110, - FIX_0_899976223, - FIX_2_562915447
-+
-+
-+idct_avr32:
-+ pushm r0-r3, r4-r7, lr //Free up registers to use for local variables
-+
-+ //; Give room for a temporary block on the stack
-+ sub sp, 8*8*2
-+
-+ mov loop_cnt, 8 //Initialize loop counter
-+
-+0:
-+
-+ ldm r12++, r0, r1, r2, r3 //Load 8 DCT-coeffisients from the current row in the DCT-block
-+ mov r6, 0
-+#ifdef USE_PREFETCH
-+ pref r12[LINE_SIZE] //Prefetch next line
-+#endif
-+ or r4, r2, r3 << 16
-+ or r4, r1 //Check if all DCT-coeffisients except the DC is zero
-+ or r4, r0
-+ brne 1f //If there are non-zero AC coeffisients perform row-transform
-+
-+ paddsub.h r5, r3:t, r6:b //Extract the DC-coeff from r5
-+ plsl.h r5, r5, PASS1_BITS
-+ mov r4, r5
-+ st.d sp++, r4
-+ st.d sp++, r4
-+
-+ sub loop_cnt, 1 //Decrement loop counter
-+ brne 0b //Perform loop one more time if loop_cnt is not zero
-+
-+ bral 2f //Perform coloumn transform after row transform is computed
-+
-+1:
-+
-+ ld.w r10, pc[coef_table_idct - .]
-+ ld.w r9, pc[coef_table_idct - . + 4]
-+
-+ padd.h r4, r2, r0 // r4:t = dataptr[2] + dataptr[6],r4:b = dataptr[3] + dataptr[7]
-+ mulhh.w r5, r4:t, r10:t
-+ mulhh.w r6, r0:t, r10:b
-+ ld.w r10, pc[coef_table_idct - . + 8]
-+ mulhh.w r7, r2:t, r9:t
-+ add r6, r5 // tmp2
-+ satrnds r6 >> (CONST_BITS - PASS1_BITS), 31
-+ add r7, r5 // tmp3
-+ satrnds r7 >> (CONST_BITS - PASS1_BITS), 31
-+
-+ paddsub.h r5, r3:t, r1:t
-+ plsl.h r5, r5, PASS1_BITS // r5:t = tmp0, r5:b = tmp1
-+
-+ paddsub.h r7, r5:t, r7:b // r7:t = tmp10, r7:b = tmp13
-+ paddsub.h r6, r5:b, r6:b // r6:t = tmp11, r6:b = tmp12
-+
-+
-+
-+ addhh.w lr, r3:b, r1:b // lr = z4
-+ addhh.w r5, r4:b, lr:b
-+ mulhh.w r5, r5:b, r9:b // r5 = z5
-+
-+ ld.w r9, pc[coef_table_idct - . + 12]
-+ mulhh.w r4, r4:b, r10:t // r4 = z3
-+ mulhh.w lr, lr:b, r10:b // lr = z4
-+
-+ add r4, r5
-+ add lr, r5
-+
-+ addhh.w r5, r2:b, r1:b // r5 = z2
-+ addhh.w r8, r3:b, r0:b // r8 = z1
-+
-+
-+ mulhh.w r0, r0:b, r9:t // r0 = tmp0
-+ ld.w r10, pc[coef_table_idct - . + 16]
-+ mulhh.w r1, r1:b, r9:b // r1 = tmp1
-+ ld.w r9, pc[coef_table_idct - . + 20]
-+ mulhh.w r2, r2:b, r10:t // r2 = tmp2
-+ mulhh.w r3, r3:b, r10:b // r3 = tmp3
-+ mulhh.w r8, r8:b, r9:t // r8 = z1
-+ mulhh.w r5, r5:b, r9:b // r5 = z2
-+
-+
-+ add r0, r8
-+ add r0, r4
-+ add r1, r5
-+ add r1, lr
-+ add r2, r5
-+ add r2, r4
-+ add r3, r8
-+ add r3, lr
-+
-+ satrnds r0 >> (CONST_BITS - PASS1_BITS), 31
-+ satrnds r1 >> (CONST_BITS - PASS1_BITS), 31
-+ satrnds r2 >> (CONST_BITS - PASS1_BITS), 31
-+ satrnds r3 >> (CONST_BITS - PASS1_BITS), 31
-+
-+ paddsub.h r5, r6:t, r2:b // r5:t = dataptr[1], r5:b = dataptr[6]
-+ paddsub.h r4, r7:t, r3:b // r4:t = dataptr[0], r4:b = dataptr[7]
-+ paddsub.h r3, r6:b, r1:b // r3:t = dataptr[2], r3:b = dataptr[5]
-+ paddsub.h r2, r7:b, r0:b // r2:t = dataptr[3], r2:b = dataptr[4]
-+
-+ sthh.w sp[0], r4:t, r5:t
-+ sthh.w sp[4], r3:t, r2:t
-+ sthh.w sp[8], r2:b, r3:b
-+ sthh.w sp[12], r5:b, r4:b
-+
-+
-+
-+ sub sp, -16
-+ sub loop_cnt, 1
-+ brne 0b
-+
-+2:
-+
-+ sub sp, 8*8*2 //Set pointer to start of DCT block
-+ sub r12, 8*8*2 //Set pointer to start of DCT block
-+
-+ mov loop_cnt, 8
-+
-+0:
-+ ldins.h r3:t,sp[0] // r3:t = dataptr[0]
-+ ldins.h r1:t,sp[1*8*2]// r1:t = dataptr[1]
-+ ldins.h r2:t,sp[2*8*2]// r2:t = dataptr[2]
-+ ldins.h r0:t,sp[5*8*2]// r0:t = dataptr[5]
-+ ldins.h r3:b,sp[4*8*2]// r3:b = dataptr[4]
-+ ldins.h r1:b,sp[3*8*2]// r1:b = dataptr[3]
-+ ldins.h r2:b,sp[6*8*2]// r2:b = dataptr[6]
-+ ldins.h r0:b,sp[7*8*2]// r0:b = dataptr[7]
-+
-+ or r4, r1, r3 << 16
-+ or r4, r2
-+ or r4, r0
-+ brne 1f //If there are non-zero AC coeffisients perform row-transform
-+
-+ satrnds r3 >> ( PASS1_BITS + 3 + 16 ), 31
-+ packw.sh r3, r3, r3
-+ mov r2, r3
-+ st.d r12++, r2
-+ st.d r12++, r2
-+ sub sp, -2 // Increment the dataptr
-+
-+ sub loop_cnt, 1//Decrement loop counter
-+ brne 0b //Perform loop one more time if loop_cnt is not zero
-+
-+ sub sp, -(8*8*2 - 8)
-+ popm r0-r3, r4-r7, pc//Pop back registers and PC
-+
-+1:
-+
-+ ld.w r10, pc[coef_table_idct - .]
-+ ld.w r9, pc[coef_table_idct - . + 4]
-+
-+ addhh.w r4, r2:t, r2:b
-+ mulhh.w r4, r4:b, r10:t // r4 = z1
-+ mulhh.w r5, r2:b, r10:b
-+ ld.w r10, pc[coef_table_idct - . + 8]
-+ mulhh.w r6, r2:t, r9:t
-+ add r5, r4 // r5 = tmp2
-+ add r6, r4 // r6 = tmp3
-+
-+ addhh.w r7, r3:t, r3:b
-+ subhh.w r8, r3:t, r3:b
-+
-+ lsl r7, CONST_BITS
-+ lsl r8, CONST_BITS
-+
-+ add r2, r7, r6 // r2 = tmp10
-+ sub r3, r7, r6 // r3 = tmp13
-+ add r4, r8, r5 // r4 = tmp11
-+ sub r5, r8, r5 // r5 = tmp12
-+
-+
-+ padd.h r6, r0, r1 // r6:t = z4, r6:b = z3
-+ addhh.w r7, r6:t, r6:b
-+ mulhh.w r7, r7:b, r9:b // r7 = z5
-+
-+ ld.w r9, pc[coef_table_idct - . + 12]
-+ mulhh.w r8, r6:b, r10:t // r8 = z3
-+ mulhh.w r6, r6:t, r10:b // r6 = z4
-+
-+ add r8, r7
-+ add r6, r7
-+
-+ paddx.h r7, r0, r1 // r7:t = z2, r7:b = z1
-+
-+ mulhh.w r10, r0:b, r9:t // r10 = tmp0
-+ mulhh.w r0, r0:t, r9:b // r0 = tmp1
-+ ld.w r9, pc[coef_table_idct - . + 16]
-+ add r10, r8
-+ add r0, r6
-+
-+ ld.w lr, pc[coef_table_idct - . + 20]
-+ machh.w r8, r1:b, r9:t // r8 = tmp2
-+ machh.w r6, r1:t, r9:b // r6 = tmp3
-+ mulhh.w r9, r7:b, lr:t // r9 = z1
-+ mulhh.w r7, r7:t, lr:b // r7 = z2
-+
-+
-+ add r10, r9
-+ add r0, r7
-+ add r8, r7
-+ add r6, r9
-+
-+ add r1, r2, r6 // r1 = dataptr[DCTSIZE*0]
-+ sub r2, r2, r6 // r2 = dataptr[DCTSIZE*7]
-+ add r6, r4, r8 // r6 = dataptr[DCTSIZE*1]
-+ sub r4, r4, r8 // r4 = dataptr[DCTSIZE*6]
-+ add r8, r5, r0 // r8 = dataptr[DCTSIZE*2]
-+ sub r5, r5, r0 // r5 = dataptr[DCTSIZE*5]
-+ add r0, r3, r10 // r0 = dataptr[DCTSIZE*3]
-+ sub r3, r3, r10 // r3 = dataptr[DCTSIZE*4]
-+
-+ satrnds r1 >> (CONST_BITS+PASS1_BITS+3), 9
-+ satrnds r2 >> (CONST_BITS+PASS1_BITS+3), 9
-+ satrnds r6 >> (CONST_BITS+PASS1_BITS+3), 9
-+ satrnds r4 >> (CONST_BITS+PASS1_BITS+3), 9
-+ satrnds r8 >> (CONST_BITS+PASS1_BITS+3), 9
-+ satrnds r5 >> (CONST_BITS+PASS1_BITS+3), 9
-+ satrnds r0 >> (CONST_BITS+PASS1_BITS+3), 9
-+ satrnds r3 >> (CONST_BITS+PASS1_BITS+3), 9
-+
-+ packw.sh r7, r1, r6
-+ packw.sh r6, r8, r0
-+ packw.sh r5, r3, r5
-+ packw.sh r4, r4, r2
-+
-+ stm r12, r4-r7
-+ sub sp, -2 // Increment the dataptr
-+ sub r12, -16
-+
-+ sub loop_cnt, 1 //Decrement loop counter
-+ brne 0b //Perform loop one more time if loop_cnt is not zero
-+
-+ sub sp, -(8*8*2 - 8)
-+ popm r0-r3, r4-r7, pc //Pop back registers and PC
-+
-+
-+
-+ .align 2
-+coef_table_idct:
-+ .short FIX_0_541196100, -FIX_1_847759065, FIX_0_765366865, FIX_1_175875602
-+ .short - FIX_1_961570560, - FIX_0_390180644, FIX_0_298631336, FIX_2_053119869
-+ .short FIX_3_072711026, FIX_1_501321110, - FIX_0_899976223, - FIX_2_562915447
-+
-diff --git a/libavcodec/avr32/mc.S b/libavcodec/avr32/mc.S
-new file mode 100644
-index 0000000..07a002d
---- /dev/null
-+++ b/libavcodec/avr32/mc.S
-@@ -0,0 +1,434 @@
-+/*
-+ * Copyright (c) 2007 Atmel Corporation. All rights reserved.
-+ *
-+ * Redistribution and use in source and binary forms, with or without
-+ * modification, are permitted provided that the following conditions
-+ * are met:
-+ *
-+ * 1. Redistributions of source code must retain the above copyright
-+ * notice, this list of conditions and the following disclaimer.
-+ *
-+ * 2. Redistributions in binary form must reproduce the above
-+ * copyright notice, this list of conditions and the following
-+ * disclaimer in the documentation and/or other materials provided
-+ * with the distribution.
-+ *
-+ * 3. The name of ATMEL may not be used to endorse or promote products
-+ * derived from this software without specific prior written
-+ * permission.
-+ *
-+ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
-+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-+ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
-+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
-+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
-+ * DAMAGE.
-+ */
-+
-+
-+ /* Macro for masking the lowest bit of each byte in a
-+ packed word */
-+ .macro packedmask1 reg, round
-+ .if \round
-+ and \reg, \reg, r8 >> 1
-+ .else
-+ and \reg, r8
-+ .endif
-+ .endm
-+
-+ /* Macro for 8 pixel wide horizontal and vertical interpolation functions */
-+ .macro pixels8_hv round, put
-+
-+
-+ pushm r0-r7, lr
-+
-+ /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */
-+
-+ /* Rounding immediate */
-+ .if \round
-+ mov r8, lo(0x02020202)
-+ orh r8, hi(0x02020202)
-+ .else
-+ mov r8, lo(0x01010101)
-+ orh r8, hi(0x01010101)
-+ .endif
-+ mov r7, 2
-+
-+ /* Pixel naming convention :
-+
-+ |-----------------------------------------------------|
-+ | s00 | s01 | s02 | s03 | s04 | s05 | s06 | s07 | s08 |
-+ |----d00---d01---d02---d03---d04---d05---d06---d07----|
-+ | s10 | s11 | s12 | s13 | s14 | s15 | s16 | s17 | s18 |
-+ |-----------------------------------------------------|
-+ */
-+1:
-+ ld.w r0, r11[0] // r0 = { s00, s01, s02, s03 }
-+ ld.w r1, r11[1] // r1 = { s01, s02, s03, s04 }
-+ mov lr, r9
-+ eor r2, r0, r1
-+ packedmask1 r2, \round
-+ add r2, r8
-+
-+ paddh.ub r0, r0, r1 // r0 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
-+
-+ add r11, r10 // pixels += line_size
-+ ld.w r1, r11[0] // r1 = { s10, s11, s12, s13 }
-+ ld.w r3, r11[1] // r3 = { s11, s12, s13, s14 }
-+0:
-+ eor r5, r1, r3
-+ packedmask1 r5, \round
-+ add r2, r5
-+
-+ paddh.ub r1, r1, r3 // r1 = {(s10+s11)/2,(s11+s12)/2,(s12+s13)/2,(s13+s14)/2}
-+ eor r6, r0, r1
-+ packedmask1 r6, \round
-+ add r2, r2, r6 << 1
-+
-+ ld.w r3, r11[r10] // r3 = { s00, s01, s02, s03 }
-+ add r11, r10 // pixels += line_size
-+ ld.w r4, r11[1] // r4 = { s01, s02, s03, s04 }
-+
-+ paddh.ub r0, r0, r1
-+ plsr.b r2, r2, 2
-+ padd.b r0, r0, r2 // r0 = { d00, d01, d02, d03 }
-+
-+ /* Next row */
-+ .if \put
-+ eor r2, r3, r4
-+ packedmask1 r2, \round
-+ add r2, r8
-+ .else
-+ ld.w r6, r12[0]
-+ eor r2, r3, r4
-+ packedmask1 r2, \round
-+ add r2, r8
-+ pavg.ub r0, r0, r6
-+ .endif
-+ st.w r12[0], r0 // Put data into the block
-+
-+ add r5, r2
-+ paddh.ub r0, r3, r4 // r0 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
-+
-+ eor r6, r0, r1
-+ packedmask1 r6, \round
-+ add r5, r5, r6 << 1
-+
-+ .if \put
-+ paddh.ub r1, r0, r1
-+ plsr.b r5, r5, 2
-+ padd.b r1, r1, r5 // r1 = { d10, d11, d12, d13 }
-+ .else
-+ ld.w r3, r12[r10]
-+ paddh.ub r1, r0, r1
-+ plsr.b r5, r5, 2
-+ padd.b r1, r1, r5 // r1 = { d10, d11, d12, d13 }
-+ pavg.ub r1, r1, r3
-+ .endif
-+
-+ st.w r12[r10], r1 // Put data into the block
-+
-+
-+ ld.w r1, r11[r10] // r1 = { s10, s11, s12, s13 }
-+ add r11, r10 // pixels += line_size
-+ ld.w r3, r11[1] // r3 = { s11, s12, s13, s14 }
-+ add r12, r12, r10 << 1 // block += 2*line_size
-+ sub lr, 2
-+ brne 0b
-+
-+ mul r0, r10, r9 // r0 = line_size * h
-+ rsub r0, r0, 4 // r0 = 4 - (line_size * h)
-+ add r11, r0
-+ sub r11, r10 // pixels += 4 - (line_size * (h+1))
-+ add r12, r0 // pixels += 4 - (line_size * (h))
-+ sub r7, 1
-+ brne 1b
-+
-+ popm r0-r7, pc
-+ .endm
-+
-+
-+ /* Macro for 8 pixel wide vertical interpolation functions */
-+
-+ .macro pixels8_v round, put
-+ pushm r4-r7,lr
-+ /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */
-+
-+ /*
-+ Pixel Naming Convention :
-+ |-----------------------------------------------|
-+ | s00 | s01 | s02 | s03 | s04 | s05 | s06 | s07 |
-+ |-d00---d01---d02---d03---d04---d05---d06---d07-|
-+ | s10 | s11 | s12 | s13 | s14 | s15 | s16 | s17 |
-+ |-----------------------------------------------|
-+ */
-+ ld.w r8, r11[r10] // r8 = { s10, s11, s12, s13 }
-+ ld.w lr, r11++ // lr = { s00, s01, s02, s03 }, src += 4
-+ ld.w r7, r11[0] // r7 = { s04, s05, s06, s07 }
-+ ld.w r6, r11[r10] // r6 = { s14, s15, s16, s17 }
-+ sub r10, 4 // stride -= 4
-+ add r11, r11, r10 << 1 // src += 2*stride
-+ sub r11, -4 // src += 4
-+
-+0:
-+ .if \round
-+ pavg.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2}
-+ pavg.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2}
-+ .else
-+ paddh.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2}
-+ paddh.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2}
-+ .endif
-+
-+ .if \put
-+ st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 }
-+ ld.w lr, r11++ // lr = { s10, s11, s12, s13 }, src += 4
-+ st.w r12[0], r4 // *dst = { d04, d05, d06, d07 }
-+ ld.w r7, r11[0] // r7 = { s14, s15, s16, s17 }
-+ .else
-+ ld.w lr, r12[0]
-+ ld.w r7, r12[4]
-+ pavg.ub r5, r5, lr
-+ pavg.ub r4, r4, r7
-+ st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 }
-+ ld.w lr, r11++ // lr = { s10, s11, s12, s13 }, src += 4
-+ st.w r12[0], r4 // *dst = { d04, d05, d06, d07 }
-+ ld.w r7, r11[0] // r7 = { s14, s15, s16, s17 }
-+ .endif
-+ add r11, r10 // src += stride
-+#ifdef USE_PREFETCH
-+ pref r11[0]
-+#endif
-+ add r12, r10 // dst += stride
-+
-+ .if \round
-+ pavg.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2}
-+ pavg.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2}
-+ .else
-+ paddh.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2}
-+ paddh.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2}
-+ .endif
-+ .if \put
-+ st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 }
-+ ld.w r8, r11++ // r8 = { s10, s11, s12, s13 }, src += 4
-+ st.w r12[0], r4 // *dst = { d04, d05, d06, d07 }
-+ ld.w r6, r11[0] // r6 = { s14, s15, s16, s17 }
-+ .else
-+ ld.w r8, r12[0]
-+ ld.w r6, r12[4]
-+ pavg.ub r5, r5, r8
-+ pavg.ub r4, r4, r6
-+ st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 }
-+ ld.w r8, r11++ // r8 = { s10, s11, s12, s13 }, src += 4
-+ st.w r12[0], r4 // *dst = { d04, d05, d06, d07 }
-+ ld.w r6, r11[0] // r6 = { s14, s15, s16, s17 }
-+ .endif
-+
-+ add r11, r10 // src += stride
-+#ifdef USE_PREFETCH
-+ pref r11[0]
-+#endif
-+ add r12, r10 // dst += stride
-+ sub r9, 2
-+ brne 0b
-+
-+ popm r4-r7,pc
-+ .endm
-+
-+ /* Macro for 8 pixel wide horizontal interpolation functions */
-+
-+ .macro pixels8_h round, put
-+ pushm r4-r7, lr
-+
-+ /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */
-+ /*
-+ Pixel Naming Convention:
-+ |--------------------------------------------------------------------|
-+ | s00 d00 s01 d01 s02 d02 s03 d03 s04 d04 s05 d05 s06 d06 s07 d07 s08|
-+ |------|-------|-------|-------|-------|-------|-------|-------|-----|
-+ | s10 d10 s11 d11 s12 d12 s13 d13 s14 d14 s15 d15 s16 d16 s17 d17 s18|
-+ |--------------------------------------------------------------------|
-+ */
-+
-+ ld.w lr, r11[0] // lr = { s00, s01, s02, s03 }
-+ ld.w r8, r11[1] // r8 = { s01, s02, s03, s04 }
-+ ld.w r7, r11[4] // r7 = { s04, s05, s06, s07 }
-+ ld.w r6, r11[5] // r6 = { s05, s06, s07, s08 }
-+ add r11, r10 // src += stride
-+
-+0:
-+ .if \round
-+ pavg.ub lr, r8, lr // lr = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
-+ pavg.ub r7, r6, r7 // r7 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2}
-+ .else
-+ paddh.ub lr, r8, lr // lr = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
-+ paddh.ub r7, r6, r7 // r7 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2}
-+ .endif
-+ .if \put
-+ ld.w r5, r11[0] // r5 = { s00, s01, s02, s03 }
-+ ld.w r4, r11[1] // r4 = { s01, s02, s03, s04 }
-+ .else
-+ ld.w r8, r12[0]
-+ ld.w r6, r12[4]
-+ ld.w r5, r11[0] // r5 = { s00, s01, s02, s03 }
-+ ld.w r4, r11[1] // r4 = { s01, s02, s03, s04 }
-+ pavg.ub lr, lr, r8
-+ pavg.ub r7, r7, r6
-+ .endif
-+ st.w r12[0], lr // dst = { d00, d01, d02, d03 }
-+ st.w r12[4], r7 // dst = { d04, d05, d06, d07 }
-+ ld.w r8, r11[4] // r8 = { s04, s05, s06, s07 }
-+ ld.w r6, r11[5] // r6 = { s05, s06, s07, s08 }
-+ add r11, r10 // src += stride
-+#ifdef USE_PREFETCH
-+ pref r11[0]
-+#endif
-+ add r12, r10 // dst += stride
-+
-+ .if \round
-+ pavg.ub r5, r4, r5 // r5 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
-+ pavg.ub r4, r6, r8 // r4 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2}
-+ .else
-+ paddh.ub r5, r4, r5 // r5 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
-+ paddh.ub r4, r6, r8 // r4 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2}
-+ .endif
-+ .if \put
-+ ld.w lr, r11[0] // lr = { s00, s01, s02, s03 }
-+ ld.w r8, r11[1] // r8 = { s01, s02, s03, s04 }
-+ .else
-+ ld.w r7, r12[0]
-+ ld.w r6, r12[4]
-+ ld.w lr, r11[0] // lr = { s00, s01, s02, s03 }
-+ ld.w r8, r11[1] // r8 = { s01, s02, s03, s04 }
-+ pavg.ub r5, r5, r7
-+ pavg.ub r4, r4, r6
-+ .endif
-+ st.w r12[0], r5 // dst = { d00, d01, d02, d03 }
-+ st.w r12[4], r4 // dst = { d04, d05, d06, d07 }
-+ ld.w r7, r11[4] // r7 = { s04, s05, s06, s07 }
-+ ld.w r6, r11[5] // r6 = { s05, s06, s07, s08 }
-+ add r11, r10 // src += stride
-+#ifdef USE_PREFETCH
-+ pref r11[0]
-+#endif
-+ add r12, r10 // dst += stride
-+ sub r9, 2
-+ brne 0b
-+
-+ popm r4-r7, pc
-+ .endm
-+
-+ /* Macro for 8 pixel wide copy functions */
-+ .macro pixels8 put
-+ stm --sp, r3-r7,lr
-+ /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */
-+ mov lr, r9
-+ sub r3, r10, 2 // stride2 = stride - 2
-+0:
-+ .if \put
-+ ld.w r9, r11[r10] // r9 = { s10, s11, s12, s13 }
-+ ld.w r7, r11++ // r7 = { s00, s01, s02, s03 }, src += 4
-+ ld.w r6, r11[0] // r6 = { s04, s05, s06, s07 }
-+ ld.w r8, r11[r10] // r8 = { s14, s15, s16, s17 }
-+ .else
-+ ld.w r9, r11[r10] // r9 = { s10, s11, s12, s13 }
-+ ld.d r4, r12[0]
-+ ld.w r7, r11++ // r7 = { s00, s01, s02, s03 }, src += 4
-+ ld.w r6, r11[0] // r6 = { s04, s05, s06, s07 }
-+ ld.w r8, r11[r10] // r8 = { s14, s15, s16, s17 }
-+ pavg.ub r6, r6, r4
-+ pavg.ub r7, r7, r5
-+ ld.d r4, r12[r10]
-+ .endif
-+ st.d r12, r6 // *dst = { s00, s01, s02, s03, s04, s05, s06, s07 }
-+ add r11, r11, r3 << 1 // src += stride2 * 2
-+ .ifeq \put
-+ pavg.ub r8, r8, r4
-+ pavg.ub r9, r9, r5
-+ .endif
-+ st.d r12[r10 << 0], r8 // *(dst + stride) = { s10, s11, s12, s13, s14, s15, s16, s17 }
-+ add r12, r12, r10 << 1 // dst += 2*stride
-+ sub lr, 2
-+ brne 0b
-+ ldm sp++, r3-r7,pc
-+
-+ .endm
-+
-+ .global put_no_rnd_pixels8_hv_avr32
-+ .text
-+put_no_rnd_pixels8_hv_avr32:
-+ pixels8_hv 0, 1
-+
-+ .global put_pixels8_hv_avr32
-+ .text
-+put_pixels8_hv_avr32:
-+ pixels8_hv 1, 1
-+
-+ .global avg_no_rnd_pixels8_hv_avr32
-+ .text
-+avg_no_rnd_pixels8_hv_avr32:
-+ pixels8_hv 0, 0
-+
-+ .global avg_pixels8_hv_avr32
-+ .text
-+avg_pixels8_hv_avr32:
-+ pixels8_hv 1, 0
-+
-+ .global put_no_rnd_pixels8_v_avr32
-+ .text
-+put_no_rnd_pixels8_v_avr32:
-+ pixels8_v 0, 1
-+
-+ .global put_pixels8_v_avr32
-+ .text
-+put_pixels8_v_avr32:
-+ pixels8_v 1, 1
-+
-+ .global avg_no_rnd_pixels8_v_avr32
-+ .text
-+avg_no_rnd_pixels8_v_avr32:
-+ pixels8_v 0, 0
-+
-+ .global avg_pixels8_v_avr32
-+ .text
-+avg_pixels8_v_avr32:
-+ pixels8_v 1, 0
-+
-+ .global put_no_rnd_pixels8_h_avr32
-+ .text
-+put_no_rnd_pixels8_h_avr32:
-+ pixels8_h 0, 1
-+
-+ .global put_pixels8_h_avr32
-+ .text
-+put_pixels8_h_avr32:
-+ pixels8_h 1, 1
-+
-+ .global avg_no_rnd_pixels8_h_avr32
-+ .text
-+avg_no_rnd_pixels8_h_avr32:
-+ pixels8_h 0, 0
-+
-+ .global avg_pixels8_h_avr32
-+ .text
-+avg_pixels8_h_avr32:
-+ pixels8_h 1, 0
-+
-+ .global put_pixels8_avr32
-+ .global put_no_rnd_pixels8_avr32
-+ .text
-+put_pixels8_avr32:
-+put_no_rnd_pixels8_avr32:
-+ pixels8 1
-+
-+ .global avg_no_rnd_pixels8_avr32
-+ .global avg_pixels8_avr32
-+ .text
-+avg_pixels8_avr32:
-+avg_no_rnd_pixels8_avr32:
-+ pixels8 0
-diff --git a/libavcodec/avr32/pico.h b/libavcodec/avr32/pico.h
-new file mode 100644
-index 0000000..32201ba
---- /dev/null
-+++ b/libavcodec/avr32/pico.h
-@@ -0,0 +1,260 @@
-+/*
-+ * Copyright (c) 2007 Atmel Corporation. All rights reserved.
-+ *
-+ * Redistribution and use in source and binary forms, with or without
-+ * modification, are permitted provided that the following conditions
-+ * are met:
-+ *
-+ * 1. Redistributions of source code must retain the above copyright
-+ * notice, this list of conditions and the following disclaimer.
-+ *
-+ * 2. Redistributions in binary form must reproduce the above
-+ * copyright notice, this list of conditions and the following
-+ * disclaimer in the documentation and/or other materials provided
-+ * with the distribution.
-+ *
-+ * 3. The name of ATMEL may not be used to endorse or promote products
-+ * derived from this software without specific prior written
-+ * permission.
-+ *
-+ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
-+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-+ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
-+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
-+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
-+ * DAMAGE.
-+ */
-+#ifndef __PICO_H__
-+#define __PICO_H__
-+
-+
-+
-+/* Coprocessor Number */
-+#define PICO_CPNO 1
-+
-+/* Pixel Coprocessor Register file */
-+#define PICO_REGVECT_INPIX2 cr0
-+#define PICO_REGVECT_INPIX1 cr1
-+#define PICO_REGVECT_INPIX0 cr2
-+#define PICO_REGVECT_OUTPIX2 cr3
-+#define PICO_REGVECT_OUTPIX1 cr4
-+#define PICO_REGVECT_OUTPIX0 cr5
-+#define PICO_REGVECT_COEFF0_A cr6
-+#define PICO_REGVECT_COEFF0_B cr7
-+#define PICO_REGVECT_COEFF1_A cr8
-+#define PICO_REGVECT_COEFF1_B cr9
-+#define PICO_REGVECT_COEFF2_A cr10
-+#define PICO_REGVECT_COEFF2_B cr11
-+#define PICO_REGVECT_VMU0_OUT cr12
-+#define PICO_REGVECT_VMU1_OUT cr13
-+#define PICO_REGVECT_VMU2_OUT cr14
-+#define PICO_REGVECT_CONFIG cr15
-+
-+#define PICO_INPIX2 0
-+#define PICO_INPIX1 1
-+#define PICO_INPIX0 2
-+#define PICO_OUTPIX2 3
-+#define PICO_OUTPIX1 4
-+#define PICO_OUTPIX0 5
-+#define PICO_COEFF0_A 6
-+#define PICO_COEFF0_B 7
-+#define PICO_COEFF1_A 8
-+#define PICO_COEFF1_B 9
-+#define PICO_COEFF2_A 10
-+#define PICO_COEFF2_B 11
-+#define PICO_VMU0_OUT 12
-+#define PICO_VMU1_OUT 13
-+#define PICO_VMU2_OUT 14
-+#define PICO_CONFIG 15
-+
-+/* Config Register */
-+#define PICO_COEFF_FRAC_BITS_OFFSET 0
-+#define PICO_COEFF_FRAC_BITS_SIZE 4
-+#define PICO_OFFSET_FRAC_BITS_OFFSET 4
-+#define PICO_OFFSET_FRAC_BITS_SIZE 4
-+#define PICO_INPUT_MODE_OFFSET 8
-+#define PICO_INPUT_MODE_SIZE 2
-+#define PICO_OUTPUT_MODE_OFFSET 10
-+#define PICO_OUTPUT_MODE_SIZE 1
-+
-+struct pico_config_t {
-+ unsigned int : 32 - PICO_OUTPUT_MODE_OFFSET - PICO_OUTPUT_MODE_SIZE;
-+ unsigned int output_mode : PICO_OUTPUT_MODE_SIZE;
-+ unsigned int input_mode : PICO_INPUT_MODE_SIZE;
-+ unsigned int offset_frac_bits : PICO_OFFSET_FRAC_BITS_SIZE;
-+ unsigned int coeff_frac_bits : PICO_COEFF_FRAC_BITS_SIZE;
-+ int vmu2_out;
-+ int vmu1_out;
-+ int vmu0_out;
-+ short coeff2_2;
-+ short coeff2_3;
-+ short coeff2_0;
-+ short coeff2_1;
-+ short coeff1_2;
-+ short coeff1_3;
-+ short coeff1_0;
-+ short coeff1_1;
-+ short coeff0_2;
-+ short coeff0_3;
-+ short coeff0_0;
-+ short coeff0_1;
-+};
-+
-+
-+#define PICO_COEFF_FRAC_BITS(x) (x << PICO_COEFF_FRAC_BITS_OFFSET)
-+#define PICO_OFFSET_FRAC_BITS(x) (x << PICO_OFFSET_FRAC_BITS_OFFSET)
-+#define PICO_INPUT_MODE(x) (x << PICO_INPUT_MODE_OFFSET)
-+#define PICO_OUTPUT_MODE(x) (x << PICO_OUTPUT_MODE_OFFSET)
-+
-+#define GET_PICO_COEFF_FRAC_BITS(x) ((x >> PICO_COEFF_FRAC_BITS_OFFSET)&((1 << PICO_COEFF_FRAC_BITS_SIZE)-1))
-+#define GET_PICO_OFFSET_FRAC_BITS(x) ((x >> PICO_OFFSET_FRAC_BITS_OFFSET)&((1 << PICO_OFFSET_FRAC_BITS_SIZE)-1))
-+#define GET_PICO_INPUT_MODE(x) ((x >> PICO_INPUT_MODE_OFFSET)&((1 << PICO_INPUT_MODE_SIZE)-1))
-+#define GET_PICO_OUTPUT_MODE(x) ((x >> PICO_OUTPUT_MODE_OFFSET)&((1 << PICO_OUTPUT_MODE_SIZE)-1))
-+
-+enum pico_input_mode { PICO_TRANSFORMATION_MODE,
-+ PICO_HOR_FILTER_MODE,
-+ PICO_VERT_FILTER_MODE };
-+
-+enum pico_output_mode { PICO_PACKED_MODE,
-+ PICO_PLANAR_MODE };
-+
-+/* Bits in coefficients */
-+#define PICO_COEFF_BITS 12
-+
-+/* Operation bits */
-+#define PICO_MATRIX (0)
-+#define PICO_USE_ACC (1 << 2)
-+#define PICO_SINGLE_VECTOR (1 << 3)
-+
-+
-+#define __str(x...) #x
-+#define __xstr(x...) __str(x)
-+
-+#define PICO_PUT_W(pico_reg, x) \
-+ __builtin_mvrc_w(PICO_CPNO, pico_reg, x);
-+#define PICO_GET_W(pico_reg) \
-+ __builtin_mvcr_w(PICO_CPNO, pico_reg)
-+
-+#define PICO_MVCR_W(x, pico_reg) \
-+ asm ("mvcr.w\tcp" __xstr(PICO_CPNO) ", %0, cr" __xstr(pico_reg) : "=r"(x));
-+
-+#define PICO_MVRC_W(pico_reg, x) \
-+ asm ("mvrc.w\tcp" __xstr(PICO_CPNO) ", cr" __xstr(pico_reg) ", %0" :: "r"(x));
-+
-+#define PICO_PUT_D(pico_reg, x) \
-+ __builtin_mvrc_d(PICO_CPNO, pico_reg, x);
-+#define PICO_GET_D(pico_reg) \
-+ __builtin_mvcr_d(PICO_CPNO, pico_reg)
-+
-+#define PICO_MVCR_D(x, pico_reg) \
-+ asm volatile ("mvcr.d\tcp" __xstr(PICO_CPNO) ", %0, cr" __xstr(pico_reg) : "=r"(x));
-+#define PICO_MVRC_D(pico_reg, x) \
-+ asm volatile ("mvrc.d\tcp" __xstr(PICO_CPNO) ", cr" __xstr(pico_reg) ", %0" :: "r"(x));
-+
-+#define PICO_STCM_W(ptr, pico_regs...) \
-+ asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
-+#define PICO_STCM_D(ptr, pico_regs...) \
-+ asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
-+
-+#define PICO_STCM_W_DEC(ptr, pico_regs...) \
-+ asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr));
-+#define PICO_STCM_D_DEC(ptr, pico_regs...) \
-+ asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr));
-+
-+#define PICO_LDCM_W(ptr, pico_regs...) \
-+ asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
-+#define PICO_LDCM_D(ptr, pico_regs...) \
-+ asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
-+
-+#define PICO_LDCM_W_INC(ptr, pico_regs...) \
-+ asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr));
-+#define PICO_LDCM_D_INC(ptr, pico_regs...) \
-+ asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr));
-+
-+#define PICO_OP(op, dst_addr, addr0, addr1, addr2) \
-+ __builtin_cop(PICO_CPNO, addr0, addr1, addr2, op | dst_addr);
-+
-+static inline void set_pico_config(struct pico_config_t *config){
-+ PICO_LDCM_D(config,
-+ PICO_REGVECT_COEFF0_A, PICO_REGVECT_COEFF0_B,
-+ PICO_REGVECT_COEFF1_A, PICO_REGVECT_COEFF1_B,
-+ PICO_REGVECT_COEFF2_A, PICO_REGVECT_COEFF2_B,
-+ PICO_REGVECT_VMU0_OUT, PICO_REGVECT_VMU1_OUT,
-+ PICO_REGVECT_VMU2_OUT, PICO_REGVECT_CONFIG);
-+}
-+
-+static inline void get_pico_config(struct pico_config_t *config){
-+ PICO_STCM_D(config,
-+ PICO_REGVECT_COEFF0_A, PICO_REGVECT_COEFF0_B,
-+ PICO_REGVECT_COEFF1_A, PICO_REGVECT_COEFF1_B,
-+ PICO_REGVECT_COEFF2_A, PICO_REGVECT_COEFF2_B,
-+ PICO_REGVECT_VMU0_OUT, PICO_REGVECT_VMU1_OUT,
-+ PICO_REGVECT_VMU2_OUT, PICO_REGVECT_CONFIG);
-+}
-+
-+static inline void dump_pico_config(){
-+ struct pico_config_t pico_config;
-+ char *input_mode, *output_mode;
-+ get_pico_config(&pico_config);
-+
-+
-+ av_log(NULL, AV_LOG_INFO, "Dumping pico configuration:\n\n");
-+ av_log(NULL, AV_LOG_INFO, "\tcoeff_frac_bits = %d\n", pico_config.coeff_frac_bits);
-+ av_log(NULL, AV_LOG_INFO, "\toffset_frac_bits = %d\n", pico_config.offset_frac_bits);
-+
-+ switch ( pico_config.input_mode ){
-+ case PICO_TRANSFORMATION_MODE:
-+ input_mode = "Transformation Mode";
-+ break;
-+ case PICO_HOR_FILTER_MODE:
-+ input_mode = "Horisontal Filter Mode";
-+ break;
-+ case PICO_VERT_FILTER_MODE:
-+ input_mode = "Vertical Filter Mode";
-+ break;
-+ default:
-+ input_mode = "Unknown Mode!!";
-+ break;
-+ }
-+ av_log(NULL, AV_LOG_INFO, "\tinput_mode = %s\n", input_mode);
-+
-+ switch ( pico_config.output_mode ){
-+ case PICO_PLANAR_MODE:
-+ output_mode = "Planar Mode";
-+ break;
-+ case PICO_PACKED_MODE:
-+ output_mode = "Packed Mode";
-+ break;
-+ default:
-+ output_mode = "Unknown Mode!!";
-+ break;
-+ }
-+
-+ av_log(NULL, AV_LOG_INFO, "\toutput_mode = %s\n", output_mode);
-+
-+ av_log(NULL, AV_LOG_INFO, "\tCoeff0_0 = %f\n", (float)pico_config.coeff0_0/(float)(1 << pico_config.coeff_frac_bits));
-+ av_log(NULL, AV_LOG_INFO, "\tCoeff0_1 = %f\n", (float)pico_config.coeff0_1/(float)(1 << pico_config.coeff_frac_bits));
-+ av_log(NULL, AV_LOG_INFO, "\tCoeff0_2 = %f\n", (float)pico_config.coeff0_2/(float)(1 << pico_config.coeff_frac_bits));
-+ av_log(NULL, AV_LOG_INFO, "\tCoeff0_3 = %f\n", (float)pico_config.coeff0_3/(float)(1 << pico_config.offset_frac_bits));
-+
-+ av_log(NULL, AV_LOG_INFO, "\tCoeff1_0 = %f\n", (float)pico_config.coeff1_0/(float)(1 << pico_config.coeff_frac_bits));
-+ av_log(NULL, AV_LOG_INFO, "\tCoeff1_1 = %f\n", (float)pico_config.coeff1_1/(float)(1 << pico_config.coeff_frac_bits));
-+ av_log(NULL, AV_LOG_INFO, "\tCoeff1_2 = %f\n", (float)pico_config.coeff1_2/(float)(1 << pico_config.coeff_frac_bits));
-+ av_log(NULL, AV_LOG_INFO, "\tCoeff1_3 = %f\n", (float)pico_config.coeff1_3/(float)(1 << pico_config.offset_frac_bits));
-+
-+ av_log(NULL, AV_LOG_INFO, "\tCoeff2_0 = %f\n", (float)pico_config.coeff2_0/(float)(1 << pico_config.coeff_frac_bits));
-+ av_log(NULL, AV_LOG_INFO, "\tCoeff2_1 = %f\n", (float)pico_config.coeff2_1/(float)(1 << pico_config.coeff_frac_bits));
-+ av_log(NULL, AV_LOG_INFO, "\tCoeff2_2 = %f\n", (float)pico_config.coeff2_2/(float)(1 << pico_config.coeff_frac_bits));
-+ av_log(NULL, AV_LOG_INFO, "\tCoeff2_3 = %f\n", (float)pico_config.coeff2_3/(float)(1 << pico_config.offset_frac_bits));
-+}
-+
-+
-+
-+#endif
-+
-diff --git a/libavcodec/bitstream.h b/libavcodec/bitstream.h
-index 26b4f8d..1f8fabf 100644
---- a/libavcodec/bitstream.h
-+++ b/libavcodec/bitstream.h
-@@ -171,7 +171,7 @@ typedef struct RL_VLC_ELEM {
- #endif
-
- /* used to avoid missaligned exceptions on some archs (alpha, ...) */
--#if defined(ARCH_X86) || defined(ARCH_X86_64)
-+#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_AVR32)
- # define unaligned16(a) (*(const uint16_t*)(a))
- # define unaligned32(a) (*(const uint32_t*)(a))
- # define unaligned64(a) (*(const uint64_t*)(a))
-@@ -813,6 +813,44 @@ void free_vlc(VLC *vlc);
- * if the vlc code is invalid and max_depth>1 than the number of bits removed
- * is undefined
- */
-+
-+#if defined(ARCH_AVR32)
-+#define GET_VLC(code, name, gb, table, bits, max_depth)\
-+{\
-+ int n, index, nb_bits;\
-+ union { VLC_TYPE vlc[2];\
-+ uint32_t u32; } table_elem;\
-+\
-+ index= SHOW_UBITS(name, gb, bits);\
-+ table_elem.u32 = unaligned32(&table[index]); \
-+ code = table_elem.vlc[0];\
-+ n = table_elem.vlc[1];\
-+\
-+ if(max_depth > 1 && n < 0 ){\
-+ LAST_SKIP_BITS(name, gb, bits)\
-+ UPDATE_CACHE(name, gb)\
-+\
-+ nb_bits = -n;\
-+\
-+ index= SHOW_UBITS(name, gb, nb_bits) + code;\
-+ table_elem.u32 = unaligned32(&table[index]); \
-+ code = table_elem.vlc[0];\
-+ n = table_elem.vlc[1];\
-+ if(max_depth > 2 && n < 0){\
-+ LAST_SKIP_BITS(name, gb, nb_bits)\
-+ UPDATE_CACHE(name, gb)\
-+\
-+ nb_bits = -n;\
-+\
-+ index= SHOW_UBITS(name, gb, nb_bits) + code;\
-+ code = table[index][0];\
-+ n = table[index][1];\
-+ }\
-+ }\
-+ SKIP_BITS(name, gb, n)\
-+}
-+
-+#else
- #define GET_VLC(code, name, gb, table, bits, max_depth)\
- {\
- int n, index, nb_bits;\
-@@ -821,7 +859,7 @@ void free_vlc(VLC *vlc);
- code = table[index][0];\
- n = table[index][1];\
- \
-- if(max_depth > 1 && n < 0){\
-+ if(max_depth > 1 && n < 0 ){\
- LAST_SKIP_BITS(name, gb, bits)\
- UPDATE_CACHE(name, gb)\
- \
-@@ -843,7 +881,38 @@ void free_vlc(VLC *vlc);
- }\
- SKIP_BITS(name, gb, n)\
- }
-+#endif
-
-+#if defined(ARCH_AVR32)
-+#define GET_RL_VLC(level, run, name, gb, table, bits, max_depth, need_update)\
-+{\
-+ int n, index, nb_bits;\
-+ union { RL_VLC_ELEM vlc;\
-+ uint32_t u32; } table_elem;\
-+\
-+ index= SHOW_UBITS(name, gb, bits);\
-+ table_elem.u32 = unaligned32(&table[index]); \
-+ level = table_elem.vlc.level;\
-+ n = table_elem.vlc.len;\
-+\
-+ if(max_depth > 1 && n < 0 ){\
-+ SKIP_BITS(name, gb, bits)\
-+ if(need_update){\
-+ UPDATE_CACHE(name, gb)\
-+ }\
-+\
-+ nb_bits = -n;\
-+\
-+ index= SHOW_UBITS(name, gb, nb_bits) + level;\
-+ table_elem.u32 = unaligned32(&table[index]); \
-+ level = table_elem.vlc.level;\
-+ n = table_elem.vlc.len;\
-+ }\
-+ run= table_elem.vlc.run;\
-+ SKIP_BITS(name, gb, n)\
-+}
-+
-+#else
- #define GET_RL_VLC(level, run, name, gb, table, bits, max_depth, need_update)\
- {\
- int n, index, nb_bits;\
-@@ -852,7 +921,7 @@ void free_vlc(VLC *vlc);
- level = table[index].level;\
- n = table[index].len;\
- \
-- if(max_depth > 1 && n < 0){\
-+ if(max_depth > 1 && n < 0 ){\
- SKIP_BITS(name, gb, bits)\
- if(need_update){\
- UPDATE_CACHE(name, gb)\
-@@ -867,7 +936,7 @@ void free_vlc(VLC *vlc);
- run= table[index].run;\
- SKIP_BITS(name, gb, n)\
- }
--
-+#endif
-
- /**
- * parses a vlc code, faster then get_vlc()
-diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
-index 56c42b9..8fc10c6 100644
---- a/libavcodec/dsputil.c
-+++ b/libavcodec/dsputil.c
-@@ -4197,6 +4197,9 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
- #ifdef ARCH_BFIN
- dsputil_init_bfin(c,avctx);
- #endif
-+#ifdef ARCH_AVR32
-+ dsputil_init_avr32(c,avctx);
-+#endif
-
- for(i=0; i<64; i++){
- if(!c->put_2tap_qpel_pixels_tab[0][i])
-diff --git a/libavcodec/h264.c b/libavcodec/h264.c
-index 865e80a..8f7c3f1 100644
---- a/libavcodec/h264.c
-+++ b/libavcodec/h264.c
-@@ -3258,7 +3258,12 @@ static void free_tables(H264Context *h){
-
- static void init_dequant8_coeff_table(H264Context *h){
- int i,q,x;
-+#ifdef ARCH_AVR32
-+ const int transpose = 0;
-+#else
- const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
-+#endif
-+
- h->dequant8_coeff[0] = h->dequant8_buffer[0];
- h->dequant8_coeff[1] = h->dequant8_buffer[1];
-
-@@ -3281,7 +3286,13 @@ static void init_dequant8_coeff_table(H264Context *h){
-
- static void init_dequant4_coeff_table(H264Context *h){
- int i,j,q,x;
-+ // Yes this is ugly as hell....
-+#ifdef ARCH_AVR32
-+ const int transpose = 0;
-+#else
- const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
-+#endif
-+
- for(i=0; i<6; i++ ){
- h->dequant4_coeff[i] = h->dequant4_buffer[i];
- for(j=0; j<i; j++){
-@@ -4663,7 +4674,11 @@ static int decode_slice_header(H264Context *h){
- if (MPV_common_init(s) < 0)
- return -1;
-
-+#ifdef ARCH_AVR32
-+ if ( 1 ){
-+#else
- if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
-+#endif
- memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
- memcpy(h-> field_scan, field_scan, 16*sizeof(uint8_t));
- }else{
-diff --git a/libavutil/common.h b/libavutil/common.h
-index 3ae5971..7e52b90 100644
---- a/libavutil/common.h
-+++ b/libavutil/common.h
-@@ -283,23 +283,39 @@ static inline int mid_pred(int a, int b, int c)
- * @param amax maximum value of the clip range
- * @return cliped value
- */
-+#if defined(ARCH_AVR32)
-+#define clip(a, amin, amax) \
-+ ({ int __tmp__; \
-+ asm ("min\t%0, %1, %2\n" \
-+ "max\t%0, %0, %3\n" \
-+ : "=&r"(__tmp__) : "r"(a), "r"(amax), "r"(amin)); \
-+ __tmp__; })
-+#else
- static inline int clip(int a, int amin, int amax)
- {
- if (a < amin) return amin;
- else if (a > amax) return amax;
- else return a;
- }
-+#endif
-
- /**
- * clip a signed integer value into the 0-255 range
- * @param a value to clip
- * @return cliped value
- */
-+#if defined(ARCH_AVR32)
-+#define clip_uint8(a) \
-+ ({ int __tmp__ = a; \
-+ asm ("satu\t%0 >> 0, 8" : "+r"(__tmp__)); \
-+ __tmp__; })
-+#else
- static inline uint8_t clip_uint8(int a)
- {
- if (a&(~255)) return (-a)>>31;
- else return a;
- }
-+#endif
-
- /* math */
- int64_t ff_gcd(int64_t a, int64_t b);
-diff --git a/libavutil/internal.h b/libavutil/internal.h
-index 285d304..a8b0718 100644
---- a/libavutil/internal.h
-+++ b/libavutil/internal.h
-@@ -210,6 +210,15 @@ if((y)<(x)){\
- }\
- }
-
-+/* XXX: Hack for uclibc which declares lrintf but does not implement it... */
-+#ifdef ARCH_AVR32
-+#undef HAVE_LRINTF
-+#define HAVE_LRINTF 1
-+#define lrintf(x) rint(x)
-+#define llrint(x) (long long)rint(x)
-+#endif
-+
-+
- #ifndef HAVE_LRINTF
- /* XXX: add ISOC specific test to avoid specific BSD testing. */
- /* better than nothing implementation. */
-diff --git a/libfaad2/common.h b/libfaad2/common.h
-index f809042..6c5fb21 100644
---- a/libfaad2/common.h
-+++ b/libfaad2/common.h
-@@ -67,7 +67,7 @@ extern "C" {
- /* Use if target platform has address generators with autoincrement */
- //#define PREFER_POINTERS
-
--#if defined(_WIN32_WCE) || defined(__arm__)
-+#if defined(_WIN32_WCE) || defined(__arm__) || defined(__avr32__)
- #define FIXED_POINT
- #endif
-
-diff --git a/libmpcodecs/ad_libmad.c b/libmpcodecs/ad_libmad.c
-index 076359a..51b77fe 100644
---- a/libmpcodecs/ad_libmad.c
-+++ b/libmpcodecs/ad_libmad.c
-@@ -86,6 +86,11 @@ static int init(sh_audio_t *sh){
- sh->channels=(this->frame.header.mode == MAD_MODE_SINGLE_CHANNEL) ? 1 : 2;
- sh->samplerate=this->frame.header.samplerate;
- sh->i_bps=this->frame.header.bitrate/8;
-+#ifdef WORDS_BIGENDIAN
-+ sh->sample_format = AF_FORMAT_S16_BE;
-+#else
-+ sh->sample_format = AF_FORMAT_S16_LE;
-+#endif
- sh->samplesize=2;
-
- return 1;
-diff --git a/libswscale/pico-avr32.h b/libswscale/pico-avr32.h
-new file mode 100644
-index 0000000..7ac6200
---- /dev/null
-+++ b/libswscale/pico-avr32.h
-@@ -0,0 +1,137 @@
-+/*
-+ * Copyright (c) 2007 Atmel Corporation. All rights reserved.
-+ *
-+ * Redistribution and use in source and binary forms, with or without
-+ * modification, are permitted provided that the following conditions
-+ * are met:
-+ *
-+ * 1. Redistributions of source code must retain the above copyright
-+ * notice, this list of conditions and the following disclaimer.
-+ *
-+ * 2. Redistributions in binary form must reproduce the above
-+ * copyright notice, this list of conditions and the following
-+ * disclaimer in the documentation and/or other materials provided
-+ * with the distribution.
-+ *
-+ * 3. The name of ATMEL may not be used to endorse or promote products
-+ * derived from this software without specific prior written
-+ * permission.
-+ *
-+ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
-+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-+ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
-+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
-+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
-+ * DAMAGE.
-+ */
-+#ifndef __PICO_H__
-+#define __PICO_H__
-+
-+/* Coprocessor Number */
-+#define PICO_CPNO 1
-+
-+/* Pixel Coprocessor Register file */
-+#define PICO_REGVECT_INPIX2 cr0
-+#define PICO_REGVECT_INPIX1 cr1
-+#define PICO_REGVECT_INPIX0 cr2
-+#define PICO_REGVECT_OUTPIX2 cr3
-+#define PICO_REGVECT_OUTPIX1 cr4
-+#define PICO_REGVECT_OUTPIX0 cr5
-+#define PICO_REGVECT_COEFF0_A cr6
-+#define PICO_REGVECT_COEFF0_B cr7
-+#define PICO_REGVECT_COEFF1_A cr8
-+#define PICO_REGVECT_COEFF1_B cr9
-+#define PICO_REGVECT_COEFF2_A cr10
-+#define PICO_REGVECT_COEFF2_B cr11
-+#define PICO_REGVECT_VMU0_OUT cr12
-+#define PICO_REGVECT_VMU1_OUT cr13
-+#define PICO_REGVECT_VMU2_OUT cr14
-+#define PICO_REGVECT_CONFIG cr15
-+
-+#define PICO_INPIX2 0
-+#define PICO_INPIX1 1
-+#define PICO_INPIX0 2
-+#define PICO_OUTPIX2 3
-+#define PICO_OUTPIX1 4
-+#define PICO_OUTPIX0 5
-+#define PICO_COEFF0_A 6
-+#define PICO_COEFF0_B 7
-+#define PICO_COEFF1_A 8
-+#define PICO_COEFF1_B 9
-+#define PICO_COEFF2_A 10
-+#define PICO_COEFF2_B 11
-+#define PICO_VMU0_OUT 12
-+#define PICO_VMU1_OUT 13
-+#define PICO_VMU2_OUT 14
-+#define PICO_CONFIG 15
-+
-+/* Config Register */
-+#define PICO_COEFF_FRAC_BITS 0
-+#define PICO_COEFF_FRAC_BITS_WIDTH 4
-+#define PICO_OFFSET_FRAC_BITS 4
-+#define PICO_OFFSET_FRAC_BITS_WIDTH 4
-+#define PICO_INPUT_MODE 8
-+#define PICO_INPUT_MODE_WIDTH 2
-+#define PICO_OUTPUT_MODE 10
-+
-+#define PICO_TRANSFORMATION_MODE 0
-+#define PICO_HOR_FILTER_MODE 1
-+#define PICO_VERT_FILTER_MODE 2
-+
-+#define PICO_PLANAR_MODE 1
-+#define PICO_PACKED_MODE 0
-+
-+/* Bits in coefficients */
-+#define PICO_COEFF_BITS 12
-+
-+/* Operation bits */
-+#define PICO_USE_ACC (1 << 2)
-+#define PICO_SINGLE_VECTOR (1 << 3)
-+
-+
-+#define __str(x...) #x
-+#define __xstr(x...) __str(x)
-+
-+#define PICO_PUT_W(pico_reg, x) \
-+ __builtin_mvrc_w(PICO_CPNO, pico_reg, x);
-+#define PICO_GET_W(pico_reg) \
-+ __builtin_mvcr_w(PICO_CPNO, pico_reg)
-+
-+#define PICO_PUT_D(pico_reg, x) \
-+ __builtin_mvrc_d(PICO_CPNO, pico_reg, x);
-+#define PICO_GET_D(pico_reg) \
-+ __builtin_mvcr_d(PICO_CPNO, pico_reg)
-+
-+
-+#define PICO_STCM_W(ptr, pico_regs...) \
-+ asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
-+#define PICO_STCM_D(ptr, pico_regs...) \
-+ asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
-+
-+#define PICO_STCM_W_DEC(ptr, pico_regs...) \
-+ asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr));
-+#define PICO_STCM_D_DEC(ptr, pico_regs...) \
-+ asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr));
-+
-+#define PICO_LDCM_W(ptr, pico_regs...) \
-+ asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
-+#define PICO_LDCM_D(ptr, pico_regs...) \
-+ asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
-+
-+#define PICO_LDCM_W_INC(ptr, pico_regs...) \
-+ asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr));
-+#define PICO_LDCM_D_INC(ptr, pico_regs...) \
-+ asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr));
-+
-+#define PICO_OP(op, dst_addr, addr0, addr1, addr2) \
-+ __builtin_cop(PICO_CPNO, addr0, addr1, addr2, op | dst_addr);
-+
-+
-+#endif
-+
-diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
-index ecd28f5..3221d0c 100644
---- a/libswscale/swscale_internal.h
-+++ b/libswscale/swscale_internal.h
-@@ -173,7 +173,7 @@ typedef struct SwsContext{
- SwsFunc yuv2rgb_get_func_ptr (SwsContext *c);
- int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation);
-
--char *sws_format_name(int format);
-+char *sws_format_name(enum PixelFormat format);
-
- //FIXME replace this with something faster
- #define isPlanarYUV(x) ((x)==PIX_FMT_YUV410P || (x)==PIX_FMT_YUV420P \
-diff --git a/libswscale/yuv2rgb.c b/libswscale/yuv2rgb.c
-index 71759bc..fa83985 100644
---- a/libswscale/yuv2rgb.c
-+++ b/libswscale/yuv2rgb.c
-@@ -44,6 +44,10 @@
- #include "yuv2rgb_mlib.c"
- #endif
-
-+#ifdef ARCH_AVR32
-+#include "yuv2rgb_avr32.c"
-+#endif
-+
- #define DITHER1XBPP // only for mmx
-
- const uint8_t __attribute__((aligned(8))) dither_2x2_4[2][8]={
-@@ -601,6 +605,12 @@ SwsFunc yuv2rgb_get_func_ptr (SwsContext *c)
- if(t) return t;
- }
- #endif
-+#ifdef ARCH_AVR32
-+ {
-+ SwsFunc t= yuv2rgb_init_avr32(c);
-+ if(t) return t;
-+ }
-+#endif
- #ifdef HAVE_ALTIVEC
- if (c->flags & SWS_CPU_CAPS_ALTIVEC)
- {
-@@ -678,6 +688,10 @@ int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange,
- //printf("%lld %lld %lld %lld %lld\n", cy, crv, cbu, cgu, cgv);
- oy -= 256*brightness;
-
-+#ifdef ARCH_AVR32
-+ yuv2rgb_c_init_tables_avr32 (c, inv_table, fullRange, brightness, contrast, saturation);
-+#endif
-+
- for (i = 0; i < 1024; i++) {
- int j;
-
-diff --git a/libswscale/yuv2rgb_avr32.c b/libswscale/yuv2rgb_avr32.c
-new file mode 100644
-index 0000000..4a8341e
---- /dev/null
-+++ b/libswscale/yuv2rgb_avr32.c
-@@ -0,0 +1,416 @@
-+/*
-+ * Copyright (c) 2007 Atmel Corporation. All rights reserved.
-+ *
-+ * Redistribution and use in source and binary forms, with or without
-+ * modification, are permitted provided that the following conditions
-+ * are met:
-+ *
-+ * 1. Redistributions of source code must retain the above copyright
-+ * notice, this list of conditions and the following disclaimer.
-+ *
-+ * 2. Redistributions in binary form must reproduce the above
-+ * copyright notice, this list of conditions and the following
-+ * disclaimer in the documentation and/or other materials provided
-+ * with the distribution.
-+ *
-+ * 3. The name of ATMEL may not be used to endorse or promote products
-+ * derived from this software without specific prior written
-+ * permission.
-+ *
-+ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
-+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-+ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
-+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
-+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
-+ * DAMAGE.
-+ */
-+#include "pico-avr32.h"
-+
-+
-+#define RGB(uv_part) \
-+ __asm__ volatile ( \
-+ "ld.w\t%0, %3[%7:" uv_part " << 2]\n\t" /* tmp = c->table_gV[V] */ \
-+ "ld.w\t%1, %4[%8:" uv_part " << 2]\n\t" /* g = c->table_gU[U] */ \
-+ "ld.w\t%2, %5[%8:" uv_part " << 2]\n\t" /* b = c->table_bU[U] */ \
-+ "add\t%1, %0\n\t" /* g += tmp */\
-+ "ld.w\t%0, %6[%7:" uv_part " << 2]" /* r = c->table_rV[V] */ \
-+ : "=&r" (r), "=&r" (g), "=&r" (b) \
-+ : "r" (&c->table_gV[0]), "r" (&c->table_gU[0]),"r" (&c->table_bU[0]), \
-+ "r" (&c->table_rV[0]), "r" (V), "r" (U));
-+
-+
-+#undef YUV2RGB1
-+#define YUV2RGB1(dst, src, y, idx) \
-+ { int tmp2; __asm__ volatile ( \
-+ "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
-+ "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \
-+ "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \
-+ "st.b\t%7[6*%8 + 0], %1\n\t" /* dst_1[2] = tmp; */ \
-+ "st.b\t%7[6*%8 + 1], %2\n\t" /* dst_1[1] = tmp; */ \
-+ "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \
-+ "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
-+ "st.b\t%7[6*%8 + 2], %1\n\t" /* dst_1[0] = tmp; */ \
-+ "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \
-+ "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \
-+ "st.b\t%7[6*%8 + 3], %1\n\t" /* dst_1[5] = tmp; */ \
-+ "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \
-+ "st.b\t%7[6*%8 + 4], %2\n\t" /* dst_1[4] = tmp; */ \
-+ "st.b\t%7[6*%8 + 5], %1" /* dst_1[3] = tmp; */ \
-+ : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \
-+ : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); }
-+
-+#undef YUV2RGB2
-+#define YUV2RGB2(dst, src, y, idx) \
-+ { int tmp2; __asm__ volatile ( \
-+ "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
-+ "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \
-+ "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \
-+ "st.b\t%7[6*%8 + 0], %1\n\t" /* dst_1[2] = tmp; */ \
-+ "st.b\t%7[6*%8 + 1], %2\n\t" /* dst_1[1] = tmp; */ \
-+ "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \
-+ "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
-+ "st.b\t%7[6*%8 + 2], %1\n\t" /* dst_1[0] = tmp; */ \
-+ "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \
-+ "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \
-+ "st.b\t%7[6*%8 + 3], %1\n\t" /* dst_1[5] = tmp; */ \
-+ "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \
-+ "st.b\t%7[6*%8 + 4], %2\n\t" /* dst_1[4] = tmp; */ \
-+ "st.b\t%7[6*%8 + 5], %1" /* dst_1[3] = tmp; */ \
-+ : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \
-+ : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); }
-+
-+
-+#undef YUV2BGR1
-+#define YUV2BGR1(dst, src, y, idx) \
-+ { int tmp2; __asm__ volatile ( \
-+ "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
-+ "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \
-+ "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \
-+ "st.b\t%7[6*%8 + 2], %1\n\t" /* dst_1[2] = tmp; */ \
-+ "st.b\t%7[6*%8 + 1], %2\n\t" /* dst_1[1] = tmp; */ \
-+ "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \
-+ "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
-+ "st.b\t%7[6*%8 + 0], %1\n\t" /* dst_1[0] = tmp; */ \
-+ "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \
-+ "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \
-+ "st.b\t%7[6*%8 + 5], %1\n\t" /* dst_1[5] = tmp; */ \
-+ "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \
-+ "st.b\t%7[6*%8 + 4], %2\n\t" /* dst_1[4] = tmp; */ \
-+ "st.b\t%7[6*%8 + 3], %1" /* dst_1[3] = tmp; */ \
-+ : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \
-+ : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); }
-+
-+#undef YUV2BGR2
-+#define YUV2BGR2(dst, src, y, idx) \
-+ { int tmp2; __asm__ volatile ( \
-+ "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
-+ "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \
-+ "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \
-+ "st.b\t%7[6*%8 + 2], %1\n\t" /* dst_1[2] = tmp; */ \
-+ "st.b\t%7[6*%8 + 1], %2\n\t" /* dst_1[1] = tmp; */ \
-+ "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \
-+ "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
-+ "st.b\t%7[6*%8 + 0], %1\n\t" /* dst_1[0] = tmp; */ \
-+ "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \
-+ "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \
-+ "st.b\t%7[6*%8 + 5], %1\n\t" /* dst_1[5] = tmp; */ \
-+ "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \
-+ "st.b\t%7[6*%8 + 4], %2\n\t" /* dst_1[4] = tmp; */ \
-+ "st.b\t%7[6*%8 + 3], %1" /* dst_1[3] = tmp; */ \
-+ : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \
-+ : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); }
-+
-+
-+
-+int yuv2bgr24_avr32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
-+ int srcSliceH, uint8_t* dst[], int dstStride[]){
-+ int y;
-+
-+ if(c->srcFormat == PIX_FMT_YUV422P){
-+ srcStride[1] *= 2;
-+ srcStride[2] *= 2;
-+ }
-+
-+
-+ for(y=0; y<srcSliceH; y+=2){
-+ uint8_t *dst_1= (uint8_t*)(dst[0] + (y+srcSliceY )*dstStride[0]);
-+ uint8_t *dst_2= (uint8_t*)(dst[0] + (y+srcSliceY+1)*dstStride[0]);
-+ uint32_t *r, *g, *b;
-+ uint8_t *py_1= src[0] + y*srcStride[0];
-+ uint8_t *py_2= py_1 + srcStride[0];
-+ uint8_t *pu= src[1] + (y>>1)*srcStride[1];
-+ uint8_t *pv= src[2] + (y>>1)*srcStride[2];
-+ unsigned int h_size= c->dstW>>3;
-+ while (h_size--) {
-+ uint32_t U, V, Y1, Y2, tmp;
-+ U = ((uint32_t*)pu)[0];
-+ V = ((uint32_t*)pv)[0];
-+
-+ RGB("t")
-+ YUV2BGR1(dst_1, py_1, Y1, 0)
-+ YUV2BGR1(dst_2, py_2, Y2, 0)
-+
-+ RGB("u")
-+ YUV2BGR2(dst_1, py_1, Y1, 1)
-+ YUV2BGR2(dst_2, py_2, Y2, 1)
-+
-+ RGB("l")
-+ YUV2BGR1(dst_1, py_1, Y1, 2)
-+ YUV2BGR1(dst_2, py_2, Y2, 2)
-+
-+ RGB("b")
-+ YUV2BGR2(dst_1, py_1, Y1, 3)
-+ YUV2BGR2(dst_2, py_2, Y2, 3)
-+
-+
-+
-+ pu += 4;
-+ pv += 4;
-+ py_1 += 8;
-+ py_2 += 8;
-+ dst_1 += 24;
-+ dst_2 += 24;
-+ }
-+ }
-+ return srcSliceH;
-+}
-+
-+
-+
-+static int yuv2rgb24_avr32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
-+ int srcSliceH, uint8_t* dst[], int dstStride[]){
-+ int y;
-+
-+ if(c->srcFormat == PIX_FMT_YUV422P){
-+ srcStride[1] *= 2;
-+ srcStride[2] *= 2;
-+ }
-+ for(y=0; y<srcSliceH; y+=2){
-+ uint8_t *dst_1= (uint8_t*)(dst[0] + (y+srcSliceY )*dstStride[0]);
-+ uint8_t *dst_2= (uint8_t*)(dst[0] + (y+srcSliceY+1)*dstStride[0]);
-+ uint8_t *r, *g, *b;
-+ uint8_t *py_1= src[0] + y*srcStride[0];
-+ uint8_t *py_2= py_1 + srcStride[0];
-+ uint8_t *pu= src[1] + (y>>1)*srcStride[1];
-+ uint8_t *pv= src[2] + (y>>1)*srcStride[2];
-+ unsigned int h_size= c->dstW>>3;
-+ while (h_size--) {
-+ uint32_t U, V, Y1, Y2, tmp;
-+ U = ((uint32_t*)pu)[0];
-+ V = ((uint32_t*)pv)[0];
-+
-+ RGB("t")
-+ YUV2RGB1(dst_1, py_1, Y1, 0)
-+ YUV2RGB1(dst_2, py_2, Y2, 0)
-+
-+ RGB("u")
-+ YUV2RGB2(dst_1, py_1, Y1, 1)
-+ YUV2RGB2(dst_2, py_2, Y2, 1)
-+
-+ RGB("l")
-+ YUV2RGB1(dst_1, py_1, Y1, 2)
-+ YUV2RGB1(dst_2, py_2, Y2, 2)
-+
-+ RGB("b")
-+ YUV2RGB2(dst_1, py_1, Y1, 3)
-+ YUV2RGB2(dst_2, py_2, Y2, 3)
-+
-+ pu += 4;
-+ pv += 4;
-+ py_1 += 8;
-+ py_2 += 8;
-+ dst_1 += 24;
-+ dst_2 += 24;
-+ }
-+ }
-+ return srcSliceH;
-+}
-+
-+#define SCALE(x, bits) (((x) + ( 1 << (bits - 1))) >> bits)
-+#define COEFF_FRAC_BITS 9
-+#define OFFSET_FRAC_BITS 2
-+
-+/* Coefficients used in the pico */
-+static struct {
-+ short coeff2_2;
-+ short coeff2_3;
-+ short coeff2_0;
-+ short coeff2_1;
-+ short coeff1_2;
-+ short coeff1_3;
-+ short coeff1_0;
-+ short coeff1_1;
-+ short coeff0_2;
-+ short coeff0_3;
-+ short coeff0_0;
-+ short coeff0_1;
-+} pico_coeff;
-+
-+
-+static int yuv2bgr24_avr32_pico(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
-+ int srcSliceH, uint8_t* dst[], int dstStride[]){
-+ int y;
-+ static int first_time = 1;
-+
-+ /* Initialize pico */
-+ PICO_LDCM_D(&pico_coeff,
-+ PICO_REGVECT_COEFF0_A, PICO_REGVECT_COEFF0_B,
-+ PICO_REGVECT_COEFF1_A, PICO_REGVECT_COEFF1_B,
-+ PICO_REGVECT_COEFF2_A, PICO_REGVECT_COEFF2_B);
-+
-+ PICO_PUT_W(PICO_CONFIG,
-+ (PICO_PACKED_MODE << PICO_OUTPUT_MODE
-+ | PICO_TRANSFORMATION_MODE << PICO_INPUT_MODE
-+ | OFFSET_FRAC_BITS << PICO_OFFSET_FRAC_BITS
-+ | COEFF_FRAC_BITS << PICO_COEFF_FRAC_BITS));
-+
-+
-+ if(c->srcFormat == PIX_FMT_YUV422P){
-+ srcStride[1] *= 2;
-+ srcStride[2] *= 2;
-+ }
-+
-+ for(y=0; y<srcSliceH; y+=2){
-+ uint8_t *dst_1= (uint8_t*)(dst[0] + (y+srcSliceY )*dstStride[0]);
-+ uint8_t *dst_2= (uint8_t*)(dst[0] + (y+srcSliceY+1)*dstStride[0]);
-+ uint8_t *r, *g, *b;
-+ uint8_t *py_1= src[0] + y*srcStride[0];
-+ uint8_t *py_2= py_1 + srcStride[0];
-+ uint8_t *pu= src[1] + (y>>1)*srcStride[1];
-+ uint8_t *pv= src[2] + (y>>1)*srcStride[2];
-+ unsigned int h_size= c->dstW>>3;
-+ int *py_1_int = (int *)py_1;
-+ int *py_2_int = (int *)py_2;
-+ int *pu_int = (int *)pu;
-+ int *pv_int = (int *)pv;
-+ while (h_size--) {
-+ PICO_PUT_W(PICO_INPIX0, *py_1_int++);
-+ PICO_PUT_W(PICO_INPIX1, *pu_int++);
-+ PICO_PUT_W(PICO_INPIX2, *pv_int++);
-+ PICO_OP(0, 0, 0, 4, 8);
-+ PICO_OP(0, 1, 1, 4, 8);
-+ PICO_OP(0, 2, 2, 5, 9);
-+ PICO_OP(0, 3, 3, 5, 9);
-+ PICO_PUT_W(PICO_INPIX0, *py_1_int++);
-+ PICO_STCM_W(dst_1, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0);
-+ PICO_OP(0, 0, 0, 6, 10);
-+ PICO_OP(0, 1, 1, 6, 10);
-+ PICO_OP(0, 2, 2, 7, 11);
-+ PICO_OP(0, 3, 3, 7, 11);
-+ PICO_PUT_W(PICO_INPIX0, *py_2_int++);
-+ PICO_STCM_W(dst_1 + 12, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0);
-+
-+ PICO_OP(0, 0, 0, 4, 8);
-+ PICO_OP(0, 1, 1, 4, 8);
-+ PICO_OP(0, 2, 2, 5, 9);
-+ PICO_OP(0, 3, 3, 5, 9);
-+ PICO_PUT_W(PICO_INPIX0, *py_2_int++);
-+ PICO_STCM_W(dst_2, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0);
-+ PICO_OP(0, 0, 0, 6, 10);
-+ PICO_OP(0, 1, 1, 6, 10);
-+ PICO_OP(0, 2, 2, 7, 11);
-+ PICO_OP(0, 3, 3, 7, 11);
-+ PICO_STCM_W(dst_2 + 12, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0);
-+
-+ dst_1 += 24;
-+ dst_2 += 24;
-+ }
-+ }
-+ return srcSliceH;
-+}
-+
-+extern int avr32_use_pico;
-+
-+SwsFunc yuv2rgb_init_avr32 (SwsContext *c){
-+ switch(c->dstFormat){
-+ case PIX_FMT_BGR24:
-+ {
-+ if ( avr32_use_pico ){
-+ MSG_ERR("AVR32 BGR24: Using PICO for color space conversion\n");
-+ return yuv2bgr24_avr32_pico;
-+ } else {
-+ MSG_ERR("AVR32 BGR24: Using optimized color space conversion\n");
-+ return yuv2bgr24_avr32;
-+ }
-+ }
-+ break;
-+ case PIX_FMT_RGB24:
-+ {
-+ if ( avr32_use_pico ){
-+ MSG_ERR("AVR32 RGB24: Using PICO for color space conversion\n");
-+ return yuv2bgr24_avr32_pico;
-+ } else {
-+ MSG_ERR("AVR32 RGB24: Using optimized color space conversion\n");
-+ return yuv2rgb24_avr32;
-+ }
-+ }
-+ }
-+ return NULL;
-+}
-+
-+
-+int yuv2rgb_c_init_tables_avr32 (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation){
-+ const int isRgb = (c->dstFormat == PIX_FMT_RGB24);
-+
-+ int64_t crv = inv_table[0];
-+ int64_t cbu = inv_table[1];
-+ int64_t cgu = -inv_table[2];
-+ int64_t cgv = -inv_table[3];
-+ int64_t cy = 1<<16;
-+ int64_t oy = 0;
-+
-+ if(!fullRange){
-+ cy= (cy*255) / 219;
-+ oy= 16<<16;
-+ }
-+
-+ cy = (cy *contrast )>>16;
-+ crv= (crv*contrast * saturation)>>32;
-+ cbu= (cbu*contrast * saturation)>>32;
-+ cgu= (cgu*contrast * saturation)>>32;
-+ cgv= (cgv*contrast * saturation)>>32;
-+
-+ oy -= 256*brightness;
-+
-+ pico_coeff.coeff1_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* G <- Y */
-+ pico_coeff.coeff1_1 = SCALE(cgu, 16 - COEFF_FRAC_BITS); /* G <- U */
-+ pico_coeff.coeff1_2 = SCALE(cgv, 16 - COEFF_FRAC_BITS); /* G <- V */
-+ pico_coeff.coeff1_3 = (SCALE(-128*cgu - 128*cgv - 16*cy, 16 - OFFSET_FRAC_BITS)
-+ + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* G offset */
-+
-+ if ( isRgb ){
-+ pico_coeff.coeff0_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* R <- Y */
-+ pico_coeff.coeff0_1 = 0; /* R <- U */
-+ pico_coeff.coeff0_2 = SCALE(crv, 16 - COEFF_FRAC_BITS); /* R <- V */
-+ pico_coeff.coeff0_3 = (SCALE(-128*crv - 16*cy, 16 - OFFSET_FRAC_BITS)
-+ + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* R offset */
-+
-+ pico_coeff.coeff2_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* B <- Y */
-+ pico_coeff.coeff2_1 = SCALE(cbu, 16 - COEFF_FRAC_BITS); /* B <- U */
-+ pico_coeff.coeff2_2 = 0; /* B <- V */
-+ pico_coeff.coeff2_3 = (SCALE(-128*cbu - 16*cy, 16 - OFFSET_FRAC_BITS)
-+ + /*0.5*/(1 << (OFFSET_FRAC_BITS-1)));/* B offset */
-+ } else {
-+ pico_coeff.coeff2_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* R <- Y */
-+ pico_coeff.coeff2_1 = 0; /* R <- U */
-+ pico_coeff.coeff2_2 = SCALE(crv, 16 - COEFF_FRAC_BITS); /* R <- V */
-+ pico_coeff.coeff2_3 = (SCALE(-128*crv - 16*cy, 16 - OFFSET_FRAC_BITS)
-+ + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* R offset */
-+
-+ pico_coeff.coeff0_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* B <- Y */
-+ pico_coeff.coeff0_1 = SCALE(cbu, 16 - COEFF_FRAC_BITS); /* B <- U */
-+ pico_coeff.coeff0_2 = 0; /* B <- V */
-+ pico_coeff.coeff0_3 = (SCALE(-128*cbu - 16*cy, 16 - OFFSET_FRAC_BITS)
-+ + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* B offset */
-+ }
-+
-+}
-+
-+
-+#undef RGB
-diff --git a/libvo/vo_fbdev2.c b/libvo/vo_fbdev2.c
-index 053c193..7017770 100644
---- a/libvo/vo_fbdev2.c
-+++ b/libvo/vo_fbdev2.c
-@@ -22,6 +22,9 @@
- #include "sub.h"
- #include "mp_msg.h"
-
-+/* Draw directly to framebuffer */
-+#define USE_CONVERT2FB
-+
- static vo_info_t info = {
- "Framebuffer Device",
- "fbdev2",
-@@ -178,6 +181,15 @@ static int fb_preinit(int reset)
- }
- fb_orig_vinfo = fb_vinfo;
-
-+ /* Reset panning offset */
-+ fb_vinfo.yoffset = 0;
-+ if (ioctl(fb_dev_fd, FBIOPAN_DISPLAY, &fb_vinfo)) {
-+ mp_msg(MSGT_VO, MSGL_ERR,
-+ "[fbdev2] FBIOPAN_DISPLAY failed: %s\n",
-+ strerror(errno));
-+ return 0;
-+ }
-+
- fb_bpp = fb_vinfo.bits_per_pixel;
-
- /* 16 and 15 bpp is reported as 16 bpp */
-@@ -289,6 +301,10 @@ static int config(uint32_t width, uint32_t height, uint32_t d_width,
- mp_msg(MSGT_VO, MSGL_ERR, "[fbdev2] Can't malloc next_frame: %s\n", strerror(errno));
- return 1;
- }
-+#else
-+ if ((fb_line_len * fb_vinfo.yres) <= (fb_finfo.smem_len / 2)
-+ && fb_vinfo.yoffset == 0)
-+ center += fb_line_len * fb_vinfo.yres;
- #endif
- if (fs) memset(frame_buffer, '\0', fb_line_len * fb_vinfo.yres);
-
-@@ -299,14 +315,22 @@ static int query_format(uint32_t format)
- {
- // open the device, etc.
- if (fb_preinit(0)) return 0;
-- if ((format & IMGFMT_BGR_MASK) == IMGFMT_BGR) {
-+ if ((format & IMGFMT_RGB_MASK) == IMGFMT_RGB) {
- int fb_target_bpp = format & 0xff;
- set_bpp(&fb_vinfo, fb_target_bpp);
- fb_vinfo.xres_virtual = fb_vinfo.xres;
-- fb_vinfo.yres_virtual = fb_vinfo.yres;
-+ fb_vinfo.yres_virtual = fb_vinfo.yres * 2;
- if (ioctl(fb_dev_fd, FBIOPUT_VSCREENINFO, &fb_vinfo)) {
-- mp_msg(MSGT_VO, MSGL_ERR, "[fbdev2] Can't put VSCREENINFO: %s\n", strerror(errno));
-- return 0;
-+ mp_msg(MSGT_VO, MSGL_WARN,
-+ "[fbdev2] Can't double virtual y resolution: %s\n",
-+ strerror(errno));
-+ fb_vinfo.yres_virtual = fb_vinfo.yres;
-+ if (ioctl(fb_dev_fd, FBIOPUT_VSCREENINFO, &fb_vinfo)) {
-+ mp_msg(MSGT_VO, MSGL_ERR,
-+ "[fbdev2] Can't put VSCREENINFO: %s\n",
-+ strerror(errno));
-+ return -1;
-+ }
- }
- fb_pixel_size = fb_vinfo.bits_per_pixel / 8;
- fb_bpp = fb_vinfo.red.length + fb_vinfo.green.length +
-@@ -367,16 +391,67 @@ static void check_events(void)
-
- static void flip_page(void)
- {
--#ifndef USE_CONVERT2FB
- int i, out_offset = 0, in_offset = 0;
-
-- for (i = 0; i < in_height; i++) {
-- memcpy(center + out_offset, next_frame + in_offset,
-- in_width * fb_pixel_size);
-- out_offset += fb_line_len;
-- in_offset += in_width * fb_pixel_size;
-- }
-+#ifndef USE_CONVERT2FB
-+ if (1) {
-+#else
-+ if (fb_vinfo.yres_virtual == fb_vinfo.yres) {
- #endif
-+ for (i = 0; i < in_height; i++) {
-+ memcpy(center + out_offset, next_frame + in_offset,
-+ in_width * fb_pixel_size);
-+ out_offset += fb_line_len;
-+ in_offset += in_width * fb_pixel_size;
-+ }
-+ } else {
-+ if (fb_vinfo.yoffset == 0) {
-+ fb_vinfo.yoffset += fb_vinfo.yres;
-+ center -= fb_line_len * fb_vinfo.yres;
-+ } else {
-+ fb_vinfo.yoffset = 0;
-+ center += fb_line_len * fb_vinfo.yres;
-+ }
-+
-+ if (ioctl(fb_dev_fd, FBIOPAN_DISPLAY, &fb_vinfo)) {
-+ mp_msg(MSGT_VO, MSGL_ERR,
-+ "[fbdev2] Can't FBIOPAN_DISPLAY: %s\n",
-+ strerror(errno));
-+ }
-+ }
-+}
-+
-+static uint32_t get_image(mp_image_t *mpi)
-+{
-+ if(mpi->flags&MP_IMGFLAG_READABLE)
-+ return VO_FALSE; // slow video ram
-+ if(mpi->type==MP_IMGTYPE_STATIC)
-+ return VO_FALSE; // it is not static
-+
-+ if (mpi->flags & (MP_IMGFLAG_ACCEPT_STRIDE | MP_IMGFLAG_ACCEPT_WIDTH)) {
-+ // we're lucky or codec accepts stride => ok, let's go!
-+
-+ //YUY2 and RGB formats
-+ mpi->planes[0] = center;
-+ mpi->width = in_width;
-+ mpi->stride[0] = fb_line_len;
-+
-+ // center image
-+
-+ mpi->flags |= MP_IMGFLAG_DIRECT;
-+
-+ return VO_TRUE;
-+ }
-+
-+ return VO_FALSE;
-+}
-+
-+static uint32_t put_image(mp_image_t *mpi)
-+{
-+ // already out?
-+ if ((mpi->flags & (MP_IMGFLAG_DIRECT | MP_IMGFLAG_DRAW_CALLBACK)))
-+ return VO_TRUE;
-+ return VO_FALSE;
- }
-
- static void uninit(void)
-@@ -403,6 +478,10 @@ static int control(uint32_t request, void *data, ...)
- switch (request) {
- case VOCTRL_QUERY_FORMAT:
- return query_format(*((uint32_t*)data));
-+ case VOCTRL_GET_IMAGE:
-+ return get_image(data);
-+ case VOCTRL_DRAW_IMAGE:
-+ return put_image(data);
- }
- return VO_NOTIMPL;
- }
-diff --git a/version.sh b/version.sh
-index 44b5c5d..cf22a68 100755
---- a/version.sh
-+++ b/version.sh
-@@ -1,2 +1,2 @@
- #!/bin/sh
--echo "#define VERSION \"1.0rc1-$1\"" > version.h
-+echo "#define VERSION \"1.0rc1.atmel.2-$1\"" > version.h
+++ /dev/null
-diff -urN MPlayer-1.0rc1-0rig/stream/stream_dvb.c MPlayer-1.0rc1/stream/stream_dvb.c
---- MPlayer-1.0rc1-0rig/stream/stream_dvb.c 2006-10-23 00:32:25.000000000 +0200
-+++ MPlayer-1.0rc1/stream/stream_dvb.c 2007-09-25 08:37:54.000000000 +0200
-@@ -37,9 +37,7 @@
- #include <sys/poll.h>
- #include <unistd.h>
- #include <fcntl.h>
--#include <string.h>
- #include <errno.h>
--#include <fcntl.h>
-
- #include "stream.h"
- #include "libmpdemux/demuxer.h"
-@@ -168,7 +166,7 @@
- if((line[0] == '#') || (strlen(line) == 0))
- continue;
-
-- colon = index(line, ':');
-+ colon = strchr(line, ':');
- if(colon)
- {
- k = colon - line;
+++ /dev/null
-#############################################################
-#
-# mplayer
-#
-#############################################################
-MPLAYER_VERSION:=1.0rc1
-MPLAYER_SOURCE:=MPlayer-$(MPLAYER_VERSION).tar.bz2
-MPLAYER_SITE:=http://www7.mplayerhq.hu/MPlayer/releases
-MPLAYER_DIR:=$(BUILD_DIR)/MPlayer-$(MPLAYER_VERSION)
-MPLAYER_CAT:=$(BZCAT)
-MPLAYER_BINARY:=mplayer
-MPLAYER_TARGET_BINARY:=usr/bin/$(MPLAYER_BINARY)
-
-ifeq ($(BR2_ENDIAN),"BIG")
-MPLAYER_ENDIAN:=--enable-big-endian
-else
-MPLAYER_ENDIAN:=--disable-big-endian
-endif
-
-$(DL_DIR)/$(MPLAYER_SOURCE):
- $(WGET) -P $(DL_DIR) $(MPLAYER_SITE)/$(MPLAYER_SOURCE)
-
-$(MPLAYER_DIR)/.unpacked: $(DL_DIR)/$(MPLAYER_SOURCE)
- $(MPLAYER_CAT) $(DL_DIR)/$(MPLAYER_SOURCE) | tar -C $(BUILD_DIR) $(TAR_OPTIONS) -
- toolchain/patch-kernel.sh $(MPLAYER_DIR) package/mplayer/ mplayer-$(MPLAYER_VERSION)\*.patch\*
- $(CONFIG_UPDATE) $(MPLAYER_DIR)
- touch $@
-
-$(MPLAYER_DIR)/.configured: $(MPLAYER_DIR)/.unpacked
- (cd $(MPLAYER_DIR); rm -rf config.cache; \
- $(TARGET_CONFIGURE_OPTS) \
- $(TARGET_CONFIGURE_ARGS) \
- CFLAGS="$(TARGET_CFLAGS)" \
- LDFLAGS="$(TARGET_LDFLAGS)" \
- ./configure \
- --prefix=/usr \
- --confdir=/etc \
- --target=$(GNU_TARGET_NAME) \
- --host-cc=$(HOSTCC) \
- --cc=$(TARGET_CC) \
- --as=$(TARGET_CROSS)as \
- --with-extraincdir=$(STAGING_DIR)/usr/include \
- --with-extralibdir=$(STAGING_DIR)/lib \
- --enable-mad \
- --enable-fbdev \
- $(MPLAYER_ENDIAN) \
- --disable-mpdvdkit \
- --disable-tv \
- --enable-dynamic-plugins \
- )
- touch $@
-
-$(MPLAYER_DIR)/$(MPLAYER_BINARY): $(MPLAYER_DIR)/.configured
- $(MAKE) -C $(MPLAYER_DIR)
- touch -c $@
-
-$(TARGET_DIR)/$(MPLAYER_TARGET_BINARY): $(MPLAYER_DIR)/$(MPLAYER_BINARY)
- $(INSTALL) -m 0755 -D $(MPLAYER_DIR)/$(MPLAYER_BINARY) $(TARGET_DIR)/$(MPLAYER_TARGET_BINARY)
- -$(STRIPCMD) $(STRIP_STRIP_UNNEEDED) $(TARGET_DIR)/$(MPLAYER_TARGET_BINARY)
- touch -c $@
-
-mplayer: uclibc libmad $(TARGET_DIR)/$(MPLAYER_TARGET_BINARY)
-
-mplayer-source: $(DL_DIR)/$(MPLAYER_SOURCE)
-
-mplayer-unpacked: $(MPLAYER_DIR)/.unpacked
-
-mplayer-clean:
- rm -f $(TARGET_DIR)/$(MPLAYER_TARGET_BINARY)
- -$(MAKE) -C $(MPLAYER_DIR) clean
-
-mplayer-dirclean:
- rm -rf $(MPLAYER_DIR)
-#############################################################
-#
-# Toplevel Makefile options
-#
-#############################################################
-ifeq ($(strip $(BR2_PACKAGE_MPLAYER)),y)
-TARGETS+=mplayer
-endif
source "package/multimedia/libvorbis/Config.in"
source "package/multimedia/madplay/Config.in"
source "package/multimedia/mpg123/Config.in"
+source "package/multimedia/mplayer/Config.in"
source "package/multimedia/speex/Config.in"
source "package/multimedia/festival/Config.in"
+source "package/multimedia/vlc/Config.in"
endmenu
--- /dev/null
+config BR2_PACKAGE_MPLAYER
+ bool "mplayer"
+ select BR2_PACKAGE_LIBMAD
+ help
+ MPlayer is a movie player which runs on many systems and supports
+ many different file formats.
+
+ http://www.mplayerhq.hu/
--- /dev/null
+ cfg-common.h | 4 +
+ cfg-mencoder.h | 4 +
+ cfg-mplayer.h | 4 +
+ configure | 13 +-
+ libaf/af_format.c | 7 +
+ libavcodec/Makefile | 7 +
+ libavcodec/avr32/dsputil_avr32.c | 2678 ++++++++++++++++++++++++++++++++++++++
+ libavcodec/avr32/fdct.S | 541 ++++++++
+ libavcodec/avr32/h264idct.S | 451 +++++++
+ libavcodec/avr32/idct.S | 829 ++++++++++++
+ libavcodec/avr32/mc.S | 434 ++++++
+ libavcodec/avr32/pico.h | 260 ++++
+ libavcodec/bitstream.h | 77 +-
+ libavcodec/dsputil.c | 3 +
+ libavcodec/h264.c | 15 +
+ libavutil/common.h | 16 +
+ libavutil/internal.h | 9 +
+ libfaad2/common.h | 2 +-
+ libmpcodecs/ad_libmad.c | 5 +
+ libswscale/pico-avr32.h | 137 ++
+ libswscale/swscale_internal.h | 2 +-
+ libswscale/yuv2rgb.c | 14 +
+ libswscale/yuv2rgb_avr32.c | 416 ++++++
+ libvo/vo_fbdev2.c | 101 ++-
+ version.sh | 2 +-
+ 25 files changed, 6011 insertions(+), 20 deletions(-)
+ create mode 100644 libavcodec/avr32/dsputil_avr32.c
+ create mode 100644 libavcodec/avr32/fdct.S
+ create mode 100644 libavcodec/avr32/h264idct.S
+ create mode 100644 libavcodec/avr32/idct.S
+ create mode 100644 libavcodec/avr32/mc.S
+ create mode 100644 libavcodec/avr32/pico.h
+ create mode 100644 libswscale/pico-avr32.h
+ create mode 100644 libswscale/yuv2rgb_avr32.c
+
+diff --git a/cfg-common.h b/cfg-common.h
+index 780df38..7d878a8 100644
+--- a/cfg-common.h
++++ b/cfg-common.h
+@@ -235,6 +235,10 @@
+ {"tsprobe", &ts_probe, CONF_TYPE_POSITION, 0, 0, TS_MAX_PROBE_SIZE, NULL},
+ {"tskeepbroken", &ts_keep_broken, CONF_TYPE_FLAG, 0, 0, 1, NULL},
+
++#ifdef ARCH_AVR32
++ {"use-pico", &avr32_use_pico, CONF_TYPE_FLAG, 0, 0, 1, NULL},
++ {"nouse-pico", &avr32_use_pico, CONF_TYPE_FLAG, 0, 1, 0, NULL},
++#endif
+ // draw by slices or whole frame (useful with libmpeg2/libavcodec)
+ {"slices", &vd_use_slices, CONF_TYPE_FLAG, 0, 0, 1, NULL},
+ {"noslices", &vd_use_slices, CONF_TYPE_FLAG, 0, 1, 0, NULL},
+diff --git a/cfg-mencoder.h b/cfg-mencoder.h
+index 411b748..addf791 100644
+--- a/cfg-mencoder.h
++++ b/cfg-mencoder.h
+@@ -5,6 +5,10 @@
+
+ #include "cfg-common.h"
+
++#ifdef ARCH_AVR32
++extern int avr32_use_pico;
++#endif
++
+ #ifdef USE_FAKE_MONO
+ extern int fakemono; // defined in dec_audio.c
+ #endif
+diff --git a/cfg-mplayer.h b/cfg-mplayer.h
+index 62b6eac..31499c2 100644
+--- a/cfg-mplayer.h
++++ b/cfg-mplayer.h
+@@ -4,6 +4,10 @@
+
+ #include "cfg-common.h"
+
++#ifdef ARCH_AVR32
++extern int avr32_use_pico;
++#endif
++
+ extern int noconsolecontrols;
+
+ #if defined(HAVE_FBDEV)||defined(HAVE_VESA)
+diff --git a/configure b/configure
+index 29002c8..56c6fe4 100755
+--- a/configure
++++ b/configure
+@@ -1203,6 +1203,15 @@ EOF
+ _optimizing="$proc"
+ ;;
+
++ avr32)
++ _def_arch='#define ARCH_AVR32'
++ _target_arch='TARGET_ARCH_AVR32 = yes'
++ iproc='avr32'
++ proc=''
++ _march=''
++ _mcpu=''
++ _optimizing=''
++ ;;
+ arm|armv4l|armv5tel)
+ _def_arch='#define ARCH_ARMV4L 1'
+ _target_arch='TARGET_ARCH_ARMV4L = yes'
+@@ -1533,7 +1542,7 @@ echores $_named_asm_args
+ # Checking for CFLAGS
+ _stripbinaries=yes
+ if test "$_profile" != "" || test "$_debug" != "" ; then
+- CFLAGS="-W -Wall -O2 $_march $_mcpu $_debug $_profile"
++ CFLAGS="-W -Wall -O4 $_march $_mcpu $_debug $_profile"
+ if test "$_cc_major" -ge "3" ; then
+ CFLAGS=`echo "$CFLAGS" | sed -e 's/\(-Wall\)/\1 -Wno-unused-parameter/'`
+ fi
+@@ -3794,7 +3803,7 @@ fi
+
+
+ echocheck "X11 headers presence"
+- for I in `echo $_inc_extra | sed s/-I//g` /usr/X11/include /usr/X11R6/include /usr/include/X11R6 /usr/include /usr/openwin/include ; do
++ for I in `echo $_inc_extra | sed s/-I//g`; do
+ if test -f "$I/X11/Xlib.h" ; then
+ _inc_x11="-I$I"
+ _x11_headers="yes"
+diff --git a/libaf/af_format.c b/libaf/af_format.c
+index e5b7cc9..5d7ea6d 100644
+--- a/libaf/af_format.c
++++ b/libaf/af_format.c
+@@ -20,7 +20,14 @@
+ // Integer to float conversion through lrintf()
+ #ifdef HAVE_LRINTF
+ #include <math.h>
++
++#ifdef ARCH_AVR32
++#define lrintf(x) rint(x)
++#define llrint(x) (long long)rint(x)
++#else
+ long int lrintf(float);
++#endif
++
+ #else
+ #define lrintf(x) ((int)(x))
+ #endif
+diff --git a/libavcodec/Makefile b/libavcodec/Makefile
+index 17b6c45..8e1dc96 100644
+--- a/libavcodec/Makefile
++++ b/libavcodec/Makefile
+@@ -360,6 +360,12 @@ OBJS-$(TARGET_ARCH_SPARC) += sparc/dsputil_vis.o \
+
+ sparc/dsputil_vis.o: CFLAGS += -mcpu=ultrasparc -mtune=ultrasparc
+
++# avr32 specific stuff
++ifeq ($(TARGET_ARCH_AVR32),yes)
++ASM_OBJS += avr32/idct.o avr32/fdct.o avr32/mc.o avr32/h264idct.o
++OBJS += avr32/dsputil_avr32.o
++endif
++
+ # sun mediaLib specific stuff
+ OBJS-$(HAVE_MLIB) += mlib/dsputil_mlib.o \
+
+@@ -419,6 +425,7 @@ tests: apiexample $(TESTS)
+ clean::
+ rm -f \
+ i386/*.o i386/*~ \
++ avr32/*.o avr32/*~ \
+ armv4l/*.o armv4l/*~ \
+ mlib/*.o mlib/*~ \
+ alpha/*.o alpha/*~ \
+diff --git a/libavcodec/avr32/dsputil_avr32.c b/libavcodec/avr32/dsputil_avr32.c
+new file mode 100644
+index 0000000..200284d
+--- /dev/null
++++ b/libavcodec/avr32/dsputil_avr32.c
+@@ -0,0 +1,2678 @@
++/*
++ * Copyright (c) 2007 Atmel Corporation. All rights reserved.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * 2. Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials provided
++ * with the distribution.
++ *
++ * 3. The name of ATMEL may not be used to endorse or promote products
++ * derived from this software without specific prior written
++ * permission.
++ *
++ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
++ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
++ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
++ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
++ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
++ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
++ * DAMAGE.
++ */
++
++#include "../dsputil.h"
++#include "pico.h"
++
++int avr32_use_pico = 1;
++
++//#define CHECK_DSP_FUNCS_AGAINST_C
++
++#ifdef CHECK_DSP_FUNCS_AGAINST_C
++#define DSP_FUNC_NAME(name) test_ ## name
++#else
++#define DSP_FUNC_NAME(name) name
++#endif
++
++union doubleword {
++ int64_t doubleword;
++ struct {
++ int32_t top;
++ int32_t bottom;
++ } words;
++};
++
++#undef LD16
++#undef LD32
++#undef LD64
++
++#define LD16(a) (*((uint16_t*)(a)))
++#define LD32(a) (*((uint32_t*)(a)))
++#define LD64(a) (*((uint64_t*)(a)))
++#define LD64_UNALIGNED(a) \
++ ({ union doubleword __tmp__; \
++ __tmp__.words.top = LD32(a); \
++ __tmp__.words.bottom = LD32(a + 4); \
++ __tmp__.doubleword; })
++
++#undef ST32
++#undef ST16
++
++#define ST16(a, b) *((uint16_t*)(a)) = (b)
++#define ST32(a, b) *((uint32_t*)(a)) = (b)
++
++#undef rnd_avg32
++#define rnd_avg32(a, b) \
++ ({ uint32_t __tmp__;\
++ asm("pavg.ub\t%0, %1, %2" : "=r"(__tmp__) : "r"(a), "r"(b));\
++ __tmp__;})
++
++void idct_avr32(DCTELEM *data);
++void fdct_avr32(DCTELEM *data);
++
++void idct_put_avr32(uint8_t *dest, int line_size, DCTELEM *data);
++void idct_add_avr32(uint8_t *dest, int line_size, DCTELEM *data);
++
++void h264_idct_add_avr32(uint8_t *dest, DCTELEM *data, int stride);
++void h264_idct8_add_avr32(uint8_t *dest, DCTELEM *data, int stride);
++
++#define extern_dspfunc(PFX, NUM) \
++ void PFX ## _pixels ## NUM ## _avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
++ void PFX ## _pixels ## NUM ## _h_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
++ void PFX ## _pixels ## NUM ## _v_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
++ void PFX ## _pixels ## NUM ## _hv_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h )
++
++extern_dspfunc(put, 8);
++extern_dspfunc(put_no_rnd, 8);
++extern_dspfunc(avg, 8);
++extern_dspfunc(avg_no_rnd, 8);
++#undef extern_dspfunc
++
++#ifdef CHECK_DSP_FUNCS_AGAINST_C
++#define extern_dspfunc(PFX, NUM) \
++ void PFX ## _pixels ## NUM ## _c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
++ void PFX ## _pixels ## NUM ## _x2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
++ void PFX ## _pixels ## NUM ## _y2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
++ void PFX ## _pixels ## NUM ## _xy2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h )
++
++extern_dspfunc(put, 4);
++extern_dspfunc(put_no_rnd, 4);
++extern_dspfunc(put, 8);
++extern_dspfunc(put_no_rnd, 8);
++extern_dspfunc(put, 16);
++extern_dspfunc(put_no_rnd, 16);
++extern_dspfunc(avg, 8);
++extern_dspfunc(avg_no_rnd, 8);
++extern_dspfunc(avg, 16);
++extern_dspfunc(avg_no_rnd, 16);
++
++
++#undef extern_dspfunc
++#define extern_dspfunc(PFX, NUM) \
++void PFX ## NUM ## _mc00_c(uint8_t *dst, uint8_t *src, int stride); \
++void PFX ## NUM ## _mc10_c(uint8_t *dst, uint8_t *src, int stride); \
++void PFX ## NUM ## _mc20_c(uint8_t *dst, uint8_t *src, int stride); \
++void PFX ## NUM ## _mc30_c(uint8_t *dst, uint8_t *src, int stride); \
++void PFX ## NUM ## _mc01_c(uint8_t *dst, uint8_t *src, int stride); \
++void PFX ## NUM ## _mc11_c(uint8_t *dst, uint8_t *src, int stride); \
++void PFX ## NUM ## _mc21_c(uint8_t *dst, uint8_t *src, int stride); \
++void PFX ## NUM ## _mc31_c(uint8_t *dst, uint8_t *src, int stride); \
++void PFX ## NUM ## _mc02_c(uint8_t *dst, uint8_t *src, int stride); \
++void PFX ## NUM ## _mc12_c(uint8_t *dst, uint8_t *src, int stride); \
++void PFX ## NUM ## _mc22_c(uint8_t *dst, uint8_t *src, int stride); \
++void PFX ## NUM ## _mc32_c(uint8_t *dst, uint8_t *src, int stride); \
++void PFX ## NUM ## _mc03_c(uint8_t *dst, uint8_t *src, int stride); \
++void PFX ## NUM ## _mc13_c(uint8_t *dst, uint8_t *src, int stride); \
++void PFX ## NUM ## _mc23_c(uint8_t *dst, uint8_t *src, int stride); \
++void PFX ## NUM ## _mc33_c(uint8_t *dst, uint8_t *src, int stride); \
++
++extern_dspfunc(put_h264_qpel, 16);
++extern_dspfunc(put_h264_qpel, 8);
++extern_dspfunc(put_h264_qpel, 4);
++extern_dspfunc(avg_h264_qpel, 16);
++extern_dspfunc(avg_h264_qpel, 8);
++extern_dspfunc(avg_h264_qpel, 4);
++
++#undef extern_dspfunc
++
++void put_h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
++void put_h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
++void put_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
++
++void avg_h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
++void avg_h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
++void avg_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
++
++
++void dump_block8(uint8_t *block, int line_size, int h);
++void dump_block4(uint8_t *block, int line_size, int h);
++void dump_block(uint8_t *block, int line_size, int h, int w);
++
++void check_block8(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
++ int h, char *name, int max_dev);
++void check_block4(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
++ int h, char *name, int max_dev);
++void check_block(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
++ int h, int width, char *name, int max_dev);
++
++#define PIXOP2( OPNAME, OP ) \
++void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
++ int i;\
++ for(i=0; i<h; i++){\
++ OP(*((uint32_t*)(block )), LD32(pixels ));\
++ pixels+=line_size;\
++ block +=line_size;\
++ }\
++}\
++void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
++ int src_stride1, int src_stride2, int h){\
++ int i;\
++ for(i=0; i<h; i++){\
++ uint32_t a,b;\
++ a= LD32(&src1[i*src_stride1 ]);\
++ b= LD32(&src2[i*src_stride2 ]);\
++ OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
++ a= LD32(&src1[i*src_stride1+4]);\
++ b= LD32(&src2[i*src_stride2+4]);\
++ OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
++ }\
++}\
++\
++void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
++ int src_stride1, int src_stride2, int h){\
++ int i;\
++ for(i=0; i<h; i++){\
++ uint32_t a,b;\
++ a= LD32(&src1[i*src_stride1 ]);\
++ b= LD32(&src2[i*src_stride2 ]);\
++ OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
++ }\
++}\
++\
++void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
++ int src_stride1, int src_stride2, int h){\
++ OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
++ OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
++}\
++
++#else
++#define PIXOP2( OPNAME, OP ) \
++static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
++ int i;\
++ for(i=0; i<h; i++){\
++ OP(*((uint32_t*)(block )), LD32(pixels ));\
++ pixels+=line_size;\
++ block +=line_size;\
++ }\
++}\
++static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
++ int i;\
++ for(i=0; i<h; i++){\
++ OP(*((uint32_t*)(block )), LD32(pixels ));\
++ OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
++ pixels+=line_size;\
++ block +=line_size;\
++ }\
++}\
++static void OPNAME ## _pixels16_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
++ int i;\
++ for(i=0; i<h; i++){\
++ OP(*((uint32_t*)(block )), LD32(pixels ));\
++ OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
++ OP(*((uint32_t*)(block+8)), LD32(pixels+8));\
++ OP(*((uint32_t*)(block+12)), LD32(pixels+12));\
++ pixels+=line_size;\
++ block +=line_size;\
++ }\
++}\
++static void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
++ int src_stride1, int src_stride2, int h){\
++ int i;\
++ for(i=0; i<h; i++){\
++ uint32_t a,b;\
++ a= LD32(&src1[i*src_stride1 ]);\
++ b= LD32(&src2[i*src_stride2 ]);\
++ OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
++ a= LD32(&src1[i*src_stride1+4]);\
++ b= LD32(&src2[i*src_stride2+4]);\
++ OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
++ }\
++}\
++\
++static void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
++ int src_stride1, int src_stride2, int h){\
++ int i;\
++ for(i=0; i<h; i++){\
++ uint32_t a,b;\
++ a= LD32(&src1[i*src_stride1 ]);\
++ b= LD32(&src2[i*src_stride2 ]);\
++ OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
++ }\
++}\
++\
++static void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
++ int src_stride1, int src_stride2, int h){\
++ OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
++ OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
++}\
++
++#endif
++
++#define op_avg(a, b) a = rnd_avg32(a, b)
++#define op_put(a, b) a = b
++
++PIXOP2(avg, op_avg)
++PIXOP2(put, op_put)
++#undef op_avg
++#undef op_put
++
++
++
++static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
++{
++ int i;
++ for(i=0; i<h; i++)
++ {
++ ST32(dst , LD32(src ));
++ dst+=dstStride;
++ src+=srcStride;
++ }
++}
++
++static void clear_blocks_avr32(DCTELEM *blocks)
++{
++ int n = 12;
++ uint64_t tmp1, tmp2;
++ blocks += 6*64;
++ asm volatile ( "mov\t%1, 0\n"
++ "mov\t%m1, 0\n"
++ "mov\t%2, 0\n"
++ "mov\t%m2, 0\n"
++ "0:\n"
++ "stm\t--%3, %1, %m1, %2, %m2\n"
++ "stm\t--%3, %1, %m1, %2, %m2\n"
++ "stm\t--%3, %1, %m1, %2, %m2\n"
++ "stm\t--%3, %1, %m1, %2, %m2\n"
++ "sub\t%0, 1\n"
++ "brne\t0b\n"
++ : "+r"(n), "=&r"(tmp1), "=&r"(tmp2),
++ "+r"(blocks));
++}
++
++
++static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
++{
++ int i;
++ for(i=0; i<h; i++)
++ {
++ ST32(dst , LD32(src ));
++ ST32(dst+4 , LD32(src+4 ));
++ dst+=dstStride;
++ src+=srcStride;
++ }
++}
++
++static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
++{
++ int i;
++ for(i=0; i<h; i++)
++ {
++ ST32(dst , LD32(src ));
++ ST32(dst+4 , LD32(src+4 ));
++ ST32(dst+8 , LD32(src+8 ));
++ ST32(dst+12, LD32(src+12));
++ dst+=dstStride;
++ src+=srcStride;
++ }
++}
++
++
++static void put_h264_chroma_mc2_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
++ const int A=(8-x)*(8-y);
++ const int B=( x)*(8-y);
++ const int C=(8-x)*( y);
++ const int D=( x)*( y);
++ int i;
++
++ PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
++ PICO_PUT_W(PICO_COEFF0_B, 32);
++ PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
++ PICO_PUT_W(PICO_COEFF1_B, 0);
++ PICO_PUT_W(PICO_COEFF2_A, 0);
++ PICO_PUT_W(PICO_COEFF2_B, 0);
++ PICO_PUT_W(PICO_CONFIG,
++ PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
++ | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
++ | PICO_COEFF_FRAC_BITS(6)
++ | PICO_OFFSET_FRAC_BITS(6));
++
++ for(i=0; i<h; i++)
++ {
++
++ int src0 = LD32(src);
++ int src1 = LD32(src + stride);
++
++ PICO_MVRC_W(PICO_INPIX0, src0);
++ PICO_MVRC_W(PICO_INPIX1, src1);
++ PICO_OP(PICO_SINGLE_VECTOR, 2, 0, 4, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 3, 1, 5, 0);
++ src += stride;
++ ST16(dst,(short)PICO_GET_W(PICO_OUTPIX0));
++ dst += stride;
++ }
++}
++
++
++static void put_h264_chroma_mc4_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
++ const int A=(8-x)*(8-y);\
++ const int B=( x)*(8-y);
++ const int C=(8-x)*( y);
++ const int D=( x)*( y);
++ int i;
++
++ PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
++ PICO_PUT_W(PICO_COEFF0_B, 32);
++ PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
++ PICO_PUT_W(PICO_COEFF1_B, 0);
++ PICO_PUT_W(PICO_COEFF2_A, 0);
++ PICO_PUT_W(PICO_COEFF2_B, 0);
++ PICO_PUT_W(PICO_CONFIG,
++ PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
++ | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
++ | PICO_COEFF_FRAC_BITS(6)
++ | PICO_OFFSET_FRAC_BITS(6));
++
++ for(i=0; i<h; i++)
++ {
++ /*
++ OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
++ OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
++ OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
++ OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
++ dst+= stride;
++ src+= stride;
++ */
++
++ int src0 = LD32(src);
++ int src1 = (((int)src[4] << 24) | (int)src[stride]);
++ int src2 = LD32(src + stride + 1);
++
++ PICO_MVRC_W(PICO_INPIX0, src0);
++ PICO_MVRC_W(PICO_INPIX1, src1);
++ PICO_MVRC_W(PICO_INPIX2, src2);
++ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
++ src += stride;
++ ST32(dst, PICO_GET_W(PICO_OUTPIX0));
++
++ dst += stride;
++ }
++}
++
++static void put_h264_chroma_mc8_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
++ const int A=(8-x)*(8-y);
++ const int B=( x)*(8-y);
++ const int C=(8-x)*( y);
++ const int D=( x)*( y);
++ int i;
++
++ PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
++ PICO_PUT_W(PICO_COEFF0_B, 32);
++ PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
++ PICO_PUT_W(PICO_COEFF1_B, 0);
++ PICO_PUT_W(PICO_COEFF2_A, 0);
++ PICO_PUT_W(PICO_COEFF2_B, 0);
++ PICO_PUT_W(PICO_CONFIG,
++ PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
++ | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
++ | PICO_COEFF_FRAC_BITS(6)
++ | PICO_OFFSET_FRAC_BITS(6));
++
++ for(i=0; i<h; i++)
++ {
++ /*
++ OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
++ OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
++ OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
++ OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
++ OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));
++ OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));
++ OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));
++ OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));
++ dst+= stride;
++ src+= stride;
++ */
++ int src0 = LD32(src);
++ int src1 = (((int)src[4] << 24) | (int)src[stride]);
++ int src2 = LD32(src + stride + 1);
++
++ PICO_MVRC_W(PICO_INPIX0, src0);
++ PICO_MVRC_W(PICO_INPIX1, src1);
++ PICO_MVRC_W(PICO_INPIX2, src2);
++ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
++ ST32(dst, PICO_GET_W(PICO_OUTPIX0));
++
++ src0 = LD32(src + 4);
++ src1 = (src[8] << 24) | src[stride + 4];
++ src2 = LD32(src + stride + 5);
++
++ PICO_MVRC_W(PICO_INPIX0, src0);
++ PICO_MVRC_W(PICO_INPIX1, src1);
++ PICO_MVRC_W(PICO_INPIX2, src2);
++ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
++ src += stride;
++ ST32(dst + 4, PICO_GET_W(PICO_OUTPIX0));
++
++ dst += stride;
++ }
++}
++
++
++static void avg_h264_chroma_mc2_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
++ const int A=(8-x)*(8-y);
++ const int B=( x)*(8-y);
++ const int C=(8-x)*( y);
++ const int D=( x)*( y);
++ int i;
++
++ PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
++ PICO_PUT_W(PICO_COEFF0_B, 32);
++ PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
++ PICO_PUT_W(PICO_COEFF1_B, 0);
++ PICO_PUT_W(PICO_COEFF2_A, 0);
++ PICO_PUT_W(PICO_COEFF2_B, 0);
++ PICO_PUT_W(PICO_CONFIG,
++ PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
++ | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
++ | PICO_COEFF_FRAC_BITS(6)
++ | PICO_OFFSET_FRAC_BITS(6));
++
++ for(i=0; i<h; i++)
++ {
++ int src0 = LD32(src);
++ int src1 = LD32(src + stride);
++
++ PICO_MVRC_W(PICO_INPIX0, src0);
++ PICO_MVRC_W(PICO_INPIX1, src1);
++ PICO_OP(PICO_SINGLE_VECTOR, 2, 0, 4, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 3, 1, 5, 0);
++ src += stride;
++ ST16(dst, rnd_avg32(LD16(dst), PICO_GET_W(PICO_OUTPIX0)));
++ dst += stride;
++ }
++}
++
++
++static void avg_h264_chroma_mc4_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
++ const int A=(8-x)*(8-y);\
++ const int B=( x)*(8-y);
++ const int C=(8-x)*( y);
++ const int D=( x)*( y);
++ int i;
++
++ PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
++ PICO_PUT_W(PICO_COEFF0_B, 32);
++ PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
++ PICO_PUT_W(PICO_COEFF1_B, 0);
++ PICO_PUT_W(PICO_COEFF2_A, 0);
++ PICO_PUT_W(PICO_COEFF2_B, 0);
++ PICO_PUT_W(PICO_CONFIG,
++ PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
++ | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
++ | PICO_COEFF_FRAC_BITS(6)
++ | PICO_OFFSET_FRAC_BITS(6));
++
++ for(i=0; i<h; i++)
++ {
++ /*
++ OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
++ OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
++ OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
++ OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
++ dst+= stride;
++ src+= stride;
++ */
++
++ int src0 = *((int *)src);
++ int src1 = (int)((src[4] << 24) | src[stride]);
++ int src2 = *((int *)(src + stride + 1));
++
++ PICO_MVRC_W(PICO_INPIX0, src0);
++ PICO_MVRC_W(PICO_INPIX1, src1);
++ PICO_MVRC_W(PICO_INPIX2, src2);
++ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
++ src += stride;
++ ST32(dst, rnd_avg32(LD32(dst), PICO_GET_W(PICO_OUTPIX0)));
++ dst += stride;
++ }
++}
++
++static void avg_h264_chroma_mc8_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
++ const int A=(8-x)*(8-y);
++ const int B=( x)*(8-y);
++ const int C=(8-x)*( y);
++ const int D=( x)*( y);
++ int i;
++
++ PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
++ PICO_PUT_W(PICO_COEFF0_B, 32);
++ PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
++ PICO_PUT_W(PICO_COEFF1_B, 0);
++ PICO_PUT_W(PICO_COEFF2_A, 0);
++ PICO_PUT_W(PICO_COEFF2_B, 0);
++ PICO_PUT_W(PICO_CONFIG,
++ PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
++ | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
++ | PICO_COEFF_FRAC_BITS(6)
++ | PICO_OFFSET_FRAC_BITS(6));
++
++ for(i=0; i<h; i++)
++ {
++ /*
++ OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
++ OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
++ OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
++ OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
++ OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));
++ OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));
++ OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));
++ OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));
++ dst+= stride;
++ src+= stride;
++ */
++ int src0 = *((int *)src);
++ int src1 = (volatile int)((src[4] << 24) | src[stride]);
++ int src2 = *((int *)(src + stride + 1));
++
++ PICO_MVRC_W(PICO_INPIX0, src0);
++ PICO_MVRC_W(PICO_INPIX1, src1);
++ PICO_MVRC_W(PICO_INPIX2, src2);
++ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
++ ST32(dst, rnd_avg32(LD32(dst), PICO_GET_W(PICO_OUTPIX0)));
++
++ src0 = *((int *)(src + 4));
++ src1 = (int)((src[8] << 24) | src[stride + 4]);
++ src2 = *((int *)(src + stride + 5));
++
++ PICO_MVRC_W(PICO_INPIX0, src0);
++ PICO_MVRC_W(PICO_INPIX1, src1);
++ PICO_MVRC_W(PICO_INPIX2, src2);
++ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
++ src += stride;
++ ST32(dst + 4, rnd_avg32(LD32(dst + 4), PICO_GET_W(PICO_OUTPIX0)));
++ dst += stride;
++ }
++}
++
++static struct pico_config_t h264_qpel4_h_lowpass_config = {
++ .input_mode = PICO_HOR_FILTER_MODE,
++ .output_mode = PICO_PLANAR_MODE,
++ .coeff_frac_bits = 5,
++ .offset_frac_bits = 5,
++ .coeff0_0 = 1,
++ .coeff0_1 = -5,
++ .coeff0_2 = 20,
++ .coeff0_3 = 16,
++ .coeff1_0 = 20,
++ .coeff1_1 = -5,
++ .coeff1_2 = 1,
++ .coeff1_3 = 0,
++ .coeff2_0 = 0,
++ .coeff2_1 = 0,
++ .coeff2_2 = 0,
++ .coeff2_3 = 0
++};
++
++
++
++static void put_h264_qpel4_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++ const int h=4;
++ int i;
++
++ set_pico_config(&h264_qpel4_h_lowpass_config);
++
++ for(i=0; i<h; i++){
++
++ /*
++ OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
++ OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
++ OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
++ OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
++ dst+=dstStride;\
++ src+=srcStride;\ */
++ PICO_MVRC_W(PICO_INPIX0, LD32(src - 2));
++ PICO_MVRC_D(PICO_INPIX2, LD64_UNALIGNED(src + 2));
++ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6);
++ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7);
++ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8);
++ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9);
++ src += srcStride;
++ ST32(dst, PICO_GET_W(PICO_OUTPIX0));
++ dst += dstStride;
++ }
++}
++
++static void avg_h264_qpel4_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++ const int h=4;
++ int i;
++
++ set_pico_config(&h264_qpel4_h_lowpass_config);
++
++ for(i=0; i<h; i++){
++
++ /*
++ OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
++ OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
++ OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
++ OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
++ dst+=dstStride;\
++ src+=srcStride;\ */
++
++ PICO_MVRC_W(PICO_INPIX0, LD32(src - 2));
++ PICO_MVRC_D(PICO_INPIX2, LD64_UNALIGNED(src + 2));
++ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6);
++ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7);
++ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8);
++ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9);
++ src += srcStride;
++ ST32(dst, rnd_avg32(LD32(dst), PICO_GET_W(PICO_OUTPIX0)));
++ dst += dstStride;
++ }
++}
++
++static struct pico_config_t h264_qpel4_v_lowpass_config1 = {
++ .input_mode = PICO_VERT_FILTER_MODE,
++ .output_mode = PICO_PACKED_MODE,
++ .coeff_frac_bits = 5,
++ .offset_frac_bits = 5,
++ .coeff0_0 = 1,
++ .coeff0_1 = -5,
++ .coeff0_2 = 20,
++ .coeff0_3 = 16,
++ .coeff1_0 = 1,
++ .coeff1_1 = -5,
++ .coeff1_2 = 20,
++ .coeff1_3 = 16,
++ .coeff2_0 = 1,
++ .coeff2_1 = -5,
++ .coeff2_2 = 20,
++ .coeff2_3 = 16
++};
++
++
++
++static struct pico_config_t h264_qpel4_v_lowpass_config2 = {
++ .input_mode = PICO_VERT_FILTER_MODE,
++ .output_mode = PICO_PLANAR_MODE,
++ .coeff_frac_bits = 5,
++ .offset_frac_bits = 5,
++ .coeff0_0 = 1,
++ .coeff0_1 = -5,
++ .coeff0_2 = 20,
++ .coeff0_3 = 16,
++ .coeff1_0 = 20,
++ .coeff1_1 = -5,
++ .coeff1_2 = 1,
++ .coeff1_3 = 0,
++ .coeff2_0 = 0,
++ .coeff2_1 = 0,
++ .coeff2_2 = 0,
++ .coeff2_3 = 0
++};
++
++static void put_h264_qpel4_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++
++ /*
++ const int w=4;
++ uint8_t *cm = cropTbl + MAX_NEG_CROP;
++ int i;
++ for(i=0; i<w; i++)
++ {
++ const int srcB= src[-2*srcStride];\
++ const int srcA= src[-1*srcStride];\
++ const int src0= src[0 *srcStride];\
++ const int src1= src[1 *srcStride];\
++ const int src2= src[2 *srcStride];\
++ const int src3= src[3 *srcStride];\
++ const int src4= src[4 *srcStride];\
++ const int src5= src[5 *srcStride];\
++ const int src6= src[6 *srcStride];\
++ OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
++ OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
++ OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
++ OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
++ dst++;\
++ src++;\
++ */
++
++ set_pico_config(&h264_qpel4_v_lowpass_config1);
++
++ {
++ int srcB= LD32(src - 2*srcStride);
++ int srcA= LD32(src - 1*srcStride);
++ int src0= LD32(src + 0 *srcStride);
++ int src1= LD32(src + 1 *srcStride);
++ int src2= LD32(src + 2 *srcStride);
++ int src3= LD32(src + 3 *srcStride);
++ int src4= LD32(src + 4 *srcStride);
++ int src5= LD32(src + 5 *srcStride);
++ int src6= LD32(src + 6 *srcStride);
++
++ /* First compute the leftmost three colums */
++ PICO_MVRC_W(PICO_INPIX0, srcB);
++ PICO_MVRC_W(PICO_INPIX1, srcA);
++ PICO_MVRC_W(PICO_INPIX2, src0);
++ PICO_OP(0, 0, 0, 3, 6);
++ PICO_MVRC_W(PICO_INPIX2, src1);
++ PICO_MVRC_W(PICO_INPIX1, src2);
++ PICO_MVRC_W(PICO_INPIX0, src3);
++ PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
++ ST32(dst, PICO_GET_W(PICO_OUTPIX0));
++ dst += dstStride;
++ PICO_MVRC_W(PICO_INPIX0, srcA);
++ PICO_MVRC_W(PICO_INPIX1, src0);
++ PICO_MVRC_W(PICO_INPIX2, src1);
++ PICO_OP(0, 0, 0, 3, 6);
++ PICO_MVRC_W(PICO_INPIX2, src2);
++ PICO_MVRC_W(PICO_INPIX1, src3);
++ PICO_MVRC_W(PICO_INPIX0, src4);
++ PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
++ ST32(dst, PICO_GET_W(PICO_OUTPIX0));
++ dst += dstStride;
++ PICO_MVRC_W(PICO_INPIX0, src0);
++ PICO_MVRC_W(PICO_INPIX1, src1);
++ PICO_MVRC_W(PICO_INPIX2, src2);
++ PICO_OP(0, 0, 0, 3, 6);
++ PICO_MVRC_W(PICO_INPIX2, src3);
++ PICO_MVRC_W(PICO_INPIX1, src4);
++ PICO_MVRC_W(PICO_INPIX0, src5);
++ PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
++ ST32(dst, PICO_GET_W(PICO_OUTPIX0));
++ dst += dstStride;
++ PICO_MVRC_W(PICO_INPIX0, src1);
++ PICO_MVRC_W(PICO_INPIX1, src2);
++ PICO_MVRC_W(PICO_INPIX2, src3);
++ PICO_OP(0, 0, 0, 3, 6);
++ PICO_MVRC_W(PICO_INPIX2, src4);
++ PICO_MVRC_W(PICO_INPIX1, src5);
++ PICO_MVRC_W(PICO_INPIX0, src6);
++ PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
++ ST32(dst, PICO_GET_W(PICO_OUTPIX0));
++ /* Now compute the last column */
++
++ union wordbytes {
++ int word;
++ struct {
++ unsigned int t:8;
++ unsigned int u:8;
++ unsigned int l:8;
++ unsigned int b:8;
++ } bytes; } tmp1, tmp2, tmp3;
++
++
++ tmp1.bytes.t = srcB;
++ tmp1.bytes.u = src1;
++ tmp1.bytes.l = src4;
++
++ tmp2.bytes.t = srcA;
++ tmp2.bytes.u = src2;
++ tmp2.bytes.l = src5;
++
++ tmp3.bytes.t = src0;
++ tmp3.bytes.u = src3;
++ tmp3.bytes.l = src6;
++
++ PICO_MVRC_W(PICO_INPIX0, tmp1.word);
++ PICO_MVRC_W(PICO_INPIX1, tmp2.word);
++ PICO_MVRC_W(PICO_INPIX2, tmp3.word);
++ set_pico_config(&h264_qpel4_v_lowpass_config2);
++
++
++ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6);
++ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7);
++ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8);
++ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9);
++
++ PICO_MVCR_W(tmp1.word, PICO_OUTPIX0);
++ dst[3] = (char)(tmp1.bytes.b);
++ dst[3 - dstStride] = (char)(tmp1.bytes.l);
++ dst[3 - 2*dstStride] = (char)(tmp1.bytes.u);
++ dst[3 - 3*dstStride] = (char)(tmp1.bytes.t);
++
++ }
++ /*}
++
++
++ }*/
++}
++
++static void avg_h264_qpel4_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++
++ /*
++ const int w=4;
++ uint8_t *cm = cropTbl + MAX_NEG_CROP;
++ int i;
++ for(i=0; i<w; i++)
++ {
++ const int srcB= src[-2*srcStride];\
++ const int srcA= src[-1*srcStride];\
++ const int src0= src[0 *srcStride];\
++ const int src1= src[1 *srcStride];\
++ const int src2= src[2 *srcStride];\
++ const int src3= src[3 *srcStride];\
++ const int src4= src[4 *srcStride];\
++ const int src5= src[5 *srcStride];\
++ const int src6= src[6 *srcStride];\
++ OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
++ OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
++ OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
++ OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
++ dst++;\
++ src++;\
++ */
++ uint8_t tmp_block[4*4];
++
++ set_pico_config(&h264_qpel4_v_lowpass_config1);
++
++ {
++ int srcB= LD32(src - 2*srcStride);
++ int srcA= LD32(src - 1*srcStride);
++ int src0= LD32(src + 0 *srcStride);
++ int src1= LD32(src + 1 *srcStride);
++ int src2= LD32(src + 2 *srcStride);
++ int src3= LD32(src + 3 *srcStride);
++ int src4= LD32(src + 4 *srcStride);
++ int src5= LD32(src + 5 *srcStride);
++ int src6= LD32(src + 6 *srcStride);
++
++ /* First compute the leftmost three colums */
++ PICO_MVRC_W(PICO_INPIX0, srcB);
++ PICO_MVRC_W(PICO_INPIX1, srcA);
++ PICO_MVRC_W(PICO_INPIX2, src0);
++ PICO_OP(0, 0, 0, 3, 6);
++ PICO_MVRC_W(PICO_INPIX2, src1);
++ PICO_MVRC_W(PICO_INPIX1, src2);
++ PICO_MVRC_W(PICO_INPIX0, src3);
++ PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
++ ST32(tmp_block, PICO_GET_W(PICO_OUTPIX0));
++ PICO_MVRC_W(PICO_INPIX0, srcA);
++ PICO_MVRC_W(PICO_INPIX1, src0);
++ PICO_MVRC_W(PICO_INPIX2, src1);
++ PICO_OP(0, 0, 0, 3, 6);
++ PICO_MVRC_W(PICO_INPIX2, src2);
++ PICO_MVRC_W(PICO_INPIX1, src3);
++ PICO_MVRC_W(PICO_INPIX0, src4);
++ PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
++ ST32(tmp_block + 4, PICO_GET_W(PICO_OUTPIX0));
++ PICO_MVRC_W(PICO_INPIX0, src0);
++ PICO_MVRC_W(PICO_INPIX1, src1);
++ PICO_MVRC_W(PICO_INPIX2, src2);
++ PICO_OP(0, 0, 0, 3, 6);
++ PICO_MVRC_W(PICO_INPIX2, src3);
++ PICO_MVRC_W(PICO_INPIX1, src4);
++ PICO_MVRC_W(PICO_INPIX0, src5);
++ PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
++ ST32(tmp_block + 8, PICO_GET_W(PICO_OUTPIX0));
++ PICO_MVRC_W(PICO_INPIX0, src1);
++ PICO_MVRC_W(PICO_INPIX1, src2);
++ PICO_MVRC_W(PICO_INPIX2, src3);
++ PICO_OP(0, 0, 0, 3, 6);
++ PICO_MVRC_W(PICO_INPIX2, src4);
++ PICO_MVRC_W(PICO_INPIX1, src5);
++ PICO_MVRC_W(PICO_INPIX0, src6);
++ PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
++ ST32(tmp_block + 12, PICO_GET_W(PICO_OUTPIX0));
++ /* Now compute the last column */
++
++ union wordbytes {
++ int word;
++ struct {
++ unsigned int t:8;
++ unsigned int u:8;
++ unsigned int l:8;
++ unsigned int b:8;
++ } bytes; } tmp1, tmp2, tmp3;
++
++
++ tmp1.bytes.t = srcB;
++ tmp1.bytes.u = src1;
++ tmp1.bytes.l = src4;
++
++ tmp2.bytes.t = srcA;
++ tmp2.bytes.u = src2;
++ tmp2.bytes.l = src5;
++
++ tmp3.bytes.t = src0;
++ tmp3.bytes.u = src3;
++ tmp3.bytes.l = src6;
++
++ PICO_MVRC_W(PICO_INPIX0, tmp1.word);
++ PICO_MVRC_W(PICO_INPIX1, tmp2.word);
++ PICO_MVRC_W(PICO_INPIX2, tmp3.word);
++ set_pico_config(&h264_qpel4_v_lowpass_config2);
++
++
++ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6);
++ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7);
++ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8);
++ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9);
++
++ PICO_MVCR_W(tmp1.word, PICO_OUTPIX0);
++ tmp_block[3 + 3*4] = (char)(tmp1.bytes.b);
++ tmp_block[3 + 2*4] = (char)(tmp1.bytes.l);
++ tmp_block[3 + 1*4] = (char)(tmp1.bytes.u);
++ tmp_block[3] = (char)(tmp1.bytes.t);
++
++ /* Compute the average */
++ srcB= LD32(dst);
++ srcA= LD32(dst + dstStride);
++ src0= LD32(dst + dstStride*2);
++ src1= LD32(dst + dstStride*3);
++
++ src2= LD32(tmp_block);
++ src3= LD32(tmp_block + 4);
++ src4= LD32(tmp_block + 8);
++ src5= LD32(tmp_block + 12);
++
++ ST32(dst, rnd_avg32(srcB, src2));
++ ST32(dst + dstStride, rnd_avg32(srcA, src3));
++ ST32(dst + 2*dstStride, rnd_avg32(src0, src4));
++ ST32(dst + 3*dstStride, rnd_avg32(src1, src5));
++ }
++}
++
++static struct pico_config_t h264_qpel4_hv_lowpass_config = {
++ .input_mode = PICO_HOR_FILTER_MODE,
++ .output_mode = PICO_PACKED_MODE,
++ .coeff_frac_bits = 10,
++ .offset_frac_bits = 10,
++ .coeff0_0 = 1,
++ .coeff0_1 = -5,
++ .coeff0_2 = 20,
++ .coeff0_3 = 512,
++ .coeff1_0 = -5,
++ .coeff1_1 = 25,
++ .coeff1_2 = -100,
++ .coeff1_3 = 0,
++ .coeff2_0 = 20,
++ .coeff2_1 = -100,
++ .coeff2_2 = 400,
++ .coeff2_3 = 0
++};
++
++static void put_h264_qpel4_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++
++ int32_t tmp_block[48];
++ int32_t *tmp = tmp_block;
++ int i;
++
++ set_pico_config(&h264_qpel4_hv_lowpass_config);
++
++ src -= 2;
++ for ( i = 0; i < 2; i++ ){
++ int srcB= LD32(src - 2*srcStride);
++ int srcA= LD32(src - 1*srcStride);
++ int src0= LD32(src + 0 *srcStride);
++ int src1= LD32(src + 1 *srcStride);
++ int src2= LD32(src + 2 *srcStride);
++ int src3= LD32(src + 3 *srcStride);
++ int src4= LD32(src + 4 *srcStride);
++ int src5= LD32(src + 5 *srcStride);
++ int src6= LD32(src + 6 *srcStride);
++
++ PICO_MVRC_W(PICO_INPIX0, srcB);
++ PICO_MVRC_W(PICO_INPIX1, srcA);
++ PICO_MVRC_W(PICO_INPIX2, src0);
++ PICO_OP(0, 0, 0, 4, 8);
++ PICO_MVRC_W(PICO_INPIX2, src1);
++ PICO_MVRC_W(PICO_INPIX1, src2);
++ PICO_MVRC_W(PICO_INPIX0, src3);
++ PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
++ PICO_STCM_W(tmp,
++ PICO_REGVECT_VMU0_OUT,
++ PICO_REGVECT_VMU1_OUT,
++ PICO_REGVECT_VMU2_OUT);
++ tmp += 3;
++
++ PICO_OP(0, 0, 1, 5, 9);
++ PICO_MVRC_W(PICO_INPIX0, srcB);
++ PICO_MVRC_W(PICO_INPIX1, srcA);
++ PICO_MVRC_W(PICO_INPIX2, src0);
++ PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
++ PICO_STCM_W(tmp,
++ PICO_REGVECT_VMU0_OUT,
++ PICO_REGVECT_VMU1_OUT,
++ PICO_REGVECT_VMU2_OUT);
++ tmp += 3;
++
++ PICO_MVRC_W(PICO_INPIX0, src1);
++ PICO_OP(0, 0, 4, 8, 0);
++ PICO_MVRC_W(PICO_INPIX2, src2);
++ PICO_MVRC_W(PICO_INPIX1, src3);
++ PICO_MVRC_W(PICO_INPIX0, src4);
++ PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
++ PICO_STCM_W(tmp,
++ PICO_REGVECT_VMU0_OUT,
++ PICO_REGVECT_VMU1_OUT,
++ PICO_REGVECT_VMU2_OUT);
++ tmp += 3;
++
++ PICO_OP(0, 0, 1, 5, 9);
++ PICO_MVRC_W(PICO_INPIX0, srcA);
++ PICO_MVRC_W(PICO_INPIX1, src0);
++ PICO_MVRC_W(PICO_INPIX2, src1);
++ PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
++ PICO_STCM_W(tmp,
++ PICO_REGVECT_VMU0_OUT,
++ PICO_REGVECT_VMU1_OUT,
++ PICO_REGVECT_VMU2_OUT);
++ tmp += 3;
++
++ PICO_MVRC_W(PICO_INPIX0, src2);
++ PICO_OP(0, 0, 4, 8, 0);
++ PICO_MVRC_W(PICO_INPIX2, src3);
++ PICO_MVRC_W(PICO_INPIX1, src4);
++ PICO_MVRC_W(PICO_INPIX0, src5);
++ PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
++ PICO_STCM_W(tmp,
++ PICO_REGVECT_VMU0_OUT,
++ PICO_REGVECT_VMU1_OUT,
++ PICO_REGVECT_VMU2_OUT);
++ tmp += 3;
++
++ PICO_OP(0, 0, 1, 5, 9);
++ PICO_MVRC_W(PICO_INPIX0, src0);
++ PICO_MVRC_W(PICO_INPIX1, src1);
++ PICO_MVRC_W(PICO_INPIX2, src2);
++ PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
++ PICO_STCM_W(tmp,
++ PICO_REGVECT_VMU0_OUT,
++ PICO_REGVECT_VMU1_OUT,
++ PICO_REGVECT_VMU2_OUT);
++ tmp += 3;
++
++ PICO_MVRC_W(PICO_INPIX0, src3);
++ PICO_OP(0, 0, 4, 8, 0);
++ PICO_MVRC_W(PICO_INPIX2, src4);
++ PICO_MVRC_W(PICO_INPIX1, src5);
++ PICO_MVRC_W(PICO_INPIX0, src6);
++ PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
++ PICO_STCM_W(tmp,
++ PICO_REGVECT_VMU0_OUT,
++ PICO_REGVECT_VMU1_OUT,
++ PICO_REGVECT_VMU2_OUT);
++ tmp += 3;
++
++ PICO_OP(0, 0, 1, 5, 9);
++ PICO_MVRC_W(PICO_INPIX0, src1);
++ PICO_MVRC_W(PICO_INPIX1, src2);
++ PICO_MVRC_W(PICO_INPIX2, src3);
++ PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
++ PICO_STCM_W(tmp,
++ PICO_REGVECT_VMU0_OUT,
++ PICO_REGVECT_VMU1_OUT,
++ PICO_REGVECT_VMU2_OUT);
++ tmp += 3;
++ src += 2;
++ }
++
++ src -= 1;
++ tmp -= 48;
++
++
++ PICO_PUT_W(PICO_CONFIG,
++ PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
++ | PICO_INPUT_MODE(PICO_VERT_FILTER_MODE)
++ | PICO_COEFF_FRAC_BITS(10)
++ | PICO_OFFSET_FRAC_BITS(10));
++
++ for ( i = 0; i < 2; i++ ){
++ int srcB= LD32(src - 2*srcStride);
++ int srcA= LD32(src - 1*srcStride);
++ int src0= LD32(src + 0 *srcStride);
++ int src1= LD32(src + 1 *srcStride);
++ int src2= LD32(src + 2 *srcStride);
++ int src3= LD32(src + 3 *srcStride);
++ int src4= LD32(src + 4 *srcStride);
++ int src5= LD32(src + 5 *srcStride);
++ int src6= LD32(src + 6 *srcStride);
++
++
++ PICO_LDCM_W_INC(tmp,
++ PICO_REGVECT_VMU0_OUT,
++ PICO_REGVECT_VMU1_OUT,
++ PICO_REGVECT_VMU2_OUT);
++ PICO_MVRC_W(PICO_INPIX0, srcB);
++ PICO_MVRC_W(PICO_INPIX1, srcA);
++ PICO_MVRC_W(PICO_INPIX2, src0);
++ PICO_OP(PICO_USE_ACC, 0, 6, 3, 0);
++ PICO_MVRC_W(PICO_INPIX2, src1);
++ PICO_MVRC_W(PICO_INPIX1, src2);
++ PICO_MVRC_W(PICO_INPIX0, src3);
++ PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 0, 6, 3, 0);
++
++ PICO_LDCM_W_INC(tmp,
++ PICO_REGVECT_VMU0_OUT,
++ PICO_REGVECT_VMU1_OUT,
++ PICO_REGVECT_VMU2_OUT);
++ PICO_OP(PICO_USE_ACC, 1, 9, 6, 3);
++ PICO_MVRC_W(PICO_INPIX0, srcB);
++ PICO_MVRC_W(PICO_INPIX1, srcA);
++ PICO_MVRC_W(PICO_INPIX2, src0);
++ PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 1, 9, 6, 3);
++
++ PICO_LDCM_W_INC(tmp,
++ PICO_REGVECT_VMU0_OUT,
++ PICO_REGVECT_VMU1_OUT,
++ PICO_REGVECT_VMU2_OUT);
++ PICO_MVRC_W(PICO_INPIX0, srcA);
++ PICO_MVRC_W(PICO_INPIX1, src0);
++ PICO_MVRC_W(PICO_INPIX2, src1);
++ PICO_OP(PICO_USE_ACC, 2, 6, 3, 0);
++ PICO_MVRC_W(PICO_INPIX2, src2);
++ PICO_MVRC_W(PICO_INPIX1, src3);
++ PICO_MVRC_W(PICO_INPIX0, src4);
++ PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 2, 6, 3, 0);
++
++ PICO_LDCM_W_INC(tmp,
++ PICO_REGVECT_VMU0_OUT,
++ PICO_REGVECT_VMU1_OUT,
++ PICO_REGVECT_VMU2_OUT);
++ PICO_OP(PICO_USE_ACC, 3, 9, 6, 3);
++ PICO_MVRC_W(PICO_INPIX0, srcA);
++ PICO_MVRC_W(PICO_INPIX1, src0);
++ PICO_MVRC_W(PICO_INPIX2, src1);
++ PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 3, 9, 6, 3);
++
++ ST16(dst + 0*dstStride, (short)(PICO_GET_W(PICO_OUTPIX0) >> 16));
++ ST16(dst + 1*dstStride, (short)PICO_GET_W(PICO_OUTPIX0));
++
++
++ PICO_LDCM_W_INC(tmp,
++ PICO_REGVECT_VMU0_OUT,
++ PICO_REGVECT_VMU1_OUT,
++ PICO_REGVECT_VMU2_OUT);
++ PICO_MVRC_W(PICO_INPIX0, src0);
++ PICO_MVRC_W(PICO_INPIX1, src1);
++ PICO_MVRC_W(PICO_INPIX2, src2);
++ PICO_OP(PICO_USE_ACC, 0, 6, 3, 0);
++ PICO_MVRC_W(PICO_INPIX2, src3);
++ PICO_MVRC_W(PICO_INPIX1, src4);
++ PICO_MVRC_W(PICO_INPIX0, src5);
++ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 0, 6, 3, 0);
++
++ PICO_LDCM_W_INC(tmp,
++ PICO_REGVECT_VMU0_OUT,
++ PICO_REGVECT_VMU1_OUT,
++ PICO_REGVECT_VMU2_OUT);
++ PICO_OP(PICO_USE_ACC, 1, 9, 6, 3);
++ PICO_MVRC_W(PICO_INPIX0, src0);
++ PICO_MVRC_W(PICO_INPIX1, src1);
++ PICO_MVRC_W(PICO_INPIX2, src2);
++ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 1, 9, 6, 3);
++
++ PICO_LDCM_W_INC(tmp,
++ PICO_REGVECT_VMU0_OUT,
++ PICO_REGVECT_VMU1_OUT,
++ PICO_REGVECT_VMU2_OUT);
++ PICO_MVRC_W(PICO_INPIX0, src1);
++ PICO_MVRC_W(PICO_INPIX1, src2);
++ PICO_MVRC_W(PICO_INPIX2, src3);
++ PICO_OP(PICO_USE_ACC, 2, 6, 3, 0);
++ PICO_MVRC_W(PICO_INPIX2, src4);
++ PICO_MVRC_W(PICO_INPIX1, src5);
++ PICO_MVRC_W(PICO_INPIX0, src6);
++ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 2, 6, 3, 0);
++
++ PICO_LDCM_W_INC(tmp,
++ PICO_REGVECT_VMU0_OUT,
++ PICO_REGVECT_VMU1_OUT,
++ PICO_REGVECT_VMU2_OUT);
++ PICO_OP(PICO_USE_ACC, 3, 9, 6, 3);
++ PICO_MVRC_W(PICO_INPIX0, src1);
++ PICO_MVRC_W(PICO_INPIX1, src2);
++ PICO_MVRC_W(PICO_INPIX2, src3);
++ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 3, 9, 6, 3);
++
++ ST16(dst + 2*dstStride, (short)(PICO_GET_W(PICO_OUTPIX0) >> 16));
++ ST16(dst + 3*dstStride, (short)PICO_GET_W(PICO_OUTPIX0));
++
++ dst += 2;
++ src += 2;
++ }
++}
++
++
++
++
++static void avg_h264_qpel4_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++
++ int32_t tmp_block[48];
++ int32_t *tmp = tmp_block;
++ int i;
++
++ set_pico_config(&h264_qpel4_hv_lowpass_config);
++
++ src -= 2;
++ for ( i = 0; i < 2; i++ ){
++ int srcB= LD32(src - 2*srcStride);
++ int srcA= LD32(src - 1*srcStride);
++ int src0= LD32(src + 0 *srcStride);
++ int src1= LD32(src + 1 *srcStride);
++ int src2= LD32(src + 2 *srcStride);
++ int src3= LD32(src + 3 *srcStride);
++ int src4= LD32(src + 4 *srcStride);
++ int src5= LD32(src + 5 *srcStride);
++ int src6= LD32(src + 6 *srcStride);
++
++ PICO_MVRC_W(PICO_INPIX0, srcB);
++ PICO_MVRC_W(PICO_INPIX1, srcA);
++ PICO_MVRC_W(PICO_INPIX2, src0);
++ PICO_OP(0, 0, 0, 4, 8);
++ PICO_MVRC_W(PICO_INPIX2, src1);
++ PICO_MVRC_W(PICO_INPIX1, src2);
++ PICO_MVRC_W(PICO_INPIX0, src3);
++ PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
++ PICO_STCM_W(tmp,
++ PICO_REGVECT_VMU0_OUT,
++ PICO_REGVECT_VMU1_OUT,
++ PICO_REGVECT_VMU2_OUT);
++ tmp += 3;
++
++ PICO_OP(0, 0, 1, 5, 9);
++ PICO_MVRC_W(PICO_INPIX0, srcB);
++ PICO_MVRC_W(PICO_INPIX1, srcA);
++ PICO_MVRC_W(PICO_INPIX2, src0);
++ PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
++ PICO_STCM_W(tmp,
++ PICO_REGVECT_VMU0_OUT,
++ PICO_REGVECT_VMU1_OUT,
++ PICO_REGVECT_VMU2_OUT);
++ tmp += 3;
++
++ PICO_MVRC_W(PICO_INPIX0, src1);
++ PICO_OP(0, 0, 4, 8, 0);
++ PICO_MVRC_W(PICO_INPIX2, src2);
++ PICO_MVRC_W(PICO_INPIX1, src3);
++ PICO_MVRC_W(PICO_INPIX0, src4);
++ PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
++ PICO_STCM_W(tmp,
++ PICO_REGVECT_VMU0_OUT,
++ PICO_REGVECT_VMU1_OUT,
++ PICO_REGVECT_VMU2_OUT);
++ tmp += 3;
++
++ PICO_OP(0, 0, 1, 5, 9);
++ PICO_MVRC_W(PICO_INPIX0, srcA);
++ PICO_MVRC_W(PICO_INPIX1, src0);
++ PICO_MVRC_W(PICO_INPIX2, src1);
++ PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
++ PICO_STCM_W(tmp,
++ PICO_REGVECT_VMU0_OUT,
++ PICO_REGVECT_VMU1_OUT,
++ PICO_REGVECT_VMU2_OUT);
++ tmp += 3;
++
++ PICO_MVRC_W(PICO_INPIX0, src2);
++ PICO_OP(0, 0, 4, 8, 0);
++ PICO_MVRC_W(PICO_INPIX2, src3);
++ PICO_MVRC_W(PICO_INPIX1, src4);
++ PICO_MVRC_W(PICO_INPIX0, src5);
++ PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
++ PICO_STCM_W(tmp,
++ PICO_REGVECT_VMU0_OUT,
++ PICO_REGVECT_VMU1_OUT,
++ PICO_REGVECT_VMU2_OUT);
++ tmp += 3;
++
++ PICO_OP(0, 0, 1, 5, 9);
++ PICO_MVRC_W(PICO_INPIX0, src0);
++ PICO_MVRC_W(PICO_INPIX1, src1);
++ PICO_MVRC_W(PICO_INPIX2, src2);
++ PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
++ PICO_STCM_W(tmp,
++ PICO_REGVECT_VMU0_OUT,
++ PICO_REGVECT_VMU1_OUT,
++ PICO_REGVECT_VMU2_OUT);
++ tmp += 3;
++
++ PICO_MVRC_W(PICO_INPIX0, src3);
++ PICO_OP(0, 0, 4, 8, 0);
++ PICO_MVRC_W(PICO_INPIX2, src4);
++ PICO_MVRC_W(PICO_INPIX1, src5);
++ PICO_MVRC_W(PICO_INPIX0, src6);
++ PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
++ PICO_STCM_W(tmp,
++ PICO_REGVECT_VMU0_OUT,
++ PICO_REGVECT_VMU1_OUT,
++ PICO_REGVECT_VMU2_OUT);
++ tmp += 3;
++
++ PICO_OP(0, 0, 1, 5, 9);
++ PICO_MVRC_W(PICO_INPIX0, src1);
++ PICO_MVRC_W(PICO_INPIX1, src2);
++ PICO_MVRC_W(PICO_INPIX2, src3);
++ PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
++ PICO_STCM_W(tmp,
++ PICO_REGVECT_VMU0_OUT,
++ PICO_REGVECT_VMU1_OUT,
++ PICO_REGVECT_VMU2_OUT);
++ tmp += 3;
++ src += 2;
++ }
++
++ src -= 1;
++ tmp -= 48;
++
++
++ PICO_PUT_W(PICO_CONFIG,
++ PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
++ | PICO_INPUT_MODE(PICO_VERT_FILTER_MODE)
++ | PICO_COEFF_FRAC_BITS(10)
++ | PICO_OFFSET_FRAC_BITS(10));
++
++ for ( i = 0; i < 2; i++ ){
++ int srcB= LD32(src - 2*srcStride);
++ int srcA= LD32(src - 1*srcStride);
++ int src0= LD32(src + 0 *srcStride);
++ int src1= LD32(src + 1 *srcStride);
++ int src2= LD32(src + 2 *srcStride);
++ int src3= LD32(src + 3 *srcStride);
++ int src4= LD32(src + 4 *srcStride);
++ int src5= LD32(src + 5 *srcStride);
++ int src6= LD32(src + 6 *srcStride);
++
++ PICO_LDCM_W_INC(tmp,
++ PICO_REGVECT_VMU0_OUT,
++ PICO_REGVECT_VMU1_OUT,
++ PICO_REGVECT_VMU2_OUT);
++ PICO_MVRC_W(PICO_INPIX0, srcB);
++ PICO_MVRC_W(PICO_INPIX1, srcA);
++ PICO_MVRC_W(PICO_INPIX2, src0);
++ PICO_OP(PICO_USE_ACC, 0, 6, 3, 0);
++ PICO_MVRC_W(PICO_INPIX2, src1);
++ PICO_MVRC_W(PICO_INPIX1, src2);
++ PICO_MVRC_W(PICO_INPIX0, src3);
++ PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 0, 6, 3, 0);
++
++ PICO_LDCM_W_INC(tmp,
++ PICO_REGVECT_VMU0_OUT,
++ PICO_REGVECT_VMU1_OUT,
++ PICO_REGVECT_VMU2_OUT);
++ PICO_OP(PICO_USE_ACC, 1, 9, 6, 3);
++ PICO_MVRC_W(PICO_INPIX0, srcB);
++ PICO_MVRC_W(PICO_INPIX1, srcA);
++ PICO_MVRC_W(PICO_INPIX2, src0);
++ PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 1, 9, 6, 3);
++
++ PICO_LDCM_W_INC(tmp,
++ PICO_REGVECT_VMU0_OUT,
++ PICO_REGVECT_VMU1_OUT,
++ PICO_REGVECT_VMU2_OUT);
++ PICO_MVRC_W(PICO_INPIX0, srcA);
++ PICO_MVRC_W(PICO_INPIX1, src0);
++ PICO_MVRC_W(PICO_INPIX2, src1);
++ PICO_OP(PICO_USE_ACC, 2, 6, 3, 0);
++ PICO_MVRC_W(PICO_INPIX2, src2);
++ PICO_MVRC_W(PICO_INPIX1, src3);
++ PICO_MVRC_W(PICO_INPIX0, src4);
++ PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 2, 6, 3, 0);
++
++ PICO_LDCM_W_INC(tmp,
++ PICO_REGVECT_VMU0_OUT,
++ PICO_REGVECT_VMU1_OUT,
++ PICO_REGVECT_VMU2_OUT);
++ PICO_OP(PICO_USE_ACC, 3, 9, 6, 3);
++ PICO_MVRC_W(PICO_INPIX0, srcA);
++ PICO_MVRC_W(PICO_INPIX1, src0);
++ PICO_MVRC_W(PICO_INPIX2, src1);
++ PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 3, 9, 6, 3);
++
++ ST16(dst + 0*dstStride, rnd_avg32(LD16(dst + 0*dstStride), PICO_GET_W(PICO_OUTPIX0) >> 16));
++ ST16(dst + 1*dstStride, rnd_avg32(LD16(dst + 1*dstStride), PICO_GET_W(PICO_OUTPIX0)));
++
++
++ PICO_LDCM_W_INC(tmp,
++ PICO_REGVECT_VMU0_OUT,
++ PICO_REGVECT_VMU1_OUT,
++ PICO_REGVECT_VMU2_OUT);
++ PICO_MVRC_W(PICO_INPIX0, src0);
++ PICO_MVRC_W(PICO_INPIX1, src1);
++ PICO_MVRC_W(PICO_INPIX2, src2);
++ PICO_OP(PICO_USE_ACC, 0, 6, 3, 0);
++ PICO_MVRC_W(PICO_INPIX2, src3);
++ PICO_MVRC_W(PICO_INPIX1, src4);
++ PICO_MVRC_W(PICO_INPIX0, src5);
++ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 0, 6, 3, 0);
++
++ PICO_LDCM_W_INC(tmp,
++ PICO_REGVECT_VMU0_OUT,
++ PICO_REGVECT_VMU1_OUT,
++ PICO_REGVECT_VMU2_OUT);
++ PICO_OP(PICO_USE_ACC, 1, 9, 6, 3);
++ PICO_MVRC_W(PICO_INPIX0, src0);
++ PICO_MVRC_W(PICO_INPIX1, src1);
++ PICO_MVRC_W(PICO_INPIX2, src2);
++ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 1, 9, 6, 3);
++
++ PICO_LDCM_W_INC(tmp,
++ PICO_REGVECT_VMU0_OUT,
++ PICO_REGVECT_VMU1_OUT,
++ PICO_REGVECT_VMU2_OUT);
++ PICO_MVRC_W(PICO_INPIX0, src1);
++ PICO_MVRC_W(PICO_INPIX1, src2);
++ PICO_MVRC_W(PICO_INPIX2, src3);
++ PICO_OP(PICO_USE_ACC, 2, 6, 3, 0);
++ PICO_MVRC_W(PICO_INPIX2, src4);
++ PICO_MVRC_W(PICO_INPIX1, src5);
++ PICO_MVRC_W(PICO_INPIX0, src6);
++ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 2, 6, 3, 0);
++
++ PICO_LDCM_W_INC(tmp,
++ PICO_REGVECT_VMU0_OUT,
++ PICO_REGVECT_VMU1_OUT,
++ PICO_REGVECT_VMU2_OUT);
++ PICO_OP(PICO_USE_ACC, 3, 9, 6, 3);
++ PICO_MVRC_W(PICO_INPIX0, src1);
++ PICO_MVRC_W(PICO_INPIX1, src2);
++ PICO_MVRC_W(PICO_INPIX2, src3);
++ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 3, 9, 6, 3);
++
++ ST16(dst + 2*dstStride, rnd_avg32(LD16(dst + 2*dstStride), PICO_GET_W(PICO_OUTPIX0) >> 16));
++ ST16(dst + 3*dstStride, rnd_avg32(LD16(dst + 3*dstStride), PICO_GET_W(PICO_OUTPIX0)));
++
++ dst += 2;
++ src += 2;
++ }
++}
++
++
++static void put_h264_qpel8_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++ put_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride);
++ put_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride);
++ src += 4*srcStride;
++ dst += 4*dstStride;
++ put_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride);
++ put_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride);
++}
++
++static void avg_h264_qpel8_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++ avg_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride);
++ avg_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride);
++ src += 4*srcStride;
++ dst += 4*dstStride;
++ avg_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride);
++ avg_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride);
++}
++
++static void put_h264_qpel8_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++ put_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride);
++ put_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride);
++ src += 4*srcStride;
++ dst += 4*dstStride;
++ put_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride);
++ put_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride);
++}
++
++static void avg_h264_qpel8_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++ avg_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride);
++ avg_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride);
++ src += 4*srcStride;
++ dst += 4*dstStride;
++ avg_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride);
++ avg_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride);
++}
++
++static void put_h264_qpel8_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++ put_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride);
++ put_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride);
++ src += 4*srcStride;
++ dst += 4*dstStride;
++ put_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride);
++ put_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride);
++}
++
++static void avg_h264_qpel8_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++ avg_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride);
++ avg_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride);
++ src += 4*srcStride;
++ dst += 4*dstStride;
++ avg_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride);
++ avg_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride);
++}
++
++static void put_h264_qpel16_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++ put_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride);
++ put_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride);
++ src += 8*srcStride;
++ dst += 8*dstStride;
++ put_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride);
++ put_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride);
++}
++
++static void avg_h264_qpel16_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++ avg_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride);
++ avg_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride);
++ src += 8*srcStride;
++ dst += 8*dstStride;
++ avg_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride);
++ avg_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride);
++}
++
++static void put_h264_qpel16_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++ put_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride);
++ put_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride);
++ src += 8*srcStride;
++ dst += 8*dstStride;
++ put_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride);
++ put_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride);
++}
++
++static void avg_h264_qpel16_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++ avg_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride);
++ avg_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride);
++ src += 8*srcStride;
++ dst += 8*dstStride;
++ avg_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride);
++ avg_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride);
++}
++
++static void put_h264_qpel16_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++ put_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride);
++ put_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride);
++ src += 8*srcStride;
++ dst += 8*dstStride;
++ put_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride);
++ put_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride);
++}
++
++static void avg_h264_qpel16_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++ avg_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride);
++ avg_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride);
++ src += 8*srcStride;
++ dst += 8*dstStride;
++ avg_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride);
++ avg_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride);
++}
++
++
++#define H264_MC(OPNAME, SIZE) \
++static void OPNAME ## h264_qpel ## SIZE ## _mc00_pico (uint8_t *dst, uint8_t *src, int stride){\
++ OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
++}\
++\
++static void OPNAME ## h264_qpel ## SIZE ## _mc10_pico(uint8_t *dst, uint8_t *src, int stride){\
++ uint8_t half[SIZE*SIZE];\
++ put_h264_qpel ## SIZE ## _h_lowpass_pico(half, src, SIZE, stride);\
++ OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
++}\
++\
++static void OPNAME ## h264_qpel ## SIZE ## _mc20_pico(uint8_t *dst, uint8_t *src, int stride){\
++ OPNAME ## h264_qpel ## SIZE ## _h_lowpass_pico(dst, src, stride, stride);\
++}\
++\
++static void OPNAME ## h264_qpel ## SIZE ## _mc30_pico(uint8_t *dst, uint8_t *src, int stride){\
++ uint8_t half[SIZE*SIZE];\
++ put_h264_qpel ## SIZE ## _h_lowpass_pico(half, src, SIZE, stride);\
++ OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
++}\
++\
++static void OPNAME ## h264_qpel ## SIZE ## _mc01_pico(uint8_t *dst, uint8_t *src, int stride){\
++ uint8_t full[SIZE*(SIZE+5)];\
++ uint8_t * const full_mid= full + SIZE*2;\
++ uint8_t half[SIZE*SIZE];\
++ copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
++ put_h264_qpel ## SIZE ## _v_lowpass_pico(half, full_mid, SIZE, SIZE);\
++ OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
++}\
++\
++static void OPNAME ## h264_qpel ## SIZE ## _mc02_pico(uint8_t *dst, uint8_t *src, int stride){\
++ uint8_t full[SIZE*(SIZE+5)];\
++ uint8_t * const full_mid= full + SIZE*2;\
++ copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
++ OPNAME ## h264_qpel ## SIZE ## _v_lowpass_pico(dst, full_mid, stride, SIZE);\
++}\
++\
++static void OPNAME ## h264_qpel ## SIZE ## _mc03_pico(uint8_t *dst, uint8_t *src, int stride){\
++ uint8_t full[SIZE*(SIZE+5)];\
++ uint8_t * const full_mid= full + SIZE*2;\
++ uint8_t half[SIZE*SIZE];\
++ copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
++ put_h264_qpel ## SIZE ## _v_lowpass_pico(half, full_mid, SIZE, SIZE);\
++ OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
++}\
++\
++static void OPNAME ## h264_qpel ## SIZE ## _mc11_pico(uint8_t *dst, uint8_t *src, int stride){\
++ uint8_t full[SIZE*(SIZE+5)];\
++ uint8_t * const full_mid= full + SIZE*2;\
++ uint8_t halfH[SIZE*SIZE];\
++ uint8_t halfV[SIZE*SIZE];\
++ put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src, SIZE, stride);\
++ copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
++ put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
++ OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
++}\
++\
++static void OPNAME ## h264_qpel ## SIZE ## _mc31_pico(uint8_t *dst, uint8_t *src, int stride){\
++ uint8_t full[SIZE*(SIZE+5)];\
++ uint8_t * const full_mid= full + SIZE*2;\
++ uint8_t halfH[SIZE*SIZE];\
++ uint8_t halfV[SIZE*SIZE];\
++ put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src, SIZE, stride);\
++ copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
++ put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
++ OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
++}\
++\
++static void OPNAME ## h264_qpel ## SIZE ## _mc13_pico(uint8_t *dst, uint8_t *src, int stride){\
++ uint8_t full[SIZE*(SIZE+5)];\
++ uint8_t * const full_mid= full + SIZE*2;\
++ uint8_t halfH[SIZE*SIZE];\
++ uint8_t halfV[SIZE*SIZE];\
++ put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src + stride, SIZE, stride);\
++ copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
++ put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
++ OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
++}\
++\
++static void OPNAME ## h264_qpel ## SIZE ## _mc33_pico(uint8_t *dst, uint8_t *src, int stride){\
++ uint8_t full[SIZE*(SIZE+5)];\
++ uint8_t * const full_mid= full + SIZE*2;\
++ uint8_t halfH[SIZE*SIZE];\
++ uint8_t halfV[SIZE*SIZE];\
++ put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src + stride, SIZE, stride);\
++ copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
++ put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
++ OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
++}\
++\
++static void OPNAME ## h264_qpel ## SIZE ## _mc22_pico(uint8_t *dst, uint8_t *src, int stride){\
++ OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_pico(dst, src, stride, stride);\
++}\
++\
++static void OPNAME ## h264_qpel ## SIZE ## _mc21_pico(uint8_t *dst, uint8_t *src, int stride){\
++ uint8_t halfH[SIZE*SIZE];\
++ uint8_t halfHV[SIZE*SIZE];\
++ put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src, SIZE, stride);\
++ put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\
++ OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
++}\
++\
++static void OPNAME ## h264_qpel ## SIZE ## _mc23_pico(uint8_t *dst, uint8_t *src, int stride){\
++ uint8_t halfH[SIZE*SIZE];\
++ uint8_t halfHV[SIZE*SIZE];\
++ put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src + stride, SIZE, stride);\
++ put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\
++ OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
++}\
++\
++static void OPNAME ## h264_qpel ## SIZE ## _mc12_pico(uint8_t *dst, uint8_t *src, int stride){\
++ uint8_t full[SIZE*(SIZE+5)];\
++ uint8_t * const full_mid= full + SIZE*2;\
++ uint8_t halfV[SIZE*SIZE];\
++ uint8_t halfHV[SIZE*SIZE];\
++ copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
++ put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
++ put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\
++ OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
++}\
++\
++static void OPNAME ## h264_qpel ## SIZE ## _mc32_pico(uint8_t *dst, uint8_t *src, int stride){\
++ uint8_t full[SIZE*(SIZE+5)];\
++ uint8_t * const full_mid= full + SIZE*2;\
++ uint8_t halfV[SIZE*SIZE];\
++ uint8_t halfHV[SIZE*SIZE];\
++ copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
++ put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
++ put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\
++ OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
++}\
++
++H264_MC(put_, 4)
++H264_MC(put_, 8)
++H264_MC(put_, 16)
++H264_MC(avg_, 4)
++H264_MC(avg_, 8)
++H264_MC(avg_, 16)
++
++
++
++#define dspfunc16(PFX) \
++ void PFX ## _pixels16_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \
++ PFX ## _pixels8_avr32(dst, pixels, line_size, h);\
++ PFX ## _pixels8_avr32(dst + 8, pixels + 8, line_size, h);\
++ }\
++ void PFX ## _pixels16_h_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \
++ PFX ## _pixels8_h_avr32(dst, pixels, line_size, h);\
++ PFX ## _pixels8_h_avr32(dst + 8, pixels + 8, line_size, h);\
++ }\
++ void PFX ## _pixels16_v_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \
++ PFX ## _pixels8_v_avr32(dst, pixels, line_size, h);\
++ PFX ## _pixels8_v_avr32(dst + 8, pixels + 8, line_size, h);\
++ }\
++ void PFX ## _pixels16_hv_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \
++ PFX ## _pixels8_hv_avr32(dst, pixels, line_size, h);\
++ PFX ## _pixels8_hv_avr32(dst + 8, pixels + 8, line_size, h);\
++ }\
++
++
++dspfunc16(put)
++dspfunc16(put_no_rnd)
++dspfunc16(avg)
++dspfunc16(avg_no_rnd)
++#undef dspfunc16
++
++static int pix_sum_avr32(uint8_t * pix, int line_size)
++{
++ int s, i;
++
++ s = 0;
++ for (i = 0; i < 16; i++) {
++ int tmp1,tmp2,tmp3,tmp4,tmp5;
++ __asm__ volatile ( "ld.w\t%0, %6[0]\n\t"
++ "ld.w\t%1, %6[4]\n\t"
++ "ld.w\t%2, %6[8]\n\t"
++ "ld.w\t%3, %6[12]\n\t"
++ "punpckub.h\t%4, %0:t\n\t"
++ "padd.h\t%5, %5, %4\n\t"
++ "punpckub.h\t%4, %0:b\n\t"
++ "padd.h\t%5, %5, %4\n\t"
++ "punpckub.h\t%4, %1:t\n\t"
++ "padd.h\t%5, %5, %4\n\t"
++ "punpckub.h\t%4, %1:b\n\t"
++ "padd.h\t%5, %5, %4\n\t"
++ "punpckub.h\t%4, %2:t\n\t"
++ "padd.h\t%5, %5, %4\n\t"
++ "punpckub.h\t%4, %2:b\n\t"
++ "padd.h\t%5, %5, %4\n\t"
++ "punpckub.h\t%4, %3:t\n\t"
++ "padd.h\t%5, %5, %4\n\t"
++ "punpckub.h\t%4, %3:b\n\t"
++ "padd.h\t%5, %5, %4\n\t"
++ : "=&r"(tmp1),"=&r"(tmp2),"=&r"(tmp3),"=&r"(tmp4),"=&r"(tmp5),"=&r"(s)
++ : "r"(pix));
++ pix += line_size;
++ }
++ __asm__ volatile ( "addhh.w\t%0, %0:t, %0:b" : "=&r" (s) );
++
++ return s;
++}
++
++
++//#define op_scale1(x) block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
++//#define op_scale2(x) dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
++//#define H264_WEIGHT(W,H) \
++//static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
++// int attribute_unused x, y; \
++// offset <<= log2_denom; \
++// if(log2_denom) offset += 1<<(log2_denom-1); \
++// for(y=0; y<H; y++, block += stride){ \
++// uint32_t tmp0, tmp1;
++// if(W==2) { \
++// asm volatile ( "ld.ub\t%[tmp0], %[block][0]\n" \
++// "ld.ub\t%[tmp1], %[block][1]\n" \
++// "mulhh.w\t%[tmp0], %[tmp0]:b, %[weight]:b\n" \
++// "mulhh.w\t%[tmp1], %[tmp1]:b, %[weight]:b\n" \
++// "asr\t%[tmp0], %[log2_denom]\n" \
++// "asr\t%[tmp1], %[log2_denom]\n" \
++// "satu\t%[tmp0] >> 0, 8\n" \
++// "satu\t%[tmp1] >> 0, 8\n" \
++// "st.b\t%[block][0], %[tmp0]\n" \
++// "st.b\t%[block][1], %[tmp1]\n" \
++// : [tmp0] "=&r"(tmp0), [tmp1] "=&r"(tmp1) \
++// : [block] "r"(block), [weight]"r"(weight), [log2_denom]"r"(log2denom) ); \
++// } else if ( W==4 ) { \
++// asm volatile ( "ld.w\t%[tmp0], %[block][0]\n" \
++// "punpckub.h\t%[tmp1], %[tmp0]:t\n" \
++// "punpckub.h\t%[tmp0], %[tmp0]:b\n" \
++// "mulhh.w\t%[tmp2], %[tmp1]:t, %[weight]:b\n" \
++// "mulhh.w\t%[tmp1], %[tmp1]:b, %[weight]:b\n" \
++// "asr\t%[tmp0], %[log2_denom]\n" \
++// "asr\t%[tmp1], %[log2_denom]\n" \
++// "satu\t%[tmp0] >> 0, 8\n" \
++// "satu\t%[tmp1] >> 0, 8\n" \
++// "st.b\t%[block][0], %[tmp0]\n" \
++// "st.b\t%[block][1], %[tmp1]\n" \
++// : [tmp0] "=&r"(tmp0), [tmp1] "=&r"(tmp1) \
++// : [block] "r"(block), [weight]"r"(weight), [log2_denom]"r"(log2denom) ); \
++//
++//
++//
++// if(W==4) continue; \
++// op_scale1(4); \
++// op_scale1(5); \
++// op_scale1(6); \
++// op_scale1(7); \
++// if(W==8) continue; \
++// op_scale1(8); \
++// op_scale1(9); \
++// op_scale1(10); \
++// op_scale1(11); \
++// op_scale1(12); \
++// op_scale1(13); \
++// op_scale1(14); \
++// op_scale1(15); \
++// } \
++//} \
++//static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets){ \
++// int attribute_unused x, y; \
++// int offset = (offsets + offsetd + 1) >> 1; \
++// offset = ((offset << 1) + 1) << log2_denom; \
++// for(y=0; y<H; y++, dst += stride, src += stride){ \
++// op_scale2(0); \
++// op_scale2(1); \
++// if(W==2) continue; \
++// op_scale2(2); \
++// op_scale2(3); \
++// if(W==4) continue; \
++// op_scale2(4); \
++// op_scale2(5); \
++// op_scale2(6); \
++// op_scale2(7); \
++// if(W==8) continue; \
++// op_scale2(8); \
++// op_scale2(9); \
++// op_scale2(10); \
++// op_scale2(11); \
++// op_scale2(12); \
++// op_scale2(13); \
++// op_scale2(14); \
++// op_scale2(15); \
++// } \
++//}
++
++
++
++/* Returns zero in each byte where the absolute difference between <a> and <b>
++ is not less than <compare> */
++#define PABS_DIFF_LESS_THAN( a, b, compare) \
++ ({ uint32_t __tmp__, __tmp2__, __mask__; \
++ asm ( \
++ /* Check ABS( a - b ) < compare */ \
++ "psubs.ub\t%[tmp], %[opa], %[opb]\n" \
++ "psubs.ub\t%[tmp2], %[opb], %[opa]\n" \
++ "or\t%[tmp], %[tmp2]\n" /* ABS ( a - b ) */ \
++ /* This produces 0 for all bytes where the comparison is not true */ \
++ "psubs.ub\t%[mask], %[cmp], %[tmp]\n" \
++ : [tmp] "=&r"(__tmp__), [tmp2] "=&r"(__tmp2__), [mask] "=&r"(__mask__) \
++ : [opa] "r"(a), [opb] "r"(b), [cmp] "r"(compare) ); \
++ __mask__; })
++
++/*
++ Set all bytes containing zero in <value> to 255 and the rest to zero.
++
++ Add with saturation 254 to all bytes making all bytes different from
++ zero become 255. Then add one without saturation to make all bytes
++ originally containing zero 255 and the rest 0. */
++#define SET_ALL_BITS_IN_ZERO_BYTES(value) \
++ ({ uint32_t __tmp__; \
++ asm ( \
++ "padds.ub\t%[tmp], %[val], %[max_minus_one]\n" \
++ "padd.b\t%[tmp], %[tmp], %[all_ones]\n" \
++ : [tmp] "=r"(__tmp__) \
++ : [val] "r"(value), [max_minus_one] "r"(0xFEFEFEFE), [all_ones] "r"(0x01010101) ); \
++ __tmp__; })
++
++#define PACKW_SH(upper, lower) \
++ ({ uint32_t __tmp__; \
++ asm ( \
++ "packw.sh\t%[tmp], %[u], %[l]\n" \
++ : [tmp] "=r"(__tmp__) \
++ : [u] "r"(upper), [l] "r"(lower) ); \
++ __tmp__; })
++
++#define PACKSH_UB(upper, lower) \
++ ({ uint32_t __tmp__; \
++ asm ( \
++ "packsh.sb\t%[tmp], %[u], %[l]\n" \
++ : [tmp] "=r"(__tmp__) \
++ : [u] "r"(upper), [l] "r"(lower) ); \
++ __tmp__; })
++
++static void h264_v_loop_filter_luma_avr32(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
++{
++ int i;
++
++ if ( alpha == 0 )
++ return;
++
++ alpha = PACKW_SH(alpha, alpha);
++ alpha = PACKSH_UB(alpha, alpha);
++ beta = PACKW_SH(beta, beta);
++ beta = PACKSH_UB(beta, beta);
++
++ for( i = 0; i < 4; i++ ) {
++ uint32_t p0, p1, p2, q0, q1, q2;
++ uint32_t mask, mask2;
++ uint32_t tmp, tmp2, tmp3, tmp4;
++
++ if( tc0[i] < 0 ) {
++ pix += 4;
++ continue;
++ }
++
++/* for( d = 0; d < 4; d++ ) {
++ const int p0 = pix[-1*stride];
++ const int p1 = pix[-2*stride];
++ const int p2 = pix[-3*stride];
++ const int q0 = pix[0];
++ const int q1 = pix[1*stride];
++ const int q2 = pix[2*stride];
++
++ if( ABS( p0 - q0 ) < alpha &&
++ ABS( p1 - p0 ) < beta &&
++ ABS( q1 - q0 ) < beta ) { */
++
++ p0 = LD32(pix - stride);
++ p1 = LD32(pix - 2*stride);
++ q0 = LD32(pix);
++ q1 = LD32(pix + stride);
++
++ /* Check which of the columns should be filtered, if any. */
++ mask = PABS_DIFF_LESS_THAN(p0, q0, alpha);
++ mask |= PABS_DIFF_LESS_THAN(p1, p0, beta);
++ mask |= PABS_DIFF_LESS_THAN(q1, q0, beta);
++
++ if ( !mask )
++ continue;
++
++ mask = SET_ALL_BITS_IN_ZERO_BYTES(mask);
++
++
++ int tc = PACKW_SH(tc0[i], tc0[i]);
++ int tc0_p = tc;
++ int tc0_m = PACKW_SH(-tc0[i], -tc0[i]);
++
++ /*
++ int i_delta;
++ if( ABS( p2 - p0 ) < beta ) {
++ pix[-2*stride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
++ tc++;
++ }*/
++
++ p2 = LD32(pix - 3*stride);
++ mask2 = PABS_DIFF_LESS_THAN(p2, p0, beta) & ~mask;
++
++ if ( mask2 ){
++ mask2 = SET_ALL_BITS_IN_ZERO_BYTES(mask2);
++ asm ("pavg.ub\t%[tmp], %[p0], %[q0]\n"
++ "paddh.ub\t%[tmp], %[tmp], %[p2]\n"
++ "punpckub.h\t%[tmp2], %[tmp]:t\n"
++ "punpckub.h\t%[tmp], %[tmp]:b\n"
++ "punpckub.h\t%[tmp3], %[p1]:t\n"
++ "punpckub.h\t%[tmp4], %[p1]:b\n"
++ "psub.h\t%[tmp2], %[tmp2], %[tmp3]\n"
++ "psub.h\t%[tmp], %[tmp], %[tmp4]\n"
++ "pmin.sh\t%[tmp2], %[tmp2], %[tc0_p]\n"
++ "pmin.sh\t%[tmp], %[tmp], %[tc0_p]\n"
++ "pmax.sh\t%[tmp2], %[tmp2], %[tc0_m]\n"
++ "pmax.sh\t%[tmp], %[tmp], %[tc0_m]\n"
++ "padd.h\t%[tmp2], %[tmp2], %[tmp3]\n"
++ "padd.h\t%[tmp], %[tmp], %[tmp4]\n"
++ "packsh.ub\t%[tmp], %[tmp2], %[tmp]\n"
++ "andn\t%[tmp], %[mask2]\n"
++ "and\t%[tmp2], %[q1], %[mask2]\n"
++ "or\t%[tmp], %[tmp2]\n"
++ : [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3),
++ [tmp4]"=&r"(tmp4)
++ : [q0]"r"(q0), [p2]"r"(p2), [p1]"r"(p1), [p0]"r"(p0), [q1]"r"(q1), [tc0_p]"r"(tc0_p),
++ [tc0_m]"r"(tc0_m), [mask2]"r"(mask2));
++ ST32(pix - 2*stride, tmp);
++ tc += 0x00010001;
++ }
++
++
++ q2 = LD32(pix + 2*stride);
++
++ /*
++ if( ABS( q2 - q0 ) < beta ) {
++ pix[ stride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
++ tc++;
++ }
++ */
++ mask2 = PABS_DIFF_LESS_THAN(q2, q0, beta) & ~mask;
++
++ if ( mask2 ){
++ mask2 = SET_ALL_BITS_IN_ZERO_BYTES(mask2);
++ asm ("pavg.ub\t%[tmp], %[p0], %[q0]\n"
++ "paddh.ub\t%[tmp], %[tmp], %[q2]\n"
++ "punpckub.h\t%[tmp2], %[tmp]:t\n"
++ "punpckub.h\t%[tmp], %[tmp]:b\n"
++ "punpckub.h\t%[tmp3], %[q1]:t\n"
++ "punpckub.h\t%[tmp4], %[q1]:b\n"
++ "psub.h\t%[tmp2], %[tmp2], %[tmp3]\n"
++ "psub.h\t%[tmp], %[tmp], %[tmp4]\n"
++ "pmin.sh\t%[tmp2], %[tmp2], %[tc0_p]\n"
++ "pmin.sh\t%[tmp], %[tmp], %[tc0_p]\n"
++ "pmax.sh\t%[tmp2], %[tmp2], %[tc0_m]\n"
++ "pmax.sh\t%[tmp], %[tmp], %[tc0_m]\n"
++ "padd.h\t%[tmp2], %[tmp2], %[tmp3]\n"
++ "padd.h\t%[tmp], %[tmp], %[tmp4]\n"
++ "packsh.ub\t%[tmp], %[tmp2], %[tmp]\n"
++ "andn\t%[tmp], %[mask2]\n"
++ "and\t%[tmp2], %[q1], %[mask2]\n"
++ "or\t%[tmp], %[tmp2]\n"
++ : [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3),
++ [tmp4]"=&r"(tmp4)
++ : [q0]"r"(q0), [q2]"r"(q2), [q1]"r"(q1), [p0]"r"(p0), [tc0_p]"r"(tc0_p),
++ [tc0_m]"r"(tc0_m), [mask2]"r"(mask2));
++ ST32(pix + stride, tmp);
++ tc += 0x00010001;
++ }
++
++ uint32_t old_p0 = p0;
++ uint32_t old_q0 = q0;
++
++ /* i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
++ pix[-stride] = clip_uint8( p0 + i_delta );
++ pix[0] = clip_uint8( q0 - i_delta ); */
++
++ asm (
++ /* Check if the two upper pixels should be filtered */
++ "lsr\t%[tmp], %[inv_mask], 16\n"
++ "breq\t0f\n"
++
++ "punpckub.h\t%[tmp], %[p1]:t\n"
++ "punpckub.h\t%[tmp2], %[q1]:t\n"
++
++ /* p1 - q1 */
++ "psub.h\t%[tmp], %[tmp], %[tmp2]\n"
++
++ "punpckub.h\t%[tmp3], %[q0]:t\n"
++ "punpckub.h\t%[tmp4], %[p0]:t\n"
++
++ /* q0 - p0 */
++ "psub.h\t%[tmp2], %[tmp3], %[tmp4]\n"
++
++ /* (q0 - p0) << 2 */
++ "plsl.h\t%[tmp2], %[tmp2], 2\n"
++
++ /* ((q0 - p0) << 2) + (p1 - q1) */
++ "padd.h\t%[tmp2], %[tmp2], %[tmp]\n"
++
++ "mov\t%[tmp], 0x00040004\n"
++ /* ((q0 - p0) << 2) + (p1 - q1) + 4*/
++ "padd.h\t%[tmp2], %[tmp2], %[tmp]\n"
++
++ /* (((q0 - p0) << 2) + (p1 - q1) + 4) >> 3*/
++ "pasr.h\t%[tmp2], %[tmp2], 3\n"
++
++ "mov\t%[tmp], 0\n"
++ "psub.h\t%[tmp], %[tmp], %[tc]\n"
++
++ /* i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); */
++ "pmin.sh\t%[tmp2], %[tmp2], %[tc]\n"
++ "pmax.sh\t%[tmp2], %[tmp2], %[tmp]\n"
++
++
++ /* pix[-stride] = clip_uint8( p0 + i_delta ); */
++ "padd.h\t%[tmp4], %[tmp4], %[tmp2]\n"
++
++
++ /* pix[0] = clip_uint8( q0 - i_delta ); */
++ "psub.h\t%[tmp3], %[tmp3], %[tmp2]\n"
++
++ /* Check if the two lower pixels should be filtered */
++ "lsl\t%[tmp2], %[inv_mask], 16\n"
++ "breq\t1f\n"
++
++ "0:\n"
++ "punpckub.h\t%[p1], %[p1]:b\n"
++ "punpckub.h\t%[q1], %[q1]:b\n"
++
++ /* p1 - q1 */
++ "psub.h\t%[p1], %[p1], %[q1]\n"
++
++ "punpckub.h\t%[q0], %[q0]:b\n"
++ "punpckub.h\t%[p0], %[p0]:b\n"
++
++ /* q0 - p0 */
++ "psub.h\t%[tmp2], %[q0], %[p0]\n"
++
++ /* (q0 - p0) << 2 */
++ "plsl.h\t%[tmp2], %[tmp2], 2\n"
++
++ /* ((q0 - p0) << 2) + (p1 - q1) */
++ "padd.h\t%[tmp2], %[tmp2], %[p1]\n"
++
++ "mov\t%[q1], 0x00040004\n"
++ /* ((q0 - p0) << 2) + (p1 - q1) + 4*/
++ "padd.h\t%[tmp2], %[tmp2], %[q1]\n"
++
++ /* (((q0 - p0) << 2) + (p1 - q1) + 4) >> 3*/
++ "pasr.h\t%[tmp2], %[tmp2], 3\n"
++
++ /* i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); */
++ "pmin.sh\t%[tmp2], %[tmp2], %[tc]\n"
++ "pmax.sh\t%[tmp2], %[tmp2], %[tmp]\n"
++
++ /* pix[-stride] = clip_uint8( p0 + i_delta ); */
++ "padd.h\t%[p0], %[p0], %[tmp2]\n"
++
++ /* pix[0] = clip_uint8( q0 - i_delta ); */
++ "psub.h\t%[q0], %[q0], %[tmp2]\n"
++
++ "1:\n"
++ "packsh.ub\t%[p0], %[tmp4], %[p0]\n"
++ "packsh.ub\t%[q0], %[tmp3], %[tmp4]\n"
++
++ : [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3),
++ [tmp4]"=&r"(tmp4), [q0]"=&r"(q0), [q1]"=&r"(q1), [p0]"=&r"(p0), [p1]"=&r"(p1)
++ : [tc]"r"(tc), [inv_mask]"r"(~mask));
++
++ ST32(pix - stride, (mask & old_p0) | (p0 & ~mask));
++ ST32(pix, (mask & old_q0) | (q0 & ~mask));
++
++ }
++ pix += 1;
++}
++
++
++
++
++#ifdef CHECK_DSP_FUNCS_AGAINST_C
++
++void dump_block8(uint8_t *block, int line_size, int h){
++ int i, j;
++
++ for ( i = 0; i < h ; i++ ){
++ av_log(NULL, AV_LOG_ERROR, "\t");
++ for ( j = 0; j < 8 ; j++ ){
++ av_log(NULL, AV_LOG_ERROR, "%d ", block[j + i*line_size]);
++ }
++ av_log(NULL, AV_LOG_ERROR, "\n");
++ }
++}
++
++void dump_block4(uint8_t *block, int line_size, int h){
++ int i, j;
++
++ for ( i = 0; i < h ; i++ ){
++ av_log(NULL, AV_LOG_ERROR, "\t");
++ for ( j = 0; j < 4 ; j++ ){
++ av_log(NULL, AV_LOG_ERROR, "%d ", block[j + i*line_size]);
++ }
++ av_log(NULL, AV_LOG_ERROR, "\n");
++ }
++}
++
++void dump_block(uint8_t *block, int line_size, int h, int w){
++ int i, j;
++
++ for ( i = 0; i < h ; i++ ){
++ av_log(NULL, AV_LOG_ERROR, "\t");
++ for ( j = 0; j < w ; j++ ){
++ av_log(NULL, AV_LOG_ERROR, "%d ", block[j + i*line_size]);
++ }
++ av_log(NULL, AV_LOG_ERROR, "\n");
++ }
++}
++
++void check_block8(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
++ int h, char *name, int max_dev){
++ int i,j;
++ for ( i = 0; i < 8 ; i++ ){
++ for ( j = 0; j < h ; j++ ){
++ int diff = test[i + line_size_test*j] - correct[i + line_size_correct*j];
++ diff = diff < 0 ? -diff : diff;
++ if ( diff > max_dev ){
++ av_log(NULL, AV_LOG_ERROR, "Error pixel x=%i, y=%i differs. Is 0x%x should be 0x%x\n",
++ i, j, test[i + line_size_test*j], correct[i + j*line_size_correct]);
++ av_log(NULL, AV_LOG_ERROR, "Error resulting block from %s is:\n", name);
++ dump_block8(test, line_size_test, h);
++ av_log(NULL, AV_LOG_ERROR, "But should be equal to:\n");
++ dump_block8(correct, line_size_correct, h);
++ exit(1);
++ }
++ }
++ }
++}
++
++void check_block4(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
++ int h, char *name, int max_dev){
++ int i,j;
++ for ( i = 0; i < 4 ; i++ ){
++ for ( j = 0; j < h ; j++ ){
++ int diff = test[i + line_size_test*j] - correct[i + line_size_correct*j];
++ diff = diff < 0 ? -diff : diff;
++ if ( diff > max_dev ){
++ av_log(NULL, AV_LOG_ERROR, "Error pixel x=%i, y=%i differs. Is 0x%x should be 0x%x\n",
++ i, j, test[i + line_size_test*j], correct[i + j*line_size_correct]);
++ av_log(NULL, AV_LOG_ERROR, "Error resulting block from %s is:\n", name);
++ dump_block8(test, line_size_test, h);
++ av_log(NULL, AV_LOG_ERROR, "But should be equal to:\n");
++ dump_block4(correct, line_size_correct, h);
++ exit(1);
++ }
++ }
++ }
++}
++
++void check_block(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
++ int h, int width, char *name, int max_dev){
++ int i,j;
++ for ( i = 0; i < width ; i++ ){
++ for ( j = 0; j < h ; j++ ){
++ int diff = test[i + line_size_test*j] - correct[i + line_size_correct*j];
++ diff = diff < 0 ? -diff : diff;
++ if ( diff > max_dev ){
++ av_log(NULL, AV_LOG_ERROR, "Error pixel x=%i, y=%i differs. Is 0x%x should be 0x%x\n",
++ i, j, test[i + line_size_test*j], correct[i + j*line_size_correct]);
++ av_log(NULL, AV_LOG_ERROR, "Error resulting block from %s is:\n", name);
++ dump_block(test, line_size_test, h, width);
++ av_log(NULL, AV_LOG_ERROR, "But should be equal to:\n");
++ dump_block(correct, line_size_correct, h, width);
++ exit(1);
++ }
++ }
++ }
++}
++
++void dump_dct_block(DCTELEM *block){
++ int i, j;
++
++ for ( i = 0; i < 8 ; i++ ){
++ av_log(NULL, AV_LOG_ERROR, "\t");
++ for ( j = 0; j < 8 ; j++ ){
++ av_log(NULL, AV_LOG_ERROR, "0x%x ", block[j + i*8]);
++ }
++ av_log(NULL, AV_LOG_ERROR, "\n");
++ }
++}
++
++void test_idct_avr32(DCTELEM *block){
++ DCTELEM testBlock[64];
++ int i, j;
++
++ /* Copy transposed block to testBlock */
++ for ( i = 0; i < 8 ; i++ ){
++ for ( j = 0; j < 8 ; j++ ){
++ testBlock[i + 8*j] = block[j + i*8];
++ }
++ }
++
++ idct_avr32(block);
++ simple_idct(&testBlock);
++
++ for ( i = 0; i < 64 ; i++ ){
++ if ( block[i] != testBlock[i] ){
++ av_log(NULL, AV_LOG_ERROR, "Error resulting block from idct is:\n");
++ dump_dct_block(block);
++ av_log(NULL, AV_LOG_ERROR, "But should be equal to the transposed of:\n");
++ dump_dct_block(testBlock);
++ exit(1);
++ }
++ }
++}
++
++void test_idct_put_avr32(uint8_t *dest, int line_size, DCTELEM *block){
++ uint8_t testBlock[64];
++ DCTELEM blockCopy[64];
++ int i, j;
++
++ /* Copy transposed block to blockCopy */
++ for ( i = 0; i < 8 ; i++ ){
++ for ( j = 0; j < 8 ; j++ ){
++ blockCopy[i + 8*j] = block[j + i*8];
++ }
++ }
++
++ idct_put_avr32(dest, line_size, block);
++ simple_idct_put(&testBlock, 8, blockCopy);
++
++ check_block8(dest, testBlock, line_size, 8, 8, "idct_put", 1);
++}
++
++
++void test_idct_add_avr32(uint8_t *dest, int line_size, DCTELEM *block){
++ uint8_t testBlock[64];
++ DCTELEM blockCopy[64];
++ int i, j;
++
++ /* Copy dest to testBlock */
++ for ( i = 0; i < 8 ; i++ ){
++ for ( j = 0; j < 8 ; j++ ){
++ testBlock[i + 8*j] = dest[i + j*line_size];
++ }
++ }
++
++ /* Copy transposed block to blockCopy */
++ for ( i = 0; i < 8 ; i++ ){
++ for ( j = 0; j < 8 ; j++ ){
++ blockCopy[i + 8*j] = block[j + i*8];
++ }
++ }
++
++ idct_add_avr32(dest, line_size, block);
++ simple_idct_add(&testBlock, 8, blockCopy);
++
++ check_block8(dest, testBlock, line_size, 8, 8, "idct_add", 1);
++}
++
++void test_h264_idct_add_avr32(uint8_t *dest, DCTELEM *block, int stride){
++ uint8_t testBlock[16];
++ DCTELEM blockCopy[16];
++ int i, j;
++
++ /* Copy dest to testBlock */
++ for ( i = 0; i < 4 ; i++ ){
++ for ( j = 0; j < 4 ; j++ ){
++ testBlock[i + 4*j] = dest[i + j*stride];
++ }
++ }
++
++ /* Copy transposed block to blockCopy */
++ for ( i = 0; i < 16 ; i++ ){
++ blockCopy[i] = block[i];
++ }
++
++ ff_h264_idct_add_c(dest, block, stride);
++
++ h264_idct_add_avr32(testBlock, blockCopy, 4);
++
++ check_block(dest, testBlock, stride, 4, 4, 4, "h264_idct_add", 0);
++}
++
++void test_h264_idct8_add_avr32(uint8_t *dest, DCTELEM *block, int stride){
++ uint8_t testBlock[8*8];
++ DCTELEM blockCopy[8*8];
++ int i, j;
++
++ /* Copy dest to testBlock */
++ for ( i = 0; i < 8 ; i++ ){
++ for ( j = 0; j < 8 ; j++ ){
++ testBlock[i + 8*j] = dest[i + j*stride];
++ }
++ }
++
++ /* Copy source block to blockCopy */
++ for ( i = 0; i < 8*8 ; i++ ){
++ blockCopy[i] = block[i];
++ }
++
++ ff_h264_idct8_add_c(dest, block, stride);
++ h264_idct8_add_avr32(testBlock, blockCopy, 8);
++
++ check_block(dest, testBlock, stride, 8, 8, 8, "h264_idct8_add", 0);
++}
++
++void test_put_pixels_funcs8(op_pixels_func test, op_pixels_func correct, uint8_t *block,
++ const uint8_t *pixels, int line_size, int h, char *name, int in_h_size, int in_v_size){
++ uint8_t *testBlock, *testBlock2;
++ int i, j;
++ int input_v_size = h + in_v_size;
++ int input_h_size = 8 + in_h_size;
++
++ testBlock = alloca(input_h_size*input_v_size);
++ testBlock2 = alloca(input_h_size*input_v_size);
++
++ for ( i = 0; i < input_h_size ; i++ ){
++ for ( j = 0; j < input_v_size ; j++ ){
++ testBlock[i + input_h_size*j] = pixels[i + j*line_size];
++ }
++ }
++
++ test(block, pixels, line_size, h);
++ correct(testBlock2, testBlock, input_h_size, h);
++
++ check_block8(block, testBlock2, line_size, input_h_size, h, name, 0);
++
++}
++
++void test_h264_chroma_mc_funcs(h264_chroma_mc_func test, h264_chroma_mc_func correct, uint8_t *dst,
++ uint8_t *src, int stride, int h, int w, int x, int y, char *name){
++ uint8_t *testBlock, *testBlock2;
++ int i, j;
++ int input_v_size = h + 1;
++ int input_h_size = ((w + 1) + 3) & ~3;
++
++ testBlock = alloca(input_h_size*input_v_size);
++ testBlock2 = alloca(input_h_size*input_v_size);
++
++ for ( i = 0; i < w + 1 ; i++ ){
++ for ( j = 0; j < h + 1 ; j++ ){
++ testBlock[i + input_h_size*j] = src[i + j*stride];
++ }
++ }
++
++ for ( i = 0; i < w ; i++ ){
++ for ( j = 0; j < h ; j++ ){
++ testBlock2[i + input_h_size*j] = dst[i + j*stride];
++ }
++ }
++
++ test(dst, src, stride, h, x, y);
++ correct(testBlock2, testBlock, input_h_size, h, x, y);
++
++ check_block(dst, testBlock2, stride, input_h_size, h, w, name, 0);
++
++}
++
++void test_qpel_mc_funcs(qpel_mc_func test, qpel_mc_func correct, uint8_t *dst,
++ uint8_t *src, int stride, int size, char *name){
++ uint8_t *testBlock, *testBlock2;
++ int i, j;
++ int test_stride = size + 8;
++
++ testBlock = alloca(test_stride*(size+8)) + 4 + test_stride*4;
++ testBlock2 = alloca(test_stride*size);
++
++ for ( i = -4; i < size+4 ; i++ ){
++ for ( j = -4; j < size+4 ; j++ ){
++ testBlock[i + test_stride*j] = src[i + j*stride];
++ }
++ }
++
++ for ( i = 0; i < size ; i++ ){
++ for ( j = 0; j < size ; j++ ){
++ testBlock2[i + test_stride*j] = dst[i + j*stride];
++ }
++ }
++
++ correct(dst, src, stride);
++ test(testBlock2, testBlock, test_stride);
++
++ check_block(testBlock2, dst, test_stride, stride, size, size, name, 0);
++
++}
++
++
++#define test_pixels_funcs(PFX, NUM ) \
++void test_ ## PFX ## _pixels ## NUM ## _avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \
++ test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _avr32, PFX ## _pixels ## NUM ## _c, \
++ block, pixels, line_size, h, "test_" #PFX "_pixels", 0, 0); } \
++void test_ ## PFX ## _pixels ## NUM ## _h_avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \
++ test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _h_avr32, PFX ## _pixels ## NUM ## _x2_c, \
++ block, pixels, line_size, h, "test_" #PFX "_pixels_h", 1, 0); } \
++void test_ ## PFX ## _pixels ## NUM ## _v_avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \
++ test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _v_avr32, PFX ## _pixels ## NUM ## _y2_c, \
++ block, pixels, line_size, h, "test_" #PFX "_pixels_v", 0, 1); } \
++void test_ ## PFX ## _pixels ## NUM ## _hv_avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \
++ test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _hv_avr32, PFX ## _pixels ## NUM ## _xy2_c, \
++ block, pixels, line_size, h, "test_" #PFX "_pixels_hv", 1, 1); }
++
++test_pixels_funcs(put, 8);
++test_pixels_funcs(put_no_rnd, 8);
++test_pixels_funcs(put, 16);
++test_pixels_funcs(put_no_rnd, 16);
++
++test_pixels_funcs(avg, 8);
++test_pixels_funcs(avg_no_rnd, 8);
++test_pixels_funcs(avg, 16);
++test_pixels_funcs(avg_no_rnd, 16);
++
++#define test_h264_chroma_mc_funcs(PFX, NUM ) \
++void test_ ## PFX ## _h264_chroma_mc ## NUM ## _pico( uint8_t *dst, uint8_t *src, int stride, int h, int x, int y){ \
++ test_h264_chroma_mc_funcs(PFX ## _h264_chroma_mc ## NUM ## _pico, PFX ## _h264_chroma_mc ## NUM ## _c, \
++ dst, src, stride, h, NUM, x, y, "test_" #PFX "_h264_chroma_mc" #NUM "_pico"); } \
++
++test_h264_chroma_mc_funcs(put, 2);
++test_h264_chroma_mc_funcs(put, 4);
++test_h264_chroma_mc_funcs(put, 8);
++test_h264_chroma_mc_funcs(avg, 2);
++test_h264_chroma_mc_funcs(avg, 4);
++test_h264_chroma_mc_funcs(avg, 8);
++
++#define test_qpel_mc_funcs_type(PFX, NUM, TYPE ) \
++void test_ ## PFX ## NUM ## _ ## TYPE ## _pico( uint8_t *dst, uint8_t *src, int stride){ \
++ test_qpel_mc_funcs(PFX ## NUM ## _ ## TYPE ## _pico, PFX ## NUM ## _ ## TYPE ## _c, \
++ dst, src, stride, NUM, "test_" #PFX #NUM "_" #TYPE "_pico"); }
++
++#define test_qpel_mc_funcs(PFX, NUM) \
++ test_qpel_mc_funcs_type(PFX, NUM, mc00);\
++ test_qpel_mc_funcs_type(PFX, NUM, mc10);\
++ test_qpel_mc_funcs_type(PFX, NUM, mc20);\
++ test_qpel_mc_funcs_type(PFX, NUM, mc30);\
++ test_qpel_mc_funcs_type(PFX, NUM, mc01);\
++ test_qpel_mc_funcs_type(PFX, NUM, mc11);\
++ test_qpel_mc_funcs_type(PFX, NUM, mc21);\
++ test_qpel_mc_funcs_type(PFX, NUM, mc31);\
++ test_qpel_mc_funcs_type(PFX, NUM, mc02);\
++ test_qpel_mc_funcs_type(PFX, NUM, mc12);\
++ test_qpel_mc_funcs_type(PFX, NUM, mc22);\
++ test_qpel_mc_funcs_type(PFX, NUM, mc32);\
++ test_qpel_mc_funcs_type(PFX, NUM, mc03);\
++ test_qpel_mc_funcs_type(PFX, NUM, mc13);\
++ test_qpel_mc_funcs_type(PFX, NUM, mc23);\
++ test_qpel_mc_funcs_type(PFX, NUM, mc33)
++
++test_qpel_mc_funcs(put_h264_qpel, 4);
++test_qpel_mc_funcs(put_h264_qpel, 8);
++test_qpel_mc_funcs(put_h264_qpel, 16);
++test_qpel_mc_funcs(avg_h264_qpel, 4);
++test_qpel_mc_funcs(avg_h264_qpel, 8);
++test_qpel_mc_funcs(avg_h264_qpel, 16);
++
++
++#define dspfunc(PFX, IDX, NUM) \
++ c->PFX ## _pixels_tab[IDX][ 0] = DSP_FUNC_NAME( PFX ## NUM ## _mc00_pico ); \
++ c->PFX ## _pixels_tab[IDX][ 1] = DSP_FUNC_NAME( PFX ## NUM ## _mc10_pico ); \
++ c->PFX ## _pixels_tab[IDX][ 2] = DSP_FUNC_NAME( PFX ## NUM ## _mc20_pico ); \
++ c->PFX ## _pixels_tab[IDX][ 3] = DSP_FUNC_NAME( PFX ## NUM ## _mc30_pico ); \
++ c->PFX ## _pixels_tab[IDX][ 4] = DSP_FUNC_NAME( PFX ## NUM ## _mc01_pico ); \
++ c->PFX ## _pixels_tab[IDX][ 5] = DSP_FUNC_NAME( PFX ## NUM ## _mc11_pico ); \
++ c->PFX ## _pixels_tab[IDX][ 6] = DSP_FUNC_NAME( PFX ## NUM ## _mc21_pico ); \
++ c->PFX ## _pixels_tab[IDX][ 7] = DSP_FUNC_NAME( PFX ## NUM ## _mc31_pico ); \
++ c->PFX ## _pixels_tab[IDX][ 8] = DSP_FUNC_NAME( PFX ## NUM ## _mc02_pico ); \
++ c->PFX ## _pixels_tab[IDX][ 9] = DSP_FUNC_NAME( PFX ## NUM ## _mc12_pico ); \
++ c->PFX ## _pixels_tab[IDX][10] = DSP_FUNC_NAME( PFX ## NUM ## _mc22_pico ); \
++ c->PFX ## _pixels_tab[IDX][11] = DSP_FUNC_NAME( PFX ## NUM ## _mc32_pico ); \
++ c->PFX ## _pixels_tab[IDX][12] = DSP_FUNC_NAME( PFX ## NUM ## _mc03_pico ); \
++ c->PFX ## _pixels_tab[IDX][13] = DSP_FUNC_NAME( PFX ## NUM ## _mc13_pico ); \
++ c->PFX ## _pixels_tab[IDX][14] = DSP_FUNC_NAME( PFX ## NUM ## _mc23_pico ); \
++ c->PFX ## _pixels_tab[IDX][15] = DSP_FUNC_NAME( PFX ## NUM ## _mc33_pico )
++
++#endif
++
++void dsputil_init_avr32(DSPContext* c, AVCodecContext *avctx)
++{
++
++ /* H264 */
++
++ if ( 0 /*avr32_use_pico*/ ){
++ c->put_h264_chroma_pixels_tab[0]= DSP_FUNC_NAME(put_h264_chroma_mc8_pico);
++ c->put_h264_chroma_pixels_tab[1]= DSP_FUNC_NAME(put_h264_chroma_mc4_pico);
++ c->put_h264_chroma_pixels_tab[2]= DSP_FUNC_NAME(put_h264_chroma_mc2_pico);
++
++ c->avg_h264_chroma_pixels_tab[0]= DSP_FUNC_NAME(avg_h264_chroma_mc8_pico);
++ c->avg_h264_chroma_pixels_tab[1]= DSP_FUNC_NAME(avg_h264_chroma_mc4_pico);
++ c->avg_h264_chroma_pixels_tab[2]= DSP_FUNC_NAME(avg_h264_chroma_mc2_pico);
++ }
++
++#define dspfunc(PFX, IDX, NUM) \
++ c->PFX ## _pixels_tab[IDX][ 0] = DSP_FUNC_NAME( PFX ## NUM ## _mc00_pico ); \
++ c->PFX ## _pixels_tab[IDX][ 1] = DSP_FUNC_NAME( PFX ## NUM ## _mc10_pico ); \
++ c->PFX ## _pixels_tab[IDX][ 2] = DSP_FUNC_NAME( PFX ## NUM ## _mc20_pico ); \
++ c->PFX ## _pixels_tab[IDX][ 3] = DSP_FUNC_NAME( PFX ## NUM ## _mc30_pico ); \
++ c->PFX ## _pixels_tab[IDX][ 4] = DSP_FUNC_NAME( PFX ## NUM ## _mc01_pico ); \
++ c->PFX ## _pixels_tab[IDX][ 5] = DSP_FUNC_NAME( PFX ## NUM ## _mc11_pico ); \
++ c->PFX ## _pixels_tab[IDX][ 6] = DSP_FUNC_NAME( PFX ## NUM ## _mc21_pico ); \
++ c->PFX ## _pixels_tab[IDX][ 7] = DSP_FUNC_NAME( PFX ## NUM ## _mc31_pico ); \
++ c->PFX ## _pixels_tab[IDX][ 8] = DSP_FUNC_NAME( PFX ## NUM ## _mc02_pico ); \
++ c->PFX ## _pixels_tab[IDX][ 9] = DSP_FUNC_NAME( PFX ## NUM ## _mc12_pico ); \
++ c->PFX ## _pixels_tab[IDX][10] = DSP_FUNC_NAME( PFX ## NUM ## _mc22_pico ); \
++ c->PFX ## _pixels_tab[IDX][11] = DSP_FUNC_NAME( PFX ## NUM ## _mc32_pico ); \
++ c->PFX ## _pixels_tab[IDX][12] = DSP_FUNC_NAME( PFX ## NUM ## _mc03_pico ); \
++ c->PFX ## _pixels_tab[IDX][13] = DSP_FUNC_NAME( PFX ## NUM ## _mc13_pico ); \
++ c->PFX ## _pixels_tab[IDX][14] = DSP_FUNC_NAME( PFX ## NUM ## _mc23_pico ); \
++ c->PFX ## _pixels_tab[IDX][15] = DSP_FUNC_NAME( PFX ## NUM ## _mc33_pico )
++
++ if ( avr32_use_pico ){
++ dspfunc(put_h264_qpel, 0, 16);
++ dspfunc(put_h264_qpel, 1, 8);
++ dspfunc(put_h264_qpel, 2, 4);
++ dspfunc(avg_h264_qpel, 0, 16);
++ dspfunc(avg_h264_qpel, 1, 8);
++ dspfunc(avg_h264_qpel, 2, 4);
++ }
++
++ c->idct_put= DSP_FUNC_NAME(idct_put_avr32);
++ c->idct_add= DSP_FUNC_NAME(idct_add_avr32);
++ c->idct = DSP_FUNC_NAME(idct_avr32);
++ c->h264_idct_add = DSP_FUNC_NAME(h264_idct_add_avr32);
++ c->h264_idct8_add = DSP_FUNC_NAME(h264_idct8_add_avr32);
++
++ /*c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_avr32;*/
++
++ c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
++
++ c->fdct = fdct_avr32;
++
++ c->clear_blocks = clear_blocks_avr32;
++
++#undef dspfunc
++#define dspfunc(PFX, IDX, NUM) \
++ c->PFX ## _pixels_tab[IDX][0] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _avr32 ); \
++ c->PFX ## _pixels_tab[IDX][1] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _h_avr32); \
++ c->PFX ## _pixels_tab[IDX][2] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _v_avr32); \
++ c->PFX ## _pixels_tab[IDX][3] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _hv_avr32)
++
++ dspfunc(put, 0, 16);
++ dspfunc(put_no_rnd, 0, 16);
++ dspfunc(put, 1, 8);
++ dspfunc(put_no_rnd, 1, 8);
++
++ dspfunc(avg, 1, 8);
++ dspfunc(avg_no_rnd, 1, 8);
++ dspfunc(avg, 0, 16);
++ dspfunc(avg_no_rnd, 0, 16);
++#undef dspfunc
++
++}
++
++
++
++#if 0
++int main(int argc, char *argv[]){
++
++
++}
++#endif
++
+diff --git a/libavcodec/avr32/fdct.S b/libavcodec/avr32/fdct.S
+new file mode 100644
+index 0000000..be45b86
+--- /dev/null
++++ b/libavcodec/avr32/fdct.S
+@@ -0,0 +1,541 @@
++/*
++ * Copyright (c) 2007 Atmel Corporation. All rights reserved.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * 2. Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials provided
++ * with the distribution.
++ *
++ * 3. The name of ATMEL may not be used to endorse or promote products
++ * derived from this software without specific prior written
++ * permission.
++ *
++ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
++ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
++ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
++ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
++ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
++ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
++ * DAMAGE.
++ */
++
++//**********************************************************
++//* 2-D fDCT, Based on: *
++//* C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical *
++//* Fast 1-D DCT Algorithms with 11 Multiplications", *
++//* Proc. Int'l. Conf. on Acoustics, Speech, and Signal *
++//* Processing 1989 (ICASSP '89), pp. 988-991. *
++//* *
++//* Fixed point implementation optimized for the AVR-II *
++//* instruction set. If a table is used for the *
++//* coeffisients we can load two and two of them from *
++//* This will give a reduction of
++//* *
++//* *
++//**********************************************************
++
++
++/* This routine is a slow-but-accurate integer implementation of the
++ * forward DCT (Discrete Cosine Transform). Taken from the IJG software
++ *
++ * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
++ * on each column. Direct algorithms are also available, but they are
++ * much more complex and seem not to be any faster when reduced to code.
++ *
++ * This implementation is based on an algorithm described in
++ * C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
++ * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
++ * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
++ * The primary algorithm described there uses 11 multiplies and 29 adds.
++ * We use their alternate method with 12 multiplies and 32 adds.
++ * The advantage of this method is that no data path contains more than one
++ * multiplication; this allows a very simple and accurate implementation in
++ * scaled fixed-point arithmetic, with a minimal number of shifts.
++ *
++ * The poop on this scaling stuff is as follows:
++ *
++ * Each 1-D DCT step produces outputs which are a factor of sqrt(N)
++ * larger than the true DCT outputs. The final outputs are therefore
++ * a factor of N larger than desired; since N=8 this can be cured by
++ * a simple right shift at the end of the algorithm. The advantage of
++ * this arrangement is that we save two multiplications per 1-D DCT,
++ * because the y0 and y4 outputs need not be divided by sqrt(N).
++ * In the IJG code, this factor of 8 is removed by the quantization step
++ * (in jcdctmgr.c), here it is removed.
++ *
++ * We have to do addition and subtraction of the integer inputs, which
++ * is no problem, and multiplication by fractional constants, which is
++ * a problem to do in integer arithmetic. We multiply all the constants
++ * by CONST_SCALE and convert them to integer constants (thus retaining
++ * CONST_BITS bits of precision in the constants). After doing a
++ * multiplication we have to divide the product by CONST_SCALE, with proper
++ * rounding, to produce the correct output. This division can be done
++ * cheaply as a right shift of CONST_BITS bits. We postpone shifting
++ * as long as possible so that partial sums can be added together with
++ * full fractional precision.
++ *
++ * The outputs of the first pass are scaled up by PASS1_BITS bits so that
++ * they are represented to better-than-integral precision. These outputs
++ * require 8 + PASS1_BITS + 3 bits; this fits in a 16-bit word
++ * with the recommended scaling. (For 12-bit sample data, the intermediate
++ * array is INT32 anyway.)
++ *
++ * To avoid overflow of the 32-bit intermediate results in pass 2, we must
++ * have 8 + CONST_BITS + PASS1_BITS <= 26. Error analysis
++ * shows that the values given below are the most effective.
++ *
++ * We can gain a little more speed, with a further compromise in accuracy,
++ * by omitting the addition in a descaling shift. This yields an incorrectly
++ * rounded result half the time...
++ */
++
++ .global fdct_avr32
++
++
++
++#define CONST_BITS 13
++#define PASS1_BITS 2
++
++#define FIX_0_298631336 2446 /* FIX(0.298631336) */
++#define FIX_0_390180644 3196 /* FIX(0.390180644) */
++#define FIX_0_541196100 4433 /* FIX(0.541196100) */
++#define FIX_0_765366865 6270 /* FIX(0.765366865) */
++#define FIX_0_899976223 7373 /* FIX(0.899976223) */
++#define FIX_1_175875602 9633 /* FIX(1.175875602) */
++#define FIX_1_501321110 12299 /* FIX(1.501321110) */
++#define FIX_1_847759065 15137 /* FIX(1.847759065) */
++#define FIX_1_961570560 16069 /* FIX(1.961570560) */
++#define FIX_2_053119869 16819 /* FIX(2.053119869) */
++#define FIX_2_562915447 20995 /* FIX(2.562915447) */
++#define FIX_3_072711026 25172 /* FIX(3.072711026) */
++
++
++/*
++ * Perform an integer forward DCT on one block of samples.
++ */
++
++//void
++//fdct_int32(short *const block)
++//{
++// int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
++// int tmp10, tmp11, tmp12, tmp13;
++// int z1, z2, z3, z4, z5;
++// short *blkptr;
++// int *dataptr;
++// int data[64];
++// int i;
++//
++// /* Pass 1: process rows. */
++// /* Note results are scaled up by sqrt(8) compared to a true DCT; */
++// /* furthermore, we scale the results by 2**PASS1_BITS. */
++//
++// dataptr = data;
++// blkptr = block;
++
++ .text
++fdct_avr32:
++ pushm r0-r3, r4-r7, lr
++#define loop_ctr r0
++#define blkptr r12
++#define x0 r1
++#define x1 r2
++#define x2 r3
++#define x3 r4
++#define x4 r5
++#define x5 r6
++#define x6 r7
++#define x7 r8
++#define tmp0 r5
++#define tmp7 r2
++#define tmp1 r3
++#define tmp6 r4
++#define tmp2 r9
++#define tmp5 r8
++#define tmp3 r7
++#define tmp4 r6
++
++
++ mov loop_ctr, 8
++// for (i = 0; i < 8; i++) {
++ROW_LOOP:
++
++ ldm blkptr, r1, r2, r3, r4
++
++// tmp2 = blkptr[2] + blkptr[5];
++// tmp3 = blkptr[3] + blkptr[4];
++ paddx.h r5, r3, r2
++// tmp5 = blkptr[2] - blkptr[5];
++// tmp4 = blkptr[3] - blkptr[4];
++ psubx.h r6, r3, r2
++// tmp0 = blkptr[0] + blkptr[7];
++// tmp1 = blkptr[1] + blkptr[6];
++ paddx.h r2, r4, r1
++// tmp7 = blkptr[0] - blkptr[7];
++// tmp6 = blkptr[1] - blkptr[6];
++ psubx.h r3, r4, r1
++
++// /* Even part per LL&M figure 1 --- note that published figure is faulty;
++// * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
++// */
++
++#define tmp10 r1
++#define tmp13 r5
++#define tmp11 r7
++#define tmp12 r3
++#define z1 r9
++
++// tmp10 = tmp0 + tmp3;
++// tmp13 = tmp0 - tmp3;
++ paddsub.h r1, r2:t, r5:b
++// tmp11 = tmp1 + tmp2;
++// tmp12 = tmp1 - tmp2;
++ paddsub.h r4, r2:b, r5:t
++
++
++// dataptr[0] = (tmp10 + tmp11) << PASS1_BITS;
++// dataptr[4] = (tmp10 - tmp11) << PASS1_BITS;
++ paddsub.h r7, r1:t, r4:t
++ ld.w r10, pc[const_table - .]
++ plsl.h r7, r7, PASS1_BITS
++
++// z1 = (tmp12 + tmp13) * FIX_0_541196100;
++ addhh.w r8, r4:b, r1:b
++ mulhh.w r8, r8:b, r10:t
++
++// dataptr[2] =
++// DESCALE(z1 + tmp13 * FIX_0_765366865, CONST_BITS - PASS1_BITS);
++// dataptr[6] =
++// DESCALE(z1 + tmp12 * (-FIX_1_847759065), CONST_BITS - PASS1_BITS);
++ mulhh.w r9, r1:b, r10:b
++ ld.w r10, pc[const_table - . + 4]
++ add r1, r8, r9
++ satrnds r1 >> (CONST_BITS - PASS1_BITS), 31
++
++ mulhh.w r9, r4:b, r10:t
++ add r4, r8, r9
++ satrnds r4 >> (CONST_BITS - PASS1_BITS), 31
++
++
++// /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
++// * cK represents cos(K*pi/16).
++// * i0..i3 in the paper are tmp4..tmp7 here.
++// */
++
++#define z2 r5
++#define z3 r6
++#define z4 r7
++#define z5 r8
++
++// z4 = tmp5 + tmp7;
++// z3 = tmp4 + tmp6;
++ padd.h r2, r6, r3
++// z2 = tmp5 + tmp6;
++// z1 = tmp4 + tmp7;
++ paddx.h r5, r6, r3
++
++ lddpc r9, pc[const_table - . + 8]
++// z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */
++ addhh.w r8, r2:t, r2:b
++ mulhh.w r8, r8:b, r10:b
++ lddpc r10, pc[const_table - . + 12]
++
++
++// tmp4 *= FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */
++ mulhh.w r11, r6:b, r9:t
++
++// tmp5 *= FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */
++ mulhh.w r6, r6:t, r9:b
++
++// tmp6 *= FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */
++ lddpc r9, pc[const_table - . + 20]
++ mulhh.w lr, r3:b, r10:t
++
++// tmp7 *= FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */
++ mulhh.w r3, r3:t, r10:b
++
++// z3 *= -FIX_1_961570560; /* sqrt(2) * (-c3-c5) */
++ mulhh.w r10, r2:b, r9:t
++
++// z4 *= -FIX_0_390180644; /* sqrt(2) * (c5-c3) */
++ mulhh.w r2, r2:t, r9:b
++ lddpc r9, pc[const_table - . + 16]
++// z3 += z5;
++// z4 += z5;
++ add r10, r8
++ add r2, r8
++
++// z1 *= -FIX_0_899976223; /* sqrt(2) * (c7-c3) */
++ mulhh.w r8, r5:b, r9:t
++
++// z2 *= -FIX_2_562915447; /* sqrt(2) * (-c1-c3) */
++ mulhh.w r5, r5:t, r9:b
++
++// dataptr[7] = DESCALE(tmp4 + z1 + z3, CONST_BITS - PASS1_BITS);
++ add r11, r8
++ add r11, r10
++ satrnds r11 >> (CONST_BITS - PASS1_BITS), 31
++
++// dataptr[5] = DESCALE(tmp5 + z2 + z4, CONST_BITS - PASS1_BITS);
++ add r6, r5
++
++ sthh.w blkptr[6*2], r4:b, r11:b
++ add r6, r2
++ satrnds r6 >> (CONST_BITS - PASS1_BITS), 31
++
++// dataptr[3] = DESCALE(tmp6 + z2 + z3, CONST_BITS - PASS1_BITS);
++ add lr, r5
++ sthh.w blkptr[4*2], r7:b, r6:b
++ add lr, r10
++ satrnds lr >> (CONST_BITS - PASS1_BITS), 31
++
++// dataptr[1] = DESCALE(tmp7 + z1 + z4, CONST_BITS - PASS1_BITS);
++ add r3, r8
++ sthh.w blkptr[2*2], r1:b, lr:b
++ add r3, r2
++ satrnds r3 >> (CONST_BITS - PASS1_BITS), 31
++
++
++
++// dataptr += 8; /* advance pointer to next row */
++// blkptr += 8;
++ sthh.w blkptr[0], r7:t, r3:b
++ sub blkptr, -16
++ sub loop_ctr, 1
++ brne ROW_LOOP
++
++// }
++
++ /* Pass 2: process columns.
++ * We remove the PASS1_BITS scaling, but leave the results scaled up
++ * by an overall factor of 8.
++ */
++
++// dataptr = data;
++ sub blkptr, 128
++
++ mov loop_ctr, 4
++// for (i = 0; i < 8; i++) {
++COLOUMN_LOOP:
++ ld.w r1, blkptr[0]
++ ld.w r2, blkptr[1*8*2]
++ ld.w r3, blkptr[2*8*2]
++ ld.w r4, blkptr[3*8*2]
++ ld.w r5, blkptr[4*8*2]
++ ld.w r6, blkptr[5*8*2]
++ ld.w r7, blkptr[6*8*2]
++ ld.w r8, blkptr[7*8*2]
++
++// tmp0 = blkptr[0] + blkptr[7*8];
++ padds.sh r9, r1, r8
++// tmp7 = blkptr[0] - blkptr[7*8];
++ psubs.sh r1, r1, r8
++// tmp1 = blkptr[1*8] + blkptr[6*8];
++ padds.sh r8, r2, r7
++// tmp6 = blkptr[1*8] - blkptr[6*8];
++ psubs.sh r2, r2, r7
++// tmp2 = blkptr[2*8] + blkptr[5*8];
++ padds.sh r7, r3, r6
++// tmp5 = blkptr[2*8] - blkptr[5*8];
++ psubs.sh r3, r3, r6
++// tmp3 = blkptr[3*8] + blkptr[4*8];
++ padds.sh r6, r4, r5
++// tmp4 = blkptr[3*8] - blkptr[4*8];
++ psubs.sh r4, r4, r5
++
++// /* even part per ll&m figure 1 --- note that published figure is faulty;
++// * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
++// */
++//
++// tmp10 = tmp0 + tmp3;
++ padds.sh r5, r9, r6
++// tmp13 = tmp0 - tmp3;
++ psubs.sh r9, r9, r6
++// tmp11 = tmp1 + tmp2;
++ padds.sh r6, r8, r7
++// tmp12 = tmp1 - tmp2;
++ psubs.sh r8, r8, r7
++
++// dataptr[0] = DESCALE(tmp10 + tmp11, PASS1_BITS);
++// dataptr[32] = DESCALE(tmp10 - tmp11, PASS1_BITS);
++//Might get an overflow here
++ padds.sh r7, r5, r6
++ psubs.sh r5, r5, r6
++
++ //Rounding
++ mov lr, (1 << (PASS1_BITS + 2))
++ orh lr, hi(1 << (16 + PASS1_BITS + 2))
++ padds.sh r7, r7, lr
++ padds.sh r5, r5, lr
++
++ pasr.h r7, r7, PASS1_BITS + 3
++ pasr.h r5, r5, PASS1_BITS + 3
++ st.w r12[0], r7
++ st.w r12[4*8*2], r5
++
++ lddpc r10, const_table2
++
++
++// z1 = (tmp12 + tmp13) * FIX_0_541196100;
++ padds.sh r5, r8, r9
++ mulhh.w r6, r5:t, r10:t
++ mulhh.w r7, r5:b, r10:t
++
++// dataptr[16] =
++// DESCALE(z1 + tmp13 * FIX_0_765366865, CONST_BITS + PASS1_BITS);
++ lddpc r11, const_table2 + 4
++ mulhh.w lr, r9:t, r10:b
++ mulhh.w r9, r9:b, r10:b
++ add lr, r6
++ add r9, r7
++ satrnds lr >> (CONST_BITS + PASS1_BITS + 3), 31
++ satrnds r9 >> (CONST_BITS + PASS1_BITS + 3), 31
++ sthh.w r12[2*8*2], lr:b, r9:b
++
++// dataptr[48] =
++// DESCALE(z1 + tmp12 * (-FIX_1_847759065), CONST_BITS + PASS1_BITS);
++ mulhh.w lr, r8:t, r11:t
++ mulhh.w r8, r8:b, r11:t
++ add lr, r6
++ add r8, r7
++ satrnds lr >> (CONST_BITS + PASS1_BITS + 3), 31
++ satrnds r8 >> (CONST_BITS + PASS1_BITS + 3), 31
++ sthh.w r12[6*8*2], lr:b, r8:b
++
++// /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
++// * cK represents cos(K*pi/16).
++// * i0..i3 in the paper are tmp4..tmp7 here.
++// */
++//
++// z2 = tmp5 + tmp6;
++// z3 = tmp4 + tmp6;
++// z4 = tmp5 + tmp7;
++ padds.sh r5, r3, r2
++ padds.sh r6, r4, r2
++ padds.sh r7, r3, r1
++
++// z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */
++ padds.sh r8, r6, r7
++ mulhh.w r9, r8:t, r11:b
++ mulhh.w r8, r8:b, r11:b
++
++// z3 *= -FIX_1_961570560; /* sqrt(2) * (-c3-c5) */
++// z3 += z5;
++ lddpc r11, const_table2 + 8
++ mulhh.w r10, r6:t, r11:t
++ mulhh.w r6, r6:b, r11:t
++ add r10, r9
++ add r6, r8
++
++// z4 *= -FIX_0_390180644; /* sqrt(2) * (c5-c3) */
++// z4 += z5;
++ mulhh.w lr, r7:t, r11:b
++ mulhh.w r7, r7:b, r11:b
++ lddpc r11, const_table2 + 12
++ st.w --sp,r0
++ add lr, r9
++ add r7, r8
++
++// tmp6 *= FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */
++ mulhh.w r0, r2:t, r11:t
++ machh.w r0, r5:t, r11:b
++ mulhh.w r2, r2:b, r11:t
++ machh.w r2, r5:b, r11:b
++
++// z2 *= -FIX_2_562915447; /* sqrt(2) * (-c1-c3) */
++// dataptr[24] = DESCALE(tmp6 + z2 + z3, CONST_BITS + PASS1_BITS);
++ add r0, r10
++ lddpc r11, const_table2 + 16
++ add r2, r6
++ satrnds r0 >> (CONST_BITS + PASS1_BITS + 3), 31
++ satrnds r2 >> (CONST_BITS + PASS1_BITS + 3), 31
++ sthh.w r12[3*8*2], r0:b, r2:b
++// tmp5 *= FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */
++ mulhh.w r0, r3:t, r11:t
++ machh.w r0, r5:t, r11:b
++ mulhh.w r2, r3:b, r11:t
++ machh.w r2, r5:b, r11:b
++ add r0, lr
++ lddpc r11, const_table2 + 20
++ add r2, r7
++
++// dataptr[40] = DESCALE(tmp5 + z2 + z4, CONST_BITS + PASS1_BITS);
++ satrnds r0 >> (CONST_BITS + PASS1_BITS + 3), 31
++ satrnds r2 >> (CONST_BITS + PASS1_BITS + 3), 31
++ sthh.w r12[5*8*2], r0:b, r2:b
++
++
++// z1 = tmp4 + tmp7;
++ padds.sh r2, r4, r1
++
++// tmp4 *= FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */
++ mulhh.w r3, r4:t, r11:t
++ machh.w r3, r2:t, r11:b
++ mulhh.w r4, r4:b, r11:t
++ machh.w r4, r2:b, r11:b
++ add r3, r10
++ lddpc r11, const_table2 + 24
++ add r4, r6
++
++// z1 *= -FIX_0_899976223; /* sqrt(2) * (c7-c3) */
++// dataptr[56] = DESCALE(tmp4 + z1 + z3, CONST_BITS + PASS1_BITS);
++ satrnds r3 >> (CONST_BITS + PASS1_BITS + 3), 31
++ satrnds r4 >> (CONST_BITS + PASS1_BITS + 3), 31
++ sthh.w r12[7*8*2], r3:b, r4:b
++
++
++// tmp7 *= FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */
++ mulhh.w r3, r1:t, r11:t
++ machh.w r3, r2:t, r11:b
++ mulhh.w r4, r1:b, r11:t
++ machh.w r4, r2:b, r11:b
++ add r3, lr
++ add r4, r7
++
++// dataptr[8] = DESCALE(tmp7 + z1 + z4, CONST_BITS + PASS1_BITS);
++ satrnds r3 >> (CONST_BITS + PASS1_BITS + 3), 31
++ satrnds r4 >> (CONST_BITS + PASS1_BITS + 3), 31
++ sthh.w r12[1*8*2], r3:b, r4:b
++ ld.w r0, sp++
++
++// dataptr++; /* advance pointer to next column */
++ sub blkptr, -4
++ sub loop_ctr, 1
++ brne COLOUMN_LOOP
++
++// }
++
++ popm r0-r3, r4-r7, pc
++
++// /* descale */
++// for (i = 0; i < 64; i++)
++// block[i] = (short int) DESCALE(data[i], 3);
++
++
++//}
++
++
++ .align 2
++const_table: .short FIX_0_541196100, FIX_0_765366865, -FIX_1_847759065, FIX_1_175875602
++ .short FIX_0_298631336, FIX_2_053119869, FIX_3_072711026, FIX_1_501321110
++ .short -FIX_0_899976223,-FIX_2_562915447, -FIX_1_961570560, -FIX_0_390180644
++
++const_table2: .short FIX_0_541196100, FIX_0_765366865, -FIX_1_847759065, FIX_1_175875602
++ .short -FIX_1_961570560, -FIX_0_390180644, FIX_3_072711026, -FIX_2_562915447
++ .short FIX_2_053119869, -FIX_2_562915447, FIX_0_298631336, -FIX_0_899976223
++ .short FIX_1_501321110, -FIX_0_899976223
++
++
++
++
+diff --git a/libavcodec/avr32/h264idct.S b/libavcodec/avr32/h264idct.S
+new file mode 100644
+index 0000000..4b23e2d
+--- /dev/null
++++ b/libavcodec/avr32/h264idct.S
+@@ -0,0 +1,451 @@
++/*
++ * Copyright (c) 2007 Atmel Corporation. All rights reserved.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * 2. Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials provided
++ * with the distribution.
++ *
++ * 3. The name of ATMEL may not be used to endorse or promote products
++ * derived from this software without specific prior written
++ * permission.
++ *
++ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
++ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
++ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
++ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
++ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
++ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
++ * DAMAGE.
++ */
++
++ .global h264_idct_add_avr32
++
++ /* Macro for performing the 1-D transform on one row line.
++
++ The register 'w01' should contain the first two pixels,
++ and the register 'w23' should contain the last two pixels
++ in the line. The resulting line is placed in p01 and p23
++ so that { w01, w23 } = { x0, x1, x3, x2 }.
++ 'tmp' and 'tmp2' should be scratchpad registers. */
++ .macro transform_row w01, w23, tmp, tmp2
++ add \tmp, \w23, \w01 << 1 /* tmp = { xxxx, 2*w1 + w3 } */
++ sub \tmp2, \w01, \w23 << 1 /* tmp2 = { xxxx, w1 - 2*w3 } */
++ bfins \tmp2, \tmp, 16, 16 /* tmp2 = { 2*w1 + w3, w1 - 2*w3 } */
++ pasr.h \tmp2, \tmp2, 1 /* tmp2 = { w1 + w3/2, w1/2 - w3 } */
++ paddsub.h \tmp, \w01:t, \w23:t /* tmp = { w0 + w2, w0 - w2 } */
++ padd.h \w01, \tmp, \tmp2 /* w01 = { w0 + w2 + w1 + w3/2, w0 - w2 + w1/2 - w3 } */
++ psub.h \w23, \tmp, \tmp2 /* w23 = { w0 + w2 - w1 - w3/2, w0 - w2 - w1/2 + w3 } */
++ .endm
++
++ /* Macro for performing the 1-D transform on two columns.
++
++ The registers w0, w1, w2, w3 should each contain two
++ packed samples from the two colomns to transform.
++ tmp and tmp2 are scratchpad registers.
++
++ The resulting transformed columns are placed in the
++ same positions as the input columns.
++ */
++ .macro transform_2columns w0, w1, w2, w3, tmp, tmp2
++ padd.h \tmp, \w0, \w2 /* tmp = z0 = w0 + w2 */
++ psub.h \w0, \w0, \w2 /* w0 = z1 = w0 - w2 */
++ pasr.h \w2, \w1, 1 /* w2 = w1/2 */
++ pasr.h \tmp2, \w3, 1 /* tmp2 = w3/2 */
++ psub.h \w3, \w2, \w3 /* w3 = z2 = w1/2 - w3 */
++ padd.h \tmp2, \w1, \tmp2/* tmp2 = z3 = w1 + w3/2 */
++ padd.h \w1, \w0, \w3 /* w1 = x1 = z1 + z2 */
++ psub.h \w2, \w0, \w3 /* w2 = x2 = z1 - z2 */
++ padd.h \w0, \tmp, \tmp2/* w0 = x0 = z0 + z3 */
++ psub.h \w3, \tmp, \tmp2/* w3 = x3 = z0 - z3 */
++ /* Scale down result. */
++ pasr.h \w0, \w0, 6
++ pasr.h \w1, \w1, 6
++ pasr.h \w2, \w2, 6
++ pasr.h \w3, \w3, 6
++ .endm
++
++/*void h264_idct_add_avr32(uint8_t *dst, DCTELEM *block, int stride)*/
++
++h264_idct_add_avr32:
++
++ stm --sp,r0-r3,r4-r7, lr
++
++ /* Setup rounding factor. */
++ mov r0, (1 << 5)
++ lsl r0, 16
++
++ /* Load block */
++ ldm r11,r2-r9
++ /* r9 = { w00, w01 },
++ r8 = { w02, w03 },
++ r7 = { w10, w11 },
++ r6 = { w12, w13 },
++ r5 = { w20, w21 },
++ r4 = { w22, w23 },
++ r3 = { w30, w31 },
++ r2 = { w32, w33 } */
++
++
++ /* Add the rounding factor to w00. */
++ add r9, r0
++
++ /* Transform rows */
++ transform_row r9, r8, r0, r1
++ transform_row r7, r6, r0, r1
++ transform_row r5, r4, r0, r1
++ transform_row r3, r2, r0, r1
++
++ /* Transform columns */
++ transform_2columns r9, r7, r5, r3, r0, r1
++ transform_2columns r8, r6, r4, r2, r0, r1
++
++ /* Load predicted pixels.*/
++ ld.w lr, r12[0]
++ ld.w r11, r12[r10]
++
++ /* Unpack to halwords. */
++ punpckub.h r0, lr:t
++ punpckub.h r1, lr:b
++
++ /* Add with transformed row. */
++ padd.h r0, r0, r9
++ paddx.h r1, r1, r8
++ /* Pack and saturate back to 8-bit pixels. */
++ packsh.ub r0, r0, r1
++
++ /* Unpack to halwords. */
++ punpckub.h lr, r11:t
++ punpckub.h r11, r11:b
++
++ /* Add with transformed row. */
++ padd.h lr, lr, r7
++ paddx.h r11, r11, r6
++ /* Pack and saturate back to 8-bit pixels. */
++ packsh.ub r1, lr, r11
++
++ /* Store back to frame. */
++ st.w r12[0], r0
++ st.w r12[r10], r1
++
++ add r12, r12, r10 << 1
++
++ /* Load predicted pixels.*/
++ ld.w lr, r12[0]
++ ld.w r11, r12[r10]
++
++ /* Unpack to halwords. */
++ punpckub.h r0, lr:t
++ punpckub.h r1, lr:b
++
++ /* Add with transformed row. */
++ padd.h r0, r0, r5
++ paddx.h r1, r1, r4
++ /* Pack and saturate back to 8-bit pixels. */
++ packsh.ub r0, r0, r1
++
++ /* Unpack to halwords. */
++ punpckub.h lr, r11:t
++ punpckub.h r11, r11:b
++
++ /* Add with transformed row. */
++ padd.h lr, lr, r3
++ paddx.h r11, r11, r2
++ /* Pack and saturate back to 8-bit pixels. */
++ packsh.ub r1, lr, r11
++
++ /* Store back to frame. */
++ st.w r12[0], r0
++ st.w r12[r10], r1
++
++ ldm sp++,r0-r3,r4-r7, pc
++
++
++ .global h264_idct8_add_avr32
++//void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride){
++
++h264_idct8_add_avr32:
++ stm --sp,r0-r3,r4-r7, lr
++
++ /* Push dst and stride on stack */
++ stm --sp,r10,r12
++
++// int i;
++// DCTELEM (*src)[8] = (DCTELEM(*)[8])block;
++// uint8_t *cm = cropTbl + MAX_NEG_CROP;
++
++// block[0] += 32;
++
++
++// for( i = 0; i < 8; i++ )
++// {
++ mov lr, 4
++0:
++ ld.w r7, r11[0*(8*2)]
++ ld.w r6, r11[1*(8*2)]
++ ld.w r5, r11[2*(8*2)]
++ ld.w r4, r11[3*(8*2)]
++ ld.w r3, r11[4*(8*2)]
++ ld.w r2, r11[5*(8*2)]
++ ld.w r1, r11[6*(8*2)]
++ ld.w r0, r11[7*(8*2)]
++
++/*
++
++ const int a0 = src[0][i] + src[4][i];
++ const int a2 = src[0][i] - src[4][i];
++ const int a4 = (src[2][i]>>1) - src[6][i];
++ const int a6 = (src[6][i]>>1) + src[2][i];
++*/
++ padd.h r8, r7, r3 /* r8 = a0 */
++ psub.h r7, r7, r3 /* r7 = a2 */
++ pasr.h r3, r5, 1 /* r3 = src[2][i] >> 1 */
++ pasr.h r9, r1, 1 /* r9 = src[6][i] >> 1 */
++ psub.h r3, r3, r1 /* r3 = a4 */
++ padd.h r9, r9, r5 /* r9 = a6 */
++
++/*
++ const int b0 = a0 + a6;
++ const int b2 = a2 + a4;
++ const int b4 = a2 - a4;
++ const int b6 = a0 - a6;
++*/
++ padd.h r1, r8, r9 /* r1 = b0 */
++ psub.h r8, r8, r9 /* r8 = b6 */
++ padd.h r5, r7, r3 /* r5 = b2 */
++ psub.h r7, r7, r3 /* r7 = b4 */
++
++/*
++ const int a1 = -src[3][i] + src[5][i] - src[7][i] - (src[7][i]>>1);
++ const int a3 = src[1][i] + src[7][i] - src[3][i] - (src[3][i]>>1);
++ const int a5 = -src[1][i] + src[7][i] + src[5][i] + (src[5][i]>>1);
++ const int a7 = src[3][i] + src[5][i] + src[1][i] + (src[1][i]>>1);
++*/
++ pasr.h r3, r0, 1
++ padd.h r3, r3, r0
++ psub.h r3, r2, r3
++ psub.h r3, r3, r4 /* r3 = a1 */
++
++ pasr.h r9, r4, 1
++ padd.h r9, r9, r4
++ psub.h r9, r0, r9
++ padd.h r9, r6, r9 /* r9 = a3 */
++
++ pasr.h r10, r2, 1
++ padd.h r10, r10, r2
++ padd.h r10, r10, r0
++ psub.h r10, r10, r6 /* r10 = a5 */
++
++ pasr.h r0, r6, 1
++ padd.h r0, r0, r6
++ padd.h r0, r0, r2
++ padd.h r0, r0, r4 /* r0 = a7 */
++/*
++ const int b1 = (a7>>2) + a1;
++ const int b3 = a3 + (a5>>2);
++ const int b5 = (a3>>2) - a5;
++ const int b7 = a7 - (a1>>2);
++*/
++ pasr.h r2, r0, 2
++ padd.h r2, r2, r3 /* r2 = b1 */
++ pasr.h r3, r3, 2
++ psub.h r3, r0, r3 /* r3 = b7 */
++
++ pasr.h r0, r10, 2
++ padd.h r0, r0, r9 /* r0 = b3 */
++ pasr.h r9, r9, 2
++ psub.h r9, r9, r10 /* r9 = b5 */
++
++
++/*
++ src[0][i] = b0 + b7;
++ src[7][i] = b0 - b7;
++ src[1][i] = b2 + b5;
++ src[6][i] = b2 - b5;
++ src[2][i] = b4 + b3;
++ src[5][i] = b4 - b3;
++ src[3][i] = b6 + b1;
++ src[4][i] = b6 - b1; */
++
++ padd.h r4, r1, r3
++ psub.h r1, r1, r3
++ st.w r11[0*(8*2)], r4
++ st.w r11[7*(8*2)], r1
++
++ padd.h r3, r5, r9
++ psub.h r5, r5, r9
++ st.w r11[1*(8*2)], r3
++ st.w r11[6*(8*2)], r5
++
++ padd.h r9, r7, r0
++ psub.h r7, r7, r0
++ st.w r11[2*(8*2)], r9
++ st.w r11[5*(8*2)], r7
++
++ padd.h r0, r8, r2
++ psub.h r8, r8, r2
++ st.w r11[3*(8*2)], r0
++ st.w r11[4*(8*2)], r8
++
++ sub r11, -4
++ sub lr, 1
++ brne 0b
++
++// }
++
++ lddsp r12, sp[0] /* r12 = dst */
++ sub r11, 4*4
++ ldm r11++, r4-r7
++ mov lr, 8
++ /* Push dst and stride on stack */
++
++1:
++// for( i = 0; i < 8; i++ )
++// {
++
++ /* r7 = {src[i][0], src[i][1]}
++ r6 = {src[i][2], src[i][3]}
++ r5 = {src[i][4], src[i][5]}
++ r4 = {src[i][6], src[i][7]} */
++
++/*
++ const int a0 = src[i][0] + src[i][4];
++ const int a2 = src[i][0] - src[i][4];
++ const int a4 = (src[i][2]>>1) - src[i][6];
++ const int a6 = (src[i][6]>>1) + src[i][2];
++*/
++ pasr.h r8, r6, 1
++ pasr.h r9, r4, 1
++ addhh.w r0, r7:t, r5:t /* r0 = a0 */
++ subhh.w r1, r7:t, r5:t /* r1 = a2 */
++ subhh.w r2, r8:t, r4:t /* r2 = a4 */
++ addhh.w r3, r9:t, r6:t /* r3 = a6 */
++
++/*
++ const int b0 = a0 + a6;
++ const int b2 = a2 + a4;
++ const int b4 = a2 - a4;
++ const int b6 = a0 - a6;
++*/
++ add r10, r0, r3 /* r10 = b0 */
++ sub r0, r3 /* r0 = b6 */
++ add r3, r1, r2 /* r3 = b2 */
++ sub r1, r2 /* r1 = b4 */
++/*
++
++
++ const int a7 = src[i][5] + src[i][3] + src[i][1] + (src[i][1]>>1);
++ const int a1 = src[i][5] - src[i][3] - src[i][7] - (src[i][7]>>1);
++ const int a3 = src[i][7] + src[i][1] - src[i][3] - (src[i][3]>>1);
++ const int a5 = src[i][7] - src[i][1] + src[i][5] + (src[i][5]>>1); */
++ addhh.w r8, r8:b, r6:b
++ addhh.w r2, r4:b, r7:b
++ sub r2, r8 /* r2 = a3 */
++
++ addhh.w r9, r9:b, r4:b
++ subhh.w r8, r5:b, r6:b
++ sub r8, r9 /* r8 = a1 */
++
++ pasr.h r9, r7, 1
++ addhh.w r9, r9:b, r7:b
++ addhh.w r6, r5:b, r6:b
++ add r6, r9 /* r6 = a7 */
++
++ pasr.h r9, r5, 1
++ addhh.w r9, r9:b, r5:b
++ subhh.w r5, r4:b, r7:b
++ add r5, r9 /* r5 = a5 */
++
++/* const int b1 = (a7>>2) + a1;
++ const int b3 = (a5>>2) + a3;
++ const int b5 = (a3>>2) - a5;
++ const int b7 = -(a1>>2) + a7 ; */
++ asr r4, r6, 2
++ add r4, r8 /* r4 = b1 */
++ asr r8, 2
++ rsub r8, r6 /* r8 = b7 */
++
++ asr r6, r5, 2
++ add r6, r2 /* r6 = b3 */
++ asr r2, 2
++ sub r2, r5 /* r2 = b5 */
++
++/*
++ dst[i*stride + 0] = cm[ dst[i*stride + 0] + ((b0 + b7) >> 6) ];
++ dst[i*stride + 1] = cm[ dst[i*stride + 1] + ((b2 + b5) >> 6) ];
++ dst[i*stride + 2] = cm[ dst[i*stride + 2] + ((b4 + b3) >> 6) ];
++ dst[i*stride + 3] = cm[ dst[i*stride + 3] + ((b6 + b1) >> 6) ];
++ dst[i*stride + 4] = cm[ dst[i*stride + 4] + ((b6 - b1) >> 6) ];
++ dst[i*stride + 5] = cm[ dst[i*stride + 5] + ((b4 - b3) >> 6) ];
++ dst[i*stride + 6] = cm[ dst[i*stride + 6] + ((b2 - b5) >> 6) ];
++ dst[i*stride + 7] = cm[ dst[i*stride + 7] + ((b0 - b7) >> 6) ];
++*/
++ add r5, r10, r8
++ satrnds r5 >> 6, 0 /* r5 = (b0 + b7) >> 6 */
++ sub r10, r8
++ satrnds r10 >> 6, 0 /* r10 = (b0 - b7) >> 6 */
++ add r8, r3, r2
++ satrnds r8 >> 6, 0 /* r8 = (b2 + b5) >> 6 */
++ sub r3, r2
++ satrnds r3 >> 6, 0 /* r3 = (b2 - b5) >> 6 */
++
++ add r2, r1, r6
++ satrnds r2 >> 6, 0 /* r2 = (b4 + b3) >> 6 */
++ sub r1, r6
++ satrnds r1 >> 6, 0 /* r1 = (b4 - b3) >> 6 */
++
++ add r6, r0, r4
++ satrnds r6 >> 6, 0 /* r6 = (b6 + b1) >> 6 */
++ sub r0, r4
++ satrnds r0 >> 6, 0 /* r0 = (b6 - b1) >> 6 */
++
++ ld.w r4, r12[0]
++
++ packw.sh r8, r5, r8
++ packw.sh r7, r2, r6
++ ld.w r9, r12[4]
++ packw.sh r6, r0, r1
++ packw.sh r5, r3, r10
++
++ punpckub.h r10, r4:t
++ punpckub.h r4, r4:b
++ punpckub.h r3, r9:t
++ punpckub.h r9, r9:b
++
++ padd.h r8, r8, r10
++ padd.h r7, r7, r4
++ padd.h r6, r6, r3
++ padd.h r5, r5, r9
++
++ lddsp r10, sp[4] /* r10 = stride */
++ packsh.ub r0, r8, r7
++ packsh.ub r1, r6, r5
++
++ st.w r12[0], r0
++ st.w r12[4], r1
++
++ ldm r11++, r4-r7
++ add r12, r10 /* dst += stride */
++
++ sub lr, 1
++ brne 1b
++
++ sub sp, -8
++ ldm sp++,r0-r3,r4-r7, pc
++
++
++
++// }
++//}
+diff --git a/libavcodec/avr32/idct.S b/libavcodec/avr32/idct.S
+new file mode 100644
+index 0000000..e7551ec
+--- /dev/null
++++ b/libavcodec/avr32/idct.S
+@@ -0,0 +1,829 @@
++/*
++ * Copyright (c) 2007 Atmel Corporation. All rights reserved.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * 2. Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials provided
++ * with the distribution.
++ *
++ * 3. The name of ATMEL may not be used to endorse or promote products
++ * derived from this software without specific prior written
++ * permission.
++ *
++ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
++ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
++ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
++ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
++ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
++ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
++ * DAMAGE.
++ */
++
++ .global idct_add_avr32
++ .global idct_put_avr32
++ .global idct_avr32
++
++
++#define CONST_BITS 13
++#define PASS1_BITS 2
++
++#define ONE ((INT32) 1)
++
++#define CONST_SCALE (ONE << CONST_BITS)
++
++#define LINE_SIZE 32
++
++#define FIX_0_298631336 (2446) /* FIX(0.298631336) */
++#define FIX_0_390180644 (3196) /* FIX(0.390180644) */
++#define FIX_0_541196100 (4433) /* FIX(0.541196100) */
++#define FIX_0_765366865 (6270) /* FIX(0.765366865) */
++#define FIX_0_899976223 (7373) /* FIX(0.899976223) */
++#define FIX_1_175875602 (9633) /* FIX(1.175875602) */
++#define FIX_1_501321110 (12299)/* FIX(1.501321110) */
++#define FIX_1_847759065 (15137)/* FIX(1.847759065) */
++#define FIX_1_961570560 (16069)/* FIX(1.961570560) */
++#define FIX_2_053119869 (16819)/* FIX(2.053119869) */
++#define FIX_2_562915447 (20995)/* FIX(2.562915447) */
++#define FIX_3_072711026 (25172)/* FIX(3.072711026) */
++
++
++#define loop_cnt r11
++
++ .text
++
++idct_add_avr32:
++ pushm r0-r3, r4-r7, lr //Free up registers to use for local variables
++
++ // Give room for some variables on the stack
++ sub sp, 8
++ stdsp SP[0], r12 // rfp
++ stdsp SP[4], r11 // iinc
++
++ mov loop_cnt, 8 //Initialize loop counter
++
++FOR_ROW:
++
++ ldm r10, r0, r1, r2, r3 //Load 8 DCT-coeffisients from the current row in the DCT-block
++ mov r6, 0
++#ifdef USE_PREFETCH
++ pref r10[LINE_SIZE] //Prefetch next line
++#endif
++ or r4, r2, r3 << 16
++ or r4, r1 //Check if all DCT-coeffisients except the DC is zero
++ or r4, r0
++ brne AC_ROW //If there are non-zero AC coeffisients perform row-transform
++
++ paddsub.h r5, r3:t, r6:b //Extract the DC-coeff from r5
++ plsl.h r5, r5, PASS1_BITS
++ mov r4, r5
++ st.d r10++, r4
++ st.d r10++, r4
++
++ sub loop_cnt, 1 //Decrement loop counter
++ brne FOR_ROW //Perform loop one more time if loop_cnt is not zero
++
++ bral COLOUMN_TRANSFORM //Perform coloumn transform after row transform is computed
++
++
++AC_ROW:
++
++
++ ld.w r12, pc[coef_table - .]
++ ld.w r9, pc[coef_table - . + 4]
++
++ padd.h r4, r2, r0 // r4:t = dataptr[2] + dataptr[6],r4:b = dataptr[3] + dataptr[7]
++ mulhh.w r5, r4:t, r12:t
++ mulhh.w r6, r0:t, r12:b
++ ld.w r12, pc[coef_table - . + 8]
++ mulhh.w r7, r2:t, r9:t
++ add r6, r5 // tmp2
++ satrnds r6 >> (CONST_BITS - PASS1_BITS), 31
++ add r7, r5 // tmp3
++ satrnds r7 >> (CONST_BITS - PASS1_BITS), 31
++
++ paddsub.h r5, r3:t, r1:t
++ plsl.h r5, r5, PASS1_BITS // r5:t = tmp0, r5:b = tmp1
++
++ paddsub.h r7, r5:t, r7:b // r7:t = tmp10, r7:b = tmp13
++ paddsub.h r6, r5:b, r6:b // r6:t = tmp11, r6:b = tmp12
++
++
++ addhh.w lr, r3:b, r1:b // lr = z4
++ addhh.w r5, r4:b, lr:b
++ mulhh.w r5, r5:b, r9:b // r5 = z5
++
++ ld.w r9, pc[coef_table - . + 12]
++ mulhh.w r4, r4:b, r12:t // r4 = z3
++ mulhh.w lr, lr:b, r12:b // lr = z4
++
++ add r4, r5
++ add lr, r5
++
++ addhh.w r5, r2:b, r1:b // r5 = z2
++ addhh.w r8, r3:b, r0:b // r8 = z1
++
++
++ mulhh.w r0, r0:b, r9:t // r0 = tmp0
++ ld.w r12, pc[coef_table - . + 16]
++ mulhh.w r1, r1:b, r9:b // r1 = tmp1
++ ld.w r9, pc[coef_table - . + 20]
++ mulhh.w r2, r2:b, r12:t // r2 = tmp2
++ mulhh.w r3, r3:b, r12:b // r3 = tmp3
++ mulhh.w r8, r8:b, r9:t // r8 = z1
++ mulhh.w r5, r5:b, r9:b // r5 = z2
++
++
++ add r0, r8
++ add r0, r4
++ add r1, r5
++ add r1, lr
++ add r2, r5
++ add r2, r4
++ add r3, r8
++ add r3, lr
++
++ satrnds r0 >> (CONST_BITS - PASS1_BITS), 31
++ satrnds r1 >> (CONST_BITS - PASS1_BITS), 31
++ satrnds r2 >> (CONST_BITS - PASS1_BITS), 31
++ satrnds r3 >> (CONST_BITS - PASS1_BITS), 31
++
++ paddsub.h r5, r6:t, r2:b // r5:t = dataptr[1], r5:b = dataptr[6]
++ paddsub.h r4, r7:t, r3:b // r4:t = dataptr[0], r4:b = dataptr[7]
++ paddsub.h r3, r6:b, r1:b // r3:t = dataptr[2], r3:b = dataptr[5]
++ paddsub.h r2, r7:b, r0:b // r2:t = dataptr[3], r2:b = dataptr[4]
++
++ sthh.w r10[0], r4:t, r5:t
++ sthh.w r10[4], r3:t, r2:t
++ sthh.w r10[8], r2:b, r3:b
++ sthh.w r10[12], r5:b, r4:b
++
++
++
++ sub r10, -16
++ sub loop_cnt, 1
++ brne FOR_ROW, e
++
++COLOUMN_TRANSFORM:
++
++ sub r10, 128 //Set pointer to start of DCT block
++
++
++ mov loop_cnt, 8
++FOR_COLOUMN:
++ ldins.h r3:t,r10[0] // r3:t = dataptr[0]
++ ldins.h r1:t,r10[1*8*2]// r1:t = dataptr[1]
++ ldins.h r2:t,r10[2*8*2]// r2:t = dataptr[2]
++ ldins.h r0:t,r10[5*8*2]// r0:t = dataptr[5]
++ ldins.h r3:b,r10[4*8*2]// r3:b = dataptr[4]
++ ldins.h r1:b,r10[3*8*2]// r1:b = dataptr[3]
++ ldins.h r2:b,r10[6*8*2]// r2:b = dataptr[6]
++ ldins.h r0:b,r10[7*8*2]// r0:b = dataptr[7]
++
++ or r4, r1, r3 << 16
++ or r4, r2
++ or r4, r0
++ brne AC_COLOUMN //If there are non-zero AC coeffisients perform row-transform
++
++ lddsp r12, SP[0] // rfp
++ lddsp r9, SP[4] // iinc
++ satrnds r3 >> ( PASS1_BITS + 3 + 16 ), 9
++ ld.d r0, r12[0]
++ sub r10, -2 // Increment the dataptr
++ bfins r3, r3, 16, 16
++ punpckub.h r2, r1:t
++ padd.h r2, r2, r3
++ punpckub.h r1, r1:b
++ padd.h r1, r1, r3
++ packsh.ub r1, r2, r1
++ punpckub.h r2, r0:t
++ padd.h r2, r2, r3
++ punpckub.h r0, r0:b
++ padd.h r0, r0, r3
++ packsh.ub r0, r2, r0
++ st.d r12[0], r0
++ add r12, r9 // increment rfp
++ stdsp SP[0], r12
++
++ sub loop_cnt, 1//Decrement loop counter
++ brne FOR_COLOUMN//Perform loop one more time if loop_cnt is not zero
++
++ sub sp, -8
++ popm r0-r3, r4-r7, pc//Pop back registers and PC
++
++AC_COLOUMN:
++
++ ld.w r12, pc[coef_table - .]
++ ld.w r9, pc[coef_table - . + 4]
++
++ addhh.w r4, r2:t, r2:b
++ mulhh.w r4, r4:b, r12:t // r4 = z1
++ mulhh.w r5, r2:b, r12:b
++ ld.w r12, pc[coef_table - . + 8]
++ mulhh.w r6, r2:t, r9:t
++ add r5, r4 // r5 = tmp2
++ add r6, r4 // r6 = tmp3
++
++ addhh.w r7, r3:t, r3:b
++ subhh.w r8, r3:t, r3:b
++
++ lsl r7, CONST_BITS
++ lsl r8, CONST_BITS
++
++ add r2, r7, r6 // r2 = tmp10
++ sub r3, r7, r6 // r3 = tmp13
++ add r4, r8, r5 // r4 = tmp11
++ sub r5, r8, r5 // r5 = tmp12
++
++ padd.h r6, r0, r1 // r6:t = z4, r6:b = z3
++ addhh.w r7, r6:t, r6:b
++ mulhh.w r7, r7:b, r9:b // r7 = z5
++
++ ld.w r9, pc[coef_table - . + 12]
++ mulhh.w r8, r6:b, r12:t // r8 = z3
++ mulhh.w r6, r6:t, r12:b // r6 = z4
++
++ add r8, r7
++ add r6, r7
++
++ paddx.h r7, r0, r1 // r7:t = z2, r7:b = z1
++
++ mulhh.w r12, r0:b, r9:t // r12 = tmp0
++ mulhh.w r0, r0:t, r9:b // r0 = tmp1
++ ld.w r9, pc[coef_table - . + 16]
++ add r12, r8
++ add r0, r6
++
++ ld.w lr, pc[coef_table - . + 20]
++ machh.w r8, r1:b, r9:t // r8 = tmp2
++ machh.w r6, r1:t, r9:b // r6 = tmp3
++ mulhh.w r9, r7:b, lr:t // r9 = z1
++ mulhh.w r7, r7:t, lr:b // r7 = z2
++
++
++ add r12, r9
++ add r0, r7
++ add r8, r7
++ add r6, r9
++
++ add r1, r2, r6 // r1 = dataptr[DCTSIZE*0]
++ sub r2, r2, r6 // r2 = dataptr[DCTSIZE*7]
++ add r6, r4, r8 // r6 = dataptr[DCTSIZE*1]
++ sub r4, r4, r8 // r4 = dataptr[DCTSIZE*6]
++ add r8, r5, r0 // r8 = dataptr[DCTSIZE*2]
++ sub r5, r5, r0 // r5 = dataptr[DCTSIZE*5]
++ add r0, r3, r12 // r0 = dataptr[DCTSIZE*3]
++ sub r3, r3, r12 // r3 = dataptr[DCTSIZE*4]
++
++ satrnds r1 >> (CONST_BITS+PASS1_BITS+3), 9
++ satrnds r2 >> (CONST_BITS+PASS1_BITS+3), 9
++ satrnds r6 >> (CONST_BITS+PASS1_BITS+3), 9
++ satrnds r4 >> (CONST_BITS+PASS1_BITS+3), 9
++ satrnds r8 >> (CONST_BITS+PASS1_BITS+3), 9
++ satrnds r5 >> (CONST_BITS+PASS1_BITS+3), 9
++ satrnds r0 >> (CONST_BITS+PASS1_BITS+3), 9
++ satrnds r3 >> (CONST_BITS+PASS1_BITS+3), 9
++
++ packw.sh r1, r1, r6
++ packw.sh r8, r8, r0
++ packw.sh r3, r3, r5
++ packw.sh r4, r4, r2
++
++ lddsp r12, SP[0] // rfp
++ lddsp r9, SP[4] // iinc
++ ld.d r6, r12[0]
++ sub r10, -2 // Increment the dataptr
++ punpckub.h r0, r7:t
++ padd.h r1, r1, r0
++ punpckub.h r0, r7:b
++ padd.h r8, r8, r0
++ packsh.ub r7, r1, r8
++ punpckub.h r0, r6:t
++ padd.h r3, r3, r0
++ punpckub.h r0, r6:b
++ padd.h r4, r4, r0
++ packsh.ub r6, r3, r4
++ st.d r12[0], r6
++ add r12, r9 // increment rfp
++ stdsp SP[0], r12
++
++ sub loop_cnt, 1 //Decrement loop counter
++ brne FOR_COLOUMN //Perform loop one more time if loop_cnt is not zero
++
++ sub sp, -8
++ popm r0-r3, r4-r7, pc //Pop back registers and PC
++
++
++
++//Coeffisient Table:
++ .align 2
++coef_table:
++ .short FIX_0_541196100, -FIX_1_847759065, FIX_0_765366865, FIX_1_175875602
++ .short - FIX_1_961570560, - FIX_0_390180644, FIX_0_298631336, FIX_2_053119869
++ .short FIX_3_072711026, FIX_1_501321110, - FIX_0_899976223, - FIX_2_562915447
++
++
++idct_put_avr32:
++ pushm r0-r3, r4-r7, lr //Free up registers to use for local variables
++
++ //; Give room for some variables on the stack
++ sub sp, 8
++ stdsp SP[0], r12 // rfp
++ stdsp SP[4], r11 // iinc
++
++ mov loop_cnt, 8 //Initialize loop counter
++
++0:
++
++ ldm r10, r0, r1, r2, r3 //Load 8 DCT-coeffisients from the current row in the DCT-block
++ mov r6, 0
++#ifdef USE_PREFETCH
++ pref r10[LINE_SIZE] //Prefetch next line
++#endif
++ or r4, r2, r3 << 16
++ or r4, r1 //Check if all DCT-coeffisients except the DC is zero
++ or r4, r0
++ brne 1f //If there are non-zero AC coeffisients perform row-transform
++
++ paddsub.h r5, r3:t, r6:b //Extract the DC-coeff from r5
++ plsl.h r5, r5, PASS1_BITS
++ mov r4, r5
++ st.d r10++, r4
++ st.d r10++, r4
++
++ sub loop_cnt, 1 //Decrement loop counter
++ brne 0b //Perform loop one more time if loop_cnt is not zero
++
++ bral 2f //Perform coloumn transform after row transform is computed
++
++1:
++
++ ld.w r12, pc[coef_table_copy - .]
++ ld.w r9, pc[coef_table_copy - . + 4]
++
++ padd.h r4, r2, r0 // r4:t = dataptr[2] + dataptr[6],r4:b = dataptr[3] + dataptr[7]
++ mulhh.w r5, r4:t, r12:t
++ mulhh.w r6, r0:t, r12:b
++ ld.w r12, pc[coef_table_copy - . + 8]
++ mulhh.w r7, r2:t, r9:t
++ add r6, r5 // tmp2
++ satrnds r6 >> (CONST_BITS - PASS1_BITS), 31
++ add r7, r5 // tmp3
++ satrnds r7 >> (CONST_BITS - PASS1_BITS), 31
++
++ paddsub.h r5, r3:t, r1:t
++ plsl.h r5, r5, PASS1_BITS // r5:t = tmp0, r5:b = tmp1
++
++ paddsub.h r7, r5:t, r7:b // r7:t = tmp10, r7:b = tmp13
++ paddsub.h r6, r5:b, r6:b // r6:t = tmp11, r6:b = tmp12
++
++
++
++ addhh.w lr, r3:b, r1:b // lr = z4
++ addhh.w r5, r4:b, lr:b
++ mulhh.w r5, r5:b, r9:b // r5 = z5
++
++ ld.w r9, pc[coef_table_copy - . + 12]
++ mulhh.w r4, r4:b, r12:t // r4 = z3
++ mulhh.w lr, lr:b, r12:b // lr = z4
++
++ add r4, r5
++ add lr, r5
++
++ addhh.w r5, r2:b, r1:b // r5 = z2
++ addhh.w r8, r3:b, r0:b // r8 = z1
++
++
++ mulhh.w r0, r0:b, r9:t // r0 = tmp0
++ ld.w r12, pc[coef_table_copy - . + 16]
++ mulhh.w r1, r1:b, r9:b // r1 = tmp1
++ ld.w r9, pc[coef_table_copy - . + 20]
++ mulhh.w r2, r2:b, r12:t // r2 = tmp2
++ mulhh.w r3, r3:b, r12:b // r3 = tmp3
++ mulhh.w r8, r8:b, r9:t // r8 = z1
++ mulhh.w r5, r5:b, r9:b // r5 = z2
++
++
++ add r0, r8
++ add r0, r4
++ add r1, r5
++ add r1, lr
++ add r2, r5
++ add r2, r4
++ add r3, r8
++ add r3, lr
++
++ satrnds r0 >> (CONST_BITS - PASS1_BITS), 31
++ satrnds r1 >> (CONST_BITS - PASS1_BITS), 31
++ satrnds r2 >> (CONST_BITS - PASS1_BITS), 31
++ satrnds r3 >> (CONST_BITS - PASS1_BITS), 31
++
++ paddsub.h r5, r6:t, r2:b // r5:t = dataptr[1], r5:b = dataptr[6]
++ paddsub.h r4, r7:t, r3:b // r4:t = dataptr[0], r4:b = dataptr[7]
++ paddsub.h r3, r6:b, r1:b // r3:t = dataptr[2], r3:b = dataptr[5]
++ paddsub.h r2, r7:b, r0:b // r2:t = dataptr[3], r2:b = dataptr[4]
++
++ sthh.w r10[0], r4:t, r5:t
++ sthh.w r10[4], r3:t, r2:t
++ sthh.w r10[8], r2:b, r3:b
++ sthh.w r10[12], r5:b, r4:b
++
++
++
++ sub r10, -16
++ sub loop_cnt, 1
++ brne 0b
++
++2:
++
++ sub r10, 128 //Set pointer to start of DCT block
++
++ mov loop_cnt, 8
++
++0:
++ ldins.h r3:t,r10[0] // r3:t = dataptr[0]
++ ldins.h r1:t,r10[1*8*2]// r1:t = dataptr[1]
++ ldins.h r2:t,r10[2*8*2]// r2:t = dataptr[2]
++ ldins.h r0:t,r10[5*8*2]// r0:t = dataptr[5]
++ ldins.h r3:b,r10[4*8*2]// r3:b = dataptr[4]
++ ldins.h r1:b,r10[3*8*2]// r1:b = dataptr[3]
++ ldins.h r2:b,r10[6*8*2]// r2:b = dataptr[6]
++ ldins.h r0:b,r10[7*8*2]// r0:b = dataptr[7]
++
++ or r4, r1, r3 << 16
++ or r4, r2
++ or r4, r0
++ brne 1f //If there are non-zero AC coeffisients perform row-transform
++
++ lddsp r12, SP[0] // rfp
++ lddsp r9, SP[4] // iinc
++ satrnds r3 >> ( PASS1_BITS + 3 + 16 ), 31
++ packw.sh r3, r3, r3
++ packsh.ub r3, r3, r3
++ mov r2, r3
++ st.d r12[0], r2
++ add r12, r9 // increment rfp
++ sub r10, -2 // Increment the dataptr
++ stdsp SP[0], r12
++
++ sub loop_cnt, 1//Decrement loop counter
++ brne 0b //Perform loop one more time if loop_cnt is not zero
++
++ sub sp, -8
++ popm r0-r3, r4-r7, pc//Pop back registers and PC
++
++1:
++
++ ld.w r12, pc[coef_table_copy - .]
++ ld.w r9, pc[coef_table_copy - . + 4]
++
++ addhh.w r4, r2:t, r2:b
++ mulhh.w r4, r4:b, r12:t // r4 = z1
++ mulhh.w r5, r2:b, r12:b
++ ld.w r12, pc[coef_table_copy - . + 8]
++ mulhh.w r6, r2:t, r9:t
++ add r5, r4 // r5 = tmp2
++ add r6, r4 // r6 = tmp3
++
++ addhh.w r7, r3:t, r3:b
++ subhh.w r8, r3:t, r3:b
++
++ lsl r7, CONST_BITS
++ lsl r8, CONST_BITS
++
++ add r2, r7, r6 // r2 = tmp10
++ sub r3, r7, r6 // r3 = tmp13
++ add r4, r8, r5 // r4 = tmp11
++ sub r5, r8, r5 // r5 = tmp12
++
++
++ padd.h r6, r0, r1 // r6:t = z4, r6:b = z3
++ addhh.w r7, r6:t, r6:b
++ mulhh.w r7, r7:b, r9:b // r7 = z5
++
++ ld.w r9, pc[coef_table_copy - . + 12]
++ mulhh.w r8, r6:b, r12:t // r8 = z3
++ mulhh.w r6, r6:t, r12:b // r6 = z4
++
++ add r8, r7
++ add r6, r7
++
++ paddx.h r7, r0, r1 // r7:t = z2, r7:b = z1
++
++ mulhh.w r12, r0:b, r9:t // r12 = tmp0
++ mulhh.w r0, r0:t, r9:b // r0 = tmp1
++ ld.w r9, pc[coef_table_copy - . + 16]
++ add r12, r8
++ add r0, r6
++
++ ld.w lr, pc[coef_table_copy - . + 20]
++ machh.w r8, r1:b, r9:t // r8 = tmp2
++ machh.w r6, r1:t, r9:b // r6 = tmp3
++ mulhh.w r9, r7:b, lr:t // r9 = z1
++ mulhh.w r7, r7:t, lr:b // r7 = z2
++
++
++ add r12, r9
++ add r0, r7
++ add r8, r7
++ add r6, r9
++
++ add r1, r2, r6 // r1 = dataptr[DCTSIZE*0]
++ sub r2, r2, r6 // r2 = dataptr[DCTSIZE*7]
++ add r6, r4, r8 // r6 = dataptr[DCTSIZE*1]
++ sub r4, r4, r8 // r4 = dataptr[DCTSIZE*6]
++ add r8, r5, r0 // r8 = dataptr[DCTSIZE*2]
++ sub r5, r5, r0 // r5 = dataptr[DCTSIZE*5]
++ add r0, r3, r12 // r0 = dataptr[DCTSIZE*3]
++ sub r3, r3, r12 // r3 = dataptr[DCTSIZE*4]
++
++ satrnds r1 >> (CONST_BITS+PASS1_BITS+3), 9
++ satrnds r2 >> (CONST_BITS+PASS1_BITS+3), 9
++ satrnds r6 >> (CONST_BITS+PASS1_BITS+3), 9
++ satrnds r4 >> (CONST_BITS+PASS1_BITS+3), 9
++ satrnds r8 >> (CONST_BITS+PASS1_BITS+3), 9
++ satrnds r5 >> (CONST_BITS+PASS1_BITS+3), 9
++ satrnds r0 >> (CONST_BITS+PASS1_BITS+3), 9
++ satrnds r3 >> (CONST_BITS+PASS1_BITS+3), 9
++
++ packw.sh r1, r1, r6
++ packw.sh r8, r8, r0
++ packw.sh r3, r3, r5
++ packw.sh r4, r4, r2
++
++ packsh.ub r1, r1, r8
++ packsh.ub r0, r3, r4
++ lddsp r12, SP[0] // rfp
++ lddsp r9, SP[4] // iinc
++ st.d r12[0], r0
++ sub r10, -2 // Increment the dataptr
++ add r12, r9 // increment rfp
++ stdsp SP[0], r12
++
++ sub loop_cnt, 1 //Decrement loop counter
++ brne 0b //Perform loop one more time if loop_cnt is not zero
++
++ sub sp, -8
++ popm r0-r3, r4-r7, pc //Pop back registers and PC
++
++
++
++ .align 2
++coef_table_copy:
++ .short FIX_0_541196100, -FIX_1_847759065, FIX_0_765366865, FIX_1_175875602
++ .short - FIX_1_961570560, - FIX_0_390180644, FIX_0_298631336, FIX_2_053119869
++ .short FIX_3_072711026, FIX_1_501321110, - FIX_0_899976223, - FIX_2_562915447
++
++
++idct_avr32:
++ pushm r0-r3, r4-r7, lr //Free up registers to use for local variables
++
++ //; Give room for a temporary block on the stack
++ sub sp, 8*8*2
++
++ mov loop_cnt, 8 //Initialize loop counter
++
++0:
++
++ ldm r12++, r0, r1, r2, r3 //Load 8 DCT-coeffisients from the current row in the DCT-block
++ mov r6, 0
++#ifdef USE_PREFETCH
++ pref r12[LINE_SIZE] //Prefetch next line
++#endif
++ or r4, r2, r3 << 16
++ or r4, r1 //Check if all DCT-coeffisients except the DC is zero
++ or r4, r0
++ brne 1f //If there are non-zero AC coeffisients perform row-transform
++
++ paddsub.h r5, r3:t, r6:b //Extract the DC-coeff from r5
++ plsl.h r5, r5, PASS1_BITS
++ mov r4, r5
++ st.d sp++, r4
++ st.d sp++, r4
++
++ sub loop_cnt, 1 //Decrement loop counter
++ brne 0b //Perform loop one more time if loop_cnt is not zero
++
++ bral 2f //Perform coloumn transform after row transform is computed
++
++1:
++
++ ld.w r10, pc[coef_table_idct - .]
++ ld.w r9, pc[coef_table_idct - . + 4]
++
++ padd.h r4, r2, r0 // r4:t = dataptr[2] + dataptr[6],r4:b = dataptr[3] + dataptr[7]
++ mulhh.w r5, r4:t, r10:t
++ mulhh.w r6, r0:t, r10:b
++ ld.w r10, pc[coef_table_idct - . + 8]
++ mulhh.w r7, r2:t, r9:t
++ add r6, r5 // tmp2
++ satrnds r6 >> (CONST_BITS - PASS1_BITS), 31
++ add r7, r5 // tmp3
++ satrnds r7 >> (CONST_BITS - PASS1_BITS), 31
++
++ paddsub.h r5, r3:t, r1:t
++ plsl.h r5, r5, PASS1_BITS // r5:t = tmp0, r5:b = tmp1
++
++ paddsub.h r7, r5:t, r7:b // r7:t = tmp10, r7:b = tmp13
++ paddsub.h r6, r5:b, r6:b // r6:t = tmp11, r6:b = tmp12
++
++
++
++ addhh.w lr, r3:b, r1:b // lr = z4
++ addhh.w r5, r4:b, lr:b
++ mulhh.w r5, r5:b, r9:b // r5 = z5
++
++ ld.w r9, pc[coef_table_idct - . + 12]
++ mulhh.w r4, r4:b, r10:t // r4 = z3
++ mulhh.w lr, lr:b, r10:b // lr = z4
++
++ add r4, r5
++ add lr, r5
++
++ addhh.w r5, r2:b, r1:b // r5 = z2
++ addhh.w r8, r3:b, r0:b // r8 = z1
++
++
++ mulhh.w r0, r0:b, r9:t // r0 = tmp0
++ ld.w r10, pc[coef_table_idct - . + 16]
++ mulhh.w r1, r1:b, r9:b // r1 = tmp1
++ ld.w r9, pc[coef_table_idct - . + 20]
++ mulhh.w r2, r2:b, r10:t // r2 = tmp2
++ mulhh.w r3, r3:b, r10:b // r3 = tmp3
++ mulhh.w r8, r8:b, r9:t // r8 = z1
++ mulhh.w r5, r5:b, r9:b // r5 = z2
++
++
++ add r0, r8
++ add r0, r4
++ add r1, r5
++ add r1, lr
++ add r2, r5
++ add r2, r4
++ add r3, r8
++ add r3, lr
++
++ satrnds r0 >> (CONST_BITS - PASS1_BITS), 31
++ satrnds r1 >> (CONST_BITS - PASS1_BITS), 31
++ satrnds r2 >> (CONST_BITS - PASS1_BITS), 31
++ satrnds r3 >> (CONST_BITS - PASS1_BITS), 31
++
++ paddsub.h r5, r6:t, r2:b // r5:t = dataptr[1], r5:b = dataptr[6]
++ paddsub.h r4, r7:t, r3:b // r4:t = dataptr[0], r4:b = dataptr[7]
++ paddsub.h r3, r6:b, r1:b // r3:t = dataptr[2], r3:b = dataptr[5]
++ paddsub.h r2, r7:b, r0:b // r2:t = dataptr[3], r2:b = dataptr[4]
++
++ sthh.w sp[0], r4:t, r5:t
++ sthh.w sp[4], r3:t, r2:t
++ sthh.w sp[8], r2:b, r3:b
++ sthh.w sp[12], r5:b, r4:b
++
++
++
++ sub sp, -16
++ sub loop_cnt, 1
++ brne 0b
++
++2:
++
++ sub sp, 8*8*2 //Set pointer to start of DCT block
++ sub r12, 8*8*2 //Set pointer to start of DCT block
++
++ mov loop_cnt, 8
++
++0:
++ ldins.h r3:t,sp[0] // r3:t = dataptr[0]
++ ldins.h r1:t,sp[1*8*2]// r1:t = dataptr[1]
++ ldins.h r2:t,sp[2*8*2]// r2:t = dataptr[2]
++ ldins.h r0:t,sp[5*8*2]// r0:t = dataptr[5]
++ ldins.h r3:b,sp[4*8*2]// r3:b = dataptr[4]
++ ldins.h r1:b,sp[3*8*2]// r1:b = dataptr[3]
++ ldins.h r2:b,sp[6*8*2]// r2:b = dataptr[6]
++ ldins.h r0:b,sp[7*8*2]// r0:b = dataptr[7]
++
++ or r4, r1, r3 << 16
++ or r4, r2
++ or r4, r0
++ brne 1f //If there are non-zero AC coeffisients perform row-transform
++
++ satrnds r3 >> ( PASS1_BITS + 3 + 16 ), 31
++ packw.sh r3, r3, r3
++ mov r2, r3
++ st.d r12++, r2
++ st.d r12++, r2
++ sub sp, -2 // Increment the dataptr
++
++ sub loop_cnt, 1//Decrement loop counter
++ brne 0b //Perform loop one more time if loop_cnt is not zero
++
++ sub sp, -(8*8*2 - 8)
++ popm r0-r3, r4-r7, pc//Pop back registers and PC
++
++1:
++
++ ld.w r10, pc[coef_table_idct - .]
++ ld.w r9, pc[coef_table_idct - . + 4]
++
++ addhh.w r4, r2:t, r2:b
++ mulhh.w r4, r4:b, r10:t // r4 = z1
++ mulhh.w r5, r2:b, r10:b
++ ld.w r10, pc[coef_table_idct - . + 8]
++ mulhh.w r6, r2:t, r9:t
++ add r5, r4 // r5 = tmp2
++ add r6, r4 // r6 = tmp3
++
++ addhh.w r7, r3:t, r3:b
++ subhh.w r8, r3:t, r3:b
++
++ lsl r7, CONST_BITS
++ lsl r8, CONST_BITS
++
++ add r2, r7, r6 // r2 = tmp10
++ sub r3, r7, r6 // r3 = tmp13
++ add r4, r8, r5 // r4 = tmp11
++ sub r5, r8, r5 // r5 = tmp12
++
++
++ padd.h r6, r0, r1 // r6:t = z4, r6:b = z3
++ addhh.w r7, r6:t, r6:b
++ mulhh.w r7, r7:b, r9:b // r7 = z5
++
++ ld.w r9, pc[coef_table_idct - . + 12]
++ mulhh.w r8, r6:b, r10:t // r8 = z3
++ mulhh.w r6, r6:t, r10:b // r6 = z4
++
++ add r8, r7
++ add r6, r7
++
++ paddx.h r7, r0, r1 // r7:t = z2, r7:b = z1
++
++ mulhh.w r10, r0:b, r9:t // r10 = tmp0
++ mulhh.w r0, r0:t, r9:b // r0 = tmp1
++ ld.w r9, pc[coef_table_idct - . + 16]
++ add r10, r8
++ add r0, r6
++
++ ld.w lr, pc[coef_table_idct - . + 20]
++ machh.w r8, r1:b, r9:t // r8 = tmp2
++ machh.w r6, r1:t, r9:b // r6 = tmp3
++ mulhh.w r9, r7:b, lr:t // r9 = z1
++ mulhh.w r7, r7:t, lr:b // r7 = z2
++
++
++ add r10, r9
++ add r0, r7
++ add r8, r7
++ add r6, r9
++
++ add r1, r2, r6 // r1 = dataptr[DCTSIZE*0]
++ sub r2, r2, r6 // r2 = dataptr[DCTSIZE*7]
++ add r6, r4, r8 // r6 = dataptr[DCTSIZE*1]
++ sub r4, r4, r8 // r4 = dataptr[DCTSIZE*6]
++ add r8, r5, r0 // r8 = dataptr[DCTSIZE*2]
++ sub r5, r5, r0 // r5 = dataptr[DCTSIZE*5]
++ add r0, r3, r10 // r0 = dataptr[DCTSIZE*3]
++ sub r3, r3, r10 // r3 = dataptr[DCTSIZE*4]
++
++ satrnds r1 >> (CONST_BITS+PASS1_BITS+3), 9
++ satrnds r2 >> (CONST_BITS+PASS1_BITS+3), 9
++ satrnds r6 >> (CONST_BITS+PASS1_BITS+3), 9
++ satrnds r4 >> (CONST_BITS+PASS1_BITS+3), 9
++ satrnds r8 >> (CONST_BITS+PASS1_BITS+3), 9
++ satrnds r5 >> (CONST_BITS+PASS1_BITS+3), 9
++ satrnds r0 >> (CONST_BITS+PASS1_BITS+3), 9
++ satrnds r3 >> (CONST_BITS+PASS1_BITS+3), 9
++
++ packw.sh r7, r1, r6
++ packw.sh r6, r8, r0
++ packw.sh r5, r3, r5
++ packw.sh r4, r4, r2
++
++ stm r12, r4-r7
++ sub sp, -2 // Increment the dataptr
++ sub r12, -16
++
++ sub loop_cnt, 1 //Decrement loop counter
++ brne 0b //Perform loop one more time if loop_cnt is not zero
++
++ sub sp, -(8*8*2 - 8)
++ popm r0-r3, r4-r7, pc //Pop back registers and PC
++
++
++
++ .align 2
++coef_table_idct:
++ .short FIX_0_541196100, -FIX_1_847759065, FIX_0_765366865, FIX_1_175875602
++ .short - FIX_1_961570560, - FIX_0_390180644, FIX_0_298631336, FIX_2_053119869
++ .short FIX_3_072711026, FIX_1_501321110, - FIX_0_899976223, - FIX_2_562915447
++
+diff --git a/libavcodec/avr32/mc.S b/libavcodec/avr32/mc.S
+new file mode 100644
+index 0000000..07a002d
+--- /dev/null
++++ b/libavcodec/avr32/mc.S
+@@ -0,0 +1,434 @@
++/*
++ * Copyright (c) 2007 Atmel Corporation. All rights reserved.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * 2. Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials provided
++ * with the distribution.
++ *
++ * 3. The name of ATMEL may not be used to endorse or promote products
++ * derived from this software without specific prior written
++ * permission.
++ *
++ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
++ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
++ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
++ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
++ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
++ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
++ * DAMAGE.
++ */
++
++
++ /* Macro for masking the lowest bit of each byte in a
++ packed word */
++ .macro packedmask1 reg, round
++ .if \round
++ and \reg, \reg, r8 >> 1
++ .else
++ and \reg, r8
++ .endif
++ .endm
++
++ /* Macro for 8 pixel wide horizontal and vertical interpolation functions */
++ .macro pixels8_hv round, put
++
++
++ pushm r0-r7, lr
++
++ /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */
++
++ /* Rounding immediate */
++ .if \round
++ mov r8, lo(0x02020202)
++ orh r8, hi(0x02020202)
++ .else
++ mov r8, lo(0x01010101)
++ orh r8, hi(0x01010101)
++ .endif
++ mov r7, 2
++
++ /* Pixel naming convention :
++
++ |-----------------------------------------------------|
++ | s00 | s01 | s02 | s03 | s04 | s05 | s06 | s07 | s08 |
++ |----d00---d01---d02---d03---d04---d05---d06---d07----|
++ | s10 | s11 | s12 | s13 | s14 | s15 | s16 | s17 | s18 |
++ |-----------------------------------------------------|
++ */
++1:
++ ld.w r0, r11[0] // r0 = { s00, s01, s02, s03 }
++ ld.w r1, r11[1] // r1 = { s01, s02, s03, s04 }
++ mov lr, r9
++ eor r2, r0, r1
++ packedmask1 r2, \round
++ add r2, r8
++
++ paddh.ub r0, r0, r1 // r0 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
++
++ add r11, r10 // pixels += line_size
++ ld.w r1, r11[0] // r1 = { s10, s11, s12, s13 }
++ ld.w r3, r11[1] // r3 = { s11, s12, s13, s14 }
++0:
++ eor r5, r1, r3
++ packedmask1 r5, \round
++ add r2, r5
++
++ paddh.ub r1, r1, r3 // r1 = {(s10+s11)/2,(s11+s12)/2,(s12+s13)/2,(s13+s14)/2}
++ eor r6, r0, r1
++ packedmask1 r6, \round
++ add r2, r2, r6 << 1
++
++ ld.w r3, r11[r10] // r3 = { s00, s01, s02, s03 }
++ add r11, r10 // pixels += line_size
++ ld.w r4, r11[1] // r4 = { s01, s02, s03, s04 }
++
++ paddh.ub r0, r0, r1
++ plsr.b r2, r2, 2
++ padd.b r0, r0, r2 // r0 = { d00, d01, d02, d03 }
++
++ /* Next row */
++ .if \put
++ eor r2, r3, r4
++ packedmask1 r2, \round
++ add r2, r8
++ .else
++ ld.w r6, r12[0]
++ eor r2, r3, r4
++ packedmask1 r2, \round
++ add r2, r8
++ pavg.ub r0, r0, r6
++ .endif
++ st.w r12[0], r0 // Put data into the block
++
++ add r5, r2
++ paddh.ub r0, r3, r4 // r0 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
++
++ eor r6, r0, r1
++ packedmask1 r6, \round
++ add r5, r5, r6 << 1
++
++ .if \put
++ paddh.ub r1, r0, r1
++ plsr.b r5, r5, 2
++ padd.b r1, r1, r5 // r1 = { d10, d11, d12, d13 }
++ .else
++ ld.w r3, r12[r10]
++ paddh.ub r1, r0, r1
++ plsr.b r5, r5, 2
++ padd.b r1, r1, r5 // r1 = { d10, d11, d12, d13 }
++ pavg.ub r1, r1, r3
++ .endif
++
++ st.w r12[r10], r1 // Put data into the block
++
++
++ ld.w r1, r11[r10] // r1 = { s10, s11, s12, s13 }
++ add r11, r10 // pixels += line_size
++ ld.w r3, r11[1] // r3 = { s11, s12, s13, s14 }
++ add r12, r12, r10 << 1 // block += 2*line_size
++ sub lr, 2
++ brne 0b
++
++ mul r0, r10, r9 // r0 = line_size * h
++ rsub r0, r0, 4 // r0 = 4 - (line_size * h)
++ add r11, r0
++ sub r11, r10 // pixels += 4 - (line_size * (h+1))
++ add r12, r0 // pixels += 4 - (line_size * (h))
++ sub r7, 1
++ brne 1b
++
++ popm r0-r7, pc
++ .endm
++
++
++ /* Macro for 8 pixel wide vertical interpolation functions */
++
++ .macro pixels8_v round, put
++ pushm r4-r7,lr
++ /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */
++
++ /*
++ Pixel Naming Convention :
++ |-----------------------------------------------|
++ | s00 | s01 | s02 | s03 | s04 | s05 | s06 | s07 |
++ |-d00---d01---d02---d03---d04---d05---d06---d07-|
++ | s10 | s11 | s12 | s13 | s14 | s15 | s16 | s17 |
++ |-----------------------------------------------|
++ */
++ ld.w r8, r11[r10] // r8 = { s10, s11, s12, s13 }
++ ld.w lr, r11++ // lr = { s00, s01, s02, s03 }, src += 4
++ ld.w r7, r11[0] // r7 = { s04, s05, s06, s07 }
++ ld.w r6, r11[r10] // r6 = { s14, s15, s16, s17 }
++ sub r10, 4 // stride -= 4
++ add r11, r11, r10 << 1 // src += 2*stride
++ sub r11, -4 // src += 4
++
++0:
++ .if \round
++ pavg.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2}
++ pavg.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2}
++ .else
++ paddh.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2}
++ paddh.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2}
++ .endif
++
++ .if \put
++ st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 }
++ ld.w lr, r11++ // lr = { s10, s11, s12, s13 }, src += 4
++ st.w r12[0], r4 // *dst = { d04, d05, d06, d07 }
++ ld.w r7, r11[0] // r7 = { s14, s15, s16, s17 }
++ .else
++ ld.w lr, r12[0]
++ ld.w r7, r12[4]
++ pavg.ub r5, r5, lr
++ pavg.ub r4, r4, r7
++ st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 }
++ ld.w lr, r11++ // lr = { s10, s11, s12, s13 }, src += 4
++ st.w r12[0], r4 // *dst = { d04, d05, d06, d07 }
++ ld.w r7, r11[0] // r7 = { s14, s15, s16, s17 }
++ .endif
++ add r11, r10 // src += stride
++#ifdef USE_PREFETCH
++ pref r11[0]
++#endif
++ add r12, r10 // dst += stride
++
++ .if \round
++ pavg.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2}
++ pavg.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2}
++ .else
++ paddh.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2}
++ paddh.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2}
++ .endif
++ .if \put
++ st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 }
++ ld.w r8, r11++ // r8 = { s10, s11, s12, s13 }, src += 4
++ st.w r12[0], r4 // *dst = { d04, d05, d06, d07 }
++ ld.w r6, r11[0] // r6 = { s14, s15, s16, s17 }
++ .else
++ ld.w r8, r12[0]
++ ld.w r6, r12[4]
++ pavg.ub r5, r5, r8
++ pavg.ub r4, r4, r6
++ st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 }
++ ld.w r8, r11++ // r8 = { s10, s11, s12, s13 }, src += 4
++ st.w r12[0], r4 // *dst = { d04, d05, d06, d07 }
++ ld.w r6, r11[0] // r6 = { s14, s15, s16, s17 }
++ .endif
++
++ add r11, r10 // src += stride
++#ifdef USE_PREFETCH
++ pref r11[0]
++#endif
++ add r12, r10 // dst += stride
++ sub r9, 2
++ brne 0b
++
++ popm r4-r7,pc
++ .endm
++
++ /* Macro for 8 pixel wide horizontal interpolation functions */
++
++ .macro pixels8_h round, put
++ pushm r4-r7, lr
++
++ /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */
++ /*
++ Pixel Naming Convention:
++ |--------------------------------------------------------------------|
++ | s00 d00 s01 d01 s02 d02 s03 d03 s04 d04 s05 d05 s06 d06 s07 d07 s08|
++ |------|-------|-------|-------|-------|-------|-------|-------|-----|
++ | s10 d10 s11 d11 s12 d12 s13 d13 s14 d14 s15 d15 s16 d16 s17 d17 s18|
++ |--------------------------------------------------------------------|
++ */
++
++ ld.w lr, r11[0] // lr = { s00, s01, s02, s03 }
++ ld.w r8, r11[1] // r8 = { s01, s02, s03, s04 }
++ ld.w r7, r11[4] // r7 = { s04, s05, s06, s07 }
++ ld.w r6, r11[5] // r6 = { s05, s06, s07, s08 }
++ add r11, r10 // src += stride
++
++0:
++ .if \round
++ pavg.ub lr, r8, lr // lr = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
++ pavg.ub r7, r6, r7 // r7 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2}
++ .else
++ paddh.ub lr, r8, lr // lr = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
++ paddh.ub r7, r6, r7 // r7 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2}
++ .endif
++ .if \put
++ ld.w r5, r11[0] // r5 = { s00, s01, s02, s03 }
++ ld.w r4, r11[1] // r4 = { s01, s02, s03, s04 }
++ .else
++ ld.w r8, r12[0]
++ ld.w r6, r12[4]
++ ld.w r5, r11[0] // r5 = { s00, s01, s02, s03 }
++ ld.w r4, r11[1] // r4 = { s01, s02, s03, s04 }
++ pavg.ub lr, lr, r8
++ pavg.ub r7, r7, r6
++ .endif
++ st.w r12[0], lr // dst = { d00, d01, d02, d03 }
++ st.w r12[4], r7 // dst = { d04, d05, d06, d07 }
++ ld.w r8, r11[4] // r8 = { s04, s05, s06, s07 }
++ ld.w r6, r11[5] // r6 = { s05, s06, s07, s08 }
++ add r11, r10 // src += stride
++#ifdef USE_PREFETCH
++ pref r11[0]
++#endif
++ add r12, r10 // dst += stride
++
++ .if \round
++ pavg.ub r5, r4, r5 // r5 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
++ pavg.ub r4, r6, r8 // r4 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2}
++ .else
++ paddh.ub r5, r4, r5 // r5 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
++ paddh.ub r4, r6, r8 // r4 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2}
++ .endif
++ .if \put
++ ld.w lr, r11[0] // lr = { s00, s01, s02, s03 }
++ ld.w r8, r11[1] // r8 = { s01, s02, s03, s04 }
++ .else
++ ld.w r7, r12[0]
++ ld.w r6, r12[4]
++ ld.w lr, r11[0] // lr = { s00, s01, s02, s03 }
++ ld.w r8, r11[1] // r8 = { s01, s02, s03, s04 }
++ pavg.ub r5, r5, r7
++ pavg.ub r4, r4, r6
++ .endif
++ st.w r12[0], r5 // dst = { d00, d01, d02, d03 }
++ st.w r12[4], r4 // dst = { d04, d05, d06, d07 }
++ ld.w r7, r11[4] // r7 = { s04, s05, s06, s07 }
++ ld.w r6, r11[5] // r6 = { s05, s06, s07, s08 }
++ add r11, r10 // src += stride
++#ifdef USE_PREFETCH
++ pref r11[0]
++#endif
++ add r12, r10 // dst += stride
++ sub r9, 2
++ brne 0b
++
++ popm r4-r7, pc
++ .endm
++
++ /* Macro for 8 pixel wide copy functions */
++ .macro pixels8 put
++ stm --sp, r3-r7,lr
++ /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */
++ mov lr, r9
++ sub r3, r10, 2 // stride2 = stride - 2
++0:
++ .if \put
++ ld.w r9, r11[r10] // r9 = { s10, s11, s12, s13 }
++ ld.w r7, r11++ // r7 = { s00, s01, s02, s03 }, src += 4
++ ld.w r6, r11[0] // r6 = { s04, s05, s06, s07 }
++ ld.w r8, r11[r10] // r8 = { s14, s15, s16, s17 }
++ .else
++ ld.w r9, r11[r10] // r9 = { s10, s11, s12, s13 }
++ ld.d r4, r12[0]
++ ld.w r7, r11++ // r7 = { s00, s01, s02, s03 }, src += 4
++ ld.w r6, r11[0] // r6 = { s04, s05, s06, s07 }
++ ld.w r8, r11[r10] // r8 = { s14, s15, s16, s17 }
++ pavg.ub r6, r6, r4
++ pavg.ub r7, r7, r5
++ ld.d r4, r12[r10]
++ .endif
++ st.d r12, r6 // *dst = { s00, s01, s02, s03, s04, s05, s06, s07 }
++ add r11, r11, r3 << 1 // src += stride2 * 2
++ .ifeq \put
++ pavg.ub r8, r8, r4
++ pavg.ub r9, r9, r5
++ .endif
++ st.d r12[r10 << 0], r8 // *(dst + stride) = { s10, s11, s12, s13, s14, s15, s16, s17 }
++ add r12, r12, r10 << 1 // dst += 2*stride
++ sub lr, 2
++ brne 0b
++ ldm sp++, r3-r7,pc
++
++ .endm
++
++ .global put_no_rnd_pixels8_hv_avr32
++ .text
++put_no_rnd_pixels8_hv_avr32:
++ pixels8_hv 0, 1
++
++ .global put_pixels8_hv_avr32
++ .text
++put_pixels8_hv_avr32:
++ pixels8_hv 1, 1
++
++ .global avg_no_rnd_pixels8_hv_avr32
++ .text
++avg_no_rnd_pixels8_hv_avr32:
++ pixels8_hv 0, 0
++
++ .global avg_pixels8_hv_avr32
++ .text
++avg_pixels8_hv_avr32:
++ pixels8_hv 1, 0
++
++ .global put_no_rnd_pixels8_v_avr32
++ .text
++put_no_rnd_pixels8_v_avr32:
++ pixels8_v 0, 1
++
++ .global put_pixels8_v_avr32
++ .text
++put_pixels8_v_avr32:
++ pixels8_v 1, 1
++
++ .global avg_no_rnd_pixels8_v_avr32
++ .text
++avg_no_rnd_pixels8_v_avr32:
++ pixels8_v 0, 0
++
++ .global avg_pixels8_v_avr32
++ .text
++avg_pixels8_v_avr32:
++ pixels8_v 1, 0
++
++ .global put_no_rnd_pixels8_h_avr32
++ .text
++put_no_rnd_pixels8_h_avr32:
++ pixels8_h 0, 1
++
++ .global put_pixels8_h_avr32
++ .text
++put_pixels8_h_avr32:
++ pixels8_h 1, 1
++
++ .global avg_no_rnd_pixels8_h_avr32
++ .text
++avg_no_rnd_pixels8_h_avr32:
++ pixels8_h 0, 0
++
++ .global avg_pixels8_h_avr32
++ .text
++avg_pixels8_h_avr32:
++ pixels8_h 1, 0
++
++ .global put_pixels8_avr32
++ .global put_no_rnd_pixels8_avr32
++ .text
++put_pixels8_avr32:
++put_no_rnd_pixels8_avr32:
++ pixels8 1
++
++ .global avg_no_rnd_pixels8_avr32
++ .global avg_pixels8_avr32
++ .text
++avg_pixels8_avr32:
++avg_no_rnd_pixels8_avr32:
++ pixels8 0
+diff --git a/libavcodec/avr32/pico.h b/libavcodec/avr32/pico.h
+new file mode 100644
+index 0000000..32201ba
+--- /dev/null
++++ b/libavcodec/avr32/pico.h
+@@ -0,0 +1,260 @@
++/*
++ * Copyright (c) 2007 Atmel Corporation. All rights reserved.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * 2. Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials provided
++ * with the distribution.
++ *
++ * 3. The name of ATMEL may not be used to endorse or promote products
++ * derived from this software without specific prior written
++ * permission.
++ *
++ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
++ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
++ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
++ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
++ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
++ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
++ * DAMAGE.
++ */
++#ifndef __PICO_H__
++#define __PICO_H__
++
++
++
++/* Coprocessor Number */
++#define PICO_CPNO 1
++
++/* Pixel Coprocessor Register file */
++#define PICO_REGVECT_INPIX2 cr0
++#define PICO_REGVECT_INPIX1 cr1
++#define PICO_REGVECT_INPIX0 cr2
++#define PICO_REGVECT_OUTPIX2 cr3
++#define PICO_REGVECT_OUTPIX1 cr4
++#define PICO_REGVECT_OUTPIX0 cr5
++#define PICO_REGVECT_COEFF0_A cr6
++#define PICO_REGVECT_COEFF0_B cr7
++#define PICO_REGVECT_COEFF1_A cr8
++#define PICO_REGVECT_COEFF1_B cr9
++#define PICO_REGVECT_COEFF2_A cr10
++#define PICO_REGVECT_COEFF2_B cr11
++#define PICO_REGVECT_VMU0_OUT cr12
++#define PICO_REGVECT_VMU1_OUT cr13
++#define PICO_REGVECT_VMU2_OUT cr14
++#define PICO_REGVECT_CONFIG cr15
++
++#define PICO_INPIX2 0
++#define PICO_INPIX1 1
++#define PICO_INPIX0 2
++#define PICO_OUTPIX2 3
++#define PICO_OUTPIX1 4
++#define PICO_OUTPIX0 5
++#define PICO_COEFF0_A 6
++#define PICO_COEFF0_B 7
++#define PICO_COEFF1_A 8
++#define PICO_COEFF1_B 9
++#define PICO_COEFF2_A 10
++#define PICO_COEFF2_B 11
++#define PICO_VMU0_OUT 12
++#define PICO_VMU1_OUT 13
++#define PICO_VMU2_OUT 14
++#define PICO_CONFIG 15
++
++/* Config Register */
++#define PICO_COEFF_FRAC_BITS_OFFSET 0
++#define PICO_COEFF_FRAC_BITS_SIZE 4
++#define PICO_OFFSET_FRAC_BITS_OFFSET 4
++#define PICO_OFFSET_FRAC_BITS_SIZE 4
++#define PICO_INPUT_MODE_OFFSET 8
++#define PICO_INPUT_MODE_SIZE 2
++#define PICO_OUTPUT_MODE_OFFSET 10
++#define PICO_OUTPUT_MODE_SIZE 1
++
++struct pico_config_t {
++ unsigned int : 32 - PICO_OUTPUT_MODE_OFFSET - PICO_OUTPUT_MODE_SIZE;
++ unsigned int output_mode : PICO_OUTPUT_MODE_SIZE;
++ unsigned int input_mode : PICO_INPUT_MODE_SIZE;
++ unsigned int offset_frac_bits : PICO_OFFSET_FRAC_BITS_SIZE;
++ unsigned int coeff_frac_bits : PICO_COEFF_FRAC_BITS_SIZE;
++ int vmu2_out;
++ int vmu1_out;
++ int vmu0_out;
++ short coeff2_2;
++ short coeff2_3;
++ short coeff2_0;
++ short coeff2_1;
++ short coeff1_2;
++ short coeff1_3;
++ short coeff1_0;
++ short coeff1_1;
++ short coeff0_2;
++ short coeff0_3;
++ short coeff0_0;
++ short coeff0_1;
++};
++
++
++#define PICO_COEFF_FRAC_BITS(x) (x << PICO_COEFF_FRAC_BITS_OFFSET)
++#define PICO_OFFSET_FRAC_BITS(x) (x << PICO_OFFSET_FRAC_BITS_OFFSET)
++#define PICO_INPUT_MODE(x) (x << PICO_INPUT_MODE_OFFSET)
++#define PICO_OUTPUT_MODE(x) (x << PICO_OUTPUT_MODE_OFFSET)
++
++#define GET_PICO_COEFF_FRAC_BITS(x) ((x >> PICO_COEFF_FRAC_BITS_OFFSET)&((1 << PICO_COEFF_FRAC_BITS_SIZE)-1))
++#define GET_PICO_OFFSET_FRAC_BITS(x) ((x >> PICO_OFFSET_FRAC_BITS_OFFSET)&((1 << PICO_OFFSET_FRAC_BITS_SIZE)-1))
++#define GET_PICO_INPUT_MODE(x) ((x >> PICO_INPUT_MODE_OFFSET)&((1 << PICO_INPUT_MODE_SIZE)-1))
++#define GET_PICO_OUTPUT_MODE(x) ((x >> PICO_OUTPUT_MODE_OFFSET)&((1 << PICO_OUTPUT_MODE_SIZE)-1))
++
++enum pico_input_mode { PICO_TRANSFORMATION_MODE,
++ PICO_HOR_FILTER_MODE,
++ PICO_VERT_FILTER_MODE };
++
++enum pico_output_mode { PICO_PACKED_MODE,
++ PICO_PLANAR_MODE };
++
++/* Bits in coefficients */
++#define PICO_COEFF_BITS 12
++
++/* Operation bits */
++#define PICO_MATRIX (0)
++#define PICO_USE_ACC (1 << 2)
++#define PICO_SINGLE_VECTOR (1 << 3)
++
++
++#define __str(x...) #x
++#define __xstr(x...) __str(x)
++
++#define PICO_PUT_W(pico_reg, x) \
++ __builtin_mvrc_w(PICO_CPNO, pico_reg, x);
++#define PICO_GET_W(pico_reg) \
++ __builtin_mvcr_w(PICO_CPNO, pico_reg)
++
++#define PICO_MVCR_W(x, pico_reg) \
++ asm ("mvcr.w\tcp" __xstr(PICO_CPNO) ", %0, cr" __xstr(pico_reg) : "=r"(x));
++
++#define PICO_MVRC_W(pico_reg, x) \
++ asm ("mvrc.w\tcp" __xstr(PICO_CPNO) ", cr" __xstr(pico_reg) ", %0" :: "r"(x));
++
++#define PICO_PUT_D(pico_reg, x) \
++ __builtin_mvrc_d(PICO_CPNO, pico_reg, x);
++#define PICO_GET_D(pico_reg) \
++ __builtin_mvcr_d(PICO_CPNO, pico_reg)
++
++#define PICO_MVCR_D(x, pico_reg) \
++ asm volatile ("mvcr.d\tcp" __xstr(PICO_CPNO) ", %0, cr" __xstr(pico_reg) : "=r"(x));
++#define PICO_MVRC_D(pico_reg, x) \
++ asm volatile ("mvrc.d\tcp" __xstr(PICO_CPNO) ", cr" __xstr(pico_reg) ", %0" :: "r"(x));
++
++#define PICO_STCM_W(ptr, pico_regs...) \
++ asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
++#define PICO_STCM_D(ptr, pico_regs...) \
++ asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
++
++#define PICO_STCM_W_DEC(ptr, pico_regs...) \
++ asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr));
++#define PICO_STCM_D_DEC(ptr, pico_regs...) \
++ asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr));
++
++#define PICO_LDCM_W(ptr, pico_regs...) \
++ asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
++#define PICO_LDCM_D(ptr, pico_regs...) \
++ asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
++
++#define PICO_LDCM_W_INC(ptr, pico_regs...) \
++ asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr));
++#define PICO_LDCM_D_INC(ptr, pico_regs...) \
++ asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr));
++
++#define PICO_OP(op, dst_addr, addr0, addr1, addr2) \
++ __builtin_cop(PICO_CPNO, addr0, addr1, addr2, op | dst_addr);
++
++static inline void set_pico_config(struct pico_config_t *config){
++ PICO_LDCM_D(config,
++ PICO_REGVECT_COEFF0_A, PICO_REGVECT_COEFF0_B,
++ PICO_REGVECT_COEFF1_A, PICO_REGVECT_COEFF1_B,
++ PICO_REGVECT_COEFF2_A, PICO_REGVECT_COEFF2_B,
++ PICO_REGVECT_VMU0_OUT, PICO_REGVECT_VMU1_OUT,
++ PICO_REGVECT_VMU2_OUT, PICO_REGVECT_CONFIG);
++}
++
++static inline void get_pico_config(struct pico_config_t *config){
++ PICO_STCM_D(config,
++ PICO_REGVECT_COEFF0_A, PICO_REGVECT_COEFF0_B,
++ PICO_REGVECT_COEFF1_A, PICO_REGVECT_COEFF1_B,
++ PICO_REGVECT_COEFF2_A, PICO_REGVECT_COEFF2_B,
++ PICO_REGVECT_VMU0_OUT, PICO_REGVECT_VMU1_OUT,
++ PICO_REGVECT_VMU2_OUT, PICO_REGVECT_CONFIG);
++}
++
++static inline void dump_pico_config(){
++ struct pico_config_t pico_config;
++ char *input_mode, *output_mode;
++ get_pico_config(&pico_config);
++
++
++ av_log(NULL, AV_LOG_INFO, "Dumping pico configuration:\n\n");
++ av_log(NULL, AV_LOG_INFO, "\tcoeff_frac_bits = %d\n", pico_config.coeff_frac_bits);
++ av_log(NULL, AV_LOG_INFO, "\toffset_frac_bits = %d\n", pico_config.offset_frac_bits);
++
++ switch ( pico_config.input_mode ){
++ case PICO_TRANSFORMATION_MODE:
++ input_mode = "Transformation Mode";
++ break;
++ case PICO_HOR_FILTER_MODE:
++ input_mode = "Horisontal Filter Mode";
++ break;
++ case PICO_VERT_FILTER_MODE:
++ input_mode = "Vertical Filter Mode";
++ break;
++ default:
++ input_mode = "Unknown Mode!!";
++ break;
++ }
++ av_log(NULL, AV_LOG_INFO, "\tinput_mode = %s\n", input_mode);
++
++ switch ( pico_config.output_mode ){
++ case PICO_PLANAR_MODE:
++ output_mode = "Planar Mode";
++ break;
++ case PICO_PACKED_MODE:
++ output_mode = "Packed Mode";
++ break;
++ default:
++ output_mode = "Unknown Mode!!";
++ break;
++ }
++
++ av_log(NULL, AV_LOG_INFO, "\toutput_mode = %s\n", output_mode);
++
++ av_log(NULL, AV_LOG_INFO, "\tCoeff0_0 = %f\n", (float)pico_config.coeff0_0/(float)(1 << pico_config.coeff_frac_bits));
++ av_log(NULL, AV_LOG_INFO, "\tCoeff0_1 = %f\n", (float)pico_config.coeff0_1/(float)(1 << pico_config.coeff_frac_bits));
++ av_log(NULL, AV_LOG_INFO, "\tCoeff0_2 = %f\n", (float)pico_config.coeff0_2/(float)(1 << pico_config.coeff_frac_bits));
++ av_log(NULL, AV_LOG_INFO, "\tCoeff0_3 = %f\n", (float)pico_config.coeff0_3/(float)(1 << pico_config.offset_frac_bits));
++
++ av_log(NULL, AV_LOG_INFO, "\tCoeff1_0 = %f\n", (float)pico_config.coeff1_0/(float)(1 << pico_config.coeff_frac_bits));
++ av_log(NULL, AV_LOG_INFO, "\tCoeff1_1 = %f\n", (float)pico_config.coeff1_1/(float)(1 << pico_config.coeff_frac_bits));
++ av_log(NULL, AV_LOG_INFO, "\tCoeff1_2 = %f\n", (float)pico_config.coeff1_2/(float)(1 << pico_config.coeff_frac_bits));
++ av_log(NULL, AV_LOG_INFO, "\tCoeff1_3 = %f\n", (float)pico_config.coeff1_3/(float)(1 << pico_config.offset_frac_bits));
++
++ av_log(NULL, AV_LOG_INFO, "\tCoeff2_0 = %f\n", (float)pico_config.coeff2_0/(float)(1 << pico_config.coeff_frac_bits));
++ av_log(NULL, AV_LOG_INFO, "\tCoeff2_1 = %f\n", (float)pico_config.coeff2_1/(float)(1 << pico_config.coeff_frac_bits));
++ av_log(NULL, AV_LOG_INFO, "\tCoeff2_2 = %f\n", (float)pico_config.coeff2_2/(float)(1 << pico_config.coeff_frac_bits));
++ av_log(NULL, AV_LOG_INFO, "\tCoeff2_3 = %f\n", (float)pico_config.coeff2_3/(float)(1 << pico_config.offset_frac_bits));
++}
++
++
++
++#endif
++
+diff --git a/libavcodec/bitstream.h b/libavcodec/bitstream.h
+index 26b4f8d..1f8fabf 100644
+--- a/libavcodec/bitstream.h
++++ b/libavcodec/bitstream.h
+@@ -171,7 +171,7 @@ typedef struct RL_VLC_ELEM {
+ #endif
+
+ /* used to avoid missaligned exceptions on some archs (alpha, ...) */
+-#if defined(ARCH_X86) || defined(ARCH_X86_64)
++#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_AVR32)
+ # define unaligned16(a) (*(const uint16_t*)(a))
+ # define unaligned32(a) (*(const uint32_t*)(a))
+ # define unaligned64(a) (*(const uint64_t*)(a))
+@@ -813,6 +813,44 @@ void free_vlc(VLC *vlc);
+ * if the vlc code is invalid and max_depth>1 than the number of bits removed
+ * is undefined
+ */
++
++#if defined(ARCH_AVR32)
++#define GET_VLC(code, name, gb, table, bits, max_depth)\
++{\
++ int n, index, nb_bits;\
++ union { VLC_TYPE vlc[2];\
++ uint32_t u32; } table_elem;\
++\
++ index= SHOW_UBITS(name, gb, bits);\
++ table_elem.u32 = unaligned32(&table[index]); \
++ code = table_elem.vlc[0];\
++ n = table_elem.vlc[1];\
++\
++ if(max_depth > 1 && n < 0 ){\
++ LAST_SKIP_BITS(name, gb, bits)\
++ UPDATE_CACHE(name, gb)\
++\
++ nb_bits = -n;\
++\
++ index= SHOW_UBITS(name, gb, nb_bits) + code;\
++ table_elem.u32 = unaligned32(&table[index]); \
++ code = table_elem.vlc[0];\
++ n = table_elem.vlc[1];\
++ if(max_depth > 2 && n < 0){\
++ LAST_SKIP_BITS(name, gb, nb_bits)\
++ UPDATE_CACHE(name, gb)\
++\
++ nb_bits = -n;\
++\
++ index= SHOW_UBITS(name, gb, nb_bits) + code;\
++ code = table[index][0];\
++ n = table[index][1];\
++ }\
++ }\
++ SKIP_BITS(name, gb, n)\
++}
++
++#else
+ #define GET_VLC(code, name, gb, table, bits, max_depth)\
+ {\
+ int n, index, nb_bits;\
+@@ -821,7 +859,7 @@ void free_vlc(VLC *vlc);
+ code = table[index][0];\
+ n = table[index][1];\
+ \
+- if(max_depth > 1 && n < 0){\
++ if(max_depth > 1 && n < 0 ){\
+ LAST_SKIP_BITS(name, gb, bits)\
+ UPDATE_CACHE(name, gb)\
+ \
+@@ -843,7 +881,38 @@ void free_vlc(VLC *vlc);
+ }\
+ SKIP_BITS(name, gb, n)\
+ }
++#endif
+
++#if defined(ARCH_AVR32)
++#define GET_RL_VLC(level, run, name, gb, table, bits, max_depth, need_update)\
++{\
++ int n, index, nb_bits;\
++ union { RL_VLC_ELEM vlc;\
++ uint32_t u32; } table_elem;\
++\
++ index= SHOW_UBITS(name, gb, bits);\
++ table_elem.u32 = unaligned32(&table[index]); \
++ level = table_elem.vlc.level;\
++ n = table_elem.vlc.len;\
++\
++ if(max_depth > 1 && n < 0 ){\
++ SKIP_BITS(name, gb, bits)\
++ if(need_update){\
++ UPDATE_CACHE(name, gb)\
++ }\
++\
++ nb_bits = -n;\
++\
++ index= SHOW_UBITS(name, gb, nb_bits) + level;\
++ table_elem.u32 = unaligned32(&table[index]); \
++ level = table_elem.vlc.level;\
++ n = table_elem.vlc.len;\
++ }\
++ run= table_elem.vlc.run;\
++ SKIP_BITS(name, gb, n)\
++}
++
++#else
+ #define GET_RL_VLC(level, run, name, gb, table, bits, max_depth, need_update)\
+ {\
+ int n, index, nb_bits;\
+@@ -852,7 +921,7 @@ void free_vlc(VLC *vlc);
+ level = table[index].level;\
+ n = table[index].len;\
+ \
+- if(max_depth > 1 && n < 0){\
++ if(max_depth > 1 && n < 0 ){\
+ SKIP_BITS(name, gb, bits)\
+ if(need_update){\
+ UPDATE_CACHE(name, gb)\
+@@ -867,7 +936,7 @@ void free_vlc(VLC *vlc);
+ run= table[index].run;\
+ SKIP_BITS(name, gb, n)\
+ }
+-
++#endif
+
+ /**
+ * parses a vlc code, faster then get_vlc()
+diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
+index 56c42b9..8fc10c6 100644
+--- a/libavcodec/dsputil.c
++++ b/libavcodec/dsputil.c
+@@ -4197,6 +4197,9 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
+ #ifdef ARCH_BFIN
+ dsputil_init_bfin(c,avctx);
+ #endif
++#ifdef ARCH_AVR32
++ dsputil_init_avr32(c,avctx);
++#endif
+
+ for(i=0; i<64; i++){
+ if(!c->put_2tap_qpel_pixels_tab[0][i])
+diff --git a/libavcodec/h264.c b/libavcodec/h264.c
+index 865e80a..8f7c3f1 100644
+--- a/libavcodec/h264.c
++++ b/libavcodec/h264.c
+@@ -3258,7 +3258,12 @@ static void free_tables(H264Context *h){
+
+ static void init_dequant8_coeff_table(H264Context *h){
+ int i,q,x;
++#ifdef ARCH_AVR32
++ const int transpose = 0;
++#else
+ const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
++#endif
++
+ h->dequant8_coeff[0] = h->dequant8_buffer[0];
+ h->dequant8_coeff[1] = h->dequant8_buffer[1];
+
+@@ -3281,7 +3286,13 @@ static void init_dequant8_coeff_table(H264Context *h){
+
+ static void init_dequant4_coeff_table(H264Context *h){
+ int i,j,q,x;
++ // Yes this is ugly as hell....
++#ifdef ARCH_AVR32
++ const int transpose = 0;
++#else
+ const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
++#endif
++
+ for(i=0; i<6; i++ ){
+ h->dequant4_coeff[i] = h->dequant4_buffer[i];
+ for(j=0; j<i; j++){
+@@ -4663,7 +4674,11 @@ static int decode_slice_header(H264Context *h){
+ if (MPV_common_init(s) < 0)
+ return -1;
+
++#ifdef ARCH_AVR32
++ if ( 1 ){
++#else
+ if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
++#endif
+ memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
+ memcpy(h-> field_scan, field_scan, 16*sizeof(uint8_t));
+ }else{
+diff --git a/libavutil/common.h b/libavutil/common.h
+index 3ae5971..7e52b90 100644
+--- a/libavutil/common.h
++++ b/libavutil/common.h
+@@ -283,23 +283,39 @@ static inline int mid_pred(int a, int b, int c)
+ * @param amax maximum value of the clip range
+ * @return cliped value
+ */
++#if defined(ARCH_AVR32)
++#define clip(a, amin, amax) \
++ ({ int __tmp__; \
++ asm ("min\t%0, %1, %2\n" \
++ "max\t%0, %0, %3\n" \
++ : "=&r"(__tmp__) : "r"(a), "r"(amax), "r"(amin)); \
++ __tmp__; })
++#else
+ static inline int clip(int a, int amin, int amax)
+ {
+ if (a < amin) return amin;
+ else if (a > amax) return amax;
+ else return a;
+ }
++#endif
+
+ /**
+ * clip a signed integer value into the 0-255 range
+ * @param a value to clip
+ * @return cliped value
+ */
++#if defined(ARCH_AVR32)
++#define clip_uint8(a) \
++ ({ int __tmp__ = a; \
++ asm ("satu\t%0 >> 0, 8" : "+r"(__tmp__)); \
++ __tmp__; })
++#else
+ static inline uint8_t clip_uint8(int a)
+ {
+ if (a&(~255)) return (-a)>>31;
+ else return a;
+ }
++#endif
+
+ /* math */
+ int64_t ff_gcd(int64_t a, int64_t b);
+diff --git a/libavutil/internal.h b/libavutil/internal.h
+index 285d304..a8b0718 100644
+--- a/libavutil/internal.h
++++ b/libavutil/internal.h
+@@ -210,6 +210,15 @@ if((y)<(x)){\
+ }\
+ }
+
++/* XXX: Hack for uclibc which declares lrintf but does not implement it... */
++#ifdef ARCH_AVR32
++#undef HAVE_LRINTF
++#define HAVE_LRINTF 1
++#define lrintf(x) rint(x)
++#define llrint(x) (long long)rint(x)
++#endif
++
++
+ #ifndef HAVE_LRINTF
+ /* XXX: add ISOC specific test to avoid specific BSD testing. */
+ /* better than nothing implementation. */
+diff --git a/libfaad2/common.h b/libfaad2/common.h
+index f809042..6c5fb21 100644
+--- a/libfaad2/common.h
++++ b/libfaad2/common.h
+@@ -67,7 +67,7 @@ extern "C" {
+ /* Use if target platform has address generators with autoincrement */
+ //#define PREFER_POINTERS
+
+-#if defined(_WIN32_WCE) || defined(__arm__)
++#if defined(_WIN32_WCE) || defined(__arm__) || defined(__avr32__)
+ #define FIXED_POINT
+ #endif
+
+diff --git a/libmpcodecs/ad_libmad.c b/libmpcodecs/ad_libmad.c
+index 076359a..51b77fe 100644
+--- a/libmpcodecs/ad_libmad.c
++++ b/libmpcodecs/ad_libmad.c
+@@ -86,6 +86,11 @@ static int init(sh_audio_t *sh){
+ sh->channels=(this->frame.header.mode == MAD_MODE_SINGLE_CHANNEL) ? 1 : 2;
+ sh->samplerate=this->frame.header.samplerate;
+ sh->i_bps=this->frame.header.bitrate/8;
++#ifdef WORDS_BIGENDIAN
++ sh->sample_format = AF_FORMAT_S16_BE;
++#else
++ sh->sample_format = AF_FORMAT_S16_LE;
++#endif
+ sh->samplesize=2;
+
+ return 1;
+diff --git a/libswscale/pico-avr32.h b/libswscale/pico-avr32.h
+new file mode 100644
+index 0000000..7ac6200
+--- /dev/null
++++ b/libswscale/pico-avr32.h
+@@ -0,0 +1,137 @@
++/*
++ * Copyright (c) 2007 Atmel Corporation. All rights reserved.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * 2. Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials provided
++ * with the distribution.
++ *
++ * 3. The name of ATMEL may not be used to endorse or promote products
++ * derived from this software without specific prior written
++ * permission.
++ *
++ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
++ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
++ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
++ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
++ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
++ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
++ * DAMAGE.
++ */
++#ifndef __PICO_H__
++#define __PICO_H__
++
++/* Coprocessor Number */
++#define PICO_CPNO 1
++
++/* Pixel Coprocessor Register file */
++#define PICO_REGVECT_INPIX2 cr0
++#define PICO_REGVECT_INPIX1 cr1
++#define PICO_REGVECT_INPIX0 cr2
++#define PICO_REGVECT_OUTPIX2 cr3
++#define PICO_REGVECT_OUTPIX1 cr4
++#define PICO_REGVECT_OUTPIX0 cr5
++#define PICO_REGVECT_COEFF0_A cr6
++#define PICO_REGVECT_COEFF0_B cr7
++#define PICO_REGVECT_COEFF1_A cr8
++#define PICO_REGVECT_COEFF1_B cr9
++#define PICO_REGVECT_COEFF2_A cr10
++#define PICO_REGVECT_COEFF2_B cr11
++#define PICO_REGVECT_VMU0_OUT cr12
++#define PICO_REGVECT_VMU1_OUT cr13
++#define PICO_REGVECT_VMU2_OUT cr14
++#define PICO_REGVECT_CONFIG cr15
++
++#define PICO_INPIX2 0
++#define PICO_INPIX1 1
++#define PICO_INPIX0 2
++#define PICO_OUTPIX2 3
++#define PICO_OUTPIX1 4
++#define PICO_OUTPIX0 5
++#define PICO_COEFF0_A 6
++#define PICO_COEFF0_B 7
++#define PICO_COEFF1_A 8
++#define PICO_COEFF1_B 9
++#define PICO_COEFF2_A 10
++#define PICO_COEFF2_B 11
++#define PICO_VMU0_OUT 12
++#define PICO_VMU1_OUT 13
++#define PICO_VMU2_OUT 14
++#define PICO_CONFIG 15
++
++/* Config Register */
++#define PICO_COEFF_FRAC_BITS 0
++#define PICO_COEFF_FRAC_BITS_WIDTH 4
++#define PICO_OFFSET_FRAC_BITS 4
++#define PICO_OFFSET_FRAC_BITS_WIDTH 4
++#define PICO_INPUT_MODE 8
++#define PICO_INPUT_MODE_WIDTH 2
++#define PICO_OUTPUT_MODE 10
++
++#define PICO_TRANSFORMATION_MODE 0
++#define PICO_HOR_FILTER_MODE 1
++#define PICO_VERT_FILTER_MODE 2
++
++#define PICO_PLANAR_MODE 1
++#define PICO_PACKED_MODE 0
++
++/* Bits in coefficients */
++#define PICO_COEFF_BITS 12
++
++/* Operation bits */
++#define PICO_USE_ACC (1 << 2)
++#define PICO_SINGLE_VECTOR (1 << 3)
++
++
++#define __str(x...) #x
++#define __xstr(x...) __str(x)
++
++#define PICO_PUT_W(pico_reg, x) \
++ __builtin_mvrc_w(PICO_CPNO, pico_reg, x);
++#define PICO_GET_W(pico_reg) \
++ __builtin_mvcr_w(PICO_CPNO, pico_reg)
++
++#define PICO_PUT_D(pico_reg, x) \
++ __builtin_mvrc_d(PICO_CPNO, pico_reg, x);
++#define PICO_GET_D(pico_reg) \
++ __builtin_mvcr_d(PICO_CPNO, pico_reg)
++
++
++#define PICO_STCM_W(ptr, pico_regs...) \
++ asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
++#define PICO_STCM_D(ptr, pico_regs...) \
++ asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
++
++#define PICO_STCM_W_DEC(ptr, pico_regs...) \
++ asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr));
++#define PICO_STCM_D_DEC(ptr, pico_regs...) \
++ asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr));
++
++#define PICO_LDCM_W(ptr, pico_regs...) \
++ asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
++#define PICO_LDCM_D(ptr, pico_regs...) \
++ asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
++
++#define PICO_LDCM_W_INC(ptr, pico_regs...) \
++ asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr));
++#define PICO_LDCM_D_INC(ptr, pico_regs...) \
++ asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr));
++
++#define PICO_OP(op, dst_addr, addr0, addr1, addr2) \
++ __builtin_cop(PICO_CPNO, addr0, addr1, addr2, op | dst_addr);
++
++
++#endif
++
+diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
+index ecd28f5..3221d0c 100644
+--- a/libswscale/swscale_internal.h
++++ b/libswscale/swscale_internal.h
+@@ -173,7 +173,7 @@ typedef struct SwsContext{
+ SwsFunc yuv2rgb_get_func_ptr (SwsContext *c);
+ int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation);
+
+-char *sws_format_name(int format);
++char *sws_format_name(enum PixelFormat format);
+
+ //FIXME replace this with something faster
+ #define isPlanarYUV(x) ((x)==PIX_FMT_YUV410P || (x)==PIX_FMT_YUV420P \
+diff --git a/libswscale/yuv2rgb.c b/libswscale/yuv2rgb.c
+index 71759bc..fa83985 100644
+--- a/libswscale/yuv2rgb.c
++++ b/libswscale/yuv2rgb.c
+@@ -44,6 +44,10 @@
+ #include "yuv2rgb_mlib.c"
+ #endif
+
++#ifdef ARCH_AVR32
++#include "yuv2rgb_avr32.c"
++#endif
++
+ #define DITHER1XBPP // only for mmx
+
+ const uint8_t __attribute__((aligned(8))) dither_2x2_4[2][8]={
+@@ -601,6 +605,12 @@ SwsFunc yuv2rgb_get_func_ptr (SwsContext *c)
+ if(t) return t;
+ }
+ #endif
++#ifdef ARCH_AVR32
++ {
++ SwsFunc t= yuv2rgb_init_avr32(c);
++ if(t) return t;
++ }
++#endif
+ #ifdef HAVE_ALTIVEC
+ if (c->flags & SWS_CPU_CAPS_ALTIVEC)
+ {
+@@ -678,6 +688,10 @@ int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange,
+ //printf("%lld %lld %lld %lld %lld\n", cy, crv, cbu, cgu, cgv);
+ oy -= 256*brightness;
+
++#ifdef ARCH_AVR32
++ yuv2rgb_c_init_tables_avr32 (c, inv_table, fullRange, brightness, contrast, saturation);
++#endif
++
+ for (i = 0; i < 1024; i++) {
+ int j;
+
+diff --git a/libswscale/yuv2rgb_avr32.c b/libswscale/yuv2rgb_avr32.c
+new file mode 100644
+index 0000000..4a8341e
+--- /dev/null
++++ b/libswscale/yuv2rgb_avr32.c
+@@ -0,0 +1,416 @@
++/*
++ * Copyright (c) 2007 Atmel Corporation. All rights reserved.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * 2. Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials provided
++ * with the distribution.
++ *
++ * 3. The name of ATMEL may not be used to endorse or promote products
++ * derived from this software without specific prior written
++ * permission.
++ *
++ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
++ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
++ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
++ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
++ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
++ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
++ * DAMAGE.
++ */
++#include "pico-avr32.h"
++
++
++#define RGB(uv_part) \
++ __asm__ volatile ( \
++ "ld.w\t%0, %3[%7:" uv_part " << 2]\n\t" /* tmp = c->table_gV[V] */ \
++ "ld.w\t%1, %4[%8:" uv_part " << 2]\n\t" /* g = c->table_gU[U] */ \
++ "ld.w\t%2, %5[%8:" uv_part " << 2]\n\t" /* b = c->table_bU[U] */ \
++ "add\t%1, %0\n\t" /* g += tmp */\
++ "ld.w\t%0, %6[%7:" uv_part " << 2]" /* r = c->table_rV[V] */ \
++ : "=&r" (r), "=&r" (g), "=&r" (b) \
++ : "r" (&c->table_gV[0]), "r" (&c->table_gU[0]),"r" (&c->table_bU[0]), \
++ "r" (&c->table_rV[0]), "r" (V), "r" (U));
++
++
++#undef YUV2RGB1
++#define YUV2RGB1(dst, src, y, idx) \
++ { int tmp2; __asm__ volatile ( \
++ "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
++ "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \
++ "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \
++ "st.b\t%7[6*%8 + 0], %1\n\t" /* dst_1[2] = tmp; */ \
++ "st.b\t%7[6*%8 + 1], %2\n\t" /* dst_1[1] = tmp; */ \
++ "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \
++ "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
++ "st.b\t%7[6*%8 + 2], %1\n\t" /* dst_1[0] = tmp; */ \
++ "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \
++ "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \
++ "st.b\t%7[6*%8 + 3], %1\n\t" /* dst_1[5] = tmp; */ \
++ "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \
++ "st.b\t%7[6*%8 + 4], %2\n\t" /* dst_1[4] = tmp; */ \
++ "st.b\t%7[6*%8 + 5], %1" /* dst_1[3] = tmp; */ \
++ : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \
++ : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); }
++
++#undef YUV2RGB2
++#define YUV2RGB2(dst, src, y, idx) \
++ { int tmp2; __asm__ volatile ( \
++ "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
++ "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \
++ "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \
++ "st.b\t%7[6*%8 + 0], %1\n\t" /* dst_1[2] = tmp; */ \
++ "st.b\t%7[6*%8 + 1], %2\n\t" /* dst_1[1] = tmp; */ \
++ "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \
++ "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
++ "st.b\t%7[6*%8 + 2], %1\n\t" /* dst_1[0] = tmp; */ \
++ "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \
++ "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \
++ "st.b\t%7[6*%8 + 3], %1\n\t" /* dst_1[5] = tmp; */ \
++ "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \
++ "st.b\t%7[6*%8 + 4], %2\n\t" /* dst_1[4] = tmp; */ \
++ "st.b\t%7[6*%8 + 5], %1" /* dst_1[3] = tmp; */ \
++ : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \
++ : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); }
++
++
++#undef YUV2BGR1
++#define YUV2BGR1(dst, src, y, idx) \
++ { int tmp2; __asm__ volatile ( \
++ "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
++ "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \
++ "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \
++ "st.b\t%7[6*%8 + 2], %1\n\t" /* dst_1[2] = tmp; */ \
++ "st.b\t%7[6*%8 + 1], %2\n\t" /* dst_1[1] = tmp; */ \
++ "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \
++ "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
++ "st.b\t%7[6*%8 + 0], %1\n\t" /* dst_1[0] = tmp; */ \
++ "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \
++ "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \
++ "st.b\t%7[6*%8 + 5], %1\n\t" /* dst_1[5] = tmp; */ \
++ "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \
++ "st.b\t%7[6*%8 + 4], %2\n\t" /* dst_1[4] = tmp; */ \
++ "st.b\t%7[6*%8 + 3], %1" /* dst_1[3] = tmp; */ \
++ : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \
++ : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); }
++
++#undef YUV2BGR2
++#define YUV2BGR2(dst, src, y, idx) \
++ { int tmp2; __asm__ volatile ( \
++ "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
++ "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \
++ "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \
++ "st.b\t%7[6*%8 + 2], %1\n\t" /* dst_1[2] = tmp; */ \
++ "st.b\t%7[6*%8 + 1], %2\n\t" /* dst_1[1] = tmp; */ \
++ "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \
++ "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
++ "st.b\t%7[6*%8 + 0], %1\n\t" /* dst_1[0] = tmp; */ \
++ "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \
++ "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \
++ "st.b\t%7[6*%8 + 5], %1\n\t" /* dst_1[5] = tmp; */ \
++ "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \
++ "st.b\t%7[6*%8 + 4], %2\n\t" /* dst_1[4] = tmp; */ \
++ "st.b\t%7[6*%8 + 3], %1" /* dst_1[3] = tmp; */ \
++ : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \
++ : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); }
++
++
++
++int yuv2bgr24_avr32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
++ int srcSliceH, uint8_t* dst[], int dstStride[]){
++ int y;
++
++ if(c->srcFormat == PIX_FMT_YUV422P){
++ srcStride[1] *= 2;
++ srcStride[2] *= 2;
++ }
++
++
++ for(y=0; y<srcSliceH; y+=2){
++ uint8_t *dst_1= (uint8_t*)(dst[0] + (y+srcSliceY )*dstStride[0]);
++ uint8_t *dst_2= (uint8_t*)(dst[0] + (y+srcSliceY+1)*dstStride[0]);
++ uint32_t *r, *g, *b;
++ uint8_t *py_1= src[0] + y*srcStride[0];
++ uint8_t *py_2= py_1 + srcStride[0];
++ uint8_t *pu= src[1] + (y>>1)*srcStride[1];
++ uint8_t *pv= src[2] + (y>>1)*srcStride[2];
++ unsigned int h_size= c->dstW>>3;
++ while (h_size--) {
++ uint32_t U, V, Y1, Y2, tmp;
++ U = ((uint32_t*)pu)[0];
++ V = ((uint32_t*)pv)[0];
++
++ RGB("t")
++ YUV2BGR1(dst_1, py_1, Y1, 0)
++ YUV2BGR1(dst_2, py_2, Y2, 0)
++
++ RGB("u")
++ YUV2BGR2(dst_1, py_1, Y1, 1)
++ YUV2BGR2(dst_2, py_2, Y2, 1)
++
++ RGB("l")
++ YUV2BGR1(dst_1, py_1, Y1, 2)
++ YUV2BGR1(dst_2, py_2, Y2, 2)
++
++ RGB("b")
++ YUV2BGR2(dst_1, py_1, Y1, 3)
++ YUV2BGR2(dst_2, py_2, Y2, 3)
++
++
++
++ pu += 4;
++ pv += 4;
++ py_1 += 8;
++ py_2 += 8;
++ dst_1 += 24;
++ dst_2 += 24;
++ }
++ }
++ return srcSliceH;
++}
++
++
++
++static int yuv2rgb24_avr32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
++ int srcSliceH, uint8_t* dst[], int dstStride[]){
++ int y;
++
++ if(c->srcFormat == PIX_FMT_YUV422P){
++ srcStride[1] *= 2;
++ srcStride[2] *= 2;
++ }
++ for(y=0; y<srcSliceH; y+=2){
++ uint8_t *dst_1= (uint8_t*)(dst[0] + (y+srcSliceY )*dstStride[0]);
++ uint8_t *dst_2= (uint8_t*)(dst[0] + (y+srcSliceY+1)*dstStride[0]);
++ uint8_t *r, *g, *b;
++ uint8_t *py_1= src[0] + y*srcStride[0];
++ uint8_t *py_2= py_1 + srcStride[0];
++ uint8_t *pu= src[1] + (y>>1)*srcStride[1];
++ uint8_t *pv= src[2] + (y>>1)*srcStride[2];
++ unsigned int h_size= c->dstW>>3;
++ while (h_size--) {
++ uint32_t U, V, Y1, Y2, tmp;
++ U = ((uint32_t*)pu)[0];
++ V = ((uint32_t*)pv)[0];
++
++ RGB("t")
++ YUV2RGB1(dst_1, py_1, Y1, 0)
++ YUV2RGB1(dst_2, py_2, Y2, 0)
++
++ RGB("u")
++ YUV2RGB2(dst_1, py_1, Y1, 1)
++ YUV2RGB2(dst_2, py_2, Y2, 1)
++
++ RGB("l")
++ YUV2RGB1(dst_1, py_1, Y1, 2)
++ YUV2RGB1(dst_2, py_2, Y2, 2)
++
++ RGB("b")
++ YUV2RGB2(dst_1, py_1, Y1, 3)
++ YUV2RGB2(dst_2, py_2, Y2, 3)
++
++ pu += 4;
++ pv += 4;
++ py_1 += 8;
++ py_2 += 8;
++ dst_1 += 24;
++ dst_2 += 24;
++ }
++ }
++ return srcSliceH;
++}
++
++#define SCALE(x, bits) (((x) + ( 1 << (bits - 1))) >> bits)
++#define COEFF_FRAC_BITS 9
++#define OFFSET_FRAC_BITS 2
++
++/* Coefficients used in the pico */
++static struct {
++ short coeff2_2;
++ short coeff2_3;
++ short coeff2_0;
++ short coeff2_1;
++ short coeff1_2;
++ short coeff1_3;
++ short coeff1_0;
++ short coeff1_1;
++ short coeff0_2;
++ short coeff0_3;
++ short coeff0_0;
++ short coeff0_1;
++} pico_coeff;
++
++
++static int yuv2bgr24_avr32_pico(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
++ int srcSliceH, uint8_t* dst[], int dstStride[]){
++ int y;
++ static int first_time = 1;
++
++ /* Initialize pico */
++ PICO_LDCM_D(&pico_coeff,
++ PICO_REGVECT_COEFF0_A, PICO_REGVECT_COEFF0_B,
++ PICO_REGVECT_COEFF1_A, PICO_REGVECT_COEFF1_B,
++ PICO_REGVECT_COEFF2_A, PICO_REGVECT_COEFF2_B);
++
++ PICO_PUT_W(PICO_CONFIG,
++ (PICO_PACKED_MODE << PICO_OUTPUT_MODE
++ | PICO_TRANSFORMATION_MODE << PICO_INPUT_MODE
++ | OFFSET_FRAC_BITS << PICO_OFFSET_FRAC_BITS
++ | COEFF_FRAC_BITS << PICO_COEFF_FRAC_BITS));
++
++
++ if(c->srcFormat == PIX_FMT_YUV422P){
++ srcStride[1] *= 2;
++ srcStride[2] *= 2;
++ }
++
++ for(y=0; y<srcSliceH; y+=2){
++ uint8_t *dst_1= (uint8_t*)(dst[0] + (y+srcSliceY )*dstStride[0]);
++ uint8_t *dst_2= (uint8_t*)(dst[0] + (y+srcSliceY+1)*dstStride[0]);
++ uint8_t *r, *g, *b;
++ uint8_t *py_1= src[0] + y*srcStride[0];
++ uint8_t *py_2= py_1 + srcStride[0];
++ uint8_t *pu= src[1] + (y>>1)*srcStride[1];
++ uint8_t *pv= src[2] + (y>>1)*srcStride[2];
++ unsigned int h_size= c->dstW>>3;
++ int *py_1_int = (int *)py_1;
++ int *py_2_int = (int *)py_2;
++ int *pu_int = (int *)pu;
++ int *pv_int = (int *)pv;
++ while (h_size--) {
++ PICO_PUT_W(PICO_INPIX0, *py_1_int++);
++ PICO_PUT_W(PICO_INPIX1, *pu_int++);
++ PICO_PUT_W(PICO_INPIX2, *pv_int++);
++ PICO_OP(0, 0, 0, 4, 8);
++ PICO_OP(0, 1, 1, 4, 8);
++ PICO_OP(0, 2, 2, 5, 9);
++ PICO_OP(0, 3, 3, 5, 9);
++ PICO_PUT_W(PICO_INPIX0, *py_1_int++);
++ PICO_STCM_W(dst_1, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0);
++ PICO_OP(0, 0, 0, 6, 10);
++ PICO_OP(0, 1, 1, 6, 10);
++ PICO_OP(0, 2, 2, 7, 11);
++ PICO_OP(0, 3, 3, 7, 11);
++ PICO_PUT_W(PICO_INPIX0, *py_2_int++);
++ PICO_STCM_W(dst_1 + 12, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0);
++
++ PICO_OP(0, 0, 0, 4, 8);
++ PICO_OP(0, 1, 1, 4, 8);
++ PICO_OP(0, 2, 2, 5, 9);
++ PICO_OP(0, 3, 3, 5, 9);
++ PICO_PUT_W(PICO_INPIX0, *py_2_int++);
++ PICO_STCM_W(dst_2, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0);
++ PICO_OP(0, 0, 0, 6, 10);
++ PICO_OP(0, 1, 1, 6, 10);
++ PICO_OP(0, 2, 2, 7, 11);
++ PICO_OP(0, 3, 3, 7, 11);
++ PICO_STCM_W(dst_2 + 12, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0);
++
++ dst_1 += 24;
++ dst_2 += 24;
++ }
++ }
++ return srcSliceH;
++}
++
++extern int avr32_use_pico;
++
++SwsFunc yuv2rgb_init_avr32 (SwsContext *c){
++ switch(c->dstFormat){
++ case PIX_FMT_BGR24:
++ {
++ if ( avr32_use_pico ){
++ MSG_ERR("AVR32 BGR24: Using PICO for color space conversion\n");
++ return yuv2bgr24_avr32_pico;
++ } else {
++ MSG_ERR("AVR32 BGR24: Using optimized color space conversion\n");
++ return yuv2bgr24_avr32;
++ }
++ }
++ break;
++ case PIX_FMT_RGB24:
++ {
++ if ( avr32_use_pico ){
++ MSG_ERR("AVR32 RGB24: Using PICO for color space conversion\n");
++ return yuv2bgr24_avr32_pico;
++ } else {
++ MSG_ERR("AVR32 RGB24: Using optimized color space conversion\n");
++ return yuv2rgb24_avr32;
++ }
++ }
++ }
++ return NULL;
++}
++
++
++int yuv2rgb_c_init_tables_avr32 (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation){
++ const int isRgb = (c->dstFormat == PIX_FMT_RGB24);
++
++ int64_t crv = inv_table[0];
++ int64_t cbu = inv_table[1];
++ int64_t cgu = -inv_table[2];
++ int64_t cgv = -inv_table[3];
++ int64_t cy = 1<<16;
++ int64_t oy = 0;
++
++ if(!fullRange){
++ cy= (cy*255) / 219;
++ oy= 16<<16;
++ }
++
++ cy = (cy *contrast )>>16;
++ crv= (crv*contrast * saturation)>>32;
++ cbu= (cbu*contrast * saturation)>>32;
++ cgu= (cgu*contrast * saturation)>>32;
++ cgv= (cgv*contrast * saturation)>>32;
++
++ oy -= 256*brightness;
++
++ pico_coeff.coeff1_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* G <- Y */
++ pico_coeff.coeff1_1 = SCALE(cgu, 16 - COEFF_FRAC_BITS); /* G <- U */
++ pico_coeff.coeff1_2 = SCALE(cgv, 16 - COEFF_FRAC_BITS); /* G <- V */
++ pico_coeff.coeff1_3 = (SCALE(-128*cgu - 128*cgv - 16*cy, 16 - OFFSET_FRAC_BITS)
++ + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* G offset */
++
++ if ( isRgb ){
++ pico_coeff.coeff0_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* R <- Y */
++ pico_coeff.coeff0_1 = 0; /* R <- U */
++ pico_coeff.coeff0_2 = SCALE(crv, 16 - COEFF_FRAC_BITS); /* R <- V */
++ pico_coeff.coeff0_3 = (SCALE(-128*crv - 16*cy, 16 - OFFSET_FRAC_BITS)
++ + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* R offset */
++
++ pico_coeff.coeff2_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* B <- Y */
++ pico_coeff.coeff2_1 = SCALE(cbu, 16 - COEFF_FRAC_BITS); /* B <- U */
++ pico_coeff.coeff2_2 = 0; /* B <- V */
++ pico_coeff.coeff2_3 = (SCALE(-128*cbu - 16*cy, 16 - OFFSET_FRAC_BITS)
++ + /*0.5*/(1 << (OFFSET_FRAC_BITS-1)));/* B offset */
++ } else {
++ pico_coeff.coeff2_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* R <- Y */
++ pico_coeff.coeff2_1 = 0; /* R <- U */
++ pico_coeff.coeff2_2 = SCALE(crv, 16 - COEFF_FRAC_BITS); /* R <- V */
++ pico_coeff.coeff2_3 = (SCALE(-128*crv - 16*cy, 16 - OFFSET_FRAC_BITS)
++ + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* R offset */
++
++ pico_coeff.coeff0_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* B <- Y */
++ pico_coeff.coeff0_1 = SCALE(cbu, 16 - COEFF_FRAC_BITS); /* B <- U */
++ pico_coeff.coeff0_2 = 0; /* B <- V */
++ pico_coeff.coeff0_3 = (SCALE(-128*cbu - 16*cy, 16 - OFFSET_FRAC_BITS)
++ + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* B offset */
++ }
++
++}
++
++
++#undef RGB
+diff --git a/libvo/vo_fbdev2.c b/libvo/vo_fbdev2.c
+index 053c193..7017770 100644
+--- a/libvo/vo_fbdev2.c
++++ b/libvo/vo_fbdev2.c
+@@ -22,6 +22,9 @@
+ #include "sub.h"
+ #include "mp_msg.h"
+
++/* Draw directly to framebuffer */
++#define USE_CONVERT2FB
++
+ static vo_info_t info = {
+ "Framebuffer Device",
+ "fbdev2",
+@@ -178,6 +181,15 @@ static int fb_preinit(int reset)
+ }
+ fb_orig_vinfo = fb_vinfo;
+
++ /* Reset panning offset */
++ fb_vinfo.yoffset = 0;
++ if (ioctl(fb_dev_fd, FBIOPAN_DISPLAY, &fb_vinfo)) {
++ mp_msg(MSGT_VO, MSGL_ERR,
++ "[fbdev2] FBIOPAN_DISPLAY failed: %s\n",
++ strerror(errno));
++ return 0;
++ }
++
+ fb_bpp = fb_vinfo.bits_per_pixel;
+
+ /* 16 and 15 bpp is reported as 16 bpp */
+@@ -289,6 +301,10 @@ static int config(uint32_t width, uint32_t height, uint32_t d_width,
+ mp_msg(MSGT_VO, MSGL_ERR, "[fbdev2] Can't malloc next_frame: %s\n", strerror(errno));
+ return 1;
+ }
++#else
++ if ((fb_line_len * fb_vinfo.yres) <= (fb_finfo.smem_len / 2)
++ && fb_vinfo.yoffset == 0)
++ center += fb_line_len * fb_vinfo.yres;
+ #endif
+ if (fs) memset(frame_buffer, '\0', fb_line_len * fb_vinfo.yres);
+
+@@ -299,14 +315,22 @@ static int query_format(uint32_t format)
+ {
+ // open the device, etc.
+ if (fb_preinit(0)) return 0;
+- if ((format & IMGFMT_BGR_MASK) == IMGFMT_BGR) {
++ if ((format & IMGFMT_RGB_MASK) == IMGFMT_RGB) {
+ int fb_target_bpp = format & 0xff;
+ set_bpp(&fb_vinfo, fb_target_bpp);
+ fb_vinfo.xres_virtual = fb_vinfo.xres;
+- fb_vinfo.yres_virtual = fb_vinfo.yres;
++ fb_vinfo.yres_virtual = fb_vinfo.yres * 2;
+ if (ioctl(fb_dev_fd, FBIOPUT_VSCREENINFO, &fb_vinfo)) {
+- mp_msg(MSGT_VO, MSGL_ERR, "[fbdev2] Can't put VSCREENINFO: %s\n", strerror(errno));
+- return 0;
++ mp_msg(MSGT_VO, MSGL_WARN,
++ "[fbdev2] Can't double virtual y resolution: %s\n",
++ strerror(errno));
++ fb_vinfo.yres_virtual = fb_vinfo.yres;
++ if (ioctl(fb_dev_fd, FBIOPUT_VSCREENINFO, &fb_vinfo)) {
++ mp_msg(MSGT_VO, MSGL_ERR,
++ "[fbdev2] Can't put VSCREENINFO: %s\n",
++ strerror(errno));
++ return -1;
++ }
+ }
+ fb_pixel_size = fb_vinfo.bits_per_pixel / 8;
+ fb_bpp = fb_vinfo.red.length + fb_vinfo.green.length +
+@@ -367,16 +391,67 @@ static void check_events(void)
+
+ static void flip_page(void)
+ {
+-#ifndef USE_CONVERT2FB
+ int i, out_offset = 0, in_offset = 0;
+
+- for (i = 0; i < in_height; i++) {
+- memcpy(center + out_offset, next_frame + in_offset,
+- in_width * fb_pixel_size);
+- out_offset += fb_line_len;
+- in_offset += in_width * fb_pixel_size;
+- }
++#ifndef USE_CONVERT2FB
++ if (1) {
++#else
++ if (fb_vinfo.yres_virtual == fb_vinfo.yres) {
+ #endif
++ for (i = 0; i < in_height; i++) {
++ memcpy(center + out_offset, next_frame + in_offset,
++ in_width * fb_pixel_size);
++ out_offset += fb_line_len;
++ in_offset += in_width * fb_pixel_size;
++ }
++ } else {
++ if (fb_vinfo.yoffset == 0) {
++ fb_vinfo.yoffset += fb_vinfo.yres;
++ center -= fb_line_len * fb_vinfo.yres;
++ } else {
++ fb_vinfo.yoffset = 0;
++ center += fb_line_len * fb_vinfo.yres;
++ }
++
++ if (ioctl(fb_dev_fd, FBIOPAN_DISPLAY, &fb_vinfo)) {
++ mp_msg(MSGT_VO, MSGL_ERR,
++ "[fbdev2] Can't FBIOPAN_DISPLAY: %s\n",
++ strerror(errno));
++ }
++ }
++}
++
++static uint32_t get_image(mp_image_t *mpi)
++{
++ if(mpi->flags&MP_IMGFLAG_READABLE)
++ return VO_FALSE; // slow video ram
++ if(mpi->type==MP_IMGTYPE_STATIC)
++ return VO_FALSE; // it is not static
++
++ if (mpi->flags & (MP_IMGFLAG_ACCEPT_STRIDE | MP_IMGFLAG_ACCEPT_WIDTH)) {
++ // we're lucky or codec accepts stride => ok, let's go!
++
++ //YUY2 and RGB formats
++ mpi->planes[0] = center;
++ mpi->width = in_width;
++ mpi->stride[0] = fb_line_len;
++
++ // center image
++
++ mpi->flags |= MP_IMGFLAG_DIRECT;
++
++ return VO_TRUE;
++ }
++
++ return VO_FALSE;
++}
++
++static uint32_t put_image(mp_image_t *mpi)
++{
++ // already out?
++ if ((mpi->flags & (MP_IMGFLAG_DIRECT | MP_IMGFLAG_DRAW_CALLBACK)))
++ return VO_TRUE;
++ return VO_FALSE;
+ }
+
+ static void uninit(void)
+@@ -403,6 +478,10 @@ static int control(uint32_t request, void *data, ...)
+ switch (request) {
+ case VOCTRL_QUERY_FORMAT:
+ return query_format(*((uint32_t*)data));
++ case VOCTRL_GET_IMAGE:
++ return get_image(data);
++ case VOCTRL_DRAW_IMAGE:
++ return put_image(data);
+ }
+ return VO_NOTIMPL;
+ }
+diff --git a/version.sh b/version.sh
+index 44b5c5d..cf22a68 100755
+--- a/version.sh
++++ b/version.sh
+@@ -1,2 +1,2 @@
+ #!/bin/sh
+-echo "#define VERSION \"1.0rc1-$1\"" > version.h
++echo "#define VERSION \"1.0rc1.atmel.2-$1\"" > version.h
--- /dev/null
+diff -urN MPlayer-1.0rc1-0rig/stream/stream_dvb.c MPlayer-1.0rc1/stream/stream_dvb.c
+--- MPlayer-1.0rc1-0rig/stream/stream_dvb.c 2006-10-23 00:32:25.000000000 +0200
++++ MPlayer-1.0rc1/stream/stream_dvb.c 2007-09-25 08:37:54.000000000 +0200
+@@ -37,9 +37,7 @@
+ #include <sys/poll.h>
+ #include <unistd.h>
+ #include <fcntl.h>
+-#include <string.h>
+ #include <errno.h>
+-#include <fcntl.h>
+
+ #include "stream.h"
+ #include "libmpdemux/demuxer.h"
+@@ -168,7 +166,7 @@
+ if((line[0] == '#') || (strlen(line) == 0))
+ continue;
+
+- colon = index(line, ':');
++ colon = strchr(line, ':');
+ if(colon)
+ {
+ k = colon - line;
--- /dev/null
+#############################################################
+#
+# mplayer
+#
+#############################################################
+MPLAYER_VERSION:=1.0rc1
+MPLAYER_SOURCE:=MPlayer-$(MPLAYER_VERSION).tar.bz2
+MPLAYER_SITE:=http://www7.mplayerhq.hu/MPlayer/releases
+MPLAYER_DIR:=$(BUILD_DIR)/MPlayer-$(MPLAYER_VERSION)
+MPLAYER_CAT:=$(BZCAT)
+MPLAYER_BINARY:=mplayer
+MPLAYER_TARGET_BINARY:=usr/bin/$(MPLAYER_BINARY)
+
+ifeq ($(BR2_ENDIAN),"BIG")
+MPLAYER_ENDIAN:=--enable-big-endian
+else
+MPLAYER_ENDIAN:=--disable-big-endian
+endif
+
+$(DL_DIR)/$(MPLAYER_SOURCE):
+ $(WGET) -P $(DL_DIR) $(MPLAYER_SITE)/$(MPLAYER_SOURCE)
+
+$(MPLAYER_DIR)/.unpacked: $(DL_DIR)/$(MPLAYER_SOURCE)
+ $(MPLAYER_CAT) $(DL_DIR)/$(MPLAYER_SOURCE) | tar -C $(BUILD_DIR) $(TAR_OPTIONS) -
+ toolchain/patch-kernel.sh $(MPLAYER_DIR) package/multimedia/mplayer/ mplayer-$(MPLAYER_VERSION)\*.patch\*
+ $(CONFIG_UPDATE) $(MPLAYER_DIR)
+ touch $@
+
+$(MPLAYER_DIR)/.configured: $(MPLAYER_DIR)/.unpacked
+ (cd $(MPLAYER_DIR); rm -rf config.cache; \
+ $(TARGET_CONFIGURE_OPTS) \
+ $(TARGET_CONFIGURE_ARGS) \
+ CFLAGS="$(TARGET_CFLAGS)" \
+ LDFLAGS="$(TARGET_LDFLAGS)" \
+ ./configure \
+ --prefix=/usr \
+ --confdir=/etc \
+ --target=$(GNU_TARGET_NAME) \
+ --host-cc=$(HOSTCC) \
+ --cc=$(TARGET_CC) \
+ --as=$(TARGET_CROSS)as \
+ --with-extraincdir=$(STAGING_DIR)/usr/include \
+ --with-extralibdir=$(STAGING_DIR)/lib \
+ --enable-mad \
+ --enable-fbdev \
+ $(MPLAYER_ENDIAN) \
+ --disable-mpdvdkit \
+ --disable-tv \
+ --enable-dynamic-plugins \
+ )
+ touch $@
+
+$(MPLAYER_DIR)/$(MPLAYER_BINARY): $(MPLAYER_DIR)/.configured
+ $(MAKE) -C $(MPLAYER_DIR)
+ touch -c $@
+
+$(TARGET_DIR)/$(MPLAYER_TARGET_BINARY): $(MPLAYER_DIR)/$(MPLAYER_BINARY)
+ $(INSTALL) -m 0755 -D $(MPLAYER_DIR)/$(MPLAYER_BINARY) $(TARGET_DIR)/$(MPLAYER_TARGET_BINARY)
+ -$(STRIPCMD) $(STRIP_STRIP_UNNEEDED) $(TARGET_DIR)/$(MPLAYER_TARGET_BINARY)
+ touch -c $@
+
+mplayer: uclibc libmad $(TARGET_DIR)/$(MPLAYER_TARGET_BINARY)
+
+mplayer-source: $(DL_DIR)/$(MPLAYER_SOURCE)
+
+mplayer-unpacked: $(MPLAYER_DIR)/.unpacked
+
+mplayer-clean:
+ rm -f $(TARGET_DIR)/$(MPLAYER_TARGET_BINARY)
+ -$(MAKE) -C $(MPLAYER_DIR) clean
+
+mplayer-dirclean:
+ rm -rf $(MPLAYER_DIR)
+#############################################################
+#
+# Toplevel Makefile options
+#
+#############################################################
+ifeq ($(strip $(BR2_PACKAGE_MPLAYER)),y)
+TARGETS+=mplayer
+endif
--- /dev/null
+config BR2_PACKAGE_VLC
+ bool "vlc"
+ help
+ a free cross-platform media player
+
+ http://download.videolan.org/pub/videolan/vlc/0.8.6e
--- /dev/null
+#############################################################
+#
+# vlc
+#
+#############################################################
+VLC_VERSION = 0.8.6e
+VLC_SOURCE = vlc-$(VLC_VERSION).tar.bz2
+VLC_SITE = http://download.videolan.org/pub/videolan/vlc/0.8.6e
+VLC_AUTORECONF = NO
+VLC_INSTALL_STAGING = NO
+VLC_INSTALL_TARGET = YES
+
+VLC_CONF_OPT =
+
+VLC_DEPENDENCIES = uclibc
+
+$(eval $(call AUTOTARGETS,package/multimedia,vlc))
+
+++ /dev/null
-config BR2_PACKAGE_VLC
- bool "vlc"
- help
- a free cross-platform media player
-
- http://download.videolan.org/pub/videolan/vlc/0.8.6e
+++ /dev/null
-#############################################################
-#
-# vlc
-#
-#############################################################
-VLC_VERSION = 0.8.6e
-VLC_SOURCE = vlc-$(VLC_VERSION).tar.bz2
-VLC_SITE = http://download.videolan.org/pub/videolan/vlc/0.8.6e
-VLC_AUTORECONF = NO
-VLC_INSTALL_STAGING = NO
-VLC_INSTALL_TARGET = YES
-
-VLC_CONF_OPT =
-
-VLC_DEPENDENCIES = uclibc
-
-$(eval $(call AUTOTARGETS,package,vlc))
-