#ifdef PIPE_ARCH_X86
-/* for 1/sqrt() \r
- * \r
- * This costs about 100fps (close to 10%) in gears:\r
- */\r
-#define HIGH_PRECISION 1
+/* for 1/sqrt()
+ *
+ * This costs about 100fps (close to 10%) in gears:
+ */
+#define HIGH_PRECISION 1
#define FOR_EACH_CHANNEL( CHAN )\
*
* See: http://softwarecommunity.intel.com/articles/eng/1818.htm
*/
- {\r
- struct x86_reg dst = make_xmm( xmm_dst );\r
- struct x86_reg src = make_xmm( xmm_src );\r
- struct x86_reg tmp0 = make_xmm( 2 );\r
- struct x86_reg tmp1 = make_xmm( 3 );\r
-\r
- assert( xmm_dst != xmm_src );\r
- assert( xmm_dst != 2 && xmm_dst != 3 );\r
- assert( xmm_src != 2 && xmm_src != 3 );\r
-\r
- sse_movaps( func, dst, get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );\r
- sse_movaps( func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );\r
- sse_rsqrtps( func, tmp1, src );\r
- sse_mulps( func, src, tmp1 );\r
- sse_mulps( func, dst, tmp1 );\r
- sse_mulps( func, src, tmp1 );\r
- sse_subps( func, tmp0, src );\r
- sse_mulps( func, dst, tmp0 );\r
+ {
+ struct x86_reg dst = make_xmm( xmm_dst );
+ struct x86_reg src = make_xmm( xmm_src );
+ struct x86_reg tmp0 = make_xmm( 2 );
+ struct x86_reg tmp1 = make_xmm( 3 );
+
+ assert( xmm_dst != xmm_src );
+ assert( xmm_dst != 2 && xmm_dst != 3 );
+ assert( xmm_src != 2 && xmm_src != 3 );
+
+ sse_movaps( func, dst, get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
+ sse_movaps( func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
+ sse_rsqrtps( func, tmp1, src );
+ sse_mulps( func, src, tmp1 );
+ sse_mulps( func, dst, tmp1 );
+ sse_mulps( func, src, tmp1 );
+ sse_subps( func, tmp0, src );
+ sse_mulps( func, dst, tmp0 );
}
#else
/* On Intel CPUs at least, this is only accurate to 12 bits -- not