tgsi: correct handling of return value from util_vsnprintf

[mesa.git] / src / gallium / auxiliary / tgsi / tgsi_sse2.c
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c

index f79170b9d65964de0ea312373e470fe5891a413e..ba2bfdef0627a3b82ddfc5a5936c34d83dd2164d 100644 (file)
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -27,12 +27,14 @@
  
  #include "pipe/p_config.h"
  
-#if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE)
+#if defined(PIPE_ARCH_X86)
  
-#include "pipe/p_debug.h"
+#include "util/u_debug.h"
  #include "pipe/p_shader_tokens.h"
  #include "util/u_math.h"
+#if defined(PIPE_ARCH_SSE)
  #include "util/u_sse.h"
+#endif
  #include "tgsi/tgsi_parse.h"
  #include "tgsi/tgsi_util.h"
  #include "tgsi_exec.h"
@@ -72,6 +74,9 @@
  
  #define TEMP_R0   TGSI_EXEC_TEMP_R0
  #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
+#define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
+#define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
+
  
  /**
   * X86 utility functions.
@@ -233,6 +238,9 @@ emit_const(
     int indirectIndex )
  {
     if (indirect) {
+      /* 'vec' is the offset from the address register's value.
+       * We're loading CONST[ADDR+vec] into an xmm register.
+       */
        struct x86_reg r0 = get_input_base();
        struct x86_reg r1 = get_output_base();
        uint i;
@@ -243,18 +251,40 @@ emit_const(
        x86_push( func, r0 );
        x86_push( func, r1 );
  
+      /*
+       * Loop over the four pixels or vertices in the quad.
+       * Get the value of the address (offset) register for pixel/vertex[i],
+       * add it to the src offset and index into the constant buffer.
+       * Note that we're working on SOA data.
+       * If any of the pixel/vertex execution channels are unused their
+       * values will be garbage.  It's very important that we don't use
+       * those garbage values as indexes into the constant buffer since
+       * that'll cause segfaults.
+       * The solution is to bitwise-AND the offset with the execution mask
+       * register whose values are either 0 or ~0.
+       * The caller must setup the execution mask register to indicate
+       * which channels are valid/alive before running the shader.
+       * The execution mask will also figure into loops and conditionals
+       * someday.
+       */
        for (i = 0; i < QUAD_SIZE; i++) {
-         x86_lea( func, r0, get_const( vec, chan ) );
+         /* r1 = address register[i] */
           x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
+         /* r0 = execution mask[i] */
+         x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
+         /* r1 = r1 & r0 */
+         x86_and( func, r1, r0 );
+         /* r0 = 'vec', the offset */
+         x86_lea( func, r0, get_const( vec, chan ) );
  
-         /* Quick hack to multiply by 16 -- need to add SHL to rtasm.
+         /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
            */
           x86_add( func, r1, r1 );
           x86_add( func, r1, r1 );
           x86_add( func, r1, r1 );
           x86_add( func, r1, r1 );
  
-         x86_add( func, r0, r1 );
+         x86_add( func, r0, r1 );  /* r0 = r0 + r1 */
           x86_mov( func, r1, x86_deref( r0 ) );
           x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
        }
@@ -268,6 +298,7 @@ emit_const(
           get_temp( TEMP_R0, CHAN_X ) );
     }
     else {
+      /* 'vec' is the index into the src register file, such as TEMP[vec] */
        assert( vec >= 0 );
  
        sse_movss(
@@ -496,7 +527,7 @@ emit_func_call_dst(
     void (PIPE_CDECL *code)() )
  {
     struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
-   unsigned i, n, xmm;
+   unsigned i, n;
     unsigned xmm_mask;
     
     /* Bitmask of the xmm registers to save */
@@ -532,7 +563,7 @@ emit_func_call_dst(
           sse_movups(
              func,
              x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
-            make_xmm( xmm ) );
+            make_xmm( i ) );
           ++n;
        }
     
@@ -550,7 +581,7 @@ emit_func_call_dst(
        if(xmm_mask & (1 << i)) {
           sse_movups(
              func,
-            make_xmm( xmm ),
+            make_xmm( i ),
              x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
           ++n;
        }
@@ -598,6 +629,9 @@ emit_func_call_dst_src(
        code );
  }
  
+
+#if defined(PIPE_ARCH_SSE)
+
  /*
   * Fast SSE2 implementation of special math functions.
   */
@@ -649,6 +683,7 @@ exp2f4(__m128 x)
     return _mm_mul_ps(expipart, expfpart);
  }
  
+
  /**
   * See http://www.devmaster.net/forums/showthread.php?p=43580
   */
@@ -691,12 +726,16 @@ log2f4(__m128 x)
     return _mm_add_ps(logmant, exp);
  }
  
+
  static INLINE __m128
  powf4(__m128 x, __m128 y)
  {
     return exp2f4(_mm_mul_ps(log2f4(x), y));
  }
  
+#endif /* PIPE_ARCH_SSE */
+
+
  
  /**
   * Low-level instruction translators.
@@ -751,13 +790,20 @@ emit_cos(
  }
  
  static void PIPE_CDECL
-#if defined(PIPE_CC_GCC)
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
  __attribute__((force_align_arg_pointer))
  #endif
  ex24f(
     float *store )
  {
+#if defined(PIPE_ARCH_SSE)
     _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
+#else
+   store[0] = util_fast_exp2( store[0] );
+   store[1] = util_fast_exp2( store[1] );
+   store[2] = util_fast_exp2( store[2] );
+   store[3] = util_fast_exp2( store[3] );
+#endif
  }
  
  static void
@@ -784,6 +830,17 @@ emit_f2it(
        make_xmm( xmm ) );
  }
  
+static void
+emit_i2f(
+   struct x86_function *func,
+   unsigned xmm )
+{
+   sse2_cvtdq2ps(
+      func,
+      make_xmm( xmm ),
+      make_xmm( xmm ) );
+}
+
  static void PIPE_CDECL
  flr4f(
     float *store )
@@ -831,13 +888,20 @@ emit_frc(
  }
  
  static void PIPE_CDECL
-#if defined(PIPE_CC_GCC)
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
  __attribute__((force_align_arg_pointer))
  #endif
  lg24f(
     float *store )
  {
+#if defined(PIPE_ARCH_SSE)
     _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
+#else
+   store[0] = util_fast_log2( store[0] );
+   store[1] = util_fast_log2( store[1] );
+   store[2] = util_fast_log2( store[2] );
+   store[3] = util_fast_log2( store[3] );
+#endif
  }
  
  static void
@@ -890,19 +954,19 @@ emit_neg(
  }
  
  static void PIPE_CDECL
-#if defined(PIPE_CC_GCC)
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
  __attribute__((force_align_arg_pointer))
  #endif
  pow4f(
     float *store )
  {
-#if 1
+#if defined(PIPE_ARCH_SSE)
     _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
  #else
-   store[0] = powf( store[0], store[4] );
-   store[1] = powf( store[1], store[5] );
-   store[2] = powf( store[2], store[6] );
-   store[3] = powf( store[3], store[7] );
+   store[0] = util_fast_pow( store[0], store[4] );
+   store[1] = util_fast_pow( store[1], store[5] );
+   store[2] = util_fast_pow( store[2], store[6] );
+   store[3] = util_fast_pow( store[3], store[7] );
  #endif
  }
  
@@ -937,6 +1001,29 @@ emit_rcp (
        make_xmm( xmm_src ) );
  }
  
+static void PIPE_CDECL
+rnd4f(
+   float *store )
+{
+   store[0] = floorf( store[0] + 0.5f );
+   store[1] = floorf( store[1] + 0.5f );
+   store[2] = floorf( store[2] + 0.5f );
+   store[3] = floorf( store[3] + 0.5f );
+}
+
+static void
+emit_rnd(
+   struct x86_function *func,
+   unsigned xmm_save, 
+   unsigned xmm_dst )
+{
+   emit_func_call_dst(
+      func,
+      xmm_save,
+      xmm_dst,
+      rnd4f );
+}
+
  static void
  emit_rsqrt(
     struct x86_function *func,
@@ -996,6 +1083,29 @@ emit_setsign(
           TGSI_EXEC_TEMP_80000000_C ) );
  }
  
+static void PIPE_CDECL
+sgn4f(
+   float *store )
+{
+   store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f;
+   store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f;
+   store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f;
+   store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f;
+}
+
+static void
+emit_sgn(
+   struct x86_function *func,
+   unsigned xmm_save, 
+   unsigned xmm_dst )
+{
+   emit_func_call_dst(
+      func,
+      xmm_save,
+      xmm_dst,
+      sgn4f );
+}
+
  static void PIPE_CDECL
  sin4f(
     float *store )
@@ -1356,6 +1466,31 @@ emit_cmp(
     }
  }
  
+
+/**
+ * Check if inst src/dest regs use indirect addressing into temporary
+ * register file.
+ */
+static boolean
+indirect_temp_reference(const struct tgsi_full_instruction *inst)
+{
+   uint i;
+   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
+      const struct tgsi_full_src_register *reg = &inst->FullSrcRegisters[i];
+      if (reg->SrcRegister.File == TGSI_FILE_TEMPORARY &&
+          reg->SrcRegister.Indirect)
+         return TRUE;
+   }
+   for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
+      const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[i];
+      if (reg->DstRegister.File == TGSI_FILE_TEMPORARY &&
+          reg->DstRegister.Indirect)
+         return TRUE;
+   }
+   return FALSE;
+}
+
+
  static int
  emit_instruction(
     struct x86_function *func,
@@ -1363,10 +1498,15 @@ emit_instruction(
  {
     unsigned chan_index;
  
+   /* we can't handle indirect addressing into temp register file yet */
+   if (indirect_temp_reference(inst))
+      return FALSE;
+
     switch (inst->Instruction.Opcode) {
     case TGSI_OPCODE_ARL:
        FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
           FETCH( func, *inst, 0, 0, chan_index );
+         emit_flr(func, 0, 0);
           emit_f2it( func, 0 );
           STORE( func, *inst, 0, 0, chan_index );
        }
@@ -1443,7 +1583,7 @@ emit_instruction(
                 func,
                 make_xmm( 2 ),
                 make_xmm( 0 ),
-               cc_LessThanEqual );
+               cc_LessThan );
              sse_andps(
                 func,
                 make_xmm( 2 ),
@@ -1465,6 +1605,7 @@ emit_instruction(
     case TGSI_OPCODE_RSQ:
     /* TGSI_OPCODE_RECIPSQRT */
        FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_abs( func, 0 );
        emit_rsqrt( func, 1, 0 );
        FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
           STORE( func, *inst, 1, 0, chan_index );
@@ -1702,7 +1843,18 @@ emit_instruction(
  
     case TGSI_OPCODE_DOT2ADD:
     /* TGSI_OPCODE_DP2A */
-      return 0;
+      FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
+      FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
+      emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
+      FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
+      FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
+      emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
+      emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
+      FETCH( func, *inst, 1, 2, CHAN_X );  /* xmm1 = src[2].x */
+      emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
+      }
        break;
  
     case TGSI_OPCODE_INDEX:
@@ -1736,7 +1888,11 @@ emit_instruction(
        break;
  
     case TGSI_OPCODE_ROUND:
-      return 0;
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_rnd( func, 0, 0 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
        break;
  
     case TGSI_OPCODE_EXPBASE2:
@@ -1976,7 +2132,12 @@ emit_instruction(
        break;
  
     case TGSI_OPCODE_ARR:
-      return 0;
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_rnd( func, 0, 0 );
+         emit_f2it( func, 0 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
        break;
  
     case TGSI_OPCODE_BRA:
@@ -1995,7 +2156,12 @@ emit_instruction(
        break;
  
     case TGSI_OPCODE_SSG:
-      return 0;
+   /* TGSI_OPCODE_SGN */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_sgn( func, 0, 0 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
        break;
  
     case TGSI_OPCODE_CMP:
@@ -2036,7 +2202,90 @@ emit_instruction(
        break;
  
     case TGSI_OPCODE_NRM:
-      return 0;
+      /* fall-through */
+   case TGSI_OPCODE_NRM4:
+      /* 3 or 4-component normalization */
+      {
+         uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
+
+         if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
+             IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
+             IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) ||
+             (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 4)) {
+
+            /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
+
+            /* xmm4 = src.x */
+            /* xmm0 = src.x * src.x */
+            FETCH(func, *inst, 0, 0, CHAN_X);
+            if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
+               emit_MOV(func, 4, 0);
+            }
+            emit_mul(func, 0, 0);
+
+            /* xmm5 = src.y */
+            /* xmm0 = xmm0 + src.y * src.y */
+            FETCH(func, *inst, 1, 0, CHAN_Y);
+            if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+               emit_MOV(func, 5, 1);
+            }
+            emit_mul(func, 1, 1);
+            emit_add(func, 0, 1);
+
+            /* xmm6 = src.z */
+            /* xmm0 = xmm0 + src.z * src.z */
+            FETCH(func, *inst, 1, 0, CHAN_Z);
+            if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+               emit_MOV(func, 6, 1);
+            }
+            emit_mul(func, 1, 1);
+            emit_add(func, 0, 1);
+
+            if (dims == 4) {
+               /* xmm7 = src.w */
+               /* xmm0 = xmm0 + src.w * src.w */
+               FETCH(func, *inst, 1, 0, CHAN_W);
+               if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
+                  emit_MOV(func, 7, 1);
+               }
+               emit_mul(func, 1, 1);
+               emit_add(func, 0, 1);
+            }
+
+            /* xmm1 = 1 / sqrt(xmm0) */
+            emit_rsqrt(func, 1, 0);
+
+            /* dst.x = xmm1 * src.x */
+            if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
+               emit_mul(func, 4, 1);
+               STORE(func, *inst, 4, 0, CHAN_X);
+            }
+
+            /* dst.y = xmm1 * src.y */
+            if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+               emit_mul(func, 5, 1);
+               STORE(func, *inst, 5, 0, CHAN_Y);
+            }
+
+            /* dst.z = xmm1 * src.z */
+            if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+               emit_mul(func, 6, 1);
+               STORE(func, *inst, 6, 0, CHAN_Z);
+            }
+
+            /* dst.w = xmm1 * src.w */
+            if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) && dims == 4) {
+               emit_mul(func, 7, 1);
+               STORE(func, *inst, 7, 0, CHAN_W);
+            }
+         }
+
+         /* dst0.w = 1.0 */
+         if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 3) {
+            emit_tempf(func, 0, TEMP_ONE_I, TEMP_ONE_C);
+            STORE(func, *inst, 0, 0, CHAN_W);
+         }
+      }
        break;
  
     case TGSI_OPCODE_DIV:
@@ -2044,7 +2293,16 @@ emit_instruction(
        break;
  
     case TGSI_OPCODE_DP2:
-      return 0;
+      FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
+      FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
+      emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
+      FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
+      FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
+      emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
+      emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
+      }
        break;
  
     case TGSI_OPCODE_TXL:
@@ -2104,7 +2362,12 @@ emit_instruction(
        break;
  
     case TGSI_OPCODE_TRUNC:
-      return 0;
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_f2it( func, 0 );
+         emit_i2f( func, 0 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
        break;
  
     case TGSI_OPCODE_SHL:
@@ -2490,7 +2753,7 @@ tgsi_emit_sse2(
        case TGSI_TOKEN_TYPE_IMMEDIATE:
           /* simply copy the immediate values into the next immediates[] slot */
           {
-            const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
+            const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
              uint i;
              assert(size <= 4);
              assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);