Merge remote branch 'origin/master' into radeon-rewrite

[mesa.git] / src / gallium / auxiliary / tgsi / tgsi_exec.c
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c

index 0756d7db1760fd20a59a7f11557d6d825766dc3a..ab641efb6032b07082167a7c2545463d210b9385 100644 (file)
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -133,7 +133,7 @@ tgsi_exec_machine_bind_shader(
     struct tgsi_exec_machine *mach,
     const struct tgsi_token *tokens,
     uint numSamplers,
-   struct tgsi_sampler *samplers)
+   struct tgsi_sampler **samplers)
  {
     uint k;
     struct tgsi_parse_context parse;
@@ -202,7 +202,7 @@ tgsi_exec_machine_bind_shader(
  
        case TGSI_TOKEN_TYPE_IMMEDIATE:
           {
-            uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
+            uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
              assert( size % 4 == 0 );
              assert( mach->ImmLimit + size / 4 <= TGSI_EXEC_NUM_IMMEDIATES );
  
@@ -320,6 +320,7 @@ micro_add(
     dst->f[3] = src0->f[3] + src1->f[3];
  }
  
+#if 0
  static void
  micro_iadd(
     union tgsi_exec_channel *dst,
@@ -331,6 +332,7 @@ micro_iadd(
     dst->i[2] = src0->i[2] + src1->i[2];
     dst->i[3] = src0->i[3] + src1->i[3];
  }
+#endif
  
  static void
  micro_and(
@@ -408,6 +410,7 @@ micro_div(
     }
  }
  
+#if 0
  static void
  micro_udiv(
     union tgsi_exec_channel *dst,
@@ -419,6 +422,7 @@ micro_udiv(
     dst->u[2] = src0->u[2] / src1->u[2];
     dst->u[3] = src0->u[3] / src1->u[3];
  }
+#endif
  
  static void
  micro_eq(
@@ -434,6 +438,7 @@ micro_eq(
     dst->f[3] = src0->f[3] == src1->f[3] ? src2->f[3] : src3->f[3];
  }
  
+#if 0
  static void
  micro_ieq(
     union tgsi_exec_channel *dst,
@@ -447,6 +452,7 @@ micro_ieq(
     dst->i[2] = src0->i[2] == src1->i[2] ? src2->i[2] : src3->i[2];
     dst->i[3] = src0->i[3] == src1->i[3] ? src2->i[3] : src3->i[3];
  }
+#endif
  
  static void
  micro_exp2(
@@ -466,6 +472,7 @@ micro_exp2(
  #endif
  }
  
+#if 0
  static void
  micro_f2ut(
     union tgsi_exec_channel *dst,
@@ -476,6 +483,7 @@ micro_f2ut(
     dst->u[2] = (uint) src->f[2];
     dst->u[3] = (uint) src->f[3];
  }
+#endif
  
  static void
  micro_flr(
@@ -570,6 +578,7 @@ micro_lt(
     dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
  }
  
+#if 0
  static void
  micro_ilt(
     union tgsi_exec_channel *dst,
@@ -583,7 +592,9 @@ micro_ilt(
     dst->i[2] = src0->i[2] < src1->i[2] ? src2->i[2] : src3->i[2];
     dst->i[3] = src0->i[3] < src1->i[3] ? src2->i[3] : src3->i[3];
  }
+#endif
  
+#if 0
  static void
  micro_ult(
     union tgsi_exec_channel *dst,
@@ -597,6 +608,7 @@ micro_ult(
     dst->u[2] = src0->u[2] < src1->u[2] ? src2->u[2] : src3->u[2];
     dst->u[3] = src0->u[3] < src1->u[3] ? src2->u[3] : src3->u[3];
  }
+#endif
  
  static void
  micro_max(
@@ -610,6 +622,7 @@ micro_max(
     dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
  }
  
+#if 0
  static void
  micro_imax(
     union tgsi_exec_channel *dst,
@@ -621,7 +634,9 @@ micro_imax(
     dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
     dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
  }
+#endif
  
+#if 0
  static void
  micro_umax(
     union tgsi_exec_channel *dst,
@@ -633,6 +648,7 @@ micro_umax(
     dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
     dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
  }
+#endif
  
  static void
  micro_min(
@@ -646,6 +662,7 @@ micro_min(
     dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
  }
  
+#if 0
  static void
  micro_imin(
     union tgsi_exec_channel *dst,
@@ -657,7 +674,9 @@ micro_imin(
     dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
     dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
  }
+#endif
  
+#if 0
  static void
  micro_umin(
     union tgsi_exec_channel *dst,
@@ -669,7 +688,9 @@ micro_umin(
     dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
     dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
  }
+#endif
  
+#if 0
  static void
  micro_umod(
     union tgsi_exec_channel *dst,
@@ -681,6 +702,7 @@ micro_umod(
     dst->u[2] = src0->u[2] % src1->u[2];
     dst->u[3] = src0->u[3] % src1->u[3];
  }
+#endif
  
  static void
  micro_mul(
@@ -694,6 +716,7 @@ micro_mul(
     dst->f[3] = src0->f[3] * src1->f[3];
  }
  
+#if 0
  static void
  micro_imul(
     union tgsi_exec_channel *dst,
@@ -705,7 +728,9 @@ micro_imul(
     dst->i[2] = src0->i[2] * src1->i[2];
     dst->i[3] = src0->i[3] * src1->i[3];
  }
+#endif
  
+#if 0
  static void
  micro_imul64(
     union tgsi_exec_channel *dst0,
@@ -722,7 +747,9 @@ micro_imul64(
     dst0->i[2] = 0;
     dst0->i[3] = 0;
  }
+#endif
  
+#if 0
  static void
  micro_umul64(
     union tgsi_exec_channel *dst0,
@@ -739,7 +766,10 @@ micro_umul64(
     dst0->u[2] = 0;
     dst0->u[3] = 0;
  }
+#endif
+
  
+#if 0
  static void
  micro_movc(
     union tgsi_exec_channel *dst,
@@ -752,6 +782,7 @@ micro_movc(
     dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
     dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
  }
+#endif
  
  static void
  micro_neg(
@@ -764,6 +795,7 @@ micro_neg(
     dst->f[3] = -src->f[3];
  }
  
+#if 0
  static void
  micro_ineg(
     union tgsi_exec_channel *dst,
@@ -774,6 +806,7 @@ micro_ineg(
     dst->i[2] = -src->i[2];
     dst->i[3] = -src->i[3];
  }
+#endif
  
  static void
  micro_not(
@@ -874,6 +907,7 @@ micro_trunc(
     dst->f[3] = (float) (int) src0->f[3];
  }
  
+#if 0
  static void
  micro_ushr(
     union tgsi_exec_channel *dst,
@@ -885,6 +919,7 @@ micro_ushr(
     dst->u[2] = src0->u[2] >> src1->u[2];
     dst->u[3] = src0->u[3] >> src1->u[3];
  }
+#endif
  
  static void
  micro_sin(
@@ -919,6 +954,7 @@ micro_sub(
     dst->f[3] = src0->f[3] - src1->f[3];
  }
  
+#if 0
  static void
  micro_u2f(
     union tgsi_exec_channel *dst,
@@ -929,6 +965,7 @@ micro_u2f(
     dst->f[2] = (float) src->u[2];
     dst->f[3] = (float) src->u[3];
  }
+#endif
  
  static void
  micro_xor(
@@ -958,14 +995,22 @@ fetch_src_file_channel(
        switch( file ) {
        case TGSI_FILE_CONSTANT:
           assert(mach->Consts);
-         assert(index->i[0] >= 0);
-         assert(index->i[1] >= 0);
-         assert(index->i[2] >= 0);
-         assert(index->i[3] >= 0);
-         chan->f[0] = mach->Consts[index->i[0]][swizzle];
-         chan->f[1] = mach->Consts[index->i[1]][swizzle];
-         chan->f[2] = mach->Consts[index->i[2]][swizzle];
-         chan->f[3] = mach->Consts[index->i[3]][swizzle];
+         if (index->i[0] < 0)
+            chan->f[0] = 0.0f;
+         else
+            chan->f[0] = mach->Consts[index->i[0]][swizzle];
+         if (index->i[1] < 0)
+            chan->f[1] = 0.0f;
+         else
+            chan->f[1] = mach->Consts[index->i[1]][swizzle];
+         if (index->i[2] < 0)
+            chan->f[2] = 0.0f;
+         else
+            chan->f[2] = mach->Consts[index->i[2]][swizzle];
+         if (index->i[3] < 0)
+            chan->f[3] = 0.0f;
+         else
+            chan->f[3] = mach->Consts[index->i[3]][swizzle];
           break;
  
        case TGSI_FILE_INPUT:
@@ -1037,11 +1082,28 @@ fetch_source(
     union tgsi_exec_channel index;
     uint swizzle;
  
+   /* We start with a direct index into a register file.
+    *
+    *    file[1],
+    *    where:
+    *       file = SrcRegister.File
+    *       [1] = SrcRegister.Index
+    */
     index.i[0] =
     index.i[1] =
     index.i[2] =
     index.i[3] = reg->SrcRegister.Index;
  
+   /* There is an extra source register that indirectly subscripts
+    * a register file. The direct index now becomes an offset
+    * that is being added to the indirect register.
+    *
+    *    file[ind[2].x+1],
+    *    where:
+    *       ind = SrcRegisterInd.File
+    *       [2] = SrcRegisterInd.Index
+    *       .x = SrcRegisterInd.SwizzleX
+    */
     if (reg->SrcRegister.Indirect) {
        union tgsi_exec_channel index2;
        union tgsi_exec_channel indir_index;
@@ -1078,19 +1140,31 @@ fetch_source(
        }
     }
  
-   if( reg->SrcRegister.Dimension ) {
-      switch( reg->SrcRegister.File ) {
+   /* There is an extra source register that is a second
+    * subscript to a register file. Effectively it means that
+    * the register file is actually a 2D array of registers.
+    *
+    *    file[1][3] == file[1*sizeof(file[1])+3],
+    *    where:
+    *       [3] = SrcRegisterDim.Index
+    */
+   if (reg->SrcRegister.Dimension) {
+      /* The size of the first-order array depends on the register file type.
+       * We need to multiply the index to the first array to get an effective,
+       * "flat" index that points to the beginning of the second-order array.
+       */
+      switch (reg->SrcRegister.File) {
        case TGSI_FILE_INPUT:
-         index.i[0] *= 17;
-         index.i[1] *= 17;
-         index.i[2] *= 17;
-         index.i[3] *= 17;
+         index.i[0] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
+         index.i[1] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
+         index.i[2] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
+         index.i[3] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
           break;
        case TGSI_FILE_CONSTANT:
-         index.i[0] *= 4096;
-         index.i[1] *= 4096;
-         index.i[2] *= 4096;
-         index.i[3] *= 4096;
+         index.i[0] *= TGSI_EXEC_MAX_CONST_BUFFER;
+         index.i[1] *= TGSI_EXEC_MAX_CONST_BUFFER;
+         index.i[2] *= TGSI_EXEC_MAX_CONST_BUFFER;
+         index.i[3] *= TGSI_EXEC_MAX_CONST_BUFFER;
           break;
        default:
           assert( 0 );
@@ -1101,6 +1175,17 @@ fetch_source(
        index.i[2] += reg->SrcRegisterDim.Index;
        index.i[3] += reg->SrcRegisterDim.Index;
  
+      /* Again, the second subscript index can be addressed indirectly
+       * identically to the first one.
+       * Nothing stops us from indirectly addressing the indirect register,
+       * but there is no need for that, so we won't exercise it.
+       *
+       *    file[1][ind[4].y+3],
+       *    where:
+       *       ind = SrcRegisterDimInd.File
+       *       [4] = SrcRegisterDimInd.Index
+       *       .y = SrcRegisterDimInd.SwizzleX
+       */
        if (reg->SrcRegisterDim.Indirect) {
           union tgsi_exec_channel index2;
           union tgsi_exec_channel indir_index;
@@ -1133,6 +1218,11 @@ fetch_source(
                 index.i[i] = 0;
           }
        }
+
+      /* If by any chance there was a need for a 3D array of register
+       * files, we would have to check whether SrcRegisterDim is followed
+       * by a dimension register and continue the saga.
+       */
     }
  
     swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
@@ -1482,7 +1572,7 @@ exec_kilp(struct tgsi_exec_machine *mach,
  
  
  /*
- * Fetch a texel using STR texture coordinates.
+ * Fetch a four texture samples using STR texture coordinates.
   */
  static void
  fetch_texel( struct tgsi_sampler *sampler,
@@ -1516,7 +1606,7 @@ exec_tex(struct tgsi_exec_machine *mach,
           boolean projected)
  {
     const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
-   union tgsi_exec_channel r[8];
+   union tgsi_exec_channel r[4];
     uint chan_index;
     float lodBias;
  
@@ -1539,7 +1629,7 @@ exec_tex(struct tgsi_exec_machine *mach,
        else
           lodBias = 0.0;
  
-      fetch_texel(&mach->Samplers[unit],
+      fetch_texel(mach->Samplers[unit],
                    &r[0], NULL, NULL, lodBias,  /* S, T, P, BIAS */
                    &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
        break;
@@ -1565,7 +1655,7 @@ exec_tex(struct tgsi_exec_machine *mach,
        else
           lodBias = 0.0;
  
-      fetch_texel(&mach->Samplers[unit],
+      fetch_texel(mach->Samplers[unit],
                    &r[0], &r[1], &r[2], lodBias,  /* inputs */
                    &r[0], &r[1], &r[2], &r[3]);  /* outputs */
        break;
@@ -1591,7 +1681,7 @@ exec_tex(struct tgsi_exec_machine *mach,
        else
           lodBias = 0.0;
  
-      fetch_texel(&mach->Samplers[unit],
+      fetch_texel(mach->Samplers[unit],
                    &r[0], &r[1], &r[2], lodBias,
                    &r[0], &r[1], &r[2], &r[3]);
        break;
@@ -1701,6 +1791,7 @@ exec_declaration(
              break;
  
           default:
+            eval = NULL;
              assert( 0 );
           }
  
@@ -1743,7 +1834,7 @@ exec_instruction(
     case TGSI_OPCODE_ARL:
        FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
           FETCH( &r[0], 0, chan_index );
-         micro_trunc( &r[0], &r[0] );
+         micro_flr( &r[0], &r[0] );
           STORE( &r[0], 0, chan_index );
        }
        break;
@@ -2033,7 +2124,21 @@ exec_instruction(
  
     case TGSI_OPCODE_DOT2ADD:
        /* TGSI_OPCODE_DP2A */
-      assert (0);
+      FETCH( &r[0], 0, CHAN_X );
+      FETCH( &r[1], 1, CHAN_X );
+      micro_mul( &r[0], &r[0], &r[1] );
+
+      FETCH( &r[1], 0, CHAN_Y );
+      FETCH( &r[2], 1, CHAN_Y );
+      micro_mul( &r[1], &r[1], &r[2] );
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FETCH( &r[2], 2, CHAN_X );
+      micro_add( &r[0], &r[0], &r[2] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
        break;
  
     case TGSI_OPCODE_INDEX:
@@ -2480,7 +2585,8 @@ exec_instruction(
           micro_mul( &dot, &r[2], &r[2] );
           micro_add( &tmp, &tmp, &dot );
  
-         /* tmp = 1 / tmp */
+         /* tmp = 1 / sqrt(tmp) */
+         micro_sqrt( &tmp, &tmp );
           micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
  
           /* note: w channel is undefined */
@@ -2513,7 +2619,8 @@ exec_instruction(
           micro_mul( &dot, &r[3], &r[3] );
           micro_add( &tmp, &tmp, &dot );
  
-         /* tmp = 1 / tmp */
+         /* tmp = 1 / sqrt(tmp) */
+         micro_sqrt( &tmp, &tmp );
           micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
  
           FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {