freedreno/ir3: fix register usage calculations

[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_tgsi_soa.c
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c

index d2cb0a0975f45ad3f9d1eb74f6232fa819ec395b..3b2097f8521bac5cf1c88edfeb5e3c9e1d879884 100644 (file)
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -725,7 +725,7 @@ static void lp_exec_default(struct lp_exec_mask *mask,
         * default (or could do switch analysis at switch start time instead).
         */
        unsigned opcode = bld_base->instructions[bld_base->pc - 1].Instruction.Opcode;
-      boolean ft_into = (opcode != TGSI_OPCODE_BRK ||
+      boolean ft_into = (opcode != TGSI_OPCODE_BRK &&
                           opcode != TGSI_OPCODE_SWITCH);
        /*
         * If it is not last statement and there was no fallthrough into it,
@@ -952,7 +952,7 @@ build_gather(struct lp_build_context *bld,
     LLVMBuilderRef builder = bld->gallivm->builder;
     LLVMValueRef res = bld->undef;
     unsigned i;
-   LLVMValueRef temp_ptr;
+   LLVMValueRef temp_ptr = NULL;
  
     if (overflow_mask) {
        temp_ptr = lp_build_alloca(
@@ -1213,7 +1213,6 @@ emit_fetch_constant(
     LLVMBuilderRef builder = gallivm->builder;
     struct lp_build_context *uint_bld = &bld_base->uint_bld;
     unsigned dimension = 0;
-   LLVMValueRef dimension_index;
     LLVMValueRef consts_ptr;
     LLVMValueRef num_consts;
     LLVMValueRef res;
@@ -1227,11 +1226,8 @@ emit_fetch_constant(
        assert(dimension < LP_MAX_TGSI_CONST_BUFFERS);
     }
  
-   dimension_index = lp_build_const_int32(gallivm, dimension);
-   consts_ptr =
-      lp_build_array_get(gallivm, bld->consts_ptr, dimension_index);
-   num_consts =
-      lp_build_array_get(gallivm, bld->const_sizes_ptr, dimension_index);
+   consts_ptr = bld->consts[dimension];
+   num_consts = bld->consts_sizes[dimension];
  
     if (reg->Register.Indirect) {
        LLVMValueRef indirect_index;
@@ -1412,12 +1408,24 @@ emit_fetch_gs_input(
  {
     struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
     struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
+   const struct tgsi_shader_info *info = bld->bld_base.info;
     LLVMBuilderRef builder = gallivm->builder;
     LLVMValueRef attrib_index = NULL;
     LLVMValueRef vertex_index = NULL;
     LLVMValueRef swizzle_index = lp_build_const_int32(gallivm, swizzle);
     LLVMValueRef res;
  
+   if (info->input_semantic_name[reg->Register.Index] == TGSI_SEMANTIC_PRIMID) {
+      /* This is really a system value not a regular input */
+      assert(!reg->Register.Indirect);
+      assert(!reg->Dimension.Indirect);
+      res = bld->system_values.prim_id;
+      if (stype != TGSI_TYPE_UNSIGNED && stype != TGSI_TYPE_SIGNED) {
+         res = LLVMBuildBitCast(builder, res, bld_base->base.vec_type, "");
+      }
+      return res;
+   }
+
     if (reg->Register.Indirect) {
        attrib_index = get_indirect_index(bld,
                                          reg->Register.File,
@@ -1530,6 +1538,16 @@ emit_fetch_system_value(
        atype = TGSI_TYPE_UNSIGNED;
        break;
  
+   case TGSI_SEMANTIC_VERTEXID_NOBASE:
+      res = bld->system_values.vertex_id_nobase;
+      atype = TGSI_TYPE_UNSIGNED;
+      break;
+
+   case TGSI_SEMANTIC_BASEVERTEX:
+      res = bld->system_values.basevertex;
+      atype = TGSI_TYPE_UNSIGNED;
+      break;
+
     case TGSI_SEMANTIC_PRIMID:
        res = bld->system_values.prim_id;
        atype = TGSI_TYPE_UNSIGNED;
@@ -1942,9 +1960,10 @@ static void
  emit_tex( struct lp_build_tgsi_soa_context *bld,
            const struct tgsi_full_instruction *inst,
            enum lp_build_tex_modifier modifier,
-          LLVMValueRef *texel)
+          LLVMValueRef *texel,
+          unsigned sampler_reg)
  {
-   unsigned unit;
+   unsigned unit = inst->Src[sampler_reg].Register.Index;
     LLVMValueRef lod_bias, explicit_lod;
     LLVMValueRef oow = NULL;
     LLVMValueRef coords[5];
@@ -2014,7 +2033,16 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
        num_derivs = 3;
        break;
     case TGSI_TEXTURE_CUBE_ARRAY:
+      num_offsets = 2;
+      num_derivs = 3;
+      layer_coord = 3;
+      break;
     case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
+      num_offsets = 2;
+      num_derivs = 3;
+      layer_coord = 3;
+      shadow_coord = 4; /* shadow coord special different reg */
+      break;
     case TGSI_TEXTURE_2D_MSAA:
     case TGSI_TEXTURE_2D_ARRAY_MSAA:
     default:
@@ -2025,7 +2053,15 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
     /* Note lod and especially projected are illegal in a LOT of cases */
     if (modifier == LP_BLD_TEX_MODIFIER_LOD_BIAS ||
         modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_LOD) {
-      LLVMValueRef lod = lp_build_emit_fetch(&bld->bld_base, inst, 0, 3);
+      LLVMValueRef lod;
+      if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
+          inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY) {
+         /* note that shadow cube array with bias/explicit lod does not exist */
+         lod = lp_build_emit_fetch(&bld->bld_base, inst, 1, 0);
+      }
+      else {
+         lod = lp_build_emit_fetch(&bld->bld_base, inst, 0, 3);
+      }
        if (modifier == LP_BLD_TEX_MODIFIER_LOD_BIAS) {
           lod_bias = lod;
           explicit_lod = NULL;
@@ -2057,13 +2093,23 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
  
     /* Layer coord always goes into 3rd slot, except for cube map arrays */
     if (layer_coord) {
-      coords[2] = lp_build_emit_fetch(&bld->bld_base, inst, 0, layer_coord);
+      if (layer_coord == 3) {
+         coords[3] = lp_build_emit_fetch(&bld->bld_base, inst, 0, layer_coord);
+      }
+      else {
+         coords[2] = lp_build_emit_fetch(&bld->bld_base, inst, 0, layer_coord);
+      }
        if (modifier == LP_BLD_TEX_MODIFIER_PROJECTED)
           coords[2] = lp_build_mul(&bld->bld_base.base, coords[2], oow);
     }
     /* Shadow coord occupies always 5th slot. */
     if (shadow_coord) {
-      coords[4] = lp_build_emit_fetch(&bld->bld_base, inst, 0, shadow_coord);
+      if (shadow_coord == 4) {
+         coords[4] = lp_build_emit_fetch(&bld->bld_base, inst, 1, 0);
+      }
+      else {
+         coords[4] = lp_build_emit_fetch(&bld->bld_base, inst, 0, shadow_coord);
+      }
        if (modifier == LP_BLD_TEX_MODIFIER_PROJECTED)
           coords[4] = lp_build_mul(&bld->bld_base.base, coords[4], oow);
     }
@@ -2075,7 +2121,6 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
           derivs.ddy[dim] = lp_build_emit_fetch(&bld->bld_base, inst, 2, dim);
        }
        deriv_ptr = &derivs;
-      unit = inst->Src[3].Register.Index;
        /*
         * could also check all src regs if constant but I doubt such
         * cases exist in practice.
@@ -2091,8 +2136,6 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
        else {
           lod_property = LP_SAMPLER_LOD_PER_ELEMENT;
        }
-   } else {
-      unit = inst->Src[1].Register.Index;
     }
  
     /* some advanced gather instructions (txgo) would require 4 offsets */
@@ -2300,7 +2343,7 @@ emit_fetch_texels( struct lp_build_tgsi_soa_context *bld,
     unsigned unit, target;
     LLVMValueRef coord_undef = LLVMGetUndef(bld->bld_base.base.int_vec_type);
     LLVMValueRef explicit_lod = NULL;
-   LLVMValueRef coords[3];
+   LLVMValueRef coords[5];
     LLVMValueRef offsets[3] = { NULL };
     enum lp_sampler_lod_property lod_property = LP_SAMPLER_LOD_SCALAR;
     unsigned dims, i;
@@ -2334,9 +2377,11 @@ emit_fetch_texels( struct lp_build_tgsi_soa_context *bld,
        break;
     case TGSI_TEXTURE_2D:
     case TGSI_TEXTURE_RECT:
+   case TGSI_TEXTURE_2D_MSAA:
        dims = 2;
        break;
     case TGSI_TEXTURE_2D_ARRAY:
+   case TGSI_TEXTURE_2D_ARRAY_MSAA:
        layer_coord = 2;
        dims = 2;
        break;
@@ -2348,16 +2393,20 @@ emit_fetch_texels( struct lp_build_tgsi_soa_context *bld,
        return;
     }
  
-   /* always have lod except for buffers ? */
-   if (target != TGSI_TEXTURE_BUFFER) {
+   /* always have lod except for buffers and msaa targets ? */
+   if (target != TGSI_TEXTURE_BUFFER &&
+       target != TGSI_TEXTURE_2D_MSAA &&
+       target != TGSI_TEXTURE_2D_ARRAY_MSAA) {
        explicit_lod = lp_build_emit_fetch(&bld->bld_base, inst, 0, 3);
        lod_property = lp_build_lod_property(&bld->bld_base, inst, 0);
     }
+   /* XXX: for real msaa support, the w component would be the sample index. */
  
     for (i = 0; i < dims; i++) {
        coords[i] = lp_build_emit_fetch(&bld->bld_base, inst, 0, i);
     }
-   for (i = dims; i < 3; i++) {
+   /* never use more than 3 coords here but emit_fetch_texel copies all 5 anyway */
+   for (i = dims; i < 5; i++) {
        coords[i] = coord_undef;
     }
     if (layer_coord)
@@ -2449,7 +2498,7 @@ emit_size_query( struct lp_build_tgsi_soa_context *bld,
                                   bld->bld_base.base.gallivm,
                                   bld->bld_base.int_bld.type,
                                   unit, pipe_target,
-                                 is_sviewinfo,
+                                 TRUE,
                                   lod_property,
                                   explicit_lod,
                                   sizes_out);
@@ -2457,7 +2506,7 @@ emit_size_query( struct lp_build_tgsi_soa_context *bld,
  
  static boolean
  near_end_of_shader(struct lp_build_tgsi_soa_context *bld,
-                  int pc)
+                   int pc)
  {
     int i;
  
@@ -2465,27 +2514,38 @@ near_end_of_shader(struct lp_build_tgsi_soa_context *bld,
        unsigned opcode;
  
        if (pc + i >= bld->bld_base.info->num_instructions)
-        return TRUE;
+         return TRUE;
  
        opcode = bld->bld_base.instructions[pc + i].Instruction.Opcode;
  
        if (opcode == TGSI_OPCODE_END)
-        return TRUE;
+         return TRUE;
  
        if (opcode == TGSI_OPCODE_TEX ||
-         opcode == TGSI_OPCODE_TXP ||
-         opcode == TGSI_OPCODE_TXD ||
-         opcode == TGSI_OPCODE_TXB ||
-         opcode == TGSI_OPCODE_TXL ||
-         opcode == TGSI_OPCODE_TXF ||
-         opcode == TGSI_OPCODE_TXQ ||
-         opcode == TGSI_OPCODE_CAL ||
-         opcode == TGSI_OPCODE_CALLNZ ||
-         opcode == TGSI_OPCODE_IF ||
-          opcode == TGSI_OPCODE_UIF ||
-         opcode == TGSI_OPCODE_BGNLOOP ||
-         opcode == TGSI_OPCODE_SWITCH)
-        return FALSE;
+         opcode == TGSI_OPCODE_TXP ||
+         opcode == TGSI_OPCODE_TXD ||
+         opcode == TGSI_OPCODE_TXB ||
+         opcode == TGSI_OPCODE_TXL ||
+         opcode == TGSI_OPCODE_TXF ||
+         opcode == TGSI_OPCODE_TXQ ||
+         opcode == TGSI_OPCODE_TEX2 ||
+         opcode == TGSI_OPCODE_TXB2 ||
+         opcode == TGSI_OPCODE_TXL2 ||
+         opcode == TGSI_OPCODE_SAMPLE ||
+         opcode == TGSI_OPCODE_SAMPLE_B ||
+         opcode == TGSI_OPCODE_SAMPLE_C ||
+         opcode == TGSI_OPCODE_SAMPLE_C_LZ ||
+         opcode == TGSI_OPCODE_SAMPLE_D ||
+         opcode == TGSI_OPCODE_SAMPLE_I ||
+         opcode == TGSI_OPCODE_SAMPLE_L ||
+         opcode == TGSI_OPCODE_SVIEWINFO ||
+         opcode == TGSI_OPCODE_CAL ||
+         opcode == TGSI_OPCODE_CALLNZ ||
+         opcode == TGSI_OPCODE_IF ||
+         opcode == TGSI_OPCODE_UIF ||
+         opcode == TGSI_OPCODE_BGNLOOP ||
+         opcode == TGSI_OPCODE_SWITCH)
+         return FALSE;
     }
  
     return TRUE;
@@ -2677,56 +2737,86 @@ lp_emit_declaration_soa(
     const unsigned last = decl->Range.Last;
     unsigned idx, i;
  
-   for (idx = first; idx <= last; ++idx) {
-      assert(last <= bld->bld_base.info->file_max[decl->Declaration.File]);
-      switch (decl->Declaration.File) {
-      case TGSI_FILE_TEMPORARY:
-         if (!(bld->indirect_files & (1 << TGSI_FILE_TEMPORARY))) {
-            assert(idx < LP_MAX_INLINED_TEMPS);
+   assert(last <= bld->bld_base.info->file_max[decl->Declaration.File]);
+
+   switch (decl->Declaration.File) {
+   case TGSI_FILE_TEMPORARY:
+      if (!(bld->indirect_files & (1 << TGSI_FILE_TEMPORARY))) {
+         assert(last < LP_MAX_INLINED_TEMPS);
+         for (idx = first; idx <= last; ++idx) {
              for (i = 0; i < TGSI_NUM_CHANNELS; i++)
                 bld->temps[idx][i] = lp_build_alloca(gallivm, vec_type, "temp");
           }
-         break;
+      }
+      break;
  
-      case TGSI_FILE_OUTPUT:
-         if (!(bld->indirect_files & (1 << TGSI_FILE_OUTPUT))) {
+   case TGSI_FILE_OUTPUT:
+      if (!(bld->indirect_files & (1 << TGSI_FILE_OUTPUT))) {
+         for (idx = first; idx <= last; ++idx) {
              for (i = 0; i < TGSI_NUM_CHANNELS; i++)
                 bld->outputs[idx][i] = lp_build_alloca(gallivm,
                                                        vec_type, "output");
           }
-         break;
+      }
+      break;
  
-      case TGSI_FILE_ADDRESS:
-        /* ADDR registers are only allocated with an integer LLVM IR type,
-         * as they are guaranteed to always have integers.
-         * XXX: Not sure if this exception is worthwhile (or the whole idea of
-         * an ADDR register for that matter).
-         */
+   case TGSI_FILE_ADDRESS:
+      /* ADDR registers are only allocated with an integer LLVM IR type,
+       * as they are guaranteed to always have integers.
+       * XXX: Not sure if this exception is worthwhile (or the whole idea of
+       * an ADDR register for that matter).
+       */
+      assert(last < LP_MAX_TGSI_ADDRS);
+      for (idx = first; idx <= last; ++idx) {
           assert(idx < LP_MAX_TGSI_ADDRS);
           for (i = 0; i < TGSI_NUM_CHANNELS; i++)
              bld->addr[idx][i] = lp_build_alloca(gallivm, bld_base->base.int_vec_type, "addr");
-         break;
+      }
+      break;
  
-      case TGSI_FILE_PREDICATE:
-         assert(idx < LP_MAX_TGSI_PREDS);
+   case TGSI_FILE_PREDICATE:
+      assert(last < LP_MAX_TGSI_PREDS);
+      for (idx = first; idx <= last; ++idx) {
           for (i = 0; i < TGSI_NUM_CHANNELS; i++)
              bld->preds[idx][i] = lp_build_alloca(gallivm, vec_type,
                                                   "predicate");
-         break;
+      }
+      break;
  
-      case TGSI_FILE_SAMPLER_VIEW:
-         /*
-          * The target stored here MUST match whatever there actually
-          * is in the set sampler views (what about return type?).
-          */
-         assert(idx < PIPE_MAX_SHADER_SAMPLER_VIEWS);
+   case TGSI_FILE_SAMPLER_VIEW:
+      /*
+       * The target stored here MUST match whatever there actually
+       * is in the set sampler views (what about return type?).
+       */
+      assert(last < PIPE_MAX_SHADER_SAMPLER_VIEWS);
+      for (idx = first; idx <= last; ++idx) {
           bld->sv[idx] = decl->SamplerView;
-         break;
-
-      default:
-         /* don't need to declare other vars */
-         break;
        }
+      break;
+
+   case TGSI_FILE_CONSTANT:
+   {
+      /*
+       * We could trivially fetch the per-buffer pointer when fetching the
+       * constant, relying on llvm to figure out it's always the same pointer
+       * anyway. However, doing so results in a huge (more than factor of 10)
+       * slowdown in llvm compilation times for some (but not all) shaders
+       * (more specifically, the IR optimization spends way more time in
+       * DominatorTree::dominates). At least with llvm versions 3.1, 3.3.
+       */
+      unsigned idx2D = decl->Dim.Index2D;
+      LLVMValueRef index2D = lp_build_const_int32(gallivm, idx2D);
+      assert(idx2D < LP_MAX_TGSI_CONST_BUFFERS);
+      bld->consts[idx2D] =
+         lp_build_array_get(gallivm, bld->consts_ptr, index2D);
+      bld->consts_sizes[idx2D] =
+         lp_build_array_get(gallivm, bld->const_sizes_ptr, index2D);
+   }
+      break;
+
+   default:
+      /* don't need to declare other vars */
+      break;
     }
  }
  
@@ -2862,7 +2952,20 @@ tex_emit(
  {
     struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
  
-   emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_NONE, emit_data->output);
+   emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_NONE,
+            emit_data->output, 1);
+}
+
+static void
+tex2_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+
+   emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_NONE,
+            emit_data->output, 2);
  }
  
  static void
@@ -2874,7 +2977,19 @@ txb_emit(
     struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
  
     emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_LOD_BIAS,
-            emit_data->output);
+            emit_data->output, 1);
+}
+
+static void
+txb2_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+
+   emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_LOD_BIAS,
+            emit_data->output, 2);
  }
  
  static void
@@ -2886,7 +3001,7 @@ txd_emit(
     struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
  
     emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV,
-            emit_data->output);
+            emit_data->output, 3);
  }
  
  static void
@@ -2898,7 +3013,19 @@ txl_emit(
     struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
  
     emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_EXPLICIT_LOD,
-            emit_data->output);
+            emit_data->output, 1);
+}
+
+static void
+txl2_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+
+   emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_EXPLICIT_LOD,
+            emit_data->output, 2);
  }
  
  static void
@@ -2910,7 +3037,7 @@ txp_emit(
     struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
  
     emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_PROJECTED,
-            emit_data->output);
+            emit_data->output, 1);
  }
  
  static void
@@ -3390,99 +3517,6 @@ cont_emit(
     lp_exec_continue(&bld->exec_mask);
  }
  
-/* XXX: Refactor and move it to lp_bld_tgsi_action.c
- *
- * XXX: What do the comments about xmm registers mean?  Maybe they are left over
- * from old code, but there is no garauntee that LLVM will use those registers
- * for this code.
- *
- * XXX: There should be no calls to lp_build_emit_fetch in this function.  This
- * should be handled by the emit_data->fetch_args function. */
-static void
-nrm_emit(
-   const struct lp_build_tgsi_action * action,
-   struct lp_build_tgsi_context * bld_base,
-   struct lp_build_emit_data * emit_data)
-{
-   LLVMValueRef tmp0, tmp1;
-   LLVMValueRef tmp4 = NULL;
-   LLVMValueRef tmp5 = NULL;
-   LLVMValueRef tmp6 = NULL;
-   LLVMValueRef tmp7 = NULL;
-   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
-
-   uint dims = (emit_data->inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
-
-  if (TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_X) ||
-      TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_Y) ||
-      TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_Z) ||
-      (TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_W) && dims == 4)) {
-
-      /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
-
-      /* xmm4 = src.x */
-      /* xmm0 = src.x * src.x */
-      tmp0 = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 0, TGSI_CHAN_X);
-      if (TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_X)) {
-         tmp4 = tmp0;
-      }
-      tmp0 = lp_build_mul( &bld->bld_base.base, tmp0, tmp0);
-
-      /* xmm5 = src.y */
-      /* xmm0 = xmm0 + src.y * src.y */
-      tmp1 = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 0, TGSI_CHAN_Y);
-      if (TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_Y)) {
-         tmp5 = tmp1;
-      }
-      tmp1 = lp_build_mul( &bld->bld_base.base, tmp1, tmp1);
-      tmp0 = lp_build_add( &bld->bld_base.base, tmp0, tmp1);
-
-      /* xmm6 = src.z */
-      /* xmm0 = xmm0 + src.z * src.z */
-      tmp1 = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 0, TGSI_CHAN_Z);
-      if (TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_Z)) {
-         tmp6 = tmp1;
-      }
-      tmp1 = lp_build_mul( &bld->bld_base.base, tmp1, tmp1);
-      tmp0 = lp_build_add( &bld->bld_base.base, tmp0, tmp1);
-
-      if (dims == 4) {
-         /* xmm7 = src.w */
-         /* xmm0 = xmm0 + src.w * src.w */
-         tmp1 = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 0, TGSI_CHAN_W);
-         if (TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_W)) {
-            tmp7 = tmp1;
-         }
-         tmp1 = lp_build_mul( &bld->bld_base.base, tmp1, tmp1);
-         tmp0 = lp_build_add( &bld->bld_base.base, tmp0, tmp1);
-      }
-      /* xmm1 = 1 / sqrt(xmm0) */
-      tmp1 = lp_build_rsqrt( &bld->bld_base.base, tmp0);
-       /* dst.x = xmm1 * src.x */
-      if (TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_X)) {
-         emit_data->output[TGSI_CHAN_X] = lp_build_mul( &bld->bld_base.base, tmp4, tmp1);
-      }
-      /* dst.y = xmm1 * src.y */
-      if (TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_Y)) {
-         emit_data->output[TGSI_CHAN_Y] = lp_build_mul( &bld->bld_base.base, tmp5, tmp1);
-      }
-
-      /* dst.z = xmm1 * src.z */
-      if (TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_Z)) {
-         emit_data->output[TGSI_CHAN_Z] = lp_build_mul( &bld->bld_base.base, tmp6, tmp1);
-      }
-      /* dst.w = xmm1 * src.w */
-      if (TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_X) && dims == 4) {
-         emit_data->output[TGSI_CHAN_W] = lp_build_mul( &bld->bld_base.base, tmp7, tmp1);
-      }
-   }
-
-   /* dst.w = 1.0 */
-   if (TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_W) && dims == 3) {
-       emit_data->output[TGSI_CHAN_W] = bld->bld_base.base.one;
-   }
-}
-
  static void emit_prologue(struct lp_build_tgsi_context * bld_base)
  {
     struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
@@ -3569,7 +3603,8 @@ static void emit_prologue(struct lp_build_tgsi_context * bld_base)
     if (DEBUG_EXECUTION) {
        lp_build_printf(gallivm, "\n");
        emit_dump_file(bld, TGSI_FILE_CONSTANT);
-      emit_dump_file(bld, TGSI_FILE_INPUT);
+      if (!bld->gs_iface)
+         emit_dump_file(bld, TGSI_FILE_INPUT);
     }
  }
  
@@ -3707,8 +3742,6 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
     bld.bld_base.op_actions[TGSI_OPCODE_UIF].emit = uif_emit;
     bld.bld_base.op_actions[TGSI_OPCODE_KILL_IF].emit = kill_if_emit;
     bld.bld_base.op_actions[TGSI_OPCODE_KILL].emit = kill_emit;
-   bld.bld_base.op_actions[TGSI_OPCODE_NRM].emit = nrm_emit;
-   bld.bld_base.op_actions[TGSI_OPCODE_NRM4].emit = nrm_emit;
     bld.bld_base.op_actions[TGSI_OPCODE_RET].emit = ret_emit;
     bld.bld_base.op_actions[TGSI_OPCODE_SWITCH].emit = switch_emit;
     bld.bld_base.op_actions[TGSI_OPCODE_TEX].emit = tex_emit;
@@ -3718,6 +3751,9 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
     bld.bld_base.op_actions[TGSI_OPCODE_TXP].emit = txp_emit;
     bld.bld_base.op_actions[TGSI_OPCODE_TXQ].emit = txq_emit;
     bld.bld_base.op_actions[TGSI_OPCODE_TXF].emit = txf_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_TEX2].emit = tex2_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_TXB2].emit = txb2_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_TXL2].emit = txl2_emit;
     /* DX10 sampling ops */
     bld.bld_base.op_actions[TGSI_OPCODE_SAMPLE].emit = sample_emit;
     bld.bld_base.op_actions[TGSI_OPCODE_SAMPLE_B].emit = sample_b_emit;
@@ -3734,8 +3770,8 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
         * were forgetting so we're using MAX_VERTEX_VARYING from
         * that spec even though we could debug_assert if it's not
         * set, but that's a lot uglier. */
-      uint max_output_vertices = 32;
-      uint i = 0;
+      uint max_output_vertices;
+
        /* inputs are always indirect with gs */
        bld.indirect_files |= (1 << TGSI_FILE_INPUT);
        bld.gs_iface = gs_iface;
@@ -3743,12 +3779,11 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
        bld.bld_base.op_actions[TGSI_OPCODE_EMIT].emit = emit_vertex;
        bld.bld_base.op_actions[TGSI_OPCODE_ENDPRIM].emit = end_primitive;
  
-      for (i = 0; i < info->num_properties; ++i) {
-         if (info->properties[i].name ==
-             TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES) {
-            max_output_vertices = info->properties[i].data[0];
-         }
-      }
+      max_output_vertices =
+            info->properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES];
+      if (!max_output_vertices)
+         max_output_vertices = 32;
+
        bld.max_output_vertices_vec =
           lp_build_const_int_vec(gallivm, bld.bld_base.int_bld.type,
                                  max_output_vertices);