mesa: s/CALLOC/calloc/
[mesa.git] / src / mesa / state_tracker / st_glsl_to_tgsi.cpp
index cd4db255f54dff7877269e78e3403cd0ecfad94a..e270cf490b5d7a7d12b9ea46a3818ca9a3acfa9a 100644 (file)
@@ -356,7 +356,7 @@ public:
 
    /** List of immediate_storage */
    exec_list immediates;
-   int num_immediates;
+   unsigned num_immediates;
 
    /** List of function_entry */
    exec_list function_signatures;
@@ -413,7 +413,6 @@ public:
 
    bool process_move_condition(ir_rvalue *ir);
 
-   void remove_output_reads(gl_register_file type);
    void simplify_cmp(void);
 
    void rename_temp_register(int index, int new_index);
@@ -664,6 +663,9 @@ glsl_to_tgsi_visitor::get_opcode(ir_instruction *ir, unsigned op,
       case3(SLT, ISLT, USLT);
       
       case2iu(ISHR, USHR);
+
+      case2fi(SSG, ISSG);
+      case3(ABS, IABS, IABS);
       
       default: break;
    }
@@ -1402,8 +1404,7 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
       }
       break;
    case ir_unop_neg:
-      assert(result_dst.type == GLSL_TYPE_FLOAT || result_dst.type == GLSL_TYPE_INT);
-      if (result_dst.type == GLSL_TYPE_INT)
+      if (result_dst.type == GLSL_TYPE_INT || result_dst.type == GLSL_TYPE_UINT)
          emit(ir, TGSI_OPCODE_INEG, result_dst, op[0]);
       else {
          op[0].negate = ~op[0].negate;
@@ -1411,7 +1412,6 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
       }
       break;
    case ir_unop_abs:
-      assert(result_dst.type == GLSL_TYPE_FLOAT);
       emit(ir, TGSI_OPCODE_ABS, result_dst, op[0]);
       break;
    case ir_unop_sign:
@@ -1448,9 +1448,29 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
       emit(ir, TGSI_OPCODE_DDX, result_dst, op[0]);
       break;
    case ir_unop_dFdy:
-      op[0].negate = ~op[0].negate;
-      emit(ir, TGSI_OPCODE_DDY, result_dst, op[0]);
+   {
+      /* The X component contains 1 or -1 depending on whether the framebuffer
+       * is a FBO or the window system buffer, respectively.
+       * It is then multiplied with the source operand of DDY.
+       */
+      static const gl_state_index transform_y_state[STATE_LENGTH]
+         = { STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM };
+
+      unsigned transform_y_index =
+         _mesa_add_state_reference(this->prog->Parameters,
+                                   transform_y_state);
+
+      st_src_reg transform_y = st_src_reg(PROGRAM_STATE_VAR,
+                                          transform_y_index,
+                                          glsl_type::vec4_type);
+      transform_y.swizzle = SWIZZLE_XXXX;
+
+      st_src_reg temp = get_temp(glsl_type::vec4_type);
+
+      emit(ir, TGSI_OPCODE_MUL, st_dst_reg(temp), transform_y, op[0]);
+      emit(ir, TGSI_OPCODE_DDY, result_dst, temp);
       break;
+   }
 
    case ir_unop_noise: {
       /* At some point, a motivated person could add a better
@@ -1761,6 +1781,18 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
       else
          emit(ir, TGSI_OPCODE_TRUNC, result_dst, op[0]);
       break;
+   case ir_unop_f2u:
+      if (native_integers)
+         emit(ir, TGSI_OPCODE_F2U, result_dst, op[0]);
+      else
+         emit(ir, TGSI_OPCODE_TRUNC, result_dst, op[0]);
+      break;
+   case ir_unop_bitcast_f2i:
+   case ir_unop_bitcast_f2u:
+   case ir_unop_bitcast_i2f:
+   case ir_unop_bitcast_u2f:
+      result_src = op[0];
+      break;
    case ir_unop_f2b:
       emit(ir, TGSI_OPCODE_SNE, result_dst, op[0], st_src_reg_for_float(0.0));
       break;
@@ -1774,13 +1806,14 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
       emit(ir, TGSI_OPCODE_TRUNC, result_dst, op[0]);
       break;
    case ir_unop_ceil:
-      op[0].negate = ~op[0].negate;
-      emit(ir, TGSI_OPCODE_FLR, result_dst, op[0]);
-      result_src.negate = ~result_src.negate;
+      emit(ir, TGSI_OPCODE_CEIL, result_dst, op[0]);
       break;
    case ir_unop_floor:
       emit(ir, TGSI_OPCODE_FLR, result_dst, op[0]);
       break;
+   case ir_unop_round_even:
+      emit(ir, TGSI_OPCODE_ROUND, result_dst, op[0]);
+      break;
    case ir_unop_fract:
       emit(ir, TGSI_OPCODE_FRC, result_dst, op[0]);
       break;
@@ -1807,33 +1840,37 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
       }
    case ir_binop_lshift:
       if (native_integers) {
-         emit(ir, TGSI_OPCODE_SHL, result_dst, op[0]);
+         emit(ir, TGSI_OPCODE_SHL, result_dst, op[0], op[1]);
          break;
       }
    case ir_binop_rshift:
       if (native_integers) {
-         emit(ir, TGSI_OPCODE_ISHR, result_dst, op[0]);
+         emit(ir, TGSI_OPCODE_ISHR, result_dst, op[0], op[1]);
          break;
       }
    case ir_binop_bit_and:
       if (native_integers) {
-         emit(ir, TGSI_OPCODE_AND, result_dst, op[0]);
+         emit(ir, TGSI_OPCODE_AND, result_dst, op[0], op[1]);
          break;
       }
    case ir_binop_bit_xor:
       if (native_integers) {
-         emit(ir, TGSI_OPCODE_XOR, result_dst, op[0]);
+         emit(ir, TGSI_OPCODE_XOR, result_dst, op[0], op[1]);
          break;
       }
    case ir_binop_bit_or:
       if (native_integers) {
-         emit(ir, TGSI_OPCODE_OR, result_dst, op[0]);
+         emit(ir, TGSI_OPCODE_OR, result_dst, op[0], op[1]);
          break;
       }
-   case ir_unop_round_even:
+
       assert(!"GLSL 1.30 features unsupported");
       break;
 
+   case ir_binop_ubo_load:
+      assert(!"not yet supported");
+      break;
+
    case ir_quadop_vector:
       /* This operation should have already been handled.
        */
@@ -1921,7 +1958,7 @@ glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
          assert(var->location != -1);
          entry = new(mem_ctx) variable_storage(var,
                                                PROGRAM_OUTPUT,
-                                               var->location);
+                                               var->location + var->index);
          break;
       case ir_var_system_value:
          entry = new(mem_ctx) variable_storage(var,
@@ -2372,7 +2409,7 @@ glsl_to_tgsi_visitor::visit(ir_constant *ir)
       gl_type = native_integers ? GL_BOOL : GL_FLOAT;
       for (i = 0; i < ir->type->vector_elements; i++) {
          if (native_integers)
-            values[i].b = ir->value.b[i];
+            values[i].u = ir->value.b[i] ? ~0 : 0;
          else
             values[i].f = ir->value.b[i];
       }
@@ -2435,7 +2472,7 @@ void
 glsl_to_tgsi_visitor::visit(ir_call *ir)
 {
    glsl_to_tgsi_instruction *call_inst;
-   ir_function_signature *sig = ir->get_callee();
+   ir_function_signature *sig = ir->callee;
    function_entry *entry = get_function_signature(sig);
    int i;
 
@@ -2642,8 +2679,9 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
       ir->shadow_comparitor->accept(this);
 
       /* XXX This will need to be updated for cubemap array samplers. */
-      if (sampler_type->sampler_dimensionality == GLSL_SAMPLER_DIM_2D &&
-          sampler_type->sampler_array) {
+      if ((sampler_type->sampler_dimensionality == GLSL_SAMPLER_DIM_2D &&
+          sampler_type->sampler_array) ||
+         sampler_type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE) {
          coord_dst.writemask = WRITEMASK_W;
       } else {
          coord_dst.writemask = WRITEMASK_Z;
@@ -2744,8 +2782,6 @@ glsl_to_tgsi_visitor::visit(ir_return *ir)
 void
 glsl_to_tgsi_visitor::visit(ir_discard *ir)
 {
-   struct gl_fragment_program *fp = (struct gl_fragment_program *)this->prog;
-
    if (ir->condition) {
       ir->condition->accept(this);
       this->result.negate = ~this->result.negate;
@@ -2753,8 +2789,6 @@ glsl_to_tgsi_visitor::visit(ir_discard *ir)
    } else {
       emit(ir, TGSI_OPCODE_KILP);
    }
-
-   fp->UsesKill = GL_TRUE;
 }
 
 void
@@ -2807,9 +2841,16 @@ glsl_to_tgsi_visitor::glsl_to_tgsi_visitor()
    num_immediates = 0;
    current_function = NULL;
    num_address_regs = 0;
+   samplers_used = 0;
    indirect_addr_temps = false;
    indirect_addr_consts = false;
+   glsl_version = 0;
+   native_integers = false;
    mem_ctx = ralloc_context(NULL);
+   ctx = NULL;
+   prog = NULL;
+   shader_program = NULL;
+   options = NULL;
 }
 
 glsl_to_tgsi_visitor::~glsl_to_tgsi_visitor()
@@ -2838,8 +2879,6 @@ count_resources(glsl_to_tgsi_visitor *v, gl_program *prog)
       if (is_tex_instruction(inst->op)) {
          v->samplers_used |= 1 << inst->sampler;
 
-         prog->SamplerTargets[inst->sampler] =
-            (gl_texture_index)inst->tex_target;
          if (inst->tex_shadow) {
             prog->ShadowSamplers |= 1 << inst->sampler;
          }
@@ -2847,7 +2886,9 @@ count_resources(glsl_to_tgsi_visitor *v, gl_program *prog)
    }
    
    prog->SamplersUsed = v->samplers_used;
-   _mesa_update_shader_textures_used(prog);
+
+   if (v->shader_program != NULL)
+      _mesa_update_shader_textures_used(v->shader_program, prog);
 }
 
 static void
@@ -2872,13 +2913,15 @@ set_uniform_initializer(struct gl_context *ctx, void *mem_ctx,
       return;
    }
 
-   int loc = _mesa_get_uniform_location(ctx, shader_program, name);
-
-   if (loc == -1) {
+   unsigned offset;
+   unsigned index = _mesa_get_uniform_location(ctx, shader_program, name,
+                                              &offset);
+   if (offset == GL_INVALID_INDEX) {
       fail_link(shader_program,
                "Couldn't find uniform for initializer %s\n", name);
       return;
    }
+   int loc = _mesa_uniform_merge_location_offset(index, offset);
 
    for (unsigned int i = 0; i < (type->is_array() ? type->length : 1); i++) {
       ir_constant *element;
@@ -2920,89 +2963,6 @@ set_uniform_initializer(struct gl_context *ctx, void *mem_ctx,
    }
 }
 
-/*
- * Scan/rewrite program to remove reads of custom (output) registers.
- * The passed type has to be either PROGRAM_OUTPUT or PROGRAM_VARYING
- * (for vertex shaders).
- * In GLSL shaders, varying vars can be read and written.
- * On some hardware, trying to read an output register causes trouble.
- * So, rewrite the program to use a temporary register in this case.
- * 
- * Based on _mesa_remove_output_reads from programopt.c.
- */
-void
-glsl_to_tgsi_visitor::remove_output_reads(gl_register_file type)
-{
-   GLuint i;
-   GLint outputMap[VERT_RESULT_MAX];
-   GLint outputTypes[VERT_RESULT_MAX];
-   GLuint numVaryingReads = 0;
-   GLboolean *usedTemps;
-   GLuint firstTemp = 0;
-
-   usedTemps = new GLboolean[MAX_TEMPS];
-   if (!usedTemps) {
-      return;
-   }
-   _mesa_find_used_registers(prog, PROGRAM_TEMPORARY,
-                             usedTemps, MAX_TEMPS);
-
-   assert(type == PROGRAM_VARYING || type == PROGRAM_OUTPUT);
-   assert(prog->Target == GL_VERTEX_PROGRAM_ARB || type != PROGRAM_VARYING);
-
-   for (i = 0; i < VERT_RESULT_MAX; i++)
-      outputMap[i] = -1;
-
-   /* look for instructions which read from varying vars */
-   foreach_iter(exec_list_iterator, iter, this->instructions) {
-      glsl_to_tgsi_instruction *inst = (glsl_to_tgsi_instruction *)iter.get();
-      const GLuint numSrc = num_inst_src_regs(inst->op);
-      GLuint j;
-      for (j = 0; j < numSrc; j++) {
-         if (inst->src[j].file == type) {
-            /* replace the read with a temp reg */
-            const GLuint var = inst->src[j].index;
-            if (outputMap[var] == -1) {
-               numVaryingReads++;
-               outputMap[var] = _mesa_find_free_register(usedTemps,
-                                                         MAX_TEMPS,
-                                                         firstTemp);
-               outputTypes[var] = inst->src[j].type;
-               firstTemp = outputMap[var] + 1;
-            }
-            inst->src[j].file = PROGRAM_TEMPORARY;
-            inst->src[j].index = outputMap[var];
-         }
-      }
-   }
-
-   delete [] usedTemps;
-
-   if (numVaryingReads == 0)
-      return; /* nothing to be done */
-
-   /* look for instructions which write to the varying vars identified above */
-   foreach_iter(exec_list_iterator, iter, this->instructions) {
-      glsl_to_tgsi_instruction *inst = (glsl_to_tgsi_instruction *)iter.get();
-      if (inst->dst.file == type && outputMap[inst->dst.index] >= 0) {
-         /* change inst to write to the temp reg, instead of the varying */
-         inst->dst.file = PROGRAM_TEMPORARY;
-         inst->dst.index = outputMap[inst->dst.index];
-      }
-   }
-   
-   /* insert new MOV instructions at the end */
-   for (i = 0; i < VERT_RESULT_MAX; i++) {
-      if (outputMap[i] >= 0) {
-         /* MOV VAR[i], TEMP[tmp]; */
-         st_src_reg src = st_src_reg(PROGRAM_TEMPORARY, outputMap[i], outputTypes[i]);
-         st_dst_reg dst = st_dst_reg(type, WRITEMASK_XYZW, outputTypes[i]);
-         dst.index = i;
-         this->emit(NULL, TGSI_OPCODE_MOV, dst, src);
-      }
-   }
-}
-
 /**
  * Returns the mask of channels (bitmask of WRITEMASK_X,Y,Z,W) which
  * are read from the given src in this instruction
@@ -3056,7 +3016,7 @@ glsl_to_tgsi_visitor::simplify_cmp(void)
    if (!tempWrites) {
       return;
    }
-   memset(tempWrites, 0, sizeof(tempWrites));
+   memset(tempWrites, 0, sizeof(unsigned) * MAX_TEMPS);
    memset(outputWrites, 0, sizeof(outputWrites));
 
    foreach_iter(exec_list_iterator, iter, this->instructions) {
@@ -3505,18 +3465,23 @@ glsl_to_tgsi_visitor::eliminate_dead_code_advanced(void)
       switch (inst->op) {
       case TGSI_OPCODE_BGNLOOP:
       case TGSI_OPCODE_ENDLOOP:
+      case TGSI_OPCODE_CONT:
+      case TGSI_OPCODE_BRK:
          /* End of a basic block, clear the write array entirely.
-          * FIXME: This keeps us from killing dead code when the writes are
+          *
+          * This keeps us from killing dead code when the writes are
           * on either side of a loop, even when the register isn't touched
-          * inside the loop.
+          * inside the loop.  However, glsl_to_tgsi_visitor doesn't seem to emit
+          * dead code of this type, so it shouldn't make a difference as long as
+          * the dead code elimination pass in the GLSL compiler does its job.
           */
          memset(writes, 0, sizeof(*writes) * this->next_temp * 4);
          break;
 
       case TGSI_OPCODE_ENDIF:
       case TGSI_OPCODE_ELSE:
-         /* Promote the recorded level all channels written inside the preceding
-          * if or else block to the level above the if/else block.
+         /* Promote the recorded level of all channels written inside the
+          * preceding if or else block to the level above the if/else block.
           */
          for (int r = 0; r < this->next_temp; r++) {
             for (int c = 0; c < 4; c++) {
@@ -3708,6 +3673,7 @@ get_pixel_transfer_visitor(struct st_fragment_program *fp,
    /* Copy attributes of the glsl_to_tgsi_visitor in the original shader. */
    v->ctx = original->ctx;
    v->prog = prog;
+   v->shader_program = NULL;
    v->glsl_version = original->glsl_version;
    v->native_integers = original->native_integers;
    v->options = original->options;
@@ -3717,6 +3683,7 @@ get_pixel_transfer_visitor(struct st_fragment_program *fp,
    v->indirect_addr_temps = original->indirect_addr_temps;
    v->indirect_addr_consts = original->indirect_addr_consts;
    memcpy(&v->immediates, &original->immediates, sizeof(v->immediates));
+   v->num_immediates = original->num_immediates;
 
    /*
     * Get initial pixel color from the texture.
@@ -3786,6 +3753,7 @@ get_pixel_transfer_visitor(struct st_fragment_program *fp,
     * new visitor. */
    foreach_iter(exec_list_iterator, iter, original->instructions) {
       glsl_to_tgsi_instruction *inst = (glsl_to_tgsi_instruction *)iter.get();
+      glsl_to_tgsi_instruction *newinst;
       st_src_reg src_regs[3];
 
       if (inst->dst.file == PROGRAM_OUTPUT)
@@ -3803,7 +3771,8 @@ get_pixel_transfer_visitor(struct st_fragment_program *fp,
             prog->InputsRead |= BITFIELD64_BIT(src_regs[i].index);
       }
 
-      v->emit(NULL, inst->op, inst->dst, src_regs[0], src_regs[1], src_regs[2]);
+      newinst = v->emit(NULL, inst->op, inst->dst, src_regs[0], src_regs[1], src_regs[2]);
+      newinst->tex_target = inst->tex_target;
    }
 
    /* Make modifications to fragment program info. */
@@ -3835,6 +3804,7 @@ get_bitmap_visitor(struct st_fragment_program *fp,
    /* Copy attributes of the glsl_to_tgsi_visitor in the original shader. */
    v->ctx = original->ctx;
    v->prog = prog;
+   v->shader_program = NULL;
    v->glsl_version = original->glsl_version;
    v->native_integers = original->native_integers;
    v->options = original->options;
@@ -3844,6 +3814,7 @@ get_bitmap_visitor(struct st_fragment_program *fp,
    v->indirect_addr_temps = original->indirect_addr_temps;
    v->indirect_addr_consts = original->indirect_addr_consts;
    memcpy(&v->immediates, &original->immediates, sizeof(v->immediates));
+   v->num_immediates = original->num_immediates;
 
    /* TEX tmp0, fragment.texcoord[0], texture[0], 2D; */
    coord = st_src_reg(PROGRAM_INPUT, FRAG_ATTRIB_TEX0, glsl_type::vec2_type);
@@ -3867,6 +3838,7 @@ get_bitmap_visitor(struct st_fragment_program *fp,
     * new visitor. */
    foreach_iter(exec_list_iterator, iter, original->instructions) {
       glsl_to_tgsi_instruction *inst = (glsl_to_tgsi_instruction *)iter.get();
+      glsl_to_tgsi_instruction *newinst;
       st_src_reg src_regs[3];
 
       if (inst->dst.file == PROGRAM_OUTPUT)
@@ -3878,7 +3850,8 @@ get_bitmap_visitor(struct st_fragment_program *fp,
             prog->InputsRead |= BITFIELD64_BIT(src_regs[i].index);
       }
 
-      v->emit(NULL, inst->op, inst->dst, src_regs[0], src_regs[1], src_regs[2]);
+      newinst = v->emit(NULL, inst->op, inst->dst, src_regs[0], src_regs[1], src_regs[2]);
+      newinst->tex_target = inst->tex_target;
    }
 
    /* Make modifications to fragment program info. */
@@ -3908,12 +3881,6 @@ struct st_translate {
    struct ureg_src samplers[PIPE_MAX_SAMPLERS];
    struct ureg_src systemValues[SYSTEM_VALUE_MAX];
 
-   /* Extra info for handling point size clamping in vertex shader */
-   struct ureg_dst pointSizeResult; /**< Actual point size output register */
-   struct ureg_src pointSizeConst;  /**< Point size range constant register */
-   GLint pointSizeOutIndex;         /**< Temp point size output register */
-   GLboolean prevInstWrotePointSize;
-
    const GLuint *inputMapping;
    const GLuint *outputMapping;
 
@@ -4030,14 +3997,11 @@ dst_register(struct st_translate *t,
 
    case PROGRAM_TEMPORARY:
       if (ureg_dst_is_undef(t->temps[index]))
-         t->temps[index] = ureg_DECL_temporary(t->ureg);
+         t->temps[index] = ureg_DECL_local_temporary(t->ureg);
 
       return t->temps[index];
 
    case PROGRAM_OUTPUT:
-      if (t->procType == TGSI_PROCESSOR_VERTEX && index == VERT_RESULT_PSIZ)
-         t->prevInstWrotePointSize = GL_TRUE;
-
       if (t->procType == TGSI_PROCESSOR_VERTEX)
          assert(index < VERT_RESULT_MAX);
       else if (t->procType == TGSI_PROCESSOR_FRAGMENT)
@@ -4064,7 +4028,7 @@ dst_register(struct st_translate *t,
 static struct ureg_src
 src_register(struct st_translate *t,
              gl_register_file file,
-             GLuint index)
+             GLint index)
 {
    switch(file) {
    case PROGRAM_UNDEFINED:
@@ -4072,9 +4036,9 @@ src_register(struct st_translate *t,
 
    case PROGRAM_TEMPORARY:
       assert(index >= 0);
-      assert(index < Elements(t->temps));
+      assert(index < (int) Elements(t->temps));
       if (ureg_dst_is_undef(t->temps[index]))
-         t->temps[index] = ureg_DECL_temporary(t->ureg);
+         t->temps[index] = ureg_DECL_local_temporary(t->ureg);
       return ureg_src(t->temps[index]);
 
    case PROGRAM_NAMED_PARAM:
@@ -4105,7 +4069,7 @@ src_register(struct st_translate *t,
       return ureg_src(t->address[index]);
 
    case PROGRAM_SYSTEM_VALUE:
-      assert(index < Elements(t->systemValues));
+      assert(index < (int) Elements(t->systemValues));
       return t->systemValues[index];
 
    default:
@@ -4120,7 +4084,7 @@ src_register(struct st_translate *t,
 static struct ureg_dst
 translate_dst(struct st_translate *t,
               const st_dst_reg *dst_reg,
-              bool saturate)
+              bool saturate, bool clamp_color)
 {
    struct ureg_dst dst = dst_register(t, 
                                       dst_reg->file,
@@ -4130,6 +4094,27 @@ translate_dst(struct st_translate *t,
    
    if (saturate)
       dst = ureg_saturate(dst);
+   else if (clamp_color && dst_reg->file == PROGRAM_OUTPUT) {
+      /* Clamp colors for ARB_color_buffer_float. */
+      switch (t->procType) {
+      case TGSI_PROCESSOR_VERTEX:
+         /* XXX if the geometry shader is present, this must be done there
+          * instead of here. */
+         if (dst_reg->index == VERT_RESULT_COL0 ||
+             dst_reg->index == VERT_RESULT_COL1 ||
+             dst_reg->index == VERT_RESULT_BFC0 ||
+             dst_reg->index == VERT_RESULT_BFC1) {
+            dst = ureg_saturate(dst);
+         }
+         break;
+
+      case TGSI_PROCESSOR_FRAGMENT:
+         if (dst_reg->index >= FRAG_RESULT_COLOR) {
+            dst = ureg_saturate(dst);
+         }
+         break;
+      }
+   }
 
    if (dst_reg->reladdr != NULL)
       dst = ureg_dst_indirect(dst, ureg_src(t->address[0]));
@@ -4193,13 +4178,15 @@ translate_tex_offset(struct st_translate *t,
    offset.SwizzleX = in_offset->SwizzleX;
    offset.SwizzleY = in_offset->SwizzleY;
    offset.SwizzleZ = in_offset->SwizzleZ;
+   offset.Padding = 0;
 
    return offset;
 }
 
 static void
 compile_tgsi_instruction(struct st_translate *t,
-                         const glsl_to_tgsi_instruction *inst)
+                         const glsl_to_tgsi_instruction *inst,
+                         bool clamp_dst_color_output)
 {
    struct ureg_program *ureg = t->ureg;
    GLuint i;
@@ -4216,7 +4203,8 @@ compile_tgsi_instruction(struct st_translate *t,
    if (num_dst) 
       dst[0] = translate_dst(t, 
                              &inst->dst,
-                             inst->saturate);
+                             inst->saturate,
+                             clamp_dst_color_output);
 
    for (i = 0; i < num_src; i++) 
       src[i] = translate_src(t, &inst->src[i]);
@@ -4249,7 +4237,7 @@ compile_tgsi_instruction(struct st_translate *t,
       ureg_tex_insn(ureg,
                     inst->op,
                     dst, num_dst, 
-                    translate_texture_target(inst->tex_target, inst->tex_shadow),
+                    st_translate_texture_target(inst->tex_target, inst->tex_shadow),
                     texoffsets, inst->tex_offset_num_offset,
                     src, num_src);
       return;
@@ -4308,7 +4296,7 @@ emit_wpos_adjustment( struct st_translate *t,
           * or not, which is determined by testing against the inversion
           * state variable used below, which will be either +1 or -1.
           */
-         struct ureg_dst adj_temp = ureg_DECL_temporary(ureg);
+         struct ureg_dst adj_temp = ureg_DECL_local_temporary(ureg);
 
          ureg_CMP(ureg, adj_temp,
                   ureg_scalar(wpostrans, invert ? 2 : 0),
@@ -4518,11 +4506,13 @@ st_translate_program(
    const ubyte inputSemanticName[],
    const ubyte inputSemanticIndex[],
    const GLuint interpMode[],
+   const GLboolean is_centroid[],
    GLuint numOutputs,
    const GLuint outputMapping[],
    const ubyte outputSemanticName[],
    const ubyte outputSemanticIndex[],
-   boolean passthrough_edgeflags)
+   boolean passthrough_edgeflags,
+   boolean clamp_color)
 {
    struct st_translate *t;
    unsigned i;
@@ -4543,37 +4533,26 @@ st_translate_program(
    t->inputMapping = inputMapping;
    t->outputMapping = outputMapping;
    t->ureg = ureg;
-   t->pointSizeOutIndex = -1;
-   t->prevInstWrotePointSize = GL_FALSE;
+
+   if (program->shader_program) {
+      for (i = 0; i < program->shader_program->NumUserUniformStorage; i++) {
+         struct gl_uniform_storage *const storage =
+               &program->shader_program->UniformStorage[i];
+
+         _mesa_uniform_detach_all_driver_storage(storage);
+      }
+   }
 
    /*
     * Declare input attributes.
     */
    if (procType == TGSI_PROCESSOR_FRAGMENT) {
       for (i = 0; i < numInputs; i++) {
-         t->inputs[i] = ureg_DECL_fs_input(ureg,
-                                           inputSemanticName[i],
-                                           inputSemanticIndex[i],
-                                           interpMode[i]);
-      }
-
-      if (program->shader_program->FragDepthLayout != FRAG_DEPTH_LAYOUT_NONE) {
-         switch (program->shader_program->FragDepthLayout) {
-         case FRAG_DEPTH_LAYOUT_ANY:
-            ureg_property_fs_depth_layout(ureg, TGSI_FS_DEPTH_LAYOUT_ANY);
-            break;
-         case FRAG_DEPTH_LAYOUT_GREATER:
-            ureg_property_fs_depth_layout(ureg, TGSI_FS_DEPTH_LAYOUT_GREATER);
-            break;
-         case FRAG_DEPTH_LAYOUT_LESS:
-            ureg_property_fs_depth_layout(ureg, TGSI_FS_DEPTH_LAYOUT_LESS);
-            break;
-         case FRAG_DEPTH_LAYOUT_UNCHANGED:
-            ureg_property_fs_depth_layout(ureg, TGSI_FS_DEPTH_LAYOUT_UNCHANGED);
-            break;
-         default:
-            assert(0);
-         }
+         t->inputs[i] = ureg_DECL_fs_input_cyl_centroid(ureg,
+                                                        inputSemanticName[i],
+                                                        inputSemanticIndex[i],
+                                                        interpMode[i], 0,
+                                                        is_centroid[i]);
       }
 
       if (proginfo->InputsRead & FRAG_BIT_WPOS) {
@@ -4640,25 +4619,6 @@ st_translate_program(
          t->outputs[i] = ureg_DECL_output(ureg,
                                           outputSemanticName[i],
                                           outputSemanticIndex[i]);
-         if ((outputSemanticName[i] == TGSI_SEMANTIC_PSIZE) && proginfo->Id) {
-            /* Writing to the point size result register requires special
-             * handling to implement clamping.
-             */
-            static const gl_state_index pointSizeClampState[STATE_LENGTH]
-               = { STATE_INTERNAL, STATE_POINT_SIZE_IMPL_CLAMP, (gl_state_index)0, (gl_state_index)0, (gl_state_index)0 };
-               /* XXX: note we are modifying the incoming shader here!  Need to
-               * do this before emitting the constant decls below, or this
-               * will be missed.
-               */
-            unsigned pointSizeClampConst =
-               _mesa_add_state_reference(proginfo->Parameters,
-                                         pointSizeClampState);
-            struct ureg_dst psizregtemp = ureg_DECL_temporary(ureg);
-            t->pointSizeConst = ureg_DECL_constant(ureg, pointSizeClampConst);
-            t->pointSizeResult = t->outputs[i];
-            t->pointSizeOutIndex = i;
-            t->outputs[i] = psizregtemp;
-         }
       }
       if (passthrough_edgeflags)
          emit_edgeflags(t);
@@ -4680,6 +4640,25 @@ st_translate_program(
          if (sysInputs & (1 << i)) {
             unsigned semName = mesa_sysval_to_semantic[i];
             t->systemValues[i] = ureg_DECL_system_value(ureg, numSys, semName, 0);
+            if (semName == TGSI_SEMANTIC_INSTANCEID ||
+                semName == TGSI_SEMANTIC_VERTEXID) {
+               /* From Gallium perspective, these system values are always
+                * integer, and require native integer support.  However, if
+                * native integer is supported on the vertex stage but not the
+                * pixel stage (e.g, i915g + draw), Mesa will generate IR that
+                * assumes these system values are floats. To resolve the
+                * inconsistency, we insert a U2F.
+                */
+               struct st_context *st = st_context(ctx);
+               struct pipe_screen *pscreen = st->pipe->screen;
+               assert(procType == TGSI_PROCESSOR_VERTEX);
+               assert(pscreen->get_shader_param(pscreen, PIPE_SHADER_VERTEX, PIPE_SHADER_CAP_INTEGERS));
+               if (!ctx->Const.NativeIntegers) {
+                  struct ureg_dst temp = ureg_DECL_local_temporary(t->ureg);
+                  ureg_U2F( t->ureg, ureg_writemask(temp, TGSI_WRITEMASK_X), t->systemValues[i]);
+                  t->systemValues[i] = ureg_scalar(ureg_src(temp), 0);
+               }
+            }
             numSys++;
             sysInputs &= ~(1 << i);
          }
@@ -4693,7 +4672,7 @@ st_translate_program(
        */
       for (i = 0; i < (unsigned)program->next_temp; i++) {
          /* XXX use TGSI_FILE_TEMPORARY_ARRAY when it's supported by ureg */
-         t->temps[i] = ureg_DECL_temporary(t->ureg);
+         t->temps[i] = ureg_DECL_local_temporary(t->ureg);
       }
    }
 
@@ -4701,7 +4680,8 @@ st_translate_program(
     * so we put all the translated regs in t->constants.
     */
    if (proginfo->Parameters) {
-      t->constants = (struct ureg_src *)CALLOC(proginfo->Parameters->NumParameters * sizeof(t->constants[0]));
+      t->constants = (struct ureg_src *)
+         calloc(proginfo->Parameters->NumParameters, sizeof(t->constants[0]));
       if (t->constants == NULL) {
          ret = PIPE_ERROR_OUT_OF_MEMORY;
          goto out;
@@ -4740,7 +4720,8 @@ st_translate_program(
    
    /* Emit immediate values.
     */
-   t->immediates = (struct ureg_src *)CALLOC(program->num_immediates * sizeof(struct ureg_src));
+   t->immediates = (struct ureg_src *)
+      calloc(program->num_immediates, sizeof(struct ureg_src));
    if (t->immediates == NULL) {
       ret = PIPE_ERROR_OUT_OF_MEMORY;
       goto out;
@@ -4748,8 +4729,10 @@ st_translate_program(
    i = 0;
    foreach_iter(exec_list_iterator, iter, program->immediates) {
       immediate_storage *imm = (immediate_storage *)iter.get();
+      assert(i < program->num_immediates);
       t->immediates[i++] = emit_immediate(t, imm->values, imm->type, imm->size);
    }
+   assert(i == program->num_immediates);
 
    /* texture samplers */
    for (i = 0; i < ctx->Const.MaxTextureImageUnits; i++) {
@@ -4762,26 +4745,8 @@ st_translate_program(
     */
    foreach_iter(exec_list_iterator, iter, program->instructions) {
       set_insn_start(t, ureg_get_instruction_number(ureg));
-      compile_tgsi_instruction(t, (glsl_to_tgsi_instruction *)iter.get());
-
-      if (t->prevInstWrotePointSize && proginfo->Id) {
-         /* The previous instruction wrote to the (fake) vertex point size
-          * result register.  Now we need to clamp that value to the min/max
-          * point size range, putting the result into the real point size
-          * register.
-          * Note that we can't do this easily at the end of program due to
-          * possible early return.
-          */
-         set_insn_start(t, ureg_get_instruction_number(ureg));
-         ureg_MAX(t->ureg,
-                  ureg_writemask(t->outputs[t->pointSizeOutIndex], WRITEMASK_X),
-                  ureg_src(t->outputs[t->pointSizeOutIndex]),
-                  ureg_swizzle(t->pointSizeConst, 1,1,1,1));
-         ureg_MIN(t->ureg, ureg_writemask(t->pointSizeResult, WRITEMASK_X),
-                  ureg_src(t->outputs[t->pointSizeOutIndex]),
-                  ureg_swizzle(t->pointSizeConst, 2,2,2,2));
-      }
-      t->prevInstWrotePointSize = GL_FALSE;
+      compile_tgsi_instruction(t, (glsl_to_tgsi_instruction *)iter.get(),
+                               clamp_color);
    }
 
    /* Fix up all emitted labels:
@@ -4791,6 +4756,20 @@ st_translate_program(
                        t->insn[t->labels[i].branch_target]);
    }
 
+   if (program->shader_program) {
+      /* This has to be done last.  Any operation the can cause
+       * prog->ParameterValues to get reallocated (e.g., anything that adds a
+       * program constant) has to happen before creating this linkage.
+       */
+      for (unsigned i = 0; i < MESA_SHADER_TYPES; i++) {
+         if (program->shader_program->_LinkedShaders[i] == NULL)
+            continue;
+
+         _mesa_associate_uniform_storage(ctx, program->shader_program,
+               program->shader_program->_LinkedShaders[i]->Program->Parameters);
+      }
+   }
+
 out:
    if (t) {
       FREE(t->insn);
@@ -4816,12 +4795,10 @@ out:
 static struct gl_program *
 get_mesa_program(struct gl_context *ctx,
                  struct gl_shader_program *shader_program,
-                struct gl_shader *shader)
+                 struct gl_shader *shader)
 {
-   glsl_to_tgsi_visitor* v = new glsl_to_tgsi_visitor();
+   glsl_to_tgsi_visitor* v;
    struct gl_program *prog;
-   struct pipe_screen * screen = st_context(ctx)->pipe->screen;
-   unsigned pipe_shader_type;
    GLenum target;
    const char *target_string;
    bool progress;
@@ -4832,17 +4809,14 @@ get_mesa_program(struct gl_context *ctx,
    case GL_VERTEX_SHADER:
       target = GL_VERTEX_PROGRAM_ARB;
       target_string = "vertex";
-      pipe_shader_type = PIPE_SHADER_VERTEX;
       break;
    case GL_FRAGMENT_SHADER:
       target = GL_FRAGMENT_PROGRAM_ARB;
       target_string = "fragment";
-      pipe_shader_type = PIPE_SHADER_FRAGMENT;
       break;
    case GL_GEOMETRY_SHADER:
       target = GL_GEOMETRY_PROGRAM_NV;
       target_string = "geometry";
-      pipe_shader_type = PIPE_SHADER_GEOMETRY;
       break;
    default:
       assert(!"should not be reached");
@@ -4855,6 +4829,7 @@ get_mesa_program(struct gl_context *ctx,
    if (!prog)
       return NULL;
    prog->Parameters = _mesa_new_parameter_list();
+   v = new glsl_to_tgsi_visitor();
    v->ctx = ctx;
    v->prog = prog;
    v->shader_program = shader_program;
@@ -4865,6 +4840,9 @@ get_mesa_program(struct gl_context *ctx,
    _mesa_generate_parameters_list_for_uniforms(shader_program, shader,
                                               prog->Parameters);
 
+   /* Remove reads from output registers. */
+   lower_output_reads(shader->ir);
+
    /* Emit intermediate IR for main(). */
    visit_exec_list(shader->ir, v);
 
@@ -4911,14 +4889,6 @@ get_mesa_program(struct gl_context *ctx,
    }
 #endif
 
-   if (!screen->get_shader_param(screen, pipe_shader_type,
-                                 PIPE_SHADER_CAP_OUTPUT_READ)) {
-      /* Remove reads to output registers, and to varyings in vertex shaders. */
-      v->remove_output_reads(PROGRAM_OUTPUT);
-      if (target == GL_VERTEX_PROGRAM_ARB)
-         v->remove_output_reads(PROGRAM_VARYING);
-   }
-   
    /* Perform optimizations on the instructions in the glsl_to_tgsi_visitor. */
    v->simplify_cmp();
    v->copy_propagate();
@@ -5040,13 +5010,18 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
             &ctx->ShaderCompilerOptions[_mesa_shader_type_to_index(prog->_LinkedShaders[i]->Type)];
 
       do {
+         unsigned what_to_lower = MOD_TO_FRACT | DIV_TO_MUL_RCP |
+            EXP_TO_EXP2 | LOG_TO_LOG2;
+         if (options->EmitNoPow)
+            what_to_lower |= POW_TO_EXP2;
+         if (!ctx->Const.NativeIntegers)
+            what_to_lower |= INT_DIV_TO_MUL_RCP;
+
          progress = false;
 
          /* Lowering */
          do_mat_op_to_vec(ir);
-         lower_instructions(ir, (MOD_TO_FRACT | DIV_TO_MUL_RCP | EXP_TO_EXP2
-                                | LOG_TO_LOG2 | INT_DIV_TO_MUL_RCP
-                                | ((options->EmitNoPow) ? POW_TO_EXP2 : 0)));
+         lower_instructions(ir, what_to_lower);
 
          progress = do_lower_jumps(ir, true, true, options->EmitNoMainReturn, options->EmitNoCont, options->EmitNoLoops) || progress;
 
@@ -5114,4 +5089,28 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
    return GL_TRUE;
 }
 
+void
+st_translate_stream_output_info(glsl_to_tgsi_visitor *glsl_to_tgsi,
+                                const GLuint outputMapping[],
+                                struct pipe_stream_output_info *so)
+{
+   unsigned i;
+   struct gl_transform_feedback_info *info =
+      &glsl_to_tgsi->shader_program->LinkedTransformFeedback;
+
+   for (i = 0; i < info->NumOutputs; i++) {
+      so->output[i].register_index =
+         outputMapping[info->Outputs[i].OutputRegister];
+      so->output[i].start_component = info->Outputs[i].ComponentOffset;
+      so->output[i].num_components = info->Outputs[i].NumComponents;
+      so->output[i].output_buffer = info->Outputs[i].OutputBuffer;
+      so->output[i].dst_offset = info->Outputs[i].DstOffset;
+   }
+
+   for (i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
+      so->stride[i] = info->BufferStride[i];
+   }
+   so->num_outputs = info->NumOutputs;
+}
+
 } /* extern "C" */