nv50/ir/ra: Fix traversal before the beginning of the active list in buildRIG.

[mesa.git] / src / gallium / drivers / nv50 / nv50_program.c
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c

index 925028700cdf5936315684f4f11978a5850593d6..10810bf3e07ef049c7007663d029e550a0533034 100644 (file)
--- a/src/gallium/drivers/nv50/nv50_program.c
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -1,5 +1,5 @@
  /*
- * Copyright 2010 Chrsitoph Bumiller
+ * Copyright 2010 Christoph Bumiller
   *
   * Permission is hereby granted, free of charge, to any person obtaining a
   * copy of this software and associated documentation files (the "Software"),
@@ -20,628 +20,426 @@
   * SOFTWARE.
   */
  
-/* #define NV50_PROGRAM_DEBUG */
-
  #include "nv50_program.h"
-#include "nv50_pc.h"
  #include "nv50_context.h"
  
-#include "pipe/p_shader_tokens.h"
-#include "tgsi/tgsi_parse.h"
-#include "tgsi/tgsi_util.h"
-#include "tgsi/tgsi_dump.h"
+#include "codegen/nv50_ir_driver.h"
  
  static INLINE unsigned
  bitcount4(const uint32_t val)
  {
-   static const unsigned cnt[16]
+   static const uint8_t cnt[16]
     = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
     return cnt[val & 0xf];
  }
  
-static unsigned
-nv50_tgsi_src_mask(const struct tgsi_full_instruction *inst, int c)
-{
-   unsigned mask = inst->Dst[0].Register.WriteMask;
-
-   switch (inst->Instruction.Opcode) {
-   case TGSI_OPCODE_COS:
-   case TGSI_OPCODE_SIN:
-      return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0);
-   case TGSI_OPCODE_DP3:
-      return 0x7;
-   case TGSI_OPCODE_DP4:
-   case TGSI_OPCODE_DPH:
-   case TGSI_OPCODE_KIL: /* WriteMask ignored */
-      return 0xf;
-   case TGSI_OPCODE_DST:
-      return mask & (c ? 0xa : 0x6);
-   case TGSI_OPCODE_EX2:
-   case TGSI_OPCODE_EXP:
-   case TGSI_OPCODE_LG2:
-   case TGSI_OPCODE_LOG:
-   case TGSI_OPCODE_POW:
-   case TGSI_OPCODE_RCP:
-   case TGSI_OPCODE_RSQ:
-   case TGSI_OPCODE_SCS:
-      return 0x1;
-   case TGSI_OPCODE_IF:
-      return 0x1;
-   case TGSI_OPCODE_LIT:
-      return 0xb;
-   case TGSI_OPCODE_TEX:
-   case TGSI_OPCODE_TXB:
-   case TGSI_OPCODE_TXL:
-   case TGSI_OPCODE_TXP:
-   {
-      const struct tgsi_instruction_texture *tex;
-
-      assert(inst->Instruction.Texture);
-      tex = &inst->Texture;
-
-      mask = 0x7;
-      if (inst->Instruction.Opcode != TGSI_OPCODE_TEX &&
-          inst->Instruction.Opcode != TGSI_OPCODE_TXD)
-         mask |= 0x8; /* bias, lod or proj */
-
-      switch (tex->Texture) {
-      case TGSI_TEXTURE_1D:
-         mask &= 0x9;
-         break;
-      case TGSI_TEXTURE_SHADOW1D:
-         mask &= 0x5;
-         break;
-      case TGSI_TEXTURE_2D:
-         mask &= 0xb;
-         break;
-      default:
-         break;
-      }
-   }
-          return mask;
-   case TGSI_OPCODE_XPD:
-   {
-      unsigned x = 0;
-      if (mask & 1) x |= 0x6;
-      if (mask & 2) x |= 0x5;
-      if (mask & 4) x |= 0x3;
-      return x;
-   }
-   default:
-      break;
-   }
-
-   return mask;
-}
-
-static void
-nv50_indirect_inputs(struct nv50_translation_info *ti, int id)
+static int
+nv50_vertprog_assign_slots(struct nv50_ir_prog_info *info)
  {
-   int i, c;
+   struct nv50_program *prog = (struct nv50_program *)info->driverPriv;
+   unsigned i, n, c;
  
-   for (i = 0; i < PIPE_MAX_SHADER_INPUTS; ++i)
-      for (c = 0; c < 4; ++c)
-         ti->input_access[i][c] = id;
+   n = 0;
+   for (i = 0; i < info->numInputs; ++i) {
+      prog->in[i].id = i;
+      prog->in[i].sn = info->in[i].sn;
+      prog->in[i].si = info->in[i].si;
+      prog->in[i].hw = n;
+      prog->in[i].mask = info->in[i].mask;
  
-   ti->indirect_inputs = TRUE;
-}
+      prog->vp.attrs[(4 * i) / 32] |= info->in[i].mask << ((4 * i) % 32);
  
-static void
-nv50_indirect_outputs(struct nv50_translation_info *ti, int id)
-{
-   int i, c;
-
-   for (i = 0; i < PIPE_MAX_SHADER_OUTPUTS; ++i)
        for (c = 0; c < 4; ++c)
-         ti->output_access[i][c] = id;
-
-   ti->indirect_outputs = TRUE;
-}
-
-static void
-prog_inst(struct nv50_translation_info *ti,
-          const struct tgsi_full_instruction *inst, int id)
-{
-   const struct tgsi_dst_register *dst;
-   const struct tgsi_src_register *src;
-   int s, c, k;
-   unsigned mask;
-
-   if (inst->Instruction.Opcode == TGSI_OPCODE_BGNSUB) {
-      ti->subr[ti->subr_nr].pos = id - 1;
-      ti->subr[ti->subr_nr].id = ti->subr_nr + 1; /* id 0 is main program */
-      ++ti->subr_nr;
-   }
-
-   if (inst->Dst[0].Register.File == TGSI_FILE_OUTPUT) {
-      dst = &inst->Dst[0].Register;
-
-      for (c = 0; c < 4; ++c) {
-         if (dst->Indirect)
-            nv50_indirect_outputs(ti, id);
-         if (!(dst->WriteMask & (1 << c)))
-            continue;
-         ti->output_access[dst->Index][c] = id;
-      }
-
-      if (inst->Instruction.Opcode == TGSI_OPCODE_MOV &&
-          inst->Src[0].Register.File == TGSI_FILE_INPUT &&
-          dst->Index == ti->edgeflag_out)
-         ti->p->vp.edgeflag = inst->Src[0].Register.Index;
+         if (info->in[i].mask & (1 << c))
+            info->in[i].slot[c] = n++;
     }
+   prog->in_nr = info->numInputs;
  
-   for (s = 0; s < inst->Instruction.NumSrcRegs; ++s) {
-      src = &inst->Src[s].Register;
-      if (src->File != TGSI_FILE_INPUT)
+   for (i = 0; i < info->numSysVals; ++i) {
+      switch (info->sv[i].sn) {
+      case TGSI_SEMANTIC_INSTANCEID:
+         prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_INSTANCE_ID;
           continue;
-      mask = nv50_tgsi_src_mask(inst, s);
-
-      if (inst->Src[s].Register.Indirect)
-         nv50_indirect_inputs(ti, id);
-
-      for (c = 0; c < 4; ++c) {
-         if (!(mask & (1 << c)))
-            continue;
-         k = tgsi_util_get_full_src_register_swizzle(&inst->Src[s], c);
-         if (k <= TGSI_SWIZZLE_W)
-            ti->input_access[src->Index][k] = id;
-      }
-   }
-}
-
-/* Probably should introduce something like struct tgsi_function_declaration
- * instead of trying to guess inputs/outputs.
- */
-static void
-prog_subroutine_inst(struct nv50_subroutine *subr,
-                     const struct tgsi_full_instruction *inst)
-{
-   const struct tgsi_dst_register *dst;
-   const struct tgsi_src_register *src;
-   int s, c, k;
-   unsigned mask;
-
-   for (s = 0; s < inst->Instruction.NumSrcRegs; ++s) {
-      src = &inst->Src[s].Register;
-      if (src->File != TGSI_FILE_TEMPORARY)
+      case TGSI_SEMANTIC_VERTEXID:
+         prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_VERTEX_ID;
+         prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_UNK12;
           continue;
-      mask = nv50_tgsi_src_mask(inst, s);
-
-      assert(!inst->Src[s].Register.Indirect);
-
-      for (c = 0; c < 4; ++c) {
-         k = tgsi_util_get_full_src_register_swizzle(&inst->Src[s], c);
-
-         if ((mask & (1 << c)) && k < TGSI_SWIZZLE_W)
-            if (!(subr->retv[src->Index / 32][k] & (1 << (src->Index % 32))))
-               subr->argv[src->Index / 32][k] |= 1 << (src->Index % 32);
-      }
-   }
-
-   if (inst->Dst[0].Register.File == TGSI_FILE_TEMPORARY) {
-      dst = &inst->Dst[0].Register;
-
-      for (c = 0; c < 4; ++c)
-         if (dst->WriteMask & (1 << c))
-            subr->retv[dst->Index / 32][c] |= 1 << (dst->Index % 32);
-   }
-}
-
-static void
-prog_immediate(struct nv50_translation_info *ti,
-               const struct tgsi_full_immediate *imm)
-{
-   int c;
-   unsigned n = ti->immd32_nr++;
-
-   assert(ti->immd32_nr <= ti->scan.immediate_count);
-
-   for (c = 0; c < 4; ++c)
-      ti->immd32[n * 4 + c] = imm->u[c].Uint;
-
-   ti->immd32_ty[n] = imm->Immediate.DataType;
-}
-
-static INLINE unsigned
-translate_interpolate(const struct tgsi_full_declaration *decl)
-{
-   unsigned mode;
-
-   if (decl->Declaration.Interpolate == TGSI_INTERPOLATE_CONSTANT)
-      mode = NV50_INTERP_FLAT;
-   else
-   if (decl->Declaration.Interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
-      mode = 0;
-   else
-      mode = NV50_INTERP_LINEAR;
-
-   if (decl->Declaration.Centroid)
-      mode |= NV50_INTERP_CENTROID;
-
-   return mode;
-}
-
-static void
-prog_decl(struct nv50_translation_info *ti,
-          const struct tgsi_full_declaration *decl)
-{
-   unsigned i, first, last, sn = 0, si = 0;
-
-   first = decl->Range.First;
-   last = decl->Range.Last;
-
-   if (decl->Declaration.Semantic) {
-      sn = decl->Semantic.Name;
-      si = decl->Semantic.Index;
-   }
-
-   switch (decl->Declaration.File) {
-   case TGSI_FILE_INPUT:
-      for (i = first; i <= last; ++i)
-         ti->interp_mode[i] = translate_interpolate(decl);
-
-      if (!decl->Declaration.Semantic)
-         break;
-
-      for (i = first; i <= last; ++i) {
-         ti->p->in[i].sn = sn;
-         ti->p->in[i].si = si;
-      }
-
-      switch (sn) {
-      case TGSI_SEMANTIC_FACE:
-         break;
-      case TGSI_SEMANTIC_COLOR:
-         if (ti->p->type == PIPE_SHADER_FRAGMENT)
-            ti->p->vp.bfc[si] = first;
-         break;
-      }
-      break;
-   case TGSI_FILE_OUTPUT:
-      if (!decl->Declaration.Semantic)
+      default:
           break;
-
-      for (i = first; i <= last; ++i) {
-         ti->p->out[i].sn = sn;
-         ti->p->out[i].si = si;
        }
+   }
  
-      switch (sn) {
-      case TGSI_SEMANTIC_BCOLOR:
-         ti->p->vp.bfc[si] = first;
-         break;
+   /*
+    * Corner case: VP has no inputs, but we will still need to submit data to
+    * draw it. HW will shout at us and won't draw anything if we don't enable
+    * any input, so let's just pretend it's the first one.
+    */
+   if (prog->vp.attrs[0] == 0 &&
+       prog->vp.attrs[1] == 0 &&
+       prog->vp.attrs[2] == 0)
+      prog->vp.attrs[0] |= 0xf;
+
+   /* VertexID before InstanceID */
+   if (info->io.vertexId < info->numSysVals)
+      info->sv[info->io.vertexId].slot[0] = n++;
+   if (info->io.instanceId < info->numSysVals)
+      info->sv[info->io.instanceId].slot[0] = n++;
+
+   n = 0;
+   for (i = 0; i < info->numOutputs; ++i) {
+      switch (info->out[i].sn) {
        case TGSI_SEMANTIC_PSIZE:
-         ti->p->vp.psiz = first;
-         break;
-      case TGSI_SEMANTIC_EDGEFLAG:
-         ti->edgeflag_out = first;
-         break;
-      default:
-         break;
-      }
-      break;
-   case TGSI_FILE_SYSTEM_VALUE:
-      switch (decl->Semantic.Name) {
-      case TGSI_SEMANTIC_FACE:
-         break;
-      case TGSI_SEMANTIC_INSTANCEID:
+         prog->vp.psiz = i;
           break;
-      case TGSI_SEMANTIC_PRIMID:
+      case TGSI_SEMANTIC_CLIPDIST:
+         prog->vp.clpd[info->out[i].si] = n;
           break;
-         /*
-      case TGSI_SEMANTIC_PRIMIDIN:
+      case TGSI_SEMANTIC_EDGEFLAG:
+         prog->vp.edgeflag = i;
           break;
-      case TGSI_SEMANTIC_VERTEXID:
+      case TGSI_SEMANTIC_BCOLOR:
+         prog->vp.bfc[info->out[i].si] = i;
           break;
-         */
        default:
           break;
        }
-      break;
-   case TGSI_FILE_CONSTANT:
-      ti->p->parm_size = MAX2(ti->p->parm_size, (last + 1) * 16);
-      break;
-   case TGSI_FILE_ADDRESS:
-   case TGSI_FILE_SAMPLER:
-   case TGSI_FILE_TEMPORARY:
-      break;
-   default:
-      assert(0);
-      break;
-   }
-}
-
-static int
-nv50_vertprog_prepare(struct nv50_translation_info *ti)
-{
-   struct nv50_program *p = ti->p;
-   int i, c;
-   unsigned num_inputs = 0;
-
-   ti->input_file = NV_FILE_MEM_S;
-   ti->output_file = NV_FILE_OUT;
-
-   for (i = 0; i <= ti->scan.file_max[TGSI_FILE_INPUT]; ++i) {
-      p->in[i].id = i;
-      p->in[i].hw = num_inputs;
-
-      for (c = 0; c < 4; ++c) {
-         if (!ti->input_access[i][c])
-            continue;
-         ti->input_map[i][c] = num_inputs++;
-         p->vp.attrs[(4 * i + c) / 32] |= 1 << ((i * 4 + c) % 32);
-      }
-   }
-
-   for (i = 0; i <= ti->scan.file_max[TGSI_FILE_OUTPUT]; ++i) {
-      p->out[i].id = i;
-      p->out[i].hw = p->max_out;
+      prog->out[i].id = i;
+      prog->out[i].sn = info->out[i].sn;
+      prog->out[i].si = info->out[i].si;
+      prog->out[i].hw = n;
+      prog->out[i].mask = info->out[i].mask;
  
-      for (c = 0; c < 4; ++c) {
-         if (!ti->output_access[i][c])
-            continue;
-         ti->output_map[i][c] = p->max_out++;
-         p->out[i].mask |= 1 << c;
-      }
+      for (c = 0; c < 4; ++c)
+         if (info->out[i].mask & (1 << c))
+            info->out[i].slot[c] = n++;
     }
+   prog->out_nr = info->numOutputs;
+   prog->max_out = n;
  
-   if (p->vp.psiz < 0x40)
-      p->vp.psiz = p->out[p->vp.psiz].hw;
+   if (prog->vp.psiz < info->numOutputs)
+      prog->vp.psiz = prog->out[prog->vp.psiz].hw;
  
     return 0;
  }
  
  static int
-nv50_fragprog_prepare(struct nv50_translation_info *ti)
+nv50_fragprog_assign_slots(struct nv50_ir_prog_info *info)
  {
-   struct nv50_program *p = ti->p;
-   int i, j, c;
-   unsigned nvary, nintp, depr;
-   unsigned n = 0, m = 0, skip = 0;
-   ubyte sn[16], si[16];
-
-   /* FP flags */
-
-   if (ti->scan.writes_z) {
-      p->fp.flags[1] = 0x11;
-      p->fp.flags[0] |= NV50TCL_FP_CONTROL_EXPORTS_Z;
-   }
-
-   if (ti->scan.uses_kill)
-      p->fp.flags[0] |= NV50TCL_FP_CONTROL_USES_KIL;
-
-   /* FP inputs */
-
-   ti->input_file = NV_FILE_MEM_V;
-   ti->output_file = NV_FILE_GPR;
-
-   /* count non-flat inputs, save semantic info */
-   for (i = 0; i < p->in_nr; ++i) {
-      m += (ti->interp_mode[i] & NV50_INTERP_FLAT) ? 0 : 1;
-      sn[i] = p->in[i].sn;
-      si[i] = p->in[i].si;
+   struct nv50_program *prog = (struct nv50_program *)info->driverPriv;
+   unsigned i, n, m, c;
+   unsigned nvary;
+   unsigned nflat;
+   unsigned nintp = 0;
+
+   /* count recorded non-flat inputs */
+   for (m = 0, i = 0; i < info->numInputs; ++i) {
+      switch (info->in[i].sn) {
+      case TGSI_SEMANTIC_POSITION:
+      case TGSI_SEMANTIC_FACE:
+         continue;
+      default:
+         m += info->in[i].flat ? 0 : 1;
+         break;
+      }
     }
+   /* careful: id may be != i in info->in[prog->in[i].id] */
  
-   /* reorder p->in[] so that non-flat inputs are first and
-    * kick out special inputs that don't use VP/GP_RESULT_MAP
+   /* Fill prog->in[] so that non-flat inputs are first and
+    * kick out special inputs that don't use the RESULT_MAP.
      */
-   nintp = 0;
-   for (i = 0; i < p->in_nr; ++i) {
-      if (sn[i] == TGSI_SEMANTIC_POSITION) {
-         for (c = 0; c < 4; ++c) {
-            ti->input_map[i][c] = nintp;
-            if (ti->input_access[i][c]) {
-               p->fp.interp |= 1 << (24 + c);
-               ++nintp;
-            }
-         }
-         skip++;
-         continue;
+   for (n = 0, i = 0; i < info->numInputs; ++i) {
+      if (info->in[i].sn == TGSI_SEMANTIC_POSITION) {
+         prog->fp.interp |= info->in[i].mask << 24;
+         for (c = 0; c < 4; ++c)
+            if (info->in[i].mask & (1 << c))
+               info->in[i].slot[c] = nintp++;
        } else
-      if (sn[i] == TGSI_SEMANTIC_FACE) {
-         ti->input_map[i][0] = 255;
-         skip++;
-         continue;
-      }
+      if (info->in[i].sn == TGSI_SEMANTIC_FACE) {
+         info->in[i].slot[0] = 255;
+      } else {
+         unsigned j = info->in[i].flat ? m++ : n++;
  
-      j = (ti->interp_mode[i] & NV50_INTERP_FLAT) ? m++ : n++;
+         if (info->in[i].sn == TGSI_SEMANTIC_COLOR)
+            prog->vp.bfc[info->in[i].si] = j;
  
-      if (sn[i] == TGSI_SEMANTIC_COLOR)
-         p->vp.bfc[si[i]] = j;
-          
-      p->in[j].linear = (ti->interp_mode[i] & NV50_INTERP_LINEAR) ? 1 : 0;
-      p->in[j].id = i;
-      p->in[j].sn = sn[i];
-      p->in[j].si = si[i];
-   }
-   assert(n <= m);
-   p->in_nr -= skip;
+         prog->in[j].id = i;
+         prog->in[j].mask = info->in[i].mask;
+         prog->in[j].sn = info->in[i].sn;
+         prog->in[j].si = info->in[i].si;
+         prog->in[j].linear = info->in[i].linear;
  
-   if (!(p->fp.interp & (8 << 24))) {
-      p->fp.interp |= (8 << 24);
+         prog->in_nr++;
+      }
+   }
+   if (!(prog->fp.interp & (8 << 24))) {
        ++nintp;
+      prog->fp.interp |= 8 << 24;
     }
  
-   p->fp.colors = (1 << 24) | 4; /* CLAMP, FFC0_ID = 4 */
+   for (i = 0; i < prog->in_nr; ++i) {
+      int j = prog->in[i].id;
  
-   for (i = 0; i < p->in_nr; ++i) {
-      int j = p->in[i].id;
-      p->in[i].hw = nintp;
-
-      for (c = 0; c < 4; ++c) {
-         if (!ti->input_access[j][c])
-            continue;
-         p->in[i].mask |= 1 << c;
-         ti->input_map[j][c] = nintp++;
-      }
-      /* count color inputs */
-      if (i == p->vp.bfc[0] || i == p->vp.bfc[1])
-         p->fp.colors += bitcount4(p->in[i].mask) << 16;
+      prog->in[i].hw = nintp;
+      for (c = 0; c < 4; ++c)
+         if (prog->in[i].mask & (1 << c))
+            info->in[j].slot[c] = nintp++;
     }
-   nintp -= bitcount4(p->fp.interp >> 24); /* subtract position inputs */
-   nvary = nintp;
-   if (n < m)
-      nvary -= p->in[n].hw;
+   /* (n == m) if m never increased, i.e. no flat inputs */
+   nflat = (n < m) ? (nintp - prog->in[n].hw) : 0;
+   nintp -= bitcount4(prog->fp.interp >> 24); /* subtract position inputs */
+   nvary = nintp - nflat;
  
-   p->fp.interp |= nvary << NV50TCL_FP_INTERPOLANT_CTRL_COUNT_NONFLAT_SHIFT;
-   p->fp.interp |= nintp << NV50TCL_FP_INTERPOLANT_CTRL_COUNT_SHIFT;
+   prog->fp.interp |= nvary << NV50_3D_FP_INTERPOLANT_CTRL_COUNT_NONFLAT__SHIFT;
+   prog->fp.interp |= nintp << NV50_3D_FP_INTERPOLANT_CTRL_COUNT__SHIFT;
+
+   /* put front/back colors right after HPOS */
+   prog->fp.colors = 4 << NV50_3D_SEMANTIC_COLOR_FFC0_ID__SHIFT;
+   for (i = 0; i < 2; ++i)
+      if (prog->vp.bfc[i] < 0xff)
+         prog->fp.colors += bitcount4(prog->in[prog->vp.bfc[i]].mask) << 16;
  
     /* FP outputs */
  
-   if (p->out_nr > (1 + (ti->scan.writes_z ? 1 : 0)))
-      p->fp.flags[0] |= NV50TCL_FP_CONTROL_MULTIPLE_RESULTS;
+   if (info->prop.fp.numColourResults > 1)
+      prog->fp.flags[0] |= NV50_3D_FP_CONTROL_MULTIPLE_RESULTS;
+
+   for (i = 0; i < info->numOutputs; ++i) {
+      prog->out[i].id = i;
+      prog->out[i].sn = info->out[i].sn;
+      prog->out[i].si = info->out[i].si;
+      prog->out[i].mask = info->out[i].mask;
  
-   depr = p->out_nr;
-   for (i = 0; i < p->out_nr; ++i) {
-      p->out[i].id = i;
-      if (p->out[i].sn == TGSI_SEMANTIC_POSITION) {
-         depr = i;
+      if (i == info->io.fragDepth || i == info->io.sampleMask)
           continue;
-      }
-      p->out[i].hw = p->max_out;
-      p->out[i].mask = 0xf;
+      prog->out[i].hw = info->out[i].si * 4;
  
        for (c = 0; c < 4; ++c)
-         ti->output_map[i][c] = p->max_out++;
-   }
-   if (depr < p->out_nr) {
-      p->out[depr].mask = 0x4;
-      p->out[depr].hw = ti->output_map[depr][2] = p->max_out++;
+         info->out[i].slot[c] = prog->out[i].hw + c;
+
+      prog->max_out = MAX2(prog->max_out, prog->out[i].hw + 4);
     }
  
+   if (info->io.sampleMask < PIPE_MAX_SHADER_OUTPUTS)
+      info->out[info->io.sampleMask].slot[0] = prog->max_out++;
+
+   if (info->io.fragDepth < PIPE_MAX_SHADER_OUTPUTS)
+      info->out[info->io.fragDepth].slot[2] = prog->max_out++;
+
+   if (!prog->max_out)
+      prog->max_out = 4;
+
     return 0;
  }
  
  static int
-nv50_geomprog_prepare(struct nv50_translation_info *ti)
+nv50_program_assign_varying_slots(struct nv50_ir_prog_info *info)
  {
-   ti->input_file = NV_FILE_MEM_S;
-   ti->output_file = NV_FILE_OUT;
-
-   assert(0);
-   return 1;
+   switch (info->type) {
+   case PIPE_SHADER_VERTEX:
+      return nv50_vertprog_assign_slots(info);
+   case PIPE_SHADER_GEOMETRY:
+      return nv50_vertprog_assign_slots(info);
+   case PIPE_SHADER_FRAGMENT:
+      return nv50_fragprog_assign_slots(info);
+   default:
+      return -1;
+   }
  }
  
-static int
-nv50_prog_scan(struct nv50_translation_info *ti)
+static struct nv50_stream_output_state *
+nv50_program_create_strmout_state(const struct nv50_ir_prog_info *info,
+                                  const struct pipe_stream_output_info *pso)
  {
-   struct nv50_program *p = ti->p;
-   struct tgsi_parse_context parse;
-   int ret, i;
+   struct nv50_stream_output_state *so;
+   unsigned b, i, c;
+   unsigned base[4];
+
+   so = MALLOC_STRUCT(nv50_stream_output_state);
+   if (!so)
+      return NULL;
+   memset(so->map, 0xff, sizeof(so->map));
+
+   for (b = 0; b < 4; ++b)
+      so->num_attribs[b] = 0;
+   for (i = 0; i < pso->num_outputs; ++i) {
+      unsigned end =  pso->output[i].dst_offset + pso->output[i].num_components;
+      b = pso->output[i].output_buffer;
+      assert(b < 4);
+      so->num_attribs[b] = MAX2(so->num_attribs[b], end);
+   }
  
-   p->vp.edgeflag = 0x40;
-   p->vp.psiz = 0x40;
-   p->vp.bfc[0] = 0x40;
-   p->vp.bfc[1] = 0x40;
-   p->gp.primid = 0x80;
+   so->ctrl = NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED;
  
-   tgsi_scan_shader(p->pipe.tokens, &ti->scan);
+   so->stride[0] = pso->stride[0] * 4;
+   base[0] = 0;
+   for (b = 1; b < 4; ++b) {
+      assert(!so->num_attribs[b] || so->num_attribs[b] == pso->stride[b]);
+      so->stride[b] = so->num_attribs[b] * 4;
+      if (so->num_attribs[b])
+         so->ctrl = (b + 1) << NV50_3D_STRMOUT_BUFFERS_CTRL_SEPARATE__SHIFT;
+      base[b] = align(base[b - 1] + so->num_attribs[b - 1], 4);
+   }
+   if (so->ctrl & NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED) {
+      assert(so->stride[0] < NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__MAX);
+      so->ctrl |= so->stride[0] << NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__SHIFT;
+   }
  
-#ifdef NV50_PROGRAM_DEBUG
-   tgsi_dump(p->pipe.tokens, 0);
-#endif
+   so->map_size = base[3] + so->num_attribs[3];
  
-   ti->subr =
-      CALLOC(ti->scan.opcode_count[TGSI_OPCODE_BGNSUB], sizeof(ti->subr[0]));
+   for (i = 0; i < pso->num_outputs; ++i) {
+      const unsigned s = pso->output[i].start_component;
+      const unsigned p = pso->output[i].dst_offset;
+      const unsigned r = pso->output[i].register_index;
+      b = pso->output[i].output_buffer;
  
-   ti->immd32 = (uint32_t *)MALLOC(ti->scan.immediate_count * 16);
-   ti->immd32_ty = (ubyte *)MALLOC(ti->scan.immediate_count * sizeof(ubyte));
+      for (c = 0; c < pso->output[i].num_components; ++c)
+         so->map[base[b] + p + c] = info->out[r].slot[s + c];
+   }
  
-   ti->insns = MALLOC(ti->scan.num_instructions * sizeof(ti->insns[0]));
+   return so;
+}
  
-   tgsi_parse_init(&parse, p->pipe.tokens);
-   while (!tgsi_parse_end_of_tokens(&parse)) {
-      tgsi_parse_token(&parse);
+boolean
+nv50_program_translate(struct nv50_program *prog, uint16_t chipset)
+{
+   struct nv50_ir_prog_info *info;
+   int ret;
+   const uint8_t map_undef = (prog->type == PIPE_SHADER_VERTEX) ? 0x40 : 0x80;
+
+   info = CALLOC_STRUCT(nv50_ir_prog_info);
+   if (!info)
+      return FALSE;
+
+   info->type = prog->type;
+   info->target = chipset;
+   info->bin.sourceRep = NV50_PROGRAM_IR_TGSI;
+   info->bin.source = (void *)prog->pipe.tokens;
+
+   info->io.ucpBinding = 15;
+   info->io.ucpBase = 0;
+   info->io.genUserClip = prog->vp.clpd_nr;
+
+   info->assignSlots = nv50_program_assign_varying_slots;
+
+   prog->vp.bfc[0] = 0xff;
+   prog->vp.bfc[1] = 0xff;
+   prog->vp.edgeflag = 0xff;
+   prog->vp.clpd[0] = map_undef;
+   prog->vp.clpd[1] = map_undef;
+   prog->vp.psiz = map_undef;
+   prog->gp.primid = 0x80;
+
+   info->driverPriv = prog;
+
+#ifdef DEBUG
+   info->optLevel = debug_get_num_option("NV50_PROG_OPTIMIZE", 3);
+   info->dbgFlags = debug_get_num_option("NV50_PROG_DEBUG", 0);
+#else
+   info->optLevel = 3;
+#endif
  
-      switch (parse.FullToken.Token.Type) {
-      case TGSI_TOKEN_TYPE_IMMEDIATE:
-         prog_immediate(ti, &parse.FullToken.FullImmediate);
-         break;
-      case TGSI_TOKEN_TYPE_DECLARATION:
-         prog_decl(ti, &parse.FullToken.FullDeclaration);
-         break;
-      case TGSI_TOKEN_TYPE_INSTRUCTION:
-         ti->insns[ti->inst_nr] = parse.FullToken.FullInstruction;
-         prog_inst(ti, &parse.FullToken.FullInstruction, ++ti->inst_nr);
-         break;
-      }
+   ret = nv50_ir_generate_code(info);
+   if (ret) {
+      NOUVEAU_ERR("shader translation failed: %i\n", ret);
+      goto out;
     }
-
-   /* Scan to determine which registers are inputs/outputs of a subroutine. */
-   for (i = 0; i < ti->subr_nr; ++i) {
-      int pc = ti->subr[i].id;
-      while (ti->insns[pc].Instruction.Opcode != TGSI_OPCODE_ENDSUB)
-         prog_subroutine_inst(&ti->subr[i], &ti->insns[pc++]);
+   FREE(info->bin.syms);
+
+   prog->code = info->bin.code;
+   prog->code_size = info->bin.codeSize;
+   prog->fixups = info->bin.relocData;
+   prog->max_gpr = MAX2(4, (info->bin.maxGPR >> 1) + 1);
+   prog->tls_space = info->bin.tlsSpace;
+
+   if (prog->type == PIPE_SHADER_FRAGMENT) {
+      if (info->prop.fp.writesDepth) {
+         prog->fp.flags[0] |= NV50_3D_FP_CONTROL_EXPORTS_Z;
+         prog->fp.flags[1] = 0x11;
+      }
+      if (info->prop.fp.usesDiscard)
+         prog->fp.flags[0] |= NV50_3D_FP_CONTROL_USES_KIL;
     }
  
-   p->in_nr = ti->scan.file_max[TGSI_FILE_INPUT] + 1;
-   p->out_nr = ti->scan.file_max[TGSI_FILE_OUTPUT] + 1;
+   if (prog->pipe.stream_output.num_outputs)
+      prog->so = nv50_program_create_strmout_state(info,
+                                                   &prog->pipe.stream_output);
  
-   switch (p->type) {
-   case PIPE_SHADER_VERTEX:
-      ret = nv50_vertprog_prepare(ti);
-      break;
-   case PIPE_SHADER_FRAGMENT:
-      ret = nv50_fragprog_prepare(ti);
-      break;
-   case PIPE_SHADER_GEOMETRY:
-      ret = nv50_geomprog_prepare(ti);
-      break;
-   default:
-      assert(!"unsupported program type");
-      ret = -1;
-      break;
-   }
-
-   assert(!ret);
-   return ret;
+out:
+   FREE(info);
+   return !ret;
  }
  
  boolean
-nv50_program_tx(struct nv50_program *p)
+nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
  {
-   struct nv50_translation_info *ti;
+   struct nouveau_heap *heap;
     int ret;
+   uint32_t size = align(prog->code_size, 0x40);
  
-   ti = CALLOC_STRUCT(nv50_translation_info);
-   ti->p = p;
-
-   ti->edgeflag_out = PIPE_MAX_SHADER_OUTPUTS;
-
-   ret = nv50_prog_scan(ti);
-   if (ret) {
-      NOUVEAU_ERR("unsupported shader program\n");
-      goto out;
+   switch (prog->type) {
+   case PIPE_SHADER_VERTEX:   heap = nv50->screen->vp_code_heap; break;
+   case PIPE_SHADER_GEOMETRY: heap = nv50->screen->fp_code_heap; break;
+   case PIPE_SHADER_FRAGMENT: heap = nv50->screen->gp_code_heap; break;
+   default:
+      assert(!"invalid program type");
+      return FALSE;
     }
  
-   ret = nv50_generate_code(ti);
+   ret = nouveau_heap_alloc(heap, size, prog, &prog->mem);
     if (ret) {
-      NOUVEAU_ERR("error during shader translation\n");
-      goto out;
+      /* Out of space: evict everything to compactify the code segment, hoping
+       * the working set is much smaller and drifts slowly. Improve me !
+       */
+      while (heap->next) {
+         struct nv50_program *evict = heap->next->priv;
+         if (evict)
+            nouveau_heap_free(&evict->mem);
+      }
+      debug_printf("WARNING: out of code space, evicting all shaders.\n");
+      ret = nouveau_heap_alloc(heap, size, prog, &prog->mem);
+      if (ret) {
+         NOUVEAU_ERR("shader too large (0x%x) to fit in code space ?\n", size);
+         return FALSE;
+      }
     }
+   prog->code_base = prog->mem->start;
  
-out:
-   if (ti->immd32)
-      FREE(ti->immd32);
-   if (ti->immd32_ty)
-      FREE(ti->immd32_ty);
-   if (ti->insns)
-      FREE(ti->insns);
-   if (ti->subr)
-      FREE(ti->subr);
-   FREE(ti);
-   return ret ? FALSE : TRUE;
+   ret = nv50_tls_realloc(nv50->screen, prog->tls_space);
+   if (ret < 0)
+      return FALSE;
+   if (ret > 0)
+      nv50->state.new_tls_space = TRUE;
+
+   if (prog->fixups)
+      nv50_ir_relocate_code(prog->fixups, prog->code, prog->code_base, 0, 0);
+
+   nv50_sifc_linear_u8(&nv50->base, nv50->screen->code,
+                       (prog->type << NV50_CODE_BO_SIZE_LOG2) + prog->code_base,
+                       NOUVEAU_BO_VRAM, prog->code_size, prog->code);
+
+   BEGIN_NV04(nv50->base.pushbuf, NV50_3D(CODE_CB_FLUSH), 1);
+   PUSH_DATA (nv50->base.pushbuf, 0);
+
+   return TRUE;
  }
  
  void
  nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
  {
-   nouveau_bo_ref(NULL, &p->bo);
+   const struct pipe_shader_state pipe = p->pipe;
+   const ubyte type = p->type;
+
+   if (p->mem)
+      nouveau_heap_free(&p->mem);
+
+   FREE(p->code);
+
+   FREE(p->fixups);
  
-   so_ref(NULL, &p->so);
+   FREE(p->so);
  
-   if (p->code)
-      FREE(p->code);
+   memset(p, 0, sizeof(*p));
  
-   p->translated = FALSE;
+   p->pipe = pipe;
+   p->type = type;
  }