nv50: negate sources directly where supported
[mesa.git] / src / gallium / drivers / nv50 / nv50_program.c
index 877ead39ce86ddbfd0acffbfd38a6c3e70b829ae..aada285f2c006785178fc6ce7e01908845693b11 100644 (file)
@@ -85,6 +85,9 @@ struct nv50_reg {
 
        int hw;
        int neg;
+
+       int rhw; /* result hw for FP outputs, or interpolant index */
+       int acc; /* instruction where this reg is last read (first insn == 1) */
 };
 
 struct nv50_pc {
@@ -108,12 +111,23 @@ struct nv50_pc {
 
        struct nv50_reg *temp_temp[16];
        unsigned temp_temp_nr;
+
+       unsigned interp_mode[32];
+       /* perspective interpolation registers */
+       struct nv50_reg *iv_p;
+       struct nv50_reg *iv_c;
+
+       /* current instruction and total number of insns */
+       unsigned insn_cur;
+       unsigned insn_nr;
+
+       boolean allow32;
 };
 
 static void
 alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
 {
-       int i;
+       int i = 0;
 
        if (reg->type == P_RESULT) {
                if (pc->p->cfg.high_result < (reg->hw + 1))
@@ -131,7 +145,22 @@ alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
                return;
        }
 
-       for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
+       if (reg->rhw != -1) {
+               /* try to allocate temporary with index rhw first */
+               if (!(pc->r_temp[reg->rhw])) {
+                       pc->r_temp[reg->rhw] = reg;
+                       reg->hw = reg->rhw;
+                       if (pc->p->cfg.high_temp < (reg->rhw + 1))
+                               pc->p->cfg.high_temp = reg->rhw + 1;
+                       return;
+               }
+               /* make sure we don't get things like $r0 needs to go
+                * in $r1 and $r1 in $r0
+                */
+               i = pc->result_nr * 4;
+       }
+
+       for (; i < NV50_SU_MAX_TEMP; i++) {
                if (!(pc->r_temp[i])) {
                        pc->r_temp[i] = reg;
                        reg->hw = i;
@@ -159,6 +188,7 @@ alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
                        r->type = P_TEMP;
                        r->index = -1;
                        r->hw = i;
+                       r->rhw = -1;
                        pc->r_temp[i] = r;
                        return r;
                }
@@ -168,6 +198,38 @@ alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
        return NULL;
 }
 
+/* Assign the hw of the discarded temporary register src
+ * to the tgsi register dst and free src.
+ */
+static void
+assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+       assert(src->index == -1 && src->hw != -1);
+
+       if (dst->hw != -1)
+               pc->r_temp[dst->hw] = NULL;
+       pc->r_temp[src->hw] = dst;
+       dst->hw = src->hw;
+
+       FREE(src);
+}
+
+/* release the hardware resource held by r */
+static void
+release_hw(struct nv50_pc *pc, struct nv50_reg *r)
+{
+       assert(r->type == P_TEMP);
+       if (r->hw == -1)
+               return;
+
+       assert(pc->r_temp[r->hw] == r);
+       pc->r_temp[r->hw] = NULL;
+
+       r->acc = 0;
+       if (r->index == -1)
+               FREE(r);
+}
+
 static void
 free_temp(struct nv50_pc *pc, struct nv50_reg *r)
 {
@@ -250,7 +312,13 @@ alloc_immd(struct nv50_pc *pc, float f)
        struct nv50_reg *r = CALLOC_STRUCT(nv50_reg);
        unsigned hw;
 
-       hw = ctor_immd(pc, f, 0, 0, 0) * 4;
+       for (hw = 0; hw < pc->immd_nr * 4; hw++)
+               if (pc->immd_buf[hw] == f)
+                       break;
+
+       if (hw == pc->immd_nr * 4)
+               hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4;
+
        r->type = P_IMMD;
        r->hw = hw;
        r->index = -1;
@@ -341,7 +409,8 @@ set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
 static INLINE void
 set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
 {
-       unsigned val = fui(pc->immd_buf[imm->hw]); /* XXX */
+       float f = pc->immd_buf[imm->hw];
+       unsigned val = fui(imm->neg ? -f : f);
 
        set_long(pc, e);
        /*XXX: can't be predicated - bits overlap.. catch cases where both
@@ -354,20 +423,35 @@ set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
        e->inst[1] |= (val >> 6) << 2;
 }
 
+
+#define INTERP_LINEAR          0
+#define INTERP_FLAT                    1
+#define INTERP_PERSPECTIVE     2
+#define INTERP_CENTROID                4
+
+/* interpolant index has been stored in dst->rhw */
 static void
-emit_interp(struct nv50_pc *pc, struct nv50_reg *dst,
-           struct nv50_reg *src, struct nv50_reg *iv)
+emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv,
+               unsigned mode)
 {
+       assert(dst->rhw != -1);
        struct nv50_program_exec *e = exec(pc);
 
        e->inst[0] |= 0x80000000;
        set_dst(pc, dst, e);
-       alloc_reg(pc, src);
-       e->inst[0] |= (src->hw << 16);
-       if (iv) {
-               e->inst[0] |= (1 << 25);
-               alloc_reg(pc, iv);
-               e->inst[0] |= (iv->hw << 9);
+       e->inst[0] |= (dst->rhw << 16);
+
+       if (mode & INTERP_FLAT) {
+               e->inst[0] |= (1 << 8);
+       } else {
+               if (mode & INTERP_PERSPECTIVE) {
+                       e->inst[0] |= (1 << 25);
+                       alloc_reg(pc, iv);
+                       e->inst[0] |= (iv->hw << 9);
+               }
+
+               if (mode & INTERP_CENTROID)
+                       e->inst[0] |= (1 << 24);
        }
 
        emit(pc, e);
@@ -378,22 +462,12 @@ set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
         struct nv50_program_exec *e)
 {
        set_long(pc, e);
-#if 1
-       e->inst[1] |= (1 << 22);
-#else
-       if (src->type == P_IMMD) {
-               e->inst[1] |= (NV50_CB_PMISC << 22);
-       } else {
-               if (pc->p->type == PIPE_SHADER_VERTEX)
-                       e->inst[1] |= (NV50_CB_PVP << 22);
-               else
-                       e->inst[1] |= (NV50_CB_PFP << 22);
-       }
-#endif
 
        e->param.index = src->hw;
        e->param.shift = s;
        e->param.mask = m << (s % 32);
+
+       e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22);
 }
 
 static void
@@ -405,12 +479,11 @@ emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 
        set_dst(pc, dst, e);
 
-       if (0 && dst->type != P_RESULT && src->type == P_IMMD) {
+       if (pc->allow32 && dst->type != P_RESULT && src->type == P_IMMD) {
                set_immd(pc, src, e);
                /*XXX: 32-bit, but steals part of "half" reg space - need to
                 *     catch and handle this case if/when we do half-regs
                 */
-               e->inst[0] |= 0x00008000;
        } else
        if (src->type == P_IMMD || src->type == P_CONST) {
                set_long(pc, e);
@@ -426,14 +499,13 @@ emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
                e->inst[0] |= (src->hw << 9);
        }
 
-       /* We really should support "half" instructions here at some point,
-        * but I don't feel confident enough about them yet.
-        */
-       set_long(pc, e);
        if (is_long(e) && !is_immd(e)) {
                e->inst[1] |= 0x04000000; /* 32-bit */
-               e->inst[1] |= 0x0003c000; /* "subsubop" 0xf == mov */
-       }
+               e->inst[1] |= 0x0000c000; /* "subsubop" 0x3 */
+               if (!(e->inst[1] & 0x20000000))
+                       e->inst[1] |= 0x00030000; /* "subsubop" 0xf */
+       } else
+               e->inst[0] |= 0x00008000;
 
        emit(pc, e);
 }
@@ -549,12 +621,26 @@ emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
        struct nv50_program_exec *e = exec(pc);
 
        e->inst[0] |= 0xc0000000;
-       set_long(pc, e);
+
+       if (!pc->allow32)
+               set_long(pc, e);
 
        check_swap_src_0_1(pc, &src0, &src1);
        set_dst(pc, dst, e);
        set_src_0(pc, src0, e);
-       set_src_1(pc, src1, e);
+       if (src1->type == P_IMMD && !is_long(e)) {
+               if (src0->neg)
+                       e->inst[0] |= 0x00008000;
+               set_immd(pc, src1, e);
+       } else {
+               set_src_1(pc, src1, e);
+               if (src0->neg ^ src1->neg) {
+                       if (is_long(e))
+                               e->inst[1] |= 0x08000000;
+                       else
+                               e->inst[0] |= 0x00008000;
+               }
+       }
 
        emit(pc, e);
 }
@@ -568,10 +654,19 @@ emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
        e->inst[0] |= 0xb0000000;
 
        check_swap_src_0_1(pc, &src0, &src1);
+
+       if (!pc->allow32 || src0->neg || src1->neg) {
+               set_long(pc, e);
+               e->inst[1] |= (src0->neg << 26) | (src1->neg << 27);
+       }
+
        set_dst(pc, dst, e);
        set_src_0(pc, src0, e);
-       if (is_long(e))
+       if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e))
                set_src_2(pc, src1, e);
+       else
+       if (src1->type == P_IMMD)
+               set_immd(pc, src1, e);
        else
                set_src_1(pc, src1, e);
 
@@ -596,25 +691,13 @@ emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
        emit(pc, e);
 }
 
-static void
+static INLINE void
 emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
         struct nv50_reg *src1)
 {
-       struct nv50_program_exec *e = exec(pc);
-
-       e->inst[0] |= 0xb0000000;
-
-       set_long(pc, e);
-       if (check_swap_src_0_1(pc, &src0, &src1))
-               e->inst[1] |= 0x04000000;
-       else
-               e->inst[1] |= 0x08000000;
-
-       set_dst(pc, dst, e);
-       set_src_0(pc, src0, e);
-       set_src_2(pc, src1, e);
-
-       emit(pc, e);
+       src1->neg ^= 1;
+       emit_add(pc, dst, src0, src1);
+       src1->neg ^= 1;
 }
 
 static void
@@ -631,26 +714,21 @@ emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
        set_src_1(pc, src1, e);
        set_src_2(pc, src2, e);
 
+       if (src0->neg ^ src1->neg)
+               e->inst[1] |= 0x04000000;
+       if (src2->neg)
+               e->inst[1] |= 0x08000000;
+
        emit(pc, e);
 }
 
-static void
+static INLINE void
 emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
         struct nv50_reg *src1, struct nv50_reg *src2)
 {
-       struct nv50_program_exec *e = exec(pc);
-
-       e->inst[0] |= 0xe0000000;
-       set_long(pc, e);
-       e->inst[1] |= 0x08000000; /* src0 * src1 - src2 */
-
-       check_swap_src_0_1(pc, &src0, &src1);
-       set_dst(pc, dst, e);
-       set_src_0(pc, src0, e);
-       set_src_1(pc, src1, e);
-       set_src_2(pc, src2, e);
-
-       emit(pc, e);
+       src2->neg ^= 1;
+       emit_mad(pc, dst, src0, src1, src2);
+       src2->neg ^= 1;
 }
 
 static void
@@ -701,6 +779,48 @@ emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
        emit(pc, e);
 }
 
+#define CVTOP_RN       0x01
+#define CVTOP_FLOOR    0x03
+#define CVTOP_CEIL     0x05
+#define CVTOP_TRUNC    0x07
+#define CVTOP_SAT      0x08
+#define CVTOP_ABS      0x10
+
+#define CVT_F32_F32 0xc4
+#define CVT_F32_S32 0x44
+#define CVT_F32_U32 0x64
+#define CVT_S32_F32 0x8c
+#define CVT_S32_S32 0x0c
+#define CVT_F32_F32_ROP 0xcc
+
+static void
+emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
+        int wp, unsigned cop, unsigned fmt)
+{
+       struct nv50_program_exec *e;
+
+       e = exec(pc);
+       set_long(pc, e);
+
+       e->inst[0] |= 0xa0000000;
+       e->inst[1] |= 0x00004000;
+       e->inst[1] |= (cop << 16);
+       e->inst[1] |= (fmt << 24);
+       set_src_0(pc, src, e);
+
+       if (wp >= 0)
+               set_pred_wr(pc, 1, wp, e);
+
+       if (dst)
+               set_dst(pc, dst, e);
+       else {
+               e->inst[0] |= 0x000001fc;
+               e->inst[1] |= 0x00000008;
+       }
+
+       emit(pc, e);
+}
+
 static void
 emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst,
         struct nv50_reg *src0, struct nv50_reg *src1)
@@ -744,22 +864,10 @@ emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst,
                free_temp(pc, dst);
 }
 
-static void
+static INLINE void
 emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 {
-       struct nv50_program_exec *e = exec(pc);
-
-       e->inst[0] = 0xa0000000; /* cvt */
-       set_long(pc, e);
-       e->inst[1] |= (6 << 29); /* cvt */
-       e->inst[1] |= 0x08000000; /* integer mode */
-       e->inst[1] |= 0x04000000; /* 32 bit */
-       e->inst[1] |= ((0x1 << 3)) << 14; /* .rn */
-       e->inst[1] |= (1 << 14); /* src .f32 */
-       set_dst(pc, dst, e);
-       set_src_0(pc, src, e);
-
-       emit(pc, e);
+       emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32_ROP);
 }
 
 static void
@@ -776,21 +884,10 @@ emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
        free_temp(pc, temp);
 }
 
-static void
+static INLINE void
 emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 {
-       struct nv50_program_exec *e = exec(pc);
-
-       e->inst[0] = 0xa0000000; /* cvt */
-       set_long(pc, e);
-       e->inst[1] |= (6 << 29); /* cvt */
-       e->inst[1] |= 0x04000000; /* 32 bit */
-       e->inst[1] |= (1 << 14); /* src .f32 */
-       e->inst[1] |= ((1 << 6) << 14); /* .abs */
-       set_dst(pc, dst, e);
-       set_src_0(pc, src, e);
-
-       emit(pc, e);
+       emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32);
 }
 
 static void
@@ -802,18 +899,12 @@ emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
        struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
        struct nv50_reg *pos128 = alloc_immd(pc,  127.999999);
        struct nv50_reg *tmp[4];
+       boolean allow32 = pc->allow32;
 
-       if (mask & (1 << 0))
-               emit_mov(pc, dst[0], one);
-
-       if (mask & (1 << 3))
-               emit_mov(pc, dst[3], one);
+       pc->allow32 = FALSE;
 
        if (mask & (3 << 1)) {
-               if (mask & (1 << 1))
-                       tmp[0] = dst[1];
-               else
-                       tmp[0] = temp_temp(pc);
+               tmp[0] = alloc_temp(pc, NULL);
                emit_minmax(pc, 4, tmp[0], src[0], zero);
        }
 
@@ -832,6 +923,21 @@ emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
                set_pred(pc, 3, 0, pc->p->exec_tail);
        }
 
+       if (mask & (1 << 1))
+               assimilate_temp(pc, dst[1], tmp[0]);
+       else
+       if (mask & (1 << 2))
+               free_temp(pc, tmp[0]);
+
+       pc->allow32 = allow32;
+
+       /* do this last, in case src[i,j] == dst[0,3] */
+       if (mask & (1 << 0))
+               emit_mov(pc, dst[0], one);
+
+       if (mask & (1 << 3))
+               emit_mov(pc, dst[3], one);
+
        FREE(pos128);
        FREE(neg128);
        FREE(zero);
@@ -866,6 +972,8 @@ emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
        e->inst[1] = 0xc4014788;
        set_src_0(pc, src, e);
        set_pred_wr(pc, 1, r_pred, e);
+       if (src->neg)
+               e->inst[1] |= 0x20000000;
        emit(pc, e);
 
        /* This is probably KILP */
@@ -876,6 +984,180 @@ emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
        emit(pc, e);
 }
 
+static void
+emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
+        struct nv50_reg **src, unsigned unit, unsigned type, boolean proj)
+{
+       struct nv50_reg *temp, *t[4];
+       struct nv50_program_exec *e;
+
+       unsigned c, mode, dim;
+
+       switch (type) {
+       case TGSI_TEXTURE_1D:
+               dim = 1;
+               break;
+       case TGSI_TEXTURE_UNKNOWN:
+       case TGSI_TEXTURE_2D:
+       case TGSI_TEXTURE_SHADOW1D: /* XXX: x, z */
+       case TGSI_TEXTURE_RECT:
+               dim = 2;
+               break;
+       case TGSI_TEXTURE_3D:
+       case TGSI_TEXTURE_CUBE:
+       case TGSI_TEXTURE_SHADOW2D:
+       case TGSI_TEXTURE_SHADOWRECT: /* XXX */
+               dim = 3;
+               break;
+       default:
+               assert(0);
+               break;
+       }
+
+       alloc_temp4(pc, t, 0);
+
+       if (proj) {
+               if (src[0]->type == P_TEMP && src[0]->rhw != -1) {
+                       mode = pc->interp_mode[src[0]->index];
+
+                       t[3]->rhw = src[3]->rhw;
+                       emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID));
+                       emit_flop(pc, 0, t[3], t[3]);
+
+                       for (c = 0; c < dim; c++) {
+                               t[c]->rhw = src[c]->rhw;
+                               emit_interp(pc, t[c], t[3],
+                                           (mode | INTERP_PERSPECTIVE));
+                       }
+               } else {
+                       emit_flop(pc, 0, t[3], src[3]);
+                       for (c = 0; c < dim; c++)
+                               emit_mul(pc, t[c], src[c], t[3]);
+
+                       /* XXX: for some reason the blob sometimes uses MAD:
+                        * emit_mad(pc, t[c], src[0][c], t[3], t[3])
+                        * pc->p->exec_tail->inst[1] |= 0x080fc000;
+                        */
+               }
+       } else {
+               if (type == TGSI_TEXTURE_CUBE) {
+                       temp = temp_temp(pc);
+                       emit_minmax(pc, 4, temp, src[0], src[1]);
+                       emit_minmax(pc, 4, temp, temp, src[2]);
+                       emit_flop(pc, 0, temp, temp);
+                       for (c = 0; c < 3; c++)
+                               emit_mul(pc, t[c], src[c], temp);
+               } else {
+                       for (c = 0; c < dim; c++)
+                               emit_mov(pc, t[c], src[c]);
+               }
+       }
+
+       e = exec(pc);
+       set_long(pc, e);
+       e->inst[0] |= 0xf0000000;
+       e->inst[1] |= 0x00000004;
+       set_dst(pc, t[0], e);
+       e->inst[0] |= (unit << 9);
+
+       if (dim == 2)
+               e->inst[0] |= 0x00400000;
+       else
+       if (dim == 3)
+               e->inst[0] |= 0x00800000;
+
+       e->inst[0] |= (mask & 0x3) << 25;
+       e->inst[1] |= (mask & 0xc) << 12;
+
+       emit(pc, e);
+
+#if 1
+       if (mask & 1) emit_mov(pc, dst[0], t[0]);
+       if (mask & 2) emit_mov(pc, dst[1], t[1]);
+       if (mask & 4) emit_mov(pc, dst[2], t[2]);
+       if (mask & 8) emit_mov(pc, dst[3], t[3]);
+
+       free_temp4(pc, t);
+#else
+       /* XXX: if p.e. MUL is used directly after TEX, it would still use
+        * the texture coordinates, not the fetched values: latency ? */
+
+       for (c = 0; c < 4; c++) {
+               if (mask & (1 << c))
+                       assimilate_temp(pc, dst[c], t[c]);
+               else
+                       free_temp(pc, t[c]);
+       }
+#endif
+}
+
+static void
+convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
+{
+       unsigned q = 0, m = ~0;
+
+       assert(!is_long(e));
+
+       switch (e->inst[0] >> 28) {
+       case 0x1:
+               /* MOV */
+               q = 0x0403c000;
+               m = 0xffff7fff;
+               break;
+       case 0x8:
+               /* INTERP */
+               m = ~0x02000000;
+               if (e->inst[0] & 0x02000000)
+                       q = 0x00020000;
+               break;
+       case 0x9:
+               /* RCP */
+               break;
+       case 0xB:
+               /* ADD */
+               m = ~(127 << 16);
+               q = ((e->inst[0] & (~m)) >> 2);
+               break;
+       case 0xC:
+               /* MUL */
+               m = ~0x00008000;
+               q = ((e->inst[0] & (~m)) << 12);
+               break;
+       case 0xE:
+               /* MAD (if src2 == dst) */
+               q = ((e->inst[0] & 0x1fc) << 12);
+               break;
+       default:
+               assert(0);
+               break;
+       }
+
+       set_long(pc, e);
+       pc->p->exec_size++;
+
+       e->inst[0] &= m;
+       e->inst[1] |= q;
+}
+
+static boolean
+negate_supported(const struct tgsi_full_instruction *insn, int i)
+{
+       switch (insn->Instruction.Opcode) {
+       case TGSI_OPCODE_DP3:
+       case TGSI_OPCODE_DP4:
+       case TGSI_OPCODE_MUL:
+       case TGSI_OPCODE_KIL:
+       case TGSI_OPCODE_ADD:
+       case TGSI_OPCODE_SUB:
+       case TGSI_OPCODE_MAD:
+               return TRUE;
+       case TGSI_OPCODE_POW:
+               return (i == 1) ? TRUE : FALSE;
+       default:
+               return FALSE;
+       }
+}
+
 static struct nv50_reg *
 tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
 {
@@ -894,7 +1176,8 @@ tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
 }
 
 static struct nv50_reg *
-tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src)
+tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
+        boolean neg)
 {
        struct nv50_reg *r = NULL;
        struct nv50_reg *temp;
@@ -949,14 +1232,21 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src)
                r = temp;
                break;
        case TGSI_UTIL_SIGN_TOGGLE:
-               temp = temp_temp(pc);
-               emit_neg(pc, temp, r);
-               r = temp;
+               if (neg)
+                       r->neg = 1;
+               else {
+                       temp = temp_temp(pc);
+                       emit_neg(pc, temp, r);
+                       r = temp;
+               }
                break;
        case TGSI_UTIL_SIGN_SET:
                temp = temp_temp(pc);
                emit_abs(pc, temp, r);
-               emit_neg(pc, temp, r);
+               if (neg)
+                       temp->neg = 1;
+               else
+                       emit_neg(pc, temp, temp);
                r = temp;
                break;
        default:
@@ -967,12 +1257,40 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src)
        return r;
 }
 
+/* returns TRUE if instruction can overwrite sources before they're read */
+static boolean
+direct2dest_op(const struct tgsi_full_instruction *insn)
+{
+       if (insn->Instruction.Saturate)
+               return FALSE;
+
+       switch (insn->Instruction.Opcode) {
+       case TGSI_OPCODE_COS:
+       case TGSI_OPCODE_DP3:
+       case TGSI_OPCODE_DP4:
+       case TGSI_OPCODE_DPH:
+       case TGSI_OPCODE_KIL:
+       case TGSI_OPCODE_LIT:
+       case TGSI_OPCODE_POW:
+       case TGSI_OPCODE_RCP:
+       case TGSI_OPCODE_RSQ:
+       case TGSI_OPCODE_SCS:
+       case TGSI_OPCODE_SIN:
+       case TGSI_OPCODE_TEX:
+       case TGSI_OPCODE_TXP:
+               return FALSE;
+       default:
+               return TRUE;
+       }
+}
+
 static boolean
 nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 {
        const struct tgsi_full_instruction *inst = &tok->FullInstruction;
        struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp;
        unsigned mask, sat, unit;
+       boolean assimilate = FALSE;
        int i, c;
 
        mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
@@ -996,7 +1314,8 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
                        unit = fs->SrcRegister.Index;
 
                for (c = 0; c < 4; c++)
-                       src[i][c] = tgsi_src(pc, c, fs);
+                       src[i][c] = tgsi_src(pc, c, fs,
+                                            negate_supported(inst, i));
        }
 
        if (sat) {
@@ -1004,6 +1323,25 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
                        rdst[c] = dst[c];
                        dst[c] = temp_temp(pc);
                }
+       } else
+       if (direct2dest_op(inst)) {
+               for (c = 0; c < 4; c++) {
+                       if (!dst[c] || dst[c]->type != P_TEMP)
+                               continue;
+
+                       for (i = c + 1; i < 4; i++) {
+                               if (dst[c] == src[0][i] ||
+                                   dst[c] == src[1][i] ||
+                                   dst[c] == src[2][i])
+                                       break;
+                       }
+                       if (i == 4)
+                               continue;
+
+                       assimilate = TRUE;
+                       rdst[c] = dst[c];
+                       dst[c] = alloc_temp(pc, NULL);
+               }
        }
 
        switch (inst->Instruction.Opcode) {
@@ -1111,6 +1449,7 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
                emit_kil(pc, src[0][1]);
                emit_kil(pc, src[0][2]);
                emit_kil(pc, src[0][3]);
+               pc->p->cfg.fp.regs[2] |= 0x00100000;
                break;
        case TGSI_OPCODE_LIT:
                emit_lit(pc, &dst[0], mask, &src[0][0]);
@@ -1178,14 +1517,14 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
                }
                break;
        case TGSI_OPCODE_RCP:
-               for (c = 0; c < 4; c++) {
+               for (c = 3; c >= 0; c--) {
                        if (!(mask & (1 << c)))
                                continue;
                        emit_flop(pc, 0, dst[c], src[0][0]);
                }
                break;
        case TGSI_OPCODE_RSQ:
-               for (c = 0; c < 4; c++) {
+               for (c = 3; c >= 0; c--) {
                        if (!(mask & (1 << c)))
                                continue;
                        emit_flop(pc, 2, dst[c], src[0][0]);
@@ -1235,30 +1574,12 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
                }
                break;
        case TGSI_OPCODE_TEX:
+               emit_tex(pc, dst, mask, src[0], unit,
+                        inst->InstructionExtTexture.Texture, FALSE);
+               break;
        case TGSI_OPCODE_TXP:
-       {
-               struct nv50_reg *t[4];
-               struct nv50_program_exec *e;
-
-               alloc_temp4(pc, t, 0);
-               emit_mov(pc, t[0], src[0][0]);
-               emit_mov(pc, t[1], src[0][1]);
-
-               e = exec(pc);
-               e->inst[0] = 0xf6400000;
-               e->inst[0] |= (unit << 9);
-               set_long(pc, e);
-               e->inst[1] |= 0x0000c004;
-               set_dst(pc, t[0], e);
-               emit(pc, e);
-
-               if (mask & (1 << 0)) emit_mov(pc, dst[0], t[0]);
-               if (mask & (1 << 1)) emit_mov(pc, dst[1], t[1]);
-               if (mask & (1 << 2)) emit_mov(pc, dst[2], t[2]);
-               if (mask & (1 << 3)) emit_mov(pc, dst[3], t[3]);
-
-               free_temp4(pc, t);
-       }
+               emit_tex(pc, dst, mask, src[0], unit,
+                        inst->InstructionExtTexture.Texture, TRUE);
                break;
        case TGSI_OPCODE_XPD:
                temp = temp_temp(pc);
@@ -1286,22 +1607,15 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 
        if (sat) {
                for (c = 0; c < 4; c++) {
-                       struct nv50_program_exec *e;
-
                        if (!(mask & (1 << c)))
                                continue;
-                       e = exec(pc);
-
-                       e->inst[0] = 0xa0000000; /* cvt */
-                       set_long(pc, e);
-                       e->inst[1] |= (6 << 29); /* cvt */
-                       e->inst[1] |= 0x04000000; /* 32 bit */
-                       e->inst[1] |= (1 << 14); /* src .f32 */
-                       e->inst[1] |= ((1 << 5) << 14); /* .sat */
-                       set_dst(pc, rdst[c], e);
-                       set_src_0(pc, dst[c], e);
-                       emit(pc, e);
+                       emit_cvt(pc, rdst[c], dst[c], -1, CVTOP_SAT,
+                                CVT_F32_F32);
                }
+       } else if (assimilate) {
+               for (c = 0; c < 4; c++)
+                       if (rdst[c])
+                               assimilate_temp(pc, rdst[c], dst[c]);
        }
 
        for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
@@ -1310,6 +1624,9 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
                                continue;
                        if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD)
                                FREE(src[i][c]);
+                       else
+                       if (src[i][c]->acc == pc->insn_cur)
+                               release_hw(pc, src[i][c]);
                }
        }
 
@@ -1317,12 +1634,169 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
        return TRUE;
 }
 
+/* Adjust a bitmask that indicates what components of a source are used,
+ * we use this in tx_prep so we only load interpolants that are needed.
+ */
+static void
+insn_adjust_mask(const struct tgsi_full_instruction *insn, unsigned *mask)
+{
+       const struct tgsi_instruction_ext_texture *tex;
+
+       switch (insn->Instruction.Opcode) {
+       case TGSI_OPCODE_DP3:
+               *mask = 0x7;
+               break;
+       case TGSI_OPCODE_DP4:
+       case TGSI_OPCODE_DPH:
+               *mask = 0xF;
+               break;
+       case TGSI_OPCODE_LIT:
+               *mask = 0xB;
+               break;
+       case TGSI_OPCODE_RCP:
+       case TGSI_OPCODE_RSQ:
+               *mask = 0x1;
+               break;
+       case TGSI_OPCODE_TEX:
+       case TGSI_OPCODE_TXP:
+               assert(insn->Instruction.Extended);
+               tex = &insn->InstructionExtTexture;
+
+               *mask = 0x7;
+               if (tex->Texture == TGSI_TEXTURE_1D)
+                       *mask = 0x1;
+               else
+               if (tex->Texture == TGSI_TEXTURE_2D)
+                       *mask = 0x3;
+
+               if (insn->Instruction.Opcode == TGSI_OPCODE_TXP)
+                       *mask |= 0x8;
+               break;
+       default:
+               break;
+       }
+}
+
+static void
+prep_inspect_insn(struct nv50_pc *pc, const union tgsi_full_token *tok,
+                 unsigned *r_usage[2])
+{
+       const struct tgsi_full_instruction *insn;
+       const struct tgsi_full_src_register *src;
+       const struct tgsi_dst_register *dst;
+
+       unsigned i, c, k, n, mask, *acc_p;
+
+       insn = &tok->FullInstruction;
+       dst = &insn->FullDstRegisters[0].DstRegister;
+       mask = dst->WriteMask;
+
+       if (!r_usage[0])
+               r_usage[0] = CALLOC(pc->temp_nr * 4, sizeof(unsigned));
+       if (!r_usage[1])
+               r_usage[1] = CALLOC(pc->attr_nr * 4, sizeof(unsigned));
+
+       if (dst->File == TGSI_FILE_TEMPORARY) {
+               for (c = 0; c < 4; c++) {
+                       if (!(mask & (1 << c)))
+                               continue;
+                       r_usage[0][dst->Index * 4 + c] = pc->insn_nr;
+               }
+       }
+
+       for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
+               src = &insn->FullSrcRegisters[i];
+
+               switch (src->SrcRegister.File) {
+               case TGSI_FILE_TEMPORARY:
+                       acc_p = r_usage[0];
+                       break;
+               case TGSI_FILE_INPUT:
+                       acc_p = r_usage[1];
+                       break;
+               default:
+                       continue;
+               }
+
+               insn_adjust_mask(insn, &mask);
+
+               for (c = 0; c < 4; c++) {
+                       if (!(mask & (1 << c)))
+                               continue;
+
+                       k = tgsi_util_get_full_src_register_extswizzle(src, c);
+                       switch (k) {
+                       case TGSI_EXTSWIZZLE_X:
+                       case TGSI_EXTSWIZZLE_Y:
+                       case TGSI_EXTSWIZZLE_Z:
+                       case TGSI_EXTSWIZZLE_W:
+                               n = src->SrcRegister.Index * 4 + k;
+                               acc_p[n] = pc->insn_nr;
+                               break;
+                       default:
+                               break;
+                       }
+               }
+       }
+}
+
+static unsigned
+load_fp_attrib(struct nv50_pc *pc, int i, unsigned *acc, int *mid,
+              int *aid, int *p_oid)
+{
+       struct nv50_reg *iv;
+       int oid, c, n;
+       unsigned mask = 0;
+
+       iv = (pc->interp_mode[i] & INTERP_CENTROID) ? pc->iv_c : pc->iv_p;
+
+       for (c = 0, n = i * 4; c < 4; c++, n++) {
+               oid = (*p_oid)++;
+               pc->attr[n].type = P_TEMP;
+               pc->attr[n].index = i;
+
+               if (pc->attr[n].acc == acc[n])
+                       continue;
+               mask |= (1 << c);
+
+               pc->attr[n].acc = acc[n];
+               pc->attr[n].rhw = pc->attr[n].hw = -1;
+               alloc_reg(pc, &pc->attr[n]);
+
+               pc->attr[n].rhw = (*aid)++;
+               emit_interp(pc, &pc->attr[n], iv, pc->interp_mode[i]);
+
+               pc->p->cfg.fp.map[(*mid) / 4] |= oid << (8 * ((*mid) % 4));
+               (*mid)++;
+               pc->p->cfg.fp.regs[1] += 0x00010001;
+       }
+
+       return mask;
+}
+
 static boolean
 nv50_program_tx_prep(struct nv50_pc *pc)
 {
        struct tgsi_parse_context p;
        boolean ret = FALSE;
        unsigned i, c;
+       unsigned fcol, bcol, fcrd, depr;
+
+       /* count (centroid) perspective interpolations */
+       unsigned centroid_loads = 0;
+       unsigned perspect_loads = 0;
+
+       /* track register access for temps and attrs */
+       unsigned *r_usage[2];
+       r_usage[0] = NULL;
+       r_usage[1] = NULL;
+
+       depr = fcol = bcol = fcrd = 0xffff;
+
+       if (pc->p->type == PIPE_SHADER_FRAGMENT) {
+               pc->p->cfg.fp.regs[0] = 0x01000404;
+               pc->p->cfg.fp.regs[1] = 0x00000400;
+       }
 
        tgsi_parse_init(&p, pc->p->pipe.tokens);
        while (!tgsi_parse_end_of_tokens(&p)) {
@@ -1344,9 +1818,10 @@ nv50_program_tx_prep(struct nv50_pc *pc)
                case TGSI_TOKEN_TYPE_DECLARATION:
                {
                        const struct tgsi_full_declaration *d;
-                       unsigned last;
+                       unsigned last, first, mode;
 
                        d = &p.FullToken.FullDeclaration;
+                       first = d->DeclarationRange.First;
                        last = d->DeclarationRange.Last;
 
                        switch (d->Declaration.File) {
@@ -1357,10 +1832,69 @@ nv50_program_tx_prep(struct nv50_pc *pc)
                        case TGSI_FILE_OUTPUT:
                                if (pc->result_nr < (last + 1))
                                        pc->result_nr = last + 1;
+
+                               if (!d->Declaration.Semantic)
+                                       break;
+
+                               switch (d->Semantic.SemanticName) {
+                               case TGSI_SEMANTIC_POSITION:
+                                       depr = first;
+                                       pc->p->cfg.fp.regs[2] |= 0x00000100;
+                                       pc->p->cfg.fp.regs[3] |= 0x00000011;
+                                       break;
+                               default:
+                                       break;
+                               }
+
                                break;
                        case TGSI_FILE_INPUT:
+                       {
                                if (pc->attr_nr < (last + 1))
                                        pc->attr_nr = last + 1;
+
+                               if (pc->p->type != PIPE_SHADER_FRAGMENT)
+                                       break;
+
+                               switch (d->Declaration.Interpolate) {
+                               case TGSI_INTERPOLATE_CONSTANT:
+                                       mode = INTERP_FLAT;
+                                       break;
+                               case TGSI_INTERPOLATE_PERSPECTIVE:
+                                       mode = INTERP_PERSPECTIVE;
+                                       break;
+                               default:
+                                       mode = INTERP_LINEAR;
+                                       break;
+                               }
+
+                               if (d->Declaration.Semantic) {
+                                       switch (d->Semantic.SemanticName) {
+                                       case TGSI_SEMANTIC_POSITION:
+                                               fcrd = first;
+                                               break;
+                                       case TGSI_SEMANTIC_COLOR:
+                                               fcol = first;
+                                               mode = INTERP_PERSPECTIVE;
+                                               break;
+                                       case TGSI_SEMANTIC_BCOLOR:
+                                               bcol = first;
+                                               mode = INTERP_PERSPECTIVE;
+                                               break;
+                                       }
+                               }
+
+                               if (d->Declaration.Centroid) {
+                                       mode |= INTERP_CENTROID;
+                                       if (mode & INTERP_PERSPECTIVE)
+                                               centroid_loads++;
+                               } else
+                               if (mode & INTERP_PERSPECTIVE)
+                                       perspect_loads++;
+
+                               assert(last < 32);
+                               for (i = first; i <= last; i++)
+                                       pc->interp_mode[i] = mode;
+                       }
                                break;
                        case TGSI_FILE_CONSTANT:
                                if (pc->param_nr < (last + 1))
@@ -1376,6 +1910,8 @@ nv50_program_tx_prep(struct nv50_pc *pc)
                }
                        break;
                case TGSI_TOKEN_TYPE_INSTRUCTION:
+                       pc->insn_nr++;
+                       prep_inspect_insn(pc, tok, r_usage);
                        break;
                default:
                        break;
@@ -1391,56 +1927,95 @@ nv50_program_tx_prep(struct nv50_pc *pc)
                        for (c = 0; c < 4; c++) {
                                pc->temp[i*4+c].type = P_TEMP;
                                pc->temp[i*4+c].hw = -1;
+                               pc->temp[i*4+c].rhw = -1;
                                pc->temp[i*4+c].index = i;
+                               pc->temp[i*4+c].acc = r_usage[0][i*4+c];
                        }
                }
        }
 
        if (pc->attr_nr) {
-               struct nv50_reg *iv = NULL;
-               int aid = 0;
+               int oid = 4, mid = 4, aid = 0;
+               /* oid = VP output id
+                * aid = FP attribute/interpolant id
+                * mid = VP output mapping field ID
+                */
 
                pc->attr = CALLOC(pc->attr_nr * 4, sizeof(struct nv50_reg));
                if (!pc->attr)
                        goto out_err;
 
                if (pc->p->type == PIPE_SHADER_FRAGMENT) {
-                       iv = alloc_temp(pc, NULL);
-                       emit_interp(pc, iv, iv, NULL);
-                       emit_flop(pc, 0, iv, iv);
-                       aid++;
-               }
+                       /* position should be loaded first */
+                       if (fcrd != 0xffff) {
+                               unsigned mask;
+                               mid = 0;
+                               mask = load_fp_attrib(pc, fcrd, r_usage[1],
+                                                     &mid, &aid, &oid);
+                               oid = 0;
+                               pc->p->cfg.fp.regs[1] |= (mask << 24);
+                               pc->p->cfg.fp.map[0] = 0x04040404 * fcrd;
+                       }
+                       pc->p->cfg.fp.map[0] += 0x03020100;
 
-               for (i = 0; i < pc->attr_nr; i++) {
-                       struct nv50_reg *a = &pc->attr[i*4];
+                       /* should do MAD fcrd.xy, fcrd, SOME_CONST, fcrd */
 
-                       for (c = 0; c < 4; c++) {
-                               if (pc->p->type == PIPE_SHADER_FRAGMENT) {
-                                       struct nv50_reg *at =
-                                               alloc_temp(pc, NULL);
-                                       pc->attr[i*4+c].type = at->type;
-                                       pc->attr[i*4+c].hw = at->hw;
-                                       pc->attr[i*4+c].index = at->index;
+                       if (perspect_loads) {
+                               pc->iv_p = alloc_temp(pc, NULL);
+
+                               if (!(pc->p->cfg.fp.regs[1] & 0x08000000)) {
+                                       pc->p->cfg.fp.regs[1] |= 0x08000000;
+                                       pc->iv_p->rhw = aid++;
+                                       emit_interp(pc, pc->iv_p, NULL,
+                                                   INTERP_LINEAR);
+                                       emit_flop(pc, 0, pc->iv_p, pc->iv_p);
                                } else {
-                                       pc->p->cfg.vp.attr[aid/32] |=
-                                               (1 << (aid % 32));
-                                       pc->attr[i*4+c].type = P_ATTR;
-                                       pc->attr[i*4+c].hw = aid++;
-                                       pc->attr[i*4+c].index = i;
+                                       pc->iv_p->rhw = aid - 1;
+                                       emit_flop(pc, 0, pc->iv_p,
+                                                 &pc->attr[fcrd * 4 + 3]);
                                }
                        }
 
-                       if (pc->p->type != PIPE_SHADER_FRAGMENT)
-                               continue;
+                       if (centroid_loads) {
+                               pc->iv_c = alloc_temp(pc, NULL);
+                               pc->iv_c->rhw = pc->iv_p ? aid - 1 : aid++;
+                               emit_interp(pc, pc->iv_c, NULL,
+                                           INTERP_CENTROID);
+                               emit_flop(pc, 0, pc->iv_c, pc->iv_c);
+                               pc->p->cfg.fp.regs[1] |= 0x08000000;
+                       }
 
-                       emit_interp(pc, &a[0], &a[0], iv);
-                       emit_interp(pc, &a[1], &a[1], iv);
-                       emit_interp(pc, &a[2], &a[2], iv);
-                       emit_interp(pc, &a[3], &a[3], iv);
-               }
+                       for (c = 0; c < 4; c++) {
+                               /* I don't know what these values do, but
+                                * let's set them like the blob does:
+                                */
+                               if (fcol != 0xffff && r_usage[1][fcol * 4 + c])
+                                       pc->p->cfg.fp.regs[0] += 0x00010000;
+                               if (bcol != 0xffff && r_usage[1][bcol * 4 + c])
+                                       pc->p->cfg.fp.regs[0] += 0x00010000;
+                       }
+
+                       for (i = 0; i < pc->attr_nr; i++)
+                               load_fp_attrib(pc, i, r_usage[1],
+                                              &mid, &aid, &oid);
+
+                       if (pc->iv_p)
+                               free_temp(pc, pc->iv_p);
+                       if (pc->iv_c)
+                               free_temp(pc, pc->iv_c);
 
-               if (iv)
-                       free_temp(pc, iv);
+                       pc->p->cfg.fp.high_map = (mid / 4);
+                       pc->p->cfg.fp.high_map += ((mid % 4) ? 1 : 0);
+               } else {
+                       /* vertex program */
+                       for (i = 0; i < pc->attr_nr * 4; i++) {
+                               pc->p->cfg.vp.attr[aid / 32] |=
+                                       (1 << (aid % 32));
+                               pc->attr[i].type = P_ATTR;
+                               pc->attr[i].hw = aid++;
+                               pc->attr[i].index = i / 4;
+                       }
+               }
        }
 
        if (pc->result_nr) {
@@ -1455,12 +2030,20 @@ nv50_program_tx_prep(struct nv50_pc *pc)
                                if (pc->p->type == PIPE_SHADER_FRAGMENT) {
                                        pc->result[i*4+c].type = P_TEMP;
                                        pc->result[i*4+c].hw = -1;
+                                       pc->result[i*4+c].rhw = (i == depr) ?
+                                               -1 : rid++;
                                } else {
                                        pc->result[i*4+c].type = P_RESULT;
                                        pc->result[i*4+c].hw = rid++;
                                }
                                pc->result[i*4+c].index = i;
                        }
+
+                       if (pc->p->type == PIPE_SHADER_FRAGMENT &&
+                           depr != 0xffff) {
+                               pc->result[depr * 4 + 2].rhw =
+                                       (pc->result_nr - 1) * 4;
+                       }
                }
        }
 
@@ -1481,7 +2064,7 @@ nv50_program_tx_prep(struct nv50_pc *pc)
        }
 
        if (pc->immd_nr) {
-               int rid = pc->param_nr * 4;
+               int rid = 0;
 
                pc->immd = CALLOC(pc->immd_nr * 4, sizeof(struct nv50_reg));
                if (!pc->immd)
@@ -1498,6 +2081,11 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 
        ret = TRUE;
 out_err:
+       if (r_usage[0])
+               FREE(r_usage[0]);
+       if (r_usage[1])
+               FREE(r_usage[1]);
+
        tgsi_parse_free(&p);
        return ret;
 }
@@ -1505,8 +2093,6 @@ out_err:
 static void
 free_nv50_pc(struct nv50_pc *pc)
 {
-       unsigned i;
-
        if (pc->immd)
                FREE(pc->immd);
        if (pc->param)
@@ -1518,12 +2104,6 @@ free_nv50_pc(struct nv50_pc *pc)
        if (pc->temp)
                FREE(pc->temp);
 
-       for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
-               /* deallocate fragment program attributes */
-               if (pc->r_temp[i] && pc->r_temp[i]->index == -1)
-                       FREE(pc->r_temp[i]);
-       }
-
        FREE(pc);
 }
 
@@ -1532,6 +2112,7 @@ nv50_program_tx(struct nv50_program *p)
 {
        struct tgsi_parse_context parse;
        struct nv50_pc *pc;
+       unsigned k;
        boolean ret;
 
        pc = CALLOC_STRUCT(nv50_pc);
@@ -1548,10 +2129,16 @@ nv50_program_tx(struct nv50_program *p)
        while (!tgsi_parse_end_of_tokens(&parse)) {
                const union tgsi_full_token *tok = &parse.FullToken;
 
+               /* don't allow half insn/immd on first and last instruction */
+               pc->allow32 = TRUE;
+               if (pc->insn_cur == 0 || pc->insn_cur + 2 == pc->insn_nr)
+                       pc->allow32 = FALSE;
+
                tgsi_parse_token(&parse);
 
                switch (tok->Token.Type) {
                case TGSI_TOKEN_TYPE_INSTRUCTION:
+                       ++pc->insn_cur;
                        ret = nv50_program_tx_insn(pc, tok);
                        if (ret == FALSE)
                                goto out_err;
@@ -1565,8 +2152,40 @@ nv50_program_tx(struct nv50_program *p)
                struct nv50_reg out;
 
                out.type = P_TEMP;
-               for (out.hw = 0; out.hw < pc->result_nr * 4; out.hw++)
-                       emit_mov(pc, &out, &pc->result[out.hw]);
+               for (k = 0; k < pc->result_nr * 4; k++) {
+                       if (pc->result[k].rhw == -1)
+                               continue;
+                       if (pc->result[k].hw != pc->result[k].rhw) {
+                               out.hw = pc->result[k].rhw;
+                               emit_mov(pc, &out, &pc->result[k]);
+                       }
+                       if (pc->p->cfg.high_result < (pc->result[k].rhw + 1))
+                               pc->p->cfg.high_result = pc->result[k].rhw + 1;
+               }
+       }
+
+       /* look for single half instructions and make them long */
+       struct nv50_program_exec *e, *e_prev;
+
+       for (k = 0, e = pc->p->exec_head, e_prev = NULL; e; e = e->next) {
+               if (!is_long(e))
+                       k++;
+
+               if (!e->next || is_long(e->next)) {
+                       if (k & 1)
+                               convert_to_long(pc, e);
+                       k = 0;
+               }
+
+               if (e->next)
+                       e_prev = e;
+       }
+
+       if (!is_long(pc->p->exec_tail)) {
+               /* this may occur if moving FP results */
+               assert(e_prev && !is_long(e_prev));
+               convert_to_long(pc, e_prev);
+               convert_to_long(pc, pc->p->exec_tail);
        }
 
        assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head));
@@ -1594,7 +2213,7 @@ nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
 
 static void
 nv50_program_upload_data(struct nv50_context *nv50, float *map,
-                        unsigned start, unsigned count)
+                       unsigned start, unsigned count, unsigned cbuf)
 {
        struct nouveau_channel *chan = nv50->screen->nvws->channel;
        struct nouveau_grobj *tesla = nv50->screen->tesla;
@@ -1603,7 +2222,7 @@ nv50_program_upload_data(struct nv50_context *nv50, float *map,
                unsigned nr = count > 2047 ? 2047 : count;
 
                BEGIN_RING(chan, tesla, 0x00000f00, 1);
-               OUT_RING  (chan, (NV50_CB_PMISC << 0) | (start << 8));
+               OUT_RING  (chan, (cbuf << 0) | (start << 8));
                BEGIN_RING(chan, tesla, 0x40000f04, nr);
                OUT_RINGp (chan, map, nr);
 
@@ -1618,35 +2237,50 @@ nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
 {
        struct nouveau_winsys *nvws = nv50->screen->nvws;
        struct pipe_winsys *ws = nv50->pipe.winsys;
-       unsigned nr = p->param_nr + p->immd_nr;
 
-       if (!p->data && nr) {
-               struct nouveau_resource *heap = nv50->screen->vp_data_heap;
+       if (!p->data[0] && p->immd_nr) {
+               struct nouveau_resource *heap = nv50->screen->immd_heap[0];
 
-               if (nvws->res_alloc(heap, nr, p, &p->data)) {
-                       while (heap->next && heap->size < nr) {
+               if (nvws->res_alloc(heap, p->immd_nr, p, &p->data[0])) {
+                       while (heap->next && heap->size < p->immd_nr) {
                                struct nv50_program *evict = heap->next->priv;
-                               nvws->res_free(&evict->data);
+                               nvws->res_free(&evict->data[0]);
                        }
 
-                       if (nvws->res_alloc(heap, nr, p, &p->data))
+                       if (nvws->res_alloc(heap, p->immd_nr, p, &p->data[0]))
+                               assert(0);
+               }
+
+               /* immediates only need to be uploaded again when freed */
+               nv50_program_upload_data(nv50, p->immd, p->data[0]->start,
+                                        p->immd_nr, NV50_CB_PMISC);
+       }
+
+       if (!p->data[1] && p->param_nr) {
+               struct nouveau_resource *heap =
+                       nv50->screen->parm_heap[p->type];
+
+               if (nvws->res_alloc(heap, p->param_nr, p, &p->data[1])) {
+                       while (heap->next && heap->size < p->param_nr) {
+                               struct nv50_program *evict = heap->next->priv;
+                               nvws->res_free(&evict->data[1]);
+                       }
+
+                       if (nvws->res_alloc(heap, p->param_nr, p, &p->data[1]))
                                assert(0);
                }
        }
 
        if (p->param_nr) {
+               unsigned cbuf = NV50_CB_PVP;
                float *map = ws->buffer_map(ws, nv50->constbuf[p->type],
                                            PIPE_BUFFER_USAGE_CPU_READ);
-               nv50_program_upload_data(nv50, map, p->data->start,
-                                        p->param_nr);
+               if (p->type == PIPE_SHADER_FRAGMENT)
+                       cbuf = NV50_CB_PFP;
+               nv50_program_upload_data(nv50, map, p->data[1]->start,
+                                        p->param_nr, cbuf);
                ws->buffer_unmap(ws, nv50->constbuf[p->type]);
        }
-
-       if (p->immd_nr) {
-               nv50_program_upload_data(nv50, p->immd,
-                                        p->data->start + p->param_nr,
-                                        p->immd_nr);
-       }
 }
 
 static void
@@ -1666,20 +2300,27 @@ nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
                upload = TRUE;
        }
 
-       if (p->data && p->data->start != p->data_start) {
+       if ((p->data[0] && p->data[0]->start != p->data_start[0]) ||
+               (p->data[1] && p->data[1]->start != p->data_start[1])) {
                for (e = p->exec_head; e; e = e->next) {
-                       unsigned ei, ci;
+                       unsigned ei, ci, bs;
 
                        if (e->param.index < 0)
                                continue;
+                       bs = (e->inst[1] >> 22) & 0x07;
+                       assert(bs < 2);
                        ei = e->param.shift >> 5;
-                       ci = e->param.index + p->data->start;
+                       ci = e->param.index + p->data[bs]->start;
 
                        e->inst[ei] &= ~e->param.mask;
                        e->inst[ei] |= (ci << e->param.shift);
                }
 
-               p->data_start = p->data->start;
+               if (p->data[0])
+                       p->data_start[0] = p->data[0]->start;
+               if (p->data[1])
+                       p->data_start[1] = p->data[1]->start;
+
                upload = TRUE;
        }
 
@@ -1777,6 +2418,7 @@ nv50_fragprog_validate(struct nv50_context *nv50)
        struct nouveau_grobj *tesla = nv50->screen->tesla;
        struct nv50_program *p = nv50->fragprog;
        struct nouveau_stateobj *so;
+       unsigned i;
 
        if (!p->translated) {
                nv50_program_validate(nv50, p);
@@ -1794,17 +2436,22 @@ nv50_fragprog_validate(struct nv50_context *nv50)
        so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
                  NOUVEAU_BO_LOW, 0, 0);
        so_method(so, tesla, 0x1904, 4);
-       so_data  (so, 0x00040404); /* p: 0x01000404 */
+       so_data  (so, p->cfg.fp.regs[0]); /* 0x01000404 / 0x00040404 */
        so_data  (so, 0x00000004);
        so_data  (so, 0x00000000);
        so_data  (so, 0x00000000);
-       so_method(so, tesla, 0x16bc, 3); /*XXX: fixme */
-       so_data  (so, 0x03020100);
-       so_data  (so, 0x07060504);
-       so_data  (so, 0x0b0a0908);
+       so_method(so, tesla, 0x16bc, p->cfg.fp.high_map);
+       for (i = 0; i < p->cfg.fp.high_map; i++)
+               so_data(so, p->cfg.fp.map[i]);
        so_method(so, tesla, 0x1988, 2);
-       so_data  (so, 0x08080408); //0x08040404); /* p: 0x0f000401 */
+       so_data  (so, p->cfg.fp.regs[1]); /* 0x08040404 / 0x0f000401 */
        so_data  (so, p->cfg.high_temp);
+       so_method(so, tesla, 0x1298, 1);
+       so_data  (so, p->cfg.high_result);
+       so_method(so, tesla, 0x19a8, 1);
+       so_data  (so, p->cfg.fp.regs[2]);
+       so_method(so, tesla, 0x196c, 1);
+       so_data  (so, p->cfg.fp.regs[3]);
        so_method(so, tesla, 0x1414, 1);
        so_data  (so, 0); /* program start offset */
        so_ref(so, &nv50->state.fragprog);
@@ -1828,7 +2475,8 @@ nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
        if (p->buffer)
                pipe_buffer_reference(&p->buffer, NULL);
 
-       nv50->screen->nvws->res_free(&p->data);
+       nv50->screen->nvws->res_free(&p->data[0]);
+       nv50->screen->nvws->res_free(&p->data[1]);
 
        p->translated = 0;
 }