nv50: negate sources directly where supported
[mesa.git] / src / gallium / drivers / nv50 / nv50_program.c
index c420b8be1c336d4a6bf4f1b675c735ccd6b0d6af..aada285f2c006785178fc6ce7e01908845693b11 100644 (file)
@@ -120,6 +120,8 @@ struct nv50_pc {
        /* current instruction and total number of insns */
        unsigned insn_cur;
        unsigned insn_nr;
+
+       boolean allow32;
 };
 
 static void
@@ -212,6 +214,22 @@ assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
        FREE(src);
 }
 
+/* release the hardware resource held by r */
+static void
+release_hw(struct nv50_pc *pc, struct nv50_reg *r)
+{
+       assert(r->type == P_TEMP);
+       if (r->hw == -1)
+               return;
+
+       assert(pc->r_temp[r->hw] == r);
+       pc->r_temp[r->hw] = NULL;
+
+       r->acc = 0;
+       if (r->index == -1)
+               FREE(r);
+}
+
 static void
 free_temp(struct nv50_pc *pc, struct nv50_reg *r)
 {
@@ -391,7 +409,8 @@ set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
 static INLINE void
 set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
 {
-       unsigned val = fui(pc->immd_buf[imm->hw]); /* XXX */
+       float f = pc->immd_buf[imm->hw];
+       unsigned val = fui(imm->neg ? -f : f);
 
        set_long(pc, e);
        /*XXX: can't be predicated - bits overlap.. catch cases where both
@@ -443,22 +462,12 @@ set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
         struct nv50_program_exec *e)
 {
        set_long(pc, e);
-#if 1
-       e->inst[1] |= (1 << 22);
-#else
-       if (src->type == P_IMMD) {
-               e->inst[1] |= (NV50_CB_PMISC << 22);
-       } else {
-               if (pc->p->type == PIPE_SHADER_VERTEX)
-                       e->inst[1] |= (NV50_CB_PVP << 22);
-               else
-                       e->inst[1] |= (NV50_CB_PFP << 22);
-       }
-#endif
 
        e->param.index = src->hw;
        e->param.shift = s;
        e->param.mask = m << (s % 32);
+
+       e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22);
 }
 
 static void
@@ -470,12 +479,11 @@ emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 
        set_dst(pc, dst, e);
 
-       if (0 && dst->type != P_RESULT && src->type == P_IMMD) {
+       if (pc->allow32 && dst->type != P_RESULT && src->type == P_IMMD) {
                set_immd(pc, src, e);
                /*XXX: 32-bit, but steals part of "half" reg space - need to
                 *     catch and handle this case if/when we do half-regs
                 */
-               e->inst[0] |= 0x00008000;
        } else
        if (src->type == P_IMMD || src->type == P_CONST) {
                set_long(pc, e);
@@ -491,14 +499,13 @@ emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
                e->inst[0] |= (src->hw << 9);
        }
 
-       /* We really should support "half" instructions here at some point,
-        * but I don't feel confident enough about them yet.
-        */
-       set_long(pc, e);
        if (is_long(e) && !is_immd(e)) {
                e->inst[1] |= 0x04000000; /* 32-bit */
-               e->inst[1] |= 0x0003c000; /* "subsubop" 0xf == mov */
-       }
+               e->inst[1] |= 0x0000c000; /* "subsubop" 0x3 */
+               if (!(e->inst[1] & 0x20000000))
+                       e->inst[1] |= 0x00030000; /* "subsubop" 0xf */
+       } else
+               e->inst[0] |= 0x00008000;
 
        emit(pc, e);
 }
@@ -614,12 +621,26 @@ emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
        struct nv50_program_exec *e = exec(pc);
 
        e->inst[0] |= 0xc0000000;
-       set_long(pc, e);
+
+       if (!pc->allow32)
+               set_long(pc, e);
 
        check_swap_src_0_1(pc, &src0, &src1);
        set_dst(pc, dst, e);
        set_src_0(pc, src0, e);
-       set_src_1(pc, src1, e);
+       if (src1->type == P_IMMD && !is_long(e)) {
+               if (src0->neg)
+                       e->inst[0] |= 0x00008000;
+               set_immd(pc, src1, e);
+       } else {
+               set_src_1(pc, src1, e);
+               if (src0->neg ^ src1->neg) {
+                       if (is_long(e))
+                               e->inst[1] |= 0x08000000;
+                       else
+                               e->inst[0] |= 0x00008000;
+               }
+       }
 
        emit(pc, e);
 }
@@ -633,10 +654,19 @@ emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
        e->inst[0] |= 0xb0000000;
 
        check_swap_src_0_1(pc, &src0, &src1);
+
+       if (!pc->allow32 || src0->neg || src1->neg) {
+               set_long(pc, e);
+               e->inst[1] |= (src0->neg << 26) | (src1->neg << 27);
+       }
+
        set_dst(pc, dst, e);
        set_src_0(pc, src0, e);
-       if (is_long(e))
+       if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e))
                set_src_2(pc, src1, e);
+       else
+       if (src1->type == P_IMMD)
+               set_immd(pc, src1, e);
        else
                set_src_1(pc, src1, e);
 
@@ -661,25 +691,13 @@ emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
        emit(pc, e);
 }
 
-static void
+static INLINE void
 emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
         struct nv50_reg *src1)
 {
-       struct nv50_program_exec *e = exec(pc);
-
-       e->inst[0] |= 0xb0000000;
-
-       set_long(pc, e);
-       if (check_swap_src_0_1(pc, &src0, &src1))
-               e->inst[1] |= 0x04000000;
-       else
-               e->inst[1] |= 0x08000000;
-
-       set_dst(pc, dst, e);
-       set_src_0(pc, src0, e);
-       set_src_2(pc, src1, e);
-
-       emit(pc, e);
+       src1->neg ^= 1;
+       emit_add(pc, dst, src0, src1);
+       src1->neg ^= 1;
 }
 
 static void
@@ -696,26 +714,21 @@ emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
        set_src_1(pc, src1, e);
        set_src_2(pc, src2, e);
 
+       if (src0->neg ^ src1->neg)
+               e->inst[1] |= 0x04000000;
+       if (src2->neg)
+               e->inst[1] |= 0x08000000;
+
        emit(pc, e);
 }
 
-static void
+static INLINE void
 emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
         struct nv50_reg *src1, struct nv50_reg *src2)
 {
-       struct nv50_program_exec *e = exec(pc);
-
-       e->inst[0] |= 0xe0000000;
-       set_long(pc, e);
-       e->inst[1] |= 0x08000000; /* src0 * src1 - src2 */
-
-       check_swap_src_0_1(pc, &src0, &src1);
-       set_dst(pc, dst, e);
-       set_src_0(pc, src0, e);
-       set_src_1(pc, src1, e);
-       set_src_2(pc, src2, e);
-
-       emit(pc, e);
+       src2->neg ^= 1;
+       emit_mad(pc, dst, src0, src1, src2);
+       src2->neg ^= 1;
 }
 
 static void
@@ -766,6 +779,48 @@ emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
        emit(pc, e);
 }
 
+#define CVTOP_RN       0x01
+#define CVTOP_FLOOR    0x03
+#define CVTOP_CEIL     0x05
+#define CVTOP_TRUNC    0x07
+#define CVTOP_SAT      0x08
+#define CVTOP_ABS      0x10
+
+#define CVT_F32_F32 0xc4
+#define CVT_F32_S32 0x44
+#define CVT_F32_U32 0x64
+#define CVT_S32_F32 0x8c
+#define CVT_S32_S32 0x0c
+#define CVT_F32_F32_ROP 0xcc
+
+static void
+emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
+        int wp, unsigned cop, unsigned fmt)
+{
+       struct nv50_program_exec *e;
+
+       e = exec(pc);
+       set_long(pc, e);
+
+       e->inst[0] |= 0xa0000000;
+       e->inst[1] |= 0x00004000;
+       e->inst[1] |= (cop << 16);
+       e->inst[1] |= (fmt << 24);
+       set_src_0(pc, src, e);
+
+       if (wp >= 0)
+               set_pred_wr(pc, 1, wp, e);
+
+       if (dst)
+               set_dst(pc, dst, e);
+       else {
+               e->inst[0] |= 0x000001fc;
+               e->inst[1] |= 0x00000008;
+       }
+
+       emit(pc, e);
+}
+
 static void
 emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst,
         struct nv50_reg *src0, struct nv50_reg *src1)
@@ -809,22 +864,10 @@ emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst,
                free_temp(pc, dst);
 }
 
-static void
+static INLINE void
 emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 {
-       struct nv50_program_exec *e = exec(pc);
-
-       e->inst[0] = 0xa0000000; /* cvt */
-       set_long(pc, e);
-       e->inst[1] |= (6 << 29); /* cvt */
-       e->inst[1] |= 0x08000000; /* integer mode */
-       e->inst[1] |= 0x04000000; /* 32 bit */
-       e->inst[1] |= ((0x1 << 3)) << 14; /* .rn */
-       e->inst[1] |= (1 << 14); /* src .f32 */
-       set_dst(pc, dst, e);
-       set_src_0(pc, src, e);
-
-       emit(pc, e);
+       emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32_ROP);
 }
 
 static void
@@ -841,21 +884,10 @@ emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
        free_temp(pc, temp);
 }
 
-static void
+static INLINE void
 emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 {
-       struct nv50_program_exec *e = exec(pc);
-
-       e->inst[0] = 0xa0000000; /* cvt */
-       set_long(pc, e);
-       e->inst[1] |= (6 << 29); /* cvt */
-       e->inst[1] |= 0x04000000; /* 32 bit */
-       e->inst[1] |= (1 << 14); /* src .f32 */
-       e->inst[1] |= ((1 << 6) << 14); /* .abs */
-       set_dst(pc, dst, e);
-       set_src_0(pc, src, e);
-
-       emit(pc, e);
+       emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32);
 }
 
 static void
@@ -867,6 +899,9 @@ emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
        struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
        struct nv50_reg *pos128 = alloc_immd(pc,  127.999999);
        struct nv50_reg *tmp[4];
+       boolean allow32 = pc->allow32;
+
+       pc->allow32 = FALSE;
 
        if (mask & (3 << 1)) {
                tmp[0] = alloc_temp(pc, NULL);
@@ -894,6 +929,8 @@ emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
        if (mask & (1 << 2))
                free_temp(pc, tmp[0]);
 
+       pc->allow32 = allow32;
+
        /* do this last, in case src[i,j] == dst[0,3] */
        if (mask & (1 << 0))
                emit_mov(pc, dst[0], one);
@@ -935,6 +972,8 @@ emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
        e->inst[1] = 0xc4014788;
        set_src_0(pc, src, e);
        set_pred_wr(pc, 1, r_pred, e);
+       if (src->neg)
+               e->inst[1] |= 0x20000000;
        emit(pc, e);
 
        /* This is probably KILP */
@@ -945,6 +984,180 @@ emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
        emit(pc, e);
 }
 
+static void
+emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
+        struct nv50_reg **src, unsigned unit, unsigned type, boolean proj)
+{
+       struct nv50_reg *temp, *t[4];
+       struct nv50_program_exec *e;
+
+       unsigned c, mode, dim;
+
+       switch (type) {
+       case TGSI_TEXTURE_1D:
+               dim = 1;
+               break;
+       case TGSI_TEXTURE_UNKNOWN:
+       case TGSI_TEXTURE_2D:
+       case TGSI_TEXTURE_SHADOW1D: /* XXX: x, z */
+       case TGSI_TEXTURE_RECT:
+               dim = 2;
+               break;
+       case TGSI_TEXTURE_3D:
+       case TGSI_TEXTURE_CUBE:
+       case TGSI_TEXTURE_SHADOW2D:
+       case TGSI_TEXTURE_SHADOWRECT: /* XXX */
+               dim = 3;
+               break;
+       default:
+               assert(0);
+               break;
+       }
+
+       alloc_temp4(pc, t, 0);
+
+       if (proj) {
+               if (src[0]->type == P_TEMP && src[0]->rhw != -1) {
+                       mode = pc->interp_mode[src[0]->index];
+
+                       t[3]->rhw = src[3]->rhw;
+                       emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID));
+                       emit_flop(pc, 0, t[3], t[3]);
+
+                       for (c = 0; c < dim; c++) {
+                               t[c]->rhw = src[c]->rhw;
+                               emit_interp(pc, t[c], t[3],
+                                           (mode | INTERP_PERSPECTIVE));
+                       }
+               } else {
+                       emit_flop(pc, 0, t[3], src[3]);
+                       for (c = 0; c < dim; c++)
+                               emit_mul(pc, t[c], src[c], t[3]);
+
+                       /* XXX: for some reason the blob sometimes uses MAD:
+                        * emit_mad(pc, t[c], src[0][c], t[3], t[3])
+                        * pc->p->exec_tail->inst[1] |= 0x080fc000;
+                        */
+               }
+       } else {
+               if (type == TGSI_TEXTURE_CUBE) {
+                       temp = temp_temp(pc);
+                       emit_minmax(pc, 4, temp, src[0], src[1]);
+                       emit_minmax(pc, 4, temp, temp, src[2]);
+                       emit_flop(pc, 0, temp, temp);
+                       for (c = 0; c < 3; c++)
+                               emit_mul(pc, t[c], src[c], temp);
+               } else {
+                       for (c = 0; c < dim; c++)
+                               emit_mov(pc, t[c], src[c]);
+               }
+       }
+
+       e = exec(pc);
+       set_long(pc, e);
+       e->inst[0] |= 0xf0000000;
+       e->inst[1] |= 0x00000004;
+       set_dst(pc, t[0], e);
+       e->inst[0] |= (unit << 9);
+
+       if (dim == 2)
+               e->inst[0] |= 0x00400000;
+       else
+       if (dim == 3)
+               e->inst[0] |= 0x00800000;
+
+       e->inst[0] |= (mask & 0x3) << 25;
+       e->inst[1] |= (mask & 0xc) << 12;
+
+       emit(pc, e);
+
+#if 1
+       if (mask & 1) emit_mov(pc, dst[0], t[0]);
+       if (mask & 2) emit_mov(pc, dst[1], t[1]);
+       if (mask & 4) emit_mov(pc, dst[2], t[2]);
+       if (mask & 8) emit_mov(pc, dst[3], t[3]);
+
+       free_temp4(pc, t);
+#else
+       /* XXX: if p.e. MUL is used directly after TEX, it would still use
+        * the texture coordinates, not the fetched values: latency ? */
+
+       for (c = 0; c < 4; c++) {
+               if (mask & (1 << c))
+                       assimilate_temp(pc, dst[c], t[c]);
+               else
+                       free_temp(pc, t[c]);
+       }
+#endif
+}
+
+static void
+convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
+{
+       unsigned q = 0, m = ~0;
+
+       assert(!is_long(e));
+
+       switch (e->inst[0] >> 28) {
+       case 0x1:
+               /* MOV */
+               q = 0x0403c000;
+               m = 0xffff7fff;
+               break;
+       case 0x8:
+               /* INTERP */
+               m = ~0x02000000;
+               if (e->inst[0] & 0x02000000)
+                       q = 0x00020000;
+               break;
+       case 0x9:
+               /* RCP */
+               break;
+       case 0xB:
+               /* ADD */
+               m = ~(127 << 16);
+               q = ((e->inst[0] & (~m)) >> 2);
+               break;
+       case 0xC:
+               /* MUL */
+               m = ~0x00008000;
+               q = ((e->inst[0] & (~m)) << 12);
+               break;
+       case 0xE:
+               /* MAD (if src2 == dst) */
+               q = ((e->inst[0] & 0x1fc) << 12);
+               break;
+       default:
+               assert(0);
+               break;
+       }
+
+       set_long(pc, e);
+       pc->p->exec_size++;
+
+       e->inst[0] &= m;
+       e->inst[1] |= q;
+}
+
+static boolean
+negate_supported(const struct tgsi_full_instruction *insn, int i)
+{
+       switch (insn->Instruction.Opcode) {
+       case TGSI_OPCODE_DP3:
+       case TGSI_OPCODE_DP4:
+       case TGSI_OPCODE_MUL:
+       case TGSI_OPCODE_KIL:
+       case TGSI_OPCODE_ADD:
+       case TGSI_OPCODE_SUB:
+       case TGSI_OPCODE_MAD:
+               return TRUE;
+       case TGSI_OPCODE_POW:
+               return (i == 1) ? TRUE : FALSE;
+       default:
+               return FALSE;
+       }
+}
+
 static struct nv50_reg *
 tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
 {
@@ -963,7 +1176,8 @@ tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
 }
 
 static struct nv50_reg *
-tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src)
+tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
+        boolean neg)
 {
        struct nv50_reg *r = NULL;
        struct nv50_reg *temp;
@@ -1018,14 +1232,21 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src)
                r = temp;
                break;
        case TGSI_UTIL_SIGN_TOGGLE:
-               temp = temp_temp(pc);
-               emit_neg(pc, temp, r);
-               r = temp;
+               if (neg)
+                       r->neg = 1;
+               else {
+                       temp = temp_temp(pc);
+                       emit_neg(pc, temp, r);
+                       r = temp;
+               }
                break;
        case TGSI_UTIL_SIGN_SET:
                temp = temp_temp(pc);
                emit_abs(pc, temp, r);
-               emit_neg(pc, temp, temp);
+               if (neg)
+                       temp->neg = 1;
+               else
+                       emit_neg(pc, temp, temp);
                r = temp;
                break;
        default:
@@ -1093,7 +1314,8 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
                        unit = fs->SrcRegister.Index;
 
                for (c = 0; c < 4; c++)
-                       src[i][c] = tgsi_src(pc, c, fs);
+                       src[i][c] = tgsi_src(pc, c, fs,
+                                            negate_supported(inst, i));
        }
 
        if (sat) {
@@ -1352,30 +1574,12 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
                }
                break;
        case TGSI_OPCODE_TEX:
+               emit_tex(pc, dst, mask, src[0], unit,
+                        inst->InstructionExtTexture.Texture, FALSE);
+               break;
        case TGSI_OPCODE_TXP:
-       {
-               struct nv50_reg *t[4];
-               struct nv50_program_exec *e;
-
-               alloc_temp4(pc, t, 0);
-               emit_mov(pc, t[0], src[0][0]);
-               emit_mov(pc, t[1], src[0][1]);
-
-               e = exec(pc);
-               e->inst[0] = 0xf6400000;
-               e->inst[0] |= (unit << 9);
-               set_long(pc, e);
-               e->inst[1] |= 0x0000c004;
-               set_dst(pc, t[0], e);
-               emit(pc, e);
-
-               if (mask & (1 << 0)) emit_mov(pc, dst[0], t[0]);
-               if (mask & (1 << 1)) emit_mov(pc, dst[1], t[1]);
-               if (mask & (1 << 2)) emit_mov(pc, dst[2], t[2]);
-               if (mask & (1 << 3)) emit_mov(pc, dst[3], t[3]);
-
-               free_temp4(pc, t);
-       }
+               emit_tex(pc, dst, mask, src[0], unit,
+                        inst->InstructionExtTexture.Texture, TRUE);
                break;
        case TGSI_OPCODE_XPD:
                temp = temp_temp(pc);
@@ -1403,21 +1607,10 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 
        if (sat) {
                for (c = 0; c < 4; c++) {
-                       struct nv50_program_exec *e;
-
                        if (!(mask & (1 << c)))
                                continue;
-                       e = exec(pc);
-
-                       e->inst[0] = 0xa0000000; /* cvt */
-                       set_long(pc, e);
-                       e->inst[1] |= (6 << 29); /* cvt */
-                       e->inst[1] |= 0x04000000; /* 32 bit */
-                       e->inst[1] |= (1 << 14); /* src .f32 */
-                       e->inst[1] |= ((1 << 5) << 14); /* .sat */
-                       set_dst(pc, rdst[c], e);
-                       set_src_0(pc, dst[c], e);
-                       emit(pc, e);
+                       emit_cvt(pc, rdst[c], dst[c], -1, CVTOP_SAT,
+                                CVT_F32_F32);
                }
        } else if (assimilate) {
                for (c = 0; c < 4; c++)
@@ -1431,6 +1624,9 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
                                continue;
                        if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD)
                                FREE(src[i][c]);
+                       else
+                       if (src[i][c]->acc == pc->insn_cur)
+                               release_hw(pc, src[i][c]);
                }
        }
 
@@ -1868,7 +2064,7 @@ nv50_program_tx_prep(struct nv50_pc *pc)
        }
 
        if (pc->immd_nr) {
-               int rid = pc->param_nr * 4;
+               int rid = 0;
 
                pc->immd = CALLOC(pc->immd_nr * 4, sizeof(struct nv50_reg));
                if (!pc->immd)
@@ -1897,8 +2093,6 @@ out_err:
 static void
 free_nv50_pc(struct nv50_pc *pc)
 {
-       unsigned i;
-
        if (pc->immd)
                FREE(pc->immd);
        if (pc->param)
@@ -1910,12 +2104,6 @@ free_nv50_pc(struct nv50_pc *pc)
        if (pc->temp)
                FREE(pc->temp);
 
-       for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
-               /* deallocate fragment program attributes */
-               if (pc->r_temp[i] && pc->r_temp[i]->index == -1)
-                       FREE(pc->r_temp[i]);
-       }
-
        FREE(pc);
 }
 
@@ -1941,6 +2129,11 @@ nv50_program_tx(struct nv50_program *p)
        while (!tgsi_parse_end_of_tokens(&parse)) {
                const union tgsi_full_token *tok = &parse.FullToken;
 
+               /* don't allow half insn/immd on first and last instruction */
+               pc->allow32 = TRUE;
+               if (pc->insn_cur == 0 || pc->insn_cur + 2 == pc->insn_nr)
+                       pc->allow32 = FALSE;
+
                tgsi_parse_token(&parse);
 
                switch (tok->Token.Type) {
@@ -1971,6 +2164,30 @@ nv50_program_tx(struct nv50_program *p)
                }
        }
 
+       /* look for single half instructions and make them long */
+       struct nv50_program_exec *e, *e_prev;
+
+       for (k = 0, e = pc->p->exec_head, e_prev = NULL; e; e = e->next) {
+               if (!is_long(e))
+                       k++;
+
+               if (!e->next || is_long(e->next)) {
+                       if (k & 1)
+                               convert_to_long(pc, e);
+                       k = 0;
+               }
+
+               if (e->next)
+                       e_prev = e;
+       }
+
+       if (!is_long(pc->p->exec_tail)) {
+               /* this may occur if moving FP results */
+               assert(e_prev && !is_long(e_prev));
+               convert_to_long(pc, e_prev);
+               convert_to_long(pc, pc->p->exec_tail);
+       }
+
        assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head));
        pc->p->exec_tail->inst[1] |= 0x00000001;
 
@@ -1996,7 +2213,7 @@ nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
 
 static void
 nv50_program_upload_data(struct nv50_context *nv50, float *map,
-                        unsigned start, unsigned count)
+                       unsigned start, unsigned count, unsigned cbuf)
 {
        struct nouveau_channel *chan = nv50->screen->nvws->channel;
        struct nouveau_grobj *tesla = nv50->screen->tesla;
@@ -2005,7 +2222,7 @@ nv50_program_upload_data(struct nv50_context *nv50, float *map,
                unsigned nr = count > 2047 ? 2047 : count;
 
                BEGIN_RING(chan, tesla, 0x00000f00, 1);
-               OUT_RING  (chan, (NV50_CB_PMISC << 0) | (start << 8));
+               OUT_RING  (chan, (cbuf << 0) | (start << 8));
                BEGIN_RING(chan, tesla, 0x40000f04, nr);
                OUT_RINGp (chan, map, nr);
 
@@ -2020,35 +2237,50 @@ nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
 {
        struct nouveau_winsys *nvws = nv50->screen->nvws;
        struct pipe_winsys *ws = nv50->pipe.winsys;
-       unsigned nr = p->param_nr + p->immd_nr;
 
-       if (!p->data && nr) {
-               struct nouveau_resource *heap = nv50->screen->vp_data_heap;
+       if (!p->data[0] && p->immd_nr) {
+               struct nouveau_resource *heap = nv50->screen->immd_heap[0];
+
+               if (nvws->res_alloc(heap, p->immd_nr, p, &p->data[0])) {
+                       while (heap->next && heap->size < p->immd_nr) {
+                               struct nv50_program *evict = heap->next->priv;
+                               nvws->res_free(&evict->data[0]);
+                       }
+
+                       if (nvws->res_alloc(heap, p->immd_nr, p, &p->data[0]))
+                               assert(0);
+               }
+
+               /* immediates only need to be uploaded again when freed */
+               nv50_program_upload_data(nv50, p->immd, p->data[0]->start,
+                                        p->immd_nr, NV50_CB_PMISC);
+       }
+
+       if (!p->data[1] && p->param_nr) {
+               struct nouveau_resource *heap =
+                       nv50->screen->parm_heap[p->type];
 
-               if (nvws->res_alloc(heap, nr, p, &p->data)) {
-                       while (heap->next && heap->size < nr) {
+               if (nvws->res_alloc(heap, p->param_nr, p, &p->data[1])) {
+                       while (heap->next && heap->size < p->param_nr) {
                                struct nv50_program *evict = heap->next->priv;
-                               nvws->res_free(&evict->data);
+                               nvws->res_free(&evict->data[1]);
                        }
 
-                       if (nvws->res_alloc(heap, nr, p, &p->data))
+                       if (nvws->res_alloc(heap, p->param_nr, p, &p->data[1]))
                                assert(0);
                }
        }
 
        if (p->param_nr) {
+               unsigned cbuf = NV50_CB_PVP;
                float *map = ws->buffer_map(ws, nv50->constbuf[p->type],
                                            PIPE_BUFFER_USAGE_CPU_READ);
-               nv50_program_upload_data(nv50, map, p->data->start,
-                                        p->param_nr);
+               if (p->type == PIPE_SHADER_FRAGMENT)
+                       cbuf = NV50_CB_PFP;
+               nv50_program_upload_data(nv50, map, p->data[1]->start,
+                                        p->param_nr, cbuf);
                ws->buffer_unmap(ws, nv50->constbuf[p->type]);
        }
-
-       if (p->immd_nr) {
-               nv50_program_upload_data(nv50, p->immd,
-                                        p->data->start + p->param_nr,
-                                        p->immd_nr);
-       }
 }
 
 static void
@@ -2068,20 +2300,27 @@ nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
                upload = TRUE;
        }
 
-       if (p->data && p->data->start != p->data_start) {
+       if ((p->data[0] && p->data[0]->start != p->data_start[0]) ||
+               (p->data[1] && p->data[1]->start != p->data_start[1])) {
                for (e = p->exec_head; e; e = e->next) {
-                       unsigned ei, ci;
+                       unsigned ei, ci, bs;
 
                        if (e->param.index < 0)
                                continue;
+                       bs = (e->inst[1] >> 22) & 0x07;
+                       assert(bs < 2);
                        ei = e->param.shift >> 5;
-                       ci = e->param.index + p->data->start;
+                       ci = e->param.index + p->data[bs]->start;
 
                        e->inst[ei] &= ~e->param.mask;
                        e->inst[ei] |= (ci << e->param.shift);
                }
 
-               p->data_start = p->data->start;
+               if (p->data[0])
+                       p->data_start[0] = p->data[0]->start;
+               if (p->data[1])
+                       p->data_start[1] = p->data[1]->start;
+
                upload = TRUE;
        }
 
@@ -2236,7 +2475,8 @@ nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
        if (p->buffer)
                pipe_buffer_reference(&p->buffer, NULL);
 
-       nv50->screen->nvws->res_free(&p->data);
+       nv50->screen->nvws->res_free(&p->data[0]);
+       nv50->screen->nvws->res_free(&p->data[1]);
 
        p->translated = 0;
 }