src/gallium/drivers/nvfx/nvfx_fragprog.c

   1 #include <float.h>
   2 #include "pipe/p_context.h"
   3 #include "pipe/p_defines.h"
   4 #include "pipe/p_state.h"
   5 #include "util/u_inlines.h"
   6 #include "util/u_debug.h"
   7
   8 #include "pipe/p_shader_tokens.h"
   9 #include "tgsi/tgsi_parse.h"
  10 #include "tgsi/tgsi_util.h"
  11 #include "tgsi/tgsi_dump.h"
  12 #include "tgsi/tgsi_ureg.h"
  13
  14 #include "nvfx_context.h"
  15 #include "nvfx_shader.h"
  16 #include "nvfx_resource.h"
  17
  18 #define MAX_CONSTS 128
  19 #define MAX_IMM 32
  20
  21 struct nvfx_fpc {
  22         struct nvfx_pipe_fragment_program* pfp;
  23         struct nvfx_fragment_program *fp;
  24
  25         unsigned max_temps;
  26         unsigned long long r_temps;
  27         unsigned long long r_temps_discard;
  28         struct nvfx_reg r_result[PIPE_MAX_SHADER_OUTPUTS];
  29         struct nvfx_reg *r_temp;
  30         unsigned sprite_coord_temp;
  31
  32         int num_regs;
  33
  34         unsigned inst_offset;
  35         unsigned have_const;
  36
  37         struct {
  38                 int pipe;
  39                 float vals[4];
  40         } consts[MAX_CONSTS];
  41         int nr_consts;
  42
  43         struct nvfx_reg imm[MAX_IMM];
  44         unsigned nr_imm;
  45
  46         unsigned char generic_to_slot[256]; /* semantic idx for each input semantic */
  47
  48         struct util_dynarray if_stack;
  49         //struct util_dynarray loop_stack;
  50         struct util_dynarray label_relocs;
  51 };
  52
  53 static INLINE struct nvfx_reg
  54 temp(struct nvfx_fpc *fpc)
  55 {
  56         int idx = __builtin_ctzll(~fpc->r_temps);
  57
  58         if (idx >= fpc->max_temps) {
  59                 NOUVEAU_ERR("out of temps!!\n");
  60                 assert(0);
  61                 return nvfx_reg(NVFXSR_TEMP, 0);
  62         }
  63
  64         fpc->r_temps |= (1ULL << idx);
  65         fpc->r_temps_discard |= (1ULL << idx);
  66         return nvfx_reg(NVFXSR_TEMP, idx);
  67 }
  68
  69 static INLINE void
  70 release_temps(struct nvfx_fpc *fpc)
  71 {
  72         fpc->r_temps &= ~fpc->r_temps_discard;
  73         fpc->r_temps_discard = 0ULL;
  74 }
  75
  76 static INLINE struct nvfx_reg
  77 constant(struct nvfx_fpc *fpc, int pipe, float vals[4])
  78 {
  79         int idx;
  80
  81         if (fpc->nr_consts == MAX_CONSTS)
  82                 assert(0);
  83         idx = fpc->nr_consts++;
  84
  85         fpc->consts[idx].pipe = pipe;
  86         if (pipe == -1)
  87                 memcpy(fpc->consts[idx].vals, vals, 4 * sizeof(float));
  88         return nvfx_reg(NVFXSR_CONST, idx);
  89 }
  90
  91 static void
  92 grow_insns(struct nvfx_fpc *fpc, int size)
  93 {
  94         struct nvfx_fragment_program *fp = fpc->fp;
  95
  96         fp->insn_len += size;
  97         fp->insn = realloc(fp->insn, sizeof(uint32_t) * fp->insn_len);
  98 }
  99
 100 static void
 101 emit_src(struct nvfx_fpc *fpc, int pos, struct nvfx_src src)
 102 {
 103         struct nvfx_fragment_program *fp = fpc->fp;
 104         uint32_t *hw = &fp->insn[fpc->inst_offset];
 105         uint32_t sr = 0;
 106
 107         switch (src.reg.type) {
 108         case NVFXSR_INPUT:
 109                 sr |= (NVFX_FP_REG_TYPE_INPUT << NVFX_FP_REG_TYPE_SHIFT);
 110                 hw[0] |= (src.reg.index << NVFX_FP_OP_INPUT_SRC_SHIFT);
 111                 break;
 112         case NVFXSR_OUTPUT:
 113                 sr |= NVFX_FP_REG_SRC_HALF;
 114                 /* fall-through */
 115         case NVFXSR_TEMP:
 116                 sr |= (NVFX_FP_REG_TYPE_TEMP << NVFX_FP_REG_TYPE_SHIFT);
 117                 sr |= (src.reg.index << NVFX_FP_REG_SRC_SHIFT);
 118                 break;
 119         case NVFXSR_RELOCATED:
 120                 sr |= (NVFX_FP_REG_TYPE_TEMP << NVFX_FP_REG_TYPE_SHIFT);
 121                 sr |= (fpc->sprite_coord_temp << NVFX_FP_REG_SRC_SHIFT);
 122                 //printf("adding relocation at %x for %x\n", fpc->inst_offset, src.index);
 123                 util_dynarray_append(&fpc->fp->slot_relocations[src.reg.index], unsigned, fpc->inst_offset + pos + 1);
 124                 break;
 125         case NVFXSR_CONST:
 126                 if (!fpc->have_const) {
 127                         grow_insns(fpc, 4);
 128                         fpc->have_const = 1;
 129                 }
 130
 131                 hw = &fp->insn[fpc->inst_offset];
 132                 if (fpc->consts[src.reg.index].pipe >= 0) {
 133                         struct nvfx_fragment_program_data *fpd;
 134
 135                         fp->consts = realloc(fp->consts, ++fp->nr_consts *
 136                                              sizeof(*fpd));
 137                         fpd = &fp->consts[fp->nr_consts - 1];
 138                         fpd->offset = fpc->inst_offset + 4;
 139                         fpd->index = fpc->consts[src.reg.index].pipe;
 140                         memset(&fp->insn[fpd->offset], 0, sizeof(uint32_t) * 4);
 141                 } else {
 142                         memcpy(&fp->insn[fpc->inst_offset + 4],
 143                                 fpc->consts[src.reg.index].vals,
 144                                 sizeof(uint32_t) * 4);
 145                 }
 146
 147                 sr |= (NVFX_FP_REG_TYPE_CONST << NVFX_FP_REG_TYPE_SHIFT);
 148                 break;
 149         case NVFXSR_NONE:
 150                 sr |= (NVFX_FP_REG_TYPE_INPUT << NVFX_FP_REG_TYPE_SHIFT);
 151                 break;
 152         default:
 153                 assert(0);
 154         }
 155
 156         if (src.negate)
 157                 sr |= NVFX_FP_REG_NEGATE;
 158
 159         if (src.abs)
 160                 hw[1] |= (1 << (29 + pos));
 161
 162         sr |= ((src.swz[0] << NVFX_FP_REG_SWZ_X_SHIFT) |
 163                (src.swz[1] << NVFX_FP_REG_SWZ_Y_SHIFT) |
 164                (src.swz[2] << NVFX_FP_REG_SWZ_Z_SHIFT) |
 165                (src.swz[3] << NVFX_FP_REG_SWZ_W_SHIFT));
 166
 167         hw[pos + 1] |= sr;
 168 }
 169
 170 static void
 171 emit_dst(struct nvfx_fpc *fpc, struct nvfx_reg dst)
 172 {
 173         struct nvfx_fragment_program *fp = fpc->fp;
 174         uint32_t *hw = &fp->insn[fpc->inst_offset];
 175
 176         switch (dst.type) {
 177         case NVFXSR_TEMP:
 178                 if (fpc->num_regs < (dst.index + 1))
 179                         fpc->num_regs = dst.index + 1;
 180                 break;
 181         case NVFXSR_OUTPUT:
 182                 if (dst.index == 1) {
 183                         fp->fp_control |= 0xe;
 184                 } else {
 185                         hw[0] |= NVFX_FP_OP_OUT_REG_HALF;
 186                 }
 187                 break;
 188         case NVFXSR_NONE:
 189                 hw[0] |= (1 << 30);
 190                 break;
 191         default:
 192                 assert(0);
 193         }
 194
 195         hw[0] |= (dst.index << NVFX_FP_OP_OUT_REG_SHIFT);
 196 }
 197
 198 static void
 199 nvfx_fp_emit(struct nvfx_fpc *fpc, struct nvfx_insn insn)
 200 {
 201         struct nvfx_fragment_program *fp = fpc->fp;
 202         uint32_t *hw;
 203
 204         fpc->inst_offset = fp->insn_len;
 205         fpc->have_const = 0;
 206         grow_insns(fpc, 4);
 207         hw = &fp->insn[fpc->inst_offset];
 208         memset(hw, 0, sizeof(uint32_t) * 4);
 209
 210         if (insn.op == NVFX_FP_OP_OPCODE_KIL)
 211                 fp->fp_control |= NV34TCL_FP_CONTROL_USES_KIL;
 212         hw[0] |= (insn.op << NVFX_FP_OP_OPCODE_SHIFT);
 213         hw[0] |= (insn.mask << NVFX_FP_OP_OUTMASK_SHIFT);
 214         hw[2] |= (insn.scale << NVFX_FP_OP_DST_SCALE_SHIFT);
 215
 216         if (insn.sat)
 217                 hw[0] |= NVFX_FP_OP_OUT_SAT;
 218
 219         if (insn.cc_update)
 220                 hw[0] |= NVFX_FP_OP_COND_WRITE_ENABLE;
 221         hw[1] |= (insn.cc_test << NVFX_FP_OP_COND_SHIFT);
 222         hw[1] |= ((insn.cc_swz[0] << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
 223                   (insn.cc_swz[1] << NVFX_FP_OP_COND_SWZ_Y_SHIFT) |
 224                   (insn.cc_swz[2] << NVFX_FP_OP_COND_SWZ_Z_SHIFT) |
 225                   (insn.cc_swz[3] << NVFX_FP_OP_COND_SWZ_W_SHIFT));
 226
 227         if(insn.unit >= 0)
 228         {
 229                 hw[0] |= (insn.unit << NVFX_FP_OP_TEX_UNIT_SHIFT);
 230                 fp->samplers |= (1 << insn.unit);
 231         }
 232
 233         emit_dst(fpc, insn.dst);
 234         emit_src(fpc, 0, insn.src[0]);
 235         emit_src(fpc, 1, insn.src[1]);
 236         emit_src(fpc, 2, insn.src[2]);
 237 }
 238
 239 #define arith(s,o,d,m,s0,s1,s2) \
 240        nvfx_insn((s), NVFX_FP_OP_OPCODE_##o, -1, \
 241                        (d), (m), (s0), (s1), (s2))
 242
 243 #define tex(s,o,u,d,m,s0,s1,s2) \
 244         nvfx_insn((s), NVFX_FP_OP_OPCODE_##o, (u), \
 245                    (d), (m), (s0), none, none)
 246
 247 /* IF src.x != 0, as TGSI specifies */
 248 static void
 249 nv40_fp_if(struct nvfx_fpc *fpc, struct nvfx_src src)
 250 {
 251         const struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0));
 252         struct nvfx_insn insn = arith(0, MOV, none.reg, NVFX_FP_MASK_X, src, none, none);
 253         uint32_t *hw;
 254         insn.cc_update = 1;
 255         nvfx_fp_emit(fpc, insn);
 256
 257         fpc->inst_offset = fpc->fp->insn_len;
 258         grow_insns(fpc, 4);
 259         hw = &fpc->fp->insn[fpc->inst_offset];
 260         /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
 261         hw[0] = (NV40_FP_OP_BRA_OPCODE_IF << NVFX_FP_OP_OPCODE_SHIFT) |
 262                 NV40_FP_OP_OUT_NONE |
 263                 (NVFX_FP_PRECISION_FP16 << NVFX_FP_OP_PRECISION_SHIFT);
 264         /* Use .xxxx swizzle so that we check only src[0].x*/
 265         hw[1] = (0 << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
 266                         (0 << NVFX_FP_OP_COND_SWZ_Y_SHIFT) |
 267                         (0 << NVFX_FP_OP_COND_SWZ_Z_SHIFT) |
 268                         (0 << NVFX_FP_OP_COND_SWZ_W_SHIFT) |
 269                         (NVFX_FP_OP_COND_NE << NVFX_FP_OP_COND_SHIFT);
 270         hw[2] = 0; /* | NV40_FP_OP_OPCODE_IS_BRANCH | else_offset */
 271         hw[3] = 0; /* | endif_offset */
 272         util_dynarray_append(&fpc->if_stack, unsigned, fpc->inst_offset);
 273 }
 274
 275 /* IF src.x != 0, as TGSI specifies */
 276 static void
 277 nv40_fp_cal(struct nvfx_fpc *fpc, unsigned target)
 278 {
 279         struct nvfx_relocation reloc;
 280         uint32_t *hw;
 281         fpc->inst_offset = fpc->fp->insn_len;
 282         grow_insns(fpc, 4);
 283         hw = &fpc->fp->insn[fpc->inst_offset];
 284         /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
 285         hw[0] = (NV40_FP_OP_BRA_OPCODE_CAL << NVFX_FP_OP_OPCODE_SHIFT);
 286         /* Use .xxxx swizzle so that we check only src[0].x*/
 287         hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_ALL_SHIFT) |
 288                         (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT);
 289         hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH; /* | call_offset */
 290         hw[3] = 0;
 291         reloc.target = target;
 292         reloc.location = fpc->inst_offset + 2;
 293         util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc);
 294 }
 295
 296 static void
 297 nv40_fp_ret(struct nvfx_fpc *fpc)
 298 {
 299         uint32_t *hw;
 300         fpc->inst_offset = fpc->fp->insn_len;
 301         grow_insns(fpc, 4);
 302         hw = &fpc->fp->insn[fpc->inst_offset];
 303         /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
 304         hw[0] = (NV40_FP_OP_BRA_OPCODE_RET << NVFX_FP_OP_OPCODE_SHIFT);
 305         /* Use .xxxx swizzle so that we check only src[0].x*/
 306         hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_ALL_SHIFT) |
 307                         (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT);
 308         hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH; /* | call_offset */
 309         hw[3] = 0;
 310 }
 311
 312 static void
 313 nv40_fp_rep(struct nvfx_fpc *fpc, unsigned count, unsigned target)
 314 {
 315         struct nvfx_relocation reloc;
 316         uint32_t *hw;
 317         fpc->inst_offset = fpc->fp->insn_len;
 318         grow_insns(fpc, 4);
 319         hw = &fpc->fp->insn[fpc->inst_offset];
 320         /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
 321         hw[0] = (NV40_FP_OP_BRA_OPCODE_REP << NVFX_FP_OP_OPCODE_SHIFT) |
 322                         NV40_FP_OP_OUT_NONE |
 323                         (NVFX_FP_PRECISION_FP16 << NVFX_FP_OP_PRECISION_SHIFT);
 324         /* Use .xxxx swizzle so that we check only src[0].x*/
 325         hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_ALL_SHIFT) |
 326                         (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT);
 327         hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH |
 328                         (count << NV40_FP_OP_REP_COUNT1_SHIFT) |
 329                         (count << NV40_FP_OP_REP_COUNT2_SHIFT) |
 330                         (count << NV40_FP_OP_REP_COUNT3_SHIFT);
 331         hw[3] = 0; /* | end_offset */
 332         reloc.target = target;
 333         reloc.location = fpc->inst_offset + 3;
 334         util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc);
 335         //util_dynarray_append(&fpc->loop_stack, unsigned, target);
 336 }
 337
 338 /* warning: this only works forward, and probably only if not inside any IF */
 339 static void
 340 nv40_fp_bra(struct nvfx_fpc *fpc, unsigned target)
 341 {
 342         struct nvfx_relocation reloc;
 343         uint32_t *hw;
 344         fpc->inst_offset = fpc->fp->insn_len;
 345         grow_insns(fpc, 4);
 346         hw = &fpc->fp->insn[fpc->inst_offset];
 347         /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
 348         hw[0] = (NV40_FP_OP_BRA_OPCODE_IF << NVFX_FP_OP_OPCODE_SHIFT) |
 349                 NV40_FP_OP_OUT_NONE |
 350                 (NVFX_FP_PRECISION_FP16 << NVFX_FP_OP_PRECISION_SHIFT);
 351         /* Use .xxxx swizzle so that we check only src[0].x*/
 352         hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
 353                         (NVFX_FP_OP_COND_FL << NVFX_FP_OP_COND_SHIFT);
 354         hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH; /* | else_offset */
 355         hw[3] = 0; /* | endif_offset */
 356         reloc.target = target;
 357         reloc.location = fpc->inst_offset + 2;
 358         util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc);
 359         reloc.target = target;
 360         reloc.location = fpc->inst_offset + 3;
 361         util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc);
 362 }
 363
 364 static void
 365 nv40_fp_brk(struct nvfx_fpc *fpc)
 366 {
 367         uint32_t *hw;
 368         fpc->inst_offset = fpc->fp->insn_len;
 369         grow_insns(fpc, 4);
 370         hw = &fpc->fp->insn[fpc->inst_offset];
 371         /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
 372         hw[0] = (NV40_FP_OP_BRA_OPCODE_BRK << NVFX_FP_OP_OPCODE_SHIFT) |
 373                 NV40_FP_OP_OUT_NONE;
 374         /* Use .xxxx swizzle so that we check only src[0].x*/
 375         hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
 376                         (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT);
 377         hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH;
 378         hw[3] = 0;
 379 }
 380
 381 static INLINE struct nvfx_src
 382 tgsi_src(struct nvfx_fpc *fpc, const struct tgsi_full_src_register *fsrc)
 383 {
 384         struct nvfx_src src;
 385
 386         switch (fsrc->Register.File) {
 387         case TGSI_FILE_INPUT:
 388                 if(fpc->pfp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_POSITION) {
 389                         assert(fpc->pfp->info.input_semantic_index[fsrc->Register.Index] == 0);
 390                         src.reg = nvfx_reg(NVFXSR_INPUT, NVFX_FP_OP_INPUT_SRC_POSITION);
 391                 } else if(fpc->pfp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_COLOR) {
 392                         if(fpc->pfp->info.input_semantic_index[fsrc->Register.Index] == 0)
 393                                 src.reg = nvfx_reg(NVFXSR_INPUT, NVFX_FP_OP_INPUT_SRC_COL0);
 394                         else if(fpc->pfp->info.input_semantic_index[fsrc->Register.Index] == 1)
 395                                 src.reg = nvfx_reg(NVFXSR_INPUT, NVFX_FP_OP_INPUT_SRC_COL1);
 396                         else
 397                                 assert(0);
 398                 } else if(fpc->pfp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_FOG) {
 399                         assert(fpc->pfp->info.input_semantic_index[fsrc->Register.Index] == 0);
 400                         src.reg = nvfx_reg(NVFXSR_INPUT, NVFX_FP_OP_INPUT_SRC_FOGC);
 401                 } else if(fpc->pfp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_FACE) {
 402                         /* TODO: check this has the correct values */
 403                         /* XXX: what do we do for nv30 here (assuming it lacks facing)?!  */
 404                         assert(fpc->pfp->info.input_semantic_index[fsrc->Register.Index] == 0);
 405                         src.reg = nvfx_reg(NVFXSR_INPUT, NV40_FP_OP_INPUT_SRC_FACING);
 406                 } else {
 407                         assert(fpc->pfp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_GENERIC);
 408                         src.reg = nvfx_reg(NVFXSR_RELOCATED, fpc->generic_to_slot[fpc->pfp->info.input_semantic_index[fsrc->Register.Index]]);
 409                 }
 410                 break;
 411         case TGSI_FILE_CONSTANT:
 412                 src.reg = constant(fpc, fsrc->Register.Index, NULL);
 413                 break;
 414         case TGSI_FILE_IMMEDIATE:
 415                 assert(fsrc->Register.Index < fpc->nr_imm);
 416                 src.reg = fpc->imm[fsrc->Register.Index];
 417                 break;
 418         case TGSI_FILE_TEMPORARY:
 419                 src.reg = fpc->r_temp[fsrc->Register.Index];
 420                 break;
 421         /* NV40 fragprog result regs are just temps, so this is simple */
 422         case TGSI_FILE_OUTPUT:
 423                 src.reg = fpc->r_result[fsrc->Register.Index];
 424                 break;
 425         default:
 426                 NOUVEAU_ERR("bad src file\n");
 427                 src.reg.index = 0;
 428                 src.reg.type = 0;
 429                 break;
 430         }
 431
 432         src.abs = fsrc->Register.Absolute;
 433         src.negate = fsrc->Register.Negate;
 434         src.swz[0] = fsrc->Register.SwizzleX;
 435         src.swz[1] = fsrc->Register.SwizzleY;
 436         src.swz[2] = fsrc->Register.SwizzleZ;
 437         src.swz[3] = fsrc->Register.SwizzleW;
 438         return src;
 439 }
 440
 441 static INLINE struct nvfx_reg
 442 tgsi_dst(struct nvfx_fpc *fpc, const struct tgsi_full_dst_register *fdst) {
 443         switch (fdst->Register.File) {
 444         case TGSI_FILE_OUTPUT:
 445                 return fpc->r_result[fdst->Register.Index];
 446         case TGSI_FILE_TEMPORARY:
 447                 return fpc->r_temp[fdst->Register.Index];
 448         case TGSI_FILE_NULL:
 449                 return nvfx_reg(NVFXSR_NONE, 0);
 450         default:
 451                 NOUVEAU_ERR("bad dst file %d\n", fdst->Register.File);
 452                 return nvfx_reg(NVFXSR_NONE, 0);
 453         }
 454 }
 455
 456 static INLINE int
 457 tgsi_mask(uint tgsi)
 458 {
 459         int mask = 0;
 460
 461         if (tgsi & TGSI_WRITEMASK_X) mask |= NVFX_FP_MASK_X;
 462         if (tgsi & TGSI_WRITEMASK_Y) mask |= NVFX_FP_MASK_Y;
 463         if (tgsi & TGSI_WRITEMASK_Z) mask |= NVFX_FP_MASK_Z;
 464         if (tgsi & TGSI_WRITEMASK_W) mask |= NVFX_FP_MASK_W;
 465         return mask;
 466 }
 467
 468 static boolean
 469 nvfx_fragprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_fpc *fpc,
 470                                 const struct tgsi_full_instruction *finst)
 471 {
 472         const struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0));
 473         struct nvfx_insn insn;
 474         struct nvfx_src src[3], tmp, tmp2;
 475         struct nvfx_reg dst;
 476         int mask, sat, unit = 0;
 477         int ai = -1, ci = -1, ii = -1;
 478         int i;
 479
 480         if (finst->Instruction.Opcode == TGSI_OPCODE_END)
 481                 return TRUE;
 482
 483         for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
 484                 const struct tgsi_full_src_register *fsrc;
 485
 486                 fsrc = &finst->Src[i];
 487                 if (fsrc->Register.File == TGSI_FILE_TEMPORARY) {
 488                         src[i] = tgsi_src(fpc, fsrc);
 489                 }
 490         }
 491
 492         for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
 493                 const struct tgsi_full_src_register *fsrc;
 494
 495                 fsrc = &finst->Src[i];
 496
 497                 switch (fsrc->Register.File) {
 498                 case TGSI_FILE_INPUT:
 499                         if (ai == -1 || ai == fsrc->Register.Index) {
 500                                 ai = fsrc->Register.Index;
 501                                 src[i] = tgsi_src(fpc, fsrc);
 502                         } else {
 503                                 src[i] = nvfx_src(temp(fpc));
 504                                 nvfx_fp_emit(fpc, arith(0, MOV, src[i].reg, NVFX_FP_MASK_ALL, tgsi_src(fpc, fsrc), none, none));
 505                         }
 506                         break;
 507                 case TGSI_FILE_CONSTANT:
 508                         if ((ci == -1 && ii == -1) ||
 509                             ci == fsrc->Register.Index) {
 510                                 ci = fsrc->Register.Index;
 511                                 src[i] = tgsi_src(fpc, fsrc);
 512                         } else {
 513                                 src[i] = nvfx_src(temp(fpc));
 514                                 nvfx_fp_emit(fpc, arith(0, MOV, src[i].reg, NVFX_FP_MASK_ALL, tgsi_src(fpc, fsrc), none, none));
 515                         }
 516                         break;
 517                 case TGSI_FILE_IMMEDIATE:
 518                         if ((ci == -1 && ii == -1) ||
 519                             ii == fsrc->Register.Index) {
 520                                 ii = fsrc->Register.Index;
 521                                 src[i] = tgsi_src(fpc, fsrc);
 522                         } else {
 523                                 src[i] = nvfx_src(temp(fpc));
 524                                 nvfx_fp_emit(fpc, arith(0, MOV, src[i].reg, NVFX_FP_MASK_ALL, tgsi_src(fpc, fsrc), none, none));
 525                         }
 526                         break;
 527                 case TGSI_FILE_TEMPORARY:
 528                         /* handled above */
 529                         break;
 530                 case TGSI_FILE_SAMPLER:
 531                         unit = fsrc->Register.Index;
 532                         break;
 533                 case TGSI_FILE_OUTPUT:
 534                         break;
 535                 default:
 536                         NOUVEAU_ERR("bad src file\n");
 537                         return FALSE;
 538                 }
 539         }
 540
 541         dst  = tgsi_dst(fpc, &finst->Dst[0]);
 542         mask = tgsi_mask(finst->Dst[0].Register.WriteMask);
 543         sat  = (finst->Instruction.Saturate == TGSI_SAT_ZERO_ONE);
 544
 545         switch (finst->Instruction.Opcode) {
 546         case TGSI_OPCODE_ABS:
 547                 nvfx_fp_emit(fpc, arith(sat, MOV, dst, mask, abs(src[0]), none, none));
 548                 break;
 549         case TGSI_OPCODE_ADD:
 550                 nvfx_fp_emit(fpc, arith(sat, ADD, dst, mask, src[0], src[1], none));
 551                 break;
 552         case TGSI_OPCODE_CMP:
 553                 insn = arith(0, MOV, none.reg, mask, src[0], none, none);
 554                 insn.cc_update = 1;
 555                 nvfx_fp_emit(fpc, insn);
 556
 557                 insn = arith(sat, MOV, dst, mask, src[2], none, none);
 558                 insn.cc_test = NVFX_COND_GE;
 559                 nvfx_fp_emit(fpc, insn);
 560
 561                 insn = arith(sat, MOV, dst, mask, src[1], none, none);
 562                 insn.cc_test = NVFX_COND_LT;
 563                 nvfx_fp_emit(fpc, insn);
 564                 break;
 565         case TGSI_OPCODE_COS:
 566                 nvfx_fp_emit(fpc, arith(sat, COS, dst, mask, src[0], none, none));
 567                 break;
 568         case TGSI_OPCODE_DDX:
 569                 if (mask & (NVFX_FP_MASK_Z | NVFX_FP_MASK_W)) {
 570                         tmp = nvfx_src(temp(fpc));
 571                         nvfx_fp_emit(fpc, arith(sat, DDX, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, swz(src[0], Z, W, Z, W), none, none));
 572                         nvfx_fp_emit(fpc, arith(0, MOV, tmp.reg, NVFX_FP_MASK_Z | NVFX_FP_MASK_W, swz(tmp, X, Y, X, Y), none, none));
 573                         nvfx_fp_emit(fpc, arith(sat, DDX, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0], none, none));
 574                         nvfx_fp_emit(fpc, arith(0, MOV, dst, mask, tmp, none, none));
 575                 } else {
 576                         nvfx_fp_emit(fpc, arith(sat, DDX, dst, mask, src[0], none, none));
 577                 }
 578                 break;
 579         case TGSI_OPCODE_DDY:
 580                 if (mask & (NVFX_FP_MASK_Z | NVFX_FP_MASK_W)) {
 581                         tmp = nvfx_src(temp(fpc));
 582                         nvfx_fp_emit(fpc, arith(sat, DDY, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, swz(src[0], Z, W, Z, W), none, none));
 583                         nvfx_fp_emit(fpc, arith(0, MOV, tmp.reg, NVFX_FP_MASK_Z | NVFX_FP_MASK_W, swz(tmp, X, Y, X, Y), none, none));
 584                         nvfx_fp_emit(fpc, arith(sat, DDY, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0], none, none));
 585                         nvfx_fp_emit(fpc, arith(0, MOV, dst, mask, tmp, none, none));
 586                 } else {
 587                         nvfx_fp_emit(fpc, arith(sat, DDY, dst, mask, src[0], none, none));
 588                 }
 589                 break;
 590         case TGSI_OPCODE_DP2:
 591                 tmp = nvfx_src(temp(fpc));
 592                 nvfx_fp_emit(fpc, arith(0, MUL, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0], src[1], none));
 593                 nvfx_fp_emit(fpc, arith(0, ADD, dst, mask, swz(tmp, X, X, X, X), swz(tmp, Y, Y, Y, Y), none));
 594                 break;
 595         case TGSI_OPCODE_DP3:
 596                 nvfx_fp_emit(fpc, arith(sat, DP3, dst, mask, src[0], src[1], none));
 597                 break;
 598         case TGSI_OPCODE_DP4:
 599                 nvfx_fp_emit(fpc, arith(sat, DP4, dst, mask, src[0], src[1], none));
 600                 break;
 601         case TGSI_OPCODE_DPH:
 602                 tmp = nvfx_src(temp(fpc));
 603                 nvfx_fp_emit(fpc, arith(0, DP3, tmp.reg, NVFX_FP_MASK_X, src[0], src[1], none));
 604                 nvfx_fp_emit(fpc, arith(sat, ADD, dst, mask, swz(tmp, X, X, X, X), swz(src[1], W, W, W, W), none));
 605                 break;
 606         case TGSI_OPCODE_DST:
 607                 nvfx_fp_emit(fpc, arith(sat, DST, dst, mask, src[0], src[1], none));
 608                 break;
 609         case TGSI_OPCODE_EX2:
 610                 nvfx_fp_emit(fpc, arith(sat, EX2, dst, mask, src[0], none, none));
 611                 break;
 612         case TGSI_OPCODE_FLR:
 613                 nvfx_fp_emit(fpc, arith(sat, FLR, dst, mask, src[0], none, none));
 614                 break;
 615         case TGSI_OPCODE_FRC:
 616                 nvfx_fp_emit(fpc, arith(sat, FRC, dst, mask, src[0], none, none));
 617                 break;
 618         case TGSI_OPCODE_KILP:
 619                 nvfx_fp_emit(fpc, arith(0, KIL, none.reg, 0, none, none, none));
 620                 break;
 621         case TGSI_OPCODE_KIL:
 622                 insn = arith(0, MOV, none.reg, NVFX_FP_MASK_ALL, src[0], none, none);
 623                 insn.cc_update = 1;
 624                 nvfx_fp_emit(fpc, insn);
 625
 626                 insn = arith(0, KIL, none.reg, 0, none, none, none);
 627                 insn.cc_test = NVFX_COND_LT;
 628                 nvfx_fp_emit(fpc, insn);
 629                 break;
 630         case TGSI_OPCODE_LG2:
 631                 nvfx_fp_emit(fpc, arith(sat, LG2, dst, mask, src[0], none, none));
 632                 break;
 633         case TGSI_OPCODE_LIT:
 634                 if(!nvfx->is_nv4x)
 635                         nvfx_fp_emit(fpc, arith(sat, LIT_NV30, dst, mask, src[0], src[1], src[2]));
 636                 else {
 637                         /* we use FLT_MIN, so that log2 never gives -infinity, and thus multiplication by
 638                          * specular 0 always gives 0, so that ex2 gives 1, to satisfy the 0^0 = 1 requirement
 639                          *
 640                          * NOTE: if we start using half precision, we might need an fp16 FLT_MIN here instead
 641                          */
 642                         float maxv[4] = {0, FLT_MIN, 0, 0};
 643                         struct nvfx_src maxs = nvfx_src(constant(fpc, -1, maxv));
 644                         tmp = nvfx_src(temp(fpc));
 645                         if (ci>= 0 || ii >= 0) {
 646                                 nvfx_fp_emit(fpc, arith(0, MOV, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, maxs, none, none));
 647                                 maxs = tmp;
 648                         }
 649                         nvfx_fp_emit(fpc, arith(0, MAX, tmp.reg, NVFX_FP_MASK_Y | NVFX_FP_MASK_W, swz(src[0], X, X, X, Y), swz(maxs, X, X, Y, Y), none));
 650                         nvfx_fp_emit(fpc, arith(0, LG2, tmp.reg, NVFX_FP_MASK_W, swz(tmp, W, W, W, W), none, none));
 651                         nvfx_fp_emit(fpc, arith(0, MUL, tmp.reg, NVFX_FP_MASK_W, swz(tmp, W, W, W, W), swz(src[0], W, W, W, W), none));
 652                         nvfx_fp_emit(fpc, arith(sat, LITEX2_NV40, dst, mask, swz(tmp, Y, Y, W, W), none, none));
 653                 }
 654                 break;
 655         case TGSI_OPCODE_LRP:
 656                 if(!nvfx->is_nv4x)
 657                         nvfx_fp_emit(fpc, arith(sat, LRP_NV30, dst, mask, src[0], src[1], src[2]));
 658                 else {
 659                         tmp = nvfx_src(temp(fpc));
 660                         nvfx_fp_emit(fpc, arith(0, MAD, tmp.reg, mask, neg(src[0]), src[2], src[2]));
 661                         nvfx_fp_emit(fpc, arith(sat, MAD, dst, mask, src[0], src[1], tmp));
 662                 }
 663                 break;
 664         case TGSI_OPCODE_MAD:
 665                 nvfx_fp_emit(fpc, arith(sat, MAD, dst, mask, src[0], src[1], src[2]));
 666                 break;
 667         case TGSI_OPCODE_MAX:
 668                 nvfx_fp_emit(fpc, arith(sat, MAX, dst, mask, src[0], src[1], none));
 669                 break;
 670         case TGSI_OPCODE_MIN:
 671                 nvfx_fp_emit(fpc, arith(sat, MIN, dst, mask, src[0], src[1], none));
 672                 break;
 673         case TGSI_OPCODE_MOV:
 674                 nvfx_fp_emit(fpc, arith(sat, MOV, dst, mask, src[0], none, none));
 675                 break;
 676         case TGSI_OPCODE_MUL:
 677                 nvfx_fp_emit(fpc, arith(sat, MUL, dst, mask, src[0], src[1], none));
 678                 break;
 679         case TGSI_OPCODE_NOP:
 680                 break;
 681         case TGSI_OPCODE_POW:
 682                 if(!nvfx->is_nv4x)
 683                         nvfx_fp_emit(fpc, arith(sat, POW_NV30, dst, mask, src[0], src[1], none));
 684                 else {
 685                         tmp = nvfx_src(temp(fpc));
 686                         nvfx_fp_emit(fpc, arith(0, LG2, tmp.reg, NVFX_FP_MASK_X, swz(src[0], X, X, X, X), none, none));
 687                         nvfx_fp_emit(fpc, arith(0, MUL, tmp.reg, NVFX_FP_MASK_X, swz(tmp, X, X, X, X), swz(src[1], X, X, X, X), none));
 688                         nvfx_fp_emit(fpc, arith(sat, EX2, dst, mask, swz(tmp, X, X, X, X), none, none));
 689                 }
 690                 break;
 691         case TGSI_OPCODE_RCP:
 692                 nvfx_fp_emit(fpc, arith(sat, RCP, dst, mask, src[0], none, none));
 693                 break;
 694         case TGSI_OPCODE_RFL:
 695                 if(!nvfx->is_nv4x)
 696                         nvfx_fp_emit(fpc, arith(0, RFL_NV30, dst, mask, src[0], src[1], none));
 697                 else {
 698                         tmp = nvfx_src(temp(fpc));
 699                         nvfx_fp_emit(fpc, arith(0, DP3, tmp.reg, NVFX_FP_MASK_X, src[0], src[0], none));
 700                         nvfx_fp_emit(fpc, arith(0, DP3, tmp.reg, NVFX_FP_MASK_Y, src[0], src[1], none));
 701                         insn = arith(0, DIV, tmp.reg, NVFX_FP_MASK_Z, swz(tmp, Y, Y, Y, Y), swz(tmp, X, X, X, X), none);
 702                         insn.scale = NVFX_FP_OP_DST_SCALE_2X;
 703                         nvfx_fp_emit(fpc, insn);
 704                         nvfx_fp_emit(fpc, arith(sat, MAD, dst, mask, swz(tmp, Z, Z, Z, Z), src[0], neg(src[1])));
 705                 }
 706                 break;
 707         case TGSI_OPCODE_RSQ:
 708                 if(!nvfx->is_nv4x)
 709                         nvfx_fp_emit(fpc, arith(sat, RSQ_NV30, dst, mask, abs(swz(src[0], X, X, X, X)), none, none));
 710                 else {
 711                         tmp = nvfx_src(temp(fpc));
 712                         insn = arith(0, LG2, tmp.reg, NVFX_FP_MASK_X, abs(swz(src[0], X, X, X, X)), none, none);
 713                         insn.scale = NVFX_FP_OP_DST_SCALE_INV_2X;
 714                         nvfx_fp_emit(fpc, insn);
 715                         nvfx_fp_emit(fpc, arith(sat, EX2, dst, mask, neg(swz(tmp, X, X, X, X)), none, none));
 716                 }
 717                 break;
 718         case TGSI_OPCODE_SCS:
 719                 /* avoid overwriting the source */
 720                 if(src[0].swz[NVFX_SWZ_X] != NVFX_SWZ_X)
 721                 {
 722                         if (mask & NVFX_FP_MASK_X)
 723                                 nvfx_fp_emit(fpc, arith(sat, COS, dst, NVFX_FP_MASK_X, swz(src[0], X, X, X, X), none, none));
 724                         if (mask & NVFX_FP_MASK_Y)
 725                                 nvfx_fp_emit(fpc, arith(sat, SIN, dst, NVFX_FP_MASK_Y, swz(src[0], X, X, X, X), none, none));
 726                 }
 727                 else
 728                 {
 729                         if (mask & NVFX_FP_MASK_Y)
 730                                 nvfx_fp_emit(fpc, arith(sat, SIN, dst, NVFX_FP_MASK_Y, swz(src[0], X, X, X, X), none, none));
 731                         if (mask & NVFX_FP_MASK_X)
 732                                 nvfx_fp_emit(fpc, arith(sat, COS, dst, NVFX_FP_MASK_X, swz(src[0], X, X, X, X), none, none));
 733                 }
 734                 break;
 735         case TGSI_OPCODE_SEQ:
 736                 nvfx_fp_emit(fpc, arith(sat, SEQ, dst, mask, src[0], src[1], none));
 737                 break;
 738         case TGSI_OPCODE_SFL:
 739                 nvfx_fp_emit(fpc, arith(sat, SFL, dst, mask, src[0], src[1], none));
 740                 break;
 741         case TGSI_OPCODE_SGE:
 742                 nvfx_fp_emit(fpc, arith(sat, SGE, dst, mask, src[0], src[1], none));
 743                 break;
 744         case TGSI_OPCODE_SGT:
 745                 nvfx_fp_emit(fpc, arith(sat, SGT, dst, mask, src[0], src[1], none));
 746                 break;
 747         case TGSI_OPCODE_SIN:
 748                 nvfx_fp_emit(fpc, arith(sat, SIN, dst, mask, src[0], none, none));
 749                 break;
 750         case TGSI_OPCODE_SLE:
 751                 nvfx_fp_emit(fpc, arith(sat, SLE, dst, mask, src[0], src[1], none));
 752                 break;
 753         case TGSI_OPCODE_SLT:
 754                 nvfx_fp_emit(fpc, arith(sat, SLT, dst, mask, src[0], src[1], none));
 755                 break;
 756         case TGSI_OPCODE_SNE:
 757                 nvfx_fp_emit(fpc, arith(sat, SNE, dst, mask, src[0], src[1], none));
 758                 break;
 759         case TGSI_OPCODE_SSG:
 760         {
 761                 float minonesv[4] = {-1.0, -1.0, -1.0, -1.0};
 762                 struct nvfx_src minones = swz(nvfx_src(constant(fpc, -1, minonesv)), X, X, X, X);
 763
 764                 insn = arith(sat, MOV, dst, mask, src[0], none, none);
 765                 insn.cc_update = 1;
 766                 nvfx_fp_emit(fpc, insn);
 767
 768                 insn = arith(0, STR, dst, mask, none, none, none);
 769                 insn.cc_test = NVFX_COND_GT;
 770                 nvfx_fp_emit(fpc, insn);
 771
 772                 if(!sat) {
 773                         insn = arith(0, MOV, dst, mask, minones, none, none);
 774                         insn.cc_test = NVFX_COND_LT;
 775                         nvfx_fp_emit(fpc, insn);
 776                 }
 777                 break;
 778         }
 779         case TGSI_OPCODE_STR:
 780                 nvfx_fp_emit(fpc, arith(sat, STR, dst, mask, src[0], src[1], none));
 781                 break;
 782         case TGSI_OPCODE_SUB:
 783                 nvfx_fp_emit(fpc, arith(sat, ADD, dst, mask, src[0], neg(src[1]), none));
 784                 break;
 785         case TGSI_OPCODE_TEX:
 786                 nvfx_fp_emit(fpc, tex(sat, TEX, unit, dst, mask, src[0], none, none));
 787                 break;
 788         case TGSI_OPCODE_TRUNC:
 789                 tmp = nvfx_src(temp(fpc));
 790                 insn = arith(0, MOV, none.reg, mask, src[0], none, none);
 791                 insn.cc_update = 1;
 792                 nvfx_fp_emit(fpc, insn);
 793
 794                 nvfx_fp_emit(fpc, arith(0, FLR, tmp.reg, mask, abs(src[0]), none, none));
 795                 nvfx_fp_emit(fpc, arith(sat, MOV, dst, mask, tmp, none, none));
 796
 797                 insn = arith(sat, MOV, dst, mask, neg(tmp), none, none);
 798                 insn.cc_test = NVFX_COND_LT;
 799                 nvfx_fp_emit(fpc, insn);
 800                 break;
 801         case TGSI_OPCODE_TXB:
 802                 nvfx_fp_emit(fpc, tex(sat, TXB, unit, dst, mask, src[0], none, none));
 803                 break;
 804         case TGSI_OPCODE_TXL:
 805                 if(nvfx->is_nv4x)
 806                         nvfx_fp_emit(fpc, tex(sat, TXL_NV40, unit, dst, mask, src[0], none, none));
 807                 else /* unsupported on nv30, use TEX and hope they like it */
 808                         nvfx_fp_emit(fpc, tex(sat, TEX, unit, dst, mask, src[0], none, none));
 809                 break;
 810         case TGSI_OPCODE_TXP:
 811                 nvfx_fp_emit(fpc, tex(sat, TXP, unit, dst, mask, src[0], none, none));
 812                 break;
 813         case TGSI_OPCODE_XPD:
 814                 tmp = nvfx_src(temp(fpc));
 815                 nvfx_fp_emit(fpc, arith(0, MUL, tmp.reg, mask, swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none));
 816                 nvfx_fp_emit(fpc, arith(sat, MAD, dst, (mask & ~NVFX_FP_MASK_W), swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y), neg(tmp)));
 817                 break;
 818
 819         case TGSI_OPCODE_IF:
 820                 // MOVRC0 R31 (TR0.xyzw), R<src>:
 821                 // IF (NE.xxxx) ELSE <else> END <end>
 822                 if(!nvfx->is_nv4x)
 823                         goto nv3x_cflow;
 824                 nv40_fp_if(fpc, src[0]);
 825                 break;
 826
 827         case TGSI_OPCODE_ELSE:
 828         {
 829                 uint32_t *hw;
 830                 if(!nvfx->is_nv4x)
 831                         goto nv3x_cflow;
 832                 assert(util_dynarray_contains(&fpc->if_stack, unsigned));
 833                 hw = &fpc->fp->insn[util_dynarray_top(&fpc->if_stack, unsigned)];
 834                 hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH | fpc->fp->insn_len;
 835                 break;
 836         }
 837
 838         case TGSI_OPCODE_ENDIF:
 839         {
 840                 uint32_t *hw;
 841                 if(!nvfx->is_nv4x)
 842                         goto nv3x_cflow;
 843                 assert(util_dynarray_contains(&fpc->if_stack, unsigned));
 844                 hw = &fpc->fp->insn[util_dynarray_pop(&fpc->if_stack, unsigned)];
 845                 if(!hw[2])
 846                         hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH | fpc->fp->insn_len;
 847                 hw[3] = fpc->fp->insn_len;
 848                 break;
 849         }
 850
 851         case TGSI_OPCODE_BRA:
 852                 /* This can in limited cases be implemented with an IF with the else and endif labels pointing to the target */
 853                 /* no state tracker uses this, so don't implement this for now */
 854                 assert(0);
 855                 nv40_fp_bra(fpc, finst->Label.Label);
 856                 break;
 857
 858         case TGSI_OPCODE_BGNSUB:
 859         case TGSI_OPCODE_ENDSUB:
 860                 /* nothing to do here */
 861                 break;
 862
 863         case TGSI_OPCODE_CAL:
 864                 if(!nvfx->is_nv4x)
 865                         goto nv3x_cflow;
 866                 nv40_fp_cal(fpc, finst->Label.Label);
 867                 break;
 868
 869         case TGSI_OPCODE_RET:
 870                 if(!nvfx->is_nv4x)
 871                         goto nv3x_cflow;
 872                 nv40_fp_ret(fpc);
 873                 break;
 874
 875         case TGSI_OPCODE_BGNLOOP:
 876                 if(!nvfx->is_nv4x)
 877                         goto nv3x_cflow;
 878                 /* TODO: we should support using two nested REPs to allow a > 255 iteration count */
 879                 nv40_fp_rep(fpc, 255, finst->Label.Label);
 880                 break;
 881
 882         case TGSI_OPCODE_ENDLOOP:
 883                 break;
 884
 885         case TGSI_OPCODE_BRK:
 886                 if(!nvfx->is_nv4x)
 887                         goto nv3x_cflow;
 888                 nv40_fp_brk(fpc);
 889                 break;
 890
 891         case TGSI_OPCODE_CONT:
 892         {
 893                 static int warned = 0;
 894                 if(!warned) {
 895                         NOUVEAU_ERR("Sorry, the continue keyword is not implemented: ignoring it.\n");
 896                         warned = 1;
 897                 }
 898                 break;
 899         }
 900
 901         default:
 902                 NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
 903                 return FALSE;
 904         }
 905
 906 out:
 907         release_temps(fpc);
 908         return TRUE;
 909 nv3x_cflow:
 910         {
 911                 static int warned = 0;
 912                 if(!warned) {
 913                         NOUVEAU_ERR(
 914                                         "Sorry, control flow instructions are not supported in hardware on nv3x: ignoring them\n"
 915                                         "If rendering is incorrect, try to disable GLSL support in the application.\n");
 916                         warned = 1;
 917                 }
 918         }
 919         goto out;
 920 }
 921
 922 static boolean
 923 nvfx_fragprog_parse_decl_output(struct nvfx_context* nvfx, struct nvfx_fpc *fpc,
 924                                 const struct tgsi_full_declaration *fdec)
 925 {
 926         unsigned idx = fdec->Range.First;
 927         unsigned hw;
 928
 929         switch (fdec->Semantic.Name) {
 930         case TGSI_SEMANTIC_POSITION:
 931                 hw = 1;
 932                 break;
 933         case TGSI_SEMANTIC_COLOR:
 934                 hw = ~0;
 935                 switch (fdec->Semantic.Index) {
 936                 case 0: hw = 0; break;
 937                 case 1: hw = 2; break;
 938                 case 2: hw = 3; break;
 939                 case 3: hw = 4; break;
 940                 }
 941                 if(hw > ((nvfx->is_nv4x) ? 4 : 2)) {
 942                         NOUVEAU_ERR("bad rcol index\n");
 943                         return FALSE;
 944                 }
 945                 break;
 946         default:
 947                 NOUVEAU_ERR("bad output semantic\n");
 948                 return FALSE;
 949         }
 950
 951         fpc->r_result[idx] = nvfx_reg(NVFXSR_OUTPUT, hw);
 952         fpc->r_temps |= (1ULL << hw);
 953         return TRUE;
 954 }
 955
 956 static boolean
 957 nvfx_fragprog_prepare(struct nvfx_context* nvfx, struct nvfx_fpc *fpc)
 958 {
 959         struct tgsi_parse_context p;
 960         int high_temp = -1, i;
 961         struct util_semantic_set set;
 962         float const0v[4] = {0, 0, 0, 0};
 963         struct nvfx_reg const0;
 964         unsigned num_texcoords = nvfx->is_nv4x ? 10 : 8;
 965
 966         fpc->fp->num_slots = util_semantic_set_from_program_file(&set, fpc->pfp->pipe.tokens, TGSI_FILE_INPUT);
 967         if(fpc->fp->num_slots > num_texcoords)
 968                 return FALSE;
 969         util_semantic_layout_from_set(fpc->fp->slot_to_generic, &set, 0, num_texcoords);
 970         util_semantic_table_from_layout(fpc->generic_to_slot, fpc->fp->slot_to_generic, 0, num_texcoords);
 971
 972         memset(fpc->fp->slot_to_fp_input, 0xff, sizeof(fpc->fp->slot_to_fp_input));
 973
 974         const0 = constant(fpc, -1, const0v);
 975         assert(const0.index == 0);
 976
 977         tgsi_parse_init(&p, fpc->pfp->pipe.tokens);
 978         while (!tgsi_parse_end_of_tokens(&p)) {
 979                 const union tgsi_full_token *tok = &p.FullToken;
 980
 981                 tgsi_parse_token(&p);
 982                 switch(tok->Token.Type) {
 983                 case TGSI_TOKEN_TYPE_DECLARATION:
 984                 {
 985                         const struct tgsi_full_declaration *fdec;
 986                         fdec = &p.FullToken.FullDeclaration;
 987                         switch (fdec->Declaration.File) {
 988                         case TGSI_FILE_OUTPUT:
 989                                 if (!nvfx_fragprog_parse_decl_output(nvfx, fpc, fdec))
 990                                         goto out_err;
 991                                 break;
 992                         case TGSI_FILE_TEMPORARY:
 993                                 if (fdec->Range.Last > high_temp) {
 994                                         high_temp =
 995                                                 fdec->Range.Last;
 996                                 }
 997                                 break;
 998                         default:
 999                                 break;
1000                         }
1001                 }
1002                         break;
1003                 case TGSI_TOKEN_TYPE_IMMEDIATE:
1004                 {
1005                         struct tgsi_full_immediate *imm;
1006                         float vals[4];
1007
1008                         imm = &p.FullToken.FullImmediate;
1009                         assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
1010                         assert(fpc->nr_imm < MAX_IMM);
1011
1012                         vals[0] = imm->u[0].Float;
1013                         vals[1] = imm->u[1].Float;
1014                         vals[2] = imm->u[2].Float;
1015                         vals[3] = imm->u[3].Float;
1016                         fpc->imm[fpc->nr_imm++] = constant(fpc, -1, vals);
1017                 }
1018                         break;
1019                 default:
1020                         break;
1021                 }
1022         }
1023         tgsi_parse_free(&p);
1024
1025         if (++high_temp) {
1026                 fpc->r_temp = CALLOC(high_temp, sizeof(struct nvfx_reg));
1027                 for (i = 0; i < high_temp; i++)
1028                         fpc->r_temp[i] = temp(fpc);
1029                 fpc->r_temps_discard = 0ULL;
1030         }
1031
1032         return TRUE;
1033
1034 out_err:
1035         if (fpc->r_temp) {
1036                 FREE(fpc->r_temp);
1037                 fpc->r_temp = NULL;
1038         }
1039         tgsi_parse_free(&p);
1040         return FALSE;
1041 }
1042
1043 DEBUG_GET_ONCE_BOOL_OPTION(nvfx_dump_fp, "NVFX_DUMP_FP", FALSE)
1044
1045 static struct nvfx_fragment_program*
1046 nvfx_fragprog_translate(struct nvfx_context *nvfx,
1047                         struct nvfx_pipe_fragment_program *pfp,
1048                         boolean emulate_sprite_flipping)
1049 {
1050         struct tgsi_parse_context parse;
1051         struct nvfx_fpc *fpc = NULL;
1052         struct util_dynarray insns;
1053         struct nvfx_fragment_program* fp = NULL;
1054         const int min_size = 4096;
1055
1056         fp = CALLOC_STRUCT(nvfx_fragment_program);
1057         if(!fp)
1058                 goto out_err;
1059
1060         fpc = CALLOC_STRUCT(nvfx_fpc);
1061         if (!fpc)
1062                 goto out_err;
1063
1064         fpc->max_temps = nvfx->is_nv4x ? 48 : 32;
1065         fpc->pfp = pfp;
1066         fpc->fp = fp;
1067         fpc->num_regs = 2;
1068
1069         for (unsigned i = 0; i < pfp->info.num_properties; ++i) {
1070                 if (pfp->info.properties[i].name == TGSI_PROPERTY_FS_COORD_ORIGIN) {
1071                         if(pfp->info.properties[i].data[0])
1072                                 fp->coord_conventions |= NV34TCL_COORD_CONVENTIONS_ORIGIN_INVERTED;
1073                 } else if (pfp->info.properties[i].name == TGSI_PROPERTY_FS_COORD_PIXEL_CENTER) {
1074                         if(pfp->info.properties[i].data[0])
1075                                 fp->coord_conventions |= NV34TCL_COORD_CONVENTIONS_CENTER_INTEGER;
1076                 }
1077         }
1078
1079         if (!nvfx_fragprog_prepare(nvfx, fpc))
1080                 goto out_err;
1081
1082         tgsi_parse_init(&parse, pfp->pipe.tokens);
1083         util_dynarray_init(&insns);
1084
1085         if(emulate_sprite_flipping)
1086         {
1087                 struct nvfx_reg reg = temp(fpc);
1088                 struct nvfx_src sprite_input = nvfx_src(nvfx_reg(NVFXSR_RELOCATED, fp->num_slots));
1089                 float v[4] = {1, -1, 0, 0};
1090                 struct nvfx_src imm = nvfx_src(constant(fpc, -1, v));
1091
1092                 fpc->sprite_coord_temp = reg.index;
1093                 fpc->r_temps_discard = 0ULL;
1094                 nvfx_fp_emit(fpc, arith(0, MAD, reg, NVFX_FP_MASK_ALL, sprite_input, swz(imm, X, Y, X, X), swz(imm, Z, X, Z, Z)));
1095         }
1096
1097         while (!tgsi_parse_end_of_tokens(&parse)) {
1098                 tgsi_parse_token(&parse);
1099
1100                 switch (parse.FullToken.Token.Type) {
1101                 case TGSI_TOKEN_TYPE_INSTRUCTION:
1102                 {
1103                         const struct tgsi_full_instruction *finst;
1104
1105                         util_dynarray_append(&insns, unsigned, fp->insn_len);
1106                         finst = &parse.FullToken.FullInstruction;
1107                         if (!nvfx_fragprog_parse_instruction(nvfx, fpc, finst))
1108                                 goto out_err;
1109                 }
1110                         break;
1111                 default:
1112                         break;
1113                 }
1114         }
1115         util_dynarray_append(&insns, unsigned, fp->insn_len);
1116
1117         for(unsigned i = 0; i < fpc->label_relocs.size; i += sizeof(struct nvfx_relocation))
1118         {
1119                 struct nvfx_relocation* label_reloc = (struct nvfx_relocation*)((char*)fpc->label_relocs.data + i);
1120                 fp->insn[label_reloc->location] |= ((unsigned*)insns.data)[label_reloc->target];
1121         }
1122         util_dynarray_fini(&insns);
1123
1124         if(!nvfx->is_nv4x)
1125                 fp->fp_control |= (fpc->num_regs-1)/2;
1126         else
1127                 fp->fp_control |= fpc->num_regs << NV40TCL_FP_CONTROL_TEMP_COUNT_SHIFT;
1128
1129         /* Terminate final instruction */
1130         if(fp->insn)
1131                 fp->insn[fpc->inst_offset] |= 0x00000001;
1132
1133         /* Append NOP + END instruction for branches to the end of the program */
1134         fpc->inst_offset = fp->insn_len;
1135         grow_insns(fpc, 4);
1136         fp->insn[fpc->inst_offset + 0] = 0x00000001;
1137         fp->insn[fpc->inst_offset + 1] = 0x00000000;
1138         fp->insn[fpc->inst_offset + 2] = 0x00000000;
1139         fp->insn[fpc->inst_offset + 3] = 0x00000000;
1140
1141         if(debug_get_option_nvfx_dump_fp())
1142         {
1143                 debug_printf("\n");
1144                 tgsi_dump(pfp->pipe.tokens, 0);
1145
1146                 debug_printf("\n%s fragment program:\n", nvfx->is_nv4x ? "nv4x" : "nv3x");
1147                 for (unsigned i = 0; i < fp->insn_len; i += 4)
1148                         debug_printf("%3u: %08x %08x %08x %08x\n", i >> 2, fp->insn[i], fp->insn[i + 1], fp->insn[i + 2], fp->insn[i + 3]);
1149                 debug_printf("\n");
1150         }
1151
1152         fp->prog_size = (fp->insn_len * 4 + 63) & ~63;
1153
1154         if(fp->prog_size >= min_size)
1155                 fp->progs_per_bo = 1;
1156         else
1157                 fp->progs_per_bo = min_size / fp->prog_size;
1158         fp->bo_prog_idx = fp->progs_per_bo - 1;
1159
1160 out:
1161         tgsi_parse_free(&parse);
1162         if(fpc)
1163         {
1164                 if (fpc->r_temp)
1165                         FREE(fpc->r_temp);
1166                 util_dynarray_fini(&fpc->if_stack);
1167                 util_dynarray_fini(&fpc->label_relocs);
1168                 //util_dynarray_fini(&fpc->loop_stack);
1169                 FREE(fpc);
1170         }
1171         return fp;
1172
1173 out_err:
1174         _debug_printf("Error: failed to compile this fragment program:\n");
1175         tgsi_dump(pfp->pipe.tokens, 0);
1176
1177         if(fp)
1178         {
1179                 FREE(fp);
1180                 fp = NULL;
1181         }
1182         goto out;
1183 }
1184
1185 static inline void
1186 nvfx_fp_memcpy(void* dst, const void* src, size_t len)
1187 {
1188 #ifndef WORDS_BIGENDIAN
1189         memcpy(dst, src, len);
1190 #else
1191         size_t i;
1192         for(i = 0; i < len; i += 4) {
1193                 uint32_t v = (uint32_t*)((char*)src + i);
1194                 *(uint32_t*)((char*)dst + i) = (v >> 16) | (v << 16);
1195         }
1196 #endif
1197 }
1198
1199 /* The hardware only supports immediate constants inside the fragment program,
1200  * and at least on nv30 doesn't support an indirect linkage table.
1201  *
1202  * Hence, we need to patch the fragment program itself both to update constants
1203  * and update linkage.
1204  *
1205  * Using a single fragment program would entail unacceptable stalls if the GPU is
1206  * already rendering with that fragment program.
1207  * Thus, we instead use a "rotating queue" of buffer objects, each of which is
1208  * packed with multiple versions of the same program.
1209  *
1210  * Whenever we need to patch something, we move to the next program and
1211  * patch it. If all buffer objects are in use by the GPU, we allocate another one,
1212  * expanding the queue.
1213  *
1214  * As an additional optimization, we record when all the programs have the
1215  * current input slot configuration, and at that point we stop patching inputs.
1216  * This happens, for instance, if a given fragment program is always used with
1217  * the same vertex program (i.e. always with GLSL), or if the layouts match
1218  * enough (non-GLSL).
1219  *
1220  * Note that instead of using multiple programs, we could push commands
1221  * on the FIFO to patch a single program: it's not fully clear which option is
1222  * faster, but my guess is that the current way is faster.
1223  *
1224  * We also track the previous slot assignments for each version and don't
1225  * patch if they are the same (this could perhaps be removed).
1226  */
1227
1228 void
1229 nvfx_fragprog_validate(struct nvfx_context *nvfx)
1230 {
1231         struct nouveau_channel* chan = nvfx->screen->base.channel;
1232         struct nvfx_pipe_fragment_program *pfp = nvfx->fragprog;
1233         struct nvfx_vertex_program* vp;
1234         /* Gallium always puts the point coord in GENERIC[0]
1235          * TODO: this is wrong, Gallium needs to be fixed
1236          */
1237         unsigned sprite_coord_enable = nvfx->rasterizer->pipe.point_quad_rasterization * (nvfx->rasterizer->pipe.sprite_coord_enable | 1);
1238
1239         boolean emulate_sprite_flipping = sprite_coord_enable && nvfx->rasterizer->pipe.sprite_coord_mode;
1240         unsigned key = emulate_sprite_flipping;
1241         struct nvfx_fragment_program* fp;
1242
1243         fp = pfp->fps[key];
1244         if (!fp)
1245         {
1246                 fp = nvfx_fragprog_translate(nvfx, pfp, emulate_sprite_flipping);
1247
1248                 if(!fp)
1249                 {
1250                         if(!nvfx->dummy_fs)
1251                         {
1252                                 struct ureg_program *ureg = ureg_create( TGSI_PROCESSOR_FRAGMENT );
1253                                 if (ureg)
1254                                 {
1255                                         ureg_END( ureg );
1256                                         nvfx->dummy_fs = ureg_create_shader_and_destroy( ureg, &nvfx->pipe );
1257                                 }
1258
1259                                 if(!nvfx->dummy_fs)
1260                                 {
1261                                         _debug_printf("Error: unable to create a dummy fragment shader: aborting.");
1262                                         abort();
1263                                 }
1264                         }
1265
1266                         fp = nvfx_fragprog_translate(nvfx, nvfx->dummy_fs, FALSE);
1267                         emulate_sprite_flipping = FALSE;
1268
1269                         if(!fp)
1270                         {
1271                                 _debug_printf("Error: unable to compile even a dummy fragment shader: aborting.");
1272                                 abort();
1273                         }
1274                 }
1275
1276                 pfp->fps[key] = fp;
1277         }
1278
1279         vp = nvfx->render_mode == HW ? nvfx->vertprog : nvfx->swtnl.vertprog;
1280
1281         if (fp->last_vp_id != vp->id || fp->last_sprite_coord_enable != sprite_coord_enable) {
1282                 int sprite_real_input = -1;
1283                 int sprite_reloc_input;
1284                 unsigned i;
1285                 fp->last_vp_id = vp->id;
1286                 fp->last_sprite_coord_enable = sprite_coord_enable;
1287
1288                 if(sprite_coord_enable)
1289                 {
1290                         sprite_real_input = vp->sprite_fp_input;
1291                         if(sprite_real_input < 0)
1292                         {
1293                                 unsigned used_texcoords = 0;
1294                                 for(unsigned i = 0; i < fp->num_slots; ++i) {
1295                                         unsigned generic = fp->slot_to_generic[i];
1296                                         if(!((1 << generic) & sprite_coord_enable))
1297                                         {
1298                                                 unsigned char slot_mask = vp->generic_to_fp_input[generic];
1299                                                 if(slot_mask >= 0xf0)
1300                                                         used_texcoords |= 1 << ((slot_mask & 0xf) - NVFX_FP_OP_INPUT_SRC_TC0);
1301                                         }
1302                                 }
1303
1304                                 sprite_real_input = NVFX_FP_OP_INPUT_SRC_TC(__builtin_ctz(~used_texcoords));
1305                         }
1306
1307                         fp->point_sprite_control |= (1 << (sprite_real_input - NVFX_FP_OP_INPUT_SRC_TC0 + 8));
1308                 }
1309                 else
1310                         fp->point_sprite_control = 0;
1311
1312                 if(emulate_sprite_flipping)
1313                    sprite_reloc_input = 0;
1314                 else
1315                    sprite_reloc_input = sprite_real_input;
1316
1317                 for(i = 0; i < fp->num_slots; ++i) {
1318                         unsigned generic = fp->slot_to_generic[i];
1319                         if((1 << generic) & sprite_coord_enable)
1320                         {
1321                                 if(fp->slot_to_fp_input[i] != sprite_reloc_input)
1322                                         goto update_slots;
1323                         }
1324                         else
1325                         {
1326                                 unsigned char slot_mask = vp->generic_to_fp_input[generic];
1327                                 if((slot_mask >> 4) & (slot_mask ^ fp->slot_to_fp_input[i]))
1328                                         goto update_slots;
1329                         }
1330                 }
1331
1332                 if(emulate_sprite_flipping)
1333                 {
1334                         if(fp->slot_to_fp_input[fp->num_slots] != sprite_real_input)
1335                                 goto update_slots;
1336                 }
1337
1338                 if(0)
1339                 {
1340 update_slots:
1341                         /* optimization: we start updating from the slot we found the first difference in */
1342                         for(; i < fp->num_slots; ++i)
1343                         {
1344                                 unsigned generic = fp->slot_to_generic[i];
1345                                 if((1 << generic) & sprite_coord_enable)
1346                                         fp->slot_to_fp_input[i] = sprite_reloc_input;
1347                                 else
1348                                         fp->slot_to_fp_input[i] = vp->generic_to_fp_input[generic] & 0xf;
1349                         }
1350
1351                         fp->slot_to_fp_input[fp->num_slots] = sprite_real_input;
1352
1353                         if(nvfx->is_nv4x)
1354                         {
1355                                 fp->or = 0;
1356                                 for(i = 0; i <= fp->num_slots; ++i) {
1357                                         unsigned fp_input = fp->slot_to_fp_input[i];
1358                                         if(fp_input == NVFX_FP_OP_INPUT_SRC_TC(8))
1359                                                 fp->or |= (1 << 12);
1360                                         else if(fp_input == NVFX_FP_OP_INPUT_SRC_TC(9))
1361                                                 fp->or |= (1 << 13);
1362                                         else if(fp_input >= NVFX_FP_OP_INPUT_SRC_TC(0) && fp_input <= NVFX_FP_OP_INPUT_SRC_TC(7))
1363                                                 fp->or |= (1 << (fp_input - NVFX_FP_OP_INPUT_SRC_TC0 + 14));
1364                                 }
1365                         }
1366
1367                         fp->progs_left_with_obsolete_slot_assignments = fp->progs;
1368                         goto update;
1369                 }
1370         }
1371
1372         /* We must update constants even on "just" fragprog changes, because
1373           * we don't check whether the current constant buffer matches the latest
1374           * one bound to this fragment program.
1375           * Doing such a check would likely be a pessimization.
1376           */
1377         if ((nvfx->hw_fragprog != fp) || (nvfx->dirty & (NVFX_NEW_FRAGPROG | NVFX_NEW_FRAGCONST))) {
1378                 int offset;
1379                 uint32_t* fpmap;
1380
1381 update:
1382                 ++fp->bo_prog_idx;
1383                 if(fp->bo_prog_idx >= fp->progs_per_bo)
1384                 {
1385                         if(fp->fpbo && !nouveau_bo_busy(fp->fpbo->next->bo, NOUVEAU_BO_WR))
1386                         {
1387                                 fp->fpbo = fp->fpbo->next;
1388                         }
1389                         else
1390                         {
1391                                 struct nvfx_fragment_program_bo* fpbo = os_malloc_aligned(sizeof(struct nvfx_fragment_program) + (fp->prog_size + 8) * fp->progs_per_bo, 16);
1392                                 uint8_t* map;
1393                                 uint8_t* buf;
1394
1395                                 fpbo->slots = (unsigned char*)&fpbo->insn[(fp->prog_size) * fp->progs_per_bo];
1396                                 memset(fpbo->slots, 0, 8 * fp->progs_per_bo);
1397                                 if(fp->fpbo)
1398                                 {
1399                                         fpbo->next = fp->fpbo->next;
1400                                         fp->fpbo->next = fpbo;
1401                                 }
1402                                 else
1403                                         fpbo->next = fpbo;
1404                                 fp->fpbo = fpbo;
1405                                 fpbo->bo = 0;
1406                                 fp->progs += fp->progs_per_bo;
1407                                 fp->progs_left_with_obsolete_slot_assignments += fp->progs_per_bo;
1408                                 nouveau_bo_new(nvfx->screen->base.device, NOUVEAU_BO_VRAM | NOUVEAU_BO_MAP, 64, fp->prog_size * fp->progs_per_bo, &fpbo->bo);
1409                                 nouveau_bo_map(fpbo->bo, NOUVEAU_BO_NOSYNC);
1410
1411                                 map = fpbo->bo->map;
1412                                 buf = (uint8_t*)fpbo->insn;
1413                                 for(unsigned i = 0; i < fp->progs_per_bo; ++i)
1414                                 {
1415                                         memcpy(buf, fp->insn, fp->insn_len * 4);
1416                                         nvfx_fp_memcpy(map, fp->insn, fp->insn_len * 4);
1417                                         map += fp->prog_size;
1418                                         buf += fp->prog_size;
1419                                 }
1420                         }
1421                         fp->bo_prog_idx = 0;
1422                 }
1423
1424                 offset = fp->bo_prog_idx * fp->prog_size;
1425                 fpmap = (uint32_t*)((char*)fp->fpbo->bo->map + offset);
1426
1427                 if(nvfx->constbuf[PIPE_SHADER_FRAGMENT]) {
1428                         struct pipe_resource* constbuf = nvfx->constbuf[PIPE_SHADER_FRAGMENT];
1429                         uint32_t* map = (uint32_t*)nvfx_buffer(constbuf)->data;
1430                         uint32_t* fpmap = (uint32_t*)((char*)fp->fpbo->bo->map + offset);
1431                         uint32_t* buf = (uint32_t*)((char*)fp->fpbo->insn + offset);
1432                         int i;
1433                         for (i = 0; i < fp->nr_consts; ++i) {
1434                                 unsigned off = fp->consts[i].offset;
1435                                 unsigned idx = fp->consts[i].index * 4;
1436
1437                                 /* TODO: is checking a good idea? */
1438                                 if(memcmp(&buf[off], &map[idx], 4 * sizeof(uint32_t))) {
1439                                         memcpy(&buf[off], &map[idx], 4 * sizeof(uint32_t));
1440                                         nvfx_fp_memcpy(&fpmap[off], &map[idx], 4 * sizeof(uint32_t));
1441                                 }
1442                         }
1443                 }
1444
1445                 /* we only do this if we aren't sure that all program versions have the
1446                  * current slot assignments, otherwise we just update constants for speed
1447                  */
1448                 if(fp->progs_left_with_obsolete_slot_assignments) {
1449                         unsigned char* fpbo_slots = &fp->fpbo->slots[fp->bo_prog_idx * 8];
1450                         /* also relocate sprite coord slot, if any */
1451                         for(unsigned i = 0; i <= fp->num_slots; ++i) {
1452                                 unsigned value = fp->slot_to_fp_input[i];;
1453                                 if(value != fpbo_slots[i]) {
1454                                         unsigned* p;
1455                                         unsigned* begin = (unsigned*)fp->slot_relocations[i].data;
1456                                         unsigned* end = (unsigned*)((char*)fp->slot_relocations[i].data + fp->slot_relocations[i].size);
1457                                         //printf("fp %p reloc slot %u/%u: %u -> %u\n", fp, i, fp->num_slots, fpbo_slots[i], value);
1458                                         if(value == 0)
1459                                         {
1460                                                 /* was relocated to an input, switch type to temporary */
1461                                                 for(p = begin; p != end; ++p) {
1462                                                         unsigned off = *p;
1463                                                         unsigned dw = fp->insn[off];
1464                                                         dw &=~ NVFX_FP_REG_TYPE_MASK;
1465                                                         //printf("reloc_tmp at %x\n", off);
1466                                                         nvfx_fp_memcpy(&fpmap[off], &dw, sizeof(dw));
1467                                                 }
1468                                         } else {
1469                                                 if(!fpbo_slots[i])
1470                                                 {
1471                                                         /* was relocated to a temporary, switch type to input */
1472                                                         for(p= begin; p != end; ++p) {
1473                                                                 unsigned off = *p;
1474                                                                 unsigned dw = fp->insn[off];
1475                                                                 //printf("reloc_in at %x\n", off);
1476                                                                 dw |= NVFX_FP_REG_TYPE_INPUT << NVFX_FP_REG_TYPE_SHIFT;
1477                                                                 nvfx_fp_memcpy(&fpmap[off], &dw, sizeof(dw));
1478                                                         }
1479                                                 }
1480
1481                                                 /* set the correct input index */
1482                                                 for(p = begin; p != end; ++p) {
1483                                                         unsigned off = *p & ~3;
1484                                                         unsigned dw = fp->insn[off];
1485                                                         //printf("reloc&~3 at %x\n", off);
1486                                                         dw = (dw & ~NVFX_FP_OP_INPUT_SRC_MASK) | (value << NVFX_FP_OP_INPUT_SRC_SHIFT);
1487                                                         nvfx_fp_memcpy(&fpmap[off], &dw, sizeof(dw));
1488                                                 }
1489                                         }
1490                                         fpbo_slots[i] = value;
1491                                 }
1492                         }
1493                         --fp->progs_left_with_obsolete_slot_assignments;
1494                 }
1495
1496                 nvfx->hw_fragprog = fp;
1497
1498                 MARK_RING(chan, 8, 1);
1499                 OUT_RING(chan, RING_3D(NV34TCL_FP_ACTIVE_PROGRAM, 1));
1500                 OUT_RELOC(chan, fp->fpbo->bo, offset, NOUVEAU_BO_VRAM |
1501                               NOUVEAU_BO_GART | NOUVEAU_BO_RD | NOUVEAU_BO_LOW |
1502                               NOUVEAU_BO_OR, NV34TCL_FP_ACTIVE_PROGRAM_DMA0,
1503                               NV34TCL_FP_ACTIVE_PROGRAM_DMA1);
1504                 OUT_RING(chan, RING_3D(NV34TCL_FP_CONTROL, 1));
1505                 OUT_RING(chan, fp->fp_control);
1506                 if(!nvfx->is_nv4x) {
1507                         OUT_RING(chan, RING_3D(NV34TCL_FP_REG_CONTROL, 1));
1508                         OUT_RING(chan, (1<<16)|0x4);
1509                         OUT_RING(chan, RING_3D(NV34TCL_TX_UNITS_ENABLE, 1));
1510                         OUT_RING(chan, fp->samplers);
1511                 }
1512         }
1513
1514         {
1515                 unsigned pointsprite_control = fp->point_sprite_control | nvfx->rasterizer->pipe.point_quad_rasterization;
1516                 if(pointsprite_control != nvfx->hw_pointsprite_control)
1517                 {
1518                         WAIT_RING(chan, 2);
1519                         OUT_RING(chan, RING_3D(NV34TCL_POINT_SPRITE, 1));
1520                         OUT_RING(chan, pointsprite_control);
1521                         nvfx->hw_pointsprite_control = pointsprite_control;
1522                 }
1523         }
1524
1525         nvfx->relocs_needed &=~ NVFX_RELOCATE_FRAGPROG;
1526 }
1527
1528 void
1529 nvfx_fragprog_relocate(struct nvfx_context *nvfx)
1530 {
1531         struct nouveau_channel* chan = nvfx->screen->base.channel;
1532         struct nvfx_fragment_program *fp = nvfx->hw_fragprog;
1533         struct nouveau_bo* bo = fp->fpbo->bo;
1534         int offset = fp->bo_prog_idx * fp->prog_size;
1535         unsigned fp_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RD; // TODO: GART?
1536         fp_flags |= NOUVEAU_BO_DUMMY;
1537         MARK_RING(chan, 2, 2);
1538         OUT_RELOC(chan, bo, RING_3D(NV34TCL_FP_ACTIVE_PROGRAM, 1), fp_flags, 0, 0);
1539         OUT_RELOC(chan, bo, offset, fp_flags | NOUVEAU_BO_LOW |
1540                       NOUVEAU_BO_OR, NV34TCL_FP_ACTIVE_PROGRAM_DMA0,
1541                       NV34TCL_FP_ACTIVE_PROGRAM_DMA1);
1542         nvfx->relocs_needed &=~ NVFX_RELOCATE_FRAGPROG;
1543 }
1544
1545 void
1546 nvfx_fragprog_destroy(struct nvfx_context *nvfx,
1547                       struct nvfx_fragment_program *fp)
1548 {
1549         unsigned i;
1550         struct nvfx_fragment_program_bo* fpbo = fp->fpbo;
1551         if(fpbo)
1552         {
1553                 do
1554                 {
1555                         struct nvfx_fragment_program_bo* next = fpbo->next;
1556                         nouveau_bo_unmap(fpbo->bo);
1557                         nouveau_bo_ref(0, &fpbo->bo);
1558                         free(fpbo);
1559                         fpbo = next;
1560                 }
1561                 while(fpbo != fp->fpbo);
1562         }
1563
1564         for(i = 0; i < Elements(fp->slot_relocations); ++i)
1565                 util_dynarray_fini(&fp->slot_relocations[i]);
1566
1567         if (fp->insn_len)
1568                 FREE(fp->insn);
1569 }
1570
1571 static void *
1572 nvfx_fp_state_create(struct pipe_context *pipe,
1573                      const struct pipe_shader_state *cso)
1574 {
1575         struct nvfx_pipe_fragment_program *pfp;
1576
1577         pfp = CALLOC(1, sizeof(struct nvfx_pipe_fragment_program));
1578         pfp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
1579
1580         tgsi_scan_shader(pfp->pipe.tokens, &pfp->info);
1581
1582         return (void *)pfp;
1583 }
1584
1585 static void
1586 nvfx_fp_state_bind(struct pipe_context *pipe, void *hwcso)
1587 {
1588         struct nvfx_context *nvfx = nvfx_context(pipe);
1589
1590         nvfx->fragprog = hwcso;
1591         nvfx->dirty |= NVFX_NEW_FRAGPROG;
1592 }
1593
1594 static void
1595 nvfx_fp_state_delete(struct pipe_context *pipe, void *hwcso)
1596 {
1597         struct nvfx_context *nvfx = nvfx_context(pipe);
1598         struct nvfx_pipe_fragment_program *pfp = hwcso;
1599         unsigned i;
1600
1601         for(i = 0; i < Elements(pfp->fps); ++i)
1602         {
1603                 if(pfp->fps[i])
1604                 {
1605                         nvfx_fragprog_destroy(nvfx, pfp->fps[i]);
1606                         FREE(pfp->fps[i]);
1607                 }
1608         }
1609
1610         FREE((void*)pfp->pipe.tokens);
1611         FREE(pfp);
1612 }
1613
1614 void
1615 nvfx_init_fragprog_functions(struct nvfx_context *nvfx)
1616 {
1617         nvfx->pipe.create_fs_state = nvfx_fp_state_create;
1618         nvfx->pipe.bind_fs_state = nvfx_fp_state_bind;
1619         nvfx->pipe.delete_fs_state = nvfx_fp_state_delete;
1620 }