nv40: support all 10 texcoords
[mesa.git] / src / gallium / drivers / nvfx / nvfx_vertprog.c
1 #include "pipe/p_context.h"
2 #include "pipe/p_defines.h"
3 #include "pipe/p_state.h"
4 #include "util/u_linkage.h"
5 #include "util/u_debug.h"
6
7 #include "pipe/p_shader_tokens.h"
8 #include "tgsi/tgsi_parse.h"
9 #include "tgsi/tgsi_dump.h"
10 #include "tgsi/tgsi_util.h"
11
12 #include "draw/draw_context.h"
13
14 #include "nvfx_context.h"
15 #include "nvfx_state.h"
16 #include "nvfx_resource.h"
17
18 /* TODO (at least...):
19 * 1. Indexed consts + ARL
20 * 3. NV_vp11, NV_vp2, NV_vp3 features
21 * - extra arith opcodes
22 * - branching
23 * - texture sampling
24 * - indexed attribs
25 * - indexed results
26 * 4. bugs
27 */
28
29 #include "nv30_vertprog.h"
30 #include "nv40_vertprog.h"
31
32 struct nvfx_loop_entry
33 {
34 unsigned brk_target;
35 unsigned cont_target;
36 };
37
38 struct nvfx_vpc {
39 struct nvfx_context* nvfx;
40 struct nvfx_vertex_program *vp;
41
42 struct nvfx_vertex_program_exec *vpi;
43
44 unsigned r_temps;
45 unsigned r_temps_discard;
46 struct nvfx_reg r_result[PIPE_MAX_SHADER_OUTPUTS];
47 struct nvfx_reg *r_address;
48 struct nvfx_reg *r_temp;
49 struct nvfx_reg *r_const;
50
51 struct nvfx_reg *imm;
52 unsigned nr_imm;
53
54 unsigned hpos_idx;
55
56 struct util_dynarray label_relocs;
57 struct util_dynarray loop_stack;
58 };
59
60 static struct nvfx_reg
61 temp(struct nvfx_vpc *vpc)
62 {
63 int idx = ffs(~vpc->r_temps) - 1;
64
65 if (idx < 0) {
66 NOUVEAU_ERR("out of temps!!\n");
67 assert(0);
68 return nvfx_reg(NVFXSR_TEMP, 0);
69 }
70
71 vpc->r_temps |= (1 << idx);
72 vpc->r_temps_discard |= (1 << idx);
73 return nvfx_reg(NVFXSR_TEMP, idx);
74 }
75
76 static inline void
77 release_temps(struct nvfx_vpc *vpc)
78 {
79 vpc->r_temps &= ~vpc->r_temps_discard;
80 vpc->r_temps_discard = 0;
81 }
82
83 static struct nvfx_reg
84 constant(struct nvfx_vpc *vpc, int pipe, float x, float y, float z, float w)
85 {
86 struct nvfx_vertex_program *vp = vpc->vp;
87 struct nvfx_vertex_program_data *vpd;
88 int idx;
89
90 if (pipe >= 0) {
91 for (idx = 0; idx < vp->nr_consts; idx++) {
92 if (vp->consts[idx].index == pipe)
93 return nvfx_reg(NVFXSR_CONST, idx);
94 }
95 }
96
97 idx = vp->nr_consts++;
98 vp->consts = realloc(vp->consts, sizeof(*vpd) * vp->nr_consts);
99 vpd = &vp->consts[idx];
100
101 vpd->index = pipe;
102 vpd->value[0] = x;
103 vpd->value[1] = y;
104 vpd->value[2] = z;
105 vpd->value[3] = w;
106 return nvfx_reg(NVFXSR_CONST, idx);
107 }
108
109 #define arith(s,o,d,m,s0,s1,s2) \
110 nvfx_insn(0, (NVFX_VP_INST_SLOT_##s << 7) | NVFX_VP_INST_##s##_OP_##o, -1, (d), (m), (s0), (s1), (s2))
111
112 static void
113 emit_src(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int pos, struct nvfx_src src)
114 {
115 struct nvfx_vertex_program *vp = vpc->vp;
116 uint32_t sr = 0;
117 struct nvfx_relocation reloc;
118
119 switch (src.reg.type) {
120 case NVFXSR_TEMP:
121 sr |= (NVFX_VP(SRC_REG_TYPE_TEMP) << NVFX_VP(SRC_REG_TYPE_SHIFT));
122 sr |= (src.reg.index << NVFX_VP(SRC_TEMP_SRC_SHIFT));
123 break;
124 case NVFXSR_INPUT:
125 sr |= (NVFX_VP(SRC_REG_TYPE_INPUT) <<
126 NVFX_VP(SRC_REG_TYPE_SHIFT));
127 vp->ir |= (1 << src.reg.index);
128 hw[1] |= (src.reg.index << NVFX_VP(INST_INPUT_SRC_SHIFT));
129 break;
130 case NVFXSR_CONST:
131 sr |= (NVFX_VP(SRC_REG_TYPE_CONST) <<
132 NVFX_VP(SRC_REG_TYPE_SHIFT));
133 reloc.location = vp->nr_insns - 1;
134 reloc.target = src.reg.index;
135 util_dynarray_append(&vp->const_relocs, struct nvfx_relocation, reloc);
136 break;
137 case NVFXSR_NONE:
138 sr |= (NVFX_VP(SRC_REG_TYPE_INPUT) <<
139 NVFX_VP(SRC_REG_TYPE_SHIFT));
140 break;
141 default:
142 assert(0);
143 }
144
145 if (src.negate)
146 sr |= NVFX_VP(SRC_NEGATE);
147
148 if (src.abs)
149 hw[0] |= (1 << (21 + pos));
150
151 sr |= ((src.swz[0] << NVFX_VP(SRC_SWZ_X_SHIFT)) |
152 (src.swz[1] << NVFX_VP(SRC_SWZ_Y_SHIFT)) |
153 (src.swz[2] << NVFX_VP(SRC_SWZ_Z_SHIFT)) |
154 (src.swz[3] << NVFX_VP(SRC_SWZ_W_SHIFT)));
155
156 if(src.indirect) {
157 if(src.reg.type == NVFXSR_CONST)
158 hw[3] |= NVFX_VP(INST_INDEX_CONST);
159 else if(src.reg.type == NVFXSR_INPUT)
160 hw[0] |= NVFX_VP(INST_INDEX_INPUT);
161 else
162 assert(0);
163 if(src.indirect_reg)
164 hw[0] |= NVFX_VP(INST_ADDR_REG_SELECT_1);
165 hw[0] |= src.indirect_swz << NVFX_VP(INST_ADDR_SWZ_SHIFT);
166 }
167
168 switch (pos) {
169 case 0:
170 hw[1] |= ((sr & NVFX_VP(SRC0_HIGH_MASK)) >>
171 NVFX_VP(SRC0_HIGH_SHIFT)) << NVFX_VP(INST_SRC0H_SHIFT);
172 hw[2] |= (sr & NVFX_VP(SRC0_LOW_MASK)) <<
173 NVFX_VP(INST_SRC0L_SHIFT);
174 break;
175 case 1:
176 hw[2] |= sr << NVFX_VP(INST_SRC1_SHIFT);
177 break;
178 case 2:
179 hw[2] |= ((sr & NVFX_VP(SRC2_HIGH_MASK)) >>
180 NVFX_VP(SRC2_HIGH_SHIFT)) << NVFX_VP(INST_SRC2H_SHIFT);
181 hw[3] |= (sr & NVFX_VP(SRC2_LOW_MASK)) <<
182 NVFX_VP(INST_SRC2L_SHIFT);
183 break;
184 default:
185 assert(0);
186 }
187 }
188
189 static void
190 emit_dst(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int slot, struct nvfx_reg dst)
191 {
192 struct nvfx_vertex_program *vp = vpc->vp;
193
194 switch (dst.type) {
195 case NVFXSR_NONE:
196 if(!nvfx->is_nv4x)
197 hw[0] |= NV30_VP_INST_DEST_TEMP_ID_MASK;
198 else {
199 hw[3] |= NV40_VP_INST_DEST_MASK;
200 if (slot == 0)
201 hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK;
202 else
203 hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
204 }
205 break;
206 case NVFXSR_TEMP:
207 if(!nvfx->is_nv4x)
208 hw[0] |= (dst.index << NV30_VP_INST_DEST_TEMP_ID_SHIFT);
209 else {
210 hw[3] |= NV40_VP_INST_DEST_MASK;
211 if (slot == 0)
212 hw[0] |= (dst.index << NV40_VP_INST_VEC_DEST_TEMP_SHIFT);
213 else
214 hw[3] |= (dst.index << NV40_VP_INST_SCA_DEST_TEMP_SHIFT);
215 }
216 break;
217 case NVFXSR_OUTPUT:
218 /* TODO: this may be wrong because on nv30 COL0 and BFC0 are swapped */
219 if(nvfx->is_nv4x) {
220 switch (dst.index) {
221 case NV30_VP_INST_DEST_CLP(0):
222 dst.index = NVFX_VP(INST_DEST_FOGC);
223 break;
224 case NV30_VP_INST_DEST_CLP(1):
225 dst.index = NVFX_VP(INST_DEST_FOGC);
226 break;
227 case NV30_VP_INST_DEST_CLP(2):
228 dst.index = NVFX_VP(INST_DEST_FOGC);
229 break;
230 case NV30_VP_INST_DEST_CLP(3):
231 dst.index = NVFX_VP(INST_DEST_PSZ);
232 break;
233 case NV30_VP_INST_DEST_CLP(4):
234 dst.index = NVFX_VP(INST_DEST_PSZ);
235 break;
236 case NV30_VP_INST_DEST_CLP(5):
237 dst.index = NVFX_VP(INST_DEST_PSZ);
238 break;
239 case NV40_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
240 case NV40_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
241 case NV40_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
242 case NV40_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
243 case NV40_VP_INST_DEST_FOGC: vp->or |= (1 << 4); break;
244 case NV40_VP_INST_DEST_PSZ : vp->or |= (1 << 5); break;
245 }
246 }
247
248 if(!nvfx->is_nv4x) {
249 hw[3] |= (dst.index << NV30_VP_INST_DEST_SHIFT);
250 hw[0] |= NV30_VP_INST_VEC_DEST_TEMP_MASK;
251
252 /*XXX: no way this is entirely correct, someone needs to
253 * figure out what exactly it is.
254 */
255 hw[3] |= 0x800;
256 } else {
257 hw[3] |= (dst.index << NV40_VP_INST_DEST_SHIFT);
258 if (slot == 0) {
259 hw[0] |= NV40_VP_INST_VEC_RESULT;
260 hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK;
261 } else {
262 hw[3] |= NV40_VP_INST_SCA_RESULT;
263 hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
264 }
265 }
266 break;
267 default:
268 assert(0);
269 }
270 }
271
272 static void
273 nvfx_vp_emit(struct nvfx_vpc *vpc, struct nvfx_insn insn)
274 {
275 struct nvfx_context* nvfx = vpc->nvfx;
276 struct nvfx_vertex_program *vp = vpc->vp;
277 unsigned slot = insn.op >> 7;
278 unsigned op = insn.op & 0x7f;
279 uint32_t *hw;
280
281 vp->insns = realloc(vp->insns, ++vp->nr_insns * sizeof(*vpc->vpi));
282 vpc->vpi = &vp->insns[vp->nr_insns - 1];
283 memset(vpc->vpi, 0, sizeof(*vpc->vpi));
284
285 hw = vpc->vpi->data;
286
287 hw[0] |= (insn.cc_test << NVFX_VP(INST_COND_SHIFT));
288 hw[0] |= ((insn.cc_swz[0] << NVFX_VP(INST_COND_SWZ_X_SHIFT)) |
289 (insn.cc_swz[1] << NVFX_VP(INST_COND_SWZ_Y_SHIFT)) |
290 (insn.cc_swz[2] << NVFX_VP(INST_COND_SWZ_Z_SHIFT)) |
291 (insn.cc_swz[3] << NVFX_VP(INST_COND_SWZ_W_SHIFT)));
292 if(insn.cc_update)
293 hw[0] |= NVFX_VP(INST_COND_UPDATE_ENABLE);
294
295 if(!nvfx->is_nv4x) {
296 if(slot == 0)
297 hw[1] |= (op << NV30_VP_INST_VEC_OPCODE_SHIFT);
298 else
299 {
300 hw[0] |= ((op >> 4) << NV30_VP_INST_SCA_OPCODEH_SHIFT);
301 hw[1] |= ((op & 0xf) << NV30_VP_INST_SCA_OPCODEL_SHIFT);
302 }
303 // hw[3] |= NVFX_VP(INST_SCA_DEST_TEMP_MASK);
304 // hw[3] |= (mask << NVFX_VP(INST_VEC_WRITEMASK_SHIFT));
305
306 if (insn.dst.type == NVFXSR_OUTPUT) {
307 if (slot)
308 hw[3] |= (insn.mask << NV30_VP_INST_SDEST_WRITEMASK_SHIFT);
309 else
310 hw[3] |= (insn.mask << NV30_VP_INST_VDEST_WRITEMASK_SHIFT);
311 } else {
312 if (slot)
313 hw[3] |= (insn.mask << NV30_VP_INST_STEMP_WRITEMASK_SHIFT);
314 else
315 hw[3] |= (insn.mask << NV30_VP_INST_VTEMP_WRITEMASK_SHIFT);
316 }
317 } else {
318 if (slot == 0) {
319 hw[1] |= (op << NV40_VP_INST_VEC_OPCODE_SHIFT);
320 hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
321 hw[3] |= (insn.mask << NV40_VP_INST_VEC_WRITEMASK_SHIFT);
322 } else {
323 hw[1] |= (op << NV40_VP_INST_SCA_OPCODE_SHIFT);
324 hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK ;
325 hw[3] |= (insn.mask << NV40_VP_INST_SCA_WRITEMASK_SHIFT);
326 }
327 }
328
329 emit_dst(nvfx, vpc, hw, slot, insn.dst);
330 emit_src(nvfx, vpc, hw, 0, insn.src[0]);
331 emit_src(nvfx, vpc, hw, 1, insn.src[1]);
332 emit_src(nvfx, vpc, hw, 2, insn.src[2]);
333
334 // if(insn.src[0].indirect || op == NVFX_VP_INST_VEC_OP_ARL)
335 // hw[3] |= NV40_VP_INST_SCA_RESULT;
336 }
337
338 static inline struct nvfx_src
339 tgsi_src(struct nvfx_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
340 struct nvfx_src src;
341
342 switch (fsrc->Register.File) {
343 case TGSI_FILE_INPUT:
344 src.reg = nvfx_reg(NVFXSR_INPUT, fsrc->Register.Index);
345 break;
346 case TGSI_FILE_CONSTANT:
347 src.reg = vpc->r_const[fsrc->Register.Index];
348 break;
349 case TGSI_FILE_IMMEDIATE:
350 src.reg = vpc->imm[fsrc->Register.Index];
351 break;
352 case TGSI_FILE_TEMPORARY:
353 src.reg = vpc->r_temp[fsrc->Register.Index];
354 break;
355 default:
356 NOUVEAU_ERR("bad src file\n");
357 src.reg.index = 0;
358 src.reg.type = -1;
359 break;
360 }
361
362 src.abs = fsrc->Register.Absolute;
363 src.negate = fsrc->Register.Negate;
364 src.swz[0] = fsrc->Register.SwizzleX;
365 src.swz[1] = fsrc->Register.SwizzleY;
366 src.swz[2] = fsrc->Register.SwizzleZ;
367 src.swz[3] = fsrc->Register.SwizzleW;
368 src.indirect = 0;
369
370 if(fsrc->Register.Indirect) {
371 if(fsrc->Indirect.File == TGSI_FILE_ADDRESS &&
372 (fsrc->Register.File == TGSI_FILE_CONSTANT || fsrc->Register.File == TGSI_FILE_INPUT))
373 {
374 src.indirect = 1;
375 src.indirect_reg = fsrc->Indirect.Index;
376 src.indirect_swz = fsrc->Indirect.SwizzleX;
377 }
378 else
379 {
380 src.reg.index = 0;
381 src.reg.type = -1;
382 }
383 }
384 return src;
385 }
386
387 static INLINE struct nvfx_reg
388 tgsi_dst(struct nvfx_vpc *vpc, const struct tgsi_full_dst_register *fdst) {
389 struct nvfx_reg dst;
390
391 switch (fdst->Register.File) {
392 case TGSI_FILE_NULL:
393 dst = nvfx_reg(NVFXSR_NONE, 0);
394 break;
395 case TGSI_FILE_OUTPUT:
396 dst = vpc->r_result[fdst->Register.Index];
397 break;
398 case TGSI_FILE_TEMPORARY:
399 dst = vpc->r_temp[fdst->Register.Index];
400 break;
401 case TGSI_FILE_ADDRESS:
402 dst = vpc->r_address[fdst->Register.Index];
403 break;
404 default:
405 NOUVEAU_ERR("bad dst file %i\n", fdst->Register.File);
406 dst.index = 0;
407 dst.type = 0;
408 break;
409 }
410
411 return dst;
412 }
413
414 static inline int
415 tgsi_mask(uint tgsi)
416 {
417 int mask = 0;
418
419 if (tgsi & TGSI_WRITEMASK_X) mask |= NVFX_VP_MASK_X;
420 if (tgsi & TGSI_WRITEMASK_Y) mask |= NVFX_VP_MASK_Y;
421 if (tgsi & TGSI_WRITEMASK_Z) mask |= NVFX_VP_MASK_Z;
422 if (tgsi & TGSI_WRITEMASK_W) mask |= NVFX_VP_MASK_W;
423 return mask;
424 }
425
426 static boolean
427 nvfx_vertprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_vpc *vpc,
428 unsigned idx, const struct tgsi_full_instruction *finst)
429 {
430 struct nvfx_src src[3], tmp;
431 struct nvfx_reg dst;
432 struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0));
433 struct nvfx_insn insn;
434 struct nvfx_relocation reloc;
435 struct nvfx_loop_entry loop;
436 int mask;
437 int ai = -1, ci = -1, ii = -1;
438 int i;
439
440 if (finst->Instruction.Opcode == TGSI_OPCODE_END)
441 return TRUE;
442
443 for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
444 const struct tgsi_full_src_register *fsrc;
445
446 fsrc = &finst->Src[i];
447 if (fsrc->Register.File == TGSI_FILE_TEMPORARY) {
448 src[i] = tgsi_src(vpc, fsrc);
449 }
450 }
451
452 for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
453 const struct tgsi_full_src_register *fsrc;
454
455 fsrc = &finst->Src[i];
456
457 switch (fsrc->Register.File) {
458 case TGSI_FILE_INPUT:
459 if (ai == -1 || ai == fsrc->Register.Index) {
460 ai = fsrc->Register.Index;
461 src[i] = tgsi_src(vpc, fsrc);
462 } else {
463 src[i] = nvfx_src(temp(vpc));
464 nvfx_vp_emit(vpc, arith(VEC, MOV, src[i].reg, NVFX_VP_MASK_ALL, tgsi_src(vpc, fsrc), none, none));
465 }
466 break;
467 case TGSI_FILE_CONSTANT:
468 if ((ci == -1 && ii == -1) ||
469 ci == fsrc->Register.Index) {
470 ci = fsrc->Register.Index;
471 src[i] = tgsi_src(vpc, fsrc);
472 } else {
473 src[i] = nvfx_src(temp(vpc));
474 nvfx_vp_emit(vpc, arith(VEC, MOV, src[i].reg, NVFX_VP_MASK_ALL, tgsi_src(vpc, fsrc), none, none));
475 }
476 break;
477 case TGSI_FILE_IMMEDIATE:
478 if ((ci == -1 && ii == -1) ||
479 ii == fsrc->Register.Index) {
480 ii = fsrc->Register.Index;
481 src[i] = tgsi_src(vpc, fsrc);
482 } else {
483 src[i] = nvfx_src(temp(vpc));
484 nvfx_vp_emit(vpc, arith(VEC, MOV, src[i].reg, NVFX_VP_MASK_ALL, tgsi_src(vpc, fsrc), none, none));
485 }
486 break;
487 case TGSI_FILE_TEMPORARY:
488 /* handled above */
489 break;
490 default:
491 NOUVEAU_ERR("bad src file\n");
492 return FALSE;
493 }
494 }
495
496 for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
497 if(src[i].reg.type < 0)
498 return FALSE;
499 }
500
501 if(finst->Dst[0].Register.File == TGSI_FILE_ADDRESS &&
502 finst->Instruction.Opcode != TGSI_OPCODE_ARL)
503 return FALSE;
504
505 dst = tgsi_dst(vpc, &finst->Dst[0]);
506 mask = tgsi_mask(finst->Dst[0].Register.WriteMask);
507
508 switch (finst->Instruction.Opcode) {
509 case TGSI_OPCODE_ABS:
510 nvfx_vp_emit(vpc, arith(VEC, MOV, dst, mask, abs(src[0]), none, none));
511 break;
512 case TGSI_OPCODE_ADD:
513 nvfx_vp_emit(vpc, arith(VEC, ADD, dst, mask, src[0], none, src[1]));
514 break;
515 case TGSI_OPCODE_ARL:
516 nvfx_vp_emit(vpc, arith(VEC, ARL, dst, mask, src[0], none, none));
517 break;
518 case TGSI_OPCODE_CMP:
519 insn = arith(VEC, MOV, none.reg, mask, src[0], none, none);
520 insn.cc_update = 1;
521 nvfx_vp_emit(vpc, insn);
522
523 insn = arith(VEC, MOV, dst, mask, src[2], none, none);
524 insn.cc_test = NVFX_COND_GE;
525 nvfx_vp_emit(vpc, insn);
526
527 insn = arith(VEC, MOV, dst, mask, src[1], none, none);
528 insn.cc_test = NVFX_COND_LT;
529 nvfx_vp_emit(vpc, insn);
530 break;
531 case TGSI_OPCODE_COS:
532 nvfx_vp_emit(vpc, arith(SCA, COS, dst, mask, none, none, src[0]));
533 break;
534 case TGSI_OPCODE_DP2:
535 tmp = nvfx_src(temp(vpc));
536 nvfx_vp_emit(vpc, arith(VEC, MUL, tmp.reg, NVFX_VP_MASK_X | NVFX_VP_MASK_Y, src[0], src[1], none));
537 nvfx_vp_emit(vpc, arith(VEC, ADD, dst, mask, swz(tmp, X, X, X, X), swz(tmp, Y, Y, Y, Y), none));
538 break;
539 case TGSI_OPCODE_DP3:
540 nvfx_vp_emit(vpc, arith(VEC, DP3, dst, mask, src[0], src[1], none));
541 break;
542 case TGSI_OPCODE_DP4:
543 nvfx_vp_emit(vpc, arith(VEC, DP4, dst, mask, src[0], src[1], none));
544 break;
545 case TGSI_OPCODE_DPH:
546 nvfx_vp_emit(vpc, arith(VEC, DPH, dst, mask, src[0], src[1], none));
547 break;
548 case TGSI_OPCODE_DST:
549 nvfx_vp_emit(vpc, arith(VEC, DST, dst, mask, src[0], src[1], none));
550 break;
551 case TGSI_OPCODE_EX2:
552 nvfx_vp_emit(vpc, arith(SCA, EX2, dst, mask, none, none, src[0]));
553 break;
554 case TGSI_OPCODE_EXP:
555 nvfx_vp_emit(vpc, arith(SCA, EXP, dst, mask, none, none, src[0]));
556 break;
557 case TGSI_OPCODE_FLR:
558 nvfx_vp_emit(vpc, arith(VEC, FLR, dst, mask, src[0], none, none));
559 break;
560 case TGSI_OPCODE_FRC:
561 nvfx_vp_emit(vpc, arith(VEC, FRC, dst, mask, src[0], none, none));
562 break;
563 case TGSI_OPCODE_LG2:
564 nvfx_vp_emit(vpc, arith(SCA, LG2, dst, mask, none, none, src[0]));
565 break;
566 case TGSI_OPCODE_LIT:
567 nvfx_vp_emit(vpc, arith(SCA, LIT, dst, mask, none, none, src[0]));
568 break;
569 case TGSI_OPCODE_LOG:
570 nvfx_vp_emit(vpc, arith(SCA, LOG, dst, mask, none, none, src[0]));
571 break;
572 case TGSI_OPCODE_LRP:
573 tmp = nvfx_src(temp(vpc));
574 nvfx_vp_emit(vpc, arith(VEC, MAD, tmp.reg, mask, neg(src[0]), src[2], src[2]));
575 nvfx_vp_emit(vpc, arith(VEC, MAD, dst, mask, src[0], src[1], tmp));
576 break;
577 case TGSI_OPCODE_MAD:
578 nvfx_vp_emit(vpc, arith(VEC, MAD, dst, mask, src[0], src[1], src[2]));
579 break;
580 case TGSI_OPCODE_MAX:
581 nvfx_vp_emit(vpc, arith(VEC, MAX, dst, mask, src[0], src[1], none));
582 break;
583 case TGSI_OPCODE_MIN:
584 nvfx_vp_emit(vpc, arith(VEC, MIN, dst, mask, src[0], src[1], none));
585 break;
586 case TGSI_OPCODE_MOV:
587 nvfx_vp_emit(vpc, arith(VEC, MOV, dst, mask, src[0], none, none));
588 break;
589 case TGSI_OPCODE_MUL:
590 nvfx_vp_emit(vpc, arith(VEC, MUL, dst, mask, src[0], src[1], none));
591 break;
592 case TGSI_OPCODE_NOP:
593 break;
594 case TGSI_OPCODE_POW:
595 tmp = nvfx_src(temp(vpc));
596 nvfx_vp_emit(vpc, arith(SCA, LG2, tmp.reg, NVFX_VP_MASK_X, none, none, swz(src[0], X, X, X, X)));
597 nvfx_vp_emit(vpc, arith(VEC, MUL, tmp.reg, NVFX_VP_MASK_X, swz(tmp, X, X, X, X), swz(src[1], X, X, X, X), none));
598 nvfx_vp_emit(vpc, arith(SCA, EX2, dst, mask, none, none, swz(tmp, X, X, X, X)));
599 break;
600 case TGSI_OPCODE_RCP:
601 nvfx_vp_emit(vpc, arith(SCA, RCP, dst, mask, none, none, src[0]));
602 break;
603 case TGSI_OPCODE_RSQ:
604 nvfx_vp_emit(vpc, arith(SCA, RSQ, dst, mask, none, none, abs(src[0])));
605 break;
606 case TGSI_OPCODE_SEQ:
607 nvfx_vp_emit(vpc, arith(VEC, SEQ, dst, mask, src[0], src[1], none));
608 break;
609 case TGSI_OPCODE_SFL:
610 nvfx_vp_emit(vpc, arith(VEC, SFL, dst, mask, src[0], src[1], none));
611 break;
612 case TGSI_OPCODE_SGE:
613 nvfx_vp_emit(vpc, arith(VEC, SGE, dst, mask, src[0], src[1], none));
614 break;
615 case TGSI_OPCODE_SGT:
616 nvfx_vp_emit(vpc, arith(VEC, SGT, dst, mask, src[0], src[1], none));
617 break;
618 case TGSI_OPCODE_SIN:
619 nvfx_vp_emit(vpc, arith(SCA, SIN, dst, mask, none, none, src[0]));
620 break;
621 case TGSI_OPCODE_SLE:
622 nvfx_vp_emit(vpc, arith(VEC, SLE, dst, mask, src[0], src[1], none));
623 break;
624 case TGSI_OPCODE_SLT:
625 nvfx_vp_emit(vpc, arith(VEC, SLT, dst, mask, src[0], src[1], none));
626 break;
627 case TGSI_OPCODE_SNE:
628 nvfx_vp_emit(vpc, arith(VEC, SNE, dst, mask, src[0], src[1], none));
629 break;
630 case TGSI_OPCODE_SSG:
631 nvfx_vp_emit(vpc, arith(VEC, SSG, dst, mask, src[0], src[1], none));
632 break;
633 case TGSI_OPCODE_STR:
634 nvfx_vp_emit(vpc, arith(VEC, STR, dst, mask, src[0], src[1], none));
635 break;
636 case TGSI_OPCODE_SUB:
637 nvfx_vp_emit(vpc, arith(VEC, ADD, dst, mask, src[0], none, neg(src[1])));
638 break;
639 case TGSI_OPCODE_TRUNC:
640 tmp = nvfx_src(temp(vpc));
641 insn = arith(VEC, MOV, none.reg, mask, src[0], none, none);
642 insn.cc_update = 1;
643 nvfx_vp_emit(vpc, insn);
644
645 nvfx_vp_emit(vpc, arith(VEC, FLR, tmp.reg, mask, abs(src[0]), none, none));
646 nvfx_vp_emit(vpc, arith(VEC, MOV, dst, mask, tmp, none, none));
647
648 insn = arith(VEC, MOV, dst, mask, neg(tmp), none, none);
649 insn.cc_test = NVFX_COND_LT;
650 nvfx_vp_emit(vpc, insn);
651 break;
652 case TGSI_OPCODE_XPD:
653 tmp = nvfx_src(temp(vpc));
654 nvfx_vp_emit(vpc, arith(VEC, MUL, tmp.reg, mask, swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none));
655 nvfx_vp_emit(vpc, arith(VEC, MAD, dst, (mask & ~NVFX_VP_MASK_W), swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y), neg(tmp)));
656 break;
657
658 case TGSI_OPCODE_IF:
659 insn = arith(VEC, MOV, none.reg, NVFX_VP_MASK_X, src[0], none, none);
660 insn.cc_update = 1;
661 nvfx_vp_emit(vpc, insn);
662
663 reloc.location = vpc->vp->nr_insns;
664 reloc.target = finst->Label.Label + 1;
665 util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc);
666
667 insn = arith(SCA, BRA, none.reg, 0, none, none, none);
668 insn.cc_test = NVFX_COND_EQ;
669 insn.cc_swz[0] = insn.cc_swz[1] = insn.cc_swz[2] = insn.cc_swz[3] = 0;
670 nvfx_vp_emit(vpc, insn);
671 break;
672
673 case TGSI_OPCODE_ELSE:
674 case TGSI_OPCODE_BRA:
675 case TGSI_OPCODE_CAL:
676 reloc.location = vpc->vp->nr_insns;
677 reloc.target = finst->Label.Label;
678 util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc);
679
680 if(finst->Instruction.Opcode == TGSI_OPCODE_CAL)
681 insn = arith(SCA, CAL, none.reg, 0, none, none, none);
682 else
683 insn = arith(SCA, BRA, none.reg, 0, none, none, none);
684 nvfx_vp_emit(vpc, insn);
685 break;
686
687 case TGSI_OPCODE_RET:
688 tmp = none;
689 tmp.swz[0] = tmp.swz[1] = tmp.swz[2] = tmp.swz[3] = 0;
690 nvfx_vp_emit(vpc, arith(SCA, RET, none.reg, 0, none, none, tmp));
691 break;
692
693 case TGSI_OPCODE_BGNSUB:
694 case TGSI_OPCODE_ENDSUB:
695 case TGSI_OPCODE_ENDIF:
696 /* nothing to do here */
697 break;
698
699 case TGSI_OPCODE_BGNLOOP:
700 loop.cont_target = idx;
701 loop.brk_target = finst->Label.Label + 1;
702 util_dynarray_append(&vpc->loop_stack, struct nvfx_loop_entry, loop);
703 break;
704
705 case TGSI_OPCODE_ENDLOOP:
706 loop = util_dynarray_pop(&vpc->loop_stack, struct nvfx_loop_entry);
707
708 reloc.location = vpc->vp->nr_insns;
709 reloc.target = loop.cont_target;
710 util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc);
711
712 nvfx_vp_emit(vpc, arith(SCA, BRA, none.reg, 0, none, none, none));
713 break;
714
715 case TGSI_OPCODE_CONT:
716 loop = util_dynarray_top(&vpc->loop_stack, struct nvfx_loop_entry);
717
718 reloc.location = vpc->vp->nr_insns;
719 reloc.target = loop.cont_target;
720 util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc);
721
722 nvfx_vp_emit(vpc, arith(SCA, BRA, none.reg, 0, none, none, none));
723 break;
724
725 case TGSI_OPCODE_BRK:
726 loop = util_dynarray_top(&vpc->loop_stack, struct nvfx_loop_entry);
727
728 reloc.location = vpc->vp->nr_insns;
729 reloc.target = loop.brk_target;
730 util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc);
731
732 nvfx_vp_emit(vpc, arith(SCA, BRA, none.reg, 0, none, none, none));
733 break;
734
735 default:
736 NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
737 return FALSE;
738 }
739
740 release_temps(vpc);
741 return TRUE;
742 }
743
744 static boolean
745 nvfx_vertprog_parse_decl_output(struct nvfx_context* nvfx, struct nvfx_vpc *vpc,
746 const struct tgsi_full_declaration *fdec)
747 {
748 unsigned idx = fdec->Range.First;
749 int hw;
750
751 switch (fdec->Semantic.Name) {
752 case TGSI_SEMANTIC_POSITION:
753 hw = NVFX_VP(INST_DEST_POS);
754 vpc->hpos_idx = idx;
755 break;
756 case TGSI_SEMANTIC_COLOR:
757 if (fdec->Semantic.Index == 0) {
758 hw = NVFX_VP(INST_DEST_COL0);
759 } else
760 if (fdec->Semantic.Index == 1) {
761 hw = NVFX_VP(INST_DEST_COL1);
762 } else {
763 NOUVEAU_ERR("bad colour semantic index\n");
764 return FALSE;
765 }
766 break;
767 case TGSI_SEMANTIC_BCOLOR:
768 if (fdec->Semantic.Index == 0) {
769 hw = NVFX_VP(INST_DEST_BFC0);
770 } else
771 if (fdec->Semantic.Index == 1) {
772 hw = NVFX_VP(INST_DEST_BFC1);
773 } else {
774 NOUVEAU_ERR("bad bcolour semantic index\n");
775 return FALSE;
776 }
777 break;
778 case TGSI_SEMANTIC_FOG:
779 hw = NVFX_VP(INST_DEST_FOGC);
780 break;
781 case TGSI_SEMANTIC_PSIZE:
782 hw = NVFX_VP(INST_DEST_PSZ);
783 break;
784 case TGSI_SEMANTIC_GENERIC:
785 hw = (vpc->vp->generic_to_fp_input[fdec->Semantic.Index] & 0xf) - NVFX_FP_OP_INPUT_SRC_TC(0);
786 if(hw <= 8)
787 hw = NVFX_VP(INST_DEST_TC(hw));
788 else if(hw == 9) /* TODO: this is correct, but how does this overlapping work exactly? */
789 hw = NV40_VP_INST_DEST_PSZ;
790 else
791 assert(0);
792 break;
793 case TGSI_SEMANTIC_EDGEFLAG:
794 /* not really an error just a fallback */
795 NOUVEAU_ERR("cannot handle edgeflag output\n");
796 return FALSE;
797 default:
798 NOUVEAU_ERR("bad output semantic\n");
799 return FALSE;
800 }
801
802 vpc->r_result[idx] = nvfx_reg(NVFXSR_OUTPUT, hw);
803 return TRUE;
804 }
805
806 static boolean
807 nvfx_vertprog_prepare(struct nvfx_context* nvfx, struct nvfx_vpc *vpc)
808 {
809 struct tgsi_parse_context p;
810 int high_const = -1, high_temp = -1, high_addr = -1, nr_imm = 0, i;
811 struct util_semantic_set set;
812 unsigned char sem_layout[10];
813 unsigned num_outputs;
814 unsigned num_texcoords = nvfx->is_nv4x ? 10 : 8;
815
816 num_outputs = util_semantic_set_from_program_file(&set, vpc->vp->pipe.tokens, TGSI_FILE_OUTPUT);
817
818 if(num_outputs > num_texcoords) {
819 NOUVEAU_ERR("too many vertex program outputs: %i\n", num_outputs);
820 return FALSE;
821 }
822 util_semantic_layout_from_set(sem_layout, &set, num_texcoords, num_texcoords);
823
824 /* hope 0xf is (0, 0, 0, 1) initialized; otherwise, we are _probably_ not required to do this */
825 memset(vpc->vp->generic_to_fp_input, 0x0f, sizeof(vpc->vp->generic_to_fp_input));
826 for(int i = 0; i < 10; ++i) {
827 if(sem_layout[i] == 0xff)
828 continue;
829 //printf("vp: GENERIC[%i] to fpreg %i\n", sem_layout[i], NVFX_FP_OP_INPUT_SRC_TC(0) + i);
830 vpc->vp->generic_to_fp_input[sem_layout[i]] = 0xf0 | NVFX_FP_OP_INPUT_SRC_TC(i);
831 }
832
833 vpc->vp->sprite_fp_input = -1;
834 for(int i = 0; i < 10; ++i)
835 {
836 if(sem_layout[i] == 0xff)
837 {
838 vpc->vp->sprite_fp_input = NVFX_FP_OP_INPUT_SRC_TC(i);
839 break;
840 }
841 }
842
843 tgsi_parse_init(&p, vpc->vp->pipe.tokens);
844 while (!tgsi_parse_end_of_tokens(&p)) {
845 const union tgsi_full_token *tok = &p.FullToken;
846
847 tgsi_parse_token(&p);
848 switch(tok->Token.Type) {
849 case TGSI_TOKEN_TYPE_IMMEDIATE:
850 nr_imm++;
851 break;
852 case TGSI_TOKEN_TYPE_DECLARATION:
853 {
854 const struct tgsi_full_declaration *fdec;
855
856 fdec = &p.FullToken.FullDeclaration;
857 switch (fdec->Declaration.File) {
858 case TGSI_FILE_TEMPORARY:
859 if (fdec->Range.Last > high_temp) {
860 high_temp =
861 fdec->Range.Last;
862 }
863 break;
864 case TGSI_FILE_ADDRESS:
865 if (fdec->Range.Last > high_addr) {
866 high_addr =
867 fdec->Range.Last;
868 }
869 break;
870 case TGSI_FILE_CONSTANT:
871 if (fdec->Range.Last > high_const) {
872 high_const =
873 fdec->Range.Last;
874 }
875 break;
876 case TGSI_FILE_OUTPUT:
877 if (!nvfx_vertprog_parse_decl_output(nvfx, vpc, fdec))
878 return FALSE;
879 break;
880 default:
881 break;
882 }
883 }
884 break;
885 default:
886 break;
887 }
888 }
889 tgsi_parse_free(&p);
890
891 if (nr_imm) {
892 vpc->imm = CALLOC(nr_imm, sizeof(struct nvfx_reg));
893 assert(vpc->imm);
894 }
895
896 if (++high_temp) {
897 vpc->r_temp = CALLOC(high_temp, sizeof(struct nvfx_reg));
898 for (i = 0; i < high_temp; i++)
899 vpc->r_temp[i] = temp(vpc);
900 }
901
902 if (++high_addr) {
903 vpc->r_address = CALLOC(high_addr, sizeof(struct nvfx_reg));
904 for (i = 0; i < high_addr; i++)
905 vpc->r_address[i] = nvfx_reg(NVFXSR_TEMP, i);
906 }
907
908 if(++high_const) {
909 vpc->r_const = CALLOC(high_const, sizeof(struct nvfx_reg));
910 for (i = 0; i < high_const; i++)
911 vpc->r_const[i] = constant(vpc, i, 0, 0, 0, 0);
912 }
913
914 vpc->r_temps_discard = 0;
915 return TRUE;
916 }
917
918 DEBUG_GET_ONCE_BOOL_OPTION(nvfx_dump_vp, "NVFX_DUMP_VP", FALSE)
919
920 static void
921 nvfx_vertprog_translate(struct nvfx_context *nvfx,
922 struct nvfx_vertex_program *vp)
923 {
924 struct tgsi_parse_context parse;
925 struct nvfx_vpc *vpc = NULL;
926 struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0));
927 struct util_dynarray insns;
928 int i;
929
930 vpc = CALLOC(1, sizeof(struct nvfx_vpc));
931 if (!vpc)
932 return;
933 vpc->nvfx = nvfx;
934 vpc->vp = vp;
935
936 /* reserve space for ucps */
937 if(nvfx->use_vp_clipping)
938 {
939 for(i = 0; i < 6; ++i)
940 constant(vpc, -1, 0, 0, 0, 0);
941 }
942
943 if (!nvfx_vertprog_prepare(nvfx, vpc)) {
944 FREE(vpc);
945 return;
946 }
947
948 /* Redirect post-transform vertex position to a temp if user clip
949 * planes are enabled. We need to append code to the vtxprog
950 * to handle clip planes later.
951 */
952 /* TODO: maybe support patching this depending on whether there are ucps: not sure if it is really matters much */
953 if (nvfx->use_vp_clipping) {
954 vpc->r_result[vpc->hpos_idx] = temp(vpc);
955 vpc->r_temps_discard = 0;
956 }
957
958 tgsi_parse_init(&parse, vp->pipe.tokens);
959
960 util_dynarray_init(&insns);
961 while (!tgsi_parse_end_of_tokens(&parse)) {
962 tgsi_parse_token(&parse);
963
964 switch (parse.FullToken.Token.Type) {
965 case TGSI_TOKEN_TYPE_IMMEDIATE:
966 {
967 const struct tgsi_full_immediate *imm;
968
969 imm = &parse.FullToken.FullImmediate;
970 assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
971 assert(imm->Immediate.NrTokens == 4 + 1);
972 vpc->imm[vpc->nr_imm++] =
973 constant(vpc, -1,
974 imm->u[0].Float,
975 imm->u[1].Float,
976 imm->u[2].Float,
977 imm->u[3].Float);
978 }
979 break;
980 case TGSI_TOKEN_TYPE_INSTRUCTION:
981 {
982 const struct tgsi_full_instruction *finst;
983 unsigned idx = insns.size >> 2;
984 util_dynarray_append(&insns, unsigned, vp->nr_insns);
985 finst = &parse.FullToken.FullInstruction;
986 if (!nvfx_vertprog_parse_instruction(nvfx, vpc, idx, finst))
987 goto out_err;
988 }
989 break;
990 default:
991 break;
992 }
993 }
994
995 util_dynarray_append(&insns, unsigned, vp->nr_insns);
996
997 for(unsigned i = 0; i < vpc->label_relocs.size; i += sizeof(struct nvfx_relocation))
998 {
999 struct nvfx_relocation* label_reloc = (struct nvfx_relocation*)((char*)vpc->label_relocs.data + i);
1000 struct nvfx_relocation hw_reloc;
1001
1002 hw_reloc.location = label_reloc->location;
1003 hw_reloc.target = ((unsigned*)insns.data)[label_reloc->target];
1004
1005 //debug_printf("hw %u -> tgsi %u = hw %u\n", hw_reloc.location, label_reloc->target, hw_reloc.target);
1006
1007 util_dynarray_append(&vp->branch_relocs, struct nvfx_relocation, hw_reloc);
1008 }
1009 util_dynarray_fini(&insns);
1010 util_dynarray_trim(&vp->branch_relocs);
1011
1012 /* XXX: what if we add a RET before?! make sure we jump here...*/
1013
1014 /* Write out HPOS if it was redirected to a temp earlier */
1015 if (vpc->r_result[vpc->hpos_idx].type != NVFXSR_OUTPUT) {
1016 struct nvfx_reg hpos = nvfx_reg(NVFXSR_OUTPUT,
1017 NVFX_VP(INST_DEST_POS));
1018 struct nvfx_src htmp = nvfx_src(vpc->r_result[vpc->hpos_idx]);
1019
1020 nvfx_vp_emit(vpc, arith(VEC, MOV, hpos, NVFX_VP_MASK_ALL, htmp, none, none));
1021 }
1022
1023 /* Insert code to handle user clip planes */
1024 if(nvfx->use_vp_clipping)
1025 {
1026 for (i = 0; i < 6; i++) {
1027 struct nvfx_reg cdst = nvfx_reg(NVFXSR_OUTPUT, NV30_VP_INST_DEST_CLP(i));
1028 struct nvfx_src ceqn = nvfx_src(nvfx_reg(NVFXSR_CONST, i));
1029 struct nvfx_src htmp = nvfx_src(vpc->r_result[vpc->hpos_idx]);
1030 unsigned mask;
1031
1032 if(nvfx->is_nv4x)
1033 {
1034 switch (i) {
1035 case 0: case 3: mask = NVFX_VP_MASK_Y; break;
1036 case 1: case 4: mask = NVFX_VP_MASK_Z; break;
1037 case 2: case 5: mask = NVFX_VP_MASK_W; break;
1038 default:
1039 NOUVEAU_ERR("invalid clip dist #%d\n", i);
1040 goto out_err;
1041 }
1042 }
1043 else
1044 mask = NVFX_VP_MASK_X;
1045
1046 nvfx_vp_emit(vpc, arith(VEC, DP4, cdst, mask, htmp, ceqn, none));
1047 }
1048 }
1049 else
1050 {
1051 if(vp->nr_insns)
1052 vp->insns[vp->nr_insns - 1].data[3] |= NVFX_VP_INST_LAST;
1053
1054 nvfx_vp_emit(vpc, arith(VEC, NOP, none.reg, 0, none, none, none));
1055 vp->insns[vp->nr_insns - 1].data[3] |= NVFX_VP_INST_LAST;
1056 }
1057
1058 if(debug_get_option_nvfx_dump_vp())
1059 {
1060 debug_printf("\n");
1061 tgsi_dump(vp->pipe.tokens, 0);
1062
1063 debug_printf("\n%s vertex program:\n", nvfx->is_nv4x ? "nv4x" : "nv3x");
1064 for (i = 0; i < vp->nr_insns; i++)
1065 debug_printf("%3u: %08x %08x %08x %08x\n", i, vp->insns[i].data[0], vp->insns[i].data[1], vp->insns[i].data[2], vp->insns[i].data[3]);
1066 debug_printf("\n");
1067 }
1068
1069 vp->clip_nr = -1;
1070 vp->exec_start = -1;
1071 vp->translated = TRUE;
1072 out_err:
1073 tgsi_parse_free(&parse);
1074 util_dynarray_fini(&vpc->label_relocs);
1075 util_dynarray_fini(&vpc->loop_stack);
1076 if (vpc->r_temp)
1077 FREE(vpc->r_temp);
1078 if (vpc->r_address)
1079 FREE(vpc->r_address);
1080 if (vpc->r_const)
1081 FREE(vpc->r_const);
1082 if (vpc->imm)
1083 FREE(vpc->imm);
1084 FREE(vpc);
1085 }
1086
1087 boolean
1088 nvfx_vertprog_validate(struct nvfx_context *nvfx)
1089 {
1090 struct nvfx_screen *screen = nvfx->screen;
1091 struct nouveau_channel *chan = screen->base.channel;
1092 struct nouveau_grobj *eng3d = screen->eng3d;
1093 struct nvfx_vertex_program *vp;
1094 struct pipe_resource *constbuf;
1095 boolean upload_code = FALSE, upload_data = FALSE;
1096 int i;
1097
1098 if (nvfx->render_mode == HW) {
1099 vp = nvfx->vertprog;
1100 constbuf = nvfx->constbuf[PIPE_SHADER_VERTEX];
1101 } else {
1102 vp = nvfx->swtnl.vertprog;
1103 constbuf = NULL;
1104 }
1105
1106 /* Translate TGSI shader into hw bytecode */
1107 if (!vp->translated)
1108 {
1109 nvfx->fallback_swtnl &= ~NVFX_NEW_VERTPROG;
1110 nvfx_vertprog_translate(nvfx, vp);
1111 if (!vp->translated) {
1112 nvfx->fallback_swtnl |= NVFX_NEW_VERTPROG;
1113 return FALSE;
1114 }
1115 }
1116
1117 /* Allocate hw vtxprog exec slots */
1118 if (!vp->exec) {
1119 struct nouveau_resource *heap = nvfx->screen->vp_exec_heap;
1120 uint vplen = vp->nr_insns;
1121
1122 if (nouveau_resource_alloc(heap, vplen, vp, &vp->exec)) {
1123 while (heap->next && heap->size < vplen) {
1124 struct nvfx_vertex_program *evict;
1125
1126 evict = heap->next->priv;
1127 nouveau_resource_free(&evict->exec);
1128 }
1129
1130 if (nouveau_resource_alloc(heap, vplen, vp, &vp->exec))
1131 {
1132 debug_printf("Vertex shader too long: %u instructions\n", vplen);
1133 nvfx->fallback_swtnl |= NVFX_NEW_VERTPROG;
1134 return FALSE;
1135 }
1136 }
1137
1138 upload_code = TRUE;
1139 }
1140
1141 /* Allocate hw vtxprog const slots */
1142 if (vp->nr_consts && !vp->data) {
1143 struct nouveau_resource *heap = nvfx->screen->vp_data_heap;
1144
1145 if (nouveau_resource_alloc(heap, vp->nr_consts, vp, &vp->data)) {
1146 while (heap->next && heap->size < vp->nr_consts) {
1147 struct nvfx_vertex_program *evict;
1148
1149 evict = heap->next->priv;
1150 nouveau_resource_free(&evict->data);
1151 }
1152
1153 if (nouveau_resource_alloc(heap, vp->nr_consts, vp, &vp->data))
1154 {
1155 debug_printf("Vertex shader uses too many constants: %u constants\n", vp->nr_consts);
1156 nvfx->fallback_swtnl |= NVFX_NEW_VERTPROG;
1157 return FALSE;
1158 }
1159 }
1160
1161 //printf("start at %u nc %u\n", vp->data->start, vp->nr_consts);
1162
1163 /*XXX: handle this some day */
1164 assert(vp->data->start >= vp->data_start_min);
1165
1166 upload_data = TRUE;
1167 if (vp->data_start != vp->data->start)
1168 upload_code = TRUE;
1169 }
1170
1171 /* If exec or data segments moved we need to patch the program to
1172 * fixup offsets and register IDs.
1173 */
1174 if (vp->exec_start != vp->exec->start) {
1175 //printf("vp_relocs %u -> %u\n", vp->exec_start, vp->exec->start);
1176 for(unsigned i = 0; i < vp->branch_relocs.size; i += sizeof(struct nvfx_relocation))
1177 {
1178 struct nvfx_relocation* reloc = (struct nvfx_relocation*)((char*)vp->branch_relocs.data + i);
1179 uint32_t* hw = vp->insns[reloc->location].data;
1180 unsigned target = vp->exec->start + reloc->target;
1181
1182 //debug_printf("vp_reloc hw %u -> hw %u\n", reloc->location, target);
1183
1184 if(!nvfx->is_nv4x)
1185 {
1186 hw[2] &=~ NV30_VP_INST_IADDR_MASK;
1187 hw[2] |= (target & 0x1ff) << NV30_VP_INST_IADDR_SHIFT;
1188 }
1189 else
1190 {
1191 hw[3] &=~ NV40_VP_INST_IADDRL_MASK;
1192 hw[3] |= (target & 7) << NV40_VP_INST_IADDRL_SHIFT;
1193
1194 hw[2] &=~ NV40_VP_INST_IADDRH_MASK;
1195 hw[2] |= ((target >> 3) & 0x3f) << NV40_VP_INST_IADDRH_SHIFT;
1196 }
1197 }
1198
1199 vp->exec_start = vp->exec->start;
1200 }
1201
1202 if (vp->data_start != vp->data->start) {
1203 for(unsigned i = 0; i < vp->const_relocs.size; i += sizeof(struct nvfx_relocation))
1204 {
1205 struct nvfx_relocation* reloc = (struct nvfx_relocation*)((char*)vp->const_relocs.data + i);
1206 struct nvfx_vertex_program_exec *vpi = &vp->insns[reloc->location];
1207
1208 //printf("reloc %i to %i + %i\n", reloc->location, vp->data->start, reloc->target);
1209
1210 vpi->data[1] &= ~NVFX_VP(INST_CONST_SRC_MASK);
1211 vpi->data[1] |=
1212 (reloc->target + vp->data->start) <<
1213 NVFX_VP(INST_CONST_SRC_SHIFT);
1214 }
1215
1216 vp->data_start = vp->data->start;
1217 upload_code = TRUE;
1218 }
1219
1220 /* Update + Upload constant values */
1221 if (vp->nr_consts) {
1222 float *map = NULL;
1223
1224 if (constbuf)
1225 map = (float*)nvfx_buffer(constbuf)->data;
1226
1227 /*
1228 for (i = 0; i < 512; i++) {
1229 float v[4] = {0.1, 0,2, 0.3, 0.4};
1230 BEGIN_RING(chan, eng3d, NV34TCL_VP_UPLOAD_CONST_ID, 5);
1231 OUT_RING (chan, i);
1232 OUT_RINGp (chan, (uint32_t *)v, 4);
1233 printf("frob %i\n", i);
1234 }
1235 */
1236
1237 for (i = nvfx->use_vp_clipping ? 6 : 0; i < vp->nr_consts; i++) {
1238 struct nvfx_vertex_program_data *vpd = &vp->consts[i];
1239
1240 if (vpd->index >= 0) {
1241 if (!upload_data &&
1242 !memcmp(vpd->value, &map[vpd->index * 4],
1243 4 * sizeof(float)))
1244 continue;
1245 memcpy(vpd->value, &map[vpd->index * 4],
1246 4 * sizeof(float));
1247 }
1248
1249 //printf("upload into %i + %i: %f %f %f %f\n", vp->data->start, i, vpd->value[0], vpd->value[1], vpd->value[2], vpd->value[3]);
1250
1251 BEGIN_RING(chan, eng3d, NV34TCL_VP_UPLOAD_CONST_ID, 5);
1252 OUT_RING (chan, i + vp->data->start);
1253 OUT_RINGp (chan, (uint32_t *)vpd->value, 4);
1254 }
1255 }
1256
1257 /* Upload vtxprog */
1258 if (upload_code) {
1259 BEGIN_RING(chan, eng3d, NV34TCL_VP_UPLOAD_FROM_ID, 1);
1260 OUT_RING (chan, vp->exec->start);
1261 for (i = 0; i < vp->nr_insns; i++) {
1262 BEGIN_RING(chan, eng3d, NV34TCL_VP_UPLOAD_INST(0), 4);
1263 //printf("%08x %08x %08x %08x\n", vp->insns[i].data[0], vp->insns[i].data[1], vp->insns[i].data[2], vp->insns[i].data[3]);
1264 OUT_RINGp (chan, vp->insns[i].data, 4);
1265 }
1266 vp->clip_nr = -1;
1267 }
1268
1269 if(nvfx->dirty & (NVFX_NEW_VERTPROG))
1270 {
1271 WAIT_RING(chan, 6);
1272 OUT_RING(chan, RING_3D(NV34TCL_VP_START_FROM_ID, 1));
1273 OUT_RING(chan, vp->exec->start);
1274 if(nvfx->is_nv4x) {
1275 OUT_RING(chan, RING_3D(NV40TCL_VP_ATTRIB_EN, 1));
1276 OUT_RING(chan, vp->ir);
1277 }
1278 }
1279
1280 return TRUE;
1281 }
1282
1283 void
1284 nvfx_vertprog_destroy(struct nvfx_context *nvfx, struct nvfx_vertex_program *vp)
1285 {
1286 if (vp->nr_insns)
1287 FREE(vp->insns);
1288
1289 if (vp->nr_consts)
1290 FREE(vp->consts);
1291
1292 nouveau_resource_free(&vp->exec);
1293 nouveau_resource_free(&vp->data);
1294
1295 util_dynarray_fini(&vp->branch_relocs);
1296 util_dynarray_fini(&vp->const_relocs);
1297 }
1298
1299 static void *
1300 nvfx_vp_state_create(struct pipe_context *pipe,
1301 const struct pipe_shader_state *cso)
1302 {
1303 struct nvfx_context *nvfx = nvfx_context(pipe);
1304 struct nvfx_vertex_program *vp;
1305
1306 // TODO: use a 64-bit atomic here!
1307 static unsigned long long id = 0;
1308
1309 vp = CALLOC(1, sizeof(struct nvfx_vertex_program));
1310 vp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
1311 vp->draw = draw_create_vertex_shader(nvfx->draw, &vp->pipe);
1312 vp->id = ++id;
1313
1314 return (void *)vp;
1315 }
1316
1317 static void
1318 nvfx_vp_state_bind(struct pipe_context *pipe, void *hwcso)
1319 {
1320 struct nvfx_context *nvfx = nvfx_context(pipe);
1321
1322 nvfx->vertprog = hwcso;
1323 nvfx->dirty |= NVFX_NEW_VERTPROG;
1324 nvfx->draw_dirty |= NVFX_NEW_VERTPROG;
1325 }
1326
1327 static void
1328 nvfx_vp_state_delete(struct pipe_context *pipe, void *hwcso)
1329 {
1330 struct nvfx_context *nvfx = nvfx_context(pipe);
1331 struct nvfx_vertex_program *vp = hwcso;
1332
1333 draw_delete_vertex_shader(nvfx->draw, vp->draw);
1334 nvfx_vertprog_destroy(nvfx, vp);
1335 FREE((void*)vp->pipe.tokens);
1336 FREE(vp);
1337 }
1338
1339 void
1340 nvfx_init_vertprog_functions(struct nvfx_context *nvfx)
1341 {
1342 nvfx->pipe.create_vs_state = nvfx_vp_state_create;
1343 nvfx->pipe.bind_vs_state = nvfx_vp_state_bind;
1344 nvfx->pipe.delete_vs_state = nvfx_vp_state_delete;
1345 }