nvfx: add rewritten swtnl support
[mesa.git] / src / gallium / drivers / nvfx / nvfx_vertprog.c
1 #include "pipe/p_context.h"
2 #include "pipe/p_defines.h"
3 #include "pipe/p_state.h"
4 #include "util/u_linkage.h"
5 #include "util/u_debug.h"
6
7 #include "pipe/p_shader_tokens.h"
8 #include "tgsi/tgsi_parse.h"
9 #include "tgsi/tgsi_dump.h"
10 #include "tgsi/tgsi_util.h"
11 #include "tgsi/tgsi_ureg.h"
12
13 #include "draw/draw_context.h"
14
15 #include "nvfx_context.h"
16 #include "nvfx_state.h"
17 #include "nvfx_resource.h"
18
19 /* TODO (at least...):
20 * 1. Indexed consts + ARL
21 * 3. NV_vp11, NV_vp2, NV_vp3 features
22 * - extra arith opcodes
23 * - branching
24 * - texture sampling
25 * - indexed attribs
26 * - indexed results
27 * 4. bugs
28 */
29
30 #include "nv30_vertprog.h"
31 #include "nv40_vertprog.h"
32
33 struct nvfx_loop_entry
34 {
35 unsigned brk_target;
36 unsigned cont_target;
37 };
38
39 struct nvfx_vpc {
40 struct nvfx_context* nvfx;
41 struct pipe_shader_state pipe;
42 struct nvfx_vertex_program *vp;
43
44 struct nvfx_vertex_program_exec *vpi;
45
46 unsigned r_temps;
47 unsigned r_temps_discard;
48 struct nvfx_reg r_result[PIPE_MAX_SHADER_OUTPUTS];
49 struct nvfx_reg *r_address;
50 struct nvfx_reg *r_temp;
51 struct nvfx_reg *r_const;
52
53 struct nvfx_reg *imm;
54 unsigned nr_imm;
55
56 unsigned hpos_idx;
57
58 struct util_dynarray label_relocs;
59 struct util_dynarray loop_stack;
60 };
61
62 static struct nvfx_reg
63 temp(struct nvfx_vpc *vpc)
64 {
65 int idx = ffs(~vpc->r_temps) - 1;
66
67 if (idx < 0) {
68 NOUVEAU_ERR("out of temps!!\n");
69 assert(0);
70 return nvfx_reg(NVFXSR_TEMP, 0);
71 }
72
73 vpc->r_temps |= (1 << idx);
74 vpc->r_temps_discard |= (1 << idx);
75 return nvfx_reg(NVFXSR_TEMP, idx);
76 }
77
78 static inline void
79 release_temps(struct nvfx_vpc *vpc)
80 {
81 vpc->r_temps &= ~vpc->r_temps_discard;
82 vpc->r_temps_discard = 0;
83 }
84
85 static struct nvfx_reg
86 constant(struct nvfx_vpc *vpc, int pipe, float x, float y, float z, float w)
87 {
88 struct nvfx_vertex_program *vp = vpc->vp;
89 struct nvfx_vertex_program_data *vpd;
90 int idx;
91
92 if (pipe >= 0) {
93 for (idx = 0; idx < vp->nr_consts; idx++) {
94 if (vp->consts[idx].index == pipe)
95 return nvfx_reg(NVFXSR_CONST, idx);
96 }
97 }
98
99 idx = vp->nr_consts++;
100 vp->consts = realloc(vp->consts, sizeof(*vpd) * vp->nr_consts);
101 vpd = &vp->consts[idx];
102
103 vpd->index = pipe;
104 vpd->value[0] = x;
105 vpd->value[1] = y;
106 vpd->value[2] = z;
107 vpd->value[3] = w;
108 return nvfx_reg(NVFXSR_CONST, idx);
109 }
110
111 #define arith(s,o,d,m,s0,s1,s2) \
112 nvfx_insn(0, (NVFX_VP_INST_SLOT_##s << 7) | NVFX_VP_INST_##s##_OP_##o, -1, (d), (m), (s0), (s1), (s2))
113
114 static void
115 emit_src(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int pos, struct nvfx_src src)
116 {
117 struct nvfx_vertex_program *vp = vpc->vp;
118 uint32_t sr = 0;
119 struct nvfx_relocation reloc;
120
121 switch (src.reg.type) {
122 case NVFXSR_TEMP:
123 sr |= (NVFX_VP(SRC_REG_TYPE_TEMP) << NVFX_VP(SRC_REG_TYPE_SHIFT));
124 sr |= (src.reg.index << NVFX_VP(SRC_TEMP_SRC_SHIFT));
125 break;
126 case NVFXSR_INPUT:
127 sr |= (NVFX_VP(SRC_REG_TYPE_INPUT) <<
128 NVFX_VP(SRC_REG_TYPE_SHIFT));
129 vp->ir |= (1 << src.reg.index);
130 hw[1] |= (src.reg.index << NVFX_VP(INST_INPUT_SRC_SHIFT));
131 break;
132 case NVFXSR_CONST:
133 sr |= (NVFX_VP(SRC_REG_TYPE_CONST) <<
134 NVFX_VP(SRC_REG_TYPE_SHIFT));
135 reloc.location = vp->nr_insns - 1;
136 reloc.target = src.reg.index;
137 util_dynarray_append(&vp->const_relocs, struct nvfx_relocation, reloc);
138 break;
139 case NVFXSR_NONE:
140 sr |= (NVFX_VP(SRC_REG_TYPE_INPUT) <<
141 NVFX_VP(SRC_REG_TYPE_SHIFT));
142 break;
143 default:
144 assert(0);
145 }
146
147 if (src.negate)
148 sr |= NVFX_VP(SRC_NEGATE);
149
150 if (src.abs)
151 hw[0] |= (1 << (21 + pos));
152
153 sr |= ((src.swz[0] << NVFX_VP(SRC_SWZ_X_SHIFT)) |
154 (src.swz[1] << NVFX_VP(SRC_SWZ_Y_SHIFT)) |
155 (src.swz[2] << NVFX_VP(SRC_SWZ_Z_SHIFT)) |
156 (src.swz[3] << NVFX_VP(SRC_SWZ_W_SHIFT)));
157
158 if(src.indirect) {
159 if(src.reg.type == NVFXSR_CONST)
160 hw[3] |= NVFX_VP(INST_INDEX_CONST);
161 else if(src.reg.type == NVFXSR_INPUT)
162 hw[0] |= NVFX_VP(INST_INDEX_INPUT);
163 else
164 assert(0);
165 if(src.indirect_reg)
166 hw[0] |= NVFX_VP(INST_ADDR_REG_SELECT_1);
167 hw[0] |= src.indirect_swz << NVFX_VP(INST_ADDR_SWZ_SHIFT);
168 }
169
170 switch (pos) {
171 case 0:
172 hw[1] |= ((sr & NVFX_VP(SRC0_HIGH_MASK)) >>
173 NVFX_VP(SRC0_HIGH_SHIFT)) << NVFX_VP(INST_SRC0H_SHIFT);
174 hw[2] |= (sr & NVFX_VP(SRC0_LOW_MASK)) <<
175 NVFX_VP(INST_SRC0L_SHIFT);
176 break;
177 case 1:
178 hw[2] |= sr << NVFX_VP(INST_SRC1_SHIFT);
179 break;
180 case 2:
181 hw[2] |= ((sr & NVFX_VP(SRC2_HIGH_MASK)) >>
182 NVFX_VP(SRC2_HIGH_SHIFT)) << NVFX_VP(INST_SRC2H_SHIFT);
183 hw[3] |= (sr & NVFX_VP(SRC2_LOW_MASK)) <<
184 NVFX_VP(INST_SRC2L_SHIFT);
185 break;
186 default:
187 assert(0);
188 }
189 }
190
191 static void
192 emit_dst(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int slot, struct nvfx_reg dst)
193 {
194 struct nvfx_vertex_program *vp = vpc->vp;
195
196 switch (dst.type) {
197 case NVFXSR_NONE:
198 if(!nvfx->is_nv4x)
199 hw[0] |= NV30_VP_INST_DEST_TEMP_ID_MASK;
200 else {
201 hw[3] |= NV40_VP_INST_DEST_MASK;
202 if (slot == 0)
203 hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK;
204 else
205 hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
206 }
207 break;
208 case NVFXSR_TEMP:
209 if(!nvfx->is_nv4x)
210 hw[0] |= (dst.index << NV30_VP_INST_DEST_TEMP_ID_SHIFT);
211 else {
212 hw[3] |= NV40_VP_INST_DEST_MASK;
213 if (slot == 0)
214 hw[0] |= (dst.index << NV40_VP_INST_VEC_DEST_TEMP_SHIFT);
215 else
216 hw[3] |= (dst.index << NV40_VP_INST_SCA_DEST_TEMP_SHIFT);
217 }
218 break;
219 case NVFXSR_OUTPUT:
220 /* TODO: this may be wrong because on nv30 COL0 and BFC0 are swapped */
221 if(nvfx->is_nv4x) {
222 switch (dst.index) {
223 case NV30_VP_INST_DEST_CLP(0):
224 dst.index = NVFX_VP(INST_DEST_FOGC);
225 break;
226 case NV30_VP_INST_DEST_CLP(1):
227 dst.index = NVFX_VP(INST_DEST_FOGC);
228 break;
229 case NV30_VP_INST_DEST_CLP(2):
230 dst.index = NVFX_VP(INST_DEST_FOGC);
231 break;
232 case NV30_VP_INST_DEST_CLP(3):
233 dst.index = NVFX_VP(INST_DEST_PSZ);
234 break;
235 case NV30_VP_INST_DEST_CLP(4):
236 dst.index = NVFX_VP(INST_DEST_PSZ);
237 break;
238 case NV30_VP_INST_DEST_CLP(5):
239 dst.index = NVFX_VP(INST_DEST_PSZ);
240 break;
241 case NV40_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
242 case NV40_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
243 case NV40_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
244 case NV40_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
245 case NV40_VP_INST_DEST_FOGC: vp->or |= (1 << 4); break;
246 case NV40_VP_INST_DEST_PSZ : vp->or |= (1 << 5); break;
247 }
248 }
249
250 if(!nvfx->is_nv4x) {
251 hw[3] |= (dst.index << NV30_VP_INST_DEST_SHIFT);
252 hw[0] |= NV30_VP_INST_VEC_DEST_TEMP_MASK;
253
254 /*XXX: no way this is entirely correct, someone needs to
255 * figure out what exactly it is.
256 */
257 hw[3] |= 0x800;
258 } else {
259 hw[3] |= (dst.index << NV40_VP_INST_DEST_SHIFT);
260 if (slot == 0) {
261 hw[0] |= NV40_VP_INST_VEC_RESULT;
262 hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK;
263 } else {
264 hw[3] |= NV40_VP_INST_SCA_RESULT;
265 hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
266 }
267 }
268 break;
269 default:
270 assert(0);
271 }
272 }
273
274 static void
275 nvfx_vp_emit(struct nvfx_vpc *vpc, struct nvfx_insn insn)
276 {
277 struct nvfx_context* nvfx = vpc->nvfx;
278 struct nvfx_vertex_program *vp = vpc->vp;
279 unsigned slot = insn.op >> 7;
280 unsigned op = insn.op & 0x7f;
281 uint32_t *hw;
282
283 vp->insns = realloc(vp->insns, ++vp->nr_insns * sizeof(*vpc->vpi));
284 vpc->vpi = &vp->insns[vp->nr_insns - 1];
285 memset(vpc->vpi, 0, sizeof(*vpc->vpi));
286
287 hw = vpc->vpi->data;
288
289 hw[0] |= (insn.cc_test << NVFX_VP(INST_COND_SHIFT));
290 hw[0] |= ((insn.cc_swz[0] << NVFX_VP(INST_COND_SWZ_X_SHIFT)) |
291 (insn.cc_swz[1] << NVFX_VP(INST_COND_SWZ_Y_SHIFT)) |
292 (insn.cc_swz[2] << NVFX_VP(INST_COND_SWZ_Z_SHIFT)) |
293 (insn.cc_swz[3] << NVFX_VP(INST_COND_SWZ_W_SHIFT)));
294 if(insn.cc_update)
295 hw[0] |= NVFX_VP(INST_COND_UPDATE_ENABLE);
296
297 if(!nvfx->is_nv4x) {
298 if(slot == 0)
299 hw[1] |= (op << NV30_VP_INST_VEC_OPCODE_SHIFT);
300 else
301 {
302 hw[0] |= ((op >> 4) << NV30_VP_INST_SCA_OPCODEH_SHIFT);
303 hw[1] |= ((op & 0xf) << NV30_VP_INST_SCA_OPCODEL_SHIFT);
304 }
305 // hw[3] |= NVFX_VP(INST_SCA_DEST_TEMP_MASK);
306 // hw[3] |= (mask << NVFX_VP(INST_VEC_WRITEMASK_SHIFT));
307
308 if (insn.dst.type == NVFXSR_OUTPUT) {
309 if (slot)
310 hw[3] |= (insn.mask << NV30_VP_INST_SDEST_WRITEMASK_SHIFT);
311 else
312 hw[3] |= (insn.mask << NV30_VP_INST_VDEST_WRITEMASK_SHIFT);
313 } else {
314 if (slot)
315 hw[3] |= (insn.mask << NV30_VP_INST_STEMP_WRITEMASK_SHIFT);
316 else
317 hw[3] |= (insn.mask << NV30_VP_INST_VTEMP_WRITEMASK_SHIFT);
318 }
319 } else {
320 if (slot == 0) {
321 hw[1] |= (op << NV40_VP_INST_VEC_OPCODE_SHIFT);
322 hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
323 hw[3] |= (insn.mask << NV40_VP_INST_VEC_WRITEMASK_SHIFT);
324 } else {
325 hw[1] |= (op << NV40_VP_INST_SCA_OPCODE_SHIFT);
326 hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK ;
327 hw[3] |= (insn.mask << NV40_VP_INST_SCA_WRITEMASK_SHIFT);
328 }
329 }
330
331 emit_dst(nvfx, vpc, hw, slot, insn.dst);
332 emit_src(nvfx, vpc, hw, 0, insn.src[0]);
333 emit_src(nvfx, vpc, hw, 1, insn.src[1]);
334 emit_src(nvfx, vpc, hw, 2, insn.src[2]);
335
336 // if(insn.src[0].indirect || op == NVFX_VP_INST_VEC_OP_ARL)
337 // hw[3] |= NV40_VP_INST_SCA_RESULT;
338 }
339
340 static inline struct nvfx_src
341 tgsi_src(struct nvfx_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
342 struct nvfx_src src;
343
344 switch (fsrc->Register.File) {
345 case TGSI_FILE_INPUT:
346 src.reg = nvfx_reg(NVFXSR_INPUT, fsrc->Register.Index);
347 break;
348 case TGSI_FILE_CONSTANT:
349 src.reg = vpc->r_const[fsrc->Register.Index];
350 break;
351 case TGSI_FILE_IMMEDIATE:
352 src.reg = vpc->imm[fsrc->Register.Index];
353 break;
354 case TGSI_FILE_TEMPORARY:
355 src.reg = vpc->r_temp[fsrc->Register.Index];
356 break;
357 default:
358 NOUVEAU_ERR("bad src file\n");
359 src.reg.index = 0;
360 src.reg.type = -1;
361 break;
362 }
363
364 src.abs = fsrc->Register.Absolute;
365 src.negate = fsrc->Register.Negate;
366 src.swz[0] = fsrc->Register.SwizzleX;
367 src.swz[1] = fsrc->Register.SwizzleY;
368 src.swz[2] = fsrc->Register.SwizzleZ;
369 src.swz[3] = fsrc->Register.SwizzleW;
370 src.indirect = 0;
371
372 if(fsrc->Register.Indirect) {
373 if(fsrc->Indirect.File == TGSI_FILE_ADDRESS &&
374 (fsrc->Register.File == TGSI_FILE_CONSTANT || fsrc->Register.File == TGSI_FILE_INPUT))
375 {
376 src.indirect = 1;
377 src.indirect_reg = fsrc->Indirect.Index;
378 src.indirect_swz = fsrc->Indirect.SwizzleX;
379 }
380 else
381 {
382 src.reg.index = 0;
383 src.reg.type = -1;
384 }
385 }
386 return src;
387 }
388
389 static INLINE struct nvfx_reg
390 tgsi_dst(struct nvfx_vpc *vpc, const struct tgsi_full_dst_register *fdst) {
391 struct nvfx_reg dst;
392
393 switch (fdst->Register.File) {
394 case TGSI_FILE_NULL:
395 dst = nvfx_reg(NVFXSR_NONE, 0);
396 break;
397 case TGSI_FILE_OUTPUT:
398 dst = vpc->r_result[fdst->Register.Index];
399 break;
400 case TGSI_FILE_TEMPORARY:
401 dst = vpc->r_temp[fdst->Register.Index];
402 break;
403 case TGSI_FILE_ADDRESS:
404 dst = vpc->r_address[fdst->Register.Index];
405 break;
406 default:
407 NOUVEAU_ERR("bad dst file %i\n", fdst->Register.File);
408 dst.index = 0;
409 dst.type = 0;
410 break;
411 }
412
413 return dst;
414 }
415
416 static inline int
417 tgsi_mask(uint tgsi)
418 {
419 int mask = 0;
420
421 if (tgsi & TGSI_WRITEMASK_X) mask |= NVFX_VP_MASK_X;
422 if (tgsi & TGSI_WRITEMASK_Y) mask |= NVFX_VP_MASK_Y;
423 if (tgsi & TGSI_WRITEMASK_Z) mask |= NVFX_VP_MASK_Z;
424 if (tgsi & TGSI_WRITEMASK_W) mask |= NVFX_VP_MASK_W;
425 return mask;
426 }
427
428 static boolean
429 nvfx_vertprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_vpc *vpc,
430 unsigned idx, const struct tgsi_full_instruction *finst)
431 {
432 struct nvfx_src src[3], tmp;
433 struct nvfx_reg dst;
434 struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0));
435 struct nvfx_insn insn;
436 struct nvfx_relocation reloc;
437 struct nvfx_loop_entry loop;
438 int mask;
439 int ai = -1, ci = -1, ii = -1;
440 int i;
441
442 if (finst->Instruction.Opcode == TGSI_OPCODE_END)
443 return TRUE;
444
445 for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
446 const struct tgsi_full_src_register *fsrc;
447
448 fsrc = &finst->Src[i];
449 if (fsrc->Register.File == TGSI_FILE_TEMPORARY) {
450 src[i] = tgsi_src(vpc, fsrc);
451 }
452 }
453
454 for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
455 const struct tgsi_full_src_register *fsrc;
456
457 fsrc = &finst->Src[i];
458
459 switch (fsrc->Register.File) {
460 case TGSI_FILE_INPUT:
461 if (ai == -1 || ai == fsrc->Register.Index) {
462 ai = fsrc->Register.Index;
463 src[i] = tgsi_src(vpc, fsrc);
464 } else {
465 src[i] = nvfx_src(temp(vpc));
466 nvfx_vp_emit(vpc, arith(VEC, MOV, src[i].reg, NVFX_VP_MASK_ALL, tgsi_src(vpc, fsrc), none, none));
467 }
468 break;
469 case TGSI_FILE_CONSTANT:
470 if ((ci == -1 && ii == -1) ||
471 ci == fsrc->Register.Index) {
472 ci = fsrc->Register.Index;
473 src[i] = tgsi_src(vpc, fsrc);
474 } else {
475 src[i] = nvfx_src(temp(vpc));
476 nvfx_vp_emit(vpc, arith(VEC, MOV, src[i].reg, NVFX_VP_MASK_ALL, tgsi_src(vpc, fsrc), none, none));
477 }
478 break;
479 case TGSI_FILE_IMMEDIATE:
480 if ((ci == -1 && ii == -1) ||
481 ii == fsrc->Register.Index) {
482 ii = fsrc->Register.Index;
483 src[i] = tgsi_src(vpc, fsrc);
484 } else {
485 src[i] = nvfx_src(temp(vpc));
486 nvfx_vp_emit(vpc, arith(VEC, MOV, src[i].reg, NVFX_VP_MASK_ALL, tgsi_src(vpc, fsrc), none, none));
487 }
488 break;
489 case TGSI_FILE_TEMPORARY:
490 /* handled above */
491 break;
492 default:
493 NOUVEAU_ERR("bad src file\n");
494 return FALSE;
495 }
496 }
497
498 for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
499 if(src[i].reg.type < 0)
500 return FALSE;
501 }
502
503 if(finst->Dst[0].Register.File == TGSI_FILE_ADDRESS &&
504 finst->Instruction.Opcode != TGSI_OPCODE_ARL)
505 return FALSE;
506
507 dst = tgsi_dst(vpc, &finst->Dst[0]);
508 mask = tgsi_mask(finst->Dst[0].Register.WriteMask);
509
510 switch (finst->Instruction.Opcode) {
511 case TGSI_OPCODE_ABS:
512 nvfx_vp_emit(vpc, arith(VEC, MOV, dst, mask, abs(src[0]), none, none));
513 break;
514 case TGSI_OPCODE_ADD:
515 nvfx_vp_emit(vpc, arith(VEC, ADD, dst, mask, src[0], none, src[1]));
516 break;
517 case TGSI_OPCODE_ARL:
518 nvfx_vp_emit(vpc, arith(VEC, ARL, dst, mask, src[0], none, none));
519 break;
520 case TGSI_OPCODE_CMP:
521 insn = arith(VEC, MOV, none.reg, mask, src[0], none, none);
522 insn.cc_update = 1;
523 nvfx_vp_emit(vpc, insn);
524
525 insn = arith(VEC, MOV, dst, mask, src[2], none, none);
526 insn.cc_test = NVFX_COND_GE;
527 nvfx_vp_emit(vpc, insn);
528
529 insn = arith(VEC, MOV, dst, mask, src[1], none, none);
530 insn.cc_test = NVFX_COND_LT;
531 nvfx_vp_emit(vpc, insn);
532 break;
533 case TGSI_OPCODE_COS:
534 nvfx_vp_emit(vpc, arith(SCA, COS, dst, mask, none, none, src[0]));
535 break;
536 case TGSI_OPCODE_DP2:
537 tmp = nvfx_src(temp(vpc));
538 nvfx_vp_emit(vpc, arith(VEC, MUL, tmp.reg, NVFX_VP_MASK_X | NVFX_VP_MASK_Y, src[0], src[1], none));
539 nvfx_vp_emit(vpc, arith(VEC, ADD, dst, mask, swz(tmp, X, X, X, X), none, swz(tmp, Y, Y, Y, Y)));
540 break;
541 case TGSI_OPCODE_DP3:
542 nvfx_vp_emit(vpc, arith(VEC, DP3, dst, mask, src[0], src[1], none));
543 break;
544 case TGSI_OPCODE_DP4:
545 nvfx_vp_emit(vpc, arith(VEC, DP4, dst, mask, src[0], src[1], none));
546 break;
547 case TGSI_OPCODE_DPH:
548 nvfx_vp_emit(vpc, arith(VEC, DPH, dst, mask, src[0], src[1], none));
549 break;
550 case TGSI_OPCODE_DST:
551 nvfx_vp_emit(vpc, arith(VEC, DST, dst, mask, src[0], src[1], none));
552 break;
553 case TGSI_OPCODE_EX2:
554 nvfx_vp_emit(vpc, arith(SCA, EX2, dst, mask, none, none, src[0]));
555 break;
556 case TGSI_OPCODE_EXP:
557 nvfx_vp_emit(vpc, arith(SCA, EXP, dst, mask, none, none, src[0]));
558 break;
559 case TGSI_OPCODE_FLR:
560 nvfx_vp_emit(vpc, arith(VEC, FLR, dst, mask, src[0], none, none));
561 break;
562 case TGSI_OPCODE_FRC:
563 nvfx_vp_emit(vpc, arith(VEC, FRC, dst, mask, src[0], none, none));
564 break;
565 case TGSI_OPCODE_LG2:
566 nvfx_vp_emit(vpc, arith(SCA, LG2, dst, mask, none, none, src[0]));
567 break;
568 case TGSI_OPCODE_LIT:
569 nvfx_vp_emit(vpc, arith(SCA, LIT, dst, mask, none, none, src[0]));
570 break;
571 case TGSI_OPCODE_LOG:
572 nvfx_vp_emit(vpc, arith(SCA, LOG, dst, mask, none, none, src[0]));
573 break;
574 case TGSI_OPCODE_LRP:
575 tmp = nvfx_src(temp(vpc));
576 nvfx_vp_emit(vpc, arith(VEC, MAD, tmp.reg, mask, neg(src[0]), src[2], src[2]));
577 nvfx_vp_emit(vpc, arith(VEC, MAD, dst, mask, src[0], src[1], tmp));
578 break;
579 case TGSI_OPCODE_MAD:
580 nvfx_vp_emit(vpc, arith(VEC, MAD, dst, mask, src[0], src[1], src[2]));
581 break;
582 case TGSI_OPCODE_MAX:
583 nvfx_vp_emit(vpc, arith(VEC, MAX, dst, mask, src[0], src[1], none));
584 break;
585 case TGSI_OPCODE_MIN:
586 nvfx_vp_emit(vpc, arith(VEC, MIN, dst, mask, src[0], src[1], none));
587 break;
588 case TGSI_OPCODE_MOV:
589 nvfx_vp_emit(vpc, arith(VEC, MOV, dst, mask, src[0], none, none));
590 break;
591 case TGSI_OPCODE_MUL:
592 nvfx_vp_emit(vpc, arith(VEC, MUL, dst, mask, src[0], src[1], none));
593 break;
594 case TGSI_OPCODE_NOP:
595 break;
596 case TGSI_OPCODE_POW:
597 tmp = nvfx_src(temp(vpc));
598 nvfx_vp_emit(vpc, arith(SCA, LG2, tmp.reg, NVFX_VP_MASK_X, none, none, swz(src[0], X, X, X, X)));
599 nvfx_vp_emit(vpc, arith(VEC, MUL, tmp.reg, NVFX_VP_MASK_X, swz(tmp, X, X, X, X), swz(src[1], X, X, X, X), none));
600 nvfx_vp_emit(vpc, arith(SCA, EX2, dst, mask, none, none, swz(tmp, X, X, X, X)));
601 break;
602 case TGSI_OPCODE_RCP:
603 nvfx_vp_emit(vpc, arith(SCA, RCP, dst, mask, none, none, src[0]));
604 break;
605 case TGSI_OPCODE_RSQ:
606 nvfx_vp_emit(vpc, arith(SCA, RSQ, dst, mask, none, none, abs(src[0])));
607 break;
608 case TGSI_OPCODE_SEQ:
609 nvfx_vp_emit(vpc, arith(VEC, SEQ, dst, mask, src[0], src[1], none));
610 break;
611 case TGSI_OPCODE_SFL:
612 nvfx_vp_emit(vpc, arith(VEC, SFL, dst, mask, src[0], src[1], none));
613 break;
614 case TGSI_OPCODE_SGE:
615 nvfx_vp_emit(vpc, arith(VEC, SGE, dst, mask, src[0], src[1], none));
616 break;
617 case TGSI_OPCODE_SGT:
618 nvfx_vp_emit(vpc, arith(VEC, SGT, dst, mask, src[0], src[1], none));
619 break;
620 case TGSI_OPCODE_SIN:
621 nvfx_vp_emit(vpc, arith(SCA, SIN, dst, mask, none, none, src[0]));
622 break;
623 case TGSI_OPCODE_SLE:
624 nvfx_vp_emit(vpc, arith(VEC, SLE, dst, mask, src[0], src[1], none));
625 break;
626 case TGSI_OPCODE_SLT:
627 nvfx_vp_emit(vpc, arith(VEC, SLT, dst, mask, src[0], src[1], none));
628 break;
629 case TGSI_OPCODE_SNE:
630 nvfx_vp_emit(vpc, arith(VEC, SNE, dst, mask, src[0], src[1], none));
631 break;
632 case TGSI_OPCODE_SSG:
633 nvfx_vp_emit(vpc, arith(VEC, SSG, dst, mask, src[0], src[1], none));
634 break;
635 case TGSI_OPCODE_STR:
636 nvfx_vp_emit(vpc, arith(VEC, STR, dst, mask, src[0], src[1], none));
637 break;
638 case TGSI_OPCODE_SUB:
639 nvfx_vp_emit(vpc, arith(VEC, ADD, dst, mask, src[0], none, neg(src[1])));
640 break;
641 case TGSI_OPCODE_TRUNC:
642 tmp = nvfx_src(temp(vpc));
643 insn = arith(VEC, MOV, none.reg, mask, src[0], none, none);
644 insn.cc_update = 1;
645 nvfx_vp_emit(vpc, insn);
646
647 nvfx_vp_emit(vpc, arith(VEC, FLR, tmp.reg, mask, abs(src[0]), none, none));
648 nvfx_vp_emit(vpc, arith(VEC, MOV, dst, mask, tmp, none, none));
649
650 insn = arith(VEC, MOV, dst, mask, neg(tmp), none, none);
651 insn.cc_test = NVFX_COND_LT;
652 nvfx_vp_emit(vpc, insn);
653 break;
654 case TGSI_OPCODE_XPD:
655 tmp = nvfx_src(temp(vpc));
656 nvfx_vp_emit(vpc, arith(VEC, MUL, tmp.reg, mask, swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none));
657 nvfx_vp_emit(vpc, arith(VEC, MAD, dst, (mask & ~NVFX_VP_MASK_W), swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y), neg(tmp)));
658 break;
659
660 case TGSI_OPCODE_IF:
661 insn = arith(VEC, MOV, none.reg, NVFX_VP_MASK_X, src[0], none, none);
662 insn.cc_update = 1;
663 nvfx_vp_emit(vpc, insn);
664
665 reloc.location = vpc->vp->nr_insns;
666 reloc.target = finst->Label.Label + 1;
667 util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc);
668
669 insn = arith(SCA, BRA, none.reg, 0, none, none, none);
670 insn.cc_test = NVFX_COND_EQ;
671 insn.cc_swz[0] = insn.cc_swz[1] = insn.cc_swz[2] = insn.cc_swz[3] = 0;
672 nvfx_vp_emit(vpc, insn);
673 break;
674
675 case TGSI_OPCODE_ELSE:
676 case TGSI_OPCODE_BRA:
677 case TGSI_OPCODE_CAL:
678 reloc.location = vpc->vp->nr_insns;
679 reloc.target = finst->Label.Label;
680 util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc);
681
682 if(finst->Instruction.Opcode == TGSI_OPCODE_CAL)
683 insn = arith(SCA, CAL, none.reg, 0, none, none, none);
684 else
685 insn = arith(SCA, BRA, none.reg, 0, none, none, none);
686 nvfx_vp_emit(vpc, insn);
687 break;
688
689 case TGSI_OPCODE_RET:
690 tmp = none;
691 tmp.swz[0] = tmp.swz[1] = tmp.swz[2] = tmp.swz[3] = 0;
692 nvfx_vp_emit(vpc, arith(SCA, RET, none.reg, 0, none, none, tmp));
693 break;
694
695 case TGSI_OPCODE_BGNSUB:
696 case TGSI_OPCODE_ENDSUB:
697 case TGSI_OPCODE_ENDIF:
698 /* nothing to do here */
699 break;
700
701 case TGSI_OPCODE_BGNLOOP:
702 loop.cont_target = idx;
703 loop.brk_target = finst->Label.Label + 1;
704 util_dynarray_append(&vpc->loop_stack, struct nvfx_loop_entry, loop);
705 break;
706
707 case TGSI_OPCODE_ENDLOOP:
708 loop = util_dynarray_pop(&vpc->loop_stack, struct nvfx_loop_entry);
709
710 reloc.location = vpc->vp->nr_insns;
711 reloc.target = loop.cont_target;
712 util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc);
713
714 nvfx_vp_emit(vpc, arith(SCA, BRA, none.reg, 0, none, none, none));
715 break;
716
717 case TGSI_OPCODE_CONT:
718 loop = util_dynarray_top(&vpc->loop_stack, struct nvfx_loop_entry);
719
720 reloc.location = vpc->vp->nr_insns;
721 reloc.target = loop.cont_target;
722 util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc);
723
724 nvfx_vp_emit(vpc, arith(SCA, BRA, none.reg, 0, none, none, none));
725 break;
726
727 case TGSI_OPCODE_BRK:
728 loop = util_dynarray_top(&vpc->loop_stack, struct nvfx_loop_entry);
729
730 reloc.location = vpc->vp->nr_insns;
731 reloc.target = loop.brk_target;
732 util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc);
733
734 nvfx_vp_emit(vpc, arith(SCA, BRA, none.reg, 0, none, none, none));
735 break;
736
737 default:
738 NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
739 return FALSE;
740 }
741
742 release_temps(vpc);
743 return TRUE;
744 }
745
746 static boolean
747 nvfx_vertprog_parse_decl_output(struct nvfx_context* nvfx, struct nvfx_vpc *vpc,
748 const struct tgsi_full_declaration *fdec)
749 {
750 unsigned idx = fdec->Range.First;
751 int hw;
752
753 switch (fdec->Semantic.Name) {
754 case TGSI_SEMANTIC_POSITION:
755 hw = NVFX_VP(INST_DEST_POS);
756 vpc->hpos_idx = idx;
757 break;
758 case TGSI_SEMANTIC_COLOR:
759 if (fdec->Semantic.Index == 0) {
760 hw = NVFX_VP(INST_DEST_COL0);
761 } else
762 if (fdec->Semantic.Index == 1) {
763 hw = NVFX_VP(INST_DEST_COL1);
764 } else {
765 NOUVEAU_ERR("bad colour semantic index\n");
766 return FALSE;
767 }
768 break;
769 case TGSI_SEMANTIC_BCOLOR:
770 if (fdec->Semantic.Index == 0) {
771 hw = NVFX_VP(INST_DEST_BFC0);
772 } else
773 if (fdec->Semantic.Index == 1) {
774 hw = NVFX_VP(INST_DEST_BFC1);
775 } else {
776 NOUVEAU_ERR("bad bcolour semantic index\n");
777 return FALSE;
778 }
779 break;
780 case TGSI_SEMANTIC_FOG:
781 hw = NVFX_VP(INST_DEST_FOGC);
782 break;
783 case TGSI_SEMANTIC_PSIZE:
784 hw = NVFX_VP(INST_DEST_PSZ);
785 break;
786 case TGSI_SEMANTIC_GENERIC:
787 hw = (vpc->vp->generic_to_fp_input[fdec->Semantic.Index] & 0xf) - NVFX_FP_OP_INPUT_SRC_TC(0);
788 if(hw <= 8)
789 hw = NVFX_VP(INST_DEST_TC(hw));
790 else if(hw == 9) /* TODO: this is correct, but how does this overlapping work exactly? */
791 hw = NV40_VP_INST_DEST_PSZ;
792 else
793 assert(0);
794 break;
795 case TGSI_SEMANTIC_EDGEFLAG:
796 /* not really an error just a fallback */
797 NOUVEAU_ERR("cannot handle edgeflag output\n");
798 return FALSE;
799 default:
800 NOUVEAU_ERR("bad output semantic\n");
801 return FALSE;
802 }
803
804 vpc->r_result[idx] = nvfx_reg(NVFXSR_OUTPUT, hw);
805 return TRUE;
806 }
807
808 static boolean
809 nvfx_vertprog_prepare(struct nvfx_context* nvfx, struct nvfx_vpc *vpc)
810 {
811 struct tgsi_parse_context p;
812 int high_const = -1, high_temp = -1, high_addr = -1, nr_imm = 0, i;
813 struct util_semantic_set set;
814 unsigned char sem_layout[10];
815 unsigned num_outputs;
816 unsigned num_texcoords = nvfx->is_nv4x ? 10 : 8;
817
818 num_outputs = util_semantic_set_from_program_file(&set, vpc->pipe.tokens, TGSI_FILE_OUTPUT);
819
820 if(num_outputs > num_texcoords) {
821 NOUVEAU_ERR("too many vertex program outputs: %i\n", num_outputs);
822 return FALSE;
823 }
824 util_semantic_layout_from_set(sem_layout, &set, num_texcoords, num_texcoords);
825
826 /* hope 0xf is (0, 0, 0, 1) initialized; otherwise, we are _probably_ not required to do this */
827 memset(vpc->vp->generic_to_fp_input, 0x0f, sizeof(vpc->vp->generic_to_fp_input));
828 for(int i = 0; i < 10; ++i) {
829 if(sem_layout[i] == 0xff)
830 continue;
831 //printf("vp: GENERIC[%i] to fpreg %i\n", sem_layout[i], NVFX_FP_OP_INPUT_SRC_TC(0) + i);
832 vpc->vp->generic_to_fp_input[sem_layout[i]] = 0xf0 | NVFX_FP_OP_INPUT_SRC_TC(i);
833 }
834
835 vpc->vp->sprite_fp_input = -1;
836 for(int i = 0; i < 10; ++i)
837 {
838 if(sem_layout[i] == 0xff)
839 {
840 vpc->vp->sprite_fp_input = NVFX_FP_OP_INPUT_SRC_TC(i);
841 break;
842 }
843 }
844
845 tgsi_parse_init(&p, vpc->pipe.tokens);
846 while (!tgsi_parse_end_of_tokens(&p)) {
847 const union tgsi_full_token *tok = &p.FullToken;
848
849 tgsi_parse_token(&p);
850 switch(tok->Token.Type) {
851 case TGSI_TOKEN_TYPE_IMMEDIATE:
852 nr_imm++;
853 break;
854 case TGSI_TOKEN_TYPE_DECLARATION:
855 {
856 const struct tgsi_full_declaration *fdec;
857
858 fdec = &p.FullToken.FullDeclaration;
859 switch (fdec->Declaration.File) {
860 case TGSI_FILE_TEMPORARY:
861 if (fdec->Range.Last > high_temp) {
862 high_temp =
863 fdec->Range.Last;
864 }
865 break;
866 case TGSI_FILE_ADDRESS:
867 if (fdec->Range.Last > high_addr) {
868 high_addr =
869 fdec->Range.Last;
870 }
871 break;
872 case TGSI_FILE_CONSTANT:
873 if (fdec->Range.Last > high_const) {
874 high_const =
875 fdec->Range.Last;
876 }
877 break;
878 case TGSI_FILE_OUTPUT:
879 if (!nvfx_vertprog_parse_decl_output(nvfx, vpc, fdec))
880 return FALSE;
881 break;
882 default:
883 break;
884 }
885 }
886 break;
887 default:
888 break;
889 }
890 }
891 tgsi_parse_free(&p);
892
893 if (nr_imm) {
894 vpc->imm = CALLOC(nr_imm, sizeof(struct nvfx_reg));
895 assert(vpc->imm);
896 }
897
898 if (++high_temp) {
899 vpc->r_temp = CALLOC(high_temp, sizeof(struct nvfx_reg));
900 for (i = 0; i < high_temp; i++)
901 vpc->r_temp[i] = temp(vpc);
902 }
903
904 if (++high_addr) {
905 vpc->r_address = CALLOC(high_addr, sizeof(struct nvfx_reg));
906 for (i = 0; i < high_addr; i++)
907 vpc->r_address[i] = nvfx_reg(NVFXSR_TEMP, i);
908 }
909
910 if(++high_const) {
911 vpc->r_const = CALLOC(high_const, sizeof(struct nvfx_reg));
912 for (i = 0; i < high_const; i++)
913 vpc->r_const[i] = constant(vpc, i, 0, 0, 0, 0);
914 }
915
916 vpc->r_temps_discard = 0;
917 return TRUE;
918 }
919
920 DEBUG_GET_ONCE_BOOL_OPTION(nvfx_dump_vp, "NVFX_DUMP_VP", FALSE)
921
922 static struct nvfx_vertex_program*
923 nvfx_vertprog_translate(struct nvfx_context *nvfx, const struct pipe_shader_state* vps)
924 {
925 struct tgsi_parse_context parse;
926 struct nvfx_vertex_program* vp = NULL;
927 struct nvfx_vpc *vpc = NULL;
928 struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0));
929 struct util_dynarray insns;
930 int i;
931
932 tgsi_parse_init(&parse, vps->tokens);
933
934 vp = CALLOC_STRUCT(nvfx_vertex_program);
935 if(!vp)
936 goto out_err;
937
938 vpc = CALLOC_STRUCT(nvfx_vpc);
939 if (!vpc)
940 goto out_err;
941
942 vpc->nvfx = nvfx;
943 vpc->vp = vp;
944 vpc->pipe = *vps;
945
946 {
947 // TODO: use a 64-bit atomic here!
948 static unsigned long long id = 0;
949 vp->id = ++id;
950 }
951
952 /* reserve space for ucps */
953 if(nvfx->use_vp_clipping)
954 {
955 for(i = 0; i < 6; ++i)
956 constant(vpc, -1, 0, 0, 0, 0);
957 }
958
959 if (!nvfx_vertprog_prepare(nvfx, vpc)) {
960 FREE(vpc);
961 return NULL;
962 }
963
964 /* Redirect post-transform vertex position to a temp if user clip
965 * planes are enabled. We need to append code to the vtxprog
966 * to handle clip planes later.
967 */
968 /* TODO: maybe support patching this depending on whether there are ucps: not sure if it is really matters much */
969 if (nvfx->use_vp_clipping) {
970 vpc->r_result[vpc->hpos_idx] = temp(vpc);
971 vpc->r_temps_discard = 0;
972 }
973
974 util_dynarray_init(&insns);
975 while (!tgsi_parse_end_of_tokens(&parse)) {
976 tgsi_parse_token(&parse);
977
978 switch (parse.FullToken.Token.Type) {
979 case TGSI_TOKEN_TYPE_IMMEDIATE:
980 {
981 const struct tgsi_full_immediate *imm;
982
983 imm = &parse.FullToken.FullImmediate;
984 assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
985 assert(imm->Immediate.NrTokens == 4 + 1);
986 vpc->imm[vpc->nr_imm++] =
987 constant(vpc, -1,
988 imm->u[0].Float,
989 imm->u[1].Float,
990 imm->u[2].Float,
991 imm->u[3].Float);
992 }
993 break;
994 case TGSI_TOKEN_TYPE_INSTRUCTION:
995 {
996 const struct tgsi_full_instruction *finst;
997 unsigned idx = insns.size >> 2;
998 util_dynarray_append(&insns, unsigned, vp->nr_insns);
999 finst = &parse.FullToken.FullInstruction;
1000 if (!nvfx_vertprog_parse_instruction(nvfx, vpc, idx, finst))
1001 goto out_err;
1002 }
1003 break;
1004 default:
1005 break;
1006 }
1007 }
1008
1009 util_dynarray_append(&insns, unsigned, vp->nr_insns);
1010
1011 for(unsigned i = 0; i < vpc->label_relocs.size; i += sizeof(struct nvfx_relocation))
1012 {
1013 struct nvfx_relocation* label_reloc = (struct nvfx_relocation*)((char*)vpc->label_relocs.data + i);
1014 struct nvfx_relocation hw_reloc;
1015
1016 hw_reloc.location = label_reloc->location;
1017 hw_reloc.target = ((unsigned*)insns.data)[label_reloc->target];
1018
1019 //debug_printf("hw %u -> tgsi %u = hw %u\n", hw_reloc.location, label_reloc->target, hw_reloc.target);
1020
1021 util_dynarray_append(&vp->branch_relocs, struct nvfx_relocation, hw_reloc);
1022 }
1023 util_dynarray_fini(&insns);
1024 util_dynarray_trim(&vp->branch_relocs);
1025
1026 /* XXX: what if we add a RET before?! make sure we jump here...*/
1027
1028 /* Write out HPOS if it was redirected to a temp earlier */
1029 if (vpc->r_result[vpc->hpos_idx].type != NVFXSR_OUTPUT) {
1030 struct nvfx_reg hpos = nvfx_reg(NVFXSR_OUTPUT,
1031 NVFX_VP(INST_DEST_POS));
1032 struct nvfx_src htmp = nvfx_src(vpc->r_result[vpc->hpos_idx]);
1033
1034 nvfx_vp_emit(vpc, arith(VEC, MOV, hpos, NVFX_VP_MASK_ALL, htmp, none, none));
1035 }
1036
1037 /* Insert code to handle user clip planes */
1038 if(nvfx->use_vp_clipping)
1039 {
1040 for (i = 0; i < 6; i++) {
1041 struct nvfx_reg cdst = nvfx_reg(NVFXSR_OUTPUT, NV30_VP_INST_DEST_CLP(i));
1042 struct nvfx_src ceqn = nvfx_src(nvfx_reg(NVFXSR_CONST, i));
1043 struct nvfx_src htmp = nvfx_src(vpc->r_result[vpc->hpos_idx]);
1044 unsigned mask;
1045
1046 if(nvfx->is_nv4x)
1047 {
1048 switch (i) {
1049 case 0: case 3: mask = NVFX_VP_MASK_Y; break;
1050 case 1: case 4: mask = NVFX_VP_MASK_Z; break;
1051 case 2: case 5: mask = NVFX_VP_MASK_W; break;
1052 default:
1053 NOUVEAU_ERR("invalid clip dist #%d\n", i);
1054 goto out_err;
1055 }
1056 }
1057 else
1058 mask = NVFX_VP_MASK_X;
1059
1060 nvfx_vp_emit(vpc, arith(VEC, DP4, cdst, mask, htmp, ceqn, none));
1061 }
1062 }
1063 else
1064 {
1065 if(vp->nr_insns)
1066 vp->insns[vp->nr_insns - 1].data[3] |= NVFX_VP_INST_LAST;
1067
1068 nvfx_vp_emit(vpc, arith(VEC, NOP, none.reg, 0, none, none, none));
1069 vp->insns[vp->nr_insns - 1].data[3] |= NVFX_VP_INST_LAST;
1070 }
1071
1072 if(debug_get_option_nvfx_dump_vp())
1073 {
1074 debug_printf("\n");
1075 tgsi_dump(vpc->pipe.tokens, 0);
1076
1077 debug_printf("\n%s vertex program:\n", nvfx->is_nv4x ? "nv4x" : "nv3x");
1078 for (i = 0; i < vp->nr_insns; i++)
1079 debug_printf("%3u: %08x %08x %08x %08x\n", i, vp->insns[i].data[0], vp->insns[i].data[1], vp->insns[i].data[2], vp->insns[i].data[3]);
1080 debug_printf("\n");
1081 }
1082
1083 vp->clip_nr = -1;
1084 vp->exec_start = -1;
1085
1086 out:
1087 tgsi_parse_free(&parse);
1088 if(vpc) {
1089 util_dynarray_fini(&vpc->label_relocs);
1090 util_dynarray_fini(&vpc->loop_stack);
1091 FREE(vpc->r_temp);
1092 FREE(vpc->r_address);
1093 FREE(vpc->r_const);
1094 FREE(vpc->imm);
1095 FREE(vpc);
1096 }
1097 return vp;
1098
1099 out_err:
1100 FREE(vp);
1101 vp = NULL;
1102 goto out;
1103 }
1104
1105 static struct nvfx_vertex_program*
1106 nvfx_vertprog_translate_draw_vp(struct nvfx_context *nvfx, struct nvfx_pipe_vertex_program* pvp)
1107 {
1108 struct nvfx_vertex_program* vp = NULL;
1109 struct pipe_shader_state vps;
1110 struct ureg_program *ureg = NULL;
1111 unsigned num_outputs = MIN2(pvp->info.num_outputs, 16);
1112
1113 ureg = ureg_create( TGSI_PROCESSOR_VERTEX );
1114 if(ureg == NULL)
1115 return 0;
1116
1117 for (unsigned i = 0; i < num_outputs; i++)
1118 ureg_MOV(ureg, ureg_DECL_output(ureg, pvp->info.output_semantic_name[i], pvp->info.output_semantic_index[i]), ureg_DECL_vs_input(ureg, i));
1119
1120 ureg_END( ureg );
1121
1122 vps.tokens = ureg_get_tokens(ureg, 0);
1123 vp = nvfx_vertprog_translate(nvfx, &vps);
1124 ureg_free_tokens(vps.tokens);
1125 ureg_destroy(ureg);
1126
1127 return vp;
1128 }
1129
1130 boolean
1131 nvfx_vertprog_validate(struct nvfx_context *nvfx)
1132 {
1133 struct nvfx_screen *screen = nvfx->screen;
1134 struct nouveau_channel *chan = screen->base.channel;
1135 struct nouveau_grobj *eng3d = screen->eng3d;
1136 struct nvfx_pipe_vertex_program *pvp = nvfx->vertprog;
1137 struct nvfx_vertex_program* vp;
1138 struct pipe_resource *constbuf;
1139 boolean upload_code = FALSE, upload_data = FALSE;
1140 int i;
1141
1142 if (nvfx->render_mode == HW) {
1143 nvfx->fallback_swtnl &= ~NVFX_NEW_VERTPROG;
1144 vp = pvp->vp;
1145
1146 if(!vp) {
1147 vp = nvfx_vertprog_translate(nvfx, &pvp->pipe);
1148 if(!vp)
1149 vp = NVFX_VP_FAILED;
1150 pvp->vp = vp;
1151 }
1152
1153 if(vp == NVFX_VP_FAILED) {
1154 nvfx->fallback_swtnl |= NVFX_NEW_VERTPROG;
1155 return FALSE;
1156 }
1157
1158 constbuf = nvfx->constbuf[PIPE_SHADER_VERTEX];
1159 } else {
1160 vp = pvp->draw_vp;
1161 if(!vp)
1162 {
1163 pvp->draw_vp = vp = nvfx_vertprog_translate_draw_vp(nvfx, pvp);
1164 if(!vp) {
1165 _debug_printf("Error: unable to create a swtnl passthrough vertex shader: aborting.");
1166 abort();
1167 }
1168 }
1169 constbuf = NULL;
1170 }
1171
1172 nvfx->hw_vertprog = vp;
1173
1174 /* Allocate hw vtxprog exec slots */
1175 if (!vp->exec) {
1176 struct nouveau_resource *heap = nvfx->screen->vp_exec_heap;
1177 uint vplen = vp->nr_insns;
1178
1179 if (nouveau_resource_alloc(heap, vplen, vp, &vp->exec)) {
1180 while (heap->next && heap->size < vplen) {
1181 struct nvfx_vertex_program *evict;
1182
1183 evict = heap->next->priv;
1184 nouveau_resource_free(&evict->exec);
1185 }
1186
1187 if (nouveau_resource_alloc(heap, vplen, vp, &vp->exec))
1188 {
1189 debug_printf("Vertex shader too long: %u instructions\n", vplen);
1190 nvfx->fallback_swtnl |= NVFX_NEW_VERTPROG;
1191 return FALSE;
1192 }
1193 }
1194
1195 upload_code = TRUE;
1196 }
1197
1198 /* Allocate hw vtxprog const slots */
1199 if (vp->nr_consts && !vp->data) {
1200 struct nouveau_resource *heap = nvfx->screen->vp_data_heap;
1201
1202 if (nouveau_resource_alloc(heap, vp->nr_consts, vp, &vp->data)) {
1203 while (heap->next && heap->size < vp->nr_consts) {
1204 struct nvfx_vertex_program *evict;
1205
1206 evict = heap->next->priv;
1207 nouveau_resource_free(&evict->data);
1208 }
1209
1210 if (nouveau_resource_alloc(heap, vp->nr_consts, vp, &vp->data))
1211 {
1212 debug_printf("Vertex shader uses too many constants: %u constants\n", vp->nr_consts);
1213 nvfx->fallback_swtnl |= NVFX_NEW_VERTPROG;
1214 return FALSE;
1215 }
1216 }
1217
1218 //printf("start at %u nc %u\n", vp->data->start, vp->nr_consts);
1219
1220 /*XXX: handle this some day */
1221 assert(vp->data->start >= vp->data_start_min);
1222
1223 upload_data = TRUE;
1224 if (vp->data_start != vp->data->start)
1225 upload_code = TRUE;
1226 }
1227
1228 /* If exec or data segments moved we need to patch the program to
1229 * fixup offsets and register IDs.
1230 */
1231 if (vp->exec_start != vp->exec->start) {
1232 //printf("vp_relocs %u -> %u\n", vp->exec_start, vp->exec->start);
1233 for(unsigned i = 0; i < vp->branch_relocs.size; i += sizeof(struct nvfx_relocation))
1234 {
1235 struct nvfx_relocation* reloc = (struct nvfx_relocation*)((char*)vp->branch_relocs.data + i);
1236 uint32_t* hw = vp->insns[reloc->location].data;
1237 unsigned target = vp->exec->start + reloc->target;
1238
1239 //debug_printf("vp_reloc hw %u -> hw %u\n", reloc->location, target);
1240
1241 if(!nvfx->is_nv4x)
1242 {
1243 hw[2] &=~ NV30_VP_INST_IADDR_MASK;
1244 hw[2] |= (target & 0x1ff) << NV30_VP_INST_IADDR_SHIFT;
1245 }
1246 else
1247 {
1248 hw[3] &=~ NV40_VP_INST_IADDRL_MASK;
1249 hw[3] |= (target & 7) << NV40_VP_INST_IADDRL_SHIFT;
1250
1251 hw[2] &=~ NV40_VP_INST_IADDRH_MASK;
1252 hw[2] |= ((target >> 3) & 0x3f) << NV40_VP_INST_IADDRH_SHIFT;
1253 }
1254 }
1255
1256 vp->exec_start = vp->exec->start;
1257 }
1258
1259 if (vp->data_start != vp->data->start) {
1260 for(unsigned i = 0; i < vp->const_relocs.size; i += sizeof(struct nvfx_relocation))
1261 {
1262 struct nvfx_relocation* reloc = (struct nvfx_relocation*)((char*)vp->const_relocs.data + i);
1263 struct nvfx_vertex_program_exec *vpi = &vp->insns[reloc->location];
1264
1265 //printf("reloc %i to %i + %i\n", reloc->location, vp->data->start, reloc->target);
1266
1267 vpi->data[1] &= ~NVFX_VP(INST_CONST_SRC_MASK);
1268 vpi->data[1] |=
1269 (reloc->target + vp->data->start) <<
1270 NVFX_VP(INST_CONST_SRC_SHIFT);
1271 }
1272
1273 vp->data_start = vp->data->start;
1274 upload_code = TRUE;
1275 }
1276
1277 /* Update + Upload constant values */
1278 if (vp->nr_consts) {
1279 float *map = NULL;
1280
1281 if (constbuf)
1282 map = (float*)nvfx_buffer(constbuf)->data;
1283
1284 /*
1285 for (i = 0; i < 512; i++) {
1286 float v[4] = {0.1, 0,2, 0.3, 0.4};
1287 BEGIN_RING(chan, eng3d, NV34TCL_VP_UPLOAD_CONST_ID, 5);
1288 OUT_RING (chan, i);
1289 OUT_RINGp (chan, (uint32_t *)v, 4);
1290 printf("frob %i\n", i);
1291 }
1292 */
1293
1294 for (i = nvfx->use_vp_clipping ? 6 : 0; i < vp->nr_consts; i++) {
1295 struct nvfx_vertex_program_data *vpd = &vp->consts[i];
1296
1297 if (vpd->index >= 0) {
1298 if (!upload_data &&
1299 !memcmp(vpd->value, &map[vpd->index * 4],
1300 4 * sizeof(float)))
1301 continue;
1302 memcpy(vpd->value, &map[vpd->index * 4],
1303 4 * sizeof(float));
1304 }
1305
1306 //printf("upload into %i + %i: %f %f %f %f\n", vp->data->start, i, vpd->value[0], vpd->value[1], vpd->value[2], vpd->value[3]);
1307
1308 BEGIN_RING(chan, eng3d, NV34TCL_VP_UPLOAD_CONST_ID, 5);
1309 OUT_RING (chan, i + vp->data->start);
1310 OUT_RINGp (chan, (uint32_t *)vpd->value, 4);
1311 }
1312 }
1313
1314 /* Upload vtxprog */
1315 if (upload_code) {
1316 BEGIN_RING(chan, eng3d, NV34TCL_VP_UPLOAD_FROM_ID, 1);
1317 OUT_RING (chan, vp->exec->start);
1318 for (i = 0; i < vp->nr_insns; i++) {
1319 BEGIN_RING(chan, eng3d, NV34TCL_VP_UPLOAD_INST(0), 4);
1320 //printf("%08x %08x %08x %08x\n", vp->insns[i].data[0], vp->insns[i].data[1], vp->insns[i].data[2], vp->insns[i].data[3]);
1321 OUT_RINGp (chan, vp->insns[i].data, 4);
1322 }
1323 vp->clip_nr = -1;
1324 }
1325
1326 if(nvfx->dirty & (NVFX_NEW_VERTPROG))
1327 {
1328 WAIT_RING(chan, 6);
1329 OUT_RING(chan, RING_3D(NV34TCL_VP_START_FROM_ID, 1));
1330 OUT_RING(chan, vp->exec->start);
1331 if(nvfx->is_nv4x) {
1332 OUT_RING(chan, RING_3D(NV40TCL_VP_ATTRIB_EN, 1));
1333 OUT_RING(chan, vp->ir);
1334 }
1335 }
1336
1337 return TRUE;
1338 }
1339
1340 void
1341 nvfx_vertprog_destroy(struct nvfx_context *nvfx, struct nvfx_vertex_program *vp)
1342 {
1343 if (vp->nr_insns)
1344 FREE(vp->insns);
1345
1346 if (vp->nr_consts)
1347 FREE(vp->consts);
1348
1349 nouveau_resource_free(&vp->exec);
1350 nouveau_resource_free(&vp->data);
1351
1352 util_dynarray_fini(&vp->branch_relocs);
1353 util_dynarray_fini(&vp->const_relocs);
1354 FREE(vp);
1355 }
1356
1357 static void *
1358 nvfx_vp_state_create(struct pipe_context *pipe, const struct pipe_shader_state *cso)
1359 {
1360 struct nvfx_context *nvfx = nvfx_context(pipe);
1361 struct nvfx_pipe_vertex_program *pvp;
1362
1363 pvp = CALLOC(1, sizeof(struct nvfx_pipe_vertex_program));
1364 pvp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
1365 tgsi_scan_shader(pvp->pipe.tokens, &pvp->info);
1366 pvp->draw_elements = MAX2(1, MIN2(pvp->info.num_outputs, 16));
1367 pvp->draw_no_elements = pvp->info.num_outputs == 0;
1368
1369 return (void *)pvp;
1370 }
1371
1372 static void
1373 nvfx_vp_state_bind(struct pipe_context *pipe, void *hwcso)
1374 {
1375 struct nvfx_context *nvfx = nvfx_context(pipe);
1376
1377 nvfx->vertprog = hwcso;
1378 nvfx->dirty |= NVFX_NEW_VERTPROG;
1379 nvfx->draw_dirty |= NVFX_NEW_VERTPROG;
1380 }
1381
1382 static void
1383 nvfx_vp_state_delete(struct pipe_context *pipe, void *hwcso)
1384 {
1385 struct nvfx_context *nvfx = nvfx_context(pipe);
1386 struct nvfx_pipe_vertex_program *pvp = hwcso;
1387
1388 if(pvp->draw_vs)
1389 draw_delete_vertex_shader(nvfx->draw, pvp->draw_vs);
1390 if(pvp->vp && pvp->vp != NVFX_VP_FAILED)
1391 nvfx_vertprog_destroy(nvfx, pvp->vp);
1392 if(pvp->draw_vp)
1393 nvfx_vertprog_destroy(nvfx, pvp->draw_vp);
1394 FREE((void*)pvp->pipe.tokens);
1395 FREE(pvp);
1396 }
1397
1398 void
1399 nvfx_init_vertprog_functions(struct nvfx_context *nvfx)
1400 {
1401 nvfx->pipe.create_vs_state = nvfx_vp_state_create;
1402 nvfx->pipe.bind_vs_state = nvfx_vp_state_bind;
1403 nvfx->pipe.delete_vs_state = nvfx_vp_state_delete;
1404 }