Merge remote branch 'origin/master' into nv50-compiler
[mesa.git] / src / gallium / drivers / nvfx / nvfx_vertprog.c
1 #include "pipe/p_context.h"
2 #include "pipe/p_defines.h"
3 #include "pipe/p_state.h"
4 #include "util/u_linkage.h"
5 #include "util/u_debug.h"
6
7 #include "pipe/p_shader_tokens.h"
8 #include "tgsi/tgsi_parse.h"
9 #include "tgsi/tgsi_dump.h"
10 #include "tgsi/tgsi_util.h"
11
12 #include "draw/draw_context.h"
13
14 #include "nvfx_context.h"
15 #include "nvfx_state.h"
16 #include "nvfx_resource.h"
17
18 /* TODO (at least...):
19 * 1. Indexed consts + ARL
20 * 3. NV_vp11, NV_vp2, NV_vp3 features
21 * - extra arith opcodes
22 * - branching
23 * - texture sampling
24 * - indexed attribs
25 * - indexed results
26 * 4. bugs
27 */
28
29 #include "nv30_vertprog.h"
30 #include "nv40_vertprog.h"
31
32 struct nvfx_loop_entry
33 {
34 unsigned brk_target;
35 unsigned cont_target;
36 };
37
38 struct nvfx_vpc {
39 struct nvfx_context* nvfx;
40 struct nvfx_vertex_program *vp;
41
42 struct nvfx_vertex_program_exec *vpi;
43
44 unsigned r_temps;
45 unsigned r_temps_discard;
46 struct nvfx_reg r_result[PIPE_MAX_SHADER_OUTPUTS];
47 struct nvfx_reg *r_address;
48 struct nvfx_reg *r_temp;
49
50 struct nvfx_reg *imm;
51 unsigned nr_imm;
52
53 unsigned hpos_idx;
54
55 struct util_dynarray label_relocs;
56 struct util_dynarray loop_stack;
57 };
58
59 static struct nvfx_reg
60 temp(struct nvfx_vpc *vpc)
61 {
62 int idx = ffs(~vpc->r_temps) - 1;
63
64 if (idx < 0) {
65 NOUVEAU_ERR("out of temps!!\n");
66 assert(0);
67 return nvfx_reg(NVFXSR_TEMP, 0);
68 }
69
70 vpc->r_temps |= (1 << idx);
71 vpc->r_temps_discard |= (1 << idx);
72 return nvfx_reg(NVFXSR_TEMP, idx);
73 }
74
75 static inline void
76 release_temps(struct nvfx_vpc *vpc)
77 {
78 vpc->r_temps &= ~vpc->r_temps_discard;
79 vpc->r_temps_discard = 0;
80 }
81
82 static struct nvfx_reg
83 constant(struct nvfx_vpc *vpc, int pipe, float x, float y, float z, float w)
84 {
85 struct nvfx_vertex_program *vp = vpc->vp;
86 struct nvfx_vertex_program_data *vpd;
87 int idx;
88
89 if (pipe >= 0) {
90 for (idx = 0; idx < vp->nr_consts; idx++) {
91 if (vp->consts[idx].index == pipe)
92 return nvfx_reg(NVFXSR_CONST, idx);
93 }
94 }
95
96 idx = vp->nr_consts++;
97 vp->consts = realloc(vp->consts, sizeof(*vpd) * vp->nr_consts);
98 vpd = &vp->consts[idx];
99
100 vpd->index = pipe;
101 vpd->value[0] = x;
102 vpd->value[1] = y;
103 vpd->value[2] = z;
104 vpd->value[3] = w;
105 return nvfx_reg(NVFXSR_CONST, idx);
106 }
107
108 #define arith(s,o,d,m,s0,s1,s2) \
109 nvfx_insn(0, (NVFX_VP_INST_SLOT_##s << 7) | NVFX_VP_INST_##s##_OP_##o, -1, (d), (m), (s0), (s1), (s2))
110
111 static void
112 emit_src(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int pos, struct nvfx_src src)
113 {
114 struct nvfx_vertex_program *vp = vpc->vp;
115 uint32_t sr = 0;
116 struct nvfx_relocation reloc;
117
118 switch (src.reg.type) {
119 case NVFXSR_TEMP:
120 sr |= (NVFX_VP(SRC_REG_TYPE_TEMP) << NVFX_VP(SRC_REG_TYPE_SHIFT));
121 sr |= (src.reg.index << NVFX_VP(SRC_TEMP_SRC_SHIFT));
122 break;
123 case NVFXSR_INPUT:
124 sr |= (NVFX_VP(SRC_REG_TYPE_INPUT) <<
125 NVFX_VP(SRC_REG_TYPE_SHIFT));
126 vp->ir |= (1 << src.reg.index);
127 hw[1] |= (src.reg.index << NVFX_VP(INST_INPUT_SRC_SHIFT));
128 break;
129 case NVFXSR_CONST:
130 sr |= (NVFX_VP(SRC_REG_TYPE_CONST) <<
131 NVFX_VP(SRC_REG_TYPE_SHIFT));
132 reloc.location = vp->nr_insns - 1;
133 reloc.target = src.reg.index;
134 util_dynarray_append(&vp->const_relocs, struct nvfx_relocation, reloc);
135 break;
136 case NVFXSR_NONE:
137 sr |= (NVFX_VP(SRC_REG_TYPE_INPUT) <<
138 NVFX_VP(SRC_REG_TYPE_SHIFT));
139 break;
140 default:
141 assert(0);
142 }
143
144 if (src.negate)
145 sr |= NVFX_VP(SRC_NEGATE);
146
147 if (src.abs)
148 hw[0] |= (1 << (21 + pos));
149
150 sr |= ((src.swz[0] << NVFX_VP(SRC_SWZ_X_SHIFT)) |
151 (src.swz[1] << NVFX_VP(SRC_SWZ_Y_SHIFT)) |
152 (src.swz[2] << NVFX_VP(SRC_SWZ_Z_SHIFT)) |
153 (src.swz[3] << NVFX_VP(SRC_SWZ_W_SHIFT)));
154
155 switch (pos) {
156 case 0:
157 hw[1] |= ((sr & NVFX_VP(SRC0_HIGH_MASK)) >>
158 NVFX_VP(SRC0_HIGH_SHIFT)) << NVFX_VP(INST_SRC0H_SHIFT);
159 hw[2] |= (sr & NVFX_VP(SRC0_LOW_MASK)) <<
160 NVFX_VP(INST_SRC0L_SHIFT);
161 break;
162 case 1:
163 hw[2] |= sr << NVFX_VP(INST_SRC1_SHIFT);
164 break;
165 case 2:
166 hw[2] |= ((sr & NVFX_VP(SRC2_HIGH_MASK)) >>
167 NVFX_VP(SRC2_HIGH_SHIFT)) << NVFX_VP(INST_SRC2H_SHIFT);
168 hw[3] |= (sr & NVFX_VP(SRC2_LOW_MASK)) <<
169 NVFX_VP(INST_SRC2L_SHIFT);
170 break;
171 default:
172 assert(0);
173 }
174 }
175
176 static void
177 emit_dst(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int slot, struct nvfx_reg dst)
178 {
179 struct nvfx_vertex_program *vp = vpc->vp;
180
181 switch (dst.type) {
182 case NVFXSR_NONE:
183 if(!nvfx->is_nv4x)
184 hw[0] |= NV30_VP_INST_DEST_TEMP_ID_MASK;
185 else {
186 hw[3] |= NV40_VP_INST_DEST_MASK;
187 if (slot == 0)
188 hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK;
189 else
190 hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
191 }
192 break;
193 case NVFXSR_TEMP:
194 if(!nvfx->is_nv4x)
195 hw[0] |= (dst.index << NV30_VP_INST_DEST_TEMP_ID_SHIFT);
196 else {
197 hw[3] |= NV40_VP_INST_DEST_MASK;
198 if (slot == 0)
199 hw[0] |= (dst.index << NV40_VP_INST_VEC_DEST_TEMP_SHIFT);
200 else
201 hw[3] |= (dst.index << NV40_VP_INST_SCA_DEST_TEMP_SHIFT);
202 }
203 break;
204 case NVFXSR_OUTPUT:
205 /* TODO: this may be wrong because on nv30 COL0 and BFC0 are swapped */
206 if(nvfx->is_nv4x) {
207 switch (dst.index) {
208 case NV30_VP_INST_DEST_CLP(0):
209 dst.index = NVFX_VP(INST_DEST_FOGC);
210 break;
211 case NV30_VP_INST_DEST_CLP(1):
212 dst.index = NVFX_VP(INST_DEST_FOGC);
213 break;
214 case NV30_VP_INST_DEST_CLP(2):
215 dst.index = NVFX_VP(INST_DEST_FOGC);
216 break;
217 case NV30_VP_INST_DEST_CLP(3):
218 dst.index = NVFX_VP(INST_DEST_PSZ);
219 break;
220 case NV30_VP_INST_DEST_CLP(4):
221 dst.index = NVFX_VP(INST_DEST_PSZ);
222 break;
223 case NV30_VP_INST_DEST_CLP(5):
224 dst.index = NVFX_VP(INST_DEST_PSZ);
225 break;
226 case NV40_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
227 case NV40_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
228 case NV40_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
229 case NV40_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
230 case NV40_VP_INST_DEST_FOGC: vp->or |= (1 << 4); break;
231 case NV40_VP_INST_DEST_PSZ : vp->or |= (1 << 5); break;
232 }
233 }
234
235 if(!nvfx->is_nv4x) {
236 hw[3] |= (dst.index << NV30_VP_INST_DEST_SHIFT);
237 hw[0] |= NV30_VP_INST_VEC_DEST_TEMP_MASK;
238
239 /*XXX: no way this is entirely correct, someone needs to
240 * figure out what exactly it is.
241 */
242 hw[3] |= 0x800;
243 } else {
244 hw[3] |= (dst.index << NV40_VP_INST_DEST_SHIFT);
245 if (slot == 0) {
246 hw[0] |= NV40_VP_INST_VEC_RESULT;
247 hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK;
248 } else {
249 hw[3] |= NV40_VP_INST_SCA_RESULT;
250 hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
251 }
252 }
253 break;
254 default:
255 assert(0);
256 }
257 }
258
259 static void
260 nvfx_vp_emit(struct nvfx_vpc *vpc, struct nvfx_insn insn)
261 {
262 struct nvfx_context* nvfx = vpc->nvfx;
263 struct nvfx_vertex_program *vp = vpc->vp;
264 unsigned slot = insn.op >> 7;
265 unsigned op = insn.op & 0x7f;
266 uint32_t *hw;
267
268 vp->insns = realloc(vp->insns, ++vp->nr_insns * sizeof(*vpc->vpi));
269 vpc->vpi = &vp->insns[vp->nr_insns - 1];
270 memset(vpc->vpi, 0, sizeof(*vpc->vpi));
271
272 hw = vpc->vpi->data;
273
274 hw[0] |= (insn.cc_test << NVFX_VP(INST_COND_SHIFT));
275 hw[0] |= ((insn.cc_swz[0] << NVFX_VP(INST_COND_SWZ_X_SHIFT)) |
276 (insn.cc_swz[1] << NVFX_VP(INST_COND_SWZ_Y_SHIFT)) |
277 (insn.cc_swz[2] << NVFX_VP(INST_COND_SWZ_Z_SHIFT)) |
278 (insn.cc_swz[3] << NVFX_VP(INST_COND_SWZ_W_SHIFT)));
279 if(insn.cc_update)
280 hw[0] |= NVFX_VP(INST_COND_UPDATE_ENABLE);
281
282 if(!nvfx->is_nv4x) {
283 if(slot == 0)
284 hw[1] |= (op << NV30_VP_INST_VEC_OPCODE_SHIFT);
285 else
286 {
287 hw[0] |= ((op >> 4) << NV30_VP_INST_SCA_OPCODEH_SHIFT);
288 hw[1] |= ((op & 0xf) << NV30_VP_INST_SCA_OPCODEL_SHIFT);
289 }
290 // hw[3] |= NVFX_VP(INST_SCA_DEST_TEMP_MASK);
291 // hw[3] |= (mask << NVFX_VP(INST_VEC_WRITEMASK_SHIFT));
292
293 if (insn.dst.type == NVFXSR_OUTPUT) {
294 if (slot)
295 hw[3] |= (insn.mask << NV30_VP_INST_SDEST_WRITEMASK_SHIFT);
296 else
297 hw[3] |= (insn.mask << NV30_VP_INST_VDEST_WRITEMASK_SHIFT);
298 } else {
299 if (slot)
300 hw[3] |= (insn.mask << NV30_VP_INST_STEMP_WRITEMASK_SHIFT);
301 else
302 hw[3] |= (insn.mask << NV30_VP_INST_VTEMP_WRITEMASK_SHIFT);
303 }
304 } else {
305 if (slot == 0) {
306 hw[1] |= (op << NV40_VP_INST_VEC_OPCODE_SHIFT);
307 hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
308 hw[3] |= (insn.mask << NV40_VP_INST_VEC_WRITEMASK_SHIFT);
309 } else {
310 hw[1] |= (op << NV40_VP_INST_SCA_OPCODE_SHIFT);
311 hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK ;
312 hw[3] |= (insn.mask << NV40_VP_INST_SCA_WRITEMASK_SHIFT);
313 }
314 }
315
316 emit_dst(nvfx, vpc, hw, slot, insn.dst);
317 emit_src(nvfx, vpc, hw, 0, insn.src[0]);
318 emit_src(nvfx, vpc, hw, 1, insn.src[1]);
319 emit_src(nvfx, vpc, hw, 2, insn.src[2]);
320 }
321
322 static inline struct nvfx_src
323 tgsi_src(struct nvfx_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
324 struct nvfx_src src;
325
326 switch (fsrc->Register.File) {
327 case TGSI_FILE_INPUT:
328 src.reg = nvfx_reg(NVFXSR_INPUT, fsrc->Register.Index);
329 break;
330 case TGSI_FILE_CONSTANT:
331 src.reg = constant(vpc, fsrc->Register.Index, 0, 0, 0, 0);
332 break;
333 case TGSI_FILE_IMMEDIATE:
334 src.reg = vpc->imm[fsrc->Register.Index];
335 break;
336 case TGSI_FILE_TEMPORARY:
337 src.reg = vpc->r_temp[fsrc->Register.Index];
338 break;
339 default:
340 NOUVEAU_ERR("bad src file\n");
341 src.reg.index = 0;
342 src.reg.type = 0;
343 break;
344 }
345
346 src.abs = fsrc->Register.Absolute;
347 src.negate = fsrc->Register.Negate;
348 src.swz[0] = fsrc->Register.SwizzleX;
349 src.swz[1] = fsrc->Register.SwizzleY;
350 src.swz[2] = fsrc->Register.SwizzleZ;
351 src.swz[3] = fsrc->Register.SwizzleW;
352 return src;
353 }
354
355 static INLINE struct nvfx_reg
356 tgsi_dst(struct nvfx_vpc *vpc, const struct tgsi_full_dst_register *fdst) {
357 struct nvfx_reg dst;
358
359 switch (fdst->Register.File) {
360 case TGSI_FILE_NULL:
361 dst = nvfx_reg(NVFXSR_NONE, 0);
362 break;
363 case TGSI_FILE_OUTPUT:
364 dst = vpc->r_result[fdst->Register.Index];
365 break;
366 case TGSI_FILE_TEMPORARY:
367 dst = vpc->r_temp[fdst->Register.Index];
368 break;
369 case TGSI_FILE_ADDRESS:
370 dst = vpc->r_address[fdst->Register.Index];
371 break;
372 default:
373 NOUVEAU_ERR("bad dst file %i\n", fdst->Register.File);
374 dst.index = 0;
375 dst.type = 0;
376 break;
377 }
378
379 return dst;
380 }
381
382 static inline int
383 tgsi_mask(uint tgsi)
384 {
385 int mask = 0;
386
387 if (tgsi & TGSI_WRITEMASK_X) mask |= NVFX_VP_MASK_X;
388 if (tgsi & TGSI_WRITEMASK_Y) mask |= NVFX_VP_MASK_Y;
389 if (tgsi & TGSI_WRITEMASK_Z) mask |= NVFX_VP_MASK_Z;
390 if (tgsi & TGSI_WRITEMASK_W) mask |= NVFX_VP_MASK_W;
391 return mask;
392 }
393
394 static boolean
395 nvfx_vertprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_vpc *vpc,
396 unsigned idx, const struct tgsi_full_instruction *finst)
397 {
398 struct nvfx_src src[3], tmp;
399 struct nvfx_reg dst;
400 struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0));
401 struct nvfx_insn insn;
402 struct nvfx_relocation reloc;
403 struct nvfx_loop_entry loop;
404 int mask;
405 int ai = -1, ci = -1, ii = -1;
406 int i;
407
408 if (finst->Instruction.Opcode == TGSI_OPCODE_END)
409 return TRUE;
410
411 for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
412 const struct tgsi_full_src_register *fsrc;
413
414 fsrc = &finst->Src[i];
415 if (fsrc->Register.File == TGSI_FILE_TEMPORARY) {
416 src[i] = tgsi_src(vpc, fsrc);
417 }
418 }
419
420 for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
421 const struct tgsi_full_src_register *fsrc;
422
423 fsrc = &finst->Src[i];
424
425 switch (fsrc->Register.File) {
426 case TGSI_FILE_INPUT:
427 if (ai == -1 || ai == fsrc->Register.Index) {
428 ai = fsrc->Register.Index;
429 src[i] = tgsi_src(vpc, fsrc);
430 } else {
431 src[i] = nvfx_src(temp(vpc));
432 nvfx_vp_emit(vpc, arith(VEC, MOV, src[i].reg, NVFX_VP_MASK_ALL, tgsi_src(vpc, fsrc), none, none));
433 }
434 break;
435 case TGSI_FILE_CONSTANT:
436 if ((ci == -1 && ii == -1) ||
437 ci == fsrc->Register.Index) {
438 ci = fsrc->Register.Index;
439 src[i] = tgsi_src(vpc, fsrc);
440 } else {
441 src[i] = nvfx_src(temp(vpc));
442 nvfx_vp_emit(vpc, arith(VEC, MOV, src[i].reg, NVFX_VP_MASK_ALL, tgsi_src(vpc, fsrc), none, none));
443 }
444 break;
445 case TGSI_FILE_IMMEDIATE:
446 if ((ci == -1 && ii == -1) ||
447 ii == fsrc->Register.Index) {
448 ii = fsrc->Register.Index;
449 src[i] = tgsi_src(vpc, fsrc);
450 } else {
451 src[i] = nvfx_src(temp(vpc));
452 nvfx_vp_emit(vpc, arith(VEC, MOV, src[i].reg, NVFX_VP_MASK_ALL, tgsi_src(vpc, fsrc), none, none));
453 }
454 break;
455 case TGSI_FILE_TEMPORARY:
456 /* handled above */
457 break;
458 default:
459 NOUVEAU_ERR("bad src file\n");
460 return FALSE;
461 }
462 }
463
464 dst = tgsi_dst(vpc, &finst->Dst[0]);
465 mask = tgsi_mask(finst->Dst[0].Register.WriteMask);
466
467 switch (finst->Instruction.Opcode) {
468 case TGSI_OPCODE_ABS:
469 nvfx_vp_emit(vpc, arith(VEC, MOV, dst, mask, abs(src[0]), none, none));
470 break;
471 case TGSI_OPCODE_ADD:
472 nvfx_vp_emit(vpc, arith(VEC, ADD, dst, mask, src[0], none, src[1]));
473 break;
474 case TGSI_OPCODE_ARL:
475 nvfx_vp_emit(vpc, arith(VEC, ARL, dst, mask, src[0], none, none));
476 break;
477 case TGSI_OPCODE_CMP:
478 insn = arith(VEC, MOV, none.reg, mask, src[0], none, none);
479 insn.cc_update = 1;
480 nvfx_vp_emit(vpc, insn);
481
482 insn = arith(VEC, MOV, dst, mask, src[2], none, none);
483 insn.cc_test = NVFX_COND_GE;
484 nvfx_vp_emit(vpc, insn);
485
486 insn = arith(VEC, MOV, dst, mask, src[1], none, none);
487 insn.cc_test = NVFX_COND_LT;
488 nvfx_vp_emit(vpc, insn);
489 break;
490 case TGSI_OPCODE_COS:
491 nvfx_vp_emit(vpc, arith(SCA, COS, dst, mask, none, none, src[0]));
492 break;
493 case TGSI_OPCODE_DP2:
494 tmp = nvfx_src(temp(vpc));
495 nvfx_vp_emit(vpc, arith(VEC, MUL, tmp.reg, NVFX_VP_MASK_X | NVFX_VP_MASK_Y, src[0], src[1], none));
496 nvfx_vp_emit(vpc, arith(VEC, ADD, dst, mask, swz(tmp, X, X, X, X), swz(tmp, Y, Y, Y, Y), none));
497 break;
498 case TGSI_OPCODE_DP3:
499 nvfx_vp_emit(vpc, arith(VEC, DP3, dst, mask, src[0], src[1], none));
500 break;
501 case TGSI_OPCODE_DP4:
502 nvfx_vp_emit(vpc, arith(VEC, DP4, dst, mask, src[0], src[1], none));
503 break;
504 case TGSI_OPCODE_DPH:
505 nvfx_vp_emit(vpc, arith(VEC, DPH, dst, mask, src[0], src[1], none));
506 break;
507 case TGSI_OPCODE_DST:
508 nvfx_vp_emit(vpc, arith(VEC, DST, dst, mask, src[0], src[1], none));
509 break;
510 case TGSI_OPCODE_EX2:
511 nvfx_vp_emit(vpc, arith(SCA, EX2, dst, mask, none, none, src[0]));
512 break;
513 case TGSI_OPCODE_EXP:
514 nvfx_vp_emit(vpc, arith(SCA, EXP, dst, mask, none, none, src[0]));
515 break;
516 case TGSI_OPCODE_FLR:
517 nvfx_vp_emit(vpc, arith(VEC, FLR, dst, mask, src[0], none, none));
518 break;
519 case TGSI_OPCODE_FRC:
520 nvfx_vp_emit(vpc, arith(VEC, FRC, dst, mask, src[0], none, none));
521 break;
522 case TGSI_OPCODE_LG2:
523 nvfx_vp_emit(vpc, arith(SCA, LG2, dst, mask, none, none, src[0]));
524 break;
525 case TGSI_OPCODE_LIT:
526 nvfx_vp_emit(vpc, arith(SCA, LIT, dst, mask, none, none, src[0]));
527 break;
528 case TGSI_OPCODE_LOG:
529 nvfx_vp_emit(vpc, arith(SCA, LOG, dst, mask, none, none, src[0]));
530 break;
531 case TGSI_OPCODE_LRP:
532 tmp = nvfx_src(temp(vpc));
533 nvfx_vp_emit(vpc, arith(VEC, MAD, tmp.reg, mask, neg(src[0]), src[2], src[2]));
534 nvfx_vp_emit(vpc, arith(VEC, MAD, dst, mask, src[0], src[1], tmp));
535 break;
536 case TGSI_OPCODE_MAD:
537 nvfx_vp_emit(vpc, arith(VEC, MAD, dst, mask, src[0], src[1], src[2]));
538 break;
539 case TGSI_OPCODE_MAX:
540 nvfx_vp_emit(vpc, arith(VEC, MAX, dst, mask, src[0], src[1], none));
541 break;
542 case TGSI_OPCODE_MIN:
543 nvfx_vp_emit(vpc, arith(VEC, MIN, dst, mask, src[0], src[1], none));
544 break;
545 case TGSI_OPCODE_MOV:
546 nvfx_vp_emit(vpc, arith(VEC, MOV, dst, mask, src[0], none, none));
547 break;
548 case TGSI_OPCODE_MUL:
549 nvfx_vp_emit(vpc, arith(VEC, MUL, dst, mask, src[0], src[1], none));
550 break;
551 case TGSI_OPCODE_NOP:
552 break;
553 case TGSI_OPCODE_POW:
554 tmp = nvfx_src(temp(vpc));
555 nvfx_vp_emit(vpc, arith(SCA, LG2, tmp.reg, NVFX_VP_MASK_X, none, none, swz(src[0], X, X, X, X)));
556 nvfx_vp_emit(vpc, arith(VEC, MUL, tmp.reg, NVFX_VP_MASK_X, swz(tmp, X, X, X, X), swz(src[1], X, X, X, X), none));
557 nvfx_vp_emit(vpc, arith(SCA, EX2, dst, mask, none, none, swz(tmp, X, X, X, X)));
558 break;
559 case TGSI_OPCODE_RCP:
560 nvfx_vp_emit(vpc, arith(SCA, RCP, dst, mask, none, none, src[0]));
561 break;
562 case TGSI_OPCODE_RSQ:
563 nvfx_vp_emit(vpc, arith(SCA, RSQ, dst, mask, none, none, abs(src[0])));
564 break;
565 case TGSI_OPCODE_SEQ:
566 nvfx_vp_emit(vpc, arith(VEC, SEQ, dst, mask, src[0], src[1], none));
567 break;
568 case TGSI_OPCODE_SFL:
569 nvfx_vp_emit(vpc, arith(VEC, SFL, dst, mask, src[0], src[1], none));
570 break;
571 case TGSI_OPCODE_SGE:
572 nvfx_vp_emit(vpc, arith(VEC, SGE, dst, mask, src[0], src[1], none));
573 break;
574 case TGSI_OPCODE_SGT:
575 nvfx_vp_emit(vpc, arith(VEC, SGT, dst, mask, src[0], src[1], none));
576 break;
577 case TGSI_OPCODE_SIN:
578 nvfx_vp_emit(vpc, arith(SCA, SIN, dst, mask, none, none, src[0]));
579 break;
580 case TGSI_OPCODE_SLE:
581 nvfx_vp_emit(vpc, arith(VEC, SLE, dst, mask, src[0], src[1], none));
582 break;
583 case TGSI_OPCODE_SLT:
584 nvfx_vp_emit(vpc, arith(VEC, SLT, dst, mask, src[0], src[1], none));
585 break;
586 case TGSI_OPCODE_SNE:
587 nvfx_vp_emit(vpc, arith(VEC, SNE, dst, mask, src[0], src[1], none));
588 break;
589 case TGSI_OPCODE_SSG:
590 nvfx_vp_emit(vpc, arith(VEC, SSG, dst, mask, src[0], src[1], none));
591 break;
592 case TGSI_OPCODE_STR:
593 nvfx_vp_emit(vpc, arith(VEC, STR, dst, mask, src[0], src[1], none));
594 break;
595 case TGSI_OPCODE_SUB:
596 nvfx_vp_emit(vpc, arith(VEC, ADD, dst, mask, src[0], none, neg(src[1])));
597 break;
598 case TGSI_OPCODE_TRUNC:
599 tmp = nvfx_src(temp(vpc));
600 insn = arith(VEC, MOV, none.reg, mask, src[0], none, none);
601 insn.cc_update = 1;
602 nvfx_vp_emit(vpc, insn);
603
604 nvfx_vp_emit(vpc, arith(VEC, FLR, tmp.reg, mask, abs(src[0]), none, none));
605 nvfx_vp_emit(vpc, arith(VEC, MOV, dst, mask, tmp, none, none));
606
607 insn = arith(VEC, MOV, dst, mask, neg(tmp), none, none);
608 insn.cc_test = NVFX_COND_LT;
609 nvfx_vp_emit(vpc, insn);
610 break;
611 case TGSI_OPCODE_XPD:
612 tmp = nvfx_src(temp(vpc));
613 nvfx_vp_emit(vpc, arith(VEC, MUL, tmp.reg, mask, swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none));
614 nvfx_vp_emit(vpc, arith(VEC, MAD, dst, (mask & ~NVFX_VP_MASK_W), swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y), neg(tmp)));
615 break;
616
617 case TGSI_OPCODE_IF:
618 insn = arith(VEC, MOV, none.reg, NVFX_VP_MASK_X, src[0], none, none);
619 insn.cc_update = 1;
620 nvfx_vp_emit(vpc, insn);
621
622 reloc.location = vpc->vp->nr_insns;
623 reloc.target = finst->Label.Label + 1;
624 util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc);
625
626 insn = arith(SCA, BRA, none.reg, 0, none, none, none);
627 insn.cc_test = NVFX_COND_EQ;
628 insn.cc_swz[0] = insn.cc_swz[1] = insn.cc_swz[2] = insn.cc_swz[3] = 0;
629 nvfx_vp_emit(vpc, insn);
630 break;
631
632 case TGSI_OPCODE_ELSE:
633 case TGSI_OPCODE_BRA:
634 case TGSI_OPCODE_CAL:
635 reloc.location = vpc->vp->nr_insns;
636 reloc.target = finst->Label.Label;
637 util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc);
638
639 if(finst->Instruction.Opcode == TGSI_OPCODE_CAL)
640 insn = arith(SCA, CAL, none.reg, 0, none, none, none);
641 else
642 insn = arith(SCA, BRA, none.reg, 0, none, none, none);
643 nvfx_vp_emit(vpc, insn);
644 break;
645
646 case TGSI_OPCODE_RET:
647 tmp = none;
648 tmp.swz[0] = tmp.swz[1] = tmp.swz[2] = tmp.swz[3] = 0;
649 nvfx_vp_emit(vpc, arith(SCA, RET, none.reg, 0, none, none, tmp));
650 break;
651
652 case TGSI_OPCODE_BGNSUB:
653 case TGSI_OPCODE_ENDSUB:
654 case TGSI_OPCODE_ENDIF:
655 /* nothing to do here */
656 break;
657
658 case TGSI_OPCODE_BGNLOOP:
659 loop.cont_target = idx;
660 loop.brk_target = finst->Label.Label + 1;
661 util_dynarray_append(&vpc->loop_stack, struct nvfx_loop_entry, loop);
662 break;
663
664 case TGSI_OPCODE_ENDLOOP:
665 loop = util_dynarray_pop(&vpc->loop_stack, struct nvfx_loop_entry);
666
667 reloc.location = vpc->vp->nr_insns;
668 reloc.target = loop.cont_target;
669 util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc);
670
671 nvfx_vp_emit(vpc, arith(SCA, BRA, none.reg, 0, none, none, none));
672 break;
673
674 case TGSI_OPCODE_CONT:
675 loop = util_dynarray_top(&vpc->loop_stack, struct nvfx_loop_entry);
676
677 reloc.location = vpc->vp->nr_insns;
678 reloc.target = loop.cont_target;
679 util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc);
680
681 nvfx_vp_emit(vpc, arith(SCA, BRA, none.reg, 0, none, none, none));
682 break;
683
684 case TGSI_OPCODE_BRK:
685 loop = util_dynarray_top(&vpc->loop_stack, struct nvfx_loop_entry);
686
687 reloc.location = vpc->vp->nr_insns;
688 reloc.target = loop.brk_target;
689 util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc);
690
691 nvfx_vp_emit(vpc, arith(SCA, BRA, none.reg, 0, none, none, none));
692 break;
693
694 default:
695 NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
696 return FALSE;
697 }
698
699 release_temps(vpc);
700 return TRUE;
701 }
702
703 static boolean
704 nvfx_vertprog_parse_decl_output(struct nvfx_context* nvfx, struct nvfx_vpc *vpc,
705 const struct tgsi_full_declaration *fdec)
706 {
707 unsigned idx = fdec->Range.First;
708 int hw;
709
710 switch (fdec->Semantic.Name) {
711 case TGSI_SEMANTIC_POSITION:
712 hw = NVFX_VP(INST_DEST_POS);
713 vpc->hpos_idx = idx;
714 break;
715 case TGSI_SEMANTIC_COLOR:
716 if (fdec->Semantic.Index == 0) {
717 hw = NVFX_VP(INST_DEST_COL0);
718 } else
719 if (fdec->Semantic.Index == 1) {
720 hw = NVFX_VP(INST_DEST_COL1);
721 } else {
722 NOUVEAU_ERR("bad colour semantic index\n");
723 return FALSE;
724 }
725 break;
726 case TGSI_SEMANTIC_BCOLOR:
727 if (fdec->Semantic.Index == 0) {
728 hw = NVFX_VP(INST_DEST_BFC0);
729 } else
730 if (fdec->Semantic.Index == 1) {
731 hw = NVFX_VP(INST_DEST_BFC1);
732 } else {
733 NOUVEAU_ERR("bad bcolour semantic index\n");
734 return FALSE;
735 }
736 break;
737 case TGSI_SEMANTIC_FOG:
738 hw = NVFX_VP(INST_DEST_FOGC);
739 break;
740 case TGSI_SEMANTIC_PSIZE:
741 hw = NVFX_VP(INST_DEST_PSZ);
742 break;
743 case TGSI_SEMANTIC_GENERIC:
744 hw = (vpc->vp->generic_to_fp_input[fdec->Semantic.Index] & 0xf)
745 + NVFX_VP(INST_DEST_TC(0)) - NVFX_FP_OP_INPUT_SRC_TC(0);
746 break;
747 case TGSI_SEMANTIC_EDGEFLAG:
748 /* not really an error just a fallback */
749 NOUVEAU_ERR("cannot handle edgeflag output\n");
750 return FALSE;
751 default:
752 NOUVEAU_ERR("bad output semantic\n");
753 return FALSE;
754 }
755
756 vpc->r_result[idx] = nvfx_reg(NVFXSR_OUTPUT, hw);
757 return TRUE;
758 }
759
760 static boolean
761 nvfx_vertprog_prepare(struct nvfx_context* nvfx, struct nvfx_vpc *vpc)
762 {
763 struct tgsi_parse_context p;
764 int high_temp = -1, high_addr = -1, nr_imm = 0, i;
765 struct util_semantic_set set;
766 unsigned char sem_layout[8];
767 unsigned num_outputs;
768
769 num_outputs = util_semantic_set_from_program_file(&set, vpc->vp->pipe.tokens, TGSI_FILE_OUTPUT);
770
771 if(num_outputs > 8) {
772 NOUVEAU_ERR("too many vertex program outputs: %i\n", num_outputs);
773 return FALSE;
774 }
775 util_semantic_layout_from_set(sem_layout, &set, 8, 8);
776
777 /* hope 0xf is (0, 0, 0, 1) initialized; otherwise, we are _probably_ not required to do this */
778 memset(vpc->vp->generic_to_fp_input, 0x0f, sizeof(vpc->vp->generic_to_fp_input));
779 for(int i = 0; i < 8; ++i) {
780 if(sem_layout[i] == 0xff)
781 continue;
782 //printf("vp: GENERIC[%i] to fpreg %i\n", sem_layout[i], NVFX_FP_OP_INPUT_SRC_TC(0) + i);
783 vpc->vp->generic_to_fp_input[sem_layout[i]] = 0xf0 | NVFX_FP_OP_INPUT_SRC_TC(i);
784 }
785
786 vpc->vp->sprite_fp_input = -1;
787 for(int i = 0; i < 8; ++i)
788 {
789 if(sem_layout[i] == 0xff)
790 {
791 vpc->vp->sprite_fp_input = NVFX_FP_OP_INPUT_SRC_TC(i);
792 break;
793 }
794 }
795
796 tgsi_parse_init(&p, vpc->vp->pipe.tokens);
797 while (!tgsi_parse_end_of_tokens(&p)) {
798 const union tgsi_full_token *tok = &p.FullToken;
799
800 tgsi_parse_token(&p);
801 switch(tok->Token.Type) {
802 case TGSI_TOKEN_TYPE_IMMEDIATE:
803 nr_imm++;
804 break;
805 case TGSI_TOKEN_TYPE_DECLARATION:
806 {
807 const struct tgsi_full_declaration *fdec;
808
809 fdec = &p.FullToken.FullDeclaration;
810 switch (fdec->Declaration.File) {
811 case TGSI_FILE_TEMPORARY:
812 if (fdec->Range.Last > high_temp) {
813 high_temp =
814 fdec->Range.Last;
815 }
816 break;
817 #if 0 /* this would be nice.. except gallium doesn't track it */
818 case TGSI_FILE_ADDRESS:
819 if (fdec->Range.Last > high_addr) {
820 high_addr =
821 fdec->Range.Last;
822 }
823 break;
824 #endif
825 case TGSI_FILE_OUTPUT:
826 if (!nvfx_vertprog_parse_decl_output(nvfx, vpc, fdec))
827 return FALSE;
828 break;
829 default:
830 break;
831 }
832 }
833 break;
834 #if 1 /* yay, parse instructions looking for address regs instead */
835 case TGSI_TOKEN_TYPE_INSTRUCTION:
836 {
837 const struct tgsi_full_instruction *finst;
838 const struct tgsi_full_dst_register *fdst;
839
840 finst = &p.FullToken.FullInstruction;
841 fdst = &finst->Dst[0];
842
843 if (fdst->Register.File == TGSI_FILE_ADDRESS) {
844 if (fdst->Register.Index > high_addr)
845 high_addr = fdst->Register.Index;
846 }
847
848 }
849 break;
850 #endif
851 default:
852 break;
853 }
854 }
855 tgsi_parse_free(&p);
856
857 if (nr_imm) {
858 vpc->imm = CALLOC(nr_imm, sizeof(struct nvfx_reg));
859 assert(vpc->imm);
860 }
861
862 if (++high_temp) {
863 vpc->r_temp = CALLOC(high_temp, sizeof(struct nvfx_reg));
864 for (i = 0; i < high_temp; i++)
865 vpc->r_temp[i] = temp(vpc);
866 }
867
868 if (++high_addr) {
869 vpc->r_address = CALLOC(high_addr, sizeof(struct nvfx_reg));
870 for (i = 0; i < high_addr; i++)
871 vpc->r_address[i] = temp(vpc);
872 }
873
874 vpc->r_temps_discard = 0;
875 return TRUE;
876 }
877
878 DEBUG_GET_ONCE_BOOL_OPTION(nvfx_dump_vp, "NVFX_DUMP_VP", FALSE)
879
880 static void
881 nvfx_vertprog_translate(struct nvfx_context *nvfx,
882 struct nvfx_vertex_program *vp)
883 {
884 struct tgsi_parse_context parse;
885 struct nvfx_vpc *vpc = NULL;
886 struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0));
887 struct util_dynarray insns;
888 int i;
889
890 vpc = CALLOC(1, sizeof(struct nvfx_vpc));
891 if (!vpc)
892 return;
893 vpc->nvfx = nvfx;
894 vpc->vp = vp;
895
896 /* reserve space for ucps */
897 if(nvfx->use_vp_clipping)
898 {
899 for(i = 0; i < 6; ++i)
900 constant(vpc, -1, 0, 0, 0, 0);
901 }
902
903 if (!nvfx_vertprog_prepare(nvfx, vpc)) {
904 FREE(vpc);
905 return;
906 }
907
908 /* Redirect post-transform vertex position to a temp if user clip
909 * planes are enabled. We need to append code to the vtxprog
910 * to handle clip planes later.
911 */
912 /* TODO: maybe support patching this depending on whether there are ucps: not sure if it is really matters much */
913 if (nvfx->use_vp_clipping) {
914 vpc->r_result[vpc->hpos_idx] = temp(vpc);
915 vpc->r_temps_discard = 0;
916 }
917
918 tgsi_parse_init(&parse, vp->pipe.tokens);
919
920 util_dynarray_init(&insns);
921 while (!tgsi_parse_end_of_tokens(&parse)) {
922 tgsi_parse_token(&parse);
923
924 switch (parse.FullToken.Token.Type) {
925 case TGSI_TOKEN_TYPE_IMMEDIATE:
926 {
927 const struct tgsi_full_immediate *imm;
928
929 imm = &parse.FullToken.FullImmediate;
930 assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
931 assert(imm->Immediate.NrTokens == 4 + 1);
932 vpc->imm[vpc->nr_imm++] =
933 constant(vpc, -1,
934 imm->u[0].Float,
935 imm->u[1].Float,
936 imm->u[2].Float,
937 imm->u[3].Float);
938 }
939 break;
940 case TGSI_TOKEN_TYPE_INSTRUCTION:
941 {
942 const struct tgsi_full_instruction *finst;
943 unsigned idx = insns.size >> 2;
944 util_dynarray_append(&insns, unsigned, vp->nr_insns);
945 finst = &parse.FullToken.FullInstruction;
946 if (!nvfx_vertprog_parse_instruction(nvfx, vpc, idx, finst))
947 goto out_err;
948 }
949 break;
950 default:
951 break;
952 }
953 }
954
955 util_dynarray_append(&insns, unsigned, vp->nr_insns);
956
957 for(unsigned i = 0; i < vpc->label_relocs.size; i += sizeof(struct nvfx_relocation))
958 {
959 struct nvfx_relocation* label_reloc = (struct nvfx_relocation*)((char*)vpc->label_relocs.data + i);
960 struct nvfx_relocation hw_reloc;
961
962 hw_reloc.location = label_reloc->location;
963 hw_reloc.target = ((unsigned*)insns.data)[label_reloc->target];
964
965 //debug_printf("hw %u -> tgsi %u = hw %u\n", hw_reloc.location, label_reloc->target, hw_reloc.target);
966
967 util_dynarray_append(&vp->branch_relocs, struct nvfx_relocation, hw_reloc);
968 }
969 util_dynarray_fini(&insns);
970 util_dynarray_trim(&vp->branch_relocs);
971
972 /* XXX: what if we add a RET before?! make sure we jump here...*/
973
974 /* Write out HPOS if it was redirected to a temp earlier */
975 if (vpc->r_result[vpc->hpos_idx].type != NVFXSR_OUTPUT) {
976 struct nvfx_reg hpos = nvfx_reg(NVFXSR_OUTPUT,
977 NVFX_VP(INST_DEST_POS));
978 struct nvfx_src htmp = nvfx_src(vpc->r_result[vpc->hpos_idx]);
979
980 nvfx_vp_emit(vpc, arith(VEC, MOV, hpos, NVFX_VP_MASK_ALL, htmp, none, none));
981 }
982
983 /* Insert code to handle user clip planes */
984 if(nvfx->use_vp_clipping)
985 {
986 for (i = 0; i < 6; i++) {
987 struct nvfx_reg cdst = nvfx_reg(NVFXSR_OUTPUT, NV30_VP_INST_DEST_CLP(i));
988 struct nvfx_src ceqn = nvfx_src(nvfx_reg(NVFXSR_CONST, i));
989 struct nvfx_src htmp = nvfx_src(vpc->r_result[vpc->hpos_idx]);
990 unsigned mask;
991
992 if(nvfx->is_nv4x)
993 {
994 switch (i) {
995 case 0: case 3: mask = NVFX_VP_MASK_Y; break;
996 case 1: case 4: mask = NVFX_VP_MASK_Z; break;
997 case 2: case 5: mask = NVFX_VP_MASK_W; break;
998 default:
999 NOUVEAU_ERR("invalid clip dist #%d\n", i);
1000 goto out_err;
1001 }
1002 }
1003 else
1004 mask = NVFX_VP_MASK_X;
1005
1006 nvfx_vp_emit(vpc, arith(VEC, DP4, cdst, mask, htmp, ceqn, none));
1007 }
1008 }
1009 else
1010 {
1011 if(vp->nr_insns)
1012 vp->insns[vp->nr_insns - 1].data[3] |= NVFX_VP_INST_LAST;
1013
1014 nvfx_vp_emit(vpc, arith(VEC, NOP, none.reg, 0, none, none, none));
1015 vp->insns[vp->nr_insns - 1].data[3] |= NVFX_VP_INST_LAST;
1016 }
1017
1018 if(debug_get_option_nvfx_dump_vp())
1019 {
1020 debug_printf("\n");
1021 tgsi_dump(vp->pipe.tokens, 0);
1022
1023 debug_printf("\n%s vertex program:\n", nvfx->is_nv4x ? "nv4x" : "nv3x");
1024 for (i = 0; i < vp->nr_insns; i++)
1025 debug_printf("%3u: %08x %08x %08x %08x\n", i, vp->insns[i].data[0], vp->insns[i].data[1], vp->insns[i].data[2], vp->insns[i].data[3]);
1026 debug_printf("\n");
1027 }
1028
1029 vp->clip_nr = -1;
1030 vp->exec_start = -1;
1031 vp->translated = TRUE;
1032 out_err:
1033 tgsi_parse_free(&parse);
1034 util_dynarray_fini(&vpc->label_relocs);
1035 util_dynarray_fini(&vpc->loop_stack);
1036 if (vpc->r_temp)
1037 FREE(vpc->r_temp);
1038 if (vpc->r_address)
1039 FREE(vpc->r_address);
1040 if (vpc->imm)
1041 FREE(vpc->imm);
1042 FREE(vpc);
1043 }
1044
1045 boolean
1046 nvfx_vertprog_validate(struct nvfx_context *nvfx)
1047 {
1048 struct nvfx_screen *screen = nvfx->screen;
1049 struct nouveau_channel *chan = screen->base.channel;
1050 struct nouveau_grobj *eng3d = screen->eng3d;
1051 struct nvfx_vertex_program *vp;
1052 struct pipe_resource *constbuf;
1053 boolean upload_code = FALSE, upload_data = FALSE;
1054 int i;
1055
1056 if (nvfx->render_mode == HW) {
1057 vp = nvfx->vertprog;
1058 constbuf = nvfx->constbuf[PIPE_SHADER_VERTEX];
1059 } else {
1060 vp = nvfx->swtnl.vertprog;
1061 constbuf = NULL;
1062 }
1063
1064 /* Translate TGSI shader into hw bytecode */
1065 if (!vp->translated)
1066 {
1067 nvfx->fallback_swtnl &= ~NVFX_NEW_VERTPROG;
1068 nvfx_vertprog_translate(nvfx, vp);
1069 if (!vp->translated) {
1070 nvfx->fallback_swtnl |= NVFX_NEW_VERTPROG;
1071 return FALSE;
1072 }
1073 }
1074
1075 /* Allocate hw vtxprog exec slots */
1076 if (!vp->exec) {
1077 struct nouveau_resource *heap = nvfx->screen->vp_exec_heap;
1078 uint vplen = vp->nr_insns;
1079
1080 if (nouveau_resource_alloc(heap, vplen, vp, &vp->exec)) {
1081 while (heap->next && heap->size < vplen) {
1082 struct nvfx_vertex_program *evict;
1083
1084 evict = heap->next->priv;
1085 nouveau_resource_free(&evict->exec);
1086 }
1087
1088 if (nouveau_resource_alloc(heap, vplen, vp, &vp->exec))
1089 {
1090 debug_printf("Vertex shader too long: %u instructions\n", vplen);
1091 nvfx->fallback_swtnl |= NVFX_NEW_VERTPROG;
1092 return FALSE;
1093 }
1094 }
1095
1096 upload_code = TRUE;
1097 }
1098
1099 /* Allocate hw vtxprog const slots */
1100 if (vp->nr_consts && !vp->data) {
1101 struct nouveau_resource *heap = nvfx->screen->vp_data_heap;
1102
1103 if (nouveau_resource_alloc(heap, vp->nr_consts, vp, &vp->data)) {
1104 while (heap->next && heap->size < vp->nr_consts) {
1105 struct nvfx_vertex_program *evict;
1106
1107 evict = heap->next->priv;
1108 nouveau_resource_free(&evict->data);
1109 }
1110
1111 if (nouveau_resource_alloc(heap, vp->nr_consts, vp, &vp->data))
1112 {
1113 debug_printf("Vertex shader uses too many constants: %u constants\n", vp->nr_consts);
1114 nvfx->fallback_swtnl |= NVFX_NEW_VERTPROG;
1115 return FALSE;
1116 }
1117 }
1118
1119 /*XXX: handle this some day */
1120 assert(vp->data->start >= vp->data_start_min);
1121
1122 upload_data = TRUE;
1123 if (vp->data_start != vp->data->start)
1124 upload_code = TRUE;
1125 }
1126
1127 /* If exec or data segments moved we need to patch the program to
1128 * fixup offsets and register IDs.
1129 */
1130 if (vp->exec_start != vp->exec->start) {
1131 //printf("vp_relocs %u -> %u\n", vp->exec_start, vp->exec->start);
1132 for(unsigned i = 0; i < vp->branch_relocs.size; i += sizeof(struct nvfx_relocation))
1133 {
1134 struct nvfx_relocation* reloc = (struct nvfx_relocation*)((char*)vp->branch_relocs.data + i);
1135 uint32_t* hw = vp->insns[reloc->location].data;
1136 unsigned target = vp->exec->start + reloc->target;
1137
1138 //debug_printf("vp_reloc hw %u -> hw %u\n", reloc->location, target);
1139
1140 if(!nvfx->is_nv4x)
1141 {
1142 hw[2] &=~ NV30_VP_INST_IADDR_MASK;
1143 hw[2] |= (target & 0x1ff) << NV30_VP_INST_IADDR_SHIFT;
1144 }
1145 else
1146 {
1147 hw[3] &=~ NV40_VP_INST_IADDRL_MASK;
1148 hw[3] |= (target & 7) << NV40_VP_INST_IADDRL_SHIFT;
1149
1150 hw[2] &=~ NV40_VP_INST_IADDRH_MASK;
1151 hw[2] |= ((target >> 3) & 0x3f) << NV40_VP_INST_IADDRH_SHIFT;
1152 }
1153 }
1154
1155 vp->exec_start = vp->exec->start;
1156 }
1157
1158 if (vp->data_start != vp->data->start) {
1159 for(unsigned i = 0; i < vp->const_relocs.size; i += sizeof(struct nvfx_relocation))
1160 {
1161 struct nvfx_relocation* reloc = (struct nvfx_relocation*)((char*)vp->const_relocs.data + i);
1162 struct nvfx_vertex_program_exec *vpi = &vp->insns[reloc->location];
1163
1164 vpi->data[1] &= ~NVFX_VP(INST_CONST_SRC_MASK);
1165 vpi->data[1] |=
1166 (reloc->target + vp->data->start) <<
1167 NVFX_VP(INST_CONST_SRC_SHIFT);
1168 }
1169
1170 vp->data_start = vp->data->start;
1171 upload_code = TRUE;
1172 }
1173
1174 /* Update + Upload constant values */
1175 if (vp->nr_consts) {
1176 float *map = NULL;
1177
1178 if (constbuf)
1179 map = (float*)nvfx_buffer(constbuf)->data;
1180
1181 for (i = nvfx->use_vp_clipping ? 6 : 0; i < vp->nr_consts; i++) {
1182 struct nvfx_vertex_program_data *vpd = &vp->consts[i];
1183
1184 if (vpd->index >= 0) {
1185 if (!upload_data &&
1186 !memcmp(vpd->value, &map[vpd->index * 4],
1187 4 * sizeof(float)))
1188 continue;
1189 memcpy(vpd->value, &map[vpd->index * 4],
1190 4 * sizeof(float));
1191 }
1192
1193 BEGIN_RING(chan, eng3d, NV34TCL_VP_UPLOAD_CONST_ID, 5);
1194 OUT_RING (chan, i + vp->data->start);
1195 OUT_RINGp (chan, (uint32_t *)vpd->value, 4);
1196 }
1197 }
1198
1199 /* Upload vtxprog */
1200 if (upload_code) {
1201 BEGIN_RING(chan, eng3d, NV34TCL_VP_UPLOAD_FROM_ID, 1);
1202 OUT_RING (chan, vp->exec->start);
1203 for (i = 0; i < vp->nr_insns; i++) {
1204 BEGIN_RING(chan, eng3d, NV34TCL_VP_UPLOAD_INST(0), 4);
1205 OUT_RINGp (chan, vp->insns[i].data, 4);
1206 }
1207 vp->clip_nr = -1;
1208 }
1209
1210 if(nvfx->dirty & (NVFX_NEW_VERTPROG))
1211 {
1212 WAIT_RING(chan, 6);
1213 OUT_RING(chan, RING_3D(NV34TCL_VP_START_FROM_ID, 1));
1214 OUT_RING(chan, vp->exec->start);
1215 if(nvfx->is_nv4x) {
1216 OUT_RING(chan, RING_3D(NV40TCL_VP_ATTRIB_EN, 1));
1217 OUT_RING(chan, vp->ir);
1218 }
1219 }
1220
1221 return TRUE;
1222 }
1223
1224 void
1225 nvfx_vertprog_destroy(struct nvfx_context *nvfx, struct nvfx_vertex_program *vp)
1226 {
1227 if (vp->nr_insns)
1228 FREE(vp->insns);
1229
1230 if (vp->nr_consts)
1231 FREE(vp->consts);
1232
1233 nouveau_resource_free(&vp->exec);
1234 nouveau_resource_free(&vp->data);
1235
1236 util_dynarray_fini(&vp->branch_relocs);
1237 util_dynarray_fini(&vp->const_relocs);
1238 }
1239
1240 static void *
1241 nvfx_vp_state_create(struct pipe_context *pipe,
1242 const struct pipe_shader_state *cso)
1243 {
1244 struct nvfx_context *nvfx = nvfx_context(pipe);
1245 struct nvfx_vertex_program *vp;
1246
1247 // TODO: use a 64-bit atomic here!
1248 static unsigned long long id = 0;
1249
1250 vp = CALLOC(1, sizeof(struct nvfx_vertex_program));
1251 vp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
1252 vp->draw = draw_create_vertex_shader(nvfx->draw, &vp->pipe);
1253 vp->id = ++id;
1254
1255 return (void *)vp;
1256 }
1257
1258 static void
1259 nvfx_vp_state_bind(struct pipe_context *pipe, void *hwcso)
1260 {
1261 struct nvfx_context *nvfx = nvfx_context(pipe);
1262
1263 nvfx->vertprog = hwcso;
1264 nvfx->dirty |= NVFX_NEW_VERTPROG;
1265 nvfx->draw_dirty |= NVFX_NEW_VERTPROG;
1266 }
1267
1268 static void
1269 nvfx_vp_state_delete(struct pipe_context *pipe, void *hwcso)
1270 {
1271 struct nvfx_context *nvfx = nvfx_context(pipe);
1272 struct nvfx_vertex_program *vp = hwcso;
1273
1274 draw_delete_vertex_shader(nvfx->draw, vp->draw);
1275 nvfx_vertprog_destroy(nvfx, vp);
1276 FREE((void*)vp->pipe.tokens);
1277 FREE(vp);
1278 }
1279
1280 void
1281 nvfx_init_vertprog_functions(struct nvfx_context *nvfx)
1282 {
1283 nvfx->pipe.create_vs_state = nvfx_vp_state_create;
1284 nvfx->pipe.bind_vs_state = nvfx_vp_state_bind;
1285 nvfx->pipe.delete_vs_state = nvfx_vp_state_delete;
1286 }