Merge commit 'origin/gallium-master-merge'
[mesa.git] / src / gallium / drivers / nv40 / nv40_vertprog.c
1 #include "pipe/p_context.h"
2 #include "pipe/p_defines.h"
3 #include "pipe/p_state.h"
4
5 #include "pipe/p_shader_tokens.h"
6 #include "tgsi/tgsi_parse.h"
7 #include "tgsi/tgsi_util.h"
8
9 #include "nv40_context.h"
10 #include "nv40_state.h"
11
12 /* TODO (at least...):
13 * 1. Indexed consts + ARL
14 * 3. NV_vp11, NV_vp2, NV_vp3 features
15 * - extra arith opcodes
16 * - branching
17 * - texture sampling
18 * - indexed attribs
19 * - indexed results
20 * 4. bugs
21 */
22
23 #define SWZ_X 0
24 #define SWZ_Y 1
25 #define SWZ_Z 2
26 #define SWZ_W 3
27 #define MASK_X 8
28 #define MASK_Y 4
29 #define MASK_Z 2
30 #define MASK_W 1
31 #define MASK_ALL (MASK_X|MASK_Y|MASK_Z|MASK_W)
32 #define DEF_SCALE 0
33 #define DEF_CTEST 0
34 #include "nv40_shader.h"
35
36 #define swz(s,x,y,z,w) nv40_sr_swz((s), SWZ_##x, SWZ_##y, SWZ_##z, SWZ_##w)
37 #define neg(s) nv40_sr_neg((s))
38 #define abs(s) nv40_sr_abs((s))
39
40 #define NV40_VP_INST_DEST_CLIP(n) ((~0 - 6) + (n))
41
42 struct nv40_vpc {
43 struct nv40_vertex_program *vp;
44
45 struct nv40_vertex_program_exec *vpi;
46
47 unsigned r_temps;
48 unsigned r_temps_discard;
49 struct nv40_sreg r_result[PIPE_MAX_SHADER_OUTPUTS];
50 struct nv40_sreg *r_address;
51 struct nv40_sreg *r_temp;
52
53 struct nv40_sreg *imm;
54 unsigned nr_imm;
55
56 unsigned hpos_idx;
57 };
58
59 static struct nv40_sreg
60 temp(struct nv40_vpc *vpc)
61 {
62 int idx = ffs(~vpc->r_temps) - 1;
63
64 if (idx < 0) {
65 NOUVEAU_ERR("out of temps!!\n");
66 assert(0);
67 return nv40_sr(NV40SR_TEMP, 0);
68 }
69
70 vpc->r_temps |= (1 << idx);
71 vpc->r_temps_discard |= (1 << idx);
72 return nv40_sr(NV40SR_TEMP, idx);
73 }
74
75 static INLINE void
76 release_temps(struct nv40_vpc *vpc)
77 {
78 vpc->r_temps &= ~vpc->r_temps_discard;
79 vpc->r_temps_discard = 0;
80 }
81
82 static struct nv40_sreg
83 constant(struct nv40_vpc *vpc, int pipe, float x, float y, float z, float w)
84 {
85 struct nv40_vertex_program *vp = vpc->vp;
86 struct nv40_vertex_program_data *vpd;
87 int idx;
88
89 if (pipe >= 0) {
90 for (idx = 0; idx < vp->nr_consts; idx++) {
91 if (vp->consts[idx].index == pipe)
92 return nv40_sr(NV40SR_CONST, idx);
93 }
94 }
95
96 idx = vp->nr_consts++;
97 vp->consts = realloc(vp->consts, sizeof(*vpd) * vp->nr_consts);
98 vpd = &vp->consts[idx];
99
100 vpd->index = pipe;
101 vpd->value[0] = x;
102 vpd->value[1] = y;
103 vpd->value[2] = z;
104 vpd->value[3] = w;
105 return nv40_sr(NV40SR_CONST, idx);
106 }
107
108 #define arith(cc,s,o,d,m,s0,s1,s2) \
109 nv40_vp_arith((cc), (s), NV40_VP_INST_##o, (d), (m), (s0), (s1), (s2))
110
111 static void
112 emit_src(struct nv40_vpc *vpc, uint32_t *hw, int pos, struct nv40_sreg src)
113 {
114 struct nv40_vertex_program *vp = vpc->vp;
115 uint32_t sr = 0;
116
117 switch (src.type) {
118 case NV40SR_TEMP:
119 sr |= (NV40_VP_SRC_REG_TYPE_TEMP << NV40_VP_SRC_REG_TYPE_SHIFT);
120 sr |= (src.index << NV40_VP_SRC_TEMP_SRC_SHIFT);
121 break;
122 case NV40SR_INPUT:
123 sr |= (NV40_VP_SRC_REG_TYPE_INPUT <<
124 NV40_VP_SRC_REG_TYPE_SHIFT);
125 vp->ir |= (1 << src.index);
126 hw[1] |= (src.index << NV40_VP_INST_INPUT_SRC_SHIFT);
127 break;
128 case NV40SR_CONST:
129 sr |= (NV40_VP_SRC_REG_TYPE_CONST <<
130 NV40_VP_SRC_REG_TYPE_SHIFT);
131 assert(vpc->vpi->const_index == -1 ||
132 vpc->vpi->const_index == src.index);
133 vpc->vpi->const_index = src.index;
134 break;
135 case NV40SR_NONE:
136 sr |= (NV40_VP_SRC_REG_TYPE_INPUT <<
137 NV40_VP_SRC_REG_TYPE_SHIFT);
138 break;
139 default:
140 assert(0);
141 }
142
143 if (src.negate)
144 sr |= NV40_VP_SRC_NEGATE;
145
146 if (src.abs)
147 hw[0] |= (1 << (21 + pos));
148
149 sr |= ((src.swz[0] << NV40_VP_SRC_SWZ_X_SHIFT) |
150 (src.swz[1] << NV40_VP_SRC_SWZ_Y_SHIFT) |
151 (src.swz[2] << NV40_VP_SRC_SWZ_Z_SHIFT) |
152 (src.swz[3] << NV40_VP_SRC_SWZ_W_SHIFT));
153
154 switch (pos) {
155 case 0:
156 hw[1] |= ((sr & NV40_VP_SRC0_HIGH_MASK) >>
157 NV40_VP_SRC0_HIGH_SHIFT) << NV40_VP_INST_SRC0H_SHIFT;
158 hw[2] |= (sr & NV40_VP_SRC0_LOW_MASK) <<
159 NV40_VP_INST_SRC0L_SHIFT;
160 break;
161 case 1:
162 hw[2] |= sr << NV40_VP_INST_SRC1_SHIFT;
163 break;
164 case 2:
165 hw[2] |= ((sr & NV40_VP_SRC2_HIGH_MASK) >>
166 NV40_VP_SRC2_HIGH_SHIFT) << NV40_VP_INST_SRC2H_SHIFT;
167 hw[3] |= (sr & NV40_VP_SRC2_LOW_MASK) <<
168 NV40_VP_INST_SRC2L_SHIFT;
169 break;
170 default:
171 assert(0);
172 }
173 }
174
175 static void
176 emit_dst(struct nv40_vpc *vpc, uint32_t *hw, int slot, struct nv40_sreg dst)
177 {
178 struct nv40_vertex_program *vp = vpc->vp;
179
180 switch (dst.type) {
181 case NV40SR_TEMP:
182 hw[3] |= NV40_VP_INST_DEST_MASK;
183 if (slot == 0) {
184 hw[0] |= (dst.index <<
185 NV40_VP_INST_VEC_DEST_TEMP_SHIFT);
186 } else {
187 hw[3] |= (dst.index <<
188 NV40_VP_INST_SCA_DEST_TEMP_SHIFT);
189 }
190 break;
191 case NV40SR_OUTPUT:
192 switch (dst.index) {
193 case NV40_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
194 case NV40_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
195 case NV40_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
196 case NV40_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
197 case NV40_VP_INST_DEST_FOGC : vp->or |= (1 << 4); break;
198 case NV40_VP_INST_DEST_PSZ : vp->or |= (1 << 5); break;
199 case NV40_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break;
200 case NV40_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break;
201 case NV40_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break;
202 case NV40_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break;
203 case NV40_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break;
204 case NV40_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break;
205 case NV40_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break;
206 case NV40_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break;
207 case NV40_VP_INST_DEST_CLIP(0):
208 vp->or |= (1 << 6);
209 vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE0;
210 dst.index = NV40_VP_INST_DEST_FOGC;
211 break;
212 case NV40_VP_INST_DEST_CLIP(1):
213 vp->or |= (1 << 7);
214 vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE1;
215 dst.index = NV40_VP_INST_DEST_FOGC;
216 break;
217 case NV40_VP_INST_DEST_CLIP(2):
218 vp->or |= (1 << 8);
219 vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE2;
220 dst.index = NV40_VP_INST_DEST_FOGC;
221 break;
222 case NV40_VP_INST_DEST_CLIP(3):
223 vp->or |= (1 << 9);
224 vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE3;
225 dst.index = NV40_VP_INST_DEST_PSZ;
226 break;
227 case NV40_VP_INST_DEST_CLIP(4):
228 vp->or |= (1 << 10);
229 vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE4;
230 dst.index = NV40_VP_INST_DEST_PSZ;
231 break;
232 case NV40_VP_INST_DEST_CLIP(5):
233 vp->or |= (1 << 11);
234 vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE5;
235 dst.index = NV40_VP_INST_DEST_PSZ;
236 break;
237 default:
238 break;
239 }
240
241 hw[3] |= (dst.index << NV40_VP_INST_DEST_SHIFT);
242 if (slot == 0) {
243 hw[0] |= NV40_VP_INST_VEC_RESULT;
244 hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK | (1<<20);
245 } else {
246 hw[3] |= NV40_VP_INST_SCA_RESULT;
247 hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
248 }
249 break;
250 default:
251 assert(0);
252 }
253 }
254
255 static void
256 nv40_vp_arith(struct nv40_vpc *vpc, int slot, int op,
257 struct nv40_sreg dst, int mask,
258 struct nv40_sreg s0, struct nv40_sreg s1,
259 struct nv40_sreg s2)
260 {
261 struct nv40_vertex_program *vp = vpc->vp;
262 uint32_t *hw;
263
264 vp->insns = realloc(vp->insns, ++vp->nr_insns * sizeof(*vpc->vpi));
265 vpc->vpi = &vp->insns[vp->nr_insns - 1];
266 memset(vpc->vpi, 0, sizeof(*vpc->vpi));
267 vpc->vpi->const_index = -1;
268
269 hw = vpc->vpi->data;
270
271 hw[0] |= (NV40_VP_INST_COND_TR << NV40_VP_INST_COND_SHIFT);
272 hw[0] |= ((0 << NV40_VP_INST_COND_SWZ_X_SHIFT) |
273 (1 << NV40_VP_INST_COND_SWZ_Y_SHIFT) |
274 (2 << NV40_VP_INST_COND_SWZ_Z_SHIFT) |
275 (3 << NV40_VP_INST_COND_SWZ_W_SHIFT));
276
277 if (slot == 0) {
278 hw[1] |= (op << NV40_VP_INST_VEC_OPCODE_SHIFT);
279 hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
280 hw[3] |= (mask << NV40_VP_INST_VEC_WRITEMASK_SHIFT);
281 } else {
282 hw[1] |= (op << NV40_VP_INST_SCA_OPCODE_SHIFT);
283 hw[0] |= (NV40_VP_INST_VEC_DEST_TEMP_MASK | (1 << 20));
284 hw[3] |= (mask << NV40_VP_INST_SCA_WRITEMASK_SHIFT);
285 }
286
287 emit_dst(vpc, hw, slot, dst);
288 emit_src(vpc, hw, 0, s0);
289 emit_src(vpc, hw, 1, s1);
290 emit_src(vpc, hw, 2, s2);
291 }
292
293 static INLINE struct nv40_sreg
294 tgsi_src(struct nv40_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
295 struct nv40_sreg src;
296
297 switch (fsrc->SrcRegister.File) {
298 case TGSI_FILE_INPUT:
299 src = nv40_sr(NV40SR_INPUT, fsrc->SrcRegister.Index);
300 break;
301 case TGSI_FILE_CONSTANT:
302 src = constant(vpc, fsrc->SrcRegister.Index, 0, 0, 0, 0);
303 break;
304 case TGSI_FILE_IMMEDIATE:
305 src = vpc->imm[fsrc->SrcRegister.Index];
306 break;
307 case TGSI_FILE_TEMPORARY:
308 src = vpc->r_temp[fsrc->SrcRegister.Index];
309 break;
310 default:
311 NOUVEAU_ERR("bad src file\n");
312 break;
313 }
314
315 src.abs = fsrc->SrcRegisterExtMod.Absolute;
316 src.negate = fsrc->SrcRegister.Negate;
317 src.swz[0] = fsrc->SrcRegister.SwizzleX;
318 src.swz[1] = fsrc->SrcRegister.SwizzleY;
319 src.swz[2] = fsrc->SrcRegister.SwizzleZ;
320 src.swz[3] = fsrc->SrcRegister.SwizzleW;
321 return src;
322 }
323
324 static INLINE struct nv40_sreg
325 tgsi_dst(struct nv40_vpc *vpc, const struct tgsi_full_dst_register *fdst) {
326 struct nv40_sreg dst;
327
328 switch (fdst->DstRegister.File) {
329 case TGSI_FILE_OUTPUT:
330 dst = vpc->r_result[fdst->DstRegister.Index];
331 break;
332 case TGSI_FILE_TEMPORARY:
333 dst = vpc->r_temp[fdst->DstRegister.Index];
334 break;
335 case TGSI_FILE_ADDRESS:
336 dst = vpc->r_address[fdst->DstRegister.Index];
337 break;
338 default:
339 NOUVEAU_ERR("bad dst file\n");
340 break;
341 }
342
343 return dst;
344 }
345
346 static INLINE int
347 tgsi_mask(uint tgsi)
348 {
349 int mask = 0;
350
351 if (tgsi & TGSI_WRITEMASK_X) mask |= MASK_X;
352 if (tgsi & TGSI_WRITEMASK_Y) mask |= MASK_Y;
353 if (tgsi & TGSI_WRITEMASK_Z) mask |= MASK_Z;
354 if (tgsi & TGSI_WRITEMASK_W) mask |= MASK_W;
355 return mask;
356 }
357
358 static boolean
359 src_native_swz(struct nv40_vpc *vpc, const struct tgsi_full_src_register *fsrc,
360 struct nv40_sreg *src)
361 {
362 const struct nv40_sreg none = nv40_sr(NV40SR_NONE, 0);
363 struct nv40_sreg tgsi = tgsi_src(vpc, fsrc);
364 uint mask = 0, zero_mask = 0, one_mask = 0, neg_mask = 0;
365 uint neg[4] = { fsrc->SrcRegisterExtSwz.NegateX,
366 fsrc->SrcRegisterExtSwz.NegateY,
367 fsrc->SrcRegisterExtSwz.NegateZ,
368 fsrc->SrcRegisterExtSwz.NegateW };
369 uint c;
370
371 for (c = 0; c < 4; c++) {
372 switch (tgsi_util_get_full_src_register_extswizzle(fsrc, c)) {
373 case TGSI_EXTSWIZZLE_X:
374 case TGSI_EXTSWIZZLE_Y:
375 case TGSI_EXTSWIZZLE_Z:
376 case TGSI_EXTSWIZZLE_W:
377 mask |= tgsi_mask(1 << c);
378 break;
379 case TGSI_EXTSWIZZLE_ZERO:
380 zero_mask |= tgsi_mask(1 << c);
381 tgsi.swz[c] = SWZ_X;
382 break;
383 case TGSI_EXTSWIZZLE_ONE:
384 one_mask |= tgsi_mask(1 << c);
385 tgsi.swz[c] = SWZ_X;
386 break;
387 default:
388 assert(0);
389 }
390
391 if (!tgsi.negate && neg[c])
392 neg_mask |= tgsi_mask(1 << c);
393 }
394
395 if (mask == MASK_ALL && !neg_mask)
396 return TRUE;
397
398 *src = temp(vpc);
399
400 if (mask)
401 arith(vpc, 0, OP_MOV, *src, mask, tgsi, none, none);
402
403 if (zero_mask)
404 arith(vpc, 0, OP_SFL, *src, zero_mask, *src, none, none);
405
406 if (one_mask)
407 arith(vpc, 0, OP_STR, *src, one_mask, *src, none, none);
408
409 if (neg_mask) {
410 struct nv40_sreg one = temp(vpc);
411 arith(vpc, 0, OP_STR, one, neg_mask, one, none, none);
412 arith(vpc, 0, OP_MUL, *src, neg_mask, *src, neg(one), none);
413 }
414
415 return FALSE;
416 }
417
418 static boolean
419 nv40_vertprog_parse_instruction(struct nv40_vpc *vpc,
420 const struct tgsi_full_instruction *finst)
421 {
422 struct nv40_sreg src[3], dst, tmp;
423 struct nv40_sreg none = nv40_sr(NV40SR_NONE, 0);
424 int mask;
425 int ai = -1, ci = -1, ii = -1;
426 int i;
427
428 if (finst->Instruction.Opcode == TGSI_OPCODE_END)
429 return TRUE;
430
431 for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
432 const struct tgsi_full_src_register *fsrc;
433
434 fsrc = &finst->FullSrcRegisters[i];
435 if (fsrc->SrcRegister.File == TGSI_FILE_TEMPORARY) {
436 src[i] = tgsi_src(vpc, fsrc);
437 }
438 }
439
440 for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
441 const struct tgsi_full_src_register *fsrc;
442
443 fsrc = &finst->FullSrcRegisters[i];
444
445 switch (fsrc->SrcRegister.File) {
446 case TGSI_FILE_INPUT:
447 case TGSI_FILE_CONSTANT:
448 case TGSI_FILE_TEMPORARY:
449 if (!src_native_swz(vpc, fsrc, &src[i]))
450 continue;
451 break;
452 default:
453 break;
454 }
455
456 switch (fsrc->SrcRegister.File) {
457 case TGSI_FILE_INPUT:
458 if (ai == -1 || ai == fsrc->SrcRegister.Index) {
459 ai = fsrc->SrcRegister.Index;
460 src[i] = tgsi_src(vpc, fsrc);
461 } else {
462 src[i] = temp(vpc);
463 arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
464 tgsi_src(vpc, fsrc), none, none);
465 }
466 break;
467 case TGSI_FILE_CONSTANT:
468 if ((ci == -1 && ii == -1) ||
469 ci == fsrc->SrcRegister.Index) {
470 ci = fsrc->SrcRegister.Index;
471 src[i] = tgsi_src(vpc, fsrc);
472 } else {
473 src[i] = temp(vpc);
474 arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
475 tgsi_src(vpc, fsrc), none, none);
476 }
477 break;
478 case TGSI_FILE_IMMEDIATE:
479 if ((ci == -1 && ii == -1) ||
480 ii == fsrc->SrcRegister.Index) {
481 ii = fsrc->SrcRegister.Index;
482 src[i] = tgsi_src(vpc, fsrc);
483 } else {
484 src[i] = temp(vpc);
485 arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
486 tgsi_src(vpc, fsrc), none, none);
487 }
488 break;
489 case TGSI_FILE_TEMPORARY:
490 /* handled above */
491 break;
492 default:
493 NOUVEAU_ERR("bad src file\n");
494 return FALSE;
495 }
496 }
497
498 dst = tgsi_dst(vpc, &finst->FullDstRegisters[0]);
499 mask = tgsi_mask(finst->FullDstRegisters[0].DstRegister.WriteMask);
500
501 switch (finst->Instruction.Opcode) {
502 case TGSI_OPCODE_ABS:
503 arith(vpc, 0, OP_MOV, dst, mask, abs(src[0]), none, none);
504 break;
505 case TGSI_OPCODE_ADD:
506 arith(vpc, 0, OP_ADD, dst, mask, src[0], none, src[1]);
507 break;
508 case TGSI_OPCODE_ARL:
509 arith(vpc, 0, OP_ARL, dst, mask, src[0], none, none);
510 break;
511 case TGSI_OPCODE_DP3:
512 arith(vpc, 0, OP_DP3, dst, mask, src[0], src[1], none);
513 break;
514 case TGSI_OPCODE_DP4:
515 arith(vpc, 0, OP_DP4, dst, mask, src[0], src[1], none);
516 break;
517 case TGSI_OPCODE_DPH:
518 arith(vpc, 0, OP_DPH, dst, mask, src[0], src[1], none);
519 break;
520 case TGSI_OPCODE_DST:
521 arith(vpc, 0, OP_DST, dst, mask, src[0], src[1], none);
522 break;
523 case TGSI_OPCODE_EX2:
524 arith(vpc, 1, OP_EX2, dst, mask, none, none, src[0]);
525 break;
526 case TGSI_OPCODE_EXP:
527 arith(vpc, 1, OP_EXP, dst, mask, none, none, src[0]);
528 break;
529 case TGSI_OPCODE_FLR:
530 arith(vpc, 0, OP_FLR, dst, mask, src[0], none, none);
531 break;
532 case TGSI_OPCODE_FRC:
533 arith(vpc, 0, OP_FRC, dst, mask, src[0], none, none);
534 break;
535 case TGSI_OPCODE_LG2:
536 arith(vpc, 1, OP_LG2, dst, mask, none, none, src[0]);
537 break;
538 case TGSI_OPCODE_LIT:
539 arith(vpc, 1, OP_LIT, dst, mask, none, none, src[0]);
540 break;
541 case TGSI_OPCODE_LOG:
542 arith(vpc, 1, OP_LOG, dst, mask, none, none, src[0]);
543 break;
544 case TGSI_OPCODE_MAD:
545 arith(vpc, 0, OP_MAD, dst, mask, src[0], src[1], src[2]);
546 break;
547 case TGSI_OPCODE_MAX:
548 arith(vpc, 0, OP_MAX, dst, mask, src[0], src[1], none);
549 break;
550 case TGSI_OPCODE_MIN:
551 arith(vpc, 0, OP_MIN, dst, mask, src[0], src[1], none);
552 break;
553 case TGSI_OPCODE_MOV:
554 arith(vpc, 0, OP_MOV, dst, mask, src[0], none, none);
555 break;
556 case TGSI_OPCODE_MUL:
557 arith(vpc, 0, OP_MUL, dst, mask, src[0], src[1], none);
558 break;
559 case TGSI_OPCODE_POW:
560 tmp = temp(vpc);
561 arith(vpc, 1, OP_LG2, tmp, MASK_X, none, none,
562 swz(src[0], X, X, X, X));
563 arith(vpc, 0, OP_MUL, tmp, MASK_X, swz(tmp, X, X, X, X),
564 swz(src[1], X, X, X, X), none);
565 arith(vpc, 1, OP_EX2, dst, mask, none, none,
566 swz(tmp, X, X, X, X));
567 break;
568 case TGSI_OPCODE_RCP:
569 arith(vpc, 1, OP_RCP, dst, mask, none, none, src[0]);
570 break;
571 case TGSI_OPCODE_RET:
572 break;
573 case TGSI_OPCODE_RSQ:
574 arith(vpc, 1, OP_RSQ, dst, mask, none, none, abs(src[0]));
575 break;
576 case TGSI_OPCODE_SGE:
577 arith(vpc, 0, OP_SGE, dst, mask, src[0], src[1], none);
578 break;
579 case TGSI_OPCODE_SLT:
580 arith(vpc, 0, OP_SLT, dst, mask, src[0], src[1], none);
581 break;
582 case TGSI_OPCODE_SUB:
583 arith(vpc, 0, OP_ADD, dst, mask, src[0], none, neg(src[1]));
584 break;
585 case TGSI_OPCODE_XPD:
586 tmp = temp(vpc);
587 arith(vpc, 0, OP_MUL, tmp, mask,
588 swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
589 arith(vpc, 0, OP_MAD, dst, (mask & ~MASK_W),
590 swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
591 neg(tmp));
592 break;
593 default:
594 NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
595 return FALSE;
596 }
597
598 release_temps(vpc);
599 return TRUE;
600 }
601
602 static boolean
603 nv40_vertprog_parse_decl_output(struct nv40_vpc *vpc,
604 const struct tgsi_full_declaration *fdec)
605 {
606 unsigned idx = fdec->DeclarationRange.First;
607 int hw;
608
609 switch (fdec->Semantic.SemanticName) {
610 case TGSI_SEMANTIC_POSITION:
611 hw = NV40_VP_INST_DEST_POS;
612 vpc->hpos_idx = idx;
613 break;
614 case TGSI_SEMANTIC_COLOR:
615 if (fdec->Semantic.SemanticIndex == 0) {
616 hw = NV40_VP_INST_DEST_COL0;
617 } else
618 if (fdec->Semantic.SemanticIndex == 1) {
619 hw = NV40_VP_INST_DEST_COL1;
620 } else {
621 NOUVEAU_ERR("bad colour semantic index\n");
622 return FALSE;
623 }
624 break;
625 case TGSI_SEMANTIC_BCOLOR:
626 if (fdec->Semantic.SemanticIndex == 0) {
627 hw = NV40_VP_INST_DEST_BFC0;
628 } else
629 if (fdec->Semantic.SemanticIndex == 1) {
630 hw = NV40_VP_INST_DEST_BFC1;
631 } else {
632 NOUVEAU_ERR("bad bcolour semantic index\n");
633 return FALSE;
634 }
635 break;
636 case TGSI_SEMANTIC_FOG:
637 hw = NV40_VP_INST_DEST_FOGC;
638 break;
639 case TGSI_SEMANTIC_PSIZE:
640 hw = NV40_VP_INST_DEST_PSZ;
641 break;
642 case TGSI_SEMANTIC_GENERIC:
643 if (fdec->Semantic.SemanticIndex <= 7) {
644 hw = NV40_VP_INST_DEST_TC(fdec->Semantic.SemanticIndex);
645 } else {
646 NOUVEAU_ERR("bad generic semantic index\n");
647 return FALSE;
648 }
649 break;
650 default:
651 NOUVEAU_ERR("bad output semantic\n");
652 return FALSE;
653 }
654
655 vpc->r_result[idx] = nv40_sr(NV40SR_OUTPUT, hw);
656 return TRUE;
657 }
658
659 static boolean
660 nv40_vertprog_prepare(struct nv40_vpc *vpc)
661 {
662 struct tgsi_parse_context p;
663 int high_temp = -1, high_addr = -1, nr_imm = 0, i;
664
665 tgsi_parse_init(&p, vpc->vp->pipe.tokens);
666 while (!tgsi_parse_end_of_tokens(&p)) {
667 const union tgsi_full_token *tok = &p.FullToken;
668
669 tgsi_parse_token(&p);
670 switch(tok->Token.Type) {
671 case TGSI_TOKEN_TYPE_IMMEDIATE:
672 nr_imm++;
673 break;
674 case TGSI_TOKEN_TYPE_DECLARATION:
675 {
676 const struct tgsi_full_declaration *fdec;
677
678 fdec = &p.FullToken.FullDeclaration;
679 switch (fdec->Declaration.File) {
680 case TGSI_FILE_TEMPORARY:
681 if (fdec->DeclarationRange.Last > high_temp) {
682 high_temp =
683 fdec->DeclarationRange.Last;
684 }
685 break;
686 #if 0 /* this would be nice.. except gallium doesn't track it */
687 case TGSI_FILE_ADDRESS:
688 if (fdec->DeclarationRange.Last > high_addr) {
689 high_addr =
690 fdec->DeclarationRange.Last;
691 }
692 break;
693 #endif
694 case TGSI_FILE_OUTPUT:
695 if (!nv40_vertprog_parse_decl_output(vpc, fdec))
696 return FALSE;
697 break;
698 default:
699 break;
700 }
701 }
702 break;
703 #if 1 /* yay, parse instructions looking for address regs instead */
704 case TGSI_TOKEN_TYPE_INSTRUCTION:
705 {
706 const struct tgsi_full_instruction *finst;
707 const struct tgsi_full_dst_register *fdst;
708
709 finst = &p.FullToken.FullInstruction;
710 fdst = &finst->FullDstRegisters[0];
711
712 if (fdst->DstRegister.File == TGSI_FILE_ADDRESS) {
713 if (fdst->DstRegister.Index > high_addr)
714 high_addr = fdst->DstRegister.Index;
715 }
716
717 }
718 break;
719 #endif
720 default:
721 break;
722 }
723 }
724 tgsi_parse_free(&p);
725
726 if (nr_imm) {
727 vpc->imm = CALLOC(nr_imm, sizeof(struct nv40_sreg));
728 assert(vpc->imm);
729 }
730
731 if (++high_temp) {
732 vpc->r_temp = CALLOC(high_temp, sizeof(struct nv40_sreg));
733 for (i = 0; i < high_temp; i++)
734 vpc->r_temp[i] = temp(vpc);
735 }
736
737 if (++high_addr) {
738 vpc->r_address = CALLOC(high_addr, sizeof(struct nv40_sreg));
739 for (i = 0; i < high_addr; i++)
740 vpc->r_address[i] = temp(vpc);
741 }
742
743 vpc->r_temps_discard = 0;
744 return TRUE;
745 }
746
747 static void
748 nv40_vertprog_translate(struct nv40_context *nv40,
749 struct nv40_vertex_program *vp)
750 {
751 struct tgsi_parse_context parse;
752 struct nv40_vpc *vpc = NULL;
753 struct nv40_sreg none = nv40_sr(NV40SR_NONE, 0);
754 int i;
755
756 vpc = CALLOC(1, sizeof(struct nv40_vpc));
757 if (!vpc)
758 return;
759 vpc->vp = vp;
760
761 if (!nv40_vertprog_prepare(vpc)) {
762 FREE(vpc);
763 return;
764 }
765
766 /* Redirect post-transform vertex position to a temp if user clip
767 * planes are enabled. We need to append code the the vtxprog
768 * to handle clip planes later.
769 */
770 if (vp->ucp.nr) {
771 vpc->r_result[vpc->hpos_idx] = temp(vpc);
772 vpc->r_temps_discard = 0;
773 }
774
775 tgsi_parse_init(&parse, vp->pipe.tokens);
776
777 while (!tgsi_parse_end_of_tokens(&parse)) {
778 tgsi_parse_token(&parse);
779
780 switch (parse.FullToken.Token.Type) {
781 case TGSI_TOKEN_TYPE_IMMEDIATE:
782 {
783 const struct tgsi_full_immediate *imm;
784
785 imm = &parse.FullToken.FullImmediate;
786 assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
787 assert(imm->Immediate.NrTokens == 4 + 1);
788 vpc->imm[vpc->nr_imm++] =
789 constant(vpc, -1,
790 imm->u.ImmediateFloat32[0].Float,
791 imm->u.ImmediateFloat32[1].Float,
792 imm->u.ImmediateFloat32[2].Float,
793 imm->u.ImmediateFloat32[3].Float);
794 }
795 break;
796 case TGSI_TOKEN_TYPE_INSTRUCTION:
797 {
798 const struct tgsi_full_instruction *finst;
799 finst = &parse.FullToken.FullInstruction;
800 if (!nv40_vertprog_parse_instruction(vpc, finst))
801 goto out_err;
802 }
803 break;
804 default:
805 break;
806 }
807 }
808
809 /* Write out HPOS if it was redirected to a temp earlier */
810 if (vpc->r_result[vpc->hpos_idx].type != NV40SR_OUTPUT) {
811 struct nv40_sreg hpos = nv40_sr(NV40SR_OUTPUT,
812 NV40_VP_INST_DEST_POS);
813 struct nv40_sreg htmp = vpc->r_result[vpc->hpos_idx];
814
815 arith(vpc, 0, OP_MOV, hpos, MASK_ALL, htmp, none, none);
816 }
817
818 /* Insert code to handle user clip planes */
819 for (i = 0; i < vp->ucp.nr; i++) {
820 struct nv40_sreg cdst = nv40_sr(NV40SR_OUTPUT,
821 NV40_VP_INST_DEST_CLIP(i));
822 struct nv40_sreg ceqn = constant(vpc, -1,
823 nv40->clip.ucp[i][0],
824 nv40->clip.ucp[i][1],
825 nv40->clip.ucp[i][2],
826 nv40->clip.ucp[i][3]);
827 struct nv40_sreg htmp = vpc->r_result[vpc->hpos_idx];
828 unsigned mask;
829
830 switch (i) {
831 case 0: case 3: mask = MASK_Y; break;
832 case 1: case 4: mask = MASK_Z; break;
833 case 2: case 5: mask = MASK_W; break;
834 default:
835 NOUVEAU_ERR("invalid clip dist #%d\n", i);
836 goto out_err;
837 }
838
839 arith(vpc, 0, OP_DP4, cdst, mask, htmp, ceqn, none);
840 }
841
842 vp->insns[vp->nr_insns - 1].data[3] |= NV40_VP_INST_LAST;
843 vp->translated = TRUE;
844 out_err:
845 tgsi_parse_free(&parse);
846 if (vpc->r_temp)
847 FREE(vpc->r_temp);
848 if (vpc->r_address)
849 FREE(vpc->r_address);
850 if (vpc->imm)
851 FREE(vpc->imm);
852 FREE(vpc);
853 }
854
855 static boolean
856 nv40_vertprog_validate(struct nv40_context *nv40)
857 {
858 struct nouveau_winsys *nvws = nv40->nvws;
859 struct pipe_winsys *ws = nv40->pipe.winsys;
860 struct nouveau_grobj *curie = nv40->screen->curie;
861 struct nv40_vertex_program *vp;
862 struct pipe_buffer *constbuf;
863 boolean upload_code = FALSE, upload_data = FALSE;
864 int i;
865
866 if (nv40->render_mode == HW) {
867 vp = nv40->vertprog;
868 constbuf = nv40->constbuf[PIPE_SHADER_VERTEX];
869
870 if ((nv40->dirty & NV40_NEW_UCP) ||
871 memcmp(&nv40->clip, &vp->ucp, sizeof(vp->ucp))) {
872 nv40_vertprog_destroy(nv40, vp);
873 memcpy(&vp->ucp, &nv40->clip, sizeof(vp->ucp));
874 }
875 } else {
876 vp = nv40->swtnl.vertprog;
877 constbuf = NULL;
878 }
879
880 /* Translate TGSI shader into hw bytecode */
881 if (vp->translated)
882 goto check_gpu_resources;
883
884 nv40->fallback_swtnl &= ~NV40_NEW_VERTPROG;
885 nv40_vertprog_translate(nv40, vp);
886 if (!vp->translated) {
887 nv40->fallback_swtnl |= NV40_NEW_VERTPROG;
888 return FALSE;
889 }
890
891 check_gpu_resources:
892 /* Allocate hw vtxprog exec slots */
893 if (!vp->exec) {
894 struct nouveau_resource *heap = nv40->screen->vp_exec_heap;
895 struct nouveau_stateobj *so;
896 uint vplen = vp->nr_insns;
897
898 if (nvws->res_alloc(heap, vplen, vp, &vp->exec)) {
899 while (heap->next && heap->size < vplen) {
900 struct nv40_vertex_program *evict;
901
902 evict = heap->next->priv;
903 nvws->res_free(&evict->exec);
904 }
905
906 if (nvws->res_alloc(heap, vplen, vp, &vp->exec))
907 assert(0);
908 }
909
910 so = so_new(7, 0);
911 so_method(so, curie, NV40TCL_VP_START_FROM_ID, 1);
912 so_data (so, vp->exec->start);
913 so_method(so, curie, NV40TCL_VP_ATTRIB_EN, 2);
914 so_data (so, vp->ir);
915 so_data (so, vp->or);
916 so_method(so, curie, NV40TCL_CLIP_PLANE_ENABLE, 1);
917 so_data (so, vp->clip_ctrl);
918 so_ref(so, &vp->so);
919
920 upload_code = TRUE;
921 }
922
923 /* Allocate hw vtxprog const slots */
924 if (vp->nr_consts && !vp->data) {
925 struct nouveau_resource *heap = nv40->screen->vp_data_heap;
926
927 if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data)) {
928 while (heap->next && heap->size < vp->nr_consts) {
929 struct nv40_vertex_program *evict;
930
931 evict = heap->next->priv;
932 nvws->res_free(&evict->data);
933 }
934
935 if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data))
936 assert(0);
937 }
938
939 /*XXX: handle this some day */
940 assert(vp->data->start >= vp->data_start_min);
941
942 upload_data = TRUE;
943 if (vp->data_start != vp->data->start)
944 upload_code = TRUE;
945 }
946
947 /* If exec or data segments moved we need to patch the program to
948 * fixup offsets and register IDs.
949 */
950 if (vp->exec_start != vp->exec->start) {
951 for (i = 0; i < vp->nr_insns; i++) {
952 struct nv40_vertex_program_exec *vpi = &vp->insns[i];
953
954 if (vpi->has_branch_offset) {
955 assert(0);
956 }
957 }
958
959 vp->exec_start = vp->exec->start;
960 }
961
962 if (vp->nr_consts && vp->data_start != vp->data->start) {
963 for (i = 0; i < vp->nr_insns; i++) {
964 struct nv40_vertex_program_exec *vpi = &vp->insns[i];
965
966 if (vpi->const_index >= 0) {
967 vpi->data[1] &= ~NV40_VP_INST_CONST_SRC_MASK;
968 vpi->data[1] |=
969 (vpi->const_index + vp->data->start) <<
970 NV40_VP_INST_CONST_SRC_SHIFT;
971
972 }
973 }
974
975 vp->data_start = vp->data->start;
976 }
977
978 /* Update + Upload constant values */
979 if (vp->nr_consts) {
980 float *map = NULL;
981
982 if (constbuf) {
983 map = ws->buffer_map(ws, constbuf,
984 PIPE_BUFFER_USAGE_CPU_READ);
985 }
986
987 for (i = 0; i < vp->nr_consts; i++) {
988 struct nv40_vertex_program_data *vpd = &vp->consts[i];
989
990 if (vpd->index >= 0) {
991 if (!upload_data &&
992 !memcmp(vpd->value, &map[vpd->index * 4],
993 4 * sizeof(float)))
994 continue;
995 memcpy(vpd->value, &map[vpd->index * 4],
996 4 * sizeof(float));
997 }
998
999 BEGIN_RING(curie, NV40TCL_VP_UPLOAD_CONST_ID, 5);
1000 OUT_RING (i + vp->data->start);
1001 OUT_RINGp ((uint32_t *)vpd->value, 4);
1002 }
1003
1004 if (constbuf)
1005 ws->buffer_unmap(ws, constbuf);
1006 }
1007
1008 /* Upload vtxprog */
1009 if (upload_code) {
1010 #if 0
1011 for (i = 0; i < vp->nr_insns; i++) {
1012 NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[0]);
1013 NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[1]);
1014 NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[2]);
1015 NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[3]);
1016 }
1017 #endif
1018 BEGIN_RING(curie, NV40TCL_VP_UPLOAD_FROM_ID, 1);
1019 OUT_RING (vp->exec->start);
1020 for (i = 0; i < vp->nr_insns; i++) {
1021 BEGIN_RING(curie, NV40TCL_VP_UPLOAD_INST(0), 4);
1022 OUT_RINGp (vp->insns[i].data, 4);
1023 }
1024 }
1025
1026 if (vp->so != nv40->state.hw[NV40_STATE_VERTPROG]) {
1027 so_ref(vp->so, &nv40->state.hw[NV40_STATE_VERTPROG]);
1028 return TRUE;
1029 }
1030
1031 return FALSE;
1032 }
1033
1034 void
1035 nv40_vertprog_destroy(struct nv40_context *nv40, struct nv40_vertex_program *vp)
1036 {
1037 struct nouveau_winsys *nvws = nv40->screen->nvws;
1038
1039 vp->translated = FALSE;
1040
1041 if (vp->nr_insns) {
1042 FREE(vp->insns);
1043 vp->insns = NULL;
1044 vp->nr_insns = 0;
1045 }
1046
1047 if (vp->nr_consts) {
1048 FREE(vp->consts);
1049 vp->consts = NULL;
1050 vp->nr_consts = 0;
1051 }
1052
1053 nvws->res_free(&vp->exec);
1054 vp->exec_start = 0;
1055 nvws->res_free(&vp->data);
1056 vp->data_start = 0;
1057 vp->data_start_min = 0;
1058
1059 vp->ir = vp->or = vp->clip_ctrl = 0;
1060 so_ref(NULL, &vp->so);
1061 }
1062
1063 struct nv40_state_entry nv40_state_vertprog = {
1064 .validate = nv40_vertprog_validate,
1065 .dirty = {
1066 .pipe = NV40_NEW_VERTPROG | NV40_NEW_UCP,
1067 .hw = NV40_STATE_VERTPROG,
1068 }
1069 };
1070