nv40: fix slight thinko
[mesa.git] / src / gallium / drivers / nv40 / nv40_vertprog.c
1 #include "pipe/p_context.h"
2 #include "pipe/p_defines.h"
3 #include "pipe/p_state.h"
4
5 #include "pipe/p_shader_tokens.h"
6 #include "tgsi/util/tgsi_parse.h"
7 #include "tgsi/util/tgsi_util.h"
8
9 #include "nv40_context.h"
10 #include "nv40_state.h"
11
12 /* TODO (at least...):
13 * 1. Indexed consts + ARL
14 * 3. NV_vp11, NV_vp2, NV_vp3 features
15 * - extra arith opcodes
16 * - branching
17 * - texture sampling
18 * - indexed attribs
19 * - indexed results
20 * 4. bugs
21 */
22
23 #define SWZ_X 0
24 #define SWZ_Y 1
25 #define SWZ_Z 2
26 #define SWZ_W 3
27 #define MASK_X 8
28 #define MASK_Y 4
29 #define MASK_Z 2
30 #define MASK_W 1
31 #define MASK_ALL (MASK_X|MASK_Y|MASK_Z|MASK_W)
32 #define DEF_SCALE 0
33 #define DEF_CTEST 0
34 #include "nv40_shader.h"
35
36 #define swz(s,x,y,z,w) nv40_sr_swz((s), SWZ_##x, SWZ_##y, SWZ_##z, SWZ_##w)
37 #define neg(s) nv40_sr_neg((s))
38 #define abs(s) nv40_sr_abs((s))
39
40 #define NV40_VP_INST_DEST_CLIP(n) ((~0 - 6) + (n))
41
42 struct nv40_vpc {
43 struct nv40_vertex_program *vp;
44
45 struct nv40_vertex_program_exec *vpi;
46
47 unsigned r_temps;
48 unsigned r_temps_discard;
49 struct nv40_sreg r_result[PIPE_MAX_SHADER_OUTPUTS];
50 struct nv40_sreg *r_address;
51 struct nv40_sreg *r_temp;
52
53 struct nv40_sreg *imm;
54 unsigned nr_imm;
55 };
56
57 static struct nv40_sreg
58 temp(struct nv40_vpc *vpc)
59 {
60 int idx = ffs(~vpc->r_temps) - 1;
61
62 if (idx < 0) {
63 NOUVEAU_ERR("out of temps!!\n");
64 assert(0);
65 return nv40_sr(NV40SR_TEMP, 0);
66 }
67
68 vpc->r_temps |= (1 << idx);
69 vpc->r_temps_discard |= (1 << idx);
70 return nv40_sr(NV40SR_TEMP, idx);
71 }
72
73 static INLINE void
74 release_temps(struct nv40_vpc *vpc)
75 {
76 vpc->r_temps &= ~vpc->r_temps_discard;
77 vpc->r_temps_discard = 0;
78 }
79
80 static struct nv40_sreg
81 constant(struct nv40_vpc *vpc, int pipe, float x, float y, float z, float w)
82 {
83 struct nv40_vertex_program *vp = vpc->vp;
84 struct nv40_vertex_program_data *vpd;
85 int idx;
86
87 if (pipe >= 0) {
88 for (idx = 0; idx < vp->nr_consts; idx++) {
89 if (vp->consts[idx].index == pipe)
90 return nv40_sr(NV40SR_CONST, idx);
91 }
92 }
93
94 idx = vp->nr_consts++;
95 vp->consts = realloc(vp->consts, sizeof(*vpd) * vp->nr_consts);
96 vpd = &vp->consts[idx];
97
98 vpd->index = pipe;
99 vpd->value[0] = x;
100 vpd->value[1] = y;
101 vpd->value[2] = z;
102 vpd->value[3] = w;
103 return nv40_sr(NV40SR_CONST, idx);
104 }
105
106 #define arith(cc,s,o,d,m,s0,s1,s2) \
107 nv40_vp_arith((cc), (s), NV40_VP_INST_##o, (d), (m), (s0), (s1), (s2))
108
109 static void
110 emit_src(struct nv40_vpc *vpc, uint32_t *hw, int pos, struct nv40_sreg src)
111 {
112 struct nv40_vertex_program *vp = vpc->vp;
113 uint32_t sr = 0;
114
115 switch (src.type) {
116 case NV40SR_TEMP:
117 sr |= (NV40_VP_SRC_REG_TYPE_TEMP << NV40_VP_SRC_REG_TYPE_SHIFT);
118 sr |= (src.index << NV40_VP_SRC_TEMP_SRC_SHIFT);
119 break;
120 case NV40SR_INPUT:
121 sr |= (NV40_VP_SRC_REG_TYPE_INPUT <<
122 NV40_VP_SRC_REG_TYPE_SHIFT);
123 vp->ir |= (1 << src.index);
124 hw[1] |= (src.index << NV40_VP_INST_INPUT_SRC_SHIFT);
125 break;
126 case NV40SR_CONST:
127 sr |= (NV40_VP_SRC_REG_TYPE_CONST <<
128 NV40_VP_SRC_REG_TYPE_SHIFT);
129 assert(vpc->vpi->const_index == -1 ||
130 vpc->vpi->const_index == src.index);
131 vpc->vpi->const_index = src.index;
132 break;
133 case NV40SR_NONE:
134 sr |= (NV40_VP_SRC_REG_TYPE_INPUT <<
135 NV40_VP_SRC_REG_TYPE_SHIFT);
136 break;
137 default:
138 assert(0);
139 }
140
141 if (src.negate)
142 sr |= NV40_VP_SRC_NEGATE;
143
144 if (src.abs)
145 hw[0] |= (1 << (21 + pos));
146
147 sr |= ((src.swz[0] << NV40_VP_SRC_SWZ_X_SHIFT) |
148 (src.swz[1] << NV40_VP_SRC_SWZ_Y_SHIFT) |
149 (src.swz[2] << NV40_VP_SRC_SWZ_Z_SHIFT) |
150 (src.swz[3] << NV40_VP_SRC_SWZ_W_SHIFT));
151
152 switch (pos) {
153 case 0:
154 hw[1] |= ((sr & NV40_VP_SRC0_HIGH_MASK) >>
155 NV40_VP_SRC0_HIGH_SHIFT) << NV40_VP_INST_SRC0H_SHIFT;
156 hw[2] |= (sr & NV40_VP_SRC0_LOW_MASK) <<
157 NV40_VP_INST_SRC0L_SHIFT;
158 break;
159 case 1:
160 hw[2] |= sr << NV40_VP_INST_SRC1_SHIFT;
161 break;
162 case 2:
163 hw[2] |= ((sr & NV40_VP_SRC2_HIGH_MASK) >>
164 NV40_VP_SRC2_HIGH_SHIFT) << NV40_VP_INST_SRC2H_SHIFT;
165 hw[3] |= (sr & NV40_VP_SRC2_LOW_MASK) <<
166 NV40_VP_INST_SRC2L_SHIFT;
167 break;
168 default:
169 assert(0);
170 }
171 }
172
173 static void
174 emit_dst(struct nv40_vpc *vpc, uint32_t *hw, int slot, struct nv40_sreg dst)
175 {
176 struct nv40_vertex_program *vp = vpc->vp;
177
178 switch (dst.type) {
179 case NV40SR_TEMP:
180 hw[3] |= NV40_VP_INST_DEST_MASK;
181 if (slot == 0) {
182 hw[0] |= (dst.index <<
183 NV40_VP_INST_VEC_DEST_TEMP_SHIFT);
184 } else {
185 hw[3] |= (dst.index <<
186 NV40_VP_INST_SCA_DEST_TEMP_SHIFT);
187 }
188 break;
189 case NV40SR_OUTPUT:
190 switch (dst.index) {
191 case NV40_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
192 case NV40_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
193 case NV40_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
194 case NV40_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
195 case NV40_VP_INST_DEST_FOGC : vp->or |= (1 << 4); break;
196 case NV40_VP_INST_DEST_PSZ : vp->or |= (1 << 5); break;
197 case NV40_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break;
198 case NV40_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break;
199 case NV40_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break;
200 case NV40_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break;
201 case NV40_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break;
202 case NV40_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break;
203 case NV40_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break;
204 case NV40_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break;
205 case NV40_VP_INST_DEST_CLIP(0):
206 vp->or |= (1 << 6);
207 vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE0;
208 dst.index = NV40_VP_INST_DEST_FOGC;
209 break;
210 case NV40_VP_INST_DEST_CLIP(1):
211 vp->or |= (1 << 7);
212 vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE1;
213 dst.index = NV40_VP_INST_DEST_FOGC;
214 break;
215 case NV40_VP_INST_DEST_CLIP(2):
216 vp->or |= (1 << 8);
217 vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE2;
218 dst.index = NV40_VP_INST_DEST_FOGC;
219 break;
220 case NV40_VP_INST_DEST_CLIP(3):
221 vp->or |= (1 << 9);
222 vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE3;
223 dst.index = NV40_VP_INST_DEST_PSZ;
224 break;
225 case NV40_VP_INST_DEST_CLIP(4):
226 vp->or |= (1 << 10);
227 vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE4;
228 dst.index = NV40_VP_INST_DEST_PSZ;
229 break;
230 case NV40_VP_INST_DEST_CLIP(5):
231 vp->or |= (1 << 11);
232 vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE5;
233 dst.index = NV40_VP_INST_DEST_PSZ;
234 break;
235 default:
236 break;
237 }
238
239 hw[3] |= (dst.index << NV40_VP_INST_DEST_SHIFT);
240 if (slot == 0) {
241 hw[0] |= NV40_VP_INST_VEC_RESULT;
242 hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK | (1<<20);
243 } else {
244 hw[3] |= NV40_VP_INST_SCA_RESULT;
245 hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
246 }
247 break;
248 default:
249 assert(0);
250 }
251 }
252
253 static void
254 nv40_vp_arith(struct nv40_vpc *vpc, int slot, int op,
255 struct nv40_sreg dst, int mask,
256 struct nv40_sreg s0, struct nv40_sreg s1,
257 struct nv40_sreg s2)
258 {
259 struct nv40_vertex_program *vp = vpc->vp;
260 uint32_t *hw;
261
262 vp->insns = realloc(vp->insns, ++vp->nr_insns * sizeof(*vpc->vpi));
263 vpc->vpi = &vp->insns[vp->nr_insns - 1];
264 memset(vpc->vpi, 0, sizeof(*vpc->vpi));
265 vpc->vpi->const_index = -1;
266
267 hw = vpc->vpi->data;
268
269 hw[0] |= (NV40_VP_INST_COND_TR << NV40_VP_INST_COND_SHIFT);
270 hw[0] |= ((0 << NV40_VP_INST_COND_SWZ_X_SHIFT) |
271 (1 << NV40_VP_INST_COND_SWZ_Y_SHIFT) |
272 (2 << NV40_VP_INST_COND_SWZ_Z_SHIFT) |
273 (3 << NV40_VP_INST_COND_SWZ_W_SHIFT));
274
275 if (slot == 0) {
276 hw[1] |= (op << NV40_VP_INST_VEC_OPCODE_SHIFT);
277 hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
278 hw[3] |= (mask << NV40_VP_INST_VEC_WRITEMASK_SHIFT);
279 } else {
280 hw[1] |= (op << NV40_VP_INST_SCA_OPCODE_SHIFT);
281 hw[0] |= (NV40_VP_INST_VEC_DEST_TEMP_MASK | (1 << 20));
282 hw[3] |= (mask << NV40_VP_INST_SCA_WRITEMASK_SHIFT);
283 }
284
285 emit_dst(vpc, hw, slot, dst);
286 emit_src(vpc, hw, 0, s0);
287 emit_src(vpc, hw, 1, s1);
288 emit_src(vpc, hw, 2, s2);
289 }
290
291 static INLINE struct nv40_sreg
292 tgsi_src(struct nv40_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
293 struct nv40_sreg src;
294
295 switch (fsrc->SrcRegister.File) {
296 case TGSI_FILE_INPUT:
297 src = nv40_sr(NV40SR_INPUT, fsrc->SrcRegister.Index);
298 break;
299 case TGSI_FILE_CONSTANT:
300 src = constant(vpc, fsrc->SrcRegister.Index, 0, 0, 0, 0);
301 break;
302 case TGSI_FILE_IMMEDIATE:
303 src = vpc->imm[fsrc->SrcRegister.Index];
304 break;
305 case TGSI_FILE_TEMPORARY:
306 src = vpc->r_temp[fsrc->SrcRegister.Index];
307 break;
308 default:
309 NOUVEAU_ERR("bad src file\n");
310 break;
311 }
312
313 src.abs = fsrc->SrcRegisterExtMod.Absolute;
314 src.negate = fsrc->SrcRegister.Negate;
315 src.swz[0] = fsrc->SrcRegister.SwizzleX;
316 src.swz[1] = fsrc->SrcRegister.SwizzleY;
317 src.swz[2] = fsrc->SrcRegister.SwizzleZ;
318 src.swz[3] = fsrc->SrcRegister.SwizzleW;
319 return src;
320 }
321
322 static INLINE struct nv40_sreg
323 tgsi_dst(struct nv40_vpc *vpc, const struct tgsi_full_dst_register *fdst) {
324 struct nv40_sreg dst;
325
326 switch (fdst->DstRegister.File) {
327 case TGSI_FILE_OUTPUT:
328 dst = vpc->r_result[fdst->DstRegister.Index];
329 break;
330 case TGSI_FILE_TEMPORARY:
331 dst = vpc->r_temp[fdst->DstRegister.Index];
332 break;
333 case TGSI_FILE_ADDRESS:
334 dst = vpc->r_address[fdst->DstRegister.Index];
335 break;
336 default:
337 NOUVEAU_ERR("bad dst file\n");
338 break;
339 }
340
341 return dst;
342 }
343
344 static INLINE int
345 tgsi_mask(uint tgsi)
346 {
347 int mask = 0;
348
349 if (tgsi & TGSI_WRITEMASK_X) mask |= MASK_X;
350 if (tgsi & TGSI_WRITEMASK_Y) mask |= MASK_Y;
351 if (tgsi & TGSI_WRITEMASK_Z) mask |= MASK_Z;
352 if (tgsi & TGSI_WRITEMASK_W) mask |= MASK_W;
353 return mask;
354 }
355
356 static boolean
357 src_native_swz(struct nv40_vpc *vpc, const struct tgsi_full_src_register *fsrc,
358 struct nv40_sreg *src)
359 {
360 const struct nv40_sreg none = nv40_sr(NV40SR_NONE, 0);
361 struct nv40_sreg tgsi = tgsi_src(vpc, fsrc);
362 uint mask = 0, zero_mask = 0, one_mask = 0, neg_mask = 0;
363 uint neg[4] = { fsrc->SrcRegisterExtSwz.NegateX,
364 fsrc->SrcRegisterExtSwz.NegateY,
365 fsrc->SrcRegisterExtSwz.NegateZ,
366 fsrc->SrcRegisterExtSwz.NegateW };
367 uint c;
368
369 for (c = 0; c < 4; c++) {
370 switch (tgsi_util_get_full_src_register_extswizzle(fsrc, c)) {
371 case TGSI_EXTSWIZZLE_X:
372 case TGSI_EXTSWIZZLE_Y:
373 case TGSI_EXTSWIZZLE_Z:
374 case TGSI_EXTSWIZZLE_W:
375 mask |= tgsi_mask(1 << c);
376 break;
377 case TGSI_EXTSWIZZLE_ZERO:
378 zero_mask |= tgsi_mask(1 << c);
379 tgsi.swz[c] = SWZ_X;
380 break;
381 case TGSI_EXTSWIZZLE_ONE:
382 one_mask |= tgsi_mask(1 << c);
383 tgsi.swz[c] = SWZ_X;
384 break;
385 default:
386 assert(0);
387 }
388
389 if (!tgsi.negate && neg[c])
390 neg_mask |= tgsi_mask(1 << c);
391 }
392
393 if (mask == MASK_ALL && !neg_mask)
394 return TRUE;
395
396 *src = temp(vpc);
397
398 if (mask)
399 arith(vpc, 0, OP_MOV, *src, mask, tgsi, none, none);
400
401 if (zero_mask)
402 arith(vpc, 0, OP_SFL, *src, zero_mask, *src, none, none);
403
404 if (one_mask)
405 arith(vpc, 0, OP_STR, *src, one_mask, *src, none, none);
406
407 if (neg_mask) {
408 struct nv40_sreg one = temp(vpc);
409 arith(vpc, 0, OP_STR, one, neg_mask, one, none, none);
410 arith(vpc, 0, OP_MUL, *src, neg_mask, *src, neg(one), none);
411 }
412
413 return FALSE;
414 }
415
416 static boolean
417 nv40_vertprog_parse_instruction(struct nv40_vpc *vpc,
418 const struct tgsi_full_instruction *finst)
419 {
420 struct nv40_sreg src[3], dst, tmp;
421 struct nv40_sreg none = nv40_sr(NV40SR_NONE, 0);
422 int mask;
423 int ai = -1, ci = -1, ii = -1;
424 int i;
425
426 struct {
427 struct nv40_sreg dst;
428 unsigned m;
429 } clip;
430
431 if (finst->Instruction.Opcode == TGSI_OPCODE_END)
432 return TRUE;
433
434 for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
435 const struct tgsi_full_src_register *fsrc;
436
437 fsrc = &finst->FullSrcRegisters[i];
438 if (fsrc->SrcRegister.File == TGSI_FILE_TEMPORARY) {
439 src[i] = tgsi_src(vpc, fsrc);
440 }
441 }
442
443 for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
444 const struct tgsi_full_src_register *fsrc;
445
446 fsrc = &finst->FullSrcRegisters[i];
447
448 switch (fsrc->SrcRegister.File) {
449 case TGSI_FILE_INPUT:
450 case TGSI_FILE_CONSTANT:
451 case TGSI_FILE_TEMPORARY:
452 if (!src_native_swz(vpc, fsrc, &src[i]))
453 continue;
454 break;
455 default:
456 break;
457 }
458
459 switch (fsrc->SrcRegister.File) {
460 case TGSI_FILE_INPUT:
461 if (ai == -1 || ai == fsrc->SrcRegister.Index) {
462 ai = fsrc->SrcRegister.Index;
463 src[i] = tgsi_src(vpc, fsrc);
464 } else {
465 src[i] = temp(vpc);
466 arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
467 tgsi_src(vpc, fsrc), none, none);
468 }
469 break;
470 case TGSI_FILE_CONSTANT:
471 if ((ci == -1 && ii == -1) ||
472 ci == fsrc->SrcRegister.Index) {
473 ci = fsrc->SrcRegister.Index;
474 src[i] = tgsi_src(vpc, fsrc);
475 } else {
476 src[i] = temp(vpc);
477 arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
478 tgsi_src(vpc, fsrc), none, none);
479 }
480 break;
481 case TGSI_FILE_IMMEDIATE:
482 if ((ci == -1 && ii == -1) ||
483 ii == fsrc->SrcRegister.Index) {
484 ii = fsrc->SrcRegister.Index;
485 src[i] = tgsi_src(vpc, fsrc);
486 } else {
487 src[i] = temp(vpc);
488 arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
489 tgsi_src(vpc, fsrc), none, none);
490 }
491 break;
492 case TGSI_FILE_TEMPORARY:
493 /* handled above */
494 break;
495 default:
496 NOUVEAU_ERR("bad src file\n");
497 return FALSE;
498 }
499 }
500
501 dst = tgsi_dst(vpc, &finst->FullDstRegisters[0]);
502 mask = tgsi_mask(finst->FullDstRegisters[0].DstRegister.WriteMask);
503
504 /* If writing to clip distance regs, need to modify instruction to
505 * change which component is written to. On NV40 the clip regs
506 * are the unused components (yzw) of FOGC/PSZ.
507 */
508 clip.dst = none;
509 if (dst.type == NV40SR_OUTPUT &&
510 dst.index >= NV40_VP_INST_DEST_CLIP(0) &&
511 dst.index <= NV40_VP_INST_DEST_CLIP(5)) {
512 unsigned n = dst.index - NV40_VP_INST_DEST_CLIP(0);
513 unsigned m[] =
514 { MASK_Y, MASK_Z, MASK_W, MASK_Y, MASK_Z, MASK_W };
515
516 /* Some instructions we can get away with swizzling and/or
517 * changing the writemask. Others, we'll use a temp reg.
518 */
519 switch (finst->Instruction.Opcode) {
520 case TGSI_OPCODE_DST:
521 case TGSI_OPCODE_EXP:
522 case TGSI_OPCODE_LIT:
523 case TGSI_OPCODE_LOG:
524 case TGSI_OPCODE_XPD:
525 clip.dst = dst;
526 clip.m = m[n];
527 dst = temp(vpc);
528 break;
529 case TGSI_OPCODE_DP3:
530 case TGSI_OPCODE_DP4:
531 case TGSI_OPCODE_DPH:
532 case TGSI_OPCODE_POW:
533 case TGSI_OPCODE_RCP:
534 case TGSI_OPCODE_RSQ:
535 mask = m[n];
536 break;
537 default:
538 for (i = 0; i < finst->Instruction.NumSrcRegs; i++)
539 src[i] = swz(src[i], X, X, X, X);
540 mask = m[n];
541 break;
542 }
543 }
544
545 switch (finst->Instruction.Opcode) {
546 case TGSI_OPCODE_ABS:
547 arith(vpc, 0, OP_MOV, dst, mask, abs(src[0]), none, none);
548 break;
549 case TGSI_OPCODE_ADD:
550 arith(vpc, 0, OP_ADD, dst, mask, src[0], none, src[1]);
551 break;
552 case TGSI_OPCODE_ARL:
553 arith(vpc, 0, OP_ARL, dst, mask, src[0], none, none);
554 break;
555 case TGSI_OPCODE_DP3:
556 arith(vpc, 0, OP_DP3, dst, mask, src[0], src[1], none);
557 break;
558 case TGSI_OPCODE_DP4:
559 arith(vpc, 0, OP_DP4, dst, mask, src[0], src[1], none);
560 break;
561 case TGSI_OPCODE_DPH:
562 arith(vpc, 0, OP_DPH, dst, mask, src[0], src[1], none);
563 break;
564 case TGSI_OPCODE_DST:
565 arith(vpc, 0, OP_DST, dst, mask, src[0], src[1], none);
566 break;
567 case TGSI_OPCODE_EX2:
568 arith(vpc, 1, OP_EX2, dst, mask, none, none, src[0]);
569 break;
570 case TGSI_OPCODE_EXP:
571 arith(vpc, 1, OP_EXP, dst, mask, none, none, src[0]);
572 break;
573 case TGSI_OPCODE_FLR:
574 arith(vpc, 0, OP_FLR, dst, mask, src[0], none, none);
575 break;
576 case TGSI_OPCODE_FRC:
577 arith(vpc, 0, OP_FRC, dst, mask, src[0], none, none);
578 break;
579 case TGSI_OPCODE_LG2:
580 arith(vpc, 1, OP_LG2, dst, mask, none, none, src[0]);
581 break;
582 case TGSI_OPCODE_LIT:
583 arith(vpc, 1, OP_LIT, dst, mask, none, none, src[0]);
584 break;
585 case TGSI_OPCODE_LOG:
586 arith(vpc, 1, OP_LOG, dst, mask, none, none, src[0]);
587 break;
588 case TGSI_OPCODE_MAD:
589 arith(vpc, 0, OP_MAD, dst, mask, src[0], src[1], src[2]);
590 break;
591 case TGSI_OPCODE_MAX:
592 arith(vpc, 0, OP_MAX, dst, mask, src[0], src[1], none);
593 break;
594 case TGSI_OPCODE_MIN:
595 arith(vpc, 0, OP_MIN, dst, mask, src[0], src[1], none);
596 break;
597 case TGSI_OPCODE_MOV:
598 arith(vpc, 0, OP_MOV, dst, mask, src[0], none, none);
599 break;
600 case TGSI_OPCODE_MUL:
601 arith(vpc, 0, OP_MUL, dst, mask, src[0], src[1], none);
602 break;
603 case TGSI_OPCODE_POW:
604 tmp = temp(vpc);
605 arith(vpc, 1, OP_LG2, tmp, MASK_X, none, none,
606 swz(src[0], X, X, X, X));
607 arith(vpc, 0, OP_MUL, tmp, MASK_X, swz(tmp, X, X, X, X),
608 swz(src[1], X, X, X, X), none);
609 arith(vpc, 1, OP_EX2, dst, mask, none, none,
610 swz(tmp, X, X, X, X));
611 break;
612 case TGSI_OPCODE_RCP:
613 arith(vpc, 1, OP_RCP, dst, mask, none, none, src[0]);
614 break;
615 case TGSI_OPCODE_RET:
616 break;
617 case TGSI_OPCODE_RSQ:
618 arith(vpc, 1, OP_RSQ, dst, mask, none, none, src[0]);
619 break;
620 case TGSI_OPCODE_SGE:
621 arith(vpc, 0, OP_SGE, dst, mask, src[0], src[1], none);
622 break;
623 case TGSI_OPCODE_SLT:
624 arith(vpc, 0, OP_SLT, dst, mask, src[0], src[1], none);
625 break;
626 case TGSI_OPCODE_SUB:
627 arith(vpc, 0, OP_ADD, dst, mask, src[0], none, neg(src[1]));
628 break;
629 case TGSI_OPCODE_XPD:
630 tmp = temp(vpc);
631 arith(vpc, 0, OP_MUL, tmp, mask,
632 swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
633 arith(vpc, 0, OP_MAD, dst, (mask & ~MASK_W),
634 swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
635 neg(tmp));
636 break;
637 default:
638 NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
639 return FALSE;
640 }
641
642 if (clip.dst.type != NV40SR_NONE) {
643 arith(vpc, 0, OP_MOV, clip.dst, clip.m,
644 swz(dst, X, X, X, X), none, none);
645 }
646
647 release_temps(vpc);
648 return TRUE;
649 }
650
651 static boolean
652 nv40_vertprog_parse_decl_output(struct nv40_vpc *vpc,
653 const struct tgsi_full_declaration *fdec)
654 {
655 unsigned idx = fdec->u.DeclarationRange.First;
656 int hw;
657
658 switch (fdec->Semantic.SemanticName) {
659 case TGSI_SEMANTIC_POSITION:
660 hw = NV40_VP_INST_DEST_POS;
661 break;
662 case TGSI_SEMANTIC_COLOR:
663 if (fdec->Semantic.SemanticIndex == 0) {
664 hw = NV40_VP_INST_DEST_COL0;
665 } else
666 if (fdec->Semantic.SemanticIndex == 1) {
667 hw = NV40_VP_INST_DEST_COL1;
668 } else {
669 NOUVEAU_ERR("bad colour semantic index\n");
670 return FALSE;
671 }
672 break;
673 case TGSI_SEMANTIC_BCOLOR:
674 if (fdec->Semantic.SemanticIndex == 0) {
675 hw = NV40_VP_INST_DEST_BFC0;
676 } else
677 if (fdec->Semantic.SemanticIndex == 1) {
678 hw = NV40_VP_INST_DEST_BFC1;
679 } else {
680 NOUVEAU_ERR("bad bcolour semantic index\n");
681 return FALSE;
682 }
683 break;
684 case TGSI_SEMANTIC_FOG:
685 hw = NV40_VP_INST_DEST_FOGC;
686 break;
687 case TGSI_SEMANTIC_PSIZE:
688 hw = NV40_VP_INST_DEST_PSZ;
689 break;
690 case TGSI_SEMANTIC_GENERIC:
691 if (fdec->Semantic.SemanticIndex <= 7) {
692 hw = NV40_VP_INST_DEST_TC(fdec->Semantic.SemanticIndex);
693 } else {
694 NOUVEAU_ERR("bad generic semantic index\n");
695 return FALSE;
696 }
697 break;
698 #if 0
699 case TGSI_SEMANTIC_CLIP:
700 if (fdec->Semantic.SemanticIndex >= 6) {
701 NOUVEAU_ERR("bad clip distance index\n");
702 return FALSE;
703 }
704 hw = NV40_VP_INST_DEST_CLIP(fdec->Semantic.SemanticIndex);
705 break;
706 #endif
707 default:
708 NOUVEAU_ERR("bad output semantic\n");
709 return FALSE;
710 }
711
712 vpc->r_result[idx] = nv40_sr(NV40SR_OUTPUT, hw);
713 return TRUE;
714 }
715
716 static boolean
717 nv40_vertprog_prepare(struct nv40_vpc *vpc)
718 {
719 struct tgsi_parse_context p;
720 int high_temp = -1, high_addr = -1, nr_imm = 0, i;
721
722 tgsi_parse_init(&p, vpc->vp->pipe.tokens);
723 while (!tgsi_parse_end_of_tokens(&p)) {
724 const union tgsi_full_token *tok = &p.FullToken;
725
726 tgsi_parse_token(&p);
727 switch(tok->Token.Type) {
728 case TGSI_TOKEN_TYPE_IMMEDIATE:
729 nr_imm++;
730 break;
731 case TGSI_TOKEN_TYPE_DECLARATION:
732 {
733 const struct tgsi_full_declaration *fdec;
734
735 fdec = &p.FullToken.FullDeclaration;
736 switch (fdec->Declaration.File) {
737 case TGSI_FILE_TEMPORARY:
738 if (fdec->u.DeclarationRange.Last > high_temp) {
739 high_temp =
740 fdec->u.DeclarationRange.Last;
741 }
742 break;
743 #if 0 /* this would be nice.. except gallium doesn't track it */
744 case TGSI_FILE_ADDRESS:
745 if (fdec->u.DeclarationRange.Last > high_addr) {
746 high_addr =
747 fdec->u.DeclarationRange.Last;
748 }
749 break;
750 #endif
751 default:
752 break;
753 }
754 }
755 break;
756 #if 1 /* yay, parse instructions looking for address regs instead */
757 case TGSI_TOKEN_TYPE_INSTRUCTION:
758 {
759 const struct tgsi_full_instruction *finst;
760 const struct tgsi_full_dst_register *fdst;
761
762 finst = &p.FullToken.FullInstruction;
763 fdst = &finst->FullDstRegisters[0];
764
765 if (fdst->DstRegister.File == TGSI_FILE_ADDRESS) {
766 if (fdst->DstRegister.Index > high_addr)
767 high_addr = fdst->DstRegister.Index;
768 }
769
770 }
771 break;
772 #endif
773 default:
774 break;
775 }
776 }
777 tgsi_parse_free(&p);
778
779 if (nr_imm) {
780 vpc->imm = CALLOC(nr_imm, sizeof(struct nv40_sreg));
781 assert(vpc->imm);
782 }
783
784 if (++high_temp) {
785 vpc->r_temp = CALLOC(high_temp, sizeof(struct nv40_sreg));
786 for (i = 0; i < high_temp; i++)
787 vpc->r_temp[i] = temp(vpc);
788 }
789
790 if (++high_addr) {
791 vpc->r_address = CALLOC(high_addr, sizeof(struct nv40_sreg));
792 for (i = 0; i < high_addr; i++)
793 vpc->r_address[i] = temp(vpc);
794 }
795
796 vpc->r_temps_discard = 0;
797 return TRUE;
798 }
799
800 static void
801 nv40_vertprog_translate(struct nv40_context *nv40,
802 struct nv40_vertex_program *vp)
803 {
804 struct tgsi_parse_context parse;
805 struct nv40_vpc *vpc = NULL;
806
807 vpc = CALLOC(1, sizeof(struct nv40_vpc));
808 if (!vpc)
809 return;
810 vpc->vp = vp;
811
812 if (!nv40_vertprog_prepare(vpc)) {
813 FREE(vpc);
814 return;
815 }
816
817 tgsi_parse_init(&parse, vp->pipe.tokens);
818
819 while (!tgsi_parse_end_of_tokens(&parse)) {
820 tgsi_parse_token(&parse);
821
822 switch (parse.FullToken.Token.Type) {
823 case TGSI_TOKEN_TYPE_DECLARATION:
824 {
825 const struct tgsi_full_declaration *fdec;
826 fdec = &parse.FullToken.FullDeclaration;
827 switch (fdec->Declaration.File) {
828 case TGSI_FILE_OUTPUT:
829 if (!nv40_vertprog_parse_decl_output(vpc, fdec))
830 goto out_err;
831 break;
832 default:
833 break;
834 }
835 }
836 break;
837 case TGSI_TOKEN_TYPE_IMMEDIATE:
838 {
839 const struct tgsi_full_immediate *imm;
840
841 imm = &parse.FullToken.FullImmediate;
842 assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
843 // assert(imm->Immediate.Size == 4);
844 vpc->imm[vpc->nr_imm++] =
845 constant(vpc, -1,
846 imm->u.ImmediateFloat32[0].Float,
847 imm->u.ImmediateFloat32[1].Float,
848 imm->u.ImmediateFloat32[2].Float,
849 imm->u.ImmediateFloat32[3].Float);
850 }
851 break;
852 case TGSI_TOKEN_TYPE_INSTRUCTION:
853 {
854 const struct tgsi_full_instruction *finst;
855 finst = &parse.FullToken.FullInstruction;
856 if (!nv40_vertprog_parse_instruction(vpc, finst))
857 goto out_err;
858 }
859 break;
860 default:
861 break;
862 }
863 }
864
865 vp->insns[vp->nr_insns - 1].data[3] |= NV40_VP_INST_LAST;
866 vp->translated = TRUE;
867 out_err:
868 tgsi_parse_free(&parse);
869 FREE(vpc);
870 }
871
872 static boolean
873 nv40_vertprog_validate(struct nv40_context *nv40)
874 {
875 struct nouveau_winsys *nvws = nv40->nvws;
876 struct pipe_winsys *ws = nv40->pipe.winsys;
877 struct nouveau_grobj *curie = nv40->screen->curie;
878 struct nv40_vertex_program *vp;
879 struct pipe_buffer *constbuf;
880 boolean upload_code = FALSE, upload_data = FALSE;
881 int i;
882
883 if (nv40->render_mode == HW) {
884 vp = nv40->vertprog;
885 constbuf = nv40->constbuf[PIPE_SHADER_VERTEX];
886 } else {
887 vp = nv40->swtnl.vertprog;
888 constbuf = NULL;
889 }
890
891 /* Translate TGSI shader into hw bytecode */
892 if (vp->translated)
893 goto check_gpu_resources;
894
895 nv40->fallback_swtnl &= ~NV40_NEW_VERTPROG;
896 nv40_vertprog_translate(nv40, vp);
897 if (!vp->translated) {
898 nv40->fallback_swtnl |= NV40_NEW_VERTPROG;
899 return FALSE;
900 }
901
902 check_gpu_resources:
903 /* Allocate hw vtxprog exec slots */
904 if (!vp->exec) {
905 struct nouveau_resource *heap = nv40->screen->vp_exec_heap;
906 struct nouveau_stateobj *so;
907 uint vplen = vp->nr_insns;
908
909 if (nvws->res_alloc(heap, vplen, vp, &vp->exec)) {
910 while (heap->next && heap->size < vplen) {
911 struct nv40_vertex_program *evict;
912
913 evict = heap->next->priv;
914 nvws->res_free(&evict->exec);
915 }
916
917 if (nvws->res_alloc(heap, vplen, vp, &vp->exec))
918 assert(0);
919 }
920
921 so = so_new(7, 0);
922 so_method(so, curie, NV40TCL_VP_START_FROM_ID, 1);
923 so_data (so, vp->exec->start);
924 so_method(so, curie, NV40TCL_VP_ATTRIB_EN, 2);
925 so_data (so, vp->ir);
926 so_data (so, vp->or);
927 so_method(so, curie, NV40TCL_CLIP_PLANE_ENABLE, 1);
928 so_data (so, vp->clip_ctrl);
929 so_ref(so, &vp->so);
930
931 upload_code = TRUE;
932 }
933
934 /* Allocate hw vtxprog const slots */
935 if (vp->nr_consts && !vp->data) {
936 struct nouveau_resource *heap = nv40->screen->vp_data_heap;
937
938 if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data)) {
939 while (heap->next && heap->size < vp->nr_consts) {
940 struct nv40_vertex_program *evict;
941
942 evict = heap->next->priv;
943 nvws->res_free(&evict->data);
944 }
945
946 if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data))
947 assert(0);
948 }
949
950 /*XXX: handle this some day */
951 assert(vp->data->start >= vp->data_start_min);
952
953 upload_data = TRUE;
954 if (vp->data_start != vp->data->start)
955 upload_code = TRUE;
956 }
957
958 /* If exec or data segments moved we need to patch the program to
959 * fixup offsets and register IDs.
960 */
961 if (vp->exec_start != vp->exec->start) {
962 for (i = 0; i < vp->nr_insns; i++) {
963 struct nv40_vertex_program_exec *vpi = &vp->insns[i];
964
965 if (vpi->has_branch_offset) {
966 assert(0);
967 }
968 }
969
970 vp->exec_start = vp->exec->start;
971 }
972
973 if (vp->nr_consts && vp->data_start != vp->data->start) {
974 for (i = 0; i < vp->nr_insns; i++) {
975 struct nv40_vertex_program_exec *vpi = &vp->insns[i];
976
977 if (vpi->const_index >= 0) {
978 vpi->data[1] &= ~NV40_VP_INST_CONST_SRC_MASK;
979 vpi->data[1] |=
980 (vpi->const_index + vp->data->start) <<
981 NV40_VP_INST_CONST_SRC_SHIFT;
982
983 }
984 }
985
986 vp->data_start = vp->data->start;
987 }
988
989 /* Update + Upload constant values */
990 if (vp->nr_consts) {
991 float *map = NULL;
992
993 if (constbuf) {
994 map = ws->buffer_map(ws, constbuf,
995 PIPE_BUFFER_USAGE_CPU_READ);
996 }
997
998 for (i = 0; i < vp->nr_consts; i++) {
999 struct nv40_vertex_program_data *vpd = &vp->consts[i];
1000
1001 if (vpd->index >= 0) {
1002 if (!upload_data &&
1003 !memcmp(vpd->value, &map[vpd->index * 4],
1004 4 * sizeof(float)))
1005 continue;
1006 memcpy(vpd->value, &map[vpd->index * 4],
1007 4 * sizeof(float));
1008 }
1009
1010 BEGIN_RING(curie, NV40TCL_VP_UPLOAD_CONST_ID, 5);
1011 OUT_RING (i + vp->data->start);
1012 OUT_RINGp ((uint32_t *)vpd->value, 4);
1013 }
1014
1015 if (constbuf)
1016 ws->buffer_unmap(ws, constbuf);
1017 }
1018
1019 /* Upload vtxprog */
1020 if (upload_code) {
1021 #if 0
1022 for (i = 0; i < vp->nr_insns; i++) {
1023 NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[0]);
1024 NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[1]);
1025 NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[2]);
1026 NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[3]);
1027 }
1028 #endif
1029 BEGIN_RING(curie, NV40TCL_VP_UPLOAD_FROM_ID, 1);
1030 OUT_RING (vp->exec->start);
1031 for (i = 0; i < vp->nr_insns; i++) {
1032 BEGIN_RING(curie, NV40TCL_VP_UPLOAD_INST(0), 4);
1033 OUT_RINGp (vp->insns[i].data, 4);
1034 }
1035 }
1036
1037 if (vp->so != nv40->state.hw[NV40_STATE_VERTPROG]) {
1038 so_ref(vp->so, &nv40->state.hw[NV40_STATE_VERTPROG]);
1039 return TRUE;
1040 }
1041
1042 return FALSE;
1043 }
1044
1045 void
1046 nv40_vertprog_destroy(struct nv40_context *nv40, struct nv40_vertex_program *vp)
1047 {
1048 if (vp->nr_consts)
1049 FREE(vp->consts);
1050 if (vp->nr_insns)
1051 FREE(vp->insns);
1052 }
1053
1054 struct nv40_state_entry nv40_state_vertprog = {
1055 .validate = nv40_vertprog_validate,
1056 .dirty = {
1057 .pipe = NV40_NEW_VERTPROG,
1058 .hw = NV40_STATE_VERTPROG,
1059 }
1060 };
1061