nv40: support vp clip distance regs, unused currently.
[mesa.git] / src / gallium / drivers / nv40 / nv40_vertprog.c
1 #include "pipe/p_context.h"
2 #include "pipe/p_defines.h"
3 #include "pipe/p_state.h"
4
5 #include "pipe/p_shader_tokens.h"
6 #include "tgsi/util/tgsi_parse.h"
7 #include "tgsi/util/tgsi_util.h"
8
9 #include "nv40_context.h"
10 #include "nv40_state.h"
11
12 /* TODO (at least...):
13 * 1. Indexed consts + ARL
14 * 3. NV_vp11, NV_vp2, NV_vp3 features
15 * - extra arith opcodes
16 * - branching
17 * - texture sampling
18 * - indexed attribs
19 * - indexed results
20 * 4. bugs
21 */
22
23 #define SWZ_X 0
24 #define SWZ_Y 1
25 #define SWZ_Z 2
26 #define SWZ_W 3
27 #define MASK_X 8
28 #define MASK_Y 4
29 #define MASK_Z 2
30 #define MASK_W 1
31 #define MASK_ALL (MASK_X|MASK_Y|MASK_Z|MASK_W)
32 #define DEF_SCALE 0
33 #define DEF_CTEST 0
34 #include "nv40_shader.h"
35
36 #define swz(s,x,y,z,w) nv40_sr_swz((s), SWZ_##x, SWZ_##y, SWZ_##z, SWZ_##w)
37 #define neg(s) nv40_sr_neg((s))
38 #define abs(s) nv40_sr_abs((s))
39
40 #define NV40_VP_INST_DEST_CLIP(n) ((~0 - 6) + (n))
41
42 struct nv40_vpc {
43 struct nv40_vertex_program *vp;
44
45 struct nv40_vertex_program_exec *vpi;
46
47 unsigned r_temps;
48 unsigned r_temps_discard;
49 struct nv40_sreg r_result[PIPE_MAX_SHADER_OUTPUTS];
50 struct nv40_sreg *r_address;
51 struct nv40_sreg *r_temp;
52
53 struct nv40_sreg *imm;
54 unsigned nr_imm;
55 };
56
57 static struct nv40_sreg
58 temp(struct nv40_vpc *vpc)
59 {
60 int idx = ffs(~vpc->r_temps) - 1;
61
62 if (idx < 0) {
63 NOUVEAU_ERR("out of temps!!\n");
64 assert(0);
65 return nv40_sr(NV40SR_TEMP, 0);
66 }
67
68 vpc->r_temps |= (1 << idx);
69 vpc->r_temps_discard |= (1 << idx);
70 return nv40_sr(NV40SR_TEMP, idx);
71 }
72
73 static INLINE void
74 release_temps(struct nv40_vpc *vpc)
75 {
76 vpc->r_temps &= ~vpc->r_temps_discard;
77 vpc->r_temps_discard = 0;
78 }
79
80 static struct nv40_sreg
81 constant(struct nv40_vpc *vpc, int pipe, float x, float y, float z, float w)
82 {
83 struct nv40_vertex_program *vp = vpc->vp;
84 struct nv40_vertex_program_data *vpd;
85 int idx;
86
87 if (pipe >= 0) {
88 for (idx = 0; idx < vp->nr_consts; idx++) {
89 if (vp->consts[idx].index == pipe)
90 return nv40_sr(NV40SR_CONST, idx);
91 }
92 }
93
94 idx = vp->nr_consts++;
95 vp->consts = realloc(vp->consts, sizeof(*vpd) * vp->nr_consts);
96 vpd = &vp->consts[idx];
97
98 vpd->index = pipe;
99 vpd->value[0] = x;
100 vpd->value[1] = y;
101 vpd->value[2] = z;
102 vpd->value[3] = w;
103 return nv40_sr(NV40SR_CONST, idx);
104 }
105
106 #define arith(cc,s,o,d,m,s0,s1,s2) \
107 nv40_vp_arith((cc), (s), NV40_VP_INST_##o, (d), (m), (s0), (s1), (s2))
108
109 static void
110 emit_src(struct nv40_vpc *vpc, uint32_t *hw, int pos, struct nv40_sreg src)
111 {
112 struct nv40_vertex_program *vp = vpc->vp;
113 uint32_t sr = 0;
114
115 switch (src.type) {
116 case NV40SR_TEMP:
117 sr |= (NV40_VP_SRC_REG_TYPE_TEMP << NV40_VP_SRC_REG_TYPE_SHIFT);
118 sr |= (src.index << NV40_VP_SRC_TEMP_SRC_SHIFT);
119 break;
120 case NV40SR_INPUT:
121 sr |= (NV40_VP_SRC_REG_TYPE_INPUT <<
122 NV40_VP_SRC_REG_TYPE_SHIFT);
123 vp->ir |= (1 << src.index);
124 hw[1] |= (src.index << NV40_VP_INST_INPUT_SRC_SHIFT);
125 break;
126 case NV40SR_CONST:
127 sr |= (NV40_VP_SRC_REG_TYPE_CONST <<
128 NV40_VP_SRC_REG_TYPE_SHIFT);
129 assert(vpc->vpi->const_index == -1 ||
130 vpc->vpi->const_index == src.index);
131 vpc->vpi->const_index = src.index;
132 break;
133 case NV40SR_NONE:
134 sr |= (NV40_VP_SRC_REG_TYPE_INPUT <<
135 NV40_VP_SRC_REG_TYPE_SHIFT);
136 break;
137 default:
138 assert(0);
139 }
140
141 if (src.negate)
142 sr |= NV40_VP_SRC_NEGATE;
143
144 if (src.abs)
145 hw[0] |= (1 << (21 + pos));
146
147 sr |= ((src.swz[0] << NV40_VP_SRC_SWZ_X_SHIFT) |
148 (src.swz[1] << NV40_VP_SRC_SWZ_Y_SHIFT) |
149 (src.swz[2] << NV40_VP_SRC_SWZ_Z_SHIFT) |
150 (src.swz[3] << NV40_VP_SRC_SWZ_W_SHIFT));
151
152 switch (pos) {
153 case 0:
154 hw[1] |= ((sr & NV40_VP_SRC0_HIGH_MASK) >>
155 NV40_VP_SRC0_HIGH_SHIFT) << NV40_VP_INST_SRC0H_SHIFT;
156 hw[2] |= (sr & NV40_VP_SRC0_LOW_MASK) <<
157 NV40_VP_INST_SRC0L_SHIFT;
158 break;
159 case 1:
160 hw[2] |= sr << NV40_VP_INST_SRC1_SHIFT;
161 break;
162 case 2:
163 hw[2] |= ((sr & NV40_VP_SRC2_HIGH_MASK) >>
164 NV40_VP_SRC2_HIGH_SHIFT) << NV40_VP_INST_SRC2H_SHIFT;
165 hw[3] |= (sr & NV40_VP_SRC2_LOW_MASK) <<
166 NV40_VP_INST_SRC2L_SHIFT;
167 break;
168 default:
169 assert(0);
170 }
171 }
172
173 static void
174 emit_dst(struct nv40_vpc *vpc, uint32_t *hw, int slot, struct nv40_sreg dst)
175 {
176 struct nv40_vertex_program *vp = vpc->vp;
177
178 switch (dst.type) {
179 case NV40SR_TEMP:
180 hw[3] |= NV40_VP_INST_DEST_MASK;
181 if (slot == 0) {
182 hw[0] |= (dst.index <<
183 NV40_VP_INST_VEC_DEST_TEMP_SHIFT);
184 } else {
185 hw[3] |= (dst.index <<
186 NV40_VP_INST_SCA_DEST_TEMP_SHIFT);
187 }
188 break;
189 case NV40SR_OUTPUT:
190 switch (dst.index) {
191 case NV40_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
192 case NV40_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
193 case NV40_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
194 case NV40_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
195 case NV40_VP_INST_DEST_FOGC : vp->or |= (1 << 4); break;
196 case NV40_VP_INST_DEST_PSZ : vp->or |= (1 << 5); break;
197 case NV40_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break;
198 case NV40_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break;
199 case NV40_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break;
200 case NV40_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break;
201 case NV40_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break;
202 case NV40_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break;
203 case NV40_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break;
204 case NV40_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break;
205 case NV40_VP_INST_DEST_CLIP(0):
206 vp->or |= (1 << 6);
207 vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE0;
208 dst.index = NV40_VP_INST_DEST_FOGC;
209 break;
210 case NV40_VP_INST_DEST_CLIP(1):
211 vp->or |= (1 << 7);
212 vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE1;
213 dst.index = NV40_VP_INST_DEST_FOGC;
214 break;
215 case NV40_VP_INST_DEST_CLIP(2):
216 vp->or |= (1 << 8);
217 vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE2;
218 dst.index = NV40_VP_INST_DEST_FOGC;
219 break;
220 case NV40_VP_INST_DEST_CLIP(3):
221 vp->or |= (1 << 9);
222 vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE3;
223 dst.index = NV40_VP_INST_DEST_PSZ;
224 break;
225 case NV40_VP_INST_DEST_CLIP(4):
226 vp->or |= (1 << 10);
227 vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE4;
228 dst.index = NV40_VP_INST_DEST_PSZ;
229 break;
230 case NV40_VP_INST_DEST_CLIP(5):
231 vp->or |= (1 << 11);
232 vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE5;
233 dst.index = NV40_VP_INST_DEST_PSZ;
234 break;
235 default:
236 break;
237 }
238
239 hw[3] |= (dst.index << NV40_VP_INST_DEST_SHIFT);
240 if (slot == 0) {
241 hw[0] |= NV40_VP_INST_VEC_RESULT;
242 hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK | (1<<20);
243 } else {
244 hw[3] |= NV40_VP_INST_SCA_RESULT;
245 hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
246 }
247 break;
248 default:
249 assert(0);
250 }
251 }
252
253 static void
254 nv40_vp_arith(struct nv40_vpc *vpc, int slot, int op,
255 struct nv40_sreg dst, int mask,
256 struct nv40_sreg s0, struct nv40_sreg s1,
257 struct nv40_sreg s2)
258 {
259 struct nv40_vertex_program *vp = vpc->vp;
260 uint32_t *hw;
261
262 vp->insns = realloc(vp->insns, ++vp->nr_insns * sizeof(*vpc->vpi));
263 vpc->vpi = &vp->insns[vp->nr_insns - 1];
264 memset(vpc->vpi, 0, sizeof(*vpc->vpi));
265 vpc->vpi->const_index = -1;
266
267 hw = vpc->vpi->data;
268
269 hw[0] |= (NV40_VP_INST_COND_TR << NV40_VP_INST_COND_SHIFT);
270 hw[0] |= ((0 << NV40_VP_INST_COND_SWZ_X_SHIFT) |
271 (1 << NV40_VP_INST_COND_SWZ_Y_SHIFT) |
272 (2 << NV40_VP_INST_COND_SWZ_Z_SHIFT) |
273 (3 << NV40_VP_INST_COND_SWZ_W_SHIFT));
274
275 if (slot == 0) {
276 hw[1] |= (op << NV40_VP_INST_VEC_OPCODE_SHIFT);
277 hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
278 hw[3] |= (mask << NV40_VP_INST_VEC_WRITEMASK_SHIFT);
279 } else {
280 hw[1] |= (op << NV40_VP_INST_SCA_OPCODE_SHIFT);
281 hw[0] |= (NV40_VP_INST_VEC_DEST_TEMP_MASK | (1 << 20));
282 hw[3] |= (mask << NV40_VP_INST_SCA_WRITEMASK_SHIFT);
283 }
284
285 emit_dst(vpc, hw, slot, dst);
286 emit_src(vpc, hw, 0, s0);
287 emit_src(vpc, hw, 1, s1);
288 emit_src(vpc, hw, 2, s2);
289 }
290
291 static INLINE struct nv40_sreg
292 tgsi_src(struct nv40_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
293 struct nv40_sreg src;
294
295 switch (fsrc->SrcRegister.File) {
296 case TGSI_FILE_INPUT:
297 src = nv40_sr(NV40SR_INPUT, fsrc->SrcRegister.Index);
298 break;
299 case TGSI_FILE_CONSTANT:
300 src = constant(vpc, fsrc->SrcRegister.Index, 0, 0, 0, 0);
301 break;
302 case TGSI_FILE_IMMEDIATE:
303 src = vpc->imm[fsrc->SrcRegister.Index];
304 break;
305 case TGSI_FILE_TEMPORARY:
306 src = vpc->r_temp[fsrc->SrcRegister.Index];
307 break;
308 default:
309 NOUVEAU_ERR("bad src file\n");
310 break;
311 }
312
313 src.abs = fsrc->SrcRegisterExtMod.Absolute;
314 src.negate = fsrc->SrcRegister.Negate;
315 src.swz[0] = fsrc->SrcRegister.SwizzleX;
316 src.swz[1] = fsrc->SrcRegister.SwizzleY;
317 src.swz[2] = fsrc->SrcRegister.SwizzleZ;
318 src.swz[3] = fsrc->SrcRegister.SwizzleW;
319 return src;
320 }
321
322 static INLINE struct nv40_sreg
323 tgsi_dst(struct nv40_vpc *vpc, const struct tgsi_full_dst_register *fdst) {
324 struct nv40_sreg dst;
325
326 switch (fdst->DstRegister.File) {
327 case TGSI_FILE_OUTPUT:
328 dst = vpc->r_result[fdst->DstRegister.Index];
329 break;
330 case TGSI_FILE_TEMPORARY:
331 dst = vpc->r_temp[fdst->DstRegister.Index];
332 break;
333 case TGSI_FILE_ADDRESS:
334 dst = vpc->r_address[fdst->DstRegister.Index];
335 break;
336 default:
337 NOUVEAU_ERR("bad dst file\n");
338 break;
339 }
340
341 return dst;
342 }
343
344 static INLINE int
345 tgsi_mask(uint tgsi)
346 {
347 int mask = 0;
348
349 if (tgsi & TGSI_WRITEMASK_X) mask |= MASK_X;
350 if (tgsi & TGSI_WRITEMASK_Y) mask |= MASK_Y;
351 if (tgsi & TGSI_WRITEMASK_Z) mask |= MASK_Z;
352 if (tgsi & TGSI_WRITEMASK_W) mask |= MASK_W;
353 return mask;
354 }
355
356 static boolean
357 src_native_swz(struct nv40_vpc *vpc, const struct tgsi_full_src_register *fsrc,
358 struct nv40_sreg *src)
359 {
360 const struct nv40_sreg none = nv40_sr(NV40SR_NONE, 0);
361 struct nv40_sreg tgsi = tgsi_src(vpc, fsrc);
362 uint mask = 0, zero_mask = 0, one_mask = 0, neg_mask = 0;
363 uint neg[4] = { fsrc->SrcRegisterExtSwz.NegateX,
364 fsrc->SrcRegisterExtSwz.NegateY,
365 fsrc->SrcRegisterExtSwz.NegateZ,
366 fsrc->SrcRegisterExtSwz.NegateW };
367 uint c;
368
369 for (c = 0; c < 4; c++) {
370 switch (tgsi_util_get_full_src_register_extswizzle(fsrc, c)) {
371 case TGSI_EXTSWIZZLE_X:
372 case TGSI_EXTSWIZZLE_Y:
373 case TGSI_EXTSWIZZLE_Z:
374 case TGSI_EXTSWIZZLE_W:
375 mask |= tgsi_mask(1 << c);
376 break;
377 case TGSI_EXTSWIZZLE_ZERO:
378 zero_mask |= tgsi_mask(1 << c);
379 tgsi.swz[c] = SWZ_X;
380 break;
381 case TGSI_EXTSWIZZLE_ONE:
382 one_mask |= tgsi_mask(1 << c);
383 tgsi.swz[c] = SWZ_X;
384 break;
385 default:
386 assert(0);
387 }
388
389 if (!tgsi.negate && neg[c])
390 neg_mask |= tgsi_mask(1 << c);
391 }
392
393 if (mask == MASK_ALL && !neg_mask)
394 return TRUE;
395
396 *src = temp(vpc);
397
398 if (mask)
399 arith(vpc, 0, OP_MOV, *src, mask, tgsi, none, none);
400
401 if (zero_mask)
402 arith(vpc, 0, OP_SFL, *src, zero_mask, *src, none, none);
403
404 if (one_mask)
405 arith(vpc, 0, OP_STR, *src, one_mask, *src, none, none);
406
407 if (neg_mask) {
408 struct nv40_sreg one = temp(vpc);
409 arith(vpc, 0, OP_STR, one, neg_mask, one, none, none);
410 arith(vpc, 0, OP_MUL, *src, neg_mask, *src, neg(one), none);
411 }
412
413 return FALSE;
414 }
415
416 static boolean
417 nv40_vertprog_parse_instruction(struct nv40_vpc *vpc,
418 const struct tgsi_full_instruction *finst)
419 {
420 struct nv40_sreg src[3], dst, tmp;
421 struct nv40_sreg none = nv40_sr(NV40SR_NONE, 0);
422 int mask;
423 int ai = -1, ci = -1, ii = -1;
424 int i;
425
426 struct {
427 struct nv40_sreg dst;
428 unsigned c, m;
429 } clip;
430
431 if (finst->Instruction.Opcode == TGSI_OPCODE_END)
432 return TRUE;
433
434 for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
435 const struct tgsi_full_src_register *fsrc;
436
437 fsrc = &finst->FullSrcRegisters[i];
438 if (fsrc->SrcRegister.File == TGSI_FILE_TEMPORARY) {
439 src[i] = tgsi_src(vpc, fsrc);
440 }
441 }
442
443 for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
444 const struct tgsi_full_src_register *fsrc;
445
446 fsrc = &finst->FullSrcRegisters[i];
447
448 switch (fsrc->SrcRegister.File) {
449 case TGSI_FILE_INPUT:
450 case TGSI_FILE_CONSTANT:
451 case TGSI_FILE_TEMPORARY:
452 if (!src_native_swz(vpc, fsrc, &src[i]))
453 continue;
454 break;
455 default:
456 break;
457 }
458
459 switch (fsrc->SrcRegister.File) {
460 case TGSI_FILE_INPUT:
461 if (ai == -1 || ai == fsrc->SrcRegister.Index) {
462 ai = fsrc->SrcRegister.Index;
463 src[i] = tgsi_src(vpc, fsrc);
464 } else {
465 src[i] = temp(vpc);
466 arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
467 tgsi_src(vpc, fsrc), none, none);
468 }
469 break;
470 case TGSI_FILE_CONSTANT:
471 if ((ci == -1 && ii == -1) ||
472 ci == fsrc->SrcRegister.Index) {
473 ci = fsrc->SrcRegister.Index;
474 src[i] = tgsi_src(vpc, fsrc);
475 } else {
476 src[i] = temp(vpc);
477 arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
478 tgsi_src(vpc, fsrc), none, none);
479 }
480 break;
481 case TGSI_FILE_IMMEDIATE:
482 if ((ci == -1 && ii == -1) ||
483 ii == fsrc->SrcRegister.Index) {
484 ii = fsrc->SrcRegister.Index;
485 src[i] = tgsi_src(vpc, fsrc);
486 } else {
487 src[i] = temp(vpc);
488 arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
489 tgsi_src(vpc, fsrc), none, none);
490 }
491 break;
492 case TGSI_FILE_TEMPORARY:
493 /* handled above */
494 break;
495 default:
496 NOUVEAU_ERR("bad src file\n");
497 return FALSE;
498 }
499 }
500
501 dst = tgsi_dst(vpc, &finst->FullDstRegisters[0]);
502 mask = tgsi_mask(finst->FullDstRegisters[0].DstRegister.WriteMask);
503
504 /* If writing to clip distance regs, need to modify instruction to
505 * change which component is written to. On NV40 the clip regs
506 * are the unused components (yzw) of FOGC/PSZ
507 */
508 clip.dst = none;
509 if (dst.type == NV40SR_OUTPUT &&
510 dst.index >= NV40_VP_INST_DEST_CLIP(0) &&
511 dst.index <= NV40_VP_INST_DEST_CLIP(5)) {
512 unsigned n = dst.index - NV40_VP_INST_DEST_CLIP(0);
513 unsigned c[] = { SWZ_Y, SWZ_Z, SWZ_W, SWZ_Y, SWZ_Z, SWZ_W };
514 unsigned m[] =
515 { MASK_Y, MASK_Z, MASK_W, MASK_Y, MASK_Z, MASK_W };
516
517 /* Some instructions we can get away with swizzling and/or
518 * changing the writemask. Others, we'll use a temp reg.
519 */
520 switch (finst->Instruction.Opcode) {
521 case TGSI_OPCODE_DST:
522 case TGSI_OPCODE_EXP:
523 case TGSI_OPCODE_LIT:
524 case TGSI_OPCODE_LOG:
525 case TGSI_OPCODE_XPD:
526 clip.dst = dst;
527 clip.c = c[n];
528 clip.m = m[n];
529 dst = temp(vpc);
530 break;
531 case TGSI_OPCODE_DP3:
532 case TGSI_OPCODE_DP4:
533 case TGSI_OPCODE_DPH:
534 case TGSI_OPCODE_POW:
535 case TGSI_OPCODE_RCP:
536 case TGSI_OPCODE_RSQ:
537 mask = m[n];
538 break;
539 default:
540 for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
541 src[i] = nv40_sr_swz(src[i],
542 c[n], c[n], c[n], c[n]);
543 }
544 mask = m[n];
545 break;
546 }
547 }
548
549 switch (finst->Instruction.Opcode) {
550 case TGSI_OPCODE_ABS:
551 arith(vpc, 0, OP_MOV, dst, mask, abs(src[0]), none, none);
552 break;
553 case TGSI_OPCODE_ADD:
554 arith(vpc, 0, OP_ADD, dst, mask, src[0], none, src[1]);
555 break;
556 case TGSI_OPCODE_ARL:
557 arith(vpc, 0, OP_ARL, dst, mask, src[0], none, none);
558 break;
559 case TGSI_OPCODE_DP3:
560 arith(vpc, 0, OP_DP3, dst, mask, src[0], src[1], none);
561 break;
562 case TGSI_OPCODE_DP4:
563 arith(vpc, 0, OP_DP4, dst, mask, src[0], src[1], none);
564 break;
565 case TGSI_OPCODE_DPH:
566 arith(vpc, 0, OP_DPH, dst, mask, src[0], src[1], none);
567 break;
568 case TGSI_OPCODE_DST:
569 arith(vpc, 0, OP_DST, dst, mask, src[0], src[1], none);
570 break;
571 case TGSI_OPCODE_EX2:
572 arith(vpc, 1, OP_EX2, dst, mask, none, none, src[0]);
573 break;
574 case TGSI_OPCODE_EXP:
575 arith(vpc, 1, OP_EXP, dst, mask, none, none, src[0]);
576 break;
577 case TGSI_OPCODE_FLR:
578 arith(vpc, 0, OP_FLR, dst, mask, src[0], none, none);
579 break;
580 case TGSI_OPCODE_FRC:
581 arith(vpc, 0, OP_FRC, dst, mask, src[0], none, none);
582 break;
583 case TGSI_OPCODE_LG2:
584 arith(vpc, 1, OP_LG2, dst, mask, none, none, src[0]);
585 break;
586 case TGSI_OPCODE_LIT:
587 arith(vpc, 1, OP_LIT, dst, mask, none, none, src[0]);
588 break;
589 case TGSI_OPCODE_LOG:
590 arith(vpc, 1, OP_LOG, dst, mask, none, none, src[0]);
591 break;
592 case TGSI_OPCODE_MAD:
593 arith(vpc, 0, OP_MAD, dst, mask, src[0], src[1], src[2]);
594 break;
595 case TGSI_OPCODE_MAX:
596 arith(vpc, 0, OP_MAX, dst, mask, src[0], src[1], none);
597 break;
598 case TGSI_OPCODE_MIN:
599 arith(vpc, 0, OP_MIN, dst, mask, src[0], src[1], none);
600 break;
601 case TGSI_OPCODE_MOV:
602 arith(vpc, 0, OP_MOV, dst, mask, src[0], none, none);
603 break;
604 case TGSI_OPCODE_MUL:
605 arith(vpc, 0, OP_MUL, dst, mask, src[0], src[1], none);
606 break;
607 case TGSI_OPCODE_POW:
608 tmp = temp(vpc);
609 arith(vpc, 1, OP_LG2, tmp, MASK_X, none, none,
610 swz(src[0], X, X, X, X));
611 arith(vpc, 0, OP_MUL, tmp, MASK_X, swz(tmp, X, X, X, X),
612 swz(src[1], X, X, X, X), none);
613 arith(vpc, 1, OP_EX2, dst, mask, none, none,
614 swz(tmp, X, X, X, X));
615 break;
616 case TGSI_OPCODE_RCP:
617 arith(vpc, 1, OP_RCP, dst, mask, none, none, src[0]);
618 break;
619 case TGSI_OPCODE_RET:
620 break;
621 case TGSI_OPCODE_RSQ:
622 arith(vpc, 1, OP_RSQ, dst, mask, none, none, src[0]);
623 break;
624 case TGSI_OPCODE_SGE:
625 arith(vpc, 0, OP_SGE, dst, mask, src[0], src[1], none);
626 break;
627 case TGSI_OPCODE_SLT:
628 arith(vpc, 0, OP_SLT, dst, mask, src[0], src[1], none);
629 break;
630 case TGSI_OPCODE_SUB:
631 arith(vpc, 0, OP_ADD, dst, mask, src[0], none, neg(src[1]));
632 break;
633 case TGSI_OPCODE_XPD:
634 tmp = temp(vpc);
635 arith(vpc, 0, OP_MUL, tmp, mask,
636 swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
637 arith(vpc, 0, OP_MAD, dst, (mask & ~MASK_W),
638 swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
639 neg(tmp));
640 break;
641 default:
642 NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
643 return FALSE;
644 }
645
646 if (clip.dst.type != NV40SR_NONE) {
647 arith(vpc, 0, OP_MOV, clip.dst, clip.m,
648 nv40_sr_swz(dst, clip.c, clip.c, clip.c, clip.c),
649 none, none);
650 }
651
652 release_temps(vpc);
653 return TRUE;
654 }
655
656 static boolean
657 nv40_vertprog_parse_decl_output(struct nv40_vpc *vpc,
658 const struct tgsi_full_declaration *fdec)
659 {
660 unsigned idx = fdec->u.DeclarationRange.First;
661 int hw;
662
663 switch (fdec->Semantic.SemanticName) {
664 case TGSI_SEMANTIC_POSITION:
665 hw = NV40_VP_INST_DEST_POS;
666 break;
667 case TGSI_SEMANTIC_COLOR:
668 if (fdec->Semantic.SemanticIndex == 0) {
669 hw = NV40_VP_INST_DEST_COL0;
670 } else
671 if (fdec->Semantic.SemanticIndex == 1) {
672 hw = NV40_VP_INST_DEST_COL1;
673 } else {
674 NOUVEAU_ERR("bad colour semantic index\n");
675 return FALSE;
676 }
677 break;
678 case TGSI_SEMANTIC_BCOLOR:
679 if (fdec->Semantic.SemanticIndex == 0) {
680 hw = NV40_VP_INST_DEST_BFC0;
681 } else
682 if (fdec->Semantic.SemanticIndex == 1) {
683 hw = NV40_VP_INST_DEST_BFC1;
684 } else {
685 NOUVEAU_ERR("bad bcolour semantic index\n");
686 return FALSE;
687 }
688 break;
689 case TGSI_SEMANTIC_FOG:
690 hw = NV40_VP_INST_DEST_FOGC;
691 break;
692 case TGSI_SEMANTIC_PSIZE:
693 hw = NV40_VP_INST_DEST_PSZ;
694 break;
695 case TGSI_SEMANTIC_GENERIC:
696 if (fdec->Semantic.SemanticIndex <= 7) {
697 hw = NV40_VP_INST_DEST_TC(fdec->Semantic.SemanticIndex);
698 } else {
699 NOUVEAU_ERR("bad generic semantic index\n");
700 return FALSE;
701 }
702 break;
703 #if 0
704 case TGSI_SEMANTIC_CLIP:
705 if (fdec->Semantic.SemanticIndex >= 6) {
706 NOUVEAU_ERR("bad clip distance index\n");
707 return FALSE;
708 }
709 hw = NV40_VP_INST_DEST_CLIP(fdec->Semantic.SemanticIndex);
710 break;
711 #endif
712 default:
713 NOUVEAU_ERR("bad output semantic\n");
714 return FALSE;
715 }
716
717 vpc->r_result[idx] = nv40_sr(NV40SR_OUTPUT, hw);
718 return TRUE;
719 }
720
721 static boolean
722 nv40_vertprog_prepare(struct nv40_vpc *vpc)
723 {
724 struct tgsi_parse_context p;
725 int high_temp = -1, high_addr = -1, nr_imm = 0, i;
726
727 tgsi_parse_init(&p, vpc->vp->pipe.tokens);
728 while (!tgsi_parse_end_of_tokens(&p)) {
729 const union tgsi_full_token *tok = &p.FullToken;
730
731 tgsi_parse_token(&p);
732 switch(tok->Token.Type) {
733 case TGSI_TOKEN_TYPE_IMMEDIATE:
734 nr_imm++;
735 break;
736 case TGSI_TOKEN_TYPE_DECLARATION:
737 {
738 const struct tgsi_full_declaration *fdec;
739
740 fdec = &p.FullToken.FullDeclaration;
741 switch (fdec->Declaration.File) {
742 case TGSI_FILE_TEMPORARY:
743 if (fdec->u.DeclarationRange.Last > high_temp) {
744 high_temp =
745 fdec->u.DeclarationRange.Last;
746 }
747 break;
748 #if 0 /* this would be nice.. except gallium doesn't track it */
749 case TGSI_FILE_ADDRESS:
750 if (fdec->u.DeclarationRange.Last > high_addr) {
751 high_addr =
752 fdec->u.DeclarationRange.Last;
753 }
754 break;
755 #endif
756 default:
757 break;
758 }
759 }
760 break;
761 #if 1 /* yay, parse instructions looking for address regs instead */
762 case TGSI_TOKEN_TYPE_INSTRUCTION:
763 {
764 const struct tgsi_full_instruction *finst;
765 const struct tgsi_full_dst_register *fdst;
766
767 finst = &p.FullToken.FullInstruction;
768 fdst = &finst->FullDstRegisters[0];
769
770 if (fdst->DstRegister.File == TGSI_FILE_ADDRESS) {
771 if (fdst->DstRegister.Index > high_addr)
772 high_addr = fdst->DstRegister.Index;
773 }
774
775 }
776 break;
777 #endif
778 default:
779 break;
780 }
781 }
782 tgsi_parse_free(&p);
783
784 if (nr_imm) {
785 vpc->imm = CALLOC(nr_imm, sizeof(struct nv40_sreg));
786 assert(vpc->imm);
787 }
788
789 if (++high_temp) {
790 vpc->r_temp = CALLOC(high_temp, sizeof(struct nv40_sreg));
791 for (i = 0; i < high_temp; i++)
792 vpc->r_temp[i] = temp(vpc);
793 }
794
795 if (++high_addr) {
796 vpc->r_address = CALLOC(high_addr, sizeof(struct nv40_sreg));
797 for (i = 0; i < high_addr; i++)
798 vpc->r_address[i] = temp(vpc);
799 }
800
801 vpc->r_temps_discard = 0;
802 return TRUE;
803 }
804
805 static void
806 nv40_vertprog_translate(struct nv40_context *nv40,
807 struct nv40_vertex_program *vp)
808 {
809 struct tgsi_parse_context parse;
810 struct nv40_vpc *vpc = NULL;
811
812 vpc = CALLOC(1, sizeof(struct nv40_vpc));
813 if (!vpc)
814 return;
815 vpc->vp = vp;
816
817 if (!nv40_vertprog_prepare(vpc)) {
818 FREE(vpc);
819 return;
820 }
821
822 tgsi_parse_init(&parse, vp->pipe.tokens);
823
824 while (!tgsi_parse_end_of_tokens(&parse)) {
825 tgsi_parse_token(&parse);
826
827 switch (parse.FullToken.Token.Type) {
828 case TGSI_TOKEN_TYPE_DECLARATION:
829 {
830 const struct tgsi_full_declaration *fdec;
831 fdec = &parse.FullToken.FullDeclaration;
832 switch (fdec->Declaration.File) {
833 case TGSI_FILE_OUTPUT:
834 if (!nv40_vertprog_parse_decl_output(vpc, fdec))
835 goto out_err;
836 break;
837 default:
838 break;
839 }
840 }
841 break;
842 case TGSI_TOKEN_TYPE_IMMEDIATE:
843 {
844 const struct tgsi_full_immediate *imm;
845
846 imm = &parse.FullToken.FullImmediate;
847 assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
848 // assert(imm->Immediate.Size == 4);
849 vpc->imm[vpc->nr_imm++] =
850 constant(vpc, -1,
851 imm->u.ImmediateFloat32[0].Float,
852 imm->u.ImmediateFloat32[1].Float,
853 imm->u.ImmediateFloat32[2].Float,
854 imm->u.ImmediateFloat32[3].Float);
855 }
856 break;
857 case TGSI_TOKEN_TYPE_INSTRUCTION:
858 {
859 const struct tgsi_full_instruction *finst;
860 finst = &parse.FullToken.FullInstruction;
861 if (!nv40_vertprog_parse_instruction(vpc, finst))
862 goto out_err;
863 }
864 break;
865 default:
866 break;
867 }
868 }
869
870 vp->insns[vp->nr_insns - 1].data[3] |= NV40_VP_INST_LAST;
871 vp->translated = TRUE;
872 out_err:
873 tgsi_parse_free(&parse);
874 FREE(vpc);
875 }
876
877 static boolean
878 nv40_vertprog_validate(struct nv40_context *nv40)
879 {
880 struct nouveau_winsys *nvws = nv40->nvws;
881 struct pipe_winsys *ws = nv40->pipe.winsys;
882 struct nouveau_grobj *curie = nv40->screen->curie;
883 struct nv40_vertex_program *vp;
884 struct pipe_buffer *constbuf;
885 boolean upload_code = FALSE, upload_data = FALSE;
886 int i;
887
888 if (nv40->render_mode == HW) {
889 vp = nv40->vertprog;
890 constbuf = nv40->constbuf[PIPE_SHADER_VERTEX];
891 } else {
892 vp = nv40->swtnl.vertprog;
893 constbuf = NULL;
894 }
895
896 /* Translate TGSI shader into hw bytecode */
897 if (vp->translated)
898 goto check_gpu_resources;
899
900 nv40->fallback_swtnl &= ~NV40_NEW_VERTPROG;
901 nv40_vertprog_translate(nv40, vp);
902 if (!vp->translated) {
903 nv40->fallback_swtnl |= NV40_NEW_VERTPROG;
904 return FALSE;
905 }
906
907 check_gpu_resources:
908 /* Allocate hw vtxprog exec slots */
909 if (!vp->exec) {
910 struct nouveau_resource *heap = nv40->screen->vp_exec_heap;
911 struct nouveau_stateobj *so;
912 uint vplen = vp->nr_insns;
913
914 if (nvws->res_alloc(heap, vplen, vp, &vp->exec)) {
915 while (heap->next && heap->size < vplen) {
916 struct nv40_vertex_program *evict;
917
918 evict = heap->next->priv;
919 nvws->res_free(&evict->exec);
920 }
921
922 if (nvws->res_alloc(heap, vplen, vp, &vp->exec))
923 assert(0);
924 }
925
926 so = so_new(7, 0);
927 so_method(so, curie, NV40TCL_VP_START_FROM_ID, 1);
928 so_data (so, vp->exec->start);
929 so_method(so, curie, NV40TCL_VP_ATTRIB_EN, 2);
930 so_data (so, vp->ir);
931 so_data (so, vp->or);
932 so_method(so, curie, NV40TCL_CLIP_PLANE_ENABLE, 1);
933 so_data (so, vp->clip_ctrl);
934 so_ref(so, &vp->so);
935
936 upload_code = TRUE;
937 }
938
939 /* Allocate hw vtxprog const slots */
940 if (vp->nr_consts && !vp->data) {
941 struct nouveau_resource *heap = nv40->screen->vp_data_heap;
942
943 if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data)) {
944 while (heap->next && heap->size < vp->nr_consts) {
945 struct nv40_vertex_program *evict;
946
947 evict = heap->next->priv;
948 nvws->res_free(&evict->data);
949 }
950
951 if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data))
952 assert(0);
953 }
954
955 /*XXX: handle this some day */
956 assert(vp->data->start >= vp->data_start_min);
957
958 upload_data = TRUE;
959 if (vp->data_start != vp->data->start)
960 upload_code = TRUE;
961 }
962
963 /* If exec or data segments moved we need to patch the program to
964 * fixup offsets and register IDs.
965 */
966 if (vp->exec_start != vp->exec->start) {
967 for (i = 0; i < vp->nr_insns; i++) {
968 struct nv40_vertex_program_exec *vpi = &vp->insns[i];
969
970 if (vpi->has_branch_offset) {
971 assert(0);
972 }
973 }
974
975 vp->exec_start = vp->exec->start;
976 }
977
978 if (vp->nr_consts && vp->data_start != vp->data->start) {
979 for (i = 0; i < vp->nr_insns; i++) {
980 struct nv40_vertex_program_exec *vpi = &vp->insns[i];
981
982 if (vpi->const_index >= 0) {
983 vpi->data[1] &= ~NV40_VP_INST_CONST_SRC_MASK;
984 vpi->data[1] |=
985 (vpi->const_index + vp->data->start) <<
986 NV40_VP_INST_CONST_SRC_SHIFT;
987
988 }
989 }
990
991 vp->data_start = vp->data->start;
992 }
993
994 /* Update + Upload constant values */
995 if (vp->nr_consts) {
996 float *map = NULL;
997
998 if (constbuf) {
999 map = ws->buffer_map(ws, constbuf,
1000 PIPE_BUFFER_USAGE_CPU_READ);
1001 }
1002
1003 for (i = 0; i < vp->nr_consts; i++) {
1004 struct nv40_vertex_program_data *vpd = &vp->consts[i];
1005
1006 if (vpd->index >= 0) {
1007 if (!upload_data &&
1008 !memcmp(vpd->value, &map[vpd->index * 4],
1009 4 * sizeof(float)))
1010 continue;
1011 memcpy(vpd->value, &map[vpd->index * 4],
1012 4 * sizeof(float));
1013 }
1014
1015 BEGIN_RING(curie, NV40TCL_VP_UPLOAD_CONST_ID, 5);
1016 OUT_RING (i + vp->data->start);
1017 OUT_RINGp ((uint32_t *)vpd->value, 4);
1018 }
1019
1020 if (constbuf)
1021 ws->buffer_unmap(ws, constbuf);
1022 }
1023
1024 /* Upload vtxprog */
1025 if (upload_code) {
1026 #if 0
1027 for (i = 0; i < vp->nr_insns; i++) {
1028 NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[0]);
1029 NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[1]);
1030 NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[2]);
1031 NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[3]);
1032 }
1033 #endif
1034 BEGIN_RING(curie, NV40TCL_VP_UPLOAD_FROM_ID, 1);
1035 OUT_RING (vp->exec->start);
1036 for (i = 0; i < vp->nr_insns; i++) {
1037 BEGIN_RING(curie, NV40TCL_VP_UPLOAD_INST(0), 4);
1038 OUT_RINGp (vp->insns[i].data, 4);
1039 }
1040 }
1041
1042 if (vp->so != nv40->state.hw[NV40_STATE_VERTPROG]) {
1043 so_ref(vp->so, &nv40->state.hw[NV40_STATE_VERTPROG]);
1044 return TRUE;
1045 }
1046
1047 return FALSE;
1048 }
1049
1050 void
1051 nv40_vertprog_destroy(struct nv40_context *nv40, struct nv40_vertex_program *vp)
1052 {
1053 if (vp->nr_consts)
1054 FREE(vp->consts);
1055 if (vp->nr_insns)
1056 FREE(vp->insns);
1057 }
1058
1059 struct nv40_state_entry nv40_state_vertprog = {
1060 .validate = nv40_vertprog_validate,
1061 .dirty = {
1062 .pipe = NV40_NEW_VERTPROG,
1063 .hw = NV40_STATE_VERTPROG,
1064 }
1065 };
1066