Merge branch 'i915-unification' of git+ssh://people.freedesktop.org/~anholt/mesa...
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vs_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "program.h"
34 #include "macros.h"
35 #include "shader/prog_parameter.h"
36 #include "shader/prog_print.h"
37 #include "brw_context.h"
38 #include "brw_vs.h"
39
40
41
42 /* Do things as simply as possible. Allocate and populate all regs
43 * ahead of time.
44 */
45 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
46 {
47 GLuint i, reg = 0, mrf;
48 GLuint nr_params;
49
50 /* r0 -- reserved as usual
51 */
52 c->r0 = brw_vec8_grf(reg, 0); reg++;
53
54 /* User clip planes from curbe:
55 */
56 if (c->key.nr_userclip) {
57 for (i = 0; i < c->key.nr_userclip; i++) {
58 c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
59 }
60
61 /* Deal with curbe alignment:
62 */
63 reg += ((6+c->key.nr_userclip+3)/4)*2;
64 }
65
66 /* Vertex program parameters from curbe:
67 */
68 nr_params = c->vp->program.Base.Parameters->NumParameters;
69 for (i = 0; i < nr_params; i++) {
70 c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
71 }
72 reg += (nr_params+1)/2;
73
74 c->prog_data.curb_read_length = reg - 1;
75
76
77
78 /* Allocate input regs:
79 */
80 c->nr_inputs = 0;
81 for (i = 0; i < VERT_ATTRIB_MAX; i++) {
82 if (c->prog_data.inputs_read & (1<<i)) {
83 c->nr_inputs++;
84 c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
85 reg++;
86 }
87 }
88
89
90 /* Allocate outputs: TODO: could organize the non-position outputs
91 * to go straight into message regs.
92 */
93 c->nr_outputs = 0;
94 c->first_output = reg;
95 mrf = 4;
96 for (i = 0; i < VERT_RESULT_MAX; i++) {
97 if (c->prog_data.outputs_written & (1<<i)) {
98 c->nr_outputs++;
99 if (i == VERT_RESULT_HPOS) {
100 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
101 reg++;
102 }
103 else if (i == VERT_RESULT_PSIZ) {
104 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
105 reg++;
106 mrf++; /* just a placeholder? XXX fix later stages & remove this */
107 }
108 else {
109 c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
110 mrf++;
111 }
112 }
113 }
114
115 /* Allocate program temporaries:
116 */
117 for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
118 c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
119 reg++;
120 }
121
122 /* Address reg(s). Don't try to use the internal address reg until
123 * deref time.
124 */
125 for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
126 c->regs[PROGRAM_ADDRESS][i] = brw_reg(BRW_GENERAL_REGISTER_FILE,
127 reg,
128 0,
129 BRW_REGISTER_TYPE_D,
130 BRW_VERTICAL_STRIDE_8,
131 BRW_WIDTH_8,
132 BRW_HORIZONTAL_STRIDE_1,
133 BRW_SWIZZLE_XXXX,
134 WRITEMASK_X);
135 reg++;
136 }
137
138
139 /* Some opcodes need an internal temporary:
140 */
141 c->first_tmp = reg;
142 c->last_tmp = reg; /* for allocation purposes */
143
144 /* Each input reg holds data from two vertices. The
145 * urb_read_length is the number of registers read from *each*
146 * vertex urb, so is half the amount:
147 */
148 c->prog_data.urb_read_length = (c->nr_inputs+1)/2;
149
150 c->prog_data.urb_entry_size = (c->nr_outputs+2+3)/4;
151 c->prog_data.total_grf = reg;
152 }
153
154
155 static struct brw_reg get_tmp( struct brw_vs_compile *c )
156 {
157 struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
158
159 if (++c->last_tmp > c->prog_data.total_grf)
160 c->prog_data.total_grf = c->last_tmp;
161
162 return tmp;
163 }
164
165 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
166 {
167 if (tmp.nr == c->last_tmp-1)
168 c->last_tmp--;
169 }
170
171 static void release_tmps( struct brw_vs_compile *c )
172 {
173 c->last_tmp = c->first_tmp;
174 }
175
176
177 static void unalias1( struct brw_vs_compile *c,
178 struct brw_reg dst,
179 struct brw_reg arg0,
180 void (*func)( struct brw_vs_compile *,
181 struct brw_reg,
182 struct brw_reg ))
183 {
184 if (dst.file == arg0.file && dst.nr == arg0.nr) {
185 struct brw_compile *p = &c->func;
186 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
187 func(c, tmp, arg0);
188 brw_MOV(p, dst, tmp);
189 }
190 else {
191 func(c, dst, arg0);
192 }
193 }
194
195 static void unalias2( struct brw_vs_compile *c,
196 struct brw_reg dst,
197 struct brw_reg arg0,
198 struct brw_reg arg1,
199 void (*func)( struct brw_vs_compile *,
200 struct brw_reg,
201 struct brw_reg,
202 struct brw_reg ))
203 {
204 if ((dst.file == arg0.file && dst.nr == arg0.nr) &&
205 (dst.file == arg1.file && dst.nr == arg1.nr)) {
206 struct brw_compile *p = &c->func;
207 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
208 func(c, tmp, arg0, arg1);
209 brw_MOV(p, dst, tmp);
210 }
211 else {
212 func(c, dst, arg0, arg1);
213 }
214 }
215
216
217
218
219 static void emit_slt( struct brw_compile *p,
220 struct brw_reg dst,
221 struct brw_reg arg0,
222 struct brw_reg arg1 )
223 {
224 /* Could be done with an if/else/endif, but this method uses half
225 * the instructions. Note that we are careful to reference the
226 * arguments before writing the dest. That means we emit the
227 * instructions in an odd order and have to play with the flag
228 * values.
229 */
230 brw_push_insn_state(p);
231 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
232
233 /* Write all values to 1:
234 */
235 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
236 brw_MOV(p, dst, brw_imm_f(1.0));
237
238 /* Where the test succeeded, overwite with zero:
239 */
240 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
241 brw_MOV(p, dst, brw_imm_f(0.0));
242 brw_pop_insn_state(p);
243 }
244
245
246 static void emit_sge( struct brw_compile *p,
247 struct brw_reg dst,
248 struct brw_reg arg0,
249 struct brw_reg arg1 )
250 {
251 brw_push_insn_state(p);
252 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
253
254 /* Write all values to zero:
255 */
256 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
257 brw_MOV(p, dst, brw_imm_f(0));
258
259 /* Where the test succeeded, overwite with 1:
260 */
261 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
262 brw_MOV(p, dst, brw_imm_f(1.0));
263 brw_pop_insn_state(p);
264 }
265
266
267 static void emit_max( struct brw_compile *p,
268 struct brw_reg dst,
269 struct brw_reg arg0,
270 struct brw_reg arg1 )
271 {
272 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
273 brw_SEL(p, dst, arg1, arg0);
274 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
275 }
276
277 static void emit_min( struct brw_compile *p,
278 struct brw_reg dst,
279 struct brw_reg arg0,
280 struct brw_reg arg1 )
281 {
282 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
283 brw_SEL(p, dst, arg0, arg1);
284 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
285 }
286
287
288 static void emit_math1( struct brw_vs_compile *c,
289 GLuint function,
290 struct brw_reg dst,
291 struct brw_reg arg0,
292 GLuint precision)
293 {
294 /* There are various odd behaviours with SEND on the simulator. In
295 * addition there are documented issues with the fact that the GEN4
296 * processor doesn't do dependency control properly on SEND
297 * results. So, on balance, this kludge to get around failures
298 * with writemasked math results looks like it might be necessary
299 * whether that turns out to be a simulator bug or not:
300 */
301 struct brw_compile *p = &c->func;
302 struct brw_reg tmp = dst;
303 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
304 dst.file != BRW_GENERAL_REGISTER_FILE);
305
306 if (need_tmp)
307 tmp = get_tmp(c);
308
309 brw_math(p,
310 tmp,
311 function,
312 BRW_MATH_SATURATE_NONE,
313 2,
314 arg0,
315 BRW_MATH_DATA_SCALAR,
316 precision);
317
318 if (need_tmp) {
319 brw_MOV(p, dst, tmp);
320 release_tmp(c, tmp);
321 }
322 }
323
324 static void emit_math2( struct brw_vs_compile *c,
325 GLuint function,
326 struct brw_reg dst,
327 struct brw_reg arg0,
328 struct brw_reg arg1,
329 GLuint precision)
330 {
331 struct brw_compile *p = &c->func;
332 struct brw_reg tmp = dst;
333 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
334 dst.file != BRW_GENERAL_REGISTER_FILE);
335
336 if (need_tmp)
337 tmp = get_tmp(c);
338
339 brw_MOV(p, brw_message_reg(3), arg1);
340
341 brw_math(p,
342 tmp,
343 function,
344 BRW_MATH_SATURATE_NONE,
345 2,
346 arg0,
347 BRW_MATH_DATA_SCALAR,
348 precision);
349
350 if (need_tmp) {
351 brw_MOV(p, dst, tmp);
352 release_tmp(c, tmp);
353 }
354 }
355
356
357
358 static void emit_exp_noalias( struct brw_vs_compile *c,
359 struct brw_reg dst,
360 struct brw_reg arg0 )
361 {
362 struct brw_compile *p = &c->func;
363
364
365 if (dst.dw1.bits.writemask & WRITEMASK_X) {
366 struct brw_reg tmp = get_tmp(c);
367 struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
368
369 /* tmp_d = floor(arg0.x) */
370 brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
371
372 /* result[0] = 2.0 ^ tmp */
373
374 /* Adjust exponent for floating point:
375 * exp += 127
376 */
377 brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
378
379 /* Install exponent and sign.
380 * Excess drops off the edge:
381 */
382 brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
383 tmp_d, brw_imm_d(23));
384
385 release_tmp(c, tmp);
386 }
387
388 if (dst.dw1.bits.writemask & WRITEMASK_Y) {
389 /* result[1] = arg0.x - floor(arg0.x) */
390 brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
391 }
392
393 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
394 /* As with the LOG instruction, we might be better off just
395 * doing a taylor expansion here, seeing as we have to do all
396 * the prep work.
397 *
398 * If mathbox partial precision is too low, consider also:
399 * result[3] = result[0] * EXP(result[1])
400 */
401 emit_math1(c,
402 BRW_MATH_FUNCTION_EXP,
403 brw_writemask(dst, WRITEMASK_Z),
404 brw_swizzle1(arg0, 0),
405 BRW_MATH_PRECISION_PARTIAL);
406 }
407
408 if (dst.dw1.bits.writemask & WRITEMASK_W) {
409 /* result[3] = 1.0; */
410 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
411 }
412 }
413
414
415 static void emit_log_noalias( struct brw_vs_compile *c,
416 struct brw_reg dst,
417 struct brw_reg arg0 )
418 {
419 struct brw_compile *p = &c->func;
420 struct brw_reg tmp = dst;
421 struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
422 struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
423 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
424 dst.file != BRW_GENERAL_REGISTER_FILE);
425
426 if (need_tmp) {
427 tmp = get_tmp(c);
428 tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
429 }
430
431 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
432 * according to spec:
433 *
434 * These almost look likey they could be joined up, but not really
435 * practical:
436 *
437 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
438 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
439 */
440 if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
441 brw_AND(p,
442 brw_writemask(tmp_ud, WRITEMASK_X),
443 brw_swizzle1(arg0_ud, 0),
444 brw_imm_ud((1U<<31)-1));
445
446 brw_SHR(p,
447 brw_writemask(tmp_ud, WRITEMASK_X),
448 tmp_ud,
449 brw_imm_ud(23));
450
451 brw_ADD(p,
452 brw_writemask(tmp, WRITEMASK_X),
453 retype(tmp_ud, BRW_REGISTER_TYPE_D), /* does it matter? */
454 brw_imm_d(-127));
455 }
456
457 if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
458 brw_AND(p,
459 brw_writemask(tmp_ud, WRITEMASK_Y),
460 brw_swizzle1(arg0_ud, 0),
461 brw_imm_ud((1<<23)-1));
462
463 brw_OR(p,
464 brw_writemask(tmp_ud, WRITEMASK_Y),
465 tmp_ud,
466 brw_imm_ud(127<<23));
467 }
468
469 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
470 /* result[2] = result[0] + LOG2(result[1]); */
471
472 /* Why bother? The above is just a hint how to do this with a
473 * taylor series. Maybe we *should* use a taylor series as by
474 * the time all the above has been done it's almost certainly
475 * quicker than calling the mathbox, even with low precision.
476 *
477 * Options are:
478 * - result[0] + mathbox.LOG2(result[1])
479 * - mathbox.LOG2(arg0.x)
480 * - result[0] + inline_taylor_approx(result[1])
481 */
482 emit_math1(c,
483 BRW_MATH_FUNCTION_LOG,
484 brw_writemask(tmp, WRITEMASK_Z),
485 brw_swizzle1(tmp, 1),
486 BRW_MATH_PRECISION_FULL);
487
488 brw_ADD(p,
489 brw_writemask(tmp, WRITEMASK_Z),
490 brw_swizzle1(tmp, 2),
491 brw_swizzle1(tmp, 0));
492 }
493
494 if (dst.dw1.bits.writemask & WRITEMASK_W) {
495 /* result[3] = 1.0; */
496 brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
497 }
498
499 if (need_tmp) {
500 brw_MOV(p, dst, tmp);
501 release_tmp(c, tmp);
502 }
503 }
504
505
506
507
508 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
509 */
510 static void emit_dst_noalias( struct brw_vs_compile *c,
511 struct brw_reg dst,
512 struct brw_reg arg0,
513 struct brw_reg arg1)
514 {
515 struct brw_compile *p = &c->func;
516
517 /* There must be a better way to do this:
518 */
519 if (dst.dw1.bits.writemask & WRITEMASK_X)
520 brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
521 if (dst.dw1.bits.writemask & WRITEMASK_Y)
522 brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
523 if (dst.dw1.bits.writemask & WRITEMASK_Z)
524 brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
525 if (dst.dw1.bits.writemask & WRITEMASK_W)
526 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
527 }
528
529 static void emit_xpd( struct brw_compile *p,
530 struct brw_reg dst,
531 struct brw_reg t,
532 struct brw_reg u)
533 {
534 brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3), brw_swizzle(u,2,0,1,3));
535 brw_MAC(p, dst, negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
536 }
537
538
539
540 static void emit_lit_noalias( struct brw_vs_compile *c,
541 struct brw_reg dst,
542 struct brw_reg arg0 )
543 {
544 struct brw_compile *p = &c->func;
545 struct brw_instruction *if_insn;
546 struct brw_reg tmp = dst;
547 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
548
549 if (need_tmp)
550 tmp = get_tmp(c);
551
552 brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
553 brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
554
555 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
556 * to get all channels active inside the IF. In the clipping code
557 * we run with NoMask, so it's not an option and we can use
558 * BRW_EXECUTE_1 for all comparisions.
559 */
560 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
561 if_insn = brw_IF(p, BRW_EXECUTE_8);
562 {
563 brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
564
565 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
566 brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z), brw_swizzle1(arg0,1));
567 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
568
569 emit_math2(c,
570 BRW_MATH_FUNCTION_POW,
571 brw_writemask(dst, WRITEMASK_Z),
572 brw_swizzle1(tmp, 2),
573 brw_swizzle1(arg0, 3),
574 BRW_MATH_PRECISION_PARTIAL);
575 }
576
577 brw_ENDIF(p, if_insn);
578 }
579
580
581
582
583
584 /* TODO: relative addressing!
585 */
586 static struct brw_reg get_reg( struct brw_vs_compile *c,
587 GLuint file,
588 GLuint index )
589 {
590
591 switch (file) {
592 case PROGRAM_TEMPORARY:
593 case PROGRAM_INPUT:
594 case PROGRAM_OUTPUT:
595 case PROGRAM_STATE_VAR:
596 assert(c->regs[file][index].nr != 0);
597 return c->regs[file][index];
598 case PROGRAM_ADDRESS:
599 assert(index == 0);
600 return c->regs[file][index];
601
602 case PROGRAM_UNDEFINED: /* undef values */
603 return brw_null_reg();
604
605 case PROGRAM_LOCAL_PARAM:
606 case PROGRAM_ENV_PARAM:
607 case PROGRAM_WRITE_ONLY:
608 default:
609 assert(0);
610 return brw_null_reg();
611 }
612 }
613
614
615
616 static struct brw_reg deref( struct brw_vs_compile *c,
617 struct brw_reg arg,
618 GLint offset)
619 {
620 struct brw_compile *p = &c->func;
621 struct brw_reg tmp = vec4(get_tmp(c));
622 struct brw_reg vp_address = retype(vec1(get_reg(c, PROGRAM_ADDRESS, 0)), BRW_REGISTER_TYPE_UW);
623 GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
624 struct brw_reg indirect = brw_vec4_indirect(0,0);
625
626 {
627 brw_push_insn_state(p);
628 brw_set_access_mode(p, BRW_ALIGN_1);
629
630 /* This is pretty clunky - load the address register twice and
631 * fetch each 4-dword value in turn. There must be a way to do
632 * this in a single pass, but I couldn't get it to work.
633 */
634 brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
635 brw_MOV(p, tmp, indirect);
636
637 brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
638 brw_MOV(p, suboffset(tmp, 4), indirect);
639
640 brw_pop_insn_state(p);
641 }
642
643 return vec8(tmp);
644 }
645
646
647 static void emit_arl( struct brw_vs_compile *c,
648 struct brw_reg dst,
649 struct brw_reg arg0 )
650 {
651 struct brw_compile *p = &c->func;
652 struct brw_reg tmp = dst;
653 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
654
655 if (need_tmp)
656 tmp = get_tmp(c);
657
658 brw_RNDD(p, tmp, arg0);
659 brw_MUL(p, dst, tmp, brw_imm_d(16));
660
661 if (need_tmp)
662 release_tmp(c, tmp);
663 }
664
665
666 /* Will return mangled results for SWZ op. The emit_swz() function
667 * ignores this result and recalculates taking extended swizzles into
668 * account.
669 */
670 static struct brw_reg get_arg( struct brw_vs_compile *c,
671 struct prog_src_register src )
672 {
673 struct brw_reg reg;
674
675 if (src.File == PROGRAM_UNDEFINED)
676 return brw_null_reg();
677
678 if (src.RelAddr)
679 reg = deref(c, c->regs[PROGRAM_STATE_VAR][0], src.Index);
680 else
681 reg = get_reg(c, src.File, src.Index);
682
683 /* Convert 3-bit swizzle to 2-bit.
684 */
685 reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src.Swizzle, 0),
686 GET_SWZ(src.Swizzle, 1),
687 GET_SWZ(src.Swizzle, 2),
688 GET_SWZ(src.Swizzle, 3));
689
690 /* Note this is ok for non-swizzle instructions:
691 */
692 reg.negate = src.NegateBase ? 1 : 0;
693
694 return reg;
695 }
696
697
698 static struct brw_reg get_dst( struct brw_vs_compile *c,
699 struct prog_dst_register dst )
700 {
701 struct brw_reg reg = get_reg(c, dst.File, dst.Index);
702
703 reg.dw1.bits.writemask = dst.WriteMask;
704
705 return reg;
706 }
707
708
709
710
711 static void emit_swz( struct brw_vs_compile *c,
712 struct brw_reg dst,
713 struct prog_src_register src )
714 {
715 struct brw_compile *p = &c->func;
716 GLuint zeros_mask = 0;
717 GLuint ones_mask = 0;
718 GLuint src_mask = 0;
719 GLubyte src_swz[4];
720 GLboolean need_tmp = (src.NegateBase &&
721 dst.file != BRW_GENERAL_REGISTER_FILE);
722 struct brw_reg tmp = dst;
723 GLuint i;
724
725 if (need_tmp)
726 tmp = get_tmp(c);
727
728 for (i = 0; i < 4; i++) {
729 if (dst.dw1.bits.writemask & (1<<i)) {
730 GLubyte s = GET_SWZ(src.Swizzle, i);
731 switch (s) {
732 case SWIZZLE_X:
733 case SWIZZLE_Y:
734 case SWIZZLE_Z:
735 case SWIZZLE_W:
736 src_mask |= 1<<i;
737 src_swz[i] = s;
738 break;
739 case SWIZZLE_ZERO:
740 zeros_mask |= 1<<i;
741 break;
742 case SWIZZLE_ONE:
743 ones_mask |= 1<<i;
744 break;
745 }
746 }
747 }
748
749 /* Do src first, in case dst aliases src:
750 */
751 if (src_mask) {
752 struct brw_reg arg0;
753
754 if (src.RelAddr)
755 arg0 = deref(c, c->regs[PROGRAM_STATE_VAR][0], src.Index);
756 else
757 arg0 = get_reg(c, src.File, src.Index);
758
759 arg0 = brw_swizzle(arg0,
760 src_swz[0], src_swz[1],
761 src_swz[2], src_swz[3]);
762
763 brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
764 }
765
766 if (zeros_mask)
767 brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
768
769 if (ones_mask)
770 brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
771
772 if (src.NegateBase)
773 brw_MOV(p, brw_writemask(tmp, src.NegateBase), negate(tmp));
774
775 if (need_tmp) {
776 brw_MOV(p, dst, tmp);
777 release_tmp(c, tmp);
778 }
779 }
780
781
782
783 /* Post-vertex-program processing. Send the results to the URB.
784 */
785 static void emit_vertex_write( struct brw_vs_compile *c)
786 {
787 struct brw_compile *p = &c->func;
788 struct brw_reg m0 = brw_message_reg(0);
789 struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
790 struct brw_reg ndc;
791
792 if (c->key.copy_edgeflag) {
793 brw_MOV(p,
794 get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
795 get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
796 }
797
798
799 /* Build ndc coords? TODO: Shortcircuit when w is known to be one.
800 */
801 if (!c->key.know_w_is_one) {
802 ndc = get_tmp(c);
803 emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
804 brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
805 }
806 else {
807 ndc = pos;
808 }
809
810 /* This includes the workaround for -ve rhw, so is no longer an
811 * optional step:
812 */
813 if ((c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) ||
814 c->key.nr_userclip ||
815 !c->key.know_w_is_one)
816 {
817 struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
818 GLuint i;
819
820 brw_MOV(p, header1, brw_imm_ud(0));
821
822 brw_set_access_mode(p, BRW_ALIGN_16);
823
824 if (c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) {
825 struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
826 brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
827 brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
828 }
829
830
831 for (i = 0; i < c->key.nr_userclip; i++) {
832 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
833 brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
834 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
835 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
836 }
837
838
839 /* i965 clipping workaround:
840 * 1) Test for -ve rhw
841 * 2) If set,
842 * set ndc = (0,0,0,0)
843 * set ucp[6] = 1
844 *
845 * Later, clipping will detect ucp[6] and ensure the primitive is
846 * clipped against all fixed planes.
847 */
848 if (!c->key.know_w_is_one) {
849 brw_CMP(p,
850 vec8(brw_null_reg()),
851 BRW_CONDITIONAL_L,
852 brw_swizzle1(ndc, 3),
853 brw_imm_f(0));
854
855 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
856 brw_MOV(p, ndc, brw_imm_f(0));
857 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
858 }
859
860 brw_set_access_mode(p, BRW_ALIGN_1); /* why? */
861 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
862 brw_set_access_mode(p, BRW_ALIGN_16);
863
864 release_tmp(c, header1);
865 }
866 else {
867 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
868 }
869
870
871 /* Emit the (interleaved) headers for the two vertices - an 8-reg
872 * of zeros followed by two sets of NDC coordinates:
873 */
874 brw_set_access_mode(p, BRW_ALIGN_1);
875 brw_MOV(p, offset(m0, 2), ndc);
876 brw_MOV(p, offset(m0, 3), pos);
877
878
879 brw_urb_WRITE(p,
880 brw_null_reg(), /* dest */
881 0, /* starting mrf reg nr */
882 c->r0, /* src */
883 0, /* allocate */
884 1, /* used */
885 c->nr_outputs + 3, /* msg len */
886 0, /* response len */
887 1, /* eot */
888 1, /* writes complete */
889 0, /* urb destination offset */
890 BRW_URB_SWIZZLE_INTERLEAVE);
891
892 }
893
894
895
896
897 /* Emit the fragment program instructions here.
898 */
899 void brw_vs_emit( struct brw_vs_compile *c )
900 {
901 struct brw_compile *p = &c->func;
902 GLuint nr_insns = c->vp->program.Base.NumInstructions;
903 GLuint insn;
904
905
906 if (INTEL_DEBUG & DEBUG_VS) {
907 _mesa_printf("\n\n\nvs-emit:\n");
908 _mesa_print_program(&c->vp->program.Base);
909 _mesa_printf("\n");
910 }
911
912 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
913 brw_set_access_mode(p, BRW_ALIGN_16);
914
915 /* Static register allocation
916 */
917 brw_vs_alloc_regs(c);
918
919 for (insn = 0; insn < nr_insns; insn++) {
920
921 struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
922 struct brw_reg args[3], dst;
923 GLuint i;
924
925 /* Get argument regs. SWZ is special and does this itself.
926 */
927 if (inst->Opcode != OPCODE_SWZ)
928 for (i = 0; i < 3; i++)
929 args[i] = get_arg(c, inst->SrcReg[i]);
930
931 /* Get dest regs. Note that it is possible for a reg to be both
932 * dst and arg, given the static allocation of registers. So
933 * care needs to be taken emitting multi-operation instructions.
934 */
935 dst = get_dst(c, inst->DstReg);
936
937
938 switch (inst->Opcode) {
939 case OPCODE_ABS:
940 brw_MOV(p, dst, brw_abs(args[0]));
941 break;
942 case OPCODE_ADD:
943 brw_ADD(p, dst, args[0], args[1]);
944 break;
945 case OPCODE_DP3:
946 brw_DP3(p, dst, args[0], args[1]);
947 break;
948 case OPCODE_DP4:
949 brw_DP4(p, dst, args[0], args[1]);
950 break;
951 case OPCODE_DPH:
952 brw_DPH(p, dst, args[0], args[1]);
953 break;
954 case OPCODE_DST:
955 unalias2(c, dst, args[0], args[1], emit_dst_noalias);
956 break;
957 case OPCODE_EXP:
958 unalias1(c, dst, args[0], emit_exp_noalias);
959 break;
960 case OPCODE_EX2:
961 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
962 break;
963 case OPCODE_ARL:
964 emit_arl(c, dst, args[0]);
965 break;
966 case OPCODE_FLR:
967 brw_RNDD(p, dst, args[0]);
968 break;
969 case OPCODE_FRC:
970 brw_FRC(p, dst, args[0]);
971 break;
972 case OPCODE_LOG:
973 unalias1(c, dst, args[0], emit_log_noalias);
974 break;
975 case OPCODE_LG2:
976 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
977 break;
978 case OPCODE_LIT:
979 unalias1(c, dst, args[0], emit_lit_noalias);
980 break;
981 case OPCODE_MAD:
982 brw_MOV(p, brw_acc_reg(), args[2]);
983 brw_MAC(p, dst, args[0], args[1]);
984 break;
985 case OPCODE_MAX:
986 emit_max(p, dst, args[0], args[1]);
987 break;
988 case OPCODE_MIN:
989 emit_min(p, dst, args[0], args[1]);
990 break;
991 case OPCODE_MOV:
992 brw_MOV(p, dst, args[0]);
993 break;
994 case OPCODE_MUL:
995 brw_MUL(p, dst, args[0], args[1]);
996 break;
997 case OPCODE_POW:
998 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
999 break;
1000 case OPCODE_RCP:
1001 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1002 break;
1003 case OPCODE_RSQ:
1004 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1005 break;
1006 case OPCODE_SGE:
1007 emit_sge(p, dst, args[0], args[1]);
1008 break;
1009 case OPCODE_SLT:
1010 emit_slt(p, dst, args[0], args[1]);
1011 break;
1012 case OPCODE_SUB:
1013 brw_ADD(p, dst, args[0], negate(args[1]));
1014 break;
1015 case OPCODE_SWZ:
1016 /* The args[0] value can't be used here as it won't have
1017 * correctly encoded the full swizzle:
1018 */
1019 emit_swz(c, dst, inst->SrcReg[0] );
1020 break;
1021 case OPCODE_XPD:
1022 emit_xpd(p, dst, args[0], args[1]);
1023 break;
1024 case OPCODE_END:
1025 case OPCODE_PRINT:
1026 break;
1027 default:
1028 break;
1029 }
1030
1031 release_tmps(c);
1032 }
1033
1034 emit_vertex_write(c);
1035
1036 }
1037
1038
1039
1040
1041