Merge remote branch 'origin/master' into nv50-compiler
[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "main/macros.h"
34 #include "brw_context.h"
35 #include "brw_wm.h"
36
37 static GLboolean can_do_pln(struct intel_context *intel,
38 const struct brw_reg *deltas)
39 {
40 struct brw_context *brw = brw_context(&intel->ctx);
41
42 if (!brw->has_pln)
43 return GL_FALSE;
44
45 if (deltas[1].nr != deltas[0].nr + 1)
46 return GL_FALSE;
47
48 if (intel->gen < 6 && ((deltas[0].nr & 1) != 0))
49 return GL_FALSE;
50
51 return GL_TRUE;
52 }
53
54 /* Not quite sure how correct this is - need to understand horiz
55 * vs. vertical strides a little better.
56 */
57 static INLINE struct brw_reg sechalf( struct brw_reg reg )
58 {
59 if (reg.vstride)
60 reg.nr++;
61 return reg;
62 }
63
64 /* Return the SrcReg index of the channels that can be immediate float operands
65 * instead of usage of PROGRAM_CONSTANT values through push/pull.
66 */
67 GLboolean
68 brw_wm_arg_can_be_immediate(enum prog_opcode opcode, int arg)
69 {
70 int opcode_array[] = {
71 [OPCODE_ADD] = 2,
72 [OPCODE_CMP] = 3,
73 [OPCODE_DP3] = 2,
74 [OPCODE_DP4] = 2,
75 [OPCODE_DPH] = 2,
76 [OPCODE_MAX] = 2,
77 [OPCODE_MIN] = 2,
78 [OPCODE_MOV] = 1,
79 [OPCODE_MUL] = 2,
80 [OPCODE_SEQ] = 2,
81 [OPCODE_SGE] = 2,
82 [OPCODE_SGT] = 2,
83 [OPCODE_SLE] = 2,
84 [OPCODE_SLT] = 2,
85 [OPCODE_SNE] = 2,
86 [OPCODE_XPD] = 2,
87 };
88
89 /* These opcodes get broken down in a way that allow two
90 * args to be immediates.
91 */
92 if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
93 if (arg == 1 || arg == 2)
94 return GL_TRUE;
95 }
96
97 if (opcode > ARRAY_SIZE(opcode_array))
98 return GL_FALSE;
99
100 return arg == opcode_array[opcode] - 1;
101 }
102
103 /**
104 * Computes the screen-space x,y position of the pixels.
105 *
106 * This will be used by emit_delta_xy() or emit_wpos_xy() for
107 * interpolation of attributes..
108 *
109 * Payload R0:
110 *
111 * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles,
112 * corresponding to each of the 16 execution channels.
113 * R0.1..8 -- ?
114 * R1.0 -- triangle vertex 0.X
115 * R1.1 -- triangle vertex 0.Y
116 * R1.2 -- tile 0 x,y coords (2 packed uwords)
117 * R1.3 -- tile 1 x,y coords (2 packed uwords)
118 * R1.4 -- tile 2 x,y coords (2 packed uwords)
119 * R1.5 -- tile 3 x,y coords (2 packed uwords)
120 * R1.6 -- ?
121 * R1.7 -- ?
122 * R1.8 -- ?
123 */
124 void emit_pixel_xy(struct brw_wm_compile *c,
125 const struct brw_reg *dst,
126 GLuint mask)
127 {
128 struct brw_compile *p = &c->func;
129 struct brw_reg r1 = brw_vec1_grf(1, 0);
130 struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
131 struct brw_reg dst0_uw, dst1_uw;
132
133 brw_push_insn_state(p);
134 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
135
136 if (c->dispatch_width == 16) {
137 dst0_uw = vec16(retype(dst[0], BRW_REGISTER_TYPE_UW));
138 dst1_uw = vec16(retype(dst[1], BRW_REGISTER_TYPE_UW));
139 } else {
140 dst0_uw = vec8(retype(dst[0], BRW_REGISTER_TYPE_UW));
141 dst1_uw = vec8(retype(dst[1], BRW_REGISTER_TYPE_UW));
142 }
143
144 /* Calculate pixel centers by adding 1 or 0 to each of the
145 * micro-tile coordinates passed in r1.
146 */
147 if (mask & WRITEMASK_X) {
148 brw_ADD(p,
149 dst0_uw,
150 stride(suboffset(r1_uw, 4), 2, 4, 0),
151 brw_imm_v(0x10101010));
152 }
153
154 if (mask & WRITEMASK_Y) {
155 brw_ADD(p,
156 dst1_uw,
157 stride(suboffset(r1_uw,5), 2, 4, 0),
158 brw_imm_v(0x11001100));
159 }
160 brw_pop_insn_state(p);
161 }
162
163 /**
164 * Computes the screen-space x,y distance of the pixels from the start
165 * vertex.
166 *
167 * This will be used in linterp or pinterp with the start vertex value
168 * and the Cx, Cy, and C0 coefficients passed in from the setup engine
169 * to produce interpolated attribute values.
170 */
171 void emit_delta_xy(struct brw_compile *p,
172 const struct brw_reg *dst,
173 GLuint mask,
174 const struct brw_reg *arg0)
175 {
176 struct brw_reg r1 = brw_vec1_grf(1, 0);
177
178 if (mask == 0)
179 return;
180
181 assert(mask == WRITEMASK_XY);
182
183 /* Calc delta X,Y by subtracting origin in r1 from the pixel
184 * centers produced by emit_pixel_xy().
185 */
186 brw_ADD(p,
187 dst[0],
188 retype(arg0[0], BRW_REGISTER_TYPE_UW),
189 negate(r1));
190 brw_ADD(p,
191 dst[1],
192 retype(arg0[1], BRW_REGISTER_TYPE_UW),
193 negate(suboffset(r1,1)));
194 }
195
196 /**
197 * Computes the pixel offset from the window origin for gl_FragCoord().
198 */
199 void emit_wpos_xy(struct brw_wm_compile *c,
200 const struct brw_reg *dst,
201 GLuint mask,
202 const struct brw_reg *arg0)
203 {
204 struct brw_compile *p = &c->func;
205
206 if (mask & WRITEMASK_X) {
207 if (c->fp->program.PixelCenterInteger) {
208 /* X' = X */
209 brw_MOV(p,
210 dst[0],
211 retype(arg0[0], BRW_REGISTER_TYPE_W));
212 } else {
213 /* X' = X + 0.5 */
214 brw_ADD(p,
215 dst[0],
216 retype(arg0[0], BRW_REGISTER_TYPE_W),
217 brw_imm_f(0.5));
218 }
219 }
220
221 if (mask & WRITEMASK_Y) {
222 if (c->fp->program.OriginUpperLeft) {
223 if (c->fp->program.PixelCenterInteger) {
224 /* Y' = Y */
225 brw_MOV(p,
226 dst[1],
227 retype(arg0[1], BRW_REGISTER_TYPE_W));
228 } else {
229 /* Y' = Y + 0.5 */
230 brw_ADD(p,
231 dst[1],
232 retype(arg0[1], BRW_REGISTER_TYPE_W),
233 brw_imm_f(0.5));
234 }
235 } else {
236 float center_offset = c->fp->program.PixelCenterInteger ? 0.0 : 0.5;
237
238 /* Y' = (height - 1) - Y + center */
239 brw_ADD(p,
240 dst[1],
241 negate(retype(arg0[1], BRW_REGISTER_TYPE_W)),
242 brw_imm_f(c->key.drawable_height - 1 + center_offset));
243 }
244 }
245 }
246
247
248 void emit_pixel_w(struct brw_wm_compile *c,
249 const struct brw_reg *dst,
250 GLuint mask,
251 const struct brw_reg *arg0,
252 const struct brw_reg *deltas)
253 {
254 struct brw_compile *p = &c->func;
255 struct intel_context *intel = &p->brw->intel;
256
257 /* Don't need this if all you are doing is interpolating color, for
258 * instance.
259 */
260 if (mask & WRITEMASK_W) {
261 struct brw_reg interp3 = brw_vec1_grf(arg0[0].nr+1, 4);
262
263 /* Calc 1/w - just linterp wpos[3] optimized by putting the
264 * result straight into a message reg.
265 */
266 if (can_do_pln(intel, deltas)) {
267 brw_PLN(p, brw_message_reg(2), interp3, deltas[0]);
268 } else {
269 brw_LINE(p, brw_null_reg(), interp3, deltas[0]);
270 brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), deltas[1]);
271 }
272
273 /* Calc w */
274 if (c->dispatch_width == 16) {
275 brw_math_16(p, dst[3],
276 BRW_MATH_FUNCTION_INV,
277 BRW_MATH_SATURATE_NONE,
278 2, brw_null_reg(),
279 BRW_MATH_PRECISION_FULL);
280 } else {
281 brw_math(p, dst[3],
282 BRW_MATH_FUNCTION_INV,
283 BRW_MATH_SATURATE_NONE,
284 2, brw_null_reg(),
285 BRW_MATH_DATA_VECTOR,
286 BRW_MATH_PRECISION_FULL);
287 }
288 }
289 }
290
291
292 void emit_linterp(struct brw_compile *p,
293 const struct brw_reg *dst,
294 GLuint mask,
295 const struct brw_reg *arg0,
296 const struct brw_reg *deltas)
297 {
298 struct intel_context *intel = &p->brw->intel;
299 struct brw_reg interp[4];
300 GLuint nr = arg0[0].nr;
301 GLuint i;
302
303 interp[0] = brw_vec1_grf(nr, 0);
304 interp[1] = brw_vec1_grf(nr, 4);
305 interp[2] = brw_vec1_grf(nr+1, 0);
306 interp[3] = brw_vec1_grf(nr+1, 4);
307
308 for (i = 0; i < 4; i++) {
309 if (mask & (1<<i)) {
310 if (can_do_pln(intel, deltas)) {
311 brw_PLN(p, dst[i], interp[i], deltas[0]);
312 } else {
313 brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
314 brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
315 }
316 }
317 }
318 }
319
320
321 void emit_pinterp(struct brw_compile *p,
322 const struct brw_reg *dst,
323 GLuint mask,
324 const struct brw_reg *arg0,
325 const struct brw_reg *deltas,
326 const struct brw_reg *w)
327 {
328 struct intel_context *intel = &p->brw->intel;
329 struct brw_reg interp[4];
330 GLuint nr = arg0[0].nr;
331 GLuint i;
332
333 interp[0] = brw_vec1_grf(nr, 0);
334 interp[1] = brw_vec1_grf(nr, 4);
335 interp[2] = brw_vec1_grf(nr+1, 0);
336 interp[3] = brw_vec1_grf(nr+1, 4);
337
338 for (i = 0; i < 4; i++) {
339 if (mask & (1<<i)) {
340 if (can_do_pln(intel, deltas)) {
341 brw_PLN(p, dst[i], interp[i], deltas[0]);
342 } else {
343 brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
344 brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
345 }
346 }
347 }
348 for (i = 0; i < 4; i++) {
349 if (mask & (1<<i)) {
350 brw_MUL(p, dst[i], dst[i], w[3]);
351 }
352 }
353 }
354
355
356 void emit_cinterp(struct brw_compile *p,
357 const struct brw_reg *dst,
358 GLuint mask,
359 const struct brw_reg *arg0)
360 {
361 struct brw_reg interp[4];
362 GLuint nr = arg0[0].nr;
363 GLuint i;
364
365 interp[0] = brw_vec1_grf(nr, 0);
366 interp[1] = brw_vec1_grf(nr, 4);
367 interp[2] = brw_vec1_grf(nr+1, 0);
368 interp[3] = brw_vec1_grf(nr+1, 4);
369
370 for (i = 0; i < 4; i++) {
371 if (mask & (1<<i)) {
372 brw_MOV(p, dst[i], suboffset(interp[i],3)); /* TODO: optimize away like other moves */
373 }
374 }
375 }
376
377 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
378 void emit_frontfacing(struct brw_compile *p,
379 const struct brw_reg *dst,
380 GLuint mask)
381 {
382 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
383 GLuint i;
384
385 if (!(mask & WRITEMASK_XYZW))
386 return;
387
388 for (i = 0; i < 4; i++) {
389 if (mask & (1<<i)) {
390 brw_MOV(p, dst[i], brw_imm_f(0.0));
391 }
392 }
393
394 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
395 * us front face
396 */
397 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
398 for (i = 0; i < 4; i++) {
399 if (mask & (1<<i)) {
400 brw_MOV(p, dst[i], brw_imm_f(1.0));
401 }
402 }
403 brw_set_predicate_control_flag_value(p, 0xff);
404 }
405
406 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
407 * looking like:
408 *
409 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
410 *
411 * and we're trying to produce:
412 *
413 * DDX DDY
414 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
415 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
416 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
417 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
418 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
419 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
420 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
421 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
422 *
423 * and add another set of two more subspans if in 16-pixel dispatch mode.
424 *
425 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
426 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
427 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
428 * between each other. We could probably do it like ddx and swizzle the right
429 * order later, but bail for now and just produce
430 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
431 */
432 void emit_ddxy(struct brw_compile *p,
433 const struct brw_reg *dst,
434 GLuint mask,
435 GLboolean is_ddx,
436 const struct brw_reg *arg0)
437 {
438 int i;
439 struct brw_reg src0, src1;
440
441 if (mask & SATURATE)
442 brw_set_saturate(p, 1);
443 for (i = 0; i < 4; i++ ) {
444 if (mask & (1<<i)) {
445 if (is_ddx) {
446 src0 = brw_reg(arg0[i].file, arg0[i].nr, 1,
447 BRW_REGISTER_TYPE_F,
448 BRW_VERTICAL_STRIDE_2,
449 BRW_WIDTH_2,
450 BRW_HORIZONTAL_STRIDE_0,
451 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
452 src1 = brw_reg(arg0[i].file, arg0[i].nr, 0,
453 BRW_REGISTER_TYPE_F,
454 BRW_VERTICAL_STRIDE_2,
455 BRW_WIDTH_2,
456 BRW_HORIZONTAL_STRIDE_0,
457 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
458 } else {
459 src0 = brw_reg(arg0[i].file, arg0[i].nr, 0,
460 BRW_REGISTER_TYPE_F,
461 BRW_VERTICAL_STRIDE_4,
462 BRW_WIDTH_4,
463 BRW_HORIZONTAL_STRIDE_0,
464 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
465 src1 = brw_reg(arg0[i].file, arg0[i].nr, 2,
466 BRW_REGISTER_TYPE_F,
467 BRW_VERTICAL_STRIDE_4,
468 BRW_WIDTH_4,
469 BRW_HORIZONTAL_STRIDE_0,
470 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
471 }
472 brw_ADD(p, dst[i], src0, negate(src1));
473 }
474 }
475 if (mask & SATURATE)
476 brw_set_saturate(p, 0);
477 }
478
479 void emit_alu1(struct brw_compile *p,
480 struct brw_instruction *(*func)(struct brw_compile *,
481 struct brw_reg,
482 struct brw_reg),
483 const struct brw_reg *dst,
484 GLuint mask,
485 const struct brw_reg *arg0)
486 {
487 GLuint i;
488
489 if (mask & SATURATE)
490 brw_set_saturate(p, 1);
491
492 for (i = 0; i < 4; i++) {
493 if (mask & (1<<i)) {
494 func(p, dst[i], arg0[i]);
495 }
496 }
497
498 if (mask & SATURATE)
499 brw_set_saturate(p, 0);
500 }
501
502
503 void emit_alu2(struct brw_compile *p,
504 struct brw_instruction *(*func)(struct brw_compile *,
505 struct brw_reg,
506 struct brw_reg,
507 struct brw_reg),
508 const struct brw_reg *dst,
509 GLuint mask,
510 const struct brw_reg *arg0,
511 const struct brw_reg *arg1)
512 {
513 GLuint i;
514
515 if (mask & SATURATE)
516 brw_set_saturate(p, 1);
517
518 for (i = 0; i < 4; i++) {
519 if (mask & (1<<i)) {
520 func(p, dst[i], arg0[i], arg1[i]);
521 }
522 }
523
524 if (mask & SATURATE)
525 brw_set_saturate(p, 0);
526 }
527
528
529 void emit_mad(struct brw_compile *p,
530 const struct brw_reg *dst,
531 GLuint mask,
532 const struct brw_reg *arg0,
533 const struct brw_reg *arg1,
534 const struct brw_reg *arg2)
535 {
536 GLuint i;
537
538 for (i = 0; i < 4; i++) {
539 if (mask & (1<<i)) {
540 brw_MUL(p, dst[i], arg0[i], arg1[i]);
541
542 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
543 brw_ADD(p, dst[i], dst[i], arg2[i]);
544 brw_set_saturate(p, 0);
545 }
546 }
547 }
548
549 void emit_lrp(struct brw_compile *p,
550 const struct brw_reg *dst,
551 GLuint mask,
552 const struct brw_reg *arg0,
553 const struct brw_reg *arg1,
554 const struct brw_reg *arg2)
555 {
556 GLuint i;
557
558 /* Uses dst as a temporary:
559 */
560 for (i = 0; i < 4; i++) {
561 if (mask & (1<<i)) {
562 /* Can I use the LINE instruction for this?
563 */
564 brw_ADD(p, dst[i], negate(arg0[i]), brw_imm_f(1.0));
565 brw_MUL(p, brw_null_reg(), dst[i], arg2[i]);
566
567 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
568 brw_MAC(p, dst[i], arg0[i], arg1[i]);
569 brw_set_saturate(p, 0);
570 }
571 }
572 }
573
574 void emit_sop(struct brw_compile *p,
575 const struct brw_reg *dst,
576 GLuint mask,
577 GLuint cond,
578 const struct brw_reg *arg0,
579 const struct brw_reg *arg1)
580 {
581 GLuint i;
582
583 for (i = 0; i < 4; i++) {
584 if (mask & (1<<i)) {
585 brw_push_insn_state(p);
586 brw_CMP(p, brw_null_reg(), cond, arg0[i], arg1[i]);
587 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
588 brw_MOV(p, dst[i], brw_imm_f(0));
589 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
590 brw_MOV(p, dst[i], brw_imm_f(1.0));
591 brw_pop_insn_state(p);
592 }
593 }
594 }
595
596 static void emit_slt( struct brw_compile *p,
597 const struct brw_reg *dst,
598 GLuint mask,
599 const struct brw_reg *arg0,
600 const struct brw_reg *arg1 )
601 {
602 emit_sop(p, dst, mask, BRW_CONDITIONAL_L, arg0, arg1);
603 }
604
605 static void emit_sle( struct brw_compile *p,
606 const struct brw_reg *dst,
607 GLuint mask,
608 const struct brw_reg *arg0,
609 const struct brw_reg *arg1 )
610 {
611 emit_sop(p, dst, mask, BRW_CONDITIONAL_LE, arg0, arg1);
612 }
613
614 static void emit_sgt( struct brw_compile *p,
615 const struct brw_reg *dst,
616 GLuint mask,
617 const struct brw_reg *arg0,
618 const struct brw_reg *arg1 )
619 {
620 emit_sop(p, dst, mask, BRW_CONDITIONAL_G, arg0, arg1);
621 }
622
623 static void emit_sge( struct brw_compile *p,
624 const struct brw_reg *dst,
625 GLuint mask,
626 const struct brw_reg *arg0,
627 const struct brw_reg *arg1 )
628 {
629 emit_sop(p, dst, mask, BRW_CONDITIONAL_GE, arg0, arg1);
630 }
631
632 static void emit_seq( struct brw_compile *p,
633 const struct brw_reg *dst,
634 GLuint mask,
635 const struct brw_reg *arg0,
636 const struct brw_reg *arg1 )
637 {
638 emit_sop(p, dst, mask, BRW_CONDITIONAL_EQ, arg0, arg1);
639 }
640
641 static void emit_sne( struct brw_compile *p,
642 const struct brw_reg *dst,
643 GLuint mask,
644 const struct brw_reg *arg0,
645 const struct brw_reg *arg1 )
646 {
647 emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1);
648 }
649
650 void emit_cmp(struct brw_compile *p,
651 const struct brw_reg *dst,
652 GLuint mask,
653 const struct brw_reg *arg0,
654 const struct brw_reg *arg1,
655 const struct brw_reg *arg2)
656 {
657 GLuint i;
658
659 for (i = 0; i < 4; i++) {
660 if (mask & (1<<i)) {
661 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
662
663 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
664 brw_SEL(p, dst[i], arg1[i], arg2[i]);
665 brw_set_saturate(p, 0);
666 brw_set_predicate_control_flag_value(p, 0xff);
667 }
668 }
669 }
670
671 void emit_sign(struct brw_compile *p,
672 const struct brw_reg *dst,
673 GLuint mask,
674 const struct brw_reg *arg0)
675 {
676 GLuint i;
677
678 for (i = 0; i < 4; i++) {
679 if (mask & (1<<i)) {
680 brw_MOV(p, dst[i], brw_imm_f(0.0));
681
682 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
683 brw_MOV(p, dst[i], brw_imm_f(-1.0));
684 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
685
686 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0[i], brw_imm_f(0));
687 brw_MOV(p, dst[i], brw_imm_f(1.0));
688 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
689 }
690 }
691 }
692
693 void emit_max(struct brw_compile *p,
694 const struct brw_reg *dst,
695 GLuint mask,
696 const struct brw_reg *arg0,
697 const struct brw_reg *arg1)
698 {
699 GLuint i;
700
701 for (i = 0; i < 4; i++) {
702 if (mask & (1<<i)) {
703 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], arg1[i]);
704
705 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
706 brw_SEL(p, dst[i], arg0[i], arg1[i]);
707 brw_set_saturate(p, 0);
708 brw_set_predicate_control_flag_value(p, 0xff);
709 }
710 }
711 }
712
713 void emit_min(struct brw_compile *p,
714 const struct brw_reg *dst,
715 GLuint mask,
716 const struct brw_reg *arg0,
717 const struct brw_reg *arg1)
718 {
719 GLuint i;
720
721 for (i = 0; i < 4; i++) {
722 if (mask & (1<<i)) {
723 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
724
725 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
726 brw_SEL(p, dst[i], arg0[i], arg1[i]);
727 brw_set_saturate(p, 0);
728 brw_set_predicate_control_flag_value(p, 0xff);
729 }
730 }
731 }
732
733
734 void emit_dp2(struct brw_compile *p,
735 const struct brw_reg *dst,
736 GLuint mask,
737 const struct brw_reg *arg0,
738 const struct brw_reg *arg1)
739 {
740 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
741
742 if (!(mask & WRITEMASK_XYZW))
743 return; /* Do not emit dead code */
744
745 assert(is_power_of_two(mask & WRITEMASK_XYZW));
746
747 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
748
749 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
750 brw_MAC(p, dst[dst_chan], arg0[1], arg1[1]);
751 brw_set_saturate(p, 0);
752 }
753
754
755 void emit_dp3(struct brw_compile *p,
756 const struct brw_reg *dst,
757 GLuint mask,
758 const struct brw_reg *arg0,
759 const struct brw_reg *arg1)
760 {
761 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
762
763 if (!(mask & WRITEMASK_XYZW))
764 return; /* Do not emit dead code */
765
766 assert(is_power_of_two(mask & WRITEMASK_XYZW));
767
768 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
769 brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
770
771 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
772 brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
773 brw_set_saturate(p, 0);
774 }
775
776
777 void emit_dp4(struct brw_compile *p,
778 const struct brw_reg *dst,
779 GLuint mask,
780 const struct brw_reg *arg0,
781 const struct brw_reg *arg1)
782 {
783 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
784
785 if (!(mask & WRITEMASK_XYZW))
786 return; /* Do not emit dead code */
787
788 assert(is_power_of_two(mask & WRITEMASK_XYZW));
789
790 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
791 brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
792 brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]);
793
794 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
795 brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]);
796 brw_set_saturate(p, 0);
797 }
798
799
800 void emit_dph(struct brw_compile *p,
801 const struct brw_reg *dst,
802 GLuint mask,
803 const struct brw_reg *arg0,
804 const struct brw_reg *arg1)
805 {
806 const int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
807
808 if (!(mask & WRITEMASK_XYZW))
809 return; /* Do not emit dead code */
810
811 assert(is_power_of_two(mask & WRITEMASK_XYZW));
812
813 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
814 brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
815 brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
816
817 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
818 brw_ADD(p, dst[dst_chan], dst[dst_chan], arg1[3]);
819 brw_set_saturate(p, 0);
820 }
821
822
823 void emit_xpd(struct brw_compile *p,
824 const struct brw_reg *dst,
825 GLuint mask,
826 const struct brw_reg *arg0,
827 const struct brw_reg *arg1)
828 {
829 GLuint i;
830
831 assert((mask & WRITEMASK_W) != WRITEMASK_W);
832
833 for (i = 0 ; i < 3; i++) {
834 if (mask & (1<<i)) {
835 GLuint i2 = (i+2)%3;
836 GLuint i1 = (i+1)%3;
837
838 brw_MUL(p, brw_null_reg(), negate(arg0[i2]), arg1[i1]);
839
840 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
841 brw_MAC(p, dst[i], arg0[i1], arg1[i2]);
842 brw_set_saturate(p, 0);
843 }
844 }
845 }
846
847
848 void emit_math1(struct brw_wm_compile *c,
849 GLuint function,
850 const struct brw_reg *dst,
851 GLuint mask,
852 const struct brw_reg *arg0)
853 {
854 struct brw_compile *p = &c->func;
855 struct intel_context *intel = &p->brw->intel;
856 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
857 GLuint saturate = ((mask & SATURATE) ?
858 BRW_MATH_SATURATE_SATURATE :
859 BRW_MATH_SATURATE_NONE);
860 struct brw_reg src;
861
862 if (intel->gen >= 6 && arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
863 /* Gen6 math requires that source and dst horizontal stride be 1.
864 *
865 */
866 src = *dst;
867 brw_MOV(p, src, arg0[0]);
868 } else {
869 src = arg0[0];
870 }
871
872 if (!(mask & WRITEMASK_XYZW))
873 return; /* Do not emit dead code */
874
875 assert(is_power_of_two(mask & WRITEMASK_XYZW));
876
877 /* Send two messages to perform all 16 operations:
878 */
879 brw_push_insn_state(p);
880 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
881 brw_math(p,
882 dst[dst_chan],
883 function,
884 saturate,
885 2,
886 src,
887 BRW_MATH_DATA_VECTOR,
888 BRW_MATH_PRECISION_FULL);
889
890 if (c->dispatch_width == 16) {
891 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
892 brw_math(p,
893 offset(dst[dst_chan],1),
894 function,
895 saturate,
896 3,
897 sechalf(src),
898 BRW_MATH_DATA_VECTOR,
899 BRW_MATH_PRECISION_FULL);
900 }
901 brw_pop_insn_state(p);
902 }
903
904
905 void emit_math2(struct brw_wm_compile *c,
906 GLuint function,
907 const struct brw_reg *dst,
908 GLuint mask,
909 const struct brw_reg *arg0,
910 const struct brw_reg *arg1)
911 {
912 struct brw_compile *p = &c->func;
913 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
914 GLuint saturate = ((mask & SATURATE) ?
915 BRW_MATH_SATURATE_SATURATE :
916 BRW_MATH_SATURATE_NONE);
917
918 if (!(mask & WRITEMASK_XYZW))
919 return; /* Do not emit dead code */
920
921 assert(is_power_of_two(mask & WRITEMASK_XYZW));
922
923 brw_push_insn_state(p);
924
925 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
926 brw_MOV(p, brw_message_reg(3), arg1[0]);
927 if (c->dispatch_width == 16) {
928 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
929 brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
930 }
931
932 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
933 brw_math(p,
934 dst[dst_chan],
935 function,
936 saturate,
937 2,
938 arg0[0],
939 BRW_MATH_DATA_VECTOR,
940 BRW_MATH_PRECISION_FULL);
941
942 /* Send two messages to perform all 16 operations:
943 */
944 if (c->dispatch_width == 16) {
945 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
946 brw_math(p,
947 offset(dst[dst_chan],1),
948 function,
949 saturate,
950 4,
951 sechalf(arg0[0]),
952 BRW_MATH_DATA_VECTOR,
953 BRW_MATH_PRECISION_FULL);
954 }
955 brw_pop_insn_state(p);
956 }
957
958
959 void emit_tex(struct brw_wm_compile *c,
960 struct brw_reg *dst,
961 GLuint dst_flags,
962 struct brw_reg *arg,
963 struct brw_reg depth_payload,
964 GLuint tex_idx,
965 GLuint sampler,
966 GLboolean shadow)
967 {
968 struct brw_compile *p = &c->func;
969 struct intel_context *intel = &p->brw->intel;
970 struct brw_reg dst_retyped;
971 GLuint cur_mrf = 2, response_length;
972 GLuint i, nr_texcoords;
973 GLuint emit;
974 GLuint msg_type;
975 GLuint mrf_per_channel;
976 GLuint simd_mode;
977
978 if (c->dispatch_width == 16) {
979 mrf_per_channel = 2;
980 response_length = 8;
981 dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
982 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
983 } else {
984 mrf_per_channel = 1;
985 response_length = 4;
986 dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
987 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
988 }
989
990 /* How many input regs are there?
991 */
992 switch (tex_idx) {
993 case TEXTURE_1D_INDEX:
994 emit = WRITEMASK_X;
995 nr_texcoords = 1;
996 break;
997 case TEXTURE_2D_INDEX:
998 case TEXTURE_RECT_INDEX:
999 emit = WRITEMASK_XY;
1000 nr_texcoords = 2;
1001 break;
1002 case TEXTURE_3D_INDEX:
1003 case TEXTURE_CUBE_INDEX:
1004 emit = WRITEMASK_XYZ;
1005 nr_texcoords = 3;
1006 break;
1007 default:
1008 /* unexpected target */
1009 abort();
1010 }
1011
1012 /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */
1013 if (intel->gen < 5 && c->dispatch_width == 8)
1014 nr_texcoords = 3;
1015
1016 /* For shadow comparisons, we have to supply u,v,r. */
1017 if (shadow)
1018 nr_texcoords = 3;
1019
1020 /* Emit the texcoords. */
1021 for (i = 0; i < nr_texcoords; i++) {
1022 if (emit & (1<<i))
1023 brw_MOV(p, brw_message_reg(cur_mrf), arg[i]);
1024 else
1025 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1026 cur_mrf += mrf_per_channel;
1027 }
1028
1029 /* Fill in the shadow comparison reference value. */
1030 if (shadow) {
1031 if (intel->gen == 5) {
1032 /* Fill in the cube map array index value. */
1033 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1034 cur_mrf += mrf_per_channel;
1035 } else if (c->dispatch_width == 8) {
1036 /* Fill in the LOD bias value. */
1037 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1038 cur_mrf += mrf_per_channel;
1039 }
1040 brw_MOV(p, brw_message_reg(cur_mrf), arg[2]);
1041 cur_mrf += mrf_per_channel;
1042 }
1043
1044 if (intel->gen == 5) {
1045 if (shadow)
1046 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_GEN5;
1047 else
1048 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_GEN5;
1049 } else {
1050 /* Note that G45 and older determines shadow compare and dispatch width
1051 * from message length for most messages.
1052 */
1053 if (c->dispatch_width == 16 && shadow)
1054 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
1055 else
1056 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
1057 }
1058
1059 brw_SAMPLE(p,
1060 dst_retyped,
1061 1,
1062 retype(depth_payload, BRW_REGISTER_TYPE_UW),
1063 SURF_INDEX_TEXTURE(sampler),
1064 sampler,
1065 dst_flags & WRITEMASK_XYZW,
1066 msg_type,
1067 response_length,
1068 cur_mrf - 1,
1069 0,
1070 1,
1071 simd_mode);
1072 }
1073
1074
1075 void emit_txb(struct brw_wm_compile *c,
1076 struct brw_reg *dst,
1077 GLuint dst_flags,
1078 struct brw_reg *arg,
1079 struct brw_reg depth_payload,
1080 GLuint tex_idx,
1081 GLuint sampler)
1082 {
1083 struct brw_compile *p = &c->func;
1084 struct intel_context *intel = &p->brw->intel;
1085 GLuint msgLength;
1086 GLuint msg_type;
1087 GLuint mrf_per_channel;
1088 GLuint response_length;
1089 struct brw_reg dst_retyped;
1090
1091 /* The G45 and older chipsets don't support 8-wide dispatch for LOD biased
1092 * samples, so we'll use the 16-wide instruction, leave the second halves
1093 * undefined, and trust the execution mask to keep the undefined pixels
1094 * from mattering.
1095 */
1096 if (c->dispatch_width == 16 || intel->gen < 5) {
1097 if (intel->gen == 5)
1098 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
1099 else
1100 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
1101 mrf_per_channel = 2;
1102 dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1103 response_length = 8;
1104 } else {
1105 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
1106 mrf_per_channel = 1;
1107 dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1108 response_length = 4;
1109 }
1110
1111 /* Shadow ignored for txb. */
1112 switch (tex_idx) {
1113 case TEXTURE_1D_INDEX:
1114 brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1115 brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), brw_imm_f(0));
1116 brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1117 break;
1118 case TEXTURE_2D_INDEX:
1119 case TEXTURE_RECT_INDEX:
1120 brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1121 brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1122 brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1123 break;
1124 case TEXTURE_3D_INDEX:
1125 case TEXTURE_CUBE_INDEX:
1126 brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1127 brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1128 brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), arg[2]);
1129 break;
1130 default:
1131 /* unexpected target */
1132 abort();
1133 }
1134
1135 brw_MOV(p, brw_message_reg(2 + 3 * mrf_per_channel), arg[3]);
1136 msgLength = 2 + 4 * mrf_per_channel - 1;
1137
1138 brw_SAMPLE(p,
1139 dst_retyped,
1140 1,
1141 retype(depth_payload, BRW_REGISTER_TYPE_UW),
1142 SURF_INDEX_TEXTURE(sampler),
1143 sampler,
1144 dst_flags & WRITEMASK_XYZW,
1145 msg_type,
1146 response_length,
1147 msgLength,
1148 0,
1149 1,
1150 BRW_SAMPLER_SIMD_MODE_SIMD16);
1151 }
1152
1153
1154 static void emit_lit(struct brw_wm_compile *c,
1155 const struct brw_reg *dst,
1156 GLuint mask,
1157 const struct brw_reg *arg0)
1158 {
1159 struct brw_compile *p = &c->func;
1160
1161 assert((mask & WRITEMASK_XW) == 0);
1162
1163 if (mask & WRITEMASK_Y) {
1164 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
1165 brw_MOV(p, dst[1], arg0[0]);
1166 brw_set_saturate(p, 0);
1167 }
1168
1169 if (mask & WRITEMASK_Z) {
1170 emit_math2(c, BRW_MATH_FUNCTION_POW,
1171 &dst[2],
1172 WRITEMASK_X | (mask & SATURATE),
1173 &arg0[1],
1174 &arg0[3]);
1175 }
1176
1177 /* Ordinarily you'd use an iff statement to skip or shortcircuit
1178 * some of the POW calculations above, but 16-wide iff statements
1179 * seem to lock c1 hardware, so this is a nasty workaround:
1180 */
1181 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_LE, arg0[0], brw_imm_f(0));
1182 {
1183 if (mask & WRITEMASK_Y)
1184 brw_MOV(p, dst[1], brw_imm_f(0));
1185
1186 if (mask & WRITEMASK_Z)
1187 brw_MOV(p, dst[2], brw_imm_f(0));
1188 }
1189 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1190 }
1191
1192
1193 /* Kill pixel - set execution mask to zero for those pixels which
1194 * fail.
1195 */
1196 static void emit_kil( struct brw_wm_compile *c,
1197 struct brw_reg *arg0)
1198 {
1199 struct brw_compile *p = &c->func;
1200 struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1201 GLuint i, j;
1202
1203 for (i = 0; i < 4; i++) {
1204 /* Check if we've already done the comparison for this reg
1205 * -- common when someone does KIL TEMP.wwww.
1206 */
1207 for (j = 0; j < i; j++) {
1208 if (memcmp(&arg0[j], &arg0[i], sizeof(arg0[0])) == 0)
1209 break;
1210 }
1211 if (j != i)
1212 continue;
1213
1214 brw_push_insn_state(p);
1215 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0));
1216 brw_set_predicate_control_flag_value(p, 0xff);
1217 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1218 brw_AND(p, r0uw, brw_flag_reg(), r0uw);
1219 brw_pop_insn_state(p);
1220 }
1221 }
1222
1223 /* KIL_NV kills the pixels that are currently executing, not based on a test
1224 * of the arguments.
1225 */
1226 static void emit_kil_nv( struct brw_wm_compile *c )
1227 {
1228 struct brw_compile *p = &c->func;
1229 struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1230
1231 brw_push_insn_state(p);
1232 brw_set_mask_control(p, BRW_MASK_DISABLE);
1233 brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); /* IMASK */
1234 brw_AND(p, r0uw, c->emit_mask_reg, r0uw);
1235 brw_pop_insn_state(p);
1236 }
1237
1238 static void fire_fb_write( struct brw_wm_compile *c,
1239 GLuint base_reg,
1240 GLuint nr,
1241 GLuint target,
1242 GLuint eot )
1243 {
1244 struct brw_compile *p = &c->func;
1245 struct intel_context *intel = &p->brw->intel;
1246 struct brw_reg dst;
1247
1248 if (c->dispatch_width == 16)
1249 dst = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1250 else
1251 dst = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1252
1253 /* Pass through control information:
1254 */
1255 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
1256 if (intel->gen < 6) /* gen6, use headerless for fb write */
1257 {
1258 brw_push_insn_state(p);
1259 brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
1260 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1261 brw_MOV(p,
1262 brw_message_reg(base_reg + 1),
1263 brw_vec8_grf(1, 0));
1264 brw_pop_insn_state(p);
1265 }
1266
1267 /* Send framebuffer write message: */
1268 /* send (16) null.0<1>:uw m0 r0.0<8;8,1>:uw 0x85a04000:ud { Align1 EOT } */
1269 brw_fb_WRITE(p,
1270 c->dispatch_width,
1271 dst,
1272 base_reg,
1273 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
1274 target,
1275 nr,
1276 0,
1277 eot);
1278 }
1279
1280
1281 static void emit_aa( struct brw_wm_compile *c,
1282 struct brw_reg *arg1,
1283 GLuint reg )
1284 {
1285 struct brw_compile *p = &c->func;
1286 GLuint comp = c->key.aa_dest_stencil_reg / 2;
1287 GLuint off = c->key.aa_dest_stencil_reg % 2;
1288 struct brw_reg aa = offset(arg1[comp], off);
1289
1290 brw_push_insn_state(p);
1291 brw_set_compression_control(p, BRW_COMPRESSION_NONE); /* ?? */
1292 brw_MOV(p, brw_message_reg(reg), aa);
1293 brw_pop_insn_state(p);
1294 }
1295
1296
1297 /* Post-fragment-program processing. Send the results to the
1298 * framebuffer.
1299 * \param arg0 the fragment color
1300 * \param arg1 the pass-through depth value
1301 * \param arg2 the shader-computed depth value
1302 */
1303 void emit_fb_write(struct brw_wm_compile *c,
1304 struct brw_reg *arg0,
1305 struct brw_reg *arg1,
1306 struct brw_reg *arg2,
1307 GLuint target,
1308 GLuint eot)
1309 {
1310 struct brw_compile *p = &c->func;
1311 struct brw_context *brw = p->brw;
1312 struct intel_context *intel = &brw->intel;
1313 GLuint nr = 2;
1314 GLuint channel;
1315 int base_reg; /* For gen6 fb write with no header, starting from color payload directly!. */
1316
1317 /* Reserve a space for AA - may not be needed:
1318 */
1319 if (c->key.aa_dest_stencil_reg)
1320 nr += 1;
1321
1322 /* I don't really understand how this achieves the color interleave
1323 * (ie RGBARGBA) in the result: [Do the saturation here]
1324 */
1325 brw_push_insn_state(p);
1326
1327 if (intel->gen >= 6)
1328 base_reg = nr;
1329 else
1330 base_reg = 0;
1331
1332 for (channel = 0; channel < 4; channel++) {
1333 if (intel->gen >= 6) {
1334 /* gen6 SIMD16 single source DP write looks like:
1335 * m + 0: r0
1336 * m + 1: r1
1337 * m + 2: g0
1338 * m + 3: g1
1339 * m + 4: b0
1340 * m + 5: b1
1341 * m + 6: a0
1342 * m + 7: a1
1343 */
1344 if (c->dispatch_width == 16) {
1345 brw_MOV(p, brw_message_reg(nr + channel * 2), arg0[channel]);
1346 } else {
1347 brw_MOV(p, brw_message_reg(nr + channel), arg0[channel]);
1348 }
1349 } else if (c->dispatch_width == 16 && brw->has_compr4) {
1350 /* pre-gen6 SIMD16 single source DP write looks like:
1351 * m + 0: r0
1352 * m + 1: g0
1353 * m + 2: b0
1354 * m + 3: a0
1355 * m + 4: r1
1356 * m + 5: g1
1357 * m + 6: b1
1358 * m + 7: a1
1359 *
1360 * By setting the high bit of the MRF register number, we indicate
1361 * that we want COMPR4 mode - instead of doing the usual destination
1362 * + 1 for the second half we get destination + 4.
1363 */
1364 brw_MOV(p,
1365 brw_message_reg(nr + channel + BRW_MRF_COMPR4),
1366 arg0[channel]);
1367 } else {
1368 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
1369 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
1370 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1371 brw_MOV(p,
1372 brw_message_reg(nr + channel),
1373 arg0[channel]);
1374
1375 if (c->dispatch_width == 16) {
1376 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1377 brw_MOV(p,
1378 brw_message_reg(nr + channel + 4),
1379 sechalf(arg0[channel]));
1380 }
1381 }
1382 }
1383 /* skip over the regs populated above:
1384 */
1385 if (c->dispatch_width == 16)
1386 nr += 8;
1387 else
1388 nr += 4;
1389
1390 brw_pop_insn_state(p);
1391
1392 if (c->key.source_depth_to_render_target)
1393 {
1394 if (c->key.computes_depth)
1395 brw_MOV(p, brw_message_reg(nr), arg2[2]);
1396 else
1397 brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */
1398
1399 nr += 2;
1400 }
1401
1402 if (c->key.dest_depth_reg)
1403 {
1404 GLuint comp = c->key.dest_depth_reg / 2;
1405 GLuint off = c->key.dest_depth_reg % 2;
1406
1407 if (off != 0) {
1408 brw_push_insn_state(p);
1409 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1410
1411 brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
1412 /* 2nd half? */
1413 brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
1414 brw_pop_insn_state(p);
1415 }
1416 else {
1417 brw_MOV(p, brw_message_reg(nr), arg1[comp]);
1418 }
1419 nr += 2;
1420 }
1421
1422 if (intel->gen >= 6) {
1423 /* Subtract off the message header, since we send headerless. */
1424 nr -= 2;
1425 }
1426
1427 if (!c->key.runtime_check_aads_emit) {
1428 if (c->key.aa_dest_stencil_reg)
1429 emit_aa(c, arg1, 2);
1430
1431 fire_fb_write(c, base_reg, nr, target, eot);
1432 }
1433 else {
1434 struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
1435 struct brw_reg ip = brw_ip_reg();
1436 struct brw_instruction *jmp;
1437
1438 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1439 brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
1440 brw_AND(p,
1441 v1_null_ud,
1442 get_element_ud(brw_vec8_grf(1,0), 6),
1443 brw_imm_ud(1<<26));
1444
1445 jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
1446 {
1447 emit_aa(c, arg1, 2);
1448 fire_fb_write(c, 0, nr, target, eot);
1449 /* note - thread killed in subroutine */
1450 }
1451 brw_land_fwd_jump(p, jmp);
1452
1453 /* ELSE: Shuffle up one register to fill in the hole left for AA:
1454 */
1455 fire_fb_write(c, 1, nr-1, target, eot);
1456 }
1457 }
1458
1459 /**
1460 * Move a GPR to scratch memory.
1461 */
1462 static void emit_spill( struct brw_wm_compile *c,
1463 struct brw_reg reg,
1464 GLuint slot )
1465 {
1466 struct brw_compile *p = &c->func;
1467
1468 /*
1469 mov (16) m2.0<1>:ud r2.0<8;8,1>:ud { Align1 Compr }
1470 */
1471 brw_MOV(p, brw_message_reg(2), reg);
1472
1473 /*
1474 mov (1) r0.2<1>:d 0x00000080:d { Align1 NoMask }
1475 send (16) null.0<1>:uw m1 r0.0<8;8,1>:uw 0x053003ff:ud { Align1 }
1476 */
1477 brw_dp_WRITE_16(p,
1478 retype(vec16(brw_vec8_grf(0, 0)), BRW_REGISTER_TYPE_UW),
1479 slot);
1480 }
1481
1482
1483 /**
1484 * Load a GPR from scratch memory.
1485 */
1486 static void emit_unspill( struct brw_wm_compile *c,
1487 struct brw_reg reg,
1488 GLuint slot )
1489 {
1490 struct brw_compile *p = &c->func;
1491
1492 /* Slot 0 is the undef value.
1493 */
1494 if (slot == 0) {
1495 brw_MOV(p, reg, brw_imm_f(0));
1496 return;
1497 }
1498
1499 /*
1500 mov (1) r0.2<1>:d 0x000000c0:d { Align1 NoMask }
1501 send (16) r110.0<1>:uw m1 r0.0<8;8,1>:uw 0x041243ff:ud { Align1 }
1502 */
1503
1504 brw_dp_READ_16(p,
1505 retype(vec16(reg), BRW_REGISTER_TYPE_UW),
1506 slot);
1507 }
1508
1509
1510 /**
1511 * Retrieve up to 4 GEN4 register pairs for the given wm reg:
1512 * Args with unspill_reg != 0 will be loaded from scratch memory.
1513 */
1514 static void get_argument_regs( struct brw_wm_compile *c,
1515 struct brw_wm_ref *arg[],
1516 struct brw_reg *regs )
1517 {
1518 GLuint i;
1519
1520 for (i = 0; i < 4; i++) {
1521 if (arg[i]) {
1522 if (arg[i]->unspill_reg)
1523 emit_unspill(c,
1524 brw_vec8_grf(arg[i]->unspill_reg, 0),
1525 arg[i]->value->spill_slot);
1526
1527 regs[i] = arg[i]->hw_reg;
1528 }
1529 else {
1530 regs[i] = brw_null_reg();
1531 }
1532 }
1533 }
1534
1535
1536 /**
1537 * For values that have a spill_slot!=0, write those regs to scratch memory.
1538 */
1539 static void spill_values( struct brw_wm_compile *c,
1540 struct brw_wm_value *values,
1541 GLuint nr )
1542 {
1543 GLuint i;
1544
1545 for (i = 0; i < nr; i++)
1546 if (values[i].spill_slot)
1547 emit_spill(c, values[i].hw_reg, values[i].spill_slot);
1548 }
1549
1550
1551 /* Emit the fragment program instructions here.
1552 */
1553 void brw_wm_emit( struct brw_wm_compile *c )
1554 {
1555 struct brw_compile *p = &c->func;
1556 GLuint insn;
1557
1558 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1559
1560 /* Check if any of the payload regs need to be spilled:
1561 */
1562 spill_values(c, c->payload.depth, 4);
1563 spill_values(c, c->creg, c->nr_creg);
1564 spill_values(c, c->payload.input_interp, FRAG_ATTRIB_MAX);
1565
1566
1567 for (insn = 0; insn < c->nr_insns; insn++) {
1568
1569 struct brw_wm_instruction *inst = &c->instruction[insn];
1570 struct brw_reg args[3][4], dst[4];
1571 GLuint i, dst_flags;
1572
1573 /* Get argument regs:
1574 */
1575 for (i = 0; i < 3; i++)
1576 get_argument_regs(c, inst->src[i], args[i]);
1577
1578 /* Get dest regs:
1579 */
1580 for (i = 0; i < 4; i++)
1581 if (inst->dst[i])
1582 dst[i] = inst->dst[i]->hw_reg;
1583 else
1584 dst[i] = brw_null_reg();
1585
1586 /* Flags
1587 */
1588 dst_flags = inst->writemask;
1589 if (inst->saturate)
1590 dst_flags |= SATURATE;
1591
1592 switch (inst->opcode) {
1593 /* Generated instructions for calculating triangle interpolants:
1594 */
1595 case WM_PIXELXY:
1596 emit_pixel_xy(c, dst, dst_flags);
1597 break;
1598
1599 case WM_DELTAXY:
1600 emit_delta_xy(p, dst, dst_flags, args[0]);
1601 break;
1602
1603 case WM_WPOSXY:
1604 emit_wpos_xy(c, dst, dst_flags, args[0]);
1605 break;
1606
1607 case WM_PIXELW:
1608 emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1609 break;
1610
1611 case WM_LINTERP:
1612 emit_linterp(p, dst, dst_flags, args[0], args[1]);
1613 break;
1614
1615 case WM_PINTERP:
1616 emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1617 break;
1618
1619 case WM_CINTERP:
1620 emit_cinterp(p, dst, dst_flags, args[0]);
1621 break;
1622
1623 case WM_FB_WRITE:
1624 emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot);
1625 break;
1626
1627 case WM_FRONTFACING:
1628 emit_frontfacing(p, dst, dst_flags);
1629 break;
1630
1631 /* Straightforward arithmetic:
1632 */
1633 case OPCODE_ADD:
1634 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1635 break;
1636
1637 case OPCODE_FRC:
1638 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1639 break;
1640
1641 case OPCODE_FLR:
1642 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1643 break;
1644
1645 case OPCODE_DDX:
1646 emit_ddxy(p, dst, dst_flags, GL_TRUE, args[0]);
1647 break;
1648
1649 case OPCODE_DDY:
1650 emit_ddxy(p, dst, dst_flags, GL_FALSE, args[0]);
1651 break;
1652
1653 case OPCODE_DP2:
1654 emit_dp2(p, dst, dst_flags, args[0], args[1]);
1655 break;
1656
1657 case OPCODE_DP3:
1658 emit_dp3(p, dst, dst_flags, args[0], args[1]);
1659 break;
1660
1661 case OPCODE_DP4:
1662 emit_dp4(p, dst, dst_flags, args[0], args[1]);
1663 break;
1664
1665 case OPCODE_DPH:
1666 emit_dph(p, dst, dst_flags, args[0], args[1]);
1667 break;
1668
1669 case OPCODE_TRUNC:
1670 emit_alu1(p, brw_RNDZ, dst, dst_flags, args[0]);
1671 break;
1672
1673 case OPCODE_LRP:
1674 emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
1675 break;
1676
1677 case OPCODE_MAD:
1678 emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
1679 break;
1680
1681 case OPCODE_MOV:
1682 case OPCODE_SWZ:
1683 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1684 break;
1685
1686 case OPCODE_MUL:
1687 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
1688 break;
1689
1690 case OPCODE_XPD:
1691 emit_xpd(p, dst, dst_flags, args[0], args[1]);
1692 break;
1693
1694 /* Higher math functions:
1695 */
1696 case OPCODE_RCP:
1697 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1698 break;
1699
1700 case OPCODE_RSQ:
1701 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1702 break;
1703
1704 case OPCODE_SIN:
1705 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1706 break;
1707
1708 case OPCODE_COS:
1709 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1710 break;
1711
1712 case OPCODE_EX2:
1713 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1714 break;
1715
1716 case OPCODE_LG2:
1717 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1718 break;
1719
1720 case OPCODE_SCS:
1721 /* There is an scs math function, but it would need some
1722 * fixup for 16-element execution.
1723 */
1724 if (dst_flags & WRITEMASK_X)
1725 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1726 if (dst_flags & WRITEMASK_Y)
1727 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1728 break;
1729
1730 case OPCODE_POW:
1731 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
1732 break;
1733
1734 /* Comparisons:
1735 */
1736 case OPCODE_CMP:
1737 emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
1738 break;
1739
1740 case OPCODE_MAX:
1741 emit_max(p, dst, dst_flags, args[0], args[1]);
1742 break;
1743
1744 case OPCODE_MIN:
1745 emit_min(p, dst, dst_flags, args[0], args[1]);
1746 break;
1747
1748 case OPCODE_SLT:
1749 emit_slt(p, dst, dst_flags, args[0], args[1]);
1750 break;
1751
1752 case OPCODE_SLE:
1753 emit_sle(p, dst, dst_flags, args[0], args[1]);
1754 break;
1755 case OPCODE_SGT:
1756 emit_sgt(p, dst, dst_flags, args[0], args[1]);
1757 break;
1758 case OPCODE_SGE:
1759 emit_sge(p, dst, dst_flags, args[0], args[1]);
1760 break;
1761 case OPCODE_SEQ:
1762 emit_seq(p, dst, dst_flags, args[0], args[1]);
1763 break;
1764 case OPCODE_SNE:
1765 emit_sne(p, dst, dst_flags, args[0], args[1]);
1766 break;
1767
1768 case OPCODE_SSG:
1769 emit_sign(p, dst, dst_flags, args[0]);
1770 break;
1771
1772 case OPCODE_LIT:
1773 emit_lit(c, dst, dst_flags, args[0]);
1774 break;
1775
1776 /* Texturing operations:
1777 */
1778 case OPCODE_TEX:
1779 emit_tex(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1780 inst->tex_idx, inst->tex_unit,
1781 inst->tex_shadow);
1782 break;
1783
1784 case OPCODE_TXB:
1785 emit_txb(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1786 inst->tex_idx, inst->tex_unit);
1787 break;
1788
1789 case OPCODE_KIL:
1790 emit_kil(c, args[0]);
1791 break;
1792
1793 case OPCODE_KIL_NV:
1794 emit_kil_nv(c);
1795 break;
1796
1797 default:
1798 printf("Unsupported opcode %i (%s) in fragment shader\n",
1799 inst->opcode, inst->opcode < MAX_OPCODE ?
1800 _mesa_opcode_string(inst->opcode) :
1801 "unknown");
1802 }
1803
1804 for (i = 0; i < 4; i++)
1805 if (inst->dst[i] && inst->dst[i]->spill_slot)
1806 emit_spill(c,
1807 inst->dst[i]->hw_reg,
1808 inst->dst[i]->spill_slot);
1809 }
1810
1811 /* Only properly tested on ILK */
1812 if (p->brw->intel.gen == 5) {
1813 brw_remove_duplicate_mrf_moves(p);
1814 if (c->dispatch_width == 16)
1815 brw_remove_grf_to_mrf_moves(p);
1816 }
1817
1818 if (INTEL_DEBUG & DEBUG_WM) {
1819 int i;
1820
1821 printf("wm-native:\n");
1822 for (i = 0; i < p->nr_insn; i++)
1823 brw_disasm(stdout, &p->store[i], p->brw->intel.gen);
1824 printf("\n");
1825 }
1826 }
1827