96fecc97ee2bf1c03eae446d3b169336db7e3be8
[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "main/macros.h"
34 #include "brw_context.h"
35 #include "brw_wm.h"
36
37 static GLboolean can_do_pln(struct intel_context *intel,
38 const struct brw_reg *deltas)
39 {
40 struct brw_context *brw = brw_context(&intel->ctx);
41
42 if (!brw->has_pln)
43 return GL_FALSE;
44
45 if (deltas[1].nr != deltas[0].nr + 1)
46 return GL_FALSE;
47
48 if (intel->gen < 6 && ((deltas[0].nr & 1) != 0))
49 return GL_FALSE;
50
51 return GL_TRUE;
52 }
53
54 /* Not quite sure how correct this is - need to understand horiz
55 * vs. vertical strides a little better.
56 */
57 static INLINE struct brw_reg sechalf( struct brw_reg reg )
58 {
59 if (reg.vstride)
60 reg.nr++;
61 return reg;
62 }
63
64 /* Return the SrcReg index of the channels that can be immediate float operands
65 * instead of usage of PROGRAM_CONSTANT values through push/pull.
66 */
67 GLboolean
68 brw_wm_arg_can_be_immediate(enum prog_opcode opcode, int arg)
69 {
70 int opcode_array[] = {
71 [OPCODE_ADD] = 2,
72 [OPCODE_CMP] = 3,
73 [OPCODE_DP3] = 2,
74 [OPCODE_DP4] = 2,
75 [OPCODE_DPH] = 2,
76 [OPCODE_MAX] = 2,
77 [OPCODE_MIN] = 2,
78 [OPCODE_MOV] = 1,
79 [OPCODE_MUL] = 2,
80 [OPCODE_SEQ] = 2,
81 [OPCODE_SGE] = 2,
82 [OPCODE_SGT] = 2,
83 [OPCODE_SLE] = 2,
84 [OPCODE_SLT] = 2,
85 [OPCODE_SNE] = 2,
86 [OPCODE_SWZ] = 1,
87 [OPCODE_XPD] = 2,
88 };
89
90 /* These opcodes get broken down in a way that allow two
91 * args to be immediates.
92 */
93 if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
94 if (arg == 1 || arg == 2)
95 return GL_TRUE;
96 }
97
98 if (opcode > ARRAY_SIZE(opcode_array))
99 return GL_FALSE;
100
101 return arg == opcode_array[opcode] - 1;
102 }
103
104 /**
105 * Computes the screen-space x,y position of the pixels.
106 *
107 * This will be used by emit_delta_xy() or emit_wpos_xy() for
108 * interpolation of attributes..
109 *
110 * Payload R0:
111 *
112 * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles,
113 * corresponding to each of the 16 execution channels.
114 * R0.1..8 -- ?
115 * R1.0 -- triangle vertex 0.X
116 * R1.1 -- triangle vertex 0.Y
117 * R1.2 -- tile 0 x,y coords (2 packed uwords)
118 * R1.3 -- tile 1 x,y coords (2 packed uwords)
119 * R1.4 -- tile 2 x,y coords (2 packed uwords)
120 * R1.5 -- tile 3 x,y coords (2 packed uwords)
121 * R1.6 -- ?
122 * R1.7 -- ?
123 * R1.8 -- ?
124 */
125 void emit_pixel_xy(struct brw_wm_compile *c,
126 const struct brw_reg *dst,
127 GLuint mask)
128 {
129 struct brw_compile *p = &c->func;
130 struct brw_reg r1 = brw_vec1_grf(1, 0);
131 struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
132 struct brw_reg dst0_uw, dst1_uw;
133
134 brw_push_insn_state(p);
135 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
136
137 if (c->dispatch_width == 16) {
138 dst0_uw = vec16(retype(dst[0], BRW_REGISTER_TYPE_UW));
139 dst1_uw = vec16(retype(dst[1], BRW_REGISTER_TYPE_UW));
140 } else {
141 dst0_uw = vec8(retype(dst[0], BRW_REGISTER_TYPE_UW));
142 dst1_uw = vec8(retype(dst[1], BRW_REGISTER_TYPE_UW));
143 }
144
145 /* Calculate pixel centers by adding 1 or 0 to each of the
146 * micro-tile coordinates passed in r1.
147 */
148 if (mask & WRITEMASK_X) {
149 brw_ADD(p,
150 dst0_uw,
151 stride(suboffset(r1_uw, 4), 2, 4, 0),
152 brw_imm_v(0x10101010));
153 }
154
155 if (mask & WRITEMASK_Y) {
156 brw_ADD(p,
157 dst1_uw,
158 stride(suboffset(r1_uw,5), 2, 4, 0),
159 brw_imm_v(0x11001100));
160 }
161 brw_pop_insn_state(p);
162 }
163
164 /**
165 * Computes the screen-space x,y distance of the pixels from the start
166 * vertex.
167 *
168 * This will be used in linterp or pinterp with the start vertex value
169 * and the Cx, Cy, and C0 coefficients passed in from the setup engine
170 * to produce interpolated attribute values.
171 */
172 void emit_delta_xy(struct brw_compile *p,
173 const struct brw_reg *dst,
174 GLuint mask,
175 const struct brw_reg *arg0)
176 {
177 struct intel_context *intel = &p->brw->intel;
178 struct brw_reg r1 = brw_vec1_grf(1, 0);
179
180 if (mask == 0)
181 return;
182
183 assert(mask == WRITEMASK_XY);
184
185 if (intel->gen >= 6) {
186 /* XXX Gen6 WM doesn't have Xstart/Ystart in payload r1.0/r1.1.
187 Just add them with 0.0 for dst reg.. */
188 r1 = brw_imm_v(0x00000000);
189 brw_ADD(p,
190 dst[0],
191 retype(arg0[0], BRW_REGISTER_TYPE_UW),
192 r1);
193 brw_ADD(p,
194 dst[1],
195 retype(arg0[1], BRW_REGISTER_TYPE_UW),
196 r1);
197 return;
198 }
199
200 /* Calc delta X,Y by subtracting origin in r1 from the pixel
201 * centers produced by emit_pixel_xy().
202 */
203 brw_ADD(p,
204 dst[0],
205 retype(arg0[0], BRW_REGISTER_TYPE_UW),
206 negate(r1));
207 brw_ADD(p,
208 dst[1],
209 retype(arg0[1], BRW_REGISTER_TYPE_UW),
210 negate(suboffset(r1,1)));
211 }
212
213 /**
214 * Computes the pixel offset from the window origin for gl_FragCoord().
215 */
216 void emit_wpos_xy(struct brw_wm_compile *c,
217 const struct brw_reg *dst,
218 GLuint mask,
219 const struct brw_reg *arg0)
220 {
221 struct brw_compile *p = &c->func;
222
223 if (mask & WRITEMASK_X) {
224 if (c->fp->program.PixelCenterInteger) {
225 /* X' = X */
226 brw_MOV(p,
227 dst[0],
228 retype(arg0[0], BRW_REGISTER_TYPE_W));
229 } else {
230 /* X' = X + 0.5 */
231 brw_ADD(p,
232 dst[0],
233 retype(arg0[0], BRW_REGISTER_TYPE_W),
234 brw_imm_f(0.5));
235 }
236 }
237
238 if (mask & WRITEMASK_Y) {
239 if (c->fp->program.OriginUpperLeft) {
240 if (c->fp->program.PixelCenterInteger) {
241 /* Y' = Y */
242 brw_MOV(p,
243 dst[1],
244 retype(arg0[1], BRW_REGISTER_TYPE_W));
245 } else {
246 /* Y' = Y + 0.5 */
247 brw_ADD(p,
248 dst[1],
249 retype(arg0[1], BRW_REGISTER_TYPE_W),
250 brw_imm_f(0.5));
251 }
252 } else {
253 float center_offset = c->fp->program.PixelCenterInteger ? 0.0 : 0.5;
254
255 /* Y' = (height - 1) - Y + center */
256 brw_ADD(p,
257 dst[1],
258 negate(retype(arg0[1], BRW_REGISTER_TYPE_W)),
259 brw_imm_f(c->key.drawable_height - 1 + center_offset));
260 }
261 }
262 }
263
264
265 void emit_pixel_w(struct brw_wm_compile *c,
266 const struct brw_reg *dst,
267 GLuint mask,
268 const struct brw_reg *arg0,
269 const struct brw_reg *deltas)
270 {
271 struct brw_compile *p = &c->func;
272 struct intel_context *intel = &p->brw->intel;
273 struct brw_reg src;
274 struct brw_reg temp_dst;
275
276 if (intel->gen >= 6)
277 temp_dst = dst[3];
278 else
279 temp_dst = brw_message_reg(2);
280
281 assert(intel->gen < 6);
282
283 /* Don't need this if all you are doing is interpolating color, for
284 * instance.
285 */
286 if (mask & WRITEMASK_W) {
287 struct brw_reg interp3 = brw_vec1_grf(arg0[0].nr+1, 4);
288
289 /* Calc 1/w - just linterp wpos[3] optimized by putting the
290 * result straight into a message reg.
291 */
292 if (can_do_pln(intel, deltas)) {
293 brw_PLN(p, temp_dst, interp3, deltas[0]);
294 } else {
295 brw_LINE(p, brw_null_reg(), interp3, deltas[0]);
296 brw_MAC(p, temp_dst, suboffset(interp3, 1), deltas[1]);
297 }
298
299 /* Calc w */
300 if (intel->gen >= 6)
301 src = temp_dst;
302 else
303 src = brw_null_reg();
304
305 if (c->dispatch_width == 16) {
306 brw_math_16(p, dst[3],
307 BRW_MATH_FUNCTION_INV,
308 BRW_MATH_SATURATE_NONE,
309 2, src,
310 BRW_MATH_PRECISION_FULL);
311 } else {
312 brw_math(p, dst[3],
313 BRW_MATH_FUNCTION_INV,
314 BRW_MATH_SATURATE_NONE,
315 2, src,
316 BRW_MATH_DATA_VECTOR,
317 BRW_MATH_PRECISION_FULL);
318 }
319 }
320 }
321
322 void emit_linterp(struct brw_compile *p,
323 const struct brw_reg *dst,
324 GLuint mask,
325 const struct brw_reg *arg0,
326 const struct brw_reg *deltas)
327 {
328 struct intel_context *intel = &p->brw->intel;
329 struct brw_reg interp[4];
330 GLuint nr = arg0[0].nr;
331 GLuint i;
332
333 interp[0] = brw_vec1_grf(nr, 0);
334 interp[1] = brw_vec1_grf(nr, 4);
335 interp[2] = brw_vec1_grf(nr+1, 0);
336 interp[3] = brw_vec1_grf(nr+1, 4);
337
338 for (i = 0; i < 4; i++) {
339 if (mask & (1<<i)) {
340 if (intel->gen >= 6) {
341 brw_PLN(p, dst[i], interp[i], brw_vec8_grf(2, 0));
342 } else if (can_do_pln(intel, deltas)) {
343 brw_PLN(p, dst[i], interp[i], deltas[0]);
344 } else {
345 brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
346 brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
347 }
348 }
349 }
350 }
351
352
353 void emit_pinterp(struct brw_compile *p,
354 const struct brw_reg *dst,
355 GLuint mask,
356 const struct brw_reg *arg0,
357 const struct brw_reg *deltas,
358 const struct brw_reg *w)
359 {
360 struct intel_context *intel = &p->brw->intel;
361 struct brw_reg interp[4];
362 GLuint nr = arg0[0].nr;
363 GLuint i;
364
365 if (intel->gen >= 6) {
366 emit_linterp(p, dst, mask, arg0, interp);
367 return;
368 }
369
370 interp[0] = brw_vec1_grf(nr, 0);
371 interp[1] = brw_vec1_grf(nr, 4);
372 interp[2] = brw_vec1_grf(nr+1, 0);
373 interp[3] = brw_vec1_grf(nr+1, 4);
374
375 for (i = 0; i < 4; i++) {
376 if (mask & (1<<i)) {
377 if (can_do_pln(intel, deltas)) {
378 brw_PLN(p, dst[i], interp[i], deltas[0]);
379 } else {
380 brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
381 brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
382 }
383 }
384 }
385 for (i = 0; i < 4; i++) {
386 if (mask & (1<<i)) {
387 brw_MUL(p, dst[i], dst[i], w[3]);
388 }
389 }
390 }
391
392
393 void emit_cinterp(struct brw_compile *p,
394 const struct brw_reg *dst,
395 GLuint mask,
396 const struct brw_reg *arg0)
397 {
398 struct brw_reg interp[4];
399 GLuint nr = arg0[0].nr;
400 GLuint i;
401
402 interp[0] = brw_vec1_grf(nr, 0);
403 interp[1] = brw_vec1_grf(nr, 4);
404 interp[2] = brw_vec1_grf(nr+1, 0);
405 interp[3] = brw_vec1_grf(nr+1, 4);
406
407 for (i = 0; i < 4; i++) {
408 if (mask & (1<<i)) {
409 brw_MOV(p, dst[i], suboffset(interp[i],3)); /* TODO: optimize away like other moves */
410 }
411 }
412 }
413
414 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
415 void emit_frontfacing(struct brw_compile *p,
416 const struct brw_reg *dst,
417 GLuint mask)
418 {
419 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
420 GLuint i;
421
422 if (!(mask & WRITEMASK_XYZW))
423 return;
424
425 for (i = 0; i < 4; i++) {
426 if (mask & (1<<i)) {
427 brw_MOV(p, dst[i], brw_imm_f(0.0));
428 }
429 }
430
431 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
432 * us front face
433 */
434 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
435 for (i = 0; i < 4; i++) {
436 if (mask & (1<<i)) {
437 brw_MOV(p, dst[i], brw_imm_f(1.0));
438 }
439 }
440 brw_set_predicate_control_flag_value(p, 0xff);
441 }
442
443 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
444 * looking like:
445 *
446 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
447 *
448 * and we're trying to produce:
449 *
450 * DDX DDY
451 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
452 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
453 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
454 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
455 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
456 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
457 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
458 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
459 *
460 * and add another set of two more subspans if in 16-pixel dispatch mode.
461 *
462 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
463 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
464 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
465 * between each other. We could probably do it like ddx and swizzle the right
466 * order later, but bail for now and just produce
467 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
468 */
469 void emit_ddxy(struct brw_compile *p,
470 const struct brw_reg *dst,
471 GLuint mask,
472 GLboolean is_ddx,
473 const struct brw_reg *arg0)
474 {
475 int i;
476 struct brw_reg src0, src1;
477
478 if (mask & SATURATE)
479 brw_set_saturate(p, 1);
480 for (i = 0; i < 4; i++ ) {
481 if (mask & (1<<i)) {
482 if (is_ddx) {
483 src0 = brw_reg(arg0[i].file, arg0[i].nr, 1,
484 BRW_REGISTER_TYPE_F,
485 BRW_VERTICAL_STRIDE_2,
486 BRW_WIDTH_2,
487 BRW_HORIZONTAL_STRIDE_0,
488 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
489 src1 = brw_reg(arg0[i].file, arg0[i].nr, 0,
490 BRW_REGISTER_TYPE_F,
491 BRW_VERTICAL_STRIDE_2,
492 BRW_WIDTH_2,
493 BRW_HORIZONTAL_STRIDE_0,
494 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
495 } else {
496 src0 = brw_reg(arg0[i].file, arg0[i].nr, 0,
497 BRW_REGISTER_TYPE_F,
498 BRW_VERTICAL_STRIDE_4,
499 BRW_WIDTH_4,
500 BRW_HORIZONTAL_STRIDE_0,
501 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
502 src1 = brw_reg(arg0[i].file, arg0[i].nr, 2,
503 BRW_REGISTER_TYPE_F,
504 BRW_VERTICAL_STRIDE_4,
505 BRW_WIDTH_4,
506 BRW_HORIZONTAL_STRIDE_0,
507 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
508 }
509 brw_ADD(p, dst[i], src0, negate(src1));
510 }
511 }
512 if (mask & SATURATE)
513 brw_set_saturate(p, 0);
514 }
515
516 void emit_alu1(struct brw_compile *p,
517 struct brw_instruction *(*func)(struct brw_compile *,
518 struct brw_reg,
519 struct brw_reg),
520 const struct brw_reg *dst,
521 GLuint mask,
522 const struct brw_reg *arg0)
523 {
524 GLuint i;
525
526 if (mask & SATURATE)
527 brw_set_saturate(p, 1);
528
529 for (i = 0; i < 4; i++) {
530 if (mask & (1<<i)) {
531 func(p, dst[i], arg0[i]);
532 }
533 }
534
535 if (mask & SATURATE)
536 brw_set_saturate(p, 0);
537 }
538
539
540 void emit_alu2(struct brw_compile *p,
541 struct brw_instruction *(*func)(struct brw_compile *,
542 struct brw_reg,
543 struct brw_reg,
544 struct brw_reg),
545 const struct brw_reg *dst,
546 GLuint mask,
547 const struct brw_reg *arg0,
548 const struct brw_reg *arg1)
549 {
550 GLuint i;
551
552 if (mask & SATURATE)
553 brw_set_saturate(p, 1);
554
555 for (i = 0; i < 4; i++) {
556 if (mask & (1<<i)) {
557 func(p, dst[i], arg0[i], arg1[i]);
558 }
559 }
560
561 if (mask & SATURATE)
562 brw_set_saturate(p, 0);
563 }
564
565
566 void emit_mad(struct brw_compile *p,
567 const struct brw_reg *dst,
568 GLuint mask,
569 const struct brw_reg *arg0,
570 const struct brw_reg *arg1,
571 const struct brw_reg *arg2)
572 {
573 GLuint i;
574
575 for (i = 0; i < 4; i++) {
576 if (mask & (1<<i)) {
577 brw_MUL(p, dst[i], arg0[i], arg1[i]);
578
579 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
580 brw_ADD(p, dst[i], dst[i], arg2[i]);
581 brw_set_saturate(p, 0);
582 }
583 }
584 }
585
586 void emit_lrp(struct brw_compile *p,
587 const struct brw_reg *dst,
588 GLuint mask,
589 const struct brw_reg *arg0,
590 const struct brw_reg *arg1,
591 const struct brw_reg *arg2)
592 {
593 GLuint i;
594
595 /* Uses dst as a temporary:
596 */
597 for (i = 0; i < 4; i++) {
598 if (mask & (1<<i)) {
599 /* Can I use the LINE instruction for this?
600 */
601 brw_ADD(p, dst[i], negate(arg0[i]), brw_imm_f(1.0));
602 brw_MUL(p, brw_null_reg(), dst[i], arg2[i]);
603
604 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
605 brw_MAC(p, dst[i], arg0[i], arg1[i]);
606 brw_set_saturate(p, 0);
607 }
608 }
609 }
610
611 void emit_sop(struct brw_compile *p,
612 const struct brw_reg *dst,
613 GLuint mask,
614 GLuint cond,
615 const struct brw_reg *arg0,
616 const struct brw_reg *arg1)
617 {
618 GLuint i;
619
620 for (i = 0; i < 4; i++) {
621 if (mask & (1<<i)) {
622 brw_push_insn_state(p);
623 brw_CMP(p, brw_null_reg(), cond, arg0[i], arg1[i]);
624 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
625 brw_MOV(p, dst[i], brw_imm_f(0));
626 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
627 brw_MOV(p, dst[i], brw_imm_f(1.0));
628 brw_pop_insn_state(p);
629 }
630 }
631 }
632
633 static void emit_slt( struct brw_compile *p,
634 const struct brw_reg *dst,
635 GLuint mask,
636 const struct brw_reg *arg0,
637 const struct brw_reg *arg1 )
638 {
639 emit_sop(p, dst, mask, BRW_CONDITIONAL_L, arg0, arg1);
640 }
641
642 static void emit_sle( struct brw_compile *p,
643 const struct brw_reg *dst,
644 GLuint mask,
645 const struct brw_reg *arg0,
646 const struct brw_reg *arg1 )
647 {
648 emit_sop(p, dst, mask, BRW_CONDITIONAL_LE, arg0, arg1);
649 }
650
651 static void emit_sgt( struct brw_compile *p,
652 const struct brw_reg *dst,
653 GLuint mask,
654 const struct brw_reg *arg0,
655 const struct brw_reg *arg1 )
656 {
657 emit_sop(p, dst, mask, BRW_CONDITIONAL_G, arg0, arg1);
658 }
659
660 static void emit_sge( struct brw_compile *p,
661 const struct brw_reg *dst,
662 GLuint mask,
663 const struct brw_reg *arg0,
664 const struct brw_reg *arg1 )
665 {
666 emit_sop(p, dst, mask, BRW_CONDITIONAL_GE, arg0, arg1);
667 }
668
669 static void emit_seq( struct brw_compile *p,
670 const struct brw_reg *dst,
671 GLuint mask,
672 const struct brw_reg *arg0,
673 const struct brw_reg *arg1 )
674 {
675 emit_sop(p, dst, mask, BRW_CONDITIONAL_EQ, arg0, arg1);
676 }
677
678 static void emit_sne( struct brw_compile *p,
679 const struct brw_reg *dst,
680 GLuint mask,
681 const struct brw_reg *arg0,
682 const struct brw_reg *arg1 )
683 {
684 emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1);
685 }
686
687 void emit_cmp(struct brw_compile *p,
688 const struct brw_reg *dst,
689 GLuint mask,
690 const struct brw_reg *arg0,
691 const struct brw_reg *arg1,
692 const struct brw_reg *arg2)
693 {
694 GLuint i;
695
696 for (i = 0; i < 4; i++) {
697 if (mask & (1<<i)) {
698 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
699
700 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
701 brw_SEL(p, dst[i], arg1[i], arg2[i]);
702 brw_set_saturate(p, 0);
703 brw_set_predicate_control_flag_value(p, 0xff);
704 }
705 }
706 }
707
708 void emit_sign(struct brw_compile *p,
709 const struct brw_reg *dst,
710 GLuint mask,
711 const struct brw_reg *arg0)
712 {
713 GLuint i;
714
715 for (i = 0; i < 4; i++) {
716 if (mask & (1<<i)) {
717 brw_MOV(p, dst[i], brw_imm_f(0.0));
718
719 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
720 brw_MOV(p, dst[i], brw_imm_f(-1.0));
721 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
722
723 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0[i], brw_imm_f(0));
724 brw_MOV(p, dst[i], brw_imm_f(1.0));
725 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
726 }
727 }
728 }
729
730 void emit_max(struct brw_compile *p,
731 const struct brw_reg *dst,
732 GLuint mask,
733 const struct brw_reg *arg0,
734 const struct brw_reg *arg1)
735 {
736 GLuint i;
737
738 for (i = 0; i < 4; i++) {
739 if (mask & (1<<i)) {
740 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], arg1[i]);
741
742 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
743 brw_SEL(p, dst[i], arg0[i], arg1[i]);
744 brw_set_saturate(p, 0);
745 brw_set_predicate_control_flag_value(p, 0xff);
746 }
747 }
748 }
749
750 void emit_min(struct brw_compile *p,
751 const struct brw_reg *dst,
752 GLuint mask,
753 const struct brw_reg *arg0,
754 const struct brw_reg *arg1)
755 {
756 GLuint i;
757
758 for (i = 0; i < 4; i++) {
759 if (mask & (1<<i)) {
760 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
761
762 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
763 brw_SEL(p, dst[i], arg0[i], arg1[i]);
764 brw_set_saturate(p, 0);
765 brw_set_predicate_control_flag_value(p, 0xff);
766 }
767 }
768 }
769
770
771 void emit_dp2(struct brw_compile *p,
772 const struct brw_reg *dst,
773 GLuint mask,
774 const struct brw_reg *arg0,
775 const struct brw_reg *arg1)
776 {
777 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
778
779 if (!(mask & WRITEMASK_XYZW))
780 return; /* Do not emit dead code */
781
782 assert(is_power_of_two(mask & WRITEMASK_XYZW));
783
784 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
785
786 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
787 brw_MAC(p, dst[dst_chan], arg0[1], arg1[1]);
788 brw_set_saturate(p, 0);
789 }
790
791
792 void emit_dp3(struct brw_compile *p,
793 const struct brw_reg *dst,
794 GLuint mask,
795 const struct brw_reg *arg0,
796 const struct brw_reg *arg1)
797 {
798 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
799
800 if (!(mask & WRITEMASK_XYZW))
801 return; /* Do not emit dead code */
802
803 assert(is_power_of_two(mask & WRITEMASK_XYZW));
804
805 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
806 brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
807
808 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
809 brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
810 brw_set_saturate(p, 0);
811 }
812
813
814 void emit_dp4(struct brw_compile *p,
815 const struct brw_reg *dst,
816 GLuint mask,
817 const struct brw_reg *arg0,
818 const struct brw_reg *arg1)
819 {
820 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
821
822 if (!(mask & WRITEMASK_XYZW))
823 return; /* Do not emit dead code */
824
825 assert(is_power_of_two(mask & WRITEMASK_XYZW));
826
827 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
828 brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
829 brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]);
830
831 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
832 brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]);
833 brw_set_saturate(p, 0);
834 }
835
836
837 void emit_dph(struct brw_compile *p,
838 const struct brw_reg *dst,
839 GLuint mask,
840 const struct brw_reg *arg0,
841 const struct brw_reg *arg1)
842 {
843 const int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
844
845 if (!(mask & WRITEMASK_XYZW))
846 return; /* Do not emit dead code */
847
848 assert(is_power_of_two(mask & WRITEMASK_XYZW));
849
850 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
851 brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
852 brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
853
854 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
855 brw_ADD(p, dst[dst_chan], dst[dst_chan], arg1[3]);
856 brw_set_saturate(p, 0);
857 }
858
859
860 void emit_xpd(struct brw_compile *p,
861 const struct brw_reg *dst,
862 GLuint mask,
863 const struct brw_reg *arg0,
864 const struct brw_reg *arg1)
865 {
866 GLuint i;
867
868 assert((mask & WRITEMASK_W) != WRITEMASK_W);
869
870 for (i = 0 ; i < 3; i++) {
871 if (mask & (1<<i)) {
872 GLuint i2 = (i+2)%3;
873 GLuint i1 = (i+1)%3;
874
875 brw_MUL(p, brw_null_reg(), negate(arg0[i2]), arg1[i1]);
876
877 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
878 brw_MAC(p, dst[i], arg0[i1], arg1[i2]);
879 brw_set_saturate(p, 0);
880 }
881 }
882 }
883
884
885 void emit_math1(struct brw_wm_compile *c,
886 GLuint function,
887 const struct brw_reg *dst,
888 GLuint mask,
889 const struct brw_reg *arg0)
890 {
891 struct brw_compile *p = &c->func;
892 struct intel_context *intel = &p->brw->intel;
893 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
894 GLuint saturate = ((mask & SATURATE) ?
895 BRW_MATH_SATURATE_SATURATE :
896 BRW_MATH_SATURATE_NONE);
897 struct brw_reg src;
898
899 if (intel->gen >= 6 && (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0 ||
900 arg0[0].file != BRW_GENERAL_REGISTER_FILE)) {
901 /* Gen6 math requires that source and dst horizontal stride be 1,
902 * and that the argument be in the GRF.
903 */
904 src = dst[dst_chan];
905 brw_MOV(p, src, arg0[0]);
906 } else {
907 src = arg0[0];
908 }
909
910 if (!(mask & WRITEMASK_XYZW))
911 return; /* Do not emit dead code */
912
913 assert(is_power_of_two(mask & WRITEMASK_XYZW));
914
915 /* Send two messages to perform all 16 operations:
916 */
917 brw_push_insn_state(p);
918 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
919 brw_math(p,
920 dst[dst_chan],
921 function,
922 saturate,
923 2,
924 src,
925 BRW_MATH_DATA_VECTOR,
926 BRW_MATH_PRECISION_FULL);
927
928 if (c->dispatch_width == 16) {
929 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
930 brw_math(p,
931 offset(dst[dst_chan],1),
932 function,
933 saturate,
934 3,
935 sechalf(src),
936 BRW_MATH_DATA_VECTOR,
937 BRW_MATH_PRECISION_FULL);
938 }
939 brw_pop_insn_state(p);
940 }
941
942
943 void emit_math2(struct brw_wm_compile *c,
944 GLuint function,
945 const struct brw_reg *dst,
946 GLuint mask,
947 const struct brw_reg *arg0,
948 const struct brw_reg *arg1)
949 {
950 struct brw_compile *p = &c->func;
951 struct intel_context *intel = &p->brw->intel;
952 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
953
954 if (!(mask & WRITEMASK_XYZW))
955 return; /* Do not emit dead code */
956
957 assert(is_power_of_two(mask & WRITEMASK_XYZW));
958
959 brw_push_insn_state(p);
960
961 /* math can only operate on up to a vec8 at a time, so in
962 * dispatch_width==16 we have to do the second half manually.
963 */
964 if (intel->gen >= 6) {
965 struct brw_reg src0 = arg0[0];
966 struct brw_reg src1 = arg1[0];
967 struct brw_reg temp_dst = dst[dst_chan];
968
969 if (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
970 if (arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
971 /* Both scalar arguments. Do scalar calc. */
972 src0.hstride = BRW_HORIZONTAL_STRIDE_1;
973 src1.hstride = BRW_HORIZONTAL_STRIDE_1;
974 temp_dst.hstride = BRW_HORIZONTAL_STRIDE_1;
975 temp_dst.width = BRW_WIDTH_1;
976
977 if (arg0[0].subnr != 0) {
978 brw_MOV(p, temp_dst, src0);
979 src0 = temp_dst;
980
981 /* Ouch. We've used the temp as a dst, and we still
982 * need a temp to store arg1 in, because src and dst
983 * offsets have to be equal. Leaving this up to
984 * glsl2-965 to handle correctly.
985 */
986 assert(arg1[0].subnr == 0);
987 } else if (arg1[0].subnr != 0) {
988 brw_MOV(p, temp_dst, src1);
989 src1 = temp_dst;
990 }
991 } else {
992 brw_MOV(p, temp_dst, src0);
993 src0 = temp_dst;
994 }
995 } else if (arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
996 brw_MOV(p, temp_dst, src1);
997 src1 = temp_dst;
998 }
999
1000 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
1001 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1002 brw_math2(p,
1003 temp_dst,
1004 function,
1005 src0,
1006 src1);
1007 if (c->dispatch_width == 16) {
1008 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1009 brw_math2(p,
1010 sechalf(temp_dst),
1011 function,
1012 sechalf(src0),
1013 sechalf(src1));
1014 }
1015
1016 /* Splat a scalar result into all the channels. */
1017 if (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0 &&
1018 arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
1019 temp_dst.hstride = BRW_HORIZONTAL_STRIDE_0;
1020 temp_dst.vstride = BRW_VERTICAL_STRIDE_0;
1021 brw_MOV(p, dst[dst_chan], temp_dst);
1022 }
1023 } else {
1024 GLuint saturate = ((mask & SATURATE) ?
1025 BRW_MATH_SATURATE_SATURATE :
1026 BRW_MATH_SATURATE_NONE);
1027
1028 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1029 brw_MOV(p, brw_message_reg(3), arg1[0]);
1030 if (c->dispatch_width == 16) {
1031 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1032 brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
1033 }
1034
1035 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1036 brw_math(p,
1037 dst[dst_chan],
1038 function,
1039 saturate,
1040 2,
1041 arg0[0],
1042 BRW_MATH_DATA_VECTOR,
1043 BRW_MATH_PRECISION_FULL);
1044
1045 /* Send two messages to perform all 16 operations:
1046 */
1047 if (c->dispatch_width == 16) {
1048 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1049 brw_math(p,
1050 offset(dst[dst_chan],1),
1051 function,
1052 saturate,
1053 4,
1054 sechalf(arg0[0]),
1055 BRW_MATH_DATA_VECTOR,
1056 BRW_MATH_PRECISION_FULL);
1057 }
1058 }
1059 brw_pop_insn_state(p);
1060 }
1061
1062
1063 void emit_tex(struct brw_wm_compile *c,
1064 struct brw_reg *dst,
1065 GLuint dst_flags,
1066 struct brw_reg *arg,
1067 struct brw_reg depth_payload,
1068 GLuint tex_idx,
1069 GLuint sampler,
1070 GLboolean shadow)
1071 {
1072 struct brw_compile *p = &c->func;
1073 struct intel_context *intel = &p->brw->intel;
1074 struct brw_reg dst_retyped;
1075 GLuint cur_mrf = 2, response_length;
1076 GLuint i, nr_texcoords;
1077 GLuint emit;
1078 GLuint msg_type;
1079 GLuint mrf_per_channel;
1080 GLuint simd_mode;
1081
1082 if (c->dispatch_width == 16) {
1083 mrf_per_channel = 2;
1084 response_length = 8;
1085 dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1086 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1087 } else {
1088 mrf_per_channel = 1;
1089 response_length = 4;
1090 dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1091 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1092 }
1093
1094 /* How many input regs are there?
1095 */
1096 switch (tex_idx) {
1097 case TEXTURE_1D_INDEX:
1098 emit = WRITEMASK_X;
1099 nr_texcoords = 1;
1100 break;
1101 case TEXTURE_2D_INDEX:
1102 case TEXTURE_RECT_INDEX:
1103 emit = WRITEMASK_XY;
1104 nr_texcoords = 2;
1105 break;
1106 case TEXTURE_3D_INDEX:
1107 case TEXTURE_CUBE_INDEX:
1108 emit = WRITEMASK_XYZ;
1109 nr_texcoords = 3;
1110 break;
1111 default:
1112 /* unexpected target */
1113 abort();
1114 }
1115
1116 /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */
1117 if (intel->gen < 5 && c->dispatch_width == 8)
1118 nr_texcoords = 3;
1119
1120 /* For shadow comparisons, we have to supply u,v,r. */
1121 if (shadow)
1122 nr_texcoords = 3;
1123
1124 /* Emit the texcoords. */
1125 for (i = 0; i < nr_texcoords; i++) {
1126 if (emit & (1<<i))
1127 brw_MOV(p, brw_message_reg(cur_mrf), arg[i]);
1128 else
1129 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1130 cur_mrf += mrf_per_channel;
1131 }
1132
1133 /* Fill in the shadow comparison reference value. */
1134 if (shadow) {
1135 if (intel->gen >= 5) {
1136 /* Fill in the cube map array index value. */
1137 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1138 cur_mrf += mrf_per_channel;
1139 } else if (c->dispatch_width == 8) {
1140 /* Fill in the LOD bias value. */
1141 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1142 cur_mrf += mrf_per_channel;
1143 }
1144 brw_MOV(p, brw_message_reg(cur_mrf), arg[2]);
1145 cur_mrf += mrf_per_channel;
1146 }
1147
1148 if (intel->gen >= 5) {
1149 if (shadow)
1150 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_GEN5;
1151 else
1152 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_GEN5;
1153 } else {
1154 /* Note that G45 and older determines shadow compare and dispatch width
1155 * from message length for most messages.
1156 */
1157 if (c->dispatch_width == 16 && shadow)
1158 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
1159 else
1160 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
1161 }
1162
1163 brw_SAMPLE(p,
1164 dst_retyped,
1165 1,
1166 retype(depth_payload, BRW_REGISTER_TYPE_UW),
1167 SURF_INDEX_TEXTURE(sampler),
1168 sampler,
1169 dst_flags & WRITEMASK_XYZW,
1170 msg_type,
1171 response_length,
1172 cur_mrf - 1,
1173 0,
1174 1,
1175 simd_mode);
1176 }
1177
1178
1179 void emit_txb(struct brw_wm_compile *c,
1180 struct brw_reg *dst,
1181 GLuint dst_flags,
1182 struct brw_reg *arg,
1183 struct brw_reg depth_payload,
1184 GLuint tex_idx,
1185 GLuint sampler)
1186 {
1187 struct brw_compile *p = &c->func;
1188 struct intel_context *intel = &p->brw->intel;
1189 GLuint msgLength;
1190 GLuint msg_type;
1191 GLuint mrf_per_channel;
1192 GLuint response_length;
1193 struct brw_reg dst_retyped;
1194
1195 /* The G45 and older chipsets don't support 8-wide dispatch for LOD biased
1196 * samples, so we'll use the 16-wide instruction, leave the second halves
1197 * undefined, and trust the execution mask to keep the undefined pixels
1198 * from mattering.
1199 */
1200 if (c->dispatch_width == 16 || intel->gen < 5) {
1201 if (intel->gen >= 5)
1202 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
1203 else
1204 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
1205 mrf_per_channel = 2;
1206 dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1207 response_length = 8;
1208 } else {
1209 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
1210 mrf_per_channel = 1;
1211 dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1212 response_length = 4;
1213 }
1214
1215 /* Shadow ignored for txb. */
1216 switch (tex_idx) {
1217 case TEXTURE_1D_INDEX:
1218 brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1219 brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), brw_imm_f(0));
1220 brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1221 break;
1222 case TEXTURE_2D_INDEX:
1223 case TEXTURE_RECT_INDEX:
1224 brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1225 brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1226 brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1227 break;
1228 case TEXTURE_3D_INDEX:
1229 case TEXTURE_CUBE_INDEX:
1230 brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1231 brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1232 brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), arg[2]);
1233 break;
1234 default:
1235 /* unexpected target */
1236 abort();
1237 }
1238
1239 brw_MOV(p, brw_message_reg(2 + 3 * mrf_per_channel), arg[3]);
1240 msgLength = 2 + 4 * mrf_per_channel - 1;
1241
1242 brw_SAMPLE(p,
1243 dst_retyped,
1244 1,
1245 retype(depth_payload, BRW_REGISTER_TYPE_UW),
1246 SURF_INDEX_TEXTURE(sampler),
1247 sampler,
1248 dst_flags & WRITEMASK_XYZW,
1249 msg_type,
1250 response_length,
1251 msgLength,
1252 0,
1253 1,
1254 BRW_SAMPLER_SIMD_MODE_SIMD16);
1255 }
1256
1257
1258 static void emit_lit(struct brw_wm_compile *c,
1259 const struct brw_reg *dst,
1260 GLuint mask,
1261 const struct brw_reg *arg0)
1262 {
1263 struct brw_compile *p = &c->func;
1264
1265 assert((mask & WRITEMASK_XW) == 0);
1266
1267 if (mask & WRITEMASK_Y) {
1268 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
1269 brw_MOV(p, dst[1], arg0[0]);
1270 brw_set_saturate(p, 0);
1271 }
1272
1273 if (mask & WRITEMASK_Z) {
1274 emit_math2(c, BRW_MATH_FUNCTION_POW,
1275 &dst[2],
1276 WRITEMASK_X | (mask & SATURATE),
1277 &arg0[1],
1278 &arg0[3]);
1279 }
1280
1281 /* Ordinarily you'd use an iff statement to skip or shortcircuit
1282 * some of the POW calculations above, but 16-wide iff statements
1283 * seem to lock c1 hardware, so this is a nasty workaround:
1284 */
1285 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_LE, arg0[0], brw_imm_f(0));
1286 {
1287 if (mask & WRITEMASK_Y)
1288 brw_MOV(p, dst[1], brw_imm_f(0));
1289
1290 if (mask & WRITEMASK_Z)
1291 brw_MOV(p, dst[2], brw_imm_f(0));
1292 }
1293 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1294 }
1295
1296
1297 /* Kill pixel - set execution mask to zero for those pixels which
1298 * fail.
1299 */
1300 static void emit_kil( struct brw_wm_compile *c,
1301 struct brw_reg *arg0)
1302 {
1303 struct brw_compile *p = &c->func;
1304 struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1305 GLuint i, j;
1306
1307 for (i = 0; i < 4; i++) {
1308 /* Check if we've already done the comparison for this reg
1309 * -- common when someone does KIL TEMP.wwww.
1310 */
1311 for (j = 0; j < i; j++) {
1312 if (memcmp(&arg0[j], &arg0[i], sizeof(arg0[0])) == 0)
1313 break;
1314 }
1315 if (j != i)
1316 continue;
1317
1318 brw_push_insn_state(p);
1319 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0));
1320 brw_set_predicate_control_flag_value(p, 0xff);
1321 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1322 brw_AND(p, r0uw, brw_flag_reg(), r0uw);
1323 brw_pop_insn_state(p);
1324 }
1325 }
1326
1327 /* KIL_NV kills the pixels that are currently executing, not based on a test
1328 * of the arguments.
1329 */
1330 void emit_kil_nv( struct brw_wm_compile *c )
1331 {
1332 struct brw_compile *p = &c->func;
1333 struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1334
1335 brw_push_insn_state(p);
1336 brw_set_mask_control(p, BRW_MASK_DISABLE);
1337 brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); /* IMASK */
1338 brw_AND(p, r0uw, c->emit_mask_reg, r0uw);
1339 brw_pop_insn_state(p);
1340 }
1341
1342 static void fire_fb_write( struct brw_wm_compile *c,
1343 GLuint base_reg,
1344 GLuint nr,
1345 GLuint target,
1346 GLuint eot )
1347 {
1348 struct brw_compile *p = &c->func;
1349 struct intel_context *intel = &p->brw->intel;
1350 struct brw_reg dst;
1351
1352 if (c->dispatch_width == 16)
1353 dst = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1354 else
1355 dst = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1356
1357 /* Pass through control information:
1358 */
1359 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
1360 if (intel->gen < 6) /* gen6, use headerless for fb write */
1361 {
1362 brw_push_insn_state(p);
1363 brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
1364 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1365 brw_MOV(p,
1366 brw_message_reg(base_reg + 1),
1367 brw_vec8_grf(1, 0));
1368 brw_pop_insn_state(p);
1369 }
1370
1371 /* Send framebuffer write message: */
1372 /* send (16) null.0<1>:uw m0 r0.0<8;8,1>:uw 0x85a04000:ud { Align1 EOT } */
1373 brw_fb_WRITE(p,
1374 c->dispatch_width,
1375 dst,
1376 base_reg,
1377 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
1378 target,
1379 nr,
1380 0,
1381 eot);
1382 }
1383
1384
1385 static void emit_aa( struct brw_wm_compile *c,
1386 struct brw_reg *arg1,
1387 GLuint reg )
1388 {
1389 struct brw_compile *p = &c->func;
1390 GLuint comp = c->key.aa_dest_stencil_reg / 2;
1391 GLuint off = c->key.aa_dest_stencil_reg % 2;
1392 struct brw_reg aa = offset(arg1[comp], off);
1393
1394 brw_push_insn_state(p);
1395 brw_set_compression_control(p, BRW_COMPRESSION_NONE); /* ?? */
1396 brw_MOV(p, brw_message_reg(reg), aa);
1397 brw_pop_insn_state(p);
1398 }
1399
1400
1401 /* Post-fragment-program processing. Send the results to the
1402 * framebuffer.
1403 * \param arg0 the fragment color
1404 * \param arg1 the pass-through depth value
1405 * \param arg2 the shader-computed depth value
1406 */
1407 void emit_fb_write(struct brw_wm_compile *c,
1408 struct brw_reg *arg0,
1409 struct brw_reg *arg1,
1410 struct brw_reg *arg2,
1411 GLuint target,
1412 GLuint eot)
1413 {
1414 struct brw_compile *p = &c->func;
1415 struct brw_context *brw = p->brw;
1416 struct intel_context *intel = &brw->intel;
1417 GLuint nr = 2;
1418 GLuint channel;
1419 int base_reg; /* For gen6 fb write with no header, starting from color payload directly!. */
1420
1421 /* Reserve a space for AA - may not be needed:
1422 */
1423 if (c->key.aa_dest_stencil_reg)
1424 nr += 1;
1425
1426 /* I don't really understand how this achieves the color interleave
1427 * (ie RGBARGBA) in the result: [Do the saturation here]
1428 */
1429 brw_push_insn_state(p);
1430
1431 if (intel->gen >= 6)
1432 base_reg = nr;
1433 else
1434 base_reg = 0;
1435
1436 for (channel = 0; channel < 4; channel++) {
1437 if (intel->gen >= 6) {
1438 /* gen6 SIMD16 single source DP write looks like:
1439 * m + 0: r0
1440 * m + 1: r1
1441 * m + 2: g0
1442 * m + 3: g1
1443 * m + 4: b0
1444 * m + 5: b1
1445 * m + 6: a0
1446 * m + 7: a1
1447 */
1448 if (c->dispatch_width == 16) {
1449 brw_MOV(p, brw_message_reg(nr + channel * 2), arg0[channel]);
1450 } else {
1451 brw_MOV(p, brw_message_reg(nr + channel), arg0[channel]);
1452 }
1453 } else if (c->dispatch_width == 16 && brw->has_compr4) {
1454 /* pre-gen6 SIMD16 single source DP write looks like:
1455 * m + 0: r0
1456 * m + 1: g0
1457 * m + 2: b0
1458 * m + 3: a0
1459 * m + 4: r1
1460 * m + 5: g1
1461 * m + 6: b1
1462 * m + 7: a1
1463 *
1464 * By setting the high bit of the MRF register number, we indicate
1465 * that we want COMPR4 mode - instead of doing the usual destination
1466 * + 1 for the second half we get destination + 4.
1467 */
1468 brw_MOV(p,
1469 brw_message_reg(nr + channel + BRW_MRF_COMPR4),
1470 arg0[channel]);
1471 } else {
1472 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
1473 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
1474 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1475 brw_MOV(p,
1476 brw_message_reg(nr + channel),
1477 arg0[channel]);
1478
1479 if (c->dispatch_width == 16) {
1480 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1481 brw_MOV(p,
1482 brw_message_reg(nr + channel + 4),
1483 sechalf(arg0[channel]));
1484 }
1485 }
1486 }
1487 /* skip over the regs populated above:
1488 */
1489 if (c->dispatch_width == 16)
1490 nr += 8;
1491 else
1492 nr += 4;
1493
1494 brw_pop_insn_state(p);
1495
1496 if (c->key.source_depth_to_render_target)
1497 {
1498 if (c->key.computes_depth)
1499 brw_MOV(p, brw_message_reg(nr), arg2[2]);
1500 else
1501 brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */
1502
1503 nr += 2;
1504 }
1505
1506 if (c->key.dest_depth_reg)
1507 {
1508 GLuint comp = c->key.dest_depth_reg / 2;
1509 GLuint off = c->key.dest_depth_reg % 2;
1510
1511 if (off != 0) {
1512 brw_push_insn_state(p);
1513 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1514
1515 brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
1516 /* 2nd half? */
1517 brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
1518 brw_pop_insn_state(p);
1519 }
1520 else {
1521 brw_MOV(p, brw_message_reg(nr), arg1[comp]);
1522 }
1523 nr += 2;
1524 }
1525
1526 if (intel->gen >= 6) {
1527 /* Subtract off the message header, since we send headerless. */
1528 nr -= 2;
1529 }
1530
1531 if (!c->key.runtime_check_aads_emit) {
1532 if (c->key.aa_dest_stencil_reg)
1533 emit_aa(c, arg1, 2);
1534
1535 fire_fb_write(c, base_reg, nr, target, eot);
1536 }
1537 else {
1538 struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
1539 struct brw_reg ip = brw_ip_reg();
1540 struct brw_instruction *jmp;
1541
1542 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1543 brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
1544 brw_AND(p,
1545 v1_null_ud,
1546 get_element_ud(brw_vec8_grf(1,0), 6),
1547 brw_imm_ud(1<<26));
1548
1549 jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
1550 {
1551 emit_aa(c, arg1, 2);
1552 fire_fb_write(c, 0, nr, target, eot);
1553 /* note - thread killed in subroutine */
1554 }
1555 brw_land_fwd_jump(p, jmp);
1556
1557 /* ELSE: Shuffle up one register to fill in the hole left for AA:
1558 */
1559 fire_fb_write(c, 1, nr-1, target, eot);
1560 }
1561 }
1562
1563 /**
1564 * Move a GPR to scratch memory.
1565 */
1566 static void emit_spill( struct brw_wm_compile *c,
1567 struct brw_reg reg,
1568 GLuint slot )
1569 {
1570 struct brw_compile *p = &c->func;
1571
1572 /*
1573 mov (16) m2.0<1>:ud r2.0<8;8,1>:ud { Align1 Compr }
1574 */
1575 brw_MOV(p, brw_message_reg(2), reg);
1576
1577 /*
1578 mov (1) r0.2<1>:d 0x00000080:d { Align1 NoMask }
1579 send (16) null.0<1>:uw m1 r0.0<8;8,1>:uw 0x053003ff:ud { Align1 }
1580 */
1581 brw_oword_block_write_scratch(p, brw_message_reg(1), 2, slot);
1582 }
1583
1584
1585 /**
1586 * Load a GPR from scratch memory.
1587 */
1588 static void emit_unspill( struct brw_wm_compile *c,
1589 struct brw_reg reg,
1590 GLuint slot )
1591 {
1592 struct brw_compile *p = &c->func;
1593
1594 /* Slot 0 is the undef value.
1595 */
1596 if (slot == 0) {
1597 brw_MOV(p, reg, brw_imm_f(0));
1598 return;
1599 }
1600
1601 /*
1602 mov (1) r0.2<1>:d 0x000000c0:d { Align1 NoMask }
1603 send (16) r110.0<1>:uw m1 r0.0<8;8,1>:uw 0x041243ff:ud { Align1 }
1604 */
1605
1606 brw_oword_block_read(p, vec16(reg), brw_message_reg(1), 2, slot);
1607 }
1608
1609
1610 /**
1611 * Retrieve up to 4 GEN4 register pairs for the given wm reg:
1612 * Args with unspill_reg != 0 will be loaded from scratch memory.
1613 */
1614 static void get_argument_regs( struct brw_wm_compile *c,
1615 struct brw_wm_ref *arg[],
1616 struct brw_reg *regs )
1617 {
1618 GLuint i;
1619
1620 for (i = 0; i < 4; i++) {
1621 if (arg[i]) {
1622 if (arg[i]->unspill_reg)
1623 emit_unspill(c,
1624 brw_vec8_grf(arg[i]->unspill_reg, 0),
1625 arg[i]->value->spill_slot);
1626
1627 regs[i] = arg[i]->hw_reg;
1628 }
1629 else {
1630 regs[i] = brw_null_reg();
1631 }
1632 }
1633 }
1634
1635
1636 /**
1637 * For values that have a spill_slot!=0, write those regs to scratch memory.
1638 */
1639 static void spill_values( struct brw_wm_compile *c,
1640 struct brw_wm_value *values,
1641 GLuint nr )
1642 {
1643 GLuint i;
1644
1645 for (i = 0; i < nr; i++)
1646 if (values[i].spill_slot)
1647 emit_spill(c, values[i].hw_reg, values[i].spill_slot);
1648 }
1649
1650
1651 /* Emit the fragment program instructions here.
1652 */
1653 void brw_wm_emit( struct brw_wm_compile *c )
1654 {
1655 struct brw_compile *p = &c->func;
1656 struct intel_context *intel = &p->brw->intel;
1657 GLuint insn;
1658
1659 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1660 if (intel->gen >= 6)
1661 brw_set_acc_write_control(p, 1);
1662
1663 /* Check if any of the payload regs need to be spilled:
1664 */
1665 spill_values(c, c->payload.depth, 4);
1666 spill_values(c, c->creg, c->nr_creg);
1667 spill_values(c, c->payload.input_interp, FRAG_ATTRIB_MAX);
1668
1669
1670 for (insn = 0; insn < c->nr_insns; insn++) {
1671
1672 struct brw_wm_instruction *inst = &c->instruction[insn];
1673 struct brw_reg args[3][4], dst[4];
1674 GLuint i, dst_flags;
1675
1676 /* Get argument regs:
1677 */
1678 for (i = 0; i < 3; i++)
1679 get_argument_regs(c, inst->src[i], args[i]);
1680
1681 /* Get dest regs:
1682 */
1683 for (i = 0; i < 4; i++)
1684 if (inst->dst[i])
1685 dst[i] = inst->dst[i]->hw_reg;
1686 else
1687 dst[i] = brw_null_reg();
1688
1689 /* Flags
1690 */
1691 dst_flags = inst->writemask;
1692 if (inst->saturate)
1693 dst_flags |= SATURATE;
1694
1695 switch (inst->opcode) {
1696 /* Generated instructions for calculating triangle interpolants:
1697 */
1698 case WM_PIXELXY:
1699 emit_pixel_xy(c, dst, dst_flags);
1700 break;
1701
1702 case WM_DELTAXY:
1703 emit_delta_xy(p, dst, dst_flags, args[0]);
1704 break;
1705
1706 case WM_WPOSXY:
1707 emit_wpos_xy(c, dst, dst_flags, args[0]);
1708 break;
1709
1710 case WM_PIXELW:
1711 emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1712 break;
1713
1714 case WM_LINTERP:
1715 emit_linterp(p, dst, dst_flags, args[0], args[1]);
1716 break;
1717
1718 case WM_PINTERP:
1719 emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1720 break;
1721
1722 case WM_CINTERP:
1723 emit_cinterp(p, dst, dst_flags, args[0]);
1724 break;
1725
1726 case WM_FB_WRITE:
1727 emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot);
1728 break;
1729
1730 case WM_FRONTFACING:
1731 emit_frontfacing(p, dst, dst_flags);
1732 break;
1733
1734 /* Straightforward arithmetic:
1735 */
1736 case OPCODE_ADD:
1737 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1738 break;
1739
1740 case OPCODE_FRC:
1741 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1742 break;
1743
1744 case OPCODE_FLR:
1745 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1746 break;
1747
1748 case OPCODE_DDX:
1749 emit_ddxy(p, dst, dst_flags, GL_TRUE, args[0]);
1750 break;
1751
1752 case OPCODE_DDY:
1753 emit_ddxy(p, dst, dst_flags, GL_FALSE, args[0]);
1754 break;
1755
1756 case OPCODE_DP2:
1757 emit_dp2(p, dst, dst_flags, args[0], args[1]);
1758 break;
1759
1760 case OPCODE_DP3:
1761 emit_dp3(p, dst, dst_flags, args[0], args[1]);
1762 break;
1763
1764 case OPCODE_DP4:
1765 emit_dp4(p, dst, dst_flags, args[0], args[1]);
1766 break;
1767
1768 case OPCODE_DPH:
1769 emit_dph(p, dst, dst_flags, args[0], args[1]);
1770 break;
1771
1772 case OPCODE_TRUNC:
1773 for (i = 0; i < 4; i++) {
1774 if (dst_flags & (1<<i)) {
1775 brw_RNDZ(p, dst[i], args[0][i]);
1776 }
1777 }
1778 break;
1779
1780 case OPCODE_LRP:
1781 emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
1782 break;
1783
1784 case OPCODE_MAD:
1785 emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
1786 break;
1787
1788 case OPCODE_MOV:
1789 case OPCODE_SWZ:
1790 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1791 break;
1792
1793 case OPCODE_MUL:
1794 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
1795 break;
1796
1797 case OPCODE_XPD:
1798 emit_xpd(p, dst, dst_flags, args[0], args[1]);
1799 break;
1800
1801 /* Higher math functions:
1802 */
1803 case OPCODE_RCP:
1804 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1805 break;
1806
1807 case OPCODE_RSQ:
1808 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1809 break;
1810
1811 case OPCODE_SIN:
1812 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1813 break;
1814
1815 case OPCODE_COS:
1816 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1817 break;
1818
1819 case OPCODE_EX2:
1820 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1821 break;
1822
1823 case OPCODE_LG2:
1824 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1825 break;
1826
1827 case OPCODE_SCS:
1828 /* There is an scs math function, but it would need some
1829 * fixup for 16-element execution.
1830 */
1831 if (dst_flags & WRITEMASK_X)
1832 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1833 if (dst_flags & WRITEMASK_Y)
1834 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1835 break;
1836
1837 case OPCODE_POW:
1838 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
1839 break;
1840
1841 /* Comparisons:
1842 */
1843 case OPCODE_CMP:
1844 emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
1845 break;
1846
1847 case OPCODE_MAX:
1848 emit_max(p, dst, dst_flags, args[0], args[1]);
1849 break;
1850
1851 case OPCODE_MIN:
1852 emit_min(p, dst, dst_flags, args[0], args[1]);
1853 break;
1854
1855 case OPCODE_SLT:
1856 emit_slt(p, dst, dst_flags, args[0], args[1]);
1857 break;
1858
1859 case OPCODE_SLE:
1860 emit_sle(p, dst, dst_flags, args[0], args[1]);
1861 break;
1862 case OPCODE_SGT:
1863 emit_sgt(p, dst, dst_flags, args[0], args[1]);
1864 break;
1865 case OPCODE_SGE:
1866 emit_sge(p, dst, dst_flags, args[0], args[1]);
1867 break;
1868 case OPCODE_SEQ:
1869 emit_seq(p, dst, dst_flags, args[0], args[1]);
1870 break;
1871 case OPCODE_SNE:
1872 emit_sne(p, dst, dst_flags, args[0], args[1]);
1873 break;
1874
1875 case OPCODE_SSG:
1876 emit_sign(p, dst, dst_flags, args[0]);
1877 break;
1878
1879 case OPCODE_LIT:
1880 emit_lit(c, dst, dst_flags, args[0]);
1881 break;
1882
1883 /* Texturing operations:
1884 */
1885 case OPCODE_TEX:
1886 emit_tex(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1887 inst->tex_idx, inst->tex_unit,
1888 inst->tex_shadow);
1889 break;
1890
1891 case OPCODE_TXB:
1892 emit_txb(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1893 inst->tex_idx, inst->tex_unit);
1894 break;
1895
1896 case OPCODE_KIL:
1897 emit_kil(c, args[0]);
1898 break;
1899
1900 case OPCODE_KIL_NV:
1901 emit_kil_nv(c);
1902 break;
1903
1904 default:
1905 printf("Unsupported opcode %i (%s) in fragment shader\n",
1906 inst->opcode, inst->opcode < MAX_OPCODE ?
1907 _mesa_opcode_string(inst->opcode) :
1908 "unknown");
1909 }
1910
1911 for (i = 0; i < 4; i++)
1912 if (inst->dst[i] && inst->dst[i]->spill_slot)
1913 emit_spill(c,
1914 inst->dst[i]->hw_reg,
1915 inst->dst[i]->spill_slot);
1916 }
1917
1918 /* Only properly tested on ILK */
1919 if (p->brw->intel.gen == 5) {
1920 brw_remove_duplicate_mrf_moves(p);
1921 if (c->dispatch_width == 16)
1922 brw_remove_grf_to_mrf_moves(p);
1923 }
1924
1925 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1926 int i;
1927
1928 printf("wm-native:\n");
1929 for (i = 0; i < p->nr_insn; i++)
1930 brw_disasm(stdout, &p->store[i], p->brw->intel.gen);
1931 printf("\n");
1932 }
1933 }
1934