i965: Replace brw_wm_* with dumping code into the fs_visitor.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "main/macros.h"
34 #include "brw_context.h"
35 #include "brw_wm.h"
36
37 static bool
38 can_do_pln(struct intel_context *intel, const struct brw_reg *deltas)
39 {
40 struct brw_context *brw = brw_context(&intel->ctx);
41
42 if (!brw->has_pln)
43 return false;
44
45 if (deltas[1].nr != deltas[0].nr + 1)
46 return false;
47
48 if (intel->gen < 6 && ((deltas[0].nr & 1) != 0))
49 return false;
50
51 return true;
52 }
53
54 /* Return the SrcReg index of the channels that can be immediate float operands
55 * instead of usage of PROGRAM_CONSTANT values through push/pull.
56 */
57 bool
58 brw_wm_arg_can_be_immediate(enum prog_opcode opcode, int arg)
59 {
60 int opcode_array[] = {
61 [OPCODE_ADD] = 2,
62 [OPCODE_CMP] = 3,
63 [OPCODE_DP3] = 2,
64 [OPCODE_DP4] = 2,
65 [OPCODE_DPH] = 2,
66 [OPCODE_MAX] = 2,
67 [OPCODE_MIN] = 2,
68 [OPCODE_MOV] = 1,
69 [OPCODE_MUL] = 2,
70 [OPCODE_SEQ] = 2,
71 [OPCODE_SGE] = 2,
72 [OPCODE_SGT] = 2,
73 [OPCODE_SLE] = 2,
74 [OPCODE_SLT] = 2,
75 [OPCODE_SNE] = 2,
76 [OPCODE_SWZ] = 1,
77 [OPCODE_XPD] = 2,
78 };
79
80 /* These opcodes get broken down in a way that allow two
81 * args to be immediates.
82 */
83 if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
84 if (arg == 1 || arg == 2)
85 return true;
86 }
87
88 if (opcode > ARRAY_SIZE(opcode_array))
89 return false;
90
91 return arg == opcode_array[opcode] - 1;
92 }
93
94 /**
95 * Computes the screen-space x,y position of the pixels.
96 *
97 * This will be used by emit_delta_xy() or emit_wpos_xy() for
98 * interpolation of attributes..
99 *
100 * Payload R0:
101 *
102 * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles,
103 * corresponding to each of the 16 execution channels.
104 * R0.1..8 -- ?
105 * R1.0 -- triangle vertex 0.X
106 * R1.1 -- triangle vertex 0.Y
107 * R1.2 -- tile 0 x,y coords (2 packed uwords)
108 * R1.3 -- tile 1 x,y coords (2 packed uwords)
109 * R1.4 -- tile 2 x,y coords (2 packed uwords)
110 * R1.5 -- tile 3 x,y coords (2 packed uwords)
111 * R1.6 -- ?
112 * R1.7 -- ?
113 * R1.8 -- ?
114 */
115 void emit_pixel_xy(struct brw_wm_compile *c,
116 const struct brw_reg *dst,
117 GLuint mask)
118 {
119 struct brw_compile *p = &c->func;
120 struct brw_reg r1 = brw_vec1_grf(1, 0);
121 struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
122 struct brw_reg dst0_uw, dst1_uw;
123
124 brw_push_insn_state(p);
125 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
126
127 if (c->dispatch_width == 16) {
128 dst0_uw = vec16(retype(dst[0], BRW_REGISTER_TYPE_UW));
129 dst1_uw = vec16(retype(dst[1], BRW_REGISTER_TYPE_UW));
130 } else {
131 dst0_uw = vec8(retype(dst[0], BRW_REGISTER_TYPE_UW));
132 dst1_uw = vec8(retype(dst[1], BRW_REGISTER_TYPE_UW));
133 }
134
135 /* Calculate pixel centers by adding 1 or 0 to each of the
136 * micro-tile coordinates passed in r1.
137 */
138 if (mask & WRITEMASK_X) {
139 brw_ADD(p,
140 dst0_uw,
141 stride(suboffset(r1_uw, 4), 2, 4, 0),
142 brw_imm_v(0x10101010));
143 }
144
145 if (mask & WRITEMASK_Y) {
146 brw_ADD(p,
147 dst1_uw,
148 stride(suboffset(r1_uw,5), 2, 4, 0),
149 brw_imm_v(0x11001100));
150 }
151 brw_pop_insn_state(p);
152 }
153
154 /**
155 * Computes the screen-space x,y distance of the pixels from the start
156 * vertex.
157 *
158 * This will be used in linterp or pinterp with the start vertex value
159 * and the Cx, Cy, and C0 coefficients passed in from the setup engine
160 * to produce interpolated attribute values.
161 */
162 void emit_delta_xy(struct brw_compile *p,
163 const struct brw_reg *dst,
164 GLuint mask,
165 const struct brw_reg *arg0)
166 {
167 struct intel_context *intel = &p->brw->intel;
168 struct brw_reg r1 = brw_vec1_grf(1, 0);
169
170 if (mask == 0)
171 return;
172
173 assert(mask == WRITEMASK_XY);
174
175 if (intel->gen >= 6) {
176 /* XXX Gen6 WM doesn't have Xstart/Ystart in payload r1.0/r1.1.
177 Just add them with 0.0 for dst reg.. */
178 r1 = brw_imm_v(0x00000000);
179 brw_ADD(p,
180 dst[0],
181 retype(arg0[0], BRW_REGISTER_TYPE_UW),
182 r1);
183 brw_ADD(p,
184 dst[1],
185 retype(arg0[1], BRW_REGISTER_TYPE_UW),
186 r1);
187 return;
188 }
189
190 /* Calc delta X,Y by subtracting origin in r1 from the pixel
191 * centers produced by emit_pixel_xy().
192 */
193 brw_ADD(p,
194 dst[0],
195 retype(arg0[0], BRW_REGISTER_TYPE_UW),
196 negate(r1));
197 brw_ADD(p,
198 dst[1],
199 retype(arg0[1], BRW_REGISTER_TYPE_UW),
200 negate(suboffset(r1,1)));
201 }
202
203 /**
204 * Computes the pixel offset from the window origin for gl_FragCoord().
205 */
206 void emit_wpos_xy(struct brw_wm_compile *c,
207 const struct brw_reg *dst,
208 GLuint mask,
209 const struct brw_reg *arg0)
210 {
211 struct brw_compile *p = &c->func;
212 struct intel_context *intel = &p->brw->intel;
213 struct brw_reg delta_x = retype(arg0[0], BRW_REGISTER_TYPE_W);
214 struct brw_reg delta_y = retype(arg0[1], BRW_REGISTER_TYPE_W);
215
216 if (mask & WRITEMASK_X) {
217 if (intel->gen >= 6) {
218 struct brw_reg delta_x_f = retype(delta_x, BRW_REGISTER_TYPE_F);
219 brw_MOV(p, delta_x_f, delta_x);
220 delta_x = delta_x_f;
221 }
222
223 if (c->fp->program.PixelCenterInteger) {
224 /* X' = X */
225 brw_MOV(p, dst[0], delta_x);
226 } else {
227 /* X' = X + 0.5 */
228 brw_ADD(p, dst[0], delta_x, brw_imm_f(0.5));
229 }
230 }
231
232 if (mask & WRITEMASK_Y) {
233 if (intel->gen >= 6) {
234 struct brw_reg delta_y_f = retype(delta_y, BRW_REGISTER_TYPE_F);
235 brw_MOV(p, delta_y_f, delta_y);
236 delta_y = delta_y_f;
237 }
238
239 if (c->fp->program.OriginUpperLeft) {
240 if (c->fp->program.PixelCenterInteger) {
241 /* Y' = Y */
242 brw_MOV(p, dst[1], delta_y);
243 } else {
244 brw_ADD(p, dst[1], delta_y, brw_imm_f(0.5));
245 }
246 } else {
247 float center_offset = c->fp->program.PixelCenterInteger ? 0.0 : 0.5;
248
249 /* Y' = (height - 1) - Y + center */
250 brw_ADD(p, dst[1], negate(delta_y),
251 brw_imm_f(c->key.drawable_height - 1 + center_offset));
252 }
253 }
254 }
255
256
257 void emit_pixel_w(struct brw_wm_compile *c,
258 const struct brw_reg *dst,
259 GLuint mask,
260 const struct brw_reg *arg0,
261 const struct brw_reg *deltas)
262 {
263 struct brw_compile *p = &c->func;
264 struct intel_context *intel = &p->brw->intel;
265 struct brw_reg src;
266 struct brw_reg temp_dst;
267
268 if (intel->gen >= 6)
269 temp_dst = dst[3];
270 else
271 temp_dst = brw_message_reg(2);
272
273 assert(intel->gen < 6);
274
275 /* Don't need this if all you are doing is interpolating color, for
276 * instance.
277 */
278 if (mask & WRITEMASK_W) {
279 struct brw_reg interp3 = brw_vec1_grf(arg0[0].nr+1, 4);
280
281 /* Calc 1/w - just linterp wpos[3] optimized by putting the
282 * result straight into a message reg.
283 */
284 if (can_do_pln(intel, deltas)) {
285 brw_PLN(p, temp_dst, interp3, deltas[0]);
286 } else {
287 brw_LINE(p, brw_null_reg(), interp3, deltas[0]);
288 brw_MAC(p, temp_dst, suboffset(interp3, 1), deltas[1]);
289 }
290
291 /* Calc w */
292 if (intel->gen >= 6)
293 src = temp_dst;
294 else
295 src = brw_null_reg();
296
297 if (c->dispatch_width == 16) {
298 brw_math_16(p, dst[3],
299 BRW_MATH_FUNCTION_INV,
300 2, src,
301 BRW_MATH_PRECISION_FULL);
302 } else {
303 brw_math(p, dst[3],
304 BRW_MATH_FUNCTION_INV,
305 2, src,
306 BRW_MATH_DATA_VECTOR,
307 BRW_MATH_PRECISION_FULL);
308 }
309 }
310 }
311
312 void emit_linterp(struct brw_compile *p,
313 const struct brw_reg *dst,
314 GLuint mask,
315 const struct brw_reg *arg0,
316 const struct brw_reg *deltas)
317 {
318 struct intel_context *intel = &p->brw->intel;
319 struct brw_reg interp[4];
320 GLuint nr = arg0[0].nr;
321 GLuint i;
322
323 interp[0] = brw_vec1_grf(nr, 0);
324 interp[1] = brw_vec1_grf(nr, 4);
325 interp[2] = brw_vec1_grf(nr+1, 0);
326 interp[3] = brw_vec1_grf(nr+1, 4);
327
328 for (i = 0; i < 4; i++) {
329 if (mask & (1<<i)) {
330 if (intel->gen >= 6) {
331 brw_PLN(p, dst[i], interp[i], brw_vec8_grf(2, 0));
332 } else if (can_do_pln(intel, deltas)) {
333 brw_PLN(p, dst[i], interp[i], deltas[0]);
334 } else {
335 brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
336 brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
337 }
338 }
339 }
340 }
341
342
343 void emit_pinterp(struct brw_compile *p,
344 const struct brw_reg *dst,
345 GLuint mask,
346 const struct brw_reg *arg0,
347 const struct brw_reg *deltas,
348 const struct brw_reg *w)
349 {
350 struct intel_context *intel = &p->brw->intel;
351 struct brw_reg interp[4];
352 GLuint nr = arg0[0].nr;
353 GLuint i;
354
355 if (intel->gen >= 6) {
356 emit_linterp(p, dst, mask, arg0, interp);
357 return;
358 }
359
360 interp[0] = brw_vec1_grf(nr, 0);
361 interp[1] = brw_vec1_grf(nr, 4);
362 interp[2] = brw_vec1_grf(nr+1, 0);
363 interp[3] = brw_vec1_grf(nr+1, 4);
364
365 for (i = 0; i < 4; i++) {
366 if (mask & (1<<i)) {
367 if (can_do_pln(intel, deltas)) {
368 brw_PLN(p, dst[i], interp[i], deltas[0]);
369 } else {
370 brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
371 brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
372 }
373 }
374 }
375 for (i = 0; i < 4; i++) {
376 if (mask & (1<<i)) {
377 brw_MUL(p, dst[i], dst[i], w[3]);
378 }
379 }
380 }
381
382
383 void emit_cinterp(struct brw_compile *p,
384 const struct brw_reg *dst,
385 GLuint mask,
386 const struct brw_reg *arg0)
387 {
388 struct brw_reg interp[4];
389 GLuint nr = arg0[0].nr;
390 GLuint i;
391
392 interp[0] = brw_vec1_grf(nr, 0);
393 interp[1] = brw_vec1_grf(nr, 4);
394 interp[2] = brw_vec1_grf(nr+1, 0);
395 interp[3] = brw_vec1_grf(nr+1, 4);
396
397 for (i = 0; i < 4; i++) {
398 if (mask & (1<<i)) {
399 brw_MOV(p, dst[i], suboffset(interp[i],3)); /* TODO: optimize away like other moves */
400 }
401 }
402 }
403
404 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
405 void emit_frontfacing(struct brw_compile *p,
406 const struct brw_reg *dst,
407 GLuint mask)
408 {
409 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
410 GLuint i;
411
412 if (!(mask & WRITEMASK_XYZW))
413 return;
414
415 for (i = 0; i < 4; i++) {
416 if (mask & (1<<i)) {
417 brw_MOV(p, dst[i], brw_imm_f(0.0));
418 }
419 }
420
421 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
422 * us front face
423 */
424 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
425 for (i = 0; i < 4; i++) {
426 if (mask & (1<<i)) {
427 brw_MOV(p, dst[i], brw_imm_f(1.0));
428 }
429 }
430 brw_set_predicate_control_flag_value(p, 0xff);
431 }
432
433 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
434 * looking like:
435 *
436 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
437 *
438 * and we're trying to produce:
439 *
440 * DDX DDY
441 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
442 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
443 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
444 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
445 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
446 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
447 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
448 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
449 *
450 * and add another set of two more subspans if in 16-pixel dispatch mode.
451 *
452 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
453 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
454 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
455 * between each other. We could probably do it like ddx and swizzle the right
456 * order later, but bail for now and just produce
457 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
458 *
459 * The negate_value boolean is used to negate the d/dy computation for FBOs,
460 * since they place the origin at the upper left instead of the lower left.
461 */
462 void emit_ddxy(struct brw_compile *p,
463 const struct brw_reg *dst,
464 GLuint mask,
465 bool is_ddx,
466 const struct brw_reg *arg0,
467 bool negate_value)
468 {
469 int i;
470 struct brw_reg src0, src1;
471
472 if (mask & SATURATE)
473 brw_set_saturate(p, 1);
474 for (i = 0; i < 4; i++ ) {
475 if (mask & (1<<i)) {
476 if (is_ddx) {
477 src0 = brw_reg(arg0[i].file, arg0[i].nr, 1,
478 BRW_REGISTER_TYPE_F,
479 BRW_VERTICAL_STRIDE_2,
480 BRW_WIDTH_2,
481 BRW_HORIZONTAL_STRIDE_0,
482 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
483 src1 = brw_reg(arg0[i].file, arg0[i].nr, 0,
484 BRW_REGISTER_TYPE_F,
485 BRW_VERTICAL_STRIDE_2,
486 BRW_WIDTH_2,
487 BRW_HORIZONTAL_STRIDE_0,
488 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
489 } else {
490 src0 = brw_reg(arg0[i].file, arg0[i].nr, 0,
491 BRW_REGISTER_TYPE_F,
492 BRW_VERTICAL_STRIDE_4,
493 BRW_WIDTH_4,
494 BRW_HORIZONTAL_STRIDE_0,
495 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
496 src1 = brw_reg(arg0[i].file, arg0[i].nr, 2,
497 BRW_REGISTER_TYPE_F,
498 BRW_VERTICAL_STRIDE_4,
499 BRW_WIDTH_4,
500 BRW_HORIZONTAL_STRIDE_0,
501 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
502 }
503 if (negate_value)
504 brw_ADD(p, dst[i], src1, negate(src0));
505 else
506 brw_ADD(p, dst[i], src0, negate(src1));
507 }
508 }
509 if (mask & SATURATE)
510 brw_set_saturate(p, 0);
511 }
512
513 void emit_alu1(struct brw_compile *p,
514 struct brw_instruction *(*func)(struct brw_compile *,
515 struct brw_reg,
516 struct brw_reg),
517 const struct brw_reg *dst,
518 GLuint mask,
519 const struct brw_reg *arg0)
520 {
521 GLuint i;
522
523 if (mask & SATURATE)
524 brw_set_saturate(p, 1);
525
526 for (i = 0; i < 4; i++) {
527 if (mask & (1<<i)) {
528 func(p, dst[i], arg0[i]);
529 }
530 }
531
532 if (mask & SATURATE)
533 brw_set_saturate(p, 0);
534 }
535
536
537 void emit_alu2(struct brw_compile *p,
538 struct brw_instruction *(*func)(struct brw_compile *,
539 struct brw_reg,
540 struct brw_reg,
541 struct brw_reg),
542 const struct brw_reg *dst,
543 GLuint mask,
544 const struct brw_reg *arg0,
545 const struct brw_reg *arg1)
546 {
547 GLuint i;
548
549 if (mask & SATURATE)
550 brw_set_saturate(p, 1);
551
552 for (i = 0; i < 4; i++) {
553 if (mask & (1<<i)) {
554 func(p, dst[i], arg0[i], arg1[i]);
555 }
556 }
557
558 if (mask & SATURATE)
559 brw_set_saturate(p, 0);
560 }
561
562
563 void emit_mad(struct brw_compile *p,
564 const struct brw_reg *dst,
565 GLuint mask,
566 const struct brw_reg *arg0,
567 const struct brw_reg *arg1,
568 const struct brw_reg *arg2)
569 {
570 GLuint i;
571
572 for (i = 0; i < 4; i++) {
573 if (mask & (1<<i)) {
574 brw_MUL(p, dst[i], arg0[i], arg1[i]);
575
576 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
577 brw_ADD(p, dst[i], dst[i], arg2[i]);
578 brw_set_saturate(p, 0);
579 }
580 }
581 }
582
583 void emit_lrp(struct brw_compile *p,
584 const struct brw_reg *dst,
585 GLuint mask,
586 const struct brw_reg *arg0,
587 const struct brw_reg *arg1,
588 const struct brw_reg *arg2)
589 {
590 GLuint i;
591
592 /* Uses dst as a temporary:
593 */
594 for (i = 0; i < 4; i++) {
595 if (mask & (1<<i)) {
596 /* Can I use the LINE instruction for this?
597 */
598 brw_ADD(p, dst[i], negate(arg0[i]), brw_imm_f(1.0));
599 brw_MUL(p, brw_null_reg(), dst[i], arg2[i]);
600
601 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
602 brw_MAC(p, dst[i], arg0[i], arg1[i]);
603 brw_set_saturate(p, 0);
604 }
605 }
606 }
607
608 void emit_sop(struct brw_compile *p,
609 const struct brw_reg *dst,
610 GLuint mask,
611 GLuint cond,
612 const struct brw_reg *arg0,
613 const struct brw_reg *arg1)
614 {
615 GLuint i;
616
617 for (i = 0; i < 4; i++) {
618 if (mask & (1<<i)) {
619 brw_push_insn_state(p);
620 brw_CMP(p, brw_null_reg(), cond, arg0[i], arg1[i]);
621 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
622 brw_MOV(p, dst[i], brw_imm_f(0));
623 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
624 brw_MOV(p, dst[i], brw_imm_f(1.0));
625 brw_pop_insn_state(p);
626 }
627 }
628 }
629
630 static void emit_slt( struct brw_compile *p,
631 const struct brw_reg *dst,
632 GLuint mask,
633 const struct brw_reg *arg0,
634 const struct brw_reg *arg1 )
635 {
636 emit_sop(p, dst, mask, BRW_CONDITIONAL_L, arg0, arg1);
637 }
638
639 static void emit_sle( struct brw_compile *p,
640 const struct brw_reg *dst,
641 GLuint mask,
642 const struct brw_reg *arg0,
643 const struct brw_reg *arg1 )
644 {
645 emit_sop(p, dst, mask, BRW_CONDITIONAL_LE, arg0, arg1);
646 }
647
648 static void emit_sgt( struct brw_compile *p,
649 const struct brw_reg *dst,
650 GLuint mask,
651 const struct brw_reg *arg0,
652 const struct brw_reg *arg1 )
653 {
654 emit_sop(p, dst, mask, BRW_CONDITIONAL_G, arg0, arg1);
655 }
656
657 static void emit_sge( struct brw_compile *p,
658 const struct brw_reg *dst,
659 GLuint mask,
660 const struct brw_reg *arg0,
661 const struct brw_reg *arg1 )
662 {
663 emit_sop(p, dst, mask, BRW_CONDITIONAL_GE, arg0, arg1);
664 }
665
666 static void emit_seq( struct brw_compile *p,
667 const struct brw_reg *dst,
668 GLuint mask,
669 const struct brw_reg *arg0,
670 const struct brw_reg *arg1 )
671 {
672 emit_sop(p, dst, mask, BRW_CONDITIONAL_EQ, arg0, arg1);
673 }
674
675 static void emit_sne( struct brw_compile *p,
676 const struct brw_reg *dst,
677 GLuint mask,
678 const struct brw_reg *arg0,
679 const struct brw_reg *arg1 )
680 {
681 emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1);
682 }
683
684 void emit_cmp(struct brw_compile *p,
685 const struct brw_reg *dst,
686 GLuint mask,
687 const struct brw_reg *arg0,
688 const struct brw_reg *arg1,
689 const struct brw_reg *arg2)
690 {
691 GLuint i;
692
693 for (i = 0; i < 4; i++) {
694 if (mask & (1<<i)) {
695 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
696
697 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
698 brw_SEL(p, dst[i], arg1[i], arg2[i]);
699 brw_set_saturate(p, 0);
700 brw_set_predicate_control_flag_value(p, 0xff);
701 }
702 }
703 }
704
705 void emit_sign(struct brw_compile *p,
706 const struct brw_reg *dst,
707 GLuint mask,
708 const struct brw_reg *arg0)
709 {
710 GLuint i;
711
712 for (i = 0; i < 4; i++) {
713 if (mask & (1<<i)) {
714 brw_MOV(p, dst[i], brw_imm_f(0.0));
715
716 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
717 brw_MOV(p, dst[i], brw_imm_f(-1.0));
718 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
719
720 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0[i], brw_imm_f(0));
721 brw_MOV(p, dst[i], brw_imm_f(1.0));
722 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
723 }
724 }
725 }
726
727 void emit_max(struct brw_compile *p,
728 const struct brw_reg *dst,
729 GLuint mask,
730 const struct brw_reg *arg0,
731 const struct brw_reg *arg1)
732 {
733 GLuint i;
734
735 for (i = 0; i < 4; i++) {
736 if (mask & (1<<i)) {
737 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], arg1[i]);
738
739 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
740 brw_SEL(p, dst[i], arg0[i], arg1[i]);
741 brw_set_saturate(p, 0);
742 brw_set_predicate_control_flag_value(p, 0xff);
743 }
744 }
745 }
746
747 void emit_min(struct brw_compile *p,
748 const struct brw_reg *dst,
749 GLuint mask,
750 const struct brw_reg *arg0,
751 const struct brw_reg *arg1)
752 {
753 GLuint i;
754
755 for (i = 0; i < 4; i++) {
756 if (mask & (1<<i)) {
757 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
758
759 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
760 brw_SEL(p, dst[i], arg0[i], arg1[i]);
761 brw_set_saturate(p, 0);
762 brw_set_predicate_control_flag_value(p, 0xff);
763 }
764 }
765 }
766
767
768 void emit_dp2(struct brw_compile *p,
769 const struct brw_reg *dst,
770 GLuint mask,
771 const struct brw_reg *arg0,
772 const struct brw_reg *arg1)
773 {
774 int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
775
776 if (!(mask & WRITEMASK_XYZW))
777 return; /* Do not emit dead code */
778
779 assert(is_power_of_two(mask & WRITEMASK_XYZW));
780
781 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
782
783 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
784 brw_MAC(p, dst[dst_chan], arg0[1], arg1[1]);
785 brw_set_saturate(p, 0);
786 }
787
788
789 void emit_dp3(struct brw_compile *p,
790 const struct brw_reg *dst,
791 GLuint mask,
792 const struct brw_reg *arg0,
793 const struct brw_reg *arg1)
794 {
795 int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
796
797 if (!(mask & WRITEMASK_XYZW))
798 return; /* Do not emit dead code */
799
800 assert(is_power_of_two(mask & WRITEMASK_XYZW));
801
802 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
803 brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
804
805 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
806 brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
807 brw_set_saturate(p, 0);
808 }
809
810
811 void emit_dp4(struct brw_compile *p,
812 const struct brw_reg *dst,
813 GLuint mask,
814 const struct brw_reg *arg0,
815 const struct brw_reg *arg1)
816 {
817 int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
818
819 if (!(mask & WRITEMASK_XYZW))
820 return; /* Do not emit dead code */
821
822 assert(is_power_of_two(mask & WRITEMASK_XYZW));
823
824 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
825 brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
826 brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]);
827
828 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
829 brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]);
830 brw_set_saturate(p, 0);
831 }
832
833
834 void emit_dph(struct brw_compile *p,
835 const struct brw_reg *dst,
836 GLuint mask,
837 const struct brw_reg *arg0,
838 const struct brw_reg *arg1)
839 {
840 const int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
841
842 if (!(mask & WRITEMASK_XYZW))
843 return; /* Do not emit dead code */
844
845 assert(is_power_of_two(mask & WRITEMASK_XYZW));
846
847 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
848 brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
849 brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
850
851 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
852 brw_ADD(p, dst[dst_chan], dst[dst_chan], arg1[3]);
853 brw_set_saturate(p, 0);
854 }
855
856
857 void emit_xpd(struct brw_compile *p,
858 const struct brw_reg *dst,
859 GLuint mask,
860 const struct brw_reg *arg0,
861 const struct brw_reg *arg1)
862 {
863 GLuint i;
864
865 assert((mask & WRITEMASK_W) != WRITEMASK_W);
866
867 for (i = 0 ; i < 3; i++) {
868 if (mask & (1<<i)) {
869 GLuint i2 = (i+2)%3;
870 GLuint i1 = (i+1)%3;
871
872 brw_MUL(p, brw_null_reg(), negate(arg0[i2]), arg1[i1]);
873
874 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
875 brw_MAC(p, dst[i], arg0[i1], arg1[i2]);
876 brw_set_saturate(p, 0);
877 }
878 }
879 }
880
881
882 void emit_math1(struct brw_wm_compile *c,
883 GLuint function,
884 const struct brw_reg *dst,
885 GLuint mask,
886 const struct brw_reg *arg0)
887 {
888 struct brw_compile *p = &c->func;
889 struct intel_context *intel = &p->brw->intel;
890 int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
891 struct brw_reg src;
892
893 if (!(mask & WRITEMASK_XYZW))
894 return; /* Do not emit dead code */
895
896 assert(is_power_of_two(mask & WRITEMASK_XYZW));
897
898 if (intel->gen >= 6 && ((arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0 ||
899 arg0[0].file != BRW_GENERAL_REGISTER_FILE) ||
900 arg0[0].negate || arg0[0].abs)) {
901 /* Gen6 math requires that source and dst horizontal stride be 1,
902 * and that the argument be in the GRF.
903 *
904 * The hardware ignores source modifiers (negate and abs) on math
905 * instructions, so we also move to a temp to set those up.
906 */
907 src = dst[dst_chan];
908 brw_MOV(p, src, arg0[0]);
909 } else {
910 src = arg0[0];
911 }
912
913 /* Send two messages to perform all 16 operations:
914 */
915 brw_push_insn_state(p);
916 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
917 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
918 brw_math(p,
919 dst[dst_chan],
920 function,
921 2,
922 src,
923 BRW_MATH_DATA_VECTOR,
924 BRW_MATH_PRECISION_FULL);
925
926 if (c->dispatch_width == 16) {
927 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
928 brw_math(p,
929 offset(dst[dst_chan],1),
930 function,
931 3,
932 sechalf(src),
933 BRW_MATH_DATA_VECTOR,
934 BRW_MATH_PRECISION_FULL);
935 }
936 brw_pop_insn_state(p);
937 }
938
939
940 void emit_math2(struct brw_wm_compile *c,
941 GLuint function,
942 const struct brw_reg *dst,
943 GLuint mask,
944 const struct brw_reg *arg0,
945 const struct brw_reg *arg1)
946 {
947 struct brw_compile *p = &c->func;
948 struct intel_context *intel = &p->brw->intel;
949 int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
950
951 if (!(mask & WRITEMASK_XYZW))
952 return; /* Do not emit dead code */
953
954 assert(is_power_of_two(mask & WRITEMASK_XYZW));
955
956 brw_push_insn_state(p);
957
958 /* math can only operate on up to a vec8 at a time, so in
959 * dispatch_width==16 we have to do the second half manually.
960 */
961 if (intel->gen >= 6) {
962 struct brw_reg src0 = arg0[0];
963 struct brw_reg src1 = arg1[0];
964 struct brw_reg temp_dst = dst[dst_chan];
965
966 if (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
967 brw_MOV(p, temp_dst, src0);
968 src0 = temp_dst;
969 }
970
971 if (arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
972 /* This is a heinous hack to get a temporary register for use
973 * in case both arg0 and arg1 are constants. Why you're
974 * doing exponentiation on constant values in the shader, we
975 * don't know.
976 *
977 * max_wm_grf is almost surely less than the maximum GRF, and
978 * gen6 doesn't care about the number of GRFs used in a
979 * shader like pre-gen6 did.
980 */
981 struct brw_reg temp = brw_vec8_grf(c->max_wm_grf, 0);
982 brw_MOV(p, temp, src1);
983 src1 = temp;
984 }
985
986 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
987 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
988 brw_math2(p,
989 temp_dst,
990 function,
991 src0,
992 src1);
993 if (c->dispatch_width == 16) {
994 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
995 brw_math2(p,
996 sechalf(temp_dst),
997 function,
998 sechalf(src0),
999 sechalf(src1));
1000 }
1001 } else {
1002 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1003 brw_MOV(p, brw_message_reg(3), arg1[0]);
1004 if (c->dispatch_width == 16) {
1005 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1006 brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
1007 }
1008
1009 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
1010 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1011 brw_math(p,
1012 dst[dst_chan],
1013 function,
1014 2,
1015 arg0[0],
1016 BRW_MATH_DATA_VECTOR,
1017 BRW_MATH_PRECISION_FULL);
1018
1019 /* Send two messages to perform all 16 operations:
1020 */
1021 if (c->dispatch_width == 16) {
1022 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1023 brw_math(p,
1024 offset(dst[dst_chan],1),
1025 function,
1026 4,
1027 sechalf(arg0[0]),
1028 BRW_MATH_DATA_VECTOR,
1029 BRW_MATH_PRECISION_FULL);
1030 }
1031 }
1032 brw_pop_insn_state(p);
1033 }
1034
1035
1036 void emit_tex(struct brw_wm_compile *c,
1037 struct brw_reg *dst,
1038 GLuint dst_flags,
1039 struct brw_reg *arg,
1040 struct brw_reg depth_payload,
1041 GLuint tex_idx,
1042 GLuint sampler,
1043 bool shadow)
1044 {
1045 struct brw_compile *p = &c->func;
1046 struct intel_context *intel = &p->brw->intel;
1047 struct brw_reg dst_retyped;
1048 GLuint cur_mrf = 2, response_length;
1049 GLuint i, nr_texcoords;
1050 GLuint emit;
1051 GLuint msg_type;
1052 GLuint mrf_per_channel;
1053 GLuint simd_mode;
1054
1055 if (c->dispatch_width == 16) {
1056 mrf_per_channel = 2;
1057 response_length = 8;
1058 dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1059 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1060 } else {
1061 mrf_per_channel = 1;
1062 response_length = 4;
1063 dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1064 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1065 }
1066
1067 /* How many input regs are there?
1068 */
1069 switch (tex_idx) {
1070 case TEXTURE_1D_INDEX:
1071 emit = WRITEMASK_X;
1072 nr_texcoords = 1;
1073 break;
1074 case TEXTURE_2D_INDEX:
1075 case TEXTURE_1D_ARRAY_INDEX:
1076 case TEXTURE_RECT_INDEX:
1077 case TEXTURE_EXTERNAL_INDEX:
1078 emit = WRITEMASK_XY;
1079 nr_texcoords = 2;
1080 break;
1081 case TEXTURE_3D_INDEX:
1082 case TEXTURE_2D_ARRAY_INDEX:
1083 case TEXTURE_CUBE_INDEX:
1084 emit = WRITEMASK_XYZ;
1085 nr_texcoords = 3;
1086 break;
1087 default:
1088 /* unexpected target */
1089 abort();
1090 }
1091
1092 /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */
1093 if (intel->gen < 5 && c->dispatch_width == 8)
1094 nr_texcoords = 3;
1095
1096 if (shadow) {
1097 if (intel->gen < 7) {
1098 /* For shadow comparisons, we have to supply u,v,r. */
1099 nr_texcoords = 3;
1100 } else {
1101 /* On Ivybridge, the shadow comparitor comes first. Just load it. */
1102 brw_MOV(p, brw_message_reg(cur_mrf), arg[2]);
1103 cur_mrf += mrf_per_channel;
1104 }
1105 }
1106
1107 /* Emit the texcoords. */
1108 for (i = 0; i < nr_texcoords; i++) {
1109 if (c->key.tex.gl_clamp_mask[i] & (1 << sampler))
1110 brw_set_saturate(p, true);
1111
1112 if (emit & (1<<i))
1113 brw_MOV(p, brw_message_reg(cur_mrf), arg[i]);
1114 else
1115 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1116 cur_mrf += mrf_per_channel;
1117
1118 brw_set_saturate(p, false);
1119 }
1120
1121 /* Fill in the shadow comparison reference value. */
1122 if (shadow && intel->gen < 7) {
1123 if (intel->gen >= 5) {
1124 /* Fill in the cube map array index value. */
1125 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1126 cur_mrf += mrf_per_channel;
1127 } else if (c->dispatch_width == 8) {
1128 /* Fill in the LOD bias value. */
1129 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1130 cur_mrf += mrf_per_channel;
1131 }
1132 brw_MOV(p, brw_message_reg(cur_mrf), arg[2]);
1133 cur_mrf += mrf_per_channel;
1134 }
1135
1136 if (intel->gen >= 5) {
1137 if (shadow)
1138 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
1139 else
1140 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
1141 } else {
1142 /* Note that G45 and older determines shadow compare and dispatch width
1143 * from message length for most messages.
1144 */
1145 if (c->dispatch_width == 16 && shadow)
1146 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
1147 else
1148 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
1149 }
1150
1151 brw_SAMPLE(p,
1152 dst_retyped,
1153 1,
1154 retype(depth_payload, BRW_REGISTER_TYPE_UW),
1155 SURF_INDEX_TEXTURE(sampler),
1156 sampler,
1157 dst_flags & WRITEMASK_XYZW,
1158 msg_type,
1159 response_length,
1160 cur_mrf - 1,
1161 1,
1162 simd_mode,
1163 BRW_SAMPLER_RETURN_FORMAT_FLOAT32);
1164 }
1165
1166
1167 void emit_txb(struct brw_wm_compile *c,
1168 struct brw_reg *dst,
1169 GLuint dst_flags,
1170 struct brw_reg *arg,
1171 struct brw_reg depth_payload,
1172 GLuint tex_idx,
1173 GLuint sampler)
1174 {
1175 struct brw_compile *p = &c->func;
1176 struct intel_context *intel = &p->brw->intel;
1177 GLuint msgLength;
1178 GLuint msg_type;
1179 GLuint mrf_per_channel;
1180 GLuint response_length;
1181 struct brw_reg dst_retyped;
1182
1183 /* The G45 and older chipsets don't support 8-wide dispatch for LOD biased
1184 * samples, so we'll use the 16-wide instruction, leave the second halves
1185 * undefined, and trust the execution mask to keep the undefined pixels
1186 * from mattering.
1187 */
1188 if (c->dispatch_width == 16 || intel->gen < 5) {
1189 if (intel->gen >= 5)
1190 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
1191 else
1192 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
1193 mrf_per_channel = 2;
1194 dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1195 response_length = 8;
1196 } else {
1197 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
1198 mrf_per_channel = 1;
1199 dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1200 response_length = 4;
1201 }
1202
1203 /* Shadow ignored for txb. */
1204 switch (tex_idx) {
1205 case TEXTURE_1D_INDEX:
1206 brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1207 brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), brw_imm_f(0));
1208 brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1209 break;
1210 case TEXTURE_2D_INDEX:
1211 case TEXTURE_RECT_INDEX:
1212 case TEXTURE_EXTERNAL_INDEX:
1213 brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1214 brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1215 brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1216 break;
1217 case TEXTURE_3D_INDEX:
1218 case TEXTURE_CUBE_INDEX:
1219 brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1220 brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1221 brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), arg[2]);
1222 break;
1223 default:
1224 /* unexpected target */
1225 abort();
1226 }
1227
1228 brw_MOV(p, brw_message_reg(2 + 3 * mrf_per_channel), arg[3]);
1229 msgLength = 2 + 4 * mrf_per_channel - 1;
1230
1231 brw_SAMPLE(p,
1232 dst_retyped,
1233 1,
1234 retype(depth_payload, BRW_REGISTER_TYPE_UW),
1235 SURF_INDEX_TEXTURE(sampler),
1236 sampler,
1237 dst_flags & WRITEMASK_XYZW,
1238 msg_type,
1239 response_length,
1240 msgLength,
1241 1,
1242 BRW_SAMPLER_SIMD_MODE_SIMD16,
1243 BRW_SAMPLER_RETURN_FORMAT_FLOAT32);
1244 }
1245
1246
1247 static void emit_lit(struct brw_wm_compile *c,
1248 const struct brw_reg *dst,
1249 GLuint mask,
1250 const struct brw_reg *arg0)
1251 {
1252 struct brw_compile *p = &c->func;
1253
1254 assert((mask & WRITEMASK_XW) == 0);
1255
1256 if (mask & WRITEMASK_Y) {
1257 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
1258 brw_MOV(p, dst[1], arg0[0]);
1259 brw_set_saturate(p, 0);
1260 }
1261
1262 if (mask & WRITEMASK_Z) {
1263 emit_math2(c, BRW_MATH_FUNCTION_POW,
1264 &dst[2],
1265 WRITEMASK_X | (mask & SATURATE),
1266 &arg0[1],
1267 &arg0[3]);
1268 }
1269
1270 /* Ordinarily you'd use an iff statement to skip or shortcircuit
1271 * some of the POW calculations above, but 16-wide iff statements
1272 * seem to lock c1 hardware, so this is a nasty workaround:
1273 */
1274 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_LE, arg0[0], brw_imm_f(0));
1275 {
1276 if (mask & WRITEMASK_Y)
1277 brw_MOV(p, dst[1], brw_imm_f(0));
1278
1279 if (mask & WRITEMASK_Z)
1280 brw_MOV(p, dst[2], brw_imm_f(0));
1281 }
1282 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1283 }
1284
1285
1286 /* Kill pixel - set execution mask to zero for those pixels which
1287 * fail.
1288 */
1289 static void emit_kil( struct brw_wm_compile *c,
1290 struct brw_reg *arg0)
1291 {
1292 struct brw_compile *p = &c->func;
1293 struct intel_context *intel = &p->brw->intel;
1294 struct brw_reg pixelmask;
1295 GLuint i, j;
1296
1297 if (intel->gen >= 6)
1298 pixelmask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
1299 else
1300 pixelmask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1301
1302 for (i = 0; i < 4; i++) {
1303 /* Check if we've already done the comparison for this reg
1304 * -- common when someone does KIL TEMP.wwww.
1305 */
1306 for (j = 0; j < i; j++) {
1307 if (memcmp(&arg0[j], &arg0[i], sizeof(arg0[0])) == 0)
1308 break;
1309 }
1310 if (j != i)
1311 continue;
1312
1313 brw_push_insn_state(p);
1314 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0));
1315 brw_set_predicate_control_flag_value(p, 0xff);
1316 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1317 brw_AND(p, pixelmask, brw_flag_reg(), pixelmask);
1318 brw_pop_insn_state(p);
1319 }
1320 }
1321
1322 static void fire_fb_write( struct brw_wm_compile *c,
1323 GLuint base_reg,
1324 GLuint nr,
1325 GLuint target,
1326 GLuint eot )
1327 {
1328 struct brw_compile *p = &c->func;
1329 struct intel_context *intel = &p->brw->intel;
1330 uint32_t msg_control;
1331
1332 /* Pass through control information:
1333 *
1334 * Gen6 has done m1 mov in emit_fb_write() for current SIMD16 case.
1335 */
1336 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
1337 if (intel->gen < 6)
1338 {
1339 brw_push_insn_state(p);
1340 brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
1341 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1342 brw_MOV(p,
1343 brw_message_reg(base_reg + 1),
1344 brw_vec8_grf(1, 0));
1345 brw_pop_insn_state(p);
1346 }
1347
1348 if (c->dispatch_width == 16)
1349 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
1350 else
1351 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
1352
1353 /* Send framebuffer write message: */
1354 /* send (16) null.0<1>:uw m0 r0.0<8;8,1>:uw 0x85a04000:ud { Align1 EOT } */
1355 brw_fb_WRITE(p,
1356 c->dispatch_width,
1357 base_reg,
1358 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
1359 msg_control,
1360 target,
1361 nr,
1362 0,
1363 eot,
1364 true);
1365 }
1366
1367
1368 static void emit_aa( struct brw_wm_compile *c,
1369 struct brw_reg *arg1,
1370 GLuint reg )
1371 {
1372 struct brw_compile *p = &c->func;
1373 GLuint comp = c->aa_dest_stencil_reg / 2;
1374 GLuint off = c->aa_dest_stencil_reg % 2;
1375 struct brw_reg aa = offset(arg1[comp], off);
1376
1377 brw_push_insn_state(p);
1378 brw_set_compression_control(p, BRW_COMPRESSION_NONE); /* ?? */
1379 brw_MOV(p, brw_message_reg(reg), aa);
1380 brw_pop_insn_state(p);
1381 }
1382
1383
1384 /* Post-fragment-program processing. Send the results to the
1385 * framebuffer.
1386 * \param arg0 the fragment color
1387 * \param arg1 the pass-through depth value
1388 * \param arg2 the shader-computed depth value
1389 */
1390 void emit_fb_write(struct brw_wm_compile *c,
1391 struct brw_reg *arg0,
1392 struct brw_reg *arg1,
1393 struct brw_reg *arg2,
1394 GLuint target,
1395 GLuint eot)
1396 {
1397 struct brw_compile *p = &c->func;
1398 struct brw_context *brw = p->brw;
1399 struct intel_context *intel = &brw->intel;
1400 GLuint nr = 2;
1401 GLuint channel;
1402
1403 /* Reserve a space for AA - may not be needed:
1404 */
1405 if (c->aa_dest_stencil_reg)
1406 nr += 1;
1407
1408 /* I don't really understand how this achieves the color interleave
1409 * (ie RGBARGBA) in the result: [Do the saturation here]
1410 */
1411 brw_push_insn_state(p);
1412
1413 if (c->key.clamp_fragment_color)
1414 brw_set_saturate(p, 1);
1415
1416 for (channel = 0; channel < 4; channel++) {
1417 if (intel->gen >= 6) {
1418 /* gen6 SIMD16 single source DP write looks like:
1419 * m + 0: r0
1420 * m + 1: r1
1421 * m + 2: g0
1422 * m + 3: g1
1423 * m + 4: b0
1424 * m + 5: b1
1425 * m + 6: a0
1426 * m + 7: a1
1427 */
1428 if (c->dispatch_width == 16) {
1429 brw_MOV(p, brw_message_reg(nr + channel * 2), arg0[channel]);
1430 } else {
1431 brw_MOV(p, brw_message_reg(nr + channel), arg0[channel]);
1432 }
1433 } else if (c->dispatch_width == 16 && brw->has_compr4) {
1434 /* pre-gen6 SIMD16 single source DP write looks like:
1435 * m + 0: r0
1436 * m + 1: g0
1437 * m + 2: b0
1438 * m + 3: a0
1439 * m + 4: r1
1440 * m + 5: g1
1441 * m + 6: b1
1442 * m + 7: a1
1443 *
1444 * By setting the high bit of the MRF register number, we indicate
1445 * that we want COMPR4 mode - instead of doing the usual destination
1446 * + 1 for the second half we get destination + 4.
1447 */
1448 brw_MOV(p,
1449 brw_message_reg(nr + channel + BRW_MRF_COMPR4),
1450 arg0[channel]);
1451 } else {
1452 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
1453 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
1454 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1455 brw_MOV(p,
1456 brw_message_reg(nr + channel),
1457 arg0[channel]);
1458
1459 if (c->dispatch_width == 16) {
1460 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1461 brw_MOV(p,
1462 brw_message_reg(nr + channel + 4),
1463 sechalf(arg0[channel]));
1464 }
1465 }
1466 }
1467
1468 brw_set_saturate(p, 0);
1469
1470 /* skip over the regs populated above:
1471 */
1472 if (c->dispatch_width == 16)
1473 nr += 8;
1474 else
1475 nr += 4;
1476
1477 brw_pop_insn_state(p);
1478
1479 if (c->source_depth_to_render_target)
1480 {
1481 if (c->computes_depth)
1482 brw_MOV(p, brw_message_reg(nr), arg2[2]);
1483 else
1484 brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */
1485
1486 nr += 2;
1487 }
1488
1489 if (c->dest_depth_reg)
1490 {
1491 GLuint comp = c->dest_depth_reg / 2;
1492 GLuint off = c->dest_depth_reg % 2;
1493
1494 if (off != 0) {
1495 brw_push_insn_state(p);
1496 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1497
1498 brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
1499 /* 2nd half? */
1500 brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
1501 brw_pop_insn_state(p);
1502 }
1503 else {
1504 brw_MOV(p, brw_message_reg(nr), arg1[comp]);
1505 }
1506 nr += 2;
1507 }
1508
1509 if (intel->gen >= 6) {
1510 /* Load the message header. There's no implied move from src0
1511 * to the base mrf on gen6.
1512 */
1513 brw_push_insn_state(p);
1514 brw_set_mask_control(p, BRW_MASK_DISABLE);
1515 brw_MOV(p, retype(brw_message_reg(0), BRW_REGISTER_TYPE_UD),
1516 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1517 brw_pop_insn_state(p);
1518
1519 if (target != 0) {
1520 brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1521 0,
1522 2), BRW_REGISTER_TYPE_UD),
1523 brw_imm_ud(target));
1524 }
1525 }
1526
1527 if (!c->runtime_check_aads_emit) {
1528 if (c->aa_dest_stencil_reg)
1529 emit_aa(c, arg1, 2);
1530
1531 fire_fb_write(c, 0, nr, target, eot);
1532 }
1533 else {
1534 struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
1535 struct brw_reg ip = brw_ip_reg();
1536 int jmp;
1537
1538 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1539 brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
1540 brw_AND(p,
1541 v1_null_ud,
1542 get_element_ud(brw_vec8_grf(1,0), 6),
1543 brw_imm_ud(1<<26));
1544
1545 jmp = brw_JMPI(p, ip, ip, brw_imm_w(0)) - p->store;
1546 {
1547 emit_aa(c, arg1, 2);
1548 fire_fb_write(c, 0, nr, target, eot);
1549 /* note - thread killed in subroutine */
1550 }
1551 brw_land_fwd_jump(p, jmp);
1552
1553 /* ELSE: Shuffle up one register to fill in the hole left for AA:
1554 */
1555 fire_fb_write(c, 1, nr-1, target, eot);
1556 }
1557 }
1558
1559 /**
1560 * Move a GPR to scratch memory.
1561 */
1562 static void emit_spill( struct brw_wm_compile *c,
1563 struct brw_reg reg,
1564 GLuint slot )
1565 {
1566 struct brw_compile *p = &c->func;
1567
1568 /*
1569 mov (16) m2.0<1>:ud r2.0<8;8,1>:ud { Align1 Compr }
1570 */
1571 brw_MOV(p, brw_message_reg(2), reg);
1572
1573 /*
1574 mov (1) r0.2<1>:d 0x00000080:d { Align1 NoMask }
1575 send (16) null.0<1>:uw m1 r0.0<8;8,1>:uw 0x053003ff:ud { Align1 }
1576 */
1577 brw_oword_block_write_scratch(p, brw_message_reg(1), 2, slot);
1578 }
1579
1580
1581 /**
1582 * Load a GPR from scratch memory.
1583 */
1584 static void emit_unspill( struct brw_wm_compile *c,
1585 struct brw_reg reg,
1586 GLuint slot )
1587 {
1588 struct brw_compile *p = &c->func;
1589
1590 /* Slot 0 is the undef value.
1591 */
1592 if (slot == 0) {
1593 brw_MOV(p, reg, brw_imm_f(0));
1594 return;
1595 }
1596
1597 /*
1598 mov (1) r0.2<1>:d 0x000000c0:d { Align1 NoMask }
1599 send (16) r110.0<1>:uw m1 r0.0<8;8,1>:uw 0x041243ff:ud { Align1 }
1600 */
1601
1602 brw_oword_block_read(p, vec16(reg), brw_message_reg(1), 2, slot);
1603 }
1604
1605
1606 /**
1607 * Retrieve up to 4 GEN4 register pairs for the given wm reg:
1608 * Args with unspill_reg != 0 will be loaded from scratch memory.
1609 */
1610 static void get_argument_regs( struct brw_wm_compile *c,
1611 struct brw_wm_ref *arg[],
1612 struct brw_reg *regs )
1613 {
1614 GLuint i;
1615
1616 for (i = 0; i < 4; i++) {
1617 if (arg[i]) {
1618 if (arg[i]->unspill_reg)
1619 emit_unspill(c,
1620 brw_vec8_grf(arg[i]->unspill_reg, 0),
1621 arg[i]->value->spill_slot);
1622
1623 regs[i] = arg[i]->hw_reg;
1624 }
1625 else {
1626 regs[i] = brw_null_reg();
1627 }
1628 }
1629 }
1630
1631
1632 /**
1633 * For values that have a spill_slot!=0, write those regs to scratch memory.
1634 */
1635 static void spill_values( struct brw_wm_compile *c,
1636 struct brw_wm_value *values,
1637 GLuint nr )
1638 {
1639 GLuint i;
1640
1641 for (i = 0; i < nr; i++)
1642 if (values[i].spill_slot)
1643 emit_spill(c, values[i].hw_reg, values[i].spill_slot);
1644 }
1645
1646
1647 /* Emit the fragment program instructions here.
1648 */
1649 void brw_wm_emit( struct brw_wm_compile *c )
1650 {
1651 struct brw_compile *p = &c->func;
1652 struct intel_context *intel = &p->brw->intel;
1653 GLuint insn;
1654
1655 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1656 if (intel->gen >= 6)
1657 brw_set_acc_write_control(p, 1);
1658
1659 /* Check if any of the payload regs need to be spilled:
1660 */
1661 spill_values(c, c->payload.depth, 4);
1662 spill_values(c, c->creg, c->nr_creg);
1663 spill_values(c, c->payload.input_interp, FRAG_ATTRIB_MAX);
1664
1665
1666 for (insn = 0; insn < c->nr_insns; insn++) {
1667
1668 struct brw_wm_instruction *inst = &c->instruction[insn];
1669 struct brw_reg args[3][4], dst[4];
1670 GLuint i, dst_flags;
1671
1672 /* Get argument regs:
1673 */
1674 for (i = 0; i < 3; i++)
1675 get_argument_regs(c, inst->src[i], args[i]);
1676
1677 /* Get dest regs:
1678 */
1679 for (i = 0; i < 4; i++)
1680 if (inst->dst[i])
1681 dst[i] = inst->dst[i]->hw_reg;
1682 else
1683 dst[i] = brw_null_reg();
1684
1685 /* Flags
1686 */
1687 dst_flags = inst->writemask;
1688 if (inst->saturate)
1689 dst_flags |= SATURATE;
1690
1691 switch (inst->opcode) {
1692 /* Generated instructions for calculating triangle interpolants:
1693 */
1694 case WM_PIXELXY:
1695 emit_pixel_xy(c, dst, dst_flags);
1696 break;
1697
1698 case WM_DELTAXY:
1699 emit_delta_xy(p, dst, dst_flags, args[0]);
1700 break;
1701
1702 case WM_WPOSXY:
1703 emit_wpos_xy(c, dst, dst_flags, args[0]);
1704 break;
1705
1706 case WM_PIXELW:
1707 emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1708 break;
1709
1710 case WM_LINTERP:
1711 emit_linterp(p, dst, dst_flags, args[0], args[1]);
1712 break;
1713
1714 case WM_PINTERP:
1715 emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1716 break;
1717
1718 case WM_CINTERP:
1719 emit_cinterp(p, dst, dst_flags, args[0]);
1720 break;
1721
1722 case WM_FB_WRITE:
1723 emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot);
1724 break;
1725
1726 case WM_FRONTFACING:
1727 emit_frontfacing(p, dst, dst_flags);
1728 break;
1729
1730 /* Straightforward arithmetic:
1731 */
1732 case OPCODE_ADD:
1733 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1734 break;
1735
1736 case OPCODE_FRC:
1737 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1738 break;
1739
1740 case OPCODE_FLR:
1741 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1742 break;
1743
1744 case OPCODE_DDX:
1745 emit_ddxy(p, dst, dst_flags, true, args[0], false);
1746 break;
1747
1748 case OPCODE_DDY:
1749 /* Make sure fp->program.UsesDFdy flag got set (otherwise there's no
1750 * guarantee that c->key.render_to_fbo is set).
1751 */
1752 assert(c->fp->program.UsesDFdy);
1753 emit_ddxy(p, dst, dst_flags, false, args[0], c->key.render_to_fbo);
1754 break;
1755
1756 case OPCODE_DP2:
1757 emit_dp2(p, dst, dst_flags, args[0], args[1]);
1758 break;
1759
1760 case OPCODE_DP3:
1761 emit_dp3(p, dst, dst_flags, args[0], args[1]);
1762 break;
1763
1764 case OPCODE_DP4:
1765 emit_dp4(p, dst, dst_flags, args[0], args[1]);
1766 break;
1767
1768 case OPCODE_DPH:
1769 emit_dph(p, dst, dst_flags, args[0], args[1]);
1770 break;
1771
1772 case OPCODE_TRUNC:
1773 for (i = 0; i < 4; i++) {
1774 if (dst_flags & (1<<i)) {
1775 brw_RNDZ(p, dst[i], args[0][i]);
1776 }
1777 }
1778 break;
1779
1780 case OPCODE_LRP:
1781 emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
1782 break;
1783
1784 case OPCODE_MAD:
1785 emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
1786 break;
1787
1788 case OPCODE_MOV:
1789 case OPCODE_SWZ:
1790 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1791 break;
1792
1793 case OPCODE_MUL:
1794 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
1795 break;
1796
1797 case OPCODE_XPD:
1798 emit_xpd(p, dst, dst_flags, args[0], args[1]);
1799 break;
1800
1801 /* Higher math functions:
1802 */
1803 case OPCODE_RCP:
1804 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1805 break;
1806
1807 case OPCODE_RSQ:
1808 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1809 break;
1810
1811 case OPCODE_SIN:
1812 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1813 break;
1814
1815 case OPCODE_COS:
1816 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1817 break;
1818
1819 case OPCODE_EX2:
1820 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1821 break;
1822
1823 case OPCODE_LG2:
1824 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1825 break;
1826
1827 case OPCODE_SCS:
1828 /* There is an scs math function, but it would need some
1829 * fixup for 16-element execution.
1830 */
1831 if (dst_flags & WRITEMASK_X)
1832 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1833 if (dst_flags & WRITEMASK_Y)
1834 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1835 break;
1836
1837 case OPCODE_POW:
1838 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
1839 break;
1840
1841 /* Comparisons:
1842 */
1843 case OPCODE_CMP:
1844 emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
1845 break;
1846
1847 case OPCODE_MAX:
1848 emit_max(p, dst, dst_flags, args[0], args[1]);
1849 break;
1850
1851 case OPCODE_MIN:
1852 emit_min(p, dst, dst_flags, args[0], args[1]);
1853 break;
1854
1855 case OPCODE_SLT:
1856 emit_slt(p, dst, dst_flags, args[0], args[1]);
1857 break;
1858
1859 case OPCODE_SLE:
1860 emit_sle(p, dst, dst_flags, args[0], args[1]);
1861 break;
1862 case OPCODE_SGT:
1863 emit_sgt(p, dst, dst_flags, args[0], args[1]);
1864 break;
1865 case OPCODE_SGE:
1866 emit_sge(p, dst, dst_flags, args[0], args[1]);
1867 break;
1868 case OPCODE_SEQ:
1869 emit_seq(p, dst, dst_flags, args[0], args[1]);
1870 break;
1871 case OPCODE_SNE:
1872 emit_sne(p, dst, dst_flags, args[0], args[1]);
1873 break;
1874
1875 case OPCODE_SSG:
1876 emit_sign(p, dst, dst_flags, args[0]);
1877 break;
1878
1879 case OPCODE_LIT:
1880 emit_lit(c, dst, dst_flags, args[0]);
1881 break;
1882
1883 /* Texturing operations:
1884 */
1885 case OPCODE_TEX:
1886 emit_tex(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1887 inst->tex_idx, inst->tex_unit,
1888 inst->tex_shadow);
1889 break;
1890
1891 case OPCODE_TXB:
1892 emit_txb(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1893 inst->tex_idx, inst->tex_unit);
1894 break;
1895
1896 case OPCODE_KIL:
1897 emit_kil(c, args[0]);
1898 break;
1899
1900 default:
1901 printf("Unsupported opcode %i (%s) in fragment shader\n",
1902 inst->opcode, inst->opcode < MAX_OPCODE ?
1903 _mesa_opcode_string(inst->opcode) :
1904 "unknown");
1905 }
1906
1907 for (i = 0; i < 4; i++)
1908 if (inst->dst[i] && inst->dst[i]->spill_slot)
1909 emit_spill(c,
1910 inst->dst[i]->hw_reg,
1911 inst->dst[i]->spill_slot);
1912 }
1913
1914 /* Only properly tested on ILK */
1915 if (p->brw->intel.gen == 5) {
1916 brw_remove_duplicate_mrf_moves(p);
1917 if (c->dispatch_width == 16)
1918 brw_remove_grf_to_mrf_moves(p);
1919 }
1920
1921 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1922 printf("wm-native:\n");
1923 brw_dump_compile(p, stdout, 0, p->next_insn_offset);
1924 printf("\n");
1925 }
1926 }
1927