d10e1c70d28cef5581ec459de74bfe84e27d7b75
[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "main/macros.h"
34 #include "brw_context.h"
35 #include "brw_wm.h"
36
37 static GLboolean can_do_pln(struct intel_context *intel,
38 const struct brw_reg *deltas)
39 {
40 struct brw_context *brw = brw_context(&intel->ctx);
41
42 if (!brw->has_pln)
43 return GL_FALSE;
44
45 if (deltas[1].nr != deltas[0].nr + 1)
46 return GL_FALSE;
47
48 if (intel->gen < 6 && ((deltas[0].nr & 1) != 0))
49 return GL_FALSE;
50
51 return GL_TRUE;
52 }
53
54 /* Not quite sure how correct this is - need to understand horiz
55 * vs. vertical strides a little better.
56 */
57 static INLINE struct brw_reg sechalf( struct brw_reg reg )
58 {
59 if (reg.vstride)
60 reg.nr++;
61 return reg;
62 }
63
64 /* Return the SrcReg index of the channels that can be immediate float operands
65 * instead of usage of PROGRAM_CONSTANT values through push/pull.
66 */
67 GLboolean
68 brw_wm_arg_can_be_immediate(enum prog_opcode opcode, int arg)
69 {
70 int opcode_array[] = {
71 [OPCODE_ADD] = 2,
72 [OPCODE_CMP] = 3,
73 [OPCODE_DP3] = 2,
74 [OPCODE_DP4] = 2,
75 [OPCODE_DPH] = 2,
76 [OPCODE_MAX] = 2,
77 [OPCODE_MIN] = 2,
78 [OPCODE_MOV] = 1,
79 [OPCODE_MUL] = 2,
80 [OPCODE_SEQ] = 2,
81 [OPCODE_SGE] = 2,
82 [OPCODE_SGT] = 2,
83 [OPCODE_SLE] = 2,
84 [OPCODE_SLT] = 2,
85 [OPCODE_SNE] = 2,
86 [OPCODE_XPD] = 2,
87 };
88
89 /* These opcodes get broken down in a way that allow two
90 * args to be immediates.
91 */
92 if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
93 if (arg == 1 || arg == 2)
94 return GL_TRUE;
95 }
96
97 if (opcode > ARRAY_SIZE(opcode_array))
98 return GL_FALSE;
99
100 return arg == opcode_array[opcode] - 1;
101 }
102
103 /**
104 * Computes the screen-space x,y position of the pixels.
105 *
106 * This will be used by emit_delta_xy() or emit_wpos_xy() for
107 * interpolation of attributes..
108 *
109 * Payload R0:
110 *
111 * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles,
112 * corresponding to each of the 16 execution channels.
113 * R0.1..8 -- ?
114 * R1.0 -- triangle vertex 0.X
115 * R1.1 -- triangle vertex 0.Y
116 * R1.2 -- tile 0 x,y coords (2 packed uwords)
117 * R1.3 -- tile 1 x,y coords (2 packed uwords)
118 * R1.4 -- tile 2 x,y coords (2 packed uwords)
119 * R1.5 -- tile 3 x,y coords (2 packed uwords)
120 * R1.6 -- ?
121 * R1.7 -- ?
122 * R1.8 -- ?
123 */
124 void emit_pixel_xy(struct brw_wm_compile *c,
125 const struct brw_reg *dst,
126 GLuint mask)
127 {
128 struct brw_compile *p = &c->func;
129 struct brw_reg r1 = brw_vec1_grf(1, 0);
130 struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
131 struct brw_reg dst0_uw, dst1_uw;
132
133 brw_push_insn_state(p);
134 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
135
136 if (c->dispatch_width == 16) {
137 dst0_uw = vec16(retype(dst[0], BRW_REGISTER_TYPE_UW));
138 dst1_uw = vec16(retype(dst[1], BRW_REGISTER_TYPE_UW));
139 } else {
140 dst0_uw = vec8(retype(dst[0], BRW_REGISTER_TYPE_UW));
141 dst1_uw = vec8(retype(dst[1], BRW_REGISTER_TYPE_UW));
142 }
143
144 /* Calculate pixel centers by adding 1 or 0 to each of the
145 * micro-tile coordinates passed in r1.
146 */
147 if (mask & WRITEMASK_X) {
148 brw_ADD(p,
149 dst0_uw,
150 stride(suboffset(r1_uw, 4), 2, 4, 0),
151 brw_imm_v(0x10101010));
152 }
153
154 if (mask & WRITEMASK_Y) {
155 brw_ADD(p,
156 dst1_uw,
157 stride(suboffset(r1_uw,5), 2, 4, 0),
158 brw_imm_v(0x11001100));
159 }
160 brw_pop_insn_state(p);
161 }
162
163 /**
164 * Computes the screen-space x,y distance of the pixels from the start
165 * vertex.
166 *
167 * This will be used in linterp or pinterp with the start vertex value
168 * and the Cx, Cy, and C0 coefficients passed in from the setup engine
169 * to produce interpolated attribute values.
170 */
171 void emit_delta_xy(struct brw_compile *p,
172 const struct brw_reg *dst,
173 GLuint mask,
174 const struct brw_reg *arg0)
175 {
176 struct brw_reg r1 = brw_vec1_grf(1, 0);
177
178 if (mask == 0)
179 return;
180
181 assert(mask == WRITEMASK_XY);
182
183 /* Calc delta X,Y by subtracting origin in r1 from the pixel
184 * centers produced by emit_pixel_xy().
185 */
186 brw_ADD(p,
187 dst[0],
188 retype(arg0[0], BRW_REGISTER_TYPE_UW),
189 negate(r1));
190 brw_ADD(p,
191 dst[1],
192 retype(arg0[1], BRW_REGISTER_TYPE_UW),
193 negate(suboffset(r1,1)));
194 }
195
196 /**
197 * Computes the pixel offset from the window origin for gl_FragCoord().
198 */
199 void emit_wpos_xy(struct brw_wm_compile *c,
200 const struct brw_reg *dst,
201 GLuint mask,
202 const struct brw_reg *arg0)
203 {
204 struct brw_compile *p = &c->func;
205
206 if (mask & WRITEMASK_X) {
207 if (c->fp->program.PixelCenterInteger) {
208 /* X' = X */
209 brw_MOV(p,
210 dst[0],
211 retype(arg0[0], BRW_REGISTER_TYPE_W));
212 } else {
213 /* X' = X + 0.5 */
214 brw_ADD(p,
215 dst[0],
216 retype(arg0[0], BRW_REGISTER_TYPE_W),
217 brw_imm_f(0.5));
218 }
219 }
220
221 if (mask & WRITEMASK_Y) {
222 if (c->fp->program.OriginUpperLeft) {
223 if (c->fp->program.PixelCenterInteger) {
224 /* Y' = Y */
225 brw_MOV(p,
226 dst[1],
227 retype(arg0[1], BRW_REGISTER_TYPE_W));
228 } else {
229 /* Y' = Y + 0.5 */
230 brw_ADD(p,
231 dst[1],
232 retype(arg0[1], BRW_REGISTER_TYPE_W),
233 brw_imm_f(0.5));
234 }
235 } else {
236 float center_offset = c->fp->program.PixelCenterInteger ? 0.0 : 0.5;
237
238 /* Y' = (height - 1) - Y + center */
239 brw_ADD(p,
240 dst[1],
241 negate(retype(arg0[1], BRW_REGISTER_TYPE_W)),
242 brw_imm_f(c->key.drawable_height - 1 + center_offset));
243 }
244 }
245 }
246
247
248 void emit_pixel_w(struct brw_wm_compile *c,
249 const struct brw_reg *dst,
250 GLuint mask,
251 const struct brw_reg *arg0,
252 const struct brw_reg *deltas)
253 {
254 struct brw_compile *p = &c->func;
255 struct intel_context *intel = &p->brw->intel;
256
257 /* Don't need this if all you are doing is interpolating color, for
258 * instance.
259 */
260 if (mask & WRITEMASK_W) {
261 struct brw_reg interp3 = brw_vec1_grf(arg0[0].nr+1, 4);
262
263 /* Calc 1/w - just linterp wpos[3] optimized by putting the
264 * result straight into a message reg.
265 */
266 if (can_do_pln(intel, deltas)) {
267 brw_PLN(p, brw_message_reg(2), interp3, deltas[0]);
268 } else {
269 brw_LINE(p, brw_null_reg(), interp3, deltas[0]);
270 brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), deltas[1]);
271 }
272
273 /* Calc w */
274 if (c->dispatch_width == 16) {
275 brw_math_16(p, dst[3],
276 BRW_MATH_FUNCTION_INV,
277 BRW_MATH_SATURATE_NONE,
278 2, brw_null_reg(),
279 BRW_MATH_PRECISION_FULL);
280 } else {
281 brw_math(p, dst[3],
282 BRW_MATH_FUNCTION_INV,
283 BRW_MATH_SATURATE_NONE,
284 2, brw_null_reg(),
285 BRW_MATH_DATA_VECTOR,
286 BRW_MATH_PRECISION_FULL);
287 }
288 }
289 }
290
291
292 void emit_linterp(struct brw_compile *p,
293 const struct brw_reg *dst,
294 GLuint mask,
295 const struct brw_reg *arg0,
296 const struct brw_reg *deltas)
297 {
298 struct intel_context *intel = &p->brw->intel;
299 struct brw_reg interp[4];
300 GLuint nr = arg0[0].nr;
301 GLuint i;
302
303 interp[0] = brw_vec1_grf(nr, 0);
304 interp[1] = brw_vec1_grf(nr, 4);
305 interp[2] = brw_vec1_grf(nr+1, 0);
306 interp[3] = brw_vec1_grf(nr+1, 4);
307
308 for (i = 0; i < 4; i++) {
309 if (mask & (1<<i)) {
310 if (can_do_pln(intel, deltas)) {
311 brw_PLN(p, dst[i], interp[i], deltas[0]);
312 } else {
313 brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
314 brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
315 }
316 }
317 }
318 }
319
320
321 void emit_pinterp(struct brw_compile *p,
322 const struct brw_reg *dst,
323 GLuint mask,
324 const struct brw_reg *arg0,
325 const struct brw_reg *deltas,
326 const struct brw_reg *w)
327 {
328 struct intel_context *intel = &p->brw->intel;
329 struct brw_reg interp[4];
330 GLuint nr = arg0[0].nr;
331 GLuint i;
332
333 interp[0] = brw_vec1_grf(nr, 0);
334 interp[1] = brw_vec1_grf(nr, 4);
335 interp[2] = brw_vec1_grf(nr+1, 0);
336 interp[3] = brw_vec1_grf(nr+1, 4);
337
338 for (i = 0; i < 4; i++) {
339 if (mask & (1<<i)) {
340 if (can_do_pln(intel, deltas)) {
341 brw_PLN(p, dst[i], interp[i], deltas[0]);
342 } else {
343 brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
344 brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
345 }
346 }
347 }
348 for (i = 0; i < 4; i++) {
349 if (mask & (1<<i)) {
350 brw_MUL(p, dst[i], dst[i], w[3]);
351 }
352 }
353 }
354
355
356 void emit_cinterp(struct brw_compile *p,
357 const struct brw_reg *dst,
358 GLuint mask,
359 const struct brw_reg *arg0)
360 {
361 struct brw_reg interp[4];
362 GLuint nr = arg0[0].nr;
363 GLuint i;
364
365 interp[0] = brw_vec1_grf(nr, 0);
366 interp[1] = brw_vec1_grf(nr, 4);
367 interp[2] = brw_vec1_grf(nr+1, 0);
368 interp[3] = brw_vec1_grf(nr+1, 4);
369
370 for (i = 0; i < 4; i++) {
371 if (mask & (1<<i)) {
372 brw_MOV(p, dst[i], suboffset(interp[i],3)); /* TODO: optimize away like other moves */
373 }
374 }
375 }
376
377 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
378 void emit_frontfacing(struct brw_compile *p,
379 const struct brw_reg *dst,
380 GLuint mask)
381 {
382 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
383 GLuint i;
384
385 if (!(mask & WRITEMASK_XYZW))
386 return;
387
388 for (i = 0; i < 4; i++) {
389 if (mask & (1<<i)) {
390 brw_MOV(p, dst[i], brw_imm_f(0.0));
391 }
392 }
393
394 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
395 * us front face
396 */
397 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
398 for (i = 0; i < 4; i++) {
399 if (mask & (1<<i)) {
400 brw_MOV(p, dst[i], brw_imm_f(1.0));
401 }
402 }
403 brw_set_predicate_control_flag_value(p, 0xff);
404 }
405
406 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
407 * looking like:
408 *
409 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
410 *
411 * and we're trying to produce:
412 *
413 * DDX DDY
414 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
415 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
416 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
417 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
418 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
419 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
420 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
421 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
422 *
423 * and add another set of two more subspans if in 16-pixel dispatch mode.
424 *
425 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
426 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
427 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
428 * between each other. We could probably do it like ddx and swizzle the right
429 * order later, but bail for now and just produce
430 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
431 */
432 void emit_ddxy(struct brw_compile *p,
433 const struct brw_reg *dst,
434 GLuint mask,
435 GLboolean is_ddx,
436 const struct brw_reg *arg0)
437 {
438 int i;
439 struct brw_reg src0, src1;
440
441 if (mask & SATURATE)
442 brw_set_saturate(p, 1);
443 for (i = 0; i < 4; i++ ) {
444 if (mask & (1<<i)) {
445 if (is_ddx) {
446 src0 = brw_reg(arg0[i].file, arg0[i].nr, 1,
447 BRW_REGISTER_TYPE_F,
448 BRW_VERTICAL_STRIDE_2,
449 BRW_WIDTH_2,
450 BRW_HORIZONTAL_STRIDE_0,
451 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
452 src1 = brw_reg(arg0[i].file, arg0[i].nr, 0,
453 BRW_REGISTER_TYPE_F,
454 BRW_VERTICAL_STRIDE_2,
455 BRW_WIDTH_2,
456 BRW_HORIZONTAL_STRIDE_0,
457 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
458 } else {
459 src0 = brw_reg(arg0[i].file, arg0[i].nr, 0,
460 BRW_REGISTER_TYPE_F,
461 BRW_VERTICAL_STRIDE_4,
462 BRW_WIDTH_4,
463 BRW_HORIZONTAL_STRIDE_0,
464 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
465 src1 = brw_reg(arg0[i].file, arg0[i].nr, 2,
466 BRW_REGISTER_TYPE_F,
467 BRW_VERTICAL_STRIDE_4,
468 BRW_WIDTH_4,
469 BRW_HORIZONTAL_STRIDE_0,
470 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
471 }
472 brw_ADD(p, dst[i], src0, negate(src1));
473 }
474 }
475 if (mask & SATURATE)
476 brw_set_saturate(p, 0);
477 }
478
479 void emit_alu1(struct brw_compile *p,
480 struct brw_instruction *(*func)(struct brw_compile *,
481 struct brw_reg,
482 struct brw_reg),
483 const struct brw_reg *dst,
484 GLuint mask,
485 const struct brw_reg *arg0)
486 {
487 GLuint i;
488
489 if (mask & SATURATE)
490 brw_set_saturate(p, 1);
491
492 for (i = 0; i < 4; i++) {
493 if (mask & (1<<i)) {
494 func(p, dst[i], arg0[i]);
495 }
496 }
497
498 if (mask & SATURATE)
499 brw_set_saturate(p, 0);
500 }
501
502
503 void emit_alu2(struct brw_compile *p,
504 struct brw_instruction *(*func)(struct brw_compile *,
505 struct brw_reg,
506 struct brw_reg,
507 struct brw_reg),
508 const struct brw_reg *dst,
509 GLuint mask,
510 const struct brw_reg *arg0,
511 const struct brw_reg *arg1)
512 {
513 GLuint i;
514
515 if (mask & SATURATE)
516 brw_set_saturate(p, 1);
517
518 for (i = 0; i < 4; i++) {
519 if (mask & (1<<i)) {
520 func(p, dst[i], arg0[i], arg1[i]);
521 }
522 }
523
524 if (mask & SATURATE)
525 brw_set_saturate(p, 0);
526 }
527
528
529 void emit_mad(struct brw_compile *p,
530 const struct brw_reg *dst,
531 GLuint mask,
532 const struct brw_reg *arg0,
533 const struct brw_reg *arg1,
534 const struct brw_reg *arg2)
535 {
536 GLuint i;
537
538 for (i = 0; i < 4; i++) {
539 if (mask & (1<<i)) {
540 brw_MUL(p, dst[i], arg0[i], arg1[i]);
541
542 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
543 brw_ADD(p, dst[i], dst[i], arg2[i]);
544 brw_set_saturate(p, 0);
545 }
546 }
547 }
548
549 void emit_lrp(struct brw_compile *p,
550 const struct brw_reg *dst,
551 GLuint mask,
552 const struct brw_reg *arg0,
553 const struct brw_reg *arg1,
554 const struct brw_reg *arg2)
555 {
556 GLuint i;
557
558 /* Uses dst as a temporary:
559 */
560 for (i = 0; i < 4; i++) {
561 if (mask & (1<<i)) {
562 /* Can I use the LINE instruction for this?
563 */
564 brw_ADD(p, dst[i], negate(arg0[i]), brw_imm_f(1.0));
565 brw_MUL(p, brw_null_reg(), dst[i], arg2[i]);
566
567 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
568 brw_MAC(p, dst[i], arg0[i], arg1[i]);
569 brw_set_saturate(p, 0);
570 }
571 }
572 }
573
574 void emit_sop(struct brw_compile *p,
575 const struct brw_reg *dst,
576 GLuint mask,
577 GLuint cond,
578 const struct brw_reg *arg0,
579 const struct brw_reg *arg1)
580 {
581 GLuint i;
582
583 for (i = 0; i < 4; i++) {
584 if (mask & (1<<i)) {
585 brw_push_insn_state(p);
586 brw_CMP(p, brw_null_reg(), cond, arg0[i], arg1[i]);
587 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
588 brw_MOV(p, dst[i], brw_imm_f(0));
589 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
590 brw_MOV(p, dst[i], brw_imm_f(1.0));
591 brw_pop_insn_state(p);
592 }
593 }
594 }
595
596 static void emit_slt( struct brw_compile *p,
597 const struct brw_reg *dst,
598 GLuint mask,
599 const struct brw_reg *arg0,
600 const struct brw_reg *arg1 )
601 {
602 emit_sop(p, dst, mask, BRW_CONDITIONAL_L, arg0, arg1);
603 }
604
605 static void emit_sle( struct brw_compile *p,
606 const struct brw_reg *dst,
607 GLuint mask,
608 const struct brw_reg *arg0,
609 const struct brw_reg *arg1 )
610 {
611 emit_sop(p, dst, mask, BRW_CONDITIONAL_LE, arg0, arg1);
612 }
613
614 static void emit_sgt( struct brw_compile *p,
615 const struct brw_reg *dst,
616 GLuint mask,
617 const struct brw_reg *arg0,
618 const struct brw_reg *arg1 )
619 {
620 emit_sop(p, dst, mask, BRW_CONDITIONAL_G, arg0, arg1);
621 }
622
623 static void emit_sge( struct brw_compile *p,
624 const struct brw_reg *dst,
625 GLuint mask,
626 const struct brw_reg *arg0,
627 const struct brw_reg *arg1 )
628 {
629 emit_sop(p, dst, mask, BRW_CONDITIONAL_GE, arg0, arg1);
630 }
631
632 static void emit_seq( struct brw_compile *p,
633 const struct brw_reg *dst,
634 GLuint mask,
635 const struct brw_reg *arg0,
636 const struct brw_reg *arg1 )
637 {
638 emit_sop(p, dst, mask, BRW_CONDITIONAL_EQ, arg0, arg1);
639 }
640
641 static void emit_sne( struct brw_compile *p,
642 const struct brw_reg *dst,
643 GLuint mask,
644 const struct brw_reg *arg0,
645 const struct brw_reg *arg1 )
646 {
647 emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1);
648 }
649
650 void emit_cmp(struct brw_compile *p,
651 const struct brw_reg *dst,
652 GLuint mask,
653 const struct brw_reg *arg0,
654 const struct brw_reg *arg1,
655 const struct brw_reg *arg2)
656 {
657 GLuint i;
658
659 for (i = 0; i < 4; i++) {
660 if (mask & (1<<i)) {
661 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
662
663 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
664 brw_SEL(p, dst[i], arg1[i], arg2[i]);
665 brw_set_saturate(p, 0);
666 brw_set_predicate_control_flag_value(p, 0xff);
667 }
668 }
669 }
670
671 void emit_max(struct brw_compile *p,
672 const struct brw_reg *dst,
673 GLuint mask,
674 const struct brw_reg *arg0,
675 const struct brw_reg *arg1)
676 {
677 GLuint i;
678
679 for (i = 0; i < 4; i++) {
680 if (mask & (1<<i)) {
681 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], arg1[i]);
682
683 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
684 brw_SEL(p, dst[i], arg0[i], arg1[i]);
685 brw_set_saturate(p, 0);
686 brw_set_predicate_control_flag_value(p, 0xff);
687 }
688 }
689 }
690
691 void emit_min(struct brw_compile *p,
692 const struct brw_reg *dst,
693 GLuint mask,
694 const struct brw_reg *arg0,
695 const struct brw_reg *arg1)
696 {
697 GLuint i;
698
699 for (i = 0; i < 4; i++) {
700 if (mask & (1<<i)) {
701 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
702
703 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
704 brw_SEL(p, dst[i], arg0[i], arg1[i]);
705 brw_set_saturate(p, 0);
706 brw_set_predicate_control_flag_value(p, 0xff);
707 }
708 }
709 }
710
711
712 void emit_dp3(struct brw_compile *p,
713 const struct brw_reg *dst,
714 GLuint mask,
715 const struct brw_reg *arg0,
716 const struct brw_reg *arg1)
717 {
718 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
719
720 if (!(mask & WRITEMASK_XYZW))
721 return; /* Do not emit dead code */
722
723 assert(is_power_of_two(mask & WRITEMASK_XYZW));
724
725 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
726 brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
727
728 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
729 brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
730 brw_set_saturate(p, 0);
731 }
732
733
734 void emit_dp4(struct brw_compile *p,
735 const struct brw_reg *dst,
736 GLuint mask,
737 const struct brw_reg *arg0,
738 const struct brw_reg *arg1)
739 {
740 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
741
742 if (!(mask & WRITEMASK_XYZW))
743 return; /* Do not emit dead code */
744
745 assert(is_power_of_two(mask & WRITEMASK_XYZW));
746
747 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
748 brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
749 brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]);
750
751 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
752 brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]);
753 brw_set_saturate(p, 0);
754 }
755
756
757 void emit_dph(struct brw_compile *p,
758 const struct brw_reg *dst,
759 GLuint mask,
760 const struct brw_reg *arg0,
761 const struct brw_reg *arg1)
762 {
763 const int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
764
765 if (!(mask & WRITEMASK_XYZW))
766 return; /* Do not emit dead code */
767
768 assert(is_power_of_two(mask & WRITEMASK_XYZW));
769
770 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
771 brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
772 brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
773
774 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
775 brw_ADD(p, dst[dst_chan], dst[dst_chan], arg1[3]);
776 brw_set_saturate(p, 0);
777 }
778
779
780 void emit_xpd(struct brw_compile *p,
781 const struct brw_reg *dst,
782 GLuint mask,
783 const struct brw_reg *arg0,
784 const struct brw_reg *arg1)
785 {
786 GLuint i;
787
788 assert((mask & WRITEMASK_W) != WRITEMASK_W);
789
790 for (i = 0 ; i < 3; i++) {
791 if (mask & (1<<i)) {
792 GLuint i2 = (i+2)%3;
793 GLuint i1 = (i+1)%3;
794
795 brw_MUL(p, brw_null_reg(), negate(arg0[i2]), arg1[i1]);
796
797 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
798 brw_MAC(p, dst[i], arg0[i1], arg1[i2]);
799 brw_set_saturate(p, 0);
800 }
801 }
802 }
803
804
805 void emit_math1(struct brw_wm_compile *c,
806 GLuint function,
807 const struct brw_reg *dst,
808 GLuint mask,
809 const struct brw_reg *arg0)
810 {
811 struct brw_compile *p = &c->func;
812 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
813 GLuint saturate = ((mask & SATURATE) ?
814 BRW_MATH_SATURATE_SATURATE :
815 BRW_MATH_SATURATE_NONE);
816
817 if (!(mask & WRITEMASK_XYZW))
818 return; /* Do not emit dead code */
819
820 assert(is_power_of_two(mask & WRITEMASK_XYZW));
821
822 /* If compressed, this will write message reg 2,3 from arg0.x's 16
823 * channels.
824 */
825 brw_MOV(p, brw_message_reg(2), arg0[0]);
826
827 /* Send two messages to perform all 16 operations:
828 */
829 brw_push_insn_state(p);
830 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
831 brw_math(p,
832 dst[dst_chan],
833 function,
834 saturate,
835 2,
836 brw_null_reg(),
837 BRW_MATH_DATA_VECTOR,
838 BRW_MATH_PRECISION_FULL);
839
840 if (c->dispatch_width == 16) {
841 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
842 brw_math(p,
843 offset(dst[dst_chan],1),
844 function,
845 saturate,
846 3,
847 brw_null_reg(),
848 BRW_MATH_DATA_VECTOR,
849 BRW_MATH_PRECISION_FULL);
850 }
851 brw_pop_insn_state(p);
852 }
853
854
855 void emit_math2(struct brw_wm_compile *c,
856 GLuint function,
857 const struct brw_reg *dst,
858 GLuint mask,
859 const struct brw_reg *arg0,
860 const struct brw_reg *arg1)
861 {
862 struct brw_compile *p = &c->func;
863 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
864 GLuint saturate = ((mask & SATURATE) ?
865 BRW_MATH_SATURATE_SATURATE :
866 BRW_MATH_SATURATE_NONE);
867
868 if (!(mask & WRITEMASK_XYZW))
869 return; /* Do not emit dead code */
870
871 assert(is_power_of_two(mask & WRITEMASK_XYZW));
872
873 brw_push_insn_state(p);
874
875 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
876 brw_MOV(p, brw_message_reg(2), arg0[0]);
877 if (c->dispatch_width == 16) {
878 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
879 brw_MOV(p, brw_message_reg(4), sechalf(arg0[0]));
880 }
881
882 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
883 brw_MOV(p, brw_message_reg(3), arg1[0]);
884 if (c->dispatch_width == 16) {
885 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
886 brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
887 }
888
889 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
890 brw_math(p,
891 dst[dst_chan],
892 function,
893 saturate,
894 2,
895 brw_null_reg(),
896 BRW_MATH_DATA_VECTOR,
897 BRW_MATH_PRECISION_FULL);
898
899 /* Send two messages to perform all 16 operations:
900 */
901 if (c->dispatch_width == 16) {
902 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
903 brw_math(p,
904 offset(dst[dst_chan],1),
905 function,
906 saturate,
907 4,
908 brw_null_reg(),
909 BRW_MATH_DATA_VECTOR,
910 BRW_MATH_PRECISION_FULL);
911 }
912 brw_pop_insn_state(p);
913 }
914
915
916 void emit_tex(struct brw_wm_compile *c,
917 struct brw_reg *dst,
918 GLuint dst_flags,
919 struct brw_reg *arg,
920 struct brw_reg depth_payload,
921 GLuint tex_idx,
922 GLuint sampler,
923 GLboolean shadow)
924 {
925 struct brw_compile *p = &c->func;
926 struct intel_context *intel = &p->brw->intel;
927 struct brw_reg dst_retyped;
928 GLuint cur_mrf = 2, response_length;
929 GLuint i, nr_texcoords;
930 GLuint emit;
931 GLuint msg_type;
932 GLuint mrf_per_channel;
933 GLuint simd_mode;
934
935 if (c->dispatch_width == 16) {
936 mrf_per_channel = 2;
937 response_length = 8;
938 dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
939 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
940 } else {
941 mrf_per_channel = 1;
942 response_length = 4;
943 dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
944 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
945 }
946
947 /* How many input regs are there?
948 */
949 switch (tex_idx) {
950 case TEXTURE_1D_INDEX:
951 emit = WRITEMASK_X;
952 nr_texcoords = 1;
953 break;
954 case TEXTURE_2D_INDEX:
955 case TEXTURE_RECT_INDEX:
956 emit = WRITEMASK_XY;
957 nr_texcoords = 2;
958 break;
959 case TEXTURE_3D_INDEX:
960 case TEXTURE_CUBE_INDEX:
961 emit = WRITEMASK_XYZ;
962 nr_texcoords = 3;
963 break;
964 default:
965 /* unexpected target */
966 abort();
967 }
968
969 /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */
970 if (intel->gen < 5 && c->dispatch_width == 8)
971 nr_texcoords = 3;
972
973 /* For shadow comparisons, we have to supply u,v,r. */
974 if (shadow)
975 nr_texcoords = 3;
976
977 /* Emit the texcoords. */
978 for (i = 0; i < nr_texcoords; i++) {
979 if (emit & (1<<i))
980 brw_MOV(p, brw_message_reg(cur_mrf), arg[i]);
981 else
982 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
983 cur_mrf += mrf_per_channel;
984 }
985
986 /* Fill in the shadow comparison reference value. */
987 if (shadow) {
988 if (intel->gen == 5) {
989 /* Fill in the cube map array index value. */
990 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
991 cur_mrf += mrf_per_channel;
992 } else if (c->dispatch_width == 8) {
993 /* Fill in the LOD bias value. */
994 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
995 cur_mrf += mrf_per_channel;
996 }
997 brw_MOV(p, brw_message_reg(cur_mrf), arg[2]);
998 cur_mrf += mrf_per_channel;
999 }
1000
1001 if (intel->gen == 5) {
1002 if (shadow)
1003 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_GEN5;
1004 else
1005 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_GEN5;
1006 } else {
1007 /* Note that G45 and older determines shadow compare and dispatch width
1008 * from message length for most messages.
1009 */
1010 if (c->dispatch_width == 16 && shadow)
1011 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
1012 else
1013 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
1014 }
1015
1016 brw_SAMPLE(p,
1017 dst_retyped,
1018 1,
1019 retype(depth_payload, BRW_REGISTER_TYPE_UW),
1020 SURF_INDEX_TEXTURE(sampler),
1021 sampler,
1022 dst_flags & WRITEMASK_XYZW,
1023 msg_type,
1024 response_length,
1025 cur_mrf - 1,
1026 0,
1027 1,
1028 simd_mode);
1029 }
1030
1031
1032 void emit_txb(struct brw_wm_compile *c,
1033 struct brw_reg *dst,
1034 GLuint dst_flags,
1035 struct brw_reg *arg,
1036 struct brw_reg depth_payload,
1037 GLuint tex_idx,
1038 GLuint sampler)
1039 {
1040 struct brw_compile *p = &c->func;
1041 struct intel_context *intel = &p->brw->intel;
1042 GLuint msgLength;
1043 GLuint msg_type;
1044 GLuint mrf_per_channel;
1045 GLuint response_length;
1046 struct brw_reg dst_retyped;
1047
1048 /* The G45 and older chipsets don't support 8-wide dispatch for LOD biased
1049 * samples, so we'll use the 16-wide instruction, leave the second halves
1050 * undefined, and trust the execution mask to keep the undefined pixels
1051 * from mattering.
1052 */
1053 if (c->dispatch_width == 16 || intel->gen < 5) {
1054 if (intel->gen == 5)
1055 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
1056 else
1057 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
1058 mrf_per_channel = 2;
1059 dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1060 response_length = 8;
1061 } else {
1062 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
1063 mrf_per_channel = 1;
1064 dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1065 response_length = 4;
1066 }
1067
1068 /* Shadow ignored for txb. */
1069 switch (tex_idx) {
1070 case TEXTURE_1D_INDEX:
1071 brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1072 brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), brw_imm_f(0));
1073 brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1074 break;
1075 case TEXTURE_2D_INDEX:
1076 case TEXTURE_RECT_INDEX:
1077 brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1078 brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1079 brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1080 break;
1081 case TEXTURE_3D_INDEX:
1082 case TEXTURE_CUBE_INDEX:
1083 brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1084 brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1085 brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), arg[2]);
1086 break;
1087 default:
1088 /* unexpected target */
1089 abort();
1090 }
1091
1092 brw_MOV(p, brw_message_reg(2 + 3 * mrf_per_channel), arg[3]);
1093 msgLength = 2 + 4 * mrf_per_channel - 1;
1094
1095 brw_SAMPLE(p,
1096 dst_retyped,
1097 1,
1098 retype(depth_payload, BRW_REGISTER_TYPE_UW),
1099 SURF_INDEX_TEXTURE(sampler),
1100 sampler,
1101 dst_flags & WRITEMASK_XYZW,
1102 msg_type,
1103 response_length,
1104 msgLength,
1105 0,
1106 1,
1107 BRW_SAMPLER_SIMD_MODE_SIMD16);
1108 }
1109
1110
1111 static void emit_lit(struct brw_wm_compile *c,
1112 const struct brw_reg *dst,
1113 GLuint mask,
1114 const struct brw_reg *arg0)
1115 {
1116 struct brw_compile *p = &c->func;
1117
1118 assert((mask & WRITEMASK_XW) == 0);
1119
1120 if (mask & WRITEMASK_Y) {
1121 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
1122 brw_MOV(p, dst[1], arg0[0]);
1123 brw_set_saturate(p, 0);
1124 }
1125
1126 if (mask & WRITEMASK_Z) {
1127 emit_math2(c, BRW_MATH_FUNCTION_POW,
1128 &dst[2],
1129 WRITEMASK_X | (mask & SATURATE),
1130 &arg0[1],
1131 &arg0[3]);
1132 }
1133
1134 /* Ordinarily you'd use an iff statement to skip or shortcircuit
1135 * some of the POW calculations above, but 16-wide iff statements
1136 * seem to lock c1 hardware, so this is a nasty workaround:
1137 */
1138 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_LE, arg0[0], brw_imm_f(0));
1139 {
1140 if (mask & WRITEMASK_Y)
1141 brw_MOV(p, dst[1], brw_imm_f(0));
1142
1143 if (mask & WRITEMASK_Z)
1144 brw_MOV(p, dst[2], brw_imm_f(0));
1145 }
1146 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1147 }
1148
1149
1150 /* Kill pixel - set execution mask to zero for those pixels which
1151 * fail.
1152 */
1153 static void emit_kil( struct brw_wm_compile *c,
1154 struct brw_reg *arg0)
1155 {
1156 struct brw_compile *p = &c->func;
1157 struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1158 GLuint i, j;
1159
1160 for (i = 0; i < 4; i++) {
1161 /* Check if we've already done the comparison for this reg
1162 * -- common when someone does KIL TEMP.wwww.
1163 */
1164 for (j = 0; j < i; j++) {
1165 if (memcmp(&arg0[j], &arg0[i], sizeof(arg0[0])) == 0)
1166 break;
1167 }
1168 if (j != i)
1169 continue;
1170
1171 brw_push_insn_state(p);
1172 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0));
1173 brw_set_predicate_control_flag_value(p, 0xff);
1174 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1175 brw_AND(p, r0uw, brw_flag_reg(), r0uw);
1176 brw_pop_insn_state(p);
1177 }
1178 }
1179
1180 /* KIL_NV kills the pixels that are currently executing, not based on a test
1181 * of the arguments.
1182 */
1183 static void emit_kil_nv( struct brw_wm_compile *c )
1184 {
1185 struct brw_compile *p = &c->func;
1186 struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1187
1188 brw_push_insn_state(p);
1189 brw_set_mask_control(p, BRW_MASK_DISABLE);
1190 brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); /* IMASK */
1191 brw_AND(p, r0uw, c->emit_mask_reg, r0uw);
1192 brw_pop_insn_state(p);
1193 }
1194
1195 static void fire_fb_write( struct brw_wm_compile *c,
1196 GLuint base_reg,
1197 GLuint nr,
1198 GLuint target,
1199 GLuint eot )
1200 {
1201 struct brw_compile *p = &c->func;
1202 struct brw_reg dst;
1203
1204 if (c->dispatch_width == 16)
1205 dst = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1206 else
1207 dst = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1208
1209 /* Pass through control information:
1210 */
1211 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
1212 {
1213 brw_push_insn_state(p);
1214 brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
1215 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1216 brw_MOV(p,
1217 brw_message_reg(base_reg + 1),
1218 brw_vec8_grf(1, 0));
1219 brw_pop_insn_state(p);
1220 }
1221
1222 /* Send framebuffer write message: */
1223 /* send (16) null.0<1>:uw m0 r0.0<8;8,1>:uw 0x85a04000:ud { Align1 EOT } */
1224 brw_fb_WRITE(p,
1225 dst,
1226 base_reg,
1227 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
1228 target,
1229 nr,
1230 0,
1231 eot);
1232 }
1233
1234
1235 static void emit_aa( struct brw_wm_compile *c,
1236 struct brw_reg *arg1,
1237 GLuint reg )
1238 {
1239 struct brw_compile *p = &c->func;
1240 GLuint comp = c->key.aa_dest_stencil_reg / 2;
1241 GLuint off = c->key.aa_dest_stencil_reg % 2;
1242 struct brw_reg aa = offset(arg1[comp], off);
1243
1244 brw_push_insn_state(p);
1245 brw_set_compression_control(p, BRW_COMPRESSION_NONE); /* ?? */
1246 brw_MOV(p, brw_message_reg(reg), aa);
1247 brw_pop_insn_state(p);
1248 }
1249
1250
1251 /* Post-fragment-program processing. Send the results to the
1252 * framebuffer.
1253 * \param arg0 the fragment color
1254 * \param arg1 the pass-through depth value
1255 * \param arg2 the shader-computed depth value
1256 */
1257 void emit_fb_write(struct brw_wm_compile *c,
1258 struct brw_reg *arg0,
1259 struct brw_reg *arg1,
1260 struct brw_reg *arg2,
1261 GLuint target,
1262 GLuint eot)
1263 {
1264 struct brw_compile *p = &c->func;
1265 struct brw_context *brw = p->brw;
1266 GLuint nr = 2;
1267 GLuint channel;
1268
1269 /* Reserve a space for AA - may not be needed:
1270 */
1271 if (c->key.aa_dest_stencil_reg)
1272 nr += 1;
1273
1274 /* I don't really understand how this achieves the color interleave
1275 * (ie RGBARGBA) in the result: [Do the saturation here]
1276 */
1277 brw_push_insn_state(p);
1278
1279 for (channel = 0; channel < 4; channel++) {
1280 if (c->dispatch_width == 16 && brw->has_compr4) {
1281 /* By setting the high bit of the MRF register number, we indicate
1282 * that we want COMPR4 mode - instead of doing the usual destination
1283 * + 1 for the second half we get destination + 4.
1284 */
1285 brw_MOV(p,
1286 brw_message_reg(nr + channel + (1 << 7)),
1287 arg0[channel]);
1288 } else {
1289 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
1290 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
1291 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1292 brw_MOV(p,
1293 brw_message_reg(nr + channel),
1294 arg0[channel]);
1295
1296 if (c->dispatch_width == 16) {
1297 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1298 brw_MOV(p,
1299 brw_message_reg(nr + channel + 4),
1300 sechalf(arg0[channel]));
1301 }
1302 }
1303 }
1304 /* skip over the regs populated above:
1305 */
1306 nr += 8;
1307 brw_pop_insn_state(p);
1308
1309 if (c->key.source_depth_to_render_target)
1310 {
1311 if (c->key.computes_depth)
1312 brw_MOV(p, brw_message_reg(nr), arg2[2]);
1313 else
1314 brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */
1315
1316 nr += 2;
1317 }
1318
1319 if (c->key.dest_depth_reg)
1320 {
1321 GLuint comp = c->key.dest_depth_reg / 2;
1322 GLuint off = c->key.dest_depth_reg % 2;
1323
1324 if (off != 0) {
1325 brw_push_insn_state(p);
1326 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1327
1328 brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
1329 /* 2nd half? */
1330 brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
1331 brw_pop_insn_state(p);
1332 }
1333 else {
1334 brw_MOV(p, brw_message_reg(nr), arg1[comp]);
1335 }
1336 nr += 2;
1337 }
1338
1339 if (!c->key.runtime_check_aads_emit) {
1340 if (c->key.aa_dest_stencil_reg)
1341 emit_aa(c, arg1, 2);
1342
1343 fire_fb_write(c, 0, nr, target, eot);
1344 }
1345 else {
1346 struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
1347 struct brw_reg ip = brw_ip_reg();
1348 struct brw_instruction *jmp;
1349
1350 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1351 brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
1352 brw_AND(p,
1353 v1_null_ud,
1354 get_element_ud(brw_vec8_grf(1,0), 6),
1355 brw_imm_ud(1<<26));
1356
1357 jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
1358 {
1359 emit_aa(c, arg1, 2);
1360 fire_fb_write(c, 0, nr, target, eot);
1361 /* note - thread killed in subroutine */
1362 }
1363 brw_land_fwd_jump(p, jmp);
1364
1365 /* ELSE: Shuffle up one register to fill in the hole left for AA:
1366 */
1367 fire_fb_write(c, 1, nr-1, target, eot);
1368 }
1369 }
1370
1371 /**
1372 * Move a GPR to scratch memory.
1373 */
1374 static void emit_spill( struct brw_wm_compile *c,
1375 struct brw_reg reg,
1376 GLuint slot )
1377 {
1378 struct brw_compile *p = &c->func;
1379
1380 /*
1381 mov (16) m2.0<1>:ud r2.0<8;8,1>:ud { Align1 Compr }
1382 */
1383 brw_MOV(p, brw_message_reg(2), reg);
1384
1385 /*
1386 mov (1) r0.2<1>:d 0x00000080:d { Align1 NoMask }
1387 send (16) null.0<1>:uw m1 r0.0<8;8,1>:uw 0x053003ff:ud { Align1 }
1388 */
1389 brw_dp_WRITE_16(p,
1390 retype(vec16(brw_vec8_grf(0, 0)), BRW_REGISTER_TYPE_UW),
1391 slot);
1392 }
1393
1394
1395 /**
1396 * Load a GPR from scratch memory.
1397 */
1398 static void emit_unspill( struct brw_wm_compile *c,
1399 struct brw_reg reg,
1400 GLuint slot )
1401 {
1402 struct brw_compile *p = &c->func;
1403
1404 /* Slot 0 is the undef value.
1405 */
1406 if (slot == 0) {
1407 brw_MOV(p, reg, brw_imm_f(0));
1408 return;
1409 }
1410
1411 /*
1412 mov (1) r0.2<1>:d 0x000000c0:d { Align1 NoMask }
1413 send (16) r110.0<1>:uw m1 r0.0<8;8,1>:uw 0x041243ff:ud { Align1 }
1414 */
1415
1416 brw_dp_READ_16(p,
1417 retype(vec16(reg), BRW_REGISTER_TYPE_UW),
1418 slot);
1419 }
1420
1421
1422 /**
1423 * Retrieve up to 4 GEN4 register pairs for the given wm reg:
1424 * Args with unspill_reg != 0 will be loaded from scratch memory.
1425 */
1426 static void get_argument_regs( struct brw_wm_compile *c,
1427 struct brw_wm_ref *arg[],
1428 struct brw_reg *regs )
1429 {
1430 GLuint i;
1431
1432 for (i = 0; i < 4; i++) {
1433 if (arg[i]) {
1434 if (arg[i]->unspill_reg)
1435 emit_unspill(c,
1436 brw_vec8_grf(arg[i]->unspill_reg, 0),
1437 arg[i]->value->spill_slot);
1438
1439 regs[i] = arg[i]->hw_reg;
1440 }
1441 else {
1442 regs[i] = brw_null_reg();
1443 }
1444 }
1445 }
1446
1447
1448 /**
1449 * For values that have a spill_slot!=0, write those regs to scratch memory.
1450 */
1451 static void spill_values( struct brw_wm_compile *c,
1452 struct brw_wm_value *values,
1453 GLuint nr )
1454 {
1455 GLuint i;
1456
1457 for (i = 0; i < nr; i++)
1458 if (values[i].spill_slot)
1459 emit_spill(c, values[i].hw_reg, values[i].spill_slot);
1460 }
1461
1462 #define BRW_MRF_NUM 16
1463 #define BRW_SIZE_OF_REG 32
1464
1465 static INLINE
1466 GLboolean brw_is_arithmetic_inst(const struct brw_instruction *inst)
1467 {
1468 switch (inst->header.opcode) {
1469 case BRW_OPCODE_MOV:
1470 case BRW_OPCODE_SEL:
1471 case BRW_OPCODE_NOT:
1472 case BRW_OPCODE_AND:
1473 case BRW_OPCODE_OR:
1474 case BRW_OPCODE_XOR:
1475 case BRW_OPCODE_SHR:
1476 case BRW_OPCODE_SHL:
1477 case BRW_OPCODE_RSR:
1478 case BRW_OPCODE_RSL:
1479 case BRW_OPCODE_ADD:
1480 case BRW_OPCODE_MUL:
1481 case BRW_OPCODE_AVG:
1482 case BRW_OPCODE_FRC:
1483 case BRW_OPCODE_RNDU:
1484 case BRW_OPCODE_RNDD:
1485 case BRW_OPCODE_RNDE:
1486 case BRW_OPCODE_RNDZ:
1487 case BRW_OPCODE_MAC:
1488 case BRW_OPCODE_MACH:
1489 case BRW_OPCODE_LINE:
1490 return GL_TRUE;
1491 default:
1492 return GL_FALSE;
1493 }
1494 }
1495
1496 static const struct {
1497 char *name;
1498 int nsrc;
1499 int ndst;
1500 } inst_opcode[128] = {
1501 [BRW_OPCODE_MOV] = { .name = "mov", .nsrc = 1, .ndst = 1 },
1502 [BRW_OPCODE_FRC] = { .name = "frc", .nsrc = 1, .ndst = 1 },
1503 [BRW_OPCODE_RNDU] = { .name = "rndu", .nsrc = 1, .ndst = 1 },
1504 [BRW_OPCODE_RNDD] = { .name = "rndd", .nsrc = 1, .ndst = 1 },
1505 [BRW_OPCODE_RNDE] = { .name = "rnde", .nsrc = 1, .ndst = 1 },
1506 [BRW_OPCODE_RNDZ] = { .name = "rndz", .nsrc = 1, .ndst = 1 },
1507 [BRW_OPCODE_NOT] = { .name = "not", .nsrc = 1, .ndst = 1 },
1508 [BRW_OPCODE_LZD] = { .name = "lzd", .nsrc = 1, .ndst = 1 },
1509
1510 [BRW_OPCODE_MUL] = { .name = "mul", .nsrc = 2, .ndst = 1 },
1511 [BRW_OPCODE_MAC] = { .name = "mac", .nsrc = 2, .ndst = 1 },
1512 [BRW_OPCODE_MACH] = { .name = "mach", .nsrc = 2, .ndst = 1 },
1513 [BRW_OPCODE_LINE] = { .name = "line", .nsrc = 2, .ndst = 1 },
1514 [BRW_OPCODE_PLN] = { .name = "pln", .nsrc = 2, .ndst = 1 },
1515 [BRW_OPCODE_SAD2] = { .name = "sad2", .nsrc = 2, .ndst = 1 },
1516 [BRW_OPCODE_SADA2] = { .name = "sada2", .nsrc = 2, .ndst = 1 },
1517 [BRW_OPCODE_DP4] = { .name = "dp4", .nsrc = 2, .ndst = 1 },
1518 [BRW_OPCODE_DPH] = { .name = "dph", .nsrc = 2, .ndst = 1 },
1519 [BRW_OPCODE_DP3] = { .name = "dp3", .nsrc = 2, .ndst = 1 },
1520 [BRW_OPCODE_DP2] = { .name = "dp2", .nsrc = 2, .ndst = 1 },
1521 [BRW_OPCODE_MATH] = { .name = "math", .nsrc = 2, .ndst = 1 },
1522
1523 [BRW_OPCODE_AVG] = { .name = "avg", .nsrc = 2, .ndst = 1 },
1524 [BRW_OPCODE_ADD] = { .name = "add", .nsrc = 2, .ndst = 1 },
1525 [BRW_OPCODE_SEL] = { .name = "sel", .nsrc = 2, .ndst = 1 },
1526 [BRW_OPCODE_AND] = { .name = "and", .nsrc = 2, .ndst = 1 },
1527 [BRW_OPCODE_OR] = { .name = "or", .nsrc = 2, .ndst = 1 },
1528 [BRW_OPCODE_XOR] = { .name = "xor", .nsrc = 2, .ndst = 1 },
1529 [BRW_OPCODE_SHR] = { .name = "shr", .nsrc = 2, .ndst = 1 },
1530 [BRW_OPCODE_SHL] = { .name = "shl", .nsrc = 2, .ndst = 1 },
1531 [BRW_OPCODE_ASR] = { .name = "asr", .nsrc = 2, .ndst = 1 },
1532 [BRW_OPCODE_CMP] = { .name = "cmp", .nsrc = 2, .ndst = 1 },
1533 [BRW_OPCODE_CMPN] = { .name = "cmpn", .nsrc = 2, .ndst = 1 },
1534
1535 [BRW_OPCODE_SEND] = { .name = "send", .nsrc = 1, .ndst = 1 },
1536 [BRW_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
1537 [BRW_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 1, .ndst = 0 },
1538 [BRW_OPCODE_IF] = { .name = "if", .nsrc = 2, .ndst = 0 },
1539 [BRW_OPCODE_IFF] = { .name = "iff", .nsrc = 2, .ndst = 1 },
1540 [BRW_OPCODE_WHILE] = { .name = "while", .nsrc = 2, .ndst = 0 },
1541 [BRW_OPCODE_ELSE] = { .name = "else", .nsrc = 2, .ndst = 0 },
1542 [BRW_OPCODE_BREAK] = { .name = "break", .nsrc = 2, .ndst = 0 },
1543 [BRW_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 1, .ndst = 0 },
1544 [BRW_OPCODE_HALT] = { .name = "halt", .nsrc = 1, .ndst = 0 },
1545 [BRW_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 },
1546 [BRW_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 },
1547 [BRW_OPCODE_MRESTORE] = { .name = "mrest", .nsrc = 1, .ndst = 1 },
1548 [BRW_OPCODE_POP] = { .name = "pop", .nsrc = 2, .ndst = 0 },
1549 [BRW_OPCODE_WAIT] = { .name = "wait", .nsrc = 1, .ndst = 0 },
1550 [BRW_OPCODE_DO] = { .name = "do", .nsrc = 0, .ndst = 0 },
1551 [BRW_OPCODE_ENDIF] = { .name = "endif", .nsrc = 2, .ndst = 0 },
1552 };
1553
1554 static const GLuint inst_stride[7] = {
1555 [0] = 0,
1556 [1] = 1,
1557 [2] = 2,
1558 [3] = 4,
1559 [4] = 8,
1560 [5] = 16,
1561 [6] = 32
1562 };
1563
1564 static const GLuint inst_type_size[8] = {
1565 [0] = 4,
1566 [1] = 4,
1567 [2] = 2,
1568 [3] = 2,
1569 [4] = 1,
1570 [5] = 1,
1571 [7] = 4
1572 };
1573
1574 #define BRW_MAX_OFFSET(x0,x1) ((x0) > (x1) ? (x0) : (x1))
1575 #define BRW_MIN_OFFSET(x0,x1) ((x0) < (x1) ? (x0) : (x1));
1576
1577 static INLINE GLboolean
1578 brw_is_grf_written(const struct brw_instruction *inst,
1579 int reg_index, int size,
1580 int gen)
1581 {
1582 if (inst_opcode[inst->header.opcode].ndst == 0)
1583 return GL_FALSE;
1584
1585 if (inst->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT)
1586 if (inst->bits1.ia1.dest_reg_file == BRW_GENERAL_REGISTER_FILE)
1587 return GL_TRUE;
1588
1589 if (inst->bits1.da1.dest_reg_file != BRW_GENERAL_REGISTER_FILE)
1590 return GL_FALSE;
1591
1592 const int reg_start = reg_index * BRW_SIZE_OF_REG;
1593 const int reg_end = reg_start + size;
1594
1595 const int type_size = inst_type_size[inst->bits1.da1.dest_reg_type];
1596 const int write_start = inst->bits1.da1.dest_reg_nr*BRW_SIZE_OF_REG
1597 + inst->bits1.da1.dest_subreg_nr;
1598 int length, write_end;
1599
1600 /* SEND is specific */
1601 if (inst->header.opcode == BRW_OPCODE_SEND) {
1602 if (gen >= 5)
1603 length = inst->bits3.generic_gen5.response_length*BRW_SIZE_OF_REG;
1604 else
1605 length = inst->bits3.generic.response_length*BRW_SIZE_OF_REG;
1606 }
1607 else {
1608 length = 1 << inst->header.execution_size;
1609 length *= type_size;
1610 length *= inst->bits1.da1.dest_horiz_stride;
1611 }
1612
1613 /* If the two intervals intersect, we overwrite the register */
1614 write_end = write_start + length;
1615 const int left = BRW_MAX_OFFSET(write_start, reg_start);
1616 const int right = BRW_MIN_OFFSET(write_end, reg_end);
1617
1618 return left < right;
1619 }
1620
1621 /* Specific path for message register since we need to handle the compr4 case */
1622 static INLINE GLboolean
1623 brw_is_mrf_written(const struct brw_instruction *inst, int reg_index, int size)
1624 {
1625 if (inst_opcode[inst->header.opcode].ndst == 0)
1626 return GL_FALSE;
1627
1628 if (inst->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT)
1629 if (inst->bits1.ia1.dest_reg_file == BRW_MESSAGE_REGISTER_FILE)
1630 return GL_TRUE;
1631
1632 if (inst->bits1.da1.dest_reg_file != BRW_MESSAGE_REGISTER_FILE)
1633 return GL_FALSE;
1634
1635 const int reg_start = reg_index * BRW_SIZE_OF_REG;
1636 const int reg_end = reg_start + size;
1637
1638 const int mrf_index = inst->bits1.da1.dest_reg_nr & 0x0f;
1639 const int is_compr4 = inst->bits1.da1.dest_reg_nr & 0xf0;
1640 const int type_size = inst_type_size[inst->bits1.da1.dest_reg_type];
1641
1642 /* We use compr4 with a size != 16 elements. Strange, we conservatively
1643 * consider that we are writing the register.
1644 */
1645 if (is_compr4 && inst->header.execution_size != BRW_EXECUTE_16)
1646 return GL_TRUE;
1647
1648 GLboolean is_written = GL_FALSE;
1649
1650 /* Here we write mrf_{i} and mrf_{i+4}. So we read two times 8 elements */
1651 if (is_compr4) {
1652 const int length = 8 * type_size * inst->bits1.da1.dest_horiz_stride;
1653
1654 /* First 8-way register */
1655 const int write_start0 = mrf_index*BRW_SIZE_OF_REG
1656 + inst->bits1.da1.dest_subreg_nr;
1657 const int write_end0 = write_start0 + length;
1658
1659 /* Second 8-way register */
1660 const int write_start1 = (mrf_index+4)*BRW_SIZE_OF_REG
1661 + inst->bits1.da1.dest_subreg_nr;
1662 const int write_end1 = write_start1 + length;
1663
1664 /* If the two intervals intersect, we overwrite the register */
1665 const int left0 = BRW_MAX_OFFSET(write_start0, reg_start);
1666 const int right0 = BRW_MIN_OFFSET(write_end0, reg_end);
1667 const int left1 = BRW_MAX_OFFSET(write_start1, reg_start);
1668 const int right1 = BRW_MIN_OFFSET(write_end1, reg_end);
1669
1670 is_written = left0 < right0 || left1 < right1;
1671 }
1672 else {
1673 int length;
1674 length = 1 << inst->header.execution_size;
1675 length *= type_size;
1676 length *= inst->bits1.da1.dest_horiz_stride;
1677
1678 /* If the two intervals intersect, we write into the register */
1679 const int write_start = inst->bits1.da1.dest_reg_nr*BRW_SIZE_OF_REG
1680 + inst->bits1.da1.dest_subreg_nr;
1681 const int write_end = write_start + length;
1682 const int left = BRW_MAX_OFFSET(write_start, reg_start);
1683 const int right = BRW_MIN_OFFSET(write_end, reg_end);;
1684
1685 is_written = left < right;
1686 }
1687
1688 /* SEND may perform an implicit mov to a mrf register */
1689 if (is_written == GL_FALSE &&
1690 inst->header.opcode == BRW_OPCODE_SEND &&
1691 inst->bits1.da1.src0_reg_file != 0) {
1692
1693 const int mrf_start = inst->header.destreg__conditionalmod;
1694 const int write_start = mrf_start * BRW_SIZE_OF_REG;
1695 const int write_end = write_start + BRW_SIZE_OF_REG;
1696 const int left = BRW_MAX_OFFSET(write_start, reg_start);
1697 const int right = BRW_MIN_OFFSET(write_end, reg_end);;
1698 is_written = left < right;
1699 }
1700
1701 return is_written;
1702 }
1703
1704 static INLINE GLboolean
1705 brw_is_mrf_read(const struct brw_instruction *inst,
1706 int reg_index, int size, int gen)
1707 {
1708 if (inst->header.opcode != BRW_OPCODE_SEND)
1709 return GL_FALSE;
1710 if (inst->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT)
1711 return GL_TRUE;
1712
1713 const int reg_start = reg_index*BRW_SIZE_OF_REG;
1714 const int reg_end = reg_start + size;
1715
1716 int length, read_start, read_end;
1717 if (gen >= 5)
1718 length = inst->bits3.generic_gen5.msg_length*BRW_SIZE_OF_REG;
1719 else
1720 length = inst->bits3.generic.msg_length*BRW_SIZE_OF_REG;
1721
1722 /* Look if SEND uses an implicit mov. In that case, we read one less register
1723 * (but we write it)
1724 */
1725 if (inst->bits1.da1.src0_reg_file != 0)
1726 read_start = inst->header.destreg__conditionalmod;
1727 else {
1728 length--;
1729 read_start = inst->header.destreg__conditionalmod + 1;
1730 }
1731 read_start *= BRW_SIZE_OF_REG;
1732 read_end = read_start + length;
1733
1734 const int left = BRW_MAX_OFFSET(read_start, reg_start);
1735 const int right = BRW_MIN_OFFSET(read_end, reg_end);
1736
1737 return left < right;
1738 }
1739
1740 static INLINE GLboolean
1741 brw_is_grf_read(const struct brw_instruction *inst, int reg_index, int size)
1742 {
1743 int i, j;
1744 if (inst_opcode[inst->header.opcode].nsrc == 0)
1745 return GL_FALSE;
1746
1747 /* Look at first source. We must take into account register regions to
1748 * monitor carefully the read. Note that we are a bit too conservative here
1749 * since we do not take into account the fact that some complete registers
1750 * may be skipped
1751 */
1752 if (inst_opcode[inst->header.opcode].nsrc >= 1) {
1753
1754 if (inst->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT)
1755 if (inst->bits1.ia1.src0_reg_file == BRW_GENERAL_REGISTER_FILE)
1756 return GL_TRUE;
1757 if (inst->bits1.da1.src0_reg_file != BRW_GENERAL_REGISTER_FILE)
1758 return GL_FALSE;
1759
1760 const int reg_start = reg_index*BRW_SIZE_OF_REG;
1761 const int reg_end = reg_start + size;
1762
1763 /* See if at least one of this element intersects the interval */
1764 const int type_size = inst_type_size[inst->bits1.da1.src0_reg_type];
1765 const int elem_num = 1 << inst->header.execution_size;
1766 const int width = 1 << inst->bits2.da1.src0_width;
1767 const int row_num = elem_num >> inst->bits2.da1.src0_width;
1768 const int hs = type_size*inst_stride[inst->bits2.da1.src0_horiz_stride];
1769 const int vs = type_size*inst_stride[inst->bits2.da1.src0_vert_stride];
1770 int row_start = inst->bits2.da1.src0_reg_nr*BRW_SIZE_OF_REG
1771 + inst->bits2.da1.src0_subreg_nr;
1772 for (j = 0; j < row_num; ++j) {
1773 int write_start = row_start;
1774 for (i = 0; i < width; ++i) {
1775 const int write_end = write_start + type_size;
1776 const int left = write_start > reg_start ? write_start : reg_start;
1777 const int right = write_end < reg_end ? write_end : reg_end;
1778 if (left < right)
1779 return GL_TRUE;
1780 write_start += hs;
1781 }
1782 row_start += vs;
1783 }
1784 }
1785
1786 /* Second src register */
1787 if (inst_opcode[inst->header.opcode].nsrc >= 2) {
1788
1789 if (inst->bits3.da1.src1_address_mode != BRW_ADDRESS_DIRECT)
1790 if (inst->bits1.ia1.src1_reg_file == BRW_GENERAL_REGISTER_FILE)
1791 return GL_TRUE;
1792 if (inst->bits1.da1.src1_reg_file != BRW_GENERAL_REGISTER_FILE)
1793 return GL_FALSE;
1794
1795 const int reg_start = reg_index*BRW_SIZE_OF_REG;
1796 const int reg_end = reg_start + size;
1797
1798 /* See if at least one of this element intersects the interval */
1799 const int type_size = inst_type_size[inst->bits1.da1.src1_reg_type];
1800 const int elem_num = 1 << inst->header.execution_size;
1801 const int width = 1 << inst->bits3.da1.src1_width;
1802 const int row_num = elem_num >> inst->bits3.da1.src1_width;
1803 const int hs = type_size*inst_stride[inst->bits3.da1.src1_horiz_stride];
1804 const int vs = type_size*inst_stride[inst->bits3.da1.src1_vert_stride];
1805 int row_start = inst->bits3.da1.src1_reg_nr*BRW_SIZE_OF_REG
1806 + inst->bits3.da1.src1_subreg_nr;
1807 for (j = 0; j < row_num; ++j) {
1808 int write_start = row_start;
1809 for (i = 0; i < width; ++i) {
1810 const int write_end = write_start + type_size;
1811 const int left = write_start > reg_start ? write_start : reg_start;
1812 const int right = write_end < reg_end ? write_end : reg_end;
1813 if (left < right)
1814 return GL_TRUE;
1815 write_start += hs;
1816 }
1817 row_start += vs;
1818 }
1819 }
1820
1821 return GL_FALSE;
1822 }
1823
1824 static INLINE GLboolean
1825 brw_is_control_done(const struct brw_instruction *mov) {
1826 return
1827 mov->header.dependency_control != 0 ||
1828 mov->header.thread_control != 0 ||
1829 mov->header.mask_control != 0 ||
1830 mov->header.saturate != 0 ||
1831 mov->header.debug_control != 0;
1832 }
1833
1834 static INLINE GLboolean
1835 brw_is_predicated(const struct brw_instruction *mov) {
1836 return mov->header.predicate_control != 0;
1837 }
1838
1839 static INLINE GLboolean
1840 brw_is_grf_to_mrf_mov(const struct brw_instruction *mov,
1841 int *mrf_index,
1842 int *grf_index,
1843 GLboolean *is_compr4)
1844 {
1845 if (brw_is_predicated(mov) ||
1846 brw_is_control_done(mov) ||
1847 mov->header.debug_control != 0)
1848 return GL_FALSE;
1849
1850 if (mov->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT ||
1851 mov->bits1.da1.dest_reg_file != BRW_MESSAGE_REGISTER_FILE ||
1852 mov->bits1.da1.dest_reg_type != 7 ||
1853 mov->bits1.da1.dest_horiz_stride != 1 ||
1854 mov->bits1.da1.dest_subreg_nr != 0)
1855 return GL_FALSE;
1856
1857 if (mov->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT ||
1858 mov->bits1.da1.src0_reg_file != BRW_GENERAL_REGISTER_FILE ||
1859 mov->bits1.da1.src0_reg_type != 7 ||
1860 mov->bits2.da1.src0_width != 3 ||
1861 mov->bits2.da1.src0_horiz_stride != 1 ||
1862 mov->bits2.da1.src0_vert_stride != 4 ||
1863 mov->bits2.da1.src0_subreg_nr != 0 ||
1864 mov->bits2.da1.src0_abs != 0 ||
1865 mov->bits2.da1.src0_negate != 0)
1866 return GL_FALSE;
1867
1868 *grf_index = mov->bits2.da1.src0_reg_nr;
1869 *mrf_index = mov->bits1.da1.dest_reg_nr & 0x0f;
1870 *is_compr4 = (mov->bits1.da1.dest_reg_nr & 0xf0) != 0;
1871 return GL_TRUE;
1872 }
1873
1874 static INLINE GLboolean
1875 brw_is_grf_straight_write(const struct brw_instruction *inst, int grf_index)
1876 {
1877 /* remark: no problem to predicate a SEL instruction */
1878 if ((!brw_is_predicated(inst) || inst->header.opcode == BRW_OPCODE_SEL) &&
1879 brw_is_control_done(inst) == GL_FALSE &&
1880 inst->header.execution_size == 4 &&
1881 inst->header.access_mode == BRW_ALIGN_1 &&
1882 inst->bits1.da1.dest_address_mode == BRW_ADDRESS_DIRECT &&
1883 inst->bits1.da1.dest_reg_file == BRW_GENERAL_REGISTER_FILE &&
1884 inst->bits1.da1.dest_reg_type == 7 &&
1885 inst->bits1.da1.dest_horiz_stride == 1 &&
1886 inst->bits1.da1.dest_reg_nr == grf_index &&
1887 inst->bits1.da1.dest_subreg_nr == 0 &&
1888 brw_is_arithmetic_inst(inst))
1889 return GL_TRUE;
1890
1891 return GL_FALSE;
1892 }
1893
1894 static INLINE GLboolean
1895 brw_inst_are_equal(const struct brw_instruction *src0,
1896 const struct brw_instruction *src1)
1897 {
1898 const GLuint *field0 = (GLuint *) src0;
1899 const GLuint *field1 = (GLuint *) src1;
1900 return field0[0] == field1[0] &&
1901 field0[1] == field1[1] &&
1902 field0[2] == field1[2] &&
1903 field0[3] == field1[3];
1904 }
1905
1906 static INLINE void
1907 brw_inst_copy(struct brw_instruction *dst,
1908 const struct brw_instruction *src)
1909 {
1910 GLuint *field_dst = (GLuint *) dst;
1911 const GLuint *field_src = (GLuint *) src;
1912 field_dst[0] = field_src[0];
1913 field_dst[1] = field_src[1];
1914 field_dst[2] = field_src[2];
1915 field_dst[3] = field_src[3];
1916 }
1917
1918 static void brw_remove_inst(struct brw_compile *p, const GLboolean *removeInst)
1919 {
1920 int i, nr_insn = 0, to = 0, from = 0;
1921
1922 for (from = 0; from < p->nr_insn; ++from) {
1923 if (removeInst[from])
1924 continue;
1925 if(to != from)
1926 brw_inst_copy(p->store + to, p->store + from);
1927 to++;
1928 }
1929
1930 for (i = 0; i < p->nr_insn; ++i)
1931 if (removeInst[i] == GL_FALSE)
1932 nr_insn++;
1933 p->nr_insn = nr_insn;
1934 }
1935
1936 /* The gen code emitter generates a lot of duplications in the mrf-to-grf moves.
1937 * Here, we monitor same mov mrf-to-grf instrutions and remove them as soon as
1938 * none of the two operands have been written
1939 */
1940 static void brw_remove_duplicate_mrf_moves(struct brw_wm_compile *c)
1941 {
1942 struct brw_compile *p = &c->func;
1943 const int gen = p->brw->intel.gen;
1944 int i, j;
1945
1946 GLboolean *removeInst = calloc(sizeof(GLboolean), p->nr_insn);
1947 for (i = 0; i < p->nr_insn; i++) {
1948 if (removeInst[i])
1949 continue;
1950
1951 const struct brw_instruction *mov = p->store + i;
1952 int mrf_index, grf_index;
1953 GLboolean is_compr4;
1954
1955 /* Only consider _straight_ grf-to-mrf moves */
1956 if (!brw_is_grf_to_mrf_mov(mov, &mrf_index, &grf_index, &is_compr4))
1957 continue;
1958
1959 const int mrf_index0 = mrf_index;
1960 const int mrf_index1 = is_compr4 ? mrf_index0+4 : mrf_index0+1;
1961 const int simd16_size = 2 * BRW_SIZE_OF_REG;
1962
1963 for (j = i + 1; j < p->nr_insn; j++) {
1964 const struct brw_instruction *inst = p->store + j;
1965
1966 if (brw_inst_are_equal(mov, inst)) {
1967 removeInst[j] = GL_TRUE;
1968 continue;
1969 }
1970
1971 if (brw_is_grf_written(inst, grf_index, simd16_size, gen) ||
1972 brw_is_mrf_written(inst, mrf_index0, BRW_SIZE_OF_REG) ||
1973 brw_is_mrf_written(inst, mrf_index1, BRW_SIZE_OF_REG))
1974 break;
1975 }
1976 }
1977
1978 brw_remove_inst(p, removeInst);
1979 free(removeInst);
1980 }
1981
1982 static void brw_remove_mrf_to_grf_moves(struct brw_wm_compile *c)
1983 {
1984 int i, j, prev;
1985 struct brw_compile *p = &c->func;
1986 struct brw_context *brw = p->brw;
1987 const int gen = brw->intel.gen;
1988 const int simd16_size = 2*BRW_SIZE_OF_REG;
1989
1990 if (c->dispatch_width != 16 || brw->has_compr4 == GL_FALSE)
1991 return;
1992
1993 GLboolean *removeInst = calloc(sizeof(GLboolean), p->nr_insn);
1994 assert(removeInst);
1995
1996 for (i = 0; i < p->nr_insn; i++) {
1997 if (removeInst[i])
1998 continue;
1999
2000 struct brw_instruction *grf_inst = NULL;
2001 const struct brw_instruction *mov = p->store + i;
2002 int mrf_index, grf_index;
2003 GLboolean is_compr4;
2004
2005 /* Only consider _straight_ grf-to-mrf moves */
2006 if (!brw_is_grf_to_mrf_mov(mov, &mrf_index, &grf_index, &is_compr4))
2007 continue;
2008
2009 /* Using comp4 enables a stride of 4 for this instruction */
2010 const int mrf_index0 = mrf_index;
2011 const int mrf_index1 = is_compr4 ? mrf_index+4 : mrf_index+1;
2012
2013 /* Look where the register has been set */
2014 prev = i;
2015 GLboolean potential_remove = GL_FALSE;
2016 while (prev--) {
2017
2018 /* If _one_ instruction writes the grf, we try to remove the mov */
2019 struct brw_instruction *inst = p->store + prev;
2020 if (brw_is_grf_straight_write(inst, grf_index)) {
2021 potential_remove = GL_TRUE;
2022 grf_inst = inst;
2023 break;
2024 }
2025
2026 }
2027
2028 if (potential_remove == GL_FALSE)
2029 continue;
2030 removeInst[i] = GL_TRUE;
2031
2032 /* Monitor first the section of code between the grf computation and the
2033 * mov. Here we cannot read or write both mrf and grf register
2034 */
2035 for (j = prev + 1; j < i; ++j) {
2036 struct brw_instruction *inst = p->store + j;
2037 if (removeInst[j])
2038 continue;
2039 if (brw_is_grf_written(inst, grf_index, simd16_size, gen) ||
2040 brw_is_grf_read(inst, grf_index, simd16_size) ||
2041 brw_is_mrf_written(inst, mrf_index0, BRW_SIZE_OF_REG) ||
2042 brw_is_mrf_written(inst, mrf_index1, BRW_SIZE_OF_REG) ||
2043 brw_is_mrf_read(inst, mrf_index0, BRW_SIZE_OF_REG, gen) ||
2044 brw_is_mrf_read(inst, mrf_index1, BRW_SIZE_OF_REG, gen)) {
2045 removeInst[i] = GL_FALSE;
2046 break;
2047 }
2048 }
2049
2050 /* After the mov, we can read or write the mrf. If the grf is overwritten,
2051 * we are done
2052 */
2053 for (j = i + 1; j < p->nr_insn; ++j) {
2054 struct brw_instruction *inst = p->store + j;
2055 if (removeInst[j])
2056 continue;
2057
2058 if (brw_is_grf_read(inst, grf_index, simd16_size)) {
2059 removeInst[i] = GL_FALSE;
2060 break;
2061 }
2062
2063 if (brw_is_grf_straight_write(inst, grf_index))
2064 break;
2065 }
2066
2067 /* Note that with the top down traversal, we can safely pacth the mov
2068 * instruction
2069 */
2070 if (removeInst[i]) {
2071 grf_inst->bits1.da1.dest_reg_file = mov->bits1.da1.dest_reg_file;
2072 grf_inst->bits1.da1.dest_reg_nr = mov->bits1.da1.dest_reg_nr;
2073 }
2074 }
2075
2076 brw_remove_inst(p, removeInst);
2077 free(removeInst);
2078 }
2079
2080 /* Emit the fragment program instructions here.
2081 */
2082 void brw_wm_emit( struct brw_wm_compile *c )
2083 {
2084 struct brw_compile *p = &c->func;
2085 GLuint insn;
2086
2087 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
2088
2089 /* Check if any of the payload regs need to be spilled:
2090 */
2091 spill_values(c, c->payload.depth, 4);
2092 spill_values(c, c->creg, c->nr_creg);
2093 spill_values(c, c->payload.input_interp, FRAG_ATTRIB_MAX);
2094
2095
2096 for (insn = 0; insn < c->nr_insns; insn++) {
2097
2098 struct brw_wm_instruction *inst = &c->instruction[insn];
2099 struct brw_reg args[3][4], dst[4];
2100 GLuint i, dst_flags;
2101
2102 /* Get argument regs:
2103 */
2104 for (i = 0; i < 3; i++)
2105 get_argument_regs(c, inst->src[i], args[i]);
2106
2107 /* Get dest regs:
2108 */
2109 for (i = 0; i < 4; i++)
2110 if (inst->dst[i])
2111 dst[i] = inst->dst[i]->hw_reg;
2112 else
2113 dst[i] = brw_null_reg();
2114
2115 /* Flags
2116 */
2117 dst_flags = inst->writemask;
2118 if (inst->saturate)
2119 dst_flags |= SATURATE;
2120
2121 switch (inst->opcode) {
2122 /* Generated instructions for calculating triangle interpolants:
2123 */
2124 case WM_PIXELXY:
2125 emit_pixel_xy(c, dst, dst_flags);
2126 break;
2127
2128 case WM_DELTAXY:
2129 emit_delta_xy(p, dst, dst_flags, args[0]);
2130 break;
2131
2132 case WM_WPOSXY:
2133 emit_wpos_xy(c, dst, dst_flags, args[0]);
2134 break;
2135
2136 case WM_PIXELW:
2137 emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
2138 break;
2139
2140 case WM_LINTERP:
2141 emit_linterp(p, dst, dst_flags, args[0], args[1]);
2142 break;
2143
2144 case WM_PINTERP:
2145 emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
2146 break;
2147
2148 case WM_CINTERP:
2149 emit_cinterp(p, dst, dst_flags, args[0]);
2150 break;
2151
2152 case WM_FB_WRITE:
2153 emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot);
2154 break;
2155
2156 case WM_FRONTFACING:
2157 emit_frontfacing(p, dst, dst_flags);
2158 break;
2159
2160 /* Straightforward arithmetic:
2161 */
2162 case OPCODE_ADD:
2163 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
2164 break;
2165
2166 case OPCODE_FRC:
2167 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
2168 break;
2169
2170 case OPCODE_FLR:
2171 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
2172 break;
2173
2174 case OPCODE_DDX:
2175 emit_ddxy(p, dst, dst_flags, GL_TRUE, args[0]);
2176 break;
2177
2178 case OPCODE_DDY:
2179 emit_ddxy(p, dst, dst_flags, GL_FALSE, args[0]);
2180 break;
2181
2182 case OPCODE_DP3:
2183 emit_dp3(p, dst, dst_flags, args[0], args[1]);
2184 break;
2185
2186 case OPCODE_DP4:
2187 emit_dp4(p, dst, dst_flags, args[0], args[1]);
2188 break;
2189
2190 case OPCODE_DPH:
2191 emit_dph(p, dst, dst_flags, args[0], args[1]);
2192 break;
2193
2194 case OPCODE_TRUNC:
2195 emit_alu1(p, brw_RNDZ, dst, dst_flags, args[0]);
2196 break;
2197
2198 case OPCODE_LRP:
2199 emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
2200 break;
2201
2202 case OPCODE_MAD:
2203 emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
2204 break;
2205
2206 case OPCODE_MOV:
2207 case OPCODE_SWZ:
2208 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
2209 break;
2210
2211 case OPCODE_MUL:
2212 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
2213 break;
2214
2215 case OPCODE_XPD:
2216 emit_xpd(p, dst, dst_flags, args[0], args[1]);
2217 break;
2218
2219 /* Higher math functions:
2220 */
2221 case OPCODE_RCP:
2222 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
2223 break;
2224
2225 case OPCODE_RSQ:
2226 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
2227 break;
2228
2229 case OPCODE_SIN:
2230 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
2231 break;
2232
2233 case OPCODE_COS:
2234 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
2235 break;
2236
2237 case OPCODE_EX2:
2238 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
2239 break;
2240
2241 case OPCODE_LG2:
2242 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
2243 break;
2244
2245 case OPCODE_SCS:
2246 /* There is an scs math function, but it would need some
2247 * fixup for 16-element execution.
2248 */
2249 if (dst_flags & WRITEMASK_X)
2250 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
2251 if (dst_flags & WRITEMASK_Y)
2252 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
2253 break;
2254
2255 case OPCODE_POW:
2256 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
2257 break;
2258
2259 /* Comparisons:
2260 */
2261 case OPCODE_CMP:
2262 emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
2263 break;
2264
2265 case OPCODE_MAX:
2266 emit_max(p, dst, dst_flags, args[0], args[1]);
2267 break;
2268
2269 case OPCODE_MIN:
2270 emit_min(p, dst, dst_flags, args[0], args[1]);
2271 break;
2272
2273 case OPCODE_SLT:
2274 emit_slt(p, dst, dst_flags, args[0], args[1]);
2275 break;
2276
2277 case OPCODE_SLE:
2278 emit_sle(p, dst, dst_flags, args[0], args[1]);
2279 break;
2280 case OPCODE_SGT:
2281 emit_sgt(p, dst, dst_flags, args[0], args[1]);
2282 break;
2283 case OPCODE_SGE:
2284 emit_sge(p, dst, dst_flags, args[0], args[1]);
2285 break;
2286 case OPCODE_SEQ:
2287 emit_seq(p, dst, dst_flags, args[0], args[1]);
2288 break;
2289 case OPCODE_SNE:
2290 emit_sne(p, dst, dst_flags, args[0], args[1]);
2291 break;
2292
2293 case OPCODE_LIT:
2294 emit_lit(c, dst, dst_flags, args[0]);
2295 break;
2296
2297 /* Texturing operations:
2298 */
2299 case OPCODE_TEX:
2300 emit_tex(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
2301 inst->tex_idx, inst->tex_unit,
2302 inst->tex_shadow);
2303 break;
2304
2305 case OPCODE_TXB:
2306 emit_txb(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
2307 inst->tex_idx, inst->tex_unit);
2308 break;
2309
2310 case OPCODE_KIL:
2311 emit_kil(c, args[0]);
2312 break;
2313
2314 case OPCODE_KIL_NV:
2315 emit_kil_nv(c);
2316 break;
2317
2318 default:
2319 printf("Unsupported opcode %i (%s) in fragment shader\n",
2320 inst->opcode, inst->opcode < MAX_OPCODE ?
2321 _mesa_opcode_string(inst->opcode) :
2322 "unknown");
2323 }
2324
2325 for (i = 0; i < 4; i++)
2326 if (inst->dst[i] && inst->dst[i]->spill_slot)
2327 emit_spill(c,
2328 inst->dst[i]->hw_reg,
2329 inst->dst[i]->spill_slot);
2330 }
2331
2332 /* Only properly tested on ILK */
2333 if (p->brw->intel.gen == 5) {
2334 brw_remove_duplicate_mrf_moves(c);
2335 brw_remove_mrf_to_grf_moves(c);
2336 }
2337
2338 if (INTEL_DEBUG & DEBUG_WM) {
2339 int i;
2340
2341 printf("wm-native:\n");
2342 for (i = 0; i < p->nr_insn; i++)
2343 brw_disasm(stderr, &p->store[i], p->brw->intel.gen);
2344 printf("\n");
2345 }
2346 }
2347