i965: Add support for OPCODE_SSG.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "main/macros.h"
34 #include "brw_context.h"
35 #include "brw_wm.h"
36
37 static GLboolean can_do_pln(struct intel_context *intel,
38 const struct brw_reg *deltas)
39 {
40 struct brw_context *brw = brw_context(&intel->ctx);
41
42 if (!brw->has_pln)
43 return GL_FALSE;
44
45 if (deltas[1].nr != deltas[0].nr + 1)
46 return GL_FALSE;
47
48 if (intel->gen < 6 && ((deltas[0].nr & 1) != 0))
49 return GL_FALSE;
50
51 return GL_TRUE;
52 }
53
54 /* Not quite sure how correct this is - need to understand horiz
55 * vs. vertical strides a little better.
56 */
57 static INLINE struct brw_reg sechalf( struct brw_reg reg )
58 {
59 if (reg.vstride)
60 reg.nr++;
61 return reg;
62 }
63
64 /* Return the SrcReg index of the channels that can be immediate float operands
65 * instead of usage of PROGRAM_CONSTANT values through push/pull.
66 */
67 GLboolean
68 brw_wm_arg_can_be_immediate(enum prog_opcode opcode, int arg)
69 {
70 int opcode_array[] = {
71 [OPCODE_ADD] = 2,
72 [OPCODE_CMP] = 3,
73 [OPCODE_DP3] = 2,
74 [OPCODE_DP4] = 2,
75 [OPCODE_DPH] = 2,
76 [OPCODE_MAX] = 2,
77 [OPCODE_MIN] = 2,
78 [OPCODE_MOV] = 1,
79 [OPCODE_MUL] = 2,
80 [OPCODE_SEQ] = 2,
81 [OPCODE_SGE] = 2,
82 [OPCODE_SGT] = 2,
83 [OPCODE_SLE] = 2,
84 [OPCODE_SLT] = 2,
85 [OPCODE_SNE] = 2,
86 [OPCODE_XPD] = 2,
87 };
88
89 /* These opcodes get broken down in a way that allow two
90 * args to be immediates.
91 */
92 if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
93 if (arg == 1 || arg == 2)
94 return GL_TRUE;
95 }
96
97 if (opcode > ARRAY_SIZE(opcode_array))
98 return GL_FALSE;
99
100 return arg == opcode_array[opcode] - 1;
101 }
102
103 /**
104 * Computes the screen-space x,y position of the pixels.
105 *
106 * This will be used by emit_delta_xy() or emit_wpos_xy() for
107 * interpolation of attributes..
108 *
109 * Payload R0:
110 *
111 * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles,
112 * corresponding to each of the 16 execution channels.
113 * R0.1..8 -- ?
114 * R1.0 -- triangle vertex 0.X
115 * R1.1 -- triangle vertex 0.Y
116 * R1.2 -- tile 0 x,y coords (2 packed uwords)
117 * R1.3 -- tile 1 x,y coords (2 packed uwords)
118 * R1.4 -- tile 2 x,y coords (2 packed uwords)
119 * R1.5 -- tile 3 x,y coords (2 packed uwords)
120 * R1.6 -- ?
121 * R1.7 -- ?
122 * R1.8 -- ?
123 */
124 void emit_pixel_xy(struct brw_wm_compile *c,
125 const struct brw_reg *dst,
126 GLuint mask)
127 {
128 struct brw_compile *p = &c->func;
129 struct brw_reg r1 = brw_vec1_grf(1, 0);
130 struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
131 struct brw_reg dst0_uw, dst1_uw;
132
133 brw_push_insn_state(p);
134 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
135
136 if (c->dispatch_width == 16) {
137 dst0_uw = vec16(retype(dst[0], BRW_REGISTER_TYPE_UW));
138 dst1_uw = vec16(retype(dst[1], BRW_REGISTER_TYPE_UW));
139 } else {
140 dst0_uw = vec8(retype(dst[0], BRW_REGISTER_TYPE_UW));
141 dst1_uw = vec8(retype(dst[1], BRW_REGISTER_TYPE_UW));
142 }
143
144 /* Calculate pixel centers by adding 1 or 0 to each of the
145 * micro-tile coordinates passed in r1.
146 */
147 if (mask & WRITEMASK_X) {
148 brw_ADD(p,
149 dst0_uw,
150 stride(suboffset(r1_uw, 4), 2, 4, 0),
151 brw_imm_v(0x10101010));
152 }
153
154 if (mask & WRITEMASK_Y) {
155 brw_ADD(p,
156 dst1_uw,
157 stride(suboffset(r1_uw,5), 2, 4, 0),
158 brw_imm_v(0x11001100));
159 }
160 brw_pop_insn_state(p);
161 }
162
163 /**
164 * Computes the screen-space x,y distance of the pixels from the start
165 * vertex.
166 *
167 * This will be used in linterp or pinterp with the start vertex value
168 * and the Cx, Cy, and C0 coefficients passed in from the setup engine
169 * to produce interpolated attribute values.
170 */
171 void emit_delta_xy(struct brw_compile *p,
172 const struct brw_reg *dst,
173 GLuint mask,
174 const struct brw_reg *arg0)
175 {
176 struct brw_reg r1 = brw_vec1_grf(1, 0);
177
178 if (mask == 0)
179 return;
180
181 assert(mask == WRITEMASK_XY);
182
183 /* Calc delta X,Y by subtracting origin in r1 from the pixel
184 * centers produced by emit_pixel_xy().
185 */
186 brw_ADD(p,
187 dst[0],
188 retype(arg0[0], BRW_REGISTER_TYPE_UW),
189 negate(r1));
190 brw_ADD(p,
191 dst[1],
192 retype(arg0[1], BRW_REGISTER_TYPE_UW),
193 negate(suboffset(r1,1)));
194 }
195
196 /**
197 * Computes the pixel offset from the window origin for gl_FragCoord().
198 */
199 void emit_wpos_xy(struct brw_wm_compile *c,
200 const struct brw_reg *dst,
201 GLuint mask,
202 const struct brw_reg *arg0)
203 {
204 struct brw_compile *p = &c->func;
205
206 if (mask & WRITEMASK_X) {
207 if (c->fp->program.PixelCenterInteger) {
208 /* X' = X */
209 brw_MOV(p,
210 dst[0],
211 retype(arg0[0], BRW_REGISTER_TYPE_W));
212 } else {
213 /* X' = X + 0.5 */
214 brw_ADD(p,
215 dst[0],
216 retype(arg0[0], BRW_REGISTER_TYPE_W),
217 brw_imm_f(0.5));
218 }
219 }
220
221 if (mask & WRITEMASK_Y) {
222 if (c->fp->program.OriginUpperLeft) {
223 if (c->fp->program.PixelCenterInteger) {
224 /* Y' = Y */
225 brw_MOV(p,
226 dst[1],
227 retype(arg0[1], BRW_REGISTER_TYPE_W));
228 } else {
229 /* Y' = Y + 0.5 */
230 brw_ADD(p,
231 dst[1],
232 retype(arg0[1], BRW_REGISTER_TYPE_W),
233 brw_imm_f(0.5));
234 }
235 } else {
236 float center_offset = c->fp->program.PixelCenterInteger ? 0.0 : 0.5;
237
238 /* Y' = (height - 1) - Y + center */
239 brw_ADD(p,
240 dst[1],
241 negate(retype(arg0[1], BRW_REGISTER_TYPE_W)),
242 brw_imm_f(c->key.drawable_height - 1 + center_offset));
243 }
244 }
245 }
246
247
248 void emit_pixel_w(struct brw_wm_compile *c,
249 const struct brw_reg *dst,
250 GLuint mask,
251 const struct brw_reg *arg0,
252 const struct brw_reg *deltas)
253 {
254 struct brw_compile *p = &c->func;
255 struct intel_context *intel = &p->brw->intel;
256
257 /* Don't need this if all you are doing is interpolating color, for
258 * instance.
259 */
260 if (mask & WRITEMASK_W) {
261 struct brw_reg interp3 = brw_vec1_grf(arg0[0].nr+1, 4);
262
263 /* Calc 1/w - just linterp wpos[3] optimized by putting the
264 * result straight into a message reg.
265 */
266 if (can_do_pln(intel, deltas)) {
267 brw_PLN(p, brw_message_reg(2), interp3, deltas[0]);
268 } else {
269 brw_LINE(p, brw_null_reg(), interp3, deltas[0]);
270 brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), deltas[1]);
271 }
272
273 /* Calc w */
274 if (c->dispatch_width == 16) {
275 brw_math_16(p, dst[3],
276 BRW_MATH_FUNCTION_INV,
277 BRW_MATH_SATURATE_NONE,
278 2, brw_null_reg(),
279 BRW_MATH_PRECISION_FULL);
280 } else {
281 brw_math(p, dst[3],
282 BRW_MATH_FUNCTION_INV,
283 BRW_MATH_SATURATE_NONE,
284 2, brw_null_reg(),
285 BRW_MATH_DATA_VECTOR,
286 BRW_MATH_PRECISION_FULL);
287 }
288 }
289 }
290
291
292 void emit_linterp(struct brw_compile *p,
293 const struct brw_reg *dst,
294 GLuint mask,
295 const struct brw_reg *arg0,
296 const struct brw_reg *deltas)
297 {
298 struct intel_context *intel = &p->brw->intel;
299 struct brw_reg interp[4];
300 GLuint nr = arg0[0].nr;
301 GLuint i;
302
303 interp[0] = brw_vec1_grf(nr, 0);
304 interp[1] = brw_vec1_grf(nr, 4);
305 interp[2] = brw_vec1_grf(nr+1, 0);
306 interp[3] = brw_vec1_grf(nr+1, 4);
307
308 for (i = 0; i < 4; i++) {
309 if (mask & (1<<i)) {
310 if (can_do_pln(intel, deltas)) {
311 brw_PLN(p, dst[i], interp[i], deltas[0]);
312 } else {
313 brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
314 brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
315 }
316 }
317 }
318 }
319
320
321 void emit_pinterp(struct brw_compile *p,
322 const struct brw_reg *dst,
323 GLuint mask,
324 const struct brw_reg *arg0,
325 const struct brw_reg *deltas,
326 const struct brw_reg *w)
327 {
328 struct intel_context *intel = &p->brw->intel;
329 struct brw_reg interp[4];
330 GLuint nr = arg0[0].nr;
331 GLuint i;
332
333 interp[0] = brw_vec1_grf(nr, 0);
334 interp[1] = brw_vec1_grf(nr, 4);
335 interp[2] = brw_vec1_grf(nr+1, 0);
336 interp[3] = brw_vec1_grf(nr+1, 4);
337
338 for (i = 0; i < 4; i++) {
339 if (mask & (1<<i)) {
340 if (can_do_pln(intel, deltas)) {
341 brw_PLN(p, dst[i], interp[i], deltas[0]);
342 } else {
343 brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
344 brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
345 }
346 }
347 }
348 for (i = 0; i < 4; i++) {
349 if (mask & (1<<i)) {
350 brw_MUL(p, dst[i], dst[i], w[3]);
351 }
352 }
353 }
354
355
356 void emit_cinterp(struct brw_compile *p,
357 const struct brw_reg *dst,
358 GLuint mask,
359 const struct brw_reg *arg0)
360 {
361 struct brw_reg interp[4];
362 GLuint nr = arg0[0].nr;
363 GLuint i;
364
365 interp[0] = brw_vec1_grf(nr, 0);
366 interp[1] = brw_vec1_grf(nr, 4);
367 interp[2] = brw_vec1_grf(nr+1, 0);
368 interp[3] = brw_vec1_grf(nr+1, 4);
369
370 for (i = 0; i < 4; i++) {
371 if (mask & (1<<i)) {
372 brw_MOV(p, dst[i], suboffset(interp[i],3)); /* TODO: optimize away like other moves */
373 }
374 }
375 }
376
377 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
378 void emit_frontfacing(struct brw_compile *p,
379 const struct brw_reg *dst,
380 GLuint mask)
381 {
382 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
383 GLuint i;
384
385 if (!(mask & WRITEMASK_XYZW))
386 return;
387
388 for (i = 0; i < 4; i++) {
389 if (mask & (1<<i)) {
390 brw_MOV(p, dst[i], brw_imm_f(0.0));
391 }
392 }
393
394 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
395 * us front face
396 */
397 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
398 for (i = 0; i < 4; i++) {
399 if (mask & (1<<i)) {
400 brw_MOV(p, dst[i], brw_imm_f(1.0));
401 }
402 }
403 brw_set_predicate_control_flag_value(p, 0xff);
404 }
405
406 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
407 * looking like:
408 *
409 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
410 *
411 * and we're trying to produce:
412 *
413 * DDX DDY
414 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
415 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
416 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
417 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
418 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
419 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
420 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
421 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
422 *
423 * and add another set of two more subspans if in 16-pixel dispatch mode.
424 *
425 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
426 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
427 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
428 * between each other. We could probably do it like ddx and swizzle the right
429 * order later, but bail for now and just produce
430 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
431 */
432 void emit_ddxy(struct brw_compile *p,
433 const struct brw_reg *dst,
434 GLuint mask,
435 GLboolean is_ddx,
436 const struct brw_reg *arg0)
437 {
438 int i;
439 struct brw_reg src0, src1;
440
441 if (mask & SATURATE)
442 brw_set_saturate(p, 1);
443 for (i = 0; i < 4; i++ ) {
444 if (mask & (1<<i)) {
445 if (is_ddx) {
446 src0 = brw_reg(arg0[i].file, arg0[i].nr, 1,
447 BRW_REGISTER_TYPE_F,
448 BRW_VERTICAL_STRIDE_2,
449 BRW_WIDTH_2,
450 BRW_HORIZONTAL_STRIDE_0,
451 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
452 src1 = brw_reg(arg0[i].file, arg0[i].nr, 0,
453 BRW_REGISTER_TYPE_F,
454 BRW_VERTICAL_STRIDE_2,
455 BRW_WIDTH_2,
456 BRW_HORIZONTAL_STRIDE_0,
457 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
458 } else {
459 src0 = brw_reg(arg0[i].file, arg0[i].nr, 0,
460 BRW_REGISTER_TYPE_F,
461 BRW_VERTICAL_STRIDE_4,
462 BRW_WIDTH_4,
463 BRW_HORIZONTAL_STRIDE_0,
464 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
465 src1 = brw_reg(arg0[i].file, arg0[i].nr, 2,
466 BRW_REGISTER_TYPE_F,
467 BRW_VERTICAL_STRIDE_4,
468 BRW_WIDTH_4,
469 BRW_HORIZONTAL_STRIDE_0,
470 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
471 }
472 brw_ADD(p, dst[i], src0, negate(src1));
473 }
474 }
475 if (mask & SATURATE)
476 brw_set_saturate(p, 0);
477 }
478
479 void emit_alu1(struct brw_compile *p,
480 struct brw_instruction *(*func)(struct brw_compile *,
481 struct brw_reg,
482 struct brw_reg),
483 const struct brw_reg *dst,
484 GLuint mask,
485 const struct brw_reg *arg0)
486 {
487 GLuint i;
488
489 if (mask & SATURATE)
490 brw_set_saturate(p, 1);
491
492 for (i = 0; i < 4; i++) {
493 if (mask & (1<<i)) {
494 func(p, dst[i], arg0[i]);
495 }
496 }
497
498 if (mask & SATURATE)
499 brw_set_saturate(p, 0);
500 }
501
502
503 void emit_alu2(struct brw_compile *p,
504 struct brw_instruction *(*func)(struct brw_compile *,
505 struct brw_reg,
506 struct brw_reg,
507 struct brw_reg),
508 const struct brw_reg *dst,
509 GLuint mask,
510 const struct brw_reg *arg0,
511 const struct brw_reg *arg1)
512 {
513 GLuint i;
514
515 if (mask & SATURATE)
516 brw_set_saturate(p, 1);
517
518 for (i = 0; i < 4; i++) {
519 if (mask & (1<<i)) {
520 func(p, dst[i], arg0[i], arg1[i]);
521 }
522 }
523
524 if (mask & SATURATE)
525 brw_set_saturate(p, 0);
526 }
527
528
529 void emit_mad(struct brw_compile *p,
530 const struct brw_reg *dst,
531 GLuint mask,
532 const struct brw_reg *arg0,
533 const struct brw_reg *arg1,
534 const struct brw_reg *arg2)
535 {
536 GLuint i;
537
538 for (i = 0; i < 4; i++) {
539 if (mask & (1<<i)) {
540 brw_MUL(p, dst[i], arg0[i], arg1[i]);
541
542 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
543 brw_ADD(p, dst[i], dst[i], arg2[i]);
544 brw_set_saturate(p, 0);
545 }
546 }
547 }
548
549 void emit_lrp(struct brw_compile *p,
550 const struct brw_reg *dst,
551 GLuint mask,
552 const struct brw_reg *arg0,
553 const struct brw_reg *arg1,
554 const struct brw_reg *arg2)
555 {
556 GLuint i;
557
558 /* Uses dst as a temporary:
559 */
560 for (i = 0; i < 4; i++) {
561 if (mask & (1<<i)) {
562 /* Can I use the LINE instruction for this?
563 */
564 brw_ADD(p, dst[i], negate(arg0[i]), brw_imm_f(1.0));
565 brw_MUL(p, brw_null_reg(), dst[i], arg2[i]);
566
567 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
568 brw_MAC(p, dst[i], arg0[i], arg1[i]);
569 brw_set_saturate(p, 0);
570 }
571 }
572 }
573
574 void emit_sop(struct brw_compile *p,
575 const struct brw_reg *dst,
576 GLuint mask,
577 GLuint cond,
578 const struct brw_reg *arg0,
579 const struct brw_reg *arg1)
580 {
581 GLuint i;
582
583 for (i = 0; i < 4; i++) {
584 if (mask & (1<<i)) {
585 brw_push_insn_state(p);
586 brw_CMP(p, brw_null_reg(), cond, arg0[i], arg1[i]);
587 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
588 brw_MOV(p, dst[i], brw_imm_f(0));
589 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
590 brw_MOV(p, dst[i], brw_imm_f(1.0));
591 brw_pop_insn_state(p);
592 }
593 }
594 }
595
596 static void emit_slt( struct brw_compile *p,
597 const struct brw_reg *dst,
598 GLuint mask,
599 const struct brw_reg *arg0,
600 const struct brw_reg *arg1 )
601 {
602 emit_sop(p, dst, mask, BRW_CONDITIONAL_L, arg0, arg1);
603 }
604
605 static void emit_sle( struct brw_compile *p,
606 const struct brw_reg *dst,
607 GLuint mask,
608 const struct brw_reg *arg0,
609 const struct brw_reg *arg1 )
610 {
611 emit_sop(p, dst, mask, BRW_CONDITIONAL_LE, arg0, arg1);
612 }
613
614 static void emit_sgt( struct brw_compile *p,
615 const struct brw_reg *dst,
616 GLuint mask,
617 const struct brw_reg *arg0,
618 const struct brw_reg *arg1 )
619 {
620 emit_sop(p, dst, mask, BRW_CONDITIONAL_G, arg0, arg1);
621 }
622
623 static void emit_sge( struct brw_compile *p,
624 const struct brw_reg *dst,
625 GLuint mask,
626 const struct brw_reg *arg0,
627 const struct brw_reg *arg1 )
628 {
629 emit_sop(p, dst, mask, BRW_CONDITIONAL_GE, arg0, arg1);
630 }
631
632 static void emit_seq( struct brw_compile *p,
633 const struct brw_reg *dst,
634 GLuint mask,
635 const struct brw_reg *arg0,
636 const struct brw_reg *arg1 )
637 {
638 emit_sop(p, dst, mask, BRW_CONDITIONAL_EQ, arg0, arg1);
639 }
640
641 static void emit_sne( struct brw_compile *p,
642 const struct brw_reg *dst,
643 GLuint mask,
644 const struct brw_reg *arg0,
645 const struct brw_reg *arg1 )
646 {
647 emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1);
648 }
649
650 void emit_cmp(struct brw_compile *p,
651 const struct brw_reg *dst,
652 GLuint mask,
653 const struct brw_reg *arg0,
654 const struct brw_reg *arg1,
655 const struct brw_reg *arg2)
656 {
657 GLuint i;
658
659 for (i = 0; i < 4; i++) {
660 if (mask & (1<<i)) {
661 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
662
663 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
664 brw_SEL(p, dst[i], arg1[i], arg2[i]);
665 brw_set_saturate(p, 0);
666 brw_set_predicate_control_flag_value(p, 0xff);
667 }
668 }
669 }
670
671 void emit_sign(struct brw_compile *p,
672 const struct brw_reg *dst,
673 GLuint mask,
674 const struct brw_reg *arg0)
675 {
676 GLuint i;
677
678 for (i = 0; i < 4; i++) {
679 if (mask & (1<<i)) {
680 brw_MOV(p, dst[i], brw_imm_f(0.0));
681
682 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
683 brw_MOV(p, dst[i], brw_imm_f(-1.0));
684 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
685
686 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0[i], brw_imm_f(0));
687 brw_MOV(p, dst[i], brw_imm_f(1.0));
688 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
689 }
690 }
691 }
692
693 void emit_max(struct brw_compile *p,
694 const struct brw_reg *dst,
695 GLuint mask,
696 const struct brw_reg *arg0,
697 const struct brw_reg *arg1)
698 {
699 GLuint i;
700
701 for (i = 0; i < 4; i++) {
702 if (mask & (1<<i)) {
703 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], arg1[i]);
704
705 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
706 brw_SEL(p, dst[i], arg0[i], arg1[i]);
707 brw_set_saturate(p, 0);
708 brw_set_predicate_control_flag_value(p, 0xff);
709 }
710 }
711 }
712
713 void emit_min(struct brw_compile *p,
714 const struct brw_reg *dst,
715 GLuint mask,
716 const struct brw_reg *arg0,
717 const struct brw_reg *arg1)
718 {
719 GLuint i;
720
721 for (i = 0; i < 4; i++) {
722 if (mask & (1<<i)) {
723 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
724
725 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
726 brw_SEL(p, dst[i], arg0[i], arg1[i]);
727 brw_set_saturate(p, 0);
728 brw_set_predicate_control_flag_value(p, 0xff);
729 }
730 }
731 }
732
733
734 void emit_dp3(struct brw_compile *p,
735 const struct brw_reg *dst,
736 GLuint mask,
737 const struct brw_reg *arg0,
738 const struct brw_reg *arg1)
739 {
740 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
741
742 if (!(mask & WRITEMASK_XYZW))
743 return; /* Do not emit dead code */
744
745 assert(is_power_of_two(mask & WRITEMASK_XYZW));
746
747 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
748 brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
749
750 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
751 brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
752 brw_set_saturate(p, 0);
753 }
754
755
756 void emit_dp4(struct brw_compile *p,
757 const struct brw_reg *dst,
758 GLuint mask,
759 const struct brw_reg *arg0,
760 const struct brw_reg *arg1)
761 {
762 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
763
764 if (!(mask & WRITEMASK_XYZW))
765 return; /* Do not emit dead code */
766
767 assert(is_power_of_two(mask & WRITEMASK_XYZW));
768
769 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
770 brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
771 brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]);
772
773 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
774 brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]);
775 brw_set_saturate(p, 0);
776 }
777
778
779 void emit_dph(struct brw_compile *p,
780 const struct brw_reg *dst,
781 GLuint mask,
782 const struct brw_reg *arg0,
783 const struct brw_reg *arg1)
784 {
785 const int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
786
787 if (!(mask & WRITEMASK_XYZW))
788 return; /* Do not emit dead code */
789
790 assert(is_power_of_two(mask & WRITEMASK_XYZW));
791
792 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
793 brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
794 brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
795
796 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
797 brw_ADD(p, dst[dst_chan], dst[dst_chan], arg1[3]);
798 brw_set_saturate(p, 0);
799 }
800
801
802 void emit_xpd(struct brw_compile *p,
803 const struct brw_reg *dst,
804 GLuint mask,
805 const struct brw_reg *arg0,
806 const struct brw_reg *arg1)
807 {
808 GLuint i;
809
810 assert((mask & WRITEMASK_W) != WRITEMASK_W);
811
812 for (i = 0 ; i < 3; i++) {
813 if (mask & (1<<i)) {
814 GLuint i2 = (i+2)%3;
815 GLuint i1 = (i+1)%3;
816
817 brw_MUL(p, brw_null_reg(), negate(arg0[i2]), arg1[i1]);
818
819 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
820 brw_MAC(p, dst[i], arg0[i1], arg1[i2]);
821 brw_set_saturate(p, 0);
822 }
823 }
824 }
825
826
827 void emit_math1(struct brw_wm_compile *c,
828 GLuint function,
829 const struct brw_reg *dst,
830 GLuint mask,
831 const struct brw_reg *arg0)
832 {
833 struct brw_compile *p = &c->func;
834 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
835 GLuint saturate = ((mask & SATURATE) ?
836 BRW_MATH_SATURATE_SATURATE :
837 BRW_MATH_SATURATE_NONE);
838
839 if (!(mask & WRITEMASK_XYZW))
840 return; /* Do not emit dead code */
841
842 assert(is_power_of_two(mask & WRITEMASK_XYZW));
843
844 /* If compressed, this will write message reg 2,3 from arg0.x's 16
845 * channels.
846 */
847 brw_MOV(p, brw_message_reg(2), arg0[0]);
848
849 /* Send two messages to perform all 16 operations:
850 */
851 brw_push_insn_state(p);
852 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
853 brw_math(p,
854 dst[dst_chan],
855 function,
856 saturate,
857 2,
858 brw_null_reg(),
859 BRW_MATH_DATA_VECTOR,
860 BRW_MATH_PRECISION_FULL);
861
862 if (c->dispatch_width == 16) {
863 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
864 brw_math(p,
865 offset(dst[dst_chan],1),
866 function,
867 saturate,
868 3,
869 brw_null_reg(),
870 BRW_MATH_DATA_VECTOR,
871 BRW_MATH_PRECISION_FULL);
872 }
873 brw_pop_insn_state(p);
874 }
875
876
877 void emit_math2(struct brw_wm_compile *c,
878 GLuint function,
879 const struct brw_reg *dst,
880 GLuint mask,
881 const struct brw_reg *arg0,
882 const struct brw_reg *arg1)
883 {
884 struct brw_compile *p = &c->func;
885 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
886 GLuint saturate = ((mask & SATURATE) ?
887 BRW_MATH_SATURATE_SATURATE :
888 BRW_MATH_SATURATE_NONE);
889
890 if (!(mask & WRITEMASK_XYZW))
891 return; /* Do not emit dead code */
892
893 assert(is_power_of_two(mask & WRITEMASK_XYZW));
894
895 brw_push_insn_state(p);
896
897 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
898 brw_MOV(p, brw_message_reg(2), arg0[0]);
899 if (c->dispatch_width == 16) {
900 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
901 brw_MOV(p, brw_message_reg(4), sechalf(arg0[0]));
902 }
903
904 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
905 brw_MOV(p, brw_message_reg(3), arg1[0]);
906 if (c->dispatch_width == 16) {
907 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
908 brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
909 }
910
911 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
912 brw_math(p,
913 dst[dst_chan],
914 function,
915 saturate,
916 2,
917 brw_null_reg(),
918 BRW_MATH_DATA_VECTOR,
919 BRW_MATH_PRECISION_FULL);
920
921 /* Send two messages to perform all 16 operations:
922 */
923 if (c->dispatch_width == 16) {
924 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
925 brw_math(p,
926 offset(dst[dst_chan],1),
927 function,
928 saturate,
929 4,
930 brw_null_reg(),
931 BRW_MATH_DATA_VECTOR,
932 BRW_MATH_PRECISION_FULL);
933 }
934 brw_pop_insn_state(p);
935 }
936
937
938 void emit_tex(struct brw_wm_compile *c,
939 struct brw_reg *dst,
940 GLuint dst_flags,
941 struct brw_reg *arg,
942 struct brw_reg depth_payload,
943 GLuint tex_idx,
944 GLuint sampler,
945 GLboolean shadow)
946 {
947 struct brw_compile *p = &c->func;
948 struct intel_context *intel = &p->brw->intel;
949 struct brw_reg dst_retyped;
950 GLuint cur_mrf = 2, response_length;
951 GLuint i, nr_texcoords;
952 GLuint emit;
953 GLuint msg_type;
954 GLuint mrf_per_channel;
955 GLuint simd_mode;
956
957 if (c->dispatch_width == 16) {
958 mrf_per_channel = 2;
959 response_length = 8;
960 dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
961 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
962 } else {
963 mrf_per_channel = 1;
964 response_length = 4;
965 dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
966 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
967 }
968
969 /* How many input regs are there?
970 */
971 switch (tex_idx) {
972 case TEXTURE_1D_INDEX:
973 emit = WRITEMASK_X;
974 nr_texcoords = 1;
975 break;
976 case TEXTURE_2D_INDEX:
977 case TEXTURE_RECT_INDEX:
978 emit = WRITEMASK_XY;
979 nr_texcoords = 2;
980 break;
981 case TEXTURE_3D_INDEX:
982 case TEXTURE_CUBE_INDEX:
983 emit = WRITEMASK_XYZ;
984 nr_texcoords = 3;
985 break;
986 default:
987 /* unexpected target */
988 abort();
989 }
990
991 /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */
992 if (intel->gen < 5 && c->dispatch_width == 8)
993 nr_texcoords = 3;
994
995 /* For shadow comparisons, we have to supply u,v,r. */
996 if (shadow)
997 nr_texcoords = 3;
998
999 /* Emit the texcoords. */
1000 for (i = 0; i < nr_texcoords; i++) {
1001 if (emit & (1<<i))
1002 brw_MOV(p, brw_message_reg(cur_mrf), arg[i]);
1003 else
1004 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1005 cur_mrf += mrf_per_channel;
1006 }
1007
1008 /* Fill in the shadow comparison reference value. */
1009 if (shadow) {
1010 if (intel->gen == 5) {
1011 /* Fill in the cube map array index value. */
1012 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1013 cur_mrf += mrf_per_channel;
1014 } else if (c->dispatch_width == 8) {
1015 /* Fill in the LOD bias value. */
1016 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1017 cur_mrf += mrf_per_channel;
1018 }
1019 brw_MOV(p, brw_message_reg(cur_mrf), arg[2]);
1020 cur_mrf += mrf_per_channel;
1021 }
1022
1023 if (intel->gen == 5) {
1024 if (shadow)
1025 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_GEN5;
1026 else
1027 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_GEN5;
1028 } else {
1029 /* Note that G45 and older determines shadow compare and dispatch width
1030 * from message length for most messages.
1031 */
1032 if (c->dispatch_width == 16 && shadow)
1033 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
1034 else
1035 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
1036 }
1037
1038 brw_SAMPLE(p,
1039 dst_retyped,
1040 1,
1041 retype(depth_payload, BRW_REGISTER_TYPE_UW),
1042 SURF_INDEX_TEXTURE(sampler),
1043 sampler,
1044 dst_flags & WRITEMASK_XYZW,
1045 msg_type,
1046 response_length,
1047 cur_mrf - 1,
1048 0,
1049 1,
1050 simd_mode);
1051 }
1052
1053
1054 void emit_txb(struct brw_wm_compile *c,
1055 struct brw_reg *dst,
1056 GLuint dst_flags,
1057 struct brw_reg *arg,
1058 struct brw_reg depth_payload,
1059 GLuint tex_idx,
1060 GLuint sampler)
1061 {
1062 struct brw_compile *p = &c->func;
1063 struct intel_context *intel = &p->brw->intel;
1064 GLuint msgLength;
1065 GLuint msg_type;
1066 GLuint mrf_per_channel;
1067 GLuint response_length;
1068 struct brw_reg dst_retyped;
1069
1070 /* The G45 and older chipsets don't support 8-wide dispatch for LOD biased
1071 * samples, so we'll use the 16-wide instruction, leave the second halves
1072 * undefined, and trust the execution mask to keep the undefined pixels
1073 * from mattering.
1074 */
1075 if (c->dispatch_width == 16 || intel->gen < 5) {
1076 if (intel->gen == 5)
1077 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
1078 else
1079 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
1080 mrf_per_channel = 2;
1081 dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1082 response_length = 8;
1083 } else {
1084 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
1085 mrf_per_channel = 1;
1086 dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1087 response_length = 4;
1088 }
1089
1090 /* Shadow ignored for txb. */
1091 switch (tex_idx) {
1092 case TEXTURE_1D_INDEX:
1093 brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1094 brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), brw_imm_f(0));
1095 brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1096 break;
1097 case TEXTURE_2D_INDEX:
1098 case TEXTURE_RECT_INDEX:
1099 brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1100 brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1101 brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1102 break;
1103 case TEXTURE_3D_INDEX:
1104 case TEXTURE_CUBE_INDEX:
1105 brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1106 brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1107 brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), arg[2]);
1108 break;
1109 default:
1110 /* unexpected target */
1111 abort();
1112 }
1113
1114 brw_MOV(p, brw_message_reg(2 + 3 * mrf_per_channel), arg[3]);
1115 msgLength = 2 + 4 * mrf_per_channel - 1;
1116
1117 brw_SAMPLE(p,
1118 dst_retyped,
1119 1,
1120 retype(depth_payload, BRW_REGISTER_TYPE_UW),
1121 SURF_INDEX_TEXTURE(sampler),
1122 sampler,
1123 dst_flags & WRITEMASK_XYZW,
1124 msg_type,
1125 response_length,
1126 msgLength,
1127 0,
1128 1,
1129 BRW_SAMPLER_SIMD_MODE_SIMD16);
1130 }
1131
1132
1133 static void emit_lit(struct brw_wm_compile *c,
1134 const struct brw_reg *dst,
1135 GLuint mask,
1136 const struct brw_reg *arg0)
1137 {
1138 struct brw_compile *p = &c->func;
1139
1140 assert((mask & WRITEMASK_XW) == 0);
1141
1142 if (mask & WRITEMASK_Y) {
1143 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
1144 brw_MOV(p, dst[1], arg0[0]);
1145 brw_set_saturate(p, 0);
1146 }
1147
1148 if (mask & WRITEMASK_Z) {
1149 emit_math2(c, BRW_MATH_FUNCTION_POW,
1150 &dst[2],
1151 WRITEMASK_X | (mask & SATURATE),
1152 &arg0[1],
1153 &arg0[3]);
1154 }
1155
1156 /* Ordinarily you'd use an iff statement to skip or shortcircuit
1157 * some of the POW calculations above, but 16-wide iff statements
1158 * seem to lock c1 hardware, so this is a nasty workaround:
1159 */
1160 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_LE, arg0[0], brw_imm_f(0));
1161 {
1162 if (mask & WRITEMASK_Y)
1163 brw_MOV(p, dst[1], brw_imm_f(0));
1164
1165 if (mask & WRITEMASK_Z)
1166 brw_MOV(p, dst[2], brw_imm_f(0));
1167 }
1168 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1169 }
1170
1171
1172 /* Kill pixel - set execution mask to zero for those pixels which
1173 * fail.
1174 */
1175 static void emit_kil( struct brw_wm_compile *c,
1176 struct brw_reg *arg0)
1177 {
1178 struct brw_compile *p = &c->func;
1179 struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1180 GLuint i, j;
1181
1182 for (i = 0; i < 4; i++) {
1183 /* Check if we've already done the comparison for this reg
1184 * -- common when someone does KIL TEMP.wwww.
1185 */
1186 for (j = 0; j < i; j++) {
1187 if (memcmp(&arg0[j], &arg0[i], sizeof(arg0[0])) == 0)
1188 break;
1189 }
1190 if (j != i)
1191 continue;
1192
1193 brw_push_insn_state(p);
1194 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0));
1195 brw_set_predicate_control_flag_value(p, 0xff);
1196 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1197 brw_AND(p, r0uw, brw_flag_reg(), r0uw);
1198 brw_pop_insn_state(p);
1199 }
1200 }
1201
1202 /* KIL_NV kills the pixels that are currently executing, not based on a test
1203 * of the arguments.
1204 */
1205 static void emit_kil_nv( struct brw_wm_compile *c )
1206 {
1207 struct brw_compile *p = &c->func;
1208 struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1209
1210 brw_push_insn_state(p);
1211 brw_set_mask_control(p, BRW_MASK_DISABLE);
1212 brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); /* IMASK */
1213 brw_AND(p, r0uw, c->emit_mask_reg, r0uw);
1214 brw_pop_insn_state(p);
1215 }
1216
1217 static void fire_fb_write( struct brw_wm_compile *c,
1218 GLuint base_reg,
1219 GLuint nr,
1220 GLuint target,
1221 GLuint eot )
1222 {
1223 struct brw_compile *p = &c->func;
1224 struct brw_reg dst;
1225
1226 if (c->dispatch_width == 16)
1227 dst = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1228 else
1229 dst = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1230
1231 /* Pass through control information:
1232 */
1233 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
1234 {
1235 brw_push_insn_state(p);
1236 brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
1237 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1238 brw_MOV(p,
1239 brw_message_reg(base_reg + 1),
1240 brw_vec8_grf(1, 0));
1241 brw_pop_insn_state(p);
1242 }
1243
1244 /* Send framebuffer write message: */
1245 /* send (16) null.0<1>:uw m0 r0.0<8;8,1>:uw 0x85a04000:ud { Align1 EOT } */
1246 brw_fb_WRITE(p,
1247 dst,
1248 base_reg,
1249 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
1250 target,
1251 nr,
1252 0,
1253 eot);
1254 }
1255
1256
1257 static void emit_aa( struct brw_wm_compile *c,
1258 struct brw_reg *arg1,
1259 GLuint reg )
1260 {
1261 struct brw_compile *p = &c->func;
1262 GLuint comp = c->key.aa_dest_stencil_reg / 2;
1263 GLuint off = c->key.aa_dest_stencil_reg % 2;
1264 struct brw_reg aa = offset(arg1[comp], off);
1265
1266 brw_push_insn_state(p);
1267 brw_set_compression_control(p, BRW_COMPRESSION_NONE); /* ?? */
1268 brw_MOV(p, brw_message_reg(reg), aa);
1269 brw_pop_insn_state(p);
1270 }
1271
1272
1273 /* Post-fragment-program processing. Send the results to the
1274 * framebuffer.
1275 * \param arg0 the fragment color
1276 * \param arg1 the pass-through depth value
1277 * \param arg2 the shader-computed depth value
1278 */
1279 void emit_fb_write(struct brw_wm_compile *c,
1280 struct brw_reg *arg0,
1281 struct brw_reg *arg1,
1282 struct brw_reg *arg2,
1283 GLuint target,
1284 GLuint eot)
1285 {
1286 struct brw_compile *p = &c->func;
1287 struct brw_context *brw = p->brw;
1288 GLuint nr = 2;
1289 GLuint channel;
1290
1291 /* Reserve a space for AA - may not be needed:
1292 */
1293 if (c->key.aa_dest_stencil_reg)
1294 nr += 1;
1295
1296 /* I don't really understand how this achieves the color interleave
1297 * (ie RGBARGBA) in the result: [Do the saturation here]
1298 */
1299 brw_push_insn_state(p);
1300
1301 for (channel = 0; channel < 4; channel++) {
1302 if (c->dispatch_width == 16 && brw->has_compr4) {
1303 /* By setting the high bit of the MRF register number, we indicate
1304 * that we want COMPR4 mode - instead of doing the usual destination
1305 * + 1 for the second half we get destination + 4.
1306 */
1307 brw_MOV(p,
1308 brw_message_reg(nr + channel + (1 << 7)),
1309 arg0[channel]);
1310 } else {
1311 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
1312 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
1313 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1314 brw_MOV(p,
1315 brw_message_reg(nr + channel),
1316 arg0[channel]);
1317
1318 if (c->dispatch_width == 16) {
1319 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1320 brw_MOV(p,
1321 brw_message_reg(nr + channel + 4),
1322 sechalf(arg0[channel]));
1323 }
1324 }
1325 }
1326 /* skip over the regs populated above:
1327 */
1328 nr += 8;
1329 brw_pop_insn_state(p);
1330
1331 if (c->key.source_depth_to_render_target)
1332 {
1333 if (c->key.computes_depth)
1334 brw_MOV(p, brw_message_reg(nr), arg2[2]);
1335 else
1336 brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */
1337
1338 nr += 2;
1339 }
1340
1341 if (c->key.dest_depth_reg)
1342 {
1343 GLuint comp = c->key.dest_depth_reg / 2;
1344 GLuint off = c->key.dest_depth_reg % 2;
1345
1346 if (off != 0) {
1347 brw_push_insn_state(p);
1348 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1349
1350 brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
1351 /* 2nd half? */
1352 brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
1353 brw_pop_insn_state(p);
1354 }
1355 else {
1356 brw_MOV(p, brw_message_reg(nr), arg1[comp]);
1357 }
1358 nr += 2;
1359 }
1360
1361 if (!c->key.runtime_check_aads_emit) {
1362 if (c->key.aa_dest_stencil_reg)
1363 emit_aa(c, arg1, 2);
1364
1365 fire_fb_write(c, 0, nr, target, eot);
1366 }
1367 else {
1368 struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
1369 struct brw_reg ip = brw_ip_reg();
1370 struct brw_instruction *jmp;
1371
1372 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1373 brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
1374 brw_AND(p,
1375 v1_null_ud,
1376 get_element_ud(brw_vec8_grf(1,0), 6),
1377 brw_imm_ud(1<<26));
1378
1379 jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
1380 {
1381 emit_aa(c, arg1, 2);
1382 fire_fb_write(c, 0, nr, target, eot);
1383 /* note - thread killed in subroutine */
1384 }
1385 brw_land_fwd_jump(p, jmp);
1386
1387 /* ELSE: Shuffle up one register to fill in the hole left for AA:
1388 */
1389 fire_fb_write(c, 1, nr-1, target, eot);
1390 }
1391 }
1392
1393 /**
1394 * Move a GPR to scratch memory.
1395 */
1396 static void emit_spill( struct brw_wm_compile *c,
1397 struct brw_reg reg,
1398 GLuint slot )
1399 {
1400 struct brw_compile *p = &c->func;
1401
1402 /*
1403 mov (16) m2.0<1>:ud r2.0<8;8,1>:ud { Align1 Compr }
1404 */
1405 brw_MOV(p, brw_message_reg(2), reg);
1406
1407 /*
1408 mov (1) r0.2<1>:d 0x00000080:d { Align1 NoMask }
1409 send (16) null.0<1>:uw m1 r0.0<8;8,1>:uw 0x053003ff:ud { Align1 }
1410 */
1411 brw_dp_WRITE_16(p,
1412 retype(vec16(brw_vec8_grf(0, 0)), BRW_REGISTER_TYPE_UW),
1413 slot);
1414 }
1415
1416
1417 /**
1418 * Load a GPR from scratch memory.
1419 */
1420 static void emit_unspill( struct brw_wm_compile *c,
1421 struct brw_reg reg,
1422 GLuint slot )
1423 {
1424 struct brw_compile *p = &c->func;
1425
1426 /* Slot 0 is the undef value.
1427 */
1428 if (slot == 0) {
1429 brw_MOV(p, reg, brw_imm_f(0));
1430 return;
1431 }
1432
1433 /*
1434 mov (1) r0.2<1>:d 0x000000c0:d { Align1 NoMask }
1435 send (16) r110.0<1>:uw m1 r0.0<8;8,1>:uw 0x041243ff:ud { Align1 }
1436 */
1437
1438 brw_dp_READ_16(p,
1439 retype(vec16(reg), BRW_REGISTER_TYPE_UW),
1440 slot);
1441 }
1442
1443
1444 /**
1445 * Retrieve up to 4 GEN4 register pairs for the given wm reg:
1446 * Args with unspill_reg != 0 will be loaded from scratch memory.
1447 */
1448 static void get_argument_regs( struct brw_wm_compile *c,
1449 struct brw_wm_ref *arg[],
1450 struct brw_reg *regs )
1451 {
1452 GLuint i;
1453
1454 for (i = 0; i < 4; i++) {
1455 if (arg[i]) {
1456 if (arg[i]->unspill_reg)
1457 emit_unspill(c,
1458 brw_vec8_grf(arg[i]->unspill_reg, 0),
1459 arg[i]->value->spill_slot);
1460
1461 regs[i] = arg[i]->hw_reg;
1462 }
1463 else {
1464 regs[i] = brw_null_reg();
1465 }
1466 }
1467 }
1468
1469
1470 /**
1471 * For values that have a spill_slot!=0, write those regs to scratch memory.
1472 */
1473 static void spill_values( struct brw_wm_compile *c,
1474 struct brw_wm_value *values,
1475 GLuint nr )
1476 {
1477 GLuint i;
1478
1479 for (i = 0; i < nr; i++)
1480 if (values[i].spill_slot)
1481 emit_spill(c, values[i].hw_reg, values[i].spill_slot);
1482 }
1483
1484
1485 /* Emit the fragment program instructions here.
1486 */
1487 void brw_wm_emit( struct brw_wm_compile *c )
1488 {
1489 struct brw_compile *p = &c->func;
1490 GLuint insn;
1491
1492 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1493
1494 /* Check if any of the payload regs need to be spilled:
1495 */
1496 spill_values(c, c->payload.depth, 4);
1497 spill_values(c, c->creg, c->nr_creg);
1498 spill_values(c, c->payload.input_interp, FRAG_ATTRIB_MAX);
1499
1500
1501 for (insn = 0; insn < c->nr_insns; insn++) {
1502
1503 struct brw_wm_instruction *inst = &c->instruction[insn];
1504 struct brw_reg args[3][4], dst[4];
1505 GLuint i, dst_flags;
1506
1507 /* Get argument regs:
1508 */
1509 for (i = 0; i < 3; i++)
1510 get_argument_regs(c, inst->src[i], args[i]);
1511
1512 /* Get dest regs:
1513 */
1514 for (i = 0; i < 4; i++)
1515 if (inst->dst[i])
1516 dst[i] = inst->dst[i]->hw_reg;
1517 else
1518 dst[i] = brw_null_reg();
1519
1520 /* Flags
1521 */
1522 dst_flags = inst->writemask;
1523 if (inst->saturate)
1524 dst_flags |= SATURATE;
1525
1526 switch (inst->opcode) {
1527 /* Generated instructions for calculating triangle interpolants:
1528 */
1529 case WM_PIXELXY:
1530 emit_pixel_xy(c, dst, dst_flags);
1531 break;
1532
1533 case WM_DELTAXY:
1534 emit_delta_xy(p, dst, dst_flags, args[0]);
1535 break;
1536
1537 case WM_WPOSXY:
1538 emit_wpos_xy(c, dst, dst_flags, args[0]);
1539 break;
1540
1541 case WM_PIXELW:
1542 emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1543 break;
1544
1545 case WM_LINTERP:
1546 emit_linterp(p, dst, dst_flags, args[0], args[1]);
1547 break;
1548
1549 case WM_PINTERP:
1550 emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1551 break;
1552
1553 case WM_CINTERP:
1554 emit_cinterp(p, dst, dst_flags, args[0]);
1555 break;
1556
1557 case WM_FB_WRITE:
1558 emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot);
1559 break;
1560
1561 case WM_FRONTFACING:
1562 emit_frontfacing(p, dst, dst_flags);
1563 break;
1564
1565 /* Straightforward arithmetic:
1566 */
1567 case OPCODE_ADD:
1568 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1569 break;
1570
1571 case OPCODE_FRC:
1572 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1573 break;
1574
1575 case OPCODE_FLR:
1576 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1577 break;
1578
1579 case OPCODE_DDX:
1580 emit_ddxy(p, dst, dst_flags, GL_TRUE, args[0]);
1581 break;
1582
1583 case OPCODE_DDY:
1584 emit_ddxy(p, dst, dst_flags, GL_FALSE, args[0]);
1585 break;
1586
1587 case OPCODE_DP3:
1588 emit_dp3(p, dst, dst_flags, args[0], args[1]);
1589 break;
1590
1591 case OPCODE_DP4:
1592 emit_dp4(p, dst, dst_flags, args[0], args[1]);
1593 break;
1594
1595 case OPCODE_DPH:
1596 emit_dph(p, dst, dst_flags, args[0], args[1]);
1597 break;
1598
1599 case OPCODE_TRUNC:
1600 emit_alu1(p, brw_RNDZ, dst, dst_flags, args[0]);
1601 break;
1602
1603 case OPCODE_LRP:
1604 emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
1605 break;
1606
1607 case OPCODE_MAD:
1608 emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
1609 break;
1610
1611 case OPCODE_MOV:
1612 case OPCODE_SWZ:
1613 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1614 break;
1615
1616 case OPCODE_MUL:
1617 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
1618 break;
1619
1620 case OPCODE_XPD:
1621 emit_xpd(p, dst, dst_flags, args[0], args[1]);
1622 break;
1623
1624 /* Higher math functions:
1625 */
1626 case OPCODE_RCP:
1627 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1628 break;
1629
1630 case OPCODE_RSQ:
1631 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1632 break;
1633
1634 case OPCODE_SIN:
1635 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1636 break;
1637
1638 case OPCODE_COS:
1639 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1640 break;
1641
1642 case OPCODE_EX2:
1643 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1644 break;
1645
1646 case OPCODE_LG2:
1647 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1648 break;
1649
1650 case OPCODE_SCS:
1651 /* There is an scs math function, but it would need some
1652 * fixup for 16-element execution.
1653 */
1654 if (dst_flags & WRITEMASK_X)
1655 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1656 if (dst_flags & WRITEMASK_Y)
1657 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1658 break;
1659
1660 case OPCODE_POW:
1661 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
1662 break;
1663
1664 /* Comparisons:
1665 */
1666 case OPCODE_CMP:
1667 emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
1668 break;
1669
1670 case OPCODE_MAX:
1671 emit_max(p, dst, dst_flags, args[0], args[1]);
1672 break;
1673
1674 case OPCODE_MIN:
1675 emit_min(p, dst, dst_flags, args[0], args[1]);
1676 break;
1677
1678 case OPCODE_SLT:
1679 emit_slt(p, dst, dst_flags, args[0], args[1]);
1680 break;
1681
1682 case OPCODE_SLE:
1683 emit_sle(p, dst, dst_flags, args[0], args[1]);
1684 break;
1685 case OPCODE_SGT:
1686 emit_sgt(p, dst, dst_flags, args[0], args[1]);
1687 break;
1688 case OPCODE_SGE:
1689 emit_sge(p, dst, dst_flags, args[0], args[1]);
1690 break;
1691 case OPCODE_SEQ:
1692 emit_seq(p, dst, dst_flags, args[0], args[1]);
1693 break;
1694 case OPCODE_SNE:
1695 emit_sne(p, dst, dst_flags, args[0], args[1]);
1696 break;
1697
1698 case OPCODE_SSG:
1699 emit_sign(p, dst, dst_flags, args[0]);
1700 break;
1701
1702 case OPCODE_LIT:
1703 emit_lit(c, dst, dst_flags, args[0]);
1704 break;
1705
1706 /* Texturing operations:
1707 */
1708 case OPCODE_TEX:
1709 emit_tex(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1710 inst->tex_idx, inst->tex_unit,
1711 inst->tex_shadow);
1712 break;
1713
1714 case OPCODE_TXB:
1715 emit_txb(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1716 inst->tex_idx, inst->tex_unit);
1717 break;
1718
1719 case OPCODE_KIL:
1720 emit_kil(c, args[0]);
1721 break;
1722
1723 case OPCODE_KIL_NV:
1724 emit_kil_nv(c);
1725 break;
1726
1727 default:
1728 printf("Unsupported opcode %i (%s) in fragment shader\n",
1729 inst->opcode, inst->opcode < MAX_OPCODE ?
1730 _mesa_opcode_string(inst->opcode) :
1731 "unknown");
1732 }
1733
1734 for (i = 0; i < 4; i++)
1735 if (inst->dst[i] && inst->dst[i]->spill_slot)
1736 emit_spill(c,
1737 inst->dst[i]->hw_reg,
1738 inst->dst[i]->spill_slot);
1739 }
1740
1741 if (INTEL_DEBUG & DEBUG_WM) {
1742 int i;
1743
1744 printf("wm-native:\n");
1745 for (i = 0; i < p->nr_insn; i++)
1746 brw_disasm(stderr, &p->store[i], p->brw->intel.gen);
1747 printf("\n");
1748 }
1749 }