i965/gen4: Fold WM surface state prepare()/emit() together.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs_emit.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs_emit.cpp
25 *
26 * This file supports emitting code from the FS LIR to the actual
27 * native instructions.
28 */
29
30 extern "C" {
31 #include "main/macros.h"
32 #include "brw_context.h"
33 #include "brw_eu.h"
34 } /* extern "C" */
35
36 #include "brw_fs.h"
37 #include "glsl/ir_print_visitor.h"
38
39 void
40 fs_visitor::generate_fb_write(fs_inst *inst)
41 {
42 bool eot = inst->eot;
43 struct brw_reg implied_header;
44
45 /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
46 * move, here's g1.
47 */
48 brw_push_insn_state(p);
49 brw_set_mask_control(p, BRW_MASK_DISABLE);
50 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
51
52 if (inst->header_present) {
53 if (intel->gen >= 6) {
54 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
55 brw_MOV(p,
56 retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD),
57 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
58 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
59
60 if (inst->target > 0) {
61 /* Set the render target index for choosing BLEND_STATE. */
62 brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
63 inst->base_mrf, 2),
64 BRW_REGISTER_TYPE_UD),
65 brw_imm_ud(inst->target));
66 }
67
68 implied_header = brw_null_reg();
69 } else {
70 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
71
72 brw_MOV(p,
73 brw_message_reg(inst->base_mrf + 1),
74 brw_vec8_grf(1, 0));
75 }
76 } else {
77 implied_header = brw_null_reg();
78 }
79
80 brw_pop_insn_state(p);
81
82 brw_fb_WRITE(p,
83 c->dispatch_width,
84 inst->base_mrf,
85 implied_header,
86 inst->target,
87 inst->mlen,
88 0,
89 eot,
90 inst->header_present);
91 }
92
93 /* Computes the integer pixel x,y values from the origin.
94 *
95 * This is the basis of gl_FragCoord computation, but is also used
96 * pre-gen6 for computing the deltas from v0 for computing
97 * interpolation.
98 */
99 void
100 fs_visitor::generate_pixel_xy(struct brw_reg dst, bool is_x)
101 {
102 struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
103 struct brw_reg src;
104 struct brw_reg deltas;
105
106 if (is_x) {
107 src = stride(suboffset(g1_uw, 4), 2, 4, 0);
108 deltas = brw_imm_v(0x10101010);
109 } else {
110 src = stride(suboffset(g1_uw, 5), 2, 4, 0);
111 deltas = brw_imm_v(0x11001100);
112 }
113
114 if (c->dispatch_width == 16) {
115 dst = vec16(dst);
116 }
117
118 /* We do this 8 or 16-wide, but since the destination is UW we
119 * don't do compression in the 16-wide case.
120 */
121 brw_push_insn_state(p);
122 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
123 brw_ADD(p, dst, src, deltas);
124 brw_pop_insn_state(p);
125 }
126
127 void
128 fs_visitor::generate_linterp(fs_inst *inst,
129 struct brw_reg dst, struct brw_reg *src)
130 {
131 struct brw_reg delta_x = src[0];
132 struct brw_reg delta_y = src[1];
133 struct brw_reg interp = src[2];
134
135 if (brw->has_pln &&
136 delta_y.nr == delta_x.nr + 1 &&
137 (intel->gen >= 6 || (delta_x.nr & 1) == 0)) {
138 brw_PLN(p, dst, interp, delta_x);
139 } else {
140 brw_LINE(p, brw_null_reg(), interp, delta_x);
141 brw_MAC(p, dst, suboffset(interp, 1), delta_y);
142 }
143 }
144
145 void
146 fs_visitor::generate_math1_gen6(fs_inst *inst,
147 struct brw_reg dst,
148 struct brw_reg src0)
149 {
150 int op = brw_math_function(inst->opcode);
151
152 assert(inst->mlen == 0);
153
154 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
155 brw_math(p, dst,
156 op,
157 inst->saturate ? BRW_MATH_SATURATE_SATURATE :
158 BRW_MATH_SATURATE_NONE,
159 0, src0,
160 BRW_MATH_DATA_VECTOR,
161 BRW_MATH_PRECISION_FULL);
162
163 if (c->dispatch_width == 16) {
164 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
165 brw_math(p, sechalf(dst),
166 op,
167 inst->saturate ? BRW_MATH_SATURATE_SATURATE :
168 BRW_MATH_SATURATE_NONE,
169 0, sechalf(src0),
170 BRW_MATH_DATA_VECTOR,
171 BRW_MATH_PRECISION_FULL);
172 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
173 }
174 }
175
176 void
177 fs_visitor::generate_math2_gen6(fs_inst *inst,
178 struct brw_reg dst,
179 struct brw_reg src0,
180 struct brw_reg src1)
181 {
182 int op = brw_math_function(inst->opcode);
183
184 assert(inst->mlen == 0);
185
186 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
187 brw_math2(p, dst, op, src0, src1);
188
189 if (c->dispatch_width == 16) {
190 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
191 brw_math2(p, sechalf(dst), op, sechalf(src0), sechalf(src1));
192 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
193 }
194 }
195
196 void
197 fs_visitor::generate_math_gen4(fs_inst *inst,
198 struct brw_reg dst,
199 struct brw_reg src)
200 {
201 int op = brw_math_function(inst->opcode);
202
203 assert(inst->mlen >= 1);
204
205 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
206 brw_math(p, dst,
207 op,
208 inst->saturate ? BRW_MATH_SATURATE_SATURATE :
209 BRW_MATH_SATURATE_NONE,
210 inst->base_mrf, src,
211 BRW_MATH_DATA_VECTOR,
212 BRW_MATH_PRECISION_FULL);
213
214 if (c->dispatch_width == 16) {
215 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
216 brw_math(p, sechalf(dst),
217 op,
218 inst->saturate ? BRW_MATH_SATURATE_SATURATE :
219 BRW_MATH_SATURATE_NONE,
220 inst->base_mrf + 1, sechalf(src),
221 BRW_MATH_DATA_VECTOR,
222 BRW_MATH_PRECISION_FULL);
223
224 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
225 }
226 }
227
228 void
229 fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
230 {
231 int msg_type = -1;
232 int rlen = 4;
233 uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
234
235 if (c->dispatch_width == 16)
236 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
237
238 if (intel->gen >= 5) {
239 switch (inst->opcode) {
240 case FS_OPCODE_TEX:
241 if (inst->shadow_compare) {
242 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
243 } else {
244 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
245 }
246 break;
247 case FS_OPCODE_TXB:
248 if (inst->shadow_compare) {
249 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
250 } else {
251 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
252 }
253 break;
254 case FS_OPCODE_TXL:
255 if (inst->shadow_compare) {
256 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
257 } else {
258 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
259 }
260 break;
261 case FS_OPCODE_TXS:
262 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
263 break;
264 case FS_OPCODE_TXD:
265 /* There is no sample_d_c message; comparisons are done manually */
266 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
267 break;
268 case FS_OPCODE_TXF:
269 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
270 break;
271 default:
272 assert(!"not reached");
273 break;
274 }
275 } else {
276 switch (inst->opcode) {
277 case FS_OPCODE_TEX:
278 /* Note that G45 and older determines shadow compare and dispatch width
279 * from message length for most messages.
280 */
281 assert(c->dispatch_width == 8);
282 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
283 if (inst->shadow_compare) {
284 assert(inst->mlen == 6);
285 } else {
286 assert(inst->mlen <= 4);
287 }
288 break;
289 case FS_OPCODE_TXB:
290 if (inst->shadow_compare) {
291 assert(inst->mlen == 6);
292 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
293 } else {
294 assert(inst->mlen == 9);
295 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
296 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
297 }
298 break;
299 case FS_OPCODE_TXL:
300 if (inst->shadow_compare) {
301 assert(inst->mlen == 6);
302 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
303 } else {
304 assert(inst->mlen == 9);
305 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
306 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
307 }
308 break;
309 case FS_OPCODE_TXD:
310 /* There is no sample_d_c message; comparisons are done manually */
311 assert(inst->mlen == 7 || inst->mlen == 10);
312 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
313 break;
314 case FS_OPCODE_TXF:
315 assert(inst->mlen == 9);
316 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
317 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
318 break;
319 case FS_OPCODE_TXS:
320 assert(inst->mlen == 3);
321 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
322 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
323 break;
324 default:
325 assert(!"not reached");
326 break;
327 }
328 }
329 assert(msg_type != -1);
330
331 if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
332 rlen = 8;
333 dst = vec16(dst);
334 }
335
336 brw_SAMPLE(p,
337 retype(dst, BRW_REGISTER_TYPE_UW),
338 inst->base_mrf,
339 src,
340 SURF_INDEX_TEXTURE(inst->sampler),
341 inst->sampler,
342 WRITEMASK_XYZW,
343 msg_type,
344 rlen,
345 inst->mlen,
346 inst->header_present,
347 simd_mode);
348 }
349
350
351 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
352 * looking like:
353 *
354 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
355 *
356 * and we're trying to produce:
357 *
358 * DDX DDY
359 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
360 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
361 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
362 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
363 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
364 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
365 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
366 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
367 *
368 * and add another set of two more subspans if in 16-pixel dispatch mode.
369 *
370 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
371 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
372 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
373 * between each other. We could probably do it like ddx and swizzle the right
374 * order later, but bail for now and just produce
375 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
376 */
377 void
378 fs_visitor::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
379 {
380 struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
381 BRW_REGISTER_TYPE_F,
382 BRW_VERTICAL_STRIDE_2,
383 BRW_WIDTH_2,
384 BRW_HORIZONTAL_STRIDE_0,
385 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
386 struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
387 BRW_REGISTER_TYPE_F,
388 BRW_VERTICAL_STRIDE_2,
389 BRW_WIDTH_2,
390 BRW_HORIZONTAL_STRIDE_0,
391 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
392 brw_ADD(p, dst, src0, negate(src1));
393 }
394
395 void
396 fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
397 {
398 struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
399 BRW_REGISTER_TYPE_F,
400 BRW_VERTICAL_STRIDE_4,
401 BRW_WIDTH_4,
402 BRW_HORIZONTAL_STRIDE_0,
403 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
404 struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
405 BRW_REGISTER_TYPE_F,
406 BRW_VERTICAL_STRIDE_4,
407 BRW_WIDTH_4,
408 BRW_HORIZONTAL_STRIDE_0,
409 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
410 brw_ADD(p, dst, src0, negate(src1));
411 }
412
413 void
414 fs_visitor::generate_discard(fs_inst *inst)
415 {
416 struct brw_reg f0 = brw_flag_reg();
417
418 if (intel->gen >= 6) {
419 struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
420 struct brw_reg some_register;
421
422 /* As of gen6, we no longer have the mask register to look at,
423 * so life gets a bit more complicated.
424 */
425
426 /* Load the flag register with all ones. */
427 brw_push_insn_state(p);
428 brw_set_mask_control(p, BRW_MASK_DISABLE);
429 brw_MOV(p, f0, brw_imm_uw(0xffff));
430 brw_pop_insn_state(p);
431
432 /* Do a comparison that should always fail, to produce 0s in the flag
433 * reg where we have active channels.
434 */
435 some_register = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
436 brw_CMP(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
437 BRW_CONDITIONAL_NZ, some_register, some_register);
438
439 /* Undo CMP's whacking of predication*/
440 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
441
442 brw_push_insn_state(p);
443 brw_set_mask_control(p, BRW_MASK_DISABLE);
444 brw_AND(p, g1, f0, g1);
445 brw_pop_insn_state(p);
446 } else {
447 struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
448
449 brw_push_insn_state(p);
450 brw_set_mask_control(p, BRW_MASK_DISABLE);
451 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
452
453 /* Unlike the 965, we have the mask reg, so we just need
454 * somewhere to invert that (containing channels to be disabled)
455 * so it can be ANDed with the mask of pixels still to be
456 * written. Use the flag reg for consistency with gen6+.
457 */
458 brw_NOT(p, f0, brw_mask_reg(1)); /* IMASK */
459 brw_AND(p, g0, f0, g0);
460
461 brw_pop_insn_state(p);
462 }
463 }
464
465 void
466 fs_visitor::generate_spill(fs_inst *inst, struct brw_reg src)
467 {
468 assert(inst->mlen != 0);
469
470 brw_MOV(p,
471 retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD),
472 retype(src, BRW_REGISTER_TYPE_UD));
473 brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1,
474 inst->offset);
475 }
476
477 void
478 fs_visitor::generate_unspill(fs_inst *inst, struct brw_reg dst)
479 {
480 assert(inst->mlen != 0);
481
482 /* Clear any post destination dependencies that would be ignored by
483 * the block read. See the B-Spec for pre-gen5 send instruction.
484 *
485 * This could use a better solution, since texture sampling and
486 * math reads could potentially run into it as well -- anywhere
487 * that we have a SEND with a destination that is a register that
488 * was written but not read within the last N instructions (what's
489 * N? unsure). This is rare because of dead code elimination, but
490 * not impossible.
491 */
492 if (intel->gen == 4 && !intel->is_g4x)
493 brw_MOV(p, brw_null_reg(), dst);
494
495 brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1,
496 inst->offset);
497
498 if (intel->gen == 4 && !intel->is_g4x) {
499 /* gen4 errata: destination from a send can't be used as a
500 * destination until it's been read. Just read it so we don't
501 * have to worry.
502 */
503 brw_MOV(p, brw_null_reg(), dst);
504 }
505 }
506
507 void
508 fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst)
509 {
510 assert(inst->mlen != 0);
511
512 /* Clear any post destination dependencies that would be ignored by
513 * the block read. See the B-Spec for pre-gen5 send instruction.
514 *
515 * This could use a better solution, since texture sampling and
516 * math reads could potentially run into it as well -- anywhere
517 * that we have a SEND with a destination that is a register that
518 * was written but not read within the last N instructions (what's
519 * N? unsure). This is rare because of dead code elimination, but
520 * not impossible.
521 */
522 if (intel->gen == 4 && !intel->is_g4x)
523 brw_MOV(p, brw_null_reg(), dst);
524
525 brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
526 inst->offset, SURF_INDEX_FRAG_CONST_BUFFER);
527
528 if (intel->gen == 4 && !intel->is_g4x) {
529 /* gen4 errata: destination from a send can't be used as a
530 * destination until it's been read. Just read it so we don't
531 * have to worry.
532 */
533 brw_MOV(p, brw_null_reg(), dst);
534 }
535 }
536
537 static struct brw_reg
538 brw_reg_from_fs_reg(fs_reg *reg)
539 {
540 struct brw_reg brw_reg;
541
542 switch (reg->file) {
543 case GRF:
544 case ARF:
545 case MRF:
546 if (reg->smear == -1) {
547 brw_reg = brw_vec8_reg(reg->file, reg->reg, 0);
548 } else {
549 brw_reg = brw_vec1_reg(reg->file, reg->reg, reg->smear);
550 }
551 brw_reg = retype(brw_reg, reg->type);
552 if (reg->sechalf)
553 brw_reg = sechalf(brw_reg);
554 break;
555 case IMM:
556 switch (reg->type) {
557 case BRW_REGISTER_TYPE_F:
558 brw_reg = brw_imm_f(reg->imm.f);
559 break;
560 case BRW_REGISTER_TYPE_D:
561 brw_reg = brw_imm_d(reg->imm.i);
562 break;
563 case BRW_REGISTER_TYPE_UD:
564 brw_reg = brw_imm_ud(reg->imm.u);
565 break;
566 default:
567 assert(!"not reached");
568 brw_reg = brw_null_reg();
569 break;
570 }
571 break;
572 case FIXED_HW_REG:
573 brw_reg = reg->fixed_hw_reg;
574 break;
575 case BAD_FILE:
576 /* Probably unused. */
577 brw_reg = brw_null_reg();
578 break;
579 case UNIFORM:
580 assert(!"not reached");
581 brw_reg = brw_null_reg();
582 break;
583 default:
584 assert(!"not reached");
585 brw_reg = brw_null_reg();
586 break;
587 }
588 if (reg->abs)
589 brw_reg = brw_abs(brw_reg);
590 if (reg->negate)
591 brw_reg = negate(brw_reg);
592
593 return brw_reg;
594 }
595
596 void
597 fs_visitor::generate_code()
598 {
599 int last_native_inst = p->nr_insn;
600 const char *last_annotation_string = NULL;
601 ir_instruction *last_annotation_ir = NULL;
602
603 int loop_stack_array_size = 16;
604 int loop_stack_depth = 0;
605 brw_instruction **loop_stack =
606 rzalloc_array(this->mem_ctx, brw_instruction *, loop_stack_array_size);
607 int *if_depth_in_loop =
608 rzalloc_array(this->mem_ctx, int, loop_stack_array_size);
609
610
611 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
612 printf("Native code for fragment shader %d (%d-wide dispatch):\n",
613 prog->Name, c->dispatch_width);
614 }
615
616 foreach_list(node, &this->instructions) {
617 fs_inst *inst = (fs_inst *)node;
618 struct brw_reg src[3], dst;
619
620 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
621 if (last_annotation_ir != inst->ir) {
622 last_annotation_ir = inst->ir;
623 if (last_annotation_ir) {
624 printf(" ");
625 last_annotation_ir->print();
626 printf("\n");
627 }
628 }
629 if (last_annotation_string != inst->annotation) {
630 last_annotation_string = inst->annotation;
631 if (last_annotation_string)
632 printf(" %s\n", last_annotation_string);
633 }
634 }
635
636 for (unsigned int i = 0; i < 3; i++) {
637 src[i] = brw_reg_from_fs_reg(&inst->src[i]);
638
639 /* The accumulator result appears to get used for the
640 * conditional modifier generation. When negating a UD
641 * value, there is a 33rd bit generated for the sign in the
642 * accumulator value, so now you can't check, for example,
643 * equality with a 32-bit value. See piglit fs-op-neg-uvec4.
644 */
645 assert(!inst->conditional_mod ||
646 inst->src[i].type != BRW_REGISTER_TYPE_UD ||
647 !inst->src[i].negate);
648 }
649 dst = brw_reg_from_fs_reg(&inst->dst);
650
651 brw_set_conditionalmod(p, inst->conditional_mod);
652 brw_set_predicate_control(p, inst->predicated);
653 brw_set_predicate_inverse(p, inst->predicate_inverse);
654 brw_set_saturate(p, inst->saturate);
655
656 if (inst->force_uncompressed || c->dispatch_width == 8) {
657 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
658 } else if (inst->force_sechalf) {
659 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
660 } else {
661 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
662 }
663
664 switch (inst->opcode) {
665 case BRW_OPCODE_MOV:
666 brw_MOV(p, dst, src[0]);
667 break;
668 case BRW_OPCODE_ADD:
669 brw_ADD(p, dst, src[0], src[1]);
670 break;
671 case BRW_OPCODE_MUL:
672 brw_MUL(p, dst, src[0], src[1]);
673 break;
674 case BRW_OPCODE_MACH:
675 brw_set_acc_write_control(p, 1);
676 brw_MACH(p, dst, src[0], src[1]);
677 brw_set_acc_write_control(p, 0);
678 break;
679
680 case BRW_OPCODE_FRC:
681 brw_FRC(p, dst, src[0]);
682 break;
683 case BRW_OPCODE_RNDD:
684 brw_RNDD(p, dst, src[0]);
685 break;
686 case BRW_OPCODE_RNDE:
687 brw_RNDE(p, dst, src[0]);
688 break;
689 case BRW_OPCODE_RNDZ:
690 brw_RNDZ(p, dst, src[0]);
691 break;
692
693 case BRW_OPCODE_AND:
694 brw_AND(p, dst, src[0], src[1]);
695 break;
696 case BRW_OPCODE_OR:
697 brw_OR(p, dst, src[0], src[1]);
698 break;
699 case BRW_OPCODE_XOR:
700 brw_XOR(p, dst, src[0], src[1]);
701 break;
702 case BRW_OPCODE_NOT:
703 brw_NOT(p, dst, src[0]);
704 break;
705 case BRW_OPCODE_ASR:
706 brw_ASR(p, dst, src[0], src[1]);
707 break;
708 case BRW_OPCODE_SHR:
709 brw_SHR(p, dst, src[0], src[1]);
710 break;
711 case BRW_OPCODE_SHL:
712 brw_SHL(p, dst, src[0], src[1]);
713 break;
714
715 case BRW_OPCODE_CMP:
716 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
717 break;
718 case BRW_OPCODE_SEL:
719 brw_SEL(p, dst, src[0], src[1]);
720 break;
721
722 case BRW_OPCODE_IF:
723 if (inst->src[0].file != BAD_FILE) {
724 /* The instruction has an embedded compare (only allowed on gen6) */
725 assert(intel->gen == 6);
726 gen6_IF(p, inst->conditional_mod, src[0], src[1]);
727 } else {
728 brw_IF(p, c->dispatch_width == 16 ? BRW_EXECUTE_16 : BRW_EXECUTE_8);
729 }
730 if_depth_in_loop[loop_stack_depth]++;
731 break;
732
733 case BRW_OPCODE_ELSE:
734 brw_ELSE(p);
735 break;
736 case BRW_OPCODE_ENDIF:
737 brw_ENDIF(p);
738 if_depth_in_loop[loop_stack_depth]--;
739 break;
740
741 case BRW_OPCODE_DO:
742 loop_stack[loop_stack_depth++] = brw_DO(p, BRW_EXECUTE_8);
743 if (loop_stack_array_size <= loop_stack_depth) {
744 loop_stack_array_size *= 2;
745 loop_stack = reralloc(this->mem_ctx, loop_stack, brw_instruction *,
746 loop_stack_array_size);
747 if_depth_in_loop = reralloc(this->mem_ctx, if_depth_in_loop, int,
748 loop_stack_array_size);
749 }
750 if_depth_in_loop[loop_stack_depth] = 0;
751 break;
752
753 case BRW_OPCODE_BREAK:
754 brw_BREAK(p, if_depth_in_loop[loop_stack_depth]);
755 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
756 break;
757 case BRW_OPCODE_CONTINUE:
758 /* FINISHME: We need to write the loop instruction support still. */
759 if (intel->gen >= 6)
760 gen6_CONT(p, loop_stack[loop_stack_depth - 1]);
761 else
762 brw_CONT(p, if_depth_in_loop[loop_stack_depth]);
763 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
764 break;
765
766 case BRW_OPCODE_WHILE: {
767 struct brw_instruction *inst0, *inst1;
768 GLuint br = 1;
769
770 if (intel->gen >= 5)
771 br = 2;
772
773 assert(loop_stack_depth > 0);
774 loop_stack_depth--;
775 inst0 = inst1 = brw_WHILE(p, loop_stack[loop_stack_depth]);
776 if (intel->gen < 6) {
777 /* patch all the BREAK/CONT instructions from last BGNLOOP */
778 while (inst0 > loop_stack[loop_stack_depth]) {
779 inst0--;
780 if (inst0->header.opcode == BRW_OPCODE_BREAK &&
781 inst0->bits3.if_else.jump_count == 0) {
782 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
783 }
784 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
785 inst0->bits3.if_else.jump_count == 0) {
786 inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
787 }
788 }
789 }
790 }
791 break;
792
793 case SHADER_OPCODE_RCP:
794 case SHADER_OPCODE_RSQ:
795 case SHADER_OPCODE_SQRT:
796 case SHADER_OPCODE_EXP2:
797 case SHADER_OPCODE_LOG2:
798 case SHADER_OPCODE_SIN:
799 case SHADER_OPCODE_COS:
800 if (intel->gen >= 6) {
801 generate_math1_gen6(inst, dst, src[0]);
802 } else {
803 generate_math_gen4(inst, dst, src[0]);
804 }
805 break;
806 case SHADER_OPCODE_INT_QUOTIENT:
807 case SHADER_OPCODE_INT_REMAINDER:
808 case SHADER_OPCODE_POW:
809 if (intel->gen >= 6) {
810 generate_math2_gen6(inst, dst, src[0], src[1]);
811 } else {
812 generate_math_gen4(inst, dst, src[0]);
813 }
814 break;
815 case FS_OPCODE_PIXEL_X:
816 generate_pixel_xy(dst, true);
817 break;
818 case FS_OPCODE_PIXEL_Y:
819 generate_pixel_xy(dst, false);
820 break;
821 case FS_OPCODE_CINTERP:
822 brw_MOV(p, dst, src[0]);
823 break;
824 case FS_OPCODE_LINTERP:
825 generate_linterp(inst, dst, src);
826 break;
827 case FS_OPCODE_TEX:
828 case FS_OPCODE_TXB:
829 case FS_OPCODE_TXD:
830 case FS_OPCODE_TXF:
831 case FS_OPCODE_TXL:
832 case FS_OPCODE_TXS:
833 generate_tex(inst, dst, src[0]);
834 break;
835 case FS_OPCODE_DISCARD:
836 generate_discard(inst);
837 break;
838 case FS_OPCODE_DDX:
839 generate_ddx(inst, dst, src[0]);
840 break;
841 case FS_OPCODE_DDY:
842 generate_ddy(inst, dst, src[0]);
843 break;
844
845 case FS_OPCODE_SPILL:
846 generate_spill(inst, src[0]);
847 break;
848
849 case FS_OPCODE_UNSPILL:
850 generate_unspill(inst, dst);
851 break;
852
853 case FS_OPCODE_PULL_CONSTANT_LOAD:
854 generate_pull_constant_load(inst, dst);
855 break;
856
857 case FS_OPCODE_FB_WRITE:
858 generate_fb_write(inst);
859 break;
860 default:
861 if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) {
862 _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
863 brw_opcodes[inst->opcode].name);
864 } else {
865 _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode);
866 }
867 fail("unsupported opcode in FS\n");
868 }
869
870 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
871 for (unsigned int i = last_native_inst; i < p->nr_insn; i++) {
872 if (0) {
873 printf("0x%08x 0x%08x 0x%08x 0x%08x ",
874 ((uint32_t *)&p->store[i])[3],
875 ((uint32_t *)&p->store[i])[2],
876 ((uint32_t *)&p->store[i])[1],
877 ((uint32_t *)&p->store[i])[0]);
878 }
879 brw_disasm(stdout, &p->store[i], intel->gen);
880 }
881 }
882
883 last_native_inst = p->nr_insn;
884 }
885
886 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
887 printf("\n");
888 }
889
890 ralloc_free(loop_stack);
891 ralloc_free(if_depth_in_loop);
892
893 brw_set_uip_jip(p);
894
895 /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS
896 * emit issues, it doesn't get the jump distances into the output,
897 * which is often something we want to debug. So this is here in
898 * case you're doing that.
899 */
900 if (0) {
901 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
902 for (unsigned int i = 0; i < p->nr_insn; i++) {
903 printf("0x%08x 0x%08x 0x%08x 0x%08x ",
904 ((uint32_t *)&p->store[i])[3],
905 ((uint32_t *)&p->store[i])[2],
906 ((uint32_t *)&p->store[i])[1],
907 ((uint32_t *)&p->store[i])[0]);
908 brw_disasm(stdout, &p->store[i], intel->gen);
909 }
910 }
911 }
912 }