i965: Add a flag for instructions with normal writemasking disabled.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs_emit.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs_emit.cpp
25 *
26 * This file supports emitting code from the FS LIR to the actual
27 * native instructions.
28 */
29
30 extern "C" {
31 #include "main/macros.h"
32 #include "brw_context.h"
33 #include "brw_eu.h"
34 } /* extern "C" */
35
36 #include "brw_fs.h"
37 #include "brw_cfg.h"
38 #include "glsl/ir_print_visitor.h"
39
40 fs_generator::fs_generator(struct brw_context *brw,
41 struct brw_wm_compile *c,
42 struct gl_shader_program *prog,
43 struct gl_fragment_program *fp,
44 bool dual_source_output)
45
46 : brw(brw), c(c), prog(prog), fp(fp), dual_source_output(dual_source_output)
47 {
48 intel = &brw->intel;
49 ctx = &intel->ctx;
50
51 shader = prog ? prog->_LinkedShaders[MESA_SHADER_FRAGMENT] : NULL;
52
53 mem_ctx = c;
54
55 p = rzalloc(mem_ctx, struct brw_compile);
56 brw_init_compile(brw, p, mem_ctx);
57 }
58
59 fs_generator::~fs_generator()
60 {
61 }
62
63 void
64 fs_generator::generate_fb_write(fs_inst *inst)
65 {
66 bool eot = inst->eot;
67 struct brw_reg implied_header;
68 uint32_t msg_control;
69
70 /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
71 * move, here's g1.
72 */
73 brw_push_insn_state(p);
74 brw_set_mask_control(p, BRW_MASK_DISABLE);
75 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
76
77 if (inst->header_present) {
78 if (intel->gen >= 6) {
79 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
80 brw_MOV(p,
81 retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD),
82 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
83 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
84
85 if (inst->target > 0 &&
86 c->key.nr_color_regions > 1 &&
87 c->key.sample_alpha_to_coverage) {
88 /* Set "Source0 Alpha Present to RenderTarget" bit in message
89 * header.
90 */
91 brw_OR(p,
92 vec1(retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD)),
93 vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
94 brw_imm_ud(0x1 << 11));
95 }
96
97 if (inst->target > 0) {
98 /* Set the render target index for choosing BLEND_STATE. */
99 brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
100 inst->base_mrf, 2),
101 BRW_REGISTER_TYPE_UD),
102 brw_imm_ud(inst->target));
103 }
104
105 implied_header = brw_null_reg();
106 } else {
107 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
108
109 brw_MOV(p,
110 brw_message_reg(inst->base_mrf + 1),
111 brw_vec8_grf(1, 0));
112 }
113 } else {
114 implied_header = brw_null_reg();
115 }
116
117 if (this->dual_source_output)
118 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
119 else if (dispatch_width == 16)
120 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
121 else
122 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
123
124 brw_pop_insn_state(p);
125
126 brw_fb_WRITE(p,
127 dispatch_width,
128 inst->base_mrf,
129 implied_header,
130 msg_control,
131 inst->target,
132 inst->mlen,
133 0,
134 eot,
135 inst->header_present);
136 }
137
138 /* Computes the integer pixel x,y values from the origin.
139 *
140 * This is the basis of gl_FragCoord computation, but is also used
141 * pre-gen6 for computing the deltas from v0 for computing
142 * interpolation.
143 */
144 void
145 fs_generator::generate_pixel_xy(struct brw_reg dst, bool is_x)
146 {
147 struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
148 struct brw_reg src;
149 struct brw_reg deltas;
150
151 if (is_x) {
152 src = stride(suboffset(g1_uw, 4), 2, 4, 0);
153 deltas = brw_imm_v(0x10101010);
154 } else {
155 src = stride(suboffset(g1_uw, 5), 2, 4, 0);
156 deltas = brw_imm_v(0x11001100);
157 }
158
159 if (dispatch_width == 16) {
160 dst = vec16(dst);
161 }
162
163 /* We do this 8 or 16-wide, but since the destination is UW we
164 * don't do compression in the 16-wide case.
165 */
166 brw_push_insn_state(p);
167 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
168 brw_ADD(p, dst, src, deltas);
169 brw_pop_insn_state(p);
170 }
171
172 void
173 fs_generator::generate_linterp(fs_inst *inst,
174 struct brw_reg dst, struct brw_reg *src)
175 {
176 struct brw_reg delta_x = src[0];
177 struct brw_reg delta_y = src[1];
178 struct brw_reg interp = src[2];
179
180 if (brw->has_pln &&
181 delta_y.nr == delta_x.nr + 1 &&
182 (intel->gen >= 6 || (delta_x.nr & 1) == 0)) {
183 brw_PLN(p, dst, interp, delta_x);
184 } else {
185 brw_LINE(p, brw_null_reg(), interp, delta_x);
186 brw_MAC(p, dst, suboffset(interp, 1), delta_y);
187 }
188 }
189
190 void
191 fs_generator::generate_math1_gen7(fs_inst *inst,
192 struct brw_reg dst,
193 struct brw_reg src0)
194 {
195 assert(inst->mlen == 0);
196 brw_math(p, dst,
197 brw_math_function(inst->opcode),
198 0, src0,
199 BRW_MATH_DATA_VECTOR,
200 BRW_MATH_PRECISION_FULL);
201 }
202
203 void
204 fs_generator::generate_math2_gen7(fs_inst *inst,
205 struct brw_reg dst,
206 struct brw_reg src0,
207 struct brw_reg src1)
208 {
209 assert(inst->mlen == 0);
210 brw_math2(p, dst, brw_math_function(inst->opcode), src0, src1);
211 }
212
213 void
214 fs_generator::generate_math1_gen6(fs_inst *inst,
215 struct brw_reg dst,
216 struct brw_reg src0)
217 {
218 int op = brw_math_function(inst->opcode);
219
220 assert(inst->mlen == 0);
221
222 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
223 brw_math(p, dst,
224 op,
225 0, src0,
226 BRW_MATH_DATA_VECTOR,
227 BRW_MATH_PRECISION_FULL);
228
229 if (dispatch_width == 16) {
230 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
231 brw_math(p, sechalf(dst),
232 op,
233 0, sechalf(src0),
234 BRW_MATH_DATA_VECTOR,
235 BRW_MATH_PRECISION_FULL);
236 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
237 }
238 }
239
240 void
241 fs_generator::generate_math2_gen6(fs_inst *inst,
242 struct brw_reg dst,
243 struct brw_reg src0,
244 struct brw_reg src1)
245 {
246 int op = brw_math_function(inst->opcode);
247
248 assert(inst->mlen == 0);
249
250 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
251 brw_math2(p, dst, op, src0, src1);
252
253 if (dispatch_width == 16) {
254 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
255 brw_math2(p, sechalf(dst), op, sechalf(src0), sechalf(src1));
256 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
257 }
258 }
259
260 void
261 fs_generator::generate_math_gen4(fs_inst *inst,
262 struct brw_reg dst,
263 struct brw_reg src)
264 {
265 int op = brw_math_function(inst->opcode);
266
267 assert(inst->mlen >= 1);
268
269 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
270 brw_math(p, dst,
271 op,
272 inst->base_mrf, src,
273 BRW_MATH_DATA_VECTOR,
274 BRW_MATH_PRECISION_FULL);
275
276 if (dispatch_width == 16) {
277 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
278 brw_math(p, sechalf(dst),
279 op,
280 inst->base_mrf + 1, sechalf(src),
281 BRW_MATH_DATA_VECTOR,
282 BRW_MATH_PRECISION_FULL);
283
284 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
285 }
286 }
287
288 void
289 fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
290 {
291 int msg_type = -1;
292 int rlen = 4;
293 uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
294 uint32_t return_format;
295
296 switch (dst.type) {
297 case BRW_REGISTER_TYPE_D:
298 return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
299 break;
300 case BRW_REGISTER_TYPE_UD:
301 return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
302 break;
303 default:
304 return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
305 break;
306 }
307
308 if (dispatch_width == 16)
309 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
310
311 if (intel->gen >= 5) {
312 switch (inst->opcode) {
313 case SHADER_OPCODE_TEX:
314 if (inst->shadow_compare) {
315 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
316 } else {
317 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
318 }
319 break;
320 case FS_OPCODE_TXB:
321 if (inst->shadow_compare) {
322 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
323 } else {
324 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
325 }
326 break;
327 case SHADER_OPCODE_TXL:
328 if (inst->shadow_compare) {
329 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
330 } else {
331 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
332 }
333 break;
334 case SHADER_OPCODE_TXS:
335 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
336 break;
337 case SHADER_OPCODE_TXD:
338 /* There is no sample_d_c message; comparisons are done manually */
339 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
340 break;
341 case SHADER_OPCODE_TXF:
342 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
343 break;
344 default:
345 assert(!"not reached");
346 break;
347 }
348 } else {
349 switch (inst->opcode) {
350 case SHADER_OPCODE_TEX:
351 /* Note that G45 and older determines shadow compare and dispatch width
352 * from message length for most messages.
353 */
354 assert(dispatch_width == 8);
355 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
356 if (inst->shadow_compare) {
357 assert(inst->mlen == 6);
358 } else {
359 assert(inst->mlen <= 4);
360 }
361 break;
362 case FS_OPCODE_TXB:
363 if (inst->shadow_compare) {
364 assert(inst->mlen == 6);
365 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
366 } else {
367 assert(inst->mlen == 9);
368 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
369 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
370 }
371 break;
372 case SHADER_OPCODE_TXL:
373 if (inst->shadow_compare) {
374 assert(inst->mlen == 6);
375 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
376 } else {
377 assert(inst->mlen == 9);
378 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
379 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
380 }
381 break;
382 case SHADER_OPCODE_TXD:
383 /* There is no sample_d_c message; comparisons are done manually */
384 assert(inst->mlen == 7 || inst->mlen == 10);
385 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
386 break;
387 case SHADER_OPCODE_TXF:
388 assert(inst->mlen == 9);
389 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
390 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
391 break;
392 case SHADER_OPCODE_TXS:
393 assert(inst->mlen == 3);
394 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
395 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
396 break;
397 default:
398 assert(!"not reached");
399 break;
400 }
401 }
402 assert(msg_type != -1);
403
404 if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
405 rlen = 8;
406 dst = vec16(dst);
407 }
408
409 /* Load the message header if present. If there's a texture offset,
410 * we need to set it up explicitly and load the offset bitfield.
411 * Otherwise, we can use an implied move from g0 to the first message reg.
412 */
413 if (inst->texture_offset) {
414 brw_push_insn_state(p);
415 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
416 /* Explicitly set up the message header by copying g0 to the MRF. */
417 brw_MOV(p, retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD),
418 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
419
420 /* Then set the offset bits in DWord 2. */
421 brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
422 inst->base_mrf, 2), BRW_REGISTER_TYPE_UD),
423 brw_imm_ud(inst->texture_offset));
424 brw_pop_insn_state(p);
425 } else if (inst->header_present) {
426 /* Set up an implied move from g0 to the MRF. */
427 src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
428 }
429
430 brw_SAMPLE(p,
431 retype(dst, BRW_REGISTER_TYPE_UW),
432 inst->base_mrf,
433 src,
434 SURF_INDEX_TEXTURE(inst->sampler),
435 inst->sampler,
436 WRITEMASK_XYZW,
437 msg_type,
438 rlen,
439 inst->mlen,
440 inst->header_present,
441 simd_mode,
442 return_format);
443 }
444
445
446 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
447 * looking like:
448 *
449 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
450 *
451 * and we're trying to produce:
452 *
453 * DDX DDY
454 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
455 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
456 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
457 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
458 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
459 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
460 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
461 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
462 *
463 * and add another set of two more subspans if in 16-pixel dispatch mode.
464 *
465 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
466 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
467 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
468 * between each other. We could probably do it like ddx and swizzle the right
469 * order later, but bail for now and just produce
470 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
471 */
472 void
473 fs_generator::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
474 {
475 struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
476 BRW_REGISTER_TYPE_F,
477 BRW_VERTICAL_STRIDE_2,
478 BRW_WIDTH_2,
479 BRW_HORIZONTAL_STRIDE_0,
480 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
481 struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
482 BRW_REGISTER_TYPE_F,
483 BRW_VERTICAL_STRIDE_2,
484 BRW_WIDTH_2,
485 BRW_HORIZONTAL_STRIDE_0,
486 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
487 brw_ADD(p, dst, src0, negate(src1));
488 }
489
490 /* The negate_value boolean is used to negate the derivative computation for
491 * FBOs, since they place the origin at the upper left instead of the lower
492 * left.
493 */
494 void
495 fs_generator::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
496 bool negate_value)
497 {
498 struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
499 BRW_REGISTER_TYPE_F,
500 BRW_VERTICAL_STRIDE_4,
501 BRW_WIDTH_4,
502 BRW_HORIZONTAL_STRIDE_0,
503 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
504 struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
505 BRW_REGISTER_TYPE_F,
506 BRW_VERTICAL_STRIDE_4,
507 BRW_WIDTH_4,
508 BRW_HORIZONTAL_STRIDE_0,
509 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
510 if (negate_value)
511 brw_ADD(p, dst, src1, negate(src0));
512 else
513 brw_ADD(p, dst, src0, negate(src1));
514 }
515
516 void
517 fs_generator::generate_discard(fs_inst *inst)
518 {
519 struct brw_reg f0 = brw_flag_reg();
520
521 if (intel->gen >= 6) {
522 struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
523 struct brw_reg some_register;
524
525 /* As of gen6, we no longer have the mask register to look at,
526 * so life gets a bit more complicated.
527 */
528
529 /* Load the flag register with all ones. */
530 brw_push_insn_state(p);
531 brw_set_mask_control(p, BRW_MASK_DISABLE);
532 brw_MOV(p, f0, brw_imm_uw(0xffff));
533 brw_pop_insn_state(p);
534
535 /* Do a comparison that should always fail, to produce 0s in the flag
536 * reg where we have active channels.
537 */
538 some_register = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
539 brw_CMP(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
540 BRW_CONDITIONAL_NZ, some_register, some_register);
541
542 /* Undo CMP's whacking of predication*/
543 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
544
545 brw_push_insn_state(p);
546 brw_set_mask_control(p, BRW_MASK_DISABLE);
547 brw_AND(p, g1, f0, g1);
548 brw_pop_insn_state(p);
549 } else {
550 struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
551
552 brw_push_insn_state(p);
553 brw_set_mask_control(p, BRW_MASK_DISABLE);
554 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
555
556 /* Unlike the 965, we have the mask reg, so we just need
557 * somewhere to invert that (containing channels to be disabled)
558 * so it can be ANDed with the mask of pixels still to be
559 * written. Use the flag reg for consistency with gen6+.
560 */
561 brw_NOT(p, f0, brw_mask_reg(1)); /* IMASK */
562 brw_AND(p, g0, f0, g0);
563
564 brw_pop_insn_state(p);
565 }
566 }
567
568 void
569 fs_generator::generate_spill(fs_inst *inst, struct brw_reg src)
570 {
571 assert(inst->mlen != 0);
572
573 brw_MOV(p,
574 retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD),
575 retype(src, BRW_REGISTER_TYPE_UD));
576 brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1,
577 inst->offset);
578 }
579
580 void
581 fs_generator::generate_unspill(fs_inst *inst, struct brw_reg dst)
582 {
583 assert(inst->mlen != 0);
584
585 /* Clear any post destination dependencies that would be ignored by
586 * the block read. See the B-Spec for pre-gen5 send instruction.
587 *
588 * This could use a better solution, since texture sampling and
589 * math reads could potentially run into it as well -- anywhere
590 * that we have a SEND with a destination that is a register that
591 * was written but not read within the last N instructions (what's
592 * N? unsure). This is rare because of dead code elimination, but
593 * not impossible.
594 */
595 if (intel->gen == 4 && !intel->is_g4x)
596 brw_MOV(p, brw_null_reg(), dst);
597
598 brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1,
599 inst->offset);
600
601 if (intel->gen == 4 && !intel->is_g4x) {
602 /* gen4 errata: destination from a send can't be used as a
603 * destination until it's been read. Just read it so we don't
604 * have to worry.
605 */
606 brw_MOV(p, brw_null_reg(), dst);
607 }
608 }
609
610 void
611 fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
612 struct brw_reg dst,
613 struct brw_reg index,
614 struct brw_reg offset)
615 {
616 assert(inst->mlen != 0);
617
618 /* Clear any post destination dependencies that would be ignored by
619 * the block read. See the B-Spec for pre-gen5 send instruction.
620 *
621 * This could use a better solution, since texture sampling and
622 * math reads could potentially run into it as well -- anywhere
623 * that we have a SEND with a destination that is a register that
624 * was written but not read within the last N instructions (what's
625 * N? unsure). This is rare because of dead code elimination, but
626 * not impossible.
627 */
628 if (intel->gen == 4 && !intel->is_g4x)
629 brw_MOV(p, brw_null_reg(), dst);
630
631 assert(index.file == BRW_IMMEDIATE_VALUE &&
632 index.type == BRW_REGISTER_TYPE_UD);
633 uint32_t surf_index = index.dw1.ud;
634
635 assert(offset.file == BRW_IMMEDIATE_VALUE &&
636 offset.type == BRW_REGISTER_TYPE_UD);
637 uint32_t read_offset = offset.dw1.ud;
638
639 brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
640 read_offset, surf_index);
641
642 if (intel->gen == 4 && !intel->is_g4x) {
643 /* gen4 errata: destination from a send can't be used as a
644 * destination until it's been read. Just read it so we don't
645 * have to worry.
646 */
647 brw_MOV(p, brw_null_reg(), dst);
648 }
649 }
650
651 void
652 fs_generator::generate_varying_pull_constant_load(fs_inst *inst,
653 struct brw_reg dst,
654 struct brw_reg index)
655 {
656 assert(intel->gen < 7); /* Should use the gen7 variant. */
657 assert(inst->header_present);
658
659 assert(index.file == BRW_IMMEDIATE_VALUE &&
660 index.type == BRW_REGISTER_TYPE_UD);
661 uint32_t surf_index = index.dw1.ud;
662
663 uint32_t msg_type, msg_control, rlen;
664 if (intel->gen >= 6)
665 msg_type = GEN6_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ;
666 else if (intel->gen == 5 || intel->is_g4x)
667 msg_type = G45_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ;
668 else
669 msg_type = BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ;
670
671 if (dispatch_width == 16) {
672 msg_control = BRW_DATAPORT_DWORD_SCATTERED_BLOCK_16DWORDS;
673 rlen = 2;
674 } else {
675 msg_control = BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS;
676 rlen = 1;
677 }
678
679 struct brw_reg header = brw_vec8_grf(0, 0);
680 gen6_resolve_implied_move(p, &header, inst->base_mrf);
681
682 struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
683 brw_set_dest(p, send, dst);
684 brw_set_src0(p, send, header);
685 if (intel->gen < 6)
686 send->header.destreg__conditionalmod = inst->base_mrf;
687 brw_set_dp_read_message(p, send,
688 surf_index,
689 msg_control,
690 msg_type,
691 BRW_DATAPORT_READ_TARGET_DATA_CACHE,
692 inst->mlen,
693 inst->header_present,
694 rlen);
695 }
696
697 void
698 fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst,
699 struct brw_reg dst,
700 struct brw_reg index,
701 struct brw_reg offset)
702 {
703 assert(intel->gen >= 7);
704 /* Varying-offset pull constant loads are treated as a normal expression on
705 * gen7, so the fact that it's a send message is hidden at the IR level.
706 */
707 assert(!inst->header_present);
708 assert(!inst->mlen);
709
710 assert(index.file == BRW_IMMEDIATE_VALUE &&
711 index.type == BRW_REGISTER_TYPE_UD);
712 uint32_t surf_index = index.dw1.ud;
713
714 uint32_t msg_control, rlen, mlen;
715 if (dispatch_width == 16) {
716 msg_control = BRW_DATAPORT_DWORD_SCATTERED_BLOCK_16DWORDS;
717 mlen = rlen = 2;
718 } else {
719 msg_control = BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS;
720 mlen = rlen = 1;
721 }
722
723 struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
724 brw_set_dest(p, send, dst);
725 brw_set_src0(p, send, offset);
726 if (intel->gen < 6)
727 send->header.destreg__conditionalmod = inst->base_mrf;
728 brw_set_dp_read_message(p, send,
729 surf_index,
730 msg_control,
731 GEN7_DATAPORT_DC_DWORD_SCATTERED_READ,
732 BRW_DATAPORT_READ_TARGET_DATA_CACHE,
733 mlen,
734 inst->header_present,
735 rlen);
736 }
737
738 /**
739 * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
740 * into the flags register (f0.0).
741 *
742 * Used only on Gen6 and above.
743 */
744 void
745 fs_generator::generate_mov_dispatch_to_flags()
746 {
747 struct brw_reg f0 = brw_flag_reg();
748 struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
749
750 assert (intel->gen >= 6);
751 brw_push_insn_state(p);
752 brw_set_mask_control(p, BRW_MASK_DISABLE);
753 brw_MOV(p, f0, g1);
754 brw_pop_insn_state(p);
755 }
756
757
758 static uint32_t brw_file_from_reg(fs_reg *reg)
759 {
760 switch (reg->file) {
761 case ARF:
762 return BRW_ARCHITECTURE_REGISTER_FILE;
763 case GRF:
764 return BRW_GENERAL_REGISTER_FILE;
765 case MRF:
766 return BRW_MESSAGE_REGISTER_FILE;
767 case IMM:
768 return BRW_IMMEDIATE_VALUE;
769 default:
770 assert(!"not reached");
771 return BRW_GENERAL_REGISTER_FILE;
772 }
773 }
774
775 static struct brw_reg
776 brw_reg_from_fs_reg(fs_reg *reg)
777 {
778 struct brw_reg brw_reg;
779
780 switch (reg->file) {
781 case GRF:
782 case ARF:
783 case MRF:
784 if (reg->smear == -1) {
785 brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0);
786 } else {
787 brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->reg, reg->smear);
788 }
789 brw_reg = retype(brw_reg, reg->type);
790 if (reg->sechalf)
791 brw_reg = sechalf(brw_reg);
792 break;
793 case IMM:
794 switch (reg->type) {
795 case BRW_REGISTER_TYPE_F:
796 brw_reg = brw_imm_f(reg->imm.f);
797 break;
798 case BRW_REGISTER_TYPE_D:
799 brw_reg = brw_imm_d(reg->imm.i);
800 break;
801 case BRW_REGISTER_TYPE_UD:
802 brw_reg = brw_imm_ud(reg->imm.u);
803 break;
804 default:
805 assert(!"not reached");
806 brw_reg = brw_null_reg();
807 break;
808 }
809 break;
810 case FIXED_HW_REG:
811 brw_reg = reg->fixed_hw_reg;
812 break;
813 case BAD_FILE:
814 /* Probably unused. */
815 brw_reg = brw_null_reg();
816 break;
817 case UNIFORM:
818 assert(!"not reached");
819 brw_reg = brw_null_reg();
820 break;
821 default:
822 assert(!"not reached");
823 brw_reg = brw_null_reg();
824 break;
825 }
826 if (reg->abs)
827 brw_reg = brw_abs(brw_reg);
828 if (reg->negate)
829 brw_reg = negate(brw_reg);
830
831 return brw_reg;
832 }
833
834 void
835 fs_generator::generate_code(exec_list *instructions)
836 {
837 int last_native_insn_offset = p->next_insn_offset;
838 const char *last_annotation_string = NULL;
839 const void *last_annotation_ir = NULL;
840
841 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
842 if (shader) {
843 printf("Native code for fragment shader %d (%d-wide dispatch):\n",
844 prog->Name, dispatch_width);
845 } else {
846 printf("Native code for fragment program %d (%d-wide dispatch):\n",
847 fp->Base.Id, dispatch_width);
848 }
849 }
850
851 cfg_t *cfg = NULL;
852 if (unlikely(INTEL_DEBUG & DEBUG_WM))
853 cfg = new(mem_ctx) cfg_t(mem_ctx, instructions);
854
855 foreach_list(node, instructions) {
856 fs_inst *inst = (fs_inst *)node;
857 struct brw_reg src[3], dst;
858
859 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
860 foreach_list(node, &cfg->block_list) {
861 bblock_link *link = (bblock_link *)node;
862 bblock_t *block = link->block;
863
864 if (block->start == inst) {
865 printf(" START B%d", block->block_num);
866 foreach_list(predecessor_node, &block->parents) {
867 bblock_link *predecessor_link =
868 (bblock_link *)predecessor_node;
869 bblock_t *predecessor_block = predecessor_link->block;
870 printf(" <-B%d", predecessor_block->block_num);
871 }
872 printf("\n");
873 }
874 }
875
876 if (last_annotation_ir != inst->ir) {
877 last_annotation_ir = inst->ir;
878 if (last_annotation_ir) {
879 printf(" ");
880 if (shader)
881 ((ir_instruction *)inst->ir)->print();
882 else {
883 const prog_instruction *fpi;
884 fpi = (const prog_instruction *)inst->ir;
885 printf("%d: ", (int)(fpi - fp->Base.Instructions));
886 _mesa_fprint_instruction_opt(stdout,
887 fpi,
888 0, PROG_PRINT_DEBUG, NULL);
889 }
890 printf("\n");
891 }
892 }
893 if (last_annotation_string != inst->annotation) {
894 last_annotation_string = inst->annotation;
895 if (last_annotation_string)
896 printf(" %s\n", last_annotation_string);
897 }
898 }
899
900 for (unsigned int i = 0; i < 3; i++) {
901 src[i] = brw_reg_from_fs_reg(&inst->src[i]);
902
903 /* The accumulator result appears to get used for the
904 * conditional modifier generation. When negating a UD
905 * value, there is a 33rd bit generated for the sign in the
906 * accumulator value, so now you can't check, for example,
907 * equality with a 32-bit value. See piglit fs-op-neg-uvec4.
908 */
909 assert(!inst->conditional_mod ||
910 inst->src[i].type != BRW_REGISTER_TYPE_UD ||
911 !inst->src[i].negate);
912 }
913 dst = brw_reg_from_fs_reg(&inst->dst);
914
915 brw_set_conditionalmod(p, inst->conditional_mod);
916 brw_set_predicate_control(p, inst->predicate);
917 brw_set_predicate_inverse(p, inst->predicate_inverse);
918 brw_set_saturate(p, inst->saturate);
919 brw_set_mask_control(p, inst->force_writemask_all);
920
921 if (inst->force_uncompressed || dispatch_width == 8) {
922 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
923 } else if (inst->force_sechalf) {
924 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
925 } else {
926 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
927 }
928
929 switch (inst->opcode) {
930 case BRW_OPCODE_MOV:
931 brw_MOV(p, dst, src[0]);
932 break;
933 case BRW_OPCODE_ADD:
934 brw_ADD(p, dst, src[0], src[1]);
935 break;
936 case BRW_OPCODE_MUL:
937 brw_MUL(p, dst, src[0], src[1]);
938 break;
939 case BRW_OPCODE_MACH:
940 brw_set_acc_write_control(p, 1);
941 brw_MACH(p, dst, src[0], src[1]);
942 brw_set_acc_write_control(p, 0);
943 break;
944
945 case BRW_OPCODE_MAD:
946 brw_set_access_mode(p, BRW_ALIGN_16);
947 if (dispatch_width == 16) {
948 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
949 brw_MAD(p, dst, src[0], src[1], src[2]);
950 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
951 brw_MAD(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
952 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
953 } else {
954 brw_MAD(p, dst, src[0], src[1], src[2]);
955 }
956 brw_set_access_mode(p, BRW_ALIGN_1);
957 break;
958
959 case BRW_OPCODE_FRC:
960 brw_FRC(p, dst, src[0]);
961 break;
962 case BRW_OPCODE_RNDD:
963 brw_RNDD(p, dst, src[0]);
964 break;
965 case BRW_OPCODE_RNDE:
966 brw_RNDE(p, dst, src[0]);
967 break;
968 case BRW_OPCODE_RNDZ:
969 brw_RNDZ(p, dst, src[0]);
970 break;
971
972 case BRW_OPCODE_AND:
973 brw_AND(p, dst, src[0], src[1]);
974 break;
975 case BRW_OPCODE_OR:
976 brw_OR(p, dst, src[0], src[1]);
977 break;
978 case BRW_OPCODE_XOR:
979 brw_XOR(p, dst, src[0], src[1]);
980 break;
981 case BRW_OPCODE_NOT:
982 brw_NOT(p, dst, src[0]);
983 break;
984 case BRW_OPCODE_ASR:
985 brw_ASR(p, dst, src[0], src[1]);
986 break;
987 case BRW_OPCODE_SHR:
988 brw_SHR(p, dst, src[0], src[1]);
989 break;
990 case BRW_OPCODE_SHL:
991 brw_SHL(p, dst, src[0], src[1]);
992 break;
993
994 case BRW_OPCODE_CMP:
995 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
996 break;
997 case BRW_OPCODE_SEL:
998 brw_SEL(p, dst, src[0], src[1]);
999 break;
1000
1001 case BRW_OPCODE_IF:
1002 if (inst->src[0].file != BAD_FILE) {
1003 /* The instruction has an embedded compare (only allowed on gen6) */
1004 assert(intel->gen == 6);
1005 gen6_IF(p, inst->conditional_mod, src[0], src[1]);
1006 } else {
1007 brw_IF(p, dispatch_width == 16 ? BRW_EXECUTE_16 : BRW_EXECUTE_8);
1008 }
1009 break;
1010
1011 case BRW_OPCODE_ELSE:
1012 brw_ELSE(p);
1013 break;
1014 case BRW_OPCODE_ENDIF:
1015 brw_ENDIF(p);
1016 break;
1017
1018 case BRW_OPCODE_DO:
1019 brw_DO(p, BRW_EXECUTE_8);
1020 break;
1021
1022 case BRW_OPCODE_BREAK:
1023 brw_BREAK(p);
1024 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1025 break;
1026 case BRW_OPCODE_CONTINUE:
1027 /* FINISHME: We need to write the loop instruction support still. */
1028 if (intel->gen >= 6)
1029 gen6_CONT(p);
1030 else
1031 brw_CONT(p);
1032 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1033 break;
1034
1035 case BRW_OPCODE_WHILE:
1036 brw_WHILE(p);
1037 break;
1038
1039 case SHADER_OPCODE_RCP:
1040 case SHADER_OPCODE_RSQ:
1041 case SHADER_OPCODE_SQRT:
1042 case SHADER_OPCODE_EXP2:
1043 case SHADER_OPCODE_LOG2:
1044 case SHADER_OPCODE_SIN:
1045 case SHADER_OPCODE_COS:
1046 if (intel->gen >= 7) {
1047 generate_math1_gen7(inst, dst, src[0]);
1048 } else if (intel->gen == 6) {
1049 generate_math1_gen6(inst, dst, src[0]);
1050 } else {
1051 generate_math_gen4(inst, dst, src[0]);
1052 }
1053 break;
1054 case SHADER_OPCODE_INT_QUOTIENT:
1055 case SHADER_OPCODE_INT_REMAINDER:
1056 case SHADER_OPCODE_POW:
1057 if (intel->gen >= 7) {
1058 generate_math2_gen7(inst, dst, src[0], src[1]);
1059 } else if (intel->gen == 6) {
1060 generate_math2_gen6(inst, dst, src[0], src[1]);
1061 } else {
1062 generate_math_gen4(inst, dst, src[0]);
1063 }
1064 break;
1065 case FS_OPCODE_PIXEL_X:
1066 generate_pixel_xy(dst, true);
1067 break;
1068 case FS_OPCODE_PIXEL_Y:
1069 generate_pixel_xy(dst, false);
1070 break;
1071 case FS_OPCODE_CINTERP:
1072 brw_MOV(p, dst, src[0]);
1073 break;
1074 case FS_OPCODE_LINTERP:
1075 generate_linterp(inst, dst, src);
1076 break;
1077 case SHADER_OPCODE_TEX:
1078 case FS_OPCODE_TXB:
1079 case SHADER_OPCODE_TXD:
1080 case SHADER_OPCODE_TXF:
1081 case SHADER_OPCODE_TXL:
1082 case SHADER_OPCODE_TXS:
1083 generate_tex(inst, dst, src[0]);
1084 break;
1085 case FS_OPCODE_DISCARD:
1086 generate_discard(inst);
1087 break;
1088 case FS_OPCODE_DDX:
1089 generate_ddx(inst, dst, src[0]);
1090 break;
1091 case FS_OPCODE_DDY:
1092 /* Make sure fp->UsesDFdy flag got set (otherwise there's no
1093 * guarantee that c->key.render_to_fbo is set).
1094 */
1095 assert(fp->UsesDFdy);
1096 generate_ddy(inst, dst, src[0], c->key.render_to_fbo);
1097 break;
1098
1099 case FS_OPCODE_SPILL:
1100 generate_spill(inst, src[0]);
1101 break;
1102
1103 case FS_OPCODE_UNSPILL:
1104 generate_unspill(inst, dst);
1105 break;
1106
1107 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1108 generate_uniform_pull_constant_load(inst, dst, src[0], src[1]);
1109 break;
1110
1111 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1112 generate_varying_pull_constant_load(inst, dst, src[0]);
1113 break;
1114
1115 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
1116 generate_varying_pull_constant_load_gen7(inst, dst, src[0], src[1]);
1117 break;
1118
1119 case FS_OPCODE_FB_WRITE:
1120 generate_fb_write(inst);
1121 break;
1122
1123 case FS_OPCODE_MOV_DISPATCH_TO_FLAGS:
1124 generate_mov_dispatch_to_flags();
1125 break;
1126
1127 default:
1128 if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) {
1129 _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
1130 opcode_descs[inst->opcode].name);
1131 } else {
1132 _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode);
1133 }
1134 abort();
1135 }
1136
1137 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1138 brw_dump_compile(p, stdout,
1139 last_native_insn_offset, p->next_insn_offset);
1140
1141 foreach_list(node, &cfg->block_list) {
1142 bblock_link *link = (bblock_link *)node;
1143 bblock_t *block = link->block;
1144
1145 if (block->end == inst) {
1146 printf(" END B%d", block->block_num);
1147 foreach_list(successor_node, &block->children) {
1148 bblock_link *successor_link =
1149 (bblock_link *)successor_node;
1150 bblock_t *successor_block = successor_link->block;
1151 printf(" ->B%d", successor_block->block_num);
1152 }
1153 printf("\n");
1154 }
1155 }
1156 }
1157
1158 last_native_insn_offset = p->next_insn_offset;
1159 }
1160
1161 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1162 printf("\n");
1163 }
1164
1165 brw_set_uip_jip(p);
1166
1167 /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS
1168 * emit issues, it doesn't get the jump distances into the output,
1169 * which is often something we want to debug. So this is here in
1170 * case you're doing that.
1171 */
1172 if (0) {
1173 brw_dump_compile(p, stdout, 0, p->next_insn_offset);
1174 }
1175 }
1176
1177 const unsigned *
1178 fs_generator::generate_assembly(exec_list *simd8_instructions,
1179 exec_list *simd16_instructions,
1180 unsigned *assembly_size)
1181 {
1182 dispatch_width = 8;
1183 generate_code(simd8_instructions);
1184
1185 if (simd16_instructions) {
1186 /* We have to do a compaction pass now, or the one at the end of
1187 * execution will squash down where our prog_offset start needs
1188 * to be.
1189 */
1190 brw_compact_instructions(p);
1191
1192 /* align to 64 byte boundary. */
1193 while ((p->nr_insn * sizeof(struct brw_instruction)) % 64) {
1194 brw_NOP(p);
1195 }
1196
1197 /* Save off the start of this 16-wide program */
1198 c->prog_data.prog_offset_16 = p->nr_insn * sizeof(struct brw_instruction);
1199
1200 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1201
1202 dispatch_width = 16;
1203 generate_code(simd16_instructions);
1204 }
1205
1206 return brw_get_program(p, assembly_size);
1207 }