2 * Copyright © 2010 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 /** @file brw_fs_emit.cpp
26 * This file supports emitting code from the FS LIR to the actual
27 * native instructions.
31 #include "main/macros.h"
32 #include "brw_context.h"
38 #include "glsl/ir_print_visitor.h"
40 fs_generator::fs_generator(struct brw_context
*brw
,
41 struct brw_wm_compile
*c
,
42 struct gl_shader_program
*prog
,
43 struct gl_fragment_program
*fp
,
44 bool dual_source_output
)
46 : brw(brw
), c(c
), prog(prog
), fp(fp
), dual_source_output(dual_source_output
)
51 shader
= prog
? prog
->_LinkedShaders
[MESA_SHADER_FRAGMENT
] : NULL
;
55 p
= rzalloc(mem_ctx
, struct brw_compile
);
56 brw_init_compile(brw
, p
, mem_ctx
);
59 fs_generator::~fs_generator()
64 fs_generator::generate_fb_write(fs_inst
*inst
)
67 struct brw_reg implied_header
;
70 /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
73 brw_push_insn_state(p
);
74 brw_set_mask_control(p
, BRW_MASK_DISABLE
);
75 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
77 if (inst
->header_present
) {
78 if (intel
->gen
>= 6) {
79 brw_set_compression_control(p
, BRW_COMPRESSION_COMPRESSED
);
81 retype(brw_message_reg(inst
->base_mrf
), BRW_REGISTER_TYPE_UD
),
82 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
));
83 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
85 if (inst
->target
> 0 &&
86 c
->key
.nr_color_regions
> 1 &&
87 c
->key
.sample_alpha_to_coverage
) {
88 /* Set "Source0 Alpha Present to RenderTarget" bit in message
92 vec1(retype(brw_message_reg(inst
->base_mrf
), BRW_REGISTER_TYPE_UD
)),
93 vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
)),
94 brw_imm_ud(0x1 << 11));
97 if (inst
->target
> 0) {
98 /* Set the render target index for choosing BLEND_STATE. */
99 brw_MOV(p
, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE
,
101 BRW_REGISTER_TYPE_UD
),
102 brw_imm_ud(inst
->target
));
105 implied_header
= brw_null_reg();
107 implied_header
= retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW
);
110 brw_message_reg(inst
->base_mrf
+ 1),
114 implied_header
= brw_null_reg();
117 if (this->dual_source_output
)
118 msg_control
= BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01
;
119 else if (dispatch_width
== 16)
120 msg_control
= BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE
;
122 msg_control
= BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01
;
124 brw_pop_insn_state(p
);
135 inst
->header_present
);
138 /* Computes the integer pixel x,y values from the origin.
140 * This is the basis of gl_FragCoord computation, but is also used
141 * pre-gen6 for computing the deltas from v0 for computing
145 fs_generator::generate_pixel_xy(struct brw_reg dst
, bool is_x
)
147 struct brw_reg g1_uw
= retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW
);
149 struct brw_reg deltas
;
152 src
= stride(suboffset(g1_uw
, 4), 2, 4, 0);
153 deltas
= brw_imm_v(0x10101010);
155 src
= stride(suboffset(g1_uw
, 5), 2, 4, 0);
156 deltas
= brw_imm_v(0x11001100);
159 if (dispatch_width
== 16) {
163 /* We do this 8 or 16-wide, but since the destination is UW we
164 * don't do compression in the 16-wide case.
166 brw_push_insn_state(p
);
167 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
168 brw_ADD(p
, dst
, src
, deltas
);
169 brw_pop_insn_state(p
);
173 fs_generator::generate_linterp(fs_inst
*inst
,
174 struct brw_reg dst
, struct brw_reg
*src
)
176 struct brw_reg delta_x
= src
[0];
177 struct brw_reg delta_y
= src
[1];
178 struct brw_reg interp
= src
[2];
181 delta_y
.nr
== delta_x
.nr
+ 1 &&
182 (intel
->gen
>= 6 || (delta_x
.nr
& 1) == 0)) {
183 brw_PLN(p
, dst
, interp
, delta_x
);
185 brw_LINE(p
, brw_null_reg(), interp
, delta_x
);
186 brw_MAC(p
, dst
, suboffset(interp
, 1), delta_y
);
191 fs_generator::generate_math1_gen7(fs_inst
*inst
,
195 assert(inst
->mlen
== 0);
197 brw_math_function(inst
->opcode
),
199 BRW_MATH_DATA_VECTOR
,
200 BRW_MATH_PRECISION_FULL
);
204 fs_generator::generate_math2_gen7(fs_inst
*inst
,
209 assert(inst
->mlen
== 0);
210 brw_math2(p
, dst
, brw_math_function(inst
->opcode
), src0
, src1
);
214 fs_generator::generate_math1_gen6(fs_inst
*inst
,
218 int op
= brw_math_function(inst
->opcode
);
220 assert(inst
->mlen
== 0);
222 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
226 BRW_MATH_DATA_VECTOR
,
227 BRW_MATH_PRECISION_FULL
);
229 if (dispatch_width
== 16) {
230 brw_set_compression_control(p
, BRW_COMPRESSION_2NDHALF
);
231 brw_math(p
, sechalf(dst
),
234 BRW_MATH_DATA_VECTOR
,
235 BRW_MATH_PRECISION_FULL
);
236 brw_set_compression_control(p
, BRW_COMPRESSION_COMPRESSED
);
241 fs_generator::generate_math2_gen6(fs_inst
*inst
,
246 int op
= brw_math_function(inst
->opcode
);
248 assert(inst
->mlen
== 0);
250 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
251 brw_math2(p
, dst
, op
, src0
, src1
);
253 if (dispatch_width
== 16) {
254 brw_set_compression_control(p
, BRW_COMPRESSION_2NDHALF
);
255 brw_math2(p
, sechalf(dst
), op
, sechalf(src0
), sechalf(src1
));
256 brw_set_compression_control(p
, BRW_COMPRESSION_COMPRESSED
);
261 fs_generator::generate_math_gen4(fs_inst
*inst
,
265 int op
= brw_math_function(inst
->opcode
);
267 assert(inst
->mlen
>= 1);
269 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
273 BRW_MATH_DATA_VECTOR
,
274 BRW_MATH_PRECISION_FULL
);
276 if (dispatch_width
== 16) {
277 brw_set_compression_control(p
, BRW_COMPRESSION_2NDHALF
);
278 brw_math(p
, sechalf(dst
),
280 inst
->base_mrf
+ 1, sechalf(src
),
281 BRW_MATH_DATA_VECTOR
,
282 BRW_MATH_PRECISION_FULL
);
284 brw_set_compression_control(p
, BRW_COMPRESSION_COMPRESSED
);
289 fs_generator::generate_tex(fs_inst
*inst
, struct brw_reg dst
, struct brw_reg src
)
293 uint32_t simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD8
;
294 uint32_t return_format
;
297 case BRW_REGISTER_TYPE_D
:
298 return_format
= BRW_SAMPLER_RETURN_FORMAT_SINT32
;
300 case BRW_REGISTER_TYPE_UD
:
301 return_format
= BRW_SAMPLER_RETURN_FORMAT_UINT32
;
304 return_format
= BRW_SAMPLER_RETURN_FORMAT_FLOAT32
;
308 if (dispatch_width
== 16)
309 simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD16
;
311 if (intel
->gen
>= 5) {
312 switch (inst
->opcode
) {
313 case SHADER_OPCODE_TEX
:
314 if (inst
->shadow_compare
) {
315 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE
;
317 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE
;
321 if (inst
->shadow_compare
) {
322 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE
;
324 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS
;
327 case SHADER_OPCODE_TXL
:
328 if (inst
->shadow_compare
) {
329 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE
;
331 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_LOD
;
334 case SHADER_OPCODE_TXS
:
335 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO
;
337 case SHADER_OPCODE_TXD
:
338 /* There is no sample_d_c message; comparisons are done manually */
339 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS
;
341 case SHADER_OPCODE_TXF
:
342 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_LD
;
345 assert(!"not reached");
349 switch (inst
->opcode
) {
350 case SHADER_OPCODE_TEX
:
351 /* Note that G45 and older determines shadow compare and dispatch width
352 * from message length for most messages.
354 assert(dispatch_width
== 8);
355 msg_type
= BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE
;
356 if (inst
->shadow_compare
) {
357 assert(inst
->mlen
== 6);
359 assert(inst
->mlen
<= 4);
363 if (inst
->shadow_compare
) {
364 assert(inst
->mlen
== 6);
365 msg_type
= BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE
;
367 assert(inst
->mlen
== 9);
368 msg_type
= BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS
;
369 simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD16
;
372 case SHADER_OPCODE_TXL
:
373 if (inst
->shadow_compare
) {
374 assert(inst
->mlen
== 6);
375 msg_type
= BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE
;
377 assert(inst
->mlen
== 9);
378 msg_type
= BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD
;
379 simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD16
;
382 case SHADER_OPCODE_TXD
:
383 /* There is no sample_d_c message; comparisons are done manually */
384 assert(inst
->mlen
== 7 || inst
->mlen
== 10);
385 msg_type
= BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS
;
387 case SHADER_OPCODE_TXF
:
388 assert(inst
->mlen
== 9);
389 msg_type
= BRW_SAMPLER_MESSAGE_SIMD16_LD
;
390 simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD16
;
392 case SHADER_OPCODE_TXS
:
393 assert(inst
->mlen
== 3);
394 msg_type
= BRW_SAMPLER_MESSAGE_SIMD16_RESINFO
;
395 simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD16
;
398 assert(!"not reached");
402 assert(msg_type
!= -1);
404 if (simd_mode
== BRW_SAMPLER_SIMD_MODE_SIMD16
) {
409 /* Load the message header if present. If there's a texture offset,
410 * we need to set it up explicitly and load the offset bitfield.
411 * Otherwise, we can use an implied move from g0 to the first message reg.
413 if (inst
->texture_offset
) {
414 brw_push_insn_state(p
);
415 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
416 /* Explicitly set up the message header by copying g0 to the MRF. */
417 brw_MOV(p
, retype(brw_message_reg(inst
->base_mrf
), BRW_REGISTER_TYPE_UD
),
418 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
));
420 /* Then set the offset bits in DWord 2. */
421 brw_MOV(p
, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE
,
422 inst
->base_mrf
, 2), BRW_REGISTER_TYPE_UD
),
423 brw_imm_ud(inst
->texture_offset
));
424 brw_pop_insn_state(p
);
425 } else if (inst
->header_present
) {
426 /* Set up an implied move from g0 to the MRF. */
427 src
= retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW
);
431 retype(dst
, BRW_REGISTER_TYPE_UW
),
434 SURF_INDEX_TEXTURE(inst
->sampler
),
440 inst
->header_present
,
446 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
449 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
451 * and we're trying to produce:
454 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
455 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
456 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
457 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
458 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
459 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
460 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
461 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
463 * and add another set of two more subspans if in 16-pixel dispatch mode.
465 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
466 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
467 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
468 * between each other. We could probably do it like ddx and swizzle the right
469 * order later, but bail for now and just produce
470 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
473 fs_generator::generate_ddx(fs_inst
*inst
, struct brw_reg dst
, struct brw_reg src
)
475 struct brw_reg src0
= brw_reg(src
.file
, src
.nr
, 1,
477 BRW_VERTICAL_STRIDE_2
,
479 BRW_HORIZONTAL_STRIDE_0
,
480 BRW_SWIZZLE_XYZW
, WRITEMASK_XYZW
);
481 struct brw_reg src1
= brw_reg(src
.file
, src
.nr
, 0,
483 BRW_VERTICAL_STRIDE_2
,
485 BRW_HORIZONTAL_STRIDE_0
,
486 BRW_SWIZZLE_XYZW
, WRITEMASK_XYZW
);
487 brw_ADD(p
, dst
, src0
, negate(src1
));
490 /* The negate_value boolean is used to negate the derivative computation for
491 * FBOs, since they place the origin at the upper left instead of the lower
495 fs_generator::generate_ddy(fs_inst
*inst
, struct brw_reg dst
, struct brw_reg src
,
498 struct brw_reg src0
= brw_reg(src
.file
, src
.nr
, 0,
500 BRW_VERTICAL_STRIDE_4
,
502 BRW_HORIZONTAL_STRIDE_0
,
503 BRW_SWIZZLE_XYZW
, WRITEMASK_XYZW
);
504 struct brw_reg src1
= brw_reg(src
.file
, src
.nr
, 2,
506 BRW_VERTICAL_STRIDE_4
,
508 BRW_HORIZONTAL_STRIDE_0
,
509 BRW_SWIZZLE_XYZW
, WRITEMASK_XYZW
);
511 brw_ADD(p
, dst
, src1
, negate(src0
));
513 brw_ADD(p
, dst
, src0
, negate(src1
));
517 fs_generator::generate_discard(fs_inst
*inst
)
519 struct brw_reg f0
= brw_flag_reg();
521 if (intel
->gen
>= 6) {
522 struct brw_reg g1
= retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW
);
523 struct brw_reg some_register
;
525 /* As of gen6, we no longer have the mask register to look at,
526 * so life gets a bit more complicated.
529 /* Load the flag register with all ones. */
530 brw_push_insn_state(p
);
531 brw_set_mask_control(p
, BRW_MASK_DISABLE
);
532 brw_MOV(p
, f0
, brw_imm_uw(0xffff));
533 brw_pop_insn_state(p
);
535 /* Do a comparison that should always fail, to produce 0s in the flag
536 * reg where we have active channels.
538 some_register
= retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW
);
539 brw_CMP(p
, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD
),
540 BRW_CONDITIONAL_NZ
, some_register
, some_register
);
542 /* Undo CMP's whacking of predication*/
543 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
545 brw_push_insn_state(p
);
546 brw_set_mask_control(p
, BRW_MASK_DISABLE
);
547 brw_AND(p
, g1
, f0
, g1
);
548 brw_pop_insn_state(p
);
550 struct brw_reg g0
= retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW
);
552 brw_push_insn_state(p
);
553 brw_set_mask_control(p
, BRW_MASK_DISABLE
);
554 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
556 /* Unlike the 965, we have the mask reg, so we just need
557 * somewhere to invert that (containing channels to be disabled)
558 * so it can be ANDed with the mask of pixels still to be
559 * written. Use the flag reg for consistency with gen6+.
561 brw_NOT(p
, f0
, brw_mask_reg(1)); /* IMASK */
562 brw_AND(p
, g0
, f0
, g0
);
564 brw_pop_insn_state(p
);
569 fs_generator::generate_spill(fs_inst
*inst
, struct brw_reg src
)
571 assert(inst
->mlen
!= 0);
574 retype(brw_message_reg(inst
->base_mrf
+ 1), BRW_REGISTER_TYPE_UD
),
575 retype(src
, BRW_REGISTER_TYPE_UD
));
576 brw_oword_block_write_scratch(p
, brw_message_reg(inst
->base_mrf
), 1,
581 fs_generator::generate_unspill(fs_inst
*inst
, struct brw_reg dst
)
583 assert(inst
->mlen
!= 0);
585 /* Clear any post destination dependencies that would be ignored by
586 * the block read. See the B-Spec for pre-gen5 send instruction.
588 * This could use a better solution, since texture sampling and
589 * math reads could potentially run into it as well -- anywhere
590 * that we have a SEND with a destination that is a register that
591 * was written but not read within the last N instructions (what's
592 * N? unsure). This is rare because of dead code elimination, but
595 if (intel
->gen
== 4 && !intel
->is_g4x
)
596 brw_MOV(p
, brw_null_reg(), dst
);
598 brw_oword_block_read_scratch(p
, dst
, brw_message_reg(inst
->base_mrf
), 1,
601 if (intel
->gen
== 4 && !intel
->is_g4x
) {
602 /* gen4 errata: destination from a send can't be used as a
603 * destination until it's been read. Just read it so we don't
606 brw_MOV(p
, brw_null_reg(), dst
);
611 fs_generator::generate_uniform_pull_constant_load(fs_inst
*inst
,
613 struct brw_reg index
,
614 struct brw_reg offset
)
616 assert(inst
->mlen
!= 0);
618 /* Clear any post destination dependencies that would be ignored by
619 * the block read. See the B-Spec for pre-gen5 send instruction.
621 * This could use a better solution, since texture sampling and
622 * math reads could potentially run into it as well -- anywhere
623 * that we have a SEND with a destination that is a register that
624 * was written but not read within the last N instructions (what's
625 * N? unsure). This is rare because of dead code elimination, but
628 if (intel
->gen
== 4 && !intel
->is_g4x
)
629 brw_MOV(p
, brw_null_reg(), dst
);
631 assert(index
.file
== BRW_IMMEDIATE_VALUE
&&
632 index
.type
== BRW_REGISTER_TYPE_UD
);
633 uint32_t surf_index
= index
.dw1
.ud
;
635 assert(offset
.file
== BRW_IMMEDIATE_VALUE
&&
636 offset
.type
== BRW_REGISTER_TYPE_UD
);
637 uint32_t read_offset
= offset
.dw1
.ud
;
639 brw_oword_block_read(p
, dst
, brw_message_reg(inst
->base_mrf
),
640 read_offset
, surf_index
);
642 if (intel
->gen
== 4 && !intel
->is_g4x
) {
643 /* gen4 errata: destination from a send can't be used as a
644 * destination until it's been read. Just read it so we don't
647 brw_MOV(p
, brw_null_reg(), dst
);
652 fs_generator::generate_varying_pull_constant_load(fs_inst
*inst
,
654 struct brw_reg index
)
656 assert(intel
->gen
< 7); /* Should use the gen7 variant. */
657 assert(inst
->header_present
);
659 assert(index
.file
== BRW_IMMEDIATE_VALUE
&&
660 index
.type
== BRW_REGISTER_TYPE_UD
);
661 uint32_t surf_index
= index
.dw1
.ud
;
663 uint32_t msg_type
, msg_control
, rlen
;
665 msg_type
= GEN6_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ
;
666 else if (intel
->gen
== 5 || intel
->is_g4x
)
667 msg_type
= G45_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ
;
669 msg_type
= BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ
;
671 if (dispatch_width
== 16) {
672 msg_control
= BRW_DATAPORT_DWORD_SCATTERED_BLOCK_16DWORDS
;
675 msg_control
= BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS
;
679 struct brw_reg header
= brw_vec8_grf(0, 0);
680 gen6_resolve_implied_move(p
, &header
, inst
->base_mrf
);
682 struct brw_instruction
*send
= brw_next_insn(p
, BRW_OPCODE_SEND
);
683 brw_set_dest(p
, send
, dst
);
684 brw_set_src0(p
, send
, header
);
686 send
->header
.destreg__conditionalmod
= inst
->base_mrf
;
687 brw_set_dp_read_message(p
, send
,
691 BRW_DATAPORT_READ_TARGET_DATA_CACHE
,
693 inst
->header_present
,
698 fs_generator::generate_varying_pull_constant_load_gen7(fs_inst
*inst
,
700 struct brw_reg index
,
701 struct brw_reg offset
)
703 assert(intel
->gen
>= 7);
704 /* Varying-offset pull constant loads are treated as a normal expression on
705 * gen7, so the fact that it's a send message is hidden at the IR level.
707 assert(!inst
->header_present
);
710 assert(index
.file
== BRW_IMMEDIATE_VALUE
&&
711 index
.type
== BRW_REGISTER_TYPE_UD
);
712 uint32_t surf_index
= index
.dw1
.ud
;
714 uint32_t msg_control
, rlen
, mlen
;
715 if (dispatch_width
== 16) {
716 msg_control
= BRW_DATAPORT_DWORD_SCATTERED_BLOCK_16DWORDS
;
719 msg_control
= BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS
;
723 struct brw_instruction
*send
= brw_next_insn(p
, BRW_OPCODE_SEND
);
724 brw_set_dest(p
, send
, dst
);
725 brw_set_src0(p
, send
, offset
);
727 send
->header
.destreg__conditionalmod
= inst
->base_mrf
;
728 brw_set_dp_read_message(p
, send
,
731 GEN7_DATAPORT_DC_DWORD_SCATTERED_READ
,
732 BRW_DATAPORT_READ_TARGET_DATA_CACHE
,
734 inst
->header_present
,
739 * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
740 * into the flags register (f0.0).
742 * Used only on Gen6 and above.
745 fs_generator::generate_mov_dispatch_to_flags()
747 struct brw_reg f0
= brw_flag_reg();
748 struct brw_reg g1
= retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW
);
750 assert (intel
->gen
>= 6);
751 brw_push_insn_state(p
);
752 brw_set_mask_control(p
, BRW_MASK_DISABLE
);
754 brw_pop_insn_state(p
);
758 static uint32_t brw_file_from_reg(fs_reg
*reg
)
762 return BRW_ARCHITECTURE_REGISTER_FILE
;
764 return BRW_GENERAL_REGISTER_FILE
;
766 return BRW_MESSAGE_REGISTER_FILE
;
768 return BRW_IMMEDIATE_VALUE
;
770 assert(!"not reached");
771 return BRW_GENERAL_REGISTER_FILE
;
775 static struct brw_reg
776 brw_reg_from_fs_reg(fs_reg
*reg
)
778 struct brw_reg brw_reg
;
784 if (reg
->smear
== -1) {
785 brw_reg
= brw_vec8_reg(brw_file_from_reg(reg
), reg
->reg
, 0);
787 brw_reg
= brw_vec1_reg(brw_file_from_reg(reg
), reg
->reg
, reg
->smear
);
789 brw_reg
= retype(brw_reg
, reg
->type
);
791 brw_reg
= sechalf(brw_reg
);
795 case BRW_REGISTER_TYPE_F
:
796 brw_reg
= brw_imm_f(reg
->imm
.f
);
798 case BRW_REGISTER_TYPE_D
:
799 brw_reg
= brw_imm_d(reg
->imm
.i
);
801 case BRW_REGISTER_TYPE_UD
:
802 brw_reg
= brw_imm_ud(reg
->imm
.u
);
805 assert(!"not reached");
806 brw_reg
= brw_null_reg();
811 brw_reg
= reg
->fixed_hw_reg
;
814 /* Probably unused. */
815 brw_reg
= brw_null_reg();
818 assert(!"not reached");
819 brw_reg
= brw_null_reg();
822 assert(!"not reached");
823 brw_reg
= brw_null_reg();
827 brw_reg
= brw_abs(brw_reg
);
829 brw_reg
= negate(brw_reg
);
835 fs_generator::generate_code(exec_list
*instructions
)
837 int last_native_insn_offset
= p
->next_insn_offset
;
838 const char *last_annotation_string
= NULL
;
839 const void *last_annotation_ir
= NULL
;
841 if (unlikely(INTEL_DEBUG
& DEBUG_WM
)) {
843 printf("Native code for fragment shader %d (%d-wide dispatch):\n",
844 prog
->Name
, dispatch_width
);
846 printf("Native code for fragment program %d (%d-wide dispatch):\n",
847 fp
->Base
.Id
, dispatch_width
);
852 if (unlikely(INTEL_DEBUG
& DEBUG_WM
))
853 cfg
= new(mem_ctx
) cfg_t(mem_ctx
, instructions
);
855 foreach_list(node
, instructions
) {
856 fs_inst
*inst
= (fs_inst
*)node
;
857 struct brw_reg src
[3], dst
;
859 if (unlikely(INTEL_DEBUG
& DEBUG_WM
)) {
860 foreach_list(node
, &cfg
->block_list
) {
861 bblock_link
*link
= (bblock_link
*)node
;
862 bblock_t
*block
= link
->block
;
864 if (block
->start
== inst
) {
865 printf(" START B%d", block
->block_num
);
866 foreach_list(predecessor_node
, &block
->parents
) {
867 bblock_link
*predecessor_link
=
868 (bblock_link
*)predecessor_node
;
869 bblock_t
*predecessor_block
= predecessor_link
->block
;
870 printf(" <-B%d", predecessor_block
->block_num
);
876 if (last_annotation_ir
!= inst
->ir
) {
877 last_annotation_ir
= inst
->ir
;
878 if (last_annotation_ir
) {
881 ((ir_instruction
*)inst
->ir
)->print();
883 const prog_instruction
*fpi
;
884 fpi
= (const prog_instruction
*)inst
->ir
;
885 printf("%d: ", (int)(fpi
- fp
->Base
.Instructions
));
886 _mesa_fprint_instruction_opt(stdout
,
888 0, PROG_PRINT_DEBUG
, NULL
);
893 if (last_annotation_string
!= inst
->annotation
) {
894 last_annotation_string
= inst
->annotation
;
895 if (last_annotation_string
)
896 printf(" %s\n", last_annotation_string
);
900 for (unsigned int i
= 0; i
< 3; i
++) {
901 src
[i
] = brw_reg_from_fs_reg(&inst
->src
[i
]);
903 /* The accumulator result appears to get used for the
904 * conditional modifier generation. When negating a UD
905 * value, there is a 33rd bit generated for the sign in the
906 * accumulator value, so now you can't check, for example,
907 * equality with a 32-bit value. See piglit fs-op-neg-uvec4.
909 assert(!inst
->conditional_mod
||
910 inst
->src
[i
].type
!= BRW_REGISTER_TYPE_UD
||
911 !inst
->src
[i
].negate
);
913 dst
= brw_reg_from_fs_reg(&inst
->dst
);
915 brw_set_conditionalmod(p
, inst
->conditional_mod
);
916 brw_set_predicate_control(p
, inst
->predicate
);
917 brw_set_predicate_inverse(p
, inst
->predicate_inverse
);
918 brw_set_saturate(p
, inst
->saturate
);
919 brw_set_mask_control(p
, inst
->force_writemask_all
);
921 if (inst
->force_uncompressed
|| dispatch_width
== 8) {
922 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
923 } else if (inst
->force_sechalf
) {
924 brw_set_compression_control(p
, BRW_COMPRESSION_2NDHALF
);
926 brw_set_compression_control(p
, BRW_COMPRESSION_COMPRESSED
);
929 switch (inst
->opcode
) {
931 brw_MOV(p
, dst
, src
[0]);
934 brw_ADD(p
, dst
, src
[0], src
[1]);
937 brw_MUL(p
, dst
, src
[0], src
[1]);
939 case BRW_OPCODE_MACH
:
940 brw_set_acc_write_control(p
, 1);
941 brw_MACH(p
, dst
, src
[0], src
[1]);
942 brw_set_acc_write_control(p
, 0);
946 brw_set_access_mode(p
, BRW_ALIGN_16
);
947 if (dispatch_width
== 16) {
948 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
949 brw_MAD(p
, dst
, src
[0], src
[1], src
[2]);
950 brw_set_compression_control(p
, BRW_COMPRESSION_2NDHALF
);
951 brw_MAD(p
, sechalf(dst
), sechalf(src
[0]), sechalf(src
[1]), sechalf(src
[2]));
952 brw_set_compression_control(p
, BRW_COMPRESSION_COMPRESSED
);
954 brw_MAD(p
, dst
, src
[0], src
[1], src
[2]);
956 brw_set_access_mode(p
, BRW_ALIGN_1
);
960 brw_FRC(p
, dst
, src
[0]);
962 case BRW_OPCODE_RNDD
:
963 brw_RNDD(p
, dst
, src
[0]);
965 case BRW_OPCODE_RNDE
:
966 brw_RNDE(p
, dst
, src
[0]);
968 case BRW_OPCODE_RNDZ
:
969 brw_RNDZ(p
, dst
, src
[0]);
973 brw_AND(p
, dst
, src
[0], src
[1]);
976 brw_OR(p
, dst
, src
[0], src
[1]);
979 brw_XOR(p
, dst
, src
[0], src
[1]);
982 brw_NOT(p
, dst
, src
[0]);
985 brw_ASR(p
, dst
, src
[0], src
[1]);
988 brw_SHR(p
, dst
, src
[0], src
[1]);
991 brw_SHL(p
, dst
, src
[0], src
[1]);
995 brw_CMP(p
, dst
, inst
->conditional_mod
, src
[0], src
[1]);
998 brw_SEL(p
, dst
, src
[0], src
[1]);
1002 if (inst
->src
[0].file
!= BAD_FILE
) {
1003 /* The instruction has an embedded compare (only allowed on gen6) */
1004 assert(intel
->gen
== 6);
1005 gen6_IF(p
, inst
->conditional_mod
, src
[0], src
[1]);
1007 brw_IF(p
, dispatch_width
== 16 ? BRW_EXECUTE_16
: BRW_EXECUTE_8
);
1011 case BRW_OPCODE_ELSE
:
1014 case BRW_OPCODE_ENDIF
:
1019 brw_DO(p
, BRW_EXECUTE_8
);
1022 case BRW_OPCODE_BREAK
:
1024 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1026 case BRW_OPCODE_CONTINUE
:
1027 /* FINISHME: We need to write the loop instruction support still. */
1028 if (intel
->gen
>= 6)
1032 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1035 case BRW_OPCODE_WHILE
:
1039 case SHADER_OPCODE_RCP
:
1040 case SHADER_OPCODE_RSQ
:
1041 case SHADER_OPCODE_SQRT
:
1042 case SHADER_OPCODE_EXP2
:
1043 case SHADER_OPCODE_LOG2
:
1044 case SHADER_OPCODE_SIN
:
1045 case SHADER_OPCODE_COS
:
1046 if (intel
->gen
>= 7) {
1047 generate_math1_gen7(inst
, dst
, src
[0]);
1048 } else if (intel
->gen
== 6) {
1049 generate_math1_gen6(inst
, dst
, src
[0]);
1051 generate_math_gen4(inst
, dst
, src
[0]);
1054 case SHADER_OPCODE_INT_QUOTIENT
:
1055 case SHADER_OPCODE_INT_REMAINDER
:
1056 case SHADER_OPCODE_POW
:
1057 if (intel
->gen
>= 7) {
1058 generate_math2_gen7(inst
, dst
, src
[0], src
[1]);
1059 } else if (intel
->gen
== 6) {
1060 generate_math2_gen6(inst
, dst
, src
[0], src
[1]);
1062 generate_math_gen4(inst
, dst
, src
[0]);
1065 case FS_OPCODE_PIXEL_X
:
1066 generate_pixel_xy(dst
, true);
1068 case FS_OPCODE_PIXEL_Y
:
1069 generate_pixel_xy(dst
, false);
1071 case FS_OPCODE_CINTERP
:
1072 brw_MOV(p
, dst
, src
[0]);
1074 case FS_OPCODE_LINTERP
:
1075 generate_linterp(inst
, dst
, src
);
1077 case SHADER_OPCODE_TEX
:
1079 case SHADER_OPCODE_TXD
:
1080 case SHADER_OPCODE_TXF
:
1081 case SHADER_OPCODE_TXL
:
1082 case SHADER_OPCODE_TXS
:
1083 generate_tex(inst
, dst
, src
[0]);
1085 case FS_OPCODE_DISCARD
:
1086 generate_discard(inst
);
1089 generate_ddx(inst
, dst
, src
[0]);
1092 /* Make sure fp->UsesDFdy flag got set (otherwise there's no
1093 * guarantee that c->key.render_to_fbo is set).
1095 assert(fp
->UsesDFdy
);
1096 generate_ddy(inst
, dst
, src
[0], c
->key
.render_to_fbo
);
1099 case FS_OPCODE_SPILL
:
1100 generate_spill(inst
, src
[0]);
1103 case FS_OPCODE_UNSPILL
:
1104 generate_unspill(inst
, dst
);
1107 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD
:
1108 generate_uniform_pull_constant_load(inst
, dst
, src
[0], src
[1]);
1111 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD
:
1112 generate_varying_pull_constant_load(inst
, dst
, src
[0]);
1115 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7
:
1116 generate_varying_pull_constant_load_gen7(inst
, dst
, src
[0], src
[1]);
1119 case FS_OPCODE_FB_WRITE
:
1120 generate_fb_write(inst
);
1123 case FS_OPCODE_MOV_DISPATCH_TO_FLAGS
:
1124 generate_mov_dispatch_to_flags();
1128 if (inst
->opcode
< (int) ARRAY_SIZE(opcode_descs
)) {
1129 _mesa_problem(ctx
, "Unsupported opcode `%s' in FS",
1130 opcode_descs
[inst
->opcode
].name
);
1132 _mesa_problem(ctx
, "Unsupported opcode %d in FS", inst
->opcode
);
1137 if (unlikely(INTEL_DEBUG
& DEBUG_WM
)) {
1138 brw_dump_compile(p
, stdout
,
1139 last_native_insn_offset
, p
->next_insn_offset
);
1141 foreach_list(node
, &cfg
->block_list
) {
1142 bblock_link
*link
= (bblock_link
*)node
;
1143 bblock_t
*block
= link
->block
;
1145 if (block
->end
== inst
) {
1146 printf(" END B%d", block
->block_num
);
1147 foreach_list(successor_node
, &block
->children
) {
1148 bblock_link
*successor_link
=
1149 (bblock_link
*)successor_node
;
1150 bblock_t
*successor_block
= successor_link
->block
;
1151 printf(" ->B%d", successor_block
->block_num
);
1158 last_native_insn_offset
= p
->next_insn_offset
;
1161 if (unlikely(INTEL_DEBUG
& DEBUG_WM
)) {
1167 /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS
1168 * emit issues, it doesn't get the jump distances into the output,
1169 * which is often something we want to debug. So this is here in
1170 * case you're doing that.
1173 brw_dump_compile(p
, stdout
, 0, p
->next_insn_offset
);
1178 fs_generator::generate_assembly(exec_list
*simd8_instructions
,
1179 exec_list
*simd16_instructions
,
1180 unsigned *assembly_size
)
1183 generate_code(simd8_instructions
);
1185 if (simd16_instructions
) {
1186 /* We have to do a compaction pass now, or the one at the end of
1187 * execution will squash down where our prog_offset start needs
1190 brw_compact_instructions(p
);
1192 /* align to 64 byte boundary. */
1193 while ((p
->nr_insn
* sizeof(struct brw_instruction
)) % 64) {
1197 /* Save off the start of this 16-wide program */
1198 c
->prog_data
.prog_offset_16
= p
->nr_insn
* sizeof(struct brw_instruction
);
1200 brw_set_compression_control(p
, BRW_COMPRESSION_COMPRESSED
);
1202 dispatch_width
= 16;
1203 generate_code(simd16_instructions
);
1206 return brw_get_program(p
, assembly_size
);