2 * Copyright © 2010, 2011, 2012 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 /** @file gen8_fs_generate.cpp
26 * Code generation for Gen8+ hardware.
30 #include "main/macros.h"
31 #include "brw_context.h"
36 #include "glsl/ir_print_visitor.h"
38 gen8_fs_generator::gen8_fs_generator(struct brw_context
*brw
,
39 struct brw_wm_compile
*c
,
40 struct gl_shader_program
*shader_prog
,
41 struct gl_fragment_program
*fp
,
42 bool dual_source_output
)
43 : gen8_generator(brw
, shader_prog
, fp
? &fp
->Base
: NULL
, c
), c(c
), fp(fp
),
44 dual_source_output(dual_source_output
)
48 gen8_fs_generator::~gen8_fs_generator()
53 gen8_fs_generator::generate_fb_write(fs_inst
*ir
)
55 /* Disable the discard condition while setting up the header. */
56 default_state
.predicate
= BRW_PREDICATE_NONE
;
57 default_state
.predicate_inverse
= false;
58 default_state
.flag_subreg_nr
= 0;
60 if (ir
->header_present
) {
61 /* The GPU will use the predicate on SENDC, unless the header is present.
63 if (fp
&& fp
->UsesKill
) {
64 gen8_instruction
*mov
=
65 MOV(retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW
),
67 gen8_set_mask_control(mov
, BRW_MASK_DISABLE
);
70 gen8_instruction
*mov
=
71 MOV_RAW(brw_message_reg(ir
->base_mrf
), brw_vec8_grf(0, 0));
72 gen8_set_exec_size(mov
, BRW_EXECUTE_16
);
74 if (ir
->target
> 0 && c
->key
.replicate_alpha
) {
75 /* Set "Source0 Alpha Present to RenderTarget" bit in the header. */
76 OR(vec1(retype(brw_message_reg(ir
->base_mrf
), BRW_REGISTER_TYPE_UD
)),
77 vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
)),
82 /* Set the render target index for choosing BLEND_STATE. */
83 MOV(retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE
, ir
->base_mrf
, 2),
84 BRW_REGISTER_TYPE_UD
),
85 brw_imm_ud(ir
->target
));
89 /* Set the predicate back to get the conditional write if necessary for
92 default_state
.predicate
= ir
->predicate
;
93 default_state
.predicate_inverse
= ir
->predicate_inverse
;
94 default_state
.flag_subreg_nr
= ir
->flag_subreg
;
96 gen8_instruction
*inst
= next_inst(BRW_OPCODE_SENDC
);
97 gen8_set_dst(brw
, inst
, retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW
));
98 gen8_set_src0(brw
, inst
, brw_message_reg(ir
->base_mrf
));
100 /* Set up the "Message Specific Control" bits for the Data Port Message
101 * Descriptor. These are documented in the "Render Target Write" message's
102 * "Message Descriptor" documentation (vol5c.2).
105 /* Set the Message Type */
106 if (this->dual_source_output
)
107 msg_type
= BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01
;
108 else if (dispatch_width
== 16)
109 msg_type
= BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE
;
111 msg_type
= BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01
;
113 uint32_t msg_control
= msg_type
;
115 /* "Last Render Target Select" must be set on all writes to the last of
116 * the render targets (if using MRT), or always for a single RT scenario.
118 if ((ir
->target
== c
->key
.nr_color_regions
- 1) || !c
->key
.nr_color_regions
)
119 msg_control
|= (1 << 4); /* Last Render Target Select */
121 uint32_t surf_index
=
122 c
->prog_data
.binding_table
.render_target_start
+ ir
->target
;
124 gen8_set_dp_message(brw
, inst
,
125 GEN6_SFID_DATAPORT_RENDER_CACHE
,
127 GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE
,
134 brw_mark_surface_used(&c
->prog_data
.base
, surf_index
);
138 gen8_fs_generator::generate_linterp(fs_inst
*inst
,
142 struct brw_reg delta_x
= src
[0];
143 struct brw_reg delta_y
= src
[1];
144 struct brw_reg interp
= src
[2];
147 assert(delta_y
.nr
== delta_x
.nr
+ 1);
148 PLN(dst
, interp
, delta_x
);
152 gen8_fs_generator::generate_tex(fs_inst
*ir
,
158 uint32_t simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD8
;
160 assert(src
.file
== BRW_GENERAL_REGISTER_FILE
);
162 if (dispatch_width
== 16 && !ir
->force_uncompressed
&& !ir
->force_sechalf
)
163 simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD16
;
165 switch (ir
->opcode
) {
166 case SHADER_OPCODE_TEX
:
167 if (ir
->shadow_compare
) {
168 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE
;
170 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE
;
174 if (ir
->shadow_compare
) {
175 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE
;
177 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS
;
180 case SHADER_OPCODE_TXL
:
181 if (ir
->shadow_compare
) {
182 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE
;
184 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_LOD
;
187 case SHADER_OPCODE_TXS
:
188 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO
;
190 case SHADER_OPCODE_TXD
:
191 if (ir
->shadow_compare
) {
192 msg_type
= HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE
;
194 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS
;
197 case SHADER_OPCODE_TXF
:
198 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_LD
;
200 case SHADER_OPCODE_TXF_CMS
:
201 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS
;
203 case SHADER_OPCODE_TXF_UMS
:
204 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS
;
206 case SHADER_OPCODE_TXF_MCS
:
207 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS
;
209 case SHADER_OPCODE_LOD
:
210 msg_type
= GEN5_SAMPLER_MESSAGE_LOD
;
212 case SHADER_OPCODE_TG4
:
213 if (ir
->shadow_compare
) {
214 assert(brw
->gen
>= 7);
215 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C
;
217 assert(brw
->gen
>= 6);
218 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4
;
221 case SHADER_OPCODE_TG4_OFFSET
:
222 assert(brw
->gen
>= 7);
223 if (ir
->shadow_compare
) {
224 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C
;
226 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO
;
230 assert(!"not reached");
233 assert(msg_type
!= -1);
235 if (simd_mode
== BRW_SAMPLER_SIMD_MODE_SIMD16
) {
240 if (ir
->header_present
) {
241 /* The send-from-GRF for SIMD16 texturing with a header has an extra
242 * hardware register allocated to it, which we need to skip over (since
243 * our coordinates in the payload are in the even-numbered registers,
244 * and the header comes right before the first one.
246 if (dispatch_width
== 16)
249 unsigned save_exec_size
= default_state
.exec_size
;
250 default_state
.exec_size
= BRW_EXECUTE_8
;
252 MOV_RAW(src
, brw_vec8_grf(0, 0));
254 if (ir
->texture_offset
) {
255 /* Set the texel offset bits. */
256 MOV_RAW(retype(brw_vec1_grf(src
.nr
, 2), BRW_REGISTER_TYPE_UD
),
257 brw_imm_ud(ir
->texture_offset
));
260 if (ir
->sampler
>= 16) {
261 /* The "Sampler Index" field can only store values between 0 and 15.
262 * However, we can add an offset to the "Sampler State Pointer"
263 * field, effectively selecting a different set of 16 samplers.
265 * The "Sampler State Pointer" needs to be aligned to a 32-byte
266 * offset, and each sampler state is only 16-bytes, so we can't
267 * exclusively use the offset - we have to use both.
269 gen8_instruction
*add
=
270 ADD(get_element_ud(src
, 3),
271 get_element_ud(brw_vec8_grf(0, 0), 3),
272 brw_imm_ud(16 * (ir
->sampler
/ 16) *
273 sizeof(gen7_sampler_state
)));
274 gen8_set_mask_control(add
, BRW_MASK_DISABLE
);
277 default_state
.exec_size
= save_exec_size
;
280 uint32_t surf_index
=
281 c
->prog_data
.base
.binding_table
.texture_start
+ ir
->sampler
;
283 gen8_instruction
*inst
= next_inst(BRW_OPCODE_SEND
);
284 gen8_set_dst(brw
, inst
, dst
);
285 gen8_set_src0(brw
, inst
, src
);
286 gen8_set_sampler_message(brw
, inst
,
295 brw_mark_surface_used(&c
->prog_data
.base
, surf_index
);
299 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
302 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
304 * and we're trying to produce:
307 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
308 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
309 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
310 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
311 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
312 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
313 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
314 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
316 * and add another set of two more subspans if in 16-pixel dispatch mode.
318 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
319 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
320 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
321 * between each other. We could probably do it like ddx and swizzle the right
322 * order later, but bail for now and just produce
323 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
326 gen8_fs_generator::generate_ddx(fs_inst
*inst
,
330 unsigned vstride
, width
;
332 if (c
->key
.high_quality_derivatives
) {
333 /* Produce accurate derivatives. */
334 vstride
= BRW_VERTICAL_STRIDE_2
;
337 /* Replicate the derivative at the top-left pixel to other pixels. */
338 vstride
= BRW_VERTICAL_STRIDE_4
;
342 struct brw_reg src0
= brw_reg(src
.file
, src
.nr
, 1,
346 BRW_HORIZONTAL_STRIDE_0
,
347 BRW_SWIZZLE_XYZW
, WRITEMASK_XYZW
);
348 struct brw_reg src1
= brw_reg(src
.file
, src
.nr
, 0,
352 BRW_HORIZONTAL_STRIDE_0
,
353 BRW_SWIZZLE_XYZW
, WRITEMASK_XYZW
);
354 ADD(dst
, src0
, negate(src1
));
357 /* The negate_value boolean is used to negate the derivative computation for
358 * FBOs, since they place the origin at the upper left instead of the lower
362 gen8_fs_generator::generate_ddy(fs_inst
*inst
,
368 unsigned src0_swizzle
;
369 unsigned src1_swizzle
;
372 if (c
->key
.high_quality_derivatives
) {
373 /* Produce accurate derivatives. */
374 hstride
= BRW_HORIZONTAL_STRIDE_1
;
375 src0_swizzle
= BRW_SWIZZLE_XYXY
;
376 src1_swizzle
= BRW_SWIZZLE_ZWZW
;
379 default_state
.access_mode
= BRW_ALIGN_16
;
381 /* Replicate the derivative at the top-left pixel to other pixels. */
382 hstride
= BRW_HORIZONTAL_STRIDE_0
;
383 src0_swizzle
= BRW_SWIZZLE_XYZW
;
384 src1_swizzle
= BRW_SWIZZLE_XYZW
;
388 struct brw_reg src0
= brw_reg(src
.file
, src
.nr
, 0,
390 BRW_VERTICAL_STRIDE_4
,
393 src0_swizzle
, WRITEMASK_XYZW
);
394 struct brw_reg src1
= brw_reg(src
.file
, src
.nr
, src1_subnr
,
396 BRW_VERTICAL_STRIDE_4
,
399 src1_swizzle
, WRITEMASK_XYZW
);
402 ADD(dst
, src1
, negate(src0
));
404 ADD(dst
, src0
, negate(src1
));
406 default_state
.access_mode
= BRW_ALIGN_1
;
410 gen8_fs_generator::generate_scratch_write(fs_inst
*ir
, struct brw_reg src
)
412 MOV(retype(brw_message_reg(ir
->base_mrf
+ 1), BRW_REGISTER_TYPE_UD
),
413 retype(src
, BRW_REGISTER_TYPE_UD
));
416 retype(brw_message_reg(ir
->base_mrf
), BRW_REGISTER_TYPE_UD
);
418 const int num_regs
= dispatch_width
/ 8;
420 uint32_t msg_control
;
422 msg_control
= BRW_DATAPORT_OWORD_BLOCK_2_OWORDS
;
424 msg_control
= BRW_DATAPORT_OWORD_BLOCK_4_OWORDS
;
426 /* Set up the message header. This is g0, with g0.2 filled with
427 * the offset. We don't want to leave our offset around in g0 or
428 * it'll screw up texture samples, so set it up inside the message
431 unsigned save_exec_size
= default_state
.exec_size
;
432 default_state
.exec_size
= BRW_EXECUTE_8
;
434 MOV_RAW(mrf
, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
));
435 /* set message header global offset field (reg 0, element 2) */
436 MOV_RAW(get_element_ud(mrf
, 2), brw_imm_ud(ir
->offset
/ 16));
439 if (dispatch_width
== 16)
440 dst
= retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW
);
442 dst
= retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW
);
444 default_state
.exec_size
= BRW_EXECUTE_16
;
446 gen8_instruction
*send
= next_inst(BRW_OPCODE_SEND
);
447 gen8_set_dst(brw
, send
, dst
);
448 gen8_set_src0(brw
, send
, mrf
);
449 gen8_set_dp_message(brw
, send
, GEN7_SFID_DATAPORT_DATA_CACHE
,
450 255, /* binding table index: stateless access */
451 GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE
,
453 1 + num_regs
, /* mlen */
455 true, /* header present */
458 default_state
.exec_size
= save_exec_size
;
462 gen8_fs_generator::generate_scratch_read(fs_inst
*ir
, struct brw_reg dst
)
465 retype(brw_message_reg(ir
->base_mrf
), BRW_REGISTER_TYPE_UD
);
467 const int num_regs
= dispatch_width
/ 8;
469 uint32_t msg_control
;
471 msg_control
= BRW_DATAPORT_OWORD_BLOCK_2_OWORDS
;
473 msg_control
= BRW_DATAPORT_OWORD_BLOCK_4_OWORDS
;
475 unsigned save_exec_size
= default_state
.exec_size
;
476 default_state
.exec_size
= BRW_EXECUTE_8
;
478 MOV_RAW(mrf
, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
));
479 /* set message header global offset field (reg 0, element 2) */
480 MOV_RAW(get_element_ud(mrf
, 2), brw_imm_ud(ir
->offset
/ 16));
482 gen8_instruction
*send
= next_inst(BRW_OPCODE_SEND
);
483 gen8_set_dst(brw
, send
, retype(dst
, BRW_REGISTER_TYPE_UW
));
484 gen8_set_src0(brw
, send
, mrf
);
485 gen8_set_dp_message(brw
, send
, GEN7_SFID_DATAPORT_DATA_CACHE
,
486 255, /* binding table index: stateless access */
487 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ
,
491 true, /* header present */
494 default_state
.exec_size
= save_exec_size
;
498 gen8_fs_generator::generate_scratch_read_gen7(fs_inst
*ir
, struct brw_reg dst
)
500 unsigned save_exec_size
= default_state
.exec_size
;
501 gen8_instruction
*send
= next_inst(BRW_OPCODE_SEND
);
503 int num_regs
= dispatch_width
/ 8;
505 /* According to the docs, offset is "A 12-bit HWord offset into the memory
506 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
507 * is 32 bytes, which happens to be the size of a register.
509 int offset
= ir
->offset
/ REG_SIZE
;
511 /* The HW requires that the header is present; this is to get the g0.5
514 gen8_set_src0(brw
, send
, brw_vec8_grf(0, 0));
515 gen8_set_dst(brw
, send
, retype(dst
, BRW_REGISTER_TYPE_UW
));
516 gen8_set_dp_scratch_message(brw
, send
,
517 false, /* scratch read */
519 false, /* invalidate after read */
522 1, /* mlen - just g0 */
524 true, /* header present */
527 default_state
.exec_size
= save_exec_size
;
531 gen8_fs_generator::generate_uniform_pull_constant_load(fs_inst
*inst
,
533 struct brw_reg index
,
534 struct brw_reg offset
)
536 assert(inst
->mlen
== 0);
538 assert(index
.file
== BRW_IMMEDIATE_VALUE
&&
539 index
.type
== BRW_REGISTER_TYPE_UD
);
540 uint32_t surf_index
= index
.dw1
.ud
;
542 assert(offset
.file
== BRW_GENERAL_REGISTER_FILE
);
543 /* Reference only the dword we need lest we anger validate_reg() with
544 * reg.width > reg.execszie.
546 offset
= brw_vec1_grf(offset
.nr
, 0);
548 gen8_instruction
*send
= next_inst(BRW_OPCODE_SEND
);
549 gen8_set_mask_control(send
, BRW_MASK_DISABLE
);
551 /* We use the SIMD4x2 mode because we want to end up with 4 constants in
552 * the destination loaded consecutively from the same offset (which appears
553 * in the first component, and the rest are ignored).
555 dst
.width
= BRW_WIDTH_4
;
556 gen8_set_dst(brw
, send
, dst
);
557 gen8_set_src0(brw
, send
, offset
);
558 gen8_set_sampler_message(brw
, send
,
560 0, /* The LD message ignores the sampler unit. */
561 GEN5_SAMPLER_MESSAGE_SAMPLE_LD
,
564 false, /* no header */
565 BRW_SAMPLER_SIMD_MODE_SIMD4X2
);
567 brw_mark_surface_used(&c
->prog_data
.base
, surf_index
);
571 gen8_fs_generator::generate_varying_pull_constant_load(fs_inst
*ir
,
573 struct brw_reg index
,
574 struct brw_reg offset
)
576 /* Varying-offset pull constant loads are treated as a normal expression on
577 * gen7, so the fact that it's a send message is hidden at the IR level.
579 assert(!ir
->header_present
);
582 assert(index
.file
== BRW_IMMEDIATE_VALUE
&&
583 index
.type
== BRW_REGISTER_TYPE_UD
);
584 uint32_t surf_index
= index
.dw1
.ud
;
586 uint32_t simd_mode
, rlen
, mlen
;
587 if (dispatch_width
== 16) {
590 simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD16
;
594 simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD8
;
597 gen8_instruction
*send
= next_inst(BRW_OPCODE_SEND
);
598 gen8_set_dst(brw
, send
, dst
);
599 gen8_set_src0(brw
, send
, offset
);
600 gen8_set_sampler_message(brw
, send
,
602 0, /* The LD message ignore the sampler unit. */
603 GEN5_SAMPLER_MESSAGE_SAMPLE_LD
,
606 false, /* no header */
609 brw_mark_surface_used(&c
->prog_data
.base
, surf_index
);
613 * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
614 * into the flags register (f0.0).
617 gen8_fs_generator::generate_mov_dispatch_to_flags(fs_inst
*ir
)
619 struct brw_reg flags
= brw_flag_reg(0, ir
->flag_subreg
);
620 struct brw_reg dispatch_mask
=
621 retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW
);
623 gen8_instruction
*mov
= MOV(flags
, dispatch_mask
);
624 gen8_set_mask_control(mov
, BRW_MASK_DISABLE
);
628 gen8_fs_generator::generate_discard_jump(fs_inst
*ir
)
630 /* This HALT will be patched up at FB write time to point UIP at the end of
631 * the program, and at brw_uip_jip() JIP will be set to the end of the
632 * current block (or the program).
634 discard_halt_patches
.push_tail(new(mem_ctx
) ip_record(nr_inst
));
640 gen8_fs_generator::patch_discard_jumps_to_fb_writes()
642 if (discard_halt_patches
.is_empty())
645 /* There is a somewhat strange undocumented requirement of using
646 * HALT, according to the simulator. If some channel has HALTed to
647 * a particular UIP, then by the end of the program, every channel
648 * must have HALTed to that UIP. Furthermore, the tracking is a
649 * stack, so you can't do the final halt of a UIP after starting
650 * halting to a new UIP.
652 * Symptoms of not emitting this instruction on actual hardware
653 * included GPU hangs and sparkly rendering on the piglit discard
656 gen8_instruction
*last_halt
= HALT();
657 gen8_set_uip(last_halt
, 16);
658 gen8_set_jip(last_halt
, 16);
662 foreach_list(node
, &discard_halt_patches
) {
663 ip_record
*patch_ip
= (ip_record
*) node
;
664 gen8_instruction
*patch
= &store
[patch_ip
->ip
];
665 assert(gen8_opcode(patch
) == BRW_OPCODE_HALT
);
667 /* HALT takes an instruction distance from the pre-incremented IP. */
668 gen8_set_uip(patch
, (ip
- patch_ip
->ip
) * 16);
671 this->discard_halt_patches
.make_empty();
675 * Sets the first dword of a vgrf for simd4x2 uniform pull constant
676 * sampler LD messages.
678 * We don't want to bake it into the send message's code generation because
679 * that means we don't get a chance to schedule the instruction.
682 gen8_fs_generator::generate_set_simd4x2_offset(fs_inst
*ir
,
684 struct brw_reg value
)
686 assert(value
.file
== BRW_IMMEDIATE_VALUE
);
687 MOV_RAW(retype(brw_vec1_reg(dst
.file
, dst
.nr
, 0), value
.type
), value
);
691 * Sets vstride=16, width=8, hstride=2 or vstride=0, width=1, hstride=0
692 * (when mask is passed as a uniform) of register mask before moving it
696 gen8_fs_generator::generate_set_omask(fs_inst
*inst
,
700 assert(dst
.type
== BRW_REGISTER_TYPE_UW
);
702 if (dispatch_width
== 16)
705 if (mask
.vstride
== BRW_VERTICAL_STRIDE_8
&&
706 mask
.width
== BRW_WIDTH_8
&&
707 mask
.hstride
== BRW_HORIZONTAL_STRIDE_1
) {
708 mask
= stride(mask
, 16, 8, 2);
710 assert(mask
.vstride
== BRW_VERTICAL_STRIDE_0
&&
711 mask
.width
== BRW_WIDTH_1
&&
712 mask
.hstride
== BRW_HORIZONTAL_STRIDE_0
);
715 unsigned save_exec_size
= default_state
.exec_size
;
716 default_state
.exec_size
= BRW_EXECUTE_8
;
718 gen8_instruction
*mov
= MOV(dst
, retype(mask
, dst
.type
));
719 gen8_set_mask_control(mov
, BRW_MASK_DISABLE
);
721 default_state
.exec_size
= save_exec_size
;
725 * Do a special ADD with vstride=1, width=4, hstride=0 for src1.
728 gen8_fs_generator::generate_set_sample_id(fs_inst
*ir
,
733 assert(dst
.type
== BRW_REGISTER_TYPE_D
|| dst
.type
== BRW_REGISTER_TYPE_UD
);
734 assert(src0
.type
== BRW_REGISTER_TYPE_D
|| src0
.type
== BRW_REGISTER_TYPE_UD
);
736 struct brw_reg reg
= retype(stride(src1
, 1, 4, 0), BRW_REGISTER_TYPE_UW
);
738 unsigned save_exec_size
= default_state
.exec_size
;
739 default_state
.exec_size
= BRW_EXECUTE_8
;
741 gen8_instruction
*add
= ADD(dst
, src0
, reg
);
742 gen8_set_mask_control(add
, BRW_MASK_DISABLE
);
743 if (dispatch_width
== 16) {
744 add
= ADD(offset(dst
, 1), offset(src0
, 1), suboffset(reg
, 2));
745 gen8_set_mask_control(add
, BRW_MASK_DISABLE
);
748 default_state
.exec_size
= save_exec_size
;
752 * Change the register's data type from UD to HF, doubling the strides in order
753 * to compensate for halving the data type width.
755 static struct brw_reg
756 ud_reg_to_hf(struct brw_reg r
)
758 assert(r
.type
== BRW_REGISTER_TYPE_UD
);
759 r
.type
= BRW_REGISTER_TYPE_HF
;
761 /* The BRW_*_STRIDE enums are defined so that incrementing the field
762 * doubles the real stride.
773 gen8_fs_generator::generate_pack_half_2x16_split(fs_inst
*inst
,
778 assert(dst
.type
== BRW_REGISTER_TYPE_UD
);
779 assert(x
.type
== BRW_REGISTER_TYPE_F
);
780 assert(y
.type
== BRW_REGISTER_TYPE_F
);
782 struct brw_reg dst_hf
= ud_reg_to_hf(dst
);
784 /* Give each 32-bit channel of dst the form below , where "." means
793 SHL(dst
, dst
, brw_imm_ud(16u));
795 /* And, finally the form of packHalf2x16's output:
802 gen8_fs_generator::generate_unpack_half_2x16_split(fs_inst
*inst
,
806 assert(dst
.type
== BRW_REGISTER_TYPE_F
);
807 assert(src
.type
== BRW_REGISTER_TYPE_UD
);
809 struct brw_reg src_hf
= ud_reg_to_hf(src
);
811 /* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll.
812 * For the Y case, we wish to access only the upper word; therefore
813 * a 16-bit subregister offset is needed.
815 assert(inst
->opcode
== FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X
||
816 inst
->opcode
== FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y
);
817 if (inst
->opcode
== FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y
)
824 gen8_fs_generator::generate_untyped_surface_read(fs_inst
*ir
,
826 struct brw_reg surf_index
)
828 assert(surf_index
.file
== BRW_IMMEDIATE_VALUE
&&
829 surf_index
.type
== BRW_REGISTER_TYPE_UD
);
831 unsigned msg_control
= 0xe | /* Enable only the R channel */
832 ((dispatch_width
== 16 ? 1 : 2) << 4); /* SIMD Mode */
834 gen8_instruction
*inst
= next_inst(BRW_OPCODE_SEND
);
835 gen8_set_dst(brw
, inst
, retype(dst
, BRW_REGISTER_TYPE_UD
));
836 gen8_set_src0(brw
, inst
, brw_message_reg(ir
->base_mrf
));
837 gen8_set_dp_message(brw
, inst
, HSW_SFID_DATAPORT_DATA_CACHE_1
,
839 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ
,
846 brw_mark_surface_used(&c
->prog_data
.base
, surf_index
.dw1
.ud
);
850 gen8_fs_generator::generate_code(exec_list
*instructions
)
852 int last_native_inst_offset
= next_inst_offset
;
853 const char *last_annotation_string
= NULL
;
854 const void *last_annotation_ir
= NULL
;
856 if (unlikely(INTEL_DEBUG
& DEBUG_WM
)) {
859 "Native code for %s fragment shader %d (SIMD%d dispatch):\n",
860 shader_prog
->Label
? shader_prog
->Label
: "unnamed",
861 shader_prog
->Name
, dispatch_width
);
864 "Native code for fragment program %d (SIMD%d dispatch):\n",
865 prog
->Id
, dispatch_width
);
867 fprintf(stderr
, "Native code for blorp program (SIMD%d dispatch):\n",
873 if (unlikely(INTEL_DEBUG
& DEBUG_WM
))
874 cfg
= new(mem_ctx
) cfg_t(instructions
);
876 foreach_list(node
, instructions
) {
877 fs_inst
*ir
= (fs_inst
*) node
;
878 struct brw_reg src
[3], dst
;
880 if (unlikely(INTEL_DEBUG
& DEBUG_WM
)) {
881 foreach_list(node
, &cfg
->block_list
) {
882 bblock_link
*link
= (bblock_link
*)node
;
883 bblock_t
*block
= link
->block
;
885 if (block
->start
== ir
) {
886 fprintf(stderr
, " START B%d", block
->block_num
);
887 foreach_list(predecessor_node
, &block
->parents
) {
888 bblock_link
*predecessor_link
=
889 (bblock_link
*)predecessor_node
;
890 bblock_t
*predecessor_block
= predecessor_link
->block
;
891 fprintf(stderr
, " <-B%d", predecessor_block
->block_num
);
893 fprintf(stderr
, "\n");
897 if (last_annotation_ir
!= ir
->ir
) {
898 last_annotation_ir
= ir
->ir
;
899 if (last_annotation_ir
) {
900 fprintf(stderr
, " ");
902 ((ir_instruction
*) ir
->ir
)->fprint(stderr
);
904 const prog_instruction
*fpi
;
905 fpi
= (const prog_instruction
*) ir
->ir
;
906 fprintf(stderr
, "%d: ", (int)(fpi
- prog
->Instructions
));
907 _mesa_fprint_instruction_opt(stderr
,
909 0, PROG_PRINT_DEBUG
, NULL
);
911 fprintf(stderr
, "\n");
914 if (last_annotation_string
!= ir
->annotation
) {
915 last_annotation_string
= ir
->annotation
;
916 if (last_annotation_string
)
917 fprintf(stderr
, " %s\n", last_annotation_string
);
921 for (unsigned int i
= 0; i
< 3; i
++) {
922 src
[i
] = brw_reg_from_fs_reg(&ir
->src
[i
]);
924 /* The accumulator result appears to get used for the
925 * conditional modifier generation. When negating a UD
926 * value, there is a 33rd bit generated for the sign in the
927 * accumulator value, so now you can't check, for example,
928 * equality with a 32-bit value. See piglit fs-op-neg-uvec4.
930 assert(!ir
->conditional_mod
||
931 ir
->src
[i
].type
!= BRW_REGISTER_TYPE_UD
||
934 dst
= brw_reg_from_fs_reg(&ir
->dst
);
936 default_state
.conditional_mod
= ir
->conditional_mod
;
937 default_state
.predicate
= ir
->predicate
;
938 default_state
.predicate_inverse
= ir
->predicate_inverse
;
939 default_state
.saturate
= ir
->saturate
;
940 default_state
.mask_control
= ir
->force_writemask_all
;
941 default_state
.flag_subreg_nr
= ir
->flag_subreg
;
943 if (dispatch_width
== 16 && !ir
->force_uncompressed
)
944 default_state
.exec_size
= BRW_EXECUTE_16
;
946 default_state
.exec_size
= BRW_EXECUTE_8
;
948 if (ir
->force_uncompressed
|| dispatch_width
== 8)
949 default_state
.qtr_control
= GEN6_COMPRESSION_1Q
;
950 else if (ir
->force_sechalf
)
951 default_state
.qtr_control
= GEN6_COMPRESSION_2Q
;
953 default_state
.qtr_control
= GEN6_COMPRESSION_1H
;
955 switch (ir
->opcode
) {
960 ADD(dst
, src
[0], src
[1]);
963 MUL(dst
, src
[0], src
[1]);
965 case BRW_OPCODE_MACH
:
966 MACH(dst
, src
[0], src
[1]);
970 default_state
.access_mode
= BRW_ALIGN_16
;
971 MAD(dst
, src
[0], src
[1], src
[2]);
972 default_state
.access_mode
= BRW_ALIGN_1
;
976 default_state
.access_mode
= BRW_ALIGN_16
;
977 LRP(dst
, src
[0], src
[1], src
[2]);
978 default_state
.access_mode
= BRW_ALIGN_1
;
985 case BRW_OPCODE_RNDD
:
988 case BRW_OPCODE_RNDE
:
991 case BRW_OPCODE_RNDZ
:
996 AND(dst
, src
[0], src
[1]);
999 OR(dst
, src
[0], src
[1]);
1001 case BRW_OPCODE_XOR
:
1002 XOR(dst
, src
[0], src
[1]);
1004 case BRW_OPCODE_NOT
:
1007 case BRW_OPCODE_ASR
:
1008 ASR(dst
, src
[0], src
[1]);
1010 case BRW_OPCODE_SHR
:
1011 SHR(dst
, src
[0], src
[1]);
1013 case BRW_OPCODE_SHL
:
1014 SHL(dst
, src
[0], src
[1]);
1017 case BRW_OPCODE_F32TO16
:
1018 MOV(retype(dst
, BRW_REGISTER_TYPE_HF
), src
[0]);
1020 case BRW_OPCODE_F16TO32
:
1021 MOV(dst
, retype(src
[0], BRW_REGISTER_TYPE_HF
));
1024 case BRW_OPCODE_CMP
:
1025 CMP(dst
, ir
->conditional_mod
, src
[0], src
[1]);
1027 case BRW_OPCODE_SEL
:
1028 SEL(dst
, src
[0], src
[1]);
1031 case BRW_OPCODE_BFREV
:
1032 /* BFREV only supports UD type for src and dst. */
1033 BFREV(retype(dst
, BRW_REGISTER_TYPE_UD
),
1034 retype(src
[0], BRW_REGISTER_TYPE_UD
));
1037 case BRW_OPCODE_FBH
:
1038 /* FBH only supports UD type for dst. */
1039 FBH(retype(dst
, BRW_REGISTER_TYPE_UD
), src
[0]);
1042 case BRW_OPCODE_FBL
:
1043 /* FBL only supports UD type for dst. */
1044 FBL(retype(dst
, BRW_REGISTER_TYPE_UD
), src
[0]);
1047 case BRW_OPCODE_CBIT
:
1048 /* CBIT only supports UD type for dst. */
1049 CBIT(retype(dst
, BRW_REGISTER_TYPE_UD
), src
[0]);
1052 case BRW_OPCODE_ADDC
:
1053 ADDC(dst
, src
[0], src
[1]);
1056 case BRW_OPCODE_SUBB
:
1057 SUBB(dst
, src
[0], src
[1]);
1060 case BRW_OPCODE_BFE
:
1061 default_state
.access_mode
= BRW_ALIGN_16
;
1062 BFE(dst
, src
[0], src
[1], src
[2]);
1063 default_state
.access_mode
= BRW_ALIGN_1
;
1066 case BRW_OPCODE_BFI1
:
1067 BFI1(dst
, src
[0], src
[1]);
1070 case BRW_OPCODE_BFI2
:
1071 default_state
.access_mode
= BRW_ALIGN_16
;
1072 BFI2(dst
, src
[0], src
[1], src
[2]);
1073 default_state
.access_mode
= BRW_ALIGN_1
;
1077 IF(BRW_PREDICATE_NORMAL
);
1080 case BRW_OPCODE_ELSE
:
1084 case BRW_OPCODE_ENDIF
:
1092 case BRW_OPCODE_BREAK
:
1096 case BRW_OPCODE_CONTINUE
:
1100 case BRW_OPCODE_WHILE
:
1104 case SHADER_OPCODE_RCP
:
1105 MATH(BRW_MATH_FUNCTION_INV
, dst
, src
[0]);
1108 case SHADER_OPCODE_RSQ
:
1109 MATH(BRW_MATH_FUNCTION_RSQ
, dst
, src
[0]);
1112 case SHADER_OPCODE_SQRT
:
1113 MATH(BRW_MATH_FUNCTION_SQRT
, dst
, src
[0]);
1116 case SHADER_OPCODE_EXP2
:
1117 MATH(BRW_MATH_FUNCTION_EXP
, dst
, src
[0]);
1120 case SHADER_OPCODE_LOG2
:
1121 MATH(BRW_MATH_FUNCTION_LOG
, dst
, src
[0]);
1124 case SHADER_OPCODE_SIN
:
1125 MATH(BRW_MATH_FUNCTION_SIN
, dst
, src
[0]);
1128 case SHADER_OPCODE_COS
:
1129 MATH(BRW_MATH_FUNCTION_COS
, dst
, src
[0]);
1132 case SHADER_OPCODE_INT_QUOTIENT
:
1133 MATH(BRW_MATH_FUNCTION_INT_DIV_QUOTIENT
, dst
, src
[0], src
[1]);
1136 case SHADER_OPCODE_INT_REMAINDER
:
1137 MATH(BRW_MATH_FUNCTION_INT_DIV_REMAINDER
, dst
, src
[0], src
[1]);
1140 case SHADER_OPCODE_POW
:
1141 MATH(BRW_MATH_FUNCTION_POW
, dst
, src
[0], src
[1]);
1144 case FS_OPCODE_PIXEL_X
:
1145 case FS_OPCODE_PIXEL_Y
:
1146 assert(!"FS_OPCODE_PIXEL_X and FS_OPCODE_PIXEL_Y are only for Gen4-5.");
1149 case FS_OPCODE_CINTERP
:
1152 case FS_OPCODE_LINTERP
:
1153 generate_linterp(ir
, dst
, src
);
1155 case SHADER_OPCODE_TEX
:
1157 case SHADER_OPCODE_TXD
:
1158 case SHADER_OPCODE_TXF
:
1159 case SHADER_OPCODE_TXF_CMS
:
1160 case SHADER_OPCODE_TXF_UMS
:
1161 case SHADER_OPCODE_TXF_MCS
:
1162 case SHADER_OPCODE_TXL
:
1163 case SHADER_OPCODE_TXS
:
1164 case SHADER_OPCODE_LOD
:
1165 case SHADER_OPCODE_TG4
:
1166 case SHADER_OPCODE_TG4_OFFSET
:
1167 generate_tex(ir
, dst
, src
[0]);
1171 generate_ddx(ir
, dst
, src
[0]);
1174 /* Make sure fp->UsesDFdy flag got set (otherwise there's no
1175 * guarantee that c->key.render_to_fbo is set).
1177 assert(fp
->UsesDFdy
);
1178 generate_ddy(ir
, dst
, src
[0], c
->key
.render_to_fbo
);
1181 case SHADER_OPCODE_GEN4_SCRATCH_WRITE
:
1182 generate_scratch_write(ir
, src
[0]);
1185 case SHADER_OPCODE_GEN4_SCRATCH_READ
:
1186 generate_scratch_read(ir
, dst
);
1189 case SHADER_OPCODE_GEN7_SCRATCH_READ
:
1190 generate_scratch_read_gen7(ir
, dst
);
1193 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7
:
1194 generate_uniform_pull_constant_load(ir
, dst
, src
[0], src
[1]);
1197 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7
:
1198 generate_varying_pull_constant_load(ir
, dst
, src
[0], src
[1]);
1201 case FS_OPCODE_FB_WRITE
:
1202 generate_fb_write(ir
);
1205 case FS_OPCODE_MOV_DISPATCH_TO_FLAGS
:
1206 generate_mov_dispatch_to_flags(ir
);
1209 case FS_OPCODE_DISCARD_JUMP
:
1210 generate_discard_jump(ir
);
1213 case SHADER_OPCODE_SHADER_TIME_ADD
:
1214 assert(!"XXX: Missing Gen8 scalar support for INTEL_DEBUG=shader_time");
1217 case SHADER_OPCODE_UNTYPED_ATOMIC
:
1218 assert(!"XXX: Missing Gen8 scalar support for untyped atomics");
1221 case SHADER_OPCODE_UNTYPED_SURFACE_READ
:
1222 generate_untyped_surface_read(ir
, dst
, src
[0]);
1225 case FS_OPCODE_SET_SIMD4X2_OFFSET
:
1226 generate_set_simd4x2_offset(ir
, dst
, src
[0]);
1229 case FS_OPCODE_SET_OMASK
:
1230 generate_set_omask(ir
, dst
, src
[0]);
1233 case FS_OPCODE_SET_SAMPLE_ID
:
1234 generate_set_sample_id(ir
, dst
, src
[0], src
[1]);
1237 case FS_OPCODE_PACK_HALF_2x16_SPLIT
:
1238 generate_pack_half_2x16_split(ir
, dst
, src
[0], src
[1]);
1241 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X
:
1242 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y
:
1243 generate_unpack_half_2x16_split(ir
, dst
, src
[0]);
1246 case FS_OPCODE_PLACEHOLDER_HALT
:
1247 /* This is the place where the final HALT needs to be inserted if
1248 * we've emitted any discards. If not, this will emit no code.
1250 patch_discard_jumps_to_fb_writes();
1254 if (ir
->opcode
< int(ARRAY_SIZE(opcode_descs
))) {
1255 _mesa_problem(ctx
, "Unsupported opcode `%s' in FS",
1256 opcode_descs
[ir
->opcode
].name
);
1258 _mesa_problem(ctx
, "Unsupported opcode %d in FS", ir
->opcode
);
1263 if (unlikely(INTEL_DEBUG
& DEBUG_WM
)) {
1264 disassemble(stderr
, last_native_inst_offset
, next_inst_offset
);
1266 foreach_list(node
, &cfg
->block_list
) {
1267 bblock_link
*link
= (bblock_link
*)node
;
1268 bblock_t
*block
= link
->block
;
1270 if (block
->end
== ir
) {
1271 fprintf(stderr
, " END B%d", block
->block_num
);
1272 foreach_list(successor_node
, &block
->children
) {
1273 bblock_link
*successor_link
=
1274 (bblock_link
*)successor_node
;
1275 bblock_t
*successor_block
= successor_link
->block
;
1276 fprintf(stderr
, " ->B%d", successor_block
->block_num
);
1278 fprintf(stderr
, "\n");
1283 last_native_inst_offset
= next_inst_offset
;
1286 if (unlikely(INTEL_DEBUG
& DEBUG_WM
)) {
1287 fprintf(stderr
, "\n");
1290 patch_jump_targets();
1292 /* OK, while the INTEL_DEBUG=fs above is very nice for debugging FS
1293 * emit issues, it doesn't get the jump distances into the output,
1294 * which is often something we want to debug. So this is here in
1295 * case you're doing that.
1297 if (0 && unlikely(INTEL_DEBUG
& DEBUG_WM
)) {
1298 disassemble(stderr
, 0, next_inst_offset
);
1303 gen8_fs_generator::generate_assembly(exec_list
*simd8_instructions
,
1304 exec_list
*simd16_instructions
,
1305 unsigned *assembly_size
)
1307 assert(simd8_instructions
|| simd16_instructions
);
1309 if (simd8_instructions
) {
1311 generate_code(simd8_instructions
);
1314 if (simd16_instructions
) {
1315 /* Align to a 64-byte boundary. */
1316 while ((nr_inst
* sizeof(gen8_instruction
)) % 64)
1319 /* Save off the start of this SIMD16 program */
1320 c
->prog_data
.prog_offset_16
= nr_inst
* sizeof(gen8_instruction
);
1322 dispatch_width
= 16;
1323 generate_code(simd16_instructions
);
1326 *assembly_size
= next_inst_offset
;
1327 return (const unsigned *) store
;