2 * Copyright © 2010, 2011, 2012 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 /** @file gen8_fs_generate.cpp
26 * Code generation for Gen8+ hardware.
30 #include "main/macros.h"
31 #include "brw_context.h"
36 #include "glsl/ir_print_visitor.h"
38 gen8_fs_generator::gen8_fs_generator(struct brw_context
*brw
,
40 const struct brw_wm_prog_key
*key
,
41 struct brw_wm_prog_data
*prog_data
,
42 struct gl_shader_program
*shader_prog
,
43 struct gl_fragment_program
*fp
,
44 bool dual_source_output
)
45 : gen8_generator(brw
, shader_prog
, fp
? &fp
->Base
: NULL
, mem_ctx
),
46 key(key
), prog_data(prog_data
),
47 fp(fp
), dual_source_output(dual_source_output
)
51 gen8_fs_generator::~gen8_fs_generator()
56 gen8_fs_generator::generate_fb_write(fs_inst
*ir
)
58 /* Disable the discard condition while setting up the header. */
59 default_state
.predicate
= BRW_PREDICATE_NONE
;
60 default_state
.predicate_inverse
= false;
61 default_state
.flag_subreg_nr
= 0;
63 if (ir
->header_present
) {
64 /* The GPU will use the predicate on SENDC, unless the header is present.
66 if (fp
&& fp
->UsesKill
) {
67 gen8_instruction
*mov
=
68 MOV(retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW
),
70 gen8_set_mask_control(mov
, BRW_MASK_DISABLE
);
73 gen8_instruction
*mov
=
74 MOV_RAW(brw_message_reg(ir
->base_mrf
), brw_vec8_grf(0, 0));
75 gen8_set_exec_size(mov
, BRW_EXECUTE_16
);
77 if (ir
->target
> 0 && key
->replicate_alpha
) {
78 /* Set "Source0 Alpha Present to RenderTarget" bit in the header. */
79 gen8_instruction
*inst
=
80 OR(get_element_ud(brw_message_reg(ir
->base_mrf
), 0),
81 vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
)),
83 gen8_set_mask_control(inst
, BRW_MASK_DISABLE
);
87 /* Set the render target index for choosing BLEND_STATE. */
88 MOV_RAW(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE
, ir
->base_mrf
, 2),
89 brw_imm_ud(ir
->target
));
93 /* Set the predicate back to get the conditional write if necessary for
96 default_state
.predicate
= ir
->predicate
;
97 default_state
.predicate_inverse
= ir
->predicate_inverse
;
98 default_state
.flag_subreg_nr
= ir
->flag_subreg
;
100 gen8_instruction
*inst
= next_inst(BRW_OPCODE_SENDC
);
101 gen8_set_dst(brw
, inst
, retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW
));
102 gen8_set_src0(brw
, inst
, brw_message_reg(ir
->base_mrf
));
104 /* Set up the "Message Specific Control" bits for the Data Port Message
105 * Descriptor. These are documented in the "Render Target Write" message's
106 * "Message Descriptor" documentation (vol5c.2).
109 /* Set the Message Type */
110 if (this->dual_source_output
)
111 msg_type
= BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01
;
112 else if (dispatch_width
== 16)
113 msg_type
= BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE
;
115 msg_type
= BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01
;
117 uint32_t msg_control
= msg_type
;
119 /* "Last Render Target Select" must be set on all writes to the last of
120 * the render targets (if using MRT), or always for a single RT scenario.
122 if ((ir
->target
== key
->nr_color_regions
- 1) || !key
->nr_color_regions
)
123 msg_control
|= (1 << 4); /* Last Render Target Select */
125 uint32_t surf_index
=
126 prog_data
->binding_table
.render_target_start
+ ir
->target
;
128 gen8_set_dp_message(brw
, inst
,
129 GEN6_SFID_DATAPORT_RENDER_CACHE
,
131 GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE
,
138 brw_mark_surface_used(&prog_data
->base
, surf_index
);
142 gen8_fs_generator::generate_linterp(fs_inst
*inst
,
146 struct brw_reg delta_x
= src
[0];
147 struct brw_reg delta_y
= src
[1];
148 struct brw_reg interp
= src
[2];
151 assert(delta_y
.nr
== delta_x
.nr
+ 1);
152 PLN(dst
, interp
, delta_x
);
156 gen8_fs_generator::generate_tex(fs_inst
*ir
,
162 uint32_t simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD8
;
164 assert(src
.file
== BRW_GENERAL_REGISTER_FILE
);
166 if (dispatch_width
== 16 && !ir
->force_uncompressed
&& !ir
->force_sechalf
)
167 simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD16
;
169 switch (ir
->opcode
) {
170 case SHADER_OPCODE_TEX
:
171 if (ir
->shadow_compare
) {
172 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE
;
174 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE
;
178 if (ir
->shadow_compare
) {
179 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE
;
181 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS
;
184 case SHADER_OPCODE_TXL
:
185 if (ir
->shadow_compare
) {
186 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE
;
188 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_LOD
;
191 case SHADER_OPCODE_TXS
:
192 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO
;
194 case SHADER_OPCODE_TXD
:
195 if (ir
->shadow_compare
) {
196 msg_type
= HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE
;
198 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS
;
201 case SHADER_OPCODE_TXF
:
202 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_LD
;
204 case SHADER_OPCODE_TXF_CMS
:
205 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS
;
207 case SHADER_OPCODE_TXF_UMS
:
208 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS
;
210 case SHADER_OPCODE_TXF_MCS
:
211 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS
;
213 case SHADER_OPCODE_LOD
:
214 msg_type
= GEN5_SAMPLER_MESSAGE_LOD
;
216 case SHADER_OPCODE_TG4
:
217 if (ir
->shadow_compare
) {
218 assert(brw
->gen
>= 7);
219 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C
;
221 assert(brw
->gen
>= 6);
222 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4
;
225 case SHADER_OPCODE_TG4_OFFSET
:
226 assert(brw
->gen
>= 7);
227 if (ir
->shadow_compare
) {
228 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C
;
230 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO
;
234 assert(!"not reached");
237 assert(msg_type
!= -1);
239 if (simd_mode
== BRW_SAMPLER_SIMD_MODE_SIMD16
) {
244 if (ir
->header_present
) {
245 /* The send-from-GRF for SIMD16 texturing with a header has an extra
246 * hardware register allocated to it, which we need to skip over (since
247 * our coordinates in the payload are in the even-numbered registers,
248 * and the header comes right before the first one.
250 if (dispatch_width
== 16)
253 unsigned save_exec_size
= default_state
.exec_size
;
254 default_state
.exec_size
= BRW_EXECUTE_8
;
256 MOV_RAW(src
, brw_vec8_grf(0, 0));
258 if (ir
->texture_offset
) {
259 /* Set the texel offset bits. */
260 MOV_RAW(retype(brw_vec1_grf(src
.nr
, 2), BRW_REGISTER_TYPE_UD
),
261 brw_imm_ud(ir
->texture_offset
));
264 if (ir
->sampler
>= 16) {
265 /* The "Sampler Index" field can only store values between 0 and 15.
266 * However, we can add an offset to the "Sampler State Pointer"
267 * field, effectively selecting a different set of 16 samplers.
269 * The "Sampler State Pointer" needs to be aligned to a 32-byte
270 * offset, and each sampler state is only 16-bytes, so we can't
271 * exclusively use the offset - we have to use both.
273 gen8_instruction
*add
=
274 ADD(get_element_ud(src
, 3),
275 get_element_ud(brw_vec8_grf(0, 0), 3),
276 brw_imm_ud(16 * (ir
->sampler
/ 16) *
277 sizeof(gen7_sampler_state
)));
278 gen8_set_mask_control(add
, BRW_MASK_DISABLE
);
281 default_state
.exec_size
= save_exec_size
;
284 uint32_t surf_index
=
285 prog_data
->base
.binding_table
.texture_start
+ ir
->sampler
;
287 gen8_instruction
*inst
= next_inst(BRW_OPCODE_SEND
);
288 gen8_set_dst(brw
, inst
, dst
);
289 gen8_set_src0(brw
, inst
, src
);
290 gen8_set_sampler_message(brw
, inst
,
299 brw_mark_surface_used(&prog_data
->base
, surf_index
);
303 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
306 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
308 * and we're trying to produce:
311 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
312 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
313 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
314 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
315 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
316 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
317 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
318 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
320 * and add another set of two more subspans if in 16-pixel dispatch mode.
322 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
323 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
324 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
325 * between each other. We could probably do it like ddx and swizzle the right
326 * order later, but bail for now and just produce
327 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
330 gen8_fs_generator::generate_ddx(fs_inst
*inst
,
334 unsigned vstride
, width
;
336 if (key
->high_quality_derivatives
) {
337 /* Produce accurate derivatives. */
338 vstride
= BRW_VERTICAL_STRIDE_2
;
341 /* Replicate the derivative at the top-left pixel to other pixels. */
342 vstride
= BRW_VERTICAL_STRIDE_4
;
346 struct brw_reg src0
= brw_reg(src
.file
, src
.nr
, 1,
350 BRW_HORIZONTAL_STRIDE_0
,
351 BRW_SWIZZLE_XYZW
, WRITEMASK_XYZW
);
352 struct brw_reg src1
= brw_reg(src
.file
, src
.nr
, 0,
356 BRW_HORIZONTAL_STRIDE_0
,
357 BRW_SWIZZLE_XYZW
, WRITEMASK_XYZW
);
358 ADD(dst
, src0
, negate(src1
));
361 /* The negate_value boolean is used to negate the derivative computation for
362 * FBOs, since they place the origin at the upper left instead of the lower
366 gen8_fs_generator::generate_ddy(fs_inst
*inst
,
372 unsigned src0_swizzle
;
373 unsigned src1_swizzle
;
376 if (key
->high_quality_derivatives
) {
377 /* Produce accurate derivatives. */
378 hstride
= BRW_HORIZONTAL_STRIDE_1
;
379 src0_swizzle
= BRW_SWIZZLE_XYXY
;
380 src1_swizzle
= BRW_SWIZZLE_ZWZW
;
383 default_state
.access_mode
= BRW_ALIGN_16
;
385 /* Replicate the derivative at the top-left pixel to other pixels. */
386 hstride
= BRW_HORIZONTAL_STRIDE_0
;
387 src0_swizzle
= BRW_SWIZZLE_XYZW
;
388 src1_swizzle
= BRW_SWIZZLE_XYZW
;
392 struct brw_reg src0
= brw_reg(src
.file
, src
.nr
, 0,
394 BRW_VERTICAL_STRIDE_4
,
397 src0_swizzle
, WRITEMASK_XYZW
);
398 struct brw_reg src1
= brw_reg(src
.file
, src
.nr
, src1_subnr
,
400 BRW_VERTICAL_STRIDE_4
,
403 src1_swizzle
, WRITEMASK_XYZW
);
406 ADD(dst
, src1
, negate(src0
));
408 ADD(dst
, src0
, negate(src1
));
410 default_state
.access_mode
= BRW_ALIGN_1
;
414 gen8_fs_generator::generate_scratch_write(fs_inst
*ir
, struct brw_reg src
)
416 MOV(retype(brw_message_reg(ir
->base_mrf
+ 1), BRW_REGISTER_TYPE_UD
),
417 retype(src
, BRW_REGISTER_TYPE_UD
));
420 retype(brw_message_reg(ir
->base_mrf
), BRW_REGISTER_TYPE_UD
);
422 const int num_regs
= dispatch_width
/ 8;
424 uint32_t msg_control
;
426 msg_control
= BRW_DATAPORT_OWORD_BLOCK_2_OWORDS
;
428 msg_control
= BRW_DATAPORT_OWORD_BLOCK_4_OWORDS
;
430 /* Set up the message header. This is g0, with g0.2 filled with
431 * the offset. We don't want to leave our offset around in g0 or
432 * it'll screw up texture samples, so set it up inside the message
435 unsigned save_exec_size
= default_state
.exec_size
;
436 default_state
.exec_size
= BRW_EXECUTE_8
;
438 MOV_RAW(mrf
, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
));
439 /* set message header global offset field (reg 0, element 2) */
440 MOV_RAW(get_element_ud(mrf
, 2), brw_imm_ud(ir
->offset
/ 16));
443 if (dispatch_width
== 16)
444 dst
= retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW
);
446 dst
= retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW
);
448 default_state
.exec_size
= BRW_EXECUTE_16
;
450 gen8_instruction
*send
= next_inst(BRW_OPCODE_SEND
);
451 gen8_set_dst(brw
, send
, dst
);
452 gen8_set_src0(brw
, send
, mrf
);
453 gen8_set_dp_message(brw
, send
, GEN7_SFID_DATAPORT_DATA_CACHE
,
454 255, /* binding table index: stateless access */
455 GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE
,
457 1 + num_regs
, /* mlen */
459 true, /* header present */
462 default_state
.exec_size
= save_exec_size
;
466 gen8_fs_generator::generate_scratch_read(fs_inst
*ir
, struct brw_reg dst
)
469 retype(brw_message_reg(ir
->base_mrf
), BRW_REGISTER_TYPE_UD
);
471 const int num_regs
= dispatch_width
/ 8;
473 uint32_t msg_control
;
475 msg_control
= BRW_DATAPORT_OWORD_BLOCK_2_OWORDS
;
477 msg_control
= BRW_DATAPORT_OWORD_BLOCK_4_OWORDS
;
479 unsigned save_exec_size
= default_state
.exec_size
;
480 default_state
.exec_size
= BRW_EXECUTE_8
;
482 MOV_RAW(mrf
, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
));
483 /* set message header global offset field (reg 0, element 2) */
484 MOV_RAW(get_element_ud(mrf
, 2), brw_imm_ud(ir
->offset
/ 16));
486 gen8_instruction
*send
= next_inst(BRW_OPCODE_SEND
);
487 gen8_set_dst(brw
, send
, retype(dst
, BRW_REGISTER_TYPE_UW
));
488 gen8_set_src0(brw
, send
, mrf
);
489 gen8_set_dp_message(brw
, send
, GEN7_SFID_DATAPORT_DATA_CACHE
,
490 255, /* binding table index: stateless access */
491 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ
,
495 true, /* header present */
498 default_state
.exec_size
= save_exec_size
;
502 gen8_fs_generator::generate_scratch_read_gen7(fs_inst
*ir
, struct brw_reg dst
)
504 unsigned save_exec_size
= default_state
.exec_size
;
505 gen8_instruction
*send
= next_inst(BRW_OPCODE_SEND
);
507 int num_regs
= dispatch_width
/ 8;
509 /* According to the docs, offset is "A 12-bit HWord offset into the memory
510 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
511 * is 32 bytes, which happens to be the size of a register.
513 int offset
= ir
->offset
/ REG_SIZE
;
515 /* The HW requires that the header is present; this is to get the g0.5
518 gen8_set_src0(brw
, send
, brw_vec8_grf(0, 0));
519 gen8_set_dst(brw
, send
, retype(dst
, BRW_REGISTER_TYPE_UW
));
520 gen8_set_dp_scratch_message(brw
, send
,
521 false, /* scratch read */
523 false, /* invalidate after read */
526 1, /* mlen - just g0 */
528 true, /* header present */
531 default_state
.exec_size
= save_exec_size
;
535 gen8_fs_generator::generate_uniform_pull_constant_load(fs_inst
*inst
,
537 struct brw_reg index
,
538 struct brw_reg offset
)
540 assert(inst
->mlen
== 0);
542 assert(index
.file
== BRW_IMMEDIATE_VALUE
&&
543 index
.type
== BRW_REGISTER_TYPE_UD
);
544 uint32_t surf_index
= index
.dw1
.ud
;
546 assert(offset
.file
== BRW_GENERAL_REGISTER_FILE
);
547 /* Reference only the dword we need lest we anger validate_reg() with
548 * reg.width > reg.execszie.
550 offset
= brw_vec1_grf(offset
.nr
, 0);
552 gen8_instruction
*send
= next_inst(BRW_OPCODE_SEND
);
553 gen8_set_mask_control(send
, BRW_MASK_DISABLE
);
555 /* We use the SIMD4x2 mode because we want to end up with 4 constants in
556 * the destination loaded consecutively from the same offset (which appears
557 * in the first component, and the rest are ignored).
559 dst
.width
= BRW_WIDTH_4
;
560 gen8_set_dst(brw
, send
, dst
);
561 gen8_set_src0(brw
, send
, offset
);
562 gen8_set_sampler_message(brw
, send
,
564 0, /* The LD message ignores the sampler unit. */
565 GEN5_SAMPLER_MESSAGE_SAMPLE_LD
,
568 false, /* no header */
569 BRW_SAMPLER_SIMD_MODE_SIMD4X2
);
571 brw_mark_surface_used(&prog_data
->base
, surf_index
);
575 gen8_fs_generator::generate_varying_pull_constant_load(fs_inst
*ir
,
577 struct brw_reg index
,
578 struct brw_reg offset
)
580 /* Varying-offset pull constant loads are treated as a normal expression on
581 * gen7, so the fact that it's a send message is hidden at the IR level.
583 assert(!ir
->header_present
);
586 assert(index
.file
== BRW_IMMEDIATE_VALUE
&&
587 index
.type
== BRW_REGISTER_TYPE_UD
);
588 uint32_t surf_index
= index
.dw1
.ud
;
590 uint32_t simd_mode
, rlen
, mlen
;
591 if (dispatch_width
== 16) {
594 simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD16
;
598 simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD8
;
601 gen8_instruction
*send
= next_inst(BRW_OPCODE_SEND
);
602 gen8_set_dst(brw
, send
, dst
);
603 gen8_set_src0(brw
, send
, offset
);
604 gen8_set_sampler_message(brw
, send
,
606 0, /* The LD message ignore the sampler unit. */
607 GEN5_SAMPLER_MESSAGE_SAMPLE_LD
,
610 false, /* no header */
613 brw_mark_surface_used(&prog_data
->base
, surf_index
);
617 * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
618 * into the flags register (f0.0).
621 gen8_fs_generator::generate_mov_dispatch_to_flags(fs_inst
*ir
)
623 struct brw_reg flags
= brw_flag_reg(0, ir
->flag_subreg
);
624 struct brw_reg dispatch_mask
=
625 retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW
);
627 gen8_instruction
*mov
= MOV(flags
, dispatch_mask
);
628 gen8_set_mask_control(mov
, BRW_MASK_DISABLE
);
632 gen8_fs_generator::generate_discard_jump(fs_inst
*ir
)
634 /* This HALT will be patched up at FB write time to point UIP at the end of
635 * the program, and at brw_uip_jip() JIP will be set to the end of the
636 * current block (or the program).
638 discard_halt_patches
.push_tail(new(mem_ctx
) ip_record(nr_inst
));
644 gen8_fs_generator::patch_discard_jumps_to_fb_writes()
646 if (discard_halt_patches
.is_empty())
649 /* There is a somewhat strange undocumented requirement of using
650 * HALT, according to the simulator. If some channel has HALTed to
651 * a particular UIP, then by the end of the program, every channel
652 * must have HALTed to that UIP. Furthermore, the tracking is a
653 * stack, so you can't do the final halt of a UIP after starting
654 * halting to a new UIP.
656 * Symptoms of not emitting this instruction on actual hardware
657 * included GPU hangs and sparkly rendering on the piglit discard
660 gen8_instruction
*last_halt
= HALT();
661 gen8_set_uip(last_halt
, 16);
662 gen8_set_jip(last_halt
, 16);
666 foreach_list(node
, &discard_halt_patches
) {
667 ip_record
*patch_ip
= (ip_record
*) node
;
668 gen8_instruction
*patch
= &store
[patch_ip
->ip
];
669 assert(gen8_opcode(patch
) == BRW_OPCODE_HALT
);
671 /* HALT takes an instruction distance from the pre-incremented IP. */
672 gen8_set_uip(patch
, (ip
- patch_ip
->ip
) * 16);
675 this->discard_halt_patches
.make_empty();
680 * Sets the first dword of a vgrf for simd4x2 uniform pull constant
681 * sampler LD messages.
683 * We don't want to bake it into the send message's code generation because
684 * that means we don't get a chance to schedule the instruction.
687 gen8_fs_generator::generate_set_simd4x2_offset(fs_inst
*ir
,
689 struct brw_reg value
)
691 assert(value
.file
== BRW_IMMEDIATE_VALUE
);
692 MOV_RAW(retype(brw_vec1_reg(dst
.file
, dst
.nr
, 0), value
.type
), value
);
696 * Sets vstride=16, width=8, hstride=2 or vstride=0, width=1, hstride=0
697 * (when mask is passed as a uniform) of register mask before moving it
701 gen8_fs_generator::generate_set_omask(fs_inst
*inst
,
705 assert(dst
.type
== BRW_REGISTER_TYPE_UW
);
707 if (dispatch_width
== 16)
710 if (mask
.vstride
== BRW_VERTICAL_STRIDE_8
&&
711 mask
.width
== BRW_WIDTH_8
&&
712 mask
.hstride
== BRW_HORIZONTAL_STRIDE_1
) {
713 mask
= stride(mask
, 16, 8, 2);
715 assert(mask
.vstride
== BRW_VERTICAL_STRIDE_0
&&
716 mask
.width
== BRW_WIDTH_1
&&
717 mask
.hstride
== BRW_HORIZONTAL_STRIDE_0
);
720 unsigned save_exec_size
= default_state
.exec_size
;
721 default_state
.exec_size
= BRW_EXECUTE_8
;
723 gen8_instruction
*mov
= MOV(dst
, retype(mask
, dst
.type
));
724 gen8_set_mask_control(mov
, BRW_MASK_DISABLE
);
726 default_state
.exec_size
= save_exec_size
;
730 * Do a special ADD with vstride=1, width=4, hstride=0 for src1.
733 gen8_fs_generator::generate_set_sample_id(fs_inst
*ir
,
738 assert(dst
.type
== BRW_REGISTER_TYPE_D
|| dst
.type
== BRW_REGISTER_TYPE_UD
);
739 assert(src0
.type
== BRW_REGISTER_TYPE_D
|| src0
.type
== BRW_REGISTER_TYPE_UD
);
741 struct brw_reg reg
= retype(stride(src1
, 1, 4, 0), BRW_REGISTER_TYPE_UW
);
743 unsigned save_exec_size
= default_state
.exec_size
;
744 default_state
.exec_size
= BRW_EXECUTE_8
;
746 gen8_instruction
*add
= ADD(dst
, src0
, reg
);
747 gen8_set_mask_control(add
, BRW_MASK_DISABLE
);
748 if (dispatch_width
== 16) {
749 add
= ADD(offset(dst
, 1), offset(src0
, 1), suboffset(reg
, 2));
750 gen8_set_mask_control(add
, BRW_MASK_DISABLE
);
753 default_state
.exec_size
= save_exec_size
;
757 * Change the register's data type from UD to HF, doubling the strides in order
758 * to compensate for halving the data type width.
760 static struct brw_reg
761 ud_reg_to_hf(struct brw_reg r
)
763 assert(r
.type
== BRW_REGISTER_TYPE_UD
);
764 r
.type
= BRW_REGISTER_TYPE_HF
;
766 /* The BRW_*_STRIDE enums are defined so that incrementing the field
767 * doubles the real stride.
778 gen8_fs_generator::generate_pack_half_2x16_split(fs_inst
*inst
,
783 assert(dst
.type
== BRW_REGISTER_TYPE_UD
);
784 assert(x
.type
== BRW_REGISTER_TYPE_F
);
785 assert(y
.type
== BRW_REGISTER_TYPE_F
);
787 struct brw_reg dst_hf
= ud_reg_to_hf(dst
);
789 /* Give each 32-bit channel of dst the form below , where "." means
798 SHL(dst
, dst
, brw_imm_ud(16u));
800 /* And, finally the form of packHalf2x16's output:
807 gen8_fs_generator::generate_unpack_half_2x16_split(fs_inst
*inst
,
811 assert(dst
.type
== BRW_REGISTER_TYPE_F
);
812 assert(src
.type
== BRW_REGISTER_TYPE_UD
);
814 struct brw_reg src_hf
= ud_reg_to_hf(src
);
816 /* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll.
817 * For the Y case, we wish to access only the upper word; therefore
818 * a 16-bit subregister offset is needed.
820 assert(inst
->opcode
== FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X
||
821 inst
->opcode
== FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y
);
822 if (inst
->opcode
== FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y
)
829 gen8_fs_generator::generate_untyped_atomic(fs_inst
*ir
,
831 struct brw_reg atomic_op
,
832 struct brw_reg surf_index
)
834 assert(atomic_op
.file
== BRW_IMMEDIATE_VALUE
&&
835 atomic_op
.type
== BRW_REGISTER_TYPE_UD
&&
836 surf_index
.file
== BRW_IMMEDIATE_VALUE
&&
837 surf_index
.type
== BRW_REGISTER_TYPE_UD
);
838 assert((atomic_op
.dw1
.ud
& ~0xf) == 0);
840 unsigned msg_control
=
841 atomic_op
.dw1
.ud
| /* Atomic Operation Type: BRW_AOP_* */
842 ((dispatch_width
== 16 ? 0 : 1) << 4) | /* SIMD Mode */
843 (1 << 5); /* Return data expected */
845 gen8_instruction
*inst
= next_inst(BRW_OPCODE_SEND
);
846 gen8_set_dst(brw
, inst
, retype(dst
, BRW_REGISTER_TYPE_UD
));
847 gen8_set_src0(brw
, inst
, brw_message_reg(ir
->base_mrf
));
848 gen8_set_dp_message(brw
, inst
, HSW_SFID_DATAPORT_DATA_CACHE_1
,
850 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP
,
857 brw_mark_surface_used(&prog_data
->base
, surf_index
.dw1
.ud
);
861 gen8_fs_generator::generate_untyped_surface_read(fs_inst
*ir
,
863 struct brw_reg surf_index
)
865 assert(surf_index
.file
== BRW_IMMEDIATE_VALUE
&&
866 surf_index
.type
== BRW_REGISTER_TYPE_UD
);
868 unsigned msg_control
= 0xe | /* Enable only the R channel */
869 ((dispatch_width
== 16 ? 1 : 2) << 4); /* SIMD Mode */
871 gen8_instruction
*inst
= next_inst(BRW_OPCODE_SEND
);
872 gen8_set_dst(brw
, inst
, retype(dst
, BRW_REGISTER_TYPE_UD
));
873 gen8_set_src0(brw
, inst
, brw_message_reg(ir
->base_mrf
));
874 gen8_set_dp_message(brw
, inst
, HSW_SFID_DATAPORT_DATA_CACHE_1
,
876 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ
,
883 brw_mark_surface_used(&prog_data
->base
, surf_index
.dw1
.ud
);
887 gen8_fs_generator::generate_code(exec_list
*instructions
,
888 struct annotation_info
*annotation
)
890 if (unlikely(INTEL_DEBUG
& DEBUG_WM
)) {
893 "Native code for %s fragment shader %d (SIMD%d dispatch):\n",
894 shader_prog
->Label
? shader_prog
->Label
: "unnamed",
895 shader_prog
->Name
, dispatch_width
);
898 "Native code for fragment program %d (SIMD%d dispatch):\n",
899 prog
->Id
, dispatch_width
);
901 fprintf(stderr
, "Native code for blorp program (SIMD%d dispatch):\n",
907 if (unlikely(INTEL_DEBUG
& DEBUG_WM
))
908 cfg
= new(mem_ctx
) cfg_t(instructions
);
910 foreach_list(node
, instructions
) {
911 fs_inst
*ir
= (fs_inst
*) node
;
912 struct brw_reg src
[3], dst
;
914 if (unlikely(INTEL_DEBUG
& DEBUG_WM
))
915 annotate(brw
, annotation
, cfg
, ir
, next_inst_offset
);
917 for (unsigned int i
= 0; i
< 3; i
++) {
918 src
[i
] = brw_reg_from_fs_reg(&ir
->src
[i
]);
920 /* The accumulator result appears to get used for the
921 * conditional modifier generation. When negating a UD
922 * value, there is a 33rd bit generated for the sign in the
923 * accumulator value, so now you can't check, for example,
924 * equality with a 32-bit value. See piglit fs-op-neg-uvec4.
926 assert(!ir
->conditional_mod
||
927 ir
->src
[i
].type
!= BRW_REGISTER_TYPE_UD
||
930 dst
= brw_reg_from_fs_reg(&ir
->dst
);
932 default_state
.conditional_mod
= ir
->conditional_mod
;
933 default_state
.predicate
= ir
->predicate
;
934 default_state
.predicate_inverse
= ir
->predicate_inverse
;
935 default_state
.saturate
= ir
->saturate
;
936 default_state
.mask_control
= ir
->force_writemask_all
;
937 default_state
.flag_subreg_nr
= ir
->flag_subreg
;
939 if (dispatch_width
== 16 && !ir
->force_uncompressed
)
940 default_state
.exec_size
= BRW_EXECUTE_16
;
942 default_state
.exec_size
= BRW_EXECUTE_8
;
944 if (ir
->force_uncompressed
|| dispatch_width
== 8)
945 default_state
.qtr_control
= GEN6_COMPRESSION_1Q
;
946 else if (ir
->force_sechalf
)
947 default_state
.qtr_control
= GEN6_COMPRESSION_2Q
;
949 default_state
.qtr_control
= GEN6_COMPRESSION_1H
;
951 switch (ir
->opcode
) {
956 ADD(dst
, src
[0], src
[1]);
959 MUL(dst
, src
[0], src
[1]);
961 case BRW_OPCODE_MACH
:
962 MACH(dst
, src
[0], src
[1]);
966 default_state
.access_mode
= BRW_ALIGN_16
;
967 MAD(dst
, src
[0], src
[1], src
[2]);
968 default_state
.access_mode
= BRW_ALIGN_1
;
972 default_state
.access_mode
= BRW_ALIGN_16
;
973 LRP(dst
, src
[0], src
[1], src
[2]);
974 default_state
.access_mode
= BRW_ALIGN_1
;
981 case BRW_OPCODE_RNDD
:
984 case BRW_OPCODE_RNDE
:
987 case BRW_OPCODE_RNDZ
:
992 AND(dst
, src
[0], src
[1]);
995 OR(dst
, src
[0], src
[1]);
998 XOR(dst
, src
[0], src
[1]);
1000 case BRW_OPCODE_NOT
:
1003 case BRW_OPCODE_ASR
:
1004 ASR(dst
, src
[0], src
[1]);
1006 case BRW_OPCODE_SHR
:
1007 SHR(dst
, src
[0], src
[1]);
1009 case BRW_OPCODE_SHL
:
1010 SHL(dst
, src
[0], src
[1]);
1013 case BRW_OPCODE_F32TO16
:
1014 MOV(retype(dst
, BRW_REGISTER_TYPE_HF
), src
[0]);
1016 case BRW_OPCODE_F16TO32
:
1017 MOV(dst
, retype(src
[0], BRW_REGISTER_TYPE_HF
));
1020 case BRW_OPCODE_CMP
:
1021 CMP(dst
, ir
->conditional_mod
, src
[0], src
[1]);
1023 case BRW_OPCODE_SEL
:
1024 SEL(dst
, src
[0], src
[1]);
1027 case BRW_OPCODE_BFREV
:
1028 /* BFREV only supports UD type for src and dst. */
1029 BFREV(retype(dst
, BRW_REGISTER_TYPE_UD
),
1030 retype(src
[0], BRW_REGISTER_TYPE_UD
));
1033 case BRW_OPCODE_FBH
:
1034 /* FBH only supports UD type for dst. */
1035 FBH(retype(dst
, BRW_REGISTER_TYPE_UD
), src
[0]);
1038 case BRW_OPCODE_FBL
:
1039 /* FBL only supports UD type for dst. */
1040 FBL(retype(dst
, BRW_REGISTER_TYPE_UD
), src
[0]);
1043 case BRW_OPCODE_CBIT
:
1044 /* CBIT only supports UD type for dst. */
1045 CBIT(retype(dst
, BRW_REGISTER_TYPE_UD
), src
[0]);
1048 case BRW_OPCODE_ADDC
:
1049 ADDC(dst
, src
[0], src
[1]);
1052 case BRW_OPCODE_SUBB
:
1053 SUBB(dst
, src
[0], src
[1]);
1056 case BRW_OPCODE_BFE
:
1057 default_state
.access_mode
= BRW_ALIGN_16
;
1058 BFE(dst
, src
[0], src
[1], src
[2]);
1059 default_state
.access_mode
= BRW_ALIGN_1
;
1062 case BRW_OPCODE_BFI1
:
1063 BFI1(dst
, src
[0], src
[1]);
1066 case BRW_OPCODE_BFI2
:
1067 default_state
.access_mode
= BRW_ALIGN_16
;
1068 BFI2(dst
, src
[0], src
[1], src
[2]);
1069 default_state
.access_mode
= BRW_ALIGN_1
;
1073 IF(BRW_PREDICATE_NORMAL
);
1076 case BRW_OPCODE_ELSE
:
1080 case BRW_OPCODE_ENDIF
:
1088 case BRW_OPCODE_BREAK
:
1092 case BRW_OPCODE_CONTINUE
:
1096 case BRW_OPCODE_WHILE
:
1100 case SHADER_OPCODE_RCP
:
1101 MATH(BRW_MATH_FUNCTION_INV
, dst
, src
[0]);
1104 case SHADER_OPCODE_RSQ
:
1105 MATH(BRW_MATH_FUNCTION_RSQ
, dst
, src
[0]);
1108 case SHADER_OPCODE_SQRT
:
1109 MATH(BRW_MATH_FUNCTION_SQRT
, dst
, src
[0]);
1112 case SHADER_OPCODE_EXP2
:
1113 MATH(BRW_MATH_FUNCTION_EXP
, dst
, src
[0]);
1116 case SHADER_OPCODE_LOG2
:
1117 MATH(BRW_MATH_FUNCTION_LOG
, dst
, src
[0]);
1120 case SHADER_OPCODE_SIN
:
1121 MATH(BRW_MATH_FUNCTION_SIN
, dst
, src
[0]);
1124 case SHADER_OPCODE_COS
:
1125 MATH(BRW_MATH_FUNCTION_COS
, dst
, src
[0]);
1128 case SHADER_OPCODE_INT_QUOTIENT
:
1129 MATH(BRW_MATH_FUNCTION_INT_DIV_QUOTIENT
, dst
, src
[0], src
[1]);
1132 case SHADER_OPCODE_INT_REMAINDER
:
1133 MATH(BRW_MATH_FUNCTION_INT_DIV_REMAINDER
, dst
, src
[0], src
[1]);
1136 case SHADER_OPCODE_POW
:
1137 MATH(BRW_MATH_FUNCTION_POW
, dst
, src
[0], src
[1]);
1140 case FS_OPCODE_PIXEL_X
:
1141 case FS_OPCODE_PIXEL_Y
:
1142 assert(!"FS_OPCODE_PIXEL_X and FS_OPCODE_PIXEL_Y are only for Gen4-5.");
1145 case FS_OPCODE_CINTERP
:
1148 case FS_OPCODE_LINTERP
:
1149 generate_linterp(ir
, dst
, src
);
1151 case SHADER_OPCODE_TEX
:
1153 case SHADER_OPCODE_TXD
:
1154 case SHADER_OPCODE_TXF
:
1155 case SHADER_OPCODE_TXF_CMS
:
1156 case SHADER_OPCODE_TXF_UMS
:
1157 case SHADER_OPCODE_TXF_MCS
:
1158 case SHADER_OPCODE_TXL
:
1159 case SHADER_OPCODE_TXS
:
1160 case SHADER_OPCODE_LOD
:
1161 case SHADER_OPCODE_TG4
:
1162 case SHADER_OPCODE_TG4_OFFSET
:
1163 generate_tex(ir
, dst
, src
[0]);
1167 generate_ddx(ir
, dst
, src
[0]);
1170 /* Make sure fp->UsesDFdy flag got set (otherwise there's no
1171 * guarantee that key->render_to_fbo is set).
1173 assert(fp
->UsesDFdy
);
1174 generate_ddy(ir
, dst
, src
[0], key
->render_to_fbo
);
1177 case SHADER_OPCODE_GEN4_SCRATCH_WRITE
:
1178 generate_scratch_write(ir
, src
[0]);
1181 case SHADER_OPCODE_GEN4_SCRATCH_READ
:
1182 generate_scratch_read(ir
, dst
);
1185 case SHADER_OPCODE_GEN7_SCRATCH_READ
:
1186 generate_scratch_read_gen7(ir
, dst
);
1189 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7
:
1190 generate_uniform_pull_constant_load(ir
, dst
, src
[0], src
[1]);
1193 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7
:
1194 generate_varying_pull_constant_load(ir
, dst
, src
[0], src
[1]);
1197 case FS_OPCODE_FB_WRITE
:
1198 generate_fb_write(ir
);
1201 case FS_OPCODE_MOV_DISPATCH_TO_FLAGS
:
1202 generate_mov_dispatch_to_flags(ir
);
1205 case FS_OPCODE_DISCARD_JUMP
:
1206 generate_discard_jump(ir
);
1209 case SHADER_OPCODE_SHADER_TIME_ADD
:
1210 assert(!"XXX: Missing Gen8 scalar support for INTEL_DEBUG=shader_time");
1213 case SHADER_OPCODE_UNTYPED_ATOMIC
:
1214 generate_untyped_atomic(ir
, dst
, src
[0], src
[1]);
1217 case SHADER_OPCODE_UNTYPED_SURFACE_READ
:
1218 generate_untyped_surface_read(ir
, dst
, src
[0]);
1221 case FS_OPCODE_SET_SIMD4X2_OFFSET
:
1222 generate_set_simd4x2_offset(ir
, dst
, src
[0]);
1225 case FS_OPCODE_SET_OMASK
:
1226 generate_set_omask(ir
, dst
, src
[0]);
1229 case FS_OPCODE_SET_SAMPLE_ID
:
1230 generate_set_sample_id(ir
, dst
, src
[0], src
[1]);
1233 case FS_OPCODE_PACK_HALF_2x16_SPLIT
:
1234 generate_pack_half_2x16_split(ir
, dst
, src
[0], src
[1]);
1237 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X
:
1238 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y
:
1239 generate_unpack_half_2x16_split(ir
, dst
, src
[0]);
1242 case FS_OPCODE_PLACEHOLDER_HALT
:
1243 /* This is the place where the final HALT needs to be inserted if
1244 * we've emitted any discards. If not, this will emit no code.
1246 if (!patch_discard_jumps_to_fb_writes()) {
1247 if (unlikely(INTEL_DEBUG
& DEBUG_WM
)) {
1248 annotation
->ann_count
--;
1254 if (ir
->opcode
< int(ARRAY_SIZE(opcode_descs
))) {
1255 _mesa_problem(ctx
, "Unsupported opcode `%s' in FS",
1256 opcode_descs
[ir
->opcode
].name
);
1258 _mesa_problem(ctx
, "Unsupported opcode %d in FS", ir
->opcode
);
1264 patch_jump_targets();
1265 annotation_finalize(annotation
, next_inst_offset
);
1269 gen8_fs_generator::generate_assembly(exec_list
*simd8_instructions
,
1270 exec_list
*simd16_instructions
,
1271 unsigned *assembly_size
)
1273 assert(simd8_instructions
|| simd16_instructions
);
1275 if (simd8_instructions
) {
1276 struct annotation_info annotation
;
1277 memset(&annotation
, 0, sizeof(annotation
));
1280 generate_code(simd8_instructions
, &annotation
);
1282 if (unlikely(INTEL_DEBUG
& DEBUG_WM
)) {
1283 dump_assembly(store
, annotation
.ann_count
, annotation
.ann
, brw
, prog
,
1285 ralloc_free(annotation
.ann
);
1289 if (simd16_instructions
) {
1290 /* Align to a 64-byte boundary. */
1291 while (next_inst_offset
% 64)
1294 /* Save off the start of this SIMD16 program */
1295 prog_data
->prog_offset_16
= next_inst_offset
;
1297 struct annotation_info annotation
;
1298 memset(&annotation
, 0, sizeof(annotation
));
1300 dispatch_width
= 16;
1301 generate_code(simd16_instructions
, &annotation
);
1303 if (unlikely(INTEL_DEBUG
& DEBUG_WM
)) {
1304 dump_assembly(store
, annotation
.ann_count
, annotation
.ann
,
1305 brw
, prog
, gen8_disassemble
);
1306 ralloc_free(annotation
.ann
);
1310 *assembly_size
= next_inst_offset
;
1311 return (const unsigned *) store
;