2 * Copyright © 2010, 2011, 2012 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 /** @file gen8_fs_generate.cpp
26 * Code generation for Gen8+ hardware.
30 #include "main/macros.h"
31 #include "brw_context.h"
36 #include "glsl/ir_print_visitor.h"
38 gen8_fs_generator::gen8_fs_generator(struct brw_context
*brw
,
40 const struct brw_wm_prog_key
*key
,
41 struct brw_wm_prog_data
*prog_data
,
42 struct gl_shader_program
*shader_prog
,
43 struct gl_fragment_program
*fp
,
44 bool dual_source_output
)
45 : gen8_generator(brw
, shader_prog
, fp
? &fp
->Base
: NULL
, mem_ctx
),
46 key(key
), prog_data(prog_data
),
47 fp(fp
), dual_source_output(dual_source_output
)
51 gen8_fs_generator::~gen8_fs_generator()
56 gen8_fs_generator::generate_fb_write(fs_inst
*ir
)
58 /* Disable the discard condition while setting up the header. */
59 default_state
.predicate
= BRW_PREDICATE_NONE
;
60 default_state
.predicate_inverse
= false;
61 default_state
.flag_subreg_nr
= 0;
63 if (ir
->header_present
) {
64 /* The GPU will use the predicate on SENDC, unless the header is present.
66 if (fp
&& fp
->UsesKill
) {
67 gen8_instruction
*mov
=
68 MOV(retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW
),
70 gen8_set_mask_control(mov
, BRW_MASK_DISABLE
);
73 gen8_instruction
*mov
=
74 MOV_RAW(brw_message_reg(ir
->base_mrf
), brw_vec8_grf(0, 0));
75 gen8_set_exec_size(mov
, BRW_EXECUTE_16
);
77 if (ir
->target
> 0 && key
->replicate_alpha
) {
78 /* Set "Source0 Alpha Present to RenderTarget" bit in the header. */
79 gen8_instruction
*inst
=
80 OR(get_element_ud(brw_message_reg(ir
->base_mrf
), 0),
81 vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
)),
83 gen8_set_mask_control(inst
, BRW_MASK_DISABLE
);
87 /* Set the render target index for choosing BLEND_STATE. */
88 MOV_RAW(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE
, ir
->base_mrf
, 2),
89 brw_imm_ud(ir
->target
));
93 /* Set the predicate back to get the conditional write if necessary for
96 default_state
.predicate
= ir
->predicate
;
97 default_state
.predicate_inverse
= ir
->predicate_inverse
;
98 default_state
.flag_subreg_nr
= ir
->flag_subreg
;
100 gen8_instruction
*inst
= next_inst(BRW_OPCODE_SENDC
);
101 gen8_set_dst(brw
, inst
, retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW
));
102 gen8_set_src0(brw
, inst
, brw_message_reg(ir
->base_mrf
));
104 /* Set up the "Message Specific Control" bits for the Data Port Message
105 * Descriptor. These are documented in the "Render Target Write" message's
106 * "Message Descriptor" documentation (vol5c.2).
109 /* Set the Message Type */
110 if (this->dual_source_output
)
111 msg_type
= BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01
;
112 else if (dispatch_width
== 16)
113 msg_type
= BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE
;
115 msg_type
= BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01
;
117 uint32_t msg_control
= msg_type
;
119 /* Set "Last Render Target Select" on the final FB write. */
121 msg_control
|= (1 << 4); /* Last Render Target Select */
123 uint32_t surf_index
=
124 prog_data
->binding_table
.render_target_start
+ ir
->target
;
126 gen8_set_dp_message(brw
, inst
,
127 GEN6_SFID_DATAPORT_RENDER_CACHE
,
129 GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE
,
136 brw_mark_surface_used(&prog_data
->base
, surf_index
);
140 gen8_fs_generator::generate_linterp(fs_inst
*inst
,
144 struct brw_reg delta_x
= src
[0];
145 struct brw_reg delta_y
= src
[1];
146 struct brw_reg interp
= src
[2];
149 assert(delta_y
.nr
== delta_x
.nr
+ 1);
150 PLN(dst
, interp
, delta_x
);
154 gen8_fs_generator::generate_tex(fs_inst
*ir
,
157 struct brw_reg sampler_index
)
161 uint32_t simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD8
;
163 assert(src
.file
== BRW_GENERAL_REGISTER_FILE
);
165 if (dispatch_width
== 16 && !ir
->force_uncompressed
&& !ir
->force_sechalf
)
166 simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD16
;
168 switch (ir
->opcode
) {
169 case SHADER_OPCODE_TEX
:
170 if (ir
->shadow_compare
) {
171 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE
;
173 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE
;
177 if (ir
->shadow_compare
) {
178 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE
;
180 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS
;
183 case SHADER_OPCODE_TXL
:
184 if (ir
->shadow_compare
) {
185 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE
;
187 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_LOD
;
190 case SHADER_OPCODE_TXS
:
191 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO
;
193 case SHADER_OPCODE_TXD
:
194 if (ir
->shadow_compare
) {
195 msg_type
= HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE
;
197 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS
;
200 case SHADER_OPCODE_TXF
:
201 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_LD
;
203 case SHADER_OPCODE_TXF_CMS
:
204 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS
;
206 case SHADER_OPCODE_TXF_UMS
:
207 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS
;
209 case SHADER_OPCODE_TXF_MCS
:
210 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS
;
212 case SHADER_OPCODE_LOD
:
213 msg_type
= GEN5_SAMPLER_MESSAGE_LOD
;
215 case SHADER_OPCODE_TG4
:
216 if (ir
->shadow_compare
) {
217 assert(brw
->gen
>= 7);
218 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C
;
220 assert(brw
->gen
>= 6);
221 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4
;
224 case SHADER_OPCODE_TG4_OFFSET
:
225 assert(brw
->gen
>= 7);
226 if (ir
->shadow_compare
) {
227 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C
;
229 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO
;
233 unreachable("not reached");
235 assert(msg_type
!= -1);
237 if (simd_mode
== BRW_SAMPLER_SIMD_MODE_SIMD16
) {
242 assert(sampler_index
.file
== BRW_IMMEDIATE_VALUE
);
243 assert(sampler_index
.type
== BRW_REGISTER_TYPE_UD
);
245 uint32_t sampler
= sampler_index
.dw1
.ud
;
247 if (ir
->header_present
) {
248 /* The send-from-GRF for SIMD16 texturing with a header has an extra
249 * hardware register allocated to it, which we need to skip over (since
250 * our coordinates in the payload are in the even-numbered registers,
251 * and the header comes right before the first one.
253 if (dispatch_width
== 16)
256 unsigned save_exec_size
= default_state
.exec_size
;
257 default_state
.exec_size
= BRW_EXECUTE_8
;
259 MOV_RAW(src
, brw_vec8_grf(0, 0));
261 if (ir
->texture_offset
) {
262 /* Set the texel offset bits. */
263 MOV_RAW(retype(brw_vec1_grf(src
.nr
, 2), BRW_REGISTER_TYPE_UD
),
264 brw_imm_ud(ir
->texture_offset
));
268 /* The "Sampler Index" field can only store values between 0 and 15.
269 * However, we can add an offset to the "Sampler State Pointer"
270 * field, effectively selecting a different set of 16 samplers.
272 * The "Sampler State Pointer" needs to be aligned to a 32-byte
273 * offset, and each sampler state is only 16-bytes, so we can't
274 * exclusively use the offset - we have to use both.
276 const int sampler_state_size
= 16; /* 16 bytes */
277 gen8_instruction
*add
=
278 ADD(get_element_ud(src
, 3),
279 get_element_ud(brw_vec8_grf(0, 0), 3),
280 brw_imm_ud(16 * (sampler
/ 16) * sampler_state_size
));
281 gen8_set_mask_control(add
, BRW_MASK_DISABLE
);
284 default_state
.exec_size
= save_exec_size
;
287 uint32_t surf_index
=
288 prog_data
->base
.binding_table
.texture_start
+ sampler
;
290 gen8_instruction
*inst
= next_inst(BRW_OPCODE_SEND
);
291 gen8_set_dst(brw
, inst
, dst
);
292 gen8_set_src0(brw
, inst
, src
);
293 gen8_set_sampler_message(brw
, inst
,
302 brw_mark_surface_used(&prog_data
->base
, surf_index
);
306 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
309 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
311 * and we're trying to produce:
314 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
315 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
316 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
317 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
318 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
319 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
320 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
321 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
323 * and add another set of two more subspans if in 16-pixel dispatch mode.
325 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
326 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
327 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
328 * between each other. We could probably do it like ddx and swizzle the right
329 * order later, but bail for now and just produce
330 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
333 gen8_fs_generator::generate_ddx(fs_inst
*inst
,
337 unsigned vstride
, width
;
339 if (key
->high_quality_derivatives
) {
340 /* Produce accurate derivatives. */
341 vstride
= BRW_VERTICAL_STRIDE_2
;
344 /* Replicate the derivative at the top-left pixel to other pixels. */
345 vstride
= BRW_VERTICAL_STRIDE_4
;
349 struct brw_reg src0
= brw_reg(src
.file
, src
.nr
, 1,
353 BRW_HORIZONTAL_STRIDE_0
,
354 BRW_SWIZZLE_XYZW
, WRITEMASK_XYZW
);
355 struct brw_reg src1
= brw_reg(src
.file
, src
.nr
, 0,
359 BRW_HORIZONTAL_STRIDE_0
,
360 BRW_SWIZZLE_XYZW
, WRITEMASK_XYZW
);
361 ADD(dst
, src0
, negate(src1
));
364 /* The negate_value boolean is used to negate the derivative computation for
365 * FBOs, since they place the origin at the upper left instead of the lower
369 gen8_fs_generator::generate_ddy(fs_inst
*inst
,
375 unsigned src0_swizzle
;
376 unsigned src1_swizzle
;
379 if (key
->high_quality_derivatives
) {
380 /* Produce accurate derivatives. */
381 hstride
= BRW_HORIZONTAL_STRIDE_1
;
382 src0_swizzle
= BRW_SWIZZLE_XYXY
;
383 src1_swizzle
= BRW_SWIZZLE_ZWZW
;
386 default_state
.access_mode
= BRW_ALIGN_16
;
388 /* Replicate the derivative at the top-left pixel to other pixels. */
389 hstride
= BRW_HORIZONTAL_STRIDE_0
;
390 src0_swizzle
= BRW_SWIZZLE_XYZW
;
391 src1_swizzle
= BRW_SWIZZLE_XYZW
;
395 struct brw_reg src0
= brw_reg(src
.file
, src
.nr
, 0,
397 BRW_VERTICAL_STRIDE_4
,
400 src0_swizzle
, WRITEMASK_XYZW
);
401 struct brw_reg src1
= brw_reg(src
.file
, src
.nr
, src1_subnr
,
403 BRW_VERTICAL_STRIDE_4
,
406 src1_swizzle
, WRITEMASK_XYZW
);
409 ADD(dst
, src1
, negate(src0
));
411 ADD(dst
, src0
, negate(src1
));
413 default_state
.access_mode
= BRW_ALIGN_1
;
417 gen8_fs_generator::generate_scratch_write(fs_inst
*ir
, struct brw_reg src
)
419 MOV(retype(brw_message_reg(ir
->base_mrf
+ 1), BRW_REGISTER_TYPE_UD
),
420 retype(src
, BRW_REGISTER_TYPE_UD
));
423 retype(brw_message_reg(ir
->base_mrf
), BRW_REGISTER_TYPE_UD
);
425 const int num_regs
= dispatch_width
/ 8;
427 uint32_t msg_control
;
429 msg_control
= BRW_DATAPORT_OWORD_BLOCK_2_OWORDS
;
431 msg_control
= BRW_DATAPORT_OWORD_BLOCK_4_OWORDS
;
433 /* Set up the message header. This is g0, with g0.2 filled with
434 * the offset. We don't want to leave our offset around in g0 or
435 * it'll screw up texture samples, so set it up inside the message
438 unsigned save_exec_size
= default_state
.exec_size
;
439 default_state
.exec_size
= BRW_EXECUTE_8
;
441 MOV_RAW(mrf
, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
));
442 /* set message header global offset field (reg 0, element 2) */
443 MOV_RAW(get_element_ud(mrf
, 2), brw_imm_ud(ir
->offset
/ 16));
446 if (dispatch_width
== 16)
447 dst
= retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW
);
449 dst
= retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW
);
451 default_state
.exec_size
= BRW_EXECUTE_16
;
453 gen8_instruction
*send
= next_inst(BRW_OPCODE_SEND
);
454 gen8_set_dst(brw
, send
, dst
);
455 gen8_set_src0(brw
, send
, mrf
);
456 gen8_set_dp_message(brw
, send
, GEN7_SFID_DATAPORT_DATA_CACHE
,
457 255, /* binding table index: stateless access */
458 GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE
,
460 1 + num_regs
, /* mlen */
462 true, /* header present */
465 default_state
.exec_size
= save_exec_size
;
469 gen8_fs_generator::generate_scratch_read(fs_inst
*ir
, struct brw_reg dst
)
472 retype(brw_message_reg(ir
->base_mrf
), BRW_REGISTER_TYPE_UD
);
474 const int num_regs
= dispatch_width
/ 8;
476 uint32_t msg_control
;
478 msg_control
= BRW_DATAPORT_OWORD_BLOCK_2_OWORDS
;
480 msg_control
= BRW_DATAPORT_OWORD_BLOCK_4_OWORDS
;
482 unsigned save_exec_size
= default_state
.exec_size
;
483 default_state
.exec_size
= BRW_EXECUTE_8
;
485 MOV_RAW(mrf
, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
));
486 /* set message header global offset field (reg 0, element 2) */
487 MOV_RAW(get_element_ud(mrf
, 2), brw_imm_ud(ir
->offset
/ 16));
489 gen8_instruction
*send
= next_inst(BRW_OPCODE_SEND
);
490 gen8_set_dst(brw
, send
, retype(dst
, BRW_REGISTER_TYPE_UW
));
491 gen8_set_src0(brw
, send
, mrf
);
492 gen8_set_dp_message(brw
, send
, GEN7_SFID_DATAPORT_DATA_CACHE
,
493 255, /* binding table index: stateless access */
494 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ
,
498 true, /* header present */
501 default_state
.exec_size
= save_exec_size
;
505 gen8_fs_generator::generate_scratch_read_gen7(fs_inst
*ir
, struct brw_reg dst
)
507 unsigned save_exec_size
= default_state
.exec_size
;
508 gen8_instruction
*send
= next_inst(BRW_OPCODE_SEND
);
510 int num_regs
= dispatch_width
/ 8;
512 /* According to the docs, offset is "A 12-bit HWord offset into the memory
513 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
514 * is 32 bytes, which happens to be the size of a register.
516 int offset
= ir
->offset
/ REG_SIZE
;
518 /* The HW requires that the header is present; this is to get the g0.5
521 gen8_set_src0(brw
, send
, brw_vec8_grf(0, 0));
522 gen8_set_dst(brw
, send
, retype(dst
, BRW_REGISTER_TYPE_UW
));
523 gen8_set_dp_scratch_message(brw
, send
,
524 false, /* scratch read */
526 false, /* invalidate after read */
529 1, /* mlen - just g0 */
531 true, /* header present */
534 default_state
.exec_size
= save_exec_size
;
538 gen8_fs_generator::generate_uniform_pull_constant_load(fs_inst
*inst
,
540 struct brw_reg index
,
541 struct brw_reg offset
)
543 assert(inst
->mlen
== 0);
545 assert(index
.file
== BRW_IMMEDIATE_VALUE
&&
546 index
.type
== BRW_REGISTER_TYPE_UD
);
547 uint32_t surf_index
= index
.dw1
.ud
;
549 assert(offset
.file
== BRW_GENERAL_REGISTER_FILE
);
550 /* Reference only the dword we need lest we anger validate_reg() with
551 * reg.width > reg.execszie.
553 offset
= brw_vec1_grf(offset
.nr
, 0);
555 gen8_instruction
*send
= next_inst(BRW_OPCODE_SEND
);
556 gen8_set_mask_control(send
, BRW_MASK_DISABLE
);
558 /* We use the SIMD4x2 mode because we want to end up with 4 constants in
559 * the destination loaded consecutively from the same offset (which appears
560 * in the first component, and the rest are ignored).
562 dst
.width
= BRW_WIDTH_4
;
563 gen8_set_dst(brw
, send
, dst
);
564 gen8_set_src0(brw
, send
, offset
);
565 gen8_set_sampler_message(brw
, send
,
567 0, /* The LD message ignores the sampler unit. */
568 GEN5_SAMPLER_MESSAGE_SAMPLE_LD
,
571 false, /* no header */
572 BRW_SAMPLER_SIMD_MODE_SIMD4X2
);
574 brw_mark_surface_used(&prog_data
->base
, surf_index
);
578 gen8_fs_generator::generate_varying_pull_constant_load(fs_inst
*ir
,
580 struct brw_reg index
,
581 struct brw_reg offset
)
583 /* Varying-offset pull constant loads are treated as a normal expression on
584 * gen7, so the fact that it's a send message is hidden at the IR level.
586 assert(!ir
->header_present
);
589 assert(index
.file
== BRW_IMMEDIATE_VALUE
&&
590 index
.type
== BRW_REGISTER_TYPE_UD
);
591 uint32_t surf_index
= index
.dw1
.ud
;
593 uint32_t simd_mode
, rlen
, mlen
;
594 if (dispatch_width
== 16) {
597 simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD16
;
601 simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD8
;
604 gen8_instruction
*send
= next_inst(BRW_OPCODE_SEND
);
605 gen8_set_dst(brw
, send
, dst
);
606 gen8_set_src0(brw
, send
, offset
);
607 gen8_set_sampler_message(brw
, send
,
609 0, /* The LD message ignore the sampler unit. */
610 GEN5_SAMPLER_MESSAGE_SAMPLE_LD
,
613 false, /* no header */
616 brw_mark_surface_used(&prog_data
->base
, surf_index
);
620 * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
621 * into the flags register (f0.0).
624 gen8_fs_generator::generate_mov_dispatch_to_flags(fs_inst
*ir
)
626 struct brw_reg flags
= brw_flag_reg(0, ir
->flag_subreg
);
627 struct brw_reg dispatch_mask
=
628 retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW
);
630 gen8_instruction
*mov
= MOV(flags
, dispatch_mask
);
631 gen8_set_mask_control(mov
, BRW_MASK_DISABLE
);
635 gen8_fs_generator::generate_discard_jump(fs_inst
*ir
)
637 /* This HALT will be patched up at FB write time to point UIP at the end of
638 * the program, and at brw_uip_jip() JIP will be set to the end of the
639 * current block (or the program).
641 discard_halt_patches
.push_tail(new(mem_ctx
) ip_record(nr_inst
));
647 gen8_fs_generator::patch_discard_jumps_to_fb_writes()
649 if (discard_halt_patches
.is_empty())
652 /* There is a somewhat strange undocumented requirement of using
653 * HALT, according to the simulator. If some channel has HALTed to
654 * a particular UIP, then by the end of the program, every channel
655 * must have HALTed to that UIP. Furthermore, the tracking is a
656 * stack, so you can't do the final halt of a UIP after starting
657 * halting to a new UIP.
659 * Symptoms of not emitting this instruction on actual hardware
660 * included GPU hangs and sparkly rendering on the piglit discard
663 gen8_instruction
*last_halt
= HALT();
664 gen8_set_uip(last_halt
, 16);
665 gen8_set_jip(last_halt
, 16);
669 foreach_in_list(ip_record
, patch_ip
, &discard_halt_patches
) {
670 gen8_instruction
*patch
= &store
[patch_ip
->ip
];
671 assert(gen8_opcode(patch
) == BRW_OPCODE_HALT
);
673 /* HALT takes an instruction distance from the pre-incremented IP. */
674 gen8_set_uip(patch
, (ip
- patch_ip
->ip
) * 16);
677 this->discard_halt_patches
.make_empty();
682 * Sets the first dword of a vgrf for simd4x2 uniform pull constant
683 * sampler LD messages.
685 * We don't want to bake it into the send message's code generation because
686 * that means we don't get a chance to schedule the instruction.
689 gen8_fs_generator::generate_set_simd4x2_offset(fs_inst
*ir
,
691 struct brw_reg value
)
693 assert(value
.file
== BRW_IMMEDIATE_VALUE
);
694 MOV_RAW(retype(brw_vec1_reg(dst
.file
, dst
.nr
, 0), value
.type
), value
);
698 * Sets vstride=16, width=8, hstride=2 or vstride=0, width=1, hstride=0
699 * (when mask is passed as a uniform) of register mask before moving it
703 gen8_fs_generator::generate_set_omask(fs_inst
*inst
,
707 assert(dst
.type
== BRW_REGISTER_TYPE_UW
);
709 if (dispatch_width
== 16)
712 if (mask
.vstride
== BRW_VERTICAL_STRIDE_8
&&
713 mask
.width
== BRW_WIDTH_8
&&
714 mask
.hstride
== BRW_HORIZONTAL_STRIDE_1
) {
715 mask
= stride(mask
, 16, 8, 2);
717 assert(mask
.vstride
== BRW_VERTICAL_STRIDE_0
&&
718 mask
.width
== BRW_WIDTH_1
&&
719 mask
.hstride
== BRW_HORIZONTAL_STRIDE_0
);
722 gen8_instruction
*mov
= MOV(dst
, retype(mask
, dst
.type
));
723 gen8_set_mask_control(mov
, BRW_MASK_DISABLE
);
727 * Do a special ADD with vstride=1, width=4, hstride=0 for src1.
730 gen8_fs_generator::generate_set_sample_id(fs_inst
*ir
,
735 assert(dst
.type
== BRW_REGISTER_TYPE_D
|| dst
.type
== BRW_REGISTER_TYPE_UD
);
736 assert(src0
.type
== BRW_REGISTER_TYPE_D
|| src0
.type
== BRW_REGISTER_TYPE_UD
);
738 struct brw_reg reg
= retype(stride(src1
, 1, 4, 0), BRW_REGISTER_TYPE_UW
);
740 unsigned save_exec_size
= default_state
.exec_size
;
741 default_state
.exec_size
= BRW_EXECUTE_8
;
743 gen8_instruction
*add
= ADD(dst
, src0
, reg
);
744 gen8_set_mask_control(add
, BRW_MASK_DISABLE
);
745 if (dispatch_width
== 16) {
746 add
= ADD(offset(dst
, 1), offset(src0
, 1), suboffset(reg
, 2));
747 gen8_set_mask_control(add
, BRW_MASK_DISABLE
);
750 default_state
.exec_size
= save_exec_size
;
754 * Change the register's data type from UD to HF, doubling the strides in order
755 * to compensate for halving the data type width.
757 static struct brw_reg
758 ud_reg_to_hf(struct brw_reg r
)
760 assert(r
.type
== BRW_REGISTER_TYPE_UD
);
761 r
.type
= BRW_REGISTER_TYPE_HF
;
763 /* The BRW_*_STRIDE enums are defined so that incrementing the field
764 * doubles the real stride.
775 gen8_fs_generator::generate_pack_half_2x16_split(fs_inst
*inst
,
780 assert(dst
.type
== BRW_REGISTER_TYPE_UD
);
781 assert(x
.type
== BRW_REGISTER_TYPE_F
);
782 assert(y
.type
== BRW_REGISTER_TYPE_F
);
784 struct brw_reg dst_hf
= ud_reg_to_hf(dst
);
786 /* Give each 32-bit channel of dst the form below , where "." means
795 SHL(dst
, dst
, brw_imm_ud(16u));
797 /* And, finally the form of packHalf2x16's output:
804 gen8_fs_generator::generate_unpack_half_2x16_split(fs_inst
*inst
,
808 assert(dst
.type
== BRW_REGISTER_TYPE_F
);
809 assert(src
.type
== BRW_REGISTER_TYPE_UD
);
811 struct brw_reg src_hf
= ud_reg_to_hf(src
);
813 /* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll.
814 * For the Y case, we wish to access only the upper word; therefore
815 * a 16-bit subregister offset is needed.
817 assert(inst
->opcode
== FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X
||
818 inst
->opcode
== FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y
);
819 if (inst
->opcode
== FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y
)
826 gen8_fs_generator::generate_untyped_atomic(fs_inst
*ir
,
828 struct brw_reg atomic_op
,
829 struct brw_reg surf_index
)
831 assert(atomic_op
.file
== BRW_IMMEDIATE_VALUE
&&
832 atomic_op
.type
== BRW_REGISTER_TYPE_UD
&&
833 surf_index
.file
== BRW_IMMEDIATE_VALUE
&&
834 surf_index
.type
== BRW_REGISTER_TYPE_UD
);
835 assert((atomic_op
.dw1
.ud
& ~0xf) == 0);
837 unsigned msg_control
=
838 atomic_op
.dw1
.ud
| /* Atomic Operation Type: BRW_AOP_* */
839 ((dispatch_width
== 16 ? 0 : 1) << 4) | /* SIMD Mode */
840 (1 << 5); /* Return data expected */
842 gen8_instruction
*inst
= next_inst(BRW_OPCODE_SEND
);
843 gen8_set_dst(brw
, inst
, retype(dst
, BRW_REGISTER_TYPE_UD
));
844 gen8_set_src0(brw
, inst
, brw_message_reg(ir
->base_mrf
));
845 gen8_set_dp_message(brw
, inst
, HSW_SFID_DATAPORT_DATA_CACHE_1
,
847 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP
,
854 brw_mark_surface_used(&prog_data
->base
, surf_index
.dw1
.ud
);
858 gen8_fs_generator::generate_untyped_surface_read(fs_inst
*ir
,
860 struct brw_reg surf_index
)
862 assert(surf_index
.file
== BRW_IMMEDIATE_VALUE
&&
863 surf_index
.type
== BRW_REGISTER_TYPE_UD
);
865 unsigned msg_control
= 0xe | /* Enable only the R channel */
866 ((dispatch_width
== 16 ? 1 : 2) << 4); /* SIMD Mode */
868 gen8_instruction
*inst
= next_inst(BRW_OPCODE_SEND
);
869 gen8_set_dst(brw
, inst
, retype(dst
, BRW_REGISTER_TYPE_UD
));
870 gen8_set_src0(brw
, inst
, brw_message_reg(ir
->base_mrf
));
871 gen8_set_dp_message(brw
, inst
, HSW_SFID_DATAPORT_DATA_CACHE_1
,
873 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ
,
880 brw_mark_surface_used(&prog_data
->base
, surf_index
.dw1
.ud
);
884 gen8_fs_generator::generate_code(exec_list
*instructions
)
886 int start_offset
= next_inst_offset
;
888 struct annotation_info annotation
;
889 memset(&annotation
, 0, sizeof(annotation
));
892 if (unlikely(INTEL_DEBUG
& DEBUG_WM
))
893 cfg
= new(mem_ctx
) cfg_t(instructions
);
895 foreach_in_list(fs_inst
, ir
, instructions
) {
896 struct brw_reg src
[3], dst
;
898 if (unlikely(INTEL_DEBUG
& DEBUG_WM
))
899 annotate(brw
, &annotation
, cfg
, ir
, next_inst_offset
);
901 for (unsigned int i
= 0; i
< 3; i
++) {
902 src
[i
] = brw_reg_from_fs_reg(&ir
->src
[i
]);
904 /* The accumulator result appears to get used for the
905 * conditional modifier generation. When negating a UD
906 * value, there is a 33rd bit generated for the sign in the
907 * accumulator value, so now you can't check, for example,
908 * equality with a 32-bit value. See piglit fs-op-neg-uvec4.
910 assert(!ir
->conditional_mod
||
911 ir
->src
[i
].type
!= BRW_REGISTER_TYPE_UD
||
914 dst
= brw_reg_from_fs_reg(&ir
->dst
);
916 default_state
.conditional_mod
= ir
->conditional_mod
;
917 default_state
.predicate
= ir
->predicate
;
918 default_state
.predicate_inverse
= ir
->predicate_inverse
;
919 default_state
.saturate
= ir
->saturate
;
920 default_state
.mask_control
= ir
->force_writemask_all
;
921 default_state
.flag_subreg_nr
= ir
->flag_subreg
;
923 if (dispatch_width
== 16 && !ir
->force_uncompressed
&& !ir
->force_sechalf
)
924 default_state
.exec_size
= BRW_EXECUTE_16
;
926 default_state
.exec_size
= BRW_EXECUTE_8
;
928 if (ir
->force_uncompressed
|| dispatch_width
== 8)
929 default_state
.qtr_control
= GEN6_COMPRESSION_1Q
;
930 else if (ir
->force_sechalf
)
931 default_state
.qtr_control
= GEN6_COMPRESSION_2Q
;
933 default_state
.qtr_control
= GEN6_COMPRESSION_1H
;
935 switch (ir
->opcode
) {
940 ADD(dst
, src
[0], src
[1]);
943 MUL(dst
, src
[0], src
[1]);
945 case BRW_OPCODE_MACH
:
946 MACH(dst
, src
[0], src
[1]);
950 default_state
.access_mode
= BRW_ALIGN_16
;
951 MAD(dst
, src
[0], src
[1], src
[2]);
952 default_state
.access_mode
= BRW_ALIGN_1
;
956 default_state
.access_mode
= BRW_ALIGN_16
;
957 LRP(dst
, src
[0], src
[1], src
[2]);
958 default_state
.access_mode
= BRW_ALIGN_1
;
965 case BRW_OPCODE_RNDD
:
968 case BRW_OPCODE_RNDE
:
971 case BRW_OPCODE_RNDZ
:
976 AND(dst
, src
[0], src
[1]);
979 OR(dst
, src
[0], src
[1]);
982 XOR(dst
, src
[0], src
[1]);
988 ASR(dst
, src
[0], src
[1]);
991 SHR(dst
, src
[0], src
[1]);
994 SHL(dst
, src
[0], src
[1]);
997 case BRW_OPCODE_F32TO16
:
998 MOV(retype(dst
, BRW_REGISTER_TYPE_HF
), src
[0]);
1000 case BRW_OPCODE_F16TO32
:
1001 MOV(dst
, retype(src
[0], BRW_REGISTER_TYPE_HF
));
1004 case BRW_OPCODE_CMP
:
1005 CMP(dst
, ir
->conditional_mod
, src
[0], src
[1]);
1007 case BRW_OPCODE_SEL
:
1008 SEL(dst
, src
[0], src
[1]);
1011 case BRW_OPCODE_BFREV
:
1012 /* BFREV only supports UD type for src and dst. */
1013 BFREV(retype(dst
, BRW_REGISTER_TYPE_UD
),
1014 retype(src
[0], BRW_REGISTER_TYPE_UD
));
1017 case BRW_OPCODE_FBH
:
1018 /* FBH only supports UD type for dst. */
1019 FBH(retype(dst
, BRW_REGISTER_TYPE_UD
), src
[0]);
1022 case BRW_OPCODE_FBL
:
1023 /* FBL only supports UD type for dst. */
1024 FBL(retype(dst
, BRW_REGISTER_TYPE_UD
), src
[0]);
1027 case BRW_OPCODE_CBIT
:
1028 /* CBIT only supports UD type for dst. */
1029 CBIT(retype(dst
, BRW_REGISTER_TYPE_UD
), src
[0]);
1032 case BRW_OPCODE_ADDC
:
1033 ADDC(dst
, src
[0], src
[1]);
1036 case BRW_OPCODE_SUBB
:
1037 SUBB(dst
, src
[0], src
[1]);
1040 case BRW_OPCODE_BFE
:
1041 default_state
.access_mode
= BRW_ALIGN_16
;
1042 BFE(dst
, src
[0], src
[1], src
[2]);
1043 default_state
.access_mode
= BRW_ALIGN_1
;
1046 case BRW_OPCODE_BFI1
:
1047 BFI1(dst
, src
[0], src
[1]);
1050 case BRW_OPCODE_BFI2
:
1051 default_state
.access_mode
= BRW_ALIGN_16
;
1052 BFI2(dst
, src
[0], src
[1], src
[2]);
1053 default_state
.access_mode
= BRW_ALIGN_1
;
1057 IF(BRW_PREDICATE_NORMAL
);
1060 case BRW_OPCODE_ELSE
:
1064 case BRW_OPCODE_ENDIF
:
1072 case BRW_OPCODE_BREAK
:
1076 case BRW_OPCODE_CONTINUE
:
1080 case BRW_OPCODE_WHILE
:
1084 case SHADER_OPCODE_RCP
:
1085 MATH(BRW_MATH_FUNCTION_INV
, dst
, src
[0]);
1088 case SHADER_OPCODE_RSQ
:
1089 MATH(BRW_MATH_FUNCTION_RSQ
, dst
, src
[0]);
1092 case SHADER_OPCODE_SQRT
:
1093 MATH(BRW_MATH_FUNCTION_SQRT
, dst
, src
[0]);
1096 case SHADER_OPCODE_EXP2
:
1097 MATH(BRW_MATH_FUNCTION_EXP
, dst
, src
[0]);
1100 case SHADER_OPCODE_LOG2
:
1101 MATH(BRW_MATH_FUNCTION_LOG
, dst
, src
[0]);
1104 case SHADER_OPCODE_SIN
:
1105 MATH(BRW_MATH_FUNCTION_SIN
, dst
, src
[0]);
1108 case SHADER_OPCODE_COS
:
1109 MATH(BRW_MATH_FUNCTION_COS
, dst
, src
[0]);
1112 case SHADER_OPCODE_INT_QUOTIENT
:
1113 MATH(BRW_MATH_FUNCTION_INT_DIV_QUOTIENT
, dst
, src
[0], src
[1]);
1116 case SHADER_OPCODE_INT_REMAINDER
:
1117 MATH(BRW_MATH_FUNCTION_INT_DIV_REMAINDER
, dst
, src
[0], src
[1]);
1120 case SHADER_OPCODE_POW
:
1121 MATH(BRW_MATH_FUNCTION_POW
, dst
, src
[0], src
[1]);
1124 case FS_OPCODE_PIXEL_X
:
1125 case FS_OPCODE_PIXEL_Y
:
1126 unreachable("FS_OPCODE_PIXEL_X and FS_OPCODE_PIXEL_Y are only for Gen4-5.");
1128 case FS_OPCODE_CINTERP
:
1131 case FS_OPCODE_LINTERP
:
1132 generate_linterp(ir
, dst
, src
);
1134 case SHADER_OPCODE_TEX
:
1136 case SHADER_OPCODE_TXD
:
1137 case SHADER_OPCODE_TXF
:
1138 case SHADER_OPCODE_TXF_CMS
:
1139 case SHADER_OPCODE_TXF_UMS
:
1140 case SHADER_OPCODE_TXF_MCS
:
1141 case SHADER_OPCODE_TXL
:
1142 case SHADER_OPCODE_TXS
:
1143 case SHADER_OPCODE_LOD
:
1144 case SHADER_OPCODE_TG4
:
1145 case SHADER_OPCODE_TG4_OFFSET
:
1146 generate_tex(ir
, dst
, src
[0], src
[1]);
1150 generate_ddx(ir
, dst
, src
[0]);
1153 /* Make sure fp->UsesDFdy flag got set (otherwise there's no
1154 * guarantee that key->render_to_fbo is set).
1156 assert(fp
->UsesDFdy
);
1157 generate_ddy(ir
, dst
, src
[0], key
->render_to_fbo
);
1160 case SHADER_OPCODE_GEN4_SCRATCH_WRITE
:
1161 generate_scratch_write(ir
, src
[0]);
1164 case SHADER_OPCODE_GEN4_SCRATCH_READ
:
1165 generate_scratch_read(ir
, dst
);
1168 case SHADER_OPCODE_GEN7_SCRATCH_READ
:
1169 generate_scratch_read_gen7(ir
, dst
);
1172 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7
:
1173 generate_uniform_pull_constant_load(ir
, dst
, src
[0], src
[1]);
1176 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7
:
1177 generate_varying_pull_constant_load(ir
, dst
, src
[0], src
[1]);
1180 case FS_OPCODE_FB_WRITE
:
1181 generate_fb_write(ir
);
1184 case FS_OPCODE_MOV_DISPATCH_TO_FLAGS
:
1185 generate_mov_dispatch_to_flags(ir
);
1188 case FS_OPCODE_DISCARD_JUMP
:
1189 generate_discard_jump(ir
);
1192 case SHADER_OPCODE_SHADER_TIME_ADD
:
1193 unreachable("XXX: Missing Gen8 scalar support for INTEL_DEBUG=shader_time");
1195 case SHADER_OPCODE_UNTYPED_ATOMIC
:
1196 generate_untyped_atomic(ir
, dst
, src
[0], src
[1]);
1199 case SHADER_OPCODE_UNTYPED_SURFACE_READ
:
1200 generate_untyped_surface_read(ir
, dst
, src
[0]);
1203 case FS_OPCODE_SET_SIMD4X2_OFFSET
:
1204 generate_set_simd4x2_offset(ir
, dst
, src
[0]);
1207 case FS_OPCODE_SET_OMASK
:
1208 generate_set_omask(ir
, dst
, src
[0]);
1211 case FS_OPCODE_SET_SAMPLE_ID
:
1212 generate_set_sample_id(ir
, dst
, src
[0], src
[1]);
1215 case FS_OPCODE_PACK_HALF_2x16_SPLIT
:
1216 generate_pack_half_2x16_split(ir
, dst
, src
[0], src
[1]);
1219 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X
:
1220 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y
:
1221 generate_unpack_half_2x16_split(ir
, dst
, src
[0]);
1224 case FS_OPCODE_PLACEHOLDER_HALT
:
1225 /* This is the place where the final HALT needs to be inserted if
1226 * we've emitted any discards. If not, this will emit no code.
1228 if (!patch_discard_jumps_to_fb_writes()) {
1229 if (unlikely(INTEL_DEBUG
& DEBUG_WM
)) {
1230 annotation
.ann_count
--;
1236 if (ir
->opcode
< int(ARRAY_SIZE(opcode_descs
))) {
1237 _mesa_problem(ctx
, "Unsupported opcode `%s' in FS",
1238 opcode_descs
[ir
->opcode
].name
);
1240 _mesa_problem(ctx
, "Unsupported opcode %d in FS", ir
->opcode
);
1246 patch_jump_targets();
1247 annotation_finalize(&annotation
, next_inst_offset
);
1249 int before_size
= next_inst_offset
- start_offset
;
1251 if (unlikely(INTEL_DEBUG
& DEBUG_WM
)) {
1254 "Native code for %s fragment shader %d (SIMD%d dispatch):\n",
1255 shader_prog
->Label
? shader_prog
->Label
: "unnamed",
1256 shader_prog
->Name
, dispatch_width
);
1259 "Native code for fragment program %d (SIMD%d dispatch):\n",
1260 prog
->Id
, dispatch_width
);
1262 fprintf(stderr
, "Native code for blorp program (SIMD%d dispatch):\n",
1265 fprintf(stderr
, "SIMD%d shader: %d instructions.\n",
1266 dispatch_width
, before_size
/ 16);
1268 dump_assembly(store
, annotation
.ann_count
, annotation
.ann
, brw
, prog
);
1269 ralloc_free(annotation
.ann
);
1274 gen8_fs_generator::generate_assembly(exec_list
*simd8_instructions
,
1275 exec_list
*simd16_instructions
,
1276 unsigned *assembly_size
)
1278 assert(simd8_instructions
|| simd16_instructions
);
1280 if (simd8_instructions
) {
1282 generate_code(simd8_instructions
);
1285 if (simd16_instructions
) {
1286 /* Align to a 64-byte boundary. */
1287 while (next_inst_offset
% 64)
1290 /* Save off the start of this SIMD16 program */
1291 prog_data
->prog_offset_16
= next_inst_offset
;
1293 dispatch_width
= 16;
1294 generate_code(simd16_instructions
);
1297 *assembly_size
= next_inst_offset
;
1298 return (const unsigned *) store
;