2 * Copyright © 2010, 2011, 2012 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 /** @file gen8_fs_generate.cpp
26 * Code generation for Gen8+ hardware.
30 #include "main/macros.h"
31 #include "brw_context.h"
36 #include "glsl/ir_print_visitor.h"
38 gen8_fs_generator::gen8_fs_generator(struct brw_context
*brw
,
39 struct brw_wm_compile
*c
,
40 struct gl_shader_program
*shader_prog
,
41 struct gl_fragment_program
*fp
,
42 bool dual_source_output
)
43 : gen8_generator(brw
, shader_prog
, fp
? &fp
->Base
: NULL
, c
), c(c
), fp(fp
),
44 dual_source_output(dual_source_output
)
48 gen8_fs_generator::~gen8_fs_generator()
53 gen8_fs_generator::mark_surface_used(unsigned surf_index
)
55 assert(surf_index
< BRW_MAX_SURFACES
);
57 c
->prog_data
.base
.binding_table
.size_bytes
=
58 MAX2(c
->prog_data
.base
.binding_table
.size_bytes
, (surf_index
+ 1) * 4);
62 gen8_fs_generator::generate_fb_write(fs_inst
*ir
)
64 /* Disable the discard condition while setting up the header. */
65 default_state
.predicate
= BRW_PREDICATE_NONE
;
66 default_state
.predicate_inverse
= false;
67 default_state
.flag_subreg_nr
= 0;
69 if (ir
->header_present
) {
70 /* The GPU will use the predicate on SENDC, unless the header is present.
72 if (fp
&& fp
->UsesKill
) {
73 gen8_instruction
*mov
=
74 MOV(retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW
),
76 gen8_set_mask_control(mov
, BRW_MASK_DISABLE
);
79 gen8_instruction
*mov
=
80 MOV_RAW(brw_message_reg(ir
->base_mrf
), brw_vec8_grf(0, 0));
81 gen8_set_exec_size(mov
, BRW_EXECUTE_16
);
83 if (ir
->target
> 0 && c
->key
.replicate_alpha
) {
84 /* Set "Source0 Alpha Present to RenderTarget" bit in the header. */
85 OR(vec1(retype(brw_message_reg(ir
->base_mrf
), BRW_REGISTER_TYPE_UD
)),
86 vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
)),
91 /* Set the render target index for choosing BLEND_STATE. */
92 MOV(retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE
, ir
->base_mrf
, 2),
93 BRW_REGISTER_TYPE_UD
),
94 brw_imm_ud(ir
->target
));
98 /* Set the predicate back to get the conditional write if necessary for
101 default_state
.predicate
= ir
->predicate
;
102 default_state
.predicate_inverse
= ir
->predicate_inverse
;
103 default_state
.flag_subreg_nr
= ir
->flag_subreg
;
105 gen8_instruction
*inst
= next_inst(BRW_OPCODE_SENDC
);
106 gen8_set_dst(brw
, inst
, retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW
));
107 gen8_set_src0(brw
, inst
, brw_message_reg(ir
->base_mrf
));
109 /* Set up the "Message Specific Control" bits for the Data Port Message
110 * Descriptor. These are documented in the "Render Target Write" message's
111 * "Message Descriptor" documentation (vol5c.2).
114 /* Set the Message Type */
115 if (this->dual_source_output
)
116 msg_type
= BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01
;
117 else if (dispatch_width
== 16)
118 msg_type
= BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE
;
120 msg_type
= BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01
;
122 uint32_t msg_control
= msg_type
;
124 /* "Last Render Target Select" must be set on all writes to the last of
125 * the render targets (if using MRT), or always for a single RT scenario.
127 if ((ir
->target
== c
->key
.nr_color_regions
- 1) || !c
->key
.nr_color_regions
)
128 msg_control
|= (1 << 4); /* Last Render Target Select */
130 uint32_t surf_index
=
131 c
->prog_data
.binding_table
.render_target_start
+ ir
->target
;
133 gen8_set_dp_message(brw
, inst
,
134 GEN6_SFID_DATAPORT_RENDER_CACHE
,
136 GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE
,
143 mark_surface_used(surf_index
);
147 gen8_fs_generator::generate_linterp(fs_inst
*inst
,
151 struct brw_reg delta_x
= src
[0];
152 struct brw_reg delta_y
= src
[1];
153 struct brw_reg interp
= src
[2];
156 assert(delta_y
.nr
== delta_x
.nr
+ 1);
157 PLN(dst
, interp
, delta_x
);
161 gen8_fs_generator::generate_tex(fs_inst
*ir
,
167 uint32_t simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD8
;
169 assert(src
.file
== BRW_GENERAL_REGISTER_FILE
);
171 if (dispatch_width
== 16 && !ir
->force_uncompressed
&& !ir
->force_sechalf
)
172 simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD16
;
174 switch (ir
->opcode
) {
175 case SHADER_OPCODE_TEX
:
176 if (ir
->shadow_compare
) {
177 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE
;
179 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE
;
183 if (ir
->shadow_compare
) {
184 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE
;
186 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS
;
189 case SHADER_OPCODE_TXL
:
190 if (ir
->shadow_compare
) {
191 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE
;
193 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_LOD
;
196 case SHADER_OPCODE_TXS
:
197 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO
;
199 case SHADER_OPCODE_TXD
:
200 if (ir
->shadow_compare
) {
201 msg_type
= HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE
;
203 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS
;
206 case SHADER_OPCODE_TXF
:
207 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_LD
;
209 case SHADER_OPCODE_TXF_CMS
:
210 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS
;
212 case SHADER_OPCODE_TXF_UMS
:
213 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS
;
215 case SHADER_OPCODE_TXF_MCS
:
216 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS
;
218 case SHADER_OPCODE_LOD
:
219 msg_type
= GEN5_SAMPLER_MESSAGE_LOD
;
221 case SHADER_OPCODE_TG4
:
222 if (ir
->shadow_compare
) {
223 assert(brw
->gen
>= 7);
224 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C
;
226 assert(brw
->gen
>= 6);
227 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4
;
230 case SHADER_OPCODE_TG4_OFFSET
:
231 assert(brw
->gen
>= 7);
232 if (ir
->shadow_compare
) {
233 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C
;
235 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO
;
239 assert(!"not reached");
242 assert(msg_type
!= -1);
244 if (simd_mode
== BRW_SAMPLER_SIMD_MODE_SIMD16
) {
249 if (ir
->header_present
) {
250 /* The send-from-GRF for SIMD16 texturing with a header has an extra
251 * hardware register allocated to it, which we need to skip over (since
252 * our coordinates in the payload are in the even-numbered registers,
253 * and the header comes right before the first one.
255 if (dispatch_width
== 16)
258 unsigned save_exec_size
= default_state
.exec_size
;
259 default_state
.exec_size
= BRW_EXECUTE_8
;
261 MOV_RAW(src
, brw_vec8_grf(0, 0));
263 if (ir
->texture_offset
) {
264 /* Set the texel offset bits. */
265 MOV_RAW(retype(brw_vec1_grf(src
.nr
, 2), BRW_REGISTER_TYPE_UD
),
266 brw_imm_ud(ir
->texture_offset
));
269 if (ir
->sampler
>= 16) {
270 /* The "Sampler Index" field can only store values between 0 and 15.
271 * However, we can add an offset to the "Sampler State Pointer"
272 * field, effectively selecting a different set of 16 samplers.
274 * The "Sampler State Pointer" needs to be aligned to a 32-byte
275 * offset, and each sampler state is only 16-bytes, so we can't
276 * exclusively use the offset - we have to use both.
278 gen8_instruction
*add
=
279 ADD(get_element_ud(src
, 3),
280 get_element_ud(brw_vec8_grf(0, 0), 3),
281 brw_imm_ud(16 * (ir
->sampler
/ 16) *
282 sizeof(gen7_sampler_state
)));
283 gen8_set_mask_control(add
, BRW_MASK_DISABLE
);
286 default_state
.exec_size
= save_exec_size
;
289 uint32_t surf_index
=
290 c
->prog_data
.base
.binding_table
.texture_start
+ ir
->sampler
;
292 gen8_instruction
*inst
= next_inst(BRW_OPCODE_SEND
);
293 gen8_set_dst(brw
, inst
, dst
);
294 gen8_set_src0(brw
, inst
, src
);
295 gen8_set_sampler_message(brw
, inst
,
304 mark_surface_used(surf_index
);
308 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
311 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
313 * and we're trying to produce:
316 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
317 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
318 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
319 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
320 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
321 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
322 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
323 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
325 * and add another set of two more subspans if in 16-pixel dispatch mode.
327 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
328 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
329 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
330 * between each other. We could probably do it like ddx and swizzle the right
331 * order later, but bail for now and just produce
332 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
335 gen8_fs_generator::generate_ddx(fs_inst
*inst
,
339 unsigned vstride
, width
;
341 if (c
->key
.high_quality_derivatives
) {
342 /* Produce accurate derivatives. */
343 vstride
= BRW_VERTICAL_STRIDE_2
;
346 /* Replicate the derivative at the top-left pixel to other pixels. */
347 vstride
= BRW_VERTICAL_STRIDE_4
;
351 struct brw_reg src0
= brw_reg(src
.file
, src
.nr
, 1,
355 BRW_HORIZONTAL_STRIDE_0
,
356 BRW_SWIZZLE_XYZW
, WRITEMASK_XYZW
);
357 struct brw_reg src1
= brw_reg(src
.file
, src
.nr
, 0,
361 BRW_HORIZONTAL_STRIDE_0
,
362 BRW_SWIZZLE_XYZW
, WRITEMASK_XYZW
);
363 ADD(dst
, src0
, negate(src1
));
366 /* The negate_value boolean is used to negate the derivative computation for
367 * FBOs, since they place the origin at the upper left instead of the lower
371 gen8_fs_generator::generate_ddy(fs_inst
*inst
,
377 unsigned src0_swizzle
;
378 unsigned src1_swizzle
;
381 if (c
->key
.high_quality_derivatives
) {
382 /* Produce accurate derivatives. */
383 hstride
= BRW_HORIZONTAL_STRIDE_1
;
384 src0_swizzle
= BRW_SWIZZLE_XYXY
;
385 src1_swizzle
= BRW_SWIZZLE_ZWZW
;
388 default_state
.access_mode
= BRW_ALIGN_16
;
390 /* Replicate the derivative at the top-left pixel to other pixels. */
391 hstride
= BRW_HORIZONTAL_STRIDE_0
;
392 src0_swizzle
= BRW_SWIZZLE_XYZW
;
393 src1_swizzle
= BRW_SWIZZLE_XYZW
;
397 struct brw_reg src0
= brw_reg(src
.file
, src
.nr
, 0,
399 BRW_VERTICAL_STRIDE_4
,
402 src0_swizzle
, WRITEMASK_XYZW
);
403 struct brw_reg src1
= brw_reg(src
.file
, src
.nr
, src1_subnr
,
405 BRW_VERTICAL_STRIDE_4
,
408 src1_swizzle
, WRITEMASK_XYZW
);
411 ADD(dst
, src1
, negate(src0
));
413 ADD(dst
, src0
, negate(src1
));
415 default_state
.access_mode
= BRW_ALIGN_1
;
419 gen8_fs_generator::generate_scratch_write(fs_inst
*ir
, struct brw_reg src
)
421 MOV(retype(brw_message_reg(ir
->base_mrf
+ 1), BRW_REGISTER_TYPE_UD
),
422 retype(src
, BRW_REGISTER_TYPE_UD
));
425 retype(brw_message_reg(ir
->base_mrf
), BRW_REGISTER_TYPE_UD
);
427 const int num_regs
= dispatch_width
/ 8;
429 uint32_t msg_control
;
431 msg_control
= BRW_DATAPORT_OWORD_BLOCK_2_OWORDS
;
433 msg_control
= BRW_DATAPORT_OWORD_BLOCK_4_OWORDS
;
435 /* Set up the message header. This is g0, with g0.2 filled with
436 * the offset. We don't want to leave our offset around in g0 or
437 * it'll screw up texture samples, so set it up inside the message
440 unsigned save_exec_size
= default_state
.exec_size
;
441 default_state
.exec_size
= BRW_EXECUTE_8
;
443 MOV_RAW(mrf
, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
));
444 /* set message header global offset field (reg 0, element 2) */
445 MOV_RAW(get_element_ud(mrf
, 2), brw_imm_ud(ir
->offset
/ 16));
448 if (dispatch_width
== 16)
449 dst
= retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW
);
451 dst
= retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW
);
453 default_state
.exec_size
= BRW_EXECUTE_16
;
455 gen8_instruction
*send
= next_inst(BRW_OPCODE_SEND
);
456 gen8_set_dst(brw
, send
, dst
);
457 gen8_set_src0(brw
, send
, mrf
);
458 gen8_set_dp_message(brw
, send
, GEN7_SFID_DATAPORT_DATA_CACHE
,
459 255, /* binding table index: stateless access */
460 GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE
,
462 1 + num_regs
, /* mlen */
464 true, /* header present */
467 default_state
.exec_size
= save_exec_size
;
471 gen8_fs_generator::generate_scratch_read(fs_inst
*ir
, struct brw_reg dst
)
474 retype(brw_message_reg(ir
->base_mrf
), BRW_REGISTER_TYPE_UD
);
476 const int num_regs
= dispatch_width
/ 8;
478 uint32_t msg_control
;
480 msg_control
= BRW_DATAPORT_OWORD_BLOCK_2_OWORDS
;
482 msg_control
= BRW_DATAPORT_OWORD_BLOCK_4_OWORDS
;
484 unsigned save_exec_size
= default_state
.exec_size
;
485 default_state
.exec_size
= BRW_EXECUTE_8
;
487 MOV_RAW(mrf
, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
));
488 /* set message header global offset field (reg 0, element 2) */
489 MOV_RAW(get_element_ud(mrf
, 2), brw_imm_ud(ir
->offset
/ 16));
491 gen8_instruction
*send
= next_inst(BRW_OPCODE_SEND
);
492 gen8_set_dst(brw
, send
, retype(dst
, BRW_REGISTER_TYPE_UW
));
493 gen8_set_src0(brw
, send
, mrf
);
494 gen8_set_dp_message(brw
, send
, GEN7_SFID_DATAPORT_DATA_CACHE
,
495 255, /* binding table index: stateless access */
496 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ
,
500 true, /* header present */
503 default_state
.exec_size
= save_exec_size
;
507 gen8_fs_generator::generate_scratch_read_gen7(fs_inst
*ir
, struct brw_reg dst
)
509 unsigned save_exec_size
= default_state
.exec_size
;
510 gen8_instruction
*send
= next_inst(BRW_OPCODE_SEND
);
512 int num_regs
= dispatch_width
/ 8;
514 /* According to the docs, offset is "A 12-bit HWord offset into the memory
515 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
516 * is 32 bytes, which happens to be the size of a register.
518 int offset
= ir
->offset
/ REG_SIZE
;
520 /* The HW requires that the header is present; this is to get the g0.5
523 gen8_set_src0(brw
, send
, brw_vec8_grf(0, 0));
524 gen8_set_dst(brw
, send
, retype(dst
, BRW_REGISTER_TYPE_UW
));
525 gen8_set_dp_scratch_message(brw
, send
,
526 false, /* scratch read */
528 false, /* invalidate after read */
531 1, /* mlen - just g0 */
533 true, /* header present */
536 default_state
.exec_size
= save_exec_size
;
540 gen8_fs_generator::generate_uniform_pull_constant_load(fs_inst
*inst
,
542 struct brw_reg index
,
543 struct brw_reg offset
)
545 assert(inst
->mlen
== 0);
547 assert(index
.file
== BRW_IMMEDIATE_VALUE
&&
548 index
.type
== BRW_REGISTER_TYPE_UD
);
549 uint32_t surf_index
= index
.dw1
.ud
;
551 assert(offset
.file
== BRW_GENERAL_REGISTER_FILE
);
552 /* Reference only the dword we need lest we anger validate_reg() with
553 * reg.width > reg.execszie.
555 offset
= brw_vec1_grf(offset
.nr
, 0);
557 gen8_instruction
*send
= next_inst(BRW_OPCODE_SEND
);
558 gen8_set_mask_control(send
, BRW_MASK_DISABLE
);
560 /* We use the SIMD4x2 mode because we want to end up with 4 constants in
561 * the destination loaded consecutively from the same offset (which appears
562 * in the first component, and the rest are ignored).
564 dst
.width
= BRW_WIDTH_4
;
565 gen8_set_dst(brw
, send
, dst
);
566 gen8_set_src0(brw
, send
, offset
);
567 gen8_set_sampler_message(brw
, send
,
569 0, /* The LD message ignores the sampler unit. */
570 GEN5_SAMPLER_MESSAGE_SAMPLE_LD
,
573 false, /* no header */
574 BRW_SAMPLER_SIMD_MODE_SIMD4X2
);
576 mark_surface_used(surf_index
);
580 gen8_fs_generator::generate_varying_pull_constant_load(fs_inst
*ir
,
582 struct brw_reg index
,
583 struct brw_reg offset
)
585 /* Varying-offset pull constant loads are treated as a normal expression on
586 * gen7, so the fact that it's a send message is hidden at the IR level.
588 assert(!ir
->header_present
);
591 assert(index
.file
== BRW_IMMEDIATE_VALUE
&&
592 index
.type
== BRW_REGISTER_TYPE_UD
);
593 uint32_t surf_index
= index
.dw1
.ud
;
595 uint32_t simd_mode
, rlen
, mlen
;
596 if (dispatch_width
== 16) {
599 simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD16
;
603 simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD8
;
606 gen8_instruction
*send
= next_inst(BRW_OPCODE_SEND
);
607 gen8_set_dst(brw
, send
, dst
);
608 gen8_set_src0(brw
, send
, offset
);
609 gen8_set_sampler_message(brw
, send
,
611 0, /* The LD message ignore the sampler unit. */
612 GEN5_SAMPLER_MESSAGE_SAMPLE_LD
,
615 false, /* no header */
618 mark_surface_used(surf_index
);
622 * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
623 * into the flags register (f0.0).
626 gen8_fs_generator::generate_mov_dispatch_to_flags(fs_inst
*ir
)
628 struct brw_reg flags
= brw_flag_reg(0, ir
->flag_subreg
);
629 struct brw_reg dispatch_mask
=
630 retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW
);
632 gen8_instruction
*mov
= MOV(flags
, dispatch_mask
);
633 gen8_set_mask_control(mov
, BRW_MASK_DISABLE
);
637 gen8_fs_generator::generate_discard_jump(fs_inst
*ir
)
639 /* This HALT will be patched up at FB write time to point UIP at the end of
640 * the program, and at brw_uip_jip() JIP will be set to the end of the
641 * current block (or the program).
643 discard_halt_patches
.push_tail(new(mem_ctx
) ip_record(nr_inst
));
649 gen8_fs_generator::patch_discard_jumps_to_fb_writes()
651 if (discard_halt_patches
.is_empty())
654 /* There is a somewhat strange undocumented requirement of using
655 * HALT, according to the simulator. If some channel has HALTed to
656 * a particular UIP, then by the end of the program, every channel
657 * must have HALTed to that UIP. Furthermore, the tracking is a
658 * stack, so you can't do the final halt of a UIP after starting
659 * halting to a new UIP.
661 * Symptoms of not emitting this instruction on actual hardware
662 * included GPU hangs and sparkly rendering on the piglit discard
665 gen8_instruction
*last_halt
= HALT();
666 gen8_set_uip(last_halt
, 16);
667 gen8_set_jip(last_halt
, 16);
671 foreach_list(node
, &discard_halt_patches
) {
672 ip_record
*patch_ip
= (ip_record
*) node
;
673 gen8_instruction
*patch
= &store
[patch_ip
->ip
];
674 assert(gen8_opcode(patch
) == BRW_OPCODE_HALT
);
676 /* HALT takes an instruction distance from the pre-incremented IP. */
677 gen8_set_uip(patch
, (ip
- patch_ip
->ip
) * 16);
680 this->discard_halt_patches
.make_empty();
684 * Sets the first dword of a vgrf for simd4x2 uniform pull constant
685 * sampler LD messages.
687 * We don't want to bake it into the send message's code generation because
688 * that means we don't get a chance to schedule the instruction.
691 gen8_fs_generator::generate_set_simd4x2_offset(fs_inst
*ir
,
693 struct brw_reg value
)
695 assert(value
.file
== BRW_IMMEDIATE_VALUE
);
696 MOV_RAW(retype(brw_vec1_reg(dst
.file
, dst
.nr
, 0), value
.type
), value
);
700 * Sets vstride=16, width=8, hstride=2 or vstride=0, width=1, hstride=0
701 * (when mask is passed as a uniform) of register mask before moving it
705 gen8_fs_generator::generate_set_omask(fs_inst
*inst
,
709 assert(dst
.type
== BRW_REGISTER_TYPE_UW
);
711 if (dispatch_width
== 16)
714 if (mask
.vstride
== BRW_VERTICAL_STRIDE_8
&&
715 mask
.width
== BRW_WIDTH_8
&&
716 mask
.hstride
== BRW_HORIZONTAL_STRIDE_1
) {
717 mask
= stride(mask
, 16, 8, 2);
719 assert(mask
.vstride
== BRW_VERTICAL_STRIDE_0
&&
720 mask
.width
== BRW_WIDTH_1
&&
721 mask
.hstride
== BRW_HORIZONTAL_STRIDE_0
);
724 unsigned save_exec_size
= default_state
.exec_size
;
725 default_state
.exec_size
= BRW_EXECUTE_8
;
727 gen8_instruction
*mov
= MOV(dst
, retype(mask
, dst
.type
));
728 gen8_set_mask_control(mov
, BRW_MASK_DISABLE
);
730 default_state
.exec_size
= save_exec_size
;
734 * Do a special ADD with vstride=1, width=4, hstride=0 for src1.
737 gen8_fs_generator::generate_set_sample_id(fs_inst
*ir
,
742 assert(dst
.type
== BRW_REGISTER_TYPE_D
|| dst
.type
== BRW_REGISTER_TYPE_UD
);
743 assert(src0
.type
== BRW_REGISTER_TYPE_D
|| src0
.type
== BRW_REGISTER_TYPE_UD
);
745 struct brw_reg reg
= retype(stride(src1
, 1, 4, 0), BRW_REGISTER_TYPE_UW
);
747 unsigned save_exec_size
= default_state
.exec_size
;
748 default_state
.exec_size
= BRW_EXECUTE_8
;
750 gen8_instruction
*add
= ADD(dst
, src0
, reg
);
751 gen8_set_mask_control(add
, BRW_MASK_DISABLE
);
752 if (dispatch_width
== 16) {
753 add
= ADD(offset(dst
, 1), offset(src0
, 1), suboffset(reg
, 2));
754 gen8_set_mask_control(add
, BRW_MASK_DISABLE
);
757 default_state
.exec_size
= save_exec_size
;
761 * Change the register's data type from UD to HF, doubling the strides in order
762 * to compensate for halving the data type width.
764 static struct brw_reg
765 ud_reg_to_hf(struct brw_reg r
)
767 assert(r
.type
== BRW_REGISTER_TYPE_UD
);
768 r
.type
= BRW_REGISTER_TYPE_HF
;
770 /* The BRW_*_STRIDE enums are defined so that incrementing the field
771 * doubles the real stride.
782 gen8_fs_generator::generate_pack_half_2x16_split(fs_inst
*inst
,
787 assert(dst
.type
== BRW_REGISTER_TYPE_UD
);
788 assert(x
.type
== BRW_REGISTER_TYPE_F
);
789 assert(y
.type
== BRW_REGISTER_TYPE_F
);
791 struct brw_reg dst_hf
= ud_reg_to_hf(dst
);
793 /* Give each 32-bit channel of dst the form below , where "." means
802 SHL(dst
, dst
, brw_imm_ud(16u));
804 /* And, finally the form of packHalf2x16's output:
811 gen8_fs_generator::generate_unpack_half_2x16_split(fs_inst
*inst
,
815 assert(dst
.type
== BRW_REGISTER_TYPE_F
);
816 assert(src
.type
== BRW_REGISTER_TYPE_UD
);
818 struct brw_reg src_hf
= ud_reg_to_hf(src
);
820 /* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll.
821 * For the Y case, we wish to access only the upper word; therefore
822 * a 16-bit subregister offset is needed.
824 assert(inst
->opcode
== FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X
||
825 inst
->opcode
== FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y
);
826 if (inst
->opcode
== FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y
)
833 gen8_fs_generator::generate_code(exec_list
*instructions
)
835 int last_native_inst_offset
= next_inst_offset
;
836 const char *last_annotation_string
= NULL
;
837 const void *last_annotation_ir
= NULL
;
839 if (unlikely(INTEL_DEBUG
& DEBUG_WM
)) {
842 "Native code for %s fragment shader %d (SIMD%d dispatch):\n",
843 shader_prog
->Label
? shader_prog
->Label
: "unnamed",
844 shader_prog
->Name
, dispatch_width
);
847 "Native code for fragment program %d (SIMD%d dispatch):\n",
848 prog
->Id
, dispatch_width
);
850 fprintf(stderr
, "Native code for blorp program (SIMD%d dispatch):\n",
856 if (unlikely(INTEL_DEBUG
& DEBUG_WM
))
857 cfg
= new(mem_ctx
) cfg_t(instructions
);
859 foreach_list(node
, instructions
) {
860 fs_inst
*ir
= (fs_inst
*) node
;
861 struct brw_reg src
[3], dst
;
863 if (unlikely(INTEL_DEBUG
& DEBUG_WM
)) {
864 foreach_list(node
, &cfg
->block_list
) {
865 bblock_link
*link
= (bblock_link
*)node
;
866 bblock_t
*block
= link
->block
;
868 if (block
->start
== ir
) {
869 fprintf(stderr
, " START B%d", block
->block_num
);
870 foreach_list(predecessor_node
, &block
->parents
) {
871 bblock_link
*predecessor_link
=
872 (bblock_link
*)predecessor_node
;
873 bblock_t
*predecessor_block
= predecessor_link
->block
;
874 fprintf(stderr
, " <-B%d", predecessor_block
->block_num
);
876 fprintf(stderr
, "\n");
880 if (last_annotation_ir
!= ir
->ir
) {
881 last_annotation_ir
= ir
->ir
;
882 if (last_annotation_ir
) {
883 fprintf(stderr
, " ");
885 ((ir_instruction
*) ir
->ir
)->fprint(stderr
);
887 const prog_instruction
*fpi
;
888 fpi
= (const prog_instruction
*) ir
->ir
;
889 fprintf(stderr
, "%d: ", (int)(fpi
- prog
->Instructions
));
890 _mesa_fprint_instruction_opt(stderr
,
892 0, PROG_PRINT_DEBUG
, NULL
);
894 fprintf(stderr
, "\n");
897 if (last_annotation_string
!= ir
->annotation
) {
898 last_annotation_string
= ir
->annotation
;
899 if (last_annotation_string
)
900 fprintf(stderr
, " %s\n", last_annotation_string
);
904 for (unsigned int i
= 0; i
< 3; i
++) {
905 src
[i
] = brw_reg_from_fs_reg(&ir
->src
[i
]);
907 /* The accumulator result appears to get used for the
908 * conditional modifier generation. When negating a UD
909 * value, there is a 33rd bit generated for the sign in the
910 * accumulator value, so now you can't check, for example,
911 * equality with a 32-bit value. See piglit fs-op-neg-uvec4.
913 assert(!ir
->conditional_mod
||
914 ir
->src
[i
].type
!= BRW_REGISTER_TYPE_UD
||
917 dst
= brw_reg_from_fs_reg(&ir
->dst
);
919 default_state
.conditional_mod
= ir
->conditional_mod
;
920 default_state
.predicate
= ir
->predicate
;
921 default_state
.predicate_inverse
= ir
->predicate_inverse
;
922 default_state
.saturate
= ir
->saturate
;
923 default_state
.mask_control
= ir
->force_writemask_all
;
924 default_state
.flag_subreg_nr
= ir
->flag_subreg
;
926 if (dispatch_width
== 16 && !ir
->force_uncompressed
)
927 default_state
.exec_size
= BRW_EXECUTE_16
;
929 default_state
.exec_size
= BRW_EXECUTE_8
;
931 if (ir
->force_uncompressed
|| dispatch_width
== 8)
932 default_state
.qtr_control
= GEN6_COMPRESSION_1Q
;
933 else if (ir
->force_sechalf
)
934 default_state
.qtr_control
= GEN6_COMPRESSION_2Q
;
936 default_state
.qtr_control
= GEN6_COMPRESSION_1H
;
938 switch (ir
->opcode
) {
943 ADD(dst
, src
[0], src
[1]);
946 MUL(dst
, src
[0], src
[1]);
948 case BRW_OPCODE_MACH
:
949 MACH(dst
, src
[0], src
[1]);
953 default_state
.access_mode
= BRW_ALIGN_16
;
954 MAD(dst
, src
[0], src
[1], src
[2]);
955 default_state
.access_mode
= BRW_ALIGN_1
;
959 default_state
.access_mode
= BRW_ALIGN_16
;
960 LRP(dst
, src
[0], src
[1], src
[2]);
961 default_state
.access_mode
= BRW_ALIGN_1
;
968 case BRW_OPCODE_RNDD
:
971 case BRW_OPCODE_RNDE
:
974 case BRW_OPCODE_RNDZ
:
979 AND(dst
, src
[0], src
[1]);
982 OR(dst
, src
[0], src
[1]);
985 XOR(dst
, src
[0], src
[1]);
991 ASR(dst
, src
[0], src
[1]);
994 SHR(dst
, src
[0], src
[1]);
997 SHL(dst
, src
[0], src
[1]);
1000 case BRW_OPCODE_F32TO16
:
1001 MOV(retype(dst
, BRW_REGISTER_TYPE_HF
), src
[0]);
1003 case BRW_OPCODE_F16TO32
:
1004 MOV(dst
, retype(src
[0], BRW_REGISTER_TYPE_HF
));
1007 case BRW_OPCODE_CMP
:
1008 CMP(dst
, ir
->conditional_mod
, src
[0], src
[1]);
1010 case BRW_OPCODE_SEL
:
1011 SEL(dst
, src
[0], src
[1]);
1014 case BRW_OPCODE_BFREV
:
1015 /* BFREV only supports UD type for src and dst. */
1016 BFREV(retype(dst
, BRW_REGISTER_TYPE_UD
),
1017 retype(src
[0], BRW_REGISTER_TYPE_UD
));
1020 case BRW_OPCODE_FBH
:
1021 /* FBH only supports UD type for dst. */
1022 FBH(retype(dst
, BRW_REGISTER_TYPE_UD
), src
[0]);
1025 case BRW_OPCODE_FBL
:
1026 /* FBL only supports UD type for dst. */
1027 FBL(retype(dst
, BRW_REGISTER_TYPE_UD
), src
[0]);
1030 case BRW_OPCODE_CBIT
:
1031 /* CBIT only supports UD type for dst. */
1032 CBIT(retype(dst
, BRW_REGISTER_TYPE_UD
), src
[0]);
1035 case BRW_OPCODE_ADDC
:
1036 ADDC(dst
, src
[0], src
[1]);
1039 case BRW_OPCODE_SUBB
:
1040 SUBB(dst
, src
[0], src
[1]);
1043 case BRW_OPCODE_BFE
:
1044 default_state
.access_mode
= BRW_ALIGN_16
;
1045 BFE(dst
, src
[0], src
[1], src
[2]);
1046 default_state
.access_mode
= BRW_ALIGN_1
;
1049 case BRW_OPCODE_BFI1
:
1050 BFI1(dst
, src
[0], src
[1]);
1053 case BRW_OPCODE_BFI2
:
1054 default_state
.access_mode
= BRW_ALIGN_16
;
1055 BFI2(dst
, src
[0], src
[1], src
[2]);
1056 default_state
.access_mode
= BRW_ALIGN_1
;
1060 IF(BRW_PREDICATE_NORMAL
);
1063 case BRW_OPCODE_ELSE
:
1067 case BRW_OPCODE_ENDIF
:
1075 case BRW_OPCODE_BREAK
:
1079 case BRW_OPCODE_CONTINUE
:
1083 case BRW_OPCODE_WHILE
:
1087 case SHADER_OPCODE_RCP
:
1088 MATH(BRW_MATH_FUNCTION_INV
, dst
, src
[0]);
1091 case SHADER_OPCODE_RSQ
:
1092 MATH(BRW_MATH_FUNCTION_RSQ
, dst
, src
[0]);
1095 case SHADER_OPCODE_SQRT
:
1096 MATH(BRW_MATH_FUNCTION_SQRT
, dst
, src
[0]);
1099 case SHADER_OPCODE_EXP2
:
1100 MATH(BRW_MATH_FUNCTION_EXP
, dst
, src
[0]);
1103 case SHADER_OPCODE_LOG2
:
1104 MATH(BRW_MATH_FUNCTION_LOG
, dst
, src
[0]);
1107 case SHADER_OPCODE_SIN
:
1108 MATH(BRW_MATH_FUNCTION_SIN
, dst
, src
[0]);
1111 case SHADER_OPCODE_COS
:
1112 MATH(BRW_MATH_FUNCTION_COS
, dst
, src
[0]);
1115 case SHADER_OPCODE_INT_QUOTIENT
:
1116 MATH(BRW_MATH_FUNCTION_INT_DIV_QUOTIENT
, dst
, src
[0], src
[1]);
1119 case SHADER_OPCODE_INT_REMAINDER
:
1120 MATH(BRW_MATH_FUNCTION_INT_DIV_REMAINDER
, dst
, src
[0], src
[1]);
1123 case SHADER_OPCODE_POW
:
1124 MATH(BRW_MATH_FUNCTION_POW
, dst
, src
[0], src
[1]);
1127 case FS_OPCODE_PIXEL_X
:
1128 case FS_OPCODE_PIXEL_Y
:
1129 assert(!"FS_OPCODE_PIXEL_X and FS_OPCODE_PIXEL_Y are only for Gen4-5.");
1132 case FS_OPCODE_CINTERP
:
1135 case FS_OPCODE_LINTERP
:
1136 generate_linterp(ir
, dst
, src
);
1138 case SHADER_OPCODE_TEX
:
1140 case SHADER_OPCODE_TXD
:
1141 case SHADER_OPCODE_TXF
:
1142 case SHADER_OPCODE_TXF_CMS
:
1143 case SHADER_OPCODE_TXF_UMS
:
1144 case SHADER_OPCODE_TXF_MCS
:
1145 case SHADER_OPCODE_TXL
:
1146 case SHADER_OPCODE_TXS
:
1147 case SHADER_OPCODE_LOD
:
1148 case SHADER_OPCODE_TG4
:
1149 case SHADER_OPCODE_TG4_OFFSET
:
1150 generate_tex(ir
, dst
, src
[0]);
1154 generate_ddx(ir
, dst
, src
[0]);
1157 /* Make sure fp->UsesDFdy flag got set (otherwise there's no
1158 * guarantee that c->key.render_to_fbo is set).
1160 assert(fp
->UsesDFdy
);
1161 generate_ddy(ir
, dst
, src
[0], c
->key
.render_to_fbo
);
1164 case SHADER_OPCODE_GEN4_SCRATCH_WRITE
:
1165 generate_scratch_write(ir
, src
[0]);
1168 case SHADER_OPCODE_GEN4_SCRATCH_READ
:
1169 generate_scratch_read(ir
, dst
);
1172 case SHADER_OPCODE_GEN7_SCRATCH_READ
:
1173 generate_scratch_read_gen7(ir
, dst
);
1176 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7
:
1177 generate_uniform_pull_constant_load(ir
, dst
, src
[0], src
[1]);
1180 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7
:
1181 generate_varying_pull_constant_load(ir
, dst
, src
[0], src
[1]);
1184 case FS_OPCODE_FB_WRITE
:
1185 generate_fb_write(ir
);
1188 case FS_OPCODE_MOV_DISPATCH_TO_FLAGS
:
1189 generate_mov_dispatch_to_flags(ir
);
1192 case FS_OPCODE_DISCARD_JUMP
:
1193 generate_discard_jump(ir
);
1196 case SHADER_OPCODE_SHADER_TIME_ADD
:
1197 assert(!"XXX: Missing Gen8 scalar support for INTEL_DEBUG=shader_time");
1200 case SHADER_OPCODE_UNTYPED_ATOMIC
:
1201 assert(!"XXX: Missing Gen8 scalar support for untyped atomics");
1204 case SHADER_OPCODE_UNTYPED_SURFACE_READ
:
1205 assert(!"XXX: Missing Gen8 scalar support for untyped surface reads");
1208 case FS_OPCODE_SET_SIMD4X2_OFFSET
:
1209 generate_set_simd4x2_offset(ir
, dst
, src
[0]);
1212 case FS_OPCODE_SET_OMASK
:
1213 generate_set_omask(ir
, dst
, src
[0]);
1216 case FS_OPCODE_SET_SAMPLE_ID
:
1217 generate_set_sample_id(ir
, dst
, src
[0], src
[1]);
1220 case FS_OPCODE_PACK_HALF_2x16_SPLIT
:
1221 generate_pack_half_2x16_split(ir
, dst
, src
[0], src
[1]);
1224 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X
:
1225 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y
:
1226 generate_unpack_half_2x16_split(ir
, dst
, src
[0]);
1229 case FS_OPCODE_PLACEHOLDER_HALT
:
1230 /* This is the place where the final HALT needs to be inserted if
1231 * we've emitted any discards. If not, this will emit no code.
1233 patch_discard_jumps_to_fb_writes();
1237 if (ir
->opcode
< int(ARRAY_SIZE(opcode_descs
))) {
1238 _mesa_problem(ctx
, "Unsupported opcode `%s' in FS",
1239 opcode_descs
[ir
->opcode
].name
);
1241 _mesa_problem(ctx
, "Unsupported opcode %d in FS", ir
->opcode
);
1246 if (unlikely(INTEL_DEBUG
& DEBUG_WM
)) {
1247 disassemble(stderr
, last_native_inst_offset
, next_inst_offset
);
1249 foreach_list(node
, &cfg
->block_list
) {
1250 bblock_link
*link
= (bblock_link
*)node
;
1251 bblock_t
*block
= link
->block
;
1253 if (block
->end
== ir
) {
1254 fprintf(stderr
, " END B%d", block
->block_num
);
1255 foreach_list(successor_node
, &block
->children
) {
1256 bblock_link
*successor_link
=
1257 (bblock_link
*)successor_node
;
1258 bblock_t
*successor_block
= successor_link
->block
;
1259 fprintf(stderr
, " ->B%d", successor_block
->block_num
);
1261 fprintf(stderr
, "\n");
1266 last_native_inst_offset
= next_inst_offset
;
1269 if (unlikely(INTEL_DEBUG
& DEBUG_WM
)) {
1270 fprintf(stderr
, "\n");
1273 patch_jump_targets();
1275 /* OK, while the INTEL_DEBUG=fs above is very nice for debugging FS
1276 * emit issues, it doesn't get the jump distances into the output,
1277 * which is often something we want to debug. So this is here in
1278 * case you're doing that.
1280 if (0 && unlikely(INTEL_DEBUG
& DEBUG_WM
)) {
1281 disassemble(stderr
, 0, next_inst_offset
);
1286 gen8_fs_generator::generate_assembly(exec_list
*simd8_instructions
,
1287 exec_list
*simd16_instructions
,
1288 unsigned *assembly_size
)
1290 assert(simd8_instructions
|| simd16_instructions
);
1292 if (simd8_instructions
) {
1294 generate_code(simd8_instructions
);
1297 if (simd16_instructions
) {
1298 /* Align to a 64-byte boundary. */
1299 while ((nr_inst
* sizeof(gen8_instruction
)) % 64)
1302 /* Save off the start of this SIMD16 program */
1303 c
->prog_data
.prog_offset_16
= nr_inst
* sizeof(gen8_instruction
);
1305 dispatch_width
= 16;
1306 generate_code(simd16_instructions
);
1309 *assembly_size
= next_inst_offset
;
1310 return (const unsigned *) store
;