2 * Copyright © 2010, 2011, 2012 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 /** @file gen8_fs_generate.cpp
26 * Code generation for Gen8+ hardware.
30 #include "main/macros.h"
31 #include "brw_context.h"
36 #include "glsl/ir_print_visitor.h"
38 gen8_fs_generator::gen8_fs_generator(struct brw_context
*brw
,
39 struct brw_wm_compile
*c
,
40 struct gl_shader_program
*shader_prog
,
41 struct gl_fragment_program
*fp
,
42 bool dual_source_output
)
43 : gen8_generator(brw
, shader_prog
, fp
? &fp
->Base
: NULL
, c
), c(c
), fp(fp
),
44 dual_source_output(dual_source_output
)
48 gen8_fs_generator::~gen8_fs_generator()
53 gen8_fs_generator::mark_surface_used(unsigned surf_index
)
55 assert(surf_index
< BRW_MAX_SURFACES
);
57 c
->prog_data
.base
.binding_table
.size_bytes
=
58 MAX2(c
->prog_data
.base
.binding_table
.size_bytes
, (surf_index
+ 1) * 4);
62 gen8_fs_generator::generate_fb_write(fs_inst
*ir
)
64 if (fp
&& fp
->UsesKill
) {
65 gen8_instruction
*mov
=
66 MOV(retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW
),
68 gen8_set_mask_control(mov
, BRW_MASK_DISABLE
);
71 if (ir
->header_present
) {
72 gen8_instruction
*mov
=
73 MOV_RAW(brw_message_reg(ir
->base_mrf
), brw_vec8_grf(0, 0));
74 gen8_set_exec_size(mov
, BRW_EXECUTE_16
);
76 if (ir
->target
> 0 && c
->key
.replicate_alpha
) {
77 /* Set "Source0 Alpha Present to RenderTarget" bit in the header. */
78 OR(vec1(retype(brw_message_reg(ir
->base_mrf
), BRW_REGISTER_TYPE_UD
)),
79 vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
)),
84 /* Set the render target index for choosing BLEND_STATE. */
85 MOV(retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE
, ir
->base_mrf
, 2),
86 BRW_REGISTER_TYPE_UD
),
87 brw_imm_ud(ir
->target
));
91 gen8_instruction
*inst
= next_inst(BRW_OPCODE_SENDC
);
92 gen8_set_dst(brw
, inst
, retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW
));
93 gen8_set_src0(brw
, inst
, brw_message_reg(ir
->base_mrf
));
95 /* Set up the "Message Specific Control" bits for the Data Port Message
96 * Descriptor. These are documented in the "Render Target Write" message's
97 * "Message Descriptor" documentation (vol5c.2).
100 /* Set the Message Type */
101 if (this->dual_source_output
)
102 msg_type
= BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01
;
103 else if (dispatch_width
== 16)
104 msg_type
= BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE
;
106 msg_type
= BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01
;
108 uint32_t msg_control
= msg_type
;
110 /* "Last Render Target Select" must be set on all writes to the last of
111 * the render targets (if using MRT), or always for a single RT scenario.
113 if ((ir
->target
== c
->key
.nr_color_regions
- 1) || !c
->key
.nr_color_regions
)
114 msg_control
|= (1 << 4); /* Last Render Target Select */
116 uint32_t surf_index
=
117 c
->prog_data
.binding_table
.render_target_start
+ ir
->target
;
119 gen8_set_dp_message(brw
, inst
,
120 GEN6_SFID_DATAPORT_RENDER_CACHE
,
122 GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE
,
129 mark_surface_used(surf_index
);
133 gen8_fs_generator::generate_linterp(fs_inst
*inst
,
137 struct brw_reg delta_x
= src
[0];
138 struct brw_reg delta_y
= src
[1];
139 struct brw_reg interp
= src
[2];
142 assert(delta_y
.nr
== delta_x
.nr
+ 1);
143 PLN(dst
, interp
, delta_x
);
147 gen8_fs_generator::generate_tex(fs_inst
*ir
,
153 uint32_t simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD8
;
155 assert(src
.file
== BRW_GENERAL_REGISTER_FILE
);
157 if (dispatch_width
== 16 && !ir
->force_uncompressed
&& !ir
->force_sechalf
)
158 simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD16
;
160 switch (ir
->opcode
) {
161 case SHADER_OPCODE_TEX
:
162 if (ir
->shadow_compare
) {
163 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE
;
165 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE
;
169 if (ir
->shadow_compare
) {
170 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE
;
172 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS
;
175 case SHADER_OPCODE_TXL
:
176 if (ir
->shadow_compare
) {
177 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE
;
179 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_LOD
;
182 case SHADER_OPCODE_TXS
:
183 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO
;
185 case SHADER_OPCODE_TXD
:
186 if (ir
->shadow_compare
) {
187 msg_type
= HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE
;
189 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS
;
192 case SHADER_OPCODE_TXF
:
193 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_LD
;
195 case SHADER_OPCODE_TXF_CMS
:
196 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS
;
198 case SHADER_OPCODE_TXF_UMS
:
199 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS
;
201 case SHADER_OPCODE_TXF_MCS
:
202 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS
;
204 case SHADER_OPCODE_LOD
:
205 msg_type
= GEN5_SAMPLER_MESSAGE_LOD
;
207 case SHADER_OPCODE_TG4
:
208 if (ir
->shadow_compare
) {
209 assert(brw
->gen
>= 7);
210 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C
;
212 assert(brw
->gen
>= 6);
213 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4
;
216 case SHADER_OPCODE_TG4_OFFSET
:
217 assert(brw
->gen
>= 7);
218 if (ir
->shadow_compare
) {
219 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C
;
221 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO
;
225 assert(!"not reached");
228 assert(msg_type
!= -1);
230 if (simd_mode
== BRW_SAMPLER_SIMD_MODE_SIMD16
) {
235 if (ir
->header_present
) {
236 /* The send-from-GRF for SIMD16 texturing with a header has an extra
237 * hardware register allocated to it, which we need to skip over (since
238 * our coordinates in the payload are in the even-numbered registers,
239 * and the header comes right before the first one.
241 if (dispatch_width
== 16)
244 MOV_RAW(src
, brw_vec8_grf(0, 0));
246 if (ir
->texture_offset
) {
247 /* Set the texel offset bits. */
248 MOV_RAW(retype(brw_vec1_grf(src
.nr
, 2), BRW_REGISTER_TYPE_UD
),
249 brw_imm_ud(ir
->texture_offset
));
253 uint32_t surf_index
=
254 c
->prog_data
.base
.binding_table
.texture_start
+ ir
->sampler
;
256 gen8_instruction
*inst
= next_inst(BRW_OPCODE_SEND
);
257 gen8_set_dst(brw
, inst
, dst
);
258 gen8_set_src0(brw
, inst
, src
);
259 gen8_set_sampler_message(brw
, inst
,
268 mark_surface_used(surf_index
);
272 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
275 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
277 * and we're trying to produce:
280 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
281 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
282 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
283 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
284 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
285 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
286 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
287 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
289 * and add another set of two more subspans if in 16-pixel dispatch mode.
291 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
292 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
293 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
294 * between each other. We could probably do it like ddx and swizzle the right
295 * order later, but bail for now and just produce
296 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
299 gen8_fs_generator::generate_ddx(fs_inst
*inst
,
303 unsigned vstride
, width
;
305 if (c
->key
.high_quality_derivatives
) {
306 /* Produce accurate derivatives. */
307 vstride
= BRW_VERTICAL_STRIDE_2
;
310 /* Replicate the derivative at the top-left pixel to other pixels. */
311 vstride
= BRW_VERTICAL_STRIDE_4
;
315 struct brw_reg src0
= brw_reg(src
.file
, src
.nr
, 1,
319 BRW_HORIZONTAL_STRIDE_0
,
320 BRW_SWIZZLE_XYZW
, WRITEMASK_XYZW
);
321 struct brw_reg src1
= brw_reg(src
.file
, src
.nr
, 0,
325 BRW_HORIZONTAL_STRIDE_0
,
326 BRW_SWIZZLE_XYZW
, WRITEMASK_XYZW
);
327 ADD(dst
, src0
, negate(src1
));
330 /* The negate_value boolean is used to negate the derivative computation for
331 * FBOs, since they place the origin at the upper left instead of the lower
335 gen8_fs_generator::generate_ddy(fs_inst
*inst
,
341 unsigned src0_swizzle
;
342 unsigned src1_swizzle
;
345 if (c
->key
.high_quality_derivatives
) {
346 /* Produce accurate derivatives. */
347 hstride
= BRW_HORIZONTAL_STRIDE_1
;
348 src0_swizzle
= BRW_SWIZZLE_XYXY
;
349 src1_swizzle
= BRW_SWIZZLE_ZWZW
;
352 default_state
.access_mode
= BRW_ALIGN_16
;
354 /* Replicate the derivative at the top-left pixel to other pixels. */
355 hstride
= BRW_HORIZONTAL_STRIDE_0
;
356 src0_swizzle
= BRW_SWIZZLE_XYZW
;
357 src1_swizzle
= BRW_SWIZZLE_XYZW
;
361 struct brw_reg src0
= brw_reg(src
.file
, src
.nr
, 0,
363 BRW_VERTICAL_STRIDE_4
,
366 src0_swizzle
, WRITEMASK_XYZW
);
367 struct brw_reg src1
= brw_reg(src
.file
, src
.nr
, src1_subnr
,
369 BRW_VERTICAL_STRIDE_4
,
372 src1_swizzle
, WRITEMASK_XYZW
);
375 ADD(dst
, src1
, negate(src0
));
377 ADD(dst
, src0
, negate(src1
));
379 default_state
.access_mode
= BRW_ALIGN_1
;
383 gen8_fs_generator::generate_scratch_write(fs_inst
*inst
, struct brw_reg dst
)
385 assert(inst
->mlen
!= 0);
386 assert(!"TODO: Implement generate_scratch_write.");
390 gen8_fs_generator::generate_scratch_read(fs_inst
*inst
, struct brw_reg dst
)
392 assert(inst
->mlen
!= 0);
393 assert(!"TODO: Implement generate_scratch_read.");
397 gen8_fs_generator::generate_scratch_read_gen7(fs_inst
*inst
, struct brw_reg dst
)
399 assert(inst
->mlen
!= 0);
400 assert(!"TODO: Implement generate_scratch_read_gen7.");
404 gen8_fs_generator::generate_uniform_pull_constant_load(fs_inst
*inst
,
406 struct brw_reg index
,
407 struct brw_reg offset
)
409 assert(inst
->mlen
== 0);
411 assert(index
.file
== BRW_IMMEDIATE_VALUE
&&
412 index
.type
== BRW_REGISTER_TYPE_UD
);
413 uint32_t surf_index
= index
.dw1
.ud
;
415 assert(offset
.file
== BRW_GENERAL_REGISTER_FILE
);
416 /* Reference only the dword we need lest we anger validate_reg() with
417 * reg.width > reg.execszie.
419 offset
= brw_vec1_grf(offset
.nr
, 0);
421 gen8_instruction
*send
= next_inst(BRW_OPCODE_SEND
);
422 gen8_set_mask_control(send
, BRW_MASK_DISABLE
);
424 /* We use the SIMD4x2 mode because we want to end up with 4 constants in
425 * the destination loaded consecutively from the same offset (which appears
426 * in the first component, and the rest are ignored).
428 dst
.width
= BRW_WIDTH_4
;
429 gen8_set_dst(brw
, send
, dst
);
430 gen8_set_src0(brw
, send
, offset
);
431 gen8_set_sampler_message(brw
, send
,
433 0, /* The LD message ignores the sampler unit. */
434 GEN5_SAMPLER_MESSAGE_SAMPLE_LD
,
437 false, /* no header */
438 BRW_SAMPLER_SIMD_MODE_SIMD4X2
);
440 mark_surface_used(surf_index
);
444 gen8_fs_generator::generate_varying_pull_constant_load(fs_inst
*ir
,
446 struct brw_reg index
,
447 struct brw_reg offset
)
449 /* Varying-offset pull constant loads are treated as a normal expression on
450 * gen7, so the fact that it's a send message is hidden at the IR level.
452 assert(!ir
->header_present
);
455 assert(index
.file
== BRW_IMMEDIATE_VALUE
&&
456 index
.type
== BRW_REGISTER_TYPE_UD
);
457 uint32_t surf_index
= index
.dw1
.ud
;
459 uint32_t simd_mode
, rlen
, mlen
;
460 if (dispatch_width
== 16) {
463 simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD16
;
467 simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD8
;
470 gen8_instruction
*send
= next_inst(BRW_OPCODE_SEND
);
471 gen8_set_dst(brw
, send
, dst
);
472 gen8_set_src0(brw
, send
, offset
);
473 gen8_set_sampler_message(brw
, send
,
475 0, /* The LD message ignore the sampler unit. */
476 GEN5_SAMPLER_MESSAGE_SAMPLE_LD
,
479 false, /* no header */
482 mark_surface_used(surf_index
);
486 * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
487 * into the flags register (f0.0).
490 gen8_fs_generator::generate_mov_dispatch_to_flags(fs_inst
*ir
)
492 struct brw_reg flags
= brw_flag_reg(0, ir
->flag_subreg
);
493 struct brw_reg dispatch_mask
=
494 retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW
);
496 gen8_instruction
*mov
= MOV(flags
, dispatch_mask
);
497 gen8_set_mask_control(mov
, BRW_MASK_DISABLE
);
501 gen8_fs_generator::generate_discard_jump(fs_inst
*ir
)
503 /* This HALT will be patched up at FB write time to point UIP at the end of
504 * the program, and at brw_uip_jip() JIP will be set to the end of the
505 * current block (or the program).
507 discard_halt_patches
.push_tail(new(mem_ctx
) ip_record(nr_inst
));
513 gen8_fs_generator::patch_discard_jumps_to_fb_writes()
515 if (discard_halt_patches
.is_empty())
518 /* There is a somewhat strange undocumented requirement of using
519 * HALT, according to the simulator. If some channel has HALTed to
520 * a particular UIP, then by the end of the program, every channel
521 * must have HALTed to that UIP. Furthermore, the tracking is a
522 * stack, so you can't do the final halt of a UIP after starting
523 * halting to a new UIP.
525 * Symptoms of not emitting this instruction on actual hardware
526 * included GPU hangs and sparkly rendering on the piglit discard
529 gen8_instruction
*last_halt
= HALT();
530 gen8_set_uip(last_halt
, 16);
531 gen8_set_jip(last_halt
, 16);
535 foreach_list(node
, &discard_halt_patches
) {
536 ip_record
*patch_ip
= (ip_record
*) node
;
537 gen8_instruction
*patch
= &store
[patch_ip
->ip
];
538 assert(gen8_opcode(patch
) == BRW_OPCODE_HALT
);
540 /* HALT takes an instruction distance from the pre-incremented IP. */
541 gen8_set_uip(patch
, (ip
- patch_ip
->ip
) * 16);
544 this->discard_halt_patches
.make_empty();
548 * Sets the first dword of a vgrf for simd4x2 uniform pull constant
549 * sampler LD messages.
551 * We don't want to bake it into the send message's code generation because
552 * that means we don't get a chance to schedule the instruction.
555 gen8_fs_generator::generate_set_simd4x2_offset(fs_inst
*ir
,
557 struct brw_reg value
)
559 assert(value
.file
== BRW_IMMEDIATE_VALUE
);
560 MOV_RAW(retype(brw_vec1_reg(dst
.file
, dst
.nr
, 0), value
.type
), value
);
564 gen8_fs_generator::generate_code(exec_list
*instructions
)
566 int last_native_inst_offset
= next_inst_offset
;
567 const char *last_annotation_string
= NULL
;
568 const void *last_annotation_ir
= NULL
;
570 if (unlikely(INTEL_DEBUG
& DEBUG_WM
)) {
572 printf("Native code for fragment shader %d (SIMD%d dispatch):\n",
573 shader_prog
->Name
, dispatch_width
);
575 printf("Native code for fragment program %d (SIMD%d dispatch):\n",
576 prog
->Id
, dispatch_width
);
578 printf("Native code for blorp program (SIMD%d dispatch):\n",
584 if (unlikely(INTEL_DEBUG
& DEBUG_WM
))
585 cfg
= new(mem_ctx
) cfg_t(instructions
);
587 foreach_list(node
, instructions
) {
588 fs_inst
*ir
= (fs_inst
*) node
;
589 struct brw_reg src
[3], dst
;
591 if (unlikely(INTEL_DEBUG
& DEBUG_WM
)) {
592 foreach_list(node
, &cfg
->block_list
) {
593 bblock_link
*link
= (bblock_link
*)node
;
594 bblock_t
*block
= link
->block
;
596 if (block
->start
== ir
) {
597 printf(" START B%d", block
->block_num
);
598 foreach_list(predecessor_node
, &block
->parents
) {
599 bblock_link
*predecessor_link
=
600 (bblock_link
*)predecessor_node
;
601 bblock_t
*predecessor_block
= predecessor_link
->block
;
602 printf(" <-B%d", predecessor_block
->block_num
);
608 if (last_annotation_ir
!= ir
->ir
) {
609 last_annotation_ir
= ir
->ir
;
610 if (last_annotation_ir
) {
613 ((ir_instruction
*) ir
->ir
)->print();
615 const prog_instruction
*fpi
;
616 fpi
= (const prog_instruction
*) ir
->ir
;
617 printf("%d: ", (int)(fpi
- prog
->Instructions
));
618 _mesa_fprint_instruction_opt(stdout
,
620 0, PROG_PRINT_DEBUG
, NULL
);
625 if (last_annotation_string
!= ir
->annotation
) {
626 last_annotation_string
= ir
->annotation
;
627 if (last_annotation_string
)
628 printf(" %s\n", last_annotation_string
);
632 for (unsigned int i
= 0; i
< 3; i
++) {
633 src
[i
] = brw_reg_from_fs_reg(&ir
->src
[i
]);
635 /* The accumulator result appears to get used for the
636 * conditional modifier generation. When negating a UD
637 * value, there is a 33rd bit generated for the sign in the
638 * accumulator value, so now you can't check, for example,
639 * equality with a 32-bit value. See piglit fs-op-neg-uvec4.
641 assert(!ir
->conditional_mod
||
642 ir
->src
[i
].type
!= BRW_REGISTER_TYPE_UD
||
645 dst
= brw_reg_from_fs_reg(&ir
->dst
);
647 default_state
.conditional_mod
= ir
->conditional_mod
;
648 default_state
.predicate
= ir
->predicate
;
649 default_state
.predicate_inverse
= ir
->predicate_inverse
;
650 default_state
.saturate
= ir
->saturate
;
651 default_state
.flag_subreg_nr
= ir
->flag_subreg
;
653 if (dispatch_width
== 16 && !ir
->force_uncompressed
)
654 default_state
.exec_size
= BRW_EXECUTE_16
;
656 default_state
.exec_size
= BRW_EXECUTE_8
;
658 /* fs_inst::force_sechalf is only used for original Gen4 code, so we
659 * don't handle it. Add qtr_control to default_state if that changes.
661 assert(!ir
->force_sechalf
);
663 switch (ir
->opcode
) {
668 ADD(dst
, src
[0], src
[1]);
671 MUL(dst
, src
[0], src
[1]);
673 case BRW_OPCODE_MACH
:
674 MACH(dst
, src
[0], src
[1]);
678 default_state
.access_mode
= BRW_ALIGN_16
;
679 MAD(dst
, src
[0], src
[1], src
[2]);
680 default_state
.access_mode
= BRW_ALIGN_1
;
684 default_state
.access_mode
= BRW_ALIGN_16
;
685 LRP(dst
, src
[0], src
[1], src
[2]);
686 default_state
.access_mode
= BRW_ALIGN_1
;
693 case BRW_OPCODE_RNDD
:
696 case BRW_OPCODE_RNDE
:
699 case BRW_OPCODE_RNDZ
:
704 AND(dst
, src
[0], src
[1]);
707 OR(dst
, src
[0], src
[1]);
710 XOR(dst
, src
[0], src
[1]);
716 ASR(dst
, src
[0], src
[1]);
719 SHR(dst
, src
[0], src
[1]);
722 SHL(dst
, src
[0], src
[1]);
725 case BRW_OPCODE_F32TO16
:
726 F32TO16(dst
, src
[0]);
728 case BRW_OPCODE_F16TO32
:
729 F16TO32(dst
, src
[0]);
733 CMP(dst
, ir
->conditional_mod
, src
[0], src
[1]);
736 SEL(dst
, src
[0], src
[1]);
739 case BRW_OPCODE_BFREV
:
740 /* BFREV only supports UD type for src and dst. */
741 BFREV(retype(dst
, BRW_REGISTER_TYPE_UD
),
742 retype(src
[0], BRW_REGISTER_TYPE_UD
));
746 /* FBH only supports UD type for dst. */
747 FBH(retype(dst
, BRW_REGISTER_TYPE_UD
), src
[0]);
751 /* FBL only supports UD type for dst. */
752 FBL(retype(dst
, BRW_REGISTER_TYPE_UD
), src
[0]);
755 case BRW_OPCODE_CBIT
:
756 /* CBIT only supports UD type for dst. */
757 CBIT(retype(dst
, BRW_REGISTER_TYPE_UD
), src
[0]);
760 case BRW_OPCODE_ADDC
:
761 ADDC(dst
, src
[0], src
[1]);
764 case BRW_OPCODE_SUBB
:
765 SUBB(dst
, src
[0], src
[1]);
769 default_state
.access_mode
= BRW_ALIGN_16
;
770 BFE(dst
, src
[0], src
[1], src
[2]);
771 default_state
.access_mode
= BRW_ALIGN_1
;
774 case BRW_OPCODE_BFI1
:
775 BFI1(dst
, src
[0], src
[1]);
778 case BRW_OPCODE_BFI2
:
779 default_state
.access_mode
= BRW_ALIGN_16
;
780 BFI2(dst
, src
[0], src
[1], src
[2]);
781 default_state
.access_mode
= BRW_ALIGN_1
;
785 IF(BRW_PREDICATE_NORMAL
);
788 case BRW_OPCODE_ELSE
:
792 case BRW_OPCODE_ENDIF
:
800 case BRW_OPCODE_BREAK
:
804 case BRW_OPCODE_CONTINUE
:
808 case BRW_OPCODE_WHILE
:
812 case SHADER_OPCODE_RCP
:
813 MATH(BRW_MATH_FUNCTION_INV
, dst
, src
[0]);
816 case SHADER_OPCODE_RSQ
:
817 MATH(BRW_MATH_FUNCTION_RSQ
, dst
, src
[0]);
820 case SHADER_OPCODE_SQRT
:
821 MATH(BRW_MATH_FUNCTION_SQRT
, dst
, src
[0]);
824 case SHADER_OPCODE_EXP2
:
825 MATH(BRW_MATH_FUNCTION_EXP
, dst
, src
[0]);
828 case SHADER_OPCODE_LOG2
:
829 MATH(BRW_MATH_FUNCTION_LOG
, dst
, src
[0]);
832 case SHADER_OPCODE_SIN
:
833 MATH(BRW_MATH_FUNCTION_SIN
, dst
, src
[0]);
836 case SHADER_OPCODE_COS
:
837 MATH(BRW_MATH_FUNCTION_COS
, dst
, src
[0]);
840 case SHADER_OPCODE_INT_QUOTIENT
:
841 MATH(BRW_MATH_FUNCTION_INT_DIV_QUOTIENT
, dst
, src
[0], src
[1]);
844 case SHADER_OPCODE_INT_REMAINDER
:
845 MATH(BRW_MATH_FUNCTION_INT_DIV_REMAINDER
, dst
, src
[0], src
[1]);
848 case SHADER_OPCODE_POW
:
849 MATH(BRW_MATH_FUNCTION_POW
, dst
, src
[0], src
[1]);
852 case FS_OPCODE_PIXEL_X
:
853 case FS_OPCODE_PIXEL_Y
:
854 assert(!"FS_OPCODE_PIXEL_X and FS_OPCODE_PIXEL_Y are only for Gen4-5.");
857 case FS_OPCODE_CINTERP
:
860 case FS_OPCODE_LINTERP
:
861 generate_linterp(ir
, dst
, src
);
863 case SHADER_OPCODE_TEX
:
865 case SHADER_OPCODE_TXD
:
866 case SHADER_OPCODE_TXF
:
867 case SHADER_OPCODE_TXF_CMS
:
868 case SHADER_OPCODE_TXF_UMS
:
869 case SHADER_OPCODE_TXF_MCS
:
870 case SHADER_OPCODE_TXL
:
871 case SHADER_OPCODE_TXS
:
872 case SHADER_OPCODE_LOD
:
873 case SHADER_OPCODE_TG4
:
874 case SHADER_OPCODE_TG4_OFFSET
:
875 generate_tex(ir
, dst
, src
[0]);
879 generate_ddx(ir
, dst
, src
[0]);
882 /* Make sure fp->UsesDFdy flag got set (otherwise there's no
883 * guarantee that c->key.render_to_fbo is set).
885 assert(fp
->UsesDFdy
);
886 generate_ddy(ir
, dst
, src
[0], c
->key
.render_to_fbo
);
889 case SHADER_OPCODE_GEN4_SCRATCH_WRITE
:
890 generate_scratch_write(ir
, src
[0]);
893 case SHADER_OPCODE_GEN4_SCRATCH_READ
:
894 generate_scratch_read(ir
, dst
);
897 case SHADER_OPCODE_GEN7_SCRATCH_READ
:
898 generate_scratch_read_gen7(ir
, dst
);
901 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7
:
902 generate_uniform_pull_constant_load(ir
, dst
, src
[0], src
[1]);
905 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7
:
906 generate_varying_pull_constant_load(ir
, dst
, src
[0], src
[1]);
909 case FS_OPCODE_FB_WRITE
:
910 generate_fb_write(ir
);
913 case FS_OPCODE_MOV_DISPATCH_TO_FLAGS
:
914 generate_mov_dispatch_to_flags(ir
);
917 case FS_OPCODE_DISCARD_JUMP
:
918 generate_discard_jump(ir
);
921 case SHADER_OPCODE_SHADER_TIME_ADD
:
922 assert(!"XXX: Missing Gen8 scalar support for INTEL_DEBUG=shader_time");
925 case SHADER_OPCODE_UNTYPED_ATOMIC
:
926 assert(!"XXX: Missing Gen8 scalar support for untyped atomics");
929 case SHADER_OPCODE_UNTYPED_SURFACE_READ
:
930 assert(!"XXX: Missing Gen8 scalar support for untyped surface reads");
933 case FS_OPCODE_SET_SIMD4X2_OFFSET
:
934 generate_set_simd4x2_offset(ir
, dst
, src
[0]);
937 case FS_OPCODE_SET_OMASK
:
938 assert(!"XXX: Missing Gen8 scalar support for SET_OMASK");
941 case FS_OPCODE_SET_SAMPLE_ID
:
942 assert(!"XXX: Missing Gen8 scalar support for SET_SAMPLE_ID");
945 case FS_OPCODE_PACK_HALF_2x16_SPLIT
:
946 assert(!"XXX: Missing Gen8 scalar support for PACK_HALF_2x16_SPLIT");
949 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X
:
950 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y
:
951 assert(!"XXX: Missing Gen8 scalar support for UNPACK_HALF_2x16_SPLIT");
954 case FS_OPCODE_PLACEHOLDER_HALT
:
955 /* This is the place where the final HALT needs to be inserted if
956 * we've emitted any discards. If not, this will emit no code.
958 patch_discard_jumps_to_fb_writes();
962 if (ir
->opcode
< int(ARRAY_SIZE(opcode_descs
))) {
963 _mesa_problem(ctx
, "Unsupported opcode `%s' in FS",
964 opcode_descs
[ir
->opcode
].name
);
966 _mesa_problem(ctx
, "Unsupported opcode %d in FS", ir
->opcode
);
971 if (unlikely(INTEL_DEBUG
& DEBUG_WM
)) {
972 disassemble(stdout
, last_native_inst_offset
, next_inst_offset
);
974 foreach_list(node
, &cfg
->block_list
) {
975 bblock_link
*link
= (bblock_link
*)node
;
976 bblock_t
*block
= link
->block
;
978 if (block
->end
== ir
) {
979 printf(" END B%d", block
->block_num
);
980 foreach_list(successor_node
, &block
->children
) {
981 bblock_link
*successor_link
=
982 (bblock_link
*)successor_node
;
983 bblock_t
*successor_block
= successor_link
->block
;
984 printf(" ->B%d", successor_block
->block_num
);
991 last_native_inst_offset
= next_inst_offset
;
994 if (unlikely(INTEL_DEBUG
& DEBUG_WM
)) {
998 patch_jump_targets();
1002 gen8_fs_generator::generate_assembly(exec_list
*simd8_instructions
,
1003 exec_list
*simd16_instructions
,
1004 unsigned *assembly_size
)
1006 assert(simd8_instructions
|| simd16_instructions
);
1008 if (simd8_instructions
) {
1010 generate_code(simd8_instructions
);
1013 if (simd16_instructions
) {
1014 /* Align to a 64-byte boundary. */
1015 while ((nr_inst
* sizeof(gen8_instruction
)) % 64)
1018 /* Save off the start of this SIMD16 program */
1019 c
->prog_data
.prog_offset_16
= nr_inst
* sizeof(gen8_instruction
);
1021 dispatch_width
= 16;
1022 generate_code(simd16_instructions
);
1025 *assembly_size
= next_inst_offset
;
1026 return (const unsigned *) store
;