2 * Copyright © 2010, 2011, 2012 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 /** @file gen8_fs_generate.cpp
26 * Code generation for Gen8+ hardware.
30 #include "main/macros.h"
31 #include "brw_context.h"
36 #include "glsl/ir_print_visitor.h"
38 gen8_fs_generator::gen8_fs_generator(struct brw_context
*brw
,
39 struct brw_wm_compile
*c
,
40 struct gl_shader_program
*shader_prog
,
41 struct gl_fragment_program
*fp
,
42 bool dual_source_output
)
43 : gen8_generator(brw
, shader_prog
, fp
? &fp
->Base
: NULL
, c
), c(c
), fp(fp
),
44 dual_source_output(dual_source_output
)
47 shader_prog
? shader_prog
->_LinkedShaders
[MESA_SHADER_FRAGMENT
] : NULL
;
50 gen8_fs_generator::~gen8_fs_generator()
55 gen8_fs_generator::mark_surface_used(unsigned surf_index
)
57 assert(surf_index
< BRW_MAX_SURFACES
);
59 c
->prog_data
.base
.binding_table
.size_bytes
=
60 MAX2(c
->prog_data
.base
.binding_table
.size_bytes
, (surf_index
+ 1) * 4);
64 gen8_fs_generator::generate_fb_write(fs_inst
*ir
)
66 if (fp
&& fp
->UsesKill
) {
67 gen8_instruction
*mov
=
68 MOV(retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW
),
70 gen8_set_mask_control(mov
, BRW_MASK_DISABLE
);
73 if (ir
->header_present
) {
74 gen8_instruction
*mov
=
75 MOV_RAW(brw_message_reg(ir
->base_mrf
), brw_vec8_grf(0, 0));
76 gen8_set_exec_size(mov
, BRW_EXECUTE_16
);
78 if (ir
->target
> 0 && c
->key
.replicate_alpha
) {
79 /* Set "Source0 Alpha Present to RenderTarget" bit in the header. */
80 OR(vec1(retype(brw_message_reg(ir
->base_mrf
), BRW_REGISTER_TYPE_UD
)),
81 vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
)),
86 /* Set the render target index for choosing BLEND_STATE. */
87 MOV(retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE
, ir
->base_mrf
, 2),
88 BRW_REGISTER_TYPE_UD
),
89 brw_imm_ud(ir
->target
));
93 gen8_instruction
*inst
= next_inst(BRW_OPCODE_SENDC
);
94 gen8_set_dst(brw
, inst
, retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW
));
95 gen8_set_src0(brw
, inst
, brw_message_reg(ir
->base_mrf
));
97 /* Set up the "Message Specific Control" bits for the Data Port Message
98 * Descriptor. These are documented in the "Render Target Write" message's
99 * "Message Descriptor" documentation (vol5c.2).
102 /* Set the Message Type */
103 if (this->dual_source_output
)
104 msg_type
= BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01
;
105 else if (dispatch_width
== 16)
106 msg_type
= BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE
;
108 msg_type
= BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01
;
110 uint32_t msg_control
= msg_type
;
112 /* "Last Render Target Select" must be set on all writes to the last of
113 * the render targets (if using MRT), or always for a single RT scenario.
115 if ((ir
->target
== c
->key
.nr_color_regions
- 1) || !c
->key
.nr_color_regions
)
116 msg_control
|= (1 << 4); /* Last Render Target Select */
118 uint32_t surf_index
=
119 c
->prog_data
.binding_table
.render_target_start
+ ir
->target
;
121 gen8_set_dp_message(brw
, inst
,
122 GEN6_SFID_DATAPORT_RENDER_CACHE
,
124 GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE
,
131 mark_surface_used(surf_index
);
135 gen8_fs_generator::generate_linterp(fs_inst
*inst
,
139 struct brw_reg delta_x
= src
[0];
140 struct brw_reg delta_y
= src
[1];
141 struct brw_reg interp
= src
[2];
144 assert(delta_y
.nr
== delta_x
.nr
+ 1);
145 PLN(dst
, interp
, delta_x
);
149 gen8_fs_generator::generate_tex(fs_inst
*ir
,
155 uint32_t simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD8
;
157 assert(src
.file
== BRW_GENERAL_REGISTER_FILE
);
159 if (dispatch_width
== 16 && !ir
->force_uncompressed
&& !ir
->force_sechalf
)
160 simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD16
;
162 switch (ir
->opcode
) {
163 case SHADER_OPCODE_TEX
:
164 if (ir
->shadow_compare
) {
165 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE
;
167 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE
;
171 if (ir
->shadow_compare
) {
172 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE
;
174 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS
;
177 case SHADER_OPCODE_TXL
:
178 if (ir
->shadow_compare
) {
179 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE
;
181 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_LOD
;
184 case SHADER_OPCODE_TXS
:
185 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO
;
187 case SHADER_OPCODE_TXD
:
188 if (ir
->shadow_compare
) {
189 msg_type
= HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE
;
191 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS
;
194 case SHADER_OPCODE_TXF
:
195 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_LD
;
197 case SHADER_OPCODE_TXF_CMS
:
198 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS
;
200 case SHADER_OPCODE_TXF_MCS
:
201 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS
;
203 case SHADER_OPCODE_LOD
:
204 msg_type
= GEN5_SAMPLER_MESSAGE_LOD
;
206 case SHADER_OPCODE_TG4
:
207 if (ir
->shadow_compare
) {
208 assert(brw
->gen
>= 7);
209 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C
;
211 assert(brw
->gen
>= 6);
212 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4
;
215 case SHADER_OPCODE_TG4_OFFSET
:
216 assert(brw
->gen
>= 7);
217 if (ir
->shadow_compare
) {
218 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C
;
220 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO
;
224 assert(!"not reached");
227 assert(msg_type
!= -1);
229 if (simd_mode
== BRW_SAMPLER_SIMD_MODE_SIMD16
) {
234 if (ir
->header_present
) {
235 /* The send-from-GRF for SIMD16 texturing with a header has an extra
236 * hardware register allocated to it, which we need to skip over (since
237 * our coordinates in the payload are in the even-numbered registers,
238 * and the header comes right before the first one.
240 if (dispatch_width
== 16)
243 MOV_RAW(src
, brw_vec8_grf(0, 0));
245 if (ir
->texture_offset
) {
246 /* Set the texel offset bits. */
247 MOV_RAW(retype(brw_vec1_grf(src
.nr
, 2), BRW_REGISTER_TYPE_UD
),
248 brw_imm_ud(ir
->texture_offset
));
252 uint32_t surf_index
=
253 c
->prog_data
.base
.binding_table
.texture_start
+ ir
->sampler
;
255 gen8_instruction
*inst
= next_inst(BRW_OPCODE_SEND
);
256 gen8_set_dst(brw
, inst
, dst
);
257 gen8_set_src0(brw
, inst
, src
);
258 gen8_set_sampler_message(brw
, inst
,
267 mark_surface_used(surf_index
);
271 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
274 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
276 * and we're trying to produce:
279 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
280 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
281 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
282 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
283 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
284 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
285 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
286 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
288 * and add another set of two more subspans if in 16-pixel dispatch mode.
290 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
291 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
292 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
293 * between each other. We could probably do it like ddx and swizzle the right
294 * order later, but bail for now and just produce
295 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
298 gen8_fs_generator::generate_ddx(fs_inst
*inst
,
302 unsigned vstride
, width
;
304 if (c
->key
.high_quality_derivatives
) {
305 /* Produce accurate derivatives. */
306 vstride
= BRW_VERTICAL_STRIDE_2
;
309 /* Replicate the derivative at the top-left pixel to other pixels. */
310 vstride
= BRW_VERTICAL_STRIDE_4
;
314 struct brw_reg src0
= brw_reg(src
.file
, src
.nr
, 1,
318 BRW_HORIZONTAL_STRIDE_0
,
319 BRW_SWIZZLE_XYZW
, WRITEMASK_XYZW
);
320 struct brw_reg src1
= brw_reg(src
.file
, src
.nr
, 0,
324 BRW_HORIZONTAL_STRIDE_0
,
325 BRW_SWIZZLE_XYZW
, WRITEMASK_XYZW
);
326 ADD(dst
, src0
, negate(src1
));
329 /* The negate_value boolean is used to negate the derivative computation for
330 * FBOs, since they place the origin at the upper left instead of the lower
334 gen8_fs_generator::generate_ddy(fs_inst
*inst
,
340 unsigned src0_swizzle
;
341 unsigned src1_swizzle
;
344 if (c
->key
.high_quality_derivatives
) {
345 /* Produce accurate derivatives. */
346 hstride
= BRW_HORIZONTAL_STRIDE_1
;
347 src0_swizzle
= BRW_SWIZZLE_XYXY
;
348 src1_swizzle
= BRW_SWIZZLE_ZWZW
;
351 default_state
.access_mode
= BRW_ALIGN_16
;
353 /* Replicate the derivative at the top-left pixel to other pixels. */
354 hstride
= BRW_HORIZONTAL_STRIDE_0
;
355 src0_swizzle
= BRW_SWIZZLE_XYZW
;
356 src1_swizzle
= BRW_SWIZZLE_XYZW
;
360 struct brw_reg src0
= brw_reg(src
.file
, src
.nr
, 0,
362 BRW_VERTICAL_STRIDE_4
,
365 src0_swizzle
, WRITEMASK_XYZW
);
366 struct brw_reg src1
= brw_reg(src
.file
, src
.nr
, src1_subnr
,
368 BRW_VERTICAL_STRIDE_4
,
371 src1_swizzle
, WRITEMASK_XYZW
);
374 ADD(dst
, src1
, negate(src0
));
376 ADD(dst
, src0
, negate(src1
));
378 default_state
.access_mode
= BRW_ALIGN_1
;
382 gen8_fs_generator::generate_scratch_write(fs_inst
*inst
, struct brw_reg dst
)
384 assert(inst
->mlen
!= 0);
385 assert(!"TODO: Implement generate_scratch_write.");
389 gen8_fs_generator::generate_scratch_read(fs_inst
*inst
, struct brw_reg dst
)
391 assert(inst
->mlen
!= 0);
392 assert(!"TODO: Implement generate_scratch_read.");
396 gen8_fs_generator::generate_scratch_read_gen7(fs_inst
*inst
, struct brw_reg dst
)
398 assert(inst
->mlen
!= 0);
399 assert(!"TODO: Implement generate_scratch_read_gen7.");
403 gen8_fs_generator::generate_uniform_pull_constant_load(fs_inst
*inst
,
405 struct brw_reg index
,
406 struct brw_reg offset
)
408 assert(inst
->mlen
== 0);
410 assert(index
.file
== BRW_IMMEDIATE_VALUE
&&
411 index
.type
== BRW_REGISTER_TYPE_UD
);
412 uint32_t surf_index
= index
.dw1
.ud
;
414 assert(offset
.file
== BRW_GENERAL_REGISTER_FILE
);
415 /* Reference only the dword we need lest we anger validate_reg() with
416 * reg.width > reg.execszie.
418 offset
= brw_vec1_grf(offset
.nr
, 0);
420 gen8_instruction
*send
= next_inst(BRW_OPCODE_SEND
);
421 gen8_set_mask_control(send
, BRW_MASK_DISABLE
);
423 /* We use the SIMD4x2 mode because we want to end up with 4 constants in
424 * the destination loaded consecutively from the same offset (which appears
425 * in the first component, and the rest are ignored).
427 dst
.width
= BRW_WIDTH_4
;
428 gen8_set_dst(brw
, send
, dst
);
429 gen8_set_src0(brw
, send
, offset
);
430 gen8_set_sampler_message(brw
, send
,
432 0, /* The LD message ignores the sampler unit. */
433 GEN5_SAMPLER_MESSAGE_SAMPLE_LD
,
436 false, /* no header */
437 BRW_SAMPLER_SIMD_MODE_SIMD4X2
);
439 mark_surface_used(surf_index
);
443 gen8_fs_generator::generate_varying_pull_constant_load(fs_inst
*ir
,
445 struct brw_reg index
,
446 struct brw_reg offset
)
448 /* Varying-offset pull constant loads are treated as a normal expression on
449 * gen7, so the fact that it's a send message is hidden at the IR level.
451 assert(!ir
->header_present
);
454 assert(index
.file
== BRW_IMMEDIATE_VALUE
&&
455 index
.type
== BRW_REGISTER_TYPE_UD
);
456 uint32_t surf_index
= index
.dw1
.ud
;
458 uint32_t simd_mode
, rlen
, mlen
;
459 if (dispatch_width
== 16) {
462 simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD16
;
466 simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD8
;
469 gen8_instruction
*send
= next_inst(BRW_OPCODE_SEND
);
470 gen8_set_dst(brw
, send
, dst
);
471 gen8_set_src0(brw
, send
, offset
);
472 gen8_set_sampler_message(brw
, send
,
474 0, /* The LD message ignore the sampler unit. */
475 GEN5_SAMPLER_MESSAGE_SAMPLE_LD
,
478 false, /* no header */
481 mark_surface_used(surf_index
);
485 * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
486 * into the flags register (f0.0).
489 gen8_fs_generator::generate_mov_dispatch_to_flags(fs_inst
*ir
)
491 struct brw_reg flags
= brw_flag_reg(0, ir
->flag_subreg
);
492 struct brw_reg dispatch_mask
=
493 retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW
);
495 gen8_instruction
*mov
= MOV(flags
, dispatch_mask
);
496 gen8_set_mask_control(mov
, BRW_MASK_DISABLE
);
500 gen8_fs_generator::generate_discard_jump(fs_inst
*ir
)
502 /* This HALT will be patched up at FB write time to point UIP at the end of
503 * the program, and at brw_uip_jip() JIP will be set to the end of the
504 * current block (or the program).
506 discard_halt_patches
.push_tail(new(mem_ctx
) ip_record(nr_inst
));
512 gen8_fs_generator::patch_discard_jumps_to_fb_writes()
514 if (discard_halt_patches
.is_empty())
517 /* There is a somewhat strange undocumented requirement of using
518 * HALT, according to the simulator. If some channel has HALTed to
519 * a particular UIP, then by the end of the program, every channel
520 * must have HALTed to that UIP. Furthermore, the tracking is a
521 * stack, so you can't do the final halt of a UIP after starting
522 * halting to a new UIP.
524 * Symptoms of not emitting this instruction on actual hardware
525 * included GPU hangs and sparkly rendering on the piglit discard
528 gen8_instruction
*last_halt
= HALT();
529 gen8_set_uip(last_halt
, 16);
530 gen8_set_jip(last_halt
, 16);
534 foreach_list(node
, &discard_halt_patches
) {
535 ip_record
*patch_ip
= (ip_record
*) node
;
536 gen8_instruction
*patch
= &store
[patch_ip
->ip
];
537 assert(gen8_opcode(patch
) == BRW_OPCODE_HALT
);
539 /* HALT takes an instruction distance from the pre-incremented IP. */
540 gen8_set_uip(patch
, (ip
- patch_ip
->ip
) * 16);
543 this->discard_halt_patches
.make_empty();
547 * Sets the first dword of a vgrf for simd4x2 uniform pull constant
548 * sampler LD messages.
550 * We don't want to bake it into the send message's code generation because
551 * that means we don't get a chance to schedule the instruction.
554 gen8_fs_generator::generate_set_simd4x2_offset(fs_inst
*ir
,
556 struct brw_reg value
)
558 assert(value
.file
== BRW_IMMEDIATE_VALUE
);
559 MOV_RAW(retype(brw_vec1_reg(dst
.file
, dst
.nr
, 0), value
.type
), value
);
563 gen8_fs_generator::generate_code(exec_list
*instructions
)
565 int last_native_inst_offset
= next_inst_offset
;
566 const char *last_annotation_string
= NULL
;
567 const void *last_annotation_ir
= NULL
;
569 if (unlikely(INTEL_DEBUG
& DEBUG_WM
)) {
571 printf("Native code for fragment shader %d (SIMD%d dispatch):\n",
572 shader_prog
->Name
, dispatch_width
);
574 printf("Native code for fragment program %d (SIMD%d dispatch):\n",
575 prog
->Id
, dispatch_width
);
577 printf("Native code for blorp program (SIMD%d dispatch):\n",
583 if (unlikely(INTEL_DEBUG
& DEBUG_WM
))
584 cfg
= new(mem_ctx
) cfg_t(instructions
);
586 foreach_list(node
, instructions
) {
587 fs_inst
*ir
= (fs_inst
*) node
;
588 struct brw_reg src
[3], dst
;
590 if (unlikely(INTEL_DEBUG
& DEBUG_WM
)) {
591 foreach_list(node
, &cfg
->block_list
) {
592 bblock_link
*link
= (bblock_link
*)node
;
593 bblock_t
*block
= link
->block
;
595 if (block
->start
== ir
) {
596 printf(" START B%d", block
->block_num
);
597 foreach_list(predecessor_node
, &block
->parents
) {
598 bblock_link
*predecessor_link
=
599 (bblock_link
*)predecessor_node
;
600 bblock_t
*predecessor_block
= predecessor_link
->block
;
601 printf(" <-B%d", predecessor_block
->block_num
);
607 if (last_annotation_ir
!= ir
->ir
) {
608 last_annotation_ir
= ir
->ir
;
609 if (last_annotation_ir
) {
612 ((ir_instruction
*) ir
->ir
)->print();
614 const prog_instruction
*fpi
;
615 fpi
= (const prog_instruction
*) ir
->ir
;
616 printf("%d: ", (int)(fpi
- prog
->Instructions
));
617 _mesa_fprint_instruction_opt(stdout
,
619 0, PROG_PRINT_DEBUG
, NULL
);
624 if (last_annotation_string
!= ir
->annotation
) {
625 last_annotation_string
= ir
->annotation
;
626 if (last_annotation_string
)
627 printf(" %s\n", last_annotation_string
);
631 for (unsigned int i
= 0; i
< 3; i
++) {
632 src
[i
] = brw_reg_from_fs_reg(&ir
->src
[i
]);
634 /* The accumulator result appears to get used for the
635 * conditional modifier generation. When negating a UD
636 * value, there is a 33rd bit generated for the sign in the
637 * accumulator value, so now you can't check, for example,
638 * equality with a 32-bit value. See piglit fs-op-neg-uvec4.
640 assert(!ir
->conditional_mod
||
641 ir
->src
[i
].type
!= BRW_REGISTER_TYPE_UD
||
644 dst
= brw_reg_from_fs_reg(&ir
->dst
);
646 default_state
.conditional_mod
= ir
->conditional_mod
;
647 default_state
.predicate
= ir
->predicate
;
648 default_state
.predicate_inverse
= ir
->predicate_inverse
;
649 default_state
.saturate
= ir
->saturate
;
650 default_state
.flag_subreg_nr
= ir
->flag_subreg
;
652 if (dispatch_width
== 16 && !ir
->force_uncompressed
)
653 default_state
.exec_size
= BRW_EXECUTE_16
;
655 default_state
.exec_size
= BRW_EXECUTE_8
;
657 /* fs_inst::force_sechalf is only used for original Gen4 code, so we
658 * don't handle it. Add qtr_control to default_state if that changes.
660 assert(!ir
->force_sechalf
);
662 switch (ir
->opcode
) {
667 ADD(dst
, src
[0], src
[1]);
670 MUL(dst
, src
[0], src
[1]);
672 case BRW_OPCODE_MACH
:
673 MACH(dst
, src
[0], src
[1]);
677 default_state
.access_mode
= BRW_ALIGN_16
;
678 MAD(dst
, src
[0], src
[1], src
[2]);
679 default_state
.access_mode
= BRW_ALIGN_1
;
683 default_state
.access_mode
= BRW_ALIGN_16
;
684 LRP(dst
, src
[0], src
[1], src
[2]);
685 default_state
.access_mode
= BRW_ALIGN_1
;
692 case BRW_OPCODE_RNDD
:
695 case BRW_OPCODE_RNDE
:
698 case BRW_OPCODE_RNDZ
:
703 AND(dst
, src
[0], src
[1]);
706 OR(dst
, src
[0], src
[1]);
709 XOR(dst
, src
[0], src
[1]);
715 ASR(dst
, src
[0], src
[1]);
718 SHR(dst
, src
[0], src
[1]);
721 SHL(dst
, src
[0], src
[1]);
724 case BRW_OPCODE_F32TO16
:
725 F32TO16(dst
, src
[0]);
727 case BRW_OPCODE_F16TO32
:
728 F16TO32(dst
, src
[0]);
732 CMP(dst
, ir
->conditional_mod
, src
[0], src
[1]);
735 SEL(dst
, src
[0], src
[1]);
738 case BRW_OPCODE_BFREV
:
739 /* BFREV only supports UD type for src and dst. */
740 BFREV(retype(dst
, BRW_REGISTER_TYPE_UD
),
741 retype(src
[0], BRW_REGISTER_TYPE_UD
));
745 /* FBH only supports UD type for dst. */
746 FBH(retype(dst
, BRW_REGISTER_TYPE_UD
), src
[0]);
750 /* FBL only supports UD type for dst. */
751 FBL(retype(dst
, BRW_REGISTER_TYPE_UD
), src
[0]);
754 case BRW_OPCODE_CBIT
:
755 /* CBIT only supports UD type for dst. */
756 CBIT(retype(dst
, BRW_REGISTER_TYPE_UD
), src
[0]);
759 case BRW_OPCODE_ADDC
:
760 ADDC(dst
, src
[0], src
[1]);
763 case BRW_OPCODE_SUBB
:
764 SUBB(dst
, src
[0], src
[1]);
768 default_state
.access_mode
= BRW_ALIGN_16
;
769 BFE(dst
, src
[0], src
[1], src
[2]);
770 default_state
.access_mode
= BRW_ALIGN_1
;
773 case BRW_OPCODE_BFI1
:
774 BFI1(dst
, src
[0], src
[1]);
777 case BRW_OPCODE_BFI2
:
778 default_state
.access_mode
= BRW_ALIGN_16
;
779 BFI2(dst
, src
[0], src
[1], src
[2]);
780 default_state
.access_mode
= BRW_ALIGN_1
;
784 IF(BRW_PREDICATE_NORMAL
);
787 case BRW_OPCODE_ELSE
:
791 case BRW_OPCODE_ENDIF
:
799 case BRW_OPCODE_BREAK
:
803 case BRW_OPCODE_CONTINUE
:
807 case BRW_OPCODE_WHILE
:
811 case SHADER_OPCODE_RCP
:
812 MATH(BRW_MATH_FUNCTION_INV
, dst
, src
[0]);
815 case SHADER_OPCODE_RSQ
:
816 MATH(BRW_MATH_FUNCTION_RSQ
, dst
, src
[0]);
819 case SHADER_OPCODE_SQRT
:
820 MATH(BRW_MATH_FUNCTION_SQRT
, dst
, src
[0]);
823 case SHADER_OPCODE_EXP2
:
824 MATH(BRW_MATH_FUNCTION_EXP
, dst
, src
[0]);
827 case SHADER_OPCODE_LOG2
:
828 MATH(BRW_MATH_FUNCTION_LOG
, dst
, src
[0]);
831 case SHADER_OPCODE_SIN
:
832 MATH(BRW_MATH_FUNCTION_SIN
, dst
, src
[0]);
835 case SHADER_OPCODE_COS
:
836 MATH(BRW_MATH_FUNCTION_COS
, dst
, src
[0]);
839 case SHADER_OPCODE_INT_QUOTIENT
:
840 MATH(BRW_MATH_FUNCTION_INT_DIV_QUOTIENT
, dst
, src
[0], src
[1]);
843 case SHADER_OPCODE_INT_REMAINDER
:
844 MATH(BRW_MATH_FUNCTION_INT_DIV_REMAINDER
, dst
, src
[0], src
[1]);
847 case SHADER_OPCODE_POW
:
848 MATH(BRW_MATH_FUNCTION_POW
, dst
, src
[0], src
[1]);
851 case FS_OPCODE_PIXEL_X
:
852 case FS_OPCODE_PIXEL_Y
:
853 assert(!"FS_OPCODE_PIXEL_X and FS_OPCODE_PIXEL_Y are only for Gen4-5.");
856 case FS_OPCODE_CINTERP
:
859 case FS_OPCODE_LINTERP
:
860 generate_linterp(ir
, dst
, src
);
862 case SHADER_OPCODE_TEX
:
864 case SHADER_OPCODE_TXD
:
865 case SHADER_OPCODE_TXF
:
866 case SHADER_OPCODE_TXF_CMS
:
867 case SHADER_OPCODE_TXF_MCS
:
868 case SHADER_OPCODE_TXL
:
869 case SHADER_OPCODE_TXS
:
870 case SHADER_OPCODE_LOD
:
871 case SHADER_OPCODE_TG4
:
872 case SHADER_OPCODE_TG4_OFFSET
:
873 generate_tex(ir
, dst
, src
[0]);
877 generate_ddx(ir
, dst
, src
[0]);
880 /* Make sure fp->UsesDFdy flag got set (otherwise there's no
881 * guarantee that c->key.render_to_fbo is set).
883 assert(fp
->UsesDFdy
);
884 generate_ddy(ir
, dst
, src
[0], c
->key
.render_to_fbo
);
887 case SHADER_OPCODE_GEN4_SCRATCH_WRITE
:
888 generate_scratch_write(ir
, src
[0]);
891 case SHADER_OPCODE_GEN4_SCRATCH_READ
:
892 generate_scratch_read(ir
, dst
);
895 case SHADER_OPCODE_GEN7_SCRATCH_READ
:
896 generate_scratch_read_gen7(ir
, dst
);
899 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7
:
900 generate_uniform_pull_constant_load(ir
, dst
, src
[0], src
[1]);
903 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7
:
904 generate_varying_pull_constant_load(ir
, dst
, src
[0], src
[1]);
907 case FS_OPCODE_FB_WRITE
:
908 generate_fb_write(ir
);
911 case FS_OPCODE_MOV_DISPATCH_TO_FLAGS
:
912 generate_mov_dispatch_to_flags(ir
);
915 case FS_OPCODE_DISCARD_JUMP
:
916 generate_discard_jump(ir
);
919 case SHADER_OPCODE_SHADER_TIME_ADD
:
920 assert(!"XXX: Missing Gen8 scalar support for INTEL_DEBUG=shader_time");
923 case SHADER_OPCODE_UNTYPED_ATOMIC
:
924 assert(!"XXX: Missing Gen8 scalar support for untyped atomics");
927 case SHADER_OPCODE_UNTYPED_SURFACE_READ
:
928 assert(!"XXX: Missing Gen8 scalar support for untyped surface reads");
931 case FS_OPCODE_SET_SIMD4X2_OFFSET
:
932 generate_set_simd4x2_offset(ir
, dst
, src
[0]);
935 case FS_OPCODE_SET_OMASK
:
936 assert(!"XXX: Missing Gen8 scalar support for SET_OMASK");
939 case FS_OPCODE_SET_SAMPLE_ID
:
940 assert(!"XXX: Missing Gen8 scalar support for SET_SAMPLE_ID");
943 case FS_OPCODE_PACK_HALF_2x16_SPLIT
:
944 assert(!"XXX: Missing Gen8 scalar support for PACK_HALF_2x16_SPLIT");
947 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X
:
948 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y
:
949 assert(!"XXX: Missing Gen8 scalar support for UNPACK_HALF_2x16_SPLIT");
952 case FS_OPCODE_PLACEHOLDER_HALT
:
953 /* This is the place where the final HALT needs to be inserted if
954 * we've emitted any discards. If not, this will emit no code.
956 patch_discard_jumps_to_fb_writes();
960 if (ir
->opcode
< int(ARRAY_SIZE(opcode_descs
))) {
961 _mesa_problem(ctx
, "Unsupported opcode `%s' in FS",
962 opcode_descs
[ir
->opcode
].name
);
964 _mesa_problem(ctx
, "Unsupported opcode %d in FS", ir
->opcode
);
969 if (unlikely(INTEL_DEBUG
& DEBUG_WM
)) {
970 disassemble(stdout
, last_native_inst_offset
, next_inst_offset
);
972 foreach_list(node
, &cfg
->block_list
) {
973 bblock_link
*link
= (bblock_link
*)node
;
974 bblock_t
*block
= link
->block
;
976 if (block
->end
== ir
) {
977 printf(" END B%d", block
->block_num
);
978 foreach_list(successor_node
, &block
->children
) {
979 bblock_link
*successor_link
=
980 (bblock_link
*)successor_node
;
981 bblock_t
*successor_block
= successor_link
->block
;
982 printf(" ->B%d", successor_block
->block_num
);
989 last_native_inst_offset
= next_inst_offset
;
992 if (unlikely(INTEL_DEBUG
& DEBUG_WM
)) {
996 patch_jump_targets();
1000 gen8_fs_generator::generate_assembly(exec_list
*simd8_instructions
,
1001 exec_list
*simd16_instructions
,
1002 unsigned *assembly_size
)
1004 assert(simd8_instructions
|| simd16_instructions
);
1006 if (simd8_instructions
) {
1008 generate_code(simd8_instructions
);
1011 if (simd16_instructions
) {
1012 /* Align to a 64-byte boundary. */
1013 while ((nr_inst
* sizeof(gen8_instruction
)) % 64)
1016 /* Save off the start of this SIMD16 program */
1017 c
->prog_data
.prog_offset_16
= nr_inst
* sizeof(gen8_instruction
);
1019 dispatch_width
= 16;
1020 generate_code(simd16_instructions
);
1023 *assembly_size
= next_inst_offset
;
1024 return (const unsigned *) store
;