2 * Copyright © 2010, 2011, 2012 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 /** @file gen8_fs_generate.cpp
26 * Code generation for Gen8+ hardware.
30 #include "main/macros.h"
31 #include "brw_context.h"
36 #include "glsl/ir_print_visitor.h"
38 gen8_fs_generator::gen8_fs_generator(struct brw_context
*brw
,
39 struct brw_wm_compile
*c
,
40 struct gl_shader_program
*shader_prog
,
41 struct gl_fragment_program
*fp
,
42 bool dual_source_output
)
43 : gen8_generator(brw
, shader_prog
, fp
? &fp
->Base
: NULL
, c
), c(c
), fp(fp
),
44 dual_source_output(dual_source_output
)
47 shader_prog
? shader_prog
->_LinkedShaders
[MESA_SHADER_FRAGMENT
] : NULL
;
50 gen8_fs_generator::~gen8_fs_generator()
55 gen8_fs_generator::mark_surface_used(unsigned surf_index
)
57 assert(surf_index
< BRW_MAX_SURFACES
);
59 c
->prog_data
.base
.binding_table
.size_bytes
=
60 MAX2(c
->prog_data
.base
.binding_table
.size_bytes
, (surf_index
+ 1) * 4);
64 gen8_fs_generator::generate_fb_write(fs_inst
*ir
)
66 if (fp
&& fp
->UsesKill
) {
67 gen8_instruction
*mov
=
68 MOV(retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW
),
70 gen8_set_mask_control(mov
, BRW_MASK_DISABLE
);
73 if (ir
->header_present
) {
74 gen8_instruction
*mov
=
75 MOV_RAW(brw_message_reg(ir
->base_mrf
), brw_vec8_grf(0, 0));
76 gen8_set_exec_size(mov
, BRW_EXECUTE_16
);
78 if (ir
->target
> 0 && c
->key
.replicate_alpha
) {
79 /* Set "Source0 Alpha Present to RenderTarget" bit in the header. */
80 OR(vec1(retype(brw_message_reg(ir
->base_mrf
), BRW_REGISTER_TYPE_UD
)),
81 vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
)),
86 /* Set the render target index for choosing BLEND_STATE. */
87 MOV(retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE
, ir
->base_mrf
, 2),
88 BRW_REGISTER_TYPE_UD
),
89 brw_imm_ud(ir
->target
));
93 gen8_instruction
*inst
= next_inst(BRW_OPCODE_SENDC
);
94 gen8_set_dst(brw
, inst
, retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW
));
95 gen8_set_src0(brw
, inst
, brw_message_reg(ir
->base_mrf
));
97 /* Set up the "Message Specific Control" bits for the Data Port Message
98 * Descriptor. These are documented in the "Render Target Write" message's
99 * "Message Descriptor" documentation (vol5c.2).
102 /* Set the Message Type */
103 if (this->dual_source_output
)
104 msg_type
= BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01
;
105 else if (dispatch_width
== 16)
106 msg_type
= BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE
;
108 msg_type
= BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01
;
110 uint32_t msg_control
= msg_type
;
112 /* "Last Render Target Select" must be set on all writes to the last of
113 * the render targets (if using MRT), or always for a single RT scenario.
115 if ((ir
->target
== c
->key
.nr_color_regions
- 1) || !c
->key
.nr_color_regions
)
116 msg_control
|= (1 << 4); /* Last Render Target Select */
118 uint32_t surf_index
=
119 c
->prog_data
.binding_table
.render_target_start
+ ir
->target
;
121 gen8_set_dp_message(brw
, inst
,
122 GEN6_SFID_DATAPORT_RENDER_CACHE
,
124 GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE
,
131 mark_surface_used(surf_index
);
135 gen8_fs_generator::generate_linterp(fs_inst
*inst
,
139 struct brw_reg delta_x
= src
[0];
140 struct brw_reg delta_y
= src
[1];
141 struct brw_reg interp
= src
[2];
144 assert(delta_y
.nr
== delta_x
.nr
+ 1);
145 PLN(dst
, interp
, delta_x
);
149 gen8_fs_generator::generate_tex(fs_inst
*ir
,
155 uint32_t simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD8
;
157 assert(src
.file
== BRW_GENERAL_REGISTER_FILE
);
159 if (dispatch_width
== 16 && !ir
->force_uncompressed
&& !ir
->force_sechalf
)
160 simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD16
;
162 switch (ir
->opcode
) {
163 case SHADER_OPCODE_TEX
:
164 if (ir
->shadow_compare
) {
165 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE
;
167 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE
;
171 if (ir
->shadow_compare
) {
172 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE
;
174 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS
;
177 case SHADER_OPCODE_TXL
:
178 if (ir
->shadow_compare
) {
179 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE
;
181 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_LOD
;
184 case SHADER_OPCODE_TXS
:
185 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO
;
187 case SHADER_OPCODE_TXD
:
188 if (ir
->shadow_compare
) {
189 msg_type
= HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE
;
191 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS
;
194 case SHADER_OPCODE_TXF
:
195 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_LD
;
197 case SHADER_OPCODE_TXF_CMS
:
198 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS
;
200 case SHADER_OPCODE_TXF_UMS
:
201 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS
;
203 case SHADER_OPCODE_TXF_MCS
:
204 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS
;
206 case SHADER_OPCODE_LOD
:
207 msg_type
= GEN5_SAMPLER_MESSAGE_LOD
;
209 case SHADER_OPCODE_TG4
:
210 if (ir
->shadow_compare
) {
211 assert(brw
->gen
>= 7);
212 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C
;
214 assert(brw
->gen
>= 6);
215 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4
;
218 case SHADER_OPCODE_TG4_OFFSET
:
219 assert(brw
->gen
>= 7);
220 if (ir
->shadow_compare
) {
221 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C
;
223 msg_type
= GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO
;
227 assert(!"not reached");
230 assert(msg_type
!= -1);
232 if (simd_mode
== BRW_SAMPLER_SIMD_MODE_SIMD16
) {
237 if (ir
->header_present
) {
238 /* The send-from-GRF for SIMD16 texturing with a header has an extra
239 * hardware register allocated to it, which we need to skip over (since
240 * our coordinates in the payload are in the even-numbered registers,
241 * and the header comes right before the first one.
243 if (dispatch_width
== 16)
246 MOV_RAW(src
, brw_vec8_grf(0, 0));
248 if (ir
->texture_offset
) {
249 /* Set the texel offset bits. */
250 MOV_RAW(retype(brw_vec1_grf(src
.nr
, 2), BRW_REGISTER_TYPE_UD
),
251 brw_imm_ud(ir
->texture_offset
));
255 uint32_t surf_index
=
256 c
->prog_data
.base
.binding_table
.texture_start
+ ir
->sampler
;
258 gen8_instruction
*inst
= next_inst(BRW_OPCODE_SEND
);
259 gen8_set_dst(brw
, inst
, dst
);
260 gen8_set_src0(brw
, inst
, src
);
261 gen8_set_sampler_message(brw
, inst
,
270 mark_surface_used(surf_index
);
274 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
277 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
279 * and we're trying to produce:
282 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
283 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
284 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
285 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
286 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
287 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
288 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
289 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
291 * and add another set of two more subspans if in 16-pixel dispatch mode.
293 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
294 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
295 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
296 * between each other. We could probably do it like ddx and swizzle the right
297 * order later, but bail for now and just produce
298 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
301 gen8_fs_generator::generate_ddx(fs_inst
*inst
,
305 unsigned vstride
, width
;
307 if (c
->key
.high_quality_derivatives
) {
308 /* Produce accurate derivatives. */
309 vstride
= BRW_VERTICAL_STRIDE_2
;
312 /* Replicate the derivative at the top-left pixel to other pixels. */
313 vstride
= BRW_VERTICAL_STRIDE_4
;
317 struct brw_reg src0
= brw_reg(src
.file
, src
.nr
, 1,
321 BRW_HORIZONTAL_STRIDE_0
,
322 BRW_SWIZZLE_XYZW
, WRITEMASK_XYZW
);
323 struct brw_reg src1
= brw_reg(src
.file
, src
.nr
, 0,
327 BRW_HORIZONTAL_STRIDE_0
,
328 BRW_SWIZZLE_XYZW
, WRITEMASK_XYZW
);
329 ADD(dst
, src0
, negate(src1
));
332 /* The negate_value boolean is used to negate the derivative computation for
333 * FBOs, since they place the origin at the upper left instead of the lower
337 gen8_fs_generator::generate_ddy(fs_inst
*inst
,
343 unsigned src0_swizzle
;
344 unsigned src1_swizzle
;
347 if (c
->key
.high_quality_derivatives
) {
348 /* Produce accurate derivatives. */
349 hstride
= BRW_HORIZONTAL_STRIDE_1
;
350 src0_swizzle
= BRW_SWIZZLE_XYXY
;
351 src1_swizzle
= BRW_SWIZZLE_ZWZW
;
354 default_state
.access_mode
= BRW_ALIGN_16
;
356 /* Replicate the derivative at the top-left pixel to other pixels. */
357 hstride
= BRW_HORIZONTAL_STRIDE_0
;
358 src0_swizzle
= BRW_SWIZZLE_XYZW
;
359 src1_swizzle
= BRW_SWIZZLE_XYZW
;
363 struct brw_reg src0
= brw_reg(src
.file
, src
.nr
, 0,
365 BRW_VERTICAL_STRIDE_4
,
368 src0_swizzle
, WRITEMASK_XYZW
);
369 struct brw_reg src1
= brw_reg(src
.file
, src
.nr
, src1_subnr
,
371 BRW_VERTICAL_STRIDE_4
,
374 src1_swizzle
, WRITEMASK_XYZW
);
377 ADD(dst
, src1
, negate(src0
));
379 ADD(dst
, src0
, negate(src1
));
381 default_state
.access_mode
= BRW_ALIGN_1
;
385 gen8_fs_generator::generate_scratch_write(fs_inst
*inst
, struct brw_reg dst
)
387 assert(inst
->mlen
!= 0);
388 assert(!"TODO: Implement generate_scratch_write.");
392 gen8_fs_generator::generate_scratch_read(fs_inst
*inst
, struct brw_reg dst
)
394 assert(inst
->mlen
!= 0);
395 assert(!"TODO: Implement generate_scratch_read.");
399 gen8_fs_generator::generate_scratch_read_gen7(fs_inst
*inst
, struct brw_reg dst
)
401 assert(inst
->mlen
!= 0);
402 assert(!"TODO: Implement generate_scratch_read_gen7.");
406 gen8_fs_generator::generate_uniform_pull_constant_load(fs_inst
*inst
,
408 struct brw_reg index
,
409 struct brw_reg offset
)
411 assert(inst
->mlen
== 0);
413 assert(index
.file
== BRW_IMMEDIATE_VALUE
&&
414 index
.type
== BRW_REGISTER_TYPE_UD
);
415 uint32_t surf_index
= index
.dw1
.ud
;
417 assert(offset
.file
== BRW_GENERAL_REGISTER_FILE
);
418 /* Reference only the dword we need lest we anger validate_reg() with
419 * reg.width > reg.execszie.
421 offset
= brw_vec1_grf(offset
.nr
, 0);
423 gen8_instruction
*send
= next_inst(BRW_OPCODE_SEND
);
424 gen8_set_mask_control(send
, BRW_MASK_DISABLE
);
426 /* We use the SIMD4x2 mode because we want to end up with 4 constants in
427 * the destination loaded consecutively from the same offset (which appears
428 * in the first component, and the rest are ignored).
430 dst
.width
= BRW_WIDTH_4
;
431 gen8_set_dst(brw
, send
, dst
);
432 gen8_set_src0(brw
, send
, offset
);
433 gen8_set_sampler_message(brw
, send
,
435 0, /* The LD message ignores the sampler unit. */
436 GEN5_SAMPLER_MESSAGE_SAMPLE_LD
,
439 false, /* no header */
440 BRW_SAMPLER_SIMD_MODE_SIMD4X2
);
442 mark_surface_used(surf_index
);
446 gen8_fs_generator::generate_varying_pull_constant_load(fs_inst
*ir
,
448 struct brw_reg index
,
449 struct brw_reg offset
)
451 /* Varying-offset pull constant loads are treated as a normal expression on
452 * gen7, so the fact that it's a send message is hidden at the IR level.
454 assert(!ir
->header_present
);
457 assert(index
.file
== BRW_IMMEDIATE_VALUE
&&
458 index
.type
== BRW_REGISTER_TYPE_UD
);
459 uint32_t surf_index
= index
.dw1
.ud
;
461 uint32_t simd_mode
, rlen
, mlen
;
462 if (dispatch_width
== 16) {
465 simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD16
;
469 simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD8
;
472 gen8_instruction
*send
= next_inst(BRW_OPCODE_SEND
);
473 gen8_set_dst(brw
, send
, dst
);
474 gen8_set_src0(brw
, send
, offset
);
475 gen8_set_sampler_message(brw
, send
,
477 0, /* The LD message ignore the sampler unit. */
478 GEN5_SAMPLER_MESSAGE_SAMPLE_LD
,
481 false, /* no header */
484 mark_surface_used(surf_index
);
488 * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
489 * into the flags register (f0.0).
492 gen8_fs_generator::generate_mov_dispatch_to_flags(fs_inst
*ir
)
494 struct brw_reg flags
= brw_flag_reg(0, ir
->flag_subreg
);
495 struct brw_reg dispatch_mask
=
496 retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW
);
498 gen8_instruction
*mov
= MOV(flags
, dispatch_mask
);
499 gen8_set_mask_control(mov
, BRW_MASK_DISABLE
);
503 gen8_fs_generator::generate_discard_jump(fs_inst
*ir
)
505 /* This HALT will be patched up at FB write time to point UIP at the end of
506 * the program, and at brw_uip_jip() JIP will be set to the end of the
507 * current block (or the program).
509 discard_halt_patches
.push_tail(new(mem_ctx
) ip_record(nr_inst
));
515 gen8_fs_generator::patch_discard_jumps_to_fb_writes()
517 if (discard_halt_patches
.is_empty())
520 /* There is a somewhat strange undocumented requirement of using
521 * HALT, according to the simulator. If some channel has HALTed to
522 * a particular UIP, then by the end of the program, every channel
523 * must have HALTed to that UIP. Furthermore, the tracking is a
524 * stack, so you can't do the final halt of a UIP after starting
525 * halting to a new UIP.
527 * Symptoms of not emitting this instruction on actual hardware
528 * included GPU hangs and sparkly rendering on the piglit discard
531 gen8_instruction
*last_halt
= HALT();
532 gen8_set_uip(last_halt
, 16);
533 gen8_set_jip(last_halt
, 16);
537 foreach_list(node
, &discard_halt_patches
) {
538 ip_record
*patch_ip
= (ip_record
*) node
;
539 gen8_instruction
*patch
= &store
[patch_ip
->ip
];
540 assert(gen8_opcode(patch
) == BRW_OPCODE_HALT
);
542 /* HALT takes an instruction distance from the pre-incremented IP. */
543 gen8_set_uip(patch
, (ip
- patch_ip
->ip
) * 16);
546 this->discard_halt_patches
.make_empty();
550 * Sets the first dword of a vgrf for simd4x2 uniform pull constant
551 * sampler LD messages.
553 * We don't want to bake it into the send message's code generation because
554 * that means we don't get a chance to schedule the instruction.
557 gen8_fs_generator::generate_set_simd4x2_offset(fs_inst
*ir
,
559 struct brw_reg value
)
561 assert(value
.file
== BRW_IMMEDIATE_VALUE
);
562 MOV_RAW(retype(brw_vec1_reg(dst
.file
, dst
.nr
, 0), value
.type
), value
);
566 gen8_fs_generator::generate_code(exec_list
*instructions
)
568 int last_native_inst_offset
= next_inst_offset
;
569 const char *last_annotation_string
= NULL
;
570 const void *last_annotation_ir
= NULL
;
572 if (unlikely(INTEL_DEBUG
& DEBUG_WM
)) {
574 printf("Native code for fragment shader %d (SIMD%d dispatch):\n",
575 shader_prog
->Name
, dispatch_width
);
577 printf("Native code for fragment program %d (SIMD%d dispatch):\n",
578 prog
->Id
, dispatch_width
);
580 printf("Native code for blorp program (SIMD%d dispatch):\n",
586 if (unlikely(INTEL_DEBUG
& DEBUG_WM
))
587 cfg
= new(mem_ctx
) cfg_t(instructions
);
589 foreach_list(node
, instructions
) {
590 fs_inst
*ir
= (fs_inst
*) node
;
591 struct brw_reg src
[3], dst
;
593 if (unlikely(INTEL_DEBUG
& DEBUG_WM
)) {
594 foreach_list(node
, &cfg
->block_list
) {
595 bblock_link
*link
= (bblock_link
*)node
;
596 bblock_t
*block
= link
->block
;
598 if (block
->start
== ir
) {
599 printf(" START B%d", block
->block_num
);
600 foreach_list(predecessor_node
, &block
->parents
) {
601 bblock_link
*predecessor_link
=
602 (bblock_link
*)predecessor_node
;
603 bblock_t
*predecessor_block
= predecessor_link
->block
;
604 printf(" <-B%d", predecessor_block
->block_num
);
610 if (last_annotation_ir
!= ir
->ir
) {
611 last_annotation_ir
= ir
->ir
;
612 if (last_annotation_ir
) {
615 ((ir_instruction
*) ir
->ir
)->print();
617 const prog_instruction
*fpi
;
618 fpi
= (const prog_instruction
*) ir
->ir
;
619 printf("%d: ", (int)(fpi
- prog
->Instructions
));
620 _mesa_fprint_instruction_opt(stdout
,
622 0, PROG_PRINT_DEBUG
, NULL
);
627 if (last_annotation_string
!= ir
->annotation
) {
628 last_annotation_string
= ir
->annotation
;
629 if (last_annotation_string
)
630 printf(" %s\n", last_annotation_string
);
634 for (unsigned int i
= 0; i
< 3; i
++) {
635 src
[i
] = brw_reg_from_fs_reg(&ir
->src
[i
]);
637 /* The accumulator result appears to get used for the
638 * conditional modifier generation. When negating a UD
639 * value, there is a 33rd bit generated for the sign in the
640 * accumulator value, so now you can't check, for example,
641 * equality with a 32-bit value. See piglit fs-op-neg-uvec4.
643 assert(!ir
->conditional_mod
||
644 ir
->src
[i
].type
!= BRW_REGISTER_TYPE_UD
||
647 dst
= brw_reg_from_fs_reg(&ir
->dst
);
649 default_state
.conditional_mod
= ir
->conditional_mod
;
650 default_state
.predicate
= ir
->predicate
;
651 default_state
.predicate_inverse
= ir
->predicate_inverse
;
652 default_state
.saturate
= ir
->saturate
;
653 default_state
.flag_subreg_nr
= ir
->flag_subreg
;
655 if (dispatch_width
== 16 && !ir
->force_uncompressed
)
656 default_state
.exec_size
= BRW_EXECUTE_16
;
658 default_state
.exec_size
= BRW_EXECUTE_8
;
660 /* fs_inst::force_sechalf is only used for original Gen4 code, so we
661 * don't handle it. Add qtr_control to default_state if that changes.
663 assert(!ir
->force_sechalf
);
665 switch (ir
->opcode
) {
670 ADD(dst
, src
[0], src
[1]);
673 MUL(dst
, src
[0], src
[1]);
675 case BRW_OPCODE_MACH
:
676 MACH(dst
, src
[0], src
[1]);
680 default_state
.access_mode
= BRW_ALIGN_16
;
681 MAD(dst
, src
[0], src
[1], src
[2]);
682 default_state
.access_mode
= BRW_ALIGN_1
;
686 default_state
.access_mode
= BRW_ALIGN_16
;
687 LRP(dst
, src
[0], src
[1], src
[2]);
688 default_state
.access_mode
= BRW_ALIGN_1
;
695 case BRW_OPCODE_RNDD
:
698 case BRW_OPCODE_RNDE
:
701 case BRW_OPCODE_RNDZ
:
706 AND(dst
, src
[0], src
[1]);
709 OR(dst
, src
[0], src
[1]);
712 XOR(dst
, src
[0], src
[1]);
718 ASR(dst
, src
[0], src
[1]);
721 SHR(dst
, src
[0], src
[1]);
724 SHL(dst
, src
[0], src
[1]);
727 case BRW_OPCODE_F32TO16
:
728 F32TO16(dst
, src
[0]);
730 case BRW_OPCODE_F16TO32
:
731 F16TO32(dst
, src
[0]);
735 CMP(dst
, ir
->conditional_mod
, src
[0], src
[1]);
738 SEL(dst
, src
[0], src
[1]);
741 case BRW_OPCODE_BFREV
:
742 /* BFREV only supports UD type for src and dst. */
743 BFREV(retype(dst
, BRW_REGISTER_TYPE_UD
),
744 retype(src
[0], BRW_REGISTER_TYPE_UD
));
748 /* FBH only supports UD type for dst. */
749 FBH(retype(dst
, BRW_REGISTER_TYPE_UD
), src
[0]);
753 /* FBL only supports UD type for dst. */
754 FBL(retype(dst
, BRW_REGISTER_TYPE_UD
), src
[0]);
757 case BRW_OPCODE_CBIT
:
758 /* CBIT only supports UD type for dst. */
759 CBIT(retype(dst
, BRW_REGISTER_TYPE_UD
), src
[0]);
762 case BRW_OPCODE_ADDC
:
763 ADDC(dst
, src
[0], src
[1]);
766 case BRW_OPCODE_SUBB
:
767 SUBB(dst
, src
[0], src
[1]);
771 default_state
.access_mode
= BRW_ALIGN_16
;
772 BFE(dst
, src
[0], src
[1], src
[2]);
773 default_state
.access_mode
= BRW_ALIGN_1
;
776 case BRW_OPCODE_BFI1
:
777 BFI1(dst
, src
[0], src
[1]);
780 case BRW_OPCODE_BFI2
:
781 default_state
.access_mode
= BRW_ALIGN_16
;
782 BFI2(dst
, src
[0], src
[1], src
[2]);
783 default_state
.access_mode
= BRW_ALIGN_1
;
787 IF(BRW_PREDICATE_NORMAL
);
790 case BRW_OPCODE_ELSE
:
794 case BRW_OPCODE_ENDIF
:
802 case BRW_OPCODE_BREAK
:
806 case BRW_OPCODE_CONTINUE
:
810 case BRW_OPCODE_WHILE
:
814 case SHADER_OPCODE_RCP
:
815 MATH(BRW_MATH_FUNCTION_INV
, dst
, src
[0]);
818 case SHADER_OPCODE_RSQ
:
819 MATH(BRW_MATH_FUNCTION_RSQ
, dst
, src
[0]);
822 case SHADER_OPCODE_SQRT
:
823 MATH(BRW_MATH_FUNCTION_SQRT
, dst
, src
[0]);
826 case SHADER_OPCODE_EXP2
:
827 MATH(BRW_MATH_FUNCTION_EXP
, dst
, src
[0]);
830 case SHADER_OPCODE_LOG2
:
831 MATH(BRW_MATH_FUNCTION_LOG
, dst
, src
[0]);
834 case SHADER_OPCODE_SIN
:
835 MATH(BRW_MATH_FUNCTION_SIN
, dst
, src
[0]);
838 case SHADER_OPCODE_COS
:
839 MATH(BRW_MATH_FUNCTION_COS
, dst
, src
[0]);
842 case SHADER_OPCODE_INT_QUOTIENT
:
843 MATH(BRW_MATH_FUNCTION_INT_DIV_QUOTIENT
, dst
, src
[0], src
[1]);
846 case SHADER_OPCODE_INT_REMAINDER
:
847 MATH(BRW_MATH_FUNCTION_INT_DIV_REMAINDER
, dst
, src
[0], src
[1]);
850 case SHADER_OPCODE_POW
:
851 MATH(BRW_MATH_FUNCTION_POW
, dst
, src
[0], src
[1]);
854 case FS_OPCODE_PIXEL_X
:
855 case FS_OPCODE_PIXEL_Y
:
856 assert(!"FS_OPCODE_PIXEL_X and FS_OPCODE_PIXEL_Y are only for Gen4-5.");
859 case FS_OPCODE_CINTERP
:
862 case FS_OPCODE_LINTERP
:
863 generate_linterp(ir
, dst
, src
);
865 case SHADER_OPCODE_TEX
:
867 case SHADER_OPCODE_TXD
:
868 case SHADER_OPCODE_TXF
:
869 case SHADER_OPCODE_TXF_CMS
:
870 case SHADER_OPCODE_TXF_UMS
:
871 case SHADER_OPCODE_TXF_MCS
:
872 case SHADER_OPCODE_TXL
:
873 case SHADER_OPCODE_TXS
:
874 case SHADER_OPCODE_LOD
:
875 case SHADER_OPCODE_TG4
:
876 case SHADER_OPCODE_TG4_OFFSET
:
877 generate_tex(ir
, dst
, src
[0]);
881 generate_ddx(ir
, dst
, src
[0]);
884 /* Make sure fp->UsesDFdy flag got set (otherwise there's no
885 * guarantee that c->key.render_to_fbo is set).
887 assert(fp
->UsesDFdy
);
888 generate_ddy(ir
, dst
, src
[0], c
->key
.render_to_fbo
);
891 case SHADER_OPCODE_GEN4_SCRATCH_WRITE
:
892 generate_scratch_write(ir
, src
[0]);
895 case SHADER_OPCODE_GEN4_SCRATCH_READ
:
896 generate_scratch_read(ir
, dst
);
899 case SHADER_OPCODE_GEN7_SCRATCH_READ
:
900 generate_scratch_read_gen7(ir
, dst
);
903 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7
:
904 generate_uniform_pull_constant_load(ir
, dst
, src
[0], src
[1]);
907 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7
:
908 generate_varying_pull_constant_load(ir
, dst
, src
[0], src
[1]);
911 case FS_OPCODE_FB_WRITE
:
912 generate_fb_write(ir
);
915 case FS_OPCODE_MOV_DISPATCH_TO_FLAGS
:
916 generate_mov_dispatch_to_flags(ir
);
919 case FS_OPCODE_DISCARD_JUMP
:
920 generate_discard_jump(ir
);
923 case SHADER_OPCODE_SHADER_TIME_ADD
:
924 assert(!"XXX: Missing Gen8 scalar support for INTEL_DEBUG=shader_time");
927 case SHADER_OPCODE_UNTYPED_ATOMIC
:
928 assert(!"XXX: Missing Gen8 scalar support for untyped atomics");
931 case SHADER_OPCODE_UNTYPED_SURFACE_READ
:
932 assert(!"XXX: Missing Gen8 scalar support for untyped surface reads");
935 case FS_OPCODE_SET_SIMD4X2_OFFSET
:
936 generate_set_simd4x2_offset(ir
, dst
, src
[0]);
939 case FS_OPCODE_SET_OMASK
:
940 assert(!"XXX: Missing Gen8 scalar support for SET_OMASK");
943 case FS_OPCODE_SET_SAMPLE_ID
:
944 assert(!"XXX: Missing Gen8 scalar support for SET_SAMPLE_ID");
947 case FS_OPCODE_PACK_HALF_2x16_SPLIT
:
948 assert(!"XXX: Missing Gen8 scalar support for PACK_HALF_2x16_SPLIT");
951 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X
:
952 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y
:
953 assert(!"XXX: Missing Gen8 scalar support for UNPACK_HALF_2x16_SPLIT");
956 case FS_OPCODE_PLACEHOLDER_HALT
:
957 /* This is the place where the final HALT needs to be inserted if
958 * we've emitted any discards. If not, this will emit no code.
960 patch_discard_jumps_to_fb_writes();
964 if (ir
->opcode
< int(ARRAY_SIZE(opcode_descs
))) {
965 _mesa_problem(ctx
, "Unsupported opcode `%s' in FS",
966 opcode_descs
[ir
->opcode
].name
);
968 _mesa_problem(ctx
, "Unsupported opcode %d in FS", ir
->opcode
);
973 if (unlikely(INTEL_DEBUG
& DEBUG_WM
)) {
974 disassemble(stdout
, last_native_inst_offset
, next_inst_offset
);
976 foreach_list(node
, &cfg
->block_list
) {
977 bblock_link
*link
= (bblock_link
*)node
;
978 bblock_t
*block
= link
->block
;
980 if (block
->end
== ir
) {
981 printf(" END B%d", block
->block_num
);
982 foreach_list(successor_node
, &block
->children
) {
983 bblock_link
*successor_link
=
984 (bblock_link
*)successor_node
;
985 bblock_t
*successor_block
= successor_link
->block
;
986 printf(" ->B%d", successor_block
->block_num
);
993 last_native_inst_offset
= next_inst_offset
;
996 if (unlikely(INTEL_DEBUG
& DEBUG_WM
)) {
1000 patch_jump_targets();
1004 gen8_fs_generator::generate_assembly(exec_list
*simd8_instructions
,
1005 exec_list
*simd16_instructions
,
1006 unsigned *assembly_size
)
1008 assert(simd8_instructions
|| simd16_instructions
);
1010 if (simd8_instructions
) {
1012 generate_code(simd8_instructions
);
1015 if (simd16_instructions
) {
1016 /* Align to a 64-byte boundary. */
1017 while ((nr_inst
* sizeof(gen8_instruction
)) % 64)
1020 /* Save off the start of this SIMD16 program */
1021 c
->prog_data
.prog_offset_16
= nr_inst
* sizeof(gen8_instruction
);
1023 dispatch_width
= 16;
1024 generate_code(simd16_instructions
);
1027 *assembly_size
= next_inst_offset
;
1028 return (const unsigned *) store
;