3 * Copyright © 2010-2015 Intel Corporation
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 #ifndef BRW_FS_BUILDER_H
26 #define BRW_FS_BUILDER_H
28 #include "brw_ir_fs.h"
29 #include "brw_shader.h"
33 * Toolbox to assemble an FS IR program out of individual instructions.
35 * This object is meant to have an interface consistent with
36 * brw::vec4_builder. They cannot be fully interchangeable because
37 * brw::fs_builder generates scalar code while brw::vec4_builder generates
42 /** Type used in this IR to represent a source of an instruction. */
43 typedef fs_reg src_reg
;
45 /** Type used in this IR to represent the destination of an instruction. */
46 typedef fs_reg dst_reg
;
48 /** Type used in this IR to represent an instruction. */
49 typedef fs_inst instruction
;
52 * Construct an fs_builder that inserts instructions into \p shader.
53 * \p dispatch_width gives the native execution width of the program.
55 fs_builder(backend_shader
*shader
,
56 unsigned dispatch_width
) :
57 shader(shader
), block(NULL
), cursor(NULL
),
58 _dispatch_width(dispatch_width
),
60 force_writemask_all(false),
66 * Construct an fs_builder that inserts instructions into \p shader
67 * before instruction \p inst in basic block \p block. The default
68 * execution controls and debug annotation are initialized from the
69 * instruction passed as argument.
71 fs_builder(backend_shader
*shader
, bblock_t
*block
, fs_inst
*inst
) :
72 shader(shader
), block(block
), cursor(inst
),
73 _dispatch_width(inst
->exec_size
),
75 force_writemask_all(inst
->force_writemask_all
)
77 annotation
.str
= inst
->annotation
;
78 annotation
.ir
= inst
->ir
;
82 * Construct an fs_builder that inserts instructions before \p cursor in
83 * basic block \p block, inheriting other code generation parameters
87 at(bblock_t
*block
, exec_node
*cursor
) const
89 fs_builder bld
= *this;
96 * Construct an fs_builder appending instructions at the end of the
97 * instruction list of the shader, inheriting other code generation
98 * parameters from this.
103 return at(NULL
, (exec_node
*)&shader
->instructions
.tail_sentinel
);
107 * Construct a builder specifying the default SIMD width and group of
108 * channel enable signals, inheriting other code generation parameters
111 * \p n gives the default SIMD width, \p i gives the slot group used for
112 * predication and control flow masking in multiples of \p n channels.
115 group(unsigned n
, unsigned i
) const
117 fs_builder bld
= *this;
119 if (n
<= dispatch_width() && i
< dispatch_width() / n
) {
122 /* The requested channel group isn't a subset of the channel group
123 * of this builder, which means that the resulting instructions
124 * would use (potentially undefined) channel enable signals not
125 * specified by the parent builder. That's only valid if the
126 * instruction doesn't have per-channel semantics, in which case
127 * we should clear off the default group index in order to prevent
128 * emitting instructions with channel group not aligned to their
129 * own execution size.
131 assert(force_writemask_all
);
135 bld
._dispatch_width
= n
;
140 * Alias for group() with width equal to eight.
143 half(unsigned i
) const
149 * Construct a builder with per-channel control flow execution masking
150 * disabled if \p b is true. If control flow execution masking is
151 * already disabled this has no effect.
154 exec_all(bool b
= true) const
156 fs_builder bld
= *this;
158 bld
.force_writemask_all
= true;
163 * Construct a builder with the given debug annotation info.
166 annotate(const char *str
, const void *ir
= NULL
) const
168 fs_builder bld
= *this;
169 bld
.annotation
.str
= str
;
170 bld
.annotation
.ir
= ir
;
175 * Get the SIMD width in use.
178 dispatch_width() const
180 return _dispatch_width
;
184 * Get the channel group in use.
193 * Allocate a virtual register of natural vector size (one for this IR)
194 * and SIMD width. \p n gives the amount of space to allocate in
195 * dispatch_width units (which is just enough space for one logical
196 * component in this IR).
199 vgrf(enum brw_reg_type type
, unsigned n
= 1) const
201 assert(dispatch_width() <= 32);
204 return dst_reg(VGRF
, shader
->alloc
.allocate(
205 DIV_ROUND_UP(n
* type_sz(type
) * dispatch_width(),
209 return retype(null_reg_ud(), type
);
213 * Create a null register of floating type.
218 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F
));
224 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF
));
228 * Create a null register of signed integer type.
233 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
237 * Create a null register of unsigned integer type.
242 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD
));
246 * Get the mask of SIMD channels enabled by dispatch and not yet
247 * disabled by discard.
250 sample_mask_reg() const
252 if (shader
->stage
!= MESA_SHADER_FRAGMENT
) {
253 return brw_imm_d(0xffffffff);
254 } else if (brw_wm_prog_data(shader
->stage_prog_data
)->uses_kill
) {
255 return brw_flag_reg(0, 1);
257 assert(shader
->devinfo
->gen
>= 6 && dispatch_width() <= 16);
258 return retype(brw_vec1_grf((_group
>= 16 ? 2 : 1), 7),
259 BRW_REGISTER_TYPE_UD
);
264 * Insert an instruction into the program.
267 emit(const instruction
&inst
) const
269 return emit(new(shader
->mem_ctx
) instruction(inst
));
273 * Create and insert a nullary control instruction into the program.
276 emit(enum opcode opcode
) const
278 return emit(instruction(opcode
, dispatch_width()));
282 * Create and insert a nullary instruction into the program.
285 emit(enum opcode opcode
, const dst_reg
&dst
) const
287 return emit(instruction(opcode
, dispatch_width(), dst
));
291 * Create and insert a unary instruction into the program.
294 emit(enum opcode opcode
, const dst_reg
&dst
, const src_reg
&src0
) const
297 case SHADER_OPCODE_RCP
:
298 case SHADER_OPCODE_RSQ
:
299 case SHADER_OPCODE_SQRT
:
300 case SHADER_OPCODE_EXP2
:
301 case SHADER_OPCODE_LOG2
:
302 case SHADER_OPCODE_SIN
:
303 case SHADER_OPCODE_COS
:
304 return emit(instruction(opcode
, dispatch_width(), dst
,
305 fix_math_operand(src0
)));
308 return emit(instruction(opcode
, dispatch_width(), dst
, src0
));
313 * Create and insert a binary instruction into the program.
316 emit(enum opcode opcode
, const dst_reg
&dst
, const src_reg
&src0
,
317 const src_reg
&src1
) const
320 case SHADER_OPCODE_POW
:
321 case SHADER_OPCODE_INT_QUOTIENT
:
322 case SHADER_OPCODE_INT_REMAINDER
:
323 return emit(instruction(opcode
, dispatch_width(), dst
,
324 fix_math_operand(src0
),
325 fix_math_operand(fix_byte_src(src1
))));
328 return emit(instruction(opcode
, dispatch_width(), dst
,
329 src0
, fix_byte_src(src1
)));
335 * Create and insert a ternary instruction into the program.
338 emit(enum opcode opcode
, const dst_reg
&dst
, const src_reg
&src0
,
339 const src_reg
&src1
, const src_reg
&src2
) const
343 case BRW_OPCODE_BFI2
:
346 return emit(instruction(opcode
, dispatch_width(), dst
,
347 fix_3src_operand(src0
),
348 fix_3src_operand(fix_byte_src(src1
)),
349 fix_3src_operand(fix_byte_src(src2
))));
352 return emit(instruction(opcode
, dispatch_width(), dst
,
353 src0
, fix_byte_src(src1
), fix_byte_src(src2
)));
358 * Create and insert an instruction with a variable number of sources
362 emit(enum opcode opcode
, const dst_reg
&dst
, const src_reg srcs
[],
365 return emit(instruction(opcode
, dispatch_width(), dst
, srcs
, n
));
369 * Insert a preallocated instruction into the program.
372 emit(instruction
*inst
) const
374 assert(inst
->exec_size
<= 32);
375 assert(inst
->exec_size
== dispatch_width() ||
376 force_writemask_all
);
378 inst
->group
= _group
;
379 inst
->force_writemask_all
= force_writemask_all
;
380 inst
->annotation
= annotation
.str
;
381 inst
->ir
= annotation
.ir
;
384 static_cast<instruction
*>(cursor
)->insert_before(block
, inst
);
386 cursor
->insert_before(inst
);
392 * Select \p src0 if the comparison of both sources with the given
393 * conditional mod evaluates to true, otherwise select \p src1.
395 * Generally useful to get the minimum or maximum of two values.
398 emit_minmax(const dst_reg
&dst
, const src_reg
&src0
,
399 const src_reg
&src1
, brw_conditional_mod mod
) const
401 assert(mod
== BRW_CONDITIONAL_GE
|| mod
== BRW_CONDITIONAL_L
);
403 /* In some cases we can't have bytes as operand for src1, so use the
404 * same type for both operand.
406 return set_condmod(mod
, SEL(dst
, fix_unsigned_negate(fix_byte_src(src0
)),
407 fix_unsigned_negate(fix_byte_src(src1
))));
411 * Copy any live channel from \p src to the first channel of the result.
414 emit_uniformize(const src_reg
&src
) const
416 /* FIXME: We use a vector chan_index and dst to allow constant and
417 * copy propagration to move result all the way into the consuming
418 * instruction (typically a surface index or sampler index for a
419 * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
420 * dispatch. Once we teach const/copy propagation about scalars we
421 * should go back to scalar destinations here.
423 const fs_builder ubld
= exec_all();
424 const dst_reg chan_index
= vgrf(BRW_REGISTER_TYPE_UD
);
425 const dst_reg dst
= vgrf(src
.type
);
427 ubld
.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL
, chan_index
)->flag_subreg
= 2;
428 ubld
.emit(SHADER_OPCODE_BROADCAST
, dst
, src
, component(chan_index
, 0));
430 return src_reg(component(dst
, 0));
434 move_to_vgrf(const src_reg
&src
, unsigned num_components
) const
436 src_reg
*const src_comps
= new src_reg
[num_components
];
437 for (unsigned i
= 0; i
< num_components
; i
++)
438 src_comps
[i
] = offset(src
, dispatch_width(), i
);
440 const dst_reg dst
= vgrf(src
.type
, num_components
);
441 LOAD_PAYLOAD(dst
, src_comps
, num_components
, 0);
449 emit_scan(enum opcode opcode
, const dst_reg
&tmp
,
450 unsigned cluster_size
, brw_conditional_mod mod
) const
452 assert(dispatch_width() >= 8);
454 /* The instruction splitting code isn't advanced enough to split
455 * these so we need to handle that ourselves.
457 if (dispatch_width() * type_sz(tmp
.type
) > 2 * REG_SIZE
) {
458 const unsigned half_width
= dispatch_width() / 2;
459 const fs_builder ubld
= exec_all().group(half_width
, 0);
461 dst_reg right
= horiz_offset(tmp
, half_width
);
462 ubld
.emit_scan(opcode
, left
, cluster_size
, mod
);
463 ubld
.emit_scan(opcode
, right
, cluster_size
, mod
);
464 if (cluster_size
> half_width
) {
465 src_reg left_comp
= component(left
, half_width
- 1);
466 set_condmod(mod
, ubld
.emit(opcode
, right
, left_comp
, right
));
471 if (cluster_size
> 1) {
472 const fs_builder ubld
= exec_all().group(dispatch_width() / 2, 0);
473 const dst_reg left
= horiz_stride(tmp
, 2);
474 const dst_reg right
= horiz_stride(horiz_offset(tmp
, 1), 2);
475 set_condmod(mod
, ubld
.emit(opcode
, right
, left
, right
));
478 if (cluster_size
> 2) {
479 if (type_sz(tmp
.type
) <= 4) {
480 const fs_builder ubld
=
481 exec_all().group(dispatch_width() / 4, 0);
482 src_reg left
= horiz_stride(horiz_offset(tmp
, 1), 4);
484 dst_reg right
= horiz_stride(horiz_offset(tmp
, 2), 4);
485 set_condmod(mod
, ubld
.emit(opcode
, right
, left
, right
));
487 right
= horiz_stride(horiz_offset(tmp
, 3), 4);
488 set_condmod(mod
, ubld
.emit(opcode
, right
, left
, right
));
490 /* For 64-bit types, we have to do things differently because
491 * the code above would land us with destination strides that
492 * the hardware can't handle. Fortunately, we'll only be
493 * 8-wide in that case and it's the same number of
496 const fs_builder ubld
= exec_all().group(2, 0);
498 for (unsigned i
= 0; i
< dispatch_width(); i
+= 4) {
499 src_reg left
= component(tmp
, i
+ 1);
500 dst_reg right
= horiz_offset(tmp
, i
+ 2);
501 set_condmod(mod
, ubld
.emit(opcode
, right
, left
, right
));
507 i
< MIN2(cluster_size
, dispatch_width());
509 const fs_builder ubld
= exec_all().group(i
, 0);
510 src_reg left
= component(tmp
, i
- 1);
511 dst_reg right
= horiz_offset(tmp
, i
);
512 set_condmod(mod
, ubld
.emit(opcode
, right
, left
, right
));
514 if (dispatch_width() > i
* 2) {
515 left
= component(tmp
, i
* 3 - 1);
516 right
= horiz_offset(tmp
, i
* 3);
517 set_condmod(mod
, ubld
.emit(opcode
, right
, left
, right
));
520 if (dispatch_width() > i
* 4) {
521 left
= component(tmp
, i
* 5 - 1);
522 right
= horiz_offset(tmp
, i
* 5);
523 set_condmod(mod
, ubld
.emit(opcode
, right
, left
, right
));
525 left
= component(tmp
, i
* 7 - 1);
526 right
= horiz_offset(tmp
, i
* 7);
527 set_condmod(mod
, ubld
.emit(opcode
, right
, left
, right
));
533 * Assorted arithmetic ops.
538 op(const dst_reg &dst, const src_reg &src0) const \
540 return emit(BRW_OPCODE_##op, dst, src0); \
545 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
547 return emit(BRW_OPCODE_##op, dst, src0, src1); \
550 #define ALU2_ACC(op) \
552 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
554 instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \
555 inst->writes_accumulator = true; \
561 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \
562 const src_reg &src2) const \
564 return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \
619 * CMP: Sets the low bit of the destination channels with the result
620 * of the comparison, while the upper bits are undefined, and updates
621 * the flag register with the packed 16 bits of the result.
624 CMP(const dst_reg
&dst
, const src_reg
&src0
, const src_reg
&src1
,
625 brw_conditional_mod condition
) const
627 /* Take the instruction:
629 * CMP null<d> src0<f> src1<f>
631 * Original gen4 does type conversion to the destination type
632 * before comparison, producing garbage results for floating
635 * The destination type doesn't matter on newer generations,
636 * so we set the type to match src0 so we can compact the
639 return set_condmod(condition
,
640 emit(BRW_OPCODE_CMP
, retype(dst
, src0
.type
),
641 fix_unsigned_negate(src0
),
642 fix_unsigned_negate(src1
)));
646 * Gen4 predicated IF.
649 IF(brw_predicate predicate
) const
651 return set_predicate(predicate
, emit(BRW_OPCODE_IF
));
655 * CSEL: dst = src2 <op> 0.0f ? src0 : src1
658 CSEL(const dst_reg
&dst
, const src_reg
&src0
, const src_reg
&src1
,
659 const src_reg
&src2
, brw_conditional_mod condition
) const
661 /* CSEL only operates on floats, so we can't do integer </<=/>=/>
662 * comparisons. Zero/non-zero (== and !=) comparisons almost work.
663 * 0x80000000 fails because it is -0.0, and -0.0 == 0.0.
665 assert(src2
.type
== BRW_REGISTER_TYPE_F
);
667 return set_condmod(condition
,
668 emit(BRW_OPCODE_CSEL
,
669 retype(dst
, BRW_REGISTER_TYPE_F
),
670 retype(src0
, BRW_REGISTER_TYPE_F
),
671 retype(fix_byte_src(src1
), BRW_REGISTER_TYPE_F
),
672 fix_byte_src(src2
)));
676 * Emit a linear interpolation instruction.
679 LRP(const dst_reg
&dst
, const src_reg
&x
, const src_reg
&y
,
680 const src_reg
&a
) const
682 if (shader
->devinfo
->gen
>= 6 && shader
->devinfo
->gen
<= 10) {
683 /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
684 * we need to reorder the operands.
686 return emit(BRW_OPCODE_LRP
, dst
, a
, y
, x
);
689 /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */
690 const dst_reg y_times_a
= vgrf(dst
.type
);
691 const dst_reg one_minus_a
= vgrf(dst
.type
);
692 const dst_reg x_times_one_minus_a
= vgrf(dst
.type
);
694 MUL(y_times_a
, y
, a
);
695 ADD(one_minus_a
, negate(a
), brw_imm_f(1.0f
));
696 MUL(x_times_one_minus_a
, x
, src_reg(one_minus_a
));
697 return ADD(dst
, src_reg(x_times_one_minus_a
), src_reg(y_times_a
));
702 * Collect a number of registers in a contiguous range of registers.
705 LOAD_PAYLOAD(const dst_reg
&dst
, const src_reg
*src
,
706 unsigned sources
, unsigned header_size
) const
708 instruction
*inst
= emit(SHADER_OPCODE_LOAD_PAYLOAD
, dst
, src
, sources
);
709 inst
->header_size
= header_size
;
710 inst
->size_written
= header_size
* REG_SIZE
;
711 for (unsigned i
= header_size
; i
< sources
; i
++) {
712 inst
->size_written
+=
713 ALIGN(dispatch_width() * type_sz(src
[i
].type
) * dst
.stride
,
721 UNDEF(const dst_reg
&dst
) const
723 assert(dst
.file
== VGRF
);
724 instruction
*inst
= emit(SHADER_OPCODE_UNDEF
,
725 retype(dst
, BRW_REGISTER_TYPE_UD
));
726 inst
->size_written
= shader
->alloc
.sizes
[dst
.nr
] * REG_SIZE
;
731 backend_shader
*shader
;
734 * Byte sized operands are not supported for src1 on Gen11+.
737 fix_byte_src(const src_reg
&src
) const
739 if (shader
->devinfo
->gen
< 11 || type_sz(src
.type
) != 1)
742 dst_reg temp
= vgrf(src
.type
== BRW_REGISTER_TYPE_UB
?
743 BRW_REGISTER_TYPE_UD
: BRW_REGISTER_TYPE_D
);
745 return src_reg(temp
);
750 * Workaround for negation of UD registers. See comment in
751 * fs_generator::generate_code() for more details.
754 fix_unsigned_negate(const src_reg
&src
) const
756 if (src
.type
== BRW_REGISTER_TYPE_UD
&&
758 dst_reg temp
= vgrf(BRW_REGISTER_TYPE_UD
);
760 return src_reg(temp
);
767 * Workaround for source register modes not supported by the ternary
768 * instruction encoding.
771 fix_3src_operand(const src_reg
&src
) const
775 /* FINISHME: Could handle scalar region, other stride=1 regions */
776 if (src
.vstride
!= BRW_VERTICAL_STRIDE_8
||
777 src
.width
!= BRW_WIDTH_8
||
778 src
.hstride
!= BRW_HORIZONTAL_STRIDE_1
)
790 dst_reg expanded
= vgrf(src
.type
);
796 * Workaround for source register modes not supported by the math
800 fix_math_operand(const src_reg
&src
) const
802 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
803 * might be able to do better by doing execsize = 1 math and then
804 * expanding that result out, but we would need to be careful with
807 * Gen6 hardware ignores source modifiers (negate and abs) on math
808 * instructions, so we also move to a temp to set those up.
810 * Gen7 relaxes most of the above restrictions, but still can't use IMM
813 if ((shader
->devinfo
->gen
== 6 &&
814 (src
.file
== IMM
|| src
.file
== UNIFORM
||
815 src
.abs
|| src
.negate
)) ||
816 (shader
->devinfo
->gen
== 7 && src
.file
== IMM
)) {
817 const dst_reg tmp
= vgrf(src
.type
);
828 unsigned _dispatch_width
;
830 bool force_writemask_all
;
832 /** Debug annotation info. */