3 * Copyright © 2010-2015 Intel Corporation
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
28 #include "brw_shader.h"
32 class fs_reg
: public backend_reg
{
34 DECLARE_RALLOC_CXX_OPERATORS(fs_reg
)
39 fs_reg(struct ::brw_reg reg
);
40 fs_reg(enum brw_reg_file file
, int nr
);
41 fs_reg(enum brw_reg_file file
, int nr
, enum brw_reg_type type
);
43 bool equals(const fs_reg
&r
) const;
44 bool negative_equals(const fs_reg
&r
) const;
45 bool is_contiguous() const;
48 * Return the size in bytes of a single logical component of the
49 * register assuming the given execution width.
51 unsigned component_size(unsigned width
) const;
53 /** Register region horizontal stride */
60 assert(reg
.file
!= IMM
);
61 reg
.negate
= !reg
.negate
;
66 retype(fs_reg reg
, enum brw_reg_type type
)
73 byte_offset(fs_reg reg
, unsigned delta
)
84 const unsigned suboffset
= reg
.offset
+ delta
;
85 reg
.nr
+= suboffset
/ REG_SIZE
;
86 reg
.offset
= suboffset
% REG_SIZE
;
91 const unsigned suboffset
= reg
.subnr
+ delta
;
92 reg
.nr
+= suboffset
/ REG_SIZE
;
93 reg
.subnr
= suboffset
% REG_SIZE
;
104 horiz_offset(const fs_reg
®
, unsigned delta
)
110 /* These only have a single component that is implicitly splatted. A
111 * horizontal offset should be a harmless no-op.
112 * XXX - Handle vector immediates correctly.
118 return byte_offset(reg
, delta
* reg
.stride
* type_sz(reg
.type
));
124 const unsigned stride
= reg
.hstride
? 1 << (reg
.hstride
- 1) : 0;
125 return byte_offset(reg
, delta
* stride
* type_sz(reg
.type
));
128 unreachable("Invalid register file");
132 offset(fs_reg reg
, unsigned width
, unsigned delta
)
143 return byte_offset(reg
, delta
* reg
.component_size(width
));
151 * Get the scalar channel of \p reg given by \p idx and replicate it to all
152 * channels of the result.
155 component(fs_reg reg
, unsigned idx
)
157 reg
= horiz_offset(reg
, idx
);
163 * Return an integer identifying the discrete address space a register is
164 * contained in. A register is by definition fully contained in the single
165 * reg_space it belongs to, so two registers with different reg_space ids are
166 * guaranteed not to overlap. Most register files are a single reg_space of
167 * its own, only the VGRF file is composed of multiple discrete address
168 * spaces, one for each VGRF allocation.
170 static inline uint32_t
171 reg_space(const fs_reg
&r
)
173 return r
.file
<< 16 | (r
.file
== VGRF
? r
.nr
: 0);
177 * Return the base offset in bytes of a register relative to the start of its
180 static inline unsigned
181 reg_offset(const fs_reg
&r
)
183 return (r
.file
== VGRF
|| r
.file
== IMM
? 0 : r
.nr
) *
184 (r
.file
== UNIFORM
? 4 : REG_SIZE
) + r
.offset
+
185 (r
.file
== ARF
|| r
.file
== FIXED_GRF
? r
.subnr
: 0);
189 * Return the amount of padding in bytes left unused between individual
190 * components of register \p r due to a (horizontal) stride value greater than
191 * one, or zero if components are tightly packed in the register file.
193 static inline unsigned
194 reg_padding(const fs_reg
&r
)
196 const unsigned stride
= ((r
.file
!= ARF
&& r
.file
!= FIXED_GRF
) ? r
.stride
:
198 1 << (r
.hstride
- 1));
199 return (MAX2(1, stride
) - 1) * type_sz(r
.type
);
203 * Return whether the register region starting at \p r and spanning \p dr
204 * bytes could potentially overlap the register region starting at \p s and
205 * spanning \p ds bytes.
208 regions_overlap(const fs_reg
&r
, unsigned dr
, const fs_reg
&s
, unsigned ds
)
210 if (r
.file
== MRF
&& (r
.nr
& BRW_MRF_COMPR4
)) {
212 t
.nr
&= ~BRW_MRF_COMPR4
;
213 /* COMPR4 regions are translated by the hardware during decompression
214 * into two separate half-regions 4 MRFs apart from each other.
216 return regions_overlap(t
, dr
/ 2, s
, ds
) ||
217 regions_overlap(byte_offset(t
, 4 * REG_SIZE
), dr
/ 2, s
, ds
);
219 } else if (s
.file
== MRF
&& (s
.nr
& BRW_MRF_COMPR4
)) {
220 return regions_overlap(s
, ds
, r
, dr
);
223 return reg_space(r
) == reg_space(s
) &&
224 !(reg_offset(r
) + dr
<= reg_offset(s
) ||
225 reg_offset(s
) + ds
<= reg_offset(r
));
230 * Check that the register region given by r [r.offset, r.offset + dr[
231 * is fully contained inside the register region given by s
232 * [s.offset, s.offset + ds[.
235 region_contained_in(const fs_reg
&r
, unsigned dr
, const fs_reg
&s
, unsigned ds
)
237 return reg_space(r
) == reg_space(s
) &&
238 reg_offset(r
) >= reg_offset(s
) &&
239 reg_offset(r
) + dr
<= reg_offset(s
) + ds
;
243 * Return whether the given register region is n-periodic, i.e. whether the
244 * original region remains invariant after shifting it by \p n scalar
248 is_periodic(const fs_reg
®
, unsigned n
)
250 if (reg
.file
== BAD_FILE
|| reg
.is_null()) {
253 } else if (reg
.file
== IMM
) {
254 const unsigned period
= (reg
.type
== BRW_REGISTER_TYPE_UV
||
255 reg
.type
== BRW_REGISTER_TYPE_V
? 8 :
256 reg
.type
== BRW_REGISTER_TYPE_VF
? 4 :
258 return n
% period
== 0;
260 } else if (reg
.file
== ARF
|| reg
.file
== FIXED_GRF
) {
261 const unsigned period
= (reg
.hstride
== 0 && reg
.vstride
== 0 ? 1 :
262 reg
.vstride
== 0 ? 1 << reg
.width
:
264 return n
% period
== 0;
267 return reg
.stride
== 0;
272 is_uniform(const fs_reg
®
)
274 return is_periodic(reg
, 1);
278 * Get the specified 8-component quarter of a register.
279 * XXX - Maybe come up with a less misleading name for this (e.g. quarter())?
282 half(const fs_reg
®
, unsigned idx
)
285 return horiz_offset(reg
, 8 * idx
);
289 * Reinterpret each channel of register \p reg as a vector of values of the
290 * given smaller type and take the i-th subcomponent from each.
293 subscript(fs_reg reg
, brw_reg_type type
, unsigned i
)
295 assert((i
+ 1) * type_sz(type
) <= type_sz(reg
.type
));
297 if (reg
.file
== ARF
|| reg
.file
== FIXED_GRF
) {
298 /* The stride is encoded inconsistently for fixed GRF and ARF registers
299 * as the log2 of the actual vertical and horizontal strides.
301 const int delta
= _mesa_logbase2(type_sz(reg
.type
)) -
302 _mesa_logbase2(type_sz(type
));
303 reg
.hstride
+= (reg
.hstride
? delta
: 0);
304 reg
.vstride
+= (reg
.vstride
? delta
: 0);
306 } else if (reg
.file
== IMM
) {
307 assert(reg
.type
== type
);
310 reg
.stride
*= type_sz(reg
.type
) / type_sz(type
);
313 return byte_offset(retype(reg
, type
), i
* type_sz(type
));
317 horiz_stride(fs_reg reg
, unsigned s
)
323 static const fs_reg reg_undef
;
325 class fs_inst
: public backend_instruction
{
326 fs_inst
&operator=(const fs_inst
&);
328 void init(enum opcode opcode
, uint8_t exec_width
, const fs_reg
&dst
,
329 const fs_reg
*src
, unsigned sources
);
332 DECLARE_RALLOC_CXX_OPERATORS(fs_inst
)
335 fs_inst(enum opcode opcode
, uint8_t exec_size
);
336 fs_inst(enum opcode opcode
, uint8_t exec_size
, const fs_reg
&dst
);
337 fs_inst(enum opcode opcode
, uint8_t exec_size
, const fs_reg
&dst
,
339 fs_inst(enum opcode opcode
, uint8_t exec_size
, const fs_reg
&dst
,
340 const fs_reg
&src0
, const fs_reg
&src1
);
341 fs_inst(enum opcode opcode
, uint8_t exec_size
, const fs_reg
&dst
,
342 const fs_reg
&src0
, const fs_reg
&src1
, const fs_reg
&src2
);
343 fs_inst(enum opcode opcode
, uint8_t exec_size
, const fs_reg
&dst
,
344 const fs_reg src
[], unsigned sources
);
345 fs_inst(const fs_inst
&that
);
348 void resize_sources(uint8_t num_sources
);
350 bool is_send_from_grf() const;
351 bool is_payload(unsigned arg
) const;
352 bool is_partial_write() const;
353 unsigned components_read(unsigned i
) const;
354 unsigned size_read(int arg
) const;
355 bool can_do_source_mods(const struct gen_device_info
*devinfo
) const;
357 bool can_change_types() const;
358 bool has_source_and_destination_hazard() const;
359 unsigned implied_mrf_writes() const;
362 * Return whether \p arg is a control source of a virtual instruction which
363 * shouldn't contribute to the execution type and usual regioning
364 * restriction calculations of arithmetic instructions.
366 bool is_control_source(unsigned arg
) const;
369 * Return the subset of flag registers read by the instruction as a bitset
370 * with byte granularity.
372 unsigned flags_read(const gen_device_info
*devinfo
) const;
375 * Return the subset of flag registers updated by the instruction (either
376 * partially or fully) as a bitset with byte granularity.
378 unsigned flags_written() const;
383 uint8_t sources
; /**< Number of fs_reg sources. */
386 bool pi_noperspective
:1; /**< Pixel interpolator noperspective flag */
388 tgl_swsb sched
; /**< Scheduling info. */
392 * Make the execution of \p inst dependent on the evaluation of a possibly
393 * inverted predicate.
395 static inline fs_inst
*
396 set_predicate_inv(enum brw_predicate pred
, bool inverse
,
399 inst
->predicate
= pred
;
400 inst
->predicate_inverse
= inverse
;
405 * Make the execution of \p inst dependent on the evaluation of a predicate.
407 static inline fs_inst
*
408 set_predicate(enum brw_predicate pred
, fs_inst
*inst
)
410 return set_predicate_inv(pred
, false, inst
);
414 * Write the result of evaluating the condition given by \p mod to a flag
417 static inline fs_inst
*
418 set_condmod(enum brw_conditional_mod mod
, fs_inst
*inst
)
420 inst
->conditional_mod
= mod
;
425 * Clamp the result of \p inst to the saturation range of its destination
428 static inline fs_inst
*
429 set_saturate(bool saturate
, fs_inst
*inst
)
431 inst
->saturate
= saturate
;
436 * Return the number of dataflow registers written by the instruction (either
437 * fully or partially) counted from 'floor(reg_offset(inst->dst) /
438 * register_size)'. The somewhat arbitrary register size unit is 4B for the
439 * UNIFORM and IMM files and 32B for all other files.
442 regs_written(const fs_inst
*inst
)
444 assert(inst
->dst
.file
!= UNIFORM
&& inst
->dst
.file
!= IMM
);
445 return DIV_ROUND_UP(reg_offset(inst
->dst
) % REG_SIZE
+
447 MIN2(inst
->size_written
, reg_padding(inst
->dst
)),
452 * Return the number of dataflow registers read by the instruction (either
453 * fully or partially) counted from 'floor(reg_offset(inst->src[i]) /
454 * register_size)'. The somewhat arbitrary register size unit is 4B for the
455 * UNIFORM and IMM files and 32B for all other files.
458 regs_read(const fs_inst
*inst
, unsigned i
)
460 const unsigned reg_size
=
461 inst
->src
[i
].file
== UNIFORM
|| inst
->src
[i
].file
== IMM
? 4 : REG_SIZE
;
462 return DIV_ROUND_UP(reg_offset(inst
->src
[i
]) % reg_size
+
464 MIN2(inst
->size_read(i
), reg_padding(inst
->src
[i
])),
468 static inline enum brw_reg_type
469 get_exec_type(const fs_inst
*inst
)
471 brw_reg_type exec_type
= BRW_REGISTER_TYPE_B
;
473 for (int i
= 0; i
< inst
->sources
; i
++) {
474 if (inst
->src
[i
].file
!= BAD_FILE
&&
475 !inst
->is_control_source(i
)) {
476 const brw_reg_type t
= get_exec_type(inst
->src
[i
].type
);
477 if (type_sz(t
) > type_sz(exec_type
))
479 else if (type_sz(t
) == type_sz(exec_type
) &&
480 brw_reg_type_is_floating_point(t
))
485 if (exec_type
== BRW_REGISTER_TYPE_B
)
486 exec_type
= inst
->dst
.type
;
488 assert(exec_type
!= BRW_REGISTER_TYPE_B
);
490 /* Promotion of the execution type to 32-bit for conversions from or to
491 * half-float seems to be consistent with the following text from the
492 * Cherryview PRM Vol. 7, "Execution Data Type":
494 * "When single precision and half precision floats are mixed between
495 * source operands or between source and destination operand [..] single
496 * precision float is the execution datatype."
498 * and from "Register Region Restrictions":
500 * "Conversion between Integer and HF (Half Float) must be DWord aligned
501 * and strided by a DWord on the destination."
503 if (type_sz(exec_type
) == 2 &&
504 inst
->dst
.type
!= exec_type
) {
505 if (exec_type
== BRW_REGISTER_TYPE_HF
)
506 exec_type
= BRW_REGISTER_TYPE_F
;
507 else if (inst
->dst
.type
== BRW_REGISTER_TYPE_HF
)
508 exec_type
= BRW_REGISTER_TYPE_D
;
514 static inline unsigned
515 get_exec_type_size(const fs_inst
*inst
)
517 return type_sz(get_exec_type(inst
));
521 is_send(const fs_inst
*inst
)
523 return inst
->mlen
|| inst
->is_send_from_grf();
527 * Return whether the instruction isn't an ALU instruction and cannot be
528 * assumed to complete in-order.
531 is_unordered(const fs_inst
*inst
)
533 return is_send(inst
) || inst
->is_math();
537 * Return whether the following regioning restriction applies to the specified
538 * instruction. From the Cherryview PRM Vol 7. "Register Region
541 * "When source or destination datatype is 64b or operation is integer DWord
542 * multiply, regioning in Align1 must follow these rules:
544 * 1. Source and Destination horizontal stride must be aligned to the same qword.
545 * 2. Regioning must ensure Src.Vstride = Src.Width * Src.Hstride.
546 * 3. Source and Destination offset must be the same, except the case of
550 has_dst_aligned_region_restriction(const gen_device_info
*devinfo
,
553 const brw_reg_type exec_type
= get_exec_type(inst
);
554 /* Even though the hardware spec claims that "integer DWord multiply"
555 * operations are restricted, empirical evidence and the behavior of the
556 * simulator suggest that only 32x32-bit integer multiplication is
559 const bool is_dword_multiply
= !brw_reg_type_is_floating_point(exec_type
) &&
560 ((inst
->opcode
== BRW_OPCODE_MUL
&&
561 MIN2(type_sz(inst
->src
[0].type
), type_sz(inst
->src
[1].type
)) >= 4) ||
562 (inst
->opcode
== BRW_OPCODE_MAD
&&
563 MIN2(type_sz(inst
->src
[1].type
), type_sz(inst
->src
[2].type
)) >= 4));
565 if (type_sz(inst
->dst
.type
) > 4 || type_sz(exec_type
) > 4 ||
566 (type_sz(exec_type
) == 4 && is_dword_multiply
))
567 return devinfo
->is_cherryview
|| gen_device_info_is_9lp(devinfo
);
573 * Return whether the LOAD_PAYLOAD instruction is a plain copy of bits from
574 * the specified register file into a VGRF.
576 * This implies identity register regions without any source-destination
577 * overlap, but otherwise has no implications on the location of sources and
578 * destination in the register file: Gathering any number of portions from
579 * multiple virtual registers in any order is allowed.
582 is_copy_payload(brw_reg_file file
, const fs_inst
*inst
)
584 if (inst
->opcode
!= SHADER_OPCODE_LOAD_PAYLOAD
||
585 inst
->is_partial_write() || inst
->saturate
||
586 inst
->dst
.file
!= VGRF
)
589 for (unsigned i
= 0; i
< inst
->sources
; i
++) {
590 if (inst
->src
[i
].file
!= file
||
591 inst
->src
[i
].abs
|| inst
->src
[i
].negate
)
594 if (!inst
->src
[i
].is_contiguous())
597 if (regions_overlap(inst
->dst
, inst
->size_written
,
598 inst
->src
[i
], inst
->size_read(i
)))
606 * Like is_copy_payload(), but the instruction is required to copy a single
607 * contiguous block of registers from the given register file into the
608 * destination without any reordering.
611 is_identity_payload(brw_reg_file file
, const fs_inst
*inst
) {
612 if (is_copy_payload(file
, inst
)) {
613 fs_reg reg
= inst
->src
[0];
615 for (unsigned i
= 0; i
< inst
->sources
; i
++) {
616 reg
.type
= inst
->src
[i
].type
;
617 if (!inst
->src
[i
].equals(reg
))
620 reg
= byte_offset(reg
, inst
->size_read(i
));
630 * Like is_copy_payload(), but the instruction is required to source data from
631 * at least two disjoint VGRFs.
633 * This doesn't necessarily rule out the elimination of this instruction
634 * through register coalescing, but due to limitations of the register
635 * coalesce pass it might be impossible to do so directly until a later stage,
636 * when the LOAD_PAYLOAD instruction is unrolled into a sequence of MOV
640 is_multi_copy_payload(const fs_inst
*inst
) {
641 if (is_copy_payload(VGRF
, inst
)) {
642 for (unsigned i
= 0; i
< inst
->sources
; i
++) {
643 if (inst
->src
[i
].nr
!= inst
->src
[0].nr
)
652 * Like is_identity_payload(), but the instruction is required to copy the
653 * whole contents of a single VGRF into the destination.
655 * This means that there is a good chance that the instruction will be
656 * eliminated through register coalescing, but it's neither a necessary nor a
657 * sufficient condition for that to happen -- E.g. consider the case where
658 * source and destination registers diverge due to other instructions in the
659 * program overwriting part of their contents, which isn't something we can
660 * predict up front based on a cheap strictly local test of the copy
664 is_coalescing_payload(const brw::simple_allocator
&alloc
, const fs_inst
*inst
)
666 return is_identity_payload(VGRF
, inst
) &&
667 inst
->src
[0].offset
== 0 &&
668 alloc
.sizes
[inst
->src
[0].nr
] * REG_SIZE
== inst
->size_written
;