3 * Copyright © 2010-2015 Intel Corporation
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
28 #include "brw_shader.h"
32 class fs_reg
: public backend_reg
{
34 DECLARE_RALLOC_CXX_OPERATORS(fs_reg
)
39 fs_reg(struct ::brw_reg reg
);
40 fs_reg(enum brw_reg_file file
, int nr
);
41 fs_reg(enum brw_reg_file file
, int nr
, enum brw_reg_type type
);
43 bool equals(const fs_reg
&r
) const;
44 bool negative_equals(const fs_reg
&r
) const;
45 bool is_contiguous() const;
48 * Return the size in bytes of a single logical component of the
49 * register assuming the given execution width.
51 unsigned component_size(unsigned width
) const;
53 /** Register region horizontal stride */
60 assert(reg
.file
!= IMM
);
61 reg
.negate
= !reg
.negate
;
66 retype(fs_reg reg
, enum brw_reg_type type
)
73 byte_offset(fs_reg reg
, unsigned delta
)
84 const unsigned suboffset
= reg
.offset
+ delta
;
85 reg
.nr
+= suboffset
/ REG_SIZE
;
86 reg
.offset
= suboffset
% REG_SIZE
;
91 const unsigned suboffset
= reg
.subnr
+ delta
;
92 reg
.nr
+= suboffset
/ REG_SIZE
;
93 reg
.subnr
= suboffset
% REG_SIZE
;
104 horiz_offset(const fs_reg
®
, unsigned delta
)
110 /* These only have a single component that is implicitly splatted. A
111 * horizontal offset should be a harmless no-op.
112 * XXX - Handle vector immediates correctly.
118 return byte_offset(reg
, delta
* reg
.stride
* type_sz(reg
.type
));
124 const unsigned stride
= reg
.hstride
? 1 << (reg
.hstride
- 1) : 0;
125 return byte_offset(reg
, delta
* stride
* type_sz(reg
.type
));
128 unreachable("Invalid register file");
132 offset(fs_reg reg
, unsigned width
, unsigned delta
)
143 return byte_offset(reg
, delta
* reg
.component_size(width
));
151 * Get the scalar channel of \p reg given by \p idx and replicate it to all
152 * channels of the result.
155 component(fs_reg reg
, unsigned idx
)
157 reg
= horiz_offset(reg
, idx
);
163 * Return an integer identifying the discrete address space a register is
164 * contained in. A register is by definition fully contained in the single
165 * reg_space it belongs to, so two registers with different reg_space ids are
166 * guaranteed not to overlap. Most register files are a single reg_space of
167 * its own, only the VGRF file is composed of multiple discrete address
168 * spaces, one for each VGRF allocation.
170 static inline uint32_t
171 reg_space(const fs_reg
&r
)
173 return r
.file
<< 16 | (r
.file
== VGRF
? r
.nr
: 0);
177 * Return the base offset in bytes of a register relative to the start of its
180 static inline unsigned
181 reg_offset(const fs_reg
&r
)
183 return (r
.file
== VGRF
|| r
.file
== IMM
? 0 : r
.nr
) *
184 (r
.file
== UNIFORM
? 4 : REG_SIZE
) + r
.offset
+
185 (r
.file
== ARF
|| r
.file
== FIXED_GRF
? r
.subnr
: 0);
189 * Return the amount of padding in bytes left unused between individual
190 * components of register \p r due to a (horizontal) stride value greater than
191 * one, or zero if components are tightly packed in the register file.
193 static inline unsigned
194 reg_padding(const fs_reg
&r
)
196 const unsigned stride
= ((r
.file
!= ARF
&& r
.file
!= FIXED_GRF
) ? r
.stride
:
198 1 << (r
.hstride
- 1));
199 return (MAX2(1, stride
) - 1) * type_sz(r
.type
);
203 * Return whether the register region starting at \p r and spanning \p dr
204 * bytes could potentially overlap the register region starting at \p s and
205 * spanning \p ds bytes.
208 regions_overlap(const fs_reg
&r
, unsigned dr
, const fs_reg
&s
, unsigned ds
)
210 if (r
.file
== MRF
&& (r
.nr
& BRW_MRF_COMPR4
)) {
212 t
.nr
&= ~BRW_MRF_COMPR4
;
213 /* COMPR4 regions are translated by the hardware during decompression
214 * into two separate half-regions 4 MRFs apart from each other.
216 return regions_overlap(t
, dr
/ 2, s
, ds
) ||
217 regions_overlap(byte_offset(t
, 4 * REG_SIZE
), dr
/ 2, s
, ds
);
219 } else if (s
.file
== MRF
&& (s
.nr
& BRW_MRF_COMPR4
)) {
220 return regions_overlap(s
, ds
, r
, dr
);
223 return reg_space(r
) == reg_space(s
) &&
224 !(reg_offset(r
) + dr
<= reg_offset(s
) ||
225 reg_offset(s
) + ds
<= reg_offset(r
));
230 * Check that the register region given by r [r.offset, r.offset + dr[
231 * is fully contained inside the register region given by s
232 * [s.offset, s.offset + ds[.
235 region_contained_in(const fs_reg
&r
, unsigned dr
, const fs_reg
&s
, unsigned ds
)
237 return reg_space(r
) == reg_space(s
) &&
238 reg_offset(r
) >= reg_offset(s
) &&
239 reg_offset(r
) + dr
<= reg_offset(s
) + ds
;
243 * Return whether the given register region is n-periodic, i.e. whether the
244 * original region remains invariant after shifting it by \p n scalar
248 is_periodic(const fs_reg
®
, unsigned n
)
250 if (reg
.file
== BAD_FILE
|| reg
.is_null()) {
253 } else if (reg
.file
== IMM
) {
254 const unsigned period
= (reg
.type
== BRW_REGISTER_TYPE_UV
||
255 reg
.type
== BRW_REGISTER_TYPE_V
? 8 :
256 reg
.type
== BRW_REGISTER_TYPE_VF
? 4 :
258 return n
% period
== 0;
260 } else if (reg
.file
== ARF
|| reg
.file
== FIXED_GRF
) {
261 const unsigned period
= (reg
.hstride
== 0 && reg
.vstride
== 0 ? 1 :
262 reg
.vstride
== 0 ? 1 << reg
.width
:
264 return n
% period
== 0;
267 return reg
.stride
== 0;
272 is_uniform(const fs_reg
®
)
274 return is_periodic(reg
, 1);
278 * Get the specified 8-component quarter of a register.
281 quarter(const fs_reg
®
, unsigned idx
)
284 return horiz_offset(reg
, 8 * idx
);
288 * Reinterpret each channel of register \p reg as a vector of values of the
289 * given smaller type and take the i-th subcomponent from each.
292 subscript(fs_reg reg
, brw_reg_type type
, unsigned i
)
294 assert((i
+ 1) * type_sz(type
) <= type_sz(reg
.type
));
296 if (reg
.file
== ARF
|| reg
.file
== FIXED_GRF
) {
297 /* The stride is encoded inconsistently for fixed GRF and ARF registers
298 * as the log2 of the actual vertical and horizontal strides.
300 const int delta
= util_logbase2(type_sz(reg
.type
)) -
301 util_logbase2(type_sz(type
));
302 reg
.hstride
+= (reg
.hstride
? delta
: 0);
303 reg
.vstride
+= (reg
.vstride
? delta
: 0);
305 } else if (reg
.file
== IMM
) {
306 assert(reg
.type
== type
);
309 reg
.stride
*= type_sz(reg
.type
) / type_sz(type
);
312 return byte_offset(retype(reg
, type
), i
* type_sz(type
));
316 horiz_stride(fs_reg reg
, unsigned s
)
322 static const fs_reg reg_undef
;
324 class fs_inst
: public backend_instruction
{
325 fs_inst
&operator=(const fs_inst
&);
327 void init(enum opcode opcode
, uint8_t exec_width
, const fs_reg
&dst
,
328 const fs_reg
*src
, unsigned sources
);
331 DECLARE_RALLOC_CXX_OPERATORS(fs_inst
)
334 fs_inst(enum opcode opcode
, uint8_t exec_size
);
335 fs_inst(enum opcode opcode
, uint8_t exec_size
, const fs_reg
&dst
);
336 fs_inst(enum opcode opcode
, uint8_t exec_size
, const fs_reg
&dst
,
338 fs_inst(enum opcode opcode
, uint8_t exec_size
, const fs_reg
&dst
,
339 const fs_reg
&src0
, const fs_reg
&src1
);
340 fs_inst(enum opcode opcode
, uint8_t exec_size
, const fs_reg
&dst
,
341 const fs_reg
&src0
, const fs_reg
&src1
, const fs_reg
&src2
);
342 fs_inst(enum opcode opcode
, uint8_t exec_size
, const fs_reg
&dst
,
343 const fs_reg src
[], unsigned sources
);
344 fs_inst(const fs_inst
&that
);
347 void resize_sources(uint8_t num_sources
);
349 bool is_send_from_grf() const;
350 bool is_payload(unsigned arg
) const;
351 bool is_partial_write() const;
352 unsigned components_read(unsigned i
) const;
353 unsigned size_read(int arg
) const;
354 bool can_do_source_mods(const struct gen_device_info
*devinfo
) const;
356 bool can_change_types() const;
357 bool has_source_and_destination_hazard() const;
358 unsigned implied_mrf_writes() const;
361 * Return whether \p arg is a control source of a virtual instruction which
362 * shouldn't contribute to the execution type and usual regioning
363 * restriction calculations of arithmetic instructions.
365 bool is_control_source(unsigned arg
) const;
368 * Return the subset of flag registers read by the instruction as a bitset
369 * with byte granularity.
371 unsigned flags_read(const gen_device_info
*devinfo
) const;
374 * Return the subset of flag registers updated by the instruction (either
375 * partially or fully) as a bitset with byte granularity.
377 unsigned flags_written() const;
382 uint8_t sources
; /**< Number of fs_reg sources. */
385 bool pi_noperspective
:1; /**< Pixel interpolator noperspective flag */
387 tgl_swsb sched
; /**< Scheduling info. */
391 * Make the execution of \p inst dependent on the evaluation of a possibly
392 * inverted predicate.
394 static inline fs_inst
*
395 set_predicate_inv(enum brw_predicate pred
, bool inverse
,
398 inst
->predicate
= pred
;
399 inst
->predicate_inverse
= inverse
;
404 * Make the execution of \p inst dependent on the evaluation of a predicate.
406 static inline fs_inst
*
407 set_predicate(enum brw_predicate pred
, fs_inst
*inst
)
409 return set_predicate_inv(pred
, false, inst
);
413 * Write the result of evaluating the condition given by \p mod to a flag
416 static inline fs_inst
*
417 set_condmod(enum brw_conditional_mod mod
, fs_inst
*inst
)
419 inst
->conditional_mod
= mod
;
424 * Clamp the result of \p inst to the saturation range of its destination
427 static inline fs_inst
*
428 set_saturate(bool saturate
, fs_inst
*inst
)
430 inst
->saturate
= saturate
;
435 * Return the number of dataflow registers written by the instruction (either
436 * fully or partially) counted from 'floor(reg_offset(inst->dst) /
437 * register_size)'. The somewhat arbitrary register size unit is 4B for the
438 * UNIFORM and IMM files and 32B for all other files.
441 regs_written(const fs_inst
*inst
)
443 assert(inst
->dst
.file
!= UNIFORM
&& inst
->dst
.file
!= IMM
);
444 return DIV_ROUND_UP(reg_offset(inst
->dst
) % REG_SIZE
+
446 MIN2(inst
->size_written
, reg_padding(inst
->dst
)),
451 * Return the number of dataflow registers read by the instruction (either
452 * fully or partially) counted from 'floor(reg_offset(inst->src[i]) /
453 * register_size)'. The somewhat arbitrary register size unit is 4B for the
454 * UNIFORM and IMM files and 32B for all other files.
457 regs_read(const fs_inst
*inst
, unsigned i
)
459 const unsigned reg_size
=
460 inst
->src
[i
].file
== UNIFORM
|| inst
->src
[i
].file
== IMM
? 4 : REG_SIZE
;
461 return DIV_ROUND_UP(reg_offset(inst
->src
[i
]) % reg_size
+
463 MIN2(inst
->size_read(i
), reg_padding(inst
->src
[i
])),
467 static inline enum brw_reg_type
468 get_exec_type(const fs_inst
*inst
)
470 brw_reg_type exec_type
= BRW_REGISTER_TYPE_B
;
472 for (int i
= 0; i
< inst
->sources
; i
++) {
473 if (inst
->src
[i
].file
!= BAD_FILE
&&
474 !inst
->is_control_source(i
)) {
475 const brw_reg_type t
= get_exec_type(inst
->src
[i
].type
);
476 if (type_sz(t
) > type_sz(exec_type
))
478 else if (type_sz(t
) == type_sz(exec_type
) &&
479 brw_reg_type_is_floating_point(t
))
484 if (exec_type
== BRW_REGISTER_TYPE_B
)
485 exec_type
= inst
->dst
.type
;
487 assert(exec_type
!= BRW_REGISTER_TYPE_B
);
489 /* Promotion of the execution type to 32-bit for conversions from or to
490 * half-float seems to be consistent with the following text from the
491 * Cherryview PRM Vol. 7, "Execution Data Type":
493 * "When single precision and half precision floats are mixed between
494 * source operands or between source and destination operand [..] single
495 * precision float is the execution datatype."
497 * and from "Register Region Restrictions":
499 * "Conversion between Integer and HF (Half Float) must be DWord aligned
500 * and strided by a DWord on the destination."
502 if (type_sz(exec_type
) == 2 &&
503 inst
->dst
.type
!= exec_type
) {
504 if (exec_type
== BRW_REGISTER_TYPE_HF
)
505 exec_type
= BRW_REGISTER_TYPE_F
;
506 else if (inst
->dst
.type
== BRW_REGISTER_TYPE_HF
)
507 exec_type
= BRW_REGISTER_TYPE_D
;
513 static inline unsigned
514 get_exec_type_size(const fs_inst
*inst
)
516 return type_sz(get_exec_type(inst
));
520 is_send(const fs_inst
*inst
)
522 return inst
->mlen
|| inst
->is_send_from_grf();
526 * Return whether the instruction isn't an ALU instruction and cannot be
527 * assumed to complete in-order.
530 is_unordered(const fs_inst
*inst
)
532 return is_send(inst
) || inst
->is_math();
536 * Return whether the following regioning restriction applies to the specified
537 * instruction. From the Cherryview PRM Vol 7. "Register Region
540 * "When source or destination datatype is 64b or operation is integer DWord
541 * multiply, regioning in Align1 must follow these rules:
543 * 1. Source and Destination horizontal stride must be aligned to the same qword.
544 * 2. Regioning must ensure Src.Vstride = Src.Width * Src.Hstride.
545 * 3. Source and Destination offset must be the same, except the case of
549 has_dst_aligned_region_restriction(const gen_device_info
*devinfo
,
552 const brw_reg_type exec_type
= get_exec_type(inst
);
553 /* Even though the hardware spec claims that "integer DWord multiply"
554 * operations are restricted, empirical evidence and the behavior of the
555 * simulator suggest that only 32x32-bit integer multiplication is
558 const bool is_dword_multiply
= !brw_reg_type_is_floating_point(exec_type
) &&
559 ((inst
->opcode
== BRW_OPCODE_MUL
&&
560 MIN2(type_sz(inst
->src
[0].type
), type_sz(inst
->src
[1].type
)) >= 4) ||
561 (inst
->opcode
== BRW_OPCODE_MAD
&&
562 MIN2(type_sz(inst
->src
[1].type
), type_sz(inst
->src
[2].type
)) >= 4));
564 if (type_sz(inst
->dst
.type
) > 4 || type_sz(exec_type
) > 4 ||
565 (type_sz(exec_type
) == 4 && is_dword_multiply
))
566 return devinfo
->is_cherryview
|| gen_device_info_is_9lp(devinfo
);
572 * Return whether the LOAD_PAYLOAD instruction is a plain copy of bits from
573 * the specified register file into a VGRF.
575 * This implies identity register regions without any source-destination
576 * overlap, but otherwise has no implications on the location of sources and
577 * destination in the register file: Gathering any number of portions from
578 * multiple virtual registers in any order is allowed.
581 is_copy_payload(brw_reg_file file
, const fs_inst
*inst
)
583 if (inst
->opcode
!= SHADER_OPCODE_LOAD_PAYLOAD
||
584 inst
->is_partial_write() || inst
->saturate
||
585 inst
->dst
.file
!= VGRF
)
588 for (unsigned i
= 0; i
< inst
->sources
; i
++) {
589 if (inst
->src
[i
].file
!= file
||
590 inst
->src
[i
].abs
|| inst
->src
[i
].negate
)
593 if (!inst
->src
[i
].is_contiguous())
596 if (regions_overlap(inst
->dst
, inst
->size_written
,
597 inst
->src
[i
], inst
->size_read(i
)))
605 * Like is_copy_payload(), but the instruction is required to copy a single
606 * contiguous block of registers from the given register file into the
607 * destination without any reordering.
610 is_identity_payload(brw_reg_file file
, const fs_inst
*inst
) {
611 if (is_copy_payload(file
, inst
)) {
612 fs_reg reg
= inst
->src
[0];
614 for (unsigned i
= 0; i
< inst
->sources
; i
++) {
615 reg
.type
= inst
->src
[i
].type
;
616 if (!inst
->src
[i
].equals(reg
))
619 reg
= byte_offset(reg
, inst
->size_read(i
));
629 * Like is_copy_payload(), but the instruction is required to source data from
630 * at least two disjoint VGRFs.
632 * This doesn't necessarily rule out the elimination of this instruction
633 * through register coalescing, but due to limitations of the register
634 * coalesce pass it might be impossible to do so directly until a later stage,
635 * when the LOAD_PAYLOAD instruction is unrolled into a sequence of MOV
639 is_multi_copy_payload(const fs_inst
*inst
) {
640 if (is_copy_payload(VGRF
, inst
)) {
641 for (unsigned i
= 0; i
< inst
->sources
; i
++) {
642 if (inst
->src
[i
].nr
!= inst
->src
[0].nr
)
651 * Like is_identity_payload(), but the instruction is required to copy the
652 * whole contents of a single VGRF into the destination.
654 * This means that there is a good chance that the instruction will be
655 * eliminated through register coalescing, but it's neither a necessary nor a
656 * sufficient condition for that to happen -- E.g. consider the case where
657 * source and destination registers diverge due to other instructions in the
658 * program overwriting part of their contents, which isn't something we can
659 * predict up front based on a cheap strictly local test of the copy
663 is_coalescing_payload(const brw::simple_allocator
&alloc
, const fs_inst
*inst
)
665 return is_identity_payload(VGRF
, inst
) &&
666 inst
->src
[0].offset
== 0 &&
667 alloc
.sizes
[inst
->src
[0].nr
] * REG_SIZE
== inst
->size_written
;
671 has_bank_conflict(const gen_device_info
*devinfo
, const fs_inst
*inst
);