3 * Copyright © 2010-2015 Intel Corporation
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
28 #include "brw_shader.h"
32 class fs_reg
: public backend_reg
{
34 DECLARE_RALLOC_CXX_OPERATORS(fs_reg
)
39 fs_reg(struct ::brw_reg reg
);
40 fs_reg(enum brw_reg_file file
, int nr
);
41 fs_reg(enum brw_reg_file file
, int nr
, enum brw_reg_type type
);
43 bool equals(const fs_reg
&r
) const;
44 bool negative_equals(const fs_reg
&r
) const;
45 bool is_contiguous() const;
48 * Return the size in bytes of a single logical component of the
49 * register assuming the given execution width.
51 unsigned component_size(unsigned width
) const;
53 /** Register region horizontal stride */
60 assert(reg
.file
!= IMM
);
61 reg
.negate
= !reg
.negate
;
66 retype(fs_reg reg
, enum brw_reg_type type
)
73 byte_offset(fs_reg reg
, unsigned delta
)
84 const unsigned suboffset
= reg
.offset
+ delta
;
85 reg
.nr
+= suboffset
/ REG_SIZE
;
86 reg
.offset
= suboffset
% REG_SIZE
;
91 const unsigned suboffset
= reg
.subnr
+ delta
;
92 reg
.nr
+= suboffset
/ REG_SIZE
;
93 reg
.subnr
= suboffset
% REG_SIZE
;
104 horiz_offset(const fs_reg
®
, unsigned delta
)
110 /* These only have a single component that is implicitly splatted. A
111 * horizontal offset should be a harmless no-op.
112 * XXX - Handle vector immediates correctly.
118 return byte_offset(reg
, delta
* reg
.stride
* type_sz(reg
.type
));
124 const unsigned stride
= reg
.hstride
? 1 << (reg
.hstride
- 1) : 0;
125 return byte_offset(reg
, delta
* stride
* type_sz(reg
.type
));
128 unreachable("Invalid register file");
132 offset(fs_reg reg
, unsigned width
, unsigned delta
)
143 return byte_offset(reg
, delta
* reg
.component_size(width
));
151 * Get the scalar channel of \p reg given by \p idx and replicate it to all
152 * channels of the result.
155 component(fs_reg reg
, unsigned idx
)
157 reg
= horiz_offset(reg
, idx
);
163 * Return an integer identifying the discrete address space a register is
164 * contained in. A register is by definition fully contained in the single
165 * reg_space it belongs to, so two registers with different reg_space ids are
166 * guaranteed not to overlap. Most register files are a single reg_space of
167 * its own, only the VGRF file is composed of multiple discrete address
168 * spaces, one for each VGRF allocation.
170 static inline uint32_t
171 reg_space(const fs_reg
&r
)
173 return r
.file
<< 16 | (r
.file
== VGRF
? r
.nr
: 0);
177 * Return the base offset in bytes of a register relative to the start of its
180 static inline unsigned
181 reg_offset(const fs_reg
&r
)
183 return (r
.file
== VGRF
|| r
.file
== IMM
? 0 : r
.nr
) *
184 (r
.file
== UNIFORM
? 4 : REG_SIZE
) + r
.offset
+
185 (r
.file
== ARF
|| r
.file
== FIXED_GRF
? r
.subnr
: 0);
189 * Return the amount of padding in bytes left unused between individual
190 * components of register \p r due to a (horizontal) stride value greater than
191 * one, or zero if components are tightly packed in the register file.
193 static inline unsigned
194 reg_padding(const fs_reg
&r
)
196 const unsigned stride
= ((r
.file
!= ARF
&& r
.file
!= FIXED_GRF
) ? r
.stride
:
198 1 << (r
.hstride
- 1));
199 return (MAX2(1, stride
) - 1) * type_sz(r
.type
);
203 * Return whether the register region starting at \p r and spanning \p dr
204 * bytes could potentially overlap the register region starting at \p s and
205 * spanning \p ds bytes.
208 regions_overlap(const fs_reg
&r
, unsigned dr
, const fs_reg
&s
, unsigned ds
)
210 if (r
.file
== MRF
&& (r
.nr
& BRW_MRF_COMPR4
)) {
212 t
.nr
&= ~BRW_MRF_COMPR4
;
213 /* COMPR4 regions are translated by the hardware during decompression
214 * into two separate half-regions 4 MRFs apart from each other.
216 return regions_overlap(t
, dr
/ 2, s
, ds
) ||
217 regions_overlap(byte_offset(t
, 4 * REG_SIZE
), dr
/ 2, s
, ds
);
219 } else if (s
.file
== MRF
&& (s
.nr
& BRW_MRF_COMPR4
)) {
220 return regions_overlap(s
, ds
, r
, dr
);
223 return reg_space(r
) == reg_space(s
) &&
224 !(reg_offset(r
) + dr
<= reg_offset(s
) ||
225 reg_offset(s
) + ds
<= reg_offset(r
));
230 * Check that the register region given by r [r.offset, r.offset + dr[
231 * is fully contained inside the register region given by s
232 * [s.offset, s.offset + ds[.
235 region_contained_in(const fs_reg
&r
, unsigned dr
, const fs_reg
&s
, unsigned ds
)
237 return reg_space(r
) == reg_space(s
) &&
238 reg_offset(r
) >= reg_offset(s
) &&
239 reg_offset(r
) + dr
<= reg_offset(s
) + ds
;
243 * Return whether the given register region is n-periodic, i.e. whether the
244 * original region remains invariant after shifting it by \p n scalar
248 is_periodic(const fs_reg
®
, unsigned n
)
250 if (reg
.file
== BAD_FILE
|| reg
.is_null()) {
253 } else if (reg
.file
== IMM
) {
254 const unsigned period
= (reg
.type
== BRW_REGISTER_TYPE_UV
||
255 reg
.type
== BRW_REGISTER_TYPE_V
? 8 :
256 reg
.type
== BRW_REGISTER_TYPE_VF
? 4 :
258 return n
% period
== 0;
260 } else if (reg
.file
== ARF
|| reg
.file
== FIXED_GRF
) {
261 const unsigned period
= (reg
.hstride
== 0 && reg
.vstride
== 0 ? 1 :
262 reg
.vstride
== 0 ? 1 << reg
.width
:
264 return n
% period
== 0;
267 return reg
.stride
== 0;
272 is_uniform(const fs_reg
®
)
274 return is_periodic(reg
, 1);
278 * Get the specified 8-component quarter of a register.
279 * XXX - Maybe come up with a less misleading name for this (e.g. quarter())?
282 half(const fs_reg
®
, unsigned idx
)
285 return horiz_offset(reg
, 8 * idx
);
289 * Reinterpret each channel of register \p reg as a vector of values of the
290 * given smaller type and take the i-th subcomponent from each.
293 subscript(fs_reg reg
, brw_reg_type type
, unsigned i
)
295 assert((i
+ 1) * type_sz(type
) <= type_sz(reg
.type
));
297 if (reg
.file
== ARF
|| reg
.file
== FIXED_GRF
) {
298 /* The stride is encoded inconsistently for fixed GRF and ARF registers
299 * as the log2 of the actual vertical and horizontal strides.
301 const int delta
= _mesa_logbase2(type_sz(reg
.type
)) -
302 _mesa_logbase2(type_sz(type
));
303 reg
.hstride
+= (reg
.hstride
? delta
: 0);
304 reg
.vstride
+= (reg
.vstride
? delta
: 0);
306 } else if (reg
.file
== IMM
) {
307 assert(reg
.type
== type
);
310 reg
.stride
*= type_sz(reg
.type
) / type_sz(type
);
313 return byte_offset(retype(reg
, type
), i
* type_sz(type
));
317 horiz_stride(fs_reg reg
, unsigned s
)
323 static const fs_reg reg_undef
;
325 class fs_inst
: public backend_instruction
{
326 fs_inst
&operator=(const fs_inst
&);
328 void init(enum opcode opcode
, uint8_t exec_width
, const fs_reg
&dst
,
329 const fs_reg
*src
, unsigned sources
);
332 DECLARE_RALLOC_CXX_OPERATORS(fs_inst
)
335 fs_inst(enum opcode opcode
, uint8_t exec_size
);
336 fs_inst(enum opcode opcode
, uint8_t exec_size
, const fs_reg
&dst
);
337 fs_inst(enum opcode opcode
, uint8_t exec_size
, const fs_reg
&dst
,
339 fs_inst(enum opcode opcode
, uint8_t exec_size
, const fs_reg
&dst
,
340 const fs_reg
&src0
, const fs_reg
&src1
);
341 fs_inst(enum opcode opcode
, uint8_t exec_size
, const fs_reg
&dst
,
342 const fs_reg
&src0
, const fs_reg
&src1
, const fs_reg
&src2
);
343 fs_inst(enum opcode opcode
, uint8_t exec_size
, const fs_reg
&dst
,
344 const fs_reg src
[], unsigned sources
);
345 fs_inst(const fs_inst
&that
);
348 void resize_sources(uint8_t num_sources
);
350 bool is_send_from_grf() const;
351 bool is_payload(unsigned arg
) const;
352 bool is_partial_write() const;
353 bool is_copy_payload(const brw::simple_allocator
&grf_alloc
) const;
354 unsigned components_read(unsigned i
) const;
355 unsigned size_read(int arg
) const;
356 bool can_do_source_mods(const struct gen_device_info
*devinfo
) const;
358 bool can_change_types() const;
359 bool has_source_and_destination_hazard() const;
360 unsigned implied_mrf_writes() const;
363 * Return whether \p arg is a control source of a virtual instruction which
364 * shouldn't contribute to the execution type and usual regioning
365 * restriction calculations of arithmetic instructions.
367 bool is_control_source(unsigned arg
) const;
370 * Return the subset of flag registers read by the instruction as a bitset
371 * with byte granularity.
373 unsigned flags_read(const gen_device_info
*devinfo
) const;
376 * Return the subset of flag registers updated by the instruction (either
377 * partially or fully) as a bitset with byte granularity.
379 unsigned flags_written() const;
384 uint8_t sources
; /**< Number of fs_reg sources. */
387 bool pi_noperspective
:1; /**< Pixel interpolator noperspective flag */
389 tgl_swsb sched
; /**< Scheduling info. */
393 * Make the execution of \p inst dependent on the evaluation of a possibly
394 * inverted predicate.
396 static inline fs_inst
*
397 set_predicate_inv(enum brw_predicate pred
, bool inverse
,
400 inst
->predicate
= pred
;
401 inst
->predicate_inverse
= inverse
;
406 * Make the execution of \p inst dependent on the evaluation of a predicate.
408 static inline fs_inst
*
409 set_predicate(enum brw_predicate pred
, fs_inst
*inst
)
411 return set_predicate_inv(pred
, false, inst
);
415 * Write the result of evaluating the condition given by \p mod to a flag
418 static inline fs_inst
*
419 set_condmod(enum brw_conditional_mod mod
, fs_inst
*inst
)
421 inst
->conditional_mod
= mod
;
426 * Clamp the result of \p inst to the saturation range of its destination
429 static inline fs_inst
*
430 set_saturate(bool saturate
, fs_inst
*inst
)
432 inst
->saturate
= saturate
;
437 * Return the number of dataflow registers written by the instruction (either
438 * fully or partially) counted from 'floor(reg_offset(inst->dst) /
439 * register_size)'. The somewhat arbitrary register size unit is 4B for the
440 * UNIFORM and IMM files and 32B for all other files.
443 regs_written(const fs_inst
*inst
)
445 assert(inst
->dst
.file
!= UNIFORM
&& inst
->dst
.file
!= IMM
);
446 return DIV_ROUND_UP(reg_offset(inst
->dst
) % REG_SIZE
+
448 MIN2(inst
->size_written
, reg_padding(inst
->dst
)),
453 * Return the number of dataflow registers read by the instruction (either
454 * fully or partially) counted from 'floor(reg_offset(inst->src[i]) /
455 * register_size)'. The somewhat arbitrary register size unit is 4B for the
456 * UNIFORM and IMM files and 32B for all other files.
459 regs_read(const fs_inst
*inst
, unsigned i
)
461 const unsigned reg_size
=
462 inst
->src
[i
].file
== UNIFORM
|| inst
->src
[i
].file
== IMM
? 4 : REG_SIZE
;
463 return DIV_ROUND_UP(reg_offset(inst
->src
[i
]) % reg_size
+
465 MIN2(inst
->size_read(i
), reg_padding(inst
->src
[i
])),
469 static inline enum brw_reg_type
470 get_exec_type(const fs_inst
*inst
)
472 brw_reg_type exec_type
= BRW_REGISTER_TYPE_B
;
474 for (int i
= 0; i
< inst
->sources
; i
++) {
475 if (inst
->src
[i
].file
!= BAD_FILE
&&
476 !inst
->is_control_source(i
)) {
477 const brw_reg_type t
= get_exec_type(inst
->src
[i
].type
);
478 if (type_sz(t
) > type_sz(exec_type
))
480 else if (type_sz(t
) == type_sz(exec_type
) &&
481 brw_reg_type_is_floating_point(t
))
486 if (exec_type
== BRW_REGISTER_TYPE_B
)
487 exec_type
= inst
->dst
.type
;
489 assert(exec_type
!= BRW_REGISTER_TYPE_B
);
491 /* Promotion of the execution type to 32-bit for conversions from or to
492 * half-float seems to be consistent with the following text from the
493 * Cherryview PRM Vol. 7, "Execution Data Type":
495 * "When single precision and half precision floats are mixed between
496 * source operands or between source and destination operand [..] single
497 * precision float is the execution datatype."
499 * and from "Register Region Restrictions":
501 * "Conversion between Integer and HF (Half Float) must be DWord aligned
502 * and strided by a DWord on the destination."
504 if (type_sz(exec_type
) == 2 &&
505 inst
->dst
.type
!= exec_type
) {
506 if (exec_type
== BRW_REGISTER_TYPE_HF
)
507 exec_type
= BRW_REGISTER_TYPE_F
;
508 else if (inst
->dst
.type
== BRW_REGISTER_TYPE_HF
)
509 exec_type
= BRW_REGISTER_TYPE_D
;
515 static inline unsigned
516 get_exec_type_size(const fs_inst
*inst
)
518 return type_sz(get_exec_type(inst
));
522 is_send(const fs_inst
*inst
)
524 return inst
->mlen
|| inst
->is_send_from_grf();
528 * Return whether the instruction isn't an ALU instruction and cannot be
529 * assumed to complete in-order.
532 is_unordered(const fs_inst
*inst
)
534 return is_send(inst
) || inst
->is_math();
538 * Return whether the following regioning restriction applies to the specified
539 * instruction. From the Cherryview PRM Vol 7. "Register Region
542 * "When source or destination datatype is 64b or operation is integer DWord
543 * multiply, regioning in Align1 must follow these rules:
545 * 1. Source and Destination horizontal stride must be aligned to the same qword.
546 * 2. Regioning must ensure Src.Vstride = Src.Width * Src.Hstride.
547 * 3. Source and Destination offset must be the same, except the case of
551 has_dst_aligned_region_restriction(const gen_device_info
*devinfo
,
554 const brw_reg_type exec_type
= get_exec_type(inst
);
555 /* Even though the hardware spec claims that "integer DWord multiply"
556 * operations are restricted, empirical evidence and the behavior of the
557 * simulator suggest that only 32x32-bit integer multiplication is
560 const bool is_dword_multiply
= !brw_reg_type_is_floating_point(exec_type
) &&
561 ((inst
->opcode
== BRW_OPCODE_MUL
&&
562 MIN2(type_sz(inst
->src
[0].type
), type_sz(inst
->src
[1].type
)) >= 4) ||
563 (inst
->opcode
== BRW_OPCODE_MAD
&&
564 MIN2(type_sz(inst
->src
[1].type
), type_sz(inst
->src
[2].type
)) >= 4));
566 if (type_sz(inst
->dst
.type
) > 4 || type_sz(exec_type
) > 4 ||
567 (type_sz(exec_type
) == 4 && is_dword_multiply
))
568 return devinfo
->is_cherryview
|| gen_device_info_is_9lp(devinfo
);