2 * Copyright © 2018 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
26 #include "brw_fs_builder.h"
31 /* From the SKL PRM Vol 2a, "Move":
33 * "A mov with the same source and destination type, no source modifier,
34 * and no saturation is a raw move. A packed byte destination region (B
35 * or UB type with HorzStride == 1 and ExecSize > 1) can only be written
39 is_byte_raw_mov(const fs_inst
*inst
)
41 return type_sz(inst
->dst
.type
) == 1 &&
42 inst
->opcode
== BRW_OPCODE_MOV
&&
43 inst
->src
[0].type
== inst
->dst
.type
&&
45 !inst
->src
[0].negate
&&
50 * Return an acceptable byte stride for the destination of an instruction
51 * that requires it to have some particular alignment.
54 required_dst_byte_stride(const fs_inst
*inst
)
56 if (inst
->dst
.is_accumulator()) {
57 /* If the destination is an accumulator, insist that we leave the
58 * stride alone. We cannot "fix" accumulator destinations by writing
59 * to a temporary and emitting a MOV into the original destination.
60 * For multiply instructions (our one use of the accumulator), the
61 * MUL writes the full 66 bits of the accumulator whereas the MOV we
62 * would emit only writes 33 bits and leaves the top 33 bits
65 * It's safe to just require the original stride here because the
66 * lowering pass will detect the mismatch in has_invalid_src_region
67 * and fix the sources of the multiply instead of the destination.
69 return inst
->dst
.stride
* type_sz(inst
->dst
.type
);
70 } else if (type_sz(inst
->dst
.type
) < get_exec_type_size(inst
) &&
71 !is_byte_raw_mov(inst
)) {
72 return get_exec_type_size(inst
);
74 /* Calculate the maximum byte stride and the minimum/maximum type
75 * size across all source and destination operands we are required to
78 unsigned max_stride
= inst
->dst
.stride
* type_sz(inst
->dst
.type
);
79 unsigned min_size
= type_sz(inst
->dst
.type
);
80 unsigned max_size
= type_sz(inst
->dst
.type
);
82 for (unsigned i
= 0; i
< inst
->sources
; i
++) {
83 if (!is_uniform(inst
->src
[i
]) && !inst
->is_control_source(i
)) {
84 const unsigned size
= type_sz(inst
->src
[i
].type
);
85 max_stride
= MAX2(max_stride
, inst
->src
[i
].stride
* size
);
86 min_size
= MIN2(min_size
, size
);
87 max_size
= MAX2(max_size
, size
);
91 /* All operands involved in lowering need to fit in the calculated
94 assert(max_size
<= 4 * min_size
);
96 /* Attempt to use the largest byte stride among all present operands,
97 * but never exceed a stride of 4 since that would lead to illegal
98 * destination regions during lowering.
100 return MIN2(max_stride
, 4 * min_size
);
105 * Return an acceptable byte sub-register offset for the destination of an
106 * instruction that requires it to be aligned to the sub-register offset of
110 required_dst_byte_offset(const fs_inst
*inst
)
112 for (unsigned i
= 0; i
< inst
->sources
; i
++) {
113 if (!is_uniform(inst
->src
[i
]) && !inst
->is_control_source(i
))
114 if (reg_offset(inst
->src
[i
]) % REG_SIZE
!=
115 reg_offset(inst
->dst
) % REG_SIZE
)
119 return reg_offset(inst
->dst
) % REG_SIZE
;
123 * Return whether the instruction has an unsupported channel bit layout
124 * specified for the i-th source region.
127 has_invalid_src_region(const gen_device_info
*devinfo
, const fs_inst
*inst
,
130 if (is_unordered(inst
) || inst
->is_control_source(i
))
133 /* Empirical testing shows that Broadwell has a bug affecting half-float
134 * MAD instructions when any of its sources has a non-zero offset, such
137 * mad(8) g18<1>HF -g17<4,4,1>HF g14.8<4,4,1>HF g11<4,4,1>HF { align16 1Q };
139 * We used to generate code like this for SIMD8 executions where we
140 * used to pack components Y and W of a vector at offset 16B of a SIMD
141 * register. The problem doesn't occur if the stride of the source is 0.
143 if (devinfo
->gen
== 8 &&
144 inst
->opcode
== BRW_OPCODE_MAD
&&
145 inst
->src
[i
].type
== BRW_REGISTER_TYPE_HF
&&
146 reg_offset(inst
->src
[i
]) % REG_SIZE
> 0 &&
147 inst
->src
[i
].stride
!= 0) {
151 const unsigned dst_byte_stride
= inst
->dst
.stride
* type_sz(inst
->dst
.type
);
152 const unsigned src_byte_stride
= inst
->src
[i
].stride
*
153 type_sz(inst
->src
[i
].type
);
154 const unsigned dst_byte_offset
= reg_offset(inst
->dst
) % REG_SIZE
;
155 const unsigned src_byte_offset
= reg_offset(inst
->src
[i
]) % REG_SIZE
;
157 return has_dst_aligned_region_restriction(devinfo
, inst
) &&
158 !is_uniform(inst
->src
[i
]) &&
159 (src_byte_stride
!= dst_byte_stride
||
160 src_byte_offset
!= dst_byte_offset
);
164 * Return whether the instruction has an unsupported channel bit layout
165 * specified for the destination region.
168 has_invalid_dst_region(const gen_device_info
*devinfo
,
171 if (is_unordered(inst
)) {
174 const brw_reg_type exec_type
= get_exec_type(inst
);
175 const unsigned dst_byte_offset
= reg_offset(inst
->dst
) % REG_SIZE
;
176 const unsigned dst_byte_stride
= inst
->dst
.stride
* type_sz(inst
->dst
.type
);
177 const bool is_narrowing_conversion
= !is_byte_raw_mov(inst
) &&
178 type_sz(inst
->dst
.type
) < type_sz(exec_type
);
180 return (has_dst_aligned_region_restriction(devinfo
, inst
) &&
181 (required_dst_byte_stride(inst
) != dst_byte_stride
||
182 required_dst_byte_offset(inst
) != dst_byte_offset
)) ||
183 (is_narrowing_conversion
&&
184 required_dst_byte_stride(inst
) != dst_byte_stride
);
189 * Return whether the instruction has unsupported source modifiers
190 * specified for the i-th source region.
193 has_invalid_src_modifiers(const gen_device_info
*devinfo
, const fs_inst
*inst
,
196 return !inst
->can_do_source_mods(devinfo
) &&
197 (inst
->src
[i
].negate
|| inst
->src
[i
].abs
);
201 * Return whether the instruction has an unsupported type conversion
202 * specified for the destination.
205 has_invalid_conversion(const gen_device_info
*devinfo
, const fs_inst
*inst
)
207 switch (inst
->opcode
) {
211 return inst
->dst
.type
!= get_exec_type(inst
);
212 case SHADER_OPCODE_BROADCAST
:
213 case SHADER_OPCODE_MOV_INDIRECT
:
214 /* The source and destination types of these may be hard-coded to
215 * integer at codegen time due to hardware limitations of 64-bit
218 return ((devinfo
->gen
== 7 && !devinfo
->is_haswell
) ||
219 devinfo
->is_cherryview
|| gen_device_info_is_9lp(devinfo
)) &&
220 type_sz(inst
->src
[0].type
) > 4 &&
221 inst
->dst
.type
!= inst
->src
[0].type
;
223 /* FIXME: We assume the opcodes don't explicitly mentioned before
224 * just work fine with arbitrary conversions.
231 * Return whether the instruction has non-standard semantics for the
232 * conditional mod which don't cause the flag register to be updated with
233 * the comparison result.
236 has_inconsistent_cmod(const fs_inst
*inst
)
238 return inst
->opcode
== BRW_OPCODE_SEL
||
239 inst
->opcode
== BRW_OPCODE_CSEL
||
240 inst
->opcode
== BRW_OPCODE_IF
||
241 inst
->opcode
== BRW_OPCODE_WHILE
;
245 lower_instruction(fs_visitor
*v
, bblock_t
*block
, fs_inst
*inst
);
250 * Remove any modifiers from the \p i-th source region of the instruction,
251 * including negate, abs and any implicit type conversion to the execution
252 * type. Instead any source modifiers will be implemented as a separate
253 * MOV instruction prior to the original instruction.
256 lower_src_modifiers(fs_visitor
*v
, bblock_t
*block
, fs_inst
*inst
, unsigned i
)
258 assert(inst
->components_read(i
) == 1);
259 const fs_builder
ibld(v
, block
, inst
);
260 const fs_reg tmp
= ibld
.vgrf(get_exec_type(inst
));
262 lower_instruction(v
, block
, ibld
.MOV(tmp
, inst
->src
[i
]));
271 * Remove any modifiers from the destination region of the instruction,
272 * including saturate, conditional mod and any implicit type conversion
273 * from the execution type. Instead any destination modifiers will be
274 * implemented as a separate MOV instruction after the original
278 lower_dst_modifiers(fs_visitor
*v
, bblock_t
*block
, fs_inst
*inst
)
280 const fs_builder
ibld(v
, block
, inst
);
281 const brw_reg_type type
= get_exec_type(inst
);
282 /* Not strictly necessary, but if possible use a temporary with the same
283 * channel alignment as the current destination in order to avoid
284 * violating the restrictions enforced later on by lower_src_region()
285 * and lower_dst_region(), which would introduce additional copy
286 * instructions into the program unnecessarily.
288 const unsigned stride
=
289 type_sz(inst
->dst
.type
) * inst
->dst
.stride
<= type_sz(type
) ? 1 :
290 type_sz(inst
->dst
.type
) * inst
->dst
.stride
/ type_sz(type
);
291 fs_reg tmp
= ibld
.vgrf(type
, stride
);
293 tmp
= horiz_stride(tmp
, stride
);
295 /* Emit a MOV taking care of all the destination modifiers. */
296 fs_inst
*mov
= ibld
.at(block
, inst
->next
).MOV(inst
->dst
, tmp
);
297 mov
->saturate
= inst
->saturate
;
298 if (!has_inconsistent_cmod(inst
))
299 mov
->conditional_mod
= inst
->conditional_mod
;
300 if (inst
->opcode
!= BRW_OPCODE_SEL
) {
301 mov
->predicate
= inst
->predicate
;
302 mov
->predicate_inverse
= inst
->predicate_inverse
;
304 mov
->flag_subreg
= inst
->flag_subreg
;
305 lower_instruction(v
, block
, mov
);
307 /* Point the original instruction at the temporary, and clean up any
308 * destination modifiers.
310 assert(inst
->size_written
== inst
->dst
.component_size(inst
->exec_size
));
312 inst
->size_written
= inst
->dst
.component_size(inst
->exec_size
);
313 inst
->saturate
= false;
314 if (!has_inconsistent_cmod(inst
))
315 inst
->conditional_mod
= BRW_CONDITIONAL_NONE
;
317 assert(!inst
->flags_written() || !mov
->predicate
);
322 * Remove any non-trivial shuffling of data from the \p i-th source region
323 * of the instruction. Instead implement the region as a series of integer
324 * copies into a temporary with the same channel layout as the destination.
327 lower_src_region(fs_visitor
*v
, bblock_t
*block
, fs_inst
*inst
, unsigned i
)
329 assert(inst
->components_read(i
) == 1);
330 const fs_builder
ibld(v
, block
, inst
);
331 const unsigned stride
= type_sz(inst
->dst
.type
) * inst
->dst
.stride
/
332 type_sz(inst
->src
[i
].type
);
334 fs_reg tmp
= ibld
.vgrf(inst
->src
[i
].type
, stride
);
336 tmp
= horiz_stride(tmp
, stride
);
338 /* Emit a series of 32-bit integer copies with any source modifiers
339 * cleaned up (because their semantics are dependent on the type).
341 const brw_reg_type raw_type
= brw_int_type(MIN2(type_sz(tmp
.type
), 4),
343 const unsigned n
= type_sz(tmp
.type
) / type_sz(raw_type
);
344 fs_reg raw_src
= inst
->src
[i
];
345 raw_src
.negate
= false;
348 for (unsigned j
= 0; j
< n
; j
++)
349 ibld
.MOV(subscript(tmp
, raw_type
, j
), subscript(raw_src
, raw_type
, j
));
351 /* Point the original instruction at the temporary, making sure to keep
352 * any source modifiers in the instruction.
354 fs_reg lower_src
= tmp
;
355 lower_src
.negate
= inst
->src
[i
].negate
;
356 lower_src
.abs
= inst
->src
[i
].abs
;
357 inst
->src
[i
] = lower_src
;
363 * Remove any non-trivial shuffling of data from the destination region of
364 * the instruction. Instead implement the region as a series of integer
365 * copies from a temporary with a channel layout compatible with the
369 lower_dst_region(fs_visitor
*v
, bblock_t
*block
, fs_inst
*inst
)
371 /* We cannot replace the result of an integer multiply which writes the
372 * accumulator because MUL+MACH pairs act on the accumulator as a 66-bit
373 * value whereas the MOV will act on only 32 or 33 bits of the
376 assert(inst
->opcode
!= BRW_OPCODE_MUL
|| !inst
->dst
.is_accumulator() ||
377 brw_reg_type_is_floating_point(inst
->dst
.type
));
379 const fs_builder
ibld(v
, block
, inst
);
380 const unsigned stride
= required_dst_byte_stride(inst
) /
381 type_sz(inst
->dst
.type
);
383 fs_reg tmp
= ibld
.vgrf(inst
->dst
.type
, stride
);
385 tmp
= horiz_stride(tmp
, stride
);
387 /* Emit a series of 32-bit integer copies from the temporary into the
388 * original destination.
390 const brw_reg_type raw_type
= brw_int_type(MIN2(type_sz(tmp
.type
), 4),
392 const unsigned n
= type_sz(tmp
.type
) / type_sz(raw_type
);
394 if (inst
->predicate
&& inst
->opcode
!= BRW_OPCODE_SEL
) {
395 /* Note that in general we cannot simply predicate the copies on the
396 * same flag register as the original instruction, since it may have
397 * been overwritten by the instruction itself. Instead initialize
398 * the temporary with the previous contents of the destination
401 for (unsigned j
= 0; j
< n
; j
++)
402 ibld
.MOV(subscript(tmp
, raw_type
, j
),
403 subscript(inst
->dst
, raw_type
, j
));
406 for (unsigned j
= 0; j
< n
; j
++)
407 ibld
.at(block
, inst
->next
).MOV(subscript(inst
->dst
, raw_type
, j
),
408 subscript(tmp
, raw_type
, j
));
410 /* Point the original instruction at the temporary, making sure to keep
411 * any destination modifiers in the instruction.
413 assert(inst
->size_written
== inst
->dst
.component_size(inst
->exec_size
));
415 inst
->size_written
= inst
->dst
.component_size(inst
->exec_size
);
421 * Legalize the source and destination regioning controls of the specified
425 lower_instruction(fs_visitor
*v
, bblock_t
*block
, fs_inst
*inst
)
427 const gen_device_info
*devinfo
= v
->devinfo
;
428 bool progress
= false;
430 if (has_invalid_conversion(devinfo
, inst
))
431 progress
|= lower_dst_modifiers(v
, block
, inst
);
433 if (has_invalid_dst_region(devinfo
, inst
))
434 progress
|= lower_dst_region(v
, block
, inst
);
436 for (unsigned i
= 0; i
< inst
->sources
; i
++) {
437 if (has_invalid_src_modifiers(devinfo
, inst
, i
))
438 progress
|= lower_src_modifiers(v
, block
, inst
, i
);
440 if (has_invalid_src_region(devinfo
, inst
, i
))
441 progress
|= lower_src_region(v
, block
, inst
, i
);
449 fs_visitor::lower_regioning()
451 bool progress
= false;
453 foreach_block_and_inst_safe(block
, fs_inst
, inst
, cfg
)
454 progress
|= lower_instruction(this, block
, inst
);
457 invalidate_live_intervals();