intel/fs: Don't touch accumulator destination while applying regioning alignment...
[mesa.git] / src / intel / compiler / brw_fs_lower_regioning.cpp
1 /*
2 * Copyright © 2018 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_fs.h"
25 #include "brw_cfg.h"
26 #include "brw_fs_builder.h"
27
28 using namespace brw;
29
30 namespace {
31 /* From the SKL PRM Vol 2a, "Move":
32 *
33 * "A mov with the same source and destination type, no source modifier,
34 * and no saturation is a raw move. A packed byte destination region (B
35 * or UB type with HorzStride == 1 and ExecSize > 1) can only be written
36 * using raw move."
37 */
38 bool
39 is_byte_raw_mov(const fs_inst *inst)
40 {
41 return type_sz(inst->dst.type) == 1 &&
42 inst->opcode == BRW_OPCODE_MOV &&
43 inst->src[0].type == inst->dst.type &&
44 !inst->saturate &&
45 !inst->src[0].negate &&
46 !inst->src[0].abs;
47 }
48
49 /*
50 * Return an acceptable byte stride for the destination of an instruction
51 * that requires it to have some particular alignment.
52 */
53 unsigned
54 required_dst_byte_stride(const fs_inst *inst)
55 {
56 if (inst->dst.is_accumulator()) {
57 /* If the destination is an accumulator, insist that we leave the
58 * stride alone. We cannot "fix" accumulator destinations by writing
59 * to a temporary and emitting a MOV into the original destination.
60 * For multiply instructions (our one use of the accumulator), the
61 * MUL writes the full 66 bits of the accumulator whereas the MOV we
62 * would emit only writes 33 bits and leaves the top 33 bits
63 * undefined.
64 *
65 * It's safe to just require the original stride here because the
66 * lowering pass will detect the mismatch in has_invalid_src_region
67 * and fix the sources of the multiply instead of the destination.
68 */
69 return inst->dst.stride * type_sz(inst->dst.type);
70 } else if (type_sz(inst->dst.type) < get_exec_type_size(inst) &&
71 !is_byte_raw_mov(inst)) {
72 return get_exec_type_size(inst);
73 } else {
74 unsigned stride = inst->dst.stride * type_sz(inst->dst.type);
75
76 for (unsigned i = 0; i < inst->sources; i++) {
77 if (!is_uniform(inst->src[i]))
78 stride = MAX2(stride, inst->src[i].stride *
79 type_sz(inst->src[i].type));
80 }
81
82 return stride;
83 }
84 }
85
86 /*
87 * Return an acceptable byte sub-register offset for the destination of an
88 * instruction that requires it to be aligned to the sub-register offset of
89 * the sources.
90 */
91 unsigned
92 required_dst_byte_offset(const fs_inst *inst)
93 {
94 for (unsigned i = 0; i < inst->sources; i++) {
95 if (!is_uniform(inst->src[i]))
96 if (reg_offset(inst->src[i]) % REG_SIZE !=
97 reg_offset(inst->dst) % REG_SIZE)
98 return 0;
99 }
100
101 return reg_offset(inst->dst) % REG_SIZE;
102 }
103
104 /*
105 * Return whether the instruction has an unsupported channel bit layout
106 * specified for the i-th source region.
107 */
108 bool
109 has_invalid_src_region(const gen_device_info *devinfo, const fs_inst *inst,
110 unsigned i)
111 {
112 if (is_unordered(inst)) {
113 return false;
114 } else {
115 const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type);
116 const unsigned src_byte_stride = inst->src[i].stride *
117 type_sz(inst->src[i].type);
118 const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE;
119 const unsigned src_byte_offset = reg_offset(inst->src[i]) % REG_SIZE;
120
121 return has_dst_aligned_region_restriction(devinfo, inst) &&
122 !is_uniform(inst->src[i]) &&
123 (src_byte_stride != dst_byte_stride ||
124 src_byte_offset != dst_byte_offset);
125 }
126 }
127
128 /*
129 * Return whether the instruction has an unsupported channel bit layout
130 * specified for the destination region.
131 */
132 bool
133 has_invalid_dst_region(const gen_device_info *devinfo,
134 const fs_inst *inst)
135 {
136 if (is_unordered(inst)) {
137 return false;
138 } else {
139 const brw_reg_type exec_type = get_exec_type(inst);
140 const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE;
141 const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type);
142 const bool is_narrowing_conversion = !is_byte_raw_mov(inst) &&
143 type_sz(inst->dst.type) < type_sz(exec_type);
144
145 return (has_dst_aligned_region_restriction(devinfo, inst) &&
146 (required_dst_byte_stride(inst) != dst_byte_stride ||
147 required_dst_byte_offset(inst) != dst_byte_offset)) ||
148 (is_narrowing_conversion &&
149 required_dst_byte_stride(inst) != dst_byte_stride);
150 }
151 }
152
153 /*
154 * Return whether the instruction has unsupported source modifiers
155 * specified for the i-th source region.
156 */
157 bool
158 has_invalid_src_modifiers(const gen_device_info *devinfo, const fs_inst *inst,
159 unsigned i)
160 {
161 return !inst->can_do_source_mods(devinfo) &&
162 (inst->src[i].negate || inst->src[i].abs);
163 }
164
165 /*
166 * Return whether the instruction has an unsupported type conversion
167 * specified for the destination.
168 */
169 bool
170 has_invalid_conversion(const gen_device_info *devinfo, const fs_inst *inst)
171 {
172 switch (inst->opcode) {
173 case BRW_OPCODE_MOV:
174 return false;
175 case BRW_OPCODE_SEL:
176 return inst->dst.type != get_exec_type(inst);
177 case SHADER_OPCODE_BROADCAST:
178 case SHADER_OPCODE_MOV_INDIRECT:
179 /* The source and destination types of these may be hard-coded to
180 * integer at codegen time due to hardware limitations of 64-bit
181 * types.
182 */
183 return ((devinfo->gen == 7 && !devinfo->is_haswell) ||
184 devinfo->is_cherryview || gen_device_info_is_9lp(devinfo)) &&
185 type_sz(inst->src[0].type) > 4 &&
186 inst->dst.type != inst->src[0].type;
187 default:
188 /* FIXME: We assume the opcodes don't explicitly mentioned before
189 * just work fine with arbitrary conversions.
190 */
191 return false;
192 }
193 }
194
195 /**
196 * Return whether the instruction has non-standard semantics for the
197 * conditional mod which don't cause the flag register to be updated with
198 * the comparison result.
199 */
200 bool
201 has_inconsistent_cmod(const fs_inst *inst)
202 {
203 return inst->opcode == BRW_OPCODE_SEL ||
204 inst->opcode == BRW_OPCODE_CSEL ||
205 inst->opcode == BRW_OPCODE_IF ||
206 inst->opcode == BRW_OPCODE_WHILE;
207 }
208
209 bool
210 lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst);
211 }
212
213 namespace brw {
214 /**
215 * Remove any modifiers from the \p i-th source region of the instruction,
216 * including negate, abs and any implicit type conversion to the execution
217 * type. Instead any source modifiers will be implemented as a separate
218 * MOV instruction prior to the original instruction.
219 */
220 bool
221 lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
222 {
223 assert(inst->components_read(i) == 1);
224 const fs_builder ibld(v, block, inst);
225 const fs_reg tmp = ibld.vgrf(get_exec_type(inst));
226
227 lower_instruction(v, block, ibld.MOV(tmp, inst->src[i]));
228 inst->src[i] = tmp;
229
230 return true;
231 }
232 }
233
234 namespace {
235 /**
236 * Remove any modifiers from the destination region of the instruction,
237 * including saturate, conditional mod and any implicit type conversion
238 * from the execution type. Instead any destination modifiers will be
239 * implemented as a separate MOV instruction after the original
240 * instruction.
241 */
242 bool
243 lower_dst_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst)
244 {
245 const fs_builder ibld(v, block, inst);
246 const brw_reg_type type = get_exec_type(inst);
247 /* Not strictly necessary, but if possible use a temporary with the same
248 * channel alignment as the current destination in order to avoid
249 * violating the restrictions enforced later on by lower_src_region()
250 * and lower_dst_region(), which would introduce additional copy
251 * instructions into the program unnecessarily.
252 */
253 const unsigned stride =
254 type_sz(inst->dst.type) * inst->dst.stride <= type_sz(type) ? 1 :
255 type_sz(inst->dst.type) * inst->dst.stride / type_sz(type);
256 const fs_reg tmp = horiz_stride(ibld.vgrf(type, stride), stride);
257
258 /* Emit a MOV taking care of all the destination modifiers. */
259 fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp);
260 mov->saturate = inst->saturate;
261 if (!has_inconsistent_cmod(inst))
262 mov->conditional_mod = inst->conditional_mod;
263 if (inst->opcode != BRW_OPCODE_SEL) {
264 mov->predicate = inst->predicate;
265 mov->predicate_inverse = inst->predicate_inverse;
266 }
267 mov->flag_subreg = inst->flag_subreg;
268 lower_instruction(v, block, mov);
269
270 /* Point the original instruction at the temporary, and clean up any
271 * destination modifiers.
272 */
273 assert(inst->size_written == inst->dst.component_size(inst->exec_size));
274 inst->dst = tmp;
275 inst->size_written = inst->dst.component_size(inst->exec_size);
276 inst->saturate = false;
277 if (!has_inconsistent_cmod(inst))
278 inst->conditional_mod = BRW_CONDITIONAL_NONE;
279
280 assert(!inst->flags_written() || !mov->predicate);
281 return true;
282 }
283
284 /**
285 * Remove any non-trivial shuffling of data from the \p i-th source region
286 * of the instruction. Instead implement the region as a series of integer
287 * copies into a temporary with the same channel layout as the destination.
288 */
289 bool
290 lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
291 {
292 assert(inst->components_read(i) == 1);
293 const fs_builder ibld(v, block, inst);
294 const unsigned stride = type_sz(inst->dst.type) * inst->dst.stride /
295 type_sz(inst->src[i].type);
296 assert(stride > 0);
297 const fs_reg tmp = horiz_stride(ibld.vgrf(inst->src[i].type, stride),
298 stride);
299
300 /* Emit a series of 32-bit integer copies with any source modifiers
301 * cleaned up (because their semantics are dependent on the type).
302 */
303 const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
304 false);
305 const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
306 fs_reg raw_src = inst->src[i];
307 raw_src.negate = false;
308 raw_src.abs = false;
309
310 for (unsigned j = 0; j < n; j++)
311 ibld.MOV(subscript(tmp, raw_type, j), subscript(raw_src, raw_type, j));
312
313 /* Point the original instruction at the temporary, making sure to keep
314 * any source modifiers in the instruction.
315 */
316 fs_reg lower_src = tmp;
317 lower_src.negate = inst->src[i].negate;
318 lower_src.abs = inst->src[i].abs;
319 inst->src[i] = lower_src;
320
321 return true;
322 }
323
324 /**
325 * Remove any non-trivial shuffling of data from the destination region of
326 * the instruction. Instead implement the region as a series of integer
327 * copies from a temporary with a channel layout compatible with the
328 * sources.
329 */
330 bool
331 lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst)
332 {
333 /* We cannot replace the result of an integer multiply which writes the
334 * accumulator because MUL+MACH pairs act on the accumulator as a 66-bit
335 * value whereas the MOV will act on only 32 or 33 bits of the
336 * accumulator.
337 */
338 assert(inst->opcode != BRW_OPCODE_MUL || !inst->dst.is_accumulator() ||
339 brw_reg_type_is_floating_point(inst->dst.type));
340
341 const fs_builder ibld(v, block, inst);
342 const unsigned stride = required_dst_byte_stride(inst) /
343 type_sz(inst->dst.type);
344 assert(stride > 0);
345 const fs_reg tmp = horiz_stride(ibld.vgrf(inst->dst.type, stride),
346 stride);
347
348 /* Emit a series of 32-bit integer copies from the temporary into the
349 * original destination.
350 */
351 const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
352 false);
353 const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
354
355 if (inst->predicate && inst->opcode != BRW_OPCODE_SEL) {
356 /* Note that in general we cannot simply predicate the copies on the
357 * same flag register as the original instruction, since it may have
358 * been overwritten by the instruction itself. Instead initialize
359 * the temporary with the previous contents of the destination
360 * register.
361 */
362 for (unsigned j = 0; j < n; j++)
363 ibld.MOV(subscript(tmp, raw_type, j),
364 subscript(inst->dst, raw_type, j));
365 }
366
367 for (unsigned j = 0; j < n; j++)
368 ibld.at(block, inst->next).MOV(subscript(inst->dst, raw_type, j),
369 subscript(tmp, raw_type, j));
370
371 /* Point the original instruction at the temporary, making sure to keep
372 * any destination modifiers in the instruction.
373 */
374 assert(inst->size_written == inst->dst.component_size(inst->exec_size));
375 inst->dst = tmp;
376 inst->size_written = inst->dst.component_size(inst->exec_size);
377
378 return true;
379 }
380
381 /**
382 * Legalize the source and destination regioning controls of the specified
383 * instruction.
384 */
385 bool
386 lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst)
387 {
388 const gen_device_info *devinfo = v->devinfo;
389 bool progress = false;
390
391 if (has_invalid_conversion(devinfo, inst))
392 progress |= lower_dst_modifiers(v, block, inst);
393
394 if (has_invalid_dst_region(devinfo, inst))
395 progress |= lower_dst_region(v, block, inst);
396
397 for (unsigned i = 0; i < inst->sources; i++) {
398 if (has_invalid_src_modifiers(devinfo, inst, i))
399 progress |= lower_src_modifiers(v, block, inst, i);
400
401 if (has_invalid_src_region(devinfo, inst, i))
402 progress |= lower_src_region(v, block, inst, i);
403 }
404
405 return progress;
406 }
407 }
408
409 bool
410 fs_visitor::lower_regioning()
411 {
412 bool progress = false;
413
414 foreach_block_and_inst_safe(block, fs_inst, inst, cfg)
415 progress |= lower_instruction(this, block, inst);
416
417 if (progress)
418 invalidate_live_intervals();
419
420 return progress;
421 }