intel/fs: Introduce regioning lowering pass.
[mesa.git] / src / intel / compiler / brw_fs_lower_regioning.cpp
1 /*
2 * Copyright © 2018 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_fs.h"
25 #include "brw_cfg.h"
26 #include "brw_fs_builder.h"
27
28 using namespace brw;
29
30 namespace {
31 /* From the SKL PRM Vol 2a, "Move":
32 *
33 * "A mov with the same source and destination type, no source modifier,
34 * and no saturation is a raw move. A packed byte destination region (B
35 * or UB type with HorzStride == 1 and ExecSize > 1) can only be written
36 * using raw move."
37 */
38 bool
39 is_byte_raw_mov(const fs_inst *inst)
40 {
41 return type_sz(inst->dst.type) == 1 &&
42 inst->opcode == BRW_OPCODE_MOV &&
43 inst->src[0].type == inst->dst.type &&
44 !inst->saturate &&
45 !inst->src[0].negate &&
46 !inst->src[0].abs;
47 }
48
49 /*
50 * Return an acceptable byte stride for the destination of an instruction
51 * that requires it to have some particular alignment.
52 */
53 unsigned
54 required_dst_byte_stride(const fs_inst *inst)
55 {
56 if (type_sz(inst->dst.type) < get_exec_type_size(inst) &&
57 !is_byte_raw_mov(inst)) {
58 return get_exec_type_size(inst);
59 } else {
60 unsigned stride = inst->dst.stride * type_sz(inst->dst.type);
61
62 for (unsigned i = 0; i < inst->sources; i++) {
63 if (!is_uniform(inst->src[i]))
64 stride = MAX2(stride, inst->src[i].stride *
65 type_sz(inst->src[i].type));
66 }
67
68 return stride;
69 }
70 }
71
72 /*
73 * Return an acceptable byte sub-register offset for the destination of an
74 * instruction that requires it to be aligned to the sub-register offset of
75 * the sources.
76 */
77 unsigned
78 required_dst_byte_offset(const fs_inst *inst)
79 {
80 for (unsigned i = 0; i < inst->sources; i++) {
81 if (!is_uniform(inst->src[i]))
82 if (reg_offset(inst->src[i]) % REG_SIZE !=
83 reg_offset(inst->dst) % REG_SIZE)
84 return 0;
85 }
86
87 return reg_offset(inst->dst) % REG_SIZE;
88 }
89
90 /*
91 * Return whether the instruction has an unsupported channel bit layout
92 * specified for the i-th source region.
93 */
94 bool
95 has_invalid_src_region(const gen_device_info *devinfo, const fs_inst *inst,
96 unsigned i)
97 {
98 if (is_unordered(inst)) {
99 return false;
100 } else {
101 const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type);
102 const unsigned src_byte_stride = inst->src[i].stride *
103 type_sz(inst->src[i].type);
104 const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE;
105 const unsigned src_byte_offset = reg_offset(inst->src[i]) % REG_SIZE;
106
107 return has_dst_aligned_region_restriction(devinfo, inst) &&
108 !is_uniform(inst->src[i]) &&
109 (src_byte_stride != dst_byte_stride ||
110 src_byte_offset != dst_byte_offset);
111 }
112 }
113
114 /*
115 * Return whether the instruction has an unsupported channel bit layout
116 * specified for the destination region.
117 */
118 bool
119 has_invalid_dst_region(const gen_device_info *devinfo,
120 const fs_inst *inst)
121 {
122 if (is_unordered(inst)) {
123 return false;
124 } else {
125 const brw_reg_type exec_type = get_exec_type(inst);
126 const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE;
127 const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type);
128 const bool is_narrowing_conversion = !is_byte_raw_mov(inst) &&
129 type_sz(inst->dst.type) < type_sz(exec_type);
130
131 return (has_dst_aligned_region_restriction(devinfo, inst) &&
132 (required_dst_byte_stride(inst) != dst_byte_stride ||
133 required_dst_byte_offset(inst) != dst_byte_offset)) ||
134 (is_narrowing_conversion &&
135 required_dst_byte_stride(inst) != dst_byte_stride);
136 }
137 }
138
139 /*
140 * Return whether the instruction has unsupported source modifiers
141 * specified for the i-th source region.
142 */
143 bool
144 has_invalid_src_modifiers(const gen_device_info *devinfo, const fs_inst *inst,
145 unsigned i)
146 {
147 return !inst->can_do_source_mods(devinfo) &&
148 (inst->src[i].negate || inst->src[i].abs);
149 }
150
151 /*
152 * Return whether the instruction has an unsupported type conversion
153 * specified for the destination.
154 */
155 bool
156 has_invalid_conversion(const gen_device_info *devinfo, const fs_inst *inst)
157 {
158 switch (inst->opcode) {
159 case BRW_OPCODE_MOV:
160 return false;
161 case BRW_OPCODE_SEL:
162 return inst->dst.type != get_exec_type(inst);
163 case SHADER_OPCODE_BROADCAST:
164 case SHADER_OPCODE_MOV_INDIRECT:
165 /* The source and destination types of these may be hard-coded to
166 * integer at codegen time due to hardware limitations of 64-bit
167 * types.
168 */
169 return ((devinfo->gen == 7 && !devinfo->is_haswell) ||
170 devinfo->is_cherryview || gen_device_info_is_9lp(devinfo)) &&
171 type_sz(inst->src[0].type) > 4 &&
172 inst->dst.type != inst->src[0].type;
173 default:
174 /* FIXME: We assume the opcodes don't explicitly mentioned before
175 * just work fine with arbitrary conversions.
176 */
177 return false;
178 }
179 }
180
181 /**
182 * Return whether the instruction has non-standard semantics for the
183 * conditional mod which don't cause the flag register to be updated with
184 * the comparison result.
185 */
186 bool
187 has_inconsistent_cmod(const fs_inst *inst)
188 {
189 return inst->opcode == BRW_OPCODE_SEL ||
190 inst->opcode == BRW_OPCODE_CSEL ||
191 inst->opcode == BRW_OPCODE_IF ||
192 inst->opcode == BRW_OPCODE_WHILE;
193 }
194
195 bool
196 lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst);
197 }
198
199 namespace brw {
200 /**
201 * Remove any modifiers from the \p i-th source region of the instruction,
202 * including negate, abs and any implicit type conversion to the execution
203 * type. Instead any source modifiers will be implemented as a separate
204 * MOV instruction prior to the original instruction.
205 */
206 bool
207 lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
208 {
209 assert(inst->components_read(i) == 1);
210 const fs_builder ibld(v, block, inst);
211 const fs_reg tmp = ibld.vgrf(get_exec_type(inst));
212
213 lower_instruction(v, block, ibld.MOV(tmp, inst->src[i]));
214 inst->src[i] = tmp;
215
216 return true;
217 }
218 }
219
220 namespace {
221 /**
222 * Remove any modifiers from the destination region of the instruction,
223 * including saturate, conditional mod and any implicit type conversion
224 * from the execution type. Instead any destination modifiers will be
225 * implemented as a separate MOV instruction after the original
226 * instruction.
227 */
228 bool
229 lower_dst_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst)
230 {
231 const fs_builder ibld(v, block, inst);
232 const brw_reg_type type = get_exec_type(inst);
233 /* Not strictly necessary, but if possible use a temporary with the same
234 * channel alignment as the current destination in order to avoid
235 * violating the restrictions enforced later on by lower_src_region()
236 * and lower_dst_region(), which would introduce additional copy
237 * instructions into the program unnecessarily.
238 */
239 const unsigned stride =
240 type_sz(inst->dst.type) * inst->dst.stride <= type_sz(type) ? 1 :
241 type_sz(inst->dst.type) * inst->dst.stride / type_sz(type);
242 const fs_reg tmp = horiz_stride(ibld.vgrf(type, stride), stride);
243
244 /* Emit a MOV taking care of all the destination modifiers. */
245 fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp);
246 mov->saturate = inst->saturate;
247 if (!has_inconsistent_cmod(inst))
248 mov->conditional_mod = inst->conditional_mod;
249 if (inst->opcode != BRW_OPCODE_SEL) {
250 mov->predicate = inst->predicate;
251 mov->predicate_inverse = inst->predicate_inverse;
252 }
253 mov->flag_subreg = inst->flag_subreg;
254 lower_instruction(v, block, mov);
255
256 /* Point the original instruction at the temporary, and clean up any
257 * destination modifiers.
258 */
259 assert(inst->size_written == inst->dst.component_size(inst->exec_size));
260 inst->dst = tmp;
261 inst->size_written = inst->dst.component_size(inst->exec_size);
262 inst->saturate = false;
263 if (!has_inconsistent_cmod(inst))
264 inst->conditional_mod = BRW_CONDITIONAL_NONE;
265
266 assert(!inst->flags_written() || !mov->predicate);
267 return true;
268 }
269
270 /**
271 * Remove any non-trivial shuffling of data from the \p i-th source region
272 * of the instruction. Instead implement the region as a series of integer
273 * copies into a temporary with the same channel layout as the destination.
274 */
275 bool
276 lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
277 {
278 assert(inst->components_read(i) == 1);
279 const fs_builder ibld(v, block, inst);
280 const unsigned stride = type_sz(inst->dst.type) * inst->dst.stride /
281 type_sz(inst->src[i].type);
282 assert(stride > 0);
283 const fs_reg tmp = horiz_stride(ibld.vgrf(inst->src[i].type, stride),
284 stride);
285
286 /* Emit a series of 32-bit integer copies with any source modifiers
287 * cleaned up (because their semantics are dependent on the type).
288 */
289 const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
290 false);
291 const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
292 fs_reg raw_src = inst->src[i];
293 raw_src.negate = false;
294 raw_src.abs = false;
295
296 for (unsigned j = 0; j < n; j++)
297 ibld.MOV(subscript(tmp, raw_type, j), subscript(raw_src, raw_type, j));
298
299 /* Point the original instruction at the temporary, making sure to keep
300 * any source modifiers in the instruction.
301 */
302 fs_reg lower_src = tmp;
303 lower_src.negate = inst->src[i].negate;
304 lower_src.abs = inst->src[i].abs;
305 inst->src[i] = lower_src;
306
307 return true;
308 }
309
310 /**
311 * Remove any non-trivial shuffling of data from the destination region of
312 * the instruction. Instead implement the region as a series of integer
313 * copies from a temporary with a channel layout compatible with the
314 * sources.
315 */
316 bool
317 lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst)
318 {
319 const fs_builder ibld(v, block, inst);
320 const unsigned stride = required_dst_byte_stride(inst) /
321 type_sz(inst->dst.type);
322 assert(stride > 0);
323 const fs_reg tmp = horiz_stride(ibld.vgrf(inst->dst.type, stride),
324 stride);
325
326 /* Emit a series of 32-bit integer copies from the temporary into the
327 * original destination.
328 */
329 const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
330 false);
331 const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
332
333 if (inst->predicate && inst->opcode != BRW_OPCODE_SEL) {
334 /* Note that in general we cannot simply predicate the copies on the
335 * same flag register as the original instruction, since it may have
336 * been overwritten by the instruction itself. Instead initialize
337 * the temporary with the previous contents of the destination
338 * register.
339 */
340 for (unsigned j = 0; j < n; j++)
341 ibld.MOV(subscript(tmp, raw_type, j),
342 subscript(inst->dst, raw_type, j));
343 }
344
345 for (unsigned j = 0; j < n; j++)
346 ibld.at(block, inst->next).MOV(subscript(inst->dst, raw_type, j),
347 subscript(tmp, raw_type, j));
348
349 /* Point the original instruction at the temporary, making sure to keep
350 * any destination modifiers in the instruction.
351 */
352 assert(inst->size_written == inst->dst.component_size(inst->exec_size));
353 inst->dst = tmp;
354 inst->size_written = inst->dst.component_size(inst->exec_size);
355
356 return true;
357 }
358
359 /**
360 * Legalize the source and destination regioning controls of the specified
361 * instruction.
362 */
363 bool
364 lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst)
365 {
366 const gen_device_info *devinfo = v->devinfo;
367 bool progress = false;
368
369 if (has_invalid_conversion(devinfo, inst))
370 progress |= lower_dst_modifiers(v, block, inst);
371
372 if (has_invalid_dst_region(devinfo, inst))
373 progress |= lower_dst_region(v, block, inst);
374
375 for (unsigned i = 0; i < inst->sources; i++) {
376 if (has_invalid_src_modifiers(devinfo, inst, i))
377 progress |= lower_src_modifiers(v, block, inst, i);
378
379 if (has_invalid_src_region(devinfo, inst, i))
380 progress |= lower_src_region(v, block, inst, i);
381 }
382
383 return progress;
384 }
385 }
386
387 bool
388 fs_visitor::lower_regioning()
389 {
390 bool progress = false;
391
392 foreach_block_and_inst_safe(block, fs_inst, inst, cfg)
393 progress |= lower_instruction(this, block, inst);
394
395 if (progress)
396 invalidate_live_intervals();
397
398 return progress;
399 }