i965/vec4: Use swizzle() to swizzle immediates during constant propagation.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_copy_propagation.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /**
25 * @file brw_vec4_copy_propagation.cpp
26 *
27 * Implements tracking of values copied between registers, and
28 * optimizations based on that: copy propagation and constant
29 * propagation.
30 */
31
32 #include "brw_vec4.h"
33 #include "brw_cfg.h"
34 #include "brw_eu.h"
35
36 namespace brw {
37
38 struct copy_entry {
39 src_reg *value[4];
40 int saturatemask;
41 };
42
43 static bool
44 is_direct_copy(vec4_instruction *inst)
45 {
46 return (inst->opcode == BRW_OPCODE_MOV &&
47 !inst->predicate &&
48 inst->dst.file == VGRF &&
49 !inst->dst.reladdr &&
50 !inst->src[0].reladdr &&
51 (inst->dst.type == inst->src[0].type ||
52 (inst->dst.type == BRW_REGISTER_TYPE_F &&
53 inst->src[0].type == BRW_REGISTER_TYPE_VF)));
54 }
55
56 static bool
57 is_dominated_by_previous_instruction(vec4_instruction *inst)
58 {
59 return (inst->opcode != BRW_OPCODE_DO &&
60 inst->opcode != BRW_OPCODE_WHILE &&
61 inst->opcode != BRW_OPCODE_ELSE &&
62 inst->opcode != BRW_OPCODE_ENDIF);
63 }
64
65 static bool
66 is_channel_updated(vec4_instruction *inst, src_reg *values[4], int ch)
67 {
68 const src_reg *src = values[ch];
69
70 /* consider GRF only */
71 assert(inst->dst.file == VGRF);
72 if (!src || src->file != VGRF)
73 return false;
74
75 return (src->in_range(inst->dst, inst->regs_written) &&
76 inst->dst.writemask & (1 << BRW_GET_SWZ(src->swizzle, ch)));
77 }
78
79 static bool
80 is_logic_op(enum opcode opcode)
81 {
82 return (opcode == BRW_OPCODE_AND ||
83 opcode == BRW_OPCODE_OR ||
84 opcode == BRW_OPCODE_XOR ||
85 opcode == BRW_OPCODE_NOT);
86 }
87
88 static bool
89 try_constant_propagate(const struct brw_device_info *devinfo,
90 vec4_instruction *inst,
91 int arg, struct copy_entry *entry)
92 {
93 /* For constant propagation, we only handle the same constant
94 * across all 4 channels. Some day, we should handle the 8-bit
95 * float vector format, which would let us constant propagate
96 * vectors better.
97 */
98 src_reg value = *entry->value[0];
99 for (int i = 1; i < 4; i++) {
100 if (!value.equals(*entry->value[i]))
101 return false;
102 }
103
104 if (value.file != IMM)
105 return false;
106
107 if (value.type == BRW_REGISTER_TYPE_VF) {
108 /* The result of bit-casting the component values of a vector float
109 * cannot in general be represented as an immediate.
110 */
111 if (inst->src[arg].type != BRW_REGISTER_TYPE_F)
112 return false;
113 } else {
114 value.type = inst->src[arg].type;
115 }
116
117 if (inst->src[arg].abs) {
118 if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) ||
119 !brw_abs_immediate(value.type, &value.as_brw_reg())) {
120 return false;
121 }
122 }
123
124 if (inst->src[arg].negate) {
125 if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) ||
126 !brw_negate_immediate(value.type, &value.as_brw_reg())) {
127 return false;
128 }
129 }
130
131 value = swizzle(value, inst->src[arg].swizzle);
132
133 switch (inst->opcode) {
134 case BRW_OPCODE_MOV:
135 case SHADER_OPCODE_BROADCAST:
136 inst->src[arg] = value;
137 return true;
138
139 case SHADER_OPCODE_POW:
140 case SHADER_OPCODE_INT_QUOTIENT:
141 case SHADER_OPCODE_INT_REMAINDER:
142 if (devinfo->gen < 8)
143 break;
144 /* fallthrough */
145 case BRW_OPCODE_DP2:
146 case BRW_OPCODE_DP3:
147 case BRW_OPCODE_DP4:
148 case BRW_OPCODE_DPH:
149 case BRW_OPCODE_BFI1:
150 case BRW_OPCODE_ASR:
151 case BRW_OPCODE_SHL:
152 case BRW_OPCODE_SHR:
153 case BRW_OPCODE_SUBB:
154 if (arg == 1) {
155 inst->src[arg] = value;
156 return true;
157 }
158 break;
159
160 case BRW_OPCODE_MACH:
161 case BRW_OPCODE_MUL:
162 case SHADER_OPCODE_MULH:
163 case BRW_OPCODE_ADD:
164 case BRW_OPCODE_OR:
165 case BRW_OPCODE_AND:
166 case BRW_OPCODE_XOR:
167 case BRW_OPCODE_ADDC:
168 if (arg == 1) {
169 inst->src[arg] = value;
170 return true;
171 } else if (arg == 0 && inst->src[1].file != IMM) {
172 /* Fit this constant in by commuting the operands. Exception: we
173 * can't do this for 32-bit integer MUL/MACH because it's asymmetric.
174 */
175 if ((inst->opcode == BRW_OPCODE_MUL ||
176 inst->opcode == BRW_OPCODE_MACH) &&
177 (inst->src[1].type == BRW_REGISTER_TYPE_D ||
178 inst->src[1].type == BRW_REGISTER_TYPE_UD))
179 break;
180 inst->src[0] = inst->src[1];
181 inst->src[1] = value;
182 return true;
183 }
184 break;
185 case GS_OPCODE_SET_WRITE_OFFSET:
186 /* This is just a multiply by a constant with special strides.
187 * The generator will handle immediates in both arguments (generating
188 * a single MOV of the product). So feel free to propagate in src0.
189 */
190 inst->src[arg] = value;
191 return true;
192
193 case BRW_OPCODE_CMP:
194 if (arg == 1) {
195 inst->src[arg] = value;
196 return true;
197 } else if (arg == 0 && inst->src[1].file != IMM) {
198 enum brw_conditional_mod new_cmod;
199
200 new_cmod = brw_swap_cmod(inst->conditional_mod);
201 if (new_cmod != BRW_CONDITIONAL_NONE) {
202 /* Fit this constant in by swapping the operands and
203 * flipping the test.
204 */
205 inst->src[0] = inst->src[1];
206 inst->src[1] = value;
207 inst->conditional_mod = new_cmod;
208 return true;
209 }
210 }
211 break;
212
213 case BRW_OPCODE_SEL:
214 if (arg == 1) {
215 inst->src[arg] = value;
216 return true;
217 } else if (arg == 0 && inst->src[1].file != IMM) {
218 inst->src[0] = inst->src[1];
219 inst->src[1] = value;
220
221 /* If this was predicated, flipping operands means
222 * we also need to flip the predicate.
223 */
224 if (inst->conditional_mod == BRW_CONDITIONAL_NONE) {
225 inst->predicate_inverse = !inst->predicate_inverse;
226 }
227 return true;
228 }
229 break;
230
231 default:
232 break;
233 }
234
235 return false;
236 }
237
238 static bool
239 try_copy_propagate(const struct brw_device_info *devinfo,
240 vec4_instruction *inst, int arg,
241 struct copy_entry *entry, int attributes_per_reg)
242 {
243 /* Build up the value we are propagating as if it were the source of a
244 * single MOV
245 */
246 /* For constant propagation, we only handle the same constant
247 * across all 4 channels. Some day, we should handle the 8-bit
248 * float vector format, which would let us constant propagate
249 * vectors better.
250 */
251 src_reg value = *entry->value[0];
252 for (int i = 1; i < 4; i++) {
253 /* This is equals() except we don't care about the swizzle. */
254 if (value.file != entry->value[i]->file ||
255 value.nr != entry->value[i]->nr ||
256 value.reg_offset != entry->value[i]->reg_offset ||
257 value.type != entry->value[i]->type ||
258 value.negate != entry->value[i]->negate ||
259 value.abs != entry->value[i]->abs) {
260 return false;
261 }
262 }
263
264 /* Compute the swizzle of the original register by swizzling the
265 * component loaded from each value according to the swizzle of
266 * operand we're going to change.
267 */
268 int s[4];
269 for (int i = 0; i < 4; i++) {
270 s[i] = BRW_GET_SWZ(entry->value[i]->swizzle, i);
271 }
272 value.swizzle = BRW_SWIZZLE4(s[0], s[1], s[2], s[3]);
273
274 /* Check that we can propagate that value */
275 if (value.file != UNIFORM &&
276 value.file != VGRF &&
277 value.file != ATTR)
278 return false;
279
280 if (devinfo->gen >= 8 && (value.negate || value.abs) &&
281 is_logic_op(inst->opcode)) {
282 return false;
283 }
284
285 bool has_source_modifiers = value.negate || value.abs;
286
287 /* gen6 math and gen7+ SENDs from GRFs ignore source modifiers on
288 * instructions.
289 */
290 if ((has_source_modifiers || value.file == UNIFORM ||
291 value.swizzle != BRW_SWIZZLE_XYZW) && !inst->can_do_source_mods(devinfo))
292 return false;
293
294 if (has_source_modifiers &&
295 value.type != inst->src[arg].type &&
296 !inst->can_change_types())
297 return false;
298
299 if (has_source_modifiers &&
300 inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_WRITE)
301 return false;
302
303 unsigned composed_swizzle = brw_compose_swizzle(inst->src[arg].swizzle,
304 value.swizzle);
305 if (inst->is_3src() &&
306 (value.file == UNIFORM ||
307 (value.file == ATTR && attributes_per_reg != 1)) &&
308 !brw_is_single_value_swizzle(composed_swizzle))
309 return false;
310
311 if (inst->is_send_from_grf())
312 return false;
313
314 /* we can't generally copy-propagate UD negations becuse we
315 * end up accessing the resulting values as signed integers
316 * instead. See also resolve_ud_negate().
317 */
318 if (value.negate &&
319 value.type == BRW_REGISTER_TYPE_UD)
320 return false;
321
322 /* Don't report progress if this is a noop. */
323 if (value.equals(inst->src[arg]))
324 return false;
325
326 const unsigned dst_saturate_mask = inst->dst.writemask &
327 brw_apply_swizzle_to_mask(inst->src[arg].swizzle, entry->saturatemask);
328
329 if (dst_saturate_mask) {
330 /* We either saturate all or nothing. */
331 if (dst_saturate_mask != inst->dst.writemask)
332 return false;
333
334 /* Limit saturate propagation only to SEL with src1 bounded within 0.0
335 * and 1.0, otherwise skip copy propagate altogether.
336 */
337 switch(inst->opcode) {
338 case BRW_OPCODE_SEL:
339 if (arg != 0 ||
340 inst->src[0].type != BRW_REGISTER_TYPE_F ||
341 inst->src[1].file != IMM ||
342 inst->src[1].type != BRW_REGISTER_TYPE_F ||
343 inst->src[1].f < 0.0 ||
344 inst->src[1].f > 1.0) {
345 return false;
346 }
347 if (!inst->saturate)
348 inst->saturate = true;
349 break;
350 default:
351 return false;
352 }
353 }
354
355 /* Build the final value */
356 if (inst->src[arg].abs) {
357 value.negate = false;
358 value.abs = true;
359 }
360 if (inst->src[arg].negate)
361 value.negate = !value.negate;
362
363 value.swizzle = composed_swizzle;
364 if (has_source_modifiers &&
365 value.type != inst->src[arg].type) {
366 assert(inst->can_change_types());
367 for (int i = 0; i < 3; i++) {
368 inst->src[i].type = value.type;
369 }
370 inst->dst.type = value.type;
371 } else {
372 value.type = inst->src[arg].type;
373 }
374
375 inst->src[arg] = value;
376 return true;
377 }
378
379 bool
380 vec4_visitor::opt_copy_propagation(bool do_constant_prop)
381 {
382 /* If we are in dual instanced or single mode, then attributes are going
383 * to be interleaved, so one register contains two attribute slots.
384 */
385 const int attributes_per_reg =
386 prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT ? 1 : 2;
387 bool progress = false;
388 struct copy_entry entries[alloc.total_size];
389
390 memset(&entries, 0, sizeof(entries));
391
392 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
393 /* This pass only works on basic blocks. If there's flow
394 * control, throw out all our information and start from
395 * scratch.
396 *
397 * This should really be fixed by using a structure like in
398 * src/glsl/opt_copy_propagation.cpp to track available copies.
399 */
400 if (!is_dominated_by_previous_instruction(inst)) {
401 memset(&entries, 0, sizeof(entries));
402 continue;
403 }
404
405 /* For each source arg, see if each component comes from a copy
406 * from the same type file (IMM, VGRF, UNIFORM), and try
407 * optimizing out access to the copy result
408 */
409 for (int i = 2; i >= 0; i--) {
410 /* Copied values end up in GRFs, and we don't track reladdr
411 * accesses.
412 */
413 if (inst->src[i].file != VGRF ||
414 inst->src[i].reladdr)
415 continue;
416
417 /* We only handle single-register copies. */
418 if (inst->regs_read(i) != 1)
419 continue;
420
421 int reg = (alloc.offsets[inst->src[i].nr] +
422 inst->src[i].reg_offset);
423
424 /* Find the regs that each swizzle component came from.
425 */
426 struct copy_entry entry;
427 memset(&entry, 0, sizeof(copy_entry));
428 int c;
429 for (c = 0; c < 4; c++) {
430 int channel = BRW_GET_SWZ(inst->src[i].swizzle, c);
431 entry.value[c] = entries[reg].value[channel];
432
433 /* If there's no available copy for this channel, bail.
434 * We could be more aggressive here -- some channels might
435 * not get used based on the destination writemask.
436 */
437 if (!entry.value[c])
438 break;
439
440 entry.saturatemask |=
441 (entries[reg].saturatemask & (1 << channel) ? 1 : 0) << c;
442
443 /* We'll only be able to copy propagate if the sources are
444 * all from the same file -- there's no ability to swizzle
445 * 0 or 1 constants in with source registers like in i915.
446 */
447 if (c > 0 && entry.value[c - 1]->file != entry.value[c]->file)
448 break;
449 }
450
451 if (c != 4)
452 continue;
453
454 if (do_constant_prop && try_constant_propagate(devinfo, inst, i, &entry))
455 progress = true;
456
457 if (try_copy_propagate(devinfo, inst, i, &entry, attributes_per_reg))
458 progress = true;
459 }
460
461 /* Track available source registers. */
462 if (inst->dst.file == VGRF) {
463 const int reg =
464 alloc.offsets[inst->dst.nr] + inst->dst.reg_offset;
465
466 /* Update our destination's current channel values. For a direct copy,
467 * the value is the newly propagated source. Otherwise, we don't know
468 * the new value, so clear it.
469 */
470 bool direct_copy = is_direct_copy(inst);
471 entries[reg].saturatemask &= ~inst->dst.writemask;
472 for (int i = 0; i < 4; i++) {
473 if (inst->dst.writemask & (1 << i)) {
474 entries[reg].value[i] = direct_copy ? &inst->src[0] : NULL;
475 entries[reg].saturatemask |=
476 inst->saturate && direct_copy ? 1 << i : 0;
477 }
478 }
479
480 /* Clear the records for any registers whose current value came from
481 * our destination's updated channels, as the two are no longer equal.
482 */
483 if (inst->dst.reladdr)
484 memset(&entries, 0, sizeof(entries));
485 else {
486 for (unsigned i = 0; i < alloc.total_size; i++) {
487 for (int j = 0; j < 4; j++) {
488 if (is_channel_updated(inst, entries[i].value, j)) {
489 entries[i].value[j] = NULL;
490 entries[i].saturatemask &= ~(1 << j);
491 }
492 }
493 }
494 }
495 }
496 }
497
498 if (progress)
499 invalidate_live_intervals();
500
501 return progress;
502 }
503
504 } /* namespace brw */