anv: Move the physical device dispatch table to anv_instance
[mesa.git] / src / intel / compiler / brw_fs_combine_constants.cpp
1 /*
2 * Copyright © 2014 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs_combine_constants.cpp
25 *
26 * This file contains the opt_combine_constants() pass that runs after the
27 * regular optimization loop. It passes over the instruction list and
28 * selectively promotes immediate values to registers by emitting a mov(1)
29 * instruction.
30 *
31 * This is useful on Gen 7 particularly, because a few instructions can be
32 * coissued (i.e., issued in the same cycle as another thread on the same EU
33 * issues an instruction) under some circumstances, one of which is that they
34 * cannot use immediate values.
35 */
36
37 #include "brw_fs.h"
38 #include "brw_cfg.h"
39 #include "util/half_float.h"
40
41 using namespace brw;
42
43 static const bool debug = false;
44
45 /* Returns whether an instruction could co-issue if its immediate source were
46 * replaced with a GRF source.
47 */
48 static bool
49 could_coissue(const struct gen_device_info *devinfo, const fs_inst *inst)
50 {
51 if (devinfo->gen != 7)
52 return false;
53
54 switch (inst->opcode) {
55 case BRW_OPCODE_MOV:
56 case BRW_OPCODE_CMP:
57 case BRW_OPCODE_ADD:
58 case BRW_OPCODE_MUL:
59 return true;
60 default:
61 return false;
62 }
63 }
64
65 /**
66 * Returns true for instructions that don't support immediate sources.
67 */
68 static bool
69 must_promote_imm(const struct gen_device_info *devinfo, const fs_inst *inst)
70 {
71 switch (inst->opcode) {
72 case SHADER_OPCODE_POW:
73 return devinfo->gen < 8;
74 case BRW_OPCODE_MAD:
75 case BRW_OPCODE_LRP:
76 return true;
77 default:
78 return false;
79 }
80 }
81
82 /** A box for putting fs_regs in a linked list. */
83 struct reg_link {
84 DECLARE_RALLOC_CXX_OPERATORS(reg_link)
85
86 reg_link(fs_reg *reg) : reg(reg) {}
87
88 struct exec_node link;
89 fs_reg *reg;
90 };
91
92 static struct exec_node *
93 link(void *mem_ctx, fs_reg *reg)
94 {
95 reg_link *l = new(mem_ctx) reg_link(reg);
96 return &l->link;
97 }
98
99 /**
100 * Information about an immediate value.
101 */
102 struct imm {
103 /** The common ancestor of all blocks using this immediate value. */
104 bblock_t *block;
105
106 /**
107 * The instruction generating the immediate value, if all uses are contained
108 * within a single basic block. Otherwise, NULL.
109 */
110 fs_inst *inst;
111
112 /**
113 * A list of fs_regs that refer to this immediate. If we promote it, we'll
114 * have to patch these up to refer to the new GRF.
115 */
116 exec_list *uses;
117
118 /** The immediate value */
119 union {
120 char bytes[8];
121 double df;
122 int64_t d64;
123 float f;
124 int32_t d;
125 int16_t w;
126 };
127 uint8_t size;
128
129 /** When promoting half-float we need to account for certain restrictions */
130 bool is_half_float;
131
132 /**
133 * The GRF register and subregister number where we've decided to store the
134 * constant value.
135 */
136 uint8_t subreg_offset;
137 uint16_t nr;
138
139 /** The number of coissuable instructions using this immediate. */
140 uint16_t uses_by_coissue;
141
142 /**
143 * Whether this constant is used by an instruction that can't handle an
144 * immediate source (and already has to be promoted to a GRF).
145 */
146 bool must_promote;
147
148 uint16_t first_use_ip;
149 uint16_t last_use_ip;
150 };
151
152 /** The working set of information about immediates. */
153 struct table {
154 struct imm *imm;
155 int size;
156 int len;
157 };
158
159 static struct imm *
160 find_imm(struct table *table, void *data, uint8_t size)
161 {
162 for (int i = 0; i < table->len; i++) {
163 if (table->imm[i].size == size &&
164 !memcmp(table->imm[i].bytes, data, size)) {
165 return &table->imm[i];
166 }
167 }
168 return NULL;
169 }
170
171 static struct imm *
172 new_imm(struct table *table, void *mem_ctx)
173 {
174 if (table->len == table->size) {
175 table->size *= 2;
176 table->imm = reralloc(mem_ctx, table->imm, struct imm, table->size);
177 }
178 return &table->imm[table->len++];
179 }
180
181 /**
182 * Comparator used for sorting an array of imm structures.
183 *
184 * We sort by basic block number, then last use IP, then first use IP (least
185 * to greatest). This sorting causes immediates live in the same area to be
186 * allocated to the same register in the hopes that all values will be dead
187 * about the same time and the register can be reused.
188 */
189 static int
190 compare(const void *_a, const void *_b)
191 {
192 const struct imm *a = (const struct imm *)_a,
193 *b = (const struct imm *)_b;
194
195 int block_diff = a->block->num - b->block->num;
196 if (block_diff)
197 return block_diff;
198
199 int end_diff = a->last_use_ip - b->last_use_ip;
200 if (end_diff)
201 return end_diff;
202
203 return a->first_use_ip - b->first_use_ip;
204 }
205
206 static bool
207 get_constant_value(const struct gen_device_info *devinfo,
208 const fs_inst *inst, uint32_t src_idx,
209 void *out, brw_reg_type *out_type)
210 {
211 const bool can_do_source_mods = inst->can_do_source_mods(devinfo);
212 const fs_reg *src = &inst->src[src_idx];
213
214 *out_type = src->type;
215
216 switch (*out_type) {
217 case BRW_REGISTER_TYPE_DF: {
218 double val = !can_do_source_mods ? src->df : fabs(src->df);
219 memcpy(out, &val, 8);
220 break;
221 }
222 case BRW_REGISTER_TYPE_F: {
223 float val = !can_do_source_mods ? src->f : fabsf(src->f);
224 memcpy(out, &val, 4);
225 break;
226 }
227 case BRW_REGISTER_TYPE_HF: {
228 uint16_t val = src->d & 0xffffu;
229 if (can_do_source_mods)
230 val = _mesa_float_to_half(fabsf(_mesa_half_to_float(val)));
231 memcpy(out, &val, 2);
232 break;
233 }
234 case BRW_REGISTER_TYPE_Q: {
235 int64_t val = !can_do_source_mods ? src->d64 : llabs(src->d64);
236 memcpy(out, &val, 8);
237 break;
238 }
239 case BRW_REGISTER_TYPE_UQ:
240 memcpy(out, &src->u64, 8);
241 break;
242 case BRW_REGISTER_TYPE_D: {
243 int32_t val = !can_do_source_mods ? src->d : abs(src->d);
244 memcpy(out, &val, 4);
245 break;
246 }
247 case BRW_REGISTER_TYPE_UD:
248 memcpy(out, &src->ud, 4);
249 break;
250 case BRW_REGISTER_TYPE_W: {
251 int16_t val = src->d & 0xffffu;
252 if (can_do_source_mods)
253 val = abs(val);
254 memcpy(out, &val, 2);
255 break;
256 }
257 case BRW_REGISTER_TYPE_UW:
258 memcpy(out, &src->ud, 2);
259 break;
260 default:
261 return false;
262 };
263
264 return true;
265 }
266
267 static struct brw_reg
268 build_imm_reg_for_copy(struct imm *imm)
269 {
270 switch (imm->size) {
271 case 8:
272 return brw_imm_d(imm->d64);
273 case 4:
274 return brw_imm_d(imm->d);
275 case 2:
276 return brw_imm_w(imm->w);
277 default:
278 unreachable("not implemented");
279 }
280 }
281
282 static inline uint32_t
283 get_alignment_for_imm(const struct imm *imm)
284 {
285 if (imm->is_half_float)
286 return 4; /* At least MAD seems to require this */
287 else
288 return imm->size;
289 }
290
291 static bool
292 needs_negate(const fs_reg *reg, const struct imm *imm)
293 {
294 switch (reg->type) {
295 case BRW_REGISTER_TYPE_DF:
296 return signbit(reg->df) != signbit(imm->df);
297 case BRW_REGISTER_TYPE_F:
298 return signbit(reg->f) != signbit(imm->f);
299 case BRW_REGISTER_TYPE_Q:
300 return (reg->d64 < 0) != (imm->d64 < 0);
301 case BRW_REGISTER_TYPE_D:
302 return (reg->d < 0) != (imm->d < 0);
303 case BRW_REGISTER_TYPE_HF:
304 return (reg->d & 0x8000u) != (imm->w & 0x8000u);
305 case BRW_REGISTER_TYPE_W:
306 return ((int16_t)reg->d < 0) != (imm->w < 0);
307 case BRW_REGISTER_TYPE_UQ:
308 case BRW_REGISTER_TYPE_UD:
309 case BRW_REGISTER_TYPE_UW:
310 return false;
311 default:
312 unreachable("not implemented");
313 };
314 }
315
316 static bool
317 representable_as_hf(float f, uint16_t *hf)
318 {
319 union fi u;
320 uint16_t h = _mesa_float_to_half(f);
321 u.f = _mesa_half_to_float(h);
322
323 if (u.f == f) {
324 *hf = h;
325 return true;
326 }
327
328 return false;
329 }
330
331 static bool
332 represent_src_as_imm(const struct gen_device_info *devinfo,
333 fs_reg *src)
334 {
335 /* TODO : consider specific platforms also */
336 if (devinfo->gen == 12) {
337 uint16_t hf;
338 if (representable_as_hf(src->f, &hf)) {
339 *src = retype(brw_imm_uw(hf), BRW_REGISTER_TYPE_HF);
340 return true;
341 }
342 }
343 return false;
344 }
345
346 bool
347 fs_visitor::opt_combine_constants()
348 {
349 void *const_ctx = ralloc_context(NULL);
350
351 struct table table;
352 table.size = 8;
353 table.len = 0;
354 table.imm = ralloc_array(const_ctx, struct imm, table.size);
355
356 cfg->calculate_idom();
357 unsigned ip = -1;
358
359 /* Make a pass through all instructions and count the number of times each
360 * constant is used by coissueable instructions or instructions that cannot
361 * take immediate arguments.
362 */
363 foreach_block_and_inst(block, fs_inst, inst, cfg) {
364 ip++;
365
366 if (!could_coissue(devinfo, inst) && !must_promote_imm(devinfo, inst))
367 continue;
368
369 bool represented_as_imm = false;
370 for (int i = 0; i < inst->sources; i++) {
371 if (inst->src[i].file != IMM)
372 continue;
373
374 if (!represented_as_imm && i == 0 &&
375 inst->opcode == BRW_OPCODE_MAD &&
376 represent_src_as_imm(devinfo, &inst->src[i])) {
377 represented_as_imm = true;
378 continue;
379 }
380
381 char data[8];
382 brw_reg_type type;
383 if (!get_constant_value(devinfo, inst, i, data, &type))
384 continue;
385
386 uint8_t size = type_sz(type);
387
388 struct imm *imm = find_imm(&table, data, size);
389
390 if (imm) {
391 bblock_t *intersection = cfg_t::intersect(block, imm->block);
392 if (intersection != imm->block)
393 imm->inst = NULL;
394 imm->block = intersection;
395 imm->uses->push_tail(link(const_ctx, &inst->src[i]));
396 imm->uses_by_coissue += could_coissue(devinfo, inst);
397 imm->must_promote = imm->must_promote || must_promote_imm(devinfo, inst);
398 imm->last_use_ip = ip;
399 if (type == BRW_REGISTER_TYPE_HF)
400 imm->is_half_float = true;
401 } else {
402 imm = new_imm(&table, const_ctx);
403 imm->block = block;
404 imm->inst = inst;
405 imm->uses = new(const_ctx) exec_list();
406 imm->uses->push_tail(link(const_ctx, &inst->src[i]));
407 memcpy(imm->bytes, data, size);
408 imm->size = size;
409 imm->is_half_float = type == BRW_REGISTER_TYPE_HF;
410 imm->uses_by_coissue = could_coissue(devinfo, inst);
411 imm->must_promote = must_promote_imm(devinfo, inst);
412 imm->first_use_ip = ip;
413 imm->last_use_ip = ip;
414 }
415 }
416 }
417
418 /* Remove constants from the table that don't have enough uses to make them
419 * profitable to store in a register.
420 */
421 for (int i = 0; i < table.len;) {
422 struct imm *imm = &table.imm[i];
423
424 if (!imm->must_promote && imm->uses_by_coissue < 4) {
425 table.imm[i] = table.imm[table.len - 1];
426 table.len--;
427 continue;
428 }
429 i++;
430 }
431 if (table.len == 0) {
432 ralloc_free(const_ctx);
433 return false;
434 }
435 if (cfg->num_blocks != 1)
436 qsort(table.imm, table.len, sizeof(struct imm), compare);
437
438 /* Insert MOVs to load the constant values into GRFs. */
439 fs_reg reg(VGRF, alloc.allocate(1));
440 reg.stride = 0;
441 for (int i = 0; i < table.len; i++) {
442 struct imm *imm = &table.imm[i];
443 /* Insert it either before the instruction that generated the immediate
444 * or after the last non-control flow instruction of the common ancestor.
445 */
446 exec_node *n = (imm->inst ? imm->inst :
447 imm->block->last_non_control_flow_inst()->next);
448
449 /* From the BDW and CHV PRM, 3D Media GPGPU, Special Restrictions:
450 *
451 * "In Align16 mode, the channel selects and channel enables apply to a
452 * pair of half-floats, because these parameters are defined for DWord
453 * elements ONLY. This is applicable when both source and destination
454 * are half-floats."
455 *
456 * This means that Align16 instructions that use promoted HF immediates
457 * and use a <0,1,0>:HF region would read 2 HF slots instead of
458 * replicating the single one we want. To avoid this, we always populate
459 * both HF slots within a DWord with the constant.
460 */
461 const uint32_t width = devinfo->gen == 8 && imm->is_half_float ? 2 : 1;
462 const fs_builder ibld = bld.at(imm->block, n).exec_all().group(width, 0);
463
464 /* Put the immediate in an offset aligned to its size. Some instructions
465 * seem to have additional alignment requirements, so account for that
466 * too.
467 */
468 reg.offset = ALIGN(reg.offset, get_alignment_for_imm(imm));
469
470 /* Ensure we have enough space in the register to copy the immediate */
471 struct brw_reg imm_reg = build_imm_reg_for_copy(imm);
472 if (reg.offset + type_sz(imm_reg.type) * width > REG_SIZE) {
473 reg.nr = alloc.allocate(1);
474 reg.offset = 0;
475 }
476
477 ibld.MOV(retype(reg, imm_reg.type), imm_reg);
478 imm->nr = reg.nr;
479 imm->subreg_offset = reg.offset;
480
481 reg.offset += imm->size * width;
482 }
483 shader_stats.promoted_constants = table.len;
484
485 /* Rewrite the immediate sources to refer to the new GRFs. */
486 for (int i = 0; i < table.len; i++) {
487 foreach_list_typed(reg_link, link, link, table.imm[i].uses) {
488 fs_reg *reg = link->reg;
489 #ifdef DEBUG
490 switch (reg->type) {
491 case BRW_REGISTER_TYPE_DF:
492 assert((isnan(reg->df) && isnan(table.imm[i].df)) ||
493 (fabs(reg->df) == fabs(table.imm[i].df)));
494 break;
495 case BRW_REGISTER_TYPE_F:
496 assert((isnan(reg->f) && isnan(table.imm[i].f)) ||
497 (fabsf(reg->f) == fabsf(table.imm[i].f)));
498 break;
499 case BRW_REGISTER_TYPE_HF:
500 assert((isnan(_mesa_half_to_float(reg->d & 0xffffu)) &&
501 isnan(_mesa_half_to_float(table.imm[i].w))) ||
502 (fabsf(_mesa_half_to_float(reg->d & 0xffffu)) ==
503 fabsf(_mesa_half_to_float(table.imm[i].w))));
504 break;
505 case BRW_REGISTER_TYPE_Q:
506 assert(abs(reg->d64) == abs(table.imm[i].d64));
507 break;
508 case BRW_REGISTER_TYPE_UQ:
509 assert(reg->d64 == table.imm[i].d64);
510 break;
511 case BRW_REGISTER_TYPE_D:
512 assert(abs(reg->d) == abs(table.imm[i].d));
513 break;
514 case BRW_REGISTER_TYPE_UD:
515 assert(reg->d == table.imm[i].d);
516 break;
517 case BRW_REGISTER_TYPE_W:
518 assert(abs((int16_t) (reg->d & 0xffff)) == table.imm[i].w);
519 break;
520 case BRW_REGISTER_TYPE_UW:
521 assert((reg->ud & 0xffffu) == (uint16_t) table.imm[i].w);
522 break;
523 default:
524 break;
525 }
526 #endif
527
528 reg->file = VGRF;
529 reg->offset = table.imm[i].subreg_offset;
530 reg->stride = 0;
531 reg->negate = needs_negate(reg, &table.imm[i]);
532 reg->nr = table.imm[i].nr;
533 }
534 }
535
536 if (debug) {
537 for (int i = 0; i < table.len; i++) {
538 struct imm *imm = &table.imm[i];
539
540 printf("0x%016" PRIx64 " - block %3d, reg %3d sub %2d, "
541 "Uses: (%2d, %2d), IP: %4d to %4d, length %4d\n",
542 (uint64_t)(imm->d & BITFIELD64_MASK(imm->size * 8)),
543 imm->block->num,
544 imm->nr,
545 imm->subreg_offset,
546 imm->must_promote,
547 imm->uses_by_coissue,
548 imm->first_use_ip,
549 imm->last_use_ip,
550 imm->last_use_ip - imm->first_use_ip);
551 }
552 }
553
554 ralloc_free(const_ctx);
555 invalidate_live_intervals();
556
557 return true;
558 }