intel/fs: Add SLM size to brw_cs_prog_data
[mesa.git] / src / intel / compiler / brw_fs_combine_constants.cpp
1 /*
2 * Copyright © 2014 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs_combine_constants.cpp
25 *
26 * This file contains the opt_combine_constants() pass that runs after the
27 * regular optimization loop. It passes over the instruction list and
28 * selectively promotes immediate values to registers by emitting a mov(1)
29 * instruction.
30 *
31 * This is useful on Gen 7 particularly, because a few instructions can be
32 * coissued (i.e., issued in the same cycle as another thread on the same EU
33 * issues an instruction) under some circumstances, one of which is that they
34 * cannot use immediate values.
35 */
36
37 #include "brw_fs.h"
38 #include "brw_cfg.h"
39 #include "util/half_float.h"
40
41 using namespace brw;
42
43 static const bool debug = false;
44
45 /* Returns whether an instruction could co-issue if its immediate source were
46 * replaced with a GRF source.
47 */
48 static bool
49 could_coissue(const struct gen_device_info *devinfo, const fs_inst *inst)
50 {
51 if (devinfo->gen != 7)
52 return false;
53
54 switch (inst->opcode) {
55 case BRW_OPCODE_MOV:
56 case BRW_OPCODE_CMP:
57 case BRW_OPCODE_ADD:
58 case BRW_OPCODE_MUL:
59 return true;
60 default:
61 return false;
62 }
63 }
64
65 /**
66 * Returns true for instructions that don't support immediate sources.
67 */
68 static bool
69 must_promote_imm(const struct gen_device_info *devinfo, const fs_inst *inst)
70 {
71 switch (inst->opcode) {
72 case SHADER_OPCODE_POW:
73 return devinfo->gen < 8;
74 case BRW_OPCODE_MAD:
75 case BRW_OPCODE_LRP:
76 return true;
77 default:
78 return false;
79 }
80 }
81
82 /** A box for putting fs_regs in a linked list. */
83 struct reg_link {
84 DECLARE_RALLOC_CXX_OPERATORS(reg_link)
85
86 reg_link(fs_reg *reg) : reg(reg) {}
87
88 struct exec_node link;
89 fs_reg *reg;
90 };
91
92 static struct exec_node *
93 link(void *mem_ctx, fs_reg *reg)
94 {
95 reg_link *l = new(mem_ctx) reg_link(reg);
96 return &l->link;
97 }
98
99 /**
100 * Information about an immediate value.
101 */
102 struct imm {
103 /** The common ancestor of all blocks using this immediate value. */
104 bblock_t *block;
105
106 /**
107 * The instruction generating the immediate value, if all uses are contained
108 * within a single basic block. Otherwise, NULL.
109 */
110 fs_inst *inst;
111
112 /**
113 * A list of fs_regs that refer to this immediate. If we promote it, we'll
114 * have to patch these up to refer to the new GRF.
115 */
116 exec_list *uses;
117
118 /** The immediate value */
119 union {
120 char bytes[8];
121 double df;
122 int64_t d64;
123 float f;
124 int32_t d;
125 int16_t w;
126 };
127 uint8_t size;
128
129 /** When promoting half-float we need to account for certain restrictions */
130 bool is_half_float;
131
132 /**
133 * The GRF register and subregister number where we've decided to store the
134 * constant value.
135 */
136 uint8_t subreg_offset;
137 uint16_t nr;
138
139 /** The number of coissuable instructions using this immediate. */
140 uint16_t uses_by_coissue;
141
142 /**
143 * Whether this constant is used by an instruction that can't handle an
144 * immediate source (and already has to be promoted to a GRF).
145 */
146 bool must_promote;
147
148 uint16_t first_use_ip;
149 uint16_t last_use_ip;
150 };
151
152 /** The working set of information about immediates. */
153 struct table {
154 struct imm *imm;
155 int size;
156 int len;
157 };
158
159 static struct imm *
160 find_imm(struct table *table, void *data, uint8_t size)
161 {
162 for (int i = 0; i < table->len; i++) {
163 if (table->imm[i].size == size &&
164 !memcmp(table->imm[i].bytes, data, size)) {
165 return &table->imm[i];
166 }
167 }
168 return NULL;
169 }
170
171 static struct imm *
172 new_imm(struct table *table, void *mem_ctx)
173 {
174 if (table->len == table->size) {
175 table->size *= 2;
176 table->imm = reralloc(mem_ctx, table->imm, struct imm, table->size);
177 }
178 return &table->imm[table->len++];
179 }
180
181 /**
182 * Comparator used for sorting an array of imm structures.
183 *
184 * We sort by basic block number, then last use IP, then first use IP (least
185 * to greatest). This sorting causes immediates live in the same area to be
186 * allocated to the same register in the hopes that all values will be dead
187 * about the same time and the register can be reused.
188 */
189 static int
190 compare(const void *_a, const void *_b)
191 {
192 const struct imm *a = (const struct imm *)_a,
193 *b = (const struct imm *)_b;
194
195 int block_diff = a->block->num - b->block->num;
196 if (block_diff)
197 return block_diff;
198
199 int end_diff = a->last_use_ip - b->last_use_ip;
200 if (end_diff)
201 return end_diff;
202
203 return a->first_use_ip - b->first_use_ip;
204 }
205
206 static bool
207 get_constant_value(const struct gen_device_info *devinfo,
208 const fs_inst *inst, uint32_t src_idx,
209 void *out, brw_reg_type *out_type)
210 {
211 const bool can_do_source_mods = inst->can_do_source_mods(devinfo);
212 const fs_reg *src = &inst->src[src_idx];
213
214 *out_type = src->type;
215
216 switch (*out_type) {
217 case BRW_REGISTER_TYPE_DF: {
218 double val = !can_do_source_mods ? src->df : fabs(src->df);
219 memcpy(out, &val, 8);
220 break;
221 }
222 case BRW_REGISTER_TYPE_F: {
223 float val = !can_do_source_mods ? src->f : fabsf(src->f);
224 memcpy(out, &val, 4);
225 break;
226 }
227 case BRW_REGISTER_TYPE_HF: {
228 uint16_t val = src->d & 0xffffu;
229 if (can_do_source_mods)
230 val = _mesa_float_to_half(fabsf(_mesa_half_to_float(val)));
231 memcpy(out, &val, 2);
232 break;
233 }
234 case BRW_REGISTER_TYPE_Q: {
235 int64_t val = !can_do_source_mods ? src->d64 : abs(src->d64);
236 memcpy(out, &val, 8);
237 break;
238 }
239 case BRW_REGISTER_TYPE_UQ:
240 memcpy(out, &src->u64, 8);
241 break;
242 case BRW_REGISTER_TYPE_D: {
243 int32_t val = !can_do_source_mods ? src->d : abs(src->d);
244 memcpy(out, &val, 4);
245 break;
246 }
247 case BRW_REGISTER_TYPE_UD:
248 memcpy(out, &src->ud, 4);
249 break;
250 case BRW_REGISTER_TYPE_W: {
251 int16_t val = src->d & 0xffffu;
252 if (can_do_source_mods)
253 val = abs(val);
254 memcpy(out, &val, 2);
255 break;
256 }
257 case BRW_REGISTER_TYPE_UW:
258 memcpy(out, &src->ud, 2);
259 break;
260 default:
261 return false;
262 };
263
264 return true;
265 }
266
267 static struct brw_reg
268 build_imm_reg_for_copy(struct imm *imm)
269 {
270 switch (imm->size) {
271 case 8:
272 return brw_imm_d(imm->d64);
273 case 4:
274 return brw_imm_d(imm->d);
275 case 2:
276 return brw_imm_w(imm->w);
277 default:
278 unreachable("not implemented");
279 }
280 }
281
282 static inline uint32_t
283 get_alignment_for_imm(const struct imm *imm)
284 {
285 if (imm->is_half_float)
286 return 4; /* At least MAD seems to require this */
287 else
288 return imm->size;
289 }
290
291 static bool
292 needs_negate(const fs_reg *reg, const struct imm *imm)
293 {
294 switch (reg->type) {
295 case BRW_REGISTER_TYPE_DF:
296 return signbit(reg->df) != signbit(imm->df);
297 case BRW_REGISTER_TYPE_F:
298 return signbit(reg->f) != signbit(imm->f);
299 case BRW_REGISTER_TYPE_Q:
300 return (reg->d64 < 0) != (imm->d64 < 0);
301 case BRW_REGISTER_TYPE_D:
302 return (reg->d < 0) != (imm->d < 0);
303 case BRW_REGISTER_TYPE_HF:
304 return (reg->d & 0x8000u) != (imm->w & 0x8000u);
305 case BRW_REGISTER_TYPE_W:
306 return ((int16_t)reg->d < 0) != (imm->w < 0);
307 case BRW_REGISTER_TYPE_UQ:
308 case BRW_REGISTER_TYPE_UD:
309 case BRW_REGISTER_TYPE_UW:
310 return false;
311 default:
312 unreachable("not implemented");
313 };
314 }
315
316 bool
317 fs_visitor::opt_combine_constants()
318 {
319 void *const_ctx = ralloc_context(NULL);
320
321 struct table table;
322 table.size = 8;
323 table.len = 0;
324 table.imm = ralloc_array(const_ctx, struct imm, table.size);
325
326 cfg->calculate_idom();
327 unsigned ip = -1;
328
329 /* Make a pass through all instructions and count the number of times each
330 * constant is used by coissueable instructions or instructions that cannot
331 * take immediate arguments.
332 */
333 foreach_block_and_inst(block, fs_inst, inst, cfg) {
334 ip++;
335
336 if (!could_coissue(devinfo, inst) && !must_promote_imm(devinfo, inst))
337 continue;
338
339 for (int i = 0; i < inst->sources; i++) {
340 if (inst->src[i].file != IMM)
341 continue;
342
343 char data[8];
344 brw_reg_type type;
345 if (!get_constant_value(devinfo, inst, i, data, &type))
346 continue;
347
348 uint8_t size = type_sz(type);
349
350 struct imm *imm = find_imm(&table, data, size);
351
352 if (imm) {
353 bblock_t *intersection = cfg_t::intersect(block, imm->block);
354 if (intersection != imm->block)
355 imm->inst = NULL;
356 imm->block = intersection;
357 imm->uses->push_tail(link(const_ctx, &inst->src[i]));
358 imm->uses_by_coissue += could_coissue(devinfo, inst);
359 imm->must_promote = imm->must_promote || must_promote_imm(devinfo, inst);
360 imm->last_use_ip = ip;
361 if (type == BRW_REGISTER_TYPE_HF)
362 imm->is_half_float = true;
363 } else {
364 imm = new_imm(&table, const_ctx);
365 imm->block = block;
366 imm->inst = inst;
367 imm->uses = new(const_ctx) exec_list();
368 imm->uses->push_tail(link(const_ctx, &inst->src[i]));
369 memcpy(imm->bytes, data, size);
370 imm->size = size;
371 imm->is_half_float = type == BRW_REGISTER_TYPE_HF;
372 imm->uses_by_coissue = could_coissue(devinfo, inst);
373 imm->must_promote = must_promote_imm(devinfo, inst);
374 imm->first_use_ip = ip;
375 imm->last_use_ip = ip;
376 }
377 }
378 }
379
380 /* Remove constants from the table that don't have enough uses to make them
381 * profitable to store in a register.
382 */
383 for (int i = 0; i < table.len;) {
384 struct imm *imm = &table.imm[i];
385
386 if (!imm->must_promote && imm->uses_by_coissue < 4) {
387 table.imm[i] = table.imm[table.len - 1];
388 table.len--;
389 continue;
390 }
391 i++;
392 }
393 if (table.len == 0) {
394 ralloc_free(const_ctx);
395 return false;
396 }
397 if (cfg->num_blocks != 1)
398 qsort(table.imm, table.len, sizeof(struct imm), compare);
399
400 /* Insert MOVs to load the constant values into GRFs. */
401 fs_reg reg(VGRF, alloc.allocate(1));
402 reg.stride = 0;
403 for (int i = 0; i < table.len; i++) {
404 struct imm *imm = &table.imm[i];
405 /* Insert it either before the instruction that generated the immediate
406 * or after the last non-control flow instruction of the common ancestor.
407 */
408 exec_node *n = (imm->inst ? imm->inst :
409 imm->block->last_non_control_flow_inst()->next);
410
411 /* From the BDW and CHV PRM, 3D Media GPGPU, Special Restrictions:
412 *
413 * "In Align16 mode, the channel selects and channel enables apply to a
414 * pair of half-floats, because these parameters are defined for DWord
415 * elements ONLY. This is applicable when both source and destination
416 * are half-floats."
417 *
418 * This means that Align16 instructions that use promoted HF immediates
419 * and use a <0,1,0>:HF region would read 2 HF slots instead of
420 * replicating the single one we want. To avoid this, we always populate
421 * both HF slots within a DWord with the constant.
422 */
423 const uint32_t width = devinfo->gen == 8 && imm->is_half_float ? 2 : 1;
424 const fs_builder ibld = bld.at(imm->block, n).exec_all().group(width, 0);
425
426 /* Put the immediate in an offset aligned to its size. Some instructions
427 * seem to have additional alignment requirements, so account for that
428 * too.
429 */
430 reg.offset = ALIGN(reg.offset, get_alignment_for_imm(imm));
431
432 /* Ensure we have enough space in the register to copy the immediate */
433 struct brw_reg imm_reg = build_imm_reg_for_copy(imm);
434 if (reg.offset + type_sz(imm_reg.type) * width > REG_SIZE) {
435 reg.nr = alloc.allocate(1);
436 reg.offset = 0;
437 }
438
439 ibld.MOV(retype(reg, imm_reg.type), imm_reg);
440 imm->nr = reg.nr;
441 imm->subreg_offset = reg.offset;
442
443 reg.offset += imm->size * width;
444 }
445 shader_stats.promoted_constants = table.len;
446
447 /* Rewrite the immediate sources to refer to the new GRFs. */
448 for (int i = 0; i < table.len; i++) {
449 foreach_list_typed(reg_link, link, link, table.imm[i].uses) {
450 fs_reg *reg = link->reg;
451 #ifdef DEBUG
452 switch (reg->type) {
453 case BRW_REGISTER_TYPE_DF:
454 assert((isnan(reg->df) && isnan(table.imm[i].df)) ||
455 (fabs(reg->df) == fabs(table.imm[i].df)));
456 break;
457 case BRW_REGISTER_TYPE_F:
458 assert((isnan(reg->f) && isnan(table.imm[i].f)) ||
459 (fabsf(reg->f) == fabsf(table.imm[i].f)));
460 break;
461 case BRW_REGISTER_TYPE_HF:
462 assert((isnan(_mesa_half_to_float(reg->d & 0xffffu)) &&
463 isnan(_mesa_half_to_float(table.imm[i].w))) ||
464 (fabsf(_mesa_half_to_float(reg->d & 0xffffu)) ==
465 fabsf(_mesa_half_to_float(table.imm[i].w))));
466 break;
467 case BRW_REGISTER_TYPE_Q:
468 assert(abs(reg->d64) == abs(table.imm[i].d64));
469 break;
470 case BRW_REGISTER_TYPE_UQ:
471 assert(reg->d64 == table.imm[i].d64);
472 break;
473 case BRW_REGISTER_TYPE_D:
474 assert(abs(reg->d) == abs(table.imm[i].d));
475 break;
476 case BRW_REGISTER_TYPE_UD:
477 assert(reg->d == table.imm[i].d);
478 break;
479 case BRW_REGISTER_TYPE_W:
480 assert(abs((int16_t) (reg->d & 0xffff)) == table.imm[i].w);
481 break;
482 case BRW_REGISTER_TYPE_UW:
483 assert((reg->ud & 0xffffu) == (uint16_t) table.imm[i].w);
484 break;
485 default:
486 break;
487 }
488 #endif
489
490 reg->file = VGRF;
491 reg->offset = table.imm[i].subreg_offset;
492 reg->stride = 0;
493 reg->negate = needs_negate(reg, &table.imm[i]);
494 reg->nr = table.imm[i].nr;
495 }
496 }
497
498 if (debug) {
499 for (int i = 0; i < table.len; i++) {
500 struct imm *imm = &table.imm[i];
501
502 printf("0x%016" PRIx64 " - block %3d, reg %3d sub %2d, "
503 "Uses: (%2d, %2d), IP: %4d to %4d, length %4d\n",
504 (uint64_t)(imm->d & BITFIELD64_MASK(imm->size * 8)),
505 imm->block->num,
506 imm->nr,
507 imm->subreg_offset,
508 imm->must_promote,
509 imm->uses_by_coissue,
510 imm->first_use_ip,
511 imm->last_use_ip,
512 imm->last_use_ip - imm->first_use_ip);
513 }
514 }
515
516 ralloc_free(const_ctx);
517 invalidate_live_intervals();
518
519 return true;
520 }