2 * Copyright © 2014 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 /** @file brw_fs_combine_constants.cpp
26 * This file contains the opt_combine_constants() pass that runs after the
27 * regular optimization loop. It passes over the instruction list and
28 * selectively promotes immediate values to registers by emitting a mov(1)
31 * This is useful on Gen 7 particularly, because a few instructions can be
32 * coissued (i.e., issued in the same cycle as another thread on the same EU
33 * issues an instruction) under some circumstances, one of which is that they
34 * cannot use immediate values.
39 #include "util/half_float.h"
43 static const bool debug
= false;
45 /* Returns whether an instruction could co-issue if its immediate source were
46 * replaced with a GRF source.
49 could_coissue(const struct gen_device_info
*devinfo
, const fs_inst
*inst
)
51 if (devinfo
->gen
!= 7)
54 switch (inst
->opcode
) {
59 /* Only float instructions can coissue. We don't have a great
60 * understanding of whether or not something like float(int(a) + int(b))
61 * would be considered float (based on the destination type) or integer
62 * (based on the source types), so we take the conservative choice of
63 * only promoting when both destination and source are float.
65 return inst
->dst
.type
== BRW_REGISTER_TYPE_F
&&
66 inst
->src
[0].type
== BRW_REGISTER_TYPE_F
;
73 * Returns true for instructions that don't support immediate sources.
76 must_promote_imm(const struct gen_device_info
*devinfo
, const fs_inst
*inst
)
78 switch (inst
->opcode
) {
79 case SHADER_OPCODE_POW
:
80 return devinfo
->gen
< 8;
89 /** A box for putting fs_regs in a linked list. */
91 DECLARE_RALLOC_CXX_OPERATORS(reg_link
)
93 reg_link(fs_reg
*reg
) : reg(reg
) {}
95 struct exec_node link
;
99 static struct exec_node
*
100 link(void *mem_ctx
, fs_reg
*reg
)
102 reg_link
*l
= new(mem_ctx
) reg_link(reg
);
107 * Information about an immediate value.
110 /** The common ancestor of all blocks using this immediate value. */
114 * The instruction generating the immediate value, if all uses are contained
115 * within a single basic block. Otherwise, NULL.
120 * A list of fs_regs that refer to this immediate. If we promote it, we'll
121 * have to patch these up to refer to the new GRF.
125 /** The immediate value */
136 /** When promoting half-float we need to account for certain restrictions */
140 * The GRF register and subregister number where we've decided to store the
143 uint8_t subreg_offset
;
146 /** The number of coissuable instructions using this immediate. */
147 uint16_t uses_by_coissue
;
150 * Whether this constant is used by an instruction that can't handle an
151 * immediate source (and already has to be promoted to a GRF).
155 uint16_t first_use_ip
;
156 uint16_t last_use_ip
;
159 /** The working set of information about immediates. */
167 find_imm(struct table
*table
, void *data
, uint8_t size
)
169 for (int i
= 0; i
< table
->len
; i
++) {
170 if (table
->imm
[i
].size
== size
&&
171 !memcmp(table
->imm
[i
].bytes
, data
, size
)) {
172 return &table
->imm
[i
];
179 new_imm(struct table
*table
, void *mem_ctx
)
181 if (table
->len
== table
->size
) {
183 table
->imm
= reralloc(mem_ctx
, table
->imm
, struct imm
, table
->size
);
185 return &table
->imm
[table
->len
++];
189 * Comparator used for sorting an array of imm structures.
191 * We sort by basic block number, then last use IP, then first use IP (least
192 * to greatest). This sorting causes immediates live in the same area to be
193 * allocated to the same register in the hopes that all values will be dead
194 * about the same time and the register can be reused.
197 compare(const void *_a
, const void *_b
)
199 const struct imm
*a
= (const struct imm
*)_a
,
200 *b
= (const struct imm
*)_b
;
202 int block_diff
= a
->block
->num
- b
->block
->num
;
206 int end_diff
= a
->last_use_ip
- b
->last_use_ip
;
210 return a
->first_use_ip
- b
->first_use_ip
;
214 get_constant_value(const struct gen_device_info
*devinfo
,
215 const fs_inst
*inst
, uint32_t src_idx
,
216 void *out
, brw_reg_type
*out_type
)
218 const bool can_do_source_mods
= inst
->can_do_source_mods(devinfo
);
219 const fs_reg
*src
= &inst
->src
[src_idx
];
221 *out_type
= src
->type
;
224 case BRW_REGISTER_TYPE_DF
: {
225 double val
= !can_do_source_mods
? src
->df
: fabs(src
->df
);
226 memcpy(out
, &val
, 8);
229 case BRW_REGISTER_TYPE_F
: {
230 float val
= !can_do_source_mods
? src
->f
: fabsf(src
->f
);
231 memcpy(out
, &val
, 4);
234 case BRW_REGISTER_TYPE_HF
: {
235 uint16_t val
= src
->d
& 0xffffu
;
236 if (can_do_source_mods
)
237 val
= _mesa_float_to_half(fabsf(_mesa_half_to_float(val
)));
238 memcpy(out
, &val
, 2);
241 case BRW_REGISTER_TYPE_Q
: {
242 int64_t val
= !can_do_source_mods
? src
->d64
: llabs(src
->d64
);
243 memcpy(out
, &val
, 8);
246 case BRW_REGISTER_TYPE_UQ
:
247 memcpy(out
, &src
->u64
, 8);
249 case BRW_REGISTER_TYPE_D
: {
250 int32_t val
= !can_do_source_mods
? src
->d
: abs(src
->d
);
251 memcpy(out
, &val
, 4);
254 case BRW_REGISTER_TYPE_UD
:
255 memcpy(out
, &src
->ud
, 4);
257 case BRW_REGISTER_TYPE_W
: {
258 int16_t val
= src
->d
& 0xffffu
;
259 if (can_do_source_mods
)
261 memcpy(out
, &val
, 2);
264 case BRW_REGISTER_TYPE_UW
:
265 memcpy(out
, &src
->ud
, 2);
274 static struct brw_reg
275 build_imm_reg_for_copy(struct imm
*imm
)
279 return brw_imm_d(imm
->d64
);
281 return brw_imm_d(imm
->d
);
283 return brw_imm_w(imm
->w
);
285 unreachable("not implemented");
289 static inline uint32_t
290 get_alignment_for_imm(const struct imm
*imm
)
292 if (imm
->is_half_float
)
293 return 4; /* At least MAD seems to require this */
299 needs_negate(const fs_reg
*reg
, const struct imm
*imm
)
302 case BRW_REGISTER_TYPE_DF
:
303 return signbit(reg
->df
) != signbit(imm
->df
);
304 case BRW_REGISTER_TYPE_F
:
305 return signbit(reg
->f
) != signbit(imm
->f
);
306 case BRW_REGISTER_TYPE_Q
:
307 return (reg
->d64
< 0) != (imm
->d64
< 0);
308 case BRW_REGISTER_TYPE_D
:
309 return (reg
->d
< 0) != (imm
->d
< 0);
310 case BRW_REGISTER_TYPE_HF
:
311 return (reg
->d
& 0x8000u
) != (imm
->w
& 0x8000u
);
312 case BRW_REGISTER_TYPE_W
:
313 return ((int16_t)reg
->d
< 0) != (imm
->w
< 0);
314 case BRW_REGISTER_TYPE_UQ
:
315 case BRW_REGISTER_TYPE_UD
:
316 case BRW_REGISTER_TYPE_UW
:
319 unreachable("not implemented");
324 representable_as_hf(float f
, uint16_t *hf
)
327 uint16_t h
= _mesa_float_to_half(f
);
328 u
.f
= _mesa_half_to_float(h
);
339 represent_src_as_imm(const struct gen_device_info
*devinfo
,
342 /* TODO : consider specific platforms also */
343 if (devinfo
->gen
== 12) {
345 if (representable_as_hf(src
->f
, &hf
)) {
346 *src
= retype(brw_imm_uw(hf
), BRW_REGISTER_TYPE_HF
);
354 fs_visitor::opt_combine_constants()
356 void *const_ctx
= ralloc_context(NULL
);
361 table
.imm
= ralloc_array(const_ctx
, struct imm
, table
.size
);
363 const brw::idom_tree
&idom
= idom_analysis
.require();
366 /* Make a pass through all instructions and count the number of times each
367 * constant is used by coissueable instructions or instructions that cannot
368 * take immediate arguments.
370 foreach_block_and_inst(block
, fs_inst
, inst
, cfg
) {
373 if (!could_coissue(devinfo
, inst
) && !must_promote_imm(devinfo
, inst
))
376 bool represented_as_imm
= false;
377 for (int i
= 0; i
< inst
->sources
; i
++) {
378 if (inst
->src
[i
].file
!= IMM
)
381 if (!represented_as_imm
&& i
== 0 &&
382 inst
->opcode
== BRW_OPCODE_MAD
&&
383 represent_src_as_imm(devinfo
, &inst
->src
[i
])) {
384 represented_as_imm
= true;
390 if (!get_constant_value(devinfo
, inst
, i
, data
, &type
))
393 uint8_t size
= type_sz(type
);
395 struct imm
*imm
= find_imm(&table
, data
, size
);
398 bblock_t
*intersection
= idom
.intersect(block
, imm
->block
);
399 if (intersection
!= imm
->block
)
401 imm
->block
= intersection
;
402 imm
->uses
->push_tail(link(const_ctx
, &inst
->src
[i
]));
403 imm
->uses_by_coissue
+= could_coissue(devinfo
, inst
);
404 imm
->must_promote
= imm
->must_promote
|| must_promote_imm(devinfo
, inst
);
405 imm
->last_use_ip
= ip
;
406 if (type
== BRW_REGISTER_TYPE_HF
)
407 imm
->is_half_float
= true;
409 imm
= new_imm(&table
, const_ctx
);
412 imm
->uses
= new(const_ctx
) exec_list();
413 imm
->uses
->push_tail(link(const_ctx
, &inst
->src
[i
]));
414 memcpy(imm
->bytes
, data
, size
);
416 imm
->is_half_float
= type
== BRW_REGISTER_TYPE_HF
;
417 imm
->uses_by_coissue
= could_coissue(devinfo
, inst
);
418 imm
->must_promote
= must_promote_imm(devinfo
, inst
);
419 imm
->first_use_ip
= ip
;
420 imm
->last_use_ip
= ip
;
425 /* Remove constants from the table that don't have enough uses to make them
426 * profitable to store in a register.
428 for (int i
= 0; i
< table
.len
;) {
429 struct imm
*imm
= &table
.imm
[i
];
431 if (!imm
->must_promote
&& imm
->uses_by_coissue
< 4) {
432 table
.imm
[i
] = table
.imm
[table
.len
- 1];
438 if (table
.len
== 0) {
439 ralloc_free(const_ctx
);
442 if (cfg
->num_blocks
!= 1)
443 qsort(table
.imm
, table
.len
, sizeof(struct imm
), compare
);
445 /* Insert MOVs to load the constant values into GRFs. */
446 fs_reg
reg(VGRF
, alloc
.allocate(1));
448 for (int i
= 0; i
< table
.len
; i
++) {
449 struct imm
*imm
= &table
.imm
[i
];
450 /* Insert it either before the instruction that generated the immediate
451 * or after the last non-control flow instruction of the common ancestor.
453 exec_node
*n
= (imm
->inst
? imm
->inst
:
454 imm
->block
->last_non_control_flow_inst()->next
);
456 /* From the BDW and CHV PRM, 3D Media GPGPU, Special Restrictions:
458 * "In Align16 mode, the channel selects and channel enables apply to a
459 * pair of half-floats, because these parameters are defined for DWord
460 * elements ONLY. This is applicable when both source and destination
463 * This means that Align16 instructions that use promoted HF immediates
464 * and use a <0,1,0>:HF region would read 2 HF slots instead of
465 * replicating the single one we want. To avoid this, we always populate
466 * both HF slots within a DWord with the constant.
468 const uint32_t width
= devinfo
->gen
== 8 && imm
->is_half_float
? 2 : 1;
469 const fs_builder ibld
= bld
.at(imm
->block
, n
).exec_all().group(width
, 0);
471 /* Put the immediate in an offset aligned to its size. Some instructions
472 * seem to have additional alignment requirements, so account for that
475 reg
.offset
= ALIGN(reg
.offset
, get_alignment_for_imm(imm
));
477 /* Ensure we have enough space in the register to copy the immediate */
478 struct brw_reg imm_reg
= build_imm_reg_for_copy(imm
);
479 if (reg
.offset
+ type_sz(imm_reg
.type
) * width
> REG_SIZE
) {
480 reg
.nr
= alloc
.allocate(1);
484 ibld
.MOV(retype(reg
, imm_reg
.type
), imm_reg
);
486 imm
->subreg_offset
= reg
.offset
;
488 reg
.offset
+= imm
->size
* width
;
490 shader_stats
.promoted_constants
= table
.len
;
492 /* Rewrite the immediate sources to refer to the new GRFs. */
493 for (int i
= 0; i
< table
.len
; i
++) {
494 foreach_list_typed(reg_link
, link
, link
, table
.imm
[i
].uses
) {
495 fs_reg
*reg
= link
->reg
;
498 case BRW_REGISTER_TYPE_DF
:
499 assert((isnan(reg
->df
) && isnan(table
.imm
[i
].df
)) ||
500 (fabs(reg
->df
) == fabs(table
.imm
[i
].df
)));
502 case BRW_REGISTER_TYPE_F
:
503 assert((isnan(reg
->f
) && isnan(table
.imm
[i
].f
)) ||
504 (fabsf(reg
->f
) == fabsf(table
.imm
[i
].f
)));
506 case BRW_REGISTER_TYPE_HF
:
507 assert((isnan(_mesa_half_to_float(reg
->d
& 0xffffu
)) &&
508 isnan(_mesa_half_to_float(table
.imm
[i
].w
))) ||
509 (fabsf(_mesa_half_to_float(reg
->d
& 0xffffu
)) ==
510 fabsf(_mesa_half_to_float(table
.imm
[i
].w
))));
512 case BRW_REGISTER_TYPE_Q
:
513 assert(abs(reg
->d64
) == abs(table
.imm
[i
].d64
));
515 case BRW_REGISTER_TYPE_UQ
:
516 assert(reg
->d64
== table
.imm
[i
].d64
);
518 case BRW_REGISTER_TYPE_D
:
519 assert(abs(reg
->d
) == abs(table
.imm
[i
].d
));
521 case BRW_REGISTER_TYPE_UD
:
522 assert(reg
->d
== table
.imm
[i
].d
);
524 case BRW_REGISTER_TYPE_W
:
525 assert(abs((int16_t) (reg
->d
& 0xffff)) == table
.imm
[i
].w
);
527 case BRW_REGISTER_TYPE_UW
:
528 assert((reg
->ud
& 0xffffu
) == (uint16_t) table
.imm
[i
].w
);
536 reg
->offset
= table
.imm
[i
].subreg_offset
;
538 reg
->negate
= needs_negate(reg
, &table
.imm
[i
]);
539 reg
->nr
= table
.imm
[i
].nr
;
544 for (int i
= 0; i
< table
.len
; i
++) {
545 struct imm
*imm
= &table
.imm
[i
];
547 printf("0x%016" PRIx64
" - block %3d, reg %3d sub %2d, "
548 "Uses: (%2d, %2d), IP: %4d to %4d, length %4d\n",
549 (uint64_t)(imm
->d
& BITFIELD64_MASK(imm
->size
* 8)),
554 imm
->uses_by_coissue
,
557 imm
->last_use_ip
- imm
->first_use_ip
);
561 ralloc_free(const_ctx
);
562 invalidate_analysis(DEPENDENCY_INSTRUCTIONS
| DEPENDENCY_VARIABLES
);