2 * Copyright © 2014 Broadcom
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 #include "util/ralloc.h"
25 #include "util/register_allocate.h"
26 #include "common/v3d_device_info.h"
27 #include "v3d_compiler.h"
29 #define QPU_R(i) { .magic = false, .index = i }
33 #define PHYS_INDEX (ACC_INDEX + ACC_COUNT)
37 is_last_ldtmu(struct qinst
*inst
, struct qblock
*block
)
39 list_for_each_entry_from(struct qinst
, scan_inst
, inst
,
40 &block
->instructions
, link
) {
41 if (inst
->qpu
.sig
.ldtmu
)
43 if (v3d_qpu_writes_tmu(&inst
->qpu
))
51 vir_is_mov_uniform(struct v3d_compile
*c
, int temp
)
53 struct qinst
*def
= c
->defs
[temp
];
56 (def
->qpu
.sig
.ldunif
||
57 (vir_is_raw_mov(def
) &&
58 def
->src
[0].file
== QFILE_UNIF
)));
62 v3d_choose_spill_node(struct v3d_compile
*c
, struct ra_graph
*g
,
63 uint32_t *temp_to_node
)
65 const float tmu_scale
= 5;
66 float block_scale
= 1.0;
67 float spill_costs
[c
->num_temps
];
68 bool in_tmu_operation
= false;
69 bool started_last_seg
= false;
71 for (unsigned i
= 0; i
< c
->num_temps
; i
++)
74 /* XXX: Scale the cost up when inside of a loop. */
75 vir_for_each_block(block
, c
) {
76 vir_for_each_inst(inst
, block
) {
77 /* We can't insert a new TMU operation while currently
78 * in a TMU operation, and we can't insert new thread
79 * switches after starting output writes.
83 (c
->threads
> 1 && started_last_seg
));
85 for (int i
= 0; i
< vir_get_nsrc(inst
); i
++) {
86 if (inst
->src
[i
].file
!= QFILE_TEMP
)
89 int temp
= inst
->src
[i
].index
;
90 if (vir_is_mov_uniform(c
, temp
)) {
91 spill_costs
[temp
] += block_scale
;
92 } else if (!no_spilling
) {
93 spill_costs
[temp
] += (block_scale
*
96 BITSET_CLEAR(c
->spillable
, temp
);
100 if (inst
->dst
.file
== QFILE_TEMP
) {
101 int temp
= inst
->dst
.index
;
103 if (vir_is_mov_uniform(c
, temp
)) {
104 /* We just rematerialize the unform
107 } else if (!no_spilling
) {
108 spill_costs
[temp
] += (block_scale
*
111 BITSET_CLEAR(c
->spillable
, temp
);
115 /* Refuse to spill a ldvary's dst, because that means
116 * that ldvary's r5 would end up being used across a
119 if (inst
->qpu
.sig
.ldvary
) {
120 assert(inst
->dst
.file
== QFILE_TEMP
);
121 BITSET_CLEAR(c
->spillable
, inst
->dst
.index
);
124 if (inst
->is_last_thrsw
)
125 started_last_seg
= true;
127 if (v3d_qpu_writes_vpm(&inst
->qpu
) ||
128 v3d_qpu_uses_tlb(&inst
->qpu
))
129 started_last_seg
= true;
131 /* Track when we're in between a TMU setup and the
132 * final LDTMU or TMUWT from that TMU setup. We can't
133 * spill/fill any temps during that time, because that
134 * involves inserting a new TMU setup/LDTMU sequence.
136 if (inst
->qpu
.sig
.ldtmu
&&
137 is_last_ldtmu(inst
, block
))
138 in_tmu_operation
= false;
140 if (inst
->qpu
.type
== V3D_QPU_INSTR_TYPE_ALU
&&
141 inst
->qpu
.alu
.add
.op
== V3D_QPU_A_TMUWT
)
142 in_tmu_operation
= false;
144 if (v3d_qpu_writes_tmu(&inst
->qpu
))
145 in_tmu_operation
= true;
149 for (unsigned i
= 0; i
< c
->num_temps
; i
++) {
150 int node
= temp_to_node
[i
];
152 if (BITSET_TEST(c
->spillable
, i
))
153 ra_set_node_spill_cost(g
, node
, spill_costs
[i
]);
156 return ra_get_best_spill_node(g
);
159 /* The spill offset for this thread takes a bit of setup, so do it once at
163 v3d_setup_spill_base(struct v3d_compile
*c
)
165 c
->cursor
= vir_before_block(vir_entry_block(c
));
167 int start_num_temps
= c
->num_temps
;
169 /* Each thread wants to be in a separate region of the scratch space
170 * so that the QPUs aren't fighting over cache lines. We have the
171 * driver keep a single global spill BO rather than
172 * per-spilling-program BOs, so we need a uniform from the driver for
173 * what the per-thread scale is.
175 struct qreg thread_offset
=
178 vir_uniform(c
, QUNIFORM_SPILL_SIZE_PER_THREAD
, 0));
180 /* Each channel in a reg is 4 bytes, so scale them up by that. */
181 struct qreg element_offset
= vir_SHL(c
, vir_EIDX(c
),
182 vir_uniform_ui(c
, 2));
184 c
->spill_base
= vir_ADD(c
,
185 vir_ADD(c
, thread_offset
, element_offset
),
186 vir_uniform(c
, QUNIFORM_SPILL_OFFSET
, 0));
188 /* Make sure that we don't spill the spilling setup instructions. */
189 for (int i
= start_num_temps
; i
< c
->num_temps
; i
++)
190 BITSET_CLEAR(c
->spillable
, i
);
194 v3d_emit_spill_tmua(struct v3d_compile
*c
, uint32_t spill_offset
)
196 vir_ADD_dest(c
, vir_reg(QFILE_MAGIC
,
199 vir_uniform_ui(c
, spill_offset
));
203 v3d_spill_reg(struct v3d_compile
*c
, int spill_temp
)
205 bool is_uniform
= vir_is_mov_uniform(c
, spill_temp
);
207 uint32_t spill_offset
= 0;
210 uint32_t spill_offset
= c
->spill_size
;
211 c
->spill_size
+= 16 * sizeof(uint32_t);
213 if (spill_offset
== 0)
214 v3d_setup_spill_base(c
);
217 struct qinst
*last_thrsw
= c
->last_thrsw
;
218 assert(!last_thrsw
|| last_thrsw
->is_last_thrsw
);
220 int start_num_temps
= c
->num_temps
;
222 int uniform_index
= ~0;
224 struct qinst
*orig_unif
= c
->defs
[spill_temp
];
225 if (orig_unif
->qpu
.sig
.ldunif
) {
226 uniform_index
= orig_unif
->uniform
;
228 assert(orig_unif
->src
[0].file
== QFILE_UNIF
);
229 uniform_index
= orig_unif
->src
[0].index
;
233 vir_for_each_inst_inorder_safe(inst
, c
) {
234 for (int i
= 0; i
< vir_get_nsrc(inst
); i
++) {
235 if (inst
->src
[i
].file
!= QFILE_TEMP
||
236 inst
->src
[i
].index
!= spill_temp
) {
240 c
->cursor
= vir_before_inst(inst
);
244 vir_MOV(c
, vir_uniform(c
,
245 c
->uniform_contents
[uniform_index
],
246 c
->uniform_data
[uniform_index
]));
248 v3d_emit_spill_tmua(c
, spill_offset
);
250 inst
->src
[i
] = vir_LDTMU(c
);
255 if (inst
->dst
.file
== QFILE_TEMP
&&
256 inst
->dst
.index
== spill_temp
) {
258 c
->cursor
.link
= NULL
;
259 vir_remove_instruction(c
, inst
);
261 c
->cursor
= vir_after_inst(inst
);
263 inst
->dst
.index
= c
->num_temps
++;
264 vir_MOV_dest(c
, vir_reg(QFILE_MAGIC
,
267 v3d_emit_spill_tmua(c
, spill_offset
);
274 /* If we didn't have a last-thrsw inserted by nir_to_vir and
275 * we've been inserting thrsws, then insert a new last_thrsw
276 * right before we start the vpm/tlb sequence for the last
279 if (!is_uniform
&& !last_thrsw
&& c
->last_thrsw
&&
280 (v3d_qpu_writes_vpm(&inst
->qpu
) ||
281 v3d_qpu_uses_tlb(&inst
->qpu
))) {
282 c
->cursor
= vir_before_inst(inst
);
285 last_thrsw
= c
->last_thrsw
;
286 last_thrsw
->is_last_thrsw
= true;
290 /* Make sure c->last_thrsw is the actual last thrsw, not just one we
291 * inserted in our most recent unspill.
294 c
->last_thrsw
= last_thrsw
;
296 /* Don't allow spilling of our spilling instructions. There's no way
297 * they can help get things colored.
299 for (int i
= start_num_temps
; i
< c
->num_temps
; i
++)
300 BITSET_CLEAR(c
->spillable
, i
);
303 struct v3d_ra_select_callback_data
{
309 v3d_ra_select_callback(struct ra_graph
*g
, BITSET_WORD
*regs
, void *data
)
311 struct v3d_ra_select_callback_data
*v3d_ra
= data
;
312 int r5
= ACC_INDEX
+ 5;
314 /* Choose r5 for our ldunifs if possible (nobody else can load to that
315 * reg, and it keeps the QPU cond field free from being occupied by
318 if (BITSET_TEST(regs
, r5
))
321 /* Choose an accumulator if possible (I think it's lower power than
322 * phys regs), but round-robin through them to give post-RA
323 * instruction selection more options.
325 for (int i
= 0; i
< ACC_COUNT
; i
++) {
326 int acc_off
= (v3d_ra
->next_acc
+ i
) % ACC_COUNT
;
327 int acc
= ACC_INDEX
+ acc_off
;
329 if (BITSET_TEST(regs
, acc
)) {
330 v3d_ra
->next_acc
= acc_off
+ 1;
335 for (int i
= 0; i
< PHYS_COUNT
; i
++) {
336 int phys_off
= (v3d_ra
->next_phys
+ i
) % PHYS_COUNT
;
337 int phys
= PHYS_INDEX
+ phys_off
;
339 if (BITSET_TEST(regs
, phys
)) {
340 v3d_ra
->next_phys
= phys_off
+ 1;
345 unreachable("RA must pass us at least one possible reg.");
349 vir_init_reg_sets(struct v3d_compiler
*compiler
)
351 /* Allocate up to 3 regfile classes, for the ways the physical
352 * register file can be divided up for fragment shader threading.
354 int max_thread_index
= (compiler
->devinfo
->ver
>= 40 ? 2 : 3);
356 compiler
->regs
= ra_alloc_reg_set(compiler
, PHYS_INDEX
+ PHYS_COUNT
,
361 for (int threads
= 0; threads
< max_thread_index
; threads
++) {
362 compiler
->reg_class_any
[threads
] =
363 ra_alloc_reg_class(compiler
->regs
);
364 compiler
->reg_class_r5
[threads
] =
365 ra_alloc_reg_class(compiler
->regs
);
366 compiler
->reg_class_phys_or_acc
[threads
] =
367 ra_alloc_reg_class(compiler
->regs
);
368 compiler
->reg_class_phys
[threads
] =
369 ra_alloc_reg_class(compiler
->regs
);
371 for (int i
= PHYS_INDEX
;
372 i
< PHYS_INDEX
+ (PHYS_COUNT
>> threads
); i
++) {
373 ra_class_add_reg(compiler
->regs
,
374 compiler
->reg_class_phys_or_acc
[threads
], i
);
375 ra_class_add_reg(compiler
->regs
,
376 compiler
->reg_class_phys
[threads
], i
);
377 ra_class_add_reg(compiler
->regs
,
378 compiler
->reg_class_any
[threads
], i
);
381 for (int i
= ACC_INDEX
+ 0; i
< ACC_INDEX
+ ACC_COUNT
- 1; i
++) {
382 ra_class_add_reg(compiler
->regs
,
383 compiler
->reg_class_phys_or_acc
[threads
], i
);
384 ra_class_add_reg(compiler
->regs
,
385 compiler
->reg_class_any
[threads
], i
);
387 /* r5 can only store a single 32-bit value, so not much can
390 ra_class_add_reg(compiler
->regs
,
391 compiler
->reg_class_r5
[threads
],
393 ra_class_add_reg(compiler
->regs
,
394 compiler
->reg_class_any
[threads
],
398 ra_set_finalize(compiler
->regs
, NULL
);
403 struct node_to_temp_map
{
409 node_to_temp_priority(const void *in_a
, const void *in_b
)
411 const struct node_to_temp_map
*a
= in_a
;
412 const struct node_to_temp_map
*b
= in_b
;
414 return a
->priority
- b
->priority
;
417 #define CLASS_BIT_PHYS (1 << 0)
418 #define CLASS_BIT_ACC (1 << 1)
419 #define CLASS_BIT_R5 (1 << 4)
420 #define CLASS_BITS_ANY (CLASS_BIT_PHYS | \
425 * Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
427 * The return value should be freed by the caller.
430 v3d_register_allocate(struct v3d_compile
*c
, bool *spilled
)
432 struct node_to_temp_map map
[c
->num_temps
];
433 uint32_t temp_to_node
[c
->num_temps
];
434 uint8_t class_bits
[c
->num_temps
];
435 int acc_nodes
[ACC_COUNT
];
436 struct v3d_ra_select_callback_data callback_data
= {
438 /* Start at RF3, to try to keep the TLB writes from using
446 vir_calculate_live_intervals(c
);
448 /* Convert 1, 2, 4 threads to 0, 1, 2 index.
450 * V3D 4.x has double the physical register space, so 64 physical regs
451 * are available at both 1x and 2x threading, and 4x has 32.
453 int thread_index
= ffs(c
->threads
) - 1;
454 if (c
->devinfo
->ver
>= 40) {
455 if (thread_index
>= 1)
459 struct ra_graph
*g
= ra_alloc_interference_graph(c
->compiler
->regs
,
461 ARRAY_SIZE(acc_nodes
));
462 ra_set_select_reg_callback(g
, v3d_ra_select_callback
, &callback_data
);
464 /* Make some fixed nodes for the accumulators, which we will need to
465 * interfere with when ops have implied r3/r4 writes or for the thread
466 * switches. We could represent these as classes for the nodes to
467 * live in, but the classes take up a lot of memory to set up, so we
468 * don't want to make too many.
470 for (int i
= 0; i
< ARRAY_SIZE(acc_nodes
); i
++) {
471 acc_nodes
[i
] = c
->num_temps
+ i
;
472 ra_set_node_reg(g
, acc_nodes
[i
], ACC_INDEX
+ i
);
475 for (uint32_t i
= 0; i
< c
->num_temps
; i
++) {
477 map
[i
].priority
= c
->temp_end
[i
] - c
->temp_start
[i
];
479 qsort(map
, c
->num_temps
, sizeof(map
[0]), node_to_temp_priority
);
480 for (uint32_t i
= 0; i
< c
->num_temps
; i
++) {
481 temp_to_node
[map
[i
].temp
] = i
;
484 /* Figure out our register classes and preallocated registers. We
485 * start with any temp being able to be in any file, then instructions
486 * incrementally remove bits that the temp definitely can't be in.
488 memset(class_bits
, CLASS_BITS_ANY
, sizeof(class_bits
));
491 vir_for_each_inst_inorder(inst
, c
) {
492 /* If the instruction writes r3/r4 (and optionally moves its
493 * result to a temp), nothing else can be stored in r3/r4 across
496 if (vir_writes_r3(c
->devinfo
, inst
)) {
497 for (int i
= 0; i
< c
->num_temps
; i
++) {
498 if (c
->temp_start
[i
] < ip
&&
499 c
->temp_end
[i
] > ip
) {
500 ra_add_node_interference(g
,
506 if (vir_writes_r4(c
->devinfo
, inst
)) {
507 for (int i
= 0; i
< c
->num_temps
; i
++) {
508 if (c
->temp_start
[i
] < ip
&&
509 c
->temp_end
[i
] > ip
) {
510 ra_add_node_interference(g
,
517 if (inst
->qpu
.type
== V3D_QPU_INSTR_TYPE_ALU
) {
518 switch (inst
->qpu
.alu
.add
.op
) {
519 case V3D_QPU_A_LDVPMV_IN
:
520 case V3D_QPU_A_LDVPMV_OUT
:
521 case V3D_QPU_A_LDVPMD_IN
:
522 case V3D_QPU_A_LDVPMD_OUT
:
523 case V3D_QPU_A_LDVPMP
:
524 case V3D_QPU_A_LDVPMG_IN
:
525 case V3D_QPU_A_LDVPMG_OUT
:
526 /* LDVPMs only store to temps (the MA flag
527 * decides whether the LDVPM is in or out)
529 assert(inst
->dst
.file
== QFILE_TEMP
);
530 class_bits
[inst
->dst
.index
] &= CLASS_BIT_PHYS
;
533 case V3D_QPU_A_RECIP
:
534 case V3D_QPU_A_RSQRT
:
538 case V3D_QPU_A_RSQRT2
:
539 /* The SFU instructions write directly to the
542 assert(inst
->dst
.file
== QFILE_TEMP
);
543 class_bits
[inst
->dst
.index
] &= CLASS_BIT_PHYS
;
551 if (inst
->src
[0].file
== QFILE_REG
) {
552 switch (inst
->src
[0].index
) {
557 /* Payload setup instructions: Force allocate
558 * the dst to the given register (so the MOV
561 assert(inst
->qpu
.alu
.mul
.op
== V3D_QPU_M_MOV
);
562 assert(inst
->dst
.file
== QFILE_TEMP
);
564 temp_to_node
[inst
->dst
.index
],
571 if (inst
->dst
.file
== QFILE_TEMP
) {
572 /* Only a ldunif gets to write to R5, which only has a
573 * single 32-bit channel of storage.
575 if (!inst
->qpu
.sig
.ldunif
) {
576 class_bits
[inst
->dst
.index
] &= ~CLASS_BIT_R5
;
578 /* Until V3D 4.x, we could only load a uniform
579 * to r5, so we'll need to spill if uniform
580 * loads interfere with each other.
582 if (c
->devinfo
->ver
< 40) {
583 class_bits
[inst
->dst
.index
] &=
589 if (inst
->qpu
.sig
.thrsw
) {
590 /* All accumulators are invalidated across a thread
593 for (int i
= 0; i
< c
->num_temps
; i
++) {
594 if (c
->temp_start
[i
] < ip
&& c
->temp_end
[i
] > ip
)
595 class_bits
[i
] &= CLASS_BIT_PHYS
;
602 for (uint32_t i
= 0; i
< c
->num_temps
; i
++) {
603 if (class_bits
[i
] == CLASS_BIT_PHYS
) {
604 ra_set_node_class(g
, temp_to_node
[i
],
605 c
->compiler
->reg_class_phys
[thread_index
]);
606 } else if (class_bits
[i
] == (CLASS_BIT_R5
)) {
607 ra_set_node_class(g
, temp_to_node
[i
],
608 c
->compiler
->reg_class_r5
[thread_index
]);
609 } else if (class_bits
[i
] == (CLASS_BIT_PHYS
| CLASS_BIT_ACC
)) {
610 ra_set_node_class(g
, temp_to_node
[i
],
611 c
->compiler
->reg_class_phys_or_acc
[thread_index
]);
613 assert(class_bits
[i
] == CLASS_BITS_ANY
);
614 ra_set_node_class(g
, temp_to_node
[i
],
615 c
->compiler
->reg_class_any
[thread_index
]);
619 for (uint32_t i
= 0; i
< c
->num_temps
; i
++) {
620 for (uint32_t j
= i
+ 1; j
< c
->num_temps
; j
++) {
621 if (!(c
->temp_start
[i
] >= c
->temp_end
[j
] ||
622 c
->temp_start
[j
] >= c
->temp_end
[i
])) {
623 ra_add_node_interference(g
,
630 /* Debug code to force a bit of register spilling, for running across
631 * conformance tests to make sure that spilling works.
633 int force_register_spills
= 0;
634 if (c
->spill_size
< 16 * sizeof(uint32_t) * force_register_spills
) {
635 int node
= v3d_choose_spill_node(c
, g
, temp_to_node
);
637 v3d_spill_reg(c
, map
[node
].temp
);
644 bool ok
= ra_allocate(g
);
646 int node
= v3d_choose_spill_node(c
, g
, temp_to_node
);
648 /* Don't emit spills using the TMU until we've dropped thread
652 (vir_is_mov_uniform(c
, map
[node
].temp
) ||
653 thread_index
== 0)) {
654 v3d_spill_reg(c
, map
[node
].temp
);
656 /* Ask the outer loop to call back in. */
664 struct qpu_reg
*temp_registers
= calloc(c
->num_temps
,
665 sizeof(*temp_registers
));
667 for (uint32_t i
= 0; i
< c
->num_temps
; i
++) {
668 int ra_reg
= ra_get_node_reg(g
, temp_to_node
[i
]);
669 if (ra_reg
< PHYS_INDEX
) {
670 temp_registers
[i
].magic
= true;
671 temp_registers
[i
].index
= (V3D_QPU_WADDR_R0
+
674 temp_registers
[i
].magic
= false;
675 temp_registers
[i
].index
= ra_reg
- PHYS_INDEX
;
678 /* If the value's never used, just write to the NOP register
679 * for clarity in debug output.
681 if (c
->temp_start
[i
] == c
->temp_end
[i
]) {
682 temp_registers
[i
].magic
= true;
683 temp_registers
[i
].index
= V3D_QPU_WADDR_NOP
;
689 if (V3D_DEBUG
& V3D_DEBUG_SHADERDB
) {
690 fprintf(stderr
, "SHADER-DB: %s prog %d/%d: %d spills\n",
691 vir_get_stage_name(c
),
692 c
->program_id
, c
->variant_id
,
695 fprintf(stderr
, "SHADER-DB: %s prog %d/%d: %d fills\n",
696 vir_get_stage_name(c
),
697 c
->program_id
, c
->variant_id
,
701 return temp_registers
;