src/gallium/drivers/vc4/vc4_register_allocate.c

   1 /*
   2  * Copyright © 2014 Broadcom
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "util/ralloc.h"
  25 #include "util/register_allocate.h"
  26 #include "vc4_context.h"
  27 #include "vc4_qir.h"
  28 #include "vc4_qpu.h"
  29
  30 #define QPU_R(file, index) { QPU_MUX_##file, index }
  31
  32 static const struct qpu_reg vc4_regs[] = {
  33         { QPU_MUX_R0, 0},
  34         { QPU_MUX_R1, 0},
  35         { QPU_MUX_R2, 0},
  36         { QPU_MUX_R3, 0},
  37         { QPU_MUX_R4, 0},
  38         QPU_R(A, 0),
  39         QPU_R(B, 0),
  40         QPU_R(A, 1),
  41         QPU_R(B, 1),
  42         QPU_R(A, 2),
  43         QPU_R(B, 2),
  44         QPU_R(A, 3),
  45         QPU_R(B, 3),
  46         QPU_R(A, 4),
  47         QPU_R(B, 4),
  48         QPU_R(A, 5),
  49         QPU_R(B, 5),
  50         QPU_R(A, 6),
  51         QPU_R(B, 6),
  52         QPU_R(A, 7),
  53         QPU_R(B, 7),
  54         QPU_R(A, 8),
  55         QPU_R(B, 8),
  56         QPU_R(A, 9),
  57         QPU_R(B, 9),
  58         QPU_R(A, 10),
  59         QPU_R(B, 10),
  60         QPU_R(A, 11),
  61         QPU_R(B, 11),
  62         QPU_R(A, 12),
  63         QPU_R(B, 12),
  64         QPU_R(A, 13),
  65         QPU_R(B, 13),
  66         QPU_R(A, 14),
  67         QPU_R(B, 14),
  68         QPU_R(A, 15),
  69         QPU_R(B, 15),
  70         QPU_R(A, 16),
  71         QPU_R(B, 16),
  72         QPU_R(A, 17),
  73         QPU_R(B, 17),
  74         QPU_R(A, 18),
  75         QPU_R(B, 18),
  76         QPU_R(A, 19),
  77         QPU_R(B, 19),
  78         QPU_R(A, 20),
  79         QPU_R(B, 20),
  80         QPU_R(A, 21),
  81         QPU_R(B, 21),
  82         QPU_R(A, 22),
  83         QPU_R(B, 22),
  84         QPU_R(A, 23),
  85         QPU_R(B, 23),
  86         QPU_R(A, 24),
  87         QPU_R(B, 24),
  88         QPU_R(A, 25),
  89         QPU_R(B, 25),
  90         QPU_R(A, 26),
  91         QPU_R(B, 26),
  92         QPU_R(A, 27),
  93         QPU_R(B, 27),
  94         QPU_R(A, 28),
  95         QPU_R(B, 28),
  96         QPU_R(A, 29),
  97         QPU_R(B, 29),
  98         QPU_R(A, 30),
  99         QPU_R(B, 30),
 100         QPU_R(A, 31),
 101         QPU_R(B, 31),
 102 };
 103 #define ACC_INDEX     0
 104 #define AB_INDEX      (ACC_INDEX + 5)
 105
 106 static void
 107 vc4_alloc_reg_set(struct vc4_context *vc4)
 108 {
 109         assert(vc4_regs[AB_INDEX].addr == 0);
 110         assert(vc4_regs[AB_INDEX + 1].addr == 0);
 111         STATIC_ASSERT(ARRAY_SIZE(vc4_regs) == AB_INDEX + 64);
 112
 113         if (vc4->regs)
 114                 return;
 115
 116         vc4->regs = ra_alloc_reg_set(vc4, ARRAY_SIZE(vc4_regs));
 117
 118         vc4->reg_class_any = ra_alloc_reg_class(vc4->regs);
 119         for (uint32_t i = 0; i < ARRAY_SIZE(vc4_regs); i++) {
 120                 /* Reserve r3 for now, since we're using it for spilling-like
 121                  * operations in vc4_qpu_emit.c
 122                  */
 123                 if (vc4_regs[i].mux == QPU_MUX_R3)
 124                         continue;
 125
 126                 /* R4 can't be written as a general purpose register. (it's
 127                  * TMU_NOSWAP as a write address).
 128                  */
 129                 if (vc4_regs[i].mux == QPU_MUX_R4)
 130                         continue;
 131
 132                 ra_class_add_reg(vc4->regs, vc4->reg_class_any, i);
 133         }
 134
 135         vc4->reg_class_a = ra_alloc_reg_class(vc4->regs);
 136         for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i += 2)
 137                 ra_class_add_reg(vc4->regs, vc4->reg_class_a, i);
 138
 139         ra_set_finalize(vc4->regs, NULL);
 140 }
 141
 142 struct node_to_temp_map {
 143         uint32_t temp;
 144         uint32_t priority;
 145 };
 146
 147 static int
 148 node_to_temp_priority(const void *in_a, const void *in_b)
 149 {
 150         const struct node_to_temp_map *a = in_a;
 151         const struct node_to_temp_map *b = in_b;
 152
 153         return a->priority - b->priority;
 154 }
 155
 156 /**
 157  * Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
 158  *
 159  * The return value should be freed by the caller.
 160  */
 161 struct qpu_reg *
 162 vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
 163 {
 164         struct simple_node *node;
 165         struct node_to_temp_map map[c->num_temps];
 166         uint32_t temp_to_node[c->num_temps];
 167         uint32_t def[c->num_temps];
 168         uint32_t use[c->num_temps];
 169         struct qpu_reg *temp_registers = calloc(c->num_temps,
 170                                                 sizeof(*temp_registers));
 171         memset(def, 0, sizeof(def));
 172         memset(use, 0, sizeof(use));
 173
 174         /* If things aren't ever written (undefined values), just read from
 175          * r0.
 176          */
 177         for (uint32_t i = 0; i < c->num_temps; i++)
 178                 temp_registers[i] = qpu_rn(0);
 179
 180         vc4_alloc_reg_set(vc4);
 181
 182         struct ra_graph *g = ra_alloc_interference_graph(vc4->regs,
 183                                                          c->num_temps);
 184
 185         for (uint32_t i = 0; i < c->num_temps; i++) {
 186                 ra_set_node_class(g, i, vc4->reg_class_any);
 187         }
 188
 189         /* Compute the live ranges so we can figure out interference.
 190          */
 191         uint32_t ip = 0;
 192         foreach(node, &c->instructions) {
 193                 struct qinst *inst = (struct qinst *)node;
 194
 195                 if (inst->dst.file == QFILE_TEMP) {
 196                         def[inst->dst.index] = ip;
 197                         use[inst->dst.index] = ip;
 198                 }
 199
 200                 for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
 201                         if (inst->src[i].file == QFILE_TEMP)
 202                                 use[inst->src[i].index] = ip;
 203                 }
 204
 205                 switch (inst->op) {
 206                 case QOP_FRAG_Z:
 207                 case QOP_FRAG_W:
 208                         /* The payload registers have values implicitly loaded
 209                          * at the start of the program.
 210                          */
 211                         def[inst->dst.index] = 0;
 212                         break;
 213                 default:
 214                         break;
 215                 }
 216
 217                 ip++;
 218         }
 219
 220         for (uint32_t i = 0; i < c->num_temps; i++) {
 221                 map[i].temp = i;
 222                 map[i].priority = use[i] - def[i];
 223         }
 224         qsort(map, c->num_temps, sizeof(map[0]), node_to_temp_priority);
 225         for (uint32_t i = 0; i < c->num_temps; i++) {
 226                 temp_to_node[map[i].temp] = i;
 227         }
 228
 229         /* Figure out our register classes and preallocated registers*/
 230         foreach(node, &c->instructions) {
 231                 struct qinst *inst = (struct qinst *)node;
 232
 233                 switch (inst->op) {
 234                 case QOP_FRAG_Z:
 235                         ra_set_node_reg(g, temp_to_node[inst->dst.index],
 236                                         AB_INDEX + QPU_R_FRAG_PAYLOAD_ZW * 2 + 1);
 237                         break;
 238
 239                 case QOP_FRAG_W:
 240                         ra_set_node_reg(g, temp_to_node[inst->dst.index],
 241                                         AB_INDEX + QPU_R_FRAG_PAYLOAD_ZW * 2);
 242                         break;
 243
 244                 case QOP_TEX_RESULT:
 245                 case QOP_TLB_COLOR_READ:
 246                         assert(vc4_regs[ACC_INDEX + 4].mux == QPU_MUX_R4);
 247                         ra_set_node_reg(g, temp_to_node[inst->dst.index],
 248                                         ACC_INDEX + 4);
 249                         break;
 250
 251                 case QOP_PACK_SCALED:
 252                         /* The pack flags require an A-file dst register. */
 253                         ra_set_node_class(g, temp_to_node[inst->dst.index],
 254                                           vc4->reg_class_a);
 255                         break;
 256
 257                 case QOP_UNPACK_8A:
 258                 case QOP_UNPACK_8B:
 259                 case QOP_UNPACK_8C:
 260                 case QOP_UNPACK_8D:
 261                         /* The unpack flags require an A-file src register. */
 262                         ra_set_node_class(g, temp_to_node[inst->src[0].index],
 263                                           vc4->reg_class_a);
 264                         break;
 265
 266                 default:
 267                         break;
 268                 }
 269         }
 270
 271         for (uint32_t i = 0; i < c->num_temps; i++) {
 272                 for (uint32_t j = i + 1; j < c->num_temps; j++) {
 273                         if (!(def[i] >= use[j] || def[j] >= use[i])) {
 274                                 ra_add_node_interference(g,
 275                                                          temp_to_node[i],
 276                                                          temp_to_node[j]);
 277                         }
 278                 }
 279         }
 280
 281         bool ok = ra_allocate(g);
 282         assert(ok);
 283
 284         for (uint32_t i = 0; i < c->num_temps; i++) {
 285                 temp_registers[i] = vc4_regs[ra_get_node_reg(g, temp_to_node[i])];
 286
 287                 /* If the value's never used, just write to the NOP register
 288                  * for clarity in debug output.
 289                  */
 290                 if (def[i] == use[i])
 291                         temp_registers[i] = qpu_ra(QPU_W_NOP);
 292         }
 293
 294         ralloc_free(g);
 295
 296         return temp_registers;
 297 }