src/panfrost/bifrost/bifrost_sched.c

   1 /*
   2  * Copyright (C) 2019 Ryan Houdek <Sonicadvance1@gmail.com>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  */
  23
  24 #include "util/register_allocate.h"
  25 #include "compiler_defines.h"
  26 #include "bifrost_sched.h"
  27 #include "bifrost_compile.h"
  28 #include "bifrost_print.h"
  29
  30 #define BI_DEBUG
  31 const unsigned max_primary_reg = 64; /* Overestimate because of special regs */
  32 const unsigned max_vec2_reg = 32;
  33 const unsigned max_vec3_reg = 16; // XXX: Do we need to align vec3 to vec4 boundary?
  34 const unsigned max_vec4_reg = 16;
  35 const unsigned max_registers = 128; /* Sum of classes */
  36 const unsigned primary_base = 0;
  37 const unsigned vec2_base = 64;
  38 const unsigned vec3_base = 96; /* above base + max_class_reg */
  39 const unsigned vec4_base = 112;
  40 const unsigned vec4_end = 128;
  41
  42 static unsigned
  43 find_or_allocate_temp(compiler_context *ctx, unsigned hash)
  44 {
  45         if (hash >= SSA_FIXED_MINIMUM)
  46                 return hash;
  47
  48         unsigned temp = (uintptr_t) _mesa_hash_table_u64_search(ctx->hash_to_temp, hash + 1);
  49
  50         if (temp)
  51                 return temp - 1;
  52
  53         /* If no temp is find, allocate one */
  54         temp = ctx->num_temps++;
  55         ctx->max_hash = MAX2(ctx->max_hash, hash);
  56
  57         _mesa_hash_table_u64_insert(ctx->hash_to_temp, hash + 1, (void *) ((uintptr_t) temp + 1));
  58
  59         return temp;
  60 }
  61
  62 static bool
  63 is_live_in_instr(bifrost_instruction *instr, unsigned temp)
  64 {
  65         if (instr->ssa_args.src0 == temp) return true;
  66         if (instr->ssa_args.src1 == temp) return true;
  67         if (instr->ssa_args.src2 == temp) return true;
  68         if (instr->ssa_args.src3 == temp) return true;
  69
  70         return false;
  71 }
  72
  73 static bool
  74 is_live_after_instr(compiler_context *ctx, bifrost_block *blk, bifrost_instruction *instr, unsigned temp)
  75 {
  76         // Scan forward in the block from this location to see if we are still live.
  77
  78         mir_foreach_instr_in_block_from(blk, ins, mir_next_instr(instr)) {
  79                 if (is_live_in_instr(ins, temp))
  80                         return true;
  81         }
  82
  83         // XXX: Walk all successor blocks and ensure the value isn't used there
  84
  85         return false;
  86 }
  87
  88 static uint32_t
  89 ra_select_callback(struct ra_graph *g, BITSET_WORD *regs, void *data)
  90 {
  91         for (int i = primary_base; i < vec4_end; ++i) {
  92                 if (BITSET_TEST(regs, i)) {
  93                         return i;
  94                 }
  95         }
  96
  97         assert(0);
  98         return 0;
  99 }
 100
 101 static uint32_t
 102 ra_get_phys_reg(compiler_context *ctx, struct ra_graph *g, unsigned temp, unsigned max_reg)
 103 {
 104         if (temp == SSA_INVALID_VALUE ||
 105             temp >= SSA_FIXED_UREG_MINIMUM ||
 106             temp == SSA_FIXED_CONST_0)
 107                 return temp;
 108
 109         if (temp >= SSA_FIXED_MINIMUM)
 110                 return SSA_REG_FROM_FIXED(temp);
 111
 112         assert(temp < max_reg);
 113         uint32_t r = ra_get_node_reg(g, temp);
 114         if (r >= vec4_base)
 115                 return (r - vec4_base) * 4;
 116         else if (r >= vec3_base)
 117                 return (r - vec3_base) * 4;
 118         else if (r >= vec2_base)
 119                 return (r - vec2_base) * 2;
 120
 121         return r;
 122 }
 123
 124 static void
 125 allocate_registers(compiler_context *ctx)
 126 {
 127         struct ra_regs *regs = ra_alloc_reg_set(NULL, max_registers, true);
 128
 129         int primary_class = ra_alloc_reg_class(regs);
 130         int vec2_class = ra_alloc_reg_class(regs);
 131         int vec3_class = ra_alloc_reg_class(regs);
 132         int vec4_class = ra_alloc_reg_class(regs);
 133
 134         // Allocate our register classes and conflicts
 135         {
 136                 unsigned reg = 0;
 137                 unsigned primary_base = 0;
 138
 139                 // Add all of our primary scalar registers
 140                 for (unsigned i = 0; i < max_primary_reg; ++i) {
 141                         ra_class_add_reg(regs, primary_class, reg);
 142                         reg++;
 143                 }
 144
 145                 // Add all of our vec2 class registers
 146                 // These alias with the scalar registers
 147                 for (unsigned i = 0; i < max_vec2_reg; ++i) {
 148                         ra_class_add_reg(regs, vec2_class, reg);
 149
 150                         // Tell RA that this conflicts with primary class registers
 151                         // Make sure to tell the RA utility all conflict slots
 152                         ra_add_reg_conflict(regs, reg, primary_base + i*2 + 0);
 153                         ra_add_reg_conflict(regs, reg, primary_base + i*2 + 1);
 154
 155                         reg++;
 156                 }
 157
 158                 // Add all of our vec3 class registers
 159                 // These alias with the scalar registers
 160                 for (unsigned i = 0; i < max_vec3_reg; ++i) {
 161                         ra_class_add_reg(regs, vec3_class, reg);
 162
 163                         // Tell RA that this conflicts with primary class registers
 164                         // Make sure to tell the RA utility all conflict slots
 165                         // These are aligned to vec4 even though they only conflict with a vec3 wide slot
 166                         ra_add_reg_conflict(regs, reg, primary_base + i*4 + 0);
 167                         ra_add_reg_conflict(regs, reg, primary_base + i*4 + 1);
 168                         ra_add_reg_conflict(regs, reg, primary_base + i*4 + 2);
 169
 170                         // State that this class conflicts with the vec2 class
 171                         ra_add_reg_conflict(regs, reg, vec2_base + i*2 + 0);
 172                         ra_add_reg_conflict(regs, reg, vec2_base + i*2 + 1);
 173
 174                         reg++;
 175                 }
 176
 177                 // Add all of our vec4 class registers
 178                 // These alias with the scalar registers
 179                 for (unsigned i = 0; i < max_vec4_reg; ++i) {
 180                         ra_class_add_reg(regs, vec4_class, reg);
 181
 182                         // Tell RA that this conflicts with primary class registers
 183                         // Make sure to tell the RA utility all conflict slots
 184                         // These are aligned to vec4 even though they only conflict with a vec3 wide slot
 185                         ra_add_reg_conflict(regs, reg, primary_base + i*4 + 0);
 186                         ra_add_reg_conflict(regs, reg, primary_base + i*4 + 1);
 187                         ra_add_reg_conflict(regs, reg, primary_base + i*4 + 2);
 188                         ra_add_reg_conflict(regs, reg, primary_base + i*4 + 3);
 189
 190                         // State that this class conflicts with the vec2 class
 191                         ra_add_reg_conflict(regs, reg, vec2_base + i*2 + 0);
 192                         ra_add_reg_conflict(regs, reg, vec2_base + i*2 + 1);
 193
 194                         // State that this class conflicts with the vec3 class
 195                         // They conflict on the exact same location due to alignments
 196                         ra_add_reg_conflict(regs, reg, vec3_base + i);
 197
 198                         reg++;
 199                 }
 200         }
 201
 202         ra_set_finalize(regs, NULL);
 203         mir_foreach_block(ctx, block) {
 204                 mir_foreach_instr_in_block(block, instr) {
 205                         instr->ssa_args.src0 = find_or_allocate_temp(ctx, instr->ssa_args.src0);
 206                         instr->ssa_args.src1 = find_or_allocate_temp(ctx, instr->ssa_args.src1);
 207                         instr->ssa_args.src2 = find_or_allocate_temp(ctx, instr->ssa_args.src2);
 208                         instr->ssa_args.src3 = find_or_allocate_temp(ctx, instr->ssa_args.src3);
 209                         instr->ssa_args.dest = find_or_allocate_temp(ctx, instr->ssa_args.dest);
 210                 }
 211         }
 212
 213         uint32_t nodes = ctx->num_temps;
 214         struct ra_graph *g = ra_alloc_interference_graph(regs, nodes);
 215
 216         mir_foreach_block(ctx, block) {
 217                 mir_foreach_instr_in_block(block, instr) {
 218                         if (instr->ssa_args.dest >= SSA_FIXED_MINIMUM) continue;
 219                         if (instr->dest_components == 4)
 220                                 ra_set_node_class(g, instr->ssa_args.dest, vec4_class);
 221                         else if (instr->dest_components == 3)
 222                                 ra_set_node_class(g, instr->ssa_args.dest, vec3_class);
 223                         else if (instr->dest_components == 2)
 224                                 ra_set_node_class(g, instr->ssa_args.dest, vec2_class);
 225                         else
 226                                 ra_set_node_class(g, instr->ssa_args.dest, primary_class);
 227                 }
 228         }
 229
 230         uint32_t *live_start = malloc(nodes * sizeof(uint32_t));
 231         uint32_t *live_end = malloc(nodes * sizeof(uint32_t));
 232
 233         memset(live_start, 0xFF, nodes * sizeof(uint32_t));
 234         memset(live_end, 0xFF, nodes * sizeof(uint32_t));
 235
 236         uint32_t location = 0;
 237         mir_foreach_block(ctx, block) {
 238                 mir_foreach_instr_in_block(block, instr) {
 239                         if (instr->ssa_args.dest < SSA_FIXED_MINIMUM) {
 240                                 // If the destination isn't yet live before this point
 241                                 // then this is the point it becomes live since we wrote to it
 242                                 if (live_start[instr->ssa_args.dest] == ~0U) {
 243                                         live_start[instr->ssa_args.dest] = location;
 244                                 }
 245                         }
 246
 247                         uint32_t sources[4] = {
 248                                 instr->ssa_args.src0,
 249                                 instr->ssa_args.src1,
 250                                 instr->ssa_args.src2,
 251                                 instr->ssa_args.src3,
 252                         };
 253
 254                         for (unsigned i = 0; i < 4; ++i) {
 255                                 if (sources[i] >= SSA_FIXED_MINIMUM)
 256                                         continue;
 257
 258                                 // If the source is no longer live after this instruction then we can end its liveness
 259                                 if (!is_live_after_instr(ctx, block, instr, sources[i])) {
 260                                         live_end[sources[i]] = location;
 261                                 }
 262                         }
 263                         ++location;
 264                 }
 265         }
 266
 267         // Spin through the nodes quick and ensure they are all killed by the end of the program
 268         for (unsigned i = 0; i < nodes; ++i) {
 269                 if (live_end[i] == ~0U)
 270                         live_end[i] = location;
 271         }
 272
 273         for (int i = 0; i < nodes; ++i) {
 274                 for (int j = i + 1; j < nodes; ++j) {
 275                         if (!(live_start[i] >= live_end[j] || live_start[j] >= live_end[i])) {
 276                                 ra_add_node_interference(g, i, j);
 277                         }
 278                 }
 279         }
 280
 281         ra_set_select_reg_callback(g, ra_select_callback, NULL);
 282
 283         if (!ra_allocate(g)) {
 284                 assert(0);
 285         }
 286
 287         free(live_start);
 288         free(live_end);
 289
 290         mir_foreach_block(ctx, block) {
 291                 mir_foreach_instr_in_block(block, instr) {
 292                         instr->args.src0 = ra_get_phys_reg(ctx, g, instr->ssa_args.src0, nodes);
 293                         instr->args.src1 = ra_get_phys_reg(ctx, g, instr->ssa_args.src1, nodes);
 294                         instr->args.src2 = ra_get_phys_reg(ctx, g, instr->ssa_args.src2, nodes);
 295                         instr->args.src3 = ra_get_phys_reg(ctx, g, instr->ssa_args.src3, nodes);
 296                         instr->args.dest = ra_get_phys_reg(ctx, g, instr->ssa_args.dest, nodes);
 297                 }
 298         }
 299 }
 300
 301 static void
 302 bundle_block(compiler_context *ctx, bifrost_block *block)
 303 {
 304 }
 305
 306 static void
 307 remove_create_vectors(compiler_context *ctx, bifrost_block *block)
 308 {
 309         mir_foreach_instr_in_block_safe(block, instr) {
 310                 if (instr->op != op_create_vector) continue;
 311
 312                 uint32_t vector_ssa_sources[4] = {
 313                         instr->ssa_args.src0,
 314                         instr->ssa_args.src1,
 315                         instr->ssa_args.src2,
 316                         instr->ssa_args.src3,
 317                 };
 318
 319                 mir_foreach_instr_in_block_from_rev(block, next_instr, instr) {
 320                         // Walk our block backwards and find the creators of this vector creation instruction
 321                         for (unsigned i = 0; i < instr->dest_components; ++i) {
 322                                 // If this instruction is ther one that writes this register then forward it to the real register
 323                                 if (vector_ssa_sources[i] == next_instr->ssa_args.dest) {
 324                                         next_instr->ssa_args.dest = vector_ssa_sources[i];
 325                                         // Source instruction destination is a vector register of size dest_components
 326                                         // So dest + i gets the components of it
 327                                         next_instr->args.dest = instr->args.dest + i;
 328                                 }
 329                         }
 330                 }
 331
 332                 // Remove the instruction now that we have copied over all the sources
 333                 mir_remove_instr(instr);
 334         }
 335 }
 336
 337 static void
 338 remove_extract_elements(compiler_context *ctx, bifrost_block *block)
 339 {
 340         mir_foreach_instr_in_block_safe(block, instr) {
 341                 if (instr->op != op_extract_element) continue;
 342
 343                 mir_foreach_instr_in_block_from(block, next_instr, instr) {
 344                         // Walk our block forward to replace uses of this register with a real register
 345                         // src0 = vector
 346                         // src1 = index in to vector
 347                         uint32_t vector_ssa_sources[4] = {
 348                                 next_instr->ssa_args.src0,
 349                                 next_instr->ssa_args.src1,
 350                                 next_instr->ssa_args.src2,
 351                                 next_instr->ssa_args.src3,
 352                         };
 353                         uint32_t *vector_sources[4] = {
 354                                 &next_instr->args.src0,
 355                                 &next_instr->args.src1,
 356                                 &next_instr->args.src2,
 357                                 &next_instr->args.src3,
 358                         };
 359
 360                         for (unsigned i = 0; i < 4; ++i) {
 361                                 if (vector_ssa_sources[i] == instr->ssa_args.dest) {
 362                                         // This source uses this vector extraction
 363                                         // Replace its usage with the real register
 364                                         // src0 is a vector register and src1 is the constant element of the vector
 365                                         *vector_sources[i] = instr->args.src0 + instr->literal_args[0];
 366                                 }
 367                         }
 368
 369                 }
 370
 371                 // Remove the instruction now that we have copied over all the sources
 372                 mir_remove_instr(instr);
 373         }
 374 }
 375
 376
 377 void schedule_program(compiler_context *ctx)
 378 {
 379         // XXX: we should move instructions together before RA that can feed in to each other and be scheduled in the same clause
 380         allocate_registers(ctx);
 381
 382         mir_foreach_block(ctx, block) {
 383                 remove_create_vectors(ctx, block);
 384                 remove_extract_elements(ctx, block);
 385         }
 386
 387         mir_foreach_block(ctx, block) {
 388 #ifdef BI_DEBUG
 389                 print_mir_block(block, true);
 390 #endif
 391
 392                 bundle_block(ctx, block);
 393         }
 394 }
 395