src/gallium/drivers/freedreno/a2xx/ir2.c

   1 /*
   2  * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  *
  23  * Authors:
  24  *    Jonathan Marek <jonathan@marek.ca>
  25  */
  26
  27 #include "ir2_private.h"
  28
  29 static bool scalar_possible(struct ir2_instr *instr)
  30 {
  31         if (instr->alu.scalar_opc == SCALAR_NONE)
  32                 return false;
  33
  34         return src_ncomp(instr) == 1;
  35 }
  36
  37 static bool is_alu_compatible(struct ir2_instr *a, struct ir2_instr *b)
  38 {
  39         if (!a)
  40                 return true;
  41
  42         /* dont use same instruction twice */
  43         if (a == b)
  44                 return false;
  45
  46         /* PRED_SET must be alone */
  47         if (b->alu.scalar_opc >= PRED_SETEs &&
  48                 b->alu.scalar_opc <= PRED_SET_RESTOREs)
  49                 return false;
  50
  51         /* must write to same export (issues otherwise?) */
  52         return a->alu.export == b->alu.export;
  53 }
  54
  55 /* priority of vector instruction for scheduling (lower=higher prio) */
  56 static unsigned alu_vector_prio(struct ir2_instr *instr)
  57 {
  58         if (instr->alu.vector_opc == VECTOR_NONE)
  59                 return ~0u;
  60
  61         if (is_export(instr))
  62                 return 4;
  63
  64         /* TODO check src type and ncomps */
  65         if (instr->src_count == 3)
  66                 return 0;
  67
  68         if (!scalar_possible(instr))
  69                 return 1;
  70
  71         return instr->src_count == 2 ? 2 : 3;
  72 }
  73
  74 /* priority of scalar instruction for scheduling (lower=higher prio) */
  75 static unsigned alu_scalar_prio(struct ir2_instr *instr)
  76 {
  77         if (!scalar_possible(instr))
  78                 return ~0u;
  79
  80         /* this case is dealt with later */
  81         if (instr->src_count > 1)
  82                 return ~0u;
  83
  84         if (is_export(instr))
  85                 return 4;
  86
  87         /* PRED to end of block */
  88         if (instr->alu.scalar_opc >= PRED_SETEs &&
  89                 instr->alu.scalar_opc <= PRED_SET_RESTOREs)
  90                 return 5;
  91
  92         /* scalar only have highest priority */
  93         return instr->alu.vector_opc == VECTOR_NONE ? 0 : 3;
  94 }
  95
  96 /* this is a bit messy:
  97  * we want to find a slot where we can insert a scalar MOV with
  98  * a vector instruction that was already scheduled
  99  */
 100 static struct ir2_sched_instr*
 101 insert(struct ir2_context *ctx, unsigned block_idx, unsigned reg_idx,
 102         struct ir2_src src1, unsigned *comp)
 103 {
 104         struct ir2_sched_instr *sched = NULL, *s;
 105         unsigned i, mask = 0xf;
 106
 107         /* go first earliest point where the mov can be inserted */
 108         for (i = ctx->instr_sched_count-1; i > 0; i--) {
 109                 s = &ctx->instr_sched[i - 1];
 110
 111                 if (s->instr && s->instr->block_idx != block_idx)
 112                         break;
 113                 if (s->instr_s && s->instr_s->block_idx != block_idx)
 114                         break;
 115
 116                 if (src1.type == IR2_SRC_SSA) {
 117                         if ((s->instr && s->instr->idx == src1.num) ||
 118                                 (s->instr_s && s->instr_s->idx == src1.num))
 119                                 break;
 120                 }
 121
 122                 unsigned mr = ~(s->reg_state[reg_idx/8] >> reg_idx%8*4 & 0xf);
 123                 if ((mask & mr) == 0)
 124                         break;
 125
 126                 mask &= mr;
 127                 if (s->instr_s || s->instr->src_count == 3)
 128                         continue;
 129
 130                 if (s->instr->type != IR2_ALU || s->instr->alu.export >= 0)
 131                         continue;
 132
 133                 sched = s;
 134         }
 135         *comp = ffs(mask) - 1;
 136
 137         if (sched) {
 138                 for (s = sched; s != &ctx->instr_sched[ctx->instr_sched_count]; s++)
 139                         s->reg_state[reg_idx/8] |= 1 << (*comp+reg_idx%8*4);
 140         }
 141
 142         return sched;
 143 }
 144
 145 /* case1:
 146  * in this case, insert a mov to place the 2nd src into to same reg
 147  * (scalar sources come from the same register)
 148  *
 149  * this is a common case which works when one of the srcs is input/const
 150  * but for instrs which have 2 ssa/reg srcs, then its not ideal
 151  */
 152 static bool
 153 scalarize_case1(struct ir2_context *ctx, struct ir2_instr *instr, bool order)
 154 {
 155         struct ir2_src src0 = instr->src[ order];
 156         struct ir2_src src1 = instr->src[!order];
 157         struct ir2_sched_instr *sched;
 158         struct ir2_instr *ins;
 159         struct ir2_reg *reg;
 160         unsigned idx, comp;
 161
 162         switch (src0.type) {
 163         case IR2_SRC_CONST:
 164         case IR2_SRC_INPUT:
 165                 return false;
 166         default:
 167                 break;
 168         }
 169
 170         /* TODO, insert needs logic for this */
 171         if (src1.type == IR2_SRC_REG)
 172                 return false;
 173
 174         /* we could do something if they match src1.. */
 175         if (src0.negate || src0.abs)
 176                 return false;
 177
 178         reg = get_reg_src(ctx, &src0);
 179
 180         /* result not used more since we will overwrite */
 181         for (int i = 0; i < 4; i++)
 182                 if (reg->comp[i].ref_count != !!(instr->alu.write_mask & 1 << i))
 183                         return false;
 184
 185         /* find a place to insert the mov */
 186         sched = insert(ctx, instr->block_idx, reg->idx, src1, &comp);
 187         if (!sched)
 188                 return false;
 189
 190         ins = &ctx->instr[idx = ctx->instr_count++];
 191         ins->idx = idx;
 192         ins->type = IR2_ALU;
 193         ins->src[0] = src1;
 194         ins->src_count = 1;
 195         ins->is_ssa = true;
 196         ins->ssa.idx = reg->idx;
 197         ins->ssa.ncomp = 1;
 198         ins->ssa.comp[0].c = comp;
 199         ins->alu.scalar_opc = MAXs;
 200         ins->alu.export = -1;
 201         ins->alu.write_mask = 1;
 202         ins->pred = instr->pred;
 203         ins->block_idx = instr->block_idx;
 204
 205         instr->src[0] = src0;
 206         instr->alu.src1_swizzle = comp;
 207
 208         sched->instr_s = ins;
 209         return true;
 210 }
 211
 212 /* fill sched with next fetch or (vector and/or scalar) alu instruction */
 213 static int sched_next(struct ir2_context *ctx, struct ir2_sched_instr *sched)
 214 {
 215         struct ir2_instr *avail[0x100], *instr_v = NULL, *instr_s = NULL;
 216         unsigned avail_count = 0;
 217
 218         instr_alloc_type_t export = ~0u;
 219         int block_idx = -1;
 220
 221         /* XXX merge this loop with the other one somehow? */
 222         ir2_foreach_instr(instr, ctx) {
 223                 if (!instr->need_emit)
 224                         continue;
 225                 if (is_export(instr))
 226                         export = MIN2(export, export_buf(instr->alu.export));
 227         }
 228
 229         ir2_foreach_instr(instr, ctx) {
 230                 if (!instr->need_emit)
 231                         continue;
 232
 233                 /* dont mix exports */
 234                 if (is_export(instr) && export_buf(instr->alu.export) != export)
 235                         continue;
 236
 237                 if (block_idx < 0)
 238                         block_idx = instr->block_idx;
 239                 else if (block_idx != instr->block_idx || /* must be same block */
 240                         instr->type == IR2_CF || /* CF/MEM must be alone */
 241                         (is_export(instr) && export == SQ_MEMORY))
 242                         break;
 243                 /* it works because IR2_CF is always at end of block
 244                  * and somewhat same idea with MEM exports, which might not be alone
 245                  * but will end up in-order at least
 246                  */
 247
 248                 /* check if dependencies are satisfied */
 249                 bool is_ok = true;
 250                 ir2_foreach_src(src, instr) {
 251                         if (src->type == IR2_SRC_REG) {
 252                                 /* need to check if all previous instructions in the block
 253                                  * which write the reg have been emitted
 254                                  * slow..
 255                                  * XXX: check components instead of whole register
 256                                  */
 257                                 struct ir2_reg *reg = get_reg_src(ctx, src);
 258                                 ir2_foreach_instr(p, ctx) {
 259                                         if (!p->is_ssa && p->reg == reg && p->idx < instr->idx)
 260                                                 is_ok &= !p->need_emit;
 261                                 }
 262                         } else if (src->type == IR2_SRC_SSA) {
 263                                 /* in this case its easy, just check need_emit */
 264                                 is_ok &= !ctx->instr[src->num].need_emit;
 265                         }
 266                 }
 267                 /* don't reorder non-ssa write before read */
 268                 if (!instr->is_ssa) {
 269                         ir2_foreach_instr(p, ctx) {
 270                                 if (!p->need_emit || p->idx >= instr->idx)
 271                                         continue;
 272
 273                                 ir2_foreach_src(src, p) {
 274                                         if (get_reg_src(ctx, src) == instr->reg)
 275                                                 is_ok = false;
 276                                 }
 277                         }
 278                 }
 279                 /* don't reorder across predicates */
 280                 if (avail_count && instr->pred != avail[0]->pred)
 281                         is_ok = false;
 282
 283                 if (!is_ok)
 284                         continue;
 285
 286                 avail[avail_count++] = instr;
 287         }
 288
 289         if (!avail_count) {
 290                 assert(block_idx == -1);
 291                 return -1;
 292         }
 293
 294         /* priority to FETCH instructions */
 295         ir2_foreach_avail(instr) {
 296                 if (instr->type == IR2_ALU)
 297                         continue;
 298
 299                 ra_src_free(ctx, instr);
 300                 ra_reg(ctx, get_reg(instr), -1, false, 0);
 301
 302                 instr->need_emit = false;
 303                 sched->instr = instr;
 304                 sched->instr_s = NULL;
 305                 return block_idx;
 306         }
 307
 308         /* TODO precompute priorities */
 309
 310         unsigned prio_v = ~0u, prio_s = ~0u, prio;
 311         ir2_foreach_avail(instr) {
 312                 prio = alu_vector_prio(instr);
 313                 if (prio < prio_v) {
 314                         instr_v = instr;
 315                         prio_v = prio;
 316                 }
 317         }
 318
 319         /* TODO can still insert scalar if src_count=3, if smart about it */
 320         if (!instr_v || instr_v->src_count < 3) {
 321                 ir2_foreach_avail(instr) {
 322                         bool compat = is_alu_compatible(instr_v, instr);
 323
 324                         prio = alu_scalar_prio(instr);
 325                         if (prio >= prio_v && !compat)
 326                                 continue;
 327
 328                         if (prio < prio_s) {
 329                                 instr_s = instr;
 330                                 prio_s = prio;
 331                                 if (!compat)
 332                                         instr_v = NULL;
 333                         }
 334                 }
 335         }
 336
 337         assert(instr_v || instr_s);
 338
 339         /* now, we try more complex insertion of vector instruction as scalar
 340          * TODO: if we are smart we can still insert if instr_v->src_count==3
 341          */
 342         if (!instr_s && instr_v->src_count < 3) {
 343                 ir2_foreach_avail(instr) {
 344                         if (!is_alu_compatible(instr_v, instr) || !scalar_possible(instr))
 345                                 continue;
 346
 347                         /* at this point, src_count should always be 2 */
 348                         assert(instr->src_count == 2);
 349
 350                         if (scalarize_case1(ctx, instr, 0)) {
 351                                 instr_s = instr;
 352                                 break;
 353                         }
 354                         if (scalarize_case1(ctx, instr, 1)) {
 355                                 instr_s = instr;
 356                                 break;
 357                         }
 358                 }
 359         }
 360
 361         /* free src registers */
 362         if (instr_v) {
 363                 instr_v->need_emit = false;
 364                 ra_src_free(ctx, instr_v);
 365         }
 366
 367         if (instr_s) {
 368                 instr_s->need_emit = false;
 369                 ra_src_free(ctx, instr_s);
 370         }
 371
 372         /* allocate dst registers */
 373         if (instr_v)
 374                 ra_reg(ctx, get_reg(instr_v), -1, is_export(instr_v), instr_v->alu.write_mask);
 375
 376         if (instr_s)
 377                 ra_reg(ctx, get_reg(instr_s), -1, is_export(instr_s), instr_s->alu.write_mask);
 378
 379         sched->instr = instr_v;
 380         sched->instr_s = instr_s;
 381         return block_idx;
 382 }
 383
 384 /* scheduling: determine order of instructions */
 385 static void schedule_instrs(struct ir2_context *ctx)
 386 {
 387         struct ir2_sched_instr *sched;
 388         int block_idx;
 389
 390         /* allocate input registers */
 391         for (unsigned idx = 0; idx < ARRAY_SIZE(ctx->input); idx++)
 392                 if (ctx->input[idx].initialized)
 393                         ra_reg(ctx, &ctx->input[idx], idx, false, 0);
 394
 395         for (;;) {
 396                 sched = &ctx->instr_sched[ctx->instr_sched_count++];
 397                 block_idx = sched_next(ctx, sched);
 398                 if (block_idx < 0)
 399                         break;
 400                 memcpy(sched->reg_state, ctx->reg_state, sizeof(ctx->reg_state));
 401
 402                 /* catch texture fetch after scheduling and insert the
 403                  * SET_TEX_LOD right before it if necessary
 404                  * TODO clean this up
 405                  */
 406                 struct ir2_instr *instr = sched->instr, *tex_lod;
 407                 if (instr && instr->type == IR2_FETCH &&
 408                         instr->fetch.opc == TEX_FETCH && instr->src_count == 2) {
 409                         /* generate the SET_LOD instruction */
 410                         tex_lod = &ctx->instr[ctx->instr_count++];
 411                         tex_lod->type = IR2_FETCH;
 412                         tex_lod->block_idx = instr->block_idx;
 413                         tex_lod->pred = instr->pred;
 414                         tex_lod->fetch.opc = TEX_SET_TEX_LOD;
 415                         tex_lod->src[0] = instr->src[1];
 416                         tex_lod->src_count = 1;
 417
 418                         sched[1] = sched[0];
 419                         sched->instr = tex_lod;
 420                         ctx->instr_sched_count++;
 421                 }
 422
 423                 bool free_block = true;
 424                 ir2_foreach_instr(instr, ctx)
 425                         free_block &= instr->block_idx != block_idx;
 426                 if (free_block)
 427                         ra_block_free(ctx, block_idx);
 428         };
 429         ctx->instr_sched_count--;
 430 }
 431
 432 void
 433 ir2_compile(struct fd2_shader_stateobj *so, unsigned variant,
 434                 struct fd2_shader_stateobj *fp)
 435 {
 436         struct ir2_context ctx = { };
 437         bool binning = !fp && so->type == MESA_SHADER_VERTEX;
 438
 439         if (fp)
 440                 so->variant[variant].f = fp->variant[0].f;
 441
 442         ctx.so = so;
 443         ctx.info = &so->variant[variant].info;
 444         ctx.f = &so->variant[variant].f;
 445         ctx.info->max_reg = -1;
 446
 447         /* convert nir to internal representation */
 448         ir2_nir_compile(&ctx, binning);
 449
 450         /* copy propagate srcs */
 451         cp_src(&ctx);
 452
 453         /* get ref_counts and kill non-needed instructions */
 454         ra_count_refs(&ctx);
 455
 456         /* remove movs used to write outputs */
 457         cp_export(&ctx);
 458
 459         /* instruction order.. and vector->scalar conversions */
 460         schedule_instrs(&ctx);
 461
 462         /* finally, assemble to bitcode */
 463         assemble(&ctx, binning);
 464 }