src/broadcom/compiler/vir.c

   1 /*
   2  * Copyright © 2016-2017 Broadcom
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "broadcom/common/v3d_device_info.h"
  25 #include "v3d_compiler.h"
  26
  27 int
  28 vir_get_non_sideband_nsrc(struct qinst *inst)
  29 {
  30         switch (inst->qpu.type) {
  31         case V3D_QPU_INSTR_TYPE_BRANCH:
  32                 return 0;
  33         case V3D_QPU_INSTR_TYPE_ALU:
  34                 if (inst->qpu.alu.add.op != V3D_QPU_A_NOP)
  35                         return v3d_qpu_add_op_num_src(inst->qpu.alu.add.op);
  36                 else
  37                         return v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op);
  38         }
  39
  40         return 0;
  41 }
  42
  43 int
  44 vir_get_nsrc(struct qinst *inst)
  45 {
  46         int nsrc = vir_get_non_sideband_nsrc(inst);
  47
  48         if (vir_has_implicit_uniform(inst))
  49                 nsrc++;
  50
  51         return nsrc;
  52 }
  53
  54 bool
  55 vir_has_implicit_uniform(struct qinst *inst)
  56 {
  57         switch (inst->qpu.type) {
  58         case V3D_QPU_INSTR_TYPE_BRANCH:
  59                 return true;
  60         case V3D_QPU_INSTR_TYPE_ALU:
  61                 switch (inst->dst.file) {
  62                 case QFILE_TLBU:
  63                         return true;
  64                 case QFILE_MAGIC:
  65                         switch (inst->dst.index) {
  66                         case V3D_QPU_WADDR_TLBU:
  67                         case V3D_QPU_WADDR_TMUAU:
  68                         case V3D_QPU_WADDR_SYNCU:
  69                                 return true;
  70                         default:
  71                                 break;
  72                         }
  73                         break;
  74                 default:
  75                         return inst->has_implicit_uniform;
  76                 }
  77         }
  78         return false;
  79 }
  80
  81 /* The sideband uniform for textures gets stored after the normal ALU
  82  * arguments.
  83  */
  84 int
  85 vir_get_implicit_uniform_src(struct qinst *inst)
  86 {
  87         if (!vir_has_implicit_uniform(inst))
  88                 return -1;
  89         return vir_get_nsrc(inst) - 1;
  90 }
  91
  92 /**
  93  * Returns whether the instruction has any side effects that must be
  94  * preserved.
  95  */
  96 bool
  97 vir_has_side_effects(struct v3d_compile *c, struct qinst *inst)
  98 {
  99         switch (inst->qpu.type) {
 100         case V3D_QPU_INSTR_TYPE_BRANCH:
 101                 return true;
 102         case V3D_QPU_INSTR_TYPE_ALU:
 103                 switch (inst->qpu.alu.add.op) {
 104                 case V3D_QPU_A_SETREVF:
 105                 case V3D_QPU_A_SETMSF:
 106                 case V3D_QPU_A_VPMSETUP:
 107                 case V3D_QPU_A_STVPMV:
 108                 case V3D_QPU_A_STVPMD:
 109                 case V3D_QPU_A_STVPMP:
 110                 case V3D_QPU_A_VPMWT:
 111                 case V3D_QPU_A_TMUWT:
 112                         return true;
 113                 default:
 114                         break;
 115                 }
 116
 117                 switch (inst->qpu.alu.mul.op) {
 118                 case V3D_QPU_M_MULTOP:
 119                         return true;
 120                 default:
 121                         break;
 122                 }
 123         }
 124
 125         if (inst->qpu.sig.ldtmu ||
 126             inst->qpu.sig.ldvary ||
 127             inst->qpu.sig.wrtmuc ||
 128             inst->qpu.sig.thrsw) {
 129                 return true;
 130         }
 131
 132         return false;
 133 }
 134
 135 bool
 136 vir_is_raw_mov(struct qinst *inst)
 137 {
 138         if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
 139             (inst->qpu.alu.mul.op != V3D_QPU_M_FMOV &&
 140              inst->qpu.alu.mul.op != V3D_QPU_M_MOV)) {
 141                 return false;
 142         }
 143
 144         if (inst->qpu.alu.add.output_pack != V3D_QPU_PACK_NONE ||
 145             inst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE) {
 146                 return false;
 147         }
 148
 149         if (inst->qpu.flags.ac != V3D_QPU_COND_NONE ||
 150             inst->qpu.flags.mc != V3D_QPU_COND_NONE)
 151                 return false;
 152
 153         return true;
 154 }
 155
 156 bool
 157 vir_is_add(struct qinst *inst)
 158 {
 159         return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
 160                 inst->qpu.alu.add.op != V3D_QPU_A_NOP);
 161 }
 162
 163 bool
 164 vir_is_mul(struct qinst *inst)
 165 {
 166         return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
 167                 inst->qpu.alu.mul.op != V3D_QPU_M_NOP);
 168 }
 169
 170 bool
 171 vir_is_tex(struct qinst *inst)
 172 {
 173         if (inst->dst.file == QFILE_MAGIC)
 174                 return v3d_qpu_magic_waddr_is_tmu(inst->dst.index);
 175
 176         if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
 177             inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) {
 178                 return true;
 179         }
 180
 181         return false;
 182 }
 183
 184 bool
 185 vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst)
 186 {
 187         for (int i = 0; i < vir_get_nsrc(inst); i++) {
 188                 switch (inst->src[i].file) {
 189                 case QFILE_VPM:
 190                         return true;
 191                 default:
 192                         break;
 193                 }
 194         }
 195
 196         if (devinfo->ver < 41 && (inst->qpu.sig.ldvary ||
 197                                   inst->qpu.sig.ldtlb ||
 198                                   inst->qpu.sig.ldtlbu ||
 199                                   inst->qpu.sig.ldvpm)) {
 200                 return true;
 201         }
 202
 203         return false;
 204 }
 205
 206 bool
 207 vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst)
 208 {
 209         switch (inst->dst.file) {
 210         case QFILE_MAGIC:
 211                 switch (inst->dst.index) {
 212                 case V3D_QPU_WADDR_RECIP:
 213                 case V3D_QPU_WADDR_RSQRT:
 214                 case V3D_QPU_WADDR_EXP:
 215                 case V3D_QPU_WADDR_LOG:
 216                 case V3D_QPU_WADDR_SIN:
 217                         return true;
 218                 }
 219                 break;
 220         default:
 221                 break;
 222         }
 223
 224         if (devinfo->ver < 41 && inst->qpu.sig.ldtmu)
 225                 return true;
 226
 227         return false;
 228 }
 229
 230 void
 231 vir_set_unpack(struct qinst *inst, int src,
 232                enum v3d_qpu_input_unpack unpack)
 233 {
 234         assert(src == 0 || src == 1);
 235
 236         if (vir_is_add(inst)) {
 237                 if (src == 0)
 238                         inst->qpu.alu.add.a_unpack = unpack;
 239                 else
 240                         inst->qpu.alu.add.b_unpack = unpack;
 241         } else {
 242                 assert(vir_is_mul(inst));
 243                 if (src == 0)
 244                         inst->qpu.alu.mul.a_unpack = unpack;
 245                 else
 246                         inst->qpu.alu.mul.b_unpack = unpack;
 247         }
 248 }
 249
 250 void
 251 vir_set_cond(struct qinst *inst, enum v3d_qpu_cond cond)
 252 {
 253         if (vir_is_add(inst)) {
 254                 inst->qpu.flags.ac = cond;
 255         } else {
 256                 assert(vir_is_mul(inst));
 257                 inst->qpu.flags.mc = cond;
 258         }
 259 }
 260
 261 void
 262 vir_set_pf(struct qinst *inst, enum v3d_qpu_pf pf)
 263 {
 264         if (vir_is_add(inst)) {
 265                 inst->qpu.flags.apf = pf;
 266         } else {
 267                 assert(vir_is_mul(inst));
 268                 inst->qpu.flags.mpf = pf;
 269         }
 270 }
 271
 272 void
 273 vir_set_uf(struct qinst *inst, enum v3d_qpu_uf uf)
 274 {
 275         if (vir_is_add(inst)) {
 276                 inst->qpu.flags.auf = uf;
 277         } else {
 278                 assert(vir_is_mul(inst));
 279                 inst->qpu.flags.muf = uf;
 280         }
 281 }
 282
 283 #if 0
 284 uint8_t
 285 vir_channels_written(struct qinst *inst)
 286 {
 287         if (vir_is_mul(inst)) {
 288                 switch (inst->dst.pack) {
 289                 case QPU_PACK_MUL_NOP:
 290                 case QPU_PACK_MUL_8888:
 291                         return 0xf;
 292                 case QPU_PACK_MUL_8A:
 293                         return 0x1;
 294                 case QPU_PACK_MUL_8B:
 295                         return 0x2;
 296                 case QPU_PACK_MUL_8C:
 297                         return 0x4;
 298                 case QPU_PACK_MUL_8D:
 299                         return 0x8;
 300                 }
 301         } else {
 302                 switch (inst->dst.pack) {
 303                 case QPU_PACK_A_NOP:
 304                 case QPU_PACK_A_8888:
 305                 case QPU_PACK_A_8888_SAT:
 306                 case QPU_PACK_A_32_SAT:
 307                         return 0xf;
 308                 case QPU_PACK_A_8A:
 309                 case QPU_PACK_A_8A_SAT:
 310                         return 0x1;
 311                 case QPU_PACK_A_8B:
 312                 case QPU_PACK_A_8B_SAT:
 313                         return 0x2;
 314                 case QPU_PACK_A_8C:
 315                 case QPU_PACK_A_8C_SAT:
 316                         return 0x4;
 317                 case QPU_PACK_A_8D:
 318                 case QPU_PACK_A_8D_SAT:
 319                         return 0x8;
 320                 case QPU_PACK_A_16A:
 321                 case QPU_PACK_A_16A_SAT:
 322                         return 0x3;
 323                 case QPU_PACK_A_16B:
 324                 case QPU_PACK_A_16B_SAT:
 325                         return 0xc;
 326                 }
 327         }
 328         unreachable("Bad pack field");
 329 }
 330 #endif
 331
 332 struct qreg
 333 vir_get_temp(struct v3d_compile *c)
 334 {
 335         struct qreg reg;
 336
 337         reg.file = QFILE_TEMP;
 338         reg.index = c->num_temps++;
 339
 340         if (c->num_temps > c->defs_array_size) {
 341                 uint32_t old_size = c->defs_array_size;
 342                 c->defs_array_size = MAX2(old_size * 2, 16);
 343
 344                 c->defs = reralloc(c, c->defs, struct qinst *,
 345                                    c->defs_array_size);
 346                 memset(&c->defs[old_size], 0,
 347                        sizeof(c->defs[0]) * (c->defs_array_size - old_size));
 348
 349                 c->spillable = reralloc(c, c->spillable,
 350                                         BITSET_WORD,
 351                                         BITSET_WORDS(c->defs_array_size));
 352                 for (int i = old_size; i < c->defs_array_size; i++)
 353                         BITSET_SET(c->spillable, i);
 354         }
 355
 356         return reg;
 357 }
 358
 359 struct qinst *
 360 vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst, struct qreg src0, struct qreg src1)
 361 {
 362         struct qinst *inst = calloc(1, sizeof(*inst));
 363
 364         inst->qpu = v3d_qpu_nop();
 365         inst->qpu.alu.add.op = op;
 366
 367         inst->dst = dst;
 368         inst->src[0] = src0;
 369         inst->src[1] = src1;
 370         inst->uniform = ~0;
 371
 372         return inst;
 373 }
 374
 375 struct qinst *
 376 vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst, struct qreg src0, struct qreg src1)
 377 {
 378         struct qinst *inst = calloc(1, sizeof(*inst));
 379
 380         inst->qpu = v3d_qpu_nop();
 381         inst->qpu.alu.mul.op = op;
 382
 383         inst->dst = dst;
 384         inst->src[0] = src0;
 385         inst->src[1] = src1;
 386         inst->uniform = ~0;
 387
 388         return inst;
 389 }
 390
 391 struct qinst *
 392 vir_branch_inst(enum v3d_qpu_branch_cond cond, struct qreg src)
 393 {
 394         struct qinst *inst = calloc(1, sizeof(*inst));
 395
 396         inst->qpu = v3d_qpu_nop();
 397         inst->qpu.type = V3D_QPU_INSTR_TYPE_BRANCH;
 398         inst->qpu.branch.cond = cond;
 399         inst->qpu.branch.msfign = V3D_QPU_MSFIGN_NONE;
 400         inst->qpu.branch.bdi = V3D_QPU_BRANCH_DEST_REL;
 401         inst->qpu.branch.ub = true;
 402         inst->qpu.branch.bdu = V3D_QPU_BRANCH_DEST_REL;
 403
 404         inst->dst = vir_reg(QFILE_NULL, 0);
 405         inst->src[0] = src;
 406         inst->uniform = ~0;
 407
 408         return inst;
 409 }
 410
 411 static void
 412 vir_emit(struct v3d_compile *c, struct qinst *inst)
 413 {
 414         switch (c->cursor.mode) {
 415         case vir_cursor_add:
 416                 list_add(&inst->link, c->cursor.link);
 417                 break;
 418         case vir_cursor_addtail:
 419                 list_addtail(&inst->link, c->cursor.link);
 420                 break;
 421         }
 422
 423         c->cursor = vir_after_inst(inst);
 424         c->live_intervals_valid = false;
 425 }
 426
 427 /* Updates inst to write to a new temporary, emits it, and notes the def. */
 428 struct qreg
 429 vir_emit_def(struct v3d_compile *c, struct qinst *inst)
 430 {
 431         assert(inst->dst.file == QFILE_NULL);
 432
 433         /* If we're emitting an instruction that's a def, it had better be
 434          * writing a register.
 435          */
 436         if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
 437                 assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP ||
 438                        v3d_qpu_add_op_has_dst(inst->qpu.alu.add.op));
 439                 assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP ||
 440                        v3d_qpu_mul_op_has_dst(inst->qpu.alu.mul.op));
 441         }
 442
 443         inst->dst = vir_get_temp(c);
 444
 445         if (inst->dst.file == QFILE_TEMP)
 446                 c->defs[inst->dst.index] = inst;
 447
 448         vir_emit(c, inst);
 449
 450         return inst->dst;
 451 }
 452
 453 struct qinst *
 454 vir_emit_nondef(struct v3d_compile *c, struct qinst *inst)
 455 {
 456         if (inst->dst.file == QFILE_TEMP)
 457                 c->defs[inst->dst.index] = NULL;
 458
 459         vir_emit(c, inst);
 460
 461         return inst;
 462 }
 463
 464 struct qblock *
 465 vir_new_block(struct v3d_compile *c)
 466 {
 467         struct qblock *block = rzalloc(c, struct qblock);
 468
 469         list_inithead(&block->instructions);
 470
 471         block->predecessors = _mesa_set_create(block,
 472                                                _mesa_hash_pointer,
 473                                                _mesa_key_pointer_equal);
 474
 475         block->index = c->next_block_index++;
 476
 477         return block;
 478 }
 479
 480 void
 481 vir_set_emit_block(struct v3d_compile *c, struct qblock *block)
 482 {
 483         c->cur_block = block;
 484         c->cursor = vir_after_block(block);
 485         list_addtail(&block->link, &c->blocks);
 486 }
 487
 488 struct qblock *
 489 vir_entry_block(struct v3d_compile *c)
 490 {
 491         return list_first_entry(&c->blocks, struct qblock, link);
 492 }
 493
 494 struct qblock *
 495 vir_exit_block(struct v3d_compile *c)
 496 {
 497         return list_last_entry(&c->blocks, struct qblock, link);
 498 }
 499
 500 void
 501 vir_link_blocks(struct qblock *predecessor, struct qblock *successor)
 502 {
 503         _mesa_set_add(successor->predecessors, predecessor);
 504         if (predecessor->successors[0]) {
 505                 assert(!predecessor->successors[1]);
 506                 predecessor->successors[1] = successor;
 507         } else {
 508                 predecessor->successors[0] = successor;
 509         }
 510 }
 511
 512 const struct v3d_compiler *
 513 v3d_compiler_init(const struct v3d_device_info *devinfo)
 514 {
 515         struct v3d_compiler *compiler = rzalloc(NULL, struct v3d_compiler);
 516         if (!compiler)
 517                 return NULL;
 518
 519         compiler->devinfo = devinfo;
 520
 521         if (!vir_init_reg_sets(compiler)) {
 522                 ralloc_free(compiler);
 523                 return NULL;
 524         }
 525
 526         return compiler;
 527 }
 528
 529 void
 530 v3d_compiler_free(const struct v3d_compiler *compiler)
 531 {
 532         ralloc_free((void *)compiler);
 533 }
 534
 535 static struct v3d_compile *
 536 vir_compile_init(const struct v3d_compiler *compiler,
 537                  struct v3d_key *key,
 538                  nir_shader *s,
 539                  void (*debug_output)(const char *msg,
 540                                       void *debug_output_data),
 541                  void *debug_output_data,
 542                  int program_id, int variant_id)
 543 {
 544         struct v3d_compile *c = rzalloc(NULL, struct v3d_compile);
 545
 546         c->compiler = compiler;
 547         c->devinfo = compiler->devinfo;
 548         c->key = key;
 549         c->program_id = program_id;
 550         c->variant_id = variant_id;
 551         c->threads = 4;
 552         c->debug_output = debug_output;
 553         c->debug_output_data = debug_output_data;
 554
 555         s = nir_shader_clone(c, s);
 556         c->s = s;
 557
 558         list_inithead(&c->blocks);
 559         vir_set_emit_block(c, vir_new_block(c));
 560
 561         c->output_position_index = -1;
 562         c->output_point_size_index = -1;
 563         c->output_sample_mask_index = -1;
 564
 565         c->def_ht = _mesa_hash_table_create(c, _mesa_hash_pointer,
 566                                             _mesa_key_pointer_equal);
 567
 568         return c;
 569 }
 570
 571 static int
 572 type_size_vec4(const struct glsl_type *type)
 573 {
 574         return glsl_count_attribute_slots(type, false);
 575 }
 576
 577 static void
 578 v3d_lower_nir(struct v3d_compile *c)
 579 {
 580         struct nir_lower_tex_options tex_options = {
 581                 .lower_txd = true,
 582                 .lower_tg4_broadcom_swizzle = true,
 583
 584                 .lower_rect = false, /* XXX: Use this on V3D 3.x */
 585                 .lower_txp = ~0,
 586                 /* Apply swizzles to all samplers. */
 587                 .swizzle_result = ~0,
 588         };
 589
 590         /* Lower the format swizzle and (for 32-bit returns)
 591          * ARB_texture_swizzle-style swizzle.
 592          */
 593         for (int i = 0; i < ARRAY_SIZE(c->key->tex); i++) {
 594                 for (int j = 0; j < 4; j++)
 595                         tex_options.swizzles[i][j] = c->key->tex[i].swizzle[j];
 596
 597                 if (c->key->tex[i].clamp_s)
 598                         tex_options.saturate_s |= 1 << i;
 599                 if (c->key->tex[i].clamp_t)
 600                         tex_options.saturate_t |= 1 << i;
 601                 if (c->key->tex[i].clamp_r)
 602                         tex_options.saturate_r |= 1 << i;
 603                 if (c->key->tex[i].return_size == 16) {
 604                         tex_options.lower_tex_packing[i] =
 605                                 nir_lower_tex_packing_16;
 606                 }
 607         }
 608
 609         NIR_PASS_V(c->s, nir_lower_tex, &tex_options);
 610         NIR_PASS_V(c->s, nir_lower_system_values);
 611 }
 612
 613 static void
 614 v3d_set_prog_data_uniforms(struct v3d_compile *c,
 615                            struct v3d_prog_data *prog_data)
 616 {
 617         int count = c->num_uniforms;
 618         struct v3d_uniform_list *ulist = &prog_data->uniforms;
 619
 620         ulist->count = count;
 621         ulist->data = ralloc_array(prog_data, uint32_t, count);
 622         memcpy(ulist->data, c->uniform_data,
 623                count * sizeof(*ulist->data));
 624         ulist->contents = ralloc_array(prog_data, enum quniform_contents, count);
 625         memcpy(ulist->contents, c->uniform_contents,
 626                count * sizeof(*ulist->contents));
 627 }
 628
 629 /* Copy the compiler UBO range state to the compiled shader, dropping out
 630  * arrays that were never referenced by an indirect load.
 631  *
 632  * (Note that QIR dead code elimination of an array access still leaves that
 633  * array alive, though)
 634  */
 635 static void
 636 v3d_set_prog_data_ubo(struct v3d_compile *c,
 637                       struct v3d_prog_data *prog_data)
 638 {
 639         if (!c->num_ubo_ranges)
 640                 return;
 641
 642         prog_data->num_ubo_ranges = 0;
 643         prog_data->ubo_ranges = ralloc_array(prog_data, struct v3d_ubo_range,
 644                                              c->num_ubo_ranges);
 645         for (int i = 0; i < c->num_ubo_ranges; i++) {
 646                 if (!c->ubo_range_used[i])
 647                         continue;
 648
 649                 struct v3d_ubo_range *range = &c->ubo_ranges[i];
 650                 prog_data->ubo_ranges[prog_data->num_ubo_ranges++] = *range;
 651                 prog_data->ubo_size += range->size;
 652         }
 653
 654         if (prog_data->ubo_size) {
 655                 if (V3D_DEBUG & V3D_DEBUG_SHADERDB) {
 656                         fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d UBO uniforms\n",
 657                                 vir_get_stage_name(c),
 658                                 c->program_id, c->variant_id,
 659                                 prog_data->ubo_size / 4);
 660                 }
 661         }
 662 }
 663
 664 static void
 665 v3d_vs_set_prog_data(struct v3d_compile *c,
 666                      struct v3d_vs_prog_data *prog_data)
 667 {
 668         prog_data->base.num_inputs = c->num_inputs;
 669
 670         /* The vertex data gets format converted by the VPM so that
 671          * each attribute channel takes up a VPM column.  Precompute
 672          * the sizes for the shader record.
 673          */
 674         for (int i = 0; i < ARRAY_SIZE(prog_data->vattr_sizes); i++) {
 675                 prog_data->vattr_sizes[i] = c->vattr_sizes[i];
 676                 prog_data->vpm_input_size += c->vattr_sizes[i];
 677         }
 678
 679         prog_data->uses_vid = (c->s->info.system_values_read &
 680                                (1ull << SYSTEM_VALUE_VERTEX_ID));
 681         prog_data->uses_iid = (c->s->info.system_values_read &
 682                                (1ull << SYSTEM_VALUE_INSTANCE_ID));
 683
 684         if (prog_data->uses_vid)
 685                 prog_data->vpm_input_size++;
 686         if (prog_data->uses_iid)
 687                 prog_data->vpm_input_size++;
 688
 689         /* Input/output segment size are in sectors (8 rows of 32 bits per
 690          * channel).
 691          */
 692         prog_data->vpm_input_size = align(prog_data->vpm_input_size, 8) / 8;
 693         prog_data->vpm_output_size = align(c->num_vpm_writes, 8) / 8;
 694
 695         /* Set us up for shared input/output segments.  This is apparently
 696          * necessary for our VCM setup to avoid varying corruption.
 697          */
 698         prog_data->separate_segments = false;
 699         prog_data->vpm_output_size = MAX2(prog_data->vpm_output_size,
 700                                           prog_data->vpm_input_size);
 701         prog_data->vpm_input_size = 0;
 702
 703         /* Compute VCM cache size.  We set up our program to take up less than
 704          * half of the VPM, so that any set of bin and render programs won't
 705          * run out of space.  We need space for at least one input segment,
 706          * and then allocate the rest to output segments (one for the current
 707          * program, the rest to VCM).  The valid range of the VCM cache size
 708          * field is 1-4 16-vertex batches, but GFXH-1744 limits us to 2-4
 709          * batches.
 710          */
 711         assert(c->devinfo->vpm_size);
 712         int sector_size = 16 * sizeof(uint32_t) * 8;
 713         int vpm_size_in_sectors = c->devinfo->vpm_size / sector_size;
 714         int half_vpm = vpm_size_in_sectors / 2;
 715         int vpm_output_sectors = half_vpm - prog_data->vpm_input_size;
 716         int vpm_output_batches = vpm_output_sectors / prog_data->vpm_output_size;
 717         assert(vpm_output_batches >= 2);
 718         prog_data->vcm_cache_size = CLAMP(vpm_output_batches - 1, 2, 4);
 719 }
 720
 721 static void
 722 v3d_set_fs_prog_data_inputs(struct v3d_compile *c,
 723                             struct v3d_fs_prog_data *prog_data)
 724 {
 725         prog_data->base.num_inputs = c->num_inputs;
 726         memcpy(prog_data->input_slots, c->input_slots,
 727                c->num_inputs * sizeof(*c->input_slots));
 728
 729         STATIC_ASSERT(ARRAY_SIZE(prog_data->flat_shade_flags) >
 730                       (V3D_MAX_FS_INPUTS - 1) / 24);
 731         for (int i = 0; i < V3D_MAX_FS_INPUTS; i++) {
 732                 if (BITSET_TEST(c->flat_shade_flags, i))
 733                         prog_data->flat_shade_flags[i / 24] |= 1 << (i % 24);
 734
 735                 if (BITSET_TEST(c->noperspective_flags, i))
 736                         prog_data->noperspective_flags[i / 24] |= 1 << (i % 24);
 737
 738                 if (BITSET_TEST(c->centroid_flags, i))
 739                         prog_data->centroid_flags[i / 24] |= 1 << (i % 24);
 740         }
 741 }
 742
 743 static void
 744 v3d_fs_set_prog_data(struct v3d_compile *c,
 745                      struct v3d_fs_prog_data *prog_data)
 746 {
 747         v3d_set_fs_prog_data_inputs(c, prog_data);
 748         prog_data->writes_z = (c->s->info.outputs_written &
 749                                (1 << FRAG_RESULT_DEPTH));
 750         prog_data->discard = (c->s->info.fs.uses_discard ||
 751                               c->fs_key->sample_alpha_to_coverage);
 752         prog_data->uses_center_w = c->uses_center_w;
 753
 754         /* If the shader has some side effects and hasn't allowed early
 755          * fragment tests, disable them.
 756          */
 757         if (!c->s->info.fs.early_fragment_tests &&
 758             (c->s->info.num_images ||
 759              c->s->info.num_ssbos ||
 760              c->s->info.num_abos)) {
 761                 prog_data->discard = true;
 762         }
 763 }
 764
 765 static void
 766 v3d_set_prog_data(struct v3d_compile *c,
 767                   struct v3d_prog_data *prog_data)
 768 {
 769         prog_data->threads = c->threads;
 770         prog_data->single_seg = !c->last_thrsw;
 771         prog_data->spill_size = c->spill_size;
 772
 773         v3d_set_prog_data_uniforms(c, prog_data);
 774         v3d_set_prog_data_ubo(c, prog_data);
 775
 776         if (c->s->info.stage == MESA_SHADER_VERTEX) {
 777                 v3d_vs_set_prog_data(c, (struct v3d_vs_prog_data *)prog_data);
 778         } else {
 779                 assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
 780                 v3d_fs_set_prog_data(c, (struct v3d_fs_prog_data *)prog_data);
 781         }
 782 }
 783
 784 static uint64_t *
 785 v3d_return_qpu_insts(struct v3d_compile *c, uint32_t *final_assembly_size)
 786 {
 787         *final_assembly_size = c->qpu_inst_count * sizeof(uint64_t);
 788
 789         uint64_t *qpu_insts = malloc(*final_assembly_size);
 790         if (!qpu_insts)
 791                 return NULL;
 792
 793         memcpy(qpu_insts, c->qpu_insts, *final_assembly_size);
 794
 795         vir_compile_destroy(c);
 796
 797         return qpu_insts;
 798 }
 799
 800 static void
 801 v3d_nir_lower_vs_early(struct v3d_compile *c)
 802 {
 803         /* Split our I/O vars and dead code eliminate the unused
 804          * components.
 805          */
 806         NIR_PASS_V(c->s, nir_lower_io_to_scalar_early,
 807                    nir_var_shader_in | nir_var_shader_out);
 808         uint64_t used_outputs[4] = {0};
 809         for (int i = 0; i < c->vs_key->num_fs_inputs; i++) {
 810                 int slot = v3d_slot_get_slot(c->vs_key->fs_inputs[i]);
 811                 int comp = v3d_slot_get_component(c->vs_key->fs_inputs[i]);
 812                 used_outputs[comp] |= 1ull << slot;
 813         }
 814         NIR_PASS_V(c->s, nir_remove_unused_io_vars,
 815                    &c->s->outputs, used_outputs, NULL); /* demotes to globals */
 816         NIR_PASS_V(c->s, nir_lower_global_vars_to_local);
 817         v3d_optimize_nir(c->s);
 818         NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in);
 819         NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
 820                    type_size_vec4,
 821                    (nir_lower_io_options)0);
 822 }
 823
 824 static void
 825 v3d_fixup_fs_output_types(struct v3d_compile *c)
 826 {
 827         nir_foreach_variable(var, &c->s->outputs) {
 828                 uint32_t mask = 0;
 829
 830                 switch (var->data.location) {
 831                 case FRAG_RESULT_COLOR:
 832                         mask = ~0;
 833                         break;
 834                 case FRAG_RESULT_DATA0:
 835                 case FRAG_RESULT_DATA1:
 836                 case FRAG_RESULT_DATA2:
 837                 case FRAG_RESULT_DATA3:
 838                         mask = 1 << (var->data.location - FRAG_RESULT_DATA0);
 839                         break;
 840                 }
 841
 842                 if (c->fs_key->int_color_rb & mask) {
 843                         var->type =
 844                                 glsl_vector_type(GLSL_TYPE_INT,
 845                                                  glsl_get_components(var->type));
 846                 } else if (c->fs_key->uint_color_rb & mask) {
 847                         var->type =
 848                                 glsl_vector_type(GLSL_TYPE_UINT,
 849                                                  glsl_get_components(var->type));
 850                 }
 851         }
 852 }
 853
 854 static void
 855 v3d_nir_lower_fs_early(struct v3d_compile *c)
 856 {
 857         if (c->fs_key->int_color_rb || c->fs_key->uint_color_rb)
 858                 v3d_fixup_fs_output_types(c);
 859 }
 860
 861 static void
 862 v3d_nir_lower_vs_late(struct v3d_compile *c)
 863 {
 864         if (c->vs_key->clamp_color)
 865                 NIR_PASS_V(c->s, nir_lower_clamp_color_outputs);
 866
 867         if (c->key->ucp_enables) {
 868                 NIR_PASS_V(c->s, nir_lower_clip_vs, c->key->ucp_enables,
 869                            false);
 870                 NIR_PASS_V(c->s, nir_lower_io_to_scalar,
 871                            nir_var_shader_out);
 872         }
 873
 874         /* Note: VS output scalarizing must happen after nir_lower_clip_vs. */
 875         NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out);
 876 }
 877
 878 static void
 879 v3d_nir_lower_fs_late(struct v3d_compile *c)
 880 {
 881         if (c->fs_key->light_twoside)
 882                 NIR_PASS_V(c->s, nir_lower_two_sided_color);
 883
 884         if (c->fs_key->clamp_color)
 885                 NIR_PASS_V(c->s, nir_lower_clamp_color_outputs);
 886
 887         if (c->fs_key->alpha_test) {
 888                 NIR_PASS_V(c->s, nir_lower_alpha_test,
 889                            c->fs_key->alpha_test_func,
 890                            false);
 891         }
 892
 893         if (c->key->ucp_enables)
 894                 NIR_PASS_V(c->s, nir_lower_clip_fs, c->key->ucp_enables);
 895
 896         /* Note: FS input scalarizing must happen after
 897          * nir_lower_two_sided_color, which only handles a vec4 at a time.
 898          */
 899         NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in);
 900 }
 901
 902 uint64_t *v3d_compile(const struct v3d_compiler *compiler,
 903                       struct v3d_key *key,
 904                       struct v3d_prog_data **out_prog_data,
 905                       nir_shader *s,
 906                       void (*debug_output)(const char *msg,
 907                                            void *debug_output_data),
 908                       void *debug_output_data,
 909                       int program_id, int variant_id,
 910                       uint32_t *final_assembly_size)
 911 {
 912         struct v3d_prog_data *prog_data;
 913         struct v3d_compile *c = vir_compile_init(compiler, key, s,
 914                                                  debug_output, debug_output_data,
 915                                                  program_id, variant_id);
 916
 917         switch (c->s->info.stage) {
 918         case MESA_SHADER_VERTEX:
 919                 c->vs_key = (struct v3d_vs_key *)key;
 920                 prog_data = rzalloc_size(NULL, sizeof(struct v3d_vs_prog_data));
 921                 break;
 922         case MESA_SHADER_FRAGMENT:
 923                 c->fs_key = (struct v3d_fs_key *)key;
 924                 prog_data = rzalloc_size(NULL, sizeof(struct v3d_fs_prog_data));
 925                 break;
 926         default:
 927                 unreachable("unsupported shader stage");
 928         }
 929
 930         if (c->s->info.stage == MESA_SHADER_VERTEX) {
 931                 v3d_nir_lower_vs_early(c);
 932         } else {
 933                 assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
 934                 v3d_nir_lower_fs_early(c);
 935         }
 936
 937         v3d_lower_nir(c);
 938
 939         if (c->s->info.stage == MESA_SHADER_VERTEX) {
 940                 v3d_nir_lower_vs_late(c);
 941         } else {
 942                 assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
 943                 v3d_nir_lower_fs_late(c);
 944         }
 945
 946         NIR_PASS_V(c->s, v3d_nir_lower_io, c);
 947         NIR_PASS_V(c->s, v3d_nir_lower_txf_ms, c);
 948         NIR_PASS_V(c->s, v3d_nir_lower_image_load_store);
 949         NIR_PASS_V(c->s, nir_lower_idiv);
 950
 951         v3d_optimize_nir(c->s);
 952         NIR_PASS_V(c->s, nir_lower_bool_to_int32);
 953         NIR_PASS_V(c->s, nir_convert_from_ssa, true);
 954
 955         v3d_nir_to_vir(c);
 956
 957         v3d_set_prog_data(c, prog_data);
 958
 959         *out_prog_data = prog_data;
 960
 961         char *shaderdb;
 962         int ret = asprintf(&shaderdb,
 963                            "%s shader: %d inst, %d threads, %d loops, "
 964                            "%d uniforms, %d:%d spills:fills",
 965                            vir_get_stage_name(c),
 966                            c->qpu_inst_count,
 967                            c->threads,
 968                            c->loops,
 969                            c->num_uniforms,
 970                            c->spills,
 971                            c->fills);
 972         if (ret >= 0) {
 973                 c->debug_output(shaderdb, c->debug_output_data);
 974                 free(shaderdb);
 975         }
 976
 977        return v3d_return_qpu_insts(c, final_assembly_size);
 978 }
 979
 980 void
 981 vir_remove_instruction(struct v3d_compile *c, struct qinst *qinst)
 982 {
 983         if (qinst->dst.file == QFILE_TEMP)
 984                 c->defs[qinst->dst.index] = NULL;
 985
 986         assert(&qinst->link != c->cursor.link);
 987
 988         list_del(&qinst->link);
 989         free(qinst);
 990
 991         c->live_intervals_valid = false;
 992 }
 993
 994 struct qreg
 995 vir_follow_movs(struct v3d_compile *c, struct qreg reg)
 996 {
 997         /* XXX
 998         int pack = reg.pack;
 999
1000         while (reg.file == QFILE_TEMP &&
1001                c->defs[reg.index] &&
1002                (c->defs[reg.index]->op == QOP_MOV ||
1003                 c->defs[reg.index]->op == QOP_FMOV) &&
1004                !c->defs[reg.index]->dst.pack &&
1005                !c->defs[reg.index]->src[0].pack) {
1006                 reg = c->defs[reg.index]->src[0];
1007         }
1008
1009         reg.pack = pack;
1010         */
1011         return reg;
1012 }
1013
1014 void
1015 vir_compile_destroy(struct v3d_compile *c)
1016 {
1017         /* Defuse the assert that we aren't removing the cursor's instruction.
1018          */
1019         c->cursor.link = NULL;
1020
1021         vir_for_each_block(block, c) {
1022                 while (!list_empty(&block->instructions)) {
1023                         struct qinst *qinst =
1024                                 list_first_entry(&block->instructions,
1025                                                  struct qinst, link);
1026                         vir_remove_instruction(c, qinst);
1027                 }
1028         }
1029
1030         ralloc_free(c);
1031 }
1032
1033 struct qreg
1034 vir_uniform(struct v3d_compile *c,
1035             enum quniform_contents contents,
1036             uint32_t data)
1037 {
1038         for (int i = 0; i < c->num_uniforms; i++) {
1039                 if (c->uniform_contents[i] == contents &&
1040                     c->uniform_data[i] == data) {
1041                         return vir_reg(QFILE_UNIF, i);
1042                 }
1043         }
1044
1045         uint32_t uniform = c->num_uniforms++;
1046
1047         if (uniform >= c->uniform_array_size) {
1048                 c->uniform_array_size = MAX2(MAX2(16, uniform + 1),
1049                                              c->uniform_array_size * 2);
1050
1051                 c->uniform_data = reralloc(c, c->uniform_data,
1052                                            uint32_t,
1053                                            c->uniform_array_size);
1054                 c->uniform_contents = reralloc(c, c->uniform_contents,
1055                                                enum quniform_contents,
1056                                                c->uniform_array_size);
1057         }
1058
1059         c->uniform_contents[uniform] = contents;
1060         c->uniform_data[uniform] = data;
1061
1062         return vir_reg(QFILE_UNIF, uniform);
1063 }
1064
1065 static bool
1066 vir_can_set_flags(struct v3d_compile *c, struct qinst *inst)
1067 {
1068         if (c->devinfo->ver >= 40 && (v3d_qpu_reads_vpm(&inst->qpu) ||
1069                                       v3d_qpu_uses_sfu(&inst->qpu))) {
1070                 return false;
1071         }
1072
1073         if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
1074             (inst->qpu.alu.add.op == V3D_QPU_A_NOP &&
1075              inst->qpu.alu.mul.op == V3D_QPU_M_NOP)) {
1076                return false;
1077         }
1078
1079         return true;
1080 }
1081
1082 void
1083 vir_PF(struct v3d_compile *c, struct qreg src, enum v3d_qpu_pf pf)
1084 {
1085         struct qinst *last_inst = NULL;
1086
1087         if (!list_empty(&c->cur_block->instructions)) {
1088                 last_inst = (struct qinst *)c->cur_block->instructions.prev;
1089
1090                 /* Can't stuff the PF into the last last inst if our cursor
1091                  * isn't pointing after it.
1092                  */
1093                 struct vir_cursor after_inst = vir_after_inst(last_inst);
1094                 if (c->cursor.mode != after_inst.mode ||
1095                     c->cursor.link != after_inst.link)
1096                         last_inst = NULL;
1097         }
1098
1099         if (src.file != QFILE_TEMP ||
1100             !c->defs[src.index] ||
1101             last_inst != c->defs[src.index] ||
1102             !vir_can_set_flags(c, last_inst)) {
1103                 /* XXX: Make the MOV be the appropriate type */
1104                 last_inst = vir_MOV_dest(c, vir_reg(QFILE_NULL, 0), src);
1105         }
1106
1107         vir_set_pf(last_inst, pf);
1108 }
1109
1110 #define OPTPASS(func)                                                   \
1111         do {                                                            \
1112                 bool stage_progress = func(c);                          \
1113                 if (stage_progress) {                                   \
1114                         progress = true;                                \
1115                         if (print_opt_debug) {                          \
1116                                 fprintf(stderr,                         \
1117                                         "VIR opt pass %2d: %s progress\n", \
1118                                         pass, #func);                   \
1119                         }                                               \
1120                         /*XXX vir_validate(c);*/                        \
1121                 }                                                       \
1122         } while (0)
1123
1124 void
1125 vir_optimize(struct v3d_compile *c)
1126 {
1127         bool print_opt_debug = false;
1128         int pass = 1;
1129
1130         while (true) {
1131                 bool progress = false;
1132
1133                 OPTPASS(vir_opt_copy_propagate);
1134                 OPTPASS(vir_opt_dead_code);
1135                 OPTPASS(vir_opt_small_immediates);
1136
1137                 if (!progress)
1138                         break;
1139
1140                 pass++;
1141         }
1142 }
1143
1144 const char *
1145 vir_get_stage_name(struct v3d_compile *c)
1146 {
1147         if (c->vs_key && c->vs_key->is_coord)
1148                 return "MESA_SHADER_COORD";
1149         else
1150                 return gl_shader_stage_name(c->s->info.stage);
1151 }