src/broadcom/compiler/vir.c

   1 /*
   2  * Copyright © 2016-2017 Broadcom
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "broadcom/common/v3d_device_info.h"
  25 #include "v3d_compiler.h"
  26
  27 int
  28 vir_get_nsrc(struct qinst *inst)
  29 {
  30         switch (inst->qpu.type) {
  31         case V3D_QPU_INSTR_TYPE_BRANCH:
  32                 return 0;
  33         case V3D_QPU_INSTR_TYPE_ALU:
  34                 if (inst->qpu.alu.add.op != V3D_QPU_A_NOP)
  35                         return v3d_qpu_add_op_num_src(inst->qpu.alu.add.op);
  36                 else
  37                         return v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op);
  38         }
  39
  40         return 0;
  41 }
  42
  43 /**
  44  * Returns whether the instruction has any side effects that must be
  45  * preserved.
  46  */
  47 bool
  48 vir_has_side_effects(struct v3d_compile *c, struct qinst *inst)
  49 {
  50         switch (inst->qpu.type) {
  51         case V3D_QPU_INSTR_TYPE_BRANCH:
  52                 return true;
  53         case V3D_QPU_INSTR_TYPE_ALU:
  54                 switch (inst->qpu.alu.add.op) {
  55                 case V3D_QPU_A_SETREVF:
  56                 case V3D_QPU_A_SETMSF:
  57                 case V3D_QPU_A_VPMSETUP:
  58                 case V3D_QPU_A_STVPMV:
  59                 case V3D_QPU_A_STVPMD:
  60                 case V3D_QPU_A_STVPMP:
  61                 case V3D_QPU_A_VPMWT:
  62                 case V3D_QPU_A_TMUWT:
  63                         return true;
  64                 default:
  65                         break;
  66                 }
  67
  68                 switch (inst->qpu.alu.mul.op) {
  69                 case V3D_QPU_M_MULTOP:
  70                         return true;
  71                 default:
  72                         break;
  73                 }
  74         }
  75
  76         if (inst->qpu.sig.ldtmu ||
  77             inst->qpu.sig.ldvary ||
  78             inst->qpu.sig.ldtlbu ||
  79             inst->qpu.sig.ldtlb ||
  80             inst->qpu.sig.wrtmuc ||
  81             inst->qpu.sig.thrsw) {
  82                 return true;
  83         }
  84
  85         return false;
  86 }
  87
  88 bool
  89 vir_is_raw_mov(struct qinst *inst)
  90 {
  91         if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
  92             (inst->qpu.alu.mul.op != V3D_QPU_M_FMOV &&
  93              inst->qpu.alu.mul.op != V3D_QPU_M_MOV)) {
  94                 return false;
  95         }
  96
  97         if (inst->qpu.alu.add.output_pack != V3D_QPU_PACK_NONE ||
  98             inst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE) {
  99                 return false;
 100         }
 101
 102         if (inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE ||
 103             inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE ||
 104             inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE ||
 105             inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE) {
 106                 return false;
 107         }
 108
 109         if (inst->qpu.flags.ac != V3D_QPU_COND_NONE ||
 110             inst->qpu.flags.mc != V3D_QPU_COND_NONE)
 111                 return false;
 112
 113         return true;
 114 }
 115
 116 bool
 117 vir_is_add(struct qinst *inst)
 118 {
 119         return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
 120                 inst->qpu.alu.add.op != V3D_QPU_A_NOP);
 121 }
 122
 123 bool
 124 vir_is_mul(struct qinst *inst)
 125 {
 126         return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
 127                 inst->qpu.alu.mul.op != V3D_QPU_M_NOP);
 128 }
 129
 130 bool
 131 vir_is_tex(struct qinst *inst)
 132 {
 133         if (inst->dst.file == QFILE_MAGIC)
 134                 return v3d_qpu_magic_waddr_is_tmu(inst->dst.index);
 135
 136         if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
 137             inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) {
 138                 return true;
 139         }
 140
 141         return false;
 142 }
 143
 144 bool
 145 vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst)
 146 {
 147         for (int i = 0; i < vir_get_nsrc(inst); i++) {
 148                 switch (inst->src[i].file) {
 149                 case QFILE_VPM:
 150                         return true;
 151                 default:
 152                         break;
 153                 }
 154         }
 155
 156         if (devinfo->ver < 41 && (inst->qpu.sig.ldvary ||
 157                                   inst->qpu.sig.ldtlb ||
 158                                   inst->qpu.sig.ldtlbu ||
 159                                   inst->qpu.sig.ldvpm)) {
 160                 return true;
 161         }
 162
 163         return false;
 164 }
 165
 166 bool
 167 vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst)
 168 {
 169         switch (inst->dst.file) {
 170         case QFILE_MAGIC:
 171                 switch (inst->dst.index) {
 172                 case V3D_QPU_WADDR_RECIP:
 173                 case V3D_QPU_WADDR_RSQRT:
 174                 case V3D_QPU_WADDR_EXP:
 175                 case V3D_QPU_WADDR_LOG:
 176                 case V3D_QPU_WADDR_SIN:
 177                         return true;
 178                 }
 179                 break;
 180         default:
 181                 break;
 182         }
 183
 184         if (devinfo->ver < 41 && inst->qpu.sig.ldtmu)
 185                 return true;
 186
 187         return false;
 188 }
 189
 190 void
 191 vir_set_unpack(struct qinst *inst, int src,
 192                enum v3d_qpu_input_unpack unpack)
 193 {
 194         assert(src == 0 || src == 1);
 195
 196         if (vir_is_add(inst)) {
 197                 if (src == 0)
 198                         inst->qpu.alu.add.a_unpack = unpack;
 199                 else
 200                         inst->qpu.alu.add.b_unpack = unpack;
 201         } else {
 202                 assert(vir_is_mul(inst));
 203                 if (src == 0)
 204                         inst->qpu.alu.mul.a_unpack = unpack;
 205                 else
 206                         inst->qpu.alu.mul.b_unpack = unpack;
 207         }
 208 }
 209
 210 void
 211 vir_set_cond(struct qinst *inst, enum v3d_qpu_cond cond)
 212 {
 213         if (vir_is_add(inst)) {
 214                 inst->qpu.flags.ac = cond;
 215         } else {
 216                 assert(vir_is_mul(inst));
 217                 inst->qpu.flags.mc = cond;
 218         }
 219 }
 220
 221 void
 222 vir_set_pf(struct qinst *inst, enum v3d_qpu_pf pf)
 223 {
 224         if (vir_is_add(inst)) {
 225                 inst->qpu.flags.apf = pf;
 226         } else {
 227                 assert(vir_is_mul(inst));
 228                 inst->qpu.flags.mpf = pf;
 229         }
 230 }
 231
 232 void
 233 vir_set_uf(struct qinst *inst, enum v3d_qpu_uf uf)
 234 {
 235         if (vir_is_add(inst)) {
 236                 inst->qpu.flags.auf = uf;
 237         } else {
 238                 assert(vir_is_mul(inst));
 239                 inst->qpu.flags.muf = uf;
 240         }
 241 }
 242
 243 #if 0
 244 uint8_t
 245 vir_channels_written(struct qinst *inst)
 246 {
 247         if (vir_is_mul(inst)) {
 248                 switch (inst->dst.pack) {
 249                 case QPU_PACK_MUL_NOP:
 250                 case QPU_PACK_MUL_8888:
 251                         return 0xf;
 252                 case QPU_PACK_MUL_8A:
 253                         return 0x1;
 254                 case QPU_PACK_MUL_8B:
 255                         return 0x2;
 256                 case QPU_PACK_MUL_8C:
 257                         return 0x4;
 258                 case QPU_PACK_MUL_8D:
 259                         return 0x8;
 260                 }
 261         } else {
 262                 switch (inst->dst.pack) {
 263                 case QPU_PACK_A_NOP:
 264                 case QPU_PACK_A_8888:
 265                 case QPU_PACK_A_8888_SAT:
 266                 case QPU_PACK_A_32_SAT:
 267                         return 0xf;
 268                 case QPU_PACK_A_8A:
 269                 case QPU_PACK_A_8A_SAT:
 270                         return 0x1;
 271                 case QPU_PACK_A_8B:
 272                 case QPU_PACK_A_8B_SAT:
 273                         return 0x2;
 274                 case QPU_PACK_A_8C:
 275                 case QPU_PACK_A_8C_SAT:
 276                         return 0x4;
 277                 case QPU_PACK_A_8D:
 278                 case QPU_PACK_A_8D_SAT:
 279                         return 0x8;
 280                 case QPU_PACK_A_16A:
 281                 case QPU_PACK_A_16A_SAT:
 282                         return 0x3;
 283                 case QPU_PACK_A_16B:
 284                 case QPU_PACK_A_16B_SAT:
 285                         return 0xc;
 286                 }
 287         }
 288         unreachable("Bad pack field");
 289 }
 290 #endif
 291
 292 struct qreg
 293 vir_get_temp(struct v3d_compile *c)
 294 {
 295         struct qreg reg;
 296
 297         reg.file = QFILE_TEMP;
 298         reg.index = c->num_temps++;
 299
 300         if (c->num_temps > c->defs_array_size) {
 301                 uint32_t old_size = c->defs_array_size;
 302                 c->defs_array_size = MAX2(old_size * 2, 16);
 303
 304                 c->defs = reralloc(c, c->defs, struct qinst *,
 305                                    c->defs_array_size);
 306                 memset(&c->defs[old_size], 0,
 307                        sizeof(c->defs[0]) * (c->defs_array_size - old_size));
 308
 309                 c->spillable = reralloc(c, c->spillable,
 310                                         BITSET_WORD,
 311                                         BITSET_WORDS(c->defs_array_size));
 312                 for (int i = old_size; i < c->defs_array_size; i++)
 313                         BITSET_SET(c->spillable, i);
 314         }
 315
 316         return reg;
 317 }
 318
 319 struct qinst *
 320 vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst, struct qreg src0, struct qreg src1)
 321 {
 322         struct qinst *inst = calloc(1, sizeof(*inst));
 323
 324         inst->qpu = v3d_qpu_nop();
 325         inst->qpu.alu.add.op = op;
 326
 327         inst->dst = dst;
 328         inst->src[0] = src0;
 329         inst->src[1] = src1;
 330         inst->uniform = ~0;
 331
 332         return inst;
 333 }
 334
 335 struct qinst *
 336 vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst, struct qreg src0, struct qreg src1)
 337 {
 338         struct qinst *inst = calloc(1, sizeof(*inst));
 339
 340         inst->qpu = v3d_qpu_nop();
 341         inst->qpu.alu.mul.op = op;
 342
 343         inst->dst = dst;
 344         inst->src[0] = src0;
 345         inst->src[1] = src1;
 346         inst->uniform = ~0;
 347
 348         return inst;
 349 }
 350
 351 struct qinst *
 352 vir_branch_inst(struct v3d_compile *c, enum v3d_qpu_branch_cond cond)
 353 {
 354         struct qinst *inst = calloc(1, sizeof(*inst));
 355
 356         inst->qpu = v3d_qpu_nop();
 357         inst->qpu.type = V3D_QPU_INSTR_TYPE_BRANCH;
 358         inst->qpu.branch.cond = cond;
 359         inst->qpu.branch.msfign = V3D_QPU_MSFIGN_NONE;
 360         inst->qpu.branch.bdi = V3D_QPU_BRANCH_DEST_REL;
 361         inst->qpu.branch.ub = true;
 362         inst->qpu.branch.bdu = V3D_QPU_BRANCH_DEST_REL;
 363
 364         inst->dst = vir_nop_reg();
 365         inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, 0);
 366
 367         return inst;
 368 }
 369
 370 static void
 371 vir_emit(struct v3d_compile *c, struct qinst *inst)
 372 {
 373         switch (c->cursor.mode) {
 374         case vir_cursor_add:
 375                 list_add(&inst->link, c->cursor.link);
 376                 break;
 377         case vir_cursor_addtail:
 378                 list_addtail(&inst->link, c->cursor.link);
 379                 break;
 380         }
 381
 382         c->cursor = vir_after_inst(inst);
 383         c->live_intervals_valid = false;
 384 }
 385
 386 /* Updates inst to write to a new temporary, emits it, and notes the def. */
 387 struct qreg
 388 vir_emit_def(struct v3d_compile *c, struct qinst *inst)
 389 {
 390         assert(inst->dst.file == QFILE_NULL);
 391
 392         /* If we're emitting an instruction that's a def, it had better be
 393          * writing a register.
 394          */
 395         if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
 396                 assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP ||
 397                        v3d_qpu_add_op_has_dst(inst->qpu.alu.add.op));
 398                 assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP ||
 399                        v3d_qpu_mul_op_has_dst(inst->qpu.alu.mul.op));
 400         }
 401
 402         inst->dst = vir_get_temp(c);
 403
 404         if (inst->dst.file == QFILE_TEMP)
 405                 c->defs[inst->dst.index] = inst;
 406
 407         vir_emit(c, inst);
 408
 409         return inst->dst;
 410 }
 411
 412 struct qinst *
 413 vir_emit_nondef(struct v3d_compile *c, struct qinst *inst)
 414 {
 415         if (inst->dst.file == QFILE_TEMP)
 416                 c->defs[inst->dst.index] = NULL;
 417
 418         vir_emit(c, inst);
 419
 420         return inst;
 421 }
 422
 423 struct qblock *
 424 vir_new_block(struct v3d_compile *c)
 425 {
 426         struct qblock *block = rzalloc(c, struct qblock);
 427
 428         list_inithead(&block->instructions);
 429
 430         block->predecessors = _mesa_set_create(block,
 431                                                _mesa_hash_pointer,
 432                                                _mesa_key_pointer_equal);
 433
 434         block->index = c->next_block_index++;
 435
 436         return block;
 437 }
 438
 439 void
 440 vir_set_emit_block(struct v3d_compile *c, struct qblock *block)
 441 {
 442         c->cur_block = block;
 443         c->cursor = vir_after_block(block);
 444         list_addtail(&block->link, &c->blocks);
 445 }
 446
 447 struct qblock *
 448 vir_entry_block(struct v3d_compile *c)
 449 {
 450         return list_first_entry(&c->blocks, struct qblock, link);
 451 }
 452
 453 struct qblock *
 454 vir_exit_block(struct v3d_compile *c)
 455 {
 456         return list_last_entry(&c->blocks, struct qblock, link);
 457 }
 458
 459 void
 460 vir_link_blocks(struct qblock *predecessor, struct qblock *successor)
 461 {
 462         _mesa_set_add(successor->predecessors, predecessor);
 463         if (predecessor->successors[0]) {
 464                 assert(!predecessor->successors[1]);
 465                 predecessor->successors[1] = successor;
 466         } else {
 467                 predecessor->successors[0] = successor;
 468         }
 469 }
 470
 471 const struct v3d_compiler *
 472 v3d_compiler_init(const struct v3d_device_info *devinfo)
 473 {
 474         struct v3d_compiler *compiler = rzalloc(NULL, struct v3d_compiler);
 475         if (!compiler)
 476                 return NULL;
 477
 478         compiler->devinfo = devinfo;
 479
 480         if (!vir_init_reg_sets(compiler)) {
 481                 ralloc_free(compiler);
 482                 return NULL;
 483         }
 484
 485         return compiler;
 486 }
 487
 488 void
 489 v3d_compiler_free(const struct v3d_compiler *compiler)
 490 {
 491         ralloc_free((void *)compiler);
 492 }
 493
 494 static struct v3d_compile *
 495 vir_compile_init(const struct v3d_compiler *compiler,
 496                  struct v3d_key *key,
 497                  nir_shader *s,
 498                  void (*debug_output)(const char *msg,
 499                                       void *debug_output_data),
 500                  void *debug_output_data,
 501                  int program_id, int variant_id)
 502 {
 503         struct v3d_compile *c = rzalloc(NULL, struct v3d_compile);
 504
 505         c->compiler = compiler;
 506         c->devinfo = compiler->devinfo;
 507         c->key = key;
 508         c->program_id = program_id;
 509         c->variant_id = variant_id;
 510         c->threads = 4;
 511         c->debug_output = debug_output;
 512         c->debug_output_data = debug_output_data;
 513
 514         s = nir_shader_clone(c, s);
 515         c->s = s;
 516
 517         list_inithead(&c->blocks);
 518         vir_set_emit_block(c, vir_new_block(c));
 519
 520         c->output_position_index = -1;
 521         c->output_sample_mask_index = -1;
 522
 523         c->def_ht = _mesa_hash_table_create(c, _mesa_hash_pointer,
 524                                             _mesa_key_pointer_equal);
 525
 526         return c;
 527 }
 528
 529 static int
 530 type_size_vec4(const struct glsl_type *type, bool bindless)
 531 {
 532         return glsl_count_attribute_slots(type, false);
 533 }
 534
 535 static void
 536 v3d_lower_nir(struct v3d_compile *c)
 537 {
 538         struct nir_lower_tex_options tex_options = {
 539                 .lower_txd = true,
 540                 .lower_tg4_broadcom_swizzle = true,
 541
 542                 .lower_rect = false, /* XXX: Use this on V3D 3.x */
 543                 .lower_txp = ~0,
 544                 /* Apply swizzles to all samplers. */
 545                 .swizzle_result = ~0,
 546         };
 547
 548         /* Lower the format swizzle and (for 32-bit returns)
 549          * ARB_texture_swizzle-style swizzle.
 550          */
 551         for (int i = 0; i < ARRAY_SIZE(c->key->tex); i++) {
 552                 for (int j = 0; j < 4; j++)
 553                         tex_options.swizzles[i][j] = c->key->tex[i].swizzle[j];
 554
 555                 if (c->key->tex[i].clamp_s)
 556                         tex_options.saturate_s |= 1 << i;
 557                 if (c->key->tex[i].clamp_t)
 558                         tex_options.saturate_t |= 1 << i;
 559                 if (c->key->tex[i].clamp_r)
 560                         tex_options.saturate_r |= 1 << i;
 561                 if (c->key->tex[i].return_size == 16) {
 562                         tex_options.lower_tex_packing[i] =
 563                                 nir_lower_tex_packing_16;
 564                 }
 565         }
 566
 567         /* CS textures may not have return_size reflecting the shadow state. */
 568         nir_foreach_variable(var, &c->s->uniforms) {
 569                 const struct glsl_type *type = glsl_without_array(var->type);
 570                 unsigned array_len = MAX2(glsl_get_length(var->type), 1);
 571
 572                 if (!glsl_type_is_sampler(type) ||
 573                     !glsl_sampler_type_is_shadow(type))
 574                         continue;
 575
 576                 for (int i = 0; i < array_len; i++) {
 577                         tex_options.lower_tex_packing[var->data.binding + i] =
 578                                 nir_lower_tex_packing_16;
 579                 }
 580         }
 581
 582         NIR_PASS_V(c->s, nir_lower_tex, &tex_options);
 583         NIR_PASS_V(c->s, nir_lower_system_values);
 584
 585         NIR_PASS_V(c->s, nir_lower_vars_to_scratch,
 586                    nir_var_function_temp,
 587                    0,
 588                    glsl_get_natural_size_align_bytes);
 589         NIR_PASS_V(c->s, v3d_nir_lower_scratch);
 590 }
 591
 592 static void
 593 v3d_set_prog_data_uniforms(struct v3d_compile *c,
 594                            struct v3d_prog_data *prog_data)
 595 {
 596         int count = c->num_uniforms;
 597         struct v3d_uniform_list *ulist = &prog_data->uniforms;
 598
 599         ulist->count = count;
 600         ulist->data = ralloc_array(prog_data, uint32_t, count);
 601         memcpy(ulist->data, c->uniform_data,
 602                count * sizeof(*ulist->data));
 603         ulist->contents = ralloc_array(prog_data, enum quniform_contents, count);
 604         memcpy(ulist->contents, c->uniform_contents,
 605                count * sizeof(*ulist->contents));
 606 }
 607
 608 static void
 609 v3d_vs_set_prog_data(struct v3d_compile *c,
 610                      struct v3d_vs_prog_data *prog_data)
 611 {
 612         /* The vertex data gets format converted by the VPM so that
 613          * each attribute channel takes up a VPM column.  Precompute
 614          * the sizes for the shader record.
 615          */
 616         for (int i = 0; i < ARRAY_SIZE(prog_data->vattr_sizes); i++) {
 617                 prog_data->vattr_sizes[i] = c->vattr_sizes[i];
 618                 prog_data->vpm_input_size += c->vattr_sizes[i];
 619         }
 620
 621         prog_data->uses_vid = (c->s->info.system_values_read &
 622                                (1ull << SYSTEM_VALUE_VERTEX_ID));
 623         prog_data->uses_iid = (c->s->info.system_values_read &
 624                                (1ull << SYSTEM_VALUE_INSTANCE_ID));
 625
 626         if (prog_data->uses_vid)
 627                 prog_data->vpm_input_size++;
 628         if (prog_data->uses_iid)
 629                 prog_data->vpm_input_size++;
 630
 631         /* Input/output segment size are in sectors (8 rows of 32 bits per
 632          * channel).
 633          */
 634         prog_data->vpm_input_size = align(prog_data->vpm_input_size, 8) / 8;
 635         prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8;
 636
 637         /* Set us up for shared input/output segments.  This is apparently
 638          * necessary for our VCM setup to avoid varying corruption.
 639          */
 640         prog_data->separate_segments = false;
 641         prog_data->vpm_output_size = MAX2(prog_data->vpm_output_size,
 642                                           prog_data->vpm_input_size);
 643         prog_data->vpm_input_size = 0;
 644
 645         /* Compute VCM cache size.  We set up our program to take up less than
 646          * half of the VPM, so that any set of bin and render programs won't
 647          * run out of space.  We need space for at least one input segment,
 648          * and then allocate the rest to output segments (one for the current
 649          * program, the rest to VCM).  The valid range of the VCM cache size
 650          * field is 1-4 16-vertex batches, but GFXH-1744 limits us to 2-4
 651          * batches.
 652          */
 653         assert(c->devinfo->vpm_size);
 654         int sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8;
 655         int vpm_size_in_sectors = c->devinfo->vpm_size / sector_size;
 656         int half_vpm = vpm_size_in_sectors / 2;
 657         int vpm_output_sectors = half_vpm - prog_data->vpm_input_size;
 658         int vpm_output_batches = vpm_output_sectors / prog_data->vpm_output_size;
 659         assert(vpm_output_batches >= 2);
 660         prog_data->vcm_cache_size = CLAMP(vpm_output_batches - 1, 2, 4);
 661 }
 662
 663 static void
 664 v3d_set_fs_prog_data_inputs(struct v3d_compile *c,
 665                             struct v3d_fs_prog_data *prog_data)
 666 {
 667         prog_data->num_inputs = c->num_inputs;
 668         memcpy(prog_data->input_slots, c->input_slots,
 669                c->num_inputs * sizeof(*c->input_slots));
 670
 671         STATIC_ASSERT(ARRAY_SIZE(prog_data->flat_shade_flags) >
 672                       (V3D_MAX_FS_INPUTS - 1) / 24);
 673         for (int i = 0; i < V3D_MAX_FS_INPUTS; i++) {
 674                 if (BITSET_TEST(c->flat_shade_flags, i))
 675                         prog_data->flat_shade_flags[i / 24] |= 1 << (i % 24);
 676
 677                 if (BITSET_TEST(c->noperspective_flags, i))
 678                         prog_data->noperspective_flags[i / 24] |= 1 << (i % 24);
 679
 680                 if (BITSET_TEST(c->centroid_flags, i))
 681                         prog_data->centroid_flags[i / 24] |= 1 << (i % 24);
 682         }
 683 }
 684
 685 static void
 686 v3d_fs_set_prog_data(struct v3d_compile *c,
 687                      struct v3d_fs_prog_data *prog_data)
 688 {
 689         v3d_set_fs_prog_data_inputs(c, prog_data);
 690         prog_data->writes_z = c->writes_z;
 691         prog_data->disable_ez = !c->s->info.fs.early_fragment_tests;
 692         prog_data->uses_center_w = c->uses_center_w;
 693         prog_data->uses_implicit_point_line_varyings =
 694                 c->uses_implicit_point_line_varyings;
 695         prog_data->lock_scoreboard_on_first_thrsw =
 696                 c->lock_scoreboard_on_first_thrsw;
 697 }
 698
 699 static void
 700 v3d_cs_set_prog_data(struct v3d_compile *c,
 701                      struct v3d_compute_prog_data *prog_data)
 702 {
 703         prog_data->shared_size = c->s->info.cs.shared_size;
 704 }
 705
 706 static void
 707 v3d_set_prog_data(struct v3d_compile *c,
 708                   struct v3d_prog_data *prog_data)
 709 {
 710         prog_data->threads = c->threads;
 711         prog_data->single_seg = !c->last_thrsw;
 712         prog_data->spill_size = c->spill_size;
 713
 714         v3d_set_prog_data_uniforms(c, prog_data);
 715
 716         if (c->s->info.stage == MESA_SHADER_COMPUTE) {
 717                 v3d_cs_set_prog_data(c, (struct v3d_compute_prog_data *)prog_data);
 718         } else if (c->s->info.stage == MESA_SHADER_VERTEX) {
 719                 v3d_vs_set_prog_data(c, (struct v3d_vs_prog_data *)prog_data);
 720         } else {
 721                 assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
 722                 v3d_fs_set_prog_data(c, (struct v3d_fs_prog_data *)prog_data);
 723         }
 724 }
 725
 726 static uint64_t *
 727 v3d_return_qpu_insts(struct v3d_compile *c, uint32_t *final_assembly_size)
 728 {
 729         *final_assembly_size = c->qpu_inst_count * sizeof(uint64_t);
 730
 731         uint64_t *qpu_insts = malloc(*final_assembly_size);
 732         if (!qpu_insts)
 733                 return NULL;
 734
 735         memcpy(qpu_insts, c->qpu_insts, *final_assembly_size);
 736
 737         vir_compile_destroy(c);
 738
 739         return qpu_insts;
 740 }
 741
 742 static void
 743 v3d_nir_lower_vs_early(struct v3d_compile *c)
 744 {
 745         /* Split our I/O vars and dead code eliminate the unused
 746          * components.
 747          */
 748         NIR_PASS_V(c->s, nir_lower_io_to_scalar_early,
 749                    nir_var_shader_in | nir_var_shader_out);
 750         uint64_t used_outputs[4] = {0};
 751         for (int i = 0; i < c->vs_key->num_fs_inputs; i++) {
 752                 int slot = v3d_slot_get_slot(c->vs_key->fs_inputs[i]);
 753                 int comp = v3d_slot_get_component(c->vs_key->fs_inputs[i]);
 754                 used_outputs[comp] |= 1ull << slot;
 755         }
 756         NIR_PASS_V(c->s, nir_remove_unused_io_vars,
 757                    &c->s->outputs, used_outputs, NULL); /* demotes to globals */
 758         NIR_PASS_V(c->s, nir_lower_global_vars_to_local);
 759         v3d_optimize_nir(c->s);
 760         NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in);
 761
 762         /* This must go before nir_lower_io */
 763         if (c->vs_key->per_vertex_point_size)
 764                 NIR_PASS_V(c->s, nir_lower_point_size, 1.0f, 0.0f);
 765
 766         NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
 767                    type_size_vec4,
 768                    (nir_lower_io_options)0);
 769         /* clean up nir_lower_io's deref_var remains */
 770         NIR_PASS_V(c->s, nir_opt_dce);
 771 }
 772
 773 static void
 774 v3d_fixup_fs_output_types(struct v3d_compile *c)
 775 {
 776         nir_foreach_variable(var, &c->s->outputs) {
 777                 uint32_t mask = 0;
 778
 779                 switch (var->data.location) {
 780                 case FRAG_RESULT_COLOR:
 781                         mask = ~0;
 782                         break;
 783                 case FRAG_RESULT_DATA0:
 784                 case FRAG_RESULT_DATA1:
 785                 case FRAG_RESULT_DATA2:
 786                 case FRAG_RESULT_DATA3:
 787                         mask = 1 << (var->data.location - FRAG_RESULT_DATA0);
 788                         break;
 789                 }
 790
 791                 if (c->fs_key->int_color_rb & mask) {
 792                         var->type =
 793                                 glsl_vector_type(GLSL_TYPE_INT,
 794                                                  glsl_get_components(var->type));
 795                 } else if (c->fs_key->uint_color_rb & mask) {
 796                         var->type =
 797                                 glsl_vector_type(GLSL_TYPE_UINT,
 798                                                  glsl_get_components(var->type));
 799                 }
 800         }
 801 }
 802
 803 static void
 804 v3d_nir_lower_fs_early(struct v3d_compile *c)
 805 {
 806         if (c->fs_key->int_color_rb || c->fs_key->uint_color_rb)
 807                 v3d_fixup_fs_output_types(c);
 808
 809         NIR_PASS_V(c->s, v3d_nir_lower_logic_ops, c);
 810
 811         /* If the shader has no non-TLB side effects, we can promote it to
 812          * enabling early_fragment_tests even if the user didn't.
 813          */
 814         if (!(c->s->info.num_images ||
 815               c->s->info.num_ssbos ||
 816               c->s->info.num_abos)) {
 817                 c->s->info.fs.early_fragment_tests = true;
 818         }
 819 }
 820
 821 static void
 822 v3d_nir_lower_vs_late(struct v3d_compile *c)
 823 {
 824         if (c->vs_key->clamp_color)
 825                 NIR_PASS_V(c->s, nir_lower_clamp_color_outputs);
 826
 827         if (c->key->ucp_enables) {
 828                 NIR_PASS_V(c->s, nir_lower_clip_vs, c->key->ucp_enables,
 829                            false);
 830                 NIR_PASS_V(c->s, nir_lower_io_to_scalar,
 831                            nir_var_shader_out);
 832         }
 833
 834         /* Note: VS output scalarizing must happen after nir_lower_clip_vs. */
 835         NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out);
 836 }
 837
 838 static void
 839 v3d_nir_lower_fs_late(struct v3d_compile *c)
 840 {
 841         if (c->fs_key->light_twoside)
 842                 NIR_PASS_V(c->s, nir_lower_two_sided_color);
 843
 844         if (c->fs_key->clamp_color)
 845                 NIR_PASS_V(c->s, nir_lower_clamp_color_outputs);
 846
 847         if (c->fs_key->alpha_test) {
 848                 NIR_PASS_V(c->s, nir_lower_alpha_test,
 849                            c->fs_key->alpha_test_func,
 850                            false);
 851         }
 852
 853         if (c->key->ucp_enables)
 854                 NIR_PASS_V(c->s, nir_lower_clip_fs, c->key->ucp_enables);
 855
 856         /* Note: FS input scalarizing must happen after
 857          * nir_lower_two_sided_color, which only handles a vec4 at a time.
 858          */
 859         NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in);
 860 }
 861
 862 static uint32_t
 863 vir_get_max_temps(struct v3d_compile *c)
 864 {
 865         int max_ip = 0;
 866         vir_for_each_inst_inorder(inst, c)
 867                 max_ip++;
 868
 869         uint32_t *pressure = rzalloc_array(NULL, uint32_t, max_ip);
 870
 871         for (int t = 0; t < c->num_temps; t++) {
 872                 for (int i = c->temp_start[t]; (i < c->temp_end[t] &&
 873                                                 i < max_ip); i++) {
 874                         if (i > max_ip)
 875                                 break;
 876                         pressure[i]++;
 877                 }
 878         }
 879
 880         uint32_t max_temps = 0;
 881         for (int i = 0; i < max_ip; i++)
 882                 max_temps = MAX2(max_temps, pressure[i]);
 883
 884         ralloc_free(pressure);
 885
 886         return max_temps;
 887 }
 888
 889 uint64_t *v3d_compile(const struct v3d_compiler *compiler,
 890                       struct v3d_key *key,
 891                       struct v3d_prog_data **out_prog_data,
 892                       nir_shader *s,
 893                       void (*debug_output)(const char *msg,
 894                                            void *debug_output_data),
 895                       void *debug_output_data,
 896                       int program_id, int variant_id,
 897                       uint32_t *final_assembly_size)
 898 {
 899         struct v3d_prog_data *prog_data;
 900         struct v3d_compile *c = vir_compile_init(compiler, key, s,
 901                                                  debug_output, debug_output_data,
 902                                                  program_id, variant_id);
 903
 904         switch (c->s->info.stage) {
 905         case MESA_SHADER_VERTEX:
 906                 c->vs_key = (struct v3d_vs_key *)key;
 907                 prog_data = rzalloc_size(NULL, sizeof(struct v3d_vs_prog_data));
 908                 break;
 909         case MESA_SHADER_FRAGMENT:
 910                 c->fs_key = (struct v3d_fs_key *)key;
 911                 prog_data = rzalloc_size(NULL, sizeof(struct v3d_fs_prog_data));
 912                 break;
 913         case MESA_SHADER_COMPUTE:
 914                 prog_data = rzalloc_size(NULL,
 915                                          sizeof(struct v3d_compute_prog_data));
 916                 break;
 917         default:
 918                 unreachable("unsupported shader stage");
 919         }
 920
 921         if (c->s->info.stage == MESA_SHADER_VERTEX) {
 922                 v3d_nir_lower_vs_early(c);
 923         } else if (c->s->info.stage != MESA_SHADER_COMPUTE) {
 924                 assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
 925                 v3d_nir_lower_fs_early(c);
 926         }
 927
 928         v3d_lower_nir(c);
 929
 930         if (c->s->info.stage == MESA_SHADER_VERTEX) {
 931                 v3d_nir_lower_vs_late(c);
 932         } else if (c->s->info.stage != MESA_SHADER_COMPUTE)  {
 933                 assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
 934                 v3d_nir_lower_fs_late(c);
 935         }
 936
 937         NIR_PASS_V(c->s, v3d_nir_lower_io, c);
 938         NIR_PASS_V(c->s, v3d_nir_lower_txf_ms, c);
 939         NIR_PASS_V(c->s, v3d_nir_lower_image_load_store);
 940         NIR_PASS_V(c->s, nir_lower_idiv);
 941
 942         v3d_optimize_nir(c->s);
 943         NIR_PASS_V(c->s, nir_lower_bool_to_int32);
 944         NIR_PASS_V(c->s, nir_convert_from_ssa, true);
 945
 946         v3d_nir_to_vir(c);
 947
 948         v3d_set_prog_data(c, prog_data);
 949
 950         *out_prog_data = prog_data;
 951
 952         char *shaderdb;
 953         int ret = asprintf(&shaderdb,
 954                            "%s shader: %d inst, %d threads, %d loops, "
 955                            "%d uniforms, %d max-temps, %d:%d spills:fills, "
 956                            "%d sfu-stalls, %d inst-and-stalls",
 957                            vir_get_stage_name(c),
 958                            c->qpu_inst_count,
 959                            c->threads,
 960                            c->loops,
 961                            c->num_uniforms,
 962                            vir_get_max_temps(c),
 963                            c->spills,
 964                            c->fills,
 965                            c->qpu_inst_stalled_count,
 966                            c->qpu_inst_count + c->qpu_inst_stalled_count);
 967         if (ret >= 0) {
 968                 if (V3D_DEBUG & V3D_DEBUG_SHADERDB)
 969                         fprintf(stderr, "SHADER-DB: %s\n", shaderdb);
 970
 971                 c->debug_output(shaderdb, c->debug_output_data);
 972                 free(shaderdb);
 973         }
 974
 975        return v3d_return_qpu_insts(c, final_assembly_size);
 976 }
 977
 978 void
 979 vir_remove_instruction(struct v3d_compile *c, struct qinst *qinst)
 980 {
 981         if (qinst->dst.file == QFILE_TEMP)
 982                 c->defs[qinst->dst.index] = NULL;
 983
 984         assert(&qinst->link != c->cursor.link);
 985
 986         list_del(&qinst->link);
 987         free(qinst);
 988
 989         c->live_intervals_valid = false;
 990 }
 991
 992 struct qreg
 993 vir_follow_movs(struct v3d_compile *c, struct qreg reg)
 994 {
 995         /* XXX
 996         int pack = reg.pack;
 997
 998         while (reg.file == QFILE_TEMP &&
 999                c->defs[reg.index] &&
1000                (c->defs[reg.index]->op == QOP_MOV ||
1001                 c->defs[reg.index]->op == QOP_FMOV) &&
1002                !c->defs[reg.index]->dst.pack &&
1003                !c->defs[reg.index]->src[0].pack) {
1004                 reg = c->defs[reg.index]->src[0];
1005         }
1006
1007         reg.pack = pack;
1008         */
1009         return reg;
1010 }
1011
1012 void
1013 vir_compile_destroy(struct v3d_compile *c)
1014 {
1015         /* Defuse the assert that we aren't removing the cursor's instruction.
1016          */
1017         c->cursor.link = NULL;
1018
1019         vir_for_each_block(block, c) {
1020                 while (!list_empty(&block->instructions)) {
1021                         struct qinst *qinst =
1022                                 list_first_entry(&block->instructions,
1023                                                  struct qinst, link);
1024                         vir_remove_instruction(c, qinst);
1025                 }
1026         }
1027
1028         ralloc_free(c);
1029 }
1030
1031 uint32_t
1032 vir_get_uniform_index(struct v3d_compile *c,
1033                       enum quniform_contents contents,
1034                       uint32_t data)
1035 {
1036         for (int i = 0; i < c->num_uniforms; i++) {
1037                 if (c->uniform_contents[i] == contents &&
1038                     c->uniform_data[i] == data) {
1039                         return i;
1040                 }
1041         }
1042
1043         uint32_t uniform = c->num_uniforms++;
1044
1045         if (uniform >= c->uniform_array_size) {
1046                 c->uniform_array_size = MAX2(MAX2(16, uniform + 1),
1047                                              c->uniform_array_size * 2);
1048
1049                 c->uniform_data = reralloc(c, c->uniform_data,
1050                                            uint32_t,
1051                                            c->uniform_array_size);
1052                 c->uniform_contents = reralloc(c, c->uniform_contents,
1053                                                enum quniform_contents,
1054                                                c->uniform_array_size);
1055         }
1056
1057         c->uniform_contents[uniform] = contents;
1058         c->uniform_data[uniform] = data;
1059
1060         return uniform;
1061 }
1062
1063 struct qreg
1064 vir_uniform(struct v3d_compile *c,
1065             enum quniform_contents contents,
1066             uint32_t data)
1067 {
1068         struct qinst *inst = vir_NOP(c);
1069         inst->qpu.sig.ldunif = true;
1070         inst->uniform = vir_get_uniform_index(c, contents, data);
1071         inst->dst = vir_get_temp(c);
1072         c->defs[inst->dst.index] = inst;
1073         return inst->dst;
1074 }
1075
1076 #define OPTPASS(func)                                                   \
1077         do {                                                            \
1078                 bool stage_progress = func(c);                          \
1079                 if (stage_progress) {                                   \
1080                         progress = true;                                \
1081                         if (print_opt_debug) {                          \
1082                                 fprintf(stderr,                         \
1083                                         "VIR opt pass %2d: %s progress\n", \
1084                                         pass, #func);                   \
1085                         }                                               \
1086                         /*XXX vir_validate(c);*/                        \
1087                 }                                                       \
1088         } while (0)
1089
1090 void
1091 vir_optimize(struct v3d_compile *c)
1092 {
1093         bool print_opt_debug = false;
1094         int pass = 1;
1095
1096         while (true) {
1097                 bool progress = false;
1098
1099                 OPTPASS(vir_opt_copy_propagate);
1100                 OPTPASS(vir_opt_redundant_flags);
1101                 OPTPASS(vir_opt_dead_code);
1102                 OPTPASS(vir_opt_small_immediates);
1103
1104                 if (!progress)
1105                         break;
1106
1107                 pass++;
1108         }
1109 }
1110
1111 const char *
1112 vir_get_stage_name(struct v3d_compile *c)
1113 {
1114         if (c->vs_key && c->vs_key->is_coord)
1115                 return "MESA_SHADER_COORD";
1116         else
1117                 return gl_shader_stage_name(c->s->info.stage);
1118 }