src/broadcom/compiler/vir.c

   1 /*
   2  * Copyright © 2016-2017 Broadcom
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "broadcom/common/v3d_device_info.h"
  25 #include "v3d_compiler.h"
  26
  27 int
  28 vir_get_nsrc(struct qinst *inst)
  29 {
  30         switch (inst->qpu.type) {
  31         case V3D_QPU_INSTR_TYPE_BRANCH:
  32                 return 0;
  33         case V3D_QPU_INSTR_TYPE_ALU:
  34                 if (inst->qpu.alu.add.op != V3D_QPU_A_NOP)
  35                         return v3d_qpu_add_op_num_src(inst->qpu.alu.add.op);
  36                 else
  37                         return v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op);
  38         }
  39
  40         return 0;
  41 }
  42
  43 /**
  44  * Returns whether the instruction has any side effects that must be
  45  * preserved.
  46  */
  47 bool
  48 vir_has_side_effects(struct v3d_compile *c, struct qinst *inst)
  49 {
  50         switch (inst->qpu.type) {
  51         case V3D_QPU_INSTR_TYPE_BRANCH:
  52                 return true;
  53         case V3D_QPU_INSTR_TYPE_ALU:
  54                 switch (inst->qpu.alu.add.op) {
  55                 case V3D_QPU_A_SETREVF:
  56                 case V3D_QPU_A_SETMSF:
  57                 case V3D_QPU_A_VPMSETUP:
  58                 case V3D_QPU_A_STVPMV:
  59                 case V3D_QPU_A_STVPMD:
  60                 case V3D_QPU_A_STVPMP:
  61                 case V3D_QPU_A_VPMWT:
  62                 case V3D_QPU_A_TMUWT:
  63                         return true;
  64                 default:
  65                         break;
  66                 }
  67
  68                 switch (inst->qpu.alu.mul.op) {
  69                 case V3D_QPU_M_MULTOP:
  70                         return true;
  71                 default:
  72                         break;
  73                 }
  74         }
  75
  76         if (inst->qpu.sig.ldtmu ||
  77             inst->qpu.sig.ldvary ||
  78             inst->qpu.sig.ldtlbu ||
  79             inst->qpu.sig.ldtlb ||
  80             inst->qpu.sig.wrtmuc ||
  81             inst->qpu.sig.thrsw) {
  82                 return true;
  83         }
  84
  85         return false;
  86 }
  87
  88 bool
  89 vir_is_raw_mov(struct qinst *inst)
  90 {
  91         if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
  92             (inst->qpu.alu.mul.op != V3D_QPU_M_FMOV &&
  93              inst->qpu.alu.mul.op != V3D_QPU_M_MOV)) {
  94                 return false;
  95         }
  96
  97         if (inst->qpu.alu.add.output_pack != V3D_QPU_PACK_NONE ||
  98             inst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE) {
  99                 return false;
 100         }
 101
 102         if (inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE ||
 103             inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE ||
 104             inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE ||
 105             inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE) {
 106                 return false;
 107         }
 108
 109         if (inst->qpu.flags.ac != V3D_QPU_COND_NONE ||
 110             inst->qpu.flags.mc != V3D_QPU_COND_NONE)
 111                 return false;
 112
 113         return true;
 114 }
 115
 116 bool
 117 vir_is_add(struct qinst *inst)
 118 {
 119         return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
 120                 inst->qpu.alu.add.op != V3D_QPU_A_NOP);
 121 }
 122
 123 bool
 124 vir_is_mul(struct qinst *inst)
 125 {
 126         return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
 127                 inst->qpu.alu.mul.op != V3D_QPU_M_NOP);
 128 }
 129
 130 bool
 131 vir_is_tex(struct qinst *inst)
 132 {
 133         if (inst->dst.file == QFILE_MAGIC)
 134                 return v3d_qpu_magic_waddr_is_tmu(inst->dst.index);
 135
 136         if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
 137             inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) {
 138                 return true;
 139         }
 140
 141         return false;
 142 }
 143
 144 bool
 145 vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst)
 146 {
 147         for (int i = 0; i < vir_get_nsrc(inst); i++) {
 148                 switch (inst->src[i].file) {
 149                 case QFILE_VPM:
 150                         return true;
 151                 default:
 152                         break;
 153                 }
 154         }
 155
 156         if (devinfo->ver < 41 && (inst->qpu.sig.ldvary ||
 157                                   inst->qpu.sig.ldtlb ||
 158                                   inst->qpu.sig.ldtlbu ||
 159                                   inst->qpu.sig.ldvpm)) {
 160                 return true;
 161         }
 162
 163         return false;
 164 }
 165
 166 bool
 167 vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst)
 168 {
 169         switch (inst->dst.file) {
 170         case QFILE_MAGIC:
 171                 switch (inst->dst.index) {
 172                 case V3D_QPU_WADDR_RECIP:
 173                 case V3D_QPU_WADDR_RSQRT:
 174                 case V3D_QPU_WADDR_EXP:
 175                 case V3D_QPU_WADDR_LOG:
 176                 case V3D_QPU_WADDR_SIN:
 177                         return true;
 178                 }
 179                 break;
 180         default:
 181                 break;
 182         }
 183
 184         if (devinfo->ver < 41 && inst->qpu.sig.ldtmu)
 185                 return true;
 186
 187         return false;
 188 }
 189
 190 void
 191 vir_set_unpack(struct qinst *inst, int src,
 192                enum v3d_qpu_input_unpack unpack)
 193 {
 194         assert(src == 0 || src == 1);
 195
 196         if (vir_is_add(inst)) {
 197                 if (src == 0)
 198                         inst->qpu.alu.add.a_unpack = unpack;
 199                 else
 200                         inst->qpu.alu.add.b_unpack = unpack;
 201         } else {
 202                 assert(vir_is_mul(inst));
 203                 if (src == 0)
 204                         inst->qpu.alu.mul.a_unpack = unpack;
 205                 else
 206                         inst->qpu.alu.mul.b_unpack = unpack;
 207         }
 208 }
 209
 210 void
 211 vir_set_cond(struct qinst *inst, enum v3d_qpu_cond cond)
 212 {
 213         if (vir_is_add(inst)) {
 214                 inst->qpu.flags.ac = cond;
 215         } else {
 216                 assert(vir_is_mul(inst));
 217                 inst->qpu.flags.mc = cond;
 218         }
 219 }
 220
 221 void
 222 vir_set_pf(struct qinst *inst, enum v3d_qpu_pf pf)
 223 {
 224         if (vir_is_add(inst)) {
 225                 inst->qpu.flags.apf = pf;
 226         } else {
 227                 assert(vir_is_mul(inst));
 228                 inst->qpu.flags.mpf = pf;
 229         }
 230 }
 231
 232 void
 233 vir_set_uf(struct qinst *inst, enum v3d_qpu_uf uf)
 234 {
 235         if (vir_is_add(inst)) {
 236                 inst->qpu.flags.auf = uf;
 237         } else {
 238                 assert(vir_is_mul(inst));
 239                 inst->qpu.flags.muf = uf;
 240         }
 241 }
 242
 243 #if 0
 244 uint8_t
 245 vir_channels_written(struct qinst *inst)
 246 {
 247         if (vir_is_mul(inst)) {
 248                 switch (inst->dst.pack) {
 249                 case QPU_PACK_MUL_NOP:
 250                 case QPU_PACK_MUL_8888:
 251                         return 0xf;
 252                 case QPU_PACK_MUL_8A:
 253                         return 0x1;
 254                 case QPU_PACK_MUL_8B:
 255                         return 0x2;
 256                 case QPU_PACK_MUL_8C:
 257                         return 0x4;
 258                 case QPU_PACK_MUL_8D:
 259                         return 0x8;
 260                 }
 261         } else {
 262                 switch (inst->dst.pack) {
 263                 case QPU_PACK_A_NOP:
 264                 case QPU_PACK_A_8888:
 265                 case QPU_PACK_A_8888_SAT:
 266                 case QPU_PACK_A_32_SAT:
 267                         return 0xf;
 268                 case QPU_PACK_A_8A:
 269                 case QPU_PACK_A_8A_SAT:
 270                         return 0x1;
 271                 case QPU_PACK_A_8B:
 272                 case QPU_PACK_A_8B_SAT:
 273                         return 0x2;
 274                 case QPU_PACK_A_8C:
 275                 case QPU_PACK_A_8C_SAT:
 276                         return 0x4;
 277                 case QPU_PACK_A_8D:
 278                 case QPU_PACK_A_8D_SAT:
 279                         return 0x8;
 280                 case QPU_PACK_A_16A:
 281                 case QPU_PACK_A_16A_SAT:
 282                         return 0x3;
 283                 case QPU_PACK_A_16B:
 284                 case QPU_PACK_A_16B_SAT:
 285                         return 0xc;
 286                 }
 287         }
 288         unreachable("Bad pack field");
 289 }
 290 #endif
 291
 292 struct qreg
 293 vir_get_temp(struct v3d_compile *c)
 294 {
 295         struct qreg reg;
 296
 297         reg.file = QFILE_TEMP;
 298         reg.index = c->num_temps++;
 299
 300         if (c->num_temps > c->defs_array_size) {
 301                 uint32_t old_size = c->defs_array_size;
 302                 c->defs_array_size = MAX2(old_size * 2, 16);
 303
 304                 c->defs = reralloc(c, c->defs, struct qinst *,
 305                                    c->defs_array_size);
 306                 memset(&c->defs[old_size], 0,
 307                        sizeof(c->defs[0]) * (c->defs_array_size - old_size));
 308
 309                 c->spillable = reralloc(c, c->spillable,
 310                                         BITSET_WORD,
 311                                         BITSET_WORDS(c->defs_array_size));
 312                 for (int i = old_size; i < c->defs_array_size; i++)
 313                         BITSET_SET(c->spillable, i);
 314         }
 315
 316         return reg;
 317 }
 318
 319 struct qinst *
 320 vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst, struct qreg src0, struct qreg src1)
 321 {
 322         struct qinst *inst = calloc(1, sizeof(*inst));
 323
 324         inst->qpu = v3d_qpu_nop();
 325         inst->qpu.alu.add.op = op;
 326
 327         inst->dst = dst;
 328         inst->src[0] = src0;
 329         inst->src[1] = src1;
 330         inst->uniform = ~0;
 331
 332         return inst;
 333 }
 334
 335 struct qinst *
 336 vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst, struct qreg src0, struct qreg src1)
 337 {
 338         struct qinst *inst = calloc(1, sizeof(*inst));
 339
 340         inst->qpu = v3d_qpu_nop();
 341         inst->qpu.alu.mul.op = op;
 342
 343         inst->dst = dst;
 344         inst->src[0] = src0;
 345         inst->src[1] = src1;
 346         inst->uniform = ~0;
 347
 348         return inst;
 349 }
 350
 351 struct qinst *
 352 vir_branch_inst(struct v3d_compile *c, enum v3d_qpu_branch_cond cond)
 353 {
 354         struct qinst *inst = calloc(1, sizeof(*inst));
 355
 356         inst->qpu = v3d_qpu_nop();
 357         inst->qpu.type = V3D_QPU_INSTR_TYPE_BRANCH;
 358         inst->qpu.branch.cond = cond;
 359         inst->qpu.branch.msfign = V3D_QPU_MSFIGN_NONE;
 360         inst->qpu.branch.bdi = V3D_QPU_BRANCH_DEST_REL;
 361         inst->qpu.branch.ub = true;
 362         inst->qpu.branch.bdu = V3D_QPU_BRANCH_DEST_REL;
 363
 364         inst->dst = vir_nop_reg();
 365         inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, 0);
 366
 367         return inst;
 368 }
 369
 370 static void
 371 vir_emit(struct v3d_compile *c, struct qinst *inst)
 372 {
 373         switch (c->cursor.mode) {
 374         case vir_cursor_add:
 375                 list_add(&inst->link, c->cursor.link);
 376                 break;
 377         case vir_cursor_addtail:
 378                 list_addtail(&inst->link, c->cursor.link);
 379                 break;
 380         }
 381
 382         c->cursor = vir_after_inst(inst);
 383         c->live_intervals_valid = false;
 384 }
 385
 386 /* Updates inst to write to a new temporary, emits it, and notes the def. */
 387 struct qreg
 388 vir_emit_def(struct v3d_compile *c, struct qinst *inst)
 389 {
 390         assert(inst->dst.file == QFILE_NULL);
 391
 392         /* If we're emitting an instruction that's a def, it had better be
 393          * writing a register.
 394          */
 395         if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
 396                 assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP ||
 397                        v3d_qpu_add_op_has_dst(inst->qpu.alu.add.op));
 398                 assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP ||
 399                        v3d_qpu_mul_op_has_dst(inst->qpu.alu.mul.op));
 400         }
 401
 402         inst->dst = vir_get_temp(c);
 403
 404         if (inst->dst.file == QFILE_TEMP)
 405                 c->defs[inst->dst.index] = inst;
 406
 407         vir_emit(c, inst);
 408
 409         return inst->dst;
 410 }
 411
 412 struct qinst *
 413 vir_emit_nondef(struct v3d_compile *c, struct qinst *inst)
 414 {
 415         if (inst->dst.file == QFILE_TEMP)
 416                 c->defs[inst->dst.index] = NULL;
 417
 418         vir_emit(c, inst);
 419
 420         return inst;
 421 }
 422
 423 struct qblock *
 424 vir_new_block(struct v3d_compile *c)
 425 {
 426         struct qblock *block = rzalloc(c, struct qblock);
 427
 428         list_inithead(&block->instructions);
 429
 430         block->predecessors = _mesa_set_create(block,
 431                                                _mesa_hash_pointer,
 432                                                _mesa_key_pointer_equal);
 433
 434         block->index = c->next_block_index++;
 435
 436         return block;
 437 }
 438
 439 void
 440 vir_set_emit_block(struct v3d_compile *c, struct qblock *block)
 441 {
 442         c->cur_block = block;
 443         c->cursor = vir_after_block(block);
 444         list_addtail(&block->link, &c->blocks);
 445 }
 446
 447 struct qblock *
 448 vir_entry_block(struct v3d_compile *c)
 449 {
 450         return list_first_entry(&c->blocks, struct qblock, link);
 451 }
 452
 453 struct qblock *
 454 vir_exit_block(struct v3d_compile *c)
 455 {
 456         return list_last_entry(&c->blocks, struct qblock, link);
 457 }
 458
 459 void
 460 vir_link_blocks(struct qblock *predecessor, struct qblock *successor)
 461 {
 462         _mesa_set_add(successor->predecessors, predecessor);
 463         if (predecessor->successors[0]) {
 464                 assert(!predecessor->successors[1]);
 465                 predecessor->successors[1] = successor;
 466         } else {
 467                 predecessor->successors[0] = successor;
 468         }
 469 }
 470
 471 const struct v3d_compiler *
 472 v3d_compiler_init(const struct v3d_device_info *devinfo)
 473 {
 474         struct v3d_compiler *compiler = rzalloc(NULL, struct v3d_compiler);
 475         if (!compiler)
 476                 return NULL;
 477
 478         compiler->devinfo = devinfo;
 479
 480         if (!vir_init_reg_sets(compiler)) {
 481                 ralloc_free(compiler);
 482                 return NULL;
 483         }
 484
 485         return compiler;
 486 }
 487
 488 void
 489 v3d_compiler_free(const struct v3d_compiler *compiler)
 490 {
 491         ralloc_free((void *)compiler);
 492 }
 493
 494 static struct v3d_compile *
 495 vir_compile_init(const struct v3d_compiler *compiler,
 496                  struct v3d_key *key,
 497                  nir_shader *s,
 498                  void (*debug_output)(const char *msg,
 499                                       void *debug_output_data),
 500                  void *debug_output_data,
 501                  int program_id, int variant_id)
 502 {
 503         struct v3d_compile *c = rzalloc(NULL, struct v3d_compile);
 504
 505         c->compiler = compiler;
 506         c->devinfo = compiler->devinfo;
 507         c->key = key;
 508         c->program_id = program_id;
 509         c->variant_id = variant_id;
 510         c->threads = 4;
 511         c->debug_output = debug_output;
 512         c->debug_output_data = debug_output_data;
 513
 514         s = nir_shader_clone(c, s);
 515         c->s = s;
 516
 517         list_inithead(&c->blocks);
 518         vir_set_emit_block(c, vir_new_block(c));
 519
 520         c->output_position_index = -1;
 521         c->output_sample_mask_index = -1;
 522
 523         c->def_ht = _mesa_hash_table_create(c, _mesa_hash_pointer,
 524                                             _mesa_key_pointer_equal);
 525
 526         return c;
 527 }
 528
 529 static int
 530 type_size_vec4(const struct glsl_type *type, bool bindless)
 531 {
 532         return glsl_count_attribute_slots(type, false);
 533 }
 534
 535 static void
 536 v3d_lower_nir(struct v3d_compile *c)
 537 {
 538         struct nir_lower_tex_options tex_options = {
 539                 .lower_txd = true,
 540                 .lower_tg4_broadcom_swizzle = true,
 541
 542                 .lower_rect = false, /* XXX: Use this on V3D 3.x */
 543                 .lower_txp = ~0,
 544                 /* Apply swizzles to all samplers. */
 545                 .swizzle_result = ~0,
 546         };
 547
 548         /* Lower the format swizzle and (for 32-bit returns)
 549          * ARB_texture_swizzle-style swizzle.
 550          */
 551         for (int i = 0; i < ARRAY_SIZE(c->key->tex); i++) {
 552                 for (int j = 0; j < 4; j++)
 553                         tex_options.swizzles[i][j] = c->key->tex[i].swizzle[j];
 554
 555                 if (c->key->tex[i].clamp_s)
 556                         tex_options.saturate_s |= 1 << i;
 557                 if (c->key->tex[i].clamp_t)
 558                         tex_options.saturate_t |= 1 << i;
 559                 if (c->key->tex[i].clamp_r)
 560                         tex_options.saturate_r |= 1 << i;
 561                 if (c->key->tex[i].return_size == 16) {
 562                         tex_options.lower_tex_packing[i] =
 563                                 nir_lower_tex_packing_16;
 564                 }
 565         }
 566
 567         /* CS textures may not have return_size reflecting the shadow state. */
 568         nir_foreach_variable(var, &c->s->uniforms) {
 569                 const struct glsl_type *type = glsl_without_array(var->type);
 570                 unsigned array_len = MAX2(glsl_get_length(var->type), 1);
 571
 572                 if (!glsl_type_is_sampler(type) ||
 573                     !glsl_sampler_type_is_shadow(type))
 574                         continue;
 575
 576                 for (int i = 0; i < array_len; i++) {
 577                         tex_options.lower_tex_packing[var->data.binding + i] =
 578                                 nir_lower_tex_packing_16;
 579                 }
 580         }
 581
 582         NIR_PASS_V(c->s, nir_lower_tex, &tex_options);
 583         NIR_PASS_V(c->s, nir_lower_system_values);
 584
 585         NIR_PASS_V(c->s, nir_lower_vars_to_scratch,
 586                    nir_var_function_temp,
 587                    0,
 588                    glsl_get_natural_size_align_bytes);
 589         NIR_PASS_V(c->s, v3d_nir_lower_scratch);
 590 }
 591
 592 static void
 593 v3d_set_prog_data_uniforms(struct v3d_compile *c,
 594                            struct v3d_prog_data *prog_data)
 595 {
 596         int count = c->num_uniforms;
 597         struct v3d_uniform_list *ulist = &prog_data->uniforms;
 598
 599         ulist->count = count;
 600         ulist->data = ralloc_array(prog_data, uint32_t, count);
 601         memcpy(ulist->data, c->uniform_data,
 602                count * sizeof(*ulist->data));
 603         ulist->contents = ralloc_array(prog_data, enum quniform_contents, count);
 604         memcpy(ulist->contents, c->uniform_contents,
 605                count * sizeof(*ulist->contents));
 606 }
 607
 608 static void
 609 v3d_vs_set_prog_data(struct v3d_compile *c,
 610                      struct v3d_vs_prog_data *prog_data)
 611 {
 612         /* The vertex data gets format converted by the VPM so that
 613          * each attribute channel takes up a VPM column.  Precompute
 614          * the sizes for the shader record.
 615          */
 616         for (int i = 0; i < ARRAY_SIZE(prog_data->vattr_sizes); i++) {
 617                 prog_data->vattr_sizes[i] = c->vattr_sizes[i];
 618                 prog_data->vpm_input_size += c->vattr_sizes[i];
 619         }
 620
 621         prog_data->uses_vid = (c->s->info.system_values_read &
 622                                (1ull << SYSTEM_VALUE_VERTEX_ID));
 623         prog_data->uses_iid = (c->s->info.system_values_read &
 624                                (1ull << SYSTEM_VALUE_INSTANCE_ID));
 625
 626         if (prog_data->uses_vid)
 627                 prog_data->vpm_input_size++;
 628         if (prog_data->uses_iid)
 629                 prog_data->vpm_input_size++;
 630
 631         /* Input/output segment size are in sectors (8 rows of 32 bits per
 632          * channel).
 633          */
 634         prog_data->vpm_input_size = align(prog_data->vpm_input_size, 8) / 8;
 635         prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8;
 636
 637         /* Set us up for shared input/output segments.  This is apparently
 638          * necessary for our VCM setup to avoid varying corruption.
 639          */
 640         prog_data->separate_segments = false;
 641         prog_data->vpm_output_size = MAX2(prog_data->vpm_output_size,
 642                                           prog_data->vpm_input_size);
 643         prog_data->vpm_input_size = 0;
 644
 645         /* Compute VCM cache size.  We set up our program to take up less than
 646          * half of the VPM, so that any set of bin and render programs won't
 647          * run out of space.  We need space for at least one input segment,
 648          * and then allocate the rest to output segments (one for the current
 649          * program, the rest to VCM).  The valid range of the VCM cache size
 650          * field is 1-4 16-vertex batches, but GFXH-1744 limits us to 2-4
 651          * batches.
 652          */
 653         assert(c->devinfo->vpm_size);
 654         int sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8;
 655         int vpm_size_in_sectors = c->devinfo->vpm_size / sector_size;
 656         int half_vpm = vpm_size_in_sectors / 2;
 657         int vpm_output_sectors = half_vpm - prog_data->vpm_input_size;
 658         int vpm_output_batches = vpm_output_sectors / prog_data->vpm_output_size;
 659         assert(vpm_output_batches >= 2);
 660         prog_data->vcm_cache_size = CLAMP(vpm_output_batches - 1, 2, 4);
 661 }
 662
 663 static void
 664 v3d_set_fs_prog_data_inputs(struct v3d_compile *c,
 665                             struct v3d_fs_prog_data *prog_data)
 666 {
 667         prog_data->num_inputs = c->num_inputs;
 668         memcpy(prog_data->input_slots, c->input_slots,
 669                c->num_inputs * sizeof(*c->input_slots));
 670
 671         STATIC_ASSERT(ARRAY_SIZE(prog_data->flat_shade_flags) >
 672                       (V3D_MAX_FS_INPUTS - 1) / 24);
 673         for (int i = 0; i < V3D_MAX_FS_INPUTS; i++) {
 674                 if (BITSET_TEST(c->flat_shade_flags, i))
 675                         prog_data->flat_shade_flags[i / 24] |= 1 << (i % 24);
 676
 677                 if (BITSET_TEST(c->noperspective_flags, i))
 678                         prog_data->noperspective_flags[i / 24] |= 1 << (i % 24);
 679
 680                 if (BITSET_TEST(c->centroid_flags, i))
 681                         prog_data->centroid_flags[i / 24] |= 1 << (i % 24);
 682         }
 683 }
 684
 685 static void
 686 v3d_fs_set_prog_data(struct v3d_compile *c,
 687                      struct v3d_fs_prog_data *prog_data)
 688 {
 689         v3d_set_fs_prog_data_inputs(c, prog_data);
 690         prog_data->writes_z = c->writes_z;
 691         prog_data->disable_ez = !c->s->info.fs.early_fragment_tests;
 692         prog_data->uses_center_w = c->uses_center_w;
 693         prog_data->uses_implicit_point_line_varyings =
 694                 c->uses_implicit_point_line_varyings;
 695         prog_data->lock_scoreboard_on_first_thrsw =
 696                 c->lock_scoreboard_on_first_thrsw;
 697 }
 698
 699 static void
 700 v3d_cs_set_prog_data(struct v3d_compile *c,
 701                      struct v3d_compute_prog_data *prog_data)
 702 {
 703         prog_data->shared_size = c->s->info.cs.shared_size;
 704 }
 705
 706 static void
 707 v3d_set_prog_data(struct v3d_compile *c,
 708                   struct v3d_prog_data *prog_data)
 709 {
 710         prog_data->threads = c->threads;
 711         prog_data->single_seg = !c->last_thrsw;
 712         prog_data->spill_size = c->spill_size;
 713
 714         v3d_set_prog_data_uniforms(c, prog_data);
 715
 716         if (c->s->info.stage == MESA_SHADER_COMPUTE) {
 717                 v3d_cs_set_prog_data(c, (struct v3d_compute_prog_data *)prog_data);
 718         } else if (c->s->info.stage == MESA_SHADER_VERTEX) {
 719                 v3d_vs_set_prog_data(c, (struct v3d_vs_prog_data *)prog_data);
 720         } else {
 721                 assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
 722                 v3d_fs_set_prog_data(c, (struct v3d_fs_prog_data *)prog_data);
 723         }
 724 }
 725
 726 static uint64_t *
 727 v3d_return_qpu_insts(struct v3d_compile *c, uint32_t *final_assembly_size)
 728 {
 729         *final_assembly_size = c->qpu_inst_count * sizeof(uint64_t);
 730
 731         uint64_t *qpu_insts = malloc(*final_assembly_size);
 732         if (!qpu_insts)
 733                 return NULL;
 734
 735         memcpy(qpu_insts, c->qpu_insts, *final_assembly_size);
 736
 737         vir_compile_destroy(c);
 738
 739         return qpu_insts;
 740 }
 741
 742 static void
 743 v3d_nir_lower_vs_early(struct v3d_compile *c)
 744 {
 745         /* Split our I/O vars and dead code eliminate the unused
 746          * components.
 747          */
 748         NIR_PASS_V(c->s, nir_lower_io_to_scalar_early,
 749                    nir_var_shader_in | nir_var_shader_out);
 750         uint64_t used_outputs[4] = {0};
 751         for (int i = 0; i < c->vs_key->num_fs_inputs; i++) {
 752                 int slot = v3d_slot_get_slot(c->vs_key->fs_inputs[i]);
 753                 int comp = v3d_slot_get_component(c->vs_key->fs_inputs[i]);
 754                 used_outputs[comp] |= 1ull << slot;
 755         }
 756         NIR_PASS_V(c->s, nir_remove_unused_io_vars,
 757                    &c->s->outputs, used_outputs, NULL); /* demotes to globals */
 758         NIR_PASS_V(c->s, nir_lower_global_vars_to_local);
 759         v3d_optimize_nir(c->s);
 760         NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in);
 761
 762         /* This must go before nir_lower_io */
 763         if (c->vs_key->per_vertex_point_size)
 764                 NIR_PASS_V(c->s, nir_lower_point_size, 1.0f, 0.0f);
 765
 766         NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
 767                    type_size_vec4,
 768                    (nir_lower_io_options)0);
 769         /* clean up nir_lower_io's deref_var remains */
 770         NIR_PASS_V(c->s, nir_opt_dce);
 771 }
 772
 773 static void
 774 v3d_fixup_fs_output_types(struct v3d_compile *c)
 775 {
 776         nir_foreach_variable(var, &c->s->outputs) {
 777                 uint32_t mask = 0;
 778
 779                 switch (var->data.location) {
 780                 case FRAG_RESULT_COLOR:
 781                         mask = ~0;
 782                         break;
 783                 case FRAG_RESULT_DATA0:
 784                 case FRAG_RESULT_DATA1:
 785                 case FRAG_RESULT_DATA2:
 786                 case FRAG_RESULT_DATA3:
 787                         mask = 1 << (var->data.location - FRAG_RESULT_DATA0);
 788                         break;
 789                 }
 790
 791                 if (c->fs_key->int_color_rb & mask) {
 792                         var->type =
 793                                 glsl_vector_type(GLSL_TYPE_INT,
 794                                                  glsl_get_components(var->type));
 795                 } else if (c->fs_key->uint_color_rb & mask) {
 796                         var->type =
 797                                 glsl_vector_type(GLSL_TYPE_UINT,
 798                                                  glsl_get_components(var->type));
 799                 }
 800         }
 801 }
 802
 803 static void
 804 v3d_nir_lower_fs_early(struct v3d_compile *c)
 805 {
 806         if (c->fs_key->int_color_rb || c->fs_key->uint_color_rb)
 807                 v3d_fixup_fs_output_types(c);
 808
 809         NIR_PASS_V(c->s, v3d_nir_lower_logic_ops, c);
 810
 811         /* If the shader has no non-TLB side effects, we can promote it to
 812          * enabling early_fragment_tests even if the user didn't.
 813          */
 814         if (!(c->s->info.num_images ||
 815               c->s->info.num_ssbos)) {
 816                 c->s->info.fs.early_fragment_tests = true;
 817         }
 818 }
 819
 820 static void
 821 v3d_nir_lower_vs_late(struct v3d_compile *c)
 822 {
 823         if (c->vs_key->clamp_color)
 824                 NIR_PASS_V(c->s, nir_lower_clamp_color_outputs);
 825
 826         if (c->key->ucp_enables) {
 827                 NIR_PASS_V(c->s, nir_lower_clip_vs, c->key->ucp_enables,
 828                            false);
 829                 NIR_PASS_V(c->s, nir_lower_io_to_scalar,
 830                            nir_var_shader_out);
 831         }
 832
 833         /* Note: VS output scalarizing must happen after nir_lower_clip_vs. */
 834         NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out);
 835 }
 836
 837 static void
 838 v3d_nir_lower_fs_late(struct v3d_compile *c)
 839 {
 840         if (c->fs_key->light_twoside)
 841                 NIR_PASS_V(c->s, nir_lower_two_sided_color);
 842
 843         if (c->fs_key->clamp_color)
 844                 NIR_PASS_V(c->s, nir_lower_clamp_color_outputs);
 845
 846         if (c->fs_key->alpha_test) {
 847                 NIR_PASS_V(c->s, nir_lower_alpha_test,
 848                            c->fs_key->alpha_test_func,
 849                            false);
 850         }
 851
 852         if (c->key->ucp_enables)
 853                 NIR_PASS_V(c->s, nir_lower_clip_fs, c->key->ucp_enables);
 854
 855         /* Note: FS input scalarizing must happen after
 856          * nir_lower_two_sided_color, which only handles a vec4 at a time.
 857          */
 858         NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in);
 859 }
 860
 861 static uint32_t
 862 vir_get_max_temps(struct v3d_compile *c)
 863 {
 864         int max_ip = 0;
 865         vir_for_each_inst_inorder(inst, c)
 866                 max_ip++;
 867
 868         uint32_t *pressure = rzalloc_array(NULL, uint32_t, max_ip);
 869
 870         for (int t = 0; t < c->num_temps; t++) {
 871                 for (int i = c->temp_start[t]; (i < c->temp_end[t] &&
 872                                                 i < max_ip); i++) {
 873                         if (i > max_ip)
 874                                 break;
 875                         pressure[i]++;
 876                 }
 877         }
 878
 879         uint32_t max_temps = 0;
 880         for (int i = 0; i < max_ip; i++)
 881                 max_temps = MAX2(max_temps, pressure[i]);
 882
 883         ralloc_free(pressure);
 884
 885         return max_temps;
 886 }
 887
 888 uint64_t *v3d_compile(const struct v3d_compiler *compiler,
 889                       struct v3d_key *key,
 890                       struct v3d_prog_data **out_prog_data,
 891                       nir_shader *s,
 892                       void (*debug_output)(const char *msg,
 893                                            void *debug_output_data),
 894                       void *debug_output_data,
 895                       int program_id, int variant_id,
 896                       uint32_t *final_assembly_size)
 897 {
 898         struct v3d_prog_data *prog_data;
 899         struct v3d_compile *c = vir_compile_init(compiler, key, s,
 900                                                  debug_output, debug_output_data,
 901                                                  program_id, variant_id);
 902
 903         switch (c->s->info.stage) {
 904         case MESA_SHADER_VERTEX:
 905                 c->vs_key = (struct v3d_vs_key *)key;
 906                 prog_data = rzalloc_size(NULL, sizeof(struct v3d_vs_prog_data));
 907                 break;
 908         case MESA_SHADER_FRAGMENT:
 909                 c->fs_key = (struct v3d_fs_key *)key;
 910                 prog_data = rzalloc_size(NULL, sizeof(struct v3d_fs_prog_data));
 911                 break;
 912         case MESA_SHADER_COMPUTE:
 913                 prog_data = rzalloc_size(NULL,
 914                                          sizeof(struct v3d_compute_prog_data));
 915                 break;
 916         default:
 917                 unreachable("unsupported shader stage");
 918         }
 919
 920         if (c->s->info.stage == MESA_SHADER_VERTEX) {
 921                 v3d_nir_lower_vs_early(c);
 922         } else if (c->s->info.stage != MESA_SHADER_COMPUTE) {
 923                 assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
 924                 v3d_nir_lower_fs_early(c);
 925         }
 926
 927         v3d_lower_nir(c);
 928
 929         if (c->s->info.stage == MESA_SHADER_VERTEX) {
 930                 v3d_nir_lower_vs_late(c);
 931         } else if (c->s->info.stage != MESA_SHADER_COMPUTE)  {
 932                 assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
 933                 v3d_nir_lower_fs_late(c);
 934         }
 935
 936         NIR_PASS_V(c->s, v3d_nir_lower_io, c);
 937         NIR_PASS_V(c->s, v3d_nir_lower_txf_ms, c);
 938         NIR_PASS_V(c->s, v3d_nir_lower_image_load_store);
 939         NIR_PASS_V(c->s, nir_lower_idiv);
 940
 941         v3d_optimize_nir(c->s);
 942         NIR_PASS_V(c->s, nir_lower_bool_to_int32);
 943         NIR_PASS_V(c->s, nir_convert_from_ssa, true);
 944
 945         v3d_nir_to_vir(c);
 946
 947         v3d_set_prog_data(c, prog_data);
 948
 949         *out_prog_data = prog_data;
 950
 951         char *shaderdb;
 952         int ret = asprintf(&shaderdb,
 953                            "%s shader: %d inst, %d threads, %d loops, "
 954                            "%d uniforms, %d max-temps, %d:%d spills:fills, "
 955                            "%d sfu-stalls, %d inst-and-stalls",
 956                            vir_get_stage_name(c),
 957                            c->qpu_inst_count,
 958                            c->threads,
 959                            c->loops,
 960                            c->num_uniforms,
 961                            vir_get_max_temps(c),
 962                            c->spills,
 963                            c->fills,
 964                            c->qpu_inst_stalled_count,
 965                            c->qpu_inst_count + c->qpu_inst_stalled_count);
 966         if (ret >= 0) {
 967                 if (V3D_DEBUG & V3D_DEBUG_SHADERDB)
 968                         fprintf(stderr, "SHADER-DB: %s\n", shaderdb);
 969
 970                 c->debug_output(shaderdb, c->debug_output_data);
 971                 free(shaderdb);
 972         }
 973
 974        return v3d_return_qpu_insts(c, final_assembly_size);
 975 }
 976
 977 void
 978 vir_remove_instruction(struct v3d_compile *c, struct qinst *qinst)
 979 {
 980         if (qinst->dst.file == QFILE_TEMP)
 981                 c->defs[qinst->dst.index] = NULL;
 982
 983         assert(&qinst->link != c->cursor.link);
 984
 985         list_del(&qinst->link);
 986         free(qinst);
 987
 988         c->live_intervals_valid = false;
 989 }
 990
 991 struct qreg
 992 vir_follow_movs(struct v3d_compile *c, struct qreg reg)
 993 {
 994         /* XXX
 995         int pack = reg.pack;
 996
 997         while (reg.file == QFILE_TEMP &&
 998                c->defs[reg.index] &&
 999                (c->defs[reg.index]->op == QOP_MOV ||
1000                 c->defs[reg.index]->op == QOP_FMOV) &&
1001                !c->defs[reg.index]->dst.pack &&
1002                !c->defs[reg.index]->src[0].pack) {
1003                 reg = c->defs[reg.index]->src[0];
1004         }
1005
1006         reg.pack = pack;
1007         */
1008         return reg;
1009 }
1010
1011 void
1012 vir_compile_destroy(struct v3d_compile *c)
1013 {
1014         /* Defuse the assert that we aren't removing the cursor's instruction.
1015          */
1016         c->cursor.link = NULL;
1017
1018         vir_for_each_block(block, c) {
1019                 while (!list_empty(&block->instructions)) {
1020                         struct qinst *qinst =
1021                                 list_first_entry(&block->instructions,
1022                                                  struct qinst, link);
1023                         vir_remove_instruction(c, qinst);
1024                 }
1025         }
1026
1027         ralloc_free(c);
1028 }
1029
1030 uint32_t
1031 vir_get_uniform_index(struct v3d_compile *c,
1032                       enum quniform_contents contents,
1033                       uint32_t data)
1034 {
1035         for (int i = 0; i < c->num_uniforms; i++) {
1036                 if (c->uniform_contents[i] == contents &&
1037                     c->uniform_data[i] == data) {
1038                         return i;
1039                 }
1040         }
1041
1042         uint32_t uniform = c->num_uniforms++;
1043
1044         if (uniform >= c->uniform_array_size) {
1045                 c->uniform_array_size = MAX2(MAX2(16, uniform + 1),
1046                                              c->uniform_array_size * 2);
1047
1048                 c->uniform_data = reralloc(c, c->uniform_data,
1049                                            uint32_t,
1050                                            c->uniform_array_size);
1051                 c->uniform_contents = reralloc(c, c->uniform_contents,
1052                                                enum quniform_contents,
1053                                                c->uniform_array_size);
1054         }
1055
1056         c->uniform_contents[uniform] = contents;
1057         c->uniform_data[uniform] = data;
1058
1059         return uniform;
1060 }
1061
1062 struct qreg
1063 vir_uniform(struct v3d_compile *c,
1064             enum quniform_contents contents,
1065             uint32_t data)
1066 {
1067         struct qinst *inst = vir_NOP(c);
1068         inst->qpu.sig.ldunif = true;
1069         inst->uniform = vir_get_uniform_index(c, contents, data);
1070         inst->dst = vir_get_temp(c);
1071         c->defs[inst->dst.index] = inst;
1072         return inst->dst;
1073 }
1074
1075 #define OPTPASS(func)                                                   \
1076         do {                                                            \
1077                 bool stage_progress = func(c);                          \
1078                 if (stage_progress) {                                   \
1079                         progress = true;                                \
1080                         if (print_opt_debug) {                          \
1081                                 fprintf(stderr,                         \
1082                                         "VIR opt pass %2d: %s progress\n", \
1083                                         pass, #func);                   \
1084                         }                                               \
1085                         /*XXX vir_validate(c);*/                        \
1086                 }                                                       \
1087         } while (0)
1088
1089 void
1090 vir_optimize(struct v3d_compile *c)
1091 {
1092         bool print_opt_debug = false;
1093         int pass = 1;
1094
1095         while (true) {
1096                 bool progress = false;
1097
1098                 OPTPASS(vir_opt_copy_propagate);
1099                 OPTPASS(vir_opt_redundant_flags);
1100                 OPTPASS(vir_opt_dead_code);
1101                 OPTPASS(vir_opt_small_immediates);
1102
1103                 if (!progress)
1104                         break;
1105
1106                 pass++;
1107         }
1108 }
1109
1110 const char *
1111 vir_get_stage_name(struct v3d_compile *c)
1112 {
1113         if (c->vs_key && c->vs_key->is_coord)
1114                 return "MESA_SHADER_COORD";
1115         else
1116                 return gl_shader_stage_name(c->s->info.stage);
1117 }