src/broadcom/compiler/nir_to_vir.c

   1 /*
   2  * Copyright © 2016 Broadcom
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include <inttypes.h>
  25 #include "util/u_format.h"
  26 #include "util/u_math.h"
  27 #include "util/u_memory.h"
  28 #include "util/ralloc.h"
  29 #include "util/hash_table.h"
  30 #include "compiler/nir/nir.h"
  31 #include "compiler/nir/nir_builder.h"
  32 #include "common/v3d_device_info.h"
  33 #include "v3d_compiler.h"
  34
  35 static void
  36 ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
  37
  38 static void
  39 resize_qreg_array(struct v3d_compile *c,
  40                   struct qreg **regs,
  41                   uint32_t *size,
  42                   uint32_t decl_size)
  43 {
  44         if (*size >= decl_size)
  45                 return;
  46
  47         uint32_t old_size = *size;
  48         *size = MAX2(*size * 2, decl_size);
  49         *regs = reralloc(c, *regs, struct qreg, *size);
  50         if (!*regs) {
  51                 fprintf(stderr, "Malloc failure\n");
  52                 abort();
  53         }
  54
  55         for (uint32_t i = old_size; i < *size; i++)
  56                 (*regs)[i] = c->undef;
  57 }
  58
  59 void
  60 vir_emit_thrsw(struct v3d_compile *c)
  61 {
  62         if (c->threads == 1)
  63                 return;
  64
  65         /* Always thread switch after each texture operation for now.
  66          *
  67          * We could do better by batching a bunch of texture fetches up and
  68          * then doing one thread switch and collecting all their results
  69          * afterward.
  70          */
  71         c->last_thrsw = vir_NOP(c);
  72         c->last_thrsw->qpu.sig.thrsw = true;
  73         c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL);
  74 }
  75
  76 static struct qreg
  77 vir_SFU(struct v3d_compile *c, int waddr, struct qreg src)
  78 {
  79         vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, waddr), src);
  80         return vir_FMOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4));
  81 }
  82
  83 static struct qreg
  84 indirect_uniform_load(struct v3d_compile *c, nir_intrinsic_instr *intr)
  85 {
  86         struct qreg indirect_offset = ntq_get_src(c, intr->src[0], 0);
  87         uint32_t offset = nir_intrinsic_base(intr);
  88         struct v3d_ubo_range *range = NULL;
  89         unsigned i;
  90
  91         for (i = 0; i < c->num_ubo_ranges; i++) {
  92                 range = &c->ubo_ranges[i];
  93                 if (offset >= range->src_offset &&
  94                     offset < range->src_offset + range->size) {
  95                         break;
  96                 }
  97         }
  98         /* The driver-location-based offset always has to be within a declared
  99          * uniform range.
 100          */
 101         assert(i != c->num_ubo_ranges);
 102         if (!c->ubo_range_used[i]) {
 103                 c->ubo_range_used[i] = true;
 104                 range->dst_offset = c->next_ubo_dst_offset;
 105                 c->next_ubo_dst_offset += range->size;
 106         }
 107
 108         offset -= range->src_offset;
 109
 110         if (range->dst_offset + offset != 0) {
 111                 indirect_offset = vir_ADD(c, indirect_offset,
 112                                           vir_uniform_ui(c, range->dst_offset +
 113                                                          offset));
 114         }
 115
 116         /* Adjust for where we stored the TGSI register base. */
 117         vir_ADD_dest(c,
 118                      vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA),
 119                      vir_uniform(c, QUNIFORM_UBO_ADDR, 0),
 120                      indirect_offset);
 121
 122         vir_emit_thrsw(c);
 123         return vir_LDTMU(c);
 124 }
 125
 126 static struct qreg *
 127 ntq_init_ssa_def(struct v3d_compile *c, nir_ssa_def *def)
 128 {
 129         struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
 130                                           def->num_components);
 131         _mesa_hash_table_insert(c->def_ht, def, qregs);
 132         return qregs;
 133 }
 134
 135 /**
 136  * This function is responsible for getting VIR results into the associated
 137  * storage for a NIR instruction.
 138  *
 139  * If it's a NIR SSA def, then we just set the associated hash table entry to
 140  * the new result.
 141  *
 142  * If it's a NIR reg, then we need to update the existing qreg assigned to the
 143  * NIR destination with the incoming value.  To do that without introducing
 144  * new MOVs, we require that the incoming qreg either be a uniform, or be
 145  * SSA-defined by the previous VIR instruction in the block and rewritable by
 146  * this function.  That lets us sneak ahead and insert the SF flag beforehand
 147  * (knowing that the previous instruction doesn't depend on flags) and rewrite
 148  * its destination to be the NIR reg's destination
 149  */
 150 void
 151 ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
 152                struct qreg result)
 153 {
 154         struct qinst *last_inst = NULL;
 155         if (!list_empty(&c->cur_block->instructions))
 156                 last_inst = (struct qinst *)c->cur_block->instructions.prev;
 157
 158         assert(result.file == QFILE_UNIF ||
 159                (result.file == QFILE_TEMP &&
 160                 last_inst && last_inst == c->defs[result.index]));
 161
 162         if (dest->is_ssa) {
 163                 assert(chan < dest->ssa.num_components);
 164
 165                 struct qreg *qregs;
 166                 struct hash_entry *entry =
 167                         _mesa_hash_table_search(c->def_ht, &dest->ssa);
 168
 169                 if (entry)
 170                         qregs = entry->data;
 171                 else
 172                         qregs = ntq_init_ssa_def(c, &dest->ssa);
 173
 174                 qregs[chan] = result;
 175         } else {
 176                 nir_register *reg = dest->reg.reg;
 177                 assert(dest->reg.base_offset == 0);
 178                 assert(reg->num_array_elems == 0);
 179                 struct hash_entry *entry =
 180                         _mesa_hash_table_search(c->def_ht, reg);
 181                 struct qreg *qregs = entry->data;
 182
 183                 /* Insert a MOV if the source wasn't an SSA def in the
 184                  * previous instruction.
 185                  */
 186                 if (result.file == QFILE_UNIF) {
 187                         result = vir_MOV(c, result);
 188                         last_inst = c->defs[result.index];
 189                 }
 190
 191                 /* We know they're both temps, so just rewrite index. */
 192                 c->defs[last_inst->dst.index] = NULL;
 193                 last_inst->dst.index = qregs[chan].index;
 194
 195                 /* If we're in control flow, then make this update of the reg
 196                  * conditional on the execution mask.
 197                  */
 198                 if (c->execute.file != QFILE_NULL) {
 199                         last_inst->dst.index = qregs[chan].index;
 200
 201                         /* Set the flags to the current exec mask.  To insert
 202                          * the flags push, we temporarily remove our SSA
 203                          * instruction.
 204                          */
 205                         list_del(&last_inst->link);
 206                         vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
 207                         list_addtail(&last_inst->link,
 208                                      &c->cur_block->instructions);
 209
 210                         vir_set_cond(last_inst, V3D_QPU_COND_IFA);
 211                         last_inst->cond_is_exec_mask = true;
 212                 }
 213         }
 214 }
 215
 216 struct qreg
 217 ntq_get_src(struct v3d_compile *c, nir_src src, int i)
 218 {
 219         struct hash_entry *entry;
 220         if (src.is_ssa) {
 221                 entry = _mesa_hash_table_search(c->def_ht, src.ssa);
 222                 assert(i < src.ssa->num_components);
 223         } else {
 224                 nir_register *reg = src.reg.reg;
 225                 entry = _mesa_hash_table_search(c->def_ht, reg);
 226                 assert(reg->num_array_elems == 0);
 227                 assert(src.reg.base_offset == 0);
 228                 assert(i < reg->num_components);
 229         }
 230
 231         struct qreg *qregs = entry->data;
 232         return qregs[i];
 233 }
 234
 235 static struct qreg
 236 ntq_get_alu_src(struct v3d_compile *c, nir_alu_instr *instr,
 237                 unsigned src)
 238 {
 239         assert(util_is_power_of_two(instr->dest.write_mask));
 240         unsigned chan = ffs(instr->dest.write_mask) - 1;
 241         struct qreg r = ntq_get_src(c, instr->src[src].src,
 242                                     instr->src[src].swizzle[chan]);
 243
 244         assert(!instr->src[src].abs);
 245         assert(!instr->src[src].negate);
 246
 247         return r;
 248 };
 249
 250 static inline struct qreg
 251 vir_SAT(struct v3d_compile *c, struct qreg val)
 252 {
 253         return vir_FMAX(c,
 254                         vir_FMIN(c, val, vir_uniform_f(c, 1.0)),
 255                         vir_uniform_f(c, 0.0));
 256 }
 257
 258 static struct qreg
 259 ntq_umul(struct v3d_compile *c, struct qreg src0, struct qreg src1)
 260 {
 261         vir_MULTOP(c, src0, src1);
 262         return vir_UMUL24(c, src0, src1);
 263 }
 264
 265 static struct qreg
 266 ntq_minify(struct v3d_compile *c, struct qreg size, struct qreg level)
 267 {
 268         return vir_MAX(c, vir_SHR(c, size, level), vir_uniform_ui(c, 1));
 269 }
 270
 271 static void
 272 ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr)
 273 {
 274         unsigned unit = instr->texture_index;
 275         int lod_index = nir_tex_instr_src_index(instr, nir_tex_src_lod);
 276         int dest_size = nir_tex_instr_dest_size(instr);
 277
 278         struct qreg lod = c->undef;
 279         if (lod_index != -1)
 280                 lod = ntq_get_src(c, instr->src[lod_index].src, 0);
 281
 282         for (int i = 0; i < dest_size; i++) {
 283                 assert(i < 3);
 284                 enum quniform_contents contents;
 285
 286                 if (instr->is_array && i == dest_size - 1)
 287                         contents = QUNIFORM_TEXTURE_ARRAY_SIZE;
 288                 else
 289                         contents = QUNIFORM_TEXTURE_WIDTH + i;
 290
 291                 struct qreg size = vir_uniform(c, contents, unit);
 292
 293                 switch (instr->sampler_dim) {
 294                 case GLSL_SAMPLER_DIM_1D:
 295                 case GLSL_SAMPLER_DIM_2D:
 296                 case GLSL_SAMPLER_DIM_3D:
 297                 case GLSL_SAMPLER_DIM_CUBE:
 298                         /* Don't minify the array size. */
 299                         if (!(instr->is_array && i == dest_size - 1)) {
 300                                 size = ntq_minify(c, size, lod);
 301                         }
 302                         break;
 303
 304                 case GLSL_SAMPLER_DIM_RECT:
 305                         /* There's no LOD field for rects */
 306                         break;
 307
 308                 default:
 309                         unreachable("Bad sampler type");
 310                 }
 311
 312                 ntq_store_dest(c, &instr->dest, i, size);
 313         }
 314 }
 315
 316 static void
 317 ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
 318 {
 319         unsigned unit = instr->texture_index;
 320
 321         /* Since each texture sampling op requires uploading uniforms to
 322          * reference the texture, there's no HW support for texture size and
 323          * you just upload uniforms containing the size.
 324          */
 325         switch (instr->op) {
 326         case nir_texop_query_levels:
 327                 ntq_store_dest(c, &instr->dest, 0,
 328                                vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit));
 329                 return;
 330         case nir_texop_txs:
 331                 ntq_emit_txs(c, instr);
 332                 return;
 333         default:
 334                 break;
 335         }
 336
 337         v3d33_vir_emit_tex(c, instr);
 338 }
 339
 340 static struct qreg
 341 ntq_fsincos(struct v3d_compile *c, struct qreg src, bool is_cos)
 342 {
 343         struct qreg input = vir_FMUL(c, src, vir_uniform_f(c, 1.0f / M_PI));
 344         if (is_cos)
 345                 input = vir_FADD(c, input, vir_uniform_f(c, 0.5));
 346
 347         struct qreg periods = vir_FROUND(c, input);
 348         struct qreg sin_output = vir_SFU(c, V3D_QPU_WADDR_SIN,
 349                                          vir_FSUB(c, input, periods));
 350         return vir_XOR(c, sin_output, vir_SHL(c,
 351                                               vir_FTOIN(c, periods),
 352                                               vir_uniform_ui(c, -1)));
 353 }
 354
 355 static struct qreg
 356 ntq_fsign(struct v3d_compile *c, struct qreg src)
 357 {
 358         struct qreg t = vir_get_temp(c);
 359
 360         vir_MOV_dest(c, t, vir_uniform_f(c, 0.0));
 361         vir_PF(c, vir_FMOV(c, src), V3D_QPU_PF_PUSHZ);
 362         vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_f(c, 1.0));
 363         vir_PF(c, vir_FMOV(c, src), V3D_QPU_PF_PUSHN);
 364         vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_f(c, -1.0));
 365         return vir_MOV(c, t);
 366 }
 367
 368 static struct qreg
 369 ntq_isign(struct v3d_compile *c, struct qreg src)
 370 {
 371         struct qreg t = vir_get_temp(c);
 372
 373         vir_MOV_dest(c, t, vir_uniform_ui(c, 0));
 374         vir_PF(c, vir_MOV(c, src), V3D_QPU_PF_PUSHZ);
 375         vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_ui(c, 1));
 376         vir_PF(c, vir_MOV(c, src), V3D_QPU_PF_PUSHN);
 377         vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_ui(c, -1));
 378         return vir_MOV(c, t);
 379 }
 380
 381 static void
 382 emit_fragcoord_input(struct v3d_compile *c, int attr)
 383 {
 384         c->inputs[attr * 4 + 0] = vir_FXCD(c);
 385         c->inputs[attr * 4 + 1] = vir_FYCD(c);
 386         c->inputs[attr * 4 + 2] = c->payload_z;
 387         c->inputs[attr * 4 + 3] = vir_SFU(c, V3D_QPU_WADDR_RECIP,
 388                                           c->payload_w);
 389 }
 390
 391 static struct qreg
 392 emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
 393                       uint8_t swizzle)
 394 {
 395         struct qreg vary = vir_reg(QFILE_VARY, ~0);
 396         struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
 397
 398         /* For gl_PointCoord input or distance along a line, we'll be called
 399          * with no nir_variable, and we don't count toward VPM size so we
 400          * don't track an input slot.
 401          */
 402         if (!var) {
 403                 return vir_FADD(c, vir_FMUL(c, vary, c->payload_w), r5);
 404         }
 405
 406         int i = c->num_inputs++;
 407         c->input_slots[i] = v3d_slot_from_slot_and_component(var->data.location,
 408                                                              swizzle);
 409
 410         switch (var->data.interpolation) {
 411         case INTERP_MODE_NONE:
 412                 /* If a gl_FrontColor or gl_BackColor input has no interp
 413                  * qualifier, then if we're using glShadeModel(GL_FLAT) it
 414                  * needs to be flat shaded.
 415                  */
 416                 switch (var->data.location) {
 417                 case VARYING_SLOT_COL0:
 418                 case VARYING_SLOT_COL1:
 419                 case VARYING_SLOT_BFC0:
 420                 case VARYING_SLOT_BFC1:
 421                         if (c->fs_key->shade_model_flat) {
 422                                 BITSET_SET(c->flat_shade_flags, i);
 423                                 vir_MOV_dest(c, c->undef, vary);
 424                                 return vir_MOV(c, r5);
 425                         } else {
 426                                 return vir_FADD(c, vir_FMUL(c, vary,
 427                                                             c->payload_w), r5);
 428                         }
 429                 default:
 430                         break;
 431                 }
 432                 /* FALLTHROUGH */
 433         case INTERP_MODE_SMOOTH:
 434                 if (var->data.centroid) {
 435                         return vir_FADD(c, vir_FMUL(c, vary,
 436                                                     c->payload_w_centroid), r5);
 437                 } else {
 438                         return vir_FADD(c, vir_FMUL(c, vary, c->payload_w), r5);
 439                 }
 440         case INTERP_MODE_NOPERSPECTIVE:
 441                 /* C appears after the mov from the varying.
 442                    XXX: improve ldvary setup.
 443                 */
 444                 return vir_FADD(c, vir_MOV(c, vary), r5);
 445         case INTERP_MODE_FLAT:
 446                 BITSET_SET(c->flat_shade_flags, i);
 447                 vir_MOV_dest(c, c->undef, vary);
 448                 return vir_MOV(c, r5);
 449         default:
 450                 unreachable("Bad interp mode");
 451         }
 452 }
 453
 454 static void
 455 emit_fragment_input(struct v3d_compile *c, int attr, nir_variable *var)
 456 {
 457         for (int i = 0; i < glsl_get_vector_elements(var->type); i++) {
 458                 int chan = var->data.location_frac + i;
 459                 c->inputs[attr * 4 + chan] =
 460                         emit_fragment_varying(c, var, chan);
 461         }
 462 }
 463
 464 static void
 465 add_output(struct v3d_compile *c,
 466            uint32_t decl_offset,
 467            uint8_t slot,
 468            uint8_t swizzle)
 469 {
 470         uint32_t old_array_size = c->outputs_array_size;
 471         resize_qreg_array(c, &c->outputs, &c->outputs_array_size,
 472                           decl_offset + 1);
 473
 474         if (old_array_size != c->outputs_array_size) {
 475                 c->output_slots = reralloc(c,
 476                                            c->output_slots,
 477                                            struct v3d_varying_slot,
 478                                            c->outputs_array_size);
 479         }
 480
 481         c->output_slots[decl_offset] =
 482                 v3d_slot_from_slot_and_component(slot, swizzle);
 483 }
 484
 485 static void
 486 declare_uniform_range(struct v3d_compile *c, uint32_t start, uint32_t size)
 487 {
 488         unsigned array_id = c->num_ubo_ranges++;
 489         if (array_id >= c->ubo_ranges_array_size) {
 490                 c->ubo_ranges_array_size = MAX2(c->ubo_ranges_array_size * 2,
 491                                                 array_id + 1);
 492                 c->ubo_ranges = reralloc(c, c->ubo_ranges,
 493                                          struct v3d_ubo_range,
 494                                          c->ubo_ranges_array_size);
 495                 c->ubo_range_used = reralloc(c, c->ubo_range_used,
 496                                              bool,
 497                                              c->ubo_ranges_array_size);
 498         }
 499
 500         c->ubo_ranges[array_id].dst_offset = 0;
 501         c->ubo_ranges[array_id].src_offset = start;
 502         c->ubo_ranges[array_id].size = size;
 503         c->ubo_range_used[array_id] = false;
 504 }
 505
 506 /**
 507  * If compare_instr is a valid comparison instruction, emits the
 508  * compare_instr's comparison and returns the sel_instr's return value based
 509  * on the compare_instr's result.
 510  */
 511 static bool
 512 ntq_emit_comparison(struct v3d_compile *c, struct qreg *dest,
 513                     nir_alu_instr *compare_instr,
 514                     nir_alu_instr *sel_instr)
 515 {
 516         struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0);
 517         struct qreg src1 = ntq_get_alu_src(c, compare_instr, 1);
 518         bool cond_invert = false;
 519
 520         switch (compare_instr->op) {
 521         case nir_op_feq:
 522         case nir_op_seq:
 523                 vir_PF(c, vir_FCMP(c, src0, src1), V3D_QPU_PF_PUSHZ);
 524                 break;
 525         case nir_op_ieq:
 526                 vir_PF(c, vir_XOR(c, src0, src1), V3D_QPU_PF_PUSHZ);
 527                 break;
 528
 529         case nir_op_fne:
 530         case nir_op_sne:
 531                 vir_PF(c, vir_FCMP(c, src0, src1), V3D_QPU_PF_PUSHZ);
 532                 cond_invert = true;
 533                 break;
 534         case nir_op_ine:
 535                 vir_PF(c, vir_XOR(c, src0, src1), V3D_QPU_PF_PUSHZ);
 536                 cond_invert = true;
 537                 break;
 538
 539         case nir_op_fge:
 540         case nir_op_sge:
 541                 vir_PF(c, vir_FCMP(c, src1, src0), V3D_QPU_PF_PUSHC);
 542                 break;
 543         case nir_op_ige:
 544                 vir_PF(c, vir_MIN(c, src1, src0), V3D_QPU_PF_PUSHC);
 545                 cond_invert = true;
 546                 break;
 547         case nir_op_uge:
 548                 vir_PF(c, vir_SUB(c, src0, src1), V3D_QPU_PF_PUSHC);
 549                 cond_invert = true;
 550                 break;
 551
 552         case nir_op_slt:
 553         case nir_op_flt:
 554                 vir_PF(c, vir_FCMP(c, src0, src1), V3D_QPU_PF_PUSHN);
 555                 break;
 556         case nir_op_ilt:
 557                 vir_PF(c, vir_MIN(c, src1, src0), V3D_QPU_PF_PUSHC);
 558                 break;
 559         case nir_op_ult:
 560                 vir_PF(c, vir_SUB(c, src0, src1), V3D_QPU_PF_PUSHC);
 561                 break;
 562
 563         default:
 564                 return false;
 565         }
 566
 567         enum v3d_qpu_cond cond = (cond_invert ?
 568                                   V3D_QPU_COND_IFNA :
 569                                   V3D_QPU_COND_IFA);
 570
 571         switch (sel_instr->op) {
 572         case nir_op_seq:
 573         case nir_op_sne:
 574         case nir_op_sge:
 575         case nir_op_slt:
 576                 *dest = vir_SEL(c, cond,
 577                                 vir_uniform_f(c, 1.0), vir_uniform_f(c, 0.0));
 578                 break;
 579
 580         case nir_op_bcsel:
 581                 *dest = vir_SEL(c, cond,
 582                                 ntq_get_alu_src(c, sel_instr, 1),
 583                                 ntq_get_alu_src(c, sel_instr, 2));
 584                 break;
 585
 586         default:
 587                 *dest = vir_SEL(c, cond,
 588                                 vir_uniform_ui(c, ~0), vir_uniform_ui(c, 0));
 589                 break;
 590         }
 591
 592         /* Make the temporary for nir_store_dest(). */
 593         *dest = vir_MOV(c, *dest);
 594
 595         return true;
 596 }
 597
 598 /**
 599  * Attempts to fold a comparison generating a boolean result into the
 600  * condition code for selecting between two values, instead of comparing the
 601  * boolean result against 0 to generate the condition code.
 602  */
 603 static struct qreg ntq_emit_bcsel(struct v3d_compile *c, nir_alu_instr *instr,
 604                                   struct qreg *src)
 605 {
 606         if (!instr->src[0].src.is_ssa)
 607                 goto out;
 608         if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
 609                 goto out;
 610         nir_alu_instr *compare =
 611                 nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
 612         if (!compare)
 613                 goto out;
 614
 615         struct qreg dest;
 616         if (ntq_emit_comparison(c, &dest, compare, instr))
 617                 return dest;
 618
 619 out:
 620         vir_PF(c, src[0], V3D_QPU_PF_PUSHZ);
 621         return vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, src[1], src[2]));
 622 }
 623
 624
 625 static void
 626 ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
 627 {
 628         /* This should always be lowered to ALU operations for V3D. */
 629         assert(!instr->dest.saturate);
 630
 631         /* Vectors are special in that they have non-scalarized writemasks,
 632          * and just take the first swizzle channel for each argument in order
 633          * into each writemask channel.
 634          */
 635         if (instr->op == nir_op_vec2 ||
 636             instr->op == nir_op_vec3 ||
 637             instr->op == nir_op_vec4) {
 638                 struct qreg srcs[4];
 639                 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
 640                         srcs[i] = ntq_get_src(c, instr->src[i].src,
 641                                               instr->src[i].swizzle[0]);
 642                 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
 643                         ntq_store_dest(c, &instr->dest.dest, i,
 644                                        vir_MOV(c, srcs[i]));
 645                 return;
 646         }
 647
 648         /* General case: We can just grab the one used channel per src. */
 649         struct qreg src[nir_op_infos[instr->op].num_inputs];
 650         for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
 651                 src[i] = ntq_get_alu_src(c, instr, i);
 652         }
 653
 654         struct qreg result;
 655
 656         switch (instr->op) {
 657         case nir_op_fmov:
 658         case nir_op_imov:
 659                 result = vir_MOV(c, src[0]);
 660                 break;
 661
 662         case nir_op_fneg:
 663                 result = vir_XOR(c, src[0], vir_uniform_ui(c, 1 << 31));
 664                 break;
 665         case nir_op_ineg:
 666                 result = vir_NEG(c, src[0]);
 667                 break;
 668
 669         case nir_op_fmul:
 670                 result = vir_FMUL(c, src[0], src[1]);
 671                 break;
 672         case nir_op_fadd:
 673                 result = vir_FADD(c, src[0], src[1]);
 674                 break;
 675         case nir_op_fsub:
 676                 result = vir_FSUB(c, src[0], src[1]);
 677                 break;
 678         case nir_op_fmin:
 679                 result = vir_FMIN(c, src[0], src[1]);
 680                 break;
 681         case nir_op_fmax:
 682                 result = vir_FMAX(c, src[0], src[1]);
 683                 break;
 684
 685         case nir_op_f2i32:
 686                 result = vir_FTOIZ(c, src[0]);
 687                 break;
 688         case nir_op_f2u32:
 689                 result = vir_FTOUZ(c, src[0]);
 690                 break;
 691         case nir_op_i2f32:
 692                 result = vir_ITOF(c, src[0]);
 693                 break;
 694         case nir_op_u2f32:
 695                 result = vir_UTOF(c, src[0]);
 696                 break;
 697         case nir_op_b2f:
 698                 result = vir_AND(c, src[0], vir_uniform_f(c, 1.0));
 699                 break;
 700         case nir_op_b2i:
 701                 result = vir_AND(c, src[0], vir_uniform_ui(c, 1));
 702                 break;
 703         case nir_op_i2b:
 704         case nir_op_f2b:
 705                 vir_PF(c, src[0], V3D_QPU_PF_PUSHZ);
 706                 result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA,
 707                                             vir_uniform_ui(c, ~0),
 708                                             vir_uniform_ui(c, 0)));
 709                 break;
 710
 711         case nir_op_iadd:
 712                 result = vir_ADD(c, src[0], src[1]);
 713                 break;
 714         case nir_op_ushr:
 715                 result = vir_SHR(c, src[0], src[1]);
 716                 break;
 717         case nir_op_isub:
 718                 result = vir_SUB(c, src[0], src[1]);
 719                 break;
 720         case nir_op_ishr:
 721                 result = vir_ASR(c, src[0], src[1]);
 722                 break;
 723         case nir_op_ishl:
 724                 result = vir_SHL(c, src[0], src[1]);
 725                 break;
 726         case nir_op_imin:
 727                 result = vir_MIN(c, src[0], src[1]);
 728                 break;
 729         case nir_op_umin:
 730                 result = vir_UMIN(c, src[0], src[1]);
 731                 break;
 732         case nir_op_imax:
 733                 result = vir_MAX(c, src[0], src[1]);
 734                 break;
 735         case nir_op_umax:
 736                 result = vir_UMAX(c, src[0], src[1]);
 737                 break;
 738         case nir_op_iand:
 739                 result = vir_AND(c, src[0], src[1]);
 740                 break;
 741         case nir_op_ior:
 742                 result = vir_OR(c, src[0], src[1]);
 743                 break;
 744         case nir_op_ixor:
 745                 result = vir_XOR(c, src[0], src[1]);
 746                 break;
 747         case nir_op_inot:
 748                 result = vir_NOT(c, src[0]);
 749                 break;
 750
 751         case nir_op_imul:
 752                 result = ntq_umul(c, src[0], src[1]);
 753                 break;
 754
 755         case nir_op_seq:
 756         case nir_op_sne:
 757         case nir_op_sge:
 758         case nir_op_slt:
 759         case nir_op_feq:
 760         case nir_op_fne:
 761         case nir_op_fge:
 762         case nir_op_flt:
 763         case nir_op_ieq:
 764         case nir_op_ine:
 765         case nir_op_ige:
 766         case nir_op_uge:
 767         case nir_op_ilt:
 768         case nir_op_ult:
 769                 if (!ntq_emit_comparison(c, &result, instr, instr)) {
 770                         fprintf(stderr, "Bad comparison instruction\n");
 771                 }
 772                 break;
 773
 774         case nir_op_bcsel:
 775                 result = ntq_emit_bcsel(c, instr, src);
 776                 break;
 777         case nir_op_fcsel:
 778                 vir_PF(c, src[0], V3D_QPU_PF_PUSHZ);
 779                 result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA,
 780                                             src[1], src[2]));
 781                 break;
 782
 783         case nir_op_frcp:
 784                 result = vir_SFU(c, V3D_QPU_WADDR_RECIP, src[0]);
 785                 break;
 786         case nir_op_frsq:
 787                 result = vir_SFU(c, V3D_QPU_WADDR_RSQRT, src[0]);
 788                 break;
 789         case nir_op_fexp2:
 790                 result = vir_SFU(c, V3D_QPU_WADDR_EXP, src[0]);
 791                 break;
 792         case nir_op_flog2:
 793                 result = vir_SFU(c, V3D_QPU_WADDR_LOG, src[0]);
 794                 break;
 795
 796         case nir_op_fceil:
 797                 result = vir_FCEIL(c, src[0]);
 798                 break;
 799         case nir_op_ffloor:
 800                 result = vir_FFLOOR(c, src[0]);
 801                 break;
 802         case nir_op_fround_even:
 803                 result = vir_FROUND(c, src[0]);
 804                 break;
 805         case nir_op_ftrunc:
 806                 result = vir_FTRUNC(c, src[0]);
 807                 break;
 808         case nir_op_ffract:
 809                 result = vir_FSUB(c, src[0], vir_FFLOOR(c, src[0]));
 810                 break;
 811
 812         case nir_op_fsin:
 813                 result = ntq_fsincos(c, src[0], false);
 814                 break;
 815         case nir_op_fcos:
 816                 result = ntq_fsincos(c, src[0], true);
 817                 break;
 818
 819         case nir_op_fsign:
 820                 result = ntq_fsign(c, src[0]);
 821                 break;
 822         case nir_op_isign:
 823                 result = ntq_isign(c, src[0]);
 824                 break;
 825
 826         case nir_op_fabs: {
 827                 result = vir_FMOV(c, src[0]);
 828                 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_ABS);
 829                 break;
 830         }
 831
 832         case nir_op_iabs:
 833                 result = vir_MAX(c, src[0],
 834                                 vir_SUB(c, vir_uniform_ui(c, 0), src[0]));
 835                 break;
 836
 837         case nir_op_fddx:
 838         case nir_op_fddx_coarse:
 839         case nir_op_fddx_fine:
 840                 result = vir_FDX(c, src[0]);
 841                 break;
 842
 843         case nir_op_fddy:
 844         case nir_op_fddy_coarse:
 845         case nir_op_fddy_fine:
 846                 result = vir_FDY(c, src[0]);
 847                 break;
 848
 849         default:
 850                 fprintf(stderr, "unknown NIR ALU inst: ");
 851                 nir_print_instr(&instr->instr, stderr);
 852                 fprintf(stderr, "\n");
 853                 abort();
 854         }
 855
 856         /* We have a scalar result, so the instruction should only have a
 857          * single channel written to.
 858          */
 859         assert(util_is_power_of_two(instr->dest.write_mask));
 860         ntq_store_dest(c, &instr->dest.dest,
 861                        ffs(instr->dest.write_mask) - 1, result);
 862 }
 863
 864 /* Each TLB read/write setup (a render target or depth buffer) takes an 8-bit
 865  * specifier.  They come from a register that's preloaded with 0xffffffff
 866  * (0xff gets you normal vec4 f16 RT0 writes), and when one is neaded the low
 867  * 8 bits are shifted off the bottom and 0xff shifted in from the top.
 868  */
 869 #define TLB_TYPE_F16_COLOR         (3 << 6)
 870 #define TLB_TYPE_I32_COLOR         (1 << 6)
 871 #define TLB_TYPE_F32_COLOR         (0 << 6)
 872 #define TLB_RENDER_TARGET_SHIFT    3 /* Reversed!  7 = RT 0, 0 = RT 7. */
 873 #define TLB_SAMPLE_MODE_PER_SAMPLE (0 << 2)
 874 #define TLB_SAMPLE_MODE_PER_PIXEL  (1 << 2)
 875 #define TLB_F16_SWAP_HI_LO         (1 << 1)
 876 #define TLB_VEC_SIZE_4_F16         (1 << 0)
 877 #define TLB_VEC_SIZE_2_F16         (0 << 0)
 878 #define TLB_VEC_SIZE_MINUS_1_SHIFT 0
 879
 880 /* Triggers Z/Stencil testing, used when the shader state's "FS modifies Z"
 881  * flag is set.
 882  */
 883 #define TLB_TYPE_DEPTH             ((2 << 6) | (0 << 4))
 884 #define TLB_DEPTH_TYPE_INVARIANT   (0 << 2) /* Unmodified sideband input used */
 885 #define TLB_DEPTH_TYPE_PER_PIXEL   (1 << 2) /* QPU result used */
 886
 887 /* Stencil is a single 32-bit write. */
 888 #define TLB_TYPE_STENCIL_ALPHA     ((2 << 6) | (1 << 4))
 889
 890 static void
 891 emit_frag_end(struct v3d_compile *c)
 892 {
 893         /* XXX
 894         if (c->output_sample_mask_index != -1) {
 895                 vir_MS_MASK(c, c->outputs[c->output_sample_mask_index]);
 896         }
 897         */
 898
 899         bool has_any_tlb_color_write = false;
 900         for (int rt = 0; rt < c->fs_key->nr_cbufs; rt++) {
 901                 if (c->output_color_var[rt])
 902                         has_any_tlb_color_write = true;
 903         }
 904
 905         if (c->output_position_index != -1) {
 906                 struct qinst *inst = vir_MOV_dest(c,
 907                                                   vir_reg(QFILE_TLBU, 0),
 908                                                   c->outputs[c->output_position_index]);
 909
 910                 inst->src[vir_get_implicit_uniform_src(inst)] =
 911                         vir_uniform_ui(c,
 912                                        TLB_TYPE_DEPTH |
 913                                        TLB_DEPTH_TYPE_PER_PIXEL |
 914                                        0xffffff00);
 915         } else if (c->s->info.fs.uses_discard || !has_any_tlb_color_write) {
 916                 /* Emit passthrough Z if it needed to be delayed until shader
 917                  * end due to potential discards.
 918                  *
 919                  * Since (single-threaded) fragment shaders always need a TLB
 920                  * write, emit passthrouh Z if we didn't have any color
 921                  * buffers and flag us as potentially discarding, so that we
 922                  * can use Z as the TLB write.
 923                  */
 924                 c->s->info.fs.uses_discard = true;
 925
 926                 struct qinst *inst = vir_MOV_dest(c,
 927                                                   vir_reg(QFILE_TLBU, 0),
 928                                                   vir_reg(QFILE_NULL, 0));
 929
 930                 inst->src[vir_get_implicit_uniform_src(inst)] =
 931                         vir_uniform_ui(c,
 932                                        TLB_TYPE_DEPTH |
 933                                        TLB_DEPTH_TYPE_INVARIANT |
 934                                        0xffffff00);
 935         }
 936
 937         /* XXX: Performance improvement: Merge Z write and color writes TLB
 938          * uniform setup
 939          */
 940
 941         for (int rt = 0; rt < c->fs_key->nr_cbufs; rt++) {
 942                 if (!c->output_color_var[rt])
 943                         continue;
 944
 945                 nir_variable *var = c->output_color_var[rt];
 946                 struct qreg *color = &c->outputs[var->data.driver_location * 4];
 947                 int num_components = glsl_get_vector_elements(var->type);
 948                 uint32_t conf = 0xffffff00;
 949                 struct qinst *inst;
 950
 951                 conf |= TLB_SAMPLE_MODE_PER_PIXEL;
 952                 conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
 953
 954                 assert(num_components != 0);
 955                 switch (glsl_get_base_type(var->type)) {
 956                 case GLSL_TYPE_UINT:
 957                 case GLSL_TYPE_INT:
 958                         conf |= TLB_TYPE_I32_COLOR;
 959                         conf |= ((num_components - 1) <<
 960                                  TLB_VEC_SIZE_MINUS_1_SHIFT);
 961
 962                         inst = vir_MOV_dest(c, vir_reg(QFILE_TLBU, 0), color[0]);
 963                         inst->src[vir_get_implicit_uniform_src(inst)] =
 964                                 vir_uniform_ui(c, conf);
 965
 966                         for (int i = 1; i < num_components; i++) {
 967                                 inst = vir_MOV_dest(c, vir_reg(QFILE_TLB, 0),
 968                                                     color[i]);
 969                         }
 970                         break;
 971
 972                 default: {
 973                         struct qreg r = color[0];
 974                         struct qreg g = color[1];
 975                         struct qreg b = color[2];
 976                         struct qreg a = color[3];
 977
 978                         if (c->fs_key->f32_color_rb) {
 979                                 conf |= TLB_TYPE_F32_COLOR;
 980                                 conf |= ((num_components - 1) <<
 981                                          TLB_VEC_SIZE_MINUS_1_SHIFT);
 982                         } else {
 983                                 conf |= TLB_TYPE_F16_COLOR;
 984                                 conf |= TLB_F16_SWAP_HI_LO;
 985                                 if (num_components >= 3)
 986                                         conf |= TLB_VEC_SIZE_4_F16;
 987                                 else
 988                                         conf |= TLB_VEC_SIZE_2_F16;
 989                         }
 990
 991                         if (c->fs_key->swap_color_rb & (1 << rt))  {
 992                                 r = color[2];
 993                                 b = color[0];
 994                         }
 995
 996                         if (c->fs_key->f32_color_rb & (1 << rt)) {
 997                                 inst = vir_MOV_dest(c, vir_reg(QFILE_TLBU, 0), color[0]);
 998                                 inst->src[vir_get_implicit_uniform_src(inst)] =
 999                                         vir_uniform_ui(c, conf);
1000
1001                                 for (int i = 1; i < num_components; i++) {
1002                                         inst = vir_MOV_dest(c, vir_reg(QFILE_TLB, 0),
1003                                                             color[i]);
1004                                 }
1005                         } else {
1006                                 inst = vir_VFPACK_dest(c, vir_reg(QFILE_TLB, 0), r, g);
1007                                 if (conf != ~0) {
1008                                         inst->dst.file = QFILE_TLBU;
1009                                         inst->src[vir_get_implicit_uniform_src(inst)] =
1010                                                 vir_uniform_ui(c, conf);
1011                                 }
1012
1013                                 if (num_components >= 3)
1014                                         inst = vir_VFPACK_dest(c, vir_reg(QFILE_TLB, 0), b, a);
1015                         }
1016                         break;
1017                 }
1018                 }
1019         }
1020 }
1021
1022 static void
1023 vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t *vpm_index)
1024 {
1025         if (c->devinfo->ver >= 40) {
1026                 vir_STVPMV(c, vir_uniform_ui(c, *vpm_index), val);
1027                 *vpm_index = *vpm_index + 1;
1028         } else {
1029                 vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val);
1030         }
1031
1032         c->num_vpm_writes++;
1033 }
1034
1035 static void
1036 emit_scaled_viewport_write(struct v3d_compile *c, struct qreg rcp_w,
1037                            uint32_t *vpm_index)
1038 {
1039         for (int i = 0; i < 2; i++) {
1040                 struct qreg coord = c->outputs[c->output_position_index + i];
1041                 coord = vir_FMUL(c, coord,
1042                                  vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE + i,
1043                                              0));
1044                 coord = vir_FMUL(c, coord, rcp_w);
1045                 vir_VPM_WRITE(c, vir_FTOIN(c, coord), vpm_index);
1046         }
1047
1048 }
1049
1050 static void
1051 emit_zs_write(struct v3d_compile *c, struct qreg rcp_w, uint32_t *vpm_index)
1052 {
1053         struct qreg zscale = vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0);
1054         struct qreg zoffset = vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0);
1055
1056         struct qreg z = c->outputs[c->output_position_index + 2];
1057         z = vir_FMUL(c, z, zscale);
1058         z = vir_FMUL(c, z, rcp_w);
1059         z = vir_FADD(c, z, zoffset);
1060         vir_VPM_WRITE(c, z, vpm_index);
1061 }
1062
1063 static void
1064 emit_rcp_wc_write(struct v3d_compile *c, struct qreg rcp_w, uint32_t *vpm_index)
1065 {
1066         vir_VPM_WRITE(c, rcp_w, vpm_index);
1067 }
1068
1069 static void
1070 emit_point_size_write(struct v3d_compile *c, uint32_t *vpm_index)
1071 {
1072         struct qreg point_size;
1073
1074         if (c->output_point_size_index != -1)
1075                 point_size = c->outputs[c->output_point_size_index];
1076         else
1077                 point_size = vir_uniform_f(c, 1.0);
1078
1079         /* Workaround: HW-2726 PTB does not handle zero-size points (BCM2835,
1080          * BCM21553).
1081          */
1082         point_size = vir_FMAX(c, point_size, vir_uniform_f(c, .125));
1083
1084         vir_VPM_WRITE(c, point_size, vpm_index);
1085 }
1086
1087 static void
1088 emit_vpm_write_setup(struct v3d_compile *c)
1089 {
1090         if (c->devinfo->ver >= 40)
1091                 return;
1092
1093         v3d33_vir_vpm_write_setup(c);
1094 }
1095
1096 static void
1097 emit_vert_end(struct v3d_compile *c)
1098 {
1099         uint32_t vpm_index = 0;
1100         struct qreg rcp_w = vir_SFU(c, V3D_QPU_WADDR_RECIP,
1101                                     c->outputs[c->output_position_index + 3]);
1102
1103         emit_vpm_write_setup(c);
1104
1105         if (c->vs_key->is_coord) {
1106                 for (int i = 0; i < 4; i++)
1107                         vir_VPM_WRITE(c, c->outputs[c->output_position_index + i],
1108                                       &vpm_index);
1109                 emit_scaled_viewport_write(c, rcp_w, &vpm_index);
1110                 if (c->vs_key->per_vertex_point_size) {
1111                         emit_point_size_write(c, &vpm_index);
1112                         /* emit_rcp_wc_write(c, rcp_w); */
1113                 }
1114                 /* XXX: Z-only rendering */
1115                 if (0)
1116                         emit_zs_write(c, rcp_w, &vpm_index);
1117         } else {
1118                 emit_scaled_viewport_write(c, rcp_w, &vpm_index);
1119                 emit_zs_write(c, rcp_w, &vpm_index);
1120                 emit_rcp_wc_write(c, rcp_w, &vpm_index);
1121                 if (c->vs_key->per_vertex_point_size)
1122                         emit_point_size_write(c, &vpm_index);
1123         }
1124
1125         for (int i = 0; i < c->vs_key->num_fs_inputs; i++) {
1126                 struct v3d_varying_slot input = c->vs_key->fs_inputs[i];
1127                 int j;
1128
1129                 for (j = 0; j < c->num_outputs; j++) {
1130                         struct v3d_varying_slot output = c->output_slots[j];
1131
1132                         if (!memcmp(&input, &output, sizeof(input))) {
1133                                 vir_VPM_WRITE(c, c->outputs[j],
1134                                               &vpm_index);
1135                                 break;
1136                         }
1137                 }
1138                 /* Emit padding if we didn't find a declared VS output for
1139                  * this FS input.
1140                  */
1141                 if (j == c->num_outputs)
1142                         vir_VPM_WRITE(c, vir_uniform_f(c, 0.0),
1143                                       &vpm_index);
1144         }
1145
1146         /* GFXH-1684: VPM writes need to be complete by the end of the shader.
1147          */
1148         if (c->devinfo->ver >= 40 && c->devinfo->ver <= 41)
1149                 vir_VPMWT(c);
1150 }
1151
1152 void
1153 v3d_optimize_nir(struct nir_shader *s)
1154 {
1155         bool progress;
1156
1157         do {
1158                 progress = false;
1159
1160                 NIR_PASS_V(s, nir_lower_vars_to_ssa);
1161                 NIR_PASS(progress, s, nir_lower_alu_to_scalar);
1162                 NIR_PASS(progress, s, nir_lower_phis_to_scalar);
1163                 NIR_PASS(progress, s, nir_copy_prop);
1164                 NIR_PASS(progress, s, nir_opt_remove_phis);
1165                 NIR_PASS(progress, s, nir_opt_dce);
1166                 NIR_PASS(progress, s, nir_opt_dead_cf);
1167                 NIR_PASS(progress, s, nir_opt_cse);
1168                 NIR_PASS(progress, s, nir_opt_peephole_select, 8);
1169                 NIR_PASS(progress, s, nir_opt_algebraic);
1170                 NIR_PASS(progress, s, nir_opt_constant_folding);
1171                 NIR_PASS(progress, s, nir_opt_undef);
1172         } while (progress);
1173 }
1174
1175 static int
1176 driver_location_compare(const void *in_a, const void *in_b)
1177 {
1178         const nir_variable *const *a = in_a;
1179         const nir_variable *const *b = in_b;
1180
1181         return (*a)->data.driver_location - (*b)->data.driver_location;
1182 }
1183
1184 static struct qreg
1185 ntq_emit_vpm_read(struct v3d_compile *c,
1186                   uint32_t *num_components_queued,
1187                   uint32_t *remaining,
1188                   uint32_t vpm_index)
1189 {
1190         struct qreg vpm = vir_reg(QFILE_VPM, vpm_index);
1191
1192         if (c->devinfo->ver >= 40 ) {
1193                 return vir_LDVPMV_IN(c,
1194                                      vir_uniform_ui(c,
1195                                                     (*num_components_queued)++));
1196         }
1197
1198         if (*num_components_queued != 0) {
1199                 (*num_components_queued)--;
1200                 c->num_inputs++;
1201                 return vir_MOV(c, vpm);
1202         }
1203
1204         uint32_t num_components = MIN2(*remaining, 32);
1205
1206         v3d33_vir_vpm_read_setup(c, num_components);
1207
1208         *num_components_queued = num_components - 1;
1209         *remaining -= num_components;
1210         c->num_inputs++;
1211
1212         return vir_MOV(c, vpm);
1213 }
1214
1215 static void
1216 ntq_setup_inputs(struct v3d_compile *c)
1217 {
1218         unsigned num_entries = 0;
1219         unsigned num_components = 0;
1220         nir_foreach_variable(var, &c->s->inputs) {
1221                 num_entries++;
1222                 num_components += glsl_get_components(var->type);
1223         }
1224
1225         nir_variable *vars[num_entries];
1226
1227         unsigned i = 0;
1228         nir_foreach_variable(var, &c->s->inputs)
1229                 vars[i++] = var;
1230
1231         /* Sort the variables so that we emit the input setup in
1232          * driver_location order.  This is required for VPM reads, whose data
1233          * is fetched into the VPM in driver_location (TGSI register index)
1234          * order.
1235          */
1236         qsort(&vars, num_entries, sizeof(*vars), driver_location_compare);
1237
1238         uint32_t vpm_components_queued = 0;
1239         if (c->s->info.stage == MESA_SHADER_VERTEX) {
1240                 bool uses_iid = c->s->info.system_values_read &
1241                         (1ull << SYSTEM_VALUE_INSTANCE_ID);
1242                 bool uses_vid = c->s->info.system_values_read &
1243                         (1ull << SYSTEM_VALUE_VERTEX_ID);
1244
1245                 num_components += uses_iid;
1246                 num_components += uses_vid;
1247
1248                 if (uses_iid) {
1249                         c->iid = ntq_emit_vpm_read(c, &vpm_components_queued,
1250                                                    &num_components, ~0);
1251                 }
1252
1253                 if (uses_vid) {
1254                         c->vid = ntq_emit_vpm_read(c, &vpm_components_queued,
1255                                                    &num_components, ~0);
1256                 }
1257         }
1258
1259         for (unsigned i = 0; i < num_entries; i++) {
1260                 nir_variable *var = vars[i];
1261                 unsigned array_len = MAX2(glsl_get_length(var->type), 1);
1262                 unsigned loc = var->data.driver_location;
1263
1264                 assert(array_len == 1);
1265                 (void)array_len;
1266                 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
1267                                   (loc + 1) * 4);
1268
1269                 if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
1270                         if (var->data.location == VARYING_SLOT_POS) {
1271                                 emit_fragcoord_input(c, loc);
1272                         } else if (var->data.location == VARYING_SLOT_PNTC ||
1273                                    (var->data.location >= VARYING_SLOT_VAR0 &&
1274                                     (c->fs_key->point_sprite_mask &
1275                                      (1 << (var->data.location -
1276                                             VARYING_SLOT_VAR0))))) {
1277                                 c->inputs[loc * 4 + 0] = c->point_x;
1278                                 c->inputs[loc * 4 + 1] = c->point_y;
1279                         } else {
1280                                 emit_fragment_input(c, loc, var);
1281                         }
1282                 } else {
1283                         int var_components = glsl_get_components(var->type);
1284
1285                         for (int i = 0; i < var_components; i++) {
1286                                 c->inputs[loc * 4 + i] =
1287                                         ntq_emit_vpm_read(c,
1288                                                           &vpm_components_queued,
1289                                                           &num_components,
1290                                                           loc * 4 + i);
1291
1292                         }
1293                         c->vattr_sizes[loc] = var_components;
1294                 }
1295         }
1296
1297         if (c->s->info.stage == MESA_SHADER_VERTEX) {
1298                 if (c->devinfo->ver >= 40) {
1299                         assert(vpm_components_queued == num_components);
1300                 } else {
1301                         assert(vpm_components_queued == 0);
1302                         assert(num_components == 0);
1303                 }
1304         }
1305 }
1306
1307 static void
1308 ntq_setup_outputs(struct v3d_compile *c)
1309 {
1310         nir_foreach_variable(var, &c->s->outputs) {
1311                 unsigned array_len = MAX2(glsl_get_length(var->type), 1);
1312                 unsigned loc = var->data.driver_location * 4;
1313
1314                 assert(array_len == 1);
1315                 (void)array_len;
1316
1317                 for (int i = 0; i < glsl_get_vector_elements(var->type); i++) {
1318                         add_output(c, loc + var->data.location_frac + i,
1319                                    var->data.location,
1320                                    var->data.location_frac + i);
1321                 }
1322
1323                 if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
1324                         switch (var->data.location) {
1325                         case FRAG_RESULT_COLOR:
1326                                 c->output_color_var[0] = var;
1327                                 c->output_color_var[1] = var;
1328                                 c->output_color_var[2] = var;
1329                                 c->output_color_var[3] = var;
1330                                 break;
1331                         case FRAG_RESULT_DATA0:
1332                         case FRAG_RESULT_DATA1:
1333                         case FRAG_RESULT_DATA2:
1334                         case FRAG_RESULT_DATA3:
1335                                 c->output_color_var[var->data.location -
1336                                                     FRAG_RESULT_DATA0] = var;
1337                                 break;
1338                         case FRAG_RESULT_DEPTH:
1339                                 c->output_position_index = loc;
1340                                 break;
1341                         case FRAG_RESULT_SAMPLE_MASK:
1342                                 c->output_sample_mask_index = loc;
1343                                 break;
1344                         }
1345                 } else {
1346                         switch (var->data.location) {
1347                         case VARYING_SLOT_POS:
1348                                 c->output_position_index = loc;
1349                                 break;
1350                         case VARYING_SLOT_PSIZ:
1351                                 c->output_point_size_index = loc;
1352                                 break;
1353                         }
1354                 }
1355         }
1356 }
1357
1358 static void
1359 ntq_setup_uniforms(struct v3d_compile *c)
1360 {
1361         nir_foreach_variable(var, &c->s->uniforms) {
1362                 uint32_t vec4_count = glsl_count_attribute_slots(var->type,
1363                                                                  false);
1364                 unsigned vec4_size = 4 * sizeof(float);
1365
1366                 declare_uniform_range(c, var->data.driver_location * vec4_size,
1367                                       vec4_count * vec4_size);
1368
1369         }
1370 }
1371
1372 /**
1373  * Sets up the mapping from nir_register to struct qreg *.
1374  *
1375  * Each nir_register gets a struct qreg per 32-bit component being stored.
1376  */
1377 static void
1378 ntq_setup_registers(struct v3d_compile *c, struct exec_list *list)
1379 {
1380         foreach_list_typed(nir_register, nir_reg, node, list) {
1381                 unsigned array_len = MAX2(nir_reg->num_array_elems, 1);
1382                 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
1383                                                   array_len *
1384                                                   nir_reg->num_components);
1385
1386                 _mesa_hash_table_insert(c->def_ht, nir_reg, qregs);
1387
1388                 for (int i = 0; i < array_len * nir_reg->num_components; i++)
1389                         qregs[i] = vir_get_temp(c);
1390         }
1391 }
1392
1393 static void
1394 ntq_emit_load_const(struct v3d_compile *c, nir_load_const_instr *instr)
1395 {
1396         struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
1397         for (int i = 0; i < instr->def.num_components; i++)
1398                 qregs[i] = vir_uniform_ui(c, instr->value.u32[i]);
1399
1400         _mesa_hash_table_insert(c->def_ht, &instr->def, qregs);
1401 }
1402
1403 static void
1404 ntq_emit_ssa_undef(struct v3d_compile *c, nir_ssa_undef_instr *instr)
1405 {
1406         struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
1407
1408         /* VIR needs there to be *some* value, so pick 0 (same as for
1409          * ntq_setup_registers().
1410          */
1411         for (int i = 0; i < instr->def.num_components; i++)
1412                 qregs[i] = vir_uniform_ui(c, 0);
1413 }
1414
1415 static void
1416 ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
1417 {
1418         nir_const_value *const_offset;
1419         unsigned offset;
1420
1421         switch (instr->intrinsic) {
1422         case nir_intrinsic_load_uniform:
1423                 assert(instr->num_components == 1);
1424                 const_offset = nir_src_as_const_value(instr->src[0]);
1425                 if (const_offset) {
1426                         offset = nir_intrinsic_base(instr) + const_offset->u32[0];
1427                         assert(offset % 4 == 0);
1428                         /* We need dwords */
1429                         offset = offset / 4;
1430                         ntq_store_dest(c, &instr->dest, 0,
1431                                        vir_uniform(c, QUNIFORM_UNIFORM,
1432                                                    offset));
1433                 } else {
1434                         ntq_store_dest(c, &instr->dest, 0,
1435                                        indirect_uniform_load(c, instr));
1436                 }
1437                 break;
1438
1439         case nir_intrinsic_load_ubo:
1440                 for (int i = 0; i < instr->num_components; i++) {
1441                         int ubo = nir_src_as_const_value(instr->src[0])->u32[0];
1442
1443                         /* Adjust for where we stored the TGSI register base. */
1444                         vir_ADD_dest(c,
1445                                      vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA),
1446                                      vir_uniform(c, QUNIFORM_UBO_ADDR, 1 + ubo),
1447                                      vir_ADD(c,
1448                                              ntq_get_src(c, instr->src[1], 0),
1449                                              vir_uniform_ui(c, i * 4)));
1450
1451                         vir_emit_thrsw(c);
1452
1453                         ntq_store_dest(c, &instr->dest, i, vir_LDTMU(c));
1454                 }
1455                 break;
1456
1457                 const_offset = nir_src_as_const_value(instr->src[0]);
1458                 if (const_offset) {
1459                         offset = nir_intrinsic_base(instr) + const_offset->u32[0];
1460                         assert(offset % 4 == 0);
1461                         /* We need dwords */
1462                         offset = offset / 4;
1463                         ntq_store_dest(c, &instr->dest, 0,
1464                                        vir_uniform(c, QUNIFORM_UNIFORM,
1465                                                    offset));
1466                 } else {
1467                         ntq_store_dest(c, &instr->dest, 0,
1468                                        indirect_uniform_load(c, instr));
1469                 }
1470                 break;
1471
1472         case nir_intrinsic_load_user_clip_plane:
1473                 for (int i = 0; i < instr->num_components; i++) {
1474                         ntq_store_dest(c, &instr->dest, i,
1475                                        vir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
1476                                                    nir_intrinsic_ucp_id(instr) *
1477                                                    4 + i));
1478                 }
1479                 break;
1480
1481         case nir_intrinsic_load_alpha_ref_float:
1482                 ntq_store_dest(c, &instr->dest, 0,
1483                                vir_uniform(c, QUNIFORM_ALPHA_REF, 0));
1484                 break;
1485
1486         case nir_intrinsic_load_sample_mask_in:
1487                 ntq_store_dest(c, &instr->dest, 0,
1488                                vir_uniform(c, QUNIFORM_SAMPLE_MASK, 0));
1489                 break;
1490
1491         case nir_intrinsic_load_front_face:
1492                 /* The register contains 0 (front) or 1 (back), and we need to
1493                  * turn it into a NIR bool where true means front.
1494                  */
1495                 ntq_store_dest(c, &instr->dest, 0,
1496                                vir_ADD(c,
1497                                        vir_uniform_ui(c, -1),
1498                                        vir_REVF(c)));
1499                 break;
1500
1501         case nir_intrinsic_load_instance_id:
1502                 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->iid));
1503                 break;
1504
1505         case nir_intrinsic_load_vertex_id:
1506                 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->vid));
1507                 break;
1508
1509         case nir_intrinsic_load_input:
1510                 const_offset = nir_src_as_const_value(instr->src[0]);
1511                 assert(const_offset && "v3d doesn't support indirect inputs");
1512                 for (int i = 0; i < instr->num_components; i++) {
1513                         offset = nir_intrinsic_base(instr) + const_offset->u32[0];
1514                         int comp = nir_intrinsic_component(instr) + i;
1515                         ntq_store_dest(c, &instr->dest, i,
1516                                        vir_MOV(c, c->inputs[offset * 4 + comp]));
1517                 }
1518                 break;
1519
1520         case nir_intrinsic_store_output:
1521                 const_offset = nir_src_as_const_value(instr->src[1]);
1522                 assert(const_offset && "v3d doesn't support indirect outputs");
1523                 offset = ((nir_intrinsic_base(instr) +
1524                            const_offset->u32[0]) * 4 +
1525                           nir_intrinsic_component(instr));
1526
1527                 for (int i = 0; i < instr->num_components; i++) {
1528                         c->outputs[offset + i] =
1529                                 vir_MOV(c, ntq_get_src(c, instr->src[0], i));
1530                 }
1531                 c->num_outputs = MAX2(c->num_outputs,
1532                                       offset + instr->num_components);
1533                 break;
1534
1535         case nir_intrinsic_discard:
1536                 if (c->execute.file != QFILE_NULL) {
1537                         vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
1538                         vir_set_cond(vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0),
1539                                                      vir_uniform_ui(c, 0)),
1540                                 V3D_QPU_COND_IFA);
1541                 } else {
1542                         vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0),
1543                                         vir_uniform_ui(c, 0));
1544                 }
1545                 break;
1546
1547         case nir_intrinsic_discard_if: {
1548                 /* true (~0) if we're discarding */
1549                 struct qreg cond = ntq_get_src(c, instr->src[0], 0);
1550
1551                 if (c->execute.file != QFILE_NULL) {
1552                         /* execute == 0 means the channel is active.  Invert
1553                          * the condition so that we can use zero as "executing
1554                          * and discarding."
1555                          */
1556                         vir_PF(c, vir_OR(c, c->execute, vir_NOT(c, cond)),
1557                                V3D_QPU_PF_PUSHZ);
1558                         vir_set_cond(vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0),
1559                                                      vir_uniform_ui(c, 0)),
1560                                      V3D_QPU_COND_IFA);
1561                 } else {
1562                         vir_PF(c, cond, V3D_QPU_PF_PUSHZ);
1563                         vir_set_cond(vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0),
1564                                                      vir_uniform_ui(c, 0)),
1565                                      V3D_QPU_COND_IFNA);
1566                 }
1567
1568                 break;
1569         }
1570
1571         default:
1572                 fprintf(stderr, "Unknown intrinsic: ");
1573                 nir_print_instr(&instr->instr, stderr);
1574                 fprintf(stderr, "\n");
1575                 break;
1576         }
1577 }
1578
1579 /* Clears (activates) the execute flags for any channels whose jump target
1580  * matches this block.
1581  */
1582 static void
1583 ntq_activate_execute_for_block(struct v3d_compile *c)
1584 {
1585         vir_PF(c, vir_SUB(c, c->execute, vir_uniform_ui(c, c->cur_block->index)),
1586                V3D_QPU_PF_PUSHZ);
1587
1588         vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
1589 }
1590
1591 static void
1592 ntq_emit_if(struct v3d_compile *c, nir_if *if_stmt)
1593 {
1594         nir_block *nir_else_block = nir_if_first_else_block(if_stmt);
1595         bool empty_else_block =
1596                 (nir_else_block == nir_if_last_else_block(if_stmt) &&
1597                  exec_list_is_empty(&nir_else_block->instr_list));
1598
1599         struct qblock *then_block = vir_new_block(c);
1600         struct qblock *after_block = vir_new_block(c);
1601         struct qblock *else_block;
1602         if (empty_else_block)
1603                 else_block = after_block;
1604         else
1605                 else_block = vir_new_block(c);
1606
1607         bool was_top_level = false;
1608         if (c->execute.file == QFILE_NULL) {
1609                 c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
1610                 was_top_level = true;
1611         }
1612
1613         /* Set A for executing (execute == 0) and jumping (if->condition ==
1614          * 0) channels, and then update execute flags for those to point to
1615          * the ELSE block.
1616          */
1617         vir_PF(c, vir_OR(c,
1618                          c->execute,
1619                          ntq_get_src(c, if_stmt->condition, 0)),
1620                 V3D_QPU_PF_PUSHZ);
1621         vir_MOV_cond(c, V3D_QPU_COND_IFA,
1622                      c->execute,
1623                      vir_uniform_ui(c, else_block->index));
1624
1625         /* Jump to ELSE if nothing is active for THEN, otherwise fall
1626          * through.
1627          */
1628         vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
1629         vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA);
1630         vir_link_blocks(c->cur_block, else_block);
1631         vir_link_blocks(c->cur_block, then_block);
1632
1633         /* Process the THEN block. */
1634         vir_set_emit_block(c, then_block);
1635         ntq_emit_cf_list(c, &if_stmt->then_list);
1636
1637         if (!empty_else_block) {
1638                 /* Handle the end of the THEN block.  First, all currently
1639                  * active channels update their execute flags to point to
1640                  * ENDIF
1641                  */
1642                 vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
1643                 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
1644                              vir_uniform_ui(c, after_block->index));
1645
1646                 /* If everything points at ENDIF, then jump there immediately. */
1647                 vir_PF(c, vir_SUB(c, c->execute,
1648                                   vir_uniform_ui(c, after_block->index)),
1649                        V3D_QPU_PF_PUSHZ);
1650                 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA);
1651                 vir_link_blocks(c->cur_block, after_block);
1652                 vir_link_blocks(c->cur_block, else_block);
1653
1654                 vir_set_emit_block(c, else_block);
1655                 ntq_activate_execute_for_block(c);
1656                 ntq_emit_cf_list(c, &if_stmt->else_list);
1657         }
1658
1659         vir_link_blocks(c->cur_block, after_block);
1660
1661         vir_set_emit_block(c, after_block);
1662         if (was_top_level)
1663                 c->execute = c->undef;
1664         else
1665                 ntq_activate_execute_for_block(c);
1666 }
1667
1668 static void
1669 ntq_emit_jump(struct v3d_compile *c, nir_jump_instr *jump)
1670 {
1671         switch (jump->type) {
1672         case nir_jump_break:
1673                 vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
1674                 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
1675                              vir_uniform_ui(c, c->loop_break_block->index));
1676                 break;
1677
1678         case nir_jump_continue:
1679                 vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
1680                 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
1681                              vir_uniform_ui(c, c->loop_cont_block->index));
1682                 break;
1683
1684         case nir_jump_return:
1685                 unreachable("All returns shouold be lowered\n");
1686         }
1687 }
1688
1689 static void
1690 ntq_emit_instr(struct v3d_compile *c, nir_instr *instr)
1691 {
1692         switch (instr->type) {
1693         case nir_instr_type_alu:
1694                 ntq_emit_alu(c, nir_instr_as_alu(instr));
1695                 break;
1696
1697         case nir_instr_type_intrinsic:
1698                 ntq_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
1699                 break;
1700
1701         case nir_instr_type_load_const:
1702                 ntq_emit_load_const(c, nir_instr_as_load_const(instr));
1703                 break;
1704
1705         case nir_instr_type_ssa_undef:
1706                 ntq_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr));
1707                 break;
1708
1709         case nir_instr_type_tex:
1710                 ntq_emit_tex(c, nir_instr_as_tex(instr));
1711                 break;
1712
1713         case nir_instr_type_jump:
1714                 ntq_emit_jump(c, nir_instr_as_jump(instr));
1715                 break;
1716
1717         default:
1718                 fprintf(stderr, "Unknown NIR instr type: ");
1719                 nir_print_instr(instr, stderr);
1720                 fprintf(stderr, "\n");
1721                 abort();
1722         }
1723 }
1724
1725 static void
1726 ntq_emit_block(struct v3d_compile *c, nir_block *block)
1727 {
1728         nir_foreach_instr(instr, block) {
1729                 ntq_emit_instr(c, instr);
1730         }
1731 }
1732
1733 static void ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
1734
1735 static void
1736 ntq_emit_loop(struct v3d_compile *c, nir_loop *loop)
1737 {
1738         bool was_top_level = false;
1739         if (c->execute.file == QFILE_NULL) {
1740                 c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
1741                 was_top_level = true;
1742         }
1743
1744         struct qblock *save_loop_cont_block = c->loop_cont_block;
1745         struct qblock *save_loop_break_block = c->loop_break_block;
1746
1747         c->loop_cont_block = vir_new_block(c);
1748         c->loop_break_block = vir_new_block(c);
1749
1750         vir_link_blocks(c->cur_block, c->loop_cont_block);
1751         vir_set_emit_block(c, c->loop_cont_block);
1752         ntq_activate_execute_for_block(c);
1753
1754         ntq_emit_cf_list(c, &loop->body);
1755
1756         /* Re-enable any previous continues now, so our ANYA check below
1757          * works.
1758          *
1759          * XXX: Use the .ORZ flags update, instead.
1760          */
1761         vir_PF(c, vir_SUB(c,
1762                           c->execute,
1763                           vir_uniform_ui(c, c->loop_cont_block->index)),
1764                V3D_QPU_PF_PUSHZ);
1765         vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
1766
1767         vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
1768
1769         vir_BRANCH(c, V3D_QPU_BRANCH_COND_ANYA);
1770         vir_link_blocks(c->cur_block, c->loop_cont_block);
1771         vir_link_blocks(c->cur_block, c->loop_break_block);
1772
1773         vir_set_emit_block(c, c->loop_break_block);
1774         if (was_top_level)
1775                 c->execute = c->undef;
1776         else
1777                 ntq_activate_execute_for_block(c);
1778
1779         c->loop_break_block = save_loop_break_block;
1780         c->loop_cont_block = save_loop_cont_block;
1781 }
1782
1783 static void
1784 ntq_emit_function(struct v3d_compile *c, nir_function_impl *func)
1785 {
1786         fprintf(stderr, "FUNCTIONS not handled.\n");
1787         abort();
1788 }
1789
1790 static void
1791 ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list)
1792 {
1793         foreach_list_typed(nir_cf_node, node, node, list) {
1794                 switch (node->type) {
1795                 case nir_cf_node_block:
1796                         ntq_emit_block(c, nir_cf_node_as_block(node));
1797                         break;
1798
1799                 case nir_cf_node_if:
1800                         ntq_emit_if(c, nir_cf_node_as_if(node));
1801                         break;
1802
1803                 case nir_cf_node_loop:
1804                         ntq_emit_loop(c, nir_cf_node_as_loop(node));
1805                         break;
1806
1807                 case nir_cf_node_function:
1808                         ntq_emit_function(c, nir_cf_node_as_function(node));
1809                         break;
1810
1811                 default:
1812                         fprintf(stderr, "Unknown NIR node type\n");
1813                         abort();
1814                 }
1815         }
1816 }
1817
1818 static void
1819 ntq_emit_impl(struct v3d_compile *c, nir_function_impl *impl)
1820 {
1821         ntq_setup_registers(c, &impl->registers);
1822         ntq_emit_cf_list(c, &impl->body);
1823 }
1824
1825 static void
1826 nir_to_vir(struct v3d_compile *c)
1827 {
1828         if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
1829                 c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
1830                 c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
1831                 c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
1832
1833                 if (c->fs_key->is_points) {
1834                         c->point_x = emit_fragment_varying(c, NULL, 0);
1835                         c->point_y = emit_fragment_varying(c, NULL, 0);
1836                 } else if (c->fs_key->is_lines) {
1837                         c->line_x = emit_fragment_varying(c, NULL, 0);
1838                 }
1839         }
1840
1841         ntq_setup_inputs(c);
1842         ntq_setup_outputs(c);
1843         ntq_setup_uniforms(c);
1844         ntq_setup_registers(c, &c->s->registers);
1845
1846         /* Find the main function and emit the body. */
1847         nir_foreach_function(function, c->s) {
1848                 assert(strcmp(function->name, "main") == 0);
1849                 assert(function->impl);
1850                 ntq_emit_impl(c, function->impl);
1851         }
1852 }
1853
1854 const nir_shader_compiler_options v3d_nir_options = {
1855         .lower_extract_byte = true,
1856         .lower_extract_word = true,
1857         .lower_bitfield_insert = true,
1858         .lower_bitfield_extract = true,
1859         .lower_pack_unorm_2x16 = true,
1860         .lower_pack_snorm_2x16 = true,
1861         .lower_pack_unorm_4x8 = true,
1862         .lower_pack_snorm_4x8 = true,
1863         .lower_unpack_unorm_4x8 = true,
1864         .lower_unpack_snorm_4x8 = true,
1865         .lower_fdiv = true,
1866         .lower_ffma = true,
1867         .lower_flrp32 = true,
1868         .lower_fpow = true,
1869         .lower_fsat = true,
1870         .lower_fsqrt = true,
1871         .native_integers = true,
1872 };
1873
1874
1875 #if 0
1876 static int
1877 count_nir_instrs(nir_shader *nir)
1878 {
1879         int count = 0;
1880         nir_foreach_function(function, nir) {
1881                 if (!function->impl)
1882                         continue;
1883                 nir_foreach_block(block, function->impl) {
1884                         nir_foreach_instr(instr, block)
1885                                 count++;
1886                 }
1887         }
1888         return count;
1889 }
1890 #endif
1891
1892 /**
1893  * When demoting a shader down to single-threaded, removes the THRSW
1894  * instructions (one will still be inserted at v3d_vir_to_qpu() for the
1895  * program end).
1896  */
1897 static void
1898 vir_remove_thrsw(struct v3d_compile *c)
1899 {
1900         vir_for_each_block(block, c) {
1901                 vir_for_each_inst_safe(inst, block) {
1902                         if (inst->qpu.sig.thrsw)
1903                                 vir_remove_instruction(c, inst);
1904                 }
1905         }
1906
1907         c->last_thrsw = NULL;
1908 }
1909
1910 static void
1911 vir_emit_last_thrsw(struct v3d_compile *c)
1912 {
1913         /* On V3D before 4.1, we need a TMU op to be outstanding when thread
1914          * switching, so disable threads if we didn't do any TMU ops (each of
1915          * which would have emitted a THRSW).
1916          */
1917         if (!c->last_thrsw_at_top_level && c->devinfo->ver < 41) {
1918                 c->threads = 1;
1919                 if (c->last_thrsw)
1920                         vir_remove_thrsw(c);
1921                 return;
1922         }
1923
1924         /* If we're threaded and the last THRSW was in conditional code, then
1925          * we need to emit another one so that we can flag it as the last
1926          * thrsw.
1927          */
1928         if (c->last_thrsw && !c->last_thrsw_at_top_level) {
1929                 assert(c->devinfo->ver >= 41);
1930                 vir_emit_thrsw(c);
1931         }
1932
1933         /* If we're threaded, then we need to mark the last THRSW instruction
1934          * so we can emit a pair of them at QPU emit time.
1935          *
1936          * For V3D 4.x, we can spawn the non-fragment shaders already in the
1937          * post-last-THRSW state, so we can skip this.
1938          */
1939         if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT) {
1940                 assert(c->devinfo->ver >= 41);
1941                 vir_emit_thrsw(c);
1942         }
1943
1944         if (c->last_thrsw)
1945                 c->last_thrsw->is_last_thrsw = true;
1946 }
1947
1948 void
1949 v3d_nir_to_vir(struct v3d_compile *c)
1950 {
1951         if (V3D_DEBUG & (V3D_DEBUG_NIR |
1952                          v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
1953                 fprintf(stderr, "%s prog %d/%d NIR:\n",
1954                         vir_get_stage_name(c),
1955                         c->program_id, c->variant_id);
1956                 nir_print_shader(c->s, stderr);
1957         }
1958
1959         nir_to_vir(c);
1960
1961         /* Emit the last THRSW before STVPM and TLB writes. */
1962         vir_emit_last_thrsw(c);
1963
1964         switch (c->s->info.stage) {
1965         case MESA_SHADER_FRAGMENT:
1966                 emit_frag_end(c);
1967                 break;
1968         case MESA_SHADER_VERTEX:
1969                 emit_vert_end(c);
1970                 break;
1971         default:
1972                 unreachable("bad stage");
1973         }
1974
1975         if (V3D_DEBUG & (V3D_DEBUG_VIR |
1976                          v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
1977                 fprintf(stderr, "%s prog %d/%d pre-opt VIR:\n",
1978                         vir_get_stage_name(c),
1979                         c->program_id, c->variant_id);
1980                 vir_dump(c);
1981                 fprintf(stderr, "\n");
1982         }
1983
1984         vir_optimize(c);
1985         vir_lower_uniforms(c);
1986
1987         /* XXX: vir_schedule_instructions(c); */
1988
1989         if (V3D_DEBUG & (V3D_DEBUG_VIR |
1990                          v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
1991                 fprintf(stderr, "%s prog %d/%d VIR:\n",
1992                         vir_get_stage_name(c),
1993                         c->program_id, c->variant_id);
1994                 vir_dump(c);
1995                 fprintf(stderr, "\n");
1996         }
1997
1998         /* Compute the live ranges so we can figure out interference. */
1999         vir_calculate_live_intervals(c);
2000
2001         /* Attempt to allocate registers for the temporaries.  If we fail,
2002          * reduce thread count and try again.
2003          */
2004         int min_threads = (c->devinfo->ver >= 41) ? 2 : 1;
2005         struct qpu_reg *temp_registers;
2006         while (true) {
2007                 temp_registers = v3d_register_allocate(c);
2008
2009                 if (temp_registers)
2010                         break;
2011
2012                 if (c->threads == min_threads) {
2013                         fprintf(stderr, "Failed to register allocate at %d threads:\n",
2014                                 c->threads);
2015                         vir_dump(c);
2016                         c->failed = true;
2017                         return;
2018                 }
2019
2020                 c->threads /= 2;
2021
2022                 if (c->threads == 1)
2023                         vir_remove_thrsw(c);
2024         }
2025
2026         v3d_vir_to_qpu(c, temp_registers);
2027 }