src/broadcom/compiler/nir_to_vir.c

   1 /*
   2  * Copyright © 2016 Broadcom
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include <inttypes.h>
  25 #include "util/u_format.h"
  26 #include "util/u_math.h"
  27 #include "util/u_memory.h"
  28 #include "util/ralloc.h"
  29 #include "util/hash_table.h"
  30 #include "compiler/nir/nir.h"
  31 #include "compiler/nir/nir_builder.h"
  32 #include "common/v3d_device_info.h"
  33 #include "v3d_compiler.h"
  34
  35 #define GENERAL_TMU_LOOKUP_PER_QUAD                 (0 << 7)
  36 #define GENERAL_TMU_LOOKUP_PER_PIXEL                (1 << 7)
  37 #define GENERAL_TMU_READ_OP_PREFETCH                (0 << 3)
  38 #define GENERAL_TMU_READ_OP_CACHE_CLEAR             (1 << 3)
  39 #define GENERAL_TMU_READ_OP_CACHE_FLUSH             (3 << 3)
  40 #define GENERAL_TMU_READ_OP_CACHE_CLEAN             (3 << 3)
  41 #define GENERAL_TMU_READ_OP_CACHE_L1T_CLEAR         (4 << 3)
  42 #define GENERAL_TMU_READ_OP_CACHE_L1T_FLUSH_AGGREGATION (5 << 3)
  43 #define GENERAL_TMU_READ_OP_ATOMIC_INC              (8 << 3)
  44 #define GENERAL_TMU_READ_OP_ATOMIC_DEC              (9 << 3)
  45 #define GENERAL_TMU_READ_OP_ATOMIC_NOT              (10 << 3)
  46 #define GENERAL_TMU_READ_OP_READ                    (15 << 3)
  47 #define GENERAL_TMU_LOOKUP_TYPE_8BIT_I              (0 << 0)
  48 #define GENERAL_TMU_LOOKUP_TYPE_16BIT_I             (1 << 0)
  49 #define GENERAL_TMU_LOOKUP_TYPE_VEC2                (2 << 0)
  50 #define GENERAL_TMU_LOOKUP_TYPE_VEC3                (3 << 0)
  51 #define GENERAL_TMU_LOOKUP_TYPE_VEC4                (4 << 0)
  52 #define GENERAL_TMU_LOOKUP_TYPE_8BIT_UI             (5 << 0)
  53 #define GENERAL_TMU_LOOKUP_TYPE_16BIT_UI            (6 << 0)
  54 #define GENERAL_TMU_LOOKUP_TYPE_32BIT_UI            (7 << 0)
  55
  56 #define GENERAL_TMU_WRITE_OP_ATOMIC_ADD_WRAP         (0 << 3)
  57 #define GENERAL_TMU_WRITE_OP_ATOMIC_SUB_WRAP         (1 << 3)
  58 #define GENERAL_TMU_WRITE_OP_ATOMIC_XCHG             (2 << 3)
  59 #define GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG          (3 << 3)
  60 #define GENERAL_TMU_WRITE_OP_ATOMIC_UMIN             (4 << 3)
  61 #define GENERAL_TMU_WRITE_OP_ATOMIC_UMAX             (5 << 3)
  62 #define GENERAL_TMU_WRITE_OP_ATOMIC_SMIN             (6 << 3)
  63 #define GENERAL_TMU_WRITE_OP_ATOMIC_SMAX             (7 << 3)
  64 #define GENERAL_TMU_WRITE_OP_ATOMIC_AND              (8 << 3)
  65 #define GENERAL_TMU_WRITE_OP_ATOMIC_OR               (9 << 3)
  66 #define GENERAL_TMU_WRITE_OP_ATOMIC_XOR              (10 << 3)
  67 #define GENERAL_TMU_WRITE_OP_WRITE                   (15 << 3)
  68
  69 static void
  70 ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
  71
  72 static void
  73 resize_qreg_array(struct v3d_compile *c,
  74                   struct qreg **regs,
  75                   uint32_t *size,
  76                   uint32_t decl_size)
  77 {
  78         if (*size >= decl_size)
  79                 return;
  80
  81         uint32_t old_size = *size;
  82         *size = MAX2(*size * 2, decl_size);
  83         *regs = reralloc(c, *regs, struct qreg, *size);
  84         if (!*regs) {
  85                 fprintf(stderr, "Malloc failure\n");
  86                 abort();
  87         }
  88
  89         for (uint32_t i = old_size; i < *size; i++)
  90                 (*regs)[i] = c->undef;
  91 }
  92
  93 void
  94 vir_emit_thrsw(struct v3d_compile *c)
  95 {
  96         if (c->threads == 1)
  97                 return;
  98
  99         /* Always thread switch after each texture operation for now.
 100          *
 101          * We could do better by batching a bunch of texture fetches up and
 102          * then doing one thread switch and collecting all their results
 103          * afterward.
 104          */
 105         c->last_thrsw = vir_NOP(c);
 106         c->last_thrsw->qpu.sig.thrsw = true;
 107         c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL);
 108 }
 109
 110 static uint32_t
 111 v3d_general_tmu_op(nir_intrinsic_instr *instr)
 112 {
 113         switch (instr->intrinsic) {
 114         case nir_intrinsic_load_ssbo:
 115         case nir_intrinsic_load_ubo:
 116         case nir_intrinsic_load_uniform:
 117                 return GENERAL_TMU_READ_OP_READ;
 118         case nir_intrinsic_store_ssbo:
 119                 return GENERAL_TMU_WRITE_OP_WRITE;
 120         case nir_intrinsic_ssbo_atomic_add:
 121                 return GENERAL_TMU_WRITE_OP_ATOMIC_ADD_WRAP;
 122         case nir_intrinsic_ssbo_atomic_imin:
 123                 return GENERAL_TMU_WRITE_OP_ATOMIC_SMIN;
 124         case nir_intrinsic_ssbo_atomic_umin:
 125                 return GENERAL_TMU_WRITE_OP_ATOMIC_UMIN;
 126         case nir_intrinsic_ssbo_atomic_imax:
 127                 return GENERAL_TMU_WRITE_OP_ATOMIC_SMAX;
 128         case nir_intrinsic_ssbo_atomic_umax:
 129                 return GENERAL_TMU_WRITE_OP_ATOMIC_UMAX;
 130         case nir_intrinsic_ssbo_atomic_and:
 131                 return GENERAL_TMU_WRITE_OP_ATOMIC_AND;
 132         case nir_intrinsic_ssbo_atomic_or:
 133                 return GENERAL_TMU_WRITE_OP_ATOMIC_OR;
 134         case nir_intrinsic_ssbo_atomic_xor:
 135                 return GENERAL_TMU_WRITE_OP_ATOMIC_XOR;
 136         case nir_intrinsic_ssbo_atomic_exchange:
 137                 return GENERAL_TMU_WRITE_OP_ATOMIC_XCHG;
 138         case nir_intrinsic_ssbo_atomic_comp_swap:
 139                 return GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG;
 140         default:
 141                 unreachable("unknown intrinsic op");
 142         }
 143 }
 144
 145 /**
 146  * Implements indirect uniform loads and SSBO accesses through the TMU general
 147  * memory access interface.
 148  */
 149 static void
 150 ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr)
 151 {
 152         /* XXX perf: We should turn add/sub of 1 to inc/dec.  Perhaps NIR
 153          * wants to have support for inc/dec?
 154          */
 155
 156         uint32_t tmu_op = v3d_general_tmu_op(instr);
 157         bool is_store = instr->intrinsic == nir_intrinsic_store_ssbo;
 158
 159         int offset_src;
 160         int tmu_writes = 1; /* address */
 161         if (instr->intrinsic == nir_intrinsic_load_uniform) {
 162                 offset_src = 0;
 163         } else if (instr->intrinsic == nir_intrinsic_load_ssbo ||
 164                    instr->intrinsic == nir_intrinsic_load_ubo) {
 165                 offset_src = 1;
 166         } else if (is_store) {
 167                 offset_src = 2;
 168                 for (int i = 0; i < instr->num_components; i++) {
 169                         vir_MOV_dest(c,
 170                                      vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
 171                                      ntq_get_src(c, instr->src[0], i));
 172                         tmu_writes++;
 173                 }
 174         } else {
 175                 offset_src = 1;
 176                 vir_MOV_dest(c,
 177                              vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
 178                              ntq_get_src(c, instr->src[2], 0));
 179                 tmu_writes++;
 180                 if (tmu_op == GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG) {
 181                         vir_MOV_dest(c,
 182                                      vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
 183                                      ntq_get_src(c, instr->src[3], 0));
 184                         tmu_writes++;
 185                 }
 186         }
 187
 188         /* Make sure we won't exceed the 16-entry TMU fifo if each thread is
 189          * storing at the same time.
 190          */
 191         while (tmu_writes > 16 / c->threads)
 192                 c->threads /= 2;
 193
 194         struct qreg offset;
 195         if (instr->intrinsic == nir_intrinsic_load_uniform) {
 196                 offset = vir_uniform(c, QUNIFORM_UBO_ADDR, 0);
 197
 198                 /* Find what variable in the default uniform block this
 199                  * uniform load is coming from.
 200                  */
 201                 uint32_t base = nir_intrinsic_base(instr);
 202                 int i;
 203                 struct v3d_ubo_range *range = NULL;
 204                 for (i = 0; i < c->num_ubo_ranges; i++) {
 205                         range = &c->ubo_ranges[i];
 206                         if (base >= range->src_offset &&
 207                             base < range->src_offset + range->size) {
 208                                 break;
 209                         }
 210                 }
 211                 /* The driver-location-based offset always has to be within a
 212                  * declared uniform range.
 213                  */
 214                 assert(i != c->num_ubo_ranges);
 215                 if (!c->ubo_range_used[i]) {
 216                         c->ubo_range_used[i] = true;
 217                         range->dst_offset = c->next_ubo_dst_offset;
 218                         c->next_ubo_dst_offset += range->size;
 219                 }
 220
 221                 base = base - range->src_offset + range->dst_offset;
 222
 223                 if (base != 0)
 224                         offset = vir_ADD(c, offset, vir_uniform_ui(c, base));
 225         } else if (instr->intrinsic == nir_intrinsic_load_ubo) {
 226                 /* Note that QUNIFORM_UBO_ADDR takes a UBO index shifted up by
 227                  * 1 (0 is gallium's constant buffer 0).
 228                  */
 229                 offset = vir_uniform(c, QUNIFORM_UBO_ADDR,
 230                                      nir_src_as_uint(instr->src[0]) + 1);
 231         } else {
 232                 offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET,
 233                                      nir_src_as_uint(instr->src[is_store ?
 234                                                                 1 : 0]));
 235         }
 236
 237         uint32_t config = (0xffffff00 |
 238                            tmu_op |
 239                            GENERAL_TMU_LOOKUP_PER_PIXEL);
 240         if (instr->num_components == 1) {
 241                 config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
 242         } else {
 243                 config |= (GENERAL_TMU_LOOKUP_TYPE_VEC2 +
 244                            instr->num_components - 2);
 245         }
 246
 247         if (c->execute.file != QFILE_NULL)
 248                 vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
 249
 250         struct qreg dest;
 251         if (config == ~0)
 252                 dest = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA);
 253         else
 254                 dest = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
 255
 256         struct qinst *tmu;
 257         if (nir_src_is_const(instr->src[offset_src]) &&
 258             nir_src_as_uint(instr->src[offset_src]) == 0) {
 259                 tmu = vir_MOV_dest(c, dest, offset);
 260         } else {
 261                 tmu = vir_ADD_dest(c, dest,
 262                                    offset,
 263                                    ntq_get_src(c, instr->src[offset_src], 0));
 264         }
 265
 266         if (config != ~0) {
 267                 tmu->src[vir_get_implicit_uniform_src(tmu)] =
 268                         vir_uniform_ui(c, config);
 269         }
 270
 271         if (c->execute.file != QFILE_NULL)
 272                 vir_set_cond(tmu, V3D_QPU_COND_IFA);
 273
 274         vir_emit_thrsw(c);
 275
 276         /* Read the result, or wait for the TMU op to complete. */
 277         for (int i = 0; i < nir_intrinsic_dest_components(instr); i++)
 278                 ntq_store_dest(c, &instr->dest, i, vir_MOV(c, vir_LDTMU(c)));
 279
 280         if (nir_intrinsic_dest_components(instr) == 0)
 281                 vir_TMUWT(c);
 282 }
 283
 284 static struct qreg *
 285 ntq_init_ssa_def(struct v3d_compile *c, nir_ssa_def *def)
 286 {
 287         struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
 288                                           def->num_components);
 289         _mesa_hash_table_insert(c->def_ht, def, qregs);
 290         return qregs;
 291 }
 292
 293 /**
 294  * This function is responsible for getting VIR results into the associated
 295  * storage for a NIR instruction.
 296  *
 297  * If it's a NIR SSA def, then we just set the associated hash table entry to
 298  * the new result.
 299  *
 300  * If it's a NIR reg, then we need to update the existing qreg assigned to the
 301  * NIR destination with the incoming value.  To do that without introducing
 302  * new MOVs, we require that the incoming qreg either be a uniform, or be
 303  * SSA-defined by the previous VIR instruction in the block and rewritable by
 304  * this function.  That lets us sneak ahead and insert the SF flag beforehand
 305  * (knowing that the previous instruction doesn't depend on flags) and rewrite
 306  * its destination to be the NIR reg's destination
 307  */
 308 void
 309 ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
 310                struct qreg result)
 311 {
 312         struct qinst *last_inst = NULL;
 313         if (!list_empty(&c->cur_block->instructions))
 314                 last_inst = (struct qinst *)c->cur_block->instructions.prev;
 315
 316         assert(result.file == QFILE_UNIF ||
 317                (result.file == QFILE_TEMP &&
 318                 last_inst && last_inst == c->defs[result.index]));
 319
 320         if (dest->is_ssa) {
 321                 assert(chan < dest->ssa.num_components);
 322
 323                 struct qreg *qregs;
 324                 struct hash_entry *entry =
 325                         _mesa_hash_table_search(c->def_ht, &dest->ssa);
 326
 327                 if (entry)
 328                         qregs = entry->data;
 329                 else
 330                         qregs = ntq_init_ssa_def(c, &dest->ssa);
 331
 332                 qregs[chan] = result;
 333         } else {
 334                 nir_register *reg = dest->reg.reg;
 335                 assert(dest->reg.base_offset == 0);
 336                 assert(reg->num_array_elems == 0);
 337                 struct hash_entry *entry =
 338                         _mesa_hash_table_search(c->def_ht, reg);
 339                 struct qreg *qregs = entry->data;
 340
 341                 /* Insert a MOV if the source wasn't an SSA def in the
 342                  * previous instruction.
 343                  */
 344                 if (result.file == QFILE_UNIF) {
 345                         result = vir_MOV(c, result);
 346                         last_inst = c->defs[result.index];
 347                 }
 348
 349                 /* We know they're both temps, so just rewrite index. */
 350                 c->defs[last_inst->dst.index] = NULL;
 351                 last_inst->dst.index = qregs[chan].index;
 352
 353                 /* If we're in control flow, then make this update of the reg
 354                  * conditional on the execution mask.
 355                  */
 356                 if (c->execute.file != QFILE_NULL) {
 357                         last_inst->dst.index = qregs[chan].index;
 358
 359                         /* Set the flags to the current exec mask.
 360                          */
 361                         c->cursor = vir_before_inst(last_inst);
 362                         vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
 363                         c->cursor = vir_after_inst(last_inst);
 364
 365                         vir_set_cond(last_inst, V3D_QPU_COND_IFA);
 366                         last_inst->cond_is_exec_mask = true;
 367                 }
 368         }
 369 }
 370
 371 struct qreg
 372 ntq_get_src(struct v3d_compile *c, nir_src src, int i)
 373 {
 374         struct hash_entry *entry;
 375         if (src.is_ssa) {
 376                 entry = _mesa_hash_table_search(c->def_ht, src.ssa);
 377                 assert(i < src.ssa->num_components);
 378         } else {
 379                 nir_register *reg = src.reg.reg;
 380                 entry = _mesa_hash_table_search(c->def_ht, reg);
 381                 assert(reg->num_array_elems == 0);
 382                 assert(src.reg.base_offset == 0);
 383                 assert(i < reg->num_components);
 384         }
 385
 386         struct qreg *qregs = entry->data;
 387         return qregs[i];
 388 }
 389
 390 static struct qreg
 391 ntq_get_alu_src(struct v3d_compile *c, nir_alu_instr *instr,
 392                 unsigned src)
 393 {
 394         assert(util_is_power_of_two_or_zero(instr->dest.write_mask));
 395         unsigned chan = ffs(instr->dest.write_mask) - 1;
 396         struct qreg r = ntq_get_src(c, instr->src[src].src,
 397                                     instr->src[src].swizzle[chan]);
 398
 399         assert(!instr->src[src].abs);
 400         assert(!instr->src[src].negate);
 401
 402         return r;
 403 };
 404
 405 static struct qreg
 406 ntq_minify(struct v3d_compile *c, struct qreg size, struct qreg level)
 407 {
 408         return vir_MAX(c, vir_SHR(c, size, level), vir_uniform_ui(c, 1));
 409 }
 410
 411 static void
 412 ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr)
 413 {
 414         unsigned unit = instr->texture_index;
 415         int lod_index = nir_tex_instr_src_index(instr, nir_tex_src_lod);
 416         int dest_size = nir_tex_instr_dest_size(instr);
 417
 418         struct qreg lod = c->undef;
 419         if (lod_index != -1)
 420                 lod = ntq_get_src(c, instr->src[lod_index].src, 0);
 421
 422         for (int i = 0; i < dest_size; i++) {
 423                 assert(i < 3);
 424                 enum quniform_contents contents;
 425
 426                 if (instr->is_array && i == dest_size - 1)
 427                         contents = QUNIFORM_TEXTURE_ARRAY_SIZE;
 428                 else
 429                         contents = QUNIFORM_TEXTURE_WIDTH + i;
 430
 431                 struct qreg size = vir_uniform(c, contents, unit);
 432
 433                 switch (instr->sampler_dim) {
 434                 case GLSL_SAMPLER_DIM_1D:
 435                 case GLSL_SAMPLER_DIM_2D:
 436                 case GLSL_SAMPLER_DIM_MS:
 437                 case GLSL_SAMPLER_DIM_3D:
 438                 case GLSL_SAMPLER_DIM_CUBE:
 439                         /* Don't minify the array size. */
 440                         if (!(instr->is_array && i == dest_size - 1)) {
 441                                 size = ntq_minify(c, size, lod);
 442                         }
 443                         break;
 444
 445                 case GLSL_SAMPLER_DIM_RECT:
 446                         /* There's no LOD field for rects */
 447                         break;
 448
 449                 default:
 450                         unreachable("Bad sampler type");
 451                 }
 452
 453                 ntq_store_dest(c, &instr->dest, i, size);
 454         }
 455 }
 456
 457 static void
 458 ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
 459 {
 460         unsigned unit = instr->texture_index;
 461
 462         /* Since each texture sampling op requires uploading uniforms to
 463          * reference the texture, there's no HW support for texture size and
 464          * you just upload uniforms containing the size.
 465          */
 466         switch (instr->op) {
 467         case nir_texop_query_levels:
 468                 ntq_store_dest(c, &instr->dest, 0,
 469                                vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit));
 470                 return;
 471         case nir_texop_txs:
 472                 ntq_emit_txs(c, instr);
 473                 return;
 474         default:
 475                 break;
 476         }
 477
 478         if (c->devinfo->ver >= 40)
 479                 v3d40_vir_emit_tex(c, instr);
 480         else
 481                 v3d33_vir_emit_tex(c, instr);
 482 }
 483
 484 static struct qreg
 485 ntq_fsincos(struct v3d_compile *c, struct qreg src, bool is_cos)
 486 {
 487         struct qreg input = vir_FMUL(c, src, vir_uniform_f(c, 1.0f / M_PI));
 488         if (is_cos)
 489                 input = vir_FADD(c, input, vir_uniform_f(c, 0.5));
 490
 491         struct qreg periods = vir_FROUND(c, input);
 492         struct qreg sin_output = vir_SIN(c, vir_FSUB(c, input, periods));
 493         return vir_XOR(c, sin_output, vir_SHL(c,
 494                                               vir_FTOIN(c, periods),
 495                                               vir_uniform_ui(c, -1)));
 496 }
 497
 498 static struct qreg
 499 ntq_fsign(struct v3d_compile *c, struct qreg src)
 500 {
 501         struct qreg t = vir_get_temp(c);
 502
 503         vir_MOV_dest(c, t, vir_uniform_f(c, 0.0));
 504         vir_PF(c, vir_FMOV(c, src), V3D_QPU_PF_PUSHZ);
 505         vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_f(c, 1.0));
 506         vir_PF(c, vir_FMOV(c, src), V3D_QPU_PF_PUSHN);
 507         vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_f(c, -1.0));
 508         return vir_MOV(c, t);
 509 }
 510
 511 static struct qreg
 512 ntq_isign(struct v3d_compile *c, struct qreg src)
 513 {
 514         struct qreg t = vir_get_temp(c);
 515
 516         vir_MOV_dest(c, t, vir_uniform_ui(c, 0));
 517         vir_PF(c, vir_MOV(c, src), V3D_QPU_PF_PUSHZ);
 518         vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_ui(c, 1));
 519         vir_PF(c, vir_MOV(c, src), V3D_QPU_PF_PUSHN);
 520         vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_ui(c, -1));
 521         return vir_MOV(c, t);
 522 }
 523
 524 static void
 525 emit_fragcoord_input(struct v3d_compile *c, int attr)
 526 {
 527         c->inputs[attr * 4 + 0] = vir_FXCD(c);
 528         c->inputs[attr * 4 + 1] = vir_FYCD(c);
 529         c->inputs[attr * 4 + 2] = c->payload_z;
 530         c->inputs[attr * 4 + 3] = vir_RECIP(c, c->payload_w);
 531 }
 532
 533 static struct qreg
 534 emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
 535                       uint8_t swizzle, int array_index)
 536 {
 537         struct qreg r3 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3);
 538         struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
 539
 540         struct qreg vary;
 541         if (c->devinfo->ver >= 41) {
 542                 struct qinst *ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef,
 543                                                     c->undef, c->undef);
 544                 ldvary->qpu.sig.ldvary = true;
 545                 vary = vir_emit_def(c, ldvary);
 546         } else {
 547                 vir_NOP(c)->qpu.sig.ldvary = true;
 548                 vary = r3;
 549         }
 550
 551         /* For gl_PointCoord input or distance along a line, we'll be called
 552          * with no nir_variable, and we don't count toward VPM size so we
 553          * don't track an input slot.
 554          */
 555         if (!var) {
 556                 return vir_FADD(c, vir_FMUL(c, vary, c->payload_w), r5);
 557         }
 558
 559         int i = c->num_inputs++;
 560         c->input_slots[i] =
 561                 v3d_slot_from_slot_and_component(var->data.location +
 562                                                  array_index, swizzle);
 563
 564         switch (var->data.interpolation) {
 565         case INTERP_MODE_NONE:
 566                 /* If a gl_FrontColor or gl_BackColor input has no interp
 567                  * qualifier, then if we're using glShadeModel(GL_FLAT) it
 568                  * needs to be flat shaded.
 569                  */
 570                 switch (var->data.location + array_index) {
 571                 case VARYING_SLOT_COL0:
 572                 case VARYING_SLOT_COL1:
 573                 case VARYING_SLOT_BFC0:
 574                 case VARYING_SLOT_BFC1:
 575                         if (c->fs_key->shade_model_flat) {
 576                                 BITSET_SET(c->flat_shade_flags, i);
 577                                 vir_MOV_dest(c, c->undef, vary);
 578                                 return vir_MOV(c, r5);
 579                         } else {
 580                                 return vir_FADD(c, vir_FMUL(c, vary,
 581                                                             c->payload_w), r5);
 582                         }
 583                 default:
 584                         break;
 585                 }
 586                 /* FALLTHROUGH */
 587         case INTERP_MODE_SMOOTH:
 588                 if (var->data.centroid) {
 589                         BITSET_SET(c->centroid_flags, i);
 590                         return vir_FADD(c, vir_FMUL(c, vary,
 591                                                     c->payload_w_centroid), r5);
 592                 } else {
 593                         return vir_FADD(c, vir_FMUL(c, vary, c->payload_w), r5);
 594                 }
 595         case INTERP_MODE_NOPERSPECTIVE:
 596                 BITSET_SET(c->noperspective_flags, i);
 597                 return vir_FADD(c, vir_MOV(c, vary), r5);
 598         case INTERP_MODE_FLAT:
 599                 BITSET_SET(c->flat_shade_flags, i);
 600                 vir_MOV_dest(c, c->undef, vary);
 601                 return vir_MOV(c, r5);
 602         default:
 603                 unreachable("Bad interp mode");
 604         }
 605 }
 606
 607 static void
 608 emit_fragment_input(struct v3d_compile *c, int attr, nir_variable *var,
 609                     int array_index)
 610 {
 611         for (int i = 0; i < glsl_get_vector_elements(var->type); i++) {
 612                 int chan = var->data.location_frac + i;
 613                 c->inputs[attr * 4 + chan] =
 614                         emit_fragment_varying(c, var, chan, array_index);
 615         }
 616 }
 617
 618 static void
 619 add_output(struct v3d_compile *c,
 620            uint32_t decl_offset,
 621            uint8_t slot,
 622            uint8_t swizzle)
 623 {
 624         uint32_t old_array_size = c->outputs_array_size;
 625         resize_qreg_array(c, &c->outputs, &c->outputs_array_size,
 626                           decl_offset + 1);
 627
 628         if (old_array_size != c->outputs_array_size) {
 629                 c->output_slots = reralloc(c,
 630                                            c->output_slots,
 631                                            struct v3d_varying_slot,
 632                                            c->outputs_array_size);
 633         }
 634
 635         c->output_slots[decl_offset] =
 636                 v3d_slot_from_slot_and_component(slot, swizzle);
 637 }
 638
 639 static void
 640 declare_uniform_range(struct v3d_compile *c, uint32_t start, uint32_t size)
 641 {
 642         unsigned array_id = c->num_ubo_ranges++;
 643         if (array_id >= c->ubo_ranges_array_size) {
 644                 c->ubo_ranges_array_size = MAX2(c->ubo_ranges_array_size * 2,
 645                                                 array_id + 1);
 646                 c->ubo_ranges = reralloc(c, c->ubo_ranges,
 647                                          struct v3d_ubo_range,
 648                                          c->ubo_ranges_array_size);
 649                 c->ubo_range_used = reralloc(c, c->ubo_range_used,
 650                                              bool,
 651                                              c->ubo_ranges_array_size);
 652         }
 653
 654         c->ubo_ranges[array_id].dst_offset = 0;
 655         c->ubo_ranges[array_id].src_offset = start;
 656         c->ubo_ranges[array_id].size = size;
 657         c->ubo_range_used[array_id] = false;
 658 }
 659
 660 /**
 661  * If compare_instr is a valid comparison instruction, emits the
 662  * compare_instr's comparison and returns the sel_instr's return value based
 663  * on the compare_instr's result.
 664  */
 665 static bool
 666 ntq_emit_comparison(struct v3d_compile *c,
 667                     nir_alu_instr *compare_instr,
 668                     enum v3d_qpu_cond *out_cond)
 669 {
 670         struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0);
 671         struct qreg src1;
 672         if (nir_op_infos[compare_instr->op].num_inputs > 1)
 673                 src1 = ntq_get_alu_src(c, compare_instr, 1);
 674         bool cond_invert = false;
 675         struct qreg nop = vir_reg(QFILE_NULL, 0);
 676
 677         switch (compare_instr->op) {
 678         case nir_op_feq32:
 679         case nir_op_seq:
 680                 vir_set_pf(vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
 681                 break;
 682         case nir_op_ieq32:
 683                 vir_set_pf(vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
 684                 break;
 685
 686         case nir_op_fne32:
 687         case nir_op_sne:
 688                 vir_set_pf(vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
 689                 cond_invert = true;
 690                 break;
 691         case nir_op_ine32:
 692                 vir_set_pf(vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
 693                 cond_invert = true;
 694                 break;
 695
 696         case nir_op_fge32:
 697         case nir_op_sge:
 698                 vir_set_pf(vir_FCMP_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
 699                 break;
 700         case nir_op_ige32:
 701                 vir_set_pf(vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
 702                 cond_invert = true;
 703                 break;
 704         case nir_op_uge32:
 705                 vir_set_pf(vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC);
 706                 cond_invert = true;
 707                 break;
 708
 709         case nir_op_slt:
 710         case nir_op_flt32:
 711                 vir_set_pf(vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHN);
 712                 break;
 713         case nir_op_ilt32:
 714                 vir_set_pf(vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
 715                 break;
 716         case nir_op_ult32:
 717                 vir_set_pf(vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC);
 718                 break;
 719
 720         default:
 721                 return false;
 722         }
 723
 724         *out_cond = cond_invert ? V3D_QPU_COND_IFNA : V3D_QPU_COND_IFA;
 725
 726         return true;
 727 }
 728
 729 /* Finds an ALU instruction that generates our src value that could
 730  * (potentially) be greedily emitted in the consuming instruction.
 731  */
 732 static struct nir_alu_instr *
 733 ntq_get_alu_parent(nir_src src)
 734 {
 735         if (!src.is_ssa || src.ssa->parent_instr->type != nir_instr_type_alu)
 736                 return NULL;
 737         nir_alu_instr *instr = nir_instr_as_alu(src.ssa->parent_instr);
 738         if (!instr)
 739                 return NULL;
 740
 741         /* If the ALU instr's srcs are non-SSA, then we would have to avoid
 742          * moving emission of the ALU instr down past another write of the
 743          * src.
 744          */
 745         for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
 746                 if (!instr->src[i].src.is_ssa)
 747                         return NULL;
 748         }
 749
 750         return instr;
 751 }
 752
 753 /**
 754  * Attempts to fold a comparison generating a boolean result into the
 755  * condition code for selecting between two values, instead of comparing the
 756  * boolean result against 0 to generate the condition code.
 757  */
 758 static struct qreg ntq_emit_bcsel(struct v3d_compile *c, nir_alu_instr *instr,
 759                                   struct qreg *src)
 760 {
 761         nir_alu_instr *compare = ntq_get_alu_parent(instr->src[0].src);
 762         if (!compare)
 763                 goto out;
 764
 765         enum v3d_qpu_cond cond;
 766         if (ntq_emit_comparison(c, compare, &cond))
 767                 return vir_MOV(c, vir_SEL(c, cond, src[1], src[2]));
 768
 769 out:
 770         vir_PF(c, src[0], V3D_QPU_PF_PUSHZ);
 771         return vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, src[1], src[2]));
 772 }
 773
 774
 775 static void
 776 ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
 777 {
 778         /* This should always be lowered to ALU operations for V3D. */
 779         assert(!instr->dest.saturate);
 780
 781         /* Vectors are special in that they have non-scalarized writemasks,
 782          * and just take the first swizzle channel for each argument in order
 783          * into each writemask channel.
 784          */
 785         if (instr->op == nir_op_vec2 ||
 786             instr->op == nir_op_vec3 ||
 787             instr->op == nir_op_vec4) {
 788                 struct qreg srcs[4];
 789                 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
 790                         srcs[i] = ntq_get_src(c, instr->src[i].src,
 791                                               instr->src[i].swizzle[0]);
 792                 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
 793                         ntq_store_dest(c, &instr->dest.dest, i,
 794                                        vir_MOV(c, srcs[i]));
 795                 return;
 796         }
 797
 798         /* General case: We can just grab the one used channel per src. */
 799         struct qreg src[nir_op_infos[instr->op].num_inputs];
 800         for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
 801                 src[i] = ntq_get_alu_src(c, instr, i);
 802         }
 803
 804         struct qreg result;
 805
 806         switch (instr->op) {
 807         case nir_op_fmov:
 808         case nir_op_imov:
 809                 result = vir_MOV(c, src[0]);
 810                 break;
 811
 812         case nir_op_fneg:
 813                 result = vir_XOR(c, src[0], vir_uniform_ui(c, 1 << 31));
 814                 break;
 815         case nir_op_ineg:
 816                 result = vir_NEG(c, src[0]);
 817                 break;
 818
 819         case nir_op_fmul:
 820                 result = vir_FMUL(c, src[0], src[1]);
 821                 break;
 822         case nir_op_fadd:
 823                 result = vir_FADD(c, src[0], src[1]);
 824                 break;
 825         case nir_op_fsub:
 826                 result = vir_FSUB(c, src[0], src[1]);
 827                 break;
 828         case nir_op_fmin:
 829                 result = vir_FMIN(c, src[0], src[1]);
 830                 break;
 831         case nir_op_fmax:
 832                 result = vir_FMAX(c, src[0], src[1]);
 833                 break;
 834
 835         case nir_op_f2i32:
 836                 result = vir_FTOIZ(c, src[0]);
 837                 break;
 838         case nir_op_f2u32:
 839                 result = vir_FTOUZ(c, src[0]);
 840                 break;
 841         case nir_op_i2f32:
 842                 result = vir_ITOF(c, src[0]);
 843                 break;
 844         case nir_op_u2f32:
 845                 result = vir_UTOF(c, src[0]);
 846                 break;
 847         case nir_op_b2f32:
 848                 result = vir_AND(c, src[0], vir_uniform_f(c, 1.0));
 849                 break;
 850         case nir_op_b2i32:
 851                 result = vir_AND(c, src[0], vir_uniform_ui(c, 1));
 852                 break;
 853         case nir_op_i2b32:
 854         case nir_op_f2b32:
 855                 vir_PF(c, src[0], V3D_QPU_PF_PUSHZ);
 856                 result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA,
 857                                             vir_uniform_ui(c, ~0),
 858                                             vir_uniform_ui(c, 0)));
 859                 break;
 860
 861         case nir_op_iadd:
 862                 result = vir_ADD(c, src[0], src[1]);
 863                 break;
 864         case nir_op_ushr:
 865                 result = vir_SHR(c, src[0], src[1]);
 866                 break;
 867         case nir_op_isub:
 868                 result = vir_SUB(c, src[0], src[1]);
 869                 break;
 870         case nir_op_ishr:
 871                 result = vir_ASR(c, src[0], src[1]);
 872                 break;
 873         case nir_op_ishl:
 874                 result = vir_SHL(c, src[0], src[1]);
 875                 break;
 876         case nir_op_imin:
 877                 result = vir_MIN(c, src[0], src[1]);
 878                 break;
 879         case nir_op_umin:
 880                 result = vir_UMIN(c, src[0], src[1]);
 881                 break;
 882         case nir_op_imax:
 883                 result = vir_MAX(c, src[0], src[1]);
 884                 break;
 885         case nir_op_umax:
 886                 result = vir_UMAX(c, src[0], src[1]);
 887                 break;
 888         case nir_op_iand:
 889                 result = vir_AND(c, src[0], src[1]);
 890                 break;
 891         case nir_op_ior:
 892                 result = vir_OR(c, src[0], src[1]);
 893                 break;
 894         case nir_op_ixor:
 895                 result = vir_XOR(c, src[0], src[1]);
 896                 break;
 897         case nir_op_inot:
 898                 result = vir_NOT(c, src[0]);
 899                 break;
 900
 901         case nir_op_ufind_msb:
 902                 result = vir_SUB(c, vir_uniform_ui(c, 31), vir_CLZ(c, src[0]));
 903                 break;
 904
 905         case nir_op_imul:
 906                 result = vir_UMUL(c, src[0], src[1]);
 907                 break;
 908
 909         case nir_op_seq:
 910         case nir_op_sne:
 911         case nir_op_sge:
 912         case nir_op_slt: {
 913                 enum v3d_qpu_cond cond;
 914                 MAYBE_UNUSED bool ok = ntq_emit_comparison(c, instr, &cond);
 915                 assert(ok);
 916                 result = vir_MOV(c, vir_SEL(c, cond,
 917                                             vir_uniform_f(c, 1.0),
 918                                             vir_uniform_f(c, 0.0)));
 919                 break;
 920         }
 921
 922         case nir_op_feq32:
 923         case nir_op_fne32:
 924         case nir_op_fge32:
 925         case nir_op_flt32:
 926         case nir_op_ieq32:
 927         case nir_op_ine32:
 928         case nir_op_ige32:
 929         case nir_op_uge32:
 930         case nir_op_ilt32:
 931         case nir_op_ult32: {
 932                 enum v3d_qpu_cond cond;
 933                 MAYBE_UNUSED bool ok = ntq_emit_comparison(c, instr, &cond);
 934                 assert(ok);
 935                 result = vir_MOV(c, vir_SEL(c, cond,
 936                                             vir_uniform_ui(c, ~0),
 937                                             vir_uniform_ui(c, 0)));
 938                 break;
 939         }
 940
 941         case nir_op_b32csel:
 942                 result = ntq_emit_bcsel(c, instr, src);
 943                 break;
 944         case nir_op_fcsel:
 945                 vir_PF(c, src[0], V3D_QPU_PF_PUSHZ);
 946                 result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA,
 947                                             src[1], src[2]));
 948                 break;
 949
 950         case nir_op_frcp:
 951                 result = vir_RECIP(c, src[0]);
 952                 break;
 953         case nir_op_frsq:
 954                 result = vir_RSQRT(c, src[0]);
 955                 break;
 956         case nir_op_fexp2:
 957                 result = vir_EXP(c, src[0]);
 958                 break;
 959         case nir_op_flog2:
 960                 result = vir_LOG(c, src[0]);
 961                 break;
 962
 963         case nir_op_fceil:
 964                 result = vir_FCEIL(c, src[0]);
 965                 break;
 966         case nir_op_ffloor:
 967                 result = vir_FFLOOR(c, src[0]);
 968                 break;
 969         case nir_op_fround_even:
 970                 result = vir_FROUND(c, src[0]);
 971                 break;
 972         case nir_op_ftrunc:
 973                 result = vir_FTRUNC(c, src[0]);
 974                 break;
 975         case nir_op_ffract:
 976                 result = vir_FSUB(c, src[0], vir_FFLOOR(c, src[0]));
 977                 break;
 978
 979         case nir_op_fsin:
 980                 result = ntq_fsincos(c, src[0], false);
 981                 break;
 982         case nir_op_fcos:
 983                 result = ntq_fsincos(c, src[0], true);
 984                 break;
 985
 986         case nir_op_fsign:
 987                 result = ntq_fsign(c, src[0]);
 988                 break;
 989         case nir_op_isign:
 990                 result = ntq_isign(c, src[0]);
 991                 break;
 992
 993         case nir_op_fabs: {
 994                 result = vir_FMOV(c, src[0]);
 995                 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_ABS);
 996                 break;
 997         }
 998
 999         case nir_op_iabs:
1000                 result = vir_MAX(c, src[0],
1001                                 vir_SUB(c, vir_uniform_ui(c, 0), src[0]));
1002                 break;
1003
1004         case nir_op_fddx:
1005         case nir_op_fddx_coarse:
1006         case nir_op_fddx_fine:
1007                 result = vir_FDX(c, src[0]);
1008                 break;
1009
1010         case nir_op_fddy:
1011         case nir_op_fddy_coarse:
1012         case nir_op_fddy_fine:
1013                 result = vir_FDY(c, src[0]);
1014                 break;
1015
1016         case nir_op_uadd_carry:
1017                 vir_PF(c, vir_ADD(c, src[0], src[1]), V3D_QPU_PF_PUSHC);
1018                 result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFA,
1019                                             vir_uniform_ui(c, ~0),
1020                                             vir_uniform_ui(c, 0)));
1021                 break;
1022
1023         case nir_op_pack_half_2x16_split:
1024                 result = vir_VFPACK(c, src[0], src[1]);
1025                 break;
1026
1027         case nir_op_unpack_half_2x16_split_x:
1028                 /* XXX perf: It would be good to be able to merge this unpack
1029                  * with whatever uses our result.
1030                  */
1031                 result = vir_FMOV(c, src[0]);
1032                 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
1033                 break;
1034
1035         case nir_op_unpack_half_2x16_split_y:
1036                 result = vir_FMOV(c, src[0]);
1037                 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_H);
1038                 break;
1039
1040         default:
1041                 fprintf(stderr, "unknown NIR ALU inst: ");
1042                 nir_print_instr(&instr->instr, stderr);
1043                 fprintf(stderr, "\n");
1044                 abort();
1045         }
1046
1047         /* We have a scalar result, so the instruction should only have a
1048          * single channel written to.
1049          */
1050         assert(util_is_power_of_two_or_zero(instr->dest.write_mask));
1051         ntq_store_dest(c, &instr->dest.dest,
1052                        ffs(instr->dest.write_mask) - 1, result);
1053 }
1054
1055 /* Each TLB read/write setup (a render target or depth buffer) takes an 8-bit
1056  * specifier.  They come from a register that's preloaded with 0xffffffff
1057  * (0xff gets you normal vec4 f16 RT0 writes), and when one is neaded the low
1058  * 8 bits are shifted off the bottom and 0xff shifted in from the top.
1059  */
1060 #define TLB_TYPE_F16_COLOR         (3 << 6)
1061 #define TLB_TYPE_I32_COLOR         (1 << 6)
1062 #define TLB_TYPE_F32_COLOR         (0 << 6)
1063 #define TLB_RENDER_TARGET_SHIFT    3 /* Reversed!  7 = RT 0, 0 = RT 7. */
1064 #define TLB_SAMPLE_MODE_PER_SAMPLE (0 << 2)
1065 #define TLB_SAMPLE_MODE_PER_PIXEL  (1 << 2)
1066 #define TLB_F16_SWAP_HI_LO         (1 << 1)
1067 #define TLB_VEC_SIZE_4_F16         (1 << 0)
1068 #define TLB_VEC_SIZE_2_F16         (0 << 0)
1069 #define TLB_VEC_SIZE_MINUS_1_SHIFT 0
1070
1071 /* Triggers Z/Stencil testing, used when the shader state's "FS modifies Z"
1072  * flag is set.
1073  */
1074 #define TLB_TYPE_DEPTH             ((2 << 6) | (0 << 4))
1075 #define TLB_DEPTH_TYPE_INVARIANT   (0 << 2) /* Unmodified sideband input used */
1076 #define TLB_DEPTH_TYPE_PER_PIXEL   (1 << 2) /* QPU result used */
1077 #define TLB_V42_DEPTH_TYPE_INVARIANT   (0 << 3) /* Unmodified sideband input used */
1078 #define TLB_V42_DEPTH_TYPE_PER_PIXEL   (1 << 3) /* QPU result used */
1079
1080 /* Stencil is a single 32-bit write. */
1081 #define TLB_TYPE_STENCIL_ALPHA     ((2 << 6) | (1 << 4))
1082
1083 static void
1084 emit_frag_end(struct v3d_compile *c)
1085 {
1086         /* XXX
1087         if (c->output_sample_mask_index != -1) {
1088                 vir_MS_MASK(c, c->outputs[c->output_sample_mask_index]);
1089         }
1090         */
1091
1092         bool has_any_tlb_color_write = false;
1093         for (int rt = 0; rt < c->fs_key->nr_cbufs; rt++) {
1094                 if (c->output_color_var[rt])
1095                         has_any_tlb_color_write = true;
1096         }
1097
1098         if (c->fs_key->sample_alpha_to_coverage && c->output_color_var[0]) {
1099                 struct nir_variable *var = c->output_color_var[0];
1100                 struct qreg *color = &c->outputs[var->data.driver_location * 4];
1101
1102                 vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0),
1103                                 vir_AND(c,
1104                                         vir_MSF(c),
1105                                         vir_FTOC(c, color[3])));
1106         }
1107
1108         if (c->output_position_index != -1) {
1109                 struct qinst *inst = vir_MOV_dest(c,
1110                                                   vir_reg(QFILE_TLBU, 0),
1111                                                   c->outputs[c->output_position_index]);
1112                 uint8_t tlb_specifier = TLB_TYPE_DEPTH;
1113
1114                 if (c->devinfo->ver >= 42) {
1115                         tlb_specifier |= (TLB_V42_DEPTH_TYPE_PER_PIXEL |
1116                                           TLB_SAMPLE_MODE_PER_PIXEL);
1117                 } else
1118                         tlb_specifier |= TLB_DEPTH_TYPE_PER_PIXEL;
1119
1120                 inst->src[vir_get_implicit_uniform_src(inst)] =
1121                         vir_uniform_ui(c, tlb_specifier | 0xffffff00);
1122         } else if (c->s->info.fs.uses_discard ||
1123                    c->fs_key->sample_alpha_to_coverage ||
1124                    !has_any_tlb_color_write) {
1125                 /* Emit passthrough Z if it needed to be delayed until shader
1126                  * end due to potential discards.
1127                  *
1128                  * Since (single-threaded) fragment shaders always need a TLB
1129                  * write, emit passthrouh Z if we didn't have any color
1130                  * buffers and flag us as potentially discarding, so that we
1131                  * can use Z as the TLB write.
1132                  */
1133                 c->s->info.fs.uses_discard = true;
1134
1135                 struct qinst *inst = vir_MOV_dest(c,
1136                                                   vir_reg(QFILE_TLBU, 0),
1137                                                   vir_reg(QFILE_NULL, 0));
1138                 uint8_t tlb_specifier = TLB_TYPE_DEPTH;
1139
1140                 if (c->devinfo->ver >= 42) {
1141                         /* The spec says the PER_PIXEL flag is ignored for
1142                          * invariant writes, but the simulator demands it.
1143                          */
1144                         tlb_specifier |= (TLB_V42_DEPTH_TYPE_INVARIANT |
1145                                           TLB_SAMPLE_MODE_PER_PIXEL);
1146                 } else {
1147                         tlb_specifier |= TLB_DEPTH_TYPE_INVARIANT;
1148                 }
1149
1150                 inst->src[vir_get_implicit_uniform_src(inst)] =
1151                         vir_uniform_ui(c, tlb_specifier | 0xffffff00);
1152         }
1153
1154         /* XXX: Performance improvement: Merge Z write and color writes TLB
1155          * uniform setup
1156          */
1157
1158         for (int rt = 0; rt < c->fs_key->nr_cbufs; rt++) {
1159                 if (!c->output_color_var[rt])
1160                         continue;
1161
1162                 nir_variable *var = c->output_color_var[rt];
1163                 struct qreg *color = &c->outputs[var->data.driver_location * 4];
1164                 int num_components = glsl_get_vector_elements(var->type);
1165                 uint32_t conf = 0xffffff00;
1166                 struct qinst *inst;
1167
1168                 conf |= TLB_SAMPLE_MODE_PER_PIXEL;
1169                 conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
1170
1171                 if (c->fs_key->swap_color_rb & (1 << rt))
1172                         num_components = MAX2(num_components, 3);
1173
1174                 assert(num_components != 0);
1175                 switch (glsl_get_base_type(var->type)) {
1176                 case GLSL_TYPE_UINT:
1177                 case GLSL_TYPE_INT:
1178                         /* The F32 vs I32 distinction was dropped in 4.2. */
1179                         if (c->devinfo->ver < 42)
1180                                 conf |= TLB_TYPE_I32_COLOR;
1181                         else
1182                                 conf |= TLB_TYPE_F32_COLOR;
1183                         conf |= ((num_components - 1) <<
1184                                  TLB_VEC_SIZE_MINUS_1_SHIFT);
1185
1186                         inst = vir_MOV_dest(c, vir_reg(QFILE_TLBU, 0), color[0]);
1187                         inst->src[vir_get_implicit_uniform_src(inst)] =
1188                                 vir_uniform_ui(c, conf);
1189
1190                         for (int i = 1; i < num_components; i++) {
1191                                 inst = vir_MOV_dest(c, vir_reg(QFILE_TLB, 0),
1192                                                     color[i]);
1193                         }
1194                         break;
1195
1196                 default: {
1197                         struct qreg r = color[0];
1198                         struct qreg g = color[1];
1199                         struct qreg b = color[2];
1200                         struct qreg a = color[3];
1201
1202                         if (c->fs_key->f32_color_rb & (1 << rt)) {
1203                                 conf |= TLB_TYPE_F32_COLOR;
1204                                 conf |= ((num_components - 1) <<
1205                                          TLB_VEC_SIZE_MINUS_1_SHIFT);
1206                         } else {
1207                                 conf |= TLB_TYPE_F16_COLOR;
1208                                 conf |= TLB_F16_SWAP_HI_LO;
1209                                 if (num_components >= 3)
1210                                         conf |= TLB_VEC_SIZE_4_F16;
1211                                 else
1212                                         conf |= TLB_VEC_SIZE_2_F16;
1213                         }
1214
1215                         if (c->fs_key->swap_color_rb & (1 << rt))  {
1216                                 r = color[2];
1217                                 b = color[0];
1218                         }
1219
1220                         if (c->fs_key->sample_alpha_to_one)
1221                                 a = vir_uniform_f(c, 1.0);
1222
1223                         if (c->fs_key->f32_color_rb & (1 << rt)) {
1224                                 inst = vir_MOV_dest(c, vir_reg(QFILE_TLBU, 0), r);
1225                                 inst->src[vir_get_implicit_uniform_src(inst)] =
1226                                         vir_uniform_ui(c, conf);
1227
1228                                 if (num_components >= 2)
1229                                         vir_MOV_dest(c, vir_reg(QFILE_TLB, 0), g);
1230                                 if (num_components >= 3)
1231                                         vir_MOV_dest(c, vir_reg(QFILE_TLB, 0), b);
1232                                 if (num_components >= 4)
1233                                         vir_MOV_dest(c, vir_reg(QFILE_TLB, 0), a);
1234                         } else {
1235                                 inst = vir_VFPACK_dest(c, vir_reg(QFILE_TLB, 0), r, g);
1236                                 if (conf != ~0) {
1237                                         inst->dst.file = QFILE_TLBU;
1238                                         inst->src[vir_get_implicit_uniform_src(inst)] =
1239                                                 vir_uniform_ui(c, conf);
1240                                 }
1241
1242                                 if (num_components >= 3)
1243                                         inst = vir_VFPACK_dest(c, vir_reg(QFILE_TLB, 0), b, a);
1244                         }
1245                         break;
1246                 }
1247                 }
1248         }
1249 }
1250
1251 static void
1252 vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t *vpm_index)
1253 {
1254         if (c->devinfo->ver >= 40) {
1255                 vir_STVPMV(c, vir_uniform_ui(c, *vpm_index), val);
1256                 *vpm_index = *vpm_index + 1;
1257         } else {
1258                 vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val);
1259         }
1260
1261         c->num_vpm_writes++;
1262 }
1263
1264 static void
1265 emit_scaled_viewport_write(struct v3d_compile *c, struct qreg rcp_w,
1266                            uint32_t *vpm_index)
1267 {
1268         for (int i = 0; i < 2; i++) {
1269                 struct qreg coord = c->outputs[c->output_position_index + i];
1270                 coord = vir_FMUL(c, coord,
1271                                  vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE + i,
1272                                              0));
1273                 coord = vir_FMUL(c, coord, rcp_w);
1274                 vir_VPM_WRITE(c, vir_FTOIN(c, coord), vpm_index);
1275         }
1276
1277 }
1278
1279 static void
1280 emit_zs_write(struct v3d_compile *c, struct qreg rcp_w, uint32_t *vpm_index)
1281 {
1282         struct qreg zscale = vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0);
1283         struct qreg zoffset = vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0);
1284
1285         struct qreg z = c->outputs[c->output_position_index + 2];
1286         z = vir_FMUL(c, z, zscale);
1287         z = vir_FMUL(c, z, rcp_w);
1288         z = vir_FADD(c, z, zoffset);
1289         vir_VPM_WRITE(c, z, vpm_index);
1290 }
1291
1292 static void
1293 emit_rcp_wc_write(struct v3d_compile *c, struct qreg rcp_w, uint32_t *vpm_index)
1294 {
1295         vir_VPM_WRITE(c, rcp_w, vpm_index);
1296 }
1297
1298 static void
1299 emit_point_size_write(struct v3d_compile *c, uint32_t *vpm_index)
1300 {
1301         struct qreg point_size;
1302
1303         if (c->output_point_size_index != -1)
1304                 point_size = c->outputs[c->output_point_size_index];
1305         else
1306                 point_size = vir_uniform_f(c, 1.0);
1307
1308         /* Workaround: HW-2726 PTB does not handle zero-size points (BCM2835,
1309          * BCM21553).
1310          */
1311         point_size = vir_FMAX(c, point_size, vir_uniform_f(c, .125));
1312
1313         vir_VPM_WRITE(c, point_size, vpm_index);
1314 }
1315
1316 static void
1317 emit_vpm_write_setup(struct v3d_compile *c)
1318 {
1319         if (c->devinfo->ver >= 40)
1320                 return;
1321
1322         v3d33_vir_vpm_write_setup(c);
1323 }
1324
1325 /**
1326  * Sets up c->outputs[c->output_position_index] for the vertex shader
1327  * epilogue, if an output vertex position wasn't specified in the user's
1328  * shader.  This may be the case for transform feedback with rasterizer
1329  * discard enabled.
1330  */
1331 static void
1332 setup_default_position(struct v3d_compile *c)
1333 {
1334         if (c->output_position_index != -1)
1335                 return;
1336
1337         c->output_position_index = c->outputs_array_size;
1338         for (int i = 0; i < 4; i++) {
1339                 add_output(c,
1340                            c->output_position_index + i,
1341                            VARYING_SLOT_POS, i);
1342         }
1343 }
1344
1345 static void
1346 emit_vert_end(struct v3d_compile *c)
1347 {
1348         setup_default_position(c);
1349
1350         uint32_t vpm_index = 0;
1351         struct qreg rcp_w = vir_RECIP(c,
1352                                       c->outputs[c->output_position_index + 3]);
1353
1354         emit_vpm_write_setup(c);
1355
1356         if (c->vs_key->is_coord) {
1357                 for (int i = 0; i < 4; i++)
1358                         vir_VPM_WRITE(c, c->outputs[c->output_position_index + i],
1359                                       &vpm_index);
1360                 emit_scaled_viewport_write(c, rcp_w, &vpm_index);
1361                 if (c->vs_key->per_vertex_point_size) {
1362                         emit_point_size_write(c, &vpm_index);
1363                         /* emit_rcp_wc_write(c, rcp_w); */
1364                 }
1365                 /* XXX: Z-only rendering */
1366                 if (0)
1367                         emit_zs_write(c, rcp_w, &vpm_index);
1368         } else {
1369                 emit_scaled_viewport_write(c, rcp_w, &vpm_index);
1370                 emit_zs_write(c, rcp_w, &vpm_index);
1371                 emit_rcp_wc_write(c, rcp_w, &vpm_index);
1372                 if (c->vs_key->per_vertex_point_size)
1373                         emit_point_size_write(c, &vpm_index);
1374         }
1375
1376         for (int i = 0; i < c->vs_key->num_fs_inputs; i++) {
1377                 struct v3d_varying_slot input = c->vs_key->fs_inputs[i];
1378                 int j;
1379
1380                 for (j = 0; j < c->num_outputs; j++) {
1381                         struct v3d_varying_slot output = c->output_slots[j];
1382
1383                         if (!memcmp(&input, &output, sizeof(input))) {
1384                                 vir_VPM_WRITE(c, c->outputs[j],
1385                                               &vpm_index);
1386                                 break;
1387                         }
1388                 }
1389                 /* Emit padding if we didn't find a declared VS output for
1390                  * this FS input.
1391                  */
1392                 if (j == c->num_outputs)
1393                         vir_VPM_WRITE(c, vir_uniform_f(c, 0.0),
1394                                       &vpm_index);
1395         }
1396
1397         /* GFXH-1684: VPM writes need to be complete by the end of the shader.
1398          */
1399         if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42)
1400                 vir_VPMWT(c);
1401 }
1402
1403 void
1404 v3d_optimize_nir(struct nir_shader *s)
1405 {
1406         bool progress;
1407
1408         do {
1409                 progress = false;
1410
1411                 NIR_PASS_V(s, nir_lower_vars_to_ssa);
1412                 NIR_PASS(progress, s, nir_lower_alu_to_scalar);
1413                 NIR_PASS(progress, s, nir_lower_phis_to_scalar);
1414                 NIR_PASS(progress, s, nir_copy_prop);
1415                 NIR_PASS(progress, s, nir_opt_remove_phis);
1416                 NIR_PASS(progress, s, nir_opt_dce);
1417                 NIR_PASS(progress, s, nir_opt_dead_cf);
1418                 NIR_PASS(progress, s, nir_opt_cse);
1419                 NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);
1420                 NIR_PASS(progress, s, nir_opt_algebraic);
1421                 NIR_PASS(progress, s, nir_opt_constant_folding);
1422                 NIR_PASS(progress, s, nir_opt_undef);
1423         } while (progress);
1424
1425         NIR_PASS(progress, s, nir_opt_move_load_ubo);
1426 }
1427
1428 static int
1429 driver_location_compare(const void *in_a, const void *in_b)
1430 {
1431         const nir_variable *const *a = in_a;
1432         const nir_variable *const *b = in_b;
1433
1434         return (*a)->data.driver_location - (*b)->data.driver_location;
1435 }
1436
1437 static struct qreg
1438 ntq_emit_vpm_read(struct v3d_compile *c,
1439                   uint32_t *num_components_queued,
1440                   uint32_t *remaining,
1441                   uint32_t vpm_index)
1442 {
1443         struct qreg vpm = vir_reg(QFILE_VPM, vpm_index);
1444
1445         if (c->devinfo->ver >= 40 ) {
1446                 return vir_LDVPMV_IN(c,
1447                                      vir_uniform_ui(c,
1448                                                     (*num_components_queued)++));
1449         }
1450
1451         if (*num_components_queued != 0) {
1452                 (*num_components_queued)--;
1453                 c->num_inputs++;
1454                 return vir_MOV(c, vpm);
1455         }
1456
1457         uint32_t num_components = MIN2(*remaining, 32);
1458
1459         v3d33_vir_vpm_read_setup(c, num_components);
1460
1461         *num_components_queued = num_components - 1;
1462         *remaining -= num_components;
1463         c->num_inputs++;
1464
1465         return vir_MOV(c, vpm);
1466 }
1467
1468 static void
1469 ntq_setup_vpm_inputs(struct v3d_compile *c)
1470 {
1471         /* Figure out how many components of each vertex attribute the shader
1472          * uses.  Each variable should have been split to individual
1473          * components and unused ones DCEed.  The vertex fetcher will load
1474          * from the start of the attribute to the number of components we
1475          * declare we need in c->vattr_sizes[].
1476          */
1477         nir_foreach_variable(var, &c->s->inputs) {
1478                 /* No VS attribute array support. */
1479                 assert(MAX2(glsl_get_length(var->type), 1) == 1);
1480
1481                 unsigned loc = var->data.driver_location;
1482                 int start_component = var->data.location_frac;
1483                 int num_components = glsl_get_components(var->type);
1484
1485                 c->vattr_sizes[loc] = MAX2(c->vattr_sizes[loc],
1486                                            start_component + num_components);
1487         }
1488
1489         unsigned num_components = 0;
1490         uint32_t vpm_components_queued = 0;
1491         bool uses_iid = c->s->info.system_values_read &
1492                 (1ull << SYSTEM_VALUE_INSTANCE_ID);
1493         bool uses_vid = c->s->info.system_values_read &
1494                 (1ull << SYSTEM_VALUE_VERTEX_ID);
1495         num_components += uses_iid;
1496         num_components += uses_vid;
1497
1498         for (int i = 0; i < ARRAY_SIZE(c->vattr_sizes); i++)
1499                 num_components += c->vattr_sizes[i];
1500
1501         if (uses_iid) {
1502                 c->iid = ntq_emit_vpm_read(c, &vpm_components_queued,
1503                                            &num_components, ~0);
1504         }
1505
1506         if (uses_vid) {
1507                 c->vid = ntq_emit_vpm_read(c, &vpm_components_queued,
1508                                            &num_components, ~0);
1509         }
1510
1511         for (int loc = 0; loc < ARRAY_SIZE(c->vattr_sizes); loc++) {
1512                 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
1513                                   (loc + 1) * 4);
1514
1515                 for (int i = 0; i < c->vattr_sizes[loc]; i++) {
1516                         c->inputs[loc * 4 + i] =
1517                                 ntq_emit_vpm_read(c,
1518                                                   &vpm_components_queued,
1519                                                   &num_components,
1520                                                   loc * 4 + i);
1521
1522                 }
1523         }
1524
1525         if (c->devinfo->ver >= 40) {
1526                 assert(vpm_components_queued == num_components);
1527         } else {
1528                 assert(vpm_components_queued == 0);
1529                 assert(num_components == 0);
1530         }
1531 }
1532
1533 static void
1534 ntq_setup_fs_inputs(struct v3d_compile *c)
1535 {
1536         unsigned num_entries = 0;
1537         unsigned num_components = 0;
1538         nir_foreach_variable(var, &c->s->inputs) {
1539                 num_entries++;
1540                 num_components += glsl_get_components(var->type);
1541         }
1542
1543         nir_variable *vars[num_entries];
1544
1545         unsigned i = 0;
1546         nir_foreach_variable(var, &c->s->inputs)
1547                 vars[i++] = var;
1548
1549         /* Sort the variables so that we emit the input setup in
1550          * driver_location order.  This is required for VPM reads, whose data
1551          * is fetched into the VPM in driver_location (TGSI register index)
1552          * order.
1553          */
1554         qsort(&vars, num_entries, sizeof(*vars), driver_location_compare);
1555
1556         for (unsigned i = 0; i < num_entries; i++) {
1557                 nir_variable *var = vars[i];
1558                 unsigned array_len = MAX2(glsl_get_length(var->type), 1);
1559                 unsigned loc = var->data.driver_location;
1560
1561                 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
1562                                   (loc + array_len) * 4);
1563
1564                 if (var->data.location == VARYING_SLOT_POS) {
1565                         emit_fragcoord_input(c, loc);
1566                 } else if (var->data.location == VARYING_SLOT_PNTC ||
1567                            (var->data.location >= VARYING_SLOT_VAR0 &&
1568                             (c->fs_key->point_sprite_mask &
1569                              (1 << (var->data.location -
1570                                     VARYING_SLOT_VAR0))))) {
1571                         c->inputs[loc * 4 + 0] = c->point_x;
1572                         c->inputs[loc * 4 + 1] = c->point_y;
1573                 } else {
1574                         for (int j = 0; j < array_len; j++)
1575                                 emit_fragment_input(c, loc + j, var, j);
1576                 }
1577         }
1578 }
1579
1580 static void
1581 ntq_setup_outputs(struct v3d_compile *c)
1582 {
1583         nir_foreach_variable(var, &c->s->outputs) {
1584                 unsigned array_len = MAX2(glsl_get_length(var->type), 1);
1585                 unsigned loc = var->data.driver_location * 4;
1586
1587                 assert(array_len == 1);
1588                 (void)array_len;
1589
1590                 for (int i = 0; i < 4 - var->data.location_frac; i++) {
1591                         add_output(c, loc + var->data.location_frac + i,
1592                                    var->data.location,
1593                                    var->data.location_frac + i);
1594                 }
1595
1596                 if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
1597                         switch (var->data.location) {
1598                         case FRAG_RESULT_COLOR:
1599                                 c->output_color_var[0] = var;
1600                                 c->output_color_var[1] = var;
1601                                 c->output_color_var[2] = var;
1602                                 c->output_color_var[3] = var;
1603                                 break;
1604                         case FRAG_RESULT_DATA0:
1605                         case FRAG_RESULT_DATA1:
1606                         case FRAG_RESULT_DATA2:
1607                         case FRAG_RESULT_DATA3:
1608                                 c->output_color_var[var->data.location -
1609                                                     FRAG_RESULT_DATA0] = var;
1610                                 break;
1611                         case FRAG_RESULT_DEPTH:
1612                                 c->output_position_index = loc;
1613                                 break;
1614                         case FRAG_RESULT_SAMPLE_MASK:
1615                                 c->output_sample_mask_index = loc;
1616                                 break;
1617                         }
1618                 } else {
1619                         switch (var->data.location) {
1620                         case VARYING_SLOT_POS:
1621                                 c->output_position_index = loc;
1622                                 break;
1623                         case VARYING_SLOT_PSIZ:
1624                                 c->output_point_size_index = loc;
1625                                 break;
1626                         }
1627                 }
1628         }
1629 }
1630
1631 static void
1632 ntq_setup_uniforms(struct v3d_compile *c)
1633 {
1634         nir_foreach_variable(var, &c->s->uniforms) {
1635                 uint32_t vec4_count = glsl_count_attribute_slots(var->type,
1636                                                                  false);
1637                 unsigned vec4_size = 4 * sizeof(float);
1638
1639                 if (var->data.mode != nir_var_uniform)
1640                         continue;
1641
1642                 declare_uniform_range(c, var->data.driver_location * vec4_size,
1643                                       vec4_count * vec4_size);
1644
1645         }
1646 }
1647
1648 /**
1649  * Sets up the mapping from nir_register to struct qreg *.
1650  *
1651  * Each nir_register gets a struct qreg per 32-bit component being stored.
1652  */
1653 static void
1654 ntq_setup_registers(struct v3d_compile *c, struct exec_list *list)
1655 {
1656         foreach_list_typed(nir_register, nir_reg, node, list) {
1657                 unsigned array_len = MAX2(nir_reg->num_array_elems, 1);
1658                 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
1659                                                   array_len *
1660                                                   nir_reg->num_components);
1661
1662                 _mesa_hash_table_insert(c->def_ht, nir_reg, qregs);
1663
1664                 for (int i = 0; i < array_len * nir_reg->num_components; i++)
1665                         qregs[i] = vir_get_temp(c);
1666         }
1667 }
1668
1669 static void
1670 ntq_emit_load_const(struct v3d_compile *c, nir_load_const_instr *instr)
1671 {
1672         /* XXX perf: Experiment with using immediate loads to avoid having
1673          * these end up in the uniform stream.  Watch out for breaking the
1674          * small immediates optimization in the process!
1675          */
1676         struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
1677         for (int i = 0; i < instr->def.num_components; i++)
1678                 qregs[i] = vir_uniform_ui(c, instr->value.u32[i]);
1679
1680         _mesa_hash_table_insert(c->def_ht, &instr->def, qregs);
1681 }
1682
1683 static void
1684 ntq_emit_ssa_undef(struct v3d_compile *c, nir_ssa_undef_instr *instr)
1685 {
1686         struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
1687
1688         /* VIR needs there to be *some* value, so pick 0 (same as for
1689          * ntq_setup_registers().
1690          */
1691         for (int i = 0; i < instr->def.num_components; i++)
1692                 qregs[i] = vir_uniform_ui(c, 0);
1693 }
1694
1695 static void
1696 ntq_emit_image_size(struct v3d_compile *c, nir_intrinsic_instr *instr)
1697 {
1698         assert(instr->intrinsic == nir_intrinsic_image_deref_size);
1699         nir_variable *var = nir_intrinsic_get_var(instr, 0);
1700         unsigned image_index = var->data.driver_location;
1701         const struct glsl_type *sampler_type = glsl_without_array(var->type);
1702         bool is_array = glsl_sampler_type_is_array(sampler_type);
1703
1704         ntq_store_dest(c, &instr->dest, 0,
1705                        vir_uniform(c, QUNIFORM_IMAGE_WIDTH, image_index));
1706         if (instr->num_components > 1) {
1707                 ntq_store_dest(c, &instr->dest, 1,
1708                                vir_uniform(c, QUNIFORM_IMAGE_HEIGHT,
1709                                            image_index));
1710         }
1711         if (instr->num_components > 2) {
1712                 ntq_store_dest(c, &instr->dest, 2,
1713                                vir_uniform(c,
1714                                            is_array ?
1715                                            QUNIFORM_IMAGE_ARRAY_SIZE :
1716                                            QUNIFORM_IMAGE_DEPTH,
1717                                            image_index));
1718         }
1719 }
1720
1721 static void
1722 ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
1723 {
1724         unsigned offset;
1725
1726         switch (instr->intrinsic) {
1727         case nir_intrinsic_load_uniform:
1728                 if (nir_src_is_const(instr->src[0])) {
1729                         int offset = (nir_intrinsic_base(instr) +
1730                                       nir_src_as_uint(instr->src[0]));
1731                         assert(offset % 4 == 0);
1732                         /* We need dwords */
1733                         offset = offset / 4;
1734                         for (int i = 0; i < instr->num_components; i++) {
1735                                 ntq_store_dest(c, &instr->dest, i,
1736                                                vir_uniform(c, QUNIFORM_UNIFORM,
1737                                                            offset + i));
1738                         }
1739                 } else {
1740                         ntq_emit_tmu_general(c, instr);
1741                 }
1742                 break;
1743
1744         case nir_intrinsic_load_ubo:
1745                 ntq_emit_tmu_general(c, instr);
1746                 break;
1747
1748         case nir_intrinsic_ssbo_atomic_add:
1749         case nir_intrinsic_ssbo_atomic_imin:
1750         case nir_intrinsic_ssbo_atomic_umin:
1751         case nir_intrinsic_ssbo_atomic_imax:
1752         case nir_intrinsic_ssbo_atomic_umax:
1753         case nir_intrinsic_ssbo_atomic_and:
1754         case nir_intrinsic_ssbo_atomic_or:
1755         case nir_intrinsic_ssbo_atomic_xor:
1756         case nir_intrinsic_ssbo_atomic_exchange:
1757         case nir_intrinsic_ssbo_atomic_comp_swap:
1758         case nir_intrinsic_load_ssbo:
1759         case nir_intrinsic_store_ssbo:
1760                 ntq_emit_tmu_general(c, instr);
1761                 break;
1762
1763         case nir_intrinsic_image_deref_load:
1764         case nir_intrinsic_image_deref_store:
1765         case nir_intrinsic_image_deref_atomic_add:
1766         case nir_intrinsic_image_deref_atomic_min:
1767         case nir_intrinsic_image_deref_atomic_max:
1768         case nir_intrinsic_image_deref_atomic_and:
1769         case nir_intrinsic_image_deref_atomic_or:
1770         case nir_intrinsic_image_deref_atomic_xor:
1771         case nir_intrinsic_image_deref_atomic_exchange:
1772         case nir_intrinsic_image_deref_atomic_comp_swap:
1773                 v3d40_vir_emit_image_load_store(c, instr);
1774                 break;
1775
1776         case nir_intrinsic_get_buffer_size:
1777                 ntq_store_dest(c, &instr->dest, 0,
1778                                vir_uniform(c, QUNIFORM_GET_BUFFER_SIZE,
1779                                            nir_src_as_uint(instr->src[0])));
1780                 break;
1781
1782         case nir_intrinsic_load_user_clip_plane:
1783                 for (int i = 0; i < instr->num_components; i++) {
1784                         ntq_store_dest(c, &instr->dest, i,
1785                                        vir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
1786                                                    nir_intrinsic_ucp_id(instr) *
1787                                                    4 + i));
1788                 }
1789                 break;
1790
1791         case nir_intrinsic_load_alpha_ref_float:
1792                 ntq_store_dest(c, &instr->dest, 0,
1793                                vir_uniform(c, QUNIFORM_ALPHA_REF, 0));
1794                 break;
1795
1796         case nir_intrinsic_load_sample_mask_in:
1797                 ntq_store_dest(c, &instr->dest, 0, vir_MSF(c));
1798                 break;
1799
1800         case nir_intrinsic_load_helper_invocation:
1801                 vir_PF(c, vir_MSF(c), V3D_QPU_PF_PUSHZ);
1802                 ntq_store_dest(c, &instr->dest, 0,
1803                                vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFA,
1804                                                   vir_uniform_ui(c, ~0),
1805                                                   vir_uniform_ui(c, 0))));
1806                 break;
1807
1808         case nir_intrinsic_load_front_face:
1809                 /* The register contains 0 (front) or 1 (back), and we need to
1810                  * turn it into a NIR bool where true means front.
1811                  */
1812                 ntq_store_dest(c, &instr->dest, 0,
1813                                vir_ADD(c,
1814                                        vir_uniform_ui(c, -1),
1815                                        vir_REVF(c)));
1816                 break;
1817
1818         case nir_intrinsic_load_instance_id:
1819                 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->iid));
1820                 break;
1821
1822         case nir_intrinsic_load_vertex_id:
1823                 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->vid));
1824                 break;
1825
1826         case nir_intrinsic_load_input:
1827                 for (int i = 0; i < instr->num_components; i++) {
1828                         offset = (nir_intrinsic_base(instr) +
1829                                   nir_src_as_uint(instr->src[0]));
1830                         int comp = nir_intrinsic_component(instr) + i;
1831                         ntq_store_dest(c, &instr->dest, i,
1832                                        vir_MOV(c, c->inputs[offset * 4 + comp]));
1833                 }
1834                 break;
1835
1836         case nir_intrinsic_store_output:
1837                 offset = ((nir_intrinsic_base(instr) +
1838                            nir_src_as_uint(instr->src[1])) * 4 +
1839                           nir_intrinsic_component(instr));
1840
1841                 for (int i = 0; i < instr->num_components; i++) {
1842                         c->outputs[offset + i] =
1843                                 vir_MOV(c, ntq_get_src(c, instr->src[0], i));
1844                 }
1845                 c->num_outputs = MAX2(c->num_outputs,
1846                                       offset + instr->num_components);
1847                 break;
1848
1849         case nir_intrinsic_image_deref_size:
1850                 ntq_emit_image_size(c, instr);
1851                 break;
1852
1853         case nir_intrinsic_discard:
1854                 if (c->execute.file != QFILE_NULL) {
1855                         vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
1856                         vir_set_cond(vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0),
1857                                                      vir_uniform_ui(c, 0)),
1858                                 V3D_QPU_COND_IFA);
1859                 } else {
1860                         vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0),
1861                                         vir_uniform_ui(c, 0));
1862                 }
1863                 break;
1864
1865         case nir_intrinsic_discard_if: {
1866                 /* true (~0) if we're discarding */
1867                 struct qreg cond = ntq_get_src(c, instr->src[0], 0);
1868
1869                 if (c->execute.file != QFILE_NULL) {
1870                         /* execute == 0 means the channel is active.  Invert
1871                          * the condition so that we can use zero as "executing
1872                          * and discarding."
1873                          */
1874                         vir_PF(c, vir_OR(c, c->execute, vir_NOT(c, cond)),
1875                                V3D_QPU_PF_PUSHZ);
1876                         vir_set_cond(vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0),
1877                                                      vir_uniform_ui(c, 0)),
1878                                      V3D_QPU_COND_IFA);
1879                 } else {
1880                         vir_PF(c, cond, V3D_QPU_PF_PUSHZ);
1881                         vir_set_cond(vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0),
1882                                                      vir_uniform_ui(c, 0)),
1883                                      V3D_QPU_COND_IFNA);
1884                 }
1885
1886                 break;
1887         }
1888
1889         case nir_intrinsic_memory_barrier:
1890         case nir_intrinsic_memory_barrier_atomic_counter:
1891         case nir_intrinsic_memory_barrier_buffer:
1892         case nir_intrinsic_memory_barrier_image:
1893                 /* We don't do any instruction scheduling of these NIR
1894                  * instructions between each other, so we just need to make
1895                  * sure that the TMU operations before the barrier are flushed
1896                  * before the ones after the barrier.  That is currently
1897                  * handled by having a THRSW in each of them and a LDTMU
1898                  * series or a TMUWT after.
1899                  */
1900                 break;
1901
1902         default:
1903                 fprintf(stderr, "Unknown intrinsic: ");
1904                 nir_print_instr(&instr->instr, stderr);
1905                 fprintf(stderr, "\n");
1906                 break;
1907         }
1908 }
1909
1910 /* Clears (activates) the execute flags for any channels whose jump target
1911  * matches this block.
1912  *
1913  * XXX perf: Could we be using flpush/flpop somehow for our execution channel
1914  * enabling?
1915  *
1916  * XXX perf: For uniform control flow, we should be able to skip c->execute
1917  * handling entirely.
1918  */
1919 static void
1920 ntq_activate_execute_for_block(struct v3d_compile *c)
1921 {
1922         vir_set_pf(vir_XOR_dest(c, vir_reg(QFILE_NULL, 0),
1923                                 c->execute, vir_uniform_ui(c, c->cur_block->index)),
1924                    V3D_QPU_PF_PUSHZ);
1925
1926         vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
1927 }
1928
1929 static void
1930 ntq_emit_uniform_if(struct v3d_compile *c, nir_if *if_stmt)
1931 {
1932         nir_block *nir_else_block = nir_if_first_else_block(if_stmt);
1933         bool empty_else_block =
1934                 (nir_else_block == nir_if_last_else_block(if_stmt) &&
1935                  exec_list_is_empty(&nir_else_block->instr_list));
1936
1937         struct qblock *then_block = vir_new_block(c);
1938         struct qblock *after_block = vir_new_block(c);
1939         struct qblock *else_block;
1940         if (empty_else_block)
1941                 else_block = after_block;
1942         else
1943                 else_block = vir_new_block(c);
1944
1945         /* Set up the flags for the IF condition (taking the THEN branch). */
1946         nir_alu_instr *if_condition_alu = ntq_get_alu_parent(if_stmt->condition);
1947         enum v3d_qpu_cond cond;
1948         if (!if_condition_alu ||
1949             !ntq_emit_comparison(c, if_condition_alu, &cond)) {
1950                 vir_PF(c, ntq_get_src(c, if_stmt->condition, 0),
1951                        V3D_QPU_PF_PUSHZ);
1952                 cond = V3D_QPU_COND_IFNA;
1953         }
1954
1955         /* Jump to ELSE. */
1956         vir_BRANCH(c, cond == V3D_QPU_COND_IFA ?
1957                    V3D_QPU_BRANCH_COND_ALLNA :
1958                    V3D_QPU_BRANCH_COND_ALLA);
1959         vir_link_blocks(c->cur_block, else_block);
1960         vir_link_blocks(c->cur_block, then_block);
1961
1962         /* Process the THEN block. */
1963         vir_set_emit_block(c, then_block);
1964         ntq_emit_cf_list(c, &if_stmt->then_list);
1965
1966         if (!empty_else_block) {
1967                 /* At the end of the THEN block, jump to ENDIF */
1968                 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
1969                 vir_link_blocks(c->cur_block, after_block);
1970
1971                 /* Emit the else block. */
1972                 vir_set_emit_block(c, else_block);
1973                 ntq_activate_execute_for_block(c);
1974                 ntq_emit_cf_list(c, &if_stmt->else_list);
1975         }
1976
1977         vir_link_blocks(c->cur_block, after_block);
1978
1979         vir_set_emit_block(c, after_block);
1980 }
1981
1982 static void
1983 ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
1984 {
1985         nir_block *nir_else_block = nir_if_first_else_block(if_stmt);
1986         bool empty_else_block =
1987                 (nir_else_block == nir_if_last_else_block(if_stmt) &&
1988                  exec_list_is_empty(&nir_else_block->instr_list));
1989
1990         struct qblock *then_block = vir_new_block(c);
1991         struct qblock *after_block = vir_new_block(c);
1992         struct qblock *else_block;
1993         if (empty_else_block)
1994                 else_block = after_block;
1995         else
1996                 else_block = vir_new_block(c);
1997
1998         bool was_top_level = false;
1999         if (c->execute.file == QFILE_NULL) {
2000                 c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
2001                 was_top_level = true;
2002         }
2003
2004         /* Set up the flags for the IF condition (taking the THEN branch). */
2005         nir_alu_instr *if_condition_alu = ntq_get_alu_parent(if_stmt->condition);
2006         enum v3d_qpu_cond cond;
2007         if (!if_condition_alu ||
2008             !ntq_emit_comparison(c, if_condition_alu, &cond)) {
2009                 vir_PF(c, ntq_get_src(c, if_stmt->condition, 0),
2010                        V3D_QPU_PF_PUSHZ);
2011                 cond = V3D_QPU_COND_IFNA;
2012         }
2013
2014         /* Update the flags+cond to mean "Taking the ELSE branch (!cond) and
2015          * was previously active (execute Z) for updating the exec flags.
2016          */
2017         if (was_top_level) {
2018                 cond = v3d_qpu_cond_invert(cond);
2019         } else {
2020                 struct qinst *inst = vir_MOV_dest(c, vir_reg(QFILE_NULL, 0),
2021                                                   c->execute);
2022                 if (cond == V3D_QPU_COND_IFA) {
2023                         vir_set_uf(inst, V3D_QPU_UF_NORNZ);
2024                 } else {
2025                         vir_set_uf(inst, V3D_QPU_UF_ANDZ);
2026                         cond = V3D_QPU_COND_IFA;
2027                 }
2028         }
2029
2030         vir_MOV_cond(c, cond,
2031                      c->execute,
2032                      vir_uniform_ui(c, else_block->index));
2033
2034         /* Jump to ELSE if nothing is active for THEN, otherwise fall
2035          * through.
2036          */
2037         vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
2038         vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA);
2039         vir_link_blocks(c->cur_block, else_block);
2040         vir_link_blocks(c->cur_block, then_block);
2041
2042         /* Process the THEN block. */
2043         vir_set_emit_block(c, then_block);
2044         ntq_emit_cf_list(c, &if_stmt->then_list);
2045
2046         if (!empty_else_block) {
2047                 /* Handle the end of the THEN block.  First, all currently
2048                  * active channels update their execute flags to point to
2049                  * ENDIF
2050                  */
2051                 vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
2052                 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
2053                              vir_uniform_ui(c, after_block->index));
2054
2055                 /* If everything points at ENDIF, then jump there immediately. */
2056                 vir_PF(c, vir_XOR(c, c->execute,
2057                                   vir_uniform_ui(c, after_block->index)),
2058                        V3D_QPU_PF_PUSHZ);
2059                 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA);
2060                 vir_link_blocks(c->cur_block, after_block);
2061                 vir_link_blocks(c->cur_block, else_block);
2062
2063                 vir_set_emit_block(c, else_block);
2064                 ntq_activate_execute_for_block(c);
2065                 ntq_emit_cf_list(c, &if_stmt->else_list);
2066         }
2067
2068         vir_link_blocks(c->cur_block, after_block);
2069
2070         vir_set_emit_block(c, after_block);
2071         if (was_top_level)
2072                 c->execute = c->undef;
2073         else
2074                 ntq_activate_execute_for_block(c);
2075 }
2076
2077 static void
2078 ntq_emit_if(struct v3d_compile *c, nir_if *nif)
2079 {
2080         if (c->execute.file == QFILE_NULL &&
2081             nir_src_is_dynamically_uniform(nif->condition)) {
2082                 ntq_emit_uniform_if(c, nif);
2083         } else {
2084                 ntq_emit_nonuniform_if(c, nif);
2085         }
2086 }
2087
2088 static void
2089 ntq_emit_jump(struct v3d_compile *c, nir_jump_instr *jump)
2090 {
2091         switch (jump->type) {
2092         case nir_jump_break:
2093                 vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
2094                 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
2095                              vir_uniform_ui(c, c->loop_break_block->index));
2096                 break;
2097
2098         case nir_jump_continue:
2099                 vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
2100                 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
2101                              vir_uniform_ui(c, c->loop_cont_block->index));
2102                 break;
2103
2104         case nir_jump_return:
2105                 unreachable("All returns shouold be lowered\n");
2106         }
2107 }
2108
2109 static void
2110 ntq_emit_instr(struct v3d_compile *c, nir_instr *instr)
2111 {
2112         switch (instr->type) {
2113         case nir_instr_type_deref:
2114                 /* ignored, will be walked by the intrinsic using it. */
2115                 break;
2116
2117         case nir_instr_type_alu:
2118                 ntq_emit_alu(c, nir_instr_as_alu(instr));
2119                 break;
2120
2121         case nir_instr_type_intrinsic:
2122                 ntq_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
2123                 break;
2124
2125         case nir_instr_type_load_const:
2126                 ntq_emit_load_const(c, nir_instr_as_load_const(instr));
2127                 break;
2128
2129         case nir_instr_type_ssa_undef:
2130                 ntq_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr));
2131                 break;
2132
2133         case nir_instr_type_tex:
2134                 ntq_emit_tex(c, nir_instr_as_tex(instr));
2135                 break;
2136
2137         case nir_instr_type_jump:
2138                 ntq_emit_jump(c, nir_instr_as_jump(instr));
2139                 break;
2140
2141         default:
2142                 fprintf(stderr, "Unknown NIR instr type: ");
2143                 nir_print_instr(instr, stderr);
2144                 fprintf(stderr, "\n");
2145                 abort();
2146         }
2147 }
2148
2149 static void
2150 ntq_emit_block(struct v3d_compile *c, nir_block *block)
2151 {
2152         nir_foreach_instr(instr, block) {
2153                 ntq_emit_instr(c, instr);
2154         }
2155 }
2156
2157 static void ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
2158
2159 static void
2160 ntq_emit_loop(struct v3d_compile *c, nir_loop *loop)
2161 {
2162         bool was_top_level = false;
2163         if (c->execute.file == QFILE_NULL) {
2164                 c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
2165                 was_top_level = true;
2166         }
2167
2168         struct qblock *save_loop_cont_block = c->loop_cont_block;
2169         struct qblock *save_loop_break_block = c->loop_break_block;
2170
2171         c->loop_cont_block = vir_new_block(c);
2172         c->loop_break_block = vir_new_block(c);
2173
2174         vir_link_blocks(c->cur_block, c->loop_cont_block);
2175         vir_set_emit_block(c, c->loop_cont_block);
2176         ntq_activate_execute_for_block(c);
2177
2178         ntq_emit_cf_list(c, &loop->body);
2179
2180         /* Re-enable any previous continues now, so our ANYA check below
2181          * works.
2182          *
2183          * XXX: Use the .ORZ flags update, instead.
2184          */
2185         vir_PF(c, vir_XOR(c,
2186                           c->execute,
2187                           vir_uniform_ui(c, c->loop_cont_block->index)),
2188                V3D_QPU_PF_PUSHZ);
2189         vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
2190
2191         vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
2192
2193         struct qinst *branch = vir_BRANCH(c, V3D_QPU_BRANCH_COND_ANYA);
2194         /* Pixels that were not dispatched or have been discarded should not
2195          * contribute to looping again.
2196          */
2197         branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P;
2198         vir_link_blocks(c->cur_block, c->loop_cont_block);
2199         vir_link_blocks(c->cur_block, c->loop_break_block);
2200
2201         vir_set_emit_block(c, c->loop_break_block);
2202         if (was_top_level)
2203                 c->execute = c->undef;
2204         else
2205                 ntq_activate_execute_for_block(c);
2206
2207         c->loop_break_block = save_loop_break_block;
2208         c->loop_cont_block = save_loop_cont_block;
2209
2210         c->loops++;
2211 }
2212
2213 static void
2214 ntq_emit_function(struct v3d_compile *c, nir_function_impl *func)
2215 {
2216         fprintf(stderr, "FUNCTIONS not handled.\n");
2217         abort();
2218 }
2219
2220 static void
2221 ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list)
2222 {
2223         foreach_list_typed(nir_cf_node, node, node, list) {
2224                 switch (node->type) {
2225                 case nir_cf_node_block:
2226                         ntq_emit_block(c, nir_cf_node_as_block(node));
2227                         break;
2228
2229                 case nir_cf_node_if:
2230                         ntq_emit_if(c, nir_cf_node_as_if(node));
2231                         break;
2232
2233                 case nir_cf_node_loop:
2234                         ntq_emit_loop(c, nir_cf_node_as_loop(node));
2235                         break;
2236
2237                 case nir_cf_node_function:
2238                         ntq_emit_function(c, nir_cf_node_as_function(node));
2239                         break;
2240
2241                 default:
2242                         fprintf(stderr, "Unknown NIR node type\n");
2243                         abort();
2244                 }
2245         }
2246 }
2247
2248 static void
2249 ntq_emit_impl(struct v3d_compile *c, nir_function_impl *impl)
2250 {
2251         ntq_setup_registers(c, &impl->registers);
2252         ntq_emit_cf_list(c, &impl->body);
2253 }
2254
2255 static void
2256 nir_to_vir(struct v3d_compile *c)
2257 {
2258         if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
2259                 c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
2260                 c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
2261                 c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
2262
2263                 /* XXX perf: We could set the "disable implicit point/line
2264                  * varyings" field in the shader record and not emit these, if
2265                  * they're not going to be used.
2266                  */
2267                 if (c->fs_key->is_points) {
2268                         c->point_x = emit_fragment_varying(c, NULL, 0, 0);
2269                         c->point_y = emit_fragment_varying(c, NULL, 0, 0);
2270                 } else if (c->fs_key->is_lines) {
2271                         c->line_x = emit_fragment_varying(c, NULL, 0, 0);
2272                 }
2273         }
2274
2275         if (c->s->info.stage == MESA_SHADER_FRAGMENT)
2276                 ntq_setup_fs_inputs(c);
2277         else
2278                 ntq_setup_vpm_inputs(c);
2279
2280         ntq_setup_outputs(c);
2281         ntq_setup_uniforms(c);
2282         ntq_setup_registers(c, &c->s->registers);
2283
2284         /* Find the main function and emit the body. */
2285         nir_foreach_function(function, c->s) {
2286                 assert(strcmp(function->name, "main") == 0);
2287                 assert(function->impl);
2288                 ntq_emit_impl(c, function->impl);
2289         }
2290 }
2291
2292 const nir_shader_compiler_options v3d_nir_options = {
2293         .lower_all_io_to_temps = true,
2294         .lower_extract_byte = true,
2295         .lower_extract_word = true,
2296         .lower_bfm = true,
2297         .lower_bitfield_insert_to_shifts = true,
2298         .lower_bitfield_extract_to_shifts = true,
2299         .lower_bitfield_reverse = true,
2300         .lower_bit_count = true,
2301         .lower_pack_unorm_2x16 = true,
2302         .lower_pack_snorm_2x16 = true,
2303         .lower_pack_unorm_4x8 = true,
2304         .lower_pack_snorm_4x8 = true,
2305         .lower_unpack_unorm_4x8 = true,
2306         .lower_unpack_snorm_4x8 = true,
2307         .lower_pack_half_2x16 = true,
2308         .lower_unpack_half_2x16 = true,
2309         .lower_fdiv = true,
2310         .lower_find_lsb = true,
2311         .lower_ffma = true,
2312         .lower_flrp32 = true,
2313         .lower_fpow = true,
2314         .lower_fsat = true,
2315         .lower_fsqrt = true,
2316         .lower_ifind_msb = true,
2317         .lower_ldexp = true,
2318         .lower_mul_high = true,
2319         .lower_wpos_pntc = true,
2320         .native_integers = true,
2321 };
2322
2323 /**
2324  * When demoting a shader down to single-threaded, removes the THRSW
2325  * instructions (one will still be inserted at v3d_vir_to_qpu() for the
2326  * program end).
2327  */
2328 static void
2329 vir_remove_thrsw(struct v3d_compile *c)
2330 {
2331         vir_for_each_block(block, c) {
2332                 vir_for_each_inst_safe(inst, block) {
2333                         if (inst->qpu.sig.thrsw)
2334                                 vir_remove_instruction(c, inst);
2335                 }
2336         }
2337
2338         c->last_thrsw = NULL;
2339 }
2340
2341 void
2342 vir_emit_last_thrsw(struct v3d_compile *c)
2343 {
2344         /* On V3D before 4.1, we need a TMU op to be outstanding when thread
2345          * switching, so disable threads if we didn't do any TMU ops (each of
2346          * which would have emitted a THRSW).
2347          */
2348         if (!c->last_thrsw_at_top_level && c->devinfo->ver < 41) {
2349                 c->threads = 1;
2350                 if (c->last_thrsw)
2351                         vir_remove_thrsw(c);
2352                 return;
2353         }
2354
2355         /* If we're threaded and the last THRSW was in conditional code, then
2356          * we need to emit another one so that we can flag it as the last
2357          * thrsw.
2358          */
2359         if (c->last_thrsw && !c->last_thrsw_at_top_level) {
2360                 assert(c->devinfo->ver >= 41);
2361                 vir_emit_thrsw(c);
2362         }
2363
2364         /* If we're threaded, then we need to mark the last THRSW instruction
2365          * so we can emit a pair of them at QPU emit time.
2366          *
2367          * For V3D 4.x, we can spawn the non-fragment shaders already in the
2368          * post-last-THRSW state, so we can skip this.
2369          */
2370         if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT) {
2371                 assert(c->devinfo->ver >= 41);
2372                 vir_emit_thrsw(c);
2373         }
2374
2375         if (c->last_thrsw)
2376                 c->last_thrsw->is_last_thrsw = true;
2377 }
2378
2379 /* There's a flag in the shader for "center W is needed for reasons other than
2380  * non-centroid varyings", so we just walk the program after VIR optimization
2381  * to see if it's used.  It should be harmless to set even if we only use
2382  * center W for varyings.
2383  */
2384 static void
2385 vir_check_payload_w(struct v3d_compile *c)
2386 {
2387         if (c->s->info.stage != MESA_SHADER_FRAGMENT)
2388                 return;
2389
2390         vir_for_each_inst_inorder(inst, c) {
2391                 for (int i = 0; i < vir_get_nsrc(inst); i++) {
2392                         if (inst->src[i].file == QFILE_REG &&
2393                             inst->src[i].index == 0) {
2394                                 c->uses_center_w = true;
2395                                 return;
2396                         }
2397                 }
2398         }
2399
2400 }
2401
2402 void
2403 v3d_nir_to_vir(struct v3d_compile *c)
2404 {
2405         if (V3D_DEBUG & (V3D_DEBUG_NIR |
2406                          v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
2407                 fprintf(stderr, "%s prog %d/%d NIR:\n",
2408                         vir_get_stage_name(c),
2409                         c->program_id, c->variant_id);
2410                 nir_print_shader(c->s, stderr);
2411         }
2412
2413         nir_to_vir(c);
2414
2415         /* Emit the last THRSW before STVPM and TLB writes. */
2416         vir_emit_last_thrsw(c);
2417
2418         switch (c->s->info.stage) {
2419         case MESA_SHADER_FRAGMENT:
2420                 emit_frag_end(c);
2421                 break;
2422         case MESA_SHADER_VERTEX:
2423                 emit_vert_end(c);
2424                 break;
2425         default:
2426                 unreachable("bad stage");
2427         }
2428
2429         if (V3D_DEBUG & (V3D_DEBUG_VIR |
2430                          v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
2431                 fprintf(stderr, "%s prog %d/%d pre-opt VIR:\n",
2432                         vir_get_stage_name(c),
2433                         c->program_id, c->variant_id);
2434                 vir_dump(c);
2435                 fprintf(stderr, "\n");
2436         }
2437
2438         vir_optimize(c);
2439         vir_lower_uniforms(c);
2440
2441         vir_check_payload_w(c);
2442
2443         /* XXX perf: On VC4, we do a VIR-level instruction scheduling here.
2444          * We used that on that platform to pipeline TMU writes and reduce the
2445          * number of thread switches, as well as try (mostly successfully) to
2446          * reduce maximum register pressure to allow more threads.  We should
2447          * do something of that sort for V3D -- either instruction scheduling
2448          * here, or delay the the THRSW and LDTMUs from our texture
2449          * instructions until the results are needed.
2450          */
2451
2452         if (V3D_DEBUG & (V3D_DEBUG_VIR |
2453                          v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
2454                 fprintf(stderr, "%s prog %d/%d VIR:\n",
2455                         vir_get_stage_name(c),
2456                         c->program_id, c->variant_id);
2457                 vir_dump(c);
2458                 fprintf(stderr, "\n");
2459         }
2460
2461         /* Attempt to allocate registers for the temporaries.  If we fail,
2462          * reduce thread count and try again.
2463          */
2464         int min_threads = (c->devinfo->ver >= 41) ? 2 : 1;
2465         struct qpu_reg *temp_registers;
2466         while (true) {
2467                 bool spilled;
2468                 temp_registers = v3d_register_allocate(c, &spilled);
2469                 if (spilled)
2470                         continue;
2471
2472                 if (temp_registers)
2473                         break;
2474
2475                 if (c->threads == min_threads) {
2476                         fprintf(stderr, "Failed to register allocate at %d threads:\n",
2477                                 c->threads);
2478                         vir_dump(c);
2479                         c->failed = true;
2480                         return;
2481                 }
2482
2483                 c->threads /= 2;
2484
2485                 if (c->threads == 1)
2486                         vir_remove_thrsw(c);
2487         }
2488
2489         v3d_vir_to_qpu(c, temp_registers);
2490 }