src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c

   1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
   2
   3 /*
   4  * Copyright (C) 2015 Rob Clark <robclark@freedesktop.org>
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice (including the next
  14  * paragraph) shall be included in all copies or substantial portions of the
  15  * Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23  * SOFTWARE.
  24  *
  25  * Authors:
  26  *    Rob Clark <robclark@freedesktop.org>
  27  */
  28
  29 #include <stdarg.h>
  30
  31 #include "pipe/p_state.h"
  32 #include "util/u_string.h"
  33 #include "util/u_memory.h"
  34 #include "util/u_inlines.h"
  35 #include "tgsi/tgsi_lowering.h"
  36 #include "tgsi/tgsi_strings.h"
  37
  38 #include "nir/tgsi_to_nir.h"
  39 #include "glsl/shader_enums.h"
  40
  41 #include "freedreno_util.h"
  42
  43 #include "ir3_compiler.h"
  44 #include "ir3_shader.h"
  45 #include "ir3_nir.h"
  46
  47 #include "instr-a3xx.h"
  48 #include "ir3.h"
  49
  50
  51 static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val);
  52
  53 struct ir3_compile {
  54         const struct tgsi_token *tokens;
  55         struct nir_shader *s;
  56
  57         struct ir3 *ir;
  58         struct ir3_shader_variant *so;
  59
  60         /* bitmask of which samplers are integer: */
  61         uint16_t integer_s;
  62
  63         struct ir3_block *block;
  64
  65         /* For fragment shaders, from the hw perspective the only
  66          * actual input is r0.xy position register passed to bary.f.
  67          * But TGSI doesn't know that, it still declares things as
  68          * IN[] registers.  So we do all the input tracking normally
  69          * and fix things up after compile_instructions()
  70          *
  71          * NOTE that frag_pos is the hardware position (possibly it
  72          * is actually an index or tag or some such.. it is *not*
  73          * values that can be directly used for gl_FragCoord..)
  74          */
  75         struct ir3_instruction *frag_pos, *frag_face, *frag_coord[4];
  76
  77         /* For vertex shaders, keep track of the system values sources */
  78         struct ir3_instruction *vertex_id, *basevertex, *instance_id;
  79
  80         /* mapping from nir_register to defining instruction: */
  81         struct hash_table *def_ht;
  82
  83         /* mapping from nir_variable to ir3_array: */
  84         struct hash_table *var_ht;
  85         unsigned num_arrays;
  86
  87         /* a common pattern for indirect addressing is to request the
  88          * same address register multiple times.  To avoid generating
  89          * duplicate instruction sequences (which our backend does not
  90          * try to clean up, since that should be done as the NIR stage)
  91          * we cache the address value generated for a given src value:
  92          */
  93         struct hash_table *addr_ht;
  94
  95         /* for calculating input/output positions/linkages: */
  96         unsigned next_inloc;
  97
  98         /* a4xx (at least patchlevel 0) cannot seem to flat-interpolate
  99          * so we need to use ldlv.u32 to load the varying directly:
 100          */
 101         bool flat_bypass;
 102
 103         /* on a3xx, we need to add one to # of array levels:
 104          */
 105         bool levels_add_one;
 106
 107         /* for looking up which system value is which */
 108         unsigned sysval_semantics[8];
 109
 110         /* list of kill instructions: */
 111         struct ir3_instruction *kill[16];
 112         unsigned int kill_count;
 113
 114         /* set if we encounter something we can't handle yet, so we
 115          * can bail cleanly and fallback to TGSI compiler f/e
 116          */
 117         bool error;
 118 };
 119
 120
 121 static struct nir_shader *to_nir(const struct tgsi_token *tokens)
 122 {
 123         struct nir_shader_compiler_options options = {
 124                         .lower_fpow = true,
 125                         .lower_fsat = true,
 126                         .lower_scmp = true,
 127                         .lower_flrp = true,
 128                         .native_integers = true,
 129         };
 130         bool progress;
 131
 132         struct nir_shader *s = tgsi_to_nir(tokens, &options);
 133
 134         if (fd_mesa_debug & FD_DBG_OPTMSGS) {
 135                 debug_printf("----------------------\n");
 136                 nir_print_shader(s, stdout);
 137                 debug_printf("----------------------\n");
 138         }
 139
 140         nir_opt_global_to_local(s);
 141         nir_convert_to_ssa(s);
 142         nir_lower_idiv(s);
 143
 144         do {
 145                 progress = false;
 146
 147                 nir_lower_vars_to_ssa(s);
 148                 nir_lower_alu_to_scalar(s);
 149
 150                 progress |= nir_copy_prop(s);
 151                 progress |= nir_opt_dce(s);
 152                 progress |= nir_opt_cse(s);
 153                 progress |= ir3_nir_lower_if_else(s);
 154                 progress |= nir_opt_algebraic(s);
 155                 progress |= nir_opt_constant_folding(s);
 156
 157         } while (progress);
 158
 159         nir_remove_dead_variables(s);
 160         nir_validate_shader(s);
 161
 162         if (fd_mesa_debug & FD_DBG_OPTMSGS) {
 163                 debug_printf("----------------------\n");
 164                 nir_print_shader(s, stdout);
 165                 debug_printf("----------------------\n");
 166         }
 167
 168         return s;
 169 }
 170
 171 /* TODO nir doesn't lower everything for us yet, but ideally it would: */
 172 static const struct tgsi_token *
 173 lower_tgsi(const struct tgsi_token *tokens, struct ir3_shader_variant *so)
 174 {
 175         struct tgsi_shader_info info;
 176         struct tgsi_lowering_config lconfig = {
 177                         .color_two_side = so->key.color_two_side,
 178                         .lower_FRC = true,
 179         };
 180
 181         switch (so->type) {
 182         case SHADER_FRAGMENT:
 183         case SHADER_COMPUTE:
 184                 lconfig.saturate_s = so->key.fsaturate_s;
 185                 lconfig.saturate_t = so->key.fsaturate_t;
 186                 lconfig.saturate_r = so->key.fsaturate_r;
 187                 break;
 188         case SHADER_VERTEX:
 189                 lconfig.saturate_s = so->key.vsaturate_s;
 190                 lconfig.saturate_t = so->key.vsaturate_t;
 191                 lconfig.saturate_r = so->key.vsaturate_r;
 192                 break;
 193         }
 194
 195         if (so->ir->compiler->gpu_id >= 400) {
 196                 /* a4xx seems to have *no* sam.p */
 197                 lconfig.lower_TXP = ~0;  /* lower all txp */
 198         } else {
 199                 /* a3xx just needs to avoid sam.p for 3d tex */
 200                 lconfig.lower_TXP = (1 << TGSI_TEXTURE_3D);
 201         }
 202
 203         return tgsi_transform_lowering(&lconfig, tokens, &info);
 204 }
 205
 206 static struct ir3_compile *
 207 compile_init(struct ir3_shader_variant *so,
 208                 const struct tgsi_token *tokens)
 209 {
 210         struct ir3_compile *ctx = rzalloc(NULL, struct ir3_compile);
 211         const struct tgsi_token *lowered_tokens;
 212
 213         if (so->ir->compiler->gpu_id >= 400) {
 214                 /* need special handling for "flat" */
 215                 ctx->flat_bypass = true;
 216                 ctx->levels_add_one = false;
 217         } else {
 218                 /* no special handling for "flat" */
 219                 ctx->flat_bypass = false;
 220                 ctx->levels_add_one = true;
 221         }
 222
 223         switch (so->type) {
 224         case SHADER_FRAGMENT:
 225         case SHADER_COMPUTE:
 226                 ctx->integer_s = so->key.finteger_s;
 227                 break;
 228         case SHADER_VERTEX:
 229                 ctx->integer_s = so->key.vinteger_s;
 230                 break;
 231         }
 232
 233         ctx->ir = so->ir;
 234         ctx->so = so;
 235         ctx->next_inloc = 8;
 236         ctx->def_ht = _mesa_hash_table_create(ctx,
 237                         _mesa_hash_pointer, _mesa_key_pointer_equal);
 238         ctx->var_ht = _mesa_hash_table_create(ctx,
 239                         _mesa_hash_pointer, _mesa_key_pointer_equal);
 240         ctx->addr_ht = _mesa_hash_table_create(ctx,
 241                         _mesa_hash_pointer, _mesa_key_pointer_equal);
 242
 243         lowered_tokens = lower_tgsi(tokens, so);
 244         if (!lowered_tokens)
 245                 lowered_tokens = tokens;
 246         ctx->s = to_nir(lowered_tokens);
 247
 248         if (lowered_tokens != tokens)
 249                 free((void *)lowered_tokens);
 250
 251         so->first_driver_param = so->first_immediate = ctx->s->num_uniforms;
 252
 253         /* one (vec4) slot for vertex id base: */
 254         if (so->type == SHADER_VERTEX)
 255                 so->first_immediate++;
 256
 257         /* reserve 4 (vec4) slots for ubo base addresses: */
 258         so->first_immediate += 4;
 259
 260         return ctx;
 261 }
 262
 263 static void
 264 compile_error(struct ir3_compile *ctx, const char *format, ...)
 265 {
 266         va_list ap;
 267         va_start(ap, format);
 268         _debug_vprintf(format, ap);
 269         va_end(ap);
 270         nir_print_shader(ctx->s, stdout);
 271         ctx->error = true;
 272         debug_assert(0);
 273 }
 274
 275 #define compile_assert(ctx, cond) do { \
 276                 if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \
 277         } while (0)
 278
 279 static void
 280 compile_free(struct ir3_compile *ctx)
 281 {
 282         ralloc_free(ctx);
 283 }
 284
 285
 286 struct ir3_array {
 287         unsigned length, aid;
 288         struct ir3_instruction *arr[];
 289 };
 290
 291 static void
 292 declare_var(struct ir3_compile *ctx, nir_variable *var)
 293 {
 294         unsigned length = glsl_get_length(var->type) * 4;  /* always vec4, at least with ttn */
 295         struct ir3_array *arr = ralloc_size(ctx, sizeof(*arr) +
 296                         (length * sizeof(arr->arr[0])));
 297         arr->length = length;
 298         arr->aid = ++ctx->num_arrays;
 299         /* Some shaders end up reading array elements without first writing..
 300          * so initialize things to prevent null instr ptrs later:
 301          */
 302         for (unsigned i = 0; i < length; i++)
 303                 arr->arr[i] = create_immed(ctx->block, 0);
 304         _mesa_hash_table_insert(ctx->var_ht, var, arr);
 305 }
 306
 307 static struct ir3_array *
 308 get_var(struct ir3_compile *ctx, nir_variable *var)
 309 {
 310         struct hash_entry *entry = _mesa_hash_table_search(ctx->var_ht, var);
 311         return entry->data;
 312 }
 313
 314 /* allocate a n element value array (to be populated by caller) and
 315  * insert in def_ht
 316  */
 317 static struct ir3_instruction **
 318 __get_dst(struct ir3_compile *ctx, void *key, unsigned n)
 319 {
 320         struct ir3_instruction **value =
 321                 ralloc_array(ctx->def_ht, struct ir3_instruction *, n);
 322         _mesa_hash_table_insert(ctx->def_ht, key, value);
 323         return value;
 324 }
 325
 326 static struct ir3_instruction **
 327 get_dst(struct ir3_compile *ctx, nir_dest *dst, unsigned n)
 328 {
 329         if (dst->is_ssa) {
 330                 return __get_dst(ctx, &dst->ssa, n);
 331         } else {
 332                 return __get_dst(ctx, dst->reg.reg, n);
 333         }
 334 }
 335
 336 static struct ir3_instruction **
 337 get_dst_ssa(struct ir3_compile *ctx, nir_ssa_def *dst, unsigned n)
 338 {
 339         return __get_dst(ctx, dst, n);
 340 }
 341
 342 static struct ir3_instruction **
 343 get_src(struct ir3_compile *ctx, nir_src *src)
 344 {
 345         struct hash_entry *entry;
 346         if (src->is_ssa) {
 347                 entry = _mesa_hash_table_search(ctx->def_ht, src->ssa);
 348         } else {
 349                 entry = _mesa_hash_table_search(ctx->def_ht, src->reg.reg);
 350         }
 351         compile_assert(ctx, entry);
 352         return entry->data;
 353 }
 354
 355 static struct ir3_instruction *
 356 create_immed(struct ir3_block *block, uint32_t val)
 357 {
 358         struct ir3_instruction *mov;
 359
 360         mov = ir3_instr_create(block, 1, 0);
 361         mov->cat1.src_type = TYPE_U32;
 362         mov->cat1.dst_type = TYPE_U32;
 363         ir3_reg_create(mov, 0, 0);
 364         ir3_reg_create(mov, 0, IR3_REG_IMMED)->uim_val = val;
 365
 366         return mov;
 367 }
 368
 369 static struct ir3_instruction *
 370 create_addr(struct ir3_block *block, struct ir3_instruction *src)
 371 {
 372         struct ir3_instruction *instr, *immed;
 373
 374         /* TODO in at least some cases, the backend could probably be
 375          * made clever enough to propagate IR3_REG_HALF..
 376          */
 377         instr = ir3_COV(block, src, TYPE_U32, TYPE_S16);
 378         instr->regs[0]->flags |= IR3_REG_HALF;
 379
 380         immed = create_immed(block, 2);
 381         immed->regs[0]->flags |= IR3_REG_HALF;
 382
 383         instr = ir3_SHL_B(block, instr, 0, immed, 0);
 384         instr->regs[0]->flags |= IR3_REG_HALF;
 385         instr->regs[1]->flags |= IR3_REG_HALF;
 386
 387         instr = ir3_MOV(block, instr, TYPE_S16);
 388         instr->regs[0]->flags |= IR3_REG_ADDR | IR3_REG_HALF;
 389         instr->regs[1]->flags |= IR3_REG_HALF;
 390
 391         return instr;
 392 }
 393
 394 /* caches addr values to avoid generating multiple cov/shl/mova
 395  * sequences for each use of a given NIR level src as address
 396  */
 397 static struct ir3_instruction *
 398 get_addr(struct ir3_compile *ctx, struct ir3_instruction *src)
 399 {
 400         struct ir3_instruction *addr;
 401         struct hash_entry *entry;
 402         entry = _mesa_hash_table_search(ctx->addr_ht, src);
 403         if (entry)
 404                 return entry->data;
 405
 406         /* TODO do we need to cache per block? */
 407         addr = create_addr(ctx->block, src);
 408         _mesa_hash_table_insert(ctx->addr_ht, src, addr);
 409
 410         return addr;
 411 }
 412
 413 static struct ir3_instruction *
 414 create_uniform(struct ir3_compile *ctx, unsigned n)
 415 {
 416         struct ir3_instruction *mov;
 417
 418         mov = ir3_instr_create(ctx->block, 1, 0);
 419         /* TODO get types right? */
 420         mov->cat1.src_type = TYPE_F32;
 421         mov->cat1.dst_type = TYPE_F32;
 422         ir3_reg_create(mov, 0, 0);
 423         ir3_reg_create(mov, n, IR3_REG_CONST);
 424
 425         return mov;
 426 }
 427
 428 static struct ir3_instruction *
 429 create_uniform_indirect(struct ir3_compile *ctx, unsigned n,
 430                 struct ir3_instruction *address)
 431 {
 432         struct ir3_instruction *mov;
 433
 434         mov = ir3_instr_create(ctx->block, 1, 0);
 435         mov->cat1.src_type = TYPE_U32;
 436         mov->cat1.dst_type = TYPE_U32;
 437         ir3_reg_create(mov, 0, 0);
 438         ir3_reg_create(mov, n, IR3_REG_CONST | IR3_REG_RELATIV);
 439         mov->address = address;
 440
 441         array_insert(ctx->ir->indirects, mov);
 442
 443         return mov;
 444 }
 445
 446 static struct ir3_instruction *
 447 create_collect(struct ir3_block *block, struct ir3_instruction **arr,
 448                 unsigned arrsz)
 449 {
 450         struct ir3_instruction *collect;
 451
 452         if (arrsz == 0)
 453                 return NULL;
 454
 455         collect = ir3_instr_create2(block, -1, OPC_META_FI, 1 + arrsz);
 456         ir3_reg_create(collect, 0, 0);
 457         for (unsigned i = 0; i < arrsz; i++)
 458                 ir3_reg_create(collect, 0, IR3_REG_SSA)->instr = arr[i];
 459
 460         return collect;
 461 }
 462
 463 static struct ir3_instruction *
 464 create_indirect_load(struct ir3_compile *ctx, unsigned arrsz, unsigned n,
 465                 struct ir3_instruction *address, struct ir3_instruction *collect)
 466 {
 467         struct ir3_block *block = ctx->block;
 468         struct ir3_instruction *mov;
 469         struct ir3_register *src;
 470
 471         mov = ir3_instr_create(block, 1, 0);
 472         mov->cat1.src_type = TYPE_U32;
 473         mov->cat1.dst_type = TYPE_U32;
 474         ir3_reg_create(mov, 0, 0);
 475         src = ir3_reg_create(mov, 0, IR3_REG_SSA | IR3_REG_RELATIV);
 476         src->instr = collect;
 477         src->size  = arrsz;
 478         src->offset = n;
 479         mov->address = address;
 480
 481         array_insert(ctx->ir->indirects, mov);
 482
 483         return mov;
 484 }
 485
 486 static struct ir3_instruction *
 487 create_indirect_store(struct ir3_compile *ctx, unsigned arrsz, unsigned n,
 488                 struct ir3_instruction *src, struct ir3_instruction *address,
 489                 struct ir3_instruction *collect)
 490 {
 491         struct ir3_block *block = ctx->block;
 492         struct ir3_instruction *mov;
 493         struct ir3_register *dst;
 494
 495         mov = ir3_instr_create(block, 1, 0);
 496         mov->cat1.src_type = TYPE_U32;
 497         mov->cat1.dst_type = TYPE_U32;
 498         dst = ir3_reg_create(mov, 0, IR3_REG_RELATIV);
 499         dst->size  = arrsz;
 500         dst->offset = n;
 501         ir3_reg_create(mov, 0, IR3_REG_SSA)->instr = src;
 502         mov->address = address;
 503         mov->fanin = collect;
 504
 505         array_insert(ctx->ir->indirects, mov);
 506
 507         return mov;
 508 }
 509
 510 static struct ir3_instruction *
 511 create_input(struct ir3_block *block, struct ir3_instruction *instr,
 512                 unsigned n)
 513 {
 514         struct ir3_instruction *in;
 515
 516         in = ir3_instr_create(block, -1, OPC_META_INPUT);
 517         in->inout.block = block;
 518         ir3_reg_create(in, n, 0);
 519         if (instr)
 520                 ir3_reg_create(in, 0, IR3_REG_SSA)->instr = instr;
 521
 522         return in;
 523 }
 524
 525 static struct ir3_instruction *
 526 create_frag_input(struct ir3_compile *ctx, unsigned n, bool use_ldlv)
 527 {
 528         struct ir3_block *block = ctx->block;
 529         struct ir3_instruction *instr;
 530         struct ir3_instruction *inloc = create_immed(block, n);
 531
 532         if (use_ldlv) {
 533                 instr = ir3_LDLV(block, inloc, 0, create_immed(block, 1), 0);
 534                 instr->cat6.type = TYPE_U32;
 535                 instr->cat6.iim_val = 1;
 536         } else {
 537                 instr = ir3_BARY_F(block, inloc, 0, ctx->frag_pos, 0);
 538                 instr->regs[2]->wrmask = 0x3;
 539         }
 540
 541         return instr;
 542 }
 543
 544 static struct ir3_instruction *
 545 create_frag_coord(struct ir3_compile *ctx, unsigned comp)
 546 {
 547         struct ir3_block *block = ctx->block;
 548         struct ir3_instruction *instr;
 549
 550         compile_assert(ctx, !ctx->frag_coord[comp]);
 551
 552         ctx->frag_coord[comp] = create_input(ctx->block, NULL, 0);
 553
 554         switch (comp) {
 555         case 0: /* .x */
 556         case 1: /* .y */
 557                 /* for frag_coord, we get unsigned values.. we need
 558                  * to subtract (integer) 8 and divide by 16 (right-
 559                  * shift by 4) then convert to float:
 560                  *
 561                  *    sub.s tmp, src, 8
 562                  *    shr.b tmp, tmp, 4
 563                  *    mov.u32f32 dst, tmp
 564                  *
 565                  */
 566                 instr = ir3_SUB_S(block, ctx->frag_coord[comp], 0,
 567                                 create_immed(block, 8), 0);
 568                 instr = ir3_SHR_B(block, instr, 0,
 569                                 create_immed(block, 4), 0);
 570                 instr = ir3_COV(block, instr, TYPE_U32, TYPE_F32);
 571
 572                 return instr;
 573         case 2: /* .z */
 574         case 3: /* .w */
 575         default:
 576                 /* seems that we can use these as-is: */
 577                 return ctx->frag_coord[comp];
 578         }
 579 }
 580
 581 static struct ir3_instruction *
 582 create_frag_face(struct ir3_compile *ctx, unsigned comp)
 583 {
 584         struct ir3_block *block = ctx->block;
 585         struct ir3_instruction *instr;
 586
 587         switch (comp) {
 588         case 0: /* .x */
 589                 compile_assert(ctx, !ctx->frag_face);
 590
 591                 ctx->frag_face = create_input(block, NULL, 0);
 592
 593                 /* for faceness, we always get -1 or 0 (int).. but TGSI expects
 594                  * positive vs negative float.. and piglit further seems to
 595                  * expect -1.0 or 1.0:
 596                  *
 597                  *    mul.s tmp, hr0.x, 2
 598                  *    add.s tmp, tmp, 1
 599                  *    mov.s32f32, dst, tmp
 600                  *
 601                  */
 602                 instr = ir3_MUL_S(block, ctx->frag_face, 0,
 603                                 create_immed(block, 2), 0);
 604                 instr = ir3_ADD_S(block, instr, 0,
 605                                 create_immed(block, 1), 0);
 606                 instr = ir3_COV(block, instr, TYPE_S32, TYPE_F32);
 607
 608                 return instr;
 609         case 1: /* .y */
 610         case 2: /* .z */
 611                 return create_immed(block, fui(0.0));
 612         default:
 613         case 3: /* .w */
 614                 return create_immed(block, fui(1.0));
 615         }
 616 }
 617
 618 /* helper for instructions that produce multiple consecutive scalar
 619  * outputs which need to have a split/fanout meta instruction inserted
 620  */
 621 static void
 622 split_dest(struct ir3_block *block, struct ir3_instruction **dst,
 623                 struct ir3_instruction *src)
 624 {
 625         struct ir3_instruction *prev = NULL;
 626         for (int i = 0, j = 0; i < 4; i++) {
 627                 struct ir3_instruction *split =
 628                                 ir3_instr_create(block, -1, OPC_META_FO);
 629                 ir3_reg_create(split, 0, IR3_REG_SSA);
 630                 ir3_reg_create(split, 0, IR3_REG_SSA)->instr = src;
 631                 split->fo.off = i;
 632
 633                 if (prev) {
 634                         split->cp.left = prev;
 635                         split->cp.left_cnt++;
 636                         prev->cp.right = split;
 637                         prev->cp.right_cnt++;
 638                 }
 639                 prev = split;
 640
 641                 if (src->regs[0]->wrmask & (1 << i))
 642                         dst[j++] = split;
 643         }
 644 }
 645
 646 /*
 647  * Adreno uses uint rather than having dedicated bool type,
 648  * which (potentially) requires some conversion, in particular
 649  * when using output of an bool instr to int input, or visa
 650  * versa.
 651  *
 652  *         | Adreno  |  NIR  |
 653  *  -------+---------+-------+-
 654  *   true  |    1    |  ~0   |
 655  *   false |    0    |   0   |
 656  *
 657  * To convert from an adreno bool (uint) to nir, use:
 658  *
 659  *    absneg.s dst, (neg)src
 660  *
 661  * To convert back in the other direction:
 662  *
 663  *    absneg.s dst, (abs)arc
 664  *
 665  * The CP step can clean up the absneg.s that cancel each other
 666  * out, and with a slight bit of extra cleverness (to recognize
 667  * the instructions which produce either a 0 or 1) can eliminate
 668  * the absneg.s's completely when an instruction that wants
 669  * 0/1 consumes the result.  For example, when a nir 'bcsel'
 670  * consumes the result of 'feq'.  So we should be able to get by
 671  * without a boolean resolve step, and without incuring any
 672  * extra penalty in instruction count.
 673  */
 674
 675 /* NIR bool -> native (adreno): */
 676 static struct ir3_instruction *
 677 ir3_b2n(struct ir3_block *block, struct ir3_instruction *instr)
 678 {
 679         return ir3_ABSNEG_S(block, instr, IR3_REG_SABS);
 680 }
 681
 682 /* native (adreno) -> NIR bool: */
 683 static struct ir3_instruction *
 684 ir3_n2b(struct ir3_block *block, struct ir3_instruction *instr)
 685 {
 686         return ir3_ABSNEG_S(block, instr, IR3_REG_SNEG);
 687 }
 688
 689 /*
 690  * alu/sfu instructions:
 691  */
 692
 693 static void
 694 emit_alu(struct ir3_compile *ctx, nir_alu_instr *alu)
 695 {
 696         const nir_op_info *info = &nir_op_infos[alu->op];
 697         struct ir3_instruction **dst, *src[info->num_inputs];
 698         struct ir3_block *b = ctx->block;
 699
 700         dst = get_dst(ctx, &alu->dest.dest, MAX2(info->output_size, 1));
 701
 702         /* Vectors are special in that they have non-scalarized writemasks,
 703          * and just take the first swizzle channel for each argument in
 704          * order into each writemask channel.
 705          */
 706         if ((alu->op == nir_op_vec2) ||
 707                         (alu->op == nir_op_vec3) ||
 708                         (alu->op == nir_op_vec4)) {
 709
 710                 for (int i = 0; i < info->num_inputs; i++) {
 711                         nir_alu_src *asrc = &alu->src[i];
 712
 713                         compile_assert(ctx, !asrc->abs);
 714                         compile_assert(ctx, !asrc->negate);
 715
 716                         src[i] = get_src(ctx, &asrc->src)[asrc->swizzle[0]];
 717                         if (!src[i])
 718                                 src[i] = create_immed(ctx->block, 0);
 719                         dst[i] = ir3_MOV(b, src[i], TYPE_U32);
 720                 }
 721
 722                 return;
 723         }
 724
 725         /* General case: We can just grab the one used channel per src. */
 726         for (int i = 0; i < info->num_inputs; i++) {
 727                 unsigned chan = ffs(alu->dest.write_mask) - 1;
 728                 nir_alu_src *asrc = &alu->src[i];
 729
 730                 compile_assert(ctx, !asrc->abs);
 731                 compile_assert(ctx, !asrc->negate);
 732
 733                 src[i] = get_src(ctx, &asrc->src)[asrc->swizzle[chan]];
 734
 735                 compile_assert(ctx, src[i]);
 736         }
 737
 738         switch (alu->op) {
 739         case nir_op_f2i:
 740                 dst[0] = ir3_COV(b, src[0], TYPE_F32, TYPE_S32);
 741                 break;
 742         case nir_op_f2u:
 743                 dst[0] = ir3_COV(b, src[0], TYPE_F32, TYPE_U32);
 744                 break;
 745         case nir_op_i2f:
 746                 dst[0] = ir3_COV(b, src[0], TYPE_S32, TYPE_F32);
 747                 break;
 748         case nir_op_u2f:
 749                 dst[0] = ir3_COV(b, src[0], TYPE_U32, TYPE_F32);
 750                 break;
 751         case nir_op_imov:
 752                 dst[0] = ir3_MOV(b, src[0], TYPE_S32);
 753                 break;
 754         case nir_op_fmov:
 755                 dst[0] = ir3_MOV(b, src[0], TYPE_F32);
 756                 break;
 757         case nir_op_f2b:
 758                 dst[0] = ir3_CMPS_F(b, src[0], 0, create_immed(b, fui(0.0)), 0);
 759                 dst[0]->cat2.condition = IR3_COND_NE;
 760                 dst[0] = ir3_n2b(b, dst[0]);
 761                 break;
 762         case nir_op_b2f:
 763                 dst[0] = ir3_COV(b, ir3_b2n(b, src[0]), TYPE_U32, TYPE_F32);
 764                 break;
 765         case nir_op_b2i:
 766                 dst[0] = ir3_b2n(b, src[0]);
 767                 break;
 768         case nir_op_i2b:
 769                 dst[0] = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0);
 770                 dst[0]->cat2.condition = IR3_COND_NE;
 771                 dst[0] = ir3_n2b(b, dst[0]);
 772                 break;
 773
 774         case nir_op_fneg:
 775                 dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FNEG);
 776                 break;
 777         case nir_op_fabs:
 778                 dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FABS);
 779                 break;
 780         case nir_op_fmax:
 781                 dst[0] = ir3_MAX_F(b, src[0], 0, src[1], 0);
 782                 break;
 783         case nir_op_fmin:
 784                 dst[0] = ir3_MIN_F(b, src[0], 0, src[1], 0);
 785                 break;
 786         case nir_op_fmul:
 787                 dst[0] = ir3_MUL_F(b, src[0], 0, src[1], 0);
 788                 break;
 789         case nir_op_fadd:
 790                 dst[0] = ir3_ADD_F(b, src[0], 0, src[1], 0);
 791                 break;
 792         case nir_op_fsub:
 793                 dst[0] = ir3_ADD_F(b, src[0], 0, src[1], IR3_REG_FNEG);
 794                 break;
 795         case nir_op_ffma:
 796                 dst[0] = ir3_MAD_F32(b, src[0], 0, src[1], 0, src[2], 0);
 797                 break;
 798         case nir_op_fddx:
 799                 dst[0] = ir3_DSX(b, src[0], 0);
 800                 dst[0]->cat5.type = TYPE_F32;
 801                 break;
 802         case nir_op_fddy:
 803                 dst[0] = ir3_DSY(b, src[0], 0);
 804                 dst[0]->cat5.type = TYPE_F32;
 805                 break;
 806                 break;
 807         case nir_op_flt:
 808                 dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
 809                 dst[0]->cat2.condition = IR3_COND_LT;
 810                 dst[0] = ir3_n2b(b, dst[0]);
 811                 break;
 812         case nir_op_fge:
 813                 dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
 814                 dst[0]->cat2.condition = IR3_COND_GE;
 815                 dst[0] = ir3_n2b(b, dst[0]);
 816                 break;
 817         case nir_op_feq:
 818                 dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
 819                 dst[0]->cat2.condition = IR3_COND_EQ;
 820                 dst[0] = ir3_n2b(b, dst[0]);
 821                 break;
 822         case nir_op_fne:
 823                 dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
 824                 dst[0]->cat2.condition = IR3_COND_NE;
 825                 dst[0] = ir3_n2b(b, dst[0]);
 826                 break;
 827         case nir_op_fceil:
 828                 dst[0] = ir3_CEIL_F(b, src[0], 0);
 829                 break;
 830         case nir_op_ffloor:
 831                 dst[0] = ir3_FLOOR_F(b, src[0], 0);
 832                 break;
 833         case nir_op_ftrunc:
 834                 dst[0] = ir3_TRUNC_F(b, src[0], 0);
 835                 break;
 836         case nir_op_fround_even:
 837                 dst[0] = ir3_RNDNE_F(b, src[0], 0);
 838                 break;
 839         case nir_op_fsign:
 840                 dst[0] = ir3_SIGN_F(b, src[0], 0);
 841                 break;
 842
 843         case nir_op_fsin:
 844                 dst[0] = ir3_SIN(b, src[0], 0);
 845                 break;
 846         case nir_op_fcos:
 847                 dst[0] = ir3_COS(b, src[0], 0);
 848                 break;
 849         case nir_op_frsq:
 850                 dst[0] = ir3_RSQ(b, src[0], 0);
 851                 break;
 852         case nir_op_frcp:
 853                 dst[0] = ir3_RCP(b, src[0], 0);
 854                 break;
 855         case nir_op_flog2:
 856                 dst[0] = ir3_LOG2(b, src[0], 0);
 857                 break;
 858         case nir_op_fexp2:
 859                 dst[0] = ir3_EXP2(b, src[0], 0);
 860                 break;
 861         case nir_op_fsqrt:
 862                 dst[0] = ir3_SQRT(b, src[0], 0);
 863                 break;
 864
 865         case nir_op_iabs:
 866                 dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SABS);
 867                 break;
 868         case nir_op_iadd:
 869                 dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0);
 870                 break;
 871         case nir_op_iand:
 872                 dst[0] = ir3_AND_B(b, src[0], 0, src[1], 0);
 873                 break;
 874         case nir_op_imax:
 875                 dst[0] = ir3_MAX_S(b, src[0], 0, src[1], 0);
 876                 break;
 877         case nir_op_imin:
 878                 dst[0] = ir3_MIN_S(b, src[0], 0, src[1], 0);
 879                 break;
 880         case nir_op_imul:
 881                 /*
 882                  * dst = (al * bl) + (ah * bl << 16) + (al * bh << 16)
 883                  *   mull.u tmp0, a, b           ; mul low, i.e. al * bl
 884                  *   madsh.m16 tmp1, a, b, tmp0  ; mul-add shift high mix, i.e. ah * bl << 16
 885                  *   madsh.m16 dst, b, a, tmp1   ; i.e. al * bh << 16
 886                  */
 887                 dst[0] = ir3_MADSH_M16(b, src[1], 0, src[0], 0,
 888                                         ir3_MADSH_M16(b, src[0], 0, src[1], 0,
 889                                                 ir3_MULL_U(b, src[0], 0, src[1], 0), 0), 0);
 890                 break;
 891         case nir_op_ineg:
 892                 dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
 893                 break;
 894         case nir_op_inot:
 895                 dst[0] = ir3_NOT_B(b, src[0], 0);
 896                 break;
 897         case nir_op_ior:
 898                 dst[0] = ir3_OR_B(b, src[0], 0, src[1], 0);
 899                 break;
 900         case nir_op_ishl:
 901                 dst[0] = ir3_SHL_B(b, src[0], 0, src[1], 0);
 902                 break;
 903         case nir_op_ishr:
 904                 dst[0] = ir3_ASHR_B(b, src[0], 0, src[1], 0);
 905                 break;
 906         case nir_op_isign: {
 907                 /* maybe this would be sane to lower in nir.. */
 908                 struct ir3_instruction *neg, *pos;
 909
 910                 neg = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0);
 911                 neg->cat2.condition = IR3_COND_LT;
 912
 913                 pos = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0);
 914                 pos->cat2.condition = IR3_COND_GT;
 915
 916                 dst[0] = ir3_SUB_U(b, pos, 0, neg, 0);
 917
 918                 break;
 919         }
 920         case nir_op_isub:
 921                 dst[0] = ir3_SUB_U(b, src[0], 0, src[1], 0);
 922                 break;
 923         case nir_op_ixor:
 924                 dst[0] = ir3_XOR_B(b, src[0], 0, src[1], 0);
 925                 break;
 926         case nir_op_ushr:
 927                 dst[0] = ir3_SHR_B(b, src[0], 0, src[1], 0);
 928                 break;
 929         case nir_op_ilt:
 930                 dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
 931                 dst[0]->cat2.condition = IR3_COND_LT;
 932                 dst[0] = ir3_n2b(b, dst[0]);
 933                 break;
 934         case nir_op_ige:
 935                 dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
 936                 dst[0]->cat2.condition = IR3_COND_GE;
 937                 dst[0] = ir3_n2b(b, dst[0]);
 938                 break;
 939         case nir_op_ieq:
 940                 dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
 941                 dst[0]->cat2.condition = IR3_COND_EQ;
 942                 dst[0] = ir3_n2b(b, dst[0]);
 943                 break;
 944         case nir_op_ine:
 945                 dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
 946                 dst[0]->cat2.condition = IR3_COND_NE;
 947                 dst[0] = ir3_n2b(b, dst[0]);
 948                 break;
 949         case nir_op_ult:
 950                 dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
 951                 dst[0]->cat2.condition = IR3_COND_LT;
 952                 dst[0] = ir3_n2b(b, dst[0]);
 953                 break;
 954         case nir_op_uge:
 955                 dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
 956                 dst[0]->cat2.condition = IR3_COND_GE;
 957                 dst[0] = ir3_n2b(b, dst[0]);
 958                 break;
 959
 960         case nir_op_bcsel:
 961                 dst[0] = ir3_SEL_B32(b, src[1], 0, ir3_b2n(b, src[0]), 0, src[2], 0);
 962                 break;
 963
 964         default:
 965                 compile_error(ctx, "Unhandled ALU op: %s\n",
 966                                 nir_op_infos[alu->op].name);
 967                 break;
 968         }
 969 }
 970
 971 /* handles direct/indirect UBO reads: */
 972 static void
 973 emit_intrinsic_load_ubo(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
 974                 struct ir3_instruction **dst)
 975 {
 976         struct ir3_block *b = ctx->block;
 977         struct ir3_instruction *addr, *src0, *src1;
 978         /* UBO addresses are the first driver params: */
 979         unsigned ubo = regid(ctx->so->first_driver_param, 0);
 980         unsigned off = intr->const_index[0];
 981
 982         /* First src is ubo index, which could either be an immed or not: */
 983         src0 = get_src(ctx, &intr->src[0])[0];
 984         if (is_same_type_mov(src0) &&
 985                         (src0->regs[1]->flags & IR3_REG_IMMED)) {
 986                 addr = create_uniform(ctx, ubo + src0->regs[1]->iim_val);
 987         } else {
 988                 addr = create_uniform_indirect(ctx, ubo, get_addr(ctx, src0));
 989         }
 990
 991         if (intr->intrinsic == nir_intrinsic_load_ubo_indirect) {
 992                 /* For load_ubo_indirect, second src is indirect offset: */
 993                 src1 = get_src(ctx, &intr->src[1])[0];
 994
 995                 /* and add offset to addr: */
 996                 addr = ir3_ADD_S(b, addr, 0, src1, 0);
 997         }
 998
 999         /* if offset is to large to encode in the ldg, split it out: */
1000         if ((off + (intr->num_components * 4)) > 1024) {
1001                 /* split out the minimal amount to improve the odds that
1002                  * cp can fit the immediate in the add.s instruction:
1003                  */
1004                 unsigned off2 = off + (intr->num_components * 4) - 1024;
1005                 addr = ir3_ADD_S(b, addr, 0, create_immed(b, off2), 0);
1006                 off -= off2;
1007         }
1008
1009         for (int i = 0; i < intr->num_components; i++) {
1010                 struct ir3_instruction *load =
1011                                 ir3_LDG(b, addr, 0, create_immed(b, 1), 0);
1012                 load->cat6.type = TYPE_U32;
1013                 load->cat6.offset = off + i * 4;    /* byte offset */
1014                 dst[i] = load;
1015         }
1016 }
1017
1018 /* handles array reads: */
1019 static void
1020 emit_intrinisic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
1021                 struct ir3_instruction **dst)
1022 {
1023         nir_deref_var *dvar = intr->variables[0];
1024         nir_deref_array *darr = nir_deref_as_array(dvar->deref.child);
1025         struct ir3_array *arr = get_var(ctx, dvar->var);
1026
1027         compile_assert(ctx, dvar->deref.child &&
1028                 (dvar->deref.child->deref_type == nir_deref_type_array));
1029
1030         switch (darr->deref_array_type) {
1031         case nir_deref_array_type_direct:
1032                 /* direct access does not require anything special: */
1033                 for (int i = 0; i < intr->num_components; i++) {
1034                         unsigned n = darr->base_offset * 4 + i;
1035                         compile_assert(ctx, n < arr->length);
1036                         dst[i] = arr->arr[n];
1037                 }
1038                 break;
1039         case nir_deref_array_type_indirect: {
1040                 /* for indirect, we need to collect all the array elements: */
1041                 struct ir3_instruction *collect =
1042                                 create_collect(ctx->block, arr->arr, arr->length);
1043                 struct ir3_instruction *addr =
1044                                 get_addr(ctx, get_src(ctx, &darr->indirect)[0]);
1045                 for (int i = 0; i < intr->num_components; i++) {
1046                         unsigned n = darr->base_offset * 4 + i;
1047                         compile_assert(ctx, n < arr->length);
1048                         dst[i] = create_indirect_load(ctx, arr->length, n, addr, collect);
1049                 }
1050                 break;
1051         }
1052         default:
1053                 compile_error(ctx, "Unhandled load deref type: %u\n",
1054                                 darr->deref_array_type);
1055                 break;
1056         }
1057 }
1058
1059 /* handles array writes: */
1060 static void
1061 emit_intrinisic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
1062 {
1063         nir_deref_var *dvar = intr->variables[0];
1064         nir_deref_array *darr = nir_deref_as_array(dvar->deref.child);
1065         struct ir3_array *arr = get_var(ctx, dvar->var);
1066         struct ir3_instruction **src;
1067
1068         compile_assert(ctx, dvar->deref.child &&
1069                 (dvar->deref.child->deref_type == nir_deref_type_array));
1070
1071         src = get_src(ctx, &intr->src[0]);
1072
1073         switch (darr->deref_array_type) {
1074         case nir_deref_array_type_direct:
1075                 /* direct access does not require anything special: */
1076                 for (int i = 0; i < intr->num_components; i++) {
1077                         unsigned n = darr->base_offset * 4 + i;
1078                         compile_assert(ctx, n < arr->length);
1079                         arr->arr[n] = src[i];
1080                 }
1081                 break;
1082         case nir_deref_array_type_indirect: {
1083                 /* for indirect, create indirect-store and fan that out: */
1084                 struct ir3_instruction *collect =
1085                                 create_collect(ctx->block, arr->arr, arr->length);
1086                 struct ir3_instruction *addr =
1087                                 get_addr(ctx, get_src(ctx, &darr->indirect)[0]);
1088                 for (int i = 0; i < intr->num_components; i++) {
1089                         struct ir3_instruction *store;
1090                         unsigned n = darr->base_offset * 4 + i;
1091                         compile_assert(ctx, n < arr->length);
1092
1093                         store = create_indirect_store(ctx, arr->length,
1094                                         n, src[i], addr, collect);
1095
1096                         store->fanin->fi.aid = arr->aid;
1097
1098                         /* TODO: probably split this out to be used for
1099                          * store_output_indirect? or move this into
1100                          * create_indirect_store()?
1101                          */
1102                         for (int j = i; j < arr->length; j += 4) {
1103                                 struct ir3_instruction *split;
1104
1105                                 split = ir3_instr_create(ctx->block, -1, OPC_META_FO);
1106                                 split->fo.off = j;
1107                                 ir3_reg_create(split, 0, 0);
1108                                 ir3_reg_create(split, 0, IR3_REG_SSA)->instr = store;
1109
1110                                 arr->arr[j] = split;
1111                         }
1112                 }
1113                 break;
1114         }
1115         default:
1116                 compile_error(ctx, "Unhandled store deref type: %u\n",
1117                                 darr->deref_array_type);
1118                 break;
1119         }
1120 }
1121
1122 static void add_sysval_input(struct ir3_compile *ctx, unsigned name,
1123                 struct ir3_instruction *instr)
1124 {
1125         struct ir3_shader_variant *so = ctx->so;
1126         unsigned r = regid(so->inputs_count, 0);
1127         unsigned n = so->inputs_count++;
1128
1129         so->inputs[n].semantic = ir3_semantic_name(name, 0);
1130         so->inputs[n].compmask = 1;
1131         so->inputs[n].regid = r;
1132         so->inputs[n].interpolate = TGSI_INTERPOLATE_CONSTANT;
1133         so->total_in++;
1134
1135         ctx->block->ninputs = MAX2(ctx->block->ninputs, r + 1);
1136         ctx->block->inputs[r] = instr;
1137 }
1138
1139 static void
1140 emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
1141 {
1142         const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
1143         struct ir3_instruction **dst, **src;
1144         struct ir3_block *b = ctx->block;
1145         unsigned idx = intr->const_index[0];
1146
1147         if (info->has_dest) {
1148                 dst = get_dst(ctx, &intr->dest, intr->num_components);
1149         } else {
1150                 dst = NULL;
1151         }
1152
1153         switch (intr->intrinsic) {
1154         case nir_intrinsic_load_uniform:
1155                 for (int i = 0; i < intr->num_components; i++) {
1156                         unsigned n = idx * 4 + i;
1157                         dst[i] = create_uniform(ctx, n);
1158                 }
1159                 break;
1160         case nir_intrinsic_load_uniform_indirect:
1161                 src = get_src(ctx, &intr->src[0]);
1162                 for (int i = 0; i < intr->num_components; i++) {
1163                         unsigned n = idx * 4 + i;
1164                         dst[i] = create_uniform_indirect(ctx, n,
1165                                         get_addr(ctx, src[0]));
1166                 }
1167                 break;
1168         case nir_intrinsic_load_ubo:
1169         case nir_intrinsic_load_ubo_indirect:
1170                 emit_intrinsic_load_ubo(ctx, intr, dst);
1171                 break;
1172         case nir_intrinsic_load_input:
1173                 for (int i = 0; i < intr->num_components; i++) {
1174                         unsigned n = idx * 4 + i;
1175                         dst[i] = b->inputs[n];
1176                 }
1177                 break;
1178         case nir_intrinsic_load_input_indirect:
1179                 src = get_src(ctx, &intr->src[0]);
1180                 struct ir3_instruction *collect =
1181                                 create_collect(b, b->inputs, b->ninputs);
1182                 struct ir3_instruction *addr = get_addr(ctx, src[0]);
1183                 for (int i = 0; i < intr->num_components; i++) {
1184                         unsigned n = idx * 4 + i;
1185                         dst[i] = create_indirect_load(ctx, b->ninputs, n, addr, collect);
1186                 }
1187                 break;
1188         case nir_intrinsic_load_var:
1189                 emit_intrinisic_load_var(ctx, intr, dst);
1190                 break;
1191         case nir_intrinsic_store_var:
1192                 emit_intrinisic_store_var(ctx, intr);
1193                 break;
1194         case nir_intrinsic_store_output:
1195                 src = get_src(ctx, &intr->src[0]);
1196                 for (int i = 0; i < intr->num_components; i++) {
1197                         unsigned n = idx * 4 + i;
1198                         b->outputs[n] = src[i];
1199                 }
1200                 break;
1201         case nir_intrinsic_load_base_vertex:
1202                 if (!ctx->basevertex) {
1203                         /* first four vec4 sysval's reserved for UBOs: */
1204                         unsigned r = regid(ctx->so->first_driver_param + 4, 0);
1205                         ctx->basevertex = create_uniform(ctx, r);
1206                         add_sysval_input(ctx, TGSI_SEMANTIC_BASEVERTEX,
1207                                         ctx->basevertex);
1208                 }
1209                 dst[0] = ctx->basevertex;
1210                 break;
1211         case nir_intrinsic_load_vertex_id_zero_base:
1212                 if (!ctx->vertex_id) {
1213                         ctx->vertex_id = create_input(ctx->block, NULL, 0);
1214                         add_sysval_input(ctx, TGSI_SEMANTIC_VERTEXID_NOBASE,
1215                                         ctx->vertex_id);
1216                 }
1217                 dst[0] = ctx->vertex_id;
1218                 break;
1219         case nir_intrinsic_load_instance_id:
1220                 if (!ctx->instance_id) {
1221                         ctx->instance_id = create_input(ctx->block, NULL, 0);
1222                         add_sysval_input(ctx, TGSI_SEMANTIC_INSTANCEID,
1223                                         ctx->instance_id);
1224                 }
1225                 dst[0] = ctx->instance_id;
1226                 break;
1227         case nir_intrinsic_discard_if:
1228         case nir_intrinsic_discard: {
1229                 struct ir3_instruction *cond, *kill;
1230
1231                 if (intr->intrinsic == nir_intrinsic_discard_if) {
1232                         /* conditional discard: */
1233                         src = get_src(ctx, &intr->src[0]);
1234                         cond = ir3_b2n(b, src[0]);
1235                 } else {
1236                         /* unconditional discard: */
1237                         cond = create_immed(b, 1);
1238                 }
1239
1240                 cond = ir3_CMPS_S(b, cond, 0, create_immed(b, 0), 0);
1241                 cond->cat2.condition = IR3_COND_NE;
1242
1243                 /* condition always goes in predicate register: */
1244                 cond->regs[0]->num = regid(REG_P0, 0);
1245
1246                 kill = ir3_KILL(b, cond, 0);
1247                 array_insert(ctx->ir->predicates, kill);
1248
1249                 ctx->kill[ctx->kill_count++] = kill;
1250                 ctx->so->has_kill = true;
1251
1252                 break;
1253         }
1254         default:
1255                 compile_error(ctx, "Unhandled intrinsic type: %s\n",
1256                                 nir_intrinsic_infos[intr->intrinsic].name);
1257                 break;
1258         }
1259 }
1260
1261 static void
1262 emit_load_const(struct ir3_compile *ctx, nir_load_const_instr *instr)
1263 {
1264         struct ir3_instruction **dst = get_dst_ssa(ctx, &instr->def,
1265                         instr->def.num_components);
1266         for (int i = 0; i < instr->def.num_components; i++)
1267                 dst[i] = create_immed(ctx->block, instr->value.u[i]);
1268 }
1269
1270 static void
1271 emit_undef(struct ir3_compile *ctx, nir_ssa_undef_instr *undef)
1272 {
1273         struct ir3_instruction **dst = get_dst_ssa(ctx, &undef->def,
1274                         undef->def.num_components);
1275         /* backend doesn't want undefined instructions, so just plug
1276          * in 0.0..
1277          */
1278         for (int i = 0; i < undef->def.num_components; i++)
1279                 dst[i] = create_immed(ctx->block, fui(0.0));
1280 }
1281
1282 /*
1283  * texture fetch/sample instructions:
1284  */
1285
1286 static void
1287 tex_info(nir_tex_instr *tex, unsigned *flagsp, unsigned *coordsp)
1288 {
1289         unsigned coords, flags = 0;
1290
1291         /* note: would use tex->coord_components.. except txs.. also,
1292          * since array index goes after shadow ref, we don't want to
1293          * count it:
1294          */
1295         switch (tex->sampler_dim) {
1296         case GLSL_SAMPLER_DIM_1D:
1297         case GLSL_SAMPLER_DIM_BUF:
1298                 coords = 1;
1299                 break;
1300         case GLSL_SAMPLER_DIM_2D:
1301         case GLSL_SAMPLER_DIM_RECT:
1302         case GLSL_SAMPLER_DIM_EXTERNAL:
1303         case GLSL_SAMPLER_DIM_MS:
1304                 coords = 2;
1305                 break;
1306         case GLSL_SAMPLER_DIM_3D:
1307         case GLSL_SAMPLER_DIM_CUBE:
1308                 coords = 3;
1309                 flags |= IR3_INSTR_3D;
1310                 break;
1311         default:
1312                 unreachable("bad sampler_dim");
1313         }
1314
1315         if (tex->is_shadow)
1316                 flags |= IR3_INSTR_S;
1317
1318         if (tex->is_array)
1319                 flags |= IR3_INSTR_A;
1320
1321         *flagsp = flags;
1322         *coordsp = coords;
1323 }
1324
1325 static void
1326 emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex)
1327 {
1328         struct ir3_block *b = ctx->block;
1329         struct ir3_instruction **dst, *sam, *src0[12], *src1[4];
1330         struct ir3_instruction **coord, *lod, *compare, *proj, **off, **ddx, **ddy;
1331         bool has_bias = false, has_lod = false, has_proj = false, has_off = false;
1332         unsigned i, coords, flags;
1333         unsigned nsrc0 = 0, nsrc1 = 0;
1334         type_t type;
1335         opc_t opc = 0;
1336
1337         coord = off = ddx = ddy = NULL;
1338         lod = proj = compare = NULL;
1339
1340         /* TODO: might just be one component for gathers? */
1341         dst = get_dst(ctx, &tex->dest, 4);
1342
1343         for (unsigned i = 0; i < tex->num_srcs; i++) {
1344                 switch (tex->src[i].src_type) {
1345                 case nir_tex_src_coord:
1346                         coord = get_src(ctx, &tex->src[i].src);
1347                         break;
1348                 case nir_tex_src_bias:
1349                         lod = get_src(ctx, &tex->src[i].src)[0];
1350                         has_bias = true;
1351                         break;
1352                 case nir_tex_src_lod:
1353                         lod = get_src(ctx, &tex->src[i].src)[0];
1354                         has_lod = true;
1355                         break;
1356                 case nir_tex_src_comparitor: /* shadow comparator */
1357                         compare = get_src(ctx, &tex->src[i].src)[0];
1358                         break;
1359                 case nir_tex_src_projector:
1360                         proj = get_src(ctx, &tex->src[i].src)[0];
1361                         has_proj = true;
1362                         break;
1363                 case nir_tex_src_offset:
1364                         off = get_src(ctx, &tex->src[i].src);
1365                         has_off = true;
1366                         break;
1367                 case nir_tex_src_ddx:
1368                         ddx = get_src(ctx, &tex->src[i].src);
1369                         break;
1370                 case nir_tex_src_ddy:
1371                         ddy = get_src(ctx, &tex->src[i].src);
1372                         break;
1373                 default:
1374                         compile_error(ctx, "Unhandled NIR tex serc type: %d\n",
1375                                         tex->src[i].src_type);
1376                         return;
1377                 }
1378         }
1379
1380         switch (tex->op) {
1381         case nir_texop_tex:      opc = OPC_SAM;      break;
1382         case nir_texop_txb:      opc = OPC_SAMB;     break;
1383         case nir_texop_txl:      opc = OPC_SAML;     break;
1384         case nir_texop_txd:      opc = OPC_SAMGQ;    break;
1385         case nir_texop_txf:      opc = OPC_ISAML;    break;
1386         case nir_texop_txf_ms:
1387         case nir_texop_txs:
1388         case nir_texop_lod:
1389         case nir_texop_tg4:
1390         case nir_texop_query_levels:
1391                 compile_error(ctx, "Unhandled NIR tex type: %d\n", tex->op);
1392                 return;
1393         }
1394
1395         tex_info(tex, &flags, &coords);
1396
1397         /* scale up integer coords for TXF based on the LOD */
1398         if (opc == OPC_ISAML) {
1399                 assert(has_lod);
1400                 for (i = 0; i < coords; i++)
1401                         coord[i] = ir3_SHL_B(b, coord[i], 0, lod, 0);
1402         }
1403         /*
1404          * lay out the first argument in the proper order:
1405          *  - actual coordinates first
1406          *  - shadow reference
1407          *  - array index
1408          *  - projection w
1409          *  - starting at offset 4, dpdx.xy, dpdy.xy
1410          *
1411          * bias/lod go into the second arg
1412          */
1413
1414         /* insert tex coords: */
1415         for (i = 0; i < coords; i++)
1416                 src0[nsrc0++] = coord[i];
1417
1418         if (coords == 1) {
1419                 /* hw doesn't do 1d, so we treat it as 2d with
1420                  * height of 1, and patch up the y coord.
1421                  * TODO: y coord should be (int)0 in some cases..
1422                  */
1423                 src0[nsrc0++] = create_immed(b, fui(0.5));
1424         }
1425
1426         if (tex->is_shadow)
1427                 src0[nsrc0++] = compare;
1428
1429         if (tex->is_array)
1430                 src0[nsrc0++] = coord[coords];
1431
1432         if (has_proj) {
1433                 src0[nsrc0++] = proj;
1434                 flags |= IR3_INSTR_P;
1435         }
1436
1437         /* pad to 4, then ddx/ddy: */
1438         if (tex->op == nir_texop_txd) {
1439                 while (nsrc0 < 4)
1440                         src0[nsrc0++] = create_immed(b, fui(0.0));
1441                 for (i = 0; i < coords; i++)
1442                         src0[nsrc0++] = ddx[i];
1443                 if (coords < 2)
1444                         src0[nsrc0++] = create_immed(b, fui(0.0));
1445                 for (i = 0; i < coords; i++)
1446                         src0[nsrc0++] = ddy[i];
1447                 if (coords < 2)
1448                         src0[nsrc0++] = create_immed(b, fui(0.0));
1449         }
1450
1451         /*
1452          * second argument (if applicable):
1453          *  - offsets
1454          *  - lod
1455          *  - bias
1456          */
1457         if (has_off | has_lod | has_bias) {
1458                 if (has_off) {
1459                         for (i = 0; i < coords; i++)
1460                                 src1[nsrc1++] = off[i];
1461                         if (coords < 2)
1462                                 src1[nsrc1++] = create_immed(b, fui(0.0));
1463                         flags |= IR3_INSTR_O;
1464                 }
1465
1466                 if (has_lod | has_bias)
1467                         src1[nsrc1++] = lod;
1468         }
1469
1470         switch (tex->dest_type) {
1471         case nir_type_invalid:
1472         case nir_type_float:
1473                 type = TYPE_F32;
1474                 break;
1475         case nir_type_int:
1476                 type = TYPE_S32;
1477                 break;
1478         case nir_type_unsigned:
1479         case nir_type_bool:
1480                 type = TYPE_U32;
1481                 break;
1482         default:
1483                 unreachable("bad dest_type");
1484         }
1485
1486         sam = ir3_SAM(b, opc, type, TGSI_WRITEMASK_XYZW,
1487                         flags, tex->sampler_index, tex->sampler_index,
1488                         create_collect(b, src0, nsrc0),
1489                         create_collect(b, src1, nsrc1));
1490
1491         split_dest(b, dst, sam);
1492 }
1493
1494 static void
1495 emit_tex_query_levels(struct ir3_compile *ctx, nir_tex_instr *tex)
1496 {
1497         struct ir3_block *b = ctx->block;
1498         struct ir3_instruction **dst, *sam;
1499
1500         dst = get_dst(ctx, &tex->dest, 1);
1501
1502         sam = ir3_SAM(b, OPC_GETINFO, TYPE_U32, TGSI_WRITEMASK_Z, 0,
1503                         tex->sampler_index, tex->sampler_index, NULL, NULL);
1504
1505         /* even though there is only one component, since it ends
1506          * up in .z rather than .x, we need a split_dest()
1507          */
1508         split_dest(b, dst, sam);
1509
1510         /* The # of levels comes from getinfo.z. We need to add 1 to it, since
1511          * the value in TEX_CONST_0 is zero-based.
1512          */
1513         if (ctx->levels_add_one)
1514                 dst[0] = ir3_ADD_U(b, dst[0], 0, create_immed(b, 1), 0);
1515 }
1516
1517 static void
1518 emit_tex_txs(struct ir3_compile *ctx, nir_tex_instr *tex)
1519 {
1520         struct ir3_block *b = ctx->block;
1521         struct ir3_instruction **dst, *sam, *lod;
1522         unsigned flags, coords;
1523
1524         tex_info(tex, &flags, &coords);
1525
1526         dst = get_dst(ctx, &tex->dest, 4);
1527
1528         compile_assert(ctx, tex->num_srcs == 1);
1529         compile_assert(ctx, tex->src[0].src_type == nir_tex_src_lod);
1530
1531         lod = get_src(ctx, &tex->src[0].src)[0];
1532
1533         sam = ir3_SAM(b, OPC_GETSIZE, TYPE_U32, TGSI_WRITEMASK_XYZW, flags,
1534                         tex->sampler_index, tex->sampler_index, lod, NULL);
1535
1536         split_dest(b, dst, sam);
1537
1538         /* Array size actually ends up in .w rather than .z. This doesn't
1539          * matter for miplevel 0, but for higher mips the value in z is
1540          * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
1541          * returned, which means that we have to add 1 to it for arrays.
1542          */
1543         if (tex->is_array) {
1544                 if (ctx->levels_add_one) {
1545                         dst[coords] = ir3_ADD_U(b, dst[3], 0, create_immed(b, 1), 0);
1546                 } else {
1547                         dst[coords] = ir3_MOV(b, dst[3], TYPE_U32);
1548                 }
1549         }
1550 }
1551
1552 static void
1553 emit_instr(struct ir3_compile *ctx, nir_instr *instr)
1554 {
1555         switch (instr->type) {
1556         case nir_instr_type_alu:
1557                 emit_alu(ctx, nir_instr_as_alu(instr));
1558                 break;
1559         case nir_instr_type_intrinsic:
1560                 emit_intrinisic(ctx, nir_instr_as_intrinsic(instr));
1561                 break;
1562         case nir_instr_type_load_const:
1563                 emit_load_const(ctx, nir_instr_as_load_const(instr));
1564                 break;
1565         case nir_instr_type_ssa_undef:
1566                 emit_undef(ctx, nir_instr_as_ssa_undef(instr));
1567                 break;
1568         case nir_instr_type_tex: {
1569                 nir_tex_instr *tex = nir_instr_as_tex(instr);
1570                 /* couple tex instructions get special-cased:
1571                  */
1572                 switch (tex->op) {
1573                 case nir_texop_txs:
1574                         emit_tex_txs(ctx, tex);
1575                         break;
1576                 case nir_texop_query_levels:
1577                         emit_tex_query_levels(ctx, tex);
1578                         break;
1579                 default:
1580                         emit_tex(ctx, tex);
1581                         break;
1582                 }
1583                 break;
1584         }
1585         case nir_instr_type_call:
1586         case nir_instr_type_jump:
1587         case nir_instr_type_phi:
1588         case nir_instr_type_parallel_copy:
1589                 compile_error(ctx, "Unhandled NIR instruction type: %d\n", instr->type);
1590                 break;
1591         }
1592 }
1593
1594 static void
1595 emit_block(struct ir3_compile *ctx, nir_block *block)
1596 {
1597         nir_foreach_instr(block, instr) {
1598                 emit_instr(ctx, instr);
1599                 if (ctx->error)
1600                         return;
1601         }
1602 }
1603
1604 static void
1605 emit_function(struct ir3_compile *ctx, nir_function_impl *impl)
1606 {
1607         foreach_list_typed(nir_cf_node, node, node, &impl->body) {
1608                 switch (node->type) {
1609                 case nir_cf_node_block:
1610                         emit_block(ctx, nir_cf_node_as_block(node));
1611                         break;
1612                 case nir_cf_node_if:
1613                 case nir_cf_node_loop:
1614                 case nir_cf_node_function:
1615                         compile_error(ctx, "TODO\n");
1616                         break;
1617                 }
1618                 if (ctx->error)
1619                         return;
1620         }
1621 }
1622
1623 static void
1624 setup_input(struct ir3_compile *ctx, nir_variable *in)
1625 {
1626         struct ir3_shader_variant *so = ctx->so;
1627         unsigned array_len = MAX2(glsl_get_length(in->type), 1);
1628         unsigned ncomp = glsl_get_components(in->type);
1629         /* XXX: map loc slots to semantics */
1630         unsigned semantic_name = in->data.location;
1631         unsigned semantic_index = in->data.index;
1632         unsigned n = in->data.driver_location;
1633
1634         DBG("; in: %u:%u, len=%ux%u, loc=%u\n",
1635                         semantic_name, semantic_index, array_len,
1636                         ncomp, n);
1637
1638         so->inputs[n].semantic =
1639                         ir3_semantic_name(semantic_name, semantic_index);
1640         so->inputs[n].compmask = (1 << ncomp) - 1;
1641         so->inputs[n].inloc = ctx->next_inloc;
1642         so->inputs[n].interpolate = 0;
1643         so->inputs_count = MAX2(so->inputs_count, n + 1);
1644
1645         /* the fdN_program_emit() code expects tgsi consts here, so map
1646          * things back to tgsi for now:
1647          */
1648         switch (in->data.interpolation) {
1649         case INTERP_QUALIFIER_FLAT:
1650                 so->inputs[n].interpolate = TGSI_INTERPOLATE_CONSTANT;
1651                 break;
1652         case INTERP_QUALIFIER_NOPERSPECTIVE:
1653                 so->inputs[n].interpolate = TGSI_INTERPOLATE_LINEAR;
1654                 break;
1655         case INTERP_QUALIFIER_SMOOTH:
1656                 so->inputs[n].interpolate = TGSI_INTERPOLATE_PERSPECTIVE;
1657                 break;
1658         }
1659
1660         for (int i = 0; i < ncomp; i++) {
1661                 struct ir3_instruction *instr = NULL;
1662                 unsigned idx = (n * 4) + i;
1663
1664                 if (ctx->so->type == SHADER_FRAGMENT) {
1665                         if (semantic_name == TGSI_SEMANTIC_POSITION) {
1666                                 so->inputs[n].bary = false;
1667                                 so->frag_coord = true;
1668                                 instr = create_frag_coord(ctx, i);
1669                         } else if (semantic_name == TGSI_SEMANTIC_FACE) {
1670                                 so->inputs[n].bary = false;
1671                                 so->frag_face = true;
1672                                 instr = create_frag_face(ctx, i);
1673                         } else {
1674                                 bool use_ldlv = false;
1675
1676                                 /* with NIR, we need to infer TGSI_INTERPOLATE_COLOR
1677                                  * from the semantic name:
1678                                  */
1679                                 if ((in->data.interpolation == INTERP_QUALIFIER_NONE) &&
1680                                                 ((semantic_name == TGSI_SEMANTIC_COLOR) ||
1681                                                         (semantic_name == TGSI_SEMANTIC_BCOLOR)))
1682                                         so->inputs[n].interpolate = TGSI_INTERPOLATE_COLOR;
1683
1684                                 if (ctx->flat_bypass) {
1685                                         /* with NIR, we need to infer TGSI_INTERPOLATE_COLOR
1686                                          * from the semantic name:
1687                                          */
1688                                         switch (so->inputs[n].interpolate) {
1689                                         case TGSI_INTERPOLATE_COLOR:
1690                                                 if (!ctx->so->key.rasterflat)
1691                                                         break;
1692                                                 /* fallthrough */
1693                                         case TGSI_INTERPOLATE_CONSTANT:
1694                                                 use_ldlv = true;
1695                                                 break;
1696                                         }
1697                                 }
1698
1699                                 so->inputs[n].bary = true;
1700
1701                                 instr = create_frag_input(ctx,
1702                                                 so->inputs[n].inloc + i - 8, use_ldlv);
1703                         }
1704                 } else {
1705                         instr = create_input(ctx->block, NULL, idx);
1706                 }
1707
1708                 ctx->block->inputs[idx] = instr;
1709         }
1710
1711         if (so->inputs[n].bary || (ctx->so->type == SHADER_VERTEX)) {
1712                 ctx->next_inloc += ncomp;
1713                 so->total_in += ncomp;
1714         }
1715 }
1716
1717 static void
1718 setup_output(struct ir3_compile *ctx, nir_variable *out)
1719 {
1720         struct ir3_shader_variant *so = ctx->so;
1721         unsigned array_len = MAX2(glsl_get_length(out->type), 1);
1722         unsigned ncomp = glsl_get_components(out->type);
1723         /* XXX: map loc slots to semantics */
1724         unsigned semantic_name = out->data.location;
1725         unsigned semantic_index = out->data.index;
1726         unsigned n = out->data.driver_location;
1727         unsigned comp = 0;
1728
1729         DBG("; out: %u:%u, len=%ux%u, loc=%u\n",
1730                         semantic_name, semantic_index, array_len,
1731                         ncomp, n);
1732
1733         if (ctx->so->type == SHADER_VERTEX) {
1734                 switch (semantic_name) {
1735                 case TGSI_SEMANTIC_POSITION:
1736                         so->writes_pos = true;
1737                         break;
1738                 case TGSI_SEMANTIC_PSIZE:
1739                         so->writes_psize = true;
1740                         break;
1741                 case TGSI_SEMANTIC_COLOR:
1742                 case TGSI_SEMANTIC_BCOLOR:
1743                 case TGSI_SEMANTIC_GENERIC:
1744                 case TGSI_SEMANTIC_FOG:
1745                 case TGSI_SEMANTIC_TEXCOORD:
1746                         break;
1747                 default:
1748                         compile_error(ctx, "unknown VS semantic name: %s\n",
1749                                         tgsi_semantic_names[semantic_name]);
1750                 }
1751         } else {
1752                 switch (semantic_name) {
1753                 case TGSI_SEMANTIC_POSITION:
1754                         comp = 2;  /* tgsi will write to .z component */
1755                         so->writes_pos = true;
1756                         break;
1757                 case TGSI_SEMANTIC_COLOR:
1758                         break;
1759                 default:
1760                         compile_error(ctx, "unknown FS semantic name: %s\n",
1761                                         tgsi_semantic_names[semantic_name]);
1762                 }
1763         }
1764
1765         compile_assert(ctx, n < ARRAY_SIZE(so->outputs));
1766
1767         so->outputs[n].semantic =
1768                         ir3_semantic_name(semantic_name, semantic_index);
1769         so->outputs[n].regid = regid(n, comp);
1770         so->outputs_count = MAX2(so->outputs_count, n + 1);
1771
1772         for (int i = 0; i < ncomp; i++) {
1773                 unsigned idx = (n * 4) + i;
1774
1775                 ctx->block->outputs[idx] = create_immed(ctx->block, fui(0.0));
1776         }
1777 }
1778
1779 static void
1780 emit_instructions(struct ir3_compile *ctx)
1781 {
1782         unsigned ninputs  = exec_list_length(&ctx->s->inputs) * 4;
1783         unsigned noutputs = exec_list_length(&ctx->s->outputs) * 4;
1784
1785         /* we need to allocate big enough outputs array so that
1786          * we can stuff the kill's at the end.  Likewise for vtx
1787          * shaders, we need to leave room for sysvals:
1788          */
1789         if (ctx->so->type == SHADER_FRAGMENT) {
1790                 noutputs += ARRAY_SIZE(ctx->kill);
1791         } else if (ctx->so->type == SHADER_VERTEX) {
1792                 ninputs += 8;
1793         }
1794
1795         ctx->block = ir3_block_create(ctx->ir, 0, ninputs, noutputs);
1796
1797         if (ctx->so->type == SHADER_FRAGMENT) {
1798                 ctx->block->noutputs -= ARRAY_SIZE(ctx->kill);
1799         } else if (ctx->so->type == SHADER_VERTEX) {
1800                 ctx->block->ninputs -= 8;
1801         }
1802
1803         /* for fragment shader, we have a single input register (usually
1804          * r0.xy) which is used as the base for bary.f varying fetch instrs:
1805          */
1806         if (ctx->so->type == SHADER_FRAGMENT) {
1807                 // TODO maybe a helper for fi since we need it a few places..
1808                 struct ir3_instruction *instr;
1809                 instr = ir3_instr_create(ctx->block, -1, OPC_META_FI);
1810                 ir3_reg_create(instr, 0, 0);
1811                 ir3_reg_create(instr, 0, IR3_REG_SSA);    /* r0.x */
1812                 ir3_reg_create(instr, 0, IR3_REG_SSA);    /* r0.y */
1813                 ctx->frag_pos = instr;
1814         }
1815
1816         /* Setup inputs: */
1817         foreach_list_typed(nir_variable, var, node, &ctx->s->inputs) {
1818                 setup_input(ctx, var);
1819         }
1820
1821         /* Setup outputs: */
1822         foreach_list_typed(nir_variable, var, node, &ctx->s->outputs) {
1823                 setup_output(ctx, var);
1824         }
1825
1826         /* Setup variables (which should only be arrays): */
1827         foreach_list_typed(nir_variable, var, node, &ctx->s->globals) {
1828                 declare_var(ctx, var);
1829         }
1830
1831         /* Find the main function and emit the body: */
1832         nir_foreach_overload(ctx->s, overload) {
1833                 compile_assert(ctx, strcmp(overload->function->name, "main") == 0);
1834                 compile_assert(ctx, overload->impl);
1835                 emit_function(ctx, overload->impl);
1836                 if (ctx->error)
1837                         return;
1838         }
1839 }
1840
1841 /* from NIR perspective, we actually have inputs.  But most of the "inputs"
1842  * for a fragment shader are just bary.f instructions.  The *actual* inputs
1843  * from the hw perspective are the frag_pos and optionally frag_coord and
1844  * frag_face.
1845  */
1846 static void
1847 fixup_frag_inputs(struct ir3_compile *ctx)
1848 {
1849         struct ir3_shader_variant *so = ctx->so;
1850         struct ir3_block *block = ctx->block;
1851         struct ir3_instruction **inputs;
1852         struct ir3_instruction *instr;
1853         int n, regid = 0;
1854
1855         block->ninputs = 0;
1856
1857         n  = 4;  /* always have frag_pos */
1858         n += COND(so->frag_face, 4);
1859         n += COND(so->frag_coord, 4);
1860
1861         inputs = ir3_alloc(ctx->ir, n * (sizeof(struct ir3_instruction *)));
1862
1863         if (so->frag_face) {
1864                 /* this ultimately gets assigned to hr0.x so doesn't conflict
1865                  * with frag_coord/frag_pos..
1866                  */
1867                 inputs[block->ninputs++] = ctx->frag_face;
1868                 ctx->frag_face->regs[0]->num = 0;
1869
1870                 /* remaining channels not used, but let's avoid confusing
1871                  * other parts that expect inputs to come in groups of vec4
1872                  */
1873                 inputs[block->ninputs++] = NULL;
1874                 inputs[block->ninputs++] = NULL;
1875                 inputs[block->ninputs++] = NULL;
1876         }
1877
1878         /* since we don't know where to set the regid for frag_coord,
1879          * we have to use r0.x for it.  But we don't want to *always*
1880          * use r1.x for frag_pos as that could increase the register
1881          * footprint on simple shaders:
1882          */
1883         if (so->frag_coord) {
1884                 ctx->frag_coord[0]->regs[0]->num = regid++;
1885                 ctx->frag_coord[1]->regs[0]->num = regid++;
1886                 ctx->frag_coord[2]->regs[0]->num = regid++;
1887                 ctx->frag_coord[3]->regs[0]->num = regid++;
1888
1889                 inputs[block->ninputs++] = ctx->frag_coord[0];
1890                 inputs[block->ninputs++] = ctx->frag_coord[1];
1891                 inputs[block->ninputs++] = ctx->frag_coord[2];
1892                 inputs[block->ninputs++] = ctx->frag_coord[3];
1893         }
1894
1895         /* we always have frag_pos: */
1896         so->pos_regid = regid;
1897
1898         /* r0.x */
1899         instr = create_input(block, NULL, block->ninputs);
1900         instr->regs[0]->num = regid++;
1901         inputs[block->ninputs++] = instr;
1902         ctx->frag_pos->regs[1]->instr = instr;
1903
1904         /* r0.y */
1905         instr = create_input(block, NULL, block->ninputs);
1906         instr->regs[0]->num = regid++;
1907         inputs[block->ninputs++] = instr;
1908         ctx->frag_pos->regs[2]->instr = instr;
1909
1910         block->inputs = inputs;
1911 }
1912
1913 int
1914 ir3_compile_shader_nir(struct ir3_compiler *compiler,
1915                 struct ir3_shader_variant *so,
1916                 const struct tgsi_token *tokens,
1917                 struct ir3_shader_key key)
1918 {
1919         struct ir3_compile *ctx;
1920         struct ir3_block *block;
1921         struct ir3_instruction **inputs;
1922         unsigned i, j, actual_in;
1923         int ret = 0, max_bary;
1924
1925         assert(!so->ir);
1926
1927         so->ir = ir3_create(compiler);
1928
1929         assert(so->ir);
1930
1931         ctx = compile_init(so, tokens);
1932         if (!ctx) {
1933                 DBG("INIT failed!");
1934                 ret = -1;
1935                 goto out;
1936         }
1937
1938         emit_instructions(ctx);
1939
1940         if (ctx->error) {
1941                 DBG("EMIT failed!");
1942                 ret = -1;
1943                 goto out;
1944         }
1945
1946         block = ctx->block;
1947         so->ir->block = block;
1948
1949         /* keep track of the inputs from TGSI perspective.. */
1950         inputs = block->inputs;
1951
1952         /* but fixup actual inputs for frag shader: */
1953         if (so->type == SHADER_FRAGMENT)
1954                 fixup_frag_inputs(ctx);
1955
1956         /* at this point, for binning pass, throw away unneeded outputs: */
1957         if (key.binning_pass) {
1958                 for (i = 0, j = 0; i < so->outputs_count; i++) {
1959                         unsigned name = sem2name(so->outputs[i].semantic);
1960                         unsigned idx = sem2idx(so->outputs[i].semantic);
1961
1962                         /* throw away everything but first position/psize */
1963                         if ((idx == 0) && ((name == TGSI_SEMANTIC_POSITION) ||
1964                                         (name == TGSI_SEMANTIC_PSIZE))) {
1965                                 if (i != j) {
1966                                         so->outputs[j] = so->outputs[i];
1967                                         block->outputs[(j*4)+0] = block->outputs[(i*4)+0];
1968                                         block->outputs[(j*4)+1] = block->outputs[(i*4)+1];
1969                                         block->outputs[(j*4)+2] = block->outputs[(i*4)+2];
1970                                         block->outputs[(j*4)+3] = block->outputs[(i*4)+3];
1971                                 }
1972                                 j++;
1973                         }
1974                 }
1975                 so->outputs_count = j;
1976                 block->noutputs = j * 4;
1977         }
1978
1979         /* if we want half-precision outputs, mark the output registers
1980          * as half:
1981          */
1982         if (key.half_precision) {
1983                 for (i = 0; i < block->noutputs; i++) {
1984                         if (!block->outputs[i])
1985                                 continue;
1986                         block->outputs[i]->regs[0]->flags |= IR3_REG_HALF;
1987                 }
1988         }
1989
1990         /* at this point, we want the kill's in the outputs array too,
1991          * so that they get scheduled (since they have no dst).. we've
1992          * already ensured that the array is big enough in push_block():
1993          */
1994         if (so->type == SHADER_FRAGMENT) {
1995                 for (i = 0; i < ctx->kill_count; i++)
1996                         block->outputs[block->noutputs++] = ctx->kill[i];
1997         }
1998
1999         if (fd_mesa_debug & FD_DBG_OPTMSGS) {
2000                 printf("BEFORE CP:\n");
2001                 ir3_print(so->ir);
2002         }
2003
2004         ir3_block_depth(block);
2005
2006         ir3_block_cp(block);
2007
2008         if (fd_mesa_debug & FD_DBG_OPTMSGS) {
2009                 printf("BEFORE GROUPING:\n");
2010                 ir3_print(so->ir);
2011         }
2012
2013         /* Group left/right neighbors, inserting mov's where needed to
2014          * solve conflicts:
2015          */
2016         ir3_block_group(block);
2017
2018         ir3_block_depth(block);
2019
2020         if (fd_mesa_debug & FD_DBG_OPTMSGS) {
2021                 printf("AFTER DEPTH:\n");
2022                 ir3_print(so->ir);
2023         }
2024
2025         ret = ir3_block_sched(block);
2026         if (ret) {
2027                 DBG("SCHED failed!");
2028                 goto out;
2029         }
2030
2031         if (fd_mesa_debug & FD_DBG_OPTMSGS) {
2032                 printf("AFTER SCHED:\n");
2033                 ir3_print(so->ir);
2034         }
2035
2036         ret = ir3_block_ra(block, so->type, so->frag_coord, so->frag_face);
2037         if (ret) {
2038                 DBG("RA failed!");
2039                 goto out;
2040         }
2041
2042         if (fd_mesa_debug & FD_DBG_OPTMSGS) {
2043                 printf("AFTER RA:\n");
2044                 ir3_print(so->ir);
2045         }
2046
2047         ir3_block_legalize(block, &so->has_samp, &max_bary);
2048
2049         /* fixup input/outputs: */
2050         for (i = 0; i < so->outputs_count; i++) {
2051                 so->outputs[i].regid = block->outputs[i*4]->regs[0]->num;
2052                 /* preserve hack for depth output.. tgsi writes depth to .z,
2053                  * but what we give the hw is the scalar register:
2054                  */
2055                 if ((so->type == SHADER_FRAGMENT) &&
2056                         (sem2name(so->outputs[i].semantic) == TGSI_SEMANTIC_POSITION))
2057                         so->outputs[i].regid += 2;
2058         }
2059
2060         /* Note that some or all channels of an input may be unused: */
2061         actual_in = 0;
2062         for (i = 0; i < so->inputs_count; i++) {
2063                 unsigned j, regid = ~0, compmask = 0;
2064                 so->inputs[i].ncomp = 0;
2065                 for (j = 0; j < 4; j++) {
2066                         struct ir3_instruction *in = inputs[(i*4) + j];
2067                         if (in) {
2068                                 compmask |= (1 << j);
2069                                 regid = in->regs[0]->num - j;
2070                                 actual_in++;
2071                                 so->inputs[i].ncomp++;
2072                         }
2073                 }
2074                 so->inputs[i].regid = regid;
2075                 so->inputs[i].compmask = compmask;
2076         }
2077
2078         /* fragment shader always gets full vec4's even if it doesn't
2079          * fetch all components, but vertex shader we need to update
2080          * with the actual number of components fetch, otherwise thing
2081          * will hang due to mismaptch between VFD_DECODE's and
2082          * TOTALATTRTOVS
2083          */
2084         if (so->type == SHADER_VERTEX)
2085                 so->total_in = actual_in;
2086         else
2087                 so->total_in = align(max_bary + 1, 4);
2088
2089 out:
2090         if (ret) {
2091                 ir3_destroy(so->ir);
2092                 so->ir = NULL;
2093         }
2094         compile_free(ctx);
2095
2096         return ret;
2097 }