src/gallium/drivers/freedreno/ir3/ir3_compiler.c

   1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
   2
   3 /*
   4  * Copyright (C) 2013 Rob Clark <robclark@freedesktop.org>
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice (including the next
  14  * paragraph) shall be included in all copies or substantial portions of the
  15  * Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23  * SOFTWARE.
  24  *
  25  * Authors:
  26  *    Rob Clark <robclark@freedesktop.org>
  27  */
  28
  29 #include <stdarg.h>
  30
  31 #include "pipe/p_state.h"
  32 #include "util/u_string.h"
  33 #include "util/u_memory.h"
  34 #include "util/u_inlines.h"
  35 #include "tgsi/tgsi_lowering.h"
  36 #include "tgsi/tgsi_parse.h"
  37 #include "tgsi/tgsi_ureg.h"
  38 #include "tgsi/tgsi_info.h"
  39 #include "tgsi/tgsi_strings.h"
  40 #include "tgsi/tgsi_dump.h"
  41 #include "tgsi/tgsi_scan.h"
  42
  43 #include "freedreno_util.h"
  44
  45 #include "ir3_compiler.h"
  46 #include "ir3_shader.h"
  47
  48 #include "instr-a3xx.h"
  49 #include "ir3.h"
  50
  51 struct ir3_compile_context {
  52         const struct tgsi_token *tokens;
  53         bool free_tokens;
  54         struct ir3 *ir;
  55         struct ir3_shader_variant *so;
  56
  57         struct ir3_block *block;
  58         struct ir3_instruction *current_instr;
  59
  60         /* we need to defer updates to block->outputs[] until the end
  61          * of an instruction (so we don't see new value until *after*
  62          * the src registers are processed)
  63          */
  64         struct {
  65                 struct ir3_instruction *instr, **instrp;
  66         } output_updates[16];
  67         unsigned num_output_updates;
  68
  69         /* are we in a sequence of "atomic" instructions?
  70          */
  71         bool atomic;
  72
  73         /* For fragment shaders, from the hw perspective the only
  74          * actual input is r0.xy position register passed to bary.f.
  75          * But TGSI doesn't know that, it still declares things as
  76          * IN[] registers.  So we do all the input tracking normally
  77          * and fix things up after compile_instructions()
  78          *
  79          * NOTE that frag_pos is the hardware position (possibly it
  80          * is actually an index or tag or some such.. it is *not*
  81          * values that can be directly used for gl_FragCoord..)
  82          */
  83         struct ir3_instruction *frag_pos, *frag_face, *frag_coord[4];
  84
  85         struct tgsi_parse_context parser;
  86         unsigned type;
  87
  88         struct tgsi_shader_info info;
  89
  90         /* for calculating input/output positions/linkages: */
  91         unsigned next_inloc;
  92
  93         unsigned num_internal_temps;
  94         struct tgsi_src_register internal_temps[8];
  95
  96         /* idx/slot for last compiler generated immediate */
  97         unsigned immediate_idx;
  98
  99         /* stack of branch instructions that mark (potentially nested)
 100          * branch if/else/loop/etc
 101          */
 102         struct {
 103                 struct ir3_instruction *instr, *cond;
 104                 bool inv;   /* true iff in else leg of branch */
 105         } branch[16];
 106         unsigned int branch_count;
 107
 108         /* list of kill instructions: */
 109         struct ir3_instruction *kill[16];
 110         unsigned int kill_count;
 111
 112         /* used when dst is same as one of the src, to avoid overwriting a
 113          * src element before the remaining scalar instructions that make
 114          * up the vector operation
 115          */
 116         struct tgsi_dst_register tmp_dst;
 117         struct tgsi_src_register *tmp_src;
 118
 119         /* just for catching incorrect use of get_dst()/put_dst():
 120          */
 121         bool using_tmp_dst;
 122 };
 123
 124
 125 static void vectorize(struct ir3_compile_context *ctx,
 126                 struct ir3_instruction *instr, struct tgsi_dst_register *dst,
 127                 int nsrcs, ...);
 128 static void create_mov(struct ir3_compile_context *ctx,
 129                 struct tgsi_dst_register *dst, struct tgsi_src_register *src);
 130 static type_t get_ftype(struct ir3_compile_context *ctx);
 131
 132 static unsigned
 133 compile_init(struct ir3_compile_context *ctx, struct ir3_shader_variant *so,
 134                 const struct tgsi_token *tokens)
 135 {
 136         unsigned ret;
 137         struct tgsi_shader_info *info = &ctx->info;
 138         struct tgsi_lowering_config lconfig = {
 139                         .color_two_side = so->key.color_two_side,
 140                         .lower_DST  = true,
 141                         .lower_XPD  = true,
 142                         .lower_SCS  = true,
 143                         .lower_LRP  = true,
 144                         .lower_FRC  = true,
 145                         .lower_POW  = true,
 146                         .lower_LIT  = true,
 147                         .lower_EXP  = true,
 148                         .lower_LOG  = true,
 149                         .lower_DP4  = true,
 150                         .lower_DP3  = true,
 151                         .lower_DPH  = true,
 152                         .lower_DP2  = true,
 153                         .lower_DP2A = true,
 154         };
 155
 156         switch (so->type) {
 157         case SHADER_FRAGMENT:
 158         case SHADER_COMPUTE:
 159                 lconfig.saturate_s = so->key.fsaturate_s;
 160                 lconfig.saturate_t = so->key.fsaturate_t;
 161                 lconfig.saturate_r = so->key.fsaturate_r;
 162                 break;
 163         case SHADER_VERTEX:
 164                 lconfig.saturate_s = so->key.vsaturate_s;
 165                 lconfig.saturate_t = so->key.vsaturate_t;
 166                 lconfig.saturate_r = so->key.vsaturate_r;
 167                 break;
 168         }
 169
 170         ctx->tokens = tgsi_transform_lowering(&lconfig, tokens, &ctx->info);
 171         ctx->free_tokens = !!ctx->tokens;
 172         if (!ctx->tokens) {
 173                 /* no lowering */
 174                 ctx->tokens = tokens;
 175         }
 176         ctx->ir = so->ir;
 177         ctx->so = so;
 178         ctx->next_inloc = 8;
 179         ctx->num_internal_temps = 0;
 180         ctx->branch_count = 0;
 181         ctx->kill_count = 0;
 182         ctx->block = NULL;
 183         ctx->current_instr = NULL;
 184         ctx->num_output_updates = 0;
 185         ctx->atomic = false;
 186         ctx->frag_pos = NULL;
 187         ctx->frag_face = NULL;
 188         ctx->tmp_src = NULL;
 189         ctx->using_tmp_dst = false;
 190
 191         memset(ctx->frag_coord, 0, sizeof(ctx->frag_coord));
 192
 193 #define FM(x) (1 << TGSI_FILE_##x)
 194         /* optimize can't deal with relative addressing: */
 195         if (info->indirect_files & (FM(TEMPORARY) | FM(INPUT) | FM(OUTPUT)))
 196                 return TGSI_PARSE_ERROR;
 197
 198         /* NOTE: if relative addressing is used, we set constlen in
 199          * the compiler (to worst-case value) since we don't know in
 200          * the assembler what the max addr reg value can be:
 201          */
 202         if (info->indirect_files & FM(CONSTANT))
 203                 so->constlen = 4 * (ctx->info.file_max[TGSI_FILE_CONSTANT] + 1);
 204
 205         /* Immediates go after constants: */
 206         so->first_immediate = info->file_max[TGSI_FILE_CONSTANT] + 1;
 207         ctx->immediate_idx = 4 * (ctx->info.file_max[TGSI_FILE_IMMEDIATE] + 1);
 208
 209         ret = tgsi_parse_init(&ctx->parser, ctx->tokens);
 210         if (ret != TGSI_PARSE_OK)
 211                 return ret;
 212
 213         ctx->type = ctx->parser.FullHeader.Processor.Processor;
 214
 215         return ret;
 216 }
 217
 218 static void
 219 compile_error(struct ir3_compile_context *ctx, const char *format, ...)
 220 {
 221         va_list ap;
 222         va_start(ap, format);
 223         _debug_vprintf(format, ap);
 224         va_end(ap);
 225         tgsi_dump(ctx->tokens, 0);
 226         debug_assert(0);
 227 }
 228
 229 #define compile_assert(ctx, cond) do { \
 230                 if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \
 231         } while (0)
 232
 233 static void
 234 compile_free(struct ir3_compile_context *ctx)
 235 {
 236         if (ctx->free_tokens)
 237                 free((void *)ctx->tokens);
 238         tgsi_parse_free(&ctx->parser);
 239 }
 240
 241 struct instr_translater {
 242         void (*fxn)(const struct instr_translater *t,
 243                         struct ir3_compile_context *ctx,
 244                         struct tgsi_full_instruction *inst);
 245         unsigned tgsi_opc;
 246         opc_t opc;
 247         opc_t hopc;    /* opc to use for half_precision mode, if different */
 248         unsigned arg;
 249 };
 250
 251 static void
 252 instr_finish(struct ir3_compile_context *ctx)
 253 {
 254         unsigned i;
 255
 256         if (ctx->atomic)
 257                 return;
 258
 259         for (i = 0; i < ctx->num_output_updates; i++)
 260                 *(ctx->output_updates[i].instrp) = ctx->output_updates[i].instr;
 261
 262         ctx->num_output_updates = 0;
 263 }
 264
 265 /* For "atomic" groups of instructions, for example the four scalar
 266  * instructions to perform a vec4 operation.  Basically this just
 267  * blocks out handling of output_updates so the next scalar instruction
 268  * still sees the result from before the start of the atomic group.
 269  *
 270  * NOTE: when used properly, this could probably replace get/put_dst()
 271  * stuff.
 272  */
 273 static void
 274 instr_atomic_start(struct ir3_compile_context *ctx)
 275 {
 276         ctx->atomic = true;
 277 }
 278
 279 static void
 280 instr_atomic_end(struct ir3_compile_context *ctx)
 281 {
 282         ctx->atomic = false;
 283         instr_finish(ctx);
 284 }
 285
 286 static struct ir3_instruction *
 287 instr_create(struct ir3_compile_context *ctx, int category, opc_t opc)
 288 {
 289         instr_finish(ctx);
 290         return (ctx->current_instr = ir3_instr_create(ctx->block, category, opc));
 291 }
 292
 293 static struct ir3_instruction *
 294 instr_clone(struct ir3_compile_context *ctx, struct ir3_instruction *instr)
 295 {
 296         instr_finish(ctx);
 297         return (ctx->current_instr = ir3_instr_clone(instr));
 298 }
 299
 300 static struct ir3_block *
 301 push_block(struct ir3_compile_context *ctx)
 302 {
 303         struct ir3_block *block;
 304         unsigned ntmp, nin, nout;
 305
 306 #define SCALAR_REGS(file) (4 * (ctx->info.file_max[TGSI_FILE_ ## file] + 1))
 307
 308         /* hmm, give ourselves room to create 8 extra temporaries (vec4):
 309          */
 310         ntmp = SCALAR_REGS(TEMPORARY);
 311         ntmp += 8 * 4;
 312
 313         nout = SCALAR_REGS(OUTPUT);
 314         nin  = SCALAR_REGS(INPUT);
 315
 316         /* for outermost block, 'inputs' are the actual shader INPUT
 317          * register file.  Reads from INPUT registers always go back to
 318          * top block.  For nested blocks, 'inputs' is used to track any
 319          * TEMPORARY file register from one of the enclosing blocks that
 320          * is ready in this block.
 321          */
 322         if (!ctx->block) {
 323                 /* NOTE: fragment shaders actually have two inputs (r0.xy, the
 324                  * position)
 325                  */
 326                 if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
 327                         int n = 2;
 328                         if (ctx->info.reads_position)
 329                                 n += 4;
 330                         if (ctx->info.uses_frontface)
 331                                 n += 4;
 332                         nin = MAX2(n, nin);
 333                         nout += ARRAY_SIZE(ctx->kill);
 334                 }
 335         } else {
 336                 nin = ntmp;
 337         }
 338
 339         block = ir3_block_create(ctx->ir, ntmp, nin, nout);
 340
 341         if ((ctx->type == TGSI_PROCESSOR_FRAGMENT) && !ctx->block)
 342                 block->noutputs -= ARRAY_SIZE(ctx->kill);
 343
 344         block->parent = ctx->block;
 345         ctx->block = block;
 346
 347         return block;
 348 }
 349
 350 static void
 351 pop_block(struct ir3_compile_context *ctx)
 352 {
 353         ctx->block = ctx->block->parent;
 354         compile_assert(ctx, ctx->block);
 355 }
 356
 357 static struct ir3_instruction *
 358 create_output(struct ir3_block *block, struct ir3_instruction *instr,
 359                 unsigned n)
 360 {
 361         struct ir3_instruction *out;
 362
 363         out = ir3_instr_create(block, -1, OPC_META_OUTPUT);
 364         out->inout.block = block;
 365         ir3_reg_create(out, n, 0);
 366         if (instr)
 367                 ir3_reg_create(out, 0, IR3_REG_SSA)->instr = instr;
 368
 369         return out;
 370 }
 371
 372 static struct ir3_instruction *
 373 create_input(struct ir3_block *block, struct ir3_instruction *instr,
 374                 unsigned n)
 375 {
 376         struct ir3_instruction *in;
 377
 378         in = ir3_instr_create(block, -1, OPC_META_INPUT);
 379         in->inout.block = block;
 380         ir3_reg_create(in, n, 0);
 381         if (instr)
 382                 ir3_reg_create(in, 0, IR3_REG_SSA)->instr = instr;
 383
 384         return in;
 385 }
 386
 387 static struct ir3_instruction *
 388 block_input(struct ir3_block *block, unsigned n)
 389 {
 390         /* references to INPUT register file always go back up to
 391          * top level:
 392          */
 393         if (block->parent)
 394                 return block_input(block->parent, n);
 395         return block->inputs[n];
 396 }
 397
 398 /* return temporary in scope, creating if needed meta-input node
 399  * to track block inputs
 400  */
 401 static struct ir3_instruction *
 402 block_temporary(struct ir3_block *block, unsigned n)
 403 {
 404         /* references to TEMPORARY register file, find the nearest
 405          * enclosing block which has already assigned this temporary,
 406          * creating meta-input instructions along the way to keep
 407          * track of block inputs
 408          */
 409         if (block->parent && !block->temporaries[n]) {
 410                 /* if already have input for this block, reuse: */
 411                 if (!block->inputs[n])
 412                         block->inputs[n] = block_temporary(block->parent, n);
 413
 414                 /* and create new input to return: */
 415                 return create_input(block, block->inputs[n], n);
 416         }
 417         return block->temporaries[n];
 418 }
 419
 420 static struct ir3_instruction *
 421 create_immed(struct ir3_compile_context *ctx, float val)
 422 {
 423         /* NOTE: *don't* use instr_create() here!
 424          */
 425         struct ir3_instruction *instr;
 426         instr = ir3_instr_create(ctx->block, 1, 0);
 427         instr->cat1.src_type = get_ftype(ctx);
 428         instr->cat1.dst_type = get_ftype(ctx);
 429         ir3_reg_create(instr, 0, 0);
 430         ir3_reg_create(instr, 0, IR3_REG_IMMED)->fim_val = val;
 431         return instr;
 432 }
 433
 434 static void
 435 ssa_dst(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
 436                 const struct tgsi_dst_register *dst, unsigned chan)
 437 {
 438         unsigned n = regid(dst->Index, chan);
 439         unsigned idx = ctx->num_output_updates;
 440
 441         compile_assert(ctx, idx < ARRAY_SIZE(ctx->output_updates));
 442
 443         /* NOTE: defer update of temporaries[idx] or output[idx]
 444          * until instr_finish(), so that if the current instruction
 445          * reads the same TEMP/OUT[] it gets the old value:
 446          *
 447          * bleh.. this might be a bit easier to just figure out
 448          * in instr_finish().  But at that point we've already
 449          * lost information about OUTPUT vs TEMPORARY register
 450          * file..
 451          */
 452
 453         switch (dst->File) {
 454         case TGSI_FILE_OUTPUT:
 455                 compile_assert(ctx, n < ctx->block->noutputs);
 456                 ctx->output_updates[idx].instrp = &ctx->block->outputs[n];
 457                 ctx->output_updates[idx].instr = instr;
 458                 ctx->num_output_updates++;
 459                 break;
 460         case TGSI_FILE_TEMPORARY:
 461                 compile_assert(ctx, n < ctx->block->ntemporaries);
 462                 ctx->output_updates[idx].instrp = &ctx->block->temporaries[n];
 463                 ctx->output_updates[idx].instr = instr;
 464                 ctx->num_output_updates++;
 465                 break;
 466         case TGSI_FILE_ADDRESS:
 467                 compile_assert(ctx, n < 1);
 468                 ctx->output_updates[idx].instrp = &ctx->block->address;
 469                 ctx->output_updates[idx].instr = instr;
 470                 ctx->num_output_updates++;
 471                 break;
 472         }
 473 }
 474
 475 static void
 476 ssa_src(struct ir3_compile_context *ctx, struct ir3_register *reg,
 477                 const struct tgsi_src_register *src, unsigned chan)
 478 {
 479         struct ir3_block *block = ctx->block;
 480         unsigned n = regid(src->Index, chan);
 481
 482         switch (src->File) {
 483         case TGSI_FILE_INPUT:
 484                 reg->flags |= IR3_REG_SSA;
 485                 reg->instr = block_input(ctx->block, n);
 486                 break;
 487         case TGSI_FILE_OUTPUT:
 488                 /* really this should just happen in case of 'MOV_SAT OUT[n], ..',
 489                  * for the following clamp instructions:
 490                  */
 491                 reg->flags |= IR3_REG_SSA;
 492                 reg->instr = block->outputs[n];
 493                 /* we don't have to worry about read from an OUTPUT that was
 494                  * assigned outside of the current block, because the _SAT
 495                  * clamp instructions will always be in the same block as
 496                  * the original instruction which wrote the OUTPUT
 497                  */
 498                 compile_assert(ctx, reg->instr);
 499                 break;
 500         case TGSI_FILE_TEMPORARY:
 501                 reg->flags |= IR3_REG_SSA;
 502                 reg->instr = block_temporary(ctx->block, n);
 503                 break;
 504         }
 505
 506         if ((reg->flags & IR3_REG_SSA) && !reg->instr) {
 507                 /* this can happen when registers (or components of a TGSI
 508                  * register) are used as src before they have been assigned
 509                  * (undefined contents).  To avoid confusing the rest of the
 510                  * compiler, and to generally keep things peachy, substitute
 511                  * an instruction that sets the src to 0.0.  Or to keep
 512                  * things undefined, I could plug in a random number? :-P
 513                  *
 514                  * NOTE: *don't* use instr_create() here!
 515                  */
 516                 reg->instr = create_immed(ctx, 0.0);
 517         }
 518 }
 519
 520 static struct ir3_register *
 521 add_dst_reg_wrmask(struct ir3_compile_context *ctx,
 522                 struct ir3_instruction *instr, const struct tgsi_dst_register *dst,
 523                 unsigned chan, unsigned wrmask)
 524 {
 525         unsigned flags = 0, num = 0;
 526         struct ir3_register *reg;
 527
 528         switch (dst->File) {
 529         case TGSI_FILE_OUTPUT:
 530         case TGSI_FILE_TEMPORARY:
 531                 /* uses SSA */
 532                 break;
 533         case TGSI_FILE_ADDRESS:
 534                 flags |= IR3_REG_ADDR;
 535                 /* uses SSA */
 536                 break;
 537         default:
 538                 compile_error(ctx, "unsupported dst register file: %s\n",
 539                         tgsi_file_name(dst->File));
 540                 break;
 541         }
 542
 543         if (dst->Indirect)
 544                 flags |= IR3_REG_RELATIV;
 545
 546         reg = ir3_reg_create(instr, regid(num, chan), flags);
 547
 548         /* NOTE: do not call ssa_dst() if atomic.. vectorize()
 549          * itself will call ssa_dst().  This is to filter out
 550          * the (initially bogus) .x component dst which is
 551          * created (but not necessarily used, ie. if the net
 552          * result of the vector operation does not write to
 553          * the .x component)
 554          */
 555
 556         reg->wrmask = wrmask;
 557         if (wrmask == 0x1) {
 558                 /* normal case */
 559                 if (!ctx->atomic)
 560                         ssa_dst(ctx, instr, dst, chan);
 561         } else if ((dst->File == TGSI_FILE_TEMPORARY) ||
 562                         (dst->File == TGSI_FILE_OUTPUT) ||
 563                         (dst->File == TGSI_FILE_ADDRESS)) {
 564                 unsigned i;
 565
 566                 /* if instruction writes multiple, we need to create
 567                  * some place-holder collect the registers:
 568                  */
 569                 for (i = 0; i < 4; i++) {
 570                         if (wrmask & (1 << i)) {
 571                                 struct ir3_instruction *collect =
 572                                                 ir3_instr_create(ctx->block, -1, OPC_META_FO);
 573                                 collect->fo.off = i;
 574                                 /* unused dst reg: */
 575                                 ir3_reg_create(collect, 0, 0);
 576                                 /* and src reg used to hold original instr */
 577                                 ir3_reg_create(collect, 0, IR3_REG_SSA)->instr = instr;
 578                                 if (!ctx->atomic)
 579                                         ssa_dst(ctx, collect, dst, chan+i);
 580                         }
 581                 }
 582         }
 583
 584         return reg;
 585 }
 586
 587 static struct ir3_register *
 588 add_dst_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
 589                 const struct tgsi_dst_register *dst, unsigned chan)
 590 {
 591         return add_dst_reg_wrmask(ctx, instr, dst, chan, 0x1);
 592 }
 593
 594 static struct ir3_register *
 595 add_src_reg_wrmask(struct ir3_compile_context *ctx,
 596                 struct ir3_instruction *instr, const struct tgsi_src_register *src,
 597                 unsigned chan, unsigned wrmask)
 598 {
 599         unsigned flags = 0, num = 0;
 600         struct ir3_register *reg;
 601         struct ir3_instruction *orig = NULL;
 602
 603         switch (src->File) {
 604         case TGSI_FILE_IMMEDIATE:
 605                 /* TODO if possible, use actual immediate instead of const.. but
 606                  * TGSI has vec4 immediates, we can only embed scalar (of limited
 607                  * size, depending on instruction..)
 608                  */
 609                 flags |= IR3_REG_CONST;
 610                 num = src->Index + ctx->so->first_immediate;
 611                 break;
 612         case TGSI_FILE_CONSTANT:
 613                 flags |= IR3_REG_CONST;
 614                 num = src->Index;
 615                 break;
 616         case TGSI_FILE_OUTPUT:
 617                 /* NOTE: we should only end up w/ OUTPUT file for things like
 618                  * clamp()'ing saturated dst instructions
 619                  */
 620         case TGSI_FILE_INPUT:
 621         case TGSI_FILE_TEMPORARY:
 622                 /* uses SSA */
 623                 break;
 624         default:
 625                 compile_error(ctx, "unsupported src register file: %s\n",
 626                         tgsi_file_name(src->File));
 627                 break;
 628         }
 629
 630         /* We seem to have 8 bits (6.2) for dst register always, so I think
 631          * it is safe to assume GPR cannot be >=64
 632          *
 633          * cat3 instructions only have 8 bits for src2, but cannot take a
 634          * const for src2
 635          *
 636          * cat5 and cat6 in some cases only has 8 bits, but cannot take a
 637          * const for any src.
 638          *
 639          * Other than that we seem to have 12 bits to encode const src,
 640          * except for cat1 which may only have 11 bits (but that seems like
 641          * a bug)
 642          */
 643         if (flags & IR3_REG_CONST)
 644                 compile_assert(ctx, src->Index < (1 << 9));
 645         else
 646                 compile_assert(ctx, src->Index < (1 << 6));
 647
 648         if (src->Absolute)
 649                 flags |= IR3_REG_ABS;
 650         if (src->Negate)
 651                 flags |= IR3_REG_NEGATE;
 652
 653         if (src->Indirect) {
 654                 flags |= IR3_REG_RELATIV;
 655
 656                 /* shouldn't happen, and we can't cope with it below: */
 657                 compile_assert(ctx, wrmask == 0x1);
 658
 659                 /* wrap in a meta-deref to track both the src and address: */
 660                 orig = instr;
 661
 662                 instr = ir3_instr_create(ctx->block, -1, OPC_META_DEREF);
 663                 ir3_reg_create(instr, 0, 0);
 664                 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->block->address;
 665         }
 666
 667         reg = ir3_reg_create(instr, regid(num, chan), flags);
 668
 669         reg->wrmask = wrmask;
 670         if (wrmask == 0x1) {
 671                 /* normal case */
 672                 ssa_src(ctx, reg, src, chan);
 673         } else if ((src->File == TGSI_FILE_TEMPORARY) ||
 674                         (src->File == TGSI_FILE_OUTPUT) ||
 675                         (src->File == TGSI_FILE_INPUT)) {
 676                 struct ir3_instruction *collect;
 677                 unsigned i;
 678
 679                 compile_assert(ctx, !src->Indirect);
 680
 681                 /* if instruction reads multiple, we need to create
 682                  * some place-holder collect the registers:
 683                  */
 684                 collect = ir3_instr_create(ctx->block, -1, OPC_META_FI);
 685                 ir3_reg_create(collect, 0, 0);   /* unused dst reg */
 686
 687                 for (i = 0; i < 4; i++) {
 688                         if (wrmask & (1 << i)) {
 689                                 /* and src reg used point to the original instr */
 690                                 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
 691                                                 src, chan + i);
 692                         } else if (wrmask & ~((i << i) - 1)) {
 693                                 /* if any remaining components, then dummy
 694                                  * placeholder src reg to fill in the blanks:
 695                                  */
 696                                 ir3_reg_create(collect, 0, 0);
 697                         }
 698                 }
 699
 700                 reg->flags |= IR3_REG_SSA;
 701                 reg->instr = collect;
 702         }
 703
 704         if (src->Indirect) {
 705                 reg = ir3_reg_create(orig, 0, flags | IR3_REG_SSA);
 706                 reg->instr = instr;
 707         }
 708         return reg;
 709 }
 710
 711 static struct ir3_register *
 712 add_src_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
 713                 const struct tgsi_src_register *src, unsigned chan)
 714 {
 715         return add_src_reg_wrmask(ctx, instr, src, chan, 0x1);
 716 }
 717
 718 static void
 719 src_from_dst(struct tgsi_src_register *src, struct tgsi_dst_register *dst)
 720 {
 721         src->File      = dst->File;
 722         src->Indirect  = dst->Indirect;
 723         src->Dimension = dst->Dimension;
 724         src->Index     = dst->Index;
 725         src->Absolute  = 0;
 726         src->Negate    = 0;
 727         src->SwizzleX  = TGSI_SWIZZLE_X;
 728         src->SwizzleY  = TGSI_SWIZZLE_Y;
 729         src->SwizzleZ  = TGSI_SWIZZLE_Z;
 730         src->SwizzleW  = TGSI_SWIZZLE_W;
 731 }
 732
 733 /* Get internal-temp src/dst to use for a sequence of instructions
 734  * generated by a single TGSI op.
 735  */
 736 static struct tgsi_src_register *
 737 get_internal_temp(struct ir3_compile_context *ctx,
 738                 struct tgsi_dst_register *tmp_dst)
 739 {
 740         struct tgsi_src_register *tmp_src;
 741         int n;
 742
 743         tmp_dst->File      = TGSI_FILE_TEMPORARY;
 744         tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW;
 745         tmp_dst->Indirect  = 0;
 746         tmp_dst->Dimension = 0;
 747
 748         /* assign next temporary: */
 749         n = ctx->num_internal_temps++;
 750         compile_assert(ctx, n < ARRAY_SIZE(ctx->internal_temps));
 751         tmp_src = &ctx->internal_temps[n];
 752
 753         tmp_dst->Index = ctx->info.file_max[TGSI_FILE_TEMPORARY] + n + 1;
 754
 755         src_from_dst(tmp_src, tmp_dst);
 756
 757         return tmp_src;
 758 }
 759
 760 static inline bool
 761 is_const(struct tgsi_src_register *src)
 762 {
 763         return (src->File == TGSI_FILE_CONSTANT) ||
 764                         (src->File == TGSI_FILE_IMMEDIATE);
 765 }
 766
 767 static inline bool
 768 is_relative(struct tgsi_src_register *src)
 769 {
 770         return src->Indirect;
 771 }
 772
 773 static inline bool
 774 is_rel_or_const(struct tgsi_src_register *src)
 775 {
 776         return is_relative(src) || is_const(src);
 777 }
 778
 779 static type_t
 780 get_ftype(struct ir3_compile_context *ctx)
 781 {
 782         return TYPE_F32;
 783 }
 784
 785 static type_t
 786 get_utype(struct ir3_compile_context *ctx)
 787 {
 788         return TYPE_U32;
 789 }
 790
 791 static type_t
 792 get_stype(struct ir3_compile_context *ctx)
 793 {
 794         return TYPE_S32;
 795 }
 796
 797 static unsigned
 798 src_swiz(struct tgsi_src_register *src, int chan)
 799 {
 800         switch (chan) {
 801         case 0: return src->SwizzleX;
 802         case 1: return src->SwizzleY;
 803         case 2: return src->SwizzleZ;
 804         case 3: return src->SwizzleW;
 805         }
 806         assert(0);
 807         return 0;
 808 }
 809
 810 /* for instructions that cannot take a const register as src, if needed
 811  * generate a move to temporary gpr:
 812  */
 813 static struct tgsi_src_register *
 814 get_unconst(struct ir3_compile_context *ctx, struct tgsi_src_register *src)
 815 {
 816         struct tgsi_dst_register tmp_dst;
 817         struct tgsi_src_register *tmp_src;
 818
 819         compile_assert(ctx, is_rel_or_const(src));
 820
 821         tmp_src = get_internal_temp(ctx, &tmp_dst);
 822
 823         create_mov(ctx, &tmp_dst, src);
 824
 825         return tmp_src;
 826 }
 827
 828 static void
 829 get_immediate(struct ir3_compile_context *ctx,
 830                 struct tgsi_src_register *reg, uint32_t val)
 831 {
 832         unsigned neg, swiz, idx, i;
 833         /* actually maps 1:1 currently.. not sure if that is safe to rely on: */
 834         static const unsigned swiz2tgsi[] = {
 835                         TGSI_SWIZZLE_X, TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_W,
 836         };
 837
 838         for (i = 0; i < ctx->immediate_idx; i++) {
 839                 swiz = i % 4;
 840                 idx  = i / 4;
 841
 842                 if (ctx->so->immediates[idx].val[swiz] == val) {
 843                         neg = 0;
 844                         break;
 845                 }
 846
 847                 if (ctx->so->immediates[idx].val[swiz] == -val) {
 848                         neg = 1;
 849                         break;
 850                 }
 851         }
 852
 853         if (i == ctx->immediate_idx) {
 854                 /* need to generate a new immediate: */
 855                 swiz = i % 4;
 856                 idx  = i / 4;
 857                 neg  = 0;
 858                 ctx->so->immediates[idx].val[swiz] = val;
 859                 ctx->so->immediates_count = idx + 1;
 860                 ctx->immediate_idx++;
 861         }
 862
 863         reg->File      = TGSI_FILE_IMMEDIATE;
 864         reg->Indirect  = 0;
 865         reg->Dimension = 0;
 866         reg->Index     = idx;
 867         reg->Absolute  = 0;
 868         reg->Negate    = neg;
 869         reg->SwizzleX  = swiz2tgsi[swiz];
 870         reg->SwizzleY  = swiz2tgsi[swiz];
 871         reg->SwizzleZ  = swiz2tgsi[swiz];
 872         reg->SwizzleW  = swiz2tgsi[swiz];
 873 }
 874
 875 static void
 876 create_mov(struct ir3_compile_context *ctx, struct tgsi_dst_register *dst,
 877                 struct tgsi_src_register *src)
 878 {
 879         type_t type_mov = get_ftype(ctx);
 880         unsigned i;
 881
 882         for (i = 0; i < 4; i++) {
 883                 /* move to destination: */
 884                 if (dst->WriteMask & (1 << i)) {
 885                         struct ir3_instruction *instr;
 886
 887                         if (src->Absolute || src->Negate) {
 888                                 /* can't have abs or neg on a mov instr, so use
 889                                  * absneg.f instead to handle these cases:
 890                                  */
 891                                 instr = instr_create(ctx, 2, OPC_ABSNEG_F);
 892                         } else {
 893                                 instr = instr_create(ctx, 1, 0);
 894                                 instr->cat1.src_type = type_mov;
 895                                 instr->cat1.dst_type = type_mov;
 896                         }
 897
 898                         add_dst_reg(ctx, instr, dst, i);
 899                         add_src_reg(ctx, instr, src, src_swiz(src, i));
 900                 }
 901         }
 902 }
 903
 904 static void
 905 create_clamp(struct ir3_compile_context *ctx,
 906                 struct tgsi_dst_register *dst, struct tgsi_src_register *val,
 907                 struct tgsi_src_register *minval, struct tgsi_src_register *maxval)
 908 {
 909         struct ir3_instruction *instr;
 910
 911         instr = instr_create(ctx, 2, OPC_MAX_F);
 912         vectorize(ctx, instr, dst, 2, val, 0, minval, 0);
 913
 914         instr = instr_create(ctx, 2, OPC_MIN_F);
 915         vectorize(ctx, instr, dst, 2, val, 0, maxval, 0);
 916 }
 917
 918 static void
 919 create_clamp_imm(struct ir3_compile_context *ctx,
 920                 struct tgsi_dst_register *dst,
 921                 uint32_t minval, uint32_t maxval)
 922 {
 923         struct tgsi_src_register minconst, maxconst;
 924         struct tgsi_src_register src;
 925
 926         src_from_dst(&src, dst);
 927
 928         get_immediate(ctx, &minconst, minval);
 929         get_immediate(ctx, &maxconst, maxval);
 930
 931         create_clamp(ctx, dst, &src, &minconst, &maxconst);
 932 }
 933
 934 static struct tgsi_dst_register *
 935 get_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst)
 936 {
 937         struct tgsi_dst_register *dst = &inst->Dst[0].Register;
 938         unsigned i;
 939
 940         compile_assert(ctx, !ctx->using_tmp_dst);
 941         ctx->using_tmp_dst = true;
 942
 943         for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
 944                 struct tgsi_src_register *src = &inst->Src[i].Register;
 945                 if ((src->File == dst->File) && (src->Index == dst->Index)) {
 946                         if ((dst->WriteMask == TGSI_WRITEMASK_XYZW) &&
 947                                         (src->SwizzleX == TGSI_SWIZZLE_X) &&
 948                                         (src->SwizzleY == TGSI_SWIZZLE_Y) &&
 949                                         (src->SwizzleZ == TGSI_SWIZZLE_Z) &&
 950                                         (src->SwizzleW == TGSI_SWIZZLE_W))
 951                                 continue;
 952                         ctx->tmp_src = get_internal_temp(ctx, &ctx->tmp_dst);
 953                         ctx->tmp_dst.WriteMask = dst->WriteMask;
 954                         dst = &ctx->tmp_dst;
 955                         break;
 956                 }
 957         }
 958         return dst;
 959 }
 960
 961 static void
 962 put_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst,
 963                 struct tgsi_dst_register *dst)
 964 {
 965         compile_assert(ctx, ctx->using_tmp_dst);
 966         ctx->using_tmp_dst = false;
 967
 968         /* if necessary, add mov back into original dst: */
 969         if (dst != &inst->Dst[0].Register) {
 970                 create_mov(ctx, &inst->Dst[0].Register, ctx->tmp_src);
 971         }
 972 }
 973
 974 /* helper to generate the necessary repeat and/or additional instructions
 975  * to turn a scalar instruction into a vector operation:
 976  */
 977 static void
 978 vectorize(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
 979                 struct tgsi_dst_register *dst, int nsrcs, ...)
 980 {
 981         va_list ap;
 982         int i, j, n = 0;
 983
 984         instr_atomic_start(ctx);
 985
 986         add_dst_reg(ctx, instr, dst, TGSI_SWIZZLE_X);
 987
 988         va_start(ap, nsrcs);
 989         for (j = 0; j < nsrcs; j++) {
 990                 struct tgsi_src_register *src =
 991                                 va_arg(ap, struct tgsi_src_register *);
 992                 unsigned flags = va_arg(ap, unsigned);
 993                 struct ir3_register *reg;
 994                 if (flags & IR3_REG_IMMED) {
 995                         reg = ir3_reg_create(instr, 0, IR3_REG_IMMED);
 996                         /* this is an ugly cast.. should have put flags first! */
 997                         reg->iim_val = *(int *)&src;
 998                 } else {
 999                         reg = add_src_reg(ctx, instr, src, TGSI_SWIZZLE_X);
1000                 }
1001                 reg->flags |= flags & ~IR3_REG_NEGATE;
1002                 if (flags & IR3_REG_NEGATE)
1003                         reg->flags ^= IR3_REG_NEGATE;
1004         }
1005         va_end(ap);
1006
1007         for (i = 0; i < 4; i++) {
1008                 if (dst->WriteMask & (1 << i)) {
1009                         struct ir3_instruction *cur;
1010
1011                         if (n++ == 0) {
1012                                 cur = instr;
1013                         } else {
1014                                 cur = instr_clone(ctx, instr);
1015                         }
1016
1017                         ssa_dst(ctx, cur, dst, i);
1018
1019                         /* fix-up dst register component: */
1020                         cur->regs[0]->num = regid(cur->regs[0]->num >> 2, i);
1021
1022                         /* fix-up src register component: */
1023                         va_start(ap, nsrcs);
1024                         for (j = 0; j < nsrcs; j++) {
1025                                 struct ir3_register *reg = cur->regs[j+1];
1026                                 struct tgsi_src_register *src =
1027                                                 va_arg(ap, struct tgsi_src_register *);
1028                                 unsigned flags = va_arg(ap, unsigned);
1029                                 if (reg->flags & IR3_REG_SSA) {
1030                                         ssa_src(ctx, reg, src, src_swiz(src, i));
1031                                 } else if (!(flags & IR3_REG_IMMED)) {
1032                                         reg->num = regid(reg->num >> 2, src_swiz(src, i));
1033                                 }
1034                         }
1035                         va_end(ap);
1036                 }
1037         }
1038
1039         instr_atomic_end(ctx);
1040 }
1041
1042 /*
1043  * Handlers for TGSI instructions which do not have a 1:1 mapping to
1044  * native instructions:
1045  */
1046
1047 static void
1048 trans_clamp(const struct instr_translater *t,
1049                 struct ir3_compile_context *ctx,
1050                 struct tgsi_full_instruction *inst)
1051 {
1052         struct tgsi_dst_register *dst = get_dst(ctx, inst);
1053         struct tgsi_src_register *src0 = &inst->Src[0].Register;
1054         struct tgsi_src_register *src1 = &inst->Src[1].Register;
1055         struct tgsi_src_register *src2 = &inst->Src[2].Register;
1056
1057         create_clamp(ctx, dst, src0, src1, src2);
1058
1059         put_dst(ctx, inst, dst);
1060 }
1061
1062 /* ARL(x) = x, but mova from hrN.x to a0.. */
1063 static void
1064 trans_arl(const struct instr_translater *t,
1065                 struct ir3_compile_context *ctx,
1066                 struct tgsi_full_instruction *inst)
1067 {
1068         struct ir3_instruction *instr;
1069         struct tgsi_dst_register tmp_dst;
1070         struct tgsi_src_register *tmp_src;
1071         struct tgsi_dst_register *dst = &inst->Dst[0].Register;
1072         struct tgsi_src_register *src = &inst->Src[0].Register;
1073         unsigned chan = src->SwizzleX;
1074
1075         compile_assert(ctx, dst->File == TGSI_FILE_ADDRESS);
1076
1077         /* NOTE: we allocate a temporary from a flat register
1078          * namespace (ignoring half vs full).  It turns out
1079          * not to really matter since registers get reassigned
1080          * later in ir3_ra which (hopefully!) can deal a bit
1081          * better with mixed half and full precision.
1082          */
1083         tmp_src = get_internal_temp(ctx, &tmp_dst);
1084
1085         /* cov.{u,f}{32,16}s16 Rtmp, Rsrc */
1086         instr = instr_create(ctx, 1, 0);
1087         instr->cat1.src_type = (t->tgsi_opc == TGSI_OPCODE_ARL) ?
1088                         get_ftype(ctx) : get_utype(ctx);
1089         instr->cat1.dst_type = TYPE_S16;
1090         add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF;
1091         add_src_reg(ctx, instr, src, chan);
1092
1093         /* shl.b Rtmp, Rtmp, 2 */
1094         instr = instr_create(ctx, 2, OPC_SHL_B);
1095         add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF;
1096         add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF;
1097         ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2;
1098
1099         /* mova a0, Rtmp */
1100         instr = instr_create(ctx, 1, 0);
1101         instr->cat1.src_type = TYPE_S16;
1102         instr->cat1.dst_type = TYPE_S16;
1103         add_dst_reg(ctx, instr, dst, 0)->flags |= IR3_REG_HALF;
1104         add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF;
1105 }
1106
1107 /*
1108  * texture fetch/sample instructions:
1109  */
1110
1111 struct tex_info {
1112         int8_t order[4];
1113         int8_t args;
1114         unsigned src_wrmask, flags;
1115 };
1116
1117 struct target_info {
1118         uint8_t dims;
1119         uint8_t cube;
1120         uint8_t array;
1121         uint8_t shadow;
1122 };
1123
1124 static const struct target_info tex_targets[] = {
1125         [TGSI_TEXTURE_1D]               = { 1, 0, 0, 0 },
1126         [TGSI_TEXTURE_2D]               = { 2, 0, 0, 0 },
1127         [TGSI_TEXTURE_3D]               = { 3, 0, 0, 0 },
1128         [TGSI_TEXTURE_CUBE]             = { 3, 1, 0, 0 },
1129         [TGSI_TEXTURE_RECT]             = { 2, 0, 0, 0 },
1130         [TGSI_TEXTURE_SHADOW1D]         = { 1, 0, 0, 1 },
1131         [TGSI_TEXTURE_SHADOW2D]         = { 2, 0, 0, 1 },
1132         [TGSI_TEXTURE_SHADOWRECT]       = { 2, 0, 0, 1 },
1133         [TGSI_TEXTURE_1D_ARRAY]         = { 1, 0, 1, 0 },
1134         [TGSI_TEXTURE_2D_ARRAY]         = { 2, 0, 1, 0 },
1135         [TGSI_TEXTURE_SHADOW1D_ARRAY]   = { 1, 0, 1, 1 },
1136         [TGSI_TEXTURE_SHADOW2D_ARRAY]   = { 2, 0, 1, 1 },
1137         [TGSI_TEXTURE_SHADOWCUBE]       = { 3, 1, 0, 1 },
1138         [TGSI_TEXTURE_2D_MSAA]          = { 2, 0, 0, 0 },
1139         [TGSI_TEXTURE_2D_ARRAY_MSAA]    = { 2, 0, 1, 0 },
1140         [TGSI_TEXTURE_CUBE_ARRAY]       = { 3, 1, 1, 0 },
1141         [TGSI_TEXTURE_SHADOWCUBE_ARRAY] = { 3, 1, 1, 1 },
1142 };
1143
1144 static void
1145 fill_tex_info(struct ir3_compile_context *ctx,
1146                           struct tgsi_full_instruction *inst,
1147                           struct tex_info *info)
1148 {
1149         const struct target_info *tgt = &tex_targets[inst->Texture.Texture];
1150
1151         if (tgt->dims == 3)
1152                 info->flags |= IR3_INSTR_3D;
1153         if (tgt->array)
1154                 info->flags |= IR3_INSTR_A;
1155         if (tgt->shadow)
1156                 info->flags |= IR3_INSTR_S;
1157
1158         switch (inst->Instruction.Opcode) {
1159         case TGSI_OPCODE_TXB:
1160         case TGSI_OPCODE_TXB2:
1161         case TGSI_OPCODE_TXL:
1162         case TGSI_OPCODE_TXF:
1163                 info->args = 2;
1164                 break;
1165         case TGSI_OPCODE_TXP:
1166                 info->flags |= IR3_INSTR_P;
1167                 /* fallthrough */
1168         case TGSI_OPCODE_TEX:
1169         case TGSI_OPCODE_TXD:
1170                 info->args = 1;
1171                 break;
1172         }
1173
1174         /*
1175          * lay out the first argument in the proper order:
1176          *  - actual coordinates first
1177          *  - array index
1178          *  - shadow reference
1179          *  - projection w
1180          *
1181          * bias/lod go into the second arg
1182          */
1183         int arg, pos = 0;
1184         for (arg = 0; arg < tgt->dims; arg++)
1185                 info->order[arg] = pos++;
1186         if (tgt->dims == 1)
1187                 info->order[pos++] = -1;
1188         if (tgt->shadow)
1189                 info->order[pos++] = MAX2(arg + tgt->array, 2);
1190         if (tgt->array)
1191                 info->order[pos++] = arg++;
1192         if (info->flags & IR3_INSTR_P)
1193                 info->order[pos++] = 3;
1194
1195         info->src_wrmask = (1 << pos) - 1;
1196
1197         for (; pos < 4; pos++)
1198                 info->order[pos] = -1;
1199
1200         assert(pos <= 4);
1201 }
1202
1203 static bool check_swiz(struct tgsi_src_register *src, const int8_t order[4])
1204 {
1205         unsigned i;
1206         for (i = 1; (i < 4) && order[i] >= 0; i++)
1207                 if (src_swiz(src, i) != (src_swiz(src, 0) + order[i]))
1208                         return false;
1209         return true;
1210 }
1211
1212 static bool is_1d(unsigned tex)
1213 {
1214         return tex_targets[tex].dims == 1;
1215 }
1216
1217 static struct tgsi_src_register *
1218 get_tex_coord(struct ir3_compile_context *ctx,
1219                 struct tgsi_full_instruction *inst,
1220                 const struct tex_info *tinf)
1221 {
1222         struct tgsi_src_register *coord = &inst->Src[0].Register;
1223         struct ir3_instruction *instr;
1224         unsigned tex = inst->Texture.Texture;
1225         bool needs_mov = false;
1226
1227         /* cat5 instruction cannot seem to handle const or relative: */
1228         if (is_rel_or_const(coord))
1229                 needs_mov = true;
1230
1231         /* 1D textures we fix up w/ 0.5 as 2nd coord: */
1232         if (is_1d(tex))
1233                 needs_mov = true;
1234
1235         /* The texture sample instructions need to coord in successive
1236          * registers/components (ie. src.xy but not src.yx).  And TXP
1237          * needs the .w component in .z for 2D..  so in some cases we
1238          * might need to emit some mov instructions to shuffle things
1239          * around:
1240          */
1241         if (!needs_mov)
1242                 needs_mov = !check_swiz(coord, tinf->order);
1243
1244         if (needs_mov) {
1245                 struct tgsi_dst_register tmp_dst;
1246                 struct tgsi_src_register *tmp_src;
1247                 unsigned j;
1248
1249                 type_t type_mov = get_ftype(ctx);
1250
1251                 /* need to move things around: */
1252                 tmp_src = get_internal_temp(ctx, &tmp_dst);
1253
1254                 for (j = 0; j < 4; j++) {
1255                         if (tinf->order[j] < 0)
1256                                 continue;
1257                         instr = instr_create(ctx, 1, 0);  /* mov */
1258                         instr->cat1.src_type = type_mov;
1259                         instr->cat1.dst_type = type_mov;
1260                         add_dst_reg(ctx, instr, &tmp_dst, j);
1261                         add_src_reg(ctx, instr, coord,
1262                                         src_swiz(coord, tinf->order[j]));
1263                 }
1264
1265                 /* fix up .y coord: */
1266                 if (is_1d(tex)) {
1267                         struct ir3_register *imm;
1268                         instr = instr_create(ctx, 1, 0);  /* mov */
1269                         instr->cat1.src_type = type_mov;
1270                         instr->cat1.dst_type = type_mov;
1271                         add_dst_reg(ctx, instr, &tmp_dst, 1);  /* .y */
1272                         imm = ir3_reg_create(instr, 0, IR3_REG_IMMED);
1273                         if (inst->Instruction.Opcode == TGSI_OPCODE_TXF)
1274                                 imm->iim_val = 0;
1275                         else
1276                                 imm->fim_val = 0.5;
1277                 }
1278
1279                 coord = tmp_src;
1280         }
1281
1282         return coord;
1283 }
1284
1285 static void
1286 trans_samp(const struct instr_translater *t,
1287                 struct ir3_compile_context *ctx,
1288                 struct tgsi_full_instruction *inst)
1289 {
1290         struct ir3_instruction *instr, *collect;
1291         struct ir3_register *reg;
1292         struct tgsi_dst_register *dst = &inst->Dst[0].Register;
1293         struct tgsi_src_register *orig, *coord, *samp, *offset, *dpdx, *dpdy;
1294         struct tgsi_src_register zero;
1295         const struct target_info *tgt = &tex_targets[inst->Texture.Texture];
1296         struct tex_info tinf;
1297         int i;
1298
1299         memset(&tinf, 0, sizeof(tinf));
1300         fill_tex_info(ctx, inst, &tinf);
1301         coord = get_tex_coord(ctx, inst, &tinf);
1302         get_immediate(ctx, &zero, 0);
1303
1304         switch (inst->Instruction.Opcode) {
1305         case TGSI_OPCODE_TXB2:
1306                 orig = &inst->Src[1].Register;
1307                 samp = &inst->Src[2].Register;
1308                 break;
1309         case TGSI_OPCODE_TXD:
1310                 orig = &inst->Src[0].Register;
1311                 dpdx = &inst->Src[1].Register;
1312                 dpdy = &inst->Src[2].Register;
1313                 samp = &inst->Src[3].Register;
1314                 if (is_rel_or_const(dpdx))
1315                                 dpdx = get_unconst(ctx, dpdx);
1316                 if (is_rel_or_const(dpdy))
1317                                 dpdy = get_unconst(ctx, dpdy);
1318                 break;
1319         default:
1320                 orig = &inst->Src[0].Register;
1321                 samp = &inst->Src[1].Register;
1322                 break;
1323         }
1324         if (tinf.args > 1 && is_rel_or_const(orig))
1325                 orig = get_unconst(ctx, orig);
1326
1327         /* scale up integer coords for TXF based on the LOD */
1328         if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
1329                 struct tgsi_dst_register tmp_dst;
1330                 struct tgsi_src_register *tmp_src;
1331                 type_t type_mov = get_utype(ctx);
1332
1333                 tmp_src = get_internal_temp(ctx, &tmp_dst);
1334                 for (i = 0; i < tgt->dims; i++) {
1335                         instr = instr_create(ctx, 2, OPC_SHL_B);
1336                         add_dst_reg(ctx, instr, &tmp_dst, i);
1337                         add_src_reg(ctx, instr, coord, src_swiz(coord, i));
1338                         add_src_reg(ctx, instr, orig, orig->SwizzleW);
1339                 }
1340                 if (tgt->dims < 2) {
1341                         instr = instr_create(ctx, 1, 0);
1342                         instr->cat1.src_type = type_mov;
1343                         instr->cat1.dst_type = type_mov;
1344                         add_dst_reg(ctx, instr, &tmp_dst, i);
1345                         add_src_reg(ctx, instr, &zero, 0);
1346                         i++;
1347                 }
1348                 if (tgt->array) {
1349                         instr = instr_create(ctx, 1, 0);
1350                         instr->cat1.src_type = type_mov;
1351                         instr->cat1.dst_type = type_mov;
1352                         add_dst_reg(ctx, instr, &tmp_dst, i);
1353                         add_src_reg(ctx, instr, coord, src_swiz(coord, i));
1354                 }
1355                 coord = tmp_src;
1356         }
1357
1358         if (inst->Texture.NumOffsets) {
1359                 struct tgsi_texture_offset *tex_offset = &inst->TexOffsets[0];
1360                 struct tgsi_src_register offset_src = {0};
1361
1362                 offset_src.File = tex_offset->File;
1363                 offset_src.Index = tex_offset->Index;
1364                 offset_src.SwizzleX = tex_offset->SwizzleX;
1365                 offset_src.SwizzleY = tex_offset->SwizzleY;
1366                 offset_src.SwizzleZ = tex_offset->SwizzleZ;
1367                 offset = get_unconst(ctx, &offset_src);
1368                 tinf.flags |= IR3_INSTR_O;
1369         }
1370
1371         instr = instr_create(ctx, 5, t->opc);
1372         instr->cat5.type = get_ftype(ctx);
1373         instr->cat5.samp = samp->Index;
1374         instr->cat5.tex  = samp->Index;
1375         instr->flags |= tinf.flags;
1376
1377         add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask);
1378
1379         reg = ir3_reg_create(instr, 0, IR3_REG_SSA);
1380
1381         collect = ir3_instr_create(ctx->block, -1, OPC_META_FI);
1382         ir3_reg_create(collect, 0, 0);
1383         for (i = 0; i < 4; i++)
1384                 if (tinf.src_wrmask & (1 << i))
1385                         ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
1386                                         coord, src_swiz(coord, i));
1387                 else if (tinf.src_wrmask & ~((1 << i) - 1))
1388                         ir3_reg_create(collect, 0, 0);
1389
1390         /* Attach derivatives onto the end of the fan-in. Derivatives start after
1391          * the 4th argument, so make sure that fi is padded up to 4 first.
1392          */
1393         if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
1394                 while (collect->regs_count < 5)
1395                         ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), &zero, 0);
1396                 for (i = 0; i < tgt->dims; i++)
1397                         ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), dpdx, i);
1398                 if (tgt->dims < 2)
1399                         ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), &zero, 0);
1400                 for (i = 0; i < tgt->dims; i++)
1401                         ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), dpdy, i);
1402                 if (tgt->dims < 2)
1403                         ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), &zero, 0);
1404                 tinf.src_wrmask |= ((1 << (2 * MAX2(tgt->dims, 2))) - 1) << 4;
1405         }
1406
1407         reg->instr = collect;
1408         reg->wrmask = tinf.src_wrmask;
1409
1410         /* The second argument contains the offsets, followed by the lod/bias
1411          * argument. This is constructed more manually due to the dynamic nature.
1412          */
1413         if (inst->Texture.NumOffsets == 0 && tinf.args == 1)
1414                 return;
1415
1416         reg = ir3_reg_create(instr, 0, IR3_REG_SSA);
1417
1418         collect = ir3_instr_create(ctx->block, -1, OPC_META_FI);
1419         ir3_reg_create(collect, 0, 0);
1420
1421         if (inst->Texture.NumOffsets) {
1422                 for (i = 0; i < tgt->dims; i++)
1423                         ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
1424                                         offset, i);
1425                 if (tgt->dims < 2)
1426                         ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), &zero, 0);
1427         }
1428         if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2)
1429                 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
1430                                 orig, orig->SwizzleX);
1431         else if (tinf.args > 1)
1432                 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
1433                                 orig, orig->SwizzleW);
1434
1435         reg->instr = collect;
1436         reg->wrmask = (1 << (collect->regs_count - 1)) - 1;
1437 }
1438
1439 static void
1440 trans_txq(const struct instr_translater *t,
1441                 struct ir3_compile_context *ctx,
1442                 struct tgsi_full_instruction *inst)
1443 {
1444         struct ir3_instruction *instr;
1445         struct tgsi_dst_register *dst = &inst->Dst[0].Register;
1446         struct tgsi_src_register *level = &inst->Src[0].Register;
1447         struct tgsi_src_register *samp = &inst->Src[1].Register;
1448         struct tex_info tinf;
1449
1450         memset(&tinf, 0, sizeof(tinf));
1451         fill_tex_info(ctx, inst, &tinf);
1452         if (is_rel_or_const(level))
1453                 level = get_unconst(ctx, level);
1454
1455         instr = instr_create(ctx, 5, OPC_GETSIZE);
1456         instr->cat5.type = get_utype(ctx);
1457         instr->cat5.samp = samp->Index;
1458         instr->cat5.tex  = samp->Index;
1459         instr->flags |= tinf.flags;
1460
1461         add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask);
1462         add_src_reg_wrmask(ctx, instr, level, level->SwizzleX, 0x1);
1463 }
1464
1465 /* DDX/DDY */
1466 static void
1467 trans_deriv(const struct instr_translater *t,
1468                 struct ir3_compile_context *ctx,
1469                 struct tgsi_full_instruction *inst)
1470 {
1471         struct ir3_instruction *instr;
1472         struct tgsi_dst_register *dst = &inst->Dst[0].Register;
1473         struct tgsi_src_register *src = &inst->Src[0].Register;
1474         static const int8_t order[4] = {0, 1, 2, 3};
1475
1476         if (!check_swiz(src, order)) {
1477                 struct tgsi_dst_register tmp_dst;
1478                 struct tgsi_src_register *tmp_src;
1479
1480                 tmp_src = get_internal_temp(ctx, &tmp_dst);
1481                 create_mov(ctx, &tmp_dst, src);
1482
1483                 src = tmp_src;
1484         }
1485
1486         /* This might be a workaround for hw bug?  Blob compiler always
1487          * seems to work two components at a time for dsy/dsx.  It does
1488          * actually seem to work in some cases (or at least some piglit
1489          * tests) for four components at a time.  But seems more reliable
1490          * to split this into two instructions like the blob compiler
1491          * does:
1492          */
1493
1494         instr = instr_create(ctx, 5, t->opc);
1495         instr->cat5.type = get_ftype(ctx);
1496         add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask & 0x3);
1497         add_src_reg_wrmask(ctx, instr, src, 0, dst->WriteMask & 0x3);
1498
1499         instr = instr_create(ctx, 5, t->opc);
1500         instr->cat5.type = get_ftype(ctx);
1501         add_dst_reg_wrmask(ctx, instr, dst, 2, (dst->WriteMask >> 2) & 0x3);
1502         add_src_reg_wrmask(ctx, instr, src, 2, (dst->WriteMask >> 2) & 0x3);
1503 }
1504
1505 /*
1506  * SEQ(a,b) = (a == b) ? 1.0 : 0.0
1507  *   cmps.f.eq tmp0, a, b
1508  *   cov.u16f16 dst, tmp0
1509  *
1510  * SNE(a,b) = (a != b) ? 1.0 : 0.0
1511  *   cmps.f.ne tmp0, a, b
1512  *   cov.u16f16 dst, tmp0
1513  *
1514  * SGE(a,b) = (a >= b) ? 1.0 : 0.0
1515  *   cmps.f.ge tmp0, a, b
1516  *   cov.u16f16 dst, tmp0
1517  *
1518  * SLE(a,b) = (a <= b) ? 1.0 : 0.0
1519  *   cmps.f.le tmp0, a, b
1520  *   cov.u16f16 dst, tmp0
1521  *
1522  * SGT(a,b) = (a > b)  ? 1.0 : 0.0
1523  *   cmps.f.gt tmp0, a, b
1524  *   cov.u16f16 dst, tmp0
1525  *
1526  * SLT(a,b) = (a < b)  ? 1.0 : 0.0
1527  *   cmps.f.lt tmp0, a, b
1528  *   cov.u16f16 dst, tmp0
1529  *
1530  * CMP(a,b,c) = (a < 0.0) ? b : c
1531  *   cmps.f.lt tmp0, a, {0.0}
1532  *   sel.b16 dst, b, tmp0, c
1533  */
1534 static void
1535 trans_cmp(const struct instr_translater *t,
1536                 struct ir3_compile_context *ctx,
1537                 struct tgsi_full_instruction *inst)
1538 {
1539         struct ir3_instruction *instr;
1540         struct tgsi_dst_register tmp_dst;
1541         struct tgsi_src_register *tmp_src;
1542         struct tgsi_src_register constval0;
1543         /* final instruction for CMP() uses orig src1 and src2: */
1544         struct tgsi_dst_register *dst = get_dst(ctx, inst);
1545         struct tgsi_src_register *a0, *a1, *a2;
1546         unsigned condition;
1547
1548         tmp_src = get_internal_temp(ctx, &tmp_dst);
1549
1550         a0 = &inst->Src[0].Register;  /* a */
1551         a1 = &inst->Src[1].Register;  /* b */
1552
1553         switch (t->tgsi_opc) {
1554         case TGSI_OPCODE_SEQ:
1555         case TGSI_OPCODE_FSEQ:
1556                 condition = IR3_COND_EQ;
1557                 break;
1558         case TGSI_OPCODE_SNE:
1559         case TGSI_OPCODE_FSNE:
1560                 condition = IR3_COND_NE;
1561                 break;
1562         case TGSI_OPCODE_SGE:
1563         case TGSI_OPCODE_FSGE:
1564                 condition = IR3_COND_GE;
1565                 break;
1566         case TGSI_OPCODE_SLT:
1567         case TGSI_OPCODE_FSLT:
1568                 condition = IR3_COND_LT;
1569                 break;
1570         case TGSI_OPCODE_SLE:
1571                 condition = IR3_COND_LE;
1572                 break;
1573         case TGSI_OPCODE_SGT:
1574                 condition = IR3_COND_GT;
1575                 break;
1576         case TGSI_OPCODE_CMP:
1577                 get_immediate(ctx, &constval0, fui(0.0));
1578                 a0 = &inst->Src[0].Register;  /* a */
1579                 a1 = &constval0;              /* {0.0} */
1580                 condition = IR3_COND_LT;
1581                 break;
1582         default:
1583                 compile_assert(ctx, 0);
1584                 return;
1585         }
1586
1587         if (is_const(a0) && is_const(a1))
1588                 a0 = get_unconst(ctx, a0);
1589
1590         /* cmps.f.<cond> tmp, a0, a1 */
1591         instr = instr_create(ctx, 2, OPC_CMPS_F);
1592         instr->cat2.condition = condition;
1593         vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0);
1594
1595         switch (t->tgsi_opc) {
1596         case TGSI_OPCODE_SEQ:
1597         case TGSI_OPCODE_SGE:
1598         case TGSI_OPCODE_SLE:
1599         case TGSI_OPCODE_SNE:
1600         case TGSI_OPCODE_SGT:
1601         case TGSI_OPCODE_SLT:
1602                 /* cov.u16f16 dst, tmp0 */
1603                 instr = instr_create(ctx, 1, 0);
1604                 instr->cat1.src_type = get_utype(ctx);
1605                 instr->cat1.dst_type = get_ftype(ctx);
1606                 vectorize(ctx, instr, dst, 1, tmp_src, 0);
1607                 break;
1608         case TGSI_OPCODE_FSEQ:
1609         case TGSI_OPCODE_FSGE:
1610         case TGSI_OPCODE_FSNE:
1611         case TGSI_OPCODE_FSLT:
1612                 /* absneg.s dst, (neg)tmp0 */
1613                 instr = instr_create(ctx, 2, OPC_ABSNEG_S);
1614                 vectorize(ctx, instr, dst, 1, tmp_src, IR3_REG_NEGATE);
1615                 break;
1616         case TGSI_OPCODE_CMP:
1617                 a1 = &inst->Src[1].Register;
1618                 a2 = &inst->Src[2].Register;
1619                 /* sel.{b32,b16} dst, src2, tmp, src1 */
1620                 instr = instr_create(ctx, 3, OPC_SEL_B32);
1621                 vectorize(ctx, instr, dst, 3, a1, 0, tmp_src, 0, a2, 0);
1622
1623                 break;
1624         }
1625
1626         put_dst(ctx, inst, dst);
1627 }
1628
1629 /*
1630  * USNE(a,b) = (a != b) ? ~0 : 0
1631  *   cmps.u32.ne dst, a, b
1632  *
1633  * USEQ(a,b) = (a == b) ? ~0 : 0
1634  *   cmps.u32.eq dst, a, b
1635  *
1636  * ISGE(a,b) = (a > b) ? ~0 : 0
1637  *   cmps.s32.ge dst, a, b
1638  *
1639  * USGE(a,b) = (a > b) ? ~0 : 0
1640  *   cmps.u32.ge dst, a, b
1641  *
1642  * ISLT(a,b) = (a < b) ? ~0 : 0
1643  *   cmps.s32.lt dst, a, b
1644  *
1645  * USLT(a,b) = (a < b) ? ~0 : 0
1646  *   cmps.u32.lt dst, a, b
1647  *
1648  */
1649 static void
1650 trans_icmp(const struct instr_translater *t,
1651                 struct ir3_compile_context *ctx,
1652                 struct tgsi_full_instruction *inst)
1653 {
1654         struct ir3_instruction *instr;
1655         struct tgsi_dst_register *dst = get_dst(ctx, inst);
1656         struct tgsi_dst_register tmp_dst;
1657         struct tgsi_src_register *tmp_src;
1658         struct tgsi_src_register *a0, *a1;
1659         unsigned condition;
1660
1661         a0 = &inst->Src[0].Register;  /* a */
1662         a1 = &inst->Src[1].Register;  /* b */
1663
1664         switch (t->tgsi_opc) {
1665         case TGSI_OPCODE_USNE:
1666                 condition = IR3_COND_NE;
1667                 break;
1668         case TGSI_OPCODE_USEQ:
1669                 condition = IR3_COND_EQ;
1670                 break;
1671         case TGSI_OPCODE_ISGE:
1672         case TGSI_OPCODE_USGE:
1673                 condition = IR3_COND_GE;
1674                 break;
1675         case TGSI_OPCODE_ISLT:
1676         case TGSI_OPCODE_USLT:
1677                 condition = IR3_COND_LT;
1678                 break;
1679
1680         default:
1681                 compile_assert(ctx, 0);
1682                 return;
1683         }
1684
1685         if (is_const(a0) && is_const(a1))
1686                 a0 = get_unconst(ctx, a0);
1687
1688         tmp_src = get_internal_temp(ctx, &tmp_dst);
1689         /* cmps.{u32,s32}.<cond> tmp, a0, a1 */
1690         instr = instr_create(ctx, 2, t->opc);
1691         instr->cat2.condition = condition;
1692         vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0);
1693
1694         /* absneg.s dst, (neg)tmp */
1695         instr = instr_create(ctx, 2, OPC_ABSNEG_S);
1696         vectorize(ctx, instr, dst, 1, tmp_src, IR3_REG_NEGATE);
1697
1698         put_dst(ctx, inst, dst);
1699 }
1700
1701 /*
1702  * UCMP(a,b,c) = a ? b : c
1703  *   sel.b16 dst, b, a, c
1704  */
1705 static void
1706 trans_ucmp(const struct instr_translater *t,
1707                 struct ir3_compile_context *ctx,
1708                 struct tgsi_full_instruction *inst)
1709 {
1710         struct ir3_instruction *instr;
1711         struct tgsi_dst_register *dst = get_dst(ctx, inst);
1712         struct tgsi_src_register *a0, *a1, *a2;
1713
1714         a0 = &inst->Src[0].Register;  /* a */
1715         a1 = &inst->Src[1].Register;  /* b */
1716         a2 = &inst->Src[2].Register;  /* c */
1717
1718         if (is_rel_or_const(a0))
1719                 a0 = get_unconst(ctx, a0);
1720
1721         /* sel.{b32,b16} dst, b, a, c */
1722         instr = instr_create(ctx, 3, OPC_SEL_B32);
1723         vectorize(ctx, instr, dst, 3, a1, 0, a0, 0, a2, 0);
1724         put_dst(ctx, inst, dst);
1725 }
1726
1727 /*
1728  * ISSG(a) = a < 0 ? -1 : a > 0 ? 1 : 0
1729  *   cmps.s.lt tmp_neg, a, 0  # 1 if a is negative
1730  *   cmps.s.gt tmp_pos, a, 0  # 1 if a is positive
1731  *   sub.u dst, tmp_pos, tmp_neg
1732  */
1733 static void
1734 trans_issg(const struct instr_translater *t,
1735                 struct ir3_compile_context *ctx,
1736                 struct tgsi_full_instruction *inst)
1737 {
1738         struct ir3_instruction *instr;
1739         struct tgsi_dst_register *dst = get_dst(ctx, inst);
1740         struct tgsi_src_register *a = &inst->Src[0].Register;
1741         struct tgsi_dst_register neg_dst, pos_dst;
1742         struct tgsi_src_register *neg_src, *pos_src;
1743
1744         neg_src = get_internal_temp(ctx, &neg_dst);
1745         pos_src = get_internal_temp(ctx, &pos_dst);
1746
1747         /* cmps.s.lt neg, a, 0 */
1748         instr = instr_create(ctx, 2, OPC_CMPS_S);
1749         instr->cat2.condition = IR3_COND_LT;
1750         vectorize(ctx, instr, &neg_dst, 2, a, 0, 0, IR3_REG_IMMED);
1751
1752         /* cmps.s.gt pos, a, 0 */
1753         instr = instr_create(ctx, 2, OPC_CMPS_S);
1754         instr->cat2.condition = IR3_COND_GT;
1755         vectorize(ctx, instr, &pos_dst, 2, a, 0, 0, IR3_REG_IMMED);
1756
1757         /* sub.u dst, pos, neg */
1758         instr = instr_create(ctx, 2, OPC_SUB_U);
1759         vectorize(ctx, instr, dst, 2, pos_src, 0, neg_src, 0);
1760
1761         put_dst(ctx, inst, dst);
1762 }
1763
1764
1765
1766 /*
1767  * Conditional / Flow control
1768  */
1769
1770 static void
1771 push_branch(struct ir3_compile_context *ctx, bool inv,
1772                 struct ir3_instruction *instr, struct ir3_instruction *cond)
1773 {
1774         unsigned int idx = ctx->branch_count++;
1775         compile_assert(ctx, idx < ARRAY_SIZE(ctx->branch));
1776         ctx->branch[idx].instr = instr;
1777         ctx->branch[idx].inv = inv;
1778         /* else side of branch has same condition: */
1779         if (!inv)
1780                 ctx->branch[idx].cond = cond;
1781 }
1782
1783 static struct ir3_instruction *
1784 pop_branch(struct ir3_compile_context *ctx)
1785 {
1786         unsigned int idx = --ctx->branch_count;
1787         return ctx->branch[idx].instr;
1788 }
1789
1790 static void
1791 trans_if(const struct instr_translater *t,
1792                 struct ir3_compile_context *ctx,
1793                 struct tgsi_full_instruction *inst)
1794 {
1795         struct ir3_instruction *instr, *cond;
1796         struct tgsi_src_register *src = &inst->Src[0].Register;
1797         struct tgsi_dst_register tmp_dst;
1798         struct tgsi_src_register *tmp_src;
1799         struct tgsi_src_register constval;
1800
1801         get_immediate(ctx, &constval, fui(0.0));
1802         tmp_src = get_internal_temp(ctx, &tmp_dst);
1803
1804         if (is_const(src))
1805                 src = get_unconst(ctx, src);
1806
1807         /* cmps.{f,u}.ne tmp0, b, {0.0} */
1808         instr = instr_create(ctx, 2, t->opc);
1809         add_dst_reg(ctx, instr, &tmp_dst, 0);
1810         add_src_reg(ctx, instr, src, src->SwizzleX);
1811         add_src_reg(ctx, instr, &constval, constval.SwizzleX);
1812         instr->cat2.condition = IR3_COND_NE;
1813
1814         compile_assert(ctx, instr->regs[1]->flags & IR3_REG_SSA); /* because get_unconst() */
1815         cond = instr->regs[1]->instr;
1816
1817         /* meta:flow tmp0 */
1818         instr = instr_create(ctx, -1, OPC_META_FLOW);
1819         ir3_reg_create(instr, 0, 0);  /* dummy dst */
1820         add_src_reg(ctx, instr, tmp_src, TGSI_SWIZZLE_X);
1821
1822         push_branch(ctx, false, instr, cond);
1823         instr->flow.if_block = push_block(ctx);
1824 }
1825
1826 static void
1827 trans_else(const struct instr_translater *t,
1828                 struct ir3_compile_context *ctx,
1829                 struct tgsi_full_instruction *inst)
1830 {
1831         struct ir3_instruction *instr;
1832
1833         pop_block(ctx);
1834
1835         instr = pop_branch(ctx);
1836
1837         compile_assert(ctx, (instr->category == -1) &&
1838                         (instr->opc == OPC_META_FLOW));
1839
1840         push_branch(ctx, true, instr, NULL);
1841         instr->flow.else_block = push_block(ctx);
1842 }
1843
1844 static struct ir3_instruction *
1845 find_temporary(struct ir3_block *block, unsigned n)
1846 {
1847         if (block->parent && !block->temporaries[n])
1848                 return find_temporary(block->parent, n);
1849         return block->temporaries[n];
1850 }
1851
1852 static struct ir3_instruction *
1853 find_output(struct ir3_block *block, unsigned n)
1854 {
1855         if (block->parent && !block->outputs[n])
1856                 return find_output(block->parent, n);
1857         return block->outputs[n];
1858 }
1859
1860 static struct ir3_instruction *
1861 create_phi(struct ir3_compile_context *ctx, struct ir3_instruction *cond,
1862                 struct ir3_instruction *a, struct ir3_instruction *b)
1863 {
1864         struct ir3_instruction *phi;
1865
1866         compile_assert(ctx, cond);
1867
1868         /* Either side of the condition could be null..  which
1869          * indicates a variable written on only one side of the
1870          * branch.  Normally this should only be variables not
1871          * used outside of that side of the branch.  So we could
1872          * just 'return a ? a : b;' in that case.  But for better
1873          * defined undefined behavior we just stick in imm{0.0}.
1874          * In the common case of a value only used within the
1875          * one side of the branch, the PHI instruction will not
1876          * get scheduled
1877          */
1878         if (!a)
1879                 a = create_immed(ctx, 0.0);
1880         if (!b)
1881                 b = create_immed(ctx, 0.0);
1882
1883         phi = instr_create(ctx, -1, OPC_META_PHI);
1884         ir3_reg_create(phi, 0, 0);  /* dummy dst */
1885         ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = cond;
1886         ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = a;
1887         ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = b;
1888
1889         return phi;
1890 }
1891
1892 static void
1893 trans_endif(const struct instr_translater *t,
1894                 struct ir3_compile_context *ctx,
1895                 struct tgsi_full_instruction *inst)
1896 {
1897         struct ir3_instruction *instr;
1898         struct ir3_block *ifb, *elseb;
1899         struct ir3_instruction **ifout, **elseout;
1900         unsigned i, ifnout = 0, elsenout = 0;
1901
1902         pop_block(ctx);
1903
1904         instr = pop_branch(ctx);
1905
1906         compile_assert(ctx, (instr->category == -1) &&
1907                         (instr->opc == OPC_META_FLOW));
1908
1909         ifb = instr->flow.if_block;
1910         elseb = instr->flow.else_block;
1911         /* if there is no else block, the parent block is used for the
1912          * branch-not-taken src of the PHI instructions:
1913          */
1914         if (!elseb)
1915                 elseb = ifb->parent;
1916
1917         /* worst case sizes: */
1918         ifnout = ifb->ntemporaries + ifb->noutputs;
1919         elsenout = elseb->ntemporaries + elseb->noutputs;
1920
1921         ifout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * ifnout);
1922         if (elseb != ifb->parent)
1923                 elseout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * elsenout);
1924
1925         ifnout = 0;
1926         elsenout = 0;
1927
1928         /* generate PHI instructions for any temporaries written: */
1929         for (i = 0; i < ifb->ntemporaries; i++) {
1930                 struct ir3_instruction *a = ifb->temporaries[i];
1931                 struct ir3_instruction *b = elseb->temporaries[i];
1932
1933                 /* if temporary written in if-block, or if else block
1934                  * is present and temporary written in else-block:
1935                  */
1936                 if (a || ((elseb != ifb->parent) && b)) {
1937                         struct ir3_instruction *phi;
1938
1939                         /* if only written on one side, find the closest
1940                          * enclosing update on other side:
1941                          */
1942                         if (!a)
1943                                 a = find_temporary(ifb, i);
1944                         if (!b)
1945                                 b = find_temporary(elseb, i);
1946
1947                         ifout[ifnout] = a;
1948                         a = create_output(ifb, a, ifnout++);
1949
1950                         if (elseb != ifb->parent) {
1951                                 elseout[elsenout] = b;
1952                                 b = create_output(elseb, b, elsenout++);
1953                         }
1954
1955                         phi = create_phi(ctx, instr, a, b);
1956                         ctx->block->temporaries[i] = phi;
1957                 }
1958         }
1959
1960         compile_assert(ctx, ifb->noutputs == elseb->noutputs);
1961
1962         /* .. and any outputs written: */
1963         for (i = 0; i < ifb->noutputs; i++) {
1964                 struct ir3_instruction *a = ifb->outputs[i];
1965                 struct ir3_instruction *b = elseb->outputs[i];
1966
1967                 /* if output written in if-block, or if else block
1968                  * is present and output written in else-block:
1969                  */
1970                 if (a || ((elseb != ifb->parent) && b)) {
1971                         struct ir3_instruction *phi;
1972
1973                         /* if only written on one side, find the closest
1974                          * enclosing update on other side:
1975                          */
1976                         if (!a)
1977                                 a = find_output(ifb, i);
1978                         if (!b)
1979                                 b = find_output(elseb, i);
1980
1981                         ifout[ifnout] = a;
1982                         a = create_output(ifb, a, ifnout++);
1983
1984                         if (elseb != ifb->parent) {
1985                                 elseout[elsenout] = b;
1986                                 b = create_output(elseb, b, elsenout++);
1987                         }
1988
1989                         phi = create_phi(ctx, instr, a, b);
1990                         ctx->block->outputs[i] = phi;
1991                 }
1992         }
1993
1994         ifb->noutputs = ifnout;
1995         ifb->outputs = ifout;
1996
1997         if (elseb != ifb->parent) {
1998                 elseb->noutputs = elsenout;
1999                 elseb->outputs = elseout;
2000         }
2001
2002         // TODO maybe we want to compact block->inputs?
2003 }
2004
2005 /*
2006  * Kill
2007  */
2008
2009 static void
2010 trans_kill(const struct instr_translater *t,
2011                 struct ir3_compile_context *ctx,
2012                 struct tgsi_full_instruction *inst)
2013 {
2014         struct ir3_instruction *instr, *immed, *cond = NULL;
2015         bool inv = false;
2016
2017         switch (t->tgsi_opc) {
2018         case TGSI_OPCODE_KILL:
2019                 /* unconditional kill, use enclosing if condition: */
2020                 if (ctx->branch_count > 0) {
2021                         unsigned int idx = ctx->branch_count - 1;
2022                         cond = ctx->branch[idx].cond;
2023                         inv = ctx->branch[idx].inv;
2024                 } else {
2025                         cond = create_immed(ctx, 1.0);
2026                 }
2027
2028                 break;
2029         }
2030
2031         compile_assert(ctx, cond);
2032
2033         immed = create_immed(ctx, 0.0);
2034
2035         /* cmps.f.ne p0.x, cond, {0.0} */
2036         instr = instr_create(ctx, 2, OPC_CMPS_F);
2037         instr->cat2.condition = IR3_COND_NE;
2038         ir3_reg_create(instr, regid(REG_P0, 0), 0);
2039         ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
2040         ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed;
2041         cond = instr;
2042
2043         /* kill p0.x */
2044         instr = instr_create(ctx, 0, OPC_KILL);
2045         instr->cat0.inv = inv;
2046         ir3_reg_create(instr, 0, 0);  /* dummy dst */
2047         ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
2048
2049         ctx->kill[ctx->kill_count++] = instr;
2050
2051         ctx->so->has_kill = true;
2052 }
2053
2054 /*
2055  * Kill-If
2056  */
2057
2058 static void
2059 trans_killif(const struct instr_translater *t,
2060                 struct ir3_compile_context *ctx,
2061                 struct tgsi_full_instruction *inst)
2062 {
2063         struct tgsi_src_register *src = &inst->Src[0].Register;
2064         struct ir3_instruction *instr, *immed, *cond = NULL;
2065         bool inv = false;
2066
2067         immed = create_immed(ctx, 0.0);
2068
2069         /* cmps.f.ne p0.x, cond, {0.0} */
2070         instr = instr_create(ctx, 2, OPC_CMPS_F);
2071         instr->cat2.condition = IR3_COND_NE;
2072         ir3_reg_create(instr, regid(REG_P0, 0), 0);
2073         ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed;
2074         add_src_reg(ctx, instr, src, src->SwizzleX);
2075
2076         cond = instr;
2077
2078         /* kill p0.x */
2079         instr = instr_create(ctx, 0, OPC_KILL);
2080         instr->cat0.inv = inv;
2081         ir3_reg_create(instr, 0, 0);  /* dummy dst */
2082         ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
2083
2084         ctx->kill[ctx->kill_count++] = instr;
2085
2086         ctx->so->has_kill = true;
2087
2088 }
2089 /*
2090  * I2F / U2F / F2I / F2U
2091  */
2092
2093 static void
2094 trans_cov(const struct instr_translater *t,
2095                 struct ir3_compile_context *ctx,
2096                 struct tgsi_full_instruction *inst)
2097 {
2098         struct ir3_instruction *instr;
2099         struct tgsi_dst_register *dst = get_dst(ctx, inst);
2100         struct tgsi_src_register *src = &inst->Src[0].Register;
2101
2102         // cov.f32s32 dst, tmp0 /
2103         instr = instr_create(ctx, 1, 0);
2104         switch (t->tgsi_opc) {
2105         case TGSI_OPCODE_U2F:
2106                 instr->cat1.src_type = TYPE_U32;
2107                 instr->cat1.dst_type = TYPE_F32;
2108                 break;
2109         case TGSI_OPCODE_I2F:
2110                 instr->cat1.src_type = TYPE_S32;
2111                 instr->cat1.dst_type = TYPE_F32;
2112                 break;
2113         case TGSI_OPCODE_F2U:
2114                 instr->cat1.src_type = TYPE_F32;
2115                 instr->cat1.dst_type = TYPE_U32;
2116                 break;
2117         case TGSI_OPCODE_F2I:
2118                 instr->cat1.src_type = TYPE_F32;
2119                 instr->cat1.dst_type = TYPE_S32;
2120                 break;
2121
2122         }
2123         vectorize(ctx, instr, dst, 1, src, 0);
2124         put_dst(ctx, inst, dst);
2125 }
2126
2127 /*
2128  * UMUL / UMAD
2129  *
2130  * There is no 32-bit multiply instruction, so splitting a and b into high and
2131  * low components, we get that
2132  *
2133  * dst = al * bl + ah * bl << 16 + al * bh << 16
2134  *
2135  *  mull.u tmp0, a, b (mul low, i.e. al * bl)
2136  *  madsh.m16 tmp1, a, b, tmp0 (mul-add shift high mix, i.e. ah * bl << 16)
2137  *  madsh.m16 dst, b, a, tmp1 (i.e. al * bh << 16)
2138  *
2139  * For UMAD, replace first mull.u with mad.u16.
2140  */
2141 static void
2142 trans_umul(const struct instr_translater *t,
2143                 struct ir3_compile_context *ctx,
2144                 struct tgsi_full_instruction *inst)
2145 {
2146         struct ir3_instruction *instr;
2147         struct tgsi_dst_register *dst = get_dst(ctx, inst);
2148         struct tgsi_src_register *a = &inst->Src[0].Register;
2149         struct tgsi_src_register *b = &inst->Src[1].Register;
2150
2151         struct tgsi_dst_register tmp0_dst, tmp1_dst;
2152         struct tgsi_src_register *tmp0_src, *tmp1_src;
2153
2154         tmp0_src = get_internal_temp(ctx, &tmp0_dst);
2155         tmp1_src = get_internal_temp(ctx, &tmp1_dst);
2156
2157         if (is_rel_or_const(a))
2158                 a = get_unconst(ctx, a);
2159         if (is_rel_or_const(b))
2160                 b = get_unconst(ctx, b);
2161
2162         if (t->tgsi_opc == TGSI_OPCODE_UMUL) {
2163                 /* mull.u tmp0, a, b */
2164                 instr = instr_create(ctx, 2, OPC_MULL_U);
2165                 vectorize(ctx, instr, &tmp0_dst, 2, a, 0, b, 0);
2166         } else {
2167                 struct tgsi_src_register *c = &inst->Src[2].Register;
2168
2169                 /* mad.u16 tmp0, a, b, c */
2170                 instr = instr_create(ctx, 3, OPC_MAD_U16);
2171                 vectorize(ctx, instr, &tmp0_dst, 3, a, 0, b, 0, c, 0);
2172         }
2173
2174         /* madsh.m16 tmp1, a, b, tmp0 */
2175         instr = instr_create(ctx, 3, OPC_MADSH_M16);
2176         vectorize(ctx, instr, &tmp1_dst, 3, a, 0, b, 0, tmp0_src, 0);
2177
2178         /* madsh.m16 dst, b, a, tmp1 */
2179         instr = instr_create(ctx, 3, OPC_MADSH_M16);
2180         vectorize(ctx, instr, dst, 3, b, 0, a, 0, tmp1_src, 0);
2181         put_dst(ctx, inst, dst);
2182 }
2183
2184 /*
2185  * IDIV / UDIV / MOD / UMOD
2186  *
2187  * See NV50LegalizeSSA::handleDIV for the origin of this implementation. For
2188  * MOD/UMOD, it becomes a - [IU]DIV(a, modulus) * modulus.
2189  */
2190 static void
2191 trans_idiv(const struct instr_translater *t,
2192                 struct ir3_compile_context *ctx,
2193                 struct tgsi_full_instruction *inst)
2194 {
2195         struct ir3_instruction *instr;
2196         struct tgsi_dst_register *dst = get_dst(ctx, inst), *premod_dst = dst;
2197         struct tgsi_src_register *a = &inst->Src[0].Register;
2198         struct tgsi_src_register *b = &inst->Src[1].Register;
2199
2200         struct tgsi_dst_register af_dst, bf_dst, q_dst, r_dst, a_dst, b_dst;
2201         struct tgsi_src_register *af_src, *bf_src, *q_src, *r_src, *a_src, *b_src;
2202
2203         struct tgsi_src_register negative_2, thirty_one;
2204         type_t src_type;
2205
2206         if (t->tgsi_opc == TGSI_OPCODE_IDIV || t->tgsi_opc == TGSI_OPCODE_MOD)
2207                 src_type = get_stype(ctx);
2208         else
2209                 src_type = get_utype(ctx);
2210
2211         af_src = get_internal_temp(ctx, &af_dst);
2212         bf_src = get_internal_temp(ctx, &bf_dst);
2213         q_src = get_internal_temp(ctx, &q_dst);
2214         r_src = get_internal_temp(ctx, &r_dst);
2215         a_src = get_internal_temp(ctx, &a_dst);
2216         b_src = get_internal_temp(ctx, &b_dst);
2217
2218         get_immediate(ctx, &negative_2, -2);
2219         get_immediate(ctx, &thirty_one, 31);
2220
2221         if (t->tgsi_opc == TGSI_OPCODE_MOD || t->tgsi_opc == TGSI_OPCODE_UMOD)
2222                 premod_dst = &q_dst;
2223
2224         /* cov.[us]32f32 af, numerator */
2225         instr = instr_create(ctx, 1, 0);
2226         instr->cat1.src_type = src_type;
2227         instr->cat1.dst_type = get_ftype(ctx);
2228         vectorize(ctx, instr, &af_dst, 1, a, 0);
2229
2230         /* cov.[us]32f32 bf, denominator */
2231         instr = instr_create(ctx, 1, 0);
2232         instr->cat1.src_type = src_type;
2233         instr->cat1.dst_type = get_ftype(ctx);
2234         vectorize(ctx, instr, &bf_dst, 1, b, 0);
2235
2236         /* Get the absolute values for IDIV */
2237         if (type_sint(src_type)) {
2238                 /* absneg.f af, (abs)af */
2239                 instr = instr_create(ctx, 2, OPC_ABSNEG_F);
2240                 vectorize(ctx, instr, &af_dst, 1, af_src, IR3_REG_ABS);
2241
2242                 /* absneg.f bf, (abs)bf */
2243                 instr = instr_create(ctx, 2, OPC_ABSNEG_F);
2244                 vectorize(ctx, instr, &bf_dst, 1, bf_src, IR3_REG_ABS);
2245
2246                 /* absneg.s a, (abs)numerator */
2247                 instr = instr_create(ctx, 2, OPC_ABSNEG_S);
2248                 vectorize(ctx, instr, &a_dst, 1, a, IR3_REG_ABS);
2249
2250                 /* absneg.s b, (abs)denominator */
2251                 instr = instr_create(ctx, 2, OPC_ABSNEG_S);
2252                 vectorize(ctx, instr, &b_dst, 1, b, IR3_REG_ABS);
2253         } else {
2254                 /* mov.u32u32 a, numerator */
2255                 instr = instr_create(ctx, 1, 0);
2256                 instr->cat1.src_type = src_type;
2257                 instr->cat1.dst_type = src_type;
2258                 vectorize(ctx, instr, &a_dst, 1, a, 0);
2259
2260                 /* mov.u32u32 b, denominator */
2261                 instr = instr_create(ctx, 1, 0);
2262                 instr->cat1.src_type = src_type;
2263                 instr->cat1.dst_type = src_type;
2264                 vectorize(ctx, instr, &b_dst, 1, b, 0);
2265         }
2266
2267         /* rcp.f bf, bf */
2268         instr = instr_create(ctx, 4, OPC_RCP);
2269         vectorize(ctx, instr, &bf_dst, 1, bf_src, 0);
2270
2271         /* That's right, subtract 2 as an integer from the float */
2272         /* add.u bf, bf, -2 */
2273         instr = instr_create(ctx, 2, OPC_ADD_U);
2274         vectorize(ctx, instr, &bf_dst, 2, bf_src, 0, &negative_2, 0);
2275
2276         /* mul.f q, af, bf */
2277         instr = instr_create(ctx, 2, OPC_MUL_F);
2278         vectorize(ctx, instr, &q_dst, 2, af_src, 0, bf_src, 0);
2279
2280         /* cov.f32[us]32 q, q */
2281         instr = instr_create(ctx, 1, 0);
2282         instr->cat1.src_type = get_ftype(ctx);
2283         instr->cat1.dst_type = src_type;
2284         vectorize(ctx, instr, &q_dst, 1, q_src, 0);
2285
2286         /* integer multiply q by b */
2287         /* mull.u r, q, b */
2288         instr = instr_create(ctx, 2, OPC_MULL_U);
2289         vectorize(ctx, instr, &r_dst, 2, q_src, 0, b_src, 0);
2290
2291         /* madsh.m16 r, q, b, r */
2292         instr = instr_create(ctx, 3, OPC_MADSH_M16);
2293         vectorize(ctx, instr, &r_dst, 3, q_src, 0, b_src, 0, r_src, 0);
2294
2295         /* madsh.m16, r, b, q, r */
2296         instr = instr_create(ctx, 3, OPC_MADSH_M16);
2297         vectorize(ctx, instr, &r_dst, 3, b_src, 0, q_src, 0, r_src, 0);
2298
2299         /* sub.u r, a, r */
2300         instr = instr_create(ctx, 2, OPC_SUB_U);
2301         vectorize(ctx, instr, &r_dst, 2, a_src, 0, r_src, 0);
2302
2303         /* cov.u32f32, r, r */
2304         instr = instr_create(ctx, 1, 0);
2305         instr->cat1.src_type = get_utype(ctx);
2306         instr->cat1.dst_type = get_ftype(ctx);
2307         vectorize(ctx, instr, &r_dst, 1, r_src, 0);
2308
2309         /* mul.f r, r, bf */
2310         instr = instr_create(ctx, 2, OPC_MUL_F);
2311         vectorize(ctx, instr, &r_dst, 2, r_src, 0, bf_src, 0);
2312
2313         /* cov.f32u32 r, r */
2314         instr = instr_create(ctx, 1, 0);
2315         instr->cat1.src_type = get_ftype(ctx);
2316         instr->cat1.dst_type = get_utype(ctx);
2317         vectorize(ctx, instr, &r_dst, 1, r_src, 0);
2318
2319         /* add.u q, q, r */
2320         instr = instr_create(ctx, 2, OPC_ADD_U);
2321         vectorize(ctx, instr, &q_dst, 2, q_src, 0, r_src, 0);
2322
2323         /* mull.u r, q, b */
2324         instr = instr_create(ctx, 2, OPC_MULL_U);
2325         vectorize(ctx, instr, &r_dst, 2, q_src, 0, b_src, 0);
2326
2327         /* madsh.m16 r, q, b, r */
2328         instr = instr_create(ctx, 3, OPC_MADSH_M16);
2329         vectorize(ctx, instr, &r_dst, 3, q_src, 0, b_src, 0, r_src, 0);
2330
2331         /* madsh.m16 r, b, q, r */
2332         instr = instr_create(ctx, 3, OPC_MADSH_M16);
2333         vectorize(ctx, instr, &r_dst, 3, b_src, 0, q_src, 0, r_src, 0);
2334
2335         /* sub.u r, a, r */
2336         instr = instr_create(ctx, 2, OPC_SUB_U);
2337         vectorize(ctx, instr, &r_dst, 2, a_src, 0, r_src, 0);
2338
2339         /* cmps.u.ge r, r, b */
2340         instr = instr_create(ctx, 2, OPC_CMPS_U);
2341         instr->cat2.condition = IR3_COND_GE;
2342         vectorize(ctx, instr, &r_dst, 2, r_src, 0, b_src, 0);
2343
2344         if (type_uint(src_type)) {
2345                 /* add.u dst, q, r */
2346                 instr = instr_create(ctx, 2, OPC_ADD_U);
2347                 vectorize(ctx, instr, premod_dst, 2, q_src, 0, r_src, 0);
2348         } else {
2349                 /* add.u q, q, r */
2350                 instr = instr_create(ctx, 2, OPC_ADD_U);
2351                 vectorize(ctx, instr, &q_dst, 2, q_src, 0, r_src, 0);
2352
2353                 /* negate result based on the original arguments */
2354                 if (is_const(a) && is_const(b))
2355                         a = get_unconst(ctx, a);
2356
2357                 /* xor.b r, numerator, denominator */
2358                 instr = instr_create(ctx, 2, OPC_XOR_B);
2359                 vectorize(ctx, instr, &r_dst, 2, a, 0, b, 0);
2360
2361                 /* shr.b r, r, 31 */
2362                 instr = instr_create(ctx, 2, OPC_SHR_B);
2363                 vectorize(ctx, instr, &r_dst, 2, r_src, 0, &thirty_one, 0);
2364
2365                 /* absneg.s b, (neg)q */
2366                 instr = instr_create(ctx, 2, OPC_ABSNEG_S);
2367                 vectorize(ctx, instr, &b_dst, 1, q_src, IR3_REG_NEGATE);
2368
2369                 /* sel.b dst, b, r, q */
2370                 instr = instr_create(ctx, 3, OPC_SEL_B32);
2371                 vectorize(ctx, instr, premod_dst, 3, b_src, 0, r_src, 0, q_src, 0);
2372         }
2373
2374         if (t->tgsi_opc == TGSI_OPCODE_MOD || t->tgsi_opc == TGSI_OPCODE_UMOD) {
2375                 /* The division result will have ended up in q. */
2376
2377                 /* mull.u r, q, b */
2378                 instr = instr_create(ctx, 2, OPC_MULL_U);
2379                 vectorize(ctx, instr, &r_dst, 2, q_src, 0, b, 0);
2380
2381                 /* madsh.m16 r, q, b, r */
2382                 instr = instr_create(ctx, 3, OPC_MADSH_M16);
2383                 vectorize(ctx, instr, &r_dst, 3, q_src, 0, b, 0, r_src, 0);
2384
2385                 /* madsh.m16 r, b, q, r */
2386                 instr = instr_create(ctx, 3, OPC_MADSH_M16);
2387                 vectorize(ctx, instr, &r_dst, 3, b, 0, q_src, 0, r_src, 0);
2388
2389                 /* sub.u dst, a, r */
2390                 instr = instr_create(ctx, 2, OPC_SUB_U);
2391                 vectorize(ctx, instr, dst, 2, a, 0, r_src, 0);
2392         }
2393
2394         put_dst(ctx, inst, dst);
2395 }
2396
2397 /*
2398  * Handlers for TGSI instructions which do have 1:1 mapping to native
2399  * instructions:
2400  */
2401
2402 static void
2403 instr_cat0(const struct instr_translater *t,
2404                 struct ir3_compile_context *ctx,
2405                 struct tgsi_full_instruction *inst)
2406 {
2407         instr_create(ctx, 0, t->opc);
2408 }
2409
2410 static void
2411 instr_cat1(const struct instr_translater *t,
2412                 struct ir3_compile_context *ctx,
2413                 struct tgsi_full_instruction *inst)
2414 {
2415         struct tgsi_dst_register *dst = get_dst(ctx, inst);
2416         struct tgsi_src_register *src = &inst->Src[0].Register;
2417         create_mov(ctx, dst, src);
2418         put_dst(ctx, inst, dst);
2419 }
2420
2421 static void
2422 instr_cat2(const struct instr_translater *t,
2423                 struct ir3_compile_context *ctx,
2424                 struct tgsi_full_instruction *inst)
2425 {
2426         struct tgsi_dst_register *dst = get_dst(ctx, inst);
2427         struct tgsi_src_register *src0 = &inst->Src[0].Register;
2428         struct tgsi_src_register *src1 = &inst->Src[1].Register;
2429         struct ir3_instruction *instr;
2430         unsigned src0_flags = 0, src1_flags = 0;
2431
2432         switch (t->tgsi_opc) {
2433         case TGSI_OPCODE_ABS:
2434         case TGSI_OPCODE_IABS:
2435                 src0_flags = IR3_REG_ABS;
2436                 break;
2437         case TGSI_OPCODE_INEG:
2438                 src0_flags = IR3_REG_NEGATE;
2439                 break;
2440         case TGSI_OPCODE_SUB:
2441                 src1_flags = IR3_REG_NEGATE;
2442                 break;
2443         }
2444
2445         switch (t->opc) {
2446         case OPC_ABSNEG_F:
2447         case OPC_ABSNEG_S:
2448         case OPC_CLZ_B:
2449         case OPC_CLZ_S:
2450         case OPC_SIGN_F:
2451         case OPC_FLOOR_F:
2452         case OPC_CEIL_F:
2453         case OPC_RNDNE_F:
2454         case OPC_RNDAZ_F:
2455         case OPC_TRUNC_F:
2456         case OPC_NOT_B:
2457         case OPC_BFREV_B:
2458         case OPC_SETRM:
2459         case OPC_CBITS_B:
2460                 /* these only have one src reg */
2461                 instr = instr_create(ctx, 2, t->opc);
2462                 vectorize(ctx, instr, dst, 1, src0, src0_flags);
2463                 break;
2464         default:
2465                 if (is_const(src0) && is_const(src1))
2466                         src0 = get_unconst(ctx, src0);
2467
2468                 instr = instr_create(ctx, 2, t->opc);
2469                 vectorize(ctx, instr, dst, 2, src0, src0_flags,
2470                                 src1, src1_flags);
2471                 break;
2472         }
2473
2474         put_dst(ctx, inst, dst);
2475 }
2476
2477 static void
2478 instr_cat3(const struct instr_translater *t,
2479                 struct ir3_compile_context *ctx,
2480                 struct tgsi_full_instruction *inst)
2481 {
2482         struct tgsi_dst_register *dst = get_dst(ctx, inst);
2483         struct tgsi_src_register *src0 = &inst->Src[0].Register;
2484         struct tgsi_src_register *src1 = &inst->Src[1].Register;
2485         struct ir3_instruction *instr;
2486
2487         /* in particular, can't handle const for src1 for cat3..
2488          * for mad, we can swap first two src's if needed:
2489          */
2490         if (is_rel_or_const(src1)) {
2491                 if (is_mad(t->opc) && !is_rel_or_const(src0)) {
2492                         struct tgsi_src_register *tmp;
2493                         tmp = src0;
2494                         src0 = src1;
2495                         src1 = tmp;
2496                 } else {
2497                         src1 = get_unconst(ctx, src1);
2498                 }
2499         }
2500
2501         instr = instr_create(ctx, 3, t->opc);
2502         vectorize(ctx, instr, dst, 3, src0, 0, src1, 0,
2503                         &inst->Src[2].Register, 0);
2504         put_dst(ctx, inst, dst);
2505 }
2506
2507 static void
2508 instr_cat4(const struct instr_translater *t,
2509                 struct ir3_compile_context *ctx,
2510                 struct tgsi_full_instruction *inst)
2511 {
2512         struct tgsi_dst_register *dst = get_dst(ctx, inst);
2513         struct tgsi_src_register *src = &inst->Src[0].Register;
2514         struct ir3_instruction *instr;
2515         unsigned i;
2516
2517         /* seems like blob compiler avoids const as src.. */
2518         if (is_const(src))
2519                 src = get_unconst(ctx, src);
2520
2521         /* we need to replicate into each component: */
2522         for (i = 0; i < 4; i++) {
2523                 if (dst->WriteMask & (1 << i)) {
2524                         instr = instr_create(ctx, 4, t->opc);
2525                         add_dst_reg(ctx, instr, dst, i);
2526                         add_src_reg(ctx, instr, src, src->SwizzleX);
2527                 }
2528         }
2529
2530         put_dst(ctx, inst, dst);
2531 }
2532
2533 static const struct instr_translater translaters[TGSI_OPCODE_LAST] = {
2534 #define INSTR(n, f, ...) \
2535         [TGSI_OPCODE_ ## n] = { .fxn = (f), .tgsi_opc = TGSI_OPCODE_ ## n, ##__VA_ARGS__ }
2536
2537         INSTR(MOV,          instr_cat1),
2538         INSTR(RCP,          instr_cat4, .opc = OPC_RCP),
2539         INSTR(RSQ,          instr_cat4, .opc = OPC_RSQ),
2540         INSTR(SQRT,         instr_cat4, .opc = OPC_SQRT),
2541         INSTR(MUL,          instr_cat2, .opc = OPC_MUL_F),
2542         INSTR(ADD,          instr_cat2, .opc = OPC_ADD_F),
2543         INSTR(SUB,          instr_cat2, .opc = OPC_ADD_F),
2544         INSTR(MIN,          instr_cat2, .opc = OPC_MIN_F),
2545         INSTR(MAX,          instr_cat2, .opc = OPC_MAX_F),
2546         INSTR(UADD,         instr_cat2, .opc = OPC_ADD_U),
2547         INSTR(IMIN,         instr_cat2, .opc = OPC_MIN_S),
2548         INSTR(UMIN,         instr_cat2, .opc = OPC_MIN_U),
2549         INSTR(IMAX,         instr_cat2, .opc = OPC_MAX_S),
2550         INSTR(UMAX,         instr_cat2, .opc = OPC_MAX_U),
2551         INSTR(AND,          instr_cat2, .opc = OPC_AND_B),
2552         INSTR(OR,           instr_cat2, .opc = OPC_OR_B),
2553         INSTR(NOT,          instr_cat2, .opc = OPC_NOT_B),
2554         INSTR(XOR,          instr_cat2, .opc = OPC_XOR_B),
2555         INSTR(UMUL,         trans_umul),
2556         INSTR(UMAD,         trans_umul),
2557         INSTR(UDIV,         trans_idiv),
2558         INSTR(IDIV,         trans_idiv),
2559         INSTR(MOD,          trans_idiv),
2560         INSTR(UMOD,         trans_idiv),
2561         INSTR(SHL,          instr_cat2, .opc = OPC_SHL_B),
2562         INSTR(USHR,         instr_cat2, .opc = OPC_SHR_B),
2563         INSTR(ISHR,         instr_cat2, .opc = OPC_ASHR_B),
2564         INSTR(IABS,         instr_cat2, .opc = OPC_ABSNEG_S),
2565         INSTR(INEG,         instr_cat2, .opc = OPC_ABSNEG_S),
2566         INSTR(AND,          instr_cat2, .opc = OPC_AND_B),
2567         INSTR(MAD,          instr_cat3, .opc = OPC_MAD_F32, .hopc = OPC_MAD_F16),
2568         INSTR(TRUNC,        instr_cat2, .opc = OPC_TRUNC_F),
2569         INSTR(CLAMP,        trans_clamp),
2570         INSTR(FLR,          instr_cat2, .opc = OPC_FLOOR_F),
2571         INSTR(ROUND,        instr_cat2, .opc = OPC_RNDNE_F),
2572         INSTR(SSG,          instr_cat2, .opc = OPC_SIGN_F),
2573         INSTR(CEIL,         instr_cat2, .opc = OPC_CEIL_F),
2574         INSTR(ARL,          trans_arl),
2575         INSTR(UARL,         trans_arl),
2576         INSTR(EX2,          instr_cat4, .opc = OPC_EXP2),
2577         INSTR(LG2,          instr_cat4, .opc = OPC_LOG2),
2578         INSTR(ABS,          instr_cat2, .opc = OPC_ABSNEG_F),
2579         INSTR(COS,          instr_cat4, .opc = OPC_COS),
2580         INSTR(SIN,          instr_cat4, .opc = OPC_SIN),
2581         INSTR(TEX,          trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TEX),
2582         INSTR(TXP,          trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TXP),
2583         INSTR(TXB,          trans_samp, .opc = OPC_SAMB, .arg = TGSI_OPCODE_TXB),
2584         INSTR(TXB2,         trans_samp, .opc = OPC_SAMB, .arg = TGSI_OPCODE_TXB2),
2585         INSTR(TXL,          trans_samp, .opc = OPC_SAML, .arg = TGSI_OPCODE_TXL),
2586         INSTR(TXD,          trans_samp, .opc = OPC_SAMGQ, .arg = TGSI_OPCODE_TXD),
2587         INSTR(TXF,          trans_samp, .opc = OPC_ISAML, .arg = TGSI_OPCODE_TXF),
2588         INSTR(TXQ,          trans_txq),
2589         INSTR(DDX,          trans_deriv, .opc = OPC_DSX),
2590         INSTR(DDY,          trans_deriv, .opc = OPC_DSY),
2591         INSTR(SGT,          trans_cmp),
2592         INSTR(SLT,          trans_cmp),
2593         INSTR(FSLT,         trans_cmp),
2594         INSTR(SGE,          trans_cmp),
2595         INSTR(FSGE,         trans_cmp),
2596         INSTR(SLE,          trans_cmp),
2597         INSTR(SNE,          trans_cmp),
2598         INSTR(FSNE,         trans_cmp),
2599         INSTR(SEQ,          trans_cmp),
2600         INSTR(FSEQ,         trans_cmp),
2601         INSTR(CMP,          trans_cmp),
2602         INSTR(USNE,         trans_icmp, .opc = OPC_CMPS_U),
2603         INSTR(USEQ,         trans_icmp, .opc = OPC_CMPS_U),
2604         INSTR(ISGE,         trans_icmp, .opc = OPC_CMPS_S),
2605         INSTR(USGE,         trans_icmp, .opc = OPC_CMPS_U),
2606         INSTR(ISLT,         trans_icmp, .opc = OPC_CMPS_S),
2607         INSTR(USLT,         trans_icmp, .opc = OPC_CMPS_U),
2608         INSTR(UCMP,         trans_ucmp),
2609         INSTR(ISSG,         trans_issg),
2610         INSTR(IF,           trans_if,   .opc = OPC_CMPS_F),
2611         INSTR(UIF,          trans_if,   .opc = OPC_CMPS_U),
2612         INSTR(ELSE,         trans_else),
2613         INSTR(ENDIF,        trans_endif),
2614         INSTR(END,          instr_cat0, .opc = OPC_END),
2615         INSTR(KILL,         trans_kill, .opc = OPC_KILL),
2616         INSTR(KILL_IF,      trans_killif, .opc = OPC_KILL),
2617         INSTR(I2F,          trans_cov),
2618         INSTR(U2F,          trans_cov),
2619         INSTR(F2I,          trans_cov),
2620         INSTR(F2U,          trans_cov),
2621 };
2622
2623 static ir3_semantic
2624 decl_semantic(const struct tgsi_declaration_semantic *sem)
2625 {
2626         return ir3_semantic_name(sem->Name, sem->Index);
2627 }
2628
2629 static struct ir3_instruction *
2630 decl_in_frag_bary(struct ir3_compile_context *ctx, unsigned regid,
2631                 unsigned j, unsigned inloc)
2632 {
2633         struct ir3_instruction *instr;
2634         struct ir3_register *src;
2635
2636         /* bary.f dst, #inloc, r0.x */
2637         instr = instr_create(ctx, 2, OPC_BARY_F);
2638         ir3_reg_create(instr, regid, 0);   /* dummy dst */
2639         ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = inloc;
2640         src = ir3_reg_create(instr, 0, IR3_REG_SSA);
2641         src->wrmask = 0x3;
2642         src->instr = ctx->frag_pos;
2643
2644         return instr;
2645 }
2646
2647 /* TGSI_SEMANTIC_POSITION
2648  * """"""""""""""""""""""
2649  *
2650  * For fragment shaders, TGSI_SEMANTIC_POSITION is used to indicate that
2651  * fragment shader input contains the fragment's window position.  The X
2652  * component starts at zero and always increases from left to right.
2653  * The Y component starts at zero and always increases but Y=0 may either
2654  * indicate the top of the window or the bottom depending on the fragment
2655  * coordinate origin convention (see TGSI_PROPERTY_FS_COORD_ORIGIN).
2656  * The Z coordinate ranges from 0 to 1 to represent depth from the front
2657  * to the back of the Z buffer.  The W component contains the reciprocol
2658  * of the interpolated vertex position W component.
2659  */
2660 static struct ir3_instruction *
2661 decl_in_frag_coord(struct ir3_compile_context *ctx, unsigned regid,
2662                 unsigned j)
2663 {
2664         struct ir3_instruction *instr, *src;
2665
2666         compile_assert(ctx, !ctx->frag_coord[j]);
2667
2668         ctx->frag_coord[j] = create_input(ctx->block, NULL, 0);
2669
2670
2671         switch (j) {
2672         case 0: /* .x */
2673         case 1: /* .y */
2674                 /* for frag_coord, we get unsigned values.. we need
2675                  * to subtract (integer) 8 and divide by 16 (right-
2676                  * shift by 4) then convert to float:
2677                  */
2678
2679                 /* add.s tmp, src, -8 */
2680                 instr = instr_create(ctx, 2, OPC_ADD_S);
2681                 ir3_reg_create(instr, regid, 0);    /* dummy dst */
2682                 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_coord[j];
2683                 ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = -8;
2684                 src = instr;
2685
2686                 /* shr.b tmp, tmp, 4 */
2687                 instr = instr_create(ctx, 2, OPC_SHR_B);
2688                 ir3_reg_create(instr, regid, 0);    /* dummy dst */
2689                 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
2690                 ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 4;
2691                 src = instr;
2692
2693                 /* mov.u32f32 dst, tmp */
2694                 instr = instr_create(ctx, 1, 0);
2695                 instr->cat1.src_type = TYPE_U32;
2696                 instr->cat1.dst_type = TYPE_F32;
2697                 ir3_reg_create(instr, regid, 0);    /* dummy dst */
2698                 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
2699
2700                 break;
2701         case 2: /* .z */
2702         case 3: /* .w */
2703                 /* seems that we can use these as-is: */
2704                 instr = ctx->frag_coord[j];
2705                 break;
2706         default:
2707                 compile_error(ctx, "invalid channel\n");
2708                 instr = create_immed(ctx, 0.0);
2709                 break;
2710         }
2711
2712         return instr;
2713 }
2714
2715 /* TGSI_SEMANTIC_FACE
2716  * """"""""""""""""""
2717  *
2718  * This label applies to fragment shader inputs only and indicates that
2719  * the register contains front/back-face information of the form (F, 0,
2720  * 0, 1).  The first component will be positive when the fragment belongs
2721  * to a front-facing polygon, and negative when the fragment belongs to a
2722  * back-facing polygon.
2723  */
2724 static struct ir3_instruction *
2725 decl_in_frag_face(struct ir3_compile_context *ctx, unsigned regid,
2726                 unsigned j)
2727 {
2728         struct ir3_instruction *instr, *src;
2729
2730         switch (j) {
2731         case 0: /* .x */
2732                 compile_assert(ctx, !ctx->frag_face);
2733
2734                 ctx->frag_face = create_input(ctx->block, NULL, 0);
2735
2736                 /* for faceness, we always get -1 or 0 (int).. but TGSI expects
2737                  * positive vs negative float.. and piglit further seems to
2738                  * expect -1.0 or 1.0:
2739                  *
2740                  *    mul.s tmp, hr0.x, 2
2741                  *    add.s tmp, tmp, 1
2742                  *    mov.s16f32, dst, tmp
2743                  *
2744                  */
2745
2746                 instr = instr_create(ctx, 2, OPC_MUL_S);
2747                 ir3_reg_create(instr, regid, 0);    /* dummy dst */
2748                 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_face;
2749                 ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2;
2750                 src = instr;
2751
2752                 instr = instr_create(ctx, 2, OPC_ADD_S);
2753                 ir3_reg_create(instr, regid, 0);    /* dummy dst */
2754                 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
2755                 ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 1;
2756                 src = instr;
2757
2758                 instr = instr_create(ctx, 1, 0); /* mov */
2759                 instr->cat1.src_type = TYPE_S32;
2760                 instr->cat1.dst_type = TYPE_F32;
2761                 ir3_reg_create(instr, regid, 0);    /* dummy dst */
2762                 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
2763
2764                 break;
2765         case 1: /* .y */
2766         case 2: /* .z */
2767                 instr = create_immed(ctx, 0.0);
2768                 break;
2769         case 3: /* .w */
2770                 instr = create_immed(ctx, 1.0);
2771                 break;
2772         default:
2773                 compile_error(ctx, "invalid channel\n");
2774                 instr = create_immed(ctx, 0.0);
2775                 break;
2776         }
2777
2778         return instr;
2779 }
2780
2781 static void
2782 decl_in(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl)
2783 {
2784         struct ir3_shader_variant *so = ctx->so;
2785         unsigned name = decl->Semantic.Name;
2786         unsigned i;
2787
2788         /* I don't think we should get frag shader input without
2789          * semantic info?  Otherwise how do inputs get linked to
2790          * vert outputs?
2791          */
2792         compile_assert(ctx, (ctx->type == TGSI_PROCESSOR_VERTEX) ||
2793                         decl->Declaration.Semantic);
2794
2795         for (i = decl->Range.First; i <= decl->Range.Last; i++) {
2796                 unsigned n = so->inputs_count++;
2797                 unsigned r = regid(i, 0);
2798                 unsigned ncomp, j;
2799
2800                 /* we'll figure out the actual components used after scheduling */
2801                 ncomp = 4;
2802
2803                 DBG("decl in -> r%d", i);
2804
2805                 compile_assert(ctx, n < ARRAY_SIZE(so->inputs));
2806
2807                 so->inputs[n].semantic = decl_semantic(&decl->Semantic);
2808                 so->inputs[n].compmask = (1 << ncomp) - 1;
2809                 so->inputs[n].regid = r;
2810                 so->inputs[n].inloc = ctx->next_inloc;
2811                 so->inputs[n].interpolate = decl->Interp.Interpolate;
2812
2813                 for (j = 0; j < ncomp; j++) {
2814                         struct ir3_instruction *instr = NULL;
2815
2816                         if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
2817                                 /* for fragment shaders, POSITION and FACE are handled
2818                                  * specially, not using normal varying / bary.f
2819                                  */
2820                                 if (name == TGSI_SEMANTIC_POSITION) {
2821                                         so->inputs[n].bary = false;
2822                                         so->frag_coord = true;
2823                                         instr = decl_in_frag_coord(ctx, r + j, j);
2824                                 } else if (name == TGSI_SEMANTIC_FACE) {
2825                                         so->inputs[n].bary = false;
2826                                         so->frag_face = true;
2827                                         instr = decl_in_frag_face(ctx, r + j, j);
2828                                 } else {
2829                                         so->inputs[n].bary = true;
2830                                         instr = decl_in_frag_bary(ctx, r + j, j,
2831                                                         so->inputs[n].inloc + j - 8);
2832                                 }
2833                         } else {
2834                                 instr = create_input(ctx->block, NULL, (i * 4) + j);
2835                         }
2836
2837                         ctx->block->inputs[(i * 4) + j] = instr;
2838                 }
2839
2840                 if (so->inputs[n].bary || (ctx->type == TGSI_PROCESSOR_VERTEX)) {
2841                         ctx->next_inloc += ncomp;
2842                         so->total_in += ncomp;
2843                 }
2844         }
2845 }
2846
2847 static void
2848 decl_out(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl)
2849 {
2850         struct ir3_shader_variant *so = ctx->so;
2851         unsigned comp = 0;
2852         unsigned name = decl->Semantic.Name;
2853         unsigned i;
2854
2855         compile_assert(ctx, decl->Declaration.Semantic);
2856
2857         DBG("decl out[%d] -> r%d", name, decl->Range.First);
2858
2859         if (ctx->type == TGSI_PROCESSOR_VERTEX) {
2860                 switch (name) {
2861                 case TGSI_SEMANTIC_POSITION:
2862                         so->writes_pos = true;
2863                         break;
2864                 case TGSI_SEMANTIC_PSIZE:
2865                         so->writes_psize = true;
2866                         break;
2867                 case TGSI_SEMANTIC_COLOR:
2868                 case TGSI_SEMANTIC_BCOLOR:
2869                 case TGSI_SEMANTIC_GENERIC:
2870                 case TGSI_SEMANTIC_FOG:
2871                 case TGSI_SEMANTIC_TEXCOORD:
2872                         break;
2873                 default:
2874                         compile_error(ctx, "unknown VS semantic name: %s\n",
2875                                         tgsi_semantic_names[name]);
2876                 }
2877         } else {
2878                 switch (name) {
2879                 case TGSI_SEMANTIC_POSITION:
2880                         comp = 2;  /* tgsi will write to .z component */
2881                         so->writes_pos = true;
2882                         break;
2883                 case TGSI_SEMANTIC_COLOR:
2884                         break;
2885                 default:
2886                         compile_error(ctx, "unknown FS semantic name: %s\n",
2887                                         tgsi_semantic_names[name]);
2888                 }
2889         }
2890
2891         for (i = decl->Range.First; i <= decl->Range.Last; i++) {
2892                 unsigned n = so->outputs_count++;
2893                 unsigned ncomp, j;
2894
2895                 ncomp = 4;
2896
2897                 compile_assert(ctx, n < ARRAY_SIZE(so->outputs));
2898
2899                 so->outputs[n].semantic = decl_semantic(&decl->Semantic);
2900                 so->outputs[n].regid = regid(i, comp);
2901
2902                 /* avoid undefined outputs, stick a dummy mov from imm{0.0},
2903                  * which if the output is actually assigned will be over-
2904                  * written
2905                  */
2906                 for (j = 0; j < ncomp; j++)
2907                         ctx->block->outputs[(i * 4) + j] = create_immed(ctx, 0.0);
2908         }
2909 }
2910
2911 /* from TGSI perspective, we actually have inputs.  But most of the "inputs"
2912  * for a fragment shader are just bary.f instructions.  The *actual* inputs
2913  * from the hw perspective are the frag_pos and optionally frag_coord and
2914  * frag_face.
2915  */
2916 static void
2917 fixup_frag_inputs(struct ir3_compile_context *ctx)
2918 {
2919         struct ir3_shader_variant *so = ctx->so;
2920         struct ir3_block *block = ctx->block;
2921         struct ir3_instruction **inputs;
2922         struct ir3_instruction *instr;
2923         int n, regid = 0;
2924
2925         block->ninputs = 0;
2926
2927         n  = 4;  /* always have frag_pos */
2928         n += COND(so->frag_face, 4);
2929         n += COND(so->frag_coord, 4);
2930
2931         inputs = ir3_alloc(ctx->ir, n * (sizeof(struct ir3_instruction *)));
2932
2933         if (so->frag_face) {
2934                 /* this ultimately gets assigned to hr0.x so doesn't conflict
2935                  * with frag_coord/frag_pos..
2936                  */
2937                 inputs[block->ninputs++] = ctx->frag_face;
2938                 ctx->frag_face->regs[0]->num = 0;
2939
2940                 /* remaining channels not used, but let's avoid confusing
2941                  * other parts that expect inputs to come in groups of vec4
2942                  */
2943                 inputs[block->ninputs++] = NULL;
2944                 inputs[block->ninputs++] = NULL;
2945                 inputs[block->ninputs++] = NULL;
2946         }
2947
2948         /* since we don't know where to set the regid for frag_coord,
2949          * we have to use r0.x for it.  But we don't want to *always*
2950          * use r1.x for frag_pos as that could increase the register
2951          * footprint on simple shaders:
2952          */
2953         if (so->frag_coord) {
2954                 ctx->frag_coord[0]->regs[0]->num = regid++;
2955                 ctx->frag_coord[1]->regs[0]->num = regid++;
2956                 ctx->frag_coord[2]->regs[0]->num = regid++;
2957                 ctx->frag_coord[3]->regs[0]->num = regid++;
2958
2959                 inputs[block->ninputs++] = ctx->frag_coord[0];
2960                 inputs[block->ninputs++] = ctx->frag_coord[1];
2961                 inputs[block->ninputs++] = ctx->frag_coord[2];
2962                 inputs[block->ninputs++] = ctx->frag_coord[3];
2963         }
2964
2965         /* we always have frag_pos: */
2966         so->pos_regid = regid;
2967
2968         /* r0.x */
2969         instr = create_input(block, NULL, block->ninputs);
2970         instr->regs[0]->num = regid++;
2971         inputs[block->ninputs++] = instr;
2972         ctx->frag_pos->regs[1]->instr = instr;
2973
2974         /* r0.y */
2975         instr = create_input(block, NULL, block->ninputs);
2976         instr->regs[0]->num = regid++;
2977         inputs[block->ninputs++] = instr;
2978         ctx->frag_pos->regs[2]->instr = instr;
2979
2980         block->inputs = inputs;
2981 }
2982
2983 static void
2984 compile_instructions(struct ir3_compile_context *ctx)
2985 {
2986         push_block(ctx);
2987
2988         /* for fragment shader, we have a single input register (usually
2989          * r0.xy) which is used as the base for bary.f varying fetch instrs:
2990          */
2991         if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
2992                 struct ir3_instruction *instr;
2993                 instr = ir3_instr_create(ctx->block, -1, OPC_META_FI);
2994                 ir3_reg_create(instr, 0, 0);
2995                 ir3_reg_create(instr, 0, IR3_REG_SSA);    /* r0.x */
2996                 ir3_reg_create(instr, 0, IR3_REG_SSA);    /* r0.y */
2997                 ctx->frag_pos = instr;
2998         }
2999
3000         while (!tgsi_parse_end_of_tokens(&ctx->parser)) {
3001                 tgsi_parse_token(&ctx->parser);
3002
3003                 switch (ctx->parser.FullToken.Token.Type) {
3004                 case TGSI_TOKEN_TYPE_DECLARATION: {
3005                         struct tgsi_full_declaration *decl =
3006                                         &ctx->parser.FullToken.FullDeclaration;
3007                         if (decl->Declaration.File == TGSI_FILE_OUTPUT) {
3008                                 decl_out(ctx, decl);
3009                         } else if (decl->Declaration.File == TGSI_FILE_INPUT) {
3010                                 decl_in(ctx, decl);
3011                         }
3012                         break;
3013                 }
3014                 case TGSI_TOKEN_TYPE_IMMEDIATE: {
3015                         /* TODO: if we know the immediate is small enough, and only
3016                          * used with instructions that can embed an immediate, we
3017                          * can skip this:
3018                          */
3019                         struct tgsi_full_immediate *imm =
3020                                         &ctx->parser.FullToken.FullImmediate;
3021                         unsigned n = ctx->so->immediates_count++;
3022                         compile_assert(ctx, n < ARRAY_SIZE(ctx->so->immediates));
3023                         memcpy(ctx->so->immediates[n].val, imm->u, 16);
3024                         break;
3025                 }
3026                 case TGSI_TOKEN_TYPE_INSTRUCTION: {
3027                         struct tgsi_full_instruction *inst =
3028                                         &ctx->parser.FullToken.FullInstruction;
3029                         unsigned opc = inst->Instruction.Opcode;
3030                         const struct instr_translater *t = &translaters[opc];
3031
3032                         if (t->fxn) {
3033                                 t->fxn(t, ctx, inst);
3034                                 ctx->num_internal_temps = 0;
3035
3036                                 compile_assert(ctx, !ctx->using_tmp_dst);
3037                         } else {
3038                                 compile_error(ctx, "unknown TGSI opc: %s\n",
3039                                                 tgsi_get_opcode_name(opc));
3040                         }
3041
3042                         switch (inst->Instruction.Saturate) {
3043                         case TGSI_SAT_ZERO_ONE:
3044                                 create_clamp_imm(ctx, &inst->Dst[0].Register,
3045                                                 fui(0.0), fui(1.0));
3046                                 break;
3047                         case TGSI_SAT_MINUS_PLUS_ONE:
3048                                 create_clamp_imm(ctx, &inst->Dst[0].Register,
3049                                                 fui(-1.0), fui(1.0));
3050                                 break;
3051                         }
3052
3053                         instr_finish(ctx);
3054
3055                         break;
3056                 }
3057                 default:
3058                         break;
3059                 }
3060         }
3061 }
3062
3063 static void
3064 compile_dump(struct ir3_compile_context *ctx)
3065 {
3066         const char *name = (ctx->so->type == SHADER_VERTEX) ? "vert" : "frag";
3067         static unsigned n = 0;
3068         char fname[16];
3069         FILE *f;
3070         snprintf(fname, sizeof(fname), "%s-%04u.dot", name, n++);
3071         f = fopen(fname, "w");
3072         if (!f)
3073                 return;
3074         ir3_block_depth(ctx->block);
3075         ir3_dump(ctx->ir, name, ctx->block, f);
3076         fclose(f);
3077 }
3078
3079 int
3080 ir3_compile_shader(struct ir3_shader_variant *so,
3081                 const struct tgsi_token *tokens, struct ir3_shader_key key,
3082                 bool cp)
3083 {
3084         struct ir3_compile_context ctx;
3085         struct ir3_block *block;
3086         struct ir3_instruction **inputs;
3087         unsigned i, j, actual_in;
3088         int ret = 0, max_bary;
3089
3090         assert(!so->ir);
3091
3092         so->ir = ir3_create();
3093
3094         assert(so->ir);
3095
3096         if (compile_init(&ctx, so, tokens) != TGSI_PARSE_OK) {
3097                 DBG("INIT failed!");
3098                 ret = -1;
3099                 goto out;
3100         }
3101
3102         compile_instructions(&ctx);
3103
3104         block = ctx.block;
3105
3106         /* keep track of the inputs from TGSI perspective.. */
3107         inputs = block->inputs;
3108
3109         /* but fixup actual inputs for frag shader: */
3110         if (ctx.type == TGSI_PROCESSOR_FRAGMENT)
3111                 fixup_frag_inputs(&ctx);
3112
3113         /* at this point, for binning pass, throw away unneeded outputs: */
3114         if (key.binning_pass) {
3115                 for (i = 0, j = 0; i < so->outputs_count; i++) {
3116                         unsigned name = sem2name(so->outputs[i].semantic);
3117                         unsigned idx = sem2name(so->outputs[i].semantic);
3118
3119                         /* throw away everything but first position/psize */
3120                         if ((idx == 0) && ((name == TGSI_SEMANTIC_POSITION) ||
3121                                         (name == TGSI_SEMANTIC_PSIZE))) {
3122                                 if (i != j) {
3123                                         so->outputs[j] = so->outputs[i];
3124                                         block->outputs[(j*4)+0] = block->outputs[(i*4)+0];
3125                                         block->outputs[(j*4)+1] = block->outputs[(i*4)+1];
3126                                         block->outputs[(j*4)+2] = block->outputs[(i*4)+2];
3127                                         block->outputs[(j*4)+3] = block->outputs[(i*4)+3];
3128                                 }
3129                                 j++;
3130                         }
3131                 }
3132                 so->outputs_count = j;
3133                 block->noutputs = j * 4;
3134         }
3135
3136         /* for rendering to alpha format, we only need the .w component,
3137          * and we need it to be in the .x position:
3138          */
3139         if (key.alpha) {
3140                 for (i = 0, j = 0; i < so->outputs_count; i++) {
3141                         unsigned name = sem2name(so->outputs[i].semantic);
3142
3143                         /* move .w component to .x and discard others: */
3144                         if (name == TGSI_SEMANTIC_COLOR) {
3145                                 block->outputs[(i*4)+0] = block->outputs[(i*4)+3];
3146                                 block->outputs[(i*4)+1] = NULL;
3147                                 block->outputs[(i*4)+2] = NULL;
3148                                 block->outputs[(i*4)+3] = NULL;
3149                         }
3150                 }
3151         }
3152
3153         /* at this point, we want the kill's in the outputs array too,
3154          * so that they get scheduled (since they have no dst).. we've
3155          * already ensured that the array is big enough in push_block():
3156          */
3157         if (ctx.type == TGSI_PROCESSOR_FRAGMENT) {
3158                 for (i = 0; i < ctx.kill_count; i++)
3159                         block->outputs[block->noutputs++] = ctx.kill[i];
3160         }
3161
3162         if (fd_mesa_debug & FD_DBG_OPTDUMP)
3163                 compile_dump(&ctx);
3164
3165         ret = ir3_block_flatten(block);
3166         if (ret < 0) {
3167                 DBG("FLATTEN failed!");
3168                 goto out;
3169         }
3170         if ((ret > 0) && (fd_mesa_debug & FD_DBG_OPTDUMP))
3171                 compile_dump(&ctx);
3172
3173         if (fd_mesa_debug & FD_DBG_OPTMSGS) {
3174                 printf("BEFORE CP:\n");
3175                 ir3_dump_instr_list(block->head);
3176         }
3177
3178         if (cp)
3179                 ir3_block_cp(block);
3180
3181         if (fd_mesa_debug & FD_DBG_OPTDUMP)
3182                 compile_dump(&ctx);
3183
3184         ir3_block_depth(block);
3185
3186         if (fd_mesa_debug & FD_DBG_OPTMSGS) {
3187                 printf("AFTER DEPTH:\n");
3188                 ir3_dump_instr_list(block->head);
3189         }
3190
3191         ret = ir3_block_sched(block);
3192         if (ret) {
3193                 DBG("SCHED failed!");
3194                 goto out;
3195         }
3196
3197         if (fd_mesa_debug & FD_DBG_OPTMSGS) {
3198                 printf("AFTER SCHED:\n");
3199                 ir3_dump_instr_list(block->head);
3200         }
3201
3202         ret = ir3_block_ra(block, so->type, key.half_precision,
3203                         so->frag_coord, so->frag_face, &so->has_samp, &max_bary);
3204         if (ret) {
3205                 DBG("RA failed!");
3206                 goto out;
3207         }
3208
3209         if (fd_mesa_debug & FD_DBG_OPTMSGS) {
3210                 printf("AFTER RA:\n");
3211                 ir3_dump_instr_list(block->head);
3212         }
3213
3214         /* fixup input/outputs: */
3215         for (i = 0; i < so->outputs_count; i++) {
3216                 so->outputs[i].regid = block->outputs[i*4]->regs[0]->num;
3217                 /* preserve hack for depth output.. tgsi writes depth to .z,
3218                  * but what we give the hw is the scalar register:
3219                  */
3220                 if ((ctx.type == TGSI_PROCESSOR_FRAGMENT) &&
3221                         (sem2name(so->outputs[i].semantic) == TGSI_SEMANTIC_POSITION))
3222                         so->outputs[i].regid += 2;
3223         }
3224         /* Note that some or all channels of an input may be unused: */
3225         actual_in = 0;
3226         for (i = 0; i < so->inputs_count; i++) {
3227                 unsigned j, regid = ~0, compmask = 0;
3228                 so->inputs[i].ncomp = 0;
3229                 for (j = 0; j < 4; j++) {
3230                         struct ir3_instruction *in = inputs[(i*4) + j];
3231                         if (in) {
3232                                 compmask |= (1 << j);
3233                                 regid = in->regs[0]->num - j;
3234                                 actual_in++;
3235                                 so->inputs[i].ncomp++;
3236                         }
3237                 }
3238                 so->inputs[i].regid = regid;
3239                 so->inputs[i].compmask = compmask;
3240         }
3241
3242         /* fragment shader always gets full vec4's even if it doesn't
3243          * fetch all components, but vertex shader we need to update
3244          * with the actual number of components fetch, otherwise thing
3245          * will hang due to mismaptch between VFD_DECODE's and
3246          * TOTALATTRTOVS
3247          */
3248         if (so->type == SHADER_VERTEX)
3249                 so->total_in = actual_in;
3250         else
3251                 so->total_in = align(max_bary + 1, 4);
3252
3253 out:
3254         if (ret) {
3255                 ir3_destroy(so->ir);
3256                 so->ir = NULL;
3257         }
3258         compile_free(&ctx);
3259
3260         return ret;
3261 }