src/gallium/drivers/panfrost/midgard/midgard_compile.c

   1 /*
   2  * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  */
  23
  24 #include <sys/types.h>
  25 #include <sys/stat.h>
  26 #include <sys/mman.h>
  27 #include <fcntl.h>
  28 #include <stdint.h>
  29 #include <stdlib.h>
  30 #include <stdio.h>
  31 #include <err.h>
  32
  33 #include "main/mtypes.h"
  34 #include "compiler/glsl/glsl_to_nir.h"
  35 #include "compiler/nir_types.h"
  36 #include "main/imports.h"
  37 #include "compiler/nir/nir_builder.h"
  38 #include "util/half_float.h"
  39 #include "util/u_debug.h"
  40 #include "util/u_dynarray.h"
  41 #include "util/list.h"
  42 #include "main/mtypes.h"
  43
  44 #include "midgard.h"
  45 #include "midgard_nir.h"
  46 #include "midgard_compile.h"
  47 #include "midgard_ops.h"
  48 #include "helpers.h"
  49 #include "compiler.h"
  50
  51 #include "disassemble.h"
  52
  53 static const struct debug_named_value debug_options[] = {
  54         {"msgs",      MIDGARD_DBG_MSGS,         "Print debug messages"},
  55         {"shaders",   MIDGARD_DBG_SHADERS,      "Dump shaders in NIR and MIR"},
  56         DEBUG_NAMED_VALUE_END
  57 };
  58
  59 DEBUG_GET_ONCE_FLAGS_OPTION(midgard_debug, "MIDGARD_MESA_DEBUG", debug_options, 0)
  60
  61 int midgard_debug = 0;
  62
  63 #define DBG(fmt, ...) \
  64                 do { if (midgard_debug & MIDGARD_DBG_MSGS) \
  65                         fprintf(stderr, "%s:%d: "fmt, \
  66                                 __FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)
  67
  68 static bool
  69 midgard_is_branch_unit(unsigned unit)
  70 {
  71         return (unit == ALU_ENAB_BRANCH) || (unit == ALU_ENAB_BR_COMPACT);
  72 }
  73
  74 static void
  75 midgard_block_add_successor(midgard_block *block, midgard_block *successor)
  76 {
  77         block->successors[block->nr_successors++] = successor;
  78         assert(block->nr_successors <= ARRAY_SIZE(block->successors));
  79 }
  80
  81 /* Helpers to generate midgard_instruction's using macro magic, since every
  82  * driver seems to do it that way */
  83
  84 #define EMIT(op, ...) emit_mir_instruction(ctx, v_##op(__VA_ARGS__));
  85 #define SWIZZLE_XYZW SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W)
  86
  87 #define M_LOAD_STORE(name, rname, uname) \
  88         static midgard_instruction m_##name(unsigned ssa, unsigned address) { \
  89                 midgard_instruction i = { \
  90                         .type = TAG_LOAD_STORE_4, \
  91                         .ssa_args = { \
  92                                 .rname = ssa, \
  93                                 .uname = -1, \
  94                                 .src1 = -1 \
  95                         }, \
  96                         .load_store = { \
  97                                 .op = midgard_op_##name, \
  98                                 .mask = 0xF, \
  99                                 .swizzle = SWIZZLE_XYZW, \
 100                                 .address = address \
 101                         } \
 102                 }; \
 103                 \
 104                 return i; \
 105         }
 106
 107 #define M_LOAD(name) M_LOAD_STORE(name, dest, src0)
 108 #define M_STORE(name) M_LOAD_STORE(name, src0, dest)
 109
 110 const midgard_vector_alu_src blank_alu_src = {
 111         .swizzle = SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
 112 };
 113
 114 const midgard_vector_alu_src blank_alu_src_xxxx = {
 115         .swizzle = SWIZZLE(COMPONENT_X, COMPONENT_X, COMPONENT_X, COMPONENT_X),
 116 };
 117
 118 const midgard_scalar_alu_src blank_scalar_alu_src = {
 119         .full = true
 120 };
 121
 122 /* Used for encoding the unused source of 1-op instructions */
 123 const midgard_vector_alu_src zero_alu_src = { 0 };
 124
 125 /* Coerce structs to integer */
 126
 127 static unsigned
 128 vector_alu_srco_unsigned(midgard_vector_alu_src src)
 129 {
 130         unsigned u;
 131         memcpy(&u, &src, sizeof(src));
 132         return u;
 133 }
 134
 135 static midgard_vector_alu_src
 136 vector_alu_from_unsigned(unsigned u)
 137 {
 138         midgard_vector_alu_src s;
 139         memcpy(&s, &u, sizeof(s));
 140         return s;
 141 }
 142
 143 /* Inputs a NIR ALU source, with modifiers attached if necessary, and outputs
 144  * the corresponding Midgard source */
 145
 146 static midgard_vector_alu_src
 147 vector_alu_modifiers(nir_alu_src *src, bool is_int)
 148 {
 149         if (!src) return blank_alu_src;
 150
 151         midgard_vector_alu_src alu_src = {
 152                 .rep_low = 0,
 153                 .rep_high = 0,
 154                 .half = 0, /* TODO */
 155                 .swizzle = SWIZZLE_FROM_ARRAY(src->swizzle)
 156         };
 157
 158         if (is_int) {
 159                 /* TODO: sign-extend/zero-extend */
 160                 alu_src.mod = midgard_int_normal;
 161
 162                 /* These should have been lowered away */
 163                 assert(!(src->abs || src->negate));
 164         } else {
 165                 alu_src.mod = (src->abs << 0) | (src->negate << 1);
 166         }
 167
 168         return alu_src;
 169 }
 170
 171 /* 'Intrinsic' move for misc aliasing uses independent of actual NIR ALU code */
 172
 173 static midgard_instruction
 174 v_fmov(unsigned src, midgard_vector_alu_src mod, unsigned dest)
 175 {
 176         midgard_instruction ins = {
 177                 .type = TAG_ALU_4,
 178                 .ssa_args = {
 179                         .src0 = SSA_UNUSED_1,
 180                         .src1 = src,
 181                         .dest = dest,
 182                 },
 183                 .alu = {
 184                         .op = midgard_alu_op_fmov,
 185                         .reg_mode = midgard_reg_mode_32,
 186                         .dest_override = midgard_dest_override_none,
 187                         .mask = 0xFF,
 188                         .src1 = vector_alu_srco_unsigned(zero_alu_src),
 189                         .src2 = vector_alu_srco_unsigned(mod)
 190                 },
 191         };
 192
 193         return ins;
 194 }
 195
 196 /* load/store instructions have both 32-bit and 16-bit variants, depending on
 197  * whether we are using vectors composed of highp or mediump. At the moment, we
 198  * don't support half-floats -- this requires changes in other parts of the
 199  * compiler -- therefore the 16-bit versions are commented out. */
 200
 201 //M_LOAD(ld_attr_16);
 202 M_LOAD(ld_attr_32);
 203 //M_LOAD(ld_vary_16);
 204 M_LOAD(ld_vary_32);
 205 //M_LOAD(ld_uniform_16);
 206 M_LOAD(ld_uniform_32);
 207 M_LOAD(ld_color_buffer_8);
 208 //M_STORE(st_vary_16);
 209 M_STORE(st_vary_32);
 210 M_STORE(st_cubemap_coords);
 211
 212 static midgard_instruction
 213 v_alu_br_compact_cond(midgard_jmp_writeout_op op, unsigned tag, signed offset, unsigned cond)
 214 {
 215         midgard_branch_cond branch = {
 216                 .op = op,
 217                 .dest_tag = tag,
 218                 .offset = offset,
 219                 .cond = cond
 220         };
 221
 222         uint16_t compact;
 223         memcpy(&compact, &branch, sizeof(branch));
 224
 225         midgard_instruction ins = {
 226                 .type = TAG_ALU_4,
 227                 .unit = ALU_ENAB_BR_COMPACT,
 228                 .prepacked_branch = true,
 229                 .compact_branch = true,
 230                 .br_compact = compact
 231         };
 232
 233         if (op == midgard_jmp_writeout_op_writeout)
 234                 ins.writeout = true;
 235
 236         return ins;
 237 }
 238
 239 static midgard_instruction
 240 v_branch(bool conditional, bool invert)
 241 {
 242         midgard_instruction ins = {
 243                 .type = TAG_ALU_4,
 244                 .unit = ALU_ENAB_BRANCH,
 245                 .compact_branch = true,
 246                 .branch = {
 247                         .conditional = conditional,
 248                         .invert_conditional = invert
 249                 }
 250         };
 251
 252         return ins;
 253 }
 254
 255 static midgard_branch_extended
 256 midgard_create_branch_extended( midgard_condition cond,
 257                                 midgard_jmp_writeout_op op,
 258                                 unsigned dest_tag,
 259                                 signed quadword_offset)
 260 {
 261         /* For unclear reasons, the condition code is repeated 8 times */
 262         uint16_t duplicated_cond =
 263                 (cond << 14) |
 264                 (cond << 12) |
 265                 (cond << 10) |
 266                 (cond << 8) |
 267                 (cond << 6) |
 268                 (cond << 4) |
 269                 (cond << 2) |
 270                 (cond << 0);
 271
 272         midgard_branch_extended branch = {
 273                 .op = op,
 274                 .dest_tag = dest_tag,
 275                 .offset = quadword_offset,
 276                 .cond = duplicated_cond
 277         };
 278
 279         return branch;
 280 }
 281
 282 static void
 283 attach_constants(compiler_context *ctx, midgard_instruction *ins, void *constants, int name)
 284 {
 285         ins->has_constants = true;
 286         memcpy(&ins->constants, constants, 16);
 287 }
 288
 289 static int
 290 glsl_type_size(const struct glsl_type *type, bool bindless)
 291 {
 292         return glsl_count_attribute_slots(type, false);
 293 }
 294
 295 /* Lower fdot2 to a vector multiplication followed by channel addition  */
 296 static void
 297 midgard_nir_lower_fdot2_body(nir_builder *b, nir_alu_instr *alu)
 298 {
 299         if (alu->op != nir_op_fdot2)
 300                 return;
 301
 302         b->cursor = nir_before_instr(&alu->instr);
 303
 304         nir_ssa_def *src0 = nir_ssa_for_alu_src(b, alu, 0);
 305         nir_ssa_def *src1 = nir_ssa_for_alu_src(b, alu, 1);
 306
 307         nir_ssa_def *product = nir_fmul(b, src0, src1);
 308
 309         nir_ssa_def *sum = nir_fadd(b,
 310                         nir_channel(b, product, 0),
 311                         nir_channel(b, product, 1));
 312
 313         /* Replace the fdot2 with this sum */
 314         nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, nir_src_for_ssa(sum));
 315 }
 316
 317 static int
 318 midgard_nir_sysval_for_intrinsic(nir_intrinsic_instr *instr)
 319 {
 320         switch (instr->intrinsic) {
 321         case nir_intrinsic_load_viewport_scale:
 322                 return PAN_SYSVAL_VIEWPORT_SCALE;
 323         case nir_intrinsic_load_viewport_offset:
 324                 return PAN_SYSVAL_VIEWPORT_OFFSET;
 325         default:
 326                 return -1;
 327         }
 328 }
 329
 330 static void
 331 midgard_nir_assign_sysval_body(compiler_context *ctx, nir_instr *instr)
 332 {
 333         int sysval = -1;
 334
 335         if (instr->type == nir_instr_type_intrinsic) {
 336                 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
 337                 sysval = midgard_nir_sysval_for_intrinsic(intr);
 338         }
 339
 340         if (sysval < 0)
 341                 return;
 342
 343         /* We have a sysval load; check if it's already been assigned */
 344
 345         if (_mesa_hash_table_u64_search(ctx->sysval_to_id, sysval))
 346                 return;
 347
 348         /* It hasn't -- so assign it now! */
 349
 350         unsigned id = ctx->sysval_count++;
 351         _mesa_hash_table_u64_insert(ctx->sysval_to_id, sysval, (void *) ((uintptr_t) id + 1));
 352         ctx->sysvals[id] = sysval;
 353 }
 354
 355 static void
 356 midgard_nir_assign_sysvals(compiler_context *ctx, nir_shader *shader)
 357 {
 358         ctx->sysval_count = 0;
 359
 360         nir_foreach_function(function, shader) {
 361                 if (!function->impl) continue;
 362
 363                 nir_foreach_block(block, function->impl) {
 364                         nir_foreach_instr_safe(instr, block) {
 365                                 midgard_nir_assign_sysval_body(ctx, instr);
 366                         }
 367                 }
 368         }
 369 }
 370
 371 static bool
 372 midgard_nir_lower_fdot2(nir_shader *shader)
 373 {
 374         bool progress = false;
 375
 376         nir_foreach_function(function, shader) {
 377                 if (!function->impl) continue;
 378
 379                 nir_builder _b;
 380                 nir_builder *b = &_b;
 381                 nir_builder_init(b, function->impl);
 382
 383                 nir_foreach_block(block, function->impl) {
 384                         nir_foreach_instr_safe(instr, block) {
 385                                 if (instr->type != nir_instr_type_alu) continue;
 386
 387                                 nir_alu_instr *alu = nir_instr_as_alu(instr);
 388                                 midgard_nir_lower_fdot2_body(b, alu);
 389
 390                                 progress |= true;
 391                         }
 392                 }
 393
 394                 nir_metadata_preserve(function->impl, nir_metadata_block_index | nir_metadata_dominance);
 395
 396         }
 397
 398         return progress;
 399 }
 400
 401 static void
 402 optimise_nir(nir_shader *nir)
 403 {
 404         bool progress;
 405         unsigned lower_flrp =
 406                 (nir->options->lower_flrp16 ? 16 : 0) |
 407                 (nir->options->lower_flrp32 ? 32 : 0) |
 408                 (nir->options->lower_flrp64 ? 64 : 0);
 409
 410         NIR_PASS(progress, nir, nir_lower_regs_to_ssa);
 411         NIR_PASS(progress, nir, midgard_nir_lower_fdot2);
 412
 413         nir_lower_tex_options lower_tex_options = {
 414                 .lower_rect = true
 415         };
 416
 417         NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_options);
 418
 419         do {
 420                 progress = false;
 421
 422                 NIR_PASS(progress, nir, nir_lower_var_copies);
 423                 NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
 424
 425                 NIR_PASS(progress, nir, nir_copy_prop);
 426                 NIR_PASS(progress, nir, nir_opt_dce);
 427                 NIR_PASS(progress, nir, nir_opt_dead_cf);
 428                 NIR_PASS(progress, nir, nir_opt_cse);
 429                 NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
 430                 NIR_PASS(progress, nir, nir_opt_algebraic);
 431                 NIR_PASS(progress, nir, nir_opt_constant_folding);
 432
 433                 if (lower_flrp != 0) {
 434                         bool lower_flrp_progress = false;
 435                         NIR_PASS(lower_flrp_progress,
 436                                  nir,
 437                                  nir_lower_flrp,
 438                                  lower_flrp,
 439                                  false /* always_precise */,
 440                                  nir->options->lower_ffma);
 441                         if (lower_flrp_progress) {
 442                                 NIR_PASS(progress, nir,
 443                                          nir_opt_constant_folding);
 444                                 progress = true;
 445                         }
 446
 447                         /* Nothing should rematerialize any flrps, so we only
 448                          * need to do this lowering once.
 449                          */
 450                         lower_flrp = 0;
 451                 }
 452
 453                 NIR_PASS(progress, nir, nir_opt_undef);
 454                 NIR_PASS(progress, nir, nir_opt_loop_unroll,
 455                          nir_var_shader_in |
 456                          nir_var_shader_out |
 457                          nir_var_function_temp);
 458
 459                 /* TODO: Enable vectorize when merged upstream */
 460                 // NIR_PASS(progress, nir, nir_opt_vectorize);
 461         } while (progress);
 462
 463         /* Must be run at the end to prevent creation of fsin/fcos ops */
 464         NIR_PASS(progress, nir, midgard_nir_scale_trig);
 465
 466         do {
 467                 progress = false;
 468
 469                 NIR_PASS(progress, nir, nir_opt_dce);
 470                 NIR_PASS(progress, nir, nir_opt_algebraic);
 471                 NIR_PASS(progress, nir, nir_opt_constant_folding);
 472                 NIR_PASS(progress, nir, nir_copy_prop);
 473         } while (progress);
 474
 475         NIR_PASS(progress, nir, nir_opt_algebraic_late);
 476
 477         /* We implement booleans as 32-bit 0/~0 */
 478         NIR_PASS(progress, nir, nir_lower_bool_to_int32);
 479
 480         /* Now that booleans are lowered, we can run out late opts */
 481         NIR_PASS(progress, nir, midgard_nir_lower_algebraic_late);
 482
 483         /* Lower mods for float ops only. Integer ops don't support modifiers
 484          * (saturate doesn't make sense on integers, neg/abs require dedicated
 485          * instructions) */
 486
 487         NIR_PASS(progress, nir, nir_lower_to_source_mods, nir_lower_float_source_mods);
 488         NIR_PASS(progress, nir, nir_copy_prop);
 489         NIR_PASS(progress, nir, nir_opt_dce);
 490
 491         /* Take us out of SSA */
 492         NIR_PASS(progress, nir, nir_lower_locals_to_regs);
 493         NIR_PASS(progress, nir, nir_convert_from_ssa, true);
 494
 495         /* We are a vector architecture; write combine where possible */
 496         NIR_PASS(progress, nir, nir_move_vec_src_uses_to_dest);
 497         NIR_PASS(progress, nir, nir_lower_vec_to_movs);
 498
 499         NIR_PASS(progress, nir, nir_opt_dce);
 500 }
 501
 502 /* Front-half of aliasing the SSA slots, merely by inserting the flag in the
 503  * appropriate hash table. Intentional off-by-one to avoid confusing NULL with
 504  * r0. See the comments in compiler_context */
 505
 506 static void
 507 alias_ssa(compiler_context *ctx, int dest, int src)
 508 {
 509         _mesa_hash_table_u64_insert(ctx->ssa_to_alias, dest + 1, (void *) ((uintptr_t) src + 1));
 510         _mesa_set_add(ctx->leftover_ssa_to_alias, (void *) (uintptr_t) (dest + 1));
 511 }
 512
 513 /* ...or undo it, after which the original index will be used (dummy move should be emitted alongside this) */
 514
 515 static void
 516 unalias_ssa(compiler_context *ctx, int dest)
 517 {
 518         _mesa_hash_table_u64_remove(ctx->ssa_to_alias, dest + 1);
 519         /* TODO: Remove from leftover or no? */
 520 }
 521
 522 /* Do not actually emit a load; instead, cache the constant for inlining */
 523
 524 static void
 525 emit_load_const(compiler_context *ctx, nir_load_const_instr *instr)
 526 {
 527         nir_ssa_def def = instr->def;
 528
 529         float *v = rzalloc_array(NULL, float, 4);
 530         nir_const_load_to_arr(v, instr, f32);
 531         _mesa_hash_table_u64_insert(ctx->ssa_constants, def.index + 1, v);
 532 }
 533
 534 /* Duplicate bits to convert sane 4-bit writemask to obscure 8-bit format (or
 535  * do the inverse) */
 536
 537 static unsigned
 538 expand_writemask(unsigned mask)
 539 {
 540         unsigned o = 0;
 541
 542         for (int i = 0; i < 4; ++i)
 543                 if (mask & (1 << i))
 544                         o |= (3 << (2 * i));
 545
 546         return o;
 547 }
 548
 549 static unsigned
 550 squeeze_writemask(unsigned mask)
 551 {
 552         unsigned o = 0;
 553
 554         for (int i = 0; i < 4; ++i)
 555                 if (mask & (3 << (2 * i)))
 556                         o |= (1 << i);
 557
 558         return o;
 559
 560 }
 561
 562 /* Determines effective writemask, taking quirks and expansion into account */
 563 static unsigned
 564 effective_writemask(midgard_vector_alu *alu)
 565 {
 566         /* Channel count is off-by-one to fit in two-bits (0 channel makes no
 567          * sense) */
 568
 569         unsigned channel_count = GET_CHANNEL_COUNT(alu_opcode_props[alu->op].props);
 570
 571         /* If there is a fixed channel count, construct the appropriate mask */
 572
 573         if (channel_count)
 574                 return (1 << channel_count) - 1;
 575
 576         /* Otherwise, just squeeze the existing mask */
 577         return squeeze_writemask(alu->mask);
 578 }
 579
 580 static unsigned
 581 nir_src_index(compiler_context *ctx, nir_src *src)
 582 {
 583         if (src->is_ssa)
 584                 return src->ssa->index;
 585         else {
 586                 assert(!src->reg.indirect);
 587                 return ctx->func->impl->ssa_alloc + src->reg.reg->index;
 588         }
 589 }
 590
 591 static unsigned
 592 nir_dest_index(compiler_context *ctx, nir_dest *dst)
 593 {
 594         if (dst->is_ssa)
 595                 return dst->ssa.index;
 596         else {
 597                 assert(!dst->reg.indirect);
 598                 return ctx->func->impl->ssa_alloc + dst->reg.reg->index;
 599         }
 600 }
 601
 602 static unsigned
 603 nir_alu_src_index(compiler_context *ctx, nir_alu_src *src)
 604 {
 605         return nir_src_index(ctx, &src->src);
 606 }
 607
 608 static bool
 609 nir_is_non_scalar_swizzle(nir_alu_src *src, unsigned nr_components)
 610 {
 611         unsigned comp = src->swizzle[0];
 612
 613         for (unsigned c = 1; c < nr_components; ++c) {
 614                 if (src->swizzle[c] != comp)
 615                         return true;
 616         }
 617
 618         return false;
 619 }
 620
 621 /* Midgard puts scalar conditionals in r31.w; move an arbitrary source (the
 622  * output of a conditional test) into that register */
 623
 624 static void
 625 emit_condition(compiler_context *ctx, nir_src *src, bool for_branch, unsigned component)
 626 {
 627         int condition = nir_src_index(ctx, src);
 628
 629         /* Source to swizzle the desired component into w */
 630
 631         const midgard_vector_alu_src alu_src = {
 632                 .swizzle = SWIZZLE(component, component, component, component),
 633         };
 634
 635         /* There is no boolean move instruction. Instead, we simulate a move by
 636          * ANDing the condition with itself to get it into r31.w */
 637
 638         midgard_instruction ins = {
 639                 .type = TAG_ALU_4,
 640
 641                 /* We need to set the conditional as close as possible */
 642                 .precede_break = true,
 643                 .unit = for_branch ? UNIT_SMUL : UNIT_SADD,
 644
 645                 .ssa_args = {
 646
 647                         .src0 = condition,
 648                         .src1 = condition,
 649                         .dest = SSA_FIXED_REGISTER(31),
 650                 },
 651                 .alu = {
 652                         .op = midgard_alu_op_iand,
 653                         .reg_mode = midgard_reg_mode_32,
 654                         .dest_override = midgard_dest_override_none,
 655                         .mask = (0x3 << 6), /* w */
 656                         .src1 = vector_alu_srco_unsigned(alu_src),
 657                         .src2 = vector_alu_srco_unsigned(alu_src)
 658                 },
 659         };
 660
 661         emit_mir_instruction(ctx, ins);
 662 }
 663
 664 /* Or, for mixed conditions (with csel_v), here's a vector version using all of
 665  * r31 instead */
 666
 667 static void
 668 emit_condition_mixed(compiler_context *ctx, nir_alu_src *src, unsigned nr_comp)
 669 {
 670         int condition = nir_src_index(ctx, &src->src);
 671
 672         /* Source to swizzle the desired component into w */
 673
 674         const midgard_vector_alu_src alu_src = {
 675                 .swizzle = SWIZZLE_FROM_ARRAY(src->swizzle),
 676         };
 677
 678         /* There is no boolean move instruction. Instead, we simulate a move by
 679          * ANDing the condition with itself to get it into r31.w */
 680
 681         midgard_instruction ins = {
 682                 .type = TAG_ALU_4,
 683                 .precede_break = true,
 684                 .ssa_args = {
 685                         .src0 = condition,
 686                         .src1 = condition,
 687                         .dest = SSA_FIXED_REGISTER(31),
 688                 },
 689                 .alu = {
 690                         .op = midgard_alu_op_iand,
 691                         .reg_mode = midgard_reg_mode_32,
 692                         .dest_override = midgard_dest_override_none,
 693                         .mask = expand_writemask((1 << nr_comp) - 1),
 694                         .src1 = vector_alu_srco_unsigned(alu_src),
 695                         .src2 = vector_alu_srco_unsigned(alu_src)
 696                 },
 697         };
 698
 699         emit_mir_instruction(ctx, ins);
 700 }
 701
 702
 703
 704 /* Likewise, indirect offsets are put in r27.w. TODO: Allow componentwise
 705  * pinning to eliminate this move in all known cases */
 706
 707 static void
 708 emit_indirect_offset(compiler_context *ctx, nir_src *src)
 709 {
 710         int offset = nir_src_index(ctx, src);
 711
 712         midgard_instruction ins = {
 713                 .type = TAG_ALU_4,
 714                 .ssa_args = {
 715                         .src0 = SSA_UNUSED_1,
 716                         .src1 = offset,
 717                         .dest = SSA_FIXED_REGISTER(REGISTER_OFFSET),
 718                 },
 719                 .alu = {
 720                         .op = midgard_alu_op_imov,
 721                         .reg_mode = midgard_reg_mode_32,
 722                         .dest_override = midgard_dest_override_none,
 723                         .mask = (0x3 << 6), /* w */
 724                         .src1 = vector_alu_srco_unsigned(zero_alu_src),
 725                         .src2 = vector_alu_srco_unsigned(blank_alu_src_xxxx)
 726                 },
 727         };
 728
 729         emit_mir_instruction(ctx, ins);
 730 }
 731
 732 #define ALU_CASE(nir, _op) \
 733         case nir_op_##nir: \
 734                 op = midgard_alu_op_##_op; \
 735                 break;
 736 static bool
 737 nir_is_fzero_constant(nir_src src)
 738 {
 739         if (!nir_src_is_const(src))
 740                 return false;
 741
 742         for (unsigned c = 0; c < nir_src_num_components(src); ++c) {
 743                 if (nir_src_comp_as_float(src, c) != 0.0)
 744                         return false;
 745         }
 746
 747         return true;
 748 }
 749
 750 static void
 751 emit_alu(compiler_context *ctx, nir_alu_instr *instr)
 752 {
 753         bool is_ssa = instr->dest.dest.is_ssa;
 754
 755         unsigned dest = nir_dest_index(ctx, &instr->dest.dest);
 756         unsigned nr_components = is_ssa ? instr->dest.dest.ssa.num_components : instr->dest.dest.reg.reg->num_components;
 757         unsigned nr_inputs = nir_op_infos[instr->op].num_inputs;
 758
 759         /* Most Midgard ALU ops have a 1:1 correspondance to NIR ops; these are
 760          * supported. A few do not and are commented for now. Also, there are a
 761          * number of NIR ops which Midgard does not support and need to be
 762          * lowered, also TODO. This switch block emits the opcode and calling
 763          * convention of the Midgard instruction; actual packing is done in
 764          * emit_alu below */
 765
 766         unsigned op;
 767
 768         switch (instr->op) {
 769                 ALU_CASE(fadd, fadd);
 770                 ALU_CASE(fmul, fmul);
 771                 ALU_CASE(fmin, fmin);
 772                 ALU_CASE(fmax, fmax);
 773                 ALU_CASE(imin, imin);
 774                 ALU_CASE(imax, imax);
 775                 ALU_CASE(umin, umin);
 776                 ALU_CASE(umax, umax);
 777                 ALU_CASE(ffloor, ffloor);
 778                 ALU_CASE(fround_even, froundeven);
 779                 ALU_CASE(ftrunc, ftrunc);
 780                 ALU_CASE(fceil, fceil);
 781                 ALU_CASE(fdot3, fdot3);
 782                 ALU_CASE(fdot4, fdot4);
 783                 ALU_CASE(iadd, iadd);
 784                 ALU_CASE(isub, isub);
 785                 ALU_CASE(imul, imul);
 786                 ALU_CASE(iabs, iabs);
 787                 ALU_CASE(mov, imov);
 788
 789                 ALU_CASE(feq32, feq);
 790                 ALU_CASE(fne32, fne);
 791                 ALU_CASE(flt32, flt);
 792                 ALU_CASE(ieq32, ieq);
 793                 ALU_CASE(ine32, ine);
 794                 ALU_CASE(ilt32, ilt);
 795                 ALU_CASE(ult32, ult);
 796
 797                 /* We don't have a native b2f32 instruction. Instead, like many
 798                  * GPUs, we exploit booleans as 0/~0 for false/true, and
 799                  * correspondingly AND
 800                  * by 1.0 to do the type conversion. For the moment, prime us
 801                  * to emit:
 802                  *
 803                  * iand [whatever], #0
 804                  *
 805                  * At the end of emit_alu (as MIR), we'll fix-up the constant
 806                  */
 807
 808                 ALU_CASE(b2f32, iand);
 809                 ALU_CASE(b2i32, iand);
 810
 811                 /* Likewise, we don't have a dedicated f2b32 instruction, but
 812                  * we can do a "not equal to 0.0" test. */
 813
 814                 ALU_CASE(f2b32, fne);
 815                 ALU_CASE(i2b32, ine);
 816
 817                 ALU_CASE(frcp, frcp);
 818                 ALU_CASE(frsq, frsqrt);
 819                 ALU_CASE(fsqrt, fsqrt);
 820                 ALU_CASE(fexp2, fexp2);
 821                 ALU_CASE(flog2, flog2);
 822
 823                 ALU_CASE(f2i32, f2i);
 824                 ALU_CASE(f2u32, f2u);
 825                 ALU_CASE(i2f32, i2f);
 826                 ALU_CASE(u2f32, u2f);
 827
 828                 ALU_CASE(fsin, fsin);
 829                 ALU_CASE(fcos, fcos);
 830
 831                 ALU_CASE(iand, iand);
 832                 ALU_CASE(ior, ior);
 833                 ALU_CASE(ixor, ixor);
 834                 ALU_CASE(inot, inand);
 835                 ALU_CASE(ishl, ishl);
 836                 ALU_CASE(ishr, iasr);
 837                 ALU_CASE(ushr, ilsr);
 838
 839                 ALU_CASE(b32all_fequal2, fball_eq);
 840                 ALU_CASE(b32all_fequal3, fball_eq);
 841                 ALU_CASE(b32all_fequal4, fball_eq);
 842
 843                 ALU_CASE(b32any_fnequal2, fbany_neq);
 844                 ALU_CASE(b32any_fnequal3, fbany_neq);
 845                 ALU_CASE(b32any_fnequal4, fbany_neq);
 846
 847                 ALU_CASE(b32all_iequal2, iball_eq);
 848                 ALU_CASE(b32all_iequal3, iball_eq);
 849                 ALU_CASE(b32all_iequal4, iball_eq);
 850
 851                 ALU_CASE(b32any_inequal2, ibany_neq);
 852                 ALU_CASE(b32any_inequal3, ibany_neq);
 853                 ALU_CASE(b32any_inequal4, ibany_neq);
 854
 855                 /* Source mods will be shoved in later */
 856                 ALU_CASE(fabs, fmov);
 857                 ALU_CASE(fneg, fmov);
 858                 ALU_CASE(fsat, fmov);
 859
 860         /* For greater-or-equal, we lower to less-or-equal and flip the
 861          * arguments */
 862
 863         case nir_op_fge:
 864         case nir_op_fge32:
 865         case nir_op_ige32:
 866         case nir_op_uge32: {
 867                 op =
 868                         instr->op == nir_op_fge   ? midgard_alu_op_fle :
 869                         instr->op == nir_op_fge32 ? midgard_alu_op_fle :
 870                         instr->op == nir_op_ige32 ? midgard_alu_op_ile :
 871                         instr->op == nir_op_uge32 ? midgard_alu_op_ule :
 872                         0;
 873
 874                 /* Swap via temporary */
 875                 nir_alu_src temp = instr->src[1];
 876                 instr->src[1] = instr->src[0];
 877                 instr->src[0] = temp;
 878
 879                 break;
 880         }
 881
 882         case nir_op_b32csel: {
 883                 /* Midgard features both fcsel and icsel, depending on
 884                  * the type of the arguments/output. However, as long
 885                  * as we're careful we can _always_ use icsel and
 886                  * _never_ need fcsel, since the latter does additional
 887                  * floating-point-specific processing whereas the
 888                  * former just moves bits on the wire. It's not obvious
 889                  * why these are separate opcodes, save for the ability
 890                  * to do things like sat/pos/abs/neg for free */
 891
 892                 bool mixed = nir_is_non_scalar_swizzle(&instr->src[0], nr_components);
 893                 op = mixed ? midgard_alu_op_icsel_v : midgard_alu_op_icsel;
 894
 895                 /* csel works as a two-arg in Midgard, since the condition is hardcoded in r31.w */
 896                 nr_inputs = 2;
 897
 898                 /* Emit the condition into r31 */
 899
 900                 if (mixed)
 901                         emit_condition_mixed(ctx, &instr->src[0], nr_components);
 902                 else
 903                         emit_condition(ctx, &instr->src[0].src, false, instr->src[0].swizzle[0]);
 904
 905                 /* The condition is the first argument; move the other
 906                  * arguments up one to be a binary instruction for
 907                  * Midgard */
 908
 909                 memmove(instr->src, instr->src + 1, 2 * sizeof(nir_alu_src));
 910                 break;
 911         }
 912
 913         default:
 914                 DBG("Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
 915                 assert(0);
 916                 return;
 917         }
 918
 919         /* Midgard can perform certain modifiers on output of an ALU op */
 920         midgard_outmod outmod =
 921                 midgard_is_integer_out_op(op) ? midgard_outmod_int :
 922                 instr->dest.saturate ? midgard_outmod_sat : midgard_outmod_none;
 923
 924         if (instr->op == nir_op_fsat)
 925                 outmod = midgard_outmod_sat;
 926
 927         /* fmax(a, 0.0) can turn into a .pos modifier as an optimization */
 928
 929         if (instr->op == nir_op_fmax) {
 930                 if (nir_is_fzero_constant(instr->src[0].src)) {
 931                         op = midgard_alu_op_fmov;
 932                         nr_inputs = 1;
 933                         outmod = midgard_outmod_pos;
 934                         instr->src[0] = instr->src[1];
 935                 } else if (nir_is_fzero_constant(instr->src[1].src)) {
 936                         op = midgard_alu_op_fmov;
 937                         nr_inputs = 1;
 938                         outmod = midgard_outmod_pos;
 939                 }
 940         }
 941
 942         /* Fetch unit, quirks, etc information */
 943         unsigned opcode_props = alu_opcode_props[op].props;
 944         bool quirk_flipped_r24 = opcode_props & QUIRK_FLIPPED_R24;
 945
 946         /* src0 will always exist afaik, but src1 will not for 1-argument
 947          * instructions. The latter can only be fetched if the instruction
 948          * needs it, or else we may segfault. */
 949
 950         unsigned src0 = nir_alu_src_index(ctx, &instr->src[0]);
 951         unsigned src1 = nr_inputs == 2 ? nir_alu_src_index(ctx, &instr->src[1]) : SSA_UNUSED_0;
 952
 953         /* Rather than use the instruction generation helpers, we do it
 954          * ourselves here to avoid the mess */
 955
 956         midgard_instruction ins = {
 957                 .type = TAG_ALU_4,
 958                 .ssa_args = {
 959                         .src0 = quirk_flipped_r24 ? SSA_UNUSED_1 : src0,
 960                         .src1 = quirk_flipped_r24 ? src0         : src1,
 961                         .dest = dest,
 962                 }
 963         };
 964
 965         nir_alu_src *nirmods[2] = { NULL };
 966
 967         if (nr_inputs == 2) {
 968                 nirmods[0] = &instr->src[0];
 969                 nirmods[1] = &instr->src[1];
 970         } else if (nr_inputs == 1) {
 971                 nirmods[quirk_flipped_r24] = &instr->src[0];
 972         } else {
 973                 assert(0);
 974         }
 975
 976         /* These were lowered to a move, so apply the corresponding mod */
 977
 978         if (instr->op == nir_op_fneg || instr->op == nir_op_fabs) {
 979                 nir_alu_src *s = nirmods[quirk_flipped_r24];
 980
 981                 if (instr->op == nir_op_fneg)
 982                         s->negate = !s->negate;
 983
 984                 if (instr->op == nir_op_fabs)
 985                         s->abs = !s->abs;
 986         }
 987
 988         bool is_int = midgard_is_integer_op(op);
 989
 990         midgard_vector_alu alu = {
 991                 .op = op,
 992                 .reg_mode = midgard_reg_mode_32,
 993                 .dest_override = midgard_dest_override_none,
 994                 .outmod = outmod,
 995
 996                 /* Writemask only valid for non-SSA NIR */
 997                 .mask = expand_writemask((1 << nr_components) - 1),
 998
 999                 .src1 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmods[0], is_int)),
1000                 .src2 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmods[1], is_int)),
1001         };
1002
1003         /* Apply writemask if non-SSA, keeping in mind that we can't write to components that don't exist */
1004
1005         if (!is_ssa)
1006                 alu.mask &= expand_writemask(instr->dest.write_mask);
1007
1008         ins.alu = alu;
1009
1010         /* Late fixup for emulated instructions */
1011
1012         if (instr->op == nir_op_b2f32 || instr->op == nir_op_b2i32) {
1013                 /* Presently, our second argument is an inline #0 constant.
1014                  * Switch over to an embedded 1.0 constant (that can't fit
1015                  * inline, since we're 32-bit, not 16-bit like the inline
1016                  * constants) */
1017
1018                 ins.ssa_args.inline_constant = false;
1019                 ins.ssa_args.src1 = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
1020                 ins.has_constants = true;
1021
1022                 if (instr->op == nir_op_b2f32) {
1023                         ins.constants[0] = 1.0f;
1024                 } else {
1025                         /* Type pun it into place */
1026                         uint32_t one = 0x1;
1027                         memcpy(&ins.constants[0], &one, sizeof(uint32_t));
1028                 }
1029
1030                 ins.alu.src2 = vector_alu_srco_unsigned(blank_alu_src_xxxx);
1031         } else if (instr->op == nir_op_f2b32 || instr->op == nir_op_i2b32) {
1032                 ins.ssa_args.inline_constant = false;
1033                 ins.ssa_args.src1 = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
1034                 ins.has_constants = true;
1035                 ins.constants[0] = 0.0f;
1036                 ins.alu.src2 = vector_alu_srco_unsigned(blank_alu_src_xxxx);
1037         } else if (instr->op == nir_op_inot) {
1038                 /* ~b = ~(b & b), so duplicate the source */
1039                 ins.ssa_args.src1 = ins.ssa_args.src0;
1040                 ins.alu.src2 = ins.alu.src1;
1041         }
1042
1043         if ((opcode_props & UNITS_ALL) == UNIT_VLUT) {
1044                 /* To avoid duplicating the lookup tables (probably), true LUT
1045                  * instructions can only operate as if they were scalars. Lower
1046                  * them here by changing the component. */
1047
1048                 uint8_t original_swizzle[4];
1049                 memcpy(original_swizzle, nirmods[0]->swizzle, sizeof(nirmods[0]->swizzle));
1050
1051                 for (int i = 0; i < nr_components; ++i) {
1052                         ins.alu.mask = (0x3) << (2 * i); /* Mask the associated component */
1053
1054                         for (int j = 0; j < 4; ++j)
1055                                 nirmods[0]->swizzle[j] = original_swizzle[i]; /* Pull from the correct component */
1056
1057                         ins.alu.src1 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmods[0], is_int));
1058                         emit_mir_instruction(ctx, ins);
1059                 }
1060         } else {
1061                 emit_mir_instruction(ctx, ins);
1062         }
1063 }
1064
1065 #undef ALU_CASE
1066
1067 static void
1068 emit_uniform_read(compiler_context *ctx, unsigned dest, unsigned offset, nir_src *indirect_offset)
1069 {
1070         /* TODO: half-floats */
1071
1072         if (!indirect_offset && offset < ctx->uniform_cutoff) {
1073                 /* Fast path: For the first 16 uniforms, direct accesses are
1074                  * 0-cycle, since they're just a register fetch in the usual
1075                  * case.  So, we alias the registers while we're still in
1076                  * SSA-space */
1077
1078                 int reg_slot = 23 - offset;
1079                 alias_ssa(ctx, dest, SSA_FIXED_REGISTER(reg_slot));
1080         } else {
1081                 /* Otherwise, read from the 'special' UBO to access
1082                  * higher-indexed uniforms, at a performance cost. More
1083                  * generally, we're emitting a UBO read instruction. */
1084
1085                 midgard_instruction ins = m_ld_uniform_32(dest, offset);
1086
1087                 /* TODO: Don't split */
1088                 ins.load_store.varying_parameters = (offset & 7) << 7;
1089                 ins.load_store.address = offset >> 3;
1090
1091                 if (indirect_offset) {
1092                         emit_indirect_offset(ctx, indirect_offset);
1093                         ins.load_store.unknown = 0x8700; /* xxx: what is this? */
1094                 } else {
1095                         ins.load_store.unknown = 0x1E00; /* xxx: what is this? */
1096                 }
1097
1098                 emit_mir_instruction(ctx, ins);
1099         }
1100 }
1101
1102 static void
1103 emit_sysval_read(compiler_context *ctx, nir_intrinsic_instr *instr)
1104 {
1105         /* First, pull out the destination */
1106         unsigned dest = nir_dest_index(ctx, &instr->dest);
1107
1108         /* Now, figure out which uniform this is */
1109         int sysval = midgard_nir_sysval_for_intrinsic(instr);
1110         void *val = _mesa_hash_table_u64_search(ctx->sysval_to_id, sysval);
1111
1112         /* Sysvals are prefix uniforms */
1113         unsigned uniform = ((uintptr_t) val) - 1;
1114
1115         /* Emit the read itself -- this is never indirect */
1116         emit_uniform_read(ctx, dest, uniform, NULL);
1117 }
1118
1119 /* Reads RGBA8888 value from the tilebuffer and converts to a RGBA32F register,
1120  * using scalar ops functional on earlier Midgard generations. Newer Midgard
1121  * generations have faster vectorized reads. This operation is for blend
1122  * shaders in particular; reading the tilebuffer from the fragment shader
1123  * remains an open problem. */
1124
1125 static void
1126 emit_fb_read_blend_scalar(compiler_context *ctx, unsigned reg)
1127 {
1128         midgard_instruction ins = m_ld_color_buffer_8(reg, 0);
1129         ins.load_store.swizzle = 0; /* xxxx */
1130
1131         /* Read each component sequentially */
1132
1133         for (unsigned c = 0; c < 4; ++c) {
1134                 ins.load_store.mask = (1 << c);
1135                 ins.load_store.unknown = c;
1136                 emit_mir_instruction(ctx, ins);
1137         }
1138
1139         /* vadd.u2f hr2, zext(hr2), #0 */
1140
1141         midgard_vector_alu_src alu_src = blank_alu_src;
1142         alu_src.mod = midgard_int_zero_extend;
1143         alu_src.half = true;
1144
1145         midgard_instruction u2f = {
1146                 .type = TAG_ALU_4,
1147                 .ssa_args = {
1148                         .src0 = reg,
1149                         .src1 = SSA_UNUSED_0,
1150                         .dest = reg,
1151                         .inline_constant = true
1152                 },
1153                 .alu = {
1154                         .op = midgard_alu_op_u2f,
1155                         .reg_mode = midgard_reg_mode_16,
1156                         .dest_override = midgard_dest_override_none,
1157                         .mask = 0xF,
1158                         .src1 = vector_alu_srco_unsigned(alu_src),
1159                         .src2 = vector_alu_srco_unsigned(blank_alu_src),
1160                 }
1161         };
1162
1163         emit_mir_instruction(ctx, u2f);
1164
1165         /* vmul.fmul.sat r1, hr2, #0.00392151 */
1166
1167         alu_src.mod = 0;
1168
1169         midgard_instruction fmul = {
1170                 .type = TAG_ALU_4,
1171                 .inline_constant = _mesa_float_to_half(1.0 / 255.0),
1172                 .ssa_args = {
1173                         .src0 = reg,
1174                         .dest = reg,
1175                         .src1 = SSA_UNUSED_0,
1176                         .inline_constant = true
1177                 },
1178                 .alu = {
1179                         .op = midgard_alu_op_fmul,
1180                         .reg_mode = midgard_reg_mode_32,
1181                         .dest_override = midgard_dest_override_none,
1182                         .outmod = midgard_outmod_sat,
1183                         .mask = 0xFF,
1184                         .src1 = vector_alu_srco_unsigned(alu_src),
1185                         .src2 = vector_alu_srco_unsigned(blank_alu_src),
1186                 }
1187         };
1188
1189         emit_mir_instruction(ctx, fmul);
1190 }
1191
1192 static void
1193 emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr)
1194 {
1195         unsigned offset, reg;
1196
1197         switch (instr->intrinsic) {
1198         case nir_intrinsic_discard_if:
1199                 emit_condition(ctx, &instr->src[0], true, COMPONENT_X);
1200
1201         /* fallthrough */
1202
1203         case nir_intrinsic_discard: {
1204                 bool conditional = instr->intrinsic == nir_intrinsic_discard_if;
1205                 struct midgard_instruction discard = v_branch(conditional, false);
1206                 discard.branch.target_type = TARGET_DISCARD;
1207                 emit_mir_instruction(ctx, discard);
1208
1209                 ctx->can_discard = true;
1210                 break;
1211         }
1212
1213         case nir_intrinsic_load_uniform:
1214         case nir_intrinsic_load_input:
1215                 offset = nir_intrinsic_base(instr);
1216
1217                 bool direct = nir_src_is_const(instr->src[0]);
1218
1219                 if (direct) {
1220                         offset += nir_src_as_uint(instr->src[0]);
1221                 }
1222
1223                 reg = nir_dest_index(ctx, &instr->dest);
1224
1225                 if (instr->intrinsic == nir_intrinsic_load_uniform && !ctx->is_blend) {
1226                         emit_uniform_read(ctx, reg, ctx->sysval_count + offset, !direct ? &instr->src[0] : NULL);
1227                 } else if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->is_blend) {
1228                         /* XXX: Half-floats? */
1229                         /* TODO: swizzle, mask */
1230
1231                         midgard_instruction ins = m_ld_vary_32(reg, offset);
1232
1233                         midgard_varying_parameter p = {
1234                                 .is_varying = 1,
1235                                 .interpolation = midgard_interp_default,
1236                                 .flat = /*var->data.interpolation == INTERP_MODE_FLAT*/ 0
1237                         };
1238
1239                         unsigned u;
1240                         memcpy(&u, &p, sizeof(p));
1241                         ins.load_store.varying_parameters = u;
1242
1243                         if (direct) {
1244                                 /* We have the offset totally ready */
1245                                 ins.load_store.unknown = 0x1e9e; /* xxx: what is this? */
1246                         } else {
1247                                 /* We have it partially ready, but we need to
1248                                  * add in the dynamic index, moved to r27.w */
1249                                 emit_indirect_offset(ctx, &instr->src[0]);
1250                                 ins.load_store.unknown = 0x79e; /* xxx: what is this? */
1251                         }
1252
1253                         emit_mir_instruction(ctx, ins);
1254                 } else if (ctx->is_blend) {
1255                         /* For blend shaders, load the input color, which is
1256                          * preloaded to r0 */
1257
1258                         midgard_instruction move = v_fmov(reg, blank_alu_src, SSA_FIXED_REGISTER(0));
1259                         emit_mir_instruction(ctx, move);
1260                 }  else if (ctx->stage == MESA_SHADER_VERTEX) {
1261                         midgard_instruction ins = m_ld_attr_32(reg, offset);
1262                         ins.load_store.unknown = 0x1E1E; /* XXX: What is this? */
1263                         ins.load_store.mask = (1 << instr->num_components) - 1;
1264                         emit_mir_instruction(ctx, ins);
1265                 } else {
1266                         DBG("Unknown load\n");
1267                         assert(0);
1268                 }
1269
1270                 break;
1271
1272         case nir_intrinsic_load_output:
1273                 assert(nir_src_is_const(instr->src[0]));
1274                 reg = nir_dest_index(ctx, &instr->dest);
1275
1276                 if (ctx->is_blend) {
1277                         /* TODO: MRT */
1278                         emit_fb_read_blend_scalar(ctx, reg);
1279                 } else {
1280                         DBG("Unknown output load\n");
1281                         assert(0);
1282                 }
1283
1284                 break;
1285
1286         case nir_intrinsic_load_blend_const_color_rgba: {
1287                 assert(ctx->is_blend);
1288                 reg = nir_dest_index(ctx, &instr->dest);
1289
1290                 /* Blend constants are embedded directly in the shader and
1291                  * patched in, so we use some magic routing */
1292
1293                 midgard_instruction ins = v_fmov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), blank_alu_src, reg);
1294                 ins.has_constants = true;
1295                 ins.has_blend_constant = true;
1296                 emit_mir_instruction(ctx, ins);
1297                 break;
1298         }
1299
1300         case nir_intrinsic_store_output:
1301                 assert(nir_src_is_const(instr->src[1]) && "no indirect outputs");
1302
1303                 offset = nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[1]);
1304
1305                 reg = nir_src_index(ctx, &instr->src[0]);
1306
1307                 if (ctx->stage == MESA_SHADER_FRAGMENT) {
1308                         /* gl_FragColor is not emitted with load/store
1309                          * instructions. Instead, it gets plonked into
1310                          * r0 at the end of the shader and we do the
1311                          * framebuffer writeout dance. TODO: Defer
1312                          * writes */
1313
1314                         midgard_instruction move = v_fmov(reg, blank_alu_src, SSA_FIXED_REGISTER(0));
1315                         emit_mir_instruction(ctx, move);
1316
1317                         /* Save the index we're writing to for later reference
1318                          * in the epilogue */
1319
1320                         ctx->fragment_output = reg;
1321                 } else if (ctx->stage == MESA_SHADER_VERTEX) {
1322                         /* Varyings are written into one of two special
1323                          * varying register, r26 or r27. The register itself is selected as the register
1324                          * in the st_vary instruction, minus the base of 26. E.g. write into r27 and then call st_vary(1)
1325                          *
1326                          * Normally emitting fmov's is frowned upon,
1327                          * but due to unique constraints of
1328                          * REGISTER_VARYING, fmov emission + a
1329                          * dedicated cleanup pass is the only way to
1330                          * guarantee correctness when considering some
1331                          * (common) edge cases XXX: FIXME */
1332
1333                         /* If this varying corresponds to a constant (why?!),
1334                          * emit that now since it won't get picked up by
1335                          * hoisting (since there is no corresponding move
1336                          * emitted otherwise) */
1337
1338                         void *constant_value = _mesa_hash_table_u64_search(ctx->ssa_constants, reg + 1);
1339
1340                         if (constant_value) {
1341                                 /* Special case: emit the varying write
1342                                  * directly to r26 (looks funny in asm but it's
1343                                  * fine) and emit the store _now_. Possibly
1344                                  * slightly slower, but this is a really stupid
1345                                  * special case anyway (why on earth would you
1346                                  * have a constant varying? Your own fault for
1347                                  * slightly worse perf :P) */
1348
1349                                 midgard_instruction ins = v_fmov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), blank_alu_src, SSA_FIXED_REGISTER(26));
1350                                 attach_constants(ctx, &ins, constant_value, reg + 1);
1351                                 emit_mir_instruction(ctx, ins);
1352
1353                                 midgard_instruction st = m_st_vary_32(SSA_FIXED_REGISTER(0), offset);
1354                                 st.load_store.unknown = 0x1E9E; /* XXX: What is this? */
1355                                 emit_mir_instruction(ctx, st);
1356                         } else {
1357                                 /* Do not emit the varying yet -- instead, just mark down that we need to later */
1358
1359                                 _mesa_hash_table_u64_insert(ctx->ssa_varyings, reg + 1, (void *) ((uintptr_t) (offset + 1)));
1360                         }
1361                 } else {
1362                         DBG("Unknown store\n");
1363                         assert(0);
1364                 }
1365
1366                 break;
1367
1368         case nir_intrinsic_load_alpha_ref_float:
1369                 assert(instr->dest.is_ssa);
1370
1371                 float ref_value = ctx->alpha_ref;
1372
1373                 float *v = ralloc_array(NULL, float, 4);
1374                 memcpy(v, &ref_value, sizeof(float));
1375                 _mesa_hash_table_u64_insert(ctx->ssa_constants, instr->dest.ssa.index + 1, v);
1376                 break;
1377
1378         case nir_intrinsic_load_viewport_scale:
1379         case nir_intrinsic_load_viewport_offset:
1380                 emit_sysval_read(ctx, instr);
1381                 break;
1382
1383         default:
1384                 printf ("Unhandled intrinsic\n");
1385                 assert(0);
1386                 break;
1387         }
1388 }
1389
1390 static unsigned
1391 midgard_tex_format(enum glsl_sampler_dim dim)
1392 {
1393         switch (dim) {
1394         case GLSL_SAMPLER_DIM_2D:
1395         case GLSL_SAMPLER_DIM_EXTERNAL:
1396                 return TEXTURE_2D;
1397
1398         case GLSL_SAMPLER_DIM_3D:
1399                 return TEXTURE_3D;
1400
1401         case GLSL_SAMPLER_DIM_CUBE:
1402                 return TEXTURE_CUBE;
1403
1404         default:
1405                 DBG("Unknown sampler dim type\n");
1406                 assert(0);
1407                 return 0;
1408         }
1409 }
1410
1411 static void
1412 emit_tex(compiler_context *ctx, nir_tex_instr *instr)
1413 {
1414         /* TODO */
1415         //assert (!instr->sampler);
1416         //assert (!instr->texture_array_size);
1417         assert (instr->op == nir_texop_tex);
1418
1419         /* Allocate registers via a round robin scheme to alternate between the two registers */
1420         int reg = ctx->texture_op_count & 1;
1421         int in_reg = reg, out_reg = reg;
1422
1423         /* Make room for the reg */
1424
1425         if (ctx->texture_index[reg] > -1)
1426                 unalias_ssa(ctx, ctx->texture_index[reg]);
1427
1428         int texture_index = instr->texture_index;
1429         int sampler_index = texture_index;
1430
1431         for (unsigned i = 0; i < instr->num_srcs; ++i) {
1432                 switch (instr->src[i].src_type) {
1433                 case nir_tex_src_coord: {
1434                         int index = nir_src_index(ctx, &instr->src[i].src);
1435
1436                         midgard_vector_alu_src alu_src = blank_alu_src;
1437
1438                         int reg = SSA_FIXED_REGISTER(REGISTER_TEXTURE_BASE + in_reg);
1439
1440                         if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
1441                                 /* For cubemaps, we need to load coords into
1442                                  * special r27, and then use a special ld/st op
1443                                  * to copy into the texture register */
1444
1445                                 alu_src.swizzle = SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_X);
1446
1447                                 midgard_instruction move = v_fmov(index, alu_src, SSA_FIXED_REGISTER(27));
1448                                 emit_mir_instruction(ctx, move);
1449
1450                                 midgard_instruction st = m_st_cubemap_coords(reg, 0);
1451                                 st.load_store.unknown = 0x24; /* XXX: What is this? */
1452                                 st.load_store.mask = 0x3; /* xy? */
1453                                 st.load_store.swizzle = alu_src.swizzle;
1454                                 emit_mir_instruction(ctx, st);
1455
1456                         } else {
1457                                 alu_src.swizzle = SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_X, COMPONENT_X);
1458
1459                                 midgard_instruction ins = v_fmov(index, alu_src, reg);
1460                                 emit_mir_instruction(ctx, ins);
1461                         }
1462
1463                         break;
1464                 }
1465
1466                 default: {
1467                         DBG("Unknown source type\n");
1468                         //assert(0);
1469                         break;
1470                 }
1471                 }
1472         }
1473
1474         /* No helper to build texture words -- we do it all here */
1475         midgard_instruction ins = {
1476                 .type = TAG_TEXTURE_4,
1477                 .texture = {
1478                         .op = TEXTURE_OP_NORMAL,
1479                         .format = midgard_tex_format(instr->sampler_dim),
1480                         .texture_handle = texture_index,
1481                         .sampler_handle = sampler_index,
1482
1483                         /* TODO: Don't force xyzw */
1484                         .swizzle = SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
1485                         .mask = 0xF,
1486
1487                         /* TODO: half */
1488                         //.in_reg_full = 1,
1489                         .out_full = 1,
1490
1491                         .filter = 1,
1492
1493                         /* Always 1 */
1494                         .unknown7 = 1,
1495
1496                         /* Assume we can continue; hint it out later */
1497                         .cont = 1,
1498                 }
1499         };
1500
1501         /* Set registers to read and write from the same place */
1502         ins.texture.in_reg_select = in_reg;
1503         ins.texture.out_reg_select = out_reg;
1504
1505         /* TODO: Dynamic swizzle input selection, half-swizzles? */
1506         if (instr->sampler_dim == GLSL_SAMPLER_DIM_3D) {
1507                 ins.texture.in_reg_swizzle_right = COMPONENT_X;
1508                 ins.texture.in_reg_swizzle_left = COMPONENT_Y;
1509                 //ins.texture.in_reg_swizzle_third = COMPONENT_Z;
1510         } else {
1511                 ins.texture.in_reg_swizzle_left = COMPONENT_X;
1512                 ins.texture.in_reg_swizzle_right = COMPONENT_Y;
1513                 //ins.texture.in_reg_swizzle_third = COMPONENT_X;
1514         }
1515
1516         emit_mir_instruction(ctx, ins);
1517
1518         /* Simultaneously alias the destination and emit a move for it. The move will be eliminated if possible */
1519
1520         int o_reg = REGISTER_TEXTURE_BASE + out_reg, o_index = nir_dest_index(ctx, &instr->dest);
1521         alias_ssa(ctx, o_index, SSA_FIXED_REGISTER(o_reg));
1522         ctx->texture_index[reg] = o_index;
1523
1524         midgard_instruction ins2 = v_fmov(SSA_FIXED_REGISTER(o_reg), blank_alu_src, o_index);
1525         emit_mir_instruction(ctx, ins2);
1526
1527         /* Used for .cont and .last hinting */
1528         ctx->texture_op_count++;
1529 }
1530
1531 static void
1532 emit_jump(compiler_context *ctx, nir_jump_instr *instr)
1533 {
1534         switch (instr->type) {
1535                 case nir_jump_break: {
1536                         /* Emit a branch out of the loop */
1537                         struct midgard_instruction br = v_branch(false, false);
1538                         br.branch.target_type = TARGET_BREAK;
1539                         br.branch.target_break = ctx->current_loop_depth;
1540                         emit_mir_instruction(ctx, br);
1541
1542                         DBG("break..\n");
1543                         break;
1544                 }
1545
1546                 default:
1547                         DBG("Unknown jump type %d\n", instr->type);
1548                         break;
1549         }
1550 }
1551
1552 static void
1553 emit_instr(compiler_context *ctx, struct nir_instr *instr)
1554 {
1555         switch (instr->type) {
1556         case nir_instr_type_load_const:
1557                 emit_load_const(ctx, nir_instr_as_load_const(instr));
1558                 break;
1559
1560         case nir_instr_type_intrinsic:
1561                 emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
1562                 break;
1563
1564         case nir_instr_type_alu:
1565                 emit_alu(ctx, nir_instr_as_alu(instr));
1566                 break;
1567
1568         case nir_instr_type_tex:
1569                 emit_tex(ctx, nir_instr_as_tex(instr));
1570                 break;
1571
1572         case nir_instr_type_jump:
1573                 emit_jump(ctx, nir_instr_as_jump(instr));
1574                 break;
1575
1576         case nir_instr_type_ssa_undef:
1577                 /* Spurious */
1578                 break;
1579
1580         default:
1581                 DBG("Unhandled instruction type\n");
1582                 break;
1583         }
1584 }
1585
1586 /* Midgard IR only knows vector ALU types, but we sometimes need to actually
1587  * use scalar ALU instructions, for functional or performance reasons. To do
1588  * this, we just demote vector ALU payloads to scalar. */
1589
1590 static int
1591 component_from_mask(unsigned mask)
1592 {
1593         for (int c = 0; c < 4; ++c) {
1594                 if (mask & (3 << (2 * c)))
1595                         return c;
1596         }
1597
1598         assert(0);
1599         return 0;
1600 }
1601
1602 static bool
1603 is_single_component_mask(unsigned mask)
1604 {
1605         int components = 0;
1606
1607         for (int c = 0; c < 4; ++c)
1608                 if (mask & (3 << (2 * c)))
1609                         components++;
1610
1611         return components == 1;
1612 }
1613
1614 /* Create a mask of accessed components from a swizzle to figure out vector
1615  * dependencies */
1616
1617 static unsigned
1618 swizzle_to_access_mask(unsigned swizzle)
1619 {
1620         unsigned component_mask = 0;
1621
1622         for (int i = 0; i < 4; ++i) {
1623                 unsigned c = (swizzle >> (2 * i)) & 3;
1624                 component_mask |= (1 << c);
1625         }
1626
1627         return component_mask;
1628 }
1629
1630 static unsigned
1631 vector_to_scalar_source(unsigned u, bool is_int)
1632 {
1633         midgard_vector_alu_src v;
1634         memcpy(&v, &u, sizeof(v));
1635
1636         /* TODO: Integers */
1637
1638         midgard_scalar_alu_src s = {
1639                 .full = !v.half,
1640                 .component = (v.swizzle & 3) << 1
1641         };
1642
1643         if (is_int) {
1644                 /* TODO */
1645         } else {
1646                 s.abs = v.mod & MIDGARD_FLOAT_MOD_ABS;
1647                 s.negate = v.mod & MIDGARD_FLOAT_MOD_NEG;
1648         }
1649
1650         unsigned o;
1651         memcpy(&o, &s, sizeof(s));
1652
1653         return o & ((1 << 6) - 1);
1654 }
1655
1656 static midgard_scalar_alu
1657 vector_to_scalar_alu(midgard_vector_alu v, midgard_instruction *ins)
1658 {
1659         bool is_int = midgard_is_integer_op(v.op);
1660
1661         /* The output component is from the mask */
1662         midgard_scalar_alu s = {
1663                 .op = v.op,
1664                 .src1 = vector_to_scalar_source(v.src1, is_int),
1665                 .src2 = vector_to_scalar_source(v.src2, is_int),
1666                 .unknown = 0,
1667                 .outmod = v.outmod,
1668                 .output_full = 1, /* TODO: Half */
1669                 .output_component = component_from_mask(v.mask) << 1,
1670         };
1671
1672         /* Inline constant is passed along rather than trying to extract it
1673          * from v */
1674
1675         if (ins->ssa_args.inline_constant) {
1676                 uint16_t imm = 0;
1677                 int lower_11 = ins->inline_constant & ((1 << 12) - 1);
1678                 imm |= (lower_11 >> 9) & 3;
1679                 imm |= (lower_11 >> 6) & 4;
1680                 imm |= (lower_11 >> 2) & 0x38;
1681                 imm |= (lower_11 & 63) << 6;
1682
1683                 s.src2 = imm;
1684         }
1685
1686         return s;
1687 }
1688
1689 /* Midgard prefetches instruction types, so during emission we need to
1690  * lookahead too. Unless this is the last instruction, in which we return 1. Or
1691  * if this is the second to last and the last is an ALU, then it's also 1... */
1692
1693 #define IS_ALU(tag) (tag == TAG_ALU_4 || tag == TAG_ALU_8 ||  \
1694                      tag == TAG_ALU_12 || tag == TAG_ALU_16)
1695
1696 #define EMIT_AND_COUNT(type, val) util_dynarray_append(emission, type, val); \
1697                                   bytes_emitted += sizeof(type)
1698
1699 static void
1700 emit_binary_vector_instruction(midgard_instruction *ains,
1701                                uint16_t *register_words, int *register_words_count,
1702                                uint64_t *body_words, size_t *body_size, int *body_words_count,
1703                                size_t *bytes_emitted)
1704 {
1705         memcpy(&register_words[(*register_words_count)++], &ains->registers, sizeof(ains->registers));
1706         *bytes_emitted += sizeof(midgard_reg_info);
1707
1708         body_size[*body_words_count] = sizeof(midgard_vector_alu);
1709         memcpy(&body_words[(*body_words_count)++], &ains->alu, sizeof(ains->alu));
1710         *bytes_emitted += sizeof(midgard_vector_alu);
1711 }
1712
1713 /* Checks for an SSA data hazard between two adjacent instructions, keeping in
1714  * mind that we are a vector architecture and we can write to different
1715  * components simultaneously */
1716
1717 static bool
1718 can_run_concurrent_ssa(midgard_instruction *first, midgard_instruction *second)
1719 {
1720         /* Each instruction reads some registers and writes to a register. See
1721          * where the first writes */
1722
1723         /* Figure out where exactly we wrote to */
1724         int source = first->ssa_args.dest;
1725         int source_mask = first->type == TAG_ALU_4 ? squeeze_writemask(first->alu.mask) : 0xF;
1726
1727         /* As long as the second doesn't read from the first, we're okay */
1728         if (second->ssa_args.src0 == source) {
1729                 if (first->type == TAG_ALU_4) {
1730                         /* Figure out which components we just read from */
1731
1732                         int q = second->alu.src1;
1733                         midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q;
1734
1735                         /* Check if there are components in common, and fail if so */
1736                         if (swizzle_to_access_mask(m->swizzle) & source_mask)
1737                                 return false;
1738                 } else
1739                         return false;
1740
1741         }
1742
1743         if (second->ssa_args.src1 == source)
1744                 return false;
1745
1746         /* Otherwise, it's safe in that regard. Another data hazard is both
1747          * writing to the same place, of course */
1748
1749         if (second->ssa_args.dest == source) {
1750                 /* ...but only if the components overlap */
1751                 int dest_mask = second->type == TAG_ALU_4 ? squeeze_writemask(second->alu.mask) : 0xF;
1752
1753                 if (dest_mask & source_mask)
1754                         return false;
1755         }
1756
1757         /* ...That's it */
1758         return true;
1759 }
1760
1761 static bool
1762 midgard_has_hazard(
1763                 midgard_instruction **segment, unsigned segment_size,
1764                 midgard_instruction *ains)
1765 {
1766         for (int s = 0; s < segment_size; ++s)
1767                 if (!can_run_concurrent_ssa(segment[s], ains))
1768                         return true;
1769
1770         return false;
1771
1772
1773 }
1774
1775 /* Schedules, but does not emit, a single basic block. After scheduling, the
1776  * final tag and size of the block are known, which are necessary for branching
1777  * */
1778
1779 static midgard_bundle
1780 schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction *ins, int *skip)
1781 {
1782         int instructions_emitted = 0, instructions_consumed = -1;
1783         midgard_bundle bundle = { 0 };
1784
1785         uint8_t tag = ins->type;
1786
1787         /* Default to the instruction's tag */
1788         bundle.tag = tag;
1789
1790         switch (ins->type) {
1791         case TAG_ALU_4: {
1792                 uint32_t control = 0;
1793                 size_t bytes_emitted = sizeof(control);
1794
1795                 /* TODO: Constant combining */
1796                 int index = 0, last_unit = 0;
1797
1798                 /* Previous instructions, for the purpose of parallelism */
1799                 midgard_instruction *segment[4] = {0};
1800                 int segment_size = 0;
1801
1802                 instructions_emitted = -1;
1803                 midgard_instruction *pins = ins;
1804
1805                 for (;;) {
1806                         midgard_instruction *ains = pins;
1807
1808                         /* Advance instruction pointer */
1809                         if (index) {
1810                                 ains = mir_next_op(pins);
1811                                 pins = ains;
1812                         }
1813
1814                         /* Out-of-work condition */
1815                         if ((struct list_head *) ains == &block->instructions)
1816                                 break;
1817
1818                         /* Ensure that the chain can continue */
1819                         if (ains->type != TAG_ALU_4) break;
1820
1821                         /* If there's already something in the bundle and we
1822                          * have weird scheduler constraints, break now */
1823                         if (ains->precede_break && index) break;
1824
1825                         /* According to the presentation "The ARM
1826                          * Mali-T880 Mobile GPU" from HotChips 27,
1827                          * there are two pipeline stages. Branching
1828                          * position determined experimentally. Lines
1829                          * are executed in parallel:
1830                          *
1831                          * [ VMUL ] [ SADD ]
1832                          * [ VADD ] [ SMUL ] [ LUT ] [ BRANCH ]
1833                          *
1834                          * Verify that there are no ordering dependencies here.
1835                          *
1836                          * TODO: Allow for parallelism!!!
1837                          */
1838
1839                         /* Pick a unit for it if it doesn't force a particular unit */
1840
1841                         int unit = ains->unit;
1842
1843                         if (!unit) {
1844                                 int op = ains->alu.op;
1845                                 int units = alu_opcode_props[op].props;
1846
1847                                 /* TODO: Promotion of scalars to vectors */
1848                                 int vector = ((!is_single_component_mask(ains->alu.mask)) || ((units & UNITS_SCALAR) == 0)) && (units & UNITS_ANY_VECTOR);
1849
1850                                 if (!vector)
1851                                         assert(units & UNITS_SCALAR);
1852
1853                                 if (vector) {
1854                                         if (last_unit >= UNIT_VADD) {
1855                                                 if (units & UNIT_VLUT)
1856                                                         unit = UNIT_VLUT;
1857                                                 else
1858                                                         break;
1859                                         } else {
1860                                                 if ((units & UNIT_VMUL) && !(control & UNIT_VMUL))
1861                                                         unit = UNIT_VMUL;
1862                                                 else if ((units & UNIT_VADD) && !(control & UNIT_VADD))
1863                                                         unit = UNIT_VADD;
1864                                                 else if (units & UNIT_VLUT)
1865                                                         unit = UNIT_VLUT;
1866                                                 else
1867                                                         break;
1868                                         }
1869                                 } else {
1870                                         if (last_unit >= UNIT_VADD) {
1871                                                 if ((units & UNIT_SMUL) && !(control & UNIT_SMUL))
1872                                                         unit = UNIT_SMUL;
1873                                                 else if (units & UNIT_VLUT)
1874                                                         unit = UNIT_VLUT;
1875                                                 else
1876                                                         break;
1877                                         } else {
1878                                                 if ((units & UNIT_SADD) && !(control & UNIT_SADD) && !midgard_has_hazard(segment, segment_size, ains))
1879                                                         unit = UNIT_SADD;
1880                                                 else if (units & UNIT_SMUL)
1881                                                         unit = ((units & UNIT_VMUL) && !(control & UNIT_VMUL)) ? UNIT_VMUL : UNIT_SMUL;
1882                                                 else if ((units & UNIT_VADD) && !(control & UNIT_VADD))
1883                                                         unit = UNIT_VADD;
1884                                                 else
1885                                                         break;
1886                                         }
1887                                 }
1888
1889                                 assert(unit & units);
1890                         }
1891
1892                         /* Late unit check, this time for encoding (not parallelism) */
1893                         if (unit <= last_unit) break;
1894
1895                         /* Clear the segment */
1896                         if (last_unit < UNIT_VADD && unit >= UNIT_VADD)
1897                                 segment_size = 0;
1898
1899                         if (midgard_has_hazard(segment, segment_size, ains))
1900                                 break;
1901
1902                         /* We're good to go -- emit the instruction */
1903                         ains->unit = unit;
1904
1905                         segment[segment_size++] = ains;
1906
1907                         /* Only one set of embedded constants per
1908                          * bundle possible; if we have more, we must
1909                          * break the chain early, unfortunately */
1910
1911                         if (ains->has_constants) {
1912                                 if (bundle.has_embedded_constants) {
1913                                         /* The blend constant needs to be
1914                                          * alone, since it conflicts with
1915                                          * everything by definition*/
1916
1917                                         if (ains->has_blend_constant || bundle.has_blend_constant)
1918                                                 break;
1919
1920                                         /* ...but if there are already
1921                                          * constants but these are the
1922                                          * *same* constants, we let it
1923                                          * through */
1924
1925                                         if (memcmp(bundle.constants, ains->constants, sizeof(bundle.constants)))
1926                                                 break;
1927                                 } else {
1928                                         bundle.has_embedded_constants = true;
1929                                         memcpy(bundle.constants, ains->constants, sizeof(bundle.constants));
1930
1931                                         /* If this is a blend shader special constant, track it for patching */
1932                                         bundle.has_blend_constant |= ains->has_blend_constant;
1933                                 }
1934                         }
1935
1936                         if (ains->unit & UNITS_ANY_VECTOR) {
1937                                 emit_binary_vector_instruction(ains, bundle.register_words,
1938                                                                &bundle.register_words_count, bundle.body_words,
1939                                                                bundle.body_size, &bundle.body_words_count, &bytes_emitted);
1940                         } else if (ains->compact_branch) {
1941                                 /* All of r0 has to be written out
1942                                  * along with the branch writeout.
1943                                  * (slow!) */
1944
1945                                 if (ains->writeout) {
1946                                         if (index == 0) {
1947                                                 midgard_instruction ins = v_fmov(0, blank_alu_src, SSA_FIXED_REGISTER(0));
1948                                                 ins.unit = UNIT_VMUL;
1949
1950                                                 control |= ins.unit;
1951
1952                                                 emit_binary_vector_instruction(&ins, bundle.register_words,
1953                                                                                &bundle.register_words_count, bundle.body_words,
1954                                                                                bundle.body_size, &bundle.body_words_count, &bytes_emitted);
1955                                         } else {
1956                                                 /* Analyse the group to see if r0 is written in full, on-time, without hanging dependencies*/
1957                                                 bool written_late = false;
1958                                                 bool components[4] = { 0 };
1959                                                 uint16_t register_dep_mask = 0;
1960                                                 uint16_t written_mask = 0;
1961
1962                                                 midgard_instruction *qins = ins;
1963                                                 for (int t = 0; t < index; ++t) {
1964                                                         if (qins->registers.out_reg != 0) {
1965                                                                 /* Mark down writes */
1966
1967                                                                 written_mask |= (1 << qins->registers.out_reg);
1968                                                         } else {
1969                                                                 /* Mark down the register dependencies for errata check */
1970
1971                                                                 if (qins->registers.src1_reg < 16)
1972                                                                         register_dep_mask |= (1 << qins->registers.src1_reg);
1973
1974                                                                 if (qins->registers.src2_reg < 16)
1975                                                                         register_dep_mask |= (1 << qins->registers.src2_reg);
1976
1977                                                                 int mask = qins->alu.mask;
1978
1979                                                                 for (int c = 0; c < 4; ++c)
1980                                                                         if (mask & (0x3 << (2 * c)))
1981                                                                                 components[c] = true;
1982
1983                                                                 /* ..but if the writeout is too late, we have to break up anyway... for some reason */
1984
1985                                                                 if (qins->unit == UNIT_VLUT)
1986                                                                         written_late = true;
1987                                                         }
1988
1989                                                         /* Advance instruction pointer */
1990                                                         qins = mir_next_op(qins);
1991                                                 }
1992
1993
1994                                                 /* ERRATA (?): In a bundle ending in a fragment writeout, the register dependencies of r0 cannot be written within this bundle (discovered in -bshading:shading=phong) */
1995                                                 if (register_dep_mask & written_mask) {
1996                                                         DBG("ERRATA WORKAROUND: Breakup for writeout dependency masks %X vs %X (common %X)\n", register_dep_mask, written_mask, register_dep_mask & written_mask);
1997                                                         break;
1998                                                 }
1999
2000                                                 if (written_late)
2001                                                         break;
2002
2003                                                 /* If even a single component is not written, break it up (conservative check). */
2004                                                 bool breakup = false;
2005
2006                                                 for (int c = 0; c < 4; ++c)
2007                                                         if (!components[c])
2008                                                                 breakup = true;
2009
2010                                                 if (breakup)
2011                                                         break;
2012
2013                                                 /* Otherwise, we're free to proceed */
2014                                         }
2015                                 }
2016
2017                                 if (ains->unit == ALU_ENAB_BRANCH) {
2018                                         bundle.body_size[bundle.body_words_count] = sizeof(midgard_branch_extended);
2019                                         memcpy(&bundle.body_words[bundle.body_words_count++], &ains->branch_extended, sizeof(midgard_branch_extended));
2020                                         bytes_emitted += sizeof(midgard_branch_extended);
2021                                 } else {
2022                                         bundle.body_size[bundle.body_words_count] = sizeof(ains->br_compact);
2023                                         memcpy(&bundle.body_words[bundle.body_words_count++], &ains->br_compact, sizeof(ains->br_compact));
2024                                         bytes_emitted += sizeof(ains->br_compact);
2025                                 }
2026                         } else {
2027                                 memcpy(&bundle.register_words[bundle.register_words_count++], &ains->registers, sizeof(ains->registers));
2028                                 bytes_emitted += sizeof(midgard_reg_info);
2029
2030                                 bundle.body_size[bundle.body_words_count] = sizeof(midgard_scalar_alu);
2031                                 bundle.body_words_count++;
2032                                 bytes_emitted += sizeof(midgard_scalar_alu);
2033                         }
2034
2035                         /* Defer marking until after writing to allow for break */
2036                         control |= ains->unit;
2037                         last_unit = ains->unit;
2038                         ++instructions_emitted;
2039                         ++index;
2040                 }
2041
2042                 /* Bubble up the number of instructions for skipping */
2043                 instructions_consumed = index - 1;
2044
2045                 int padding = 0;
2046
2047                 /* Pad ALU op to nearest word */
2048
2049                 if (bytes_emitted & 15) {
2050                         padding = 16 - (bytes_emitted & 15);
2051                         bytes_emitted += padding;
2052                 }
2053
2054                 /* Constants must always be quadwords */
2055                 if (bundle.has_embedded_constants)
2056                         bytes_emitted += 16;
2057
2058                 /* Size ALU instruction for tag */
2059                 bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1;
2060                 bundle.padding = padding;
2061                 bundle.control = bundle.tag | control;
2062
2063                 break;
2064         }
2065
2066         case TAG_LOAD_STORE_4: {
2067                 /* Load store instructions have two words at once. If
2068                  * we only have one queued up, we need to NOP pad.
2069                  * Otherwise, we store both in succession to save space
2070                  * and cycles -- letting them go in parallel -- skip
2071                  * the next. The usefulness of this optimisation is
2072                  * greatly dependent on the quality of the instruction
2073                  * scheduler.
2074                  */
2075
2076                 midgard_instruction *next_op = mir_next_op(ins);
2077
2078                 if ((struct list_head *) next_op != &block->instructions && next_op->type == TAG_LOAD_STORE_4) {
2079                         /* As the two operate concurrently, make sure
2080                          * they are not dependent */
2081
2082                         if (can_run_concurrent_ssa(ins, next_op) || true) {
2083                                 /* Skip ahead, since it's redundant with the pair */
2084                                 instructions_consumed = 1 + (instructions_emitted++);
2085                         }
2086                 }
2087
2088                 break;
2089         }
2090
2091         default:
2092                 /* Texture ops default to single-op-per-bundle scheduling */
2093                 break;
2094         }
2095
2096         /* Copy the instructions into the bundle */
2097         bundle.instruction_count = instructions_emitted + 1;
2098
2099         int used_idx = 0;
2100
2101         midgard_instruction *uins = ins;
2102         for (int i = 0; used_idx < bundle.instruction_count; ++i) {
2103                 bundle.instructions[used_idx++] = *uins;
2104                 uins = mir_next_op(uins);
2105         }
2106
2107         *skip = (instructions_consumed == -1) ? instructions_emitted : instructions_consumed;
2108
2109         return bundle;
2110 }
2111
2112 static int
2113 quadword_size(int tag)
2114 {
2115         switch (tag) {
2116         case TAG_ALU_4:
2117                 return 1;
2118
2119         case TAG_ALU_8:
2120                 return 2;
2121
2122         case TAG_ALU_12:
2123                 return 3;
2124
2125         case TAG_ALU_16:
2126                 return 4;
2127
2128         case TAG_LOAD_STORE_4:
2129                 return 1;
2130
2131         case TAG_TEXTURE_4:
2132                 return 1;
2133
2134         default:
2135                 assert(0);
2136                 return 0;
2137         }
2138 }
2139
2140 /* Schedule a single block by iterating its instruction to create bundles.
2141  * While we go, tally about the bundle sizes to compute the block size. */
2142
2143 static void
2144 schedule_block(compiler_context *ctx, midgard_block *block)
2145 {
2146         util_dynarray_init(&block->bundles, NULL);
2147
2148         block->quadword_count = 0;
2149
2150         mir_foreach_instr_in_block(block, ins) {
2151                 int skip;
2152                 midgard_bundle bundle = schedule_bundle(ctx, block, ins, &skip);
2153                 util_dynarray_append(&block->bundles, midgard_bundle, bundle);
2154
2155                 if (bundle.has_blend_constant) {
2156                         /* TODO: Multiblock? */
2157                         int quadwords_within_block = block->quadword_count + quadword_size(bundle.tag) - 1;
2158                         ctx->blend_constant_offset = quadwords_within_block * 0x10;
2159                 }
2160
2161                 while(skip--)
2162                         ins = mir_next_op(ins);
2163
2164                 block->quadword_count += quadword_size(bundle.tag);
2165         }
2166
2167         block->is_scheduled = true;
2168 }
2169
2170 static void
2171 schedule_program(compiler_context *ctx)
2172 {
2173         /* We run RA prior to scheduling */
2174         struct ra_graph *g = allocate_registers(ctx);
2175         install_registers(ctx, g);
2176
2177         mir_foreach_block(ctx, block) {
2178                 schedule_block(ctx, block);
2179         }
2180 }
2181
2182 /* After everything is scheduled, emit whole bundles at a time */
2183
2184 static void
2185 emit_binary_bundle(compiler_context *ctx, midgard_bundle *bundle, struct util_dynarray *emission, int next_tag)
2186 {
2187         int lookahead = next_tag << 4;
2188
2189         switch (bundle->tag) {
2190         case TAG_ALU_4:
2191         case TAG_ALU_8:
2192         case TAG_ALU_12:
2193         case TAG_ALU_16: {
2194                 /* Actually emit each component */
2195                 util_dynarray_append(emission, uint32_t, bundle->control | lookahead);
2196
2197                 for (int i = 0; i < bundle->register_words_count; ++i)
2198                         util_dynarray_append(emission, uint16_t, bundle->register_words[i]);
2199
2200                 /* Emit body words based on the instructions bundled */
2201                 for (int i = 0; i < bundle->instruction_count; ++i) {
2202                         midgard_instruction *ins = &bundle->instructions[i];
2203
2204                         if (ins->unit & UNITS_ANY_VECTOR) {
2205                                 memcpy(util_dynarray_grow(emission, sizeof(midgard_vector_alu)), &ins->alu, sizeof(midgard_vector_alu));
2206                         } else if (ins->compact_branch) {
2207                                 /* Dummy move, XXX DRY */
2208                                 if ((i == 0) && ins->writeout) {
2209                                         midgard_instruction ins = v_fmov(0, blank_alu_src, SSA_FIXED_REGISTER(0));
2210                                         memcpy(util_dynarray_grow(emission, sizeof(midgard_vector_alu)), &ins.alu, sizeof(midgard_vector_alu));
2211                                 }
2212
2213                                 if (ins->unit == ALU_ENAB_BR_COMPACT) {
2214                                         memcpy(util_dynarray_grow(emission, sizeof(ins->br_compact)), &ins->br_compact, sizeof(ins->br_compact));
2215                                 } else {
2216                                         memcpy(util_dynarray_grow(emission, sizeof(ins->branch_extended)), &ins->branch_extended, sizeof(ins->branch_extended));
2217                                 }
2218                         } else {
2219                                 /* Scalar */
2220                                 midgard_scalar_alu scalarised = vector_to_scalar_alu(ins->alu, ins);
2221                                 memcpy(util_dynarray_grow(emission, sizeof(scalarised)), &scalarised, sizeof(scalarised));
2222                         }
2223                 }
2224
2225                 /* Emit padding (all zero) */
2226                 memset(util_dynarray_grow(emission, bundle->padding), 0, bundle->padding);
2227
2228                 /* Tack on constants */
2229
2230                 if (bundle->has_embedded_constants) {
2231                         util_dynarray_append(emission, float, bundle->constants[0]);
2232                         util_dynarray_append(emission, float, bundle->constants[1]);
2233                         util_dynarray_append(emission, float, bundle->constants[2]);
2234                         util_dynarray_append(emission, float, bundle->constants[3]);
2235                 }
2236
2237                 break;
2238         }
2239
2240         case TAG_LOAD_STORE_4: {
2241                 /* One or two composing instructions */
2242
2243                 uint64_t current64, next64 = LDST_NOP;
2244
2245                 memcpy(&current64, &bundle->instructions[0].load_store, sizeof(current64));
2246
2247                 if (bundle->instruction_count == 2)
2248                         memcpy(&next64, &bundle->instructions[1].load_store, sizeof(next64));
2249
2250                 midgard_load_store instruction = {
2251                         .type = bundle->tag,
2252                         .next_type = next_tag,
2253                         .word1 = current64,
2254                         .word2 = next64
2255                 };
2256
2257                 util_dynarray_append(emission, midgard_load_store, instruction);
2258
2259                 break;
2260         }
2261
2262         case TAG_TEXTURE_4: {
2263                 /* Texture instructions are easy, since there is no
2264                  * pipelining nor VLIW to worry about. We may need to set the .last flag */
2265
2266                 midgard_instruction *ins = &bundle->instructions[0];
2267
2268                 ins->texture.type = TAG_TEXTURE_4;
2269                 ins->texture.next_type = next_tag;
2270
2271                 ctx->texture_op_count--;
2272
2273                 if (!ctx->texture_op_count) {
2274                         ins->texture.cont = 0;
2275                         ins->texture.last = 1;
2276                 }
2277
2278                 util_dynarray_append(emission, midgard_texture_word, ins->texture);
2279                 break;
2280         }
2281
2282         default:
2283                 DBG("Unknown midgard instruction type\n");
2284                 assert(0);
2285                 break;
2286         }
2287 }
2288
2289
2290 /* ALU instructions can inline or embed constants, which decreases register
2291  * pressure and saves space. */
2292
2293 #define CONDITIONAL_ATTACH(src) { \
2294         void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->ssa_args.src + 1); \
2295 \
2296         if (entry) { \
2297                 attach_constants(ctx, alu, entry, alu->ssa_args.src + 1); \
2298                 alu->ssa_args.src = SSA_FIXED_REGISTER(REGISTER_CONSTANT); \
2299         } \
2300 }
2301
2302 static void
2303 inline_alu_constants(compiler_context *ctx)
2304 {
2305         mir_foreach_instr(ctx, alu) {
2306                 /* Other instructions cannot inline constants */
2307                 if (alu->type != TAG_ALU_4) continue;
2308
2309                 /* If there is already a constant here, we can do nothing */
2310                 if (alu->has_constants) continue;
2311
2312                 /* It makes no sense to inline constants on a branch */
2313                 if (alu->compact_branch || alu->prepacked_branch) continue;
2314
2315                 CONDITIONAL_ATTACH(src0);
2316
2317                 if (!alu->has_constants) {
2318                         CONDITIONAL_ATTACH(src1)
2319                 } else if (!alu->inline_constant) {
2320                         /* Corner case: _two_ vec4 constants, for instance with a
2321                          * csel. For this case, we can only use a constant
2322                          * register for one, we'll have to emit a move for the
2323                          * other. Note, if both arguments are constants, then
2324                          * necessarily neither argument depends on the value of
2325                          * any particular register. As the destination register
2326                          * will be wiped, that means we can spill the constant
2327                          * to the destination register.
2328                          */
2329
2330                         void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->ssa_args.src1 + 1);
2331                         unsigned scratch = alu->ssa_args.dest;
2332
2333                         if (entry) {
2334                                 midgard_instruction ins = v_fmov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), blank_alu_src, scratch);
2335                                 attach_constants(ctx, &ins, entry, alu->ssa_args.src1 + 1);
2336
2337                                 /* Force a break XXX Defer r31 writes */
2338                                 ins.unit = UNIT_VLUT;
2339
2340                                 /* Set the source */
2341                                 alu->ssa_args.src1 = scratch;
2342
2343                                 /* Inject us -before- the last instruction which set r31 */
2344                                 mir_insert_instruction_before(mir_prev_op(alu), ins);
2345                         }
2346                 }
2347         }
2348 }
2349
2350 /* Midgard supports two types of constants, embedded constants (128-bit) and
2351  * inline constants (16-bit). Sometimes, especially with scalar ops, embedded
2352  * constants can be demoted to inline constants, for space savings and
2353  * sometimes a performance boost */
2354
2355 static void
2356 embedded_to_inline_constant(compiler_context *ctx)
2357 {
2358         mir_foreach_instr(ctx, ins) {
2359                 if (!ins->has_constants) continue;
2360
2361                 if (ins->ssa_args.inline_constant) continue;
2362
2363                 /* Blend constants must not be inlined by definition */
2364                 if (ins->has_blend_constant) continue;
2365
2366                 /* src1 cannot be an inline constant due to encoding
2367                  * restrictions. So, if possible we try to flip the arguments
2368                  * in that case */
2369
2370                 int op = ins->alu.op;
2371
2372                 if (ins->ssa_args.src0 == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) {
2373                         switch (op) {
2374                         /* These ops require an operational change to flip
2375                          * their arguments TODO */
2376                         case midgard_alu_op_flt:
2377                         case midgard_alu_op_fle:
2378                         case midgard_alu_op_ilt:
2379                         case midgard_alu_op_ile:
2380                         case midgard_alu_op_fcsel:
2381                         case midgard_alu_op_icsel:
2382                                 DBG("Missed non-commutative flip (%s)\n", alu_opcode_props[op].name);
2383                         default:
2384                                 break;
2385                         }
2386
2387                         if (alu_opcode_props[op].props & OP_COMMUTES) {
2388                                 /* Flip the SSA numbers */
2389                                 ins->ssa_args.src0 = ins->ssa_args.src1;
2390                                 ins->ssa_args.src1 = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
2391
2392                                 /* And flip the modifiers */
2393
2394                                 unsigned src_temp;
2395
2396                                 src_temp = ins->alu.src2;
2397                                 ins->alu.src2 = ins->alu.src1;
2398                                 ins->alu.src1 = src_temp;
2399                         }
2400                 }
2401
2402                 if (ins->ssa_args.src1 == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) {
2403                         /* Extract the source information */
2404
2405                         midgard_vector_alu_src *src;
2406                         int q = ins->alu.src2;
2407                         midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q;
2408                         src = m;
2409
2410                         /* Component is from the swizzle, e.g. r26.w -> w component. TODO: What if x is masked out? */
2411                         int component = src->swizzle & 3;
2412
2413                         /* Scale constant appropriately, if we can legally */
2414                         uint16_t scaled_constant = 0;
2415
2416                         if (midgard_is_integer_op(op)) {
2417                                 unsigned int *iconstants = (unsigned int *) ins->constants;
2418                                 scaled_constant = (uint16_t) iconstants[component];
2419
2420                                 /* Constant overflow after resize */
2421                                 if (scaled_constant != iconstants[component])
2422                                         continue;
2423                         } else {
2424                                 float original = (float) ins->constants[component];
2425                                 scaled_constant = _mesa_float_to_half(original);
2426
2427                                 /* Check for loss of precision. If this is
2428                                  * mediump, we don't care, but for a highp
2429                                  * shader, we need to pay attention. NIR
2430                                  * doesn't yet tell us which mode we're in!
2431                                  * Practically this prevents most constants
2432                                  * from being inlined, sadly. */
2433
2434                                 float fp32 = _mesa_half_to_float(scaled_constant);
2435
2436                                 if (fp32 != original)
2437                                         continue;
2438                         }
2439
2440                         /* We don't know how to handle these with a constant */
2441
2442                         if (src->mod || src->half || src->rep_low || src->rep_high) {
2443                                 DBG("Bailing inline constant...\n");
2444                                 continue;
2445                         }
2446
2447                         /* Make sure that the constant is not itself a
2448                          * vector by checking if all accessed values
2449                          * (by the swizzle) are the same. */
2450
2451                         uint32_t *cons = (uint32_t *) ins->constants;
2452                         uint32_t value = cons[component];
2453
2454                         bool is_vector = false;
2455                         unsigned mask = effective_writemask(&ins->alu);
2456
2457                         for (int c = 1; c < 4; ++c) {
2458                                 /* We only care if this component is actually used */
2459                                 if (!(mask & (1 << c)))
2460                                         continue;
2461
2462                                 uint32_t test = cons[(src->swizzle >> (2 * c)) & 3];
2463
2464                                 if (test != value) {
2465                                         is_vector = true;
2466                                         break;
2467                                 }
2468                         }
2469
2470                         if (is_vector)
2471                                 continue;
2472
2473                         /* Get rid of the embedded constant */
2474                         ins->has_constants = false;
2475                         ins->ssa_args.src1 = SSA_UNUSED_0;
2476                         ins->ssa_args.inline_constant = true;
2477                         ins->inline_constant = scaled_constant;
2478                 }
2479         }
2480 }
2481
2482 /* Map normal SSA sources to other SSA sources / fixed registers (like
2483  * uniforms) */
2484
2485 static void
2486 map_ssa_to_alias(compiler_context *ctx, int *ref)
2487 {
2488         unsigned int alias = (uintptr_t) _mesa_hash_table_u64_search(ctx->ssa_to_alias, *ref + 1);
2489
2490         if (alias) {
2491                 /* Remove entry in leftovers to avoid a redunant fmov */
2492
2493                 struct set_entry *leftover = _mesa_set_search(ctx->leftover_ssa_to_alias, ((void *) (uintptr_t) (*ref + 1)));
2494
2495                 if (leftover)
2496                         _mesa_set_remove(ctx->leftover_ssa_to_alias, leftover);
2497
2498                 /* Assign the alias map */
2499                 *ref = alias - 1;
2500                 return;
2501         }
2502 }
2503
2504 /* Basic dead code elimination on the MIR itself, which cleans up e.g. the
2505  * texture pipeline */
2506
2507 static bool
2508 midgard_opt_dead_code_eliminate(compiler_context *ctx, midgard_block *block)
2509 {
2510         bool progress = false;
2511
2512         mir_foreach_instr_in_block_safe(block, ins) {
2513                 if (ins->type != TAG_ALU_4) continue;
2514                 if (ins->compact_branch) continue;
2515
2516                 if (ins->ssa_args.dest >= SSA_FIXED_MINIMUM) continue;
2517                 if (mir_is_live_after(ctx, block, ins, ins->ssa_args.dest)) continue;
2518
2519                 mir_remove_instruction(ins);
2520                 progress = true;
2521         }
2522
2523         return progress;
2524 }
2525
2526 static bool
2527 mir_nontrivial_mod(midgard_vector_alu_src src, bool is_int, unsigned mask)
2528 {
2529         /* abs or neg */
2530         if (!is_int && src.mod) return true;
2531
2532         /* swizzle */
2533         for (unsigned c = 0; c < 4; ++c) {
2534                 if (!(mask & (1 << c))) continue;
2535                 if (((src.swizzle >> (2*c)) & 3) != c) return true;
2536         }
2537
2538         return false;
2539 }
2540
2541 static bool
2542 midgard_opt_copy_prop(compiler_context *ctx, midgard_block *block)
2543 {
2544         bool progress = false;
2545
2546         mir_foreach_instr_in_block_safe(block, ins) {
2547                 if (ins->type != TAG_ALU_4) continue;
2548                 if (!OP_IS_MOVE(ins->alu.op)) continue;
2549
2550                 unsigned from = ins->ssa_args.src1;
2551                 unsigned to = ins->ssa_args.dest;
2552
2553                 /* We only work on pure SSA */
2554
2555                 if (to >= SSA_FIXED_MINIMUM) continue;
2556                 if (from >= SSA_FIXED_MINIMUM) continue;
2557                 if (to >= ctx->func->impl->ssa_alloc) continue;
2558                 if (from >= ctx->func->impl->ssa_alloc) continue;
2559
2560                 /* Constant propagation is not handled here, either */
2561                 if (ins->ssa_args.inline_constant) continue;
2562                 if (ins->has_constants) continue;
2563
2564                 /* Also, if the move has side effects, we're helpless */
2565
2566                 midgard_vector_alu_src src =
2567                         vector_alu_from_unsigned(ins->alu.src2);
2568                 unsigned mask = squeeze_writemask(ins->alu.mask);
2569                 bool is_int = midgard_is_integer_op(ins->alu.op);
2570
2571                 if (mir_nontrivial_mod(src, is_int, mask)) continue;
2572                 if (ins->alu.outmod != midgard_outmod_none) continue;
2573
2574                 mir_foreach_instr_in_block_from(block, v, mir_next_op(ins)) {
2575                         if (v->ssa_args.src0 == to) {
2576                                 v->ssa_args.src0 = from;
2577                                 progress = true;
2578                         }
2579
2580                         if (v->ssa_args.src1 == to && !v->ssa_args.inline_constant) {
2581                                 v->ssa_args.src1 = from;
2582                                 progress = true;
2583                         }
2584                 }
2585         }
2586
2587         return progress;
2588 }
2589
2590 static bool
2591 midgard_opt_copy_prop_tex(compiler_context *ctx, midgard_block *block)
2592 {
2593         bool progress = false;
2594
2595         mir_foreach_instr_in_block_safe(block, ins) {
2596                 if (ins->type != TAG_ALU_4) continue;
2597                 if (!OP_IS_MOVE(ins->alu.op)) continue;
2598
2599                 unsigned from = ins->ssa_args.src1;
2600                 unsigned to = ins->ssa_args.dest;
2601
2602                 /* Make sure it's simple enough for us to handle */
2603
2604                 if (from >= SSA_FIXED_MINIMUM) continue;
2605                 if (from >= ctx->func->impl->ssa_alloc) continue;
2606                 if (to < SSA_FIXED_REGISTER(REGISTER_TEXTURE_BASE)) continue;
2607                 if (to > SSA_FIXED_REGISTER(REGISTER_TEXTURE_BASE + 1)) continue;
2608
2609                 bool eliminated = false;
2610
2611                 mir_foreach_instr_in_block_from_rev(block, v, mir_prev_op(ins)) {
2612                         /* The texture registers are not SSA so be careful.
2613                          * Conservatively, just stop if we hit a texture op
2614                          * (even if it may not write) to where we are */
2615
2616                         if (v->type != TAG_ALU_4)
2617                                 break;
2618
2619                         if (v->ssa_args.dest == from) {
2620                                 /* We don't want to track partial writes ... */
2621                                 if (v->alu.mask == 0xF) {
2622                                         v->ssa_args.dest = to;
2623                                         eliminated = true;
2624                                 }
2625
2626                                 break;
2627                         }
2628                 }
2629
2630                 if (eliminated)
2631                         mir_remove_instruction(ins);
2632
2633                 progress |= eliminated;
2634         }
2635
2636         return progress;
2637 }
2638
2639 /* The following passes reorder MIR instructions to enable better scheduling */
2640
2641 static void
2642 midgard_pair_load_store(compiler_context *ctx, midgard_block *block)
2643 {
2644         mir_foreach_instr_in_block_safe(block, ins) {
2645                 if (ins->type != TAG_LOAD_STORE_4) continue;
2646
2647                 /* We've found a load/store op. Check if next is also load/store. */
2648                 midgard_instruction *next_op = mir_next_op(ins);
2649                 if (&next_op->link != &block->instructions) {
2650                         if (next_op->type == TAG_LOAD_STORE_4) {
2651                                 /* If so, we're done since we're a pair */
2652                                 ins = mir_next_op(ins);
2653                                 continue;
2654                         }
2655
2656                         /* Maximum search distance to pair, to avoid register pressure disasters */
2657                         int search_distance = 8;
2658
2659                         /* Otherwise, we have an orphaned load/store -- search for another load */
2660                         mir_foreach_instr_in_block_from(block, c, mir_next_op(ins)) {
2661                                 /* Terminate search if necessary */
2662                                 if (!(search_distance--)) break;
2663
2664                                 if (c->type != TAG_LOAD_STORE_4) continue;
2665
2666                                 /* Stores cannot be reordered, since they have
2667                                  * dependencies. For the same reason, indirect
2668                                  * loads cannot be reordered as their index is
2669                                  * loaded in r27.w */
2670
2671                                 if (OP_IS_STORE(c->load_store.op)) continue;
2672
2673                                 /* It appears the 0x800 bit is set whenever a
2674                                  * load is direct, unset when it is indirect.
2675                                  * Skip indirect loads. */
2676
2677                                 if (!(c->load_store.unknown & 0x800)) continue;
2678
2679                                 /* We found one! Move it up to pair and remove it from the old location */
2680
2681                                 mir_insert_instruction_before(ins, *c);
2682                                 mir_remove_instruction(c);
2683
2684                                 break;
2685                         }
2686                 }
2687         }
2688 }
2689
2690 /* Emit varying stores late */
2691
2692 static void
2693 midgard_emit_store(compiler_context *ctx, midgard_block *block) {
2694         /* Iterate in reverse to get the final write, rather than the first */
2695
2696         mir_foreach_instr_in_block_safe_rev(block, ins) {
2697                 /* Check if what we just wrote needs a store */
2698                 int idx = ins->ssa_args.dest;
2699                 uintptr_t varying = ((uintptr_t) _mesa_hash_table_u64_search(ctx->ssa_varyings, idx + 1));
2700
2701                 if (!varying) continue;
2702
2703                 varying -= 1;
2704
2705                 /* We need to store to the appropriate varying, so emit the
2706                  * move/store */
2707
2708                 /* TODO: Integrate with special purpose RA (and scheduler?) */
2709                 bool high_varying_register = false;
2710
2711                 midgard_instruction mov = v_fmov(idx, blank_alu_src, SSA_FIXED_REGISTER(REGISTER_VARYING_BASE + high_varying_register));
2712
2713                 midgard_instruction st = m_st_vary_32(SSA_FIXED_REGISTER(high_varying_register), varying);
2714                 st.load_store.unknown = 0x1E9E; /* XXX: What is this? */
2715
2716                 mir_insert_instruction_before(mir_next_op(ins), st);
2717                 mir_insert_instruction_before(mir_next_op(ins), mov);
2718
2719                 /* We no longer need to store this varying */
2720                 _mesa_hash_table_u64_remove(ctx->ssa_varyings, idx + 1);
2721         }
2722 }
2723
2724 /* If there are leftovers after the below pass, emit actual fmov
2725  * instructions for the slow-but-correct path */
2726
2727 static void
2728 emit_leftover_move(compiler_context *ctx)
2729 {
2730         set_foreach(ctx->leftover_ssa_to_alias, leftover) {
2731                 int base = ((uintptr_t) leftover->key) - 1;
2732                 int mapped = base;
2733
2734                 map_ssa_to_alias(ctx, &mapped);
2735                 EMIT(fmov, mapped, blank_alu_src, base);
2736         }
2737 }
2738
2739 static void
2740 actualise_ssa_to_alias(compiler_context *ctx)
2741 {
2742         mir_foreach_instr(ctx, ins) {
2743                 map_ssa_to_alias(ctx, &ins->ssa_args.src0);
2744                 map_ssa_to_alias(ctx, &ins->ssa_args.src1);
2745         }
2746
2747         emit_leftover_move(ctx);
2748 }
2749
2750 static void
2751 emit_fragment_epilogue(compiler_context *ctx)
2752 {
2753         /* Special case: writing out constants requires us to include the move
2754          * explicitly now, so shove it into r0 */
2755
2756         void *constant_value = _mesa_hash_table_u64_search(ctx->ssa_constants, ctx->fragment_output + 1);
2757
2758         if (constant_value) {
2759                 midgard_instruction ins = v_fmov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), blank_alu_src, SSA_FIXED_REGISTER(0));
2760                 attach_constants(ctx, &ins, constant_value, ctx->fragment_output + 1);
2761                 emit_mir_instruction(ctx, ins);
2762         }
2763
2764         /* Perform the actual fragment writeout. We have two writeout/branch
2765          * instructions, forming a loop until writeout is successful as per the
2766          * docs. TODO: gl_FragDepth */
2767
2768         EMIT(alu_br_compact_cond, midgard_jmp_writeout_op_writeout, TAG_ALU_4, 0, midgard_condition_always);
2769         EMIT(alu_br_compact_cond, midgard_jmp_writeout_op_writeout, TAG_ALU_4, -1, midgard_condition_always);
2770 }
2771
2772 /* For the blend epilogue, we need to convert the blended fragment vec4 (stored
2773  * in r0) to a RGBA8888 value by scaling and type converting. We then output it
2774  * with the int8 analogue to the fragment epilogue */
2775
2776 static void
2777 emit_blend_epilogue(compiler_context *ctx)
2778 {
2779         /* vmul.fmul.none.fulllow hr48, r0, #255 */
2780
2781         midgard_instruction scale = {
2782                 .type = TAG_ALU_4,
2783                 .unit = UNIT_VMUL,
2784                 .inline_constant = _mesa_float_to_half(255.0),
2785                 .ssa_args = {
2786                         .src0 = SSA_FIXED_REGISTER(0),
2787                         .src1 = SSA_UNUSED_0,
2788                         .dest = SSA_FIXED_REGISTER(24),
2789                         .inline_constant = true
2790                 },
2791                 .alu = {
2792                         .op = midgard_alu_op_fmul,
2793                         .reg_mode = midgard_reg_mode_32,
2794                         .dest_override = midgard_dest_override_lower,
2795                         .mask = 0xFF,
2796                         .src1 = vector_alu_srco_unsigned(blank_alu_src),
2797                         .src2 = vector_alu_srco_unsigned(blank_alu_src),
2798                 }
2799         };
2800
2801         emit_mir_instruction(ctx, scale);
2802
2803         /* vadd.f2u8.pos.low hr0, hr48, #0 */
2804
2805         midgard_vector_alu_src alu_src = blank_alu_src;
2806         alu_src.half = true;
2807
2808         midgard_instruction f2u8 = {
2809                 .type = TAG_ALU_4,
2810                 .ssa_args = {
2811                         .src0 = SSA_FIXED_REGISTER(24),
2812                         .src1 = SSA_UNUSED_0,
2813                         .dest = SSA_FIXED_REGISTER(0),
2814                         .inline_constant = true
2815                 },
2816                 .alu = {
2817                         .op = midgard_alu_op_f2u8,
2818                         .reg_mode = midgard_reg_mode_16,
2819                         .dest_override = midgard_dest_override_lower,
2820                         .outmod = midgard_outmod_pos,
2821                         .mask = 0xF,
2822                         .src1 = vector_alu_srco_unsigned(alu_src),
2823                         .src2 = vector_alu_srco_unsigned(blank_alu_src),
2824                 }
2825         };
2826
2827         emit_mir_instruction(ctx, f2u8);
2828
2829         /* vmul.imov.quarter r0, r0, r0 */
2830
2831         midgard_instruction imov_8 = {
2832                 .type = TAG_ALU_4,
2833                 .ssa_args = {
2834                         .src0 = SSA_UNUSED_1,
2835                         .src1 = SSA_FIXED_REGISTER(0),
2836                         .dest = SSA_FIXED_REGISTER(0),
2837                 },
2838                 .alu = {
2839                         .op = midgard_alu_op_imov,
2840                         .reg_mode = midgard_reg_mode_8,
2841                         .dest_override = midgard_dest_override_none,
2842                         .mask = 0xFF,
2843                         .src1 = vector_alu_srco_unsigned(blank_alu_src),
2844                         .src2 = vector_alu_srco_unsigned(blank_alu_src),
2845                 }
2846         };
2847
2848         /* Emit branch epilogue with the 8-bit move as the source */
2849
2850         emit_mir_instruction(ctx, imov_8);
2851         EMIT(alu_br_compact_cond, midgard_jmp_writeout_op_writeout, TAG_ALU_4, 0, midgard_condition_always);
2852
2853         emit_mir_instruction(ctx, imov_8);
2854         EMIT(alu_br_compact_cond, midgard_jmp_writeout_op_writeout, TAG_ALU_4, -1, midgard_condition_always);
2855 }
2856
2857 static midgard_block *
2858 emit_block(compiler_context *ctx, nir_block *block)
2859 {
2860         midgard_block *this_block = calloc(sizeof(midgard_block), 1);
2861         list_addtail(&this_block->link, &ctx->blocks);
2862
2863         this_block->is_scheduled = false;
2864         ++ctx->block_count;
2865
2866         ctx->texture_index[0] = -1;
2867         ctx->texture_index[1] = -1;
2868
2869         /* Add us as a successor to the block we are following */
2870         if (ctx->current_block)
2871                 midgard_block_add_successor(ctx->current_block, this_block);
2872
2873         /* Set up current block */
2874         list_inithead(&this_block->instructions);
2875         ctx->current_block = this_block;
2876
2877         nir_foreach_instr(instr, block) {
2878                 emit_instr(ctx, instr);
2879                 ++ctx->instruction_count;
2880         }
2881
2882         inline_alu_constants(ctx);
2883         embedded_to_inline_constant(ctx);
2884
2885         /* Perform heavylifting for aliasing */
2886         actualise_ssa_to_alias(ctx);
2887
2888         midgard_emit_store(ctx, this_block);
2889         midgard_pair_load_store(ctx, this_block);
2890
2891         /* Append fragment shader epilogue (value writeout) */
2892         if (ctx->stage == MESA_SHADER_FRAGMENT) {
2893                 if (block == nir_impl_last_block(ctx->func->impl)) {
2894                         if (ctx->is_blend)
2895                                 emit_blend_epilogue(ctx);
2896                         else
2897                                 emit_fragment_epilogue(ctx);
2898                 }
2899         }
2900
2901         if (block == nir_start_block(ctx->func->impl))
2902                 ctx->initial_block = this_block;
2903
2904         if (block == nir_impl_last_block(ctx->func->impl))
2905                 ctx->final_block = this_block;
2906
2907         /* Allow the next control flow to access us retroactively, for
2908          * branching etc */
2909         ctx->current_block = this_block;
2910
2911         /* Document the fallthrough chain */
2912         ctx->previous_source_block = this_block;
2913
2914         return this_block;
2915 }
2916
2917 static midgard_block *emit_cf_list(struct compiler_context *ctx, struct exec_list *list);
2918
2919 static void
2920 emit_if(struct compiler_context *ctx, nir_if *nif)
2921 {
2922         /* Conditional branches expect the condition in r31.w; emit a move for
2923          * that in the _previous_ block (which is the current block). */
2924         emit_condition(ctx, &nif->condition, true, COMPONENT_X);
2925
2926         /* Speculatively emit the branch, but we can't fill it in until later */
2927         EMIT(branch, true, true);
2928         midgard_instruction *then_branch = mir_last_in_block(ctx->current_block);
2929
2930         /* Emit the two subblocks */
2931         midgard_block *then_block = emit_cf_list(ctx, &nif->then_list);
2932
2933         /* Emit a jump from the end of the then block to the end of the else */
2934         EMIT(branch, false, false);
2935         midgard_instruction *then_exit = mir_last_in_block(ctx->current_block);
2936
2937         /* Emit second block, and check if it's empty */
2938
2939         int else_idx = ctx->block_count;
2940         int count_in = ctx->instruction_count;
2941         midgard_block *else_block = emit_cf_list(ctx, &nif->else_list);
2942         int after_else_idx = ctx->block_count;
2943
2944         /* Now that we have the subblocks emitted, fix up the branches */
2945
2946         assert(then_block);
2947         assert(else_block);
2948
2949         if (ctx->instruction_count == count_in) {
2950                 /* The else block is empty, so don't emit an exit jump */
2951                 mir_remove_instruction(then_exit);
2952                 then_branch->branch.target_block = after_else_idx;
2953         } else {
2954                 then_branch->branch.target_block = else_idx;
2955                 then_exit->branch.target_block = after_else_idx;
2956         }
2957 }
2958
2959 static void
2960 emit_loop(struct compiler_context *ctx, nir_loop *nloop)
2961 {
2962         /* Remember where we are */
2963         midgard_block *start_block = ctx->current_block;
2964
2965         /* Allocate a loop number, growing the current inner loop depth */
2966         int loop_idx = ++ctx->current_loop_depth;
2967
2968         /* Get index from before the body so we can loop back later */
2969         int start_idx = ctx->block_count;
2970
2971         /* Emit the body itself */
2972         emit_cf_list(ctx, &nloop->body);
2973
2974         /* Branch back to loop back */
2975         struct midgard_instruction br_back = v_branch(false, false);
2976         br_back.branch.target_block = start_idx;
2977         emit_mir_instruction(ctx, br_back);
2978
2979         /* Mark down that branch in the graph. Note that we're really branching
2980          * to the block *after* we started in. TODO: Why doesn't the branch
2981          * itself have an off-by-one then...? */
2982         midgard_block_add_successor(ctx->current_block, start_block->successors[0]);
2983
2984         /* Find the index of the block about to follow us (note: we don't add
2985          * one; blocks are 0-indexed so we get a fencepost problem) */
2986         int break_block_idx = ctx->block_count;
2987
2988         /* Fix up the break statements we emitted to point to the right place,
2989          * now that we can allocate a block number for them */
2990
2991         list_for_each_entry_from(struct midgard_block, block, start_block, &ctx->blocks, link) {
2992                 mir_foreach_instr_in_block(block, ins) {
2993                         if (ins->type != TAG_ALU_4) continue;
2994                         if (!ins->compact_branch) continue;
2995                         if (ins->prepacked_branch) continue;
2996
2997                         /* We found a branch -- check the type to see if we need to do anything */
2998                         if (ins->branch.target_type != TARGET_BREAK) continue;
2999
3000                         /* It's a break! Check if it's our break */
3001                         if (ins->branch.target_break != loop_idx) continue;
3002
3003                         /* Okay, cool, we're breaking out of this loop.
3004                          * Rewrite from a break to a goto */
3005
3006                         ins->branch.target_type = TARGET_GOTO;
3007                         ins->branch.target_block = break_block_idx;
3008                 }
3009         }
3010
3011         /* Now that we've finished emitting the loop, free up the depth again
3012          * so we play nice with recursion amid nested loops */
3013         --ctx->current_loop_depth;
3014 }
3015
3016 static midgard_block *
3017 emit_cf_list(struct compiler_context *ctx, struct exec_list *list)
3018 {
3019         midgard_block *start_block = NULL;
3020
3021         foreach_list_typed(nir_cf_node, node, node, list) {
3022                 switch (node->type) {
3023                 case nir_cf_node_block: {
3024                         midgard_block *block = emit_block(ctx, nir_cf_node_as_block(node));
3025
3026                         if (!start_block)
3027                                 start_block = block;
3028
3029                         break;
3030                 }
3031
3032                 case nir_cf_node_if:
3033                         emit_if(ctx, nir_cf_node_as_if(node));
3034                         break;
3035
3036                 case nir_cf_node_loop:
3037                         emit_loop(ctx, nir_cf_node_as_loop(node));
3038                         break;
3039
3040                 case nir_cf_node_function:
3041                         assert(0);
3042                         break;
3043                 }
3044         }
3045
3046         return start_block;
3047 }
3048
3049 /* Due to lookahead, we need to report the first tag executed in the command
3050  * stream and in branch targets. An initial block might be empty, so iterate
3051  * until we find one that 'works' */
3052
3053 static unsigned
3054 midgard_get_first_tag_from_block(compiler_context *ctx, unsigned block_idx)
3055 {
3056         midgard_block *initial_block = mir_get_block(ctx, block_idx);
3057
3058         unsigned first_tag = 0;
3059
3060         do {
3061                 midgard_bundle *initial_bundle = util_dynarray_element(&initial_block->bundles, midgard_bundle, 0);
3062
3063                 if (initial_bundle) {
3064                         first_tag = initial_bundle->tag;
3065                         break;
3066                 }
3067
3068                 /* Initial block is empty, try the next block */
3069                 initial_block = list_first_entry(&(initial_block->link), midgard_block, link);
3070         } while(initial_block != NULL);
3071
3072         assert(first_tag);
3073         return first_tag;
3074 }
3075
3076 int
3077 midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_blend)
3078 {
3079         struct util_dynarray *compiled = &program->compiled;
3080
3081         midgard_debug = debug_get_option_midgard_debug();
3082
3083         compiler_context ictx = {
3084                 .nir = nir,
3085                 .stage = nir->info.stage,
3086
3087                 .is_blend = is_blend,
3088                 .blend_constant_offset = -1,
3089
3090                 .alpha_ref = program->alpha_ref
3091         };
3092
3093         compiler_context *ctx = &ictx;
3094
3095         /* TODO: Decide this at runtime */
3096         ctx->uniform_cutoff = 8;
3097
3098         /* Assign var locations early, so the epilogue can use them if necessary */
3099
3100         nir_assign_var_locations(&nir->outputs, &nir->num_outputs, glsl_type_size);
3101         nir_assign_var_locations(&nir->inputs, &nir->num_inputs, glsl_type_size);
3102         nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms, glsl_type_size);
3103
3104         /* Initialize at a global (not block) level hash tables */
3105
3106         ctx->ssa_constants = _mesa_hash_table_u64_create(NULL);
3107         ctx->ssa_varyings = _mesa_hash_table_u64_create(NULL);
3108         ctx->ssa_to_alias = _mesa_hash_table_u64_create(NULL);
3109         ctx->hash_to_temp = _mesa_hash_table_u64_create(NULL);
3110         ctx->sysval_to_id = _mesa_hash_table_u64_create(NULL);
3111         ctx->leftover_ssa_to_alias = _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
3112
3113         /* Record the varying mapping for the command stream's bookkeeping */
3114
3115         struct exec_list *varyings =
3116                 ctx->stage == MESA_SHADER_VERTEX ? &nir->outputs : &nir->inputs;
3117
3118         nir_foreach_variable(var, varyings) {
3119                 unsigned loc = var->data.driver_location;
3120                 unsigned sz = glsl_type_size(var->type, FALSE);
3121
3122                 for (int c = 0; c < sz; ++c) {
3123                         program->varyings[loc + c] = var->data.location;
3124                 }
3125         }
3126
3127         /* Lower gl_Position pre-optimisation */
3128
3129         if (ctx->stage == MESA_SHADER_VERTEX)
3130                 NIR_PASS_V(nir, nir_lower_viewport_transform);
3131
3132         NIR_PASS_V(nir, nir_lower_var_copies);
3133         NIR_PASS_V(nir, nir_lower_vars_to_ssa);
3134         NIR_PASS_V(nir, nir_split_var_copies);
3135         NIR_PASS_V(nir, nir_lower_var_copies);
3136         NIR_PASS_V(nir, nir_lower_global_vars_to_local);
3137         NIR_PASS_V(nir, nir_lower_var_copies);
3138         NIR_PASS_V(nir, nir_lower_vars_to_ssa);
3139
3140         NIR_PASS_V(nir, nir_lower_io, nir_var_all, glsl_type_size, 0);
3141
3142         /* Optimisation passes */
3143
3144         optimise_nir(nir);
3145
3146         if (midgard_debug & MIDGARD_DBG_SHADERS) {
3147                 nir_print_shader(nir, stdout);
3148         }
3149
3150         /* Assign sysvals and counts, now that we're sure
3151          * (post-optimisation) */
3152
3153         midgard_nir_assign_sysvals(ctx, nir);
3154
3155         program->uniform_count = nir->num_uniforms;
3156         program->sysval_count = ctx->sysval_count;
3157         memcpy(program->sysvals, ctx->sysvals, sizeof(ctx->sysvals[0]) * ctx->sysval_count);
3158
3159         program->attribute_count = (ctx->stage == MESA_SHADER_VERTEX) ? nir->num_inputs : 0;
3160         program->varying_count = (ctx->stage == MESA_SHADER_VERTEX) ? nir->num_outputs : ((ctx->stage == MESA_SHADER_FRAGMENT) ? nir->num_inputs : 0);
3161
3162         nir_foreach_function(func, nir) {
3163                 if (!func->impl)
3164                         continue;
3165
3166                 list_inithead(&ctx->blocks);
3167                 ctx->block_count = 0;
3168                 ctx->func = func;
3169
3170                 emit_cf_list(ctx, &func->impl->body);
3171                 emit_block(ctx, func->impl->end_block);
3172
3173                 break; /* TODO: Multi-function shaders */
3174         }
3175
3176         util_dynarray_init(compiled, NULL);
3177
3178         /* MIR-level optimizations */
3179
3180         bool progress = false;
3181
3182         do {
3183                 progress = false;
3184
3185                 mir_foreach_block(ctx, block) {
3186                         progress |= midgard_opt_copy_prop(ctx, block);
3187                         progress |= midgard_opt_copy_prop_tex(ctx, block);
3188                         progress |= midgard_opt_dead_code_eliminate(ctx, block);
3189                 }
3190         } while (progress);
3191
3192         /* Schedule! */
3193         schedule_program(ctx);
3194
3195         /* Now that all the bundles are scheduled and we can calculate block
3196          * sizes, emit actual branch instructions rather than placeholders */
3197
3198         int br_block_idx = 0;
3199
3200         mir_foreach_block(ctx, block) {
3201                 util_dynarray_foreach(&block->bundles, midgard_bundle, bundle) {
3202                         for (int c = 0; c < bundle->instruction_count; ++c) {
3203                                 midgard_instruction *ins = &bundle->instructions[c];
3204
3205                                 if (!midgard_is_branch_unit(ins->unit)) continue;
3206
3207                                 if (ins->prepacked_branch) continue;
3208
3209                                 /* Parse some basic branch info */
3210                                 bool is_compact = ins->unit == ALU_ENAB_BR_COMPACT;
3211                                 bool is_conditional = ins->branch.conditional;
3212                                 bool is_inverted = ins->branch.invert_conditional;
3213                                 bool is_discard = ins->branch.target_type == TARGET_DISCARD;
3214
3215                                 /* Determine the block we're jumping to */
3216                                 int target_number = ins->branch.target_block;
3217
3218                                 /* Report the destination tag. Discards don't need this */
3219                                 int dest_tag = is_discard ? 0 : midgard_get_first_tag_from_block(ctx, target_number);
3220
3221                                 /* Count up the number of quadwords we're jumping over. That is, the number of quadwords in each of the blocks between (br_block_idx, target_number) */
3222                                 int quadword_offset = 0;
3223
3224                                 if (is_discard) {
3225                                         /* Jump to the end of the shader. We
3226                                          * need to include not only the
3227                                          * following blocks, but also the
3228                                          * contents of our current block (since
3229                                          * discard can come in the middle of
3230                                          * the block) */
3231
3232                                         midgard_block *blk = mir_get_block(ctx, br_block_idx + 1);
3233
3234                                         for (midgard_bundle *bun = bundle + 1; bun < (midgard_bundle *)((char*) block->bundles.data + block->bundles.size); ++bun) {
3235                                                 quadword_offset += quadword_size(bun->tag);
3236                                         }
3237
3238                                         mir_foreach_block_from(ctx, blk, b) {
3239                                                 quadword_offset += b->quadword_count;
3240                                         }
3241
3242                                 } else if (target_number > br_block_idx) {
3243                                         /* Jump forward */
3244
3245                                         for (int idx = br_block_idx + 1; idx < target_number; ++idx) {
3246                                                 midgard_block *blk = mir_get_block(ctx, idx);
3247                                                 assert(blk);
3248
3249                                                 quadword_offset += blk->quadword_count;
3250                                         }
3251                                 } else {
3252                                         /* Jump backwards */
3253
3254                                         for (int idx = br_block_idx; idx >= target_number; --idx) {
3255                                                 midgard_block *blk = mir_get_block(ctx, idx);
3256                                                 assert(blk);
3257
3258                                                 quadword_offset -= blk->quadword_count;
3259                                         }
3260                                 }
3261
3262                                 /* Unconditional extended branches (far jumps)
3263                                  * have issues, so we always use a conditional
3264                                  * branch, setting the condition to always for
3265                                  * unconditional. For compact unconditional
3266                                  * branches, cond isn't used so it doesn't
3267                                  * matter what we pick. */
3268
3269                                 midgard_condition cond =
3270                                         !is_conditional ? midgard_condition_always :
3271                                         is_inverted ? midgard_condition_false :
3272                                         midgard_condition_true;
3273
3274                                 midgard_jmp_writeout_op op =
3275                                         is_discard ? midgard_jmp_writeout_op_discard :
3276                                         (is_compact && !is_conditional) ? midgard_jmp_writeout_op_branch_uncond :
3277                                         midgard_jmp_writeout_op_branch_cond;
3278
3279                                 if (!is_compact) {
3280                                         midgard_branch_extended branch =
3281                                                 midgard_create_branch_extended(
3282                                                         cond, op,
3283                                                         dest_tag,
3284                                                         quadword_offset);
3285
3286                                         memcpy(&ins->branch_extended, &branch, sizeof(branch));
3287                                 } else if (is_conditional || is_discard) {
3288                                         midgard_branch_cond branch = {
3289                                                 .op = op,
3290                                                 .dest_tag = dest_tag,
3291                                                 .offset = quadword_offset,
3292                                                 .cond = cond
3293                                         };
3294
3295                                         assert(branch.offset == quadword_offset);
3296
3297                                         memcpy(&ins->br_compact, &branch, sizeof(branch));
3298                                 } else {
3299                                         assert(op == midgard_jmp_writeout_op_branch_uncond);
3300
3301                                         midgard_branch_uncond branch = {
3302                                                 .op = op,
3303                                                 .dest_tag = dest_tag,
3304                                                 .offset = quadword_offset,
3305                                                 .unknown = 1
3306                                         };
3307
3308                                         assert(branch.offset == quadword_offset);
3309
3310                                         memcpy(&ins->br_compact, &branch, sizeof(branch));
3311                                 }
3312                         }
3313                 }
3314
3315                 ++br_block_idx;
3316         }
3317
3318         /* Emit flat binary from the instruction arrays. Iterate each block in
3319          * sequence. Save instruction boundaries such that lookahead tags can
3320          * be assigned easily */
3321
3322         /* Cache _all_ bundles in source order for lookahead across failed branches */
3323
3324         int bundle_count = 0;
3325         mir_foreach_block(ctx, block) {
3326                 bundle_count += block->bundles.size / sizeof(midgard_bundle);
3327         }
3328         midgard_bundle **source_order_bundles = malloc(sizeof(midgard_bundle *) * bundle_count);
3329         int bundle_idx = 0;
3330         mir_foreach_block(ctx, block) {
3331                 util_dynarray_foreach(&block->bundles, midgard_bundle, bundle) {
3332                         source_order_bundles[bundle_idx++] = bundle;
3333                 }
3334         }
3335
3336         int current_bundle = 0;
3337
3338         mir_foreach_block(ctx, block) {
3339                 util_dynarray_foreach(&block->bundles, midgard_bundle, bundle) {
3340                         int lookahead = 1;
3341
3342                         if (current_bundle + 1 < bundle_count) {
3343                                 uint8_t next = source_order_bundles[current_bundle + 1]->tag;
3344
3345                                 if (!(current_bundle + 2 < bundle_count) && IS_ALU(next)) {
3346                                         lookahead = 1;
3347                                 } else {
3348                                         lookahead = next;
3349                                 }
3350                         }
3351
3352                         emit_binary_bundle(ctx, bundle, compiled, lookahead);
3353                         ++current_bundle;
3354                 }
3355
3356                 /* TODO: Free deeper */
3357                 //util_dynarray_fini(&block->instructions);
3358         }
3359
3360         free(source_order_bundles);
3361
3362         /* Report the very first tag executed */
3363         program->first_tag = midgard_get_first_tag_from_block(ctx, 0);
3364
3365         /* Deal with off-by-one related to the fencepost problem */
3366         program->work_register_count = ctx->work_registers + 1;
3367
3368         program->can_discard = ctx->can_discard;
3369         program->uniform_cutoff = ctx->uniform_cutoff;
3370
3371         program->blend_patch_offset = ctx->blend_constant_offset;
3372
3373         if (midgard_debug & MIDGARD_DBG_SHADERS)
3374                 disassemble_midgard(program->compiled.data, program->compiled.size);
3375
3376         return 0;
3377 }