src/gallium/drivers/panfrost/midgard/midgard_compile.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig <alyssa@rosenzweig.io>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  */
  23
  24 #include <sys/types.h>
  25 #include <sys/stat.h>
  26 #include <sys/mman.h>
  27 #include <fcntl.h>
  28 #include <stdint.h>
  29 #include <stdlib.h>
  30 #include <stdio.h>
  31 #include <err.h>
  32
  33 #include "compiler/glsl/glsl_to_nir.h"
  34 #include "compiler/nir_types.h"
  35 #include "main/imports.h"
  36 #include "compiler/nir/nir_builder.h"
  37 #include "util/half_float.h"
  38 #include "util/register_allocate.h"
  39 #include "util/u_dynarray.h"
  40 #include "util/list.h"
  41 #include "main/mtypes.h"
  42
  43 #include "midgard.h"
  44 #include "midgard_nir.h"
  45 #include "midgard_compile.h"
  46 #include "helpers.h"
  47
  48 #include "disassemble.h"
  49
  50 /* Instruction arguments represented as block-local SSA indices, rather than
  51  * registers. Negative values mean unused. */
  52
  53 typedef struct {
  54         int src0;
  55         int src1;
  56         int dest;
  57
  58         /* src1 is -not- SSA but instead a 16-bit inline constant to be smudged
  59          * in. Only valid for ALU ops. */
  60         bool inline_constant;
  61 } ssa_args;
  62
  63 /* Forward declare so midgard_branch can reference */
  64 struct midgard_block;
  65
  66 /* Target types. Defaults to TARGET_GOTO (the type corresponding directly to
  67  * the hardware), hence why that must be zero */
  68
  69 #define TARGET_GOTO 0
  70 #define TARGET_BREAK 1
  71 #define TARGET_CONTINUE 2
  72
  73 typedef struct midgard_branch {
  74         /* If conditional, the condition is specified in r31.w */
  75         bool conditional;
  76
  77         /* For conditionals, if this is true, we branch on FALSE. If false, we  branch on TRUE. */
  78         bool invert_conditional;
  79
  80         /* Branch targets: the start of a block, the start of a loop (continue), the end of a loop (break). Value is one of TARGET_ */
  81         unsigned target_type;
  82
  83         /* The actual target */
  84         union {
  85                 int target_block;
  86                 int target_break;
  87                 int target_continue;
  88         };
  89 } midgard_branch;
  90
  91 /* Generic in-memory data type repesenting a single logical instruction, rather
  92  * than a single instruction group. This is the preferred form for code gen.
  93  * Multiple midgard_insturctions will later be combined during scheduling,
  94  * though this is not represented in this structure.  Its format bridges
  95  * the low-level binary representation with the higher level semantic meaning.
  96  *
  97  * Notably, it allows registers to be specified as block local SSA, for code
  98  * emitted before the register allocation pass.
  99  */
 100
 101 typedef struct midgard_instruction {
 102         /* Must be first for casting */
 103         struct list_head link;
 104
 105         unsigned type; /* ALU, load/store, texture */
 106
 107         /* If the register allocator has not run yet... */
 108         ssa_args ssa_args;
 109
 110         /* Special fields for an ALU instruction */
 111         midgard_reg_info registers;
 112
 113         /* I.e. (1 << alu_bit) */
 114         int unit;
 115
 116         bool has_constants;
 117         float constants[4];
 118         uint16_t inline_constant;
 119         bool has_blend_constant;
 120
 121         bool compact_branch;
 122         bool writeout;
 123         bool prepacked_branch;
 124
 125         union {
 126                 midgard_load_store_word load_store;
 127                 midgard_vector_alu alu;
 128                 midgard_texture_word texture;
 129                 midgard_branch_extended branch_extended;
 130                 uint16_t br_compact;
 131
 132                 /* General branch, rather than packed br_compact. Higher level
 133                  * than the other components */
 134                 midgard_branch branch;
 135         };
 136 } midgard_instruction;
 137
 138 typedef struct midgard_block {
 139         /* Link to next block. Must be first for mir_get_block */
 140         struct list_head link;
 141
 142         /* List of midgard_instructions emitted for the current block */
 143         struct list_head instructions;
 144
 145         bool is_scheduled;
 146
 147         /* List of midgard_bundles emitted (after the scheduler has run) */
 148         struct util_dynarray bundles;
 149
 150         /* Number of quadwords _actually_ emitted, as determined after scheduling */
 151         unsigned quadword_count;
 152
 153         struct midgard_block *next_fallthrough;
 154 } midgard_block;
 155
 156 /* Helpers to generate midgard_instruction's using macro magic, since every
 157  * driver seems to do it that way */
 158
 159 #define EMIT(op, ...) emit_mir_instruction(ctx, v_##op(__VA_ARGS__));
 160
 161 #define M_LOAD_STORE(name, rname, uname) \
 162         static midgard_instruction m_##name(unsigned ssa, unsigned address) { \
 163                 midgard_instruction i = { \
 164                         .type = TAG_LOAD_STORE_4, \
 165                         .ssa_args = { \
 166                                 .rname = ssa, \
 167                                 .uname = -1, \
 168                                 .src1 = -1 \
 169                         }, \
 170                         .load_store = { \
 171                                 .op = midgard_op_##name, \
 172                                 .mask = 0xF, \
 173                                 .swizzle = SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), \
 174                                 .address = address \
 175                         } \
 176                 }; \
 177                 \
 178                 return i; \
 179         }
 180
 181 #define M_LOAD(name) M_LOAD_STORE(name, dest, src0)
 182 #define M_STORE(name) M_LOAD_STORE(name, src0, dest)
 183
 184 const midgard_vector_alu_src blank_alu_src = {
 185         .swizzle = SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
 186 };
 187
 188 const midgard_scalar_alu_src blank_scalar_alu_src = {
 189         .full = true
 190 };
 191
 192 /* Used for encoding the unused source of 1-op instructions */
 193 const midgard_vector_alu_src zero_alu_src = { 0 };
 194
 195 /* Coerce structs to integer */
 196
 197 static unsigned
 198 vector_alu_srco_unsigned(midgard_vector_alu_src src)
 199 {
 200         unsigned u;
 201         memcpy(&u, &src, sizeof(src));
 202         return u;
 203 }
 204
 205 /* Inputs a NIR ALU source, with modifiers attached if necessary, and outputs
 206  * the corresponding Midgard source */
 207
 208 static midgard_vector_alu_src
 209 vector_alu_modifiers(nir_alu_src *src)
 210 {
 211         if (!src) return blank_alu_src;
 212
 213         midgard_vector_alu_src alu_src = {
 214                 .abs = src->abs,
 215                 .negate = src->negate,
 216                 .rep_low = 0,
 217                 .rep_high = 0,
 218                 .half = 0, /* TODO */
 219                 .swizzle = SWIZZLE_FROM_ARRAY(src->swizzle)
 220         };
 221
 222         return alu_src;
 223 }
 224
 225 /* 'Intrinsic' move for misc aliasing uses independent of actual NIR ALU code */
 226
 227 static midgard_instruction
 228 v_fmov(unsigned src, midgard_vector_alu_src mod, unsigned dest)
 229 {
 230         midgard_instruction ins = {
 231                 .type = TAG_ALU_4,
 232                 .ssa_args = {
 233                         .src0 = SSA_UNUSED_1,
 234                         .src1 = src,
 235                         .dest = dest,
 236                 },
 237                 .alu = {
 238                         .op = midgard_alu_op_fmov,
 239                         .reg_mode = midgard_reg_mode_full,
 240                         .dest_override = midgard_dest_override_none,
 241                         .mask = 0xFF,
 242                         .src1 = vector_alu_srco_unsigned(zero_alu_src),
 243                         .src2 = vector_alu_srco_unsigned(mod)
 244                 },
 245         };
 246
 247         return ins;
 248 }
 249
 250 /* load/store instructions have both 32-bit and 16-bit variants, depending on
 251  * whether we are using vectors composed of highp or mediump. At the moment, we
 252  * don't support half-floats -- this requires changes in other parts of the
 253  * compiler -- therefore the 16-bit versions are commented out. */
 254
 255 //M_LOAD(load_attr_16);
 256 M_LOAD(load_attr_32);
 257 //M_LOAD(load_vary_16);
 258 M_LOAD(load_vary_32);
 259 //M_LOAD(load_uniform_16);
 260 M_LOAD(load_uniform_32);
 261 M_LOAD(load_color_buffer_8);
 262 //M_STORE(store_vary_16);
 263 M_STORE(store_vary_32);
 264
 265 static midgard_instruction
 266 v_alu_br_compact_cond(midgard_jmp_writeout_op op, unsigned tag, signed offset, unsigned cond)
 267 {
 268         midgard_branch_cond branch = {
 269                 .op = op,
 270                 .dest_tag = tag,
 271                 .offset = offset,
 272                 .cond = cond
 273         };
 274
 275         uint16_t compact;
 276         memcpy(&compact, &branch, sizeof(branch));
 277
 278         midgard_instruction ins = {
 279                 .type = TAG_ALU_4,
 280                 .unit = ALU_ENAB_BR_COMPACT,
 281                 .prepacked_branch = true,
 282                 .compact_branch = true,
 283                 .br_compact = compact
 284         };
 285
 286         if (op == midgard_jmp_writeout_op_writeout)
 287                 ins.writeout = true;
 288
 289         return ins;
 290 }
 291
 292 static midgard_instruction
 293 v_branch(bool conditional, bool invert)
 294 {
 295         midgard_instruction ins = {
 296                 .type = TAG_ALU_4,
 297                 .unit = ALU_ENAB_BRANCH,
 298                 .compact_branch = true,
 299                 .branch = {
 300                         .conditional = conditional,
 301                         .invert_conditional = invert
 302                 }
 303         };
 304
 305         return ins;
 306 }
 307
 308 static midgard_branch_extended
 309 midgard_create_branch_extended( midgard_condition cond,
 310                                 midgard_jmp_writeout_op op,
 311                                 unsigned dest_tag,
 312                                 signed quadword_offset)
 313 {
 314         /* For unclear reasons, the condition code is repeated 8 times */
 315         uint16_t duplicated_cond =
 316                 (cond << 14) |
 317                 (cond << 12) |
 318                 (cond << 10) |
 319                 (cond << 8) |
 320                 (cond << 6) |
 321                 (cond << 4) |
 322                 (cond << 2) |
 323                 (cond << 0);
 324
 325         midgard_branch_extended branch = {
 326                 .op = midgard_jmp_writeout_op_branch_cond,
 327                 .dest_tag = dest_tag,
 328                 .offset = quadword_offset,
 329                 .cond = duplicated_cond
 330         };
 331
 332         return branch;
 333 }
 334
 335 typedef struct midgard_bundle {
 336         /* Tag for the overall bundle */
 337         int tag;
 338
 339         /* Instructions contained by the bundle */
 340         int instruction_count;
 341         midgard_instruction instructions[5];
 342
 343         /* Bundle-wide ALU configuration */
 344         int padding;
 345         int control;
 346         bool has_embedded_constants;
 347         float constants[4];
 348         bool has_blend_constant;
 349
 350         uint16_t register_words[8];
 351         int register_words_count;
 352
 353         uint64_t body_words[8];
 354         size_t body_size[8];
 355         int body_words_count;
 356 } midgard_bundle;
 357
 358 typedef struct compiler_context {
 359         nir_shader *nir;
 360         gl_shader_stage stage;
 361
 362         /* Is internally a blend shader? Depends on stage == FRAGMENT */
 363         bool is_blend;
 364
 365         /* Tracking for blend constant patching */
 366         int blend_constant_number;
 367         int blend_constant_offset;
 368
 369         /* Current NIR function */
 370         nir_function *func;
 371
 372         /* Unordered list of midgard_blocks */
 373         int block_count;
 374         struct list_head blocks;
 375
 376         midgard_block *initial_block;
 377         midgard_block *previous_source_block;
 378         midgard_block *final_block;
 379
 380         /* List of midgard_instructions emitted for the current block */
 381         midgard_block *current_block;
 382
 383         /* The index corresponding to the current loop, e.g. for breaks/contineus */
 384         int current_loop;
 385
 386         /* Constants which have been loaded, for later inlining */
 387         struct hash_table_u64 *ssa_constants;
 388
 389         /* SSA indices to be outputted to corresponding varying offset */
 390         struct hash_table_u64 *ssa_varyings;
 391
 392         /* SSA values / registers which have been aliased. Naively, these
 393          * demand a fmov output; instead, we alias them in a later pass to
 394          * avoid the wasted op.
 395          *
 396          * A note on encoding: to avoid dynamic memory management here, rather
 397          * than ampping to a pointer, we map to the source index; the key
 398          * itself is just the destination index. */
 399
 400         struct hash_table_u64 *ssa_to_alias;
 401         struct set *leftover_ssa_to_alias;
 402
 403         /* Actual SSA-to-register for RA */
 404         struct hash_table_u64 *ssa_to_register;
 405
 406         /* Mapping of hashes computed from NIR indices to the sequential temp indices ultimately used in MIR */
 407         struct hash_table_u64 *hash_to_temp;
 408         int temp_count;
 409         int max_hash;
 410
 411         /* Uniform IDs for mdg */
 412         struct hash_table_u64 *uniform_nir_to_mdg;
 413         int uniform_count;
 414
 415         struct hash_table_u64 *varying_nir_to_mdg;
 416         int varying_count;
 417
 418         /* Just the count of the max register used. Higher count => higher
 419          * register pressure */
 420         int work_registers;
 421
 422         /* Used for cont/last hinting. Increase when a tex op is added.
 423          * Decrease when a tex op is removed. */
 424         int texture_op_count;
 425
 426         /* Mapping of texture register -> SSA index for unaliasing */
 427         int texture_index[2];
 428
 429         /* Count of special uniforms (viewport, etc) in vec4 units */
 430         int special_uniforms;
 431
 432         /* If any path hits a discard instruction */
 433         bool can_discard;
 434
 435         /* The number of uniforms allowable for the fast path */
 436         int uniform_cutoff;
 437
 438         /* Count of instructions emitted from NIR overall, across all blocks */
 439         int instruction_count;
 440
 441         /* Alpha ref value passed in */
 442         float alpha_ref;
 443
 444         /* The index corresponding to the fragment output */
 445         unsigned fragment_output;
 446 } compiler_context;
 447
 448 /* Append instruction to end of current block */
 449
 450 static midgard_instruction *
 451 mir_upload_ins(struct midgard_instruction ins)
 452 {
 453         midgard_instruction *heap = malloc(sizeof(ins));
 454         memcpy(heap, &ins, sizeof(ins));
 455         return heap;
 456 }
 457
 458 static void
 459 emit_mir_instruction(struct compiler_context *ctx, struct midgard_instruction ins)
 460 {
 461         list_addtail(&(mir_upload_ins(ins))->link, &ctx->current_block->instructions);
 462 }
 463
 464 static void
 465 mir_insert_instruction_before(struct midgard_instruction *tag, struct midgard_instruction ins)
 466 {
 467         list_addtail(&(mir_upload_ins(ins))->link, &tag->link);
 468 }
 469
 470 static void
 471 mir_remove_instruction(struct midgard_instruction *ins)
 472 {
 473         list_del(&ins->link);
 474 }
 475
 476 static midgard_instruction*
 477 mir_prev_op(struct midgard_instruction *ins)
 478 {
 479         return list_last_entry(&(ins->link), midgard_instruction, link);
 480 }
 481
 482 static midgard_instruction*
 483 mir_next_op(struct midgard_instruction *ins)
 484 {
 485         return list_first_entry(&(ins->link), midgard_instruction, link);
 486 }
 487
 488 static midgard_block *
 489 mir_next_block(struct midgard_block *blk)
 490 {
 491         return list_first_entry(&(blk->link), midgard_block, link);
 492 }
 493
 494
 495 #define mir_foreach_block(ctx, v) list_for_each_entry(struct midgard_block, v, &ctx->blocks, link)
 496 #define mir_foreach_block_from(ctx, from, v) list_for_each_entry_from(struct midgard_block, v, from, &ctx->blocks, link)
 497
 498 #define mir_foreach_instr(ctx, v) list_for_each_entry(struct midgard_instruction, v, &ctx->current_block->instructions, link)
 499 #define mir_foreach_instr_safe(ctx, v) list_for_each_entry_safe(struct midgard_instruction, v, &ctx->current_block->instructions, link)
 500 #define mir_foreach_instr_in_block(block, v) list_for_each_entry(struct midgard_instruction, v, &block->instructions, link)
 501 #define mir_foreach_instr_in_block_safe(block, v) list_for_each_entry_safe(struct midgard_instruction, v, &block->instructions, link)
 502 #define mir_foreach_instr_in_block_safe_rev(block, v) list_for_each_entry_safe_rev(struct midgard_instruction, v, &block->instructions, link)
 503 #define mir_foreach_instr_in_block_from(block, v, from) list_for_each_entry_from(struct midgard_instruction, v, from, &block->instructions, link)
 504
 505
 506 static midgard_instruction *
 507 mir_last_in_block(struct midgard_block *block)
 508 {
 509         return list_last_entry(&block->instructions, struct midgard_instruction, link);
 510 }
 511
 512 static midgard_block *
 513 mir_get_block(compiler_context *ctx, int idx)
 514 {
 515         struct list_head *lst = &ctx->blocks;
 516
 517         while ((idx--) + 1)
 518                 lst = lst->next;
 519
 520         return (struct midgard_block *) lst;
 521 }
 522
 523 /* Pretty printer for internal Midgard IR */
 524
 525 static void
 526 print_mir_source(int source)
 527 {
 528         if (source >= SSA_FIXED_MINIMUM) {
 529                 /* Specific register */
 530                 int reg = SSA_REG_FROM_FIXED(source);
 531
 532                 /* TODO: Moving threshold */
 533                 if (reg > 16 && reg < 24)
 534                         printf("u%d", 23 - reg);
 535                 else
 536                         printf("r%d", reg);
 537         } else {
 538                 printf("%d", source);
 539         }
 540 }
 541
 542 static void
 543 print_mir_instruction(midgard_instruction *ins)
 544 {
 545         printf("\t");
 546
 547         switch (ins->type) {
 548         case TAG_ALU_4: {
 549                 midgard_alu_op op = ins->alu.op;
 550                 const char *name = alu_opcode_names[op];
 551
 552                 if (ins->unit)
 553                         printf("%d.", ins->unit);
 554
 555                 printf("%s", name ? name : "??");
 556                 break;
 557         }
 558
 559         case TAG_LOAD_STORE_4: {
 560                 midgard_load_store_op op = ins->load_store.op;
 561                 const char *name = load_store_opcode_names[op];
 562
 563                 assert(name);
 564                 printf("%s", name);
 565                 break;
 566         }
 567
 568         case TAG_TEXTURE_4: {
 569                 printf("texture");
 570                 break;
 571         }
 572
 573         default:
 574                 assert(0);
 575         }
 576
 577         ssa_args *args = &ins->ssa_args;
 578
 579         printf(" %d, ", args->dest);
 580
 581         print_mir_source(args->src0);
 582         printf(", ");
 583
 584         if (args->inline_constant)
 585                 printf("#%d", ins->inline_constant);
 586         else
 587                 print_mir_source(args->src1);
 588
 589         if (ins->has_constants)
 590                 printf(" <%f, %f, %f, %f>", ins->constants[0], ins->constants[1], ins->constants[2], ins->constants[3]);
 591
 592         printf("\n");
 593 }
 594
 595 static void
 596 print_mir_block(midgard_block *block)
 597 {
 598         printf("{\n");
 599
 600         mir_foreach_instr_in_block(block, ins) {
 601                 print_mir_instruction(ins);
 602         }
 603
 604         printf("}\n");
 605 }
 606
 607
 608
 609 static void
 610 attach_constants(compiler_context *ctx, midgard_instruction *ins, void *constants, int name)
 611 {
 612         ins->has_constants = true;
 613         memcpy(&ins->constants, constants, 16);
 614
 615         /* If this is the special blend constant, mark this instruction */
 616
 617         if (ctx->is_blend && ctx->blend_constant_number == name)
 618                 ins->has_blend_constant = true;
 619 }
 620
 621 static int
 622 glsl_type_size(const struct glsl_type *type)
 623 {
 624         return glsl_count_attribute_slots(type, false);
 625 }
 626
 627 /* Lower fdot2 to a vector multiplication followed by channel addition  */
 628 static void
 629 midgard_nir_lower_fdot2_body(nir_builder *b, nir_alu_instr *alu)
 630 {
 631         if (alu->op != nir_op_fdot2)
 632                 return;
 633
 634         b->cursor = nir_before_instr(&alu->instr);
 635
 636         nir_ssa_def *src0 = nir_ssa_for_alu_src(b, alu, 0);
 637         nir_ssa_def *src1 = nir_ssa_for_alu_src(b, alu, 1);
 638
 639         nir_ssa_def *product = nir_fmul(b, src0, src1);
 640
 641         nir_ssa_def *sum = nir_fadd(b,
 642                         nir_channel(b, product, 0),
 643                         nir_channel(b, product, 1));
 644
 645         /* Replace the fdot2 with this sum */
 646         nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, nir_src_for_ssa(sum));
 647 }
 648
 649 static bool
 650 midgard_nir_lower_fdot2(nir_shader *shader)
 651 {
 652         bool progress = false;
 653
 654         nir_foreach_function(function, shader) {
 655                 if (!function->impl) continue;
 656
 657                 nir_builder _b;
 658                 nir_builder *b = &_b;
 659                 nir_builder_init(b, function->impl);
 660
 661                 nir_foreach_block(block, function->impl) {
 662                         nir_foreach_instr_safe(instr, block) {
 663                                 if (instr->type != nir_instr_type_alu) continue;
 664
 665                                 nir_alu_instr *alu = nir_instr_as_alu(instr);
 666                                 midgard_nir_lower_fdot2_body(b, alu);
 667
 668                                 progress |= true;
 669                         }
 670                 }
 671
 672                 nir_metadata_preserve(function->impl, nir_metadata_block_index | nir_metadata_dominance);
 673
 674         }
 675
 676         return progress;
 677 }
 678
 679 static void
 680 optimise_nir(nir_shader *nir)
 681 {
 682         bool progress;
 683
 684         NIR_PASS(progress, nir, nir_lower_regs_to_ssa);
 685         NIR_PASS(progress, nir, midgard_nir_lower_fdot2);
 686
 687         nir_lower_tex_options lower_tex_options = {
 688                 .lower_rect = true
 689         };
 690
 691         NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_options);
 692
 693         do {
 694                 progress = false;
 695
 696                 NIR_PASS(progress, nir, midgard_nir_lower_algebraic);
 697                 NIR_PASS(progress, nir, nir_lower_io, nir_var_all, glsl_type_size, 0);
 698                 NIR_PASS(progress, nir, nir_lower_var_copies);
 699                 NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
 700
 701                 NIR_PASS(progress, nir, nir_copy_prop);
 702                 NIR_PASS(progress, nir, nir_opt_dce);
 703                 NIR_PASS(progress, nir, nir_opt_dead_cf);
 704                 NIR_PASS(progress, nir, nir_opt_cse);
 705                 NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
 706                 NIR_PASS(progress, nir, nir_opt_algebraic);
 707                 NIR_PASS(progress, nir, nir_opt_constant_folding);
 708                 NIR_PASS(progress, nir, nir_opt_undef);
 709                 NIR_PASS(progress, nir, nir_opt_loop_unroll,
 710                          nir_var_shader_in |
 711                          nir_var_shader_out |
 712                          nir_var_function_temp);
 713
 714                 /* TODO: Enable vectorize when merged upstream */
 715                 // NIR_PASS(progress, nir, nir_opt_vectorize);
 716         } while (progress);
 717
 718         /* Must be run at the end to prevent creation of fsin/fcos ops */
 719         NIR_PASS(progress, nir, midgard_nir_scale_trig);
 720
 721         do {
 722                 progress = false;
 723
 724                 NIR_PASS(progress, nir, nir_opt_dce);
 725                 NIR_PASS(progress, nir, nir_opt_algebraic);
 726                 NIR_PASS(progress, nir, nir_opt_constant_folding);
 727                 NIR_PASS(progress, nir, nir_copy_prop);
 728         } while (progress);
 729
 730         NIR_PASS(progress, nir, nir_opt_algebraic_late);
 731
 732         /* Lower mods */
 733         NIR_PASS(progress, nir, nir_lower_to_source_mods, nir_lower_all_source_mods);
 734         NIR_PASS(progress, nir, nir_copy_prop);
 735         NIR_PASS(progress, nir, nir_opt_dce);
 736
 737         /* Take us out of SSA */
 738         NIR_PASS(progress, nir, nir_lower_locals_to_regs);
 739         NIR_PASS(progress, nir, nir_convert_from_ssa, true);
 740
 741         /* We are a vector architecture; write combine where possible */
 742         NIR_PASS(progress, nir, nir_move_vec_src_uses_to_dest);
 743         NIR_PASS(progress, nir, nir_lower_vec_to_movs);
 744
 745         NIR_PASS(progress, nir, nir_opt_dce);
 746 }
 747
 748 /* Front-half of aliasing the SSA slots, merely by inserting the flag in the
 749  * appropriate hash table. Intentional off-by-one to avoid confusing NULL with
 750  * r0. See the comments in compiler_context */
 751
 752 static void
 753 alias_ssa(compiler_context *ctx, int dest, int src)
 754 {
 755         _mesa_hash_table_u64_insert(ctx->ssa_to_alias, dest + 1, (void *) ((uintptr_t) src + 1));
 756         _mesa_set_add(ctx->leftover_ssa_to_alias, (void *) (uintptr_t) (dest + 1));
 757 }
 758
 759 /* ...or undo it, after which the original index will be used (dummy move should be emitted alongside this) */
 760
 761 static void
 762 unalias_ssa(compiler_context *ctx, int dest)
 763 {
 764         _mesa_hash_table_u64_remove(ctx->ssa_to_alias, dest + 1);
 765         /* TODO: Remove from leftover or no? */
 766 }
 767
 768 static void
 769 midgard_pin_output(compiler_context *ctx, int index, int reg)
 770 {
 771         _mesa_hash_table_u64_insert(ctx->ssa_to_register, index + 1, (void *) ((uintptr_t) reg + 1));
 772 }
 773
 774 static bool
 775 midgard_is_pinned(compiler_context *ctx, int index)
 776 {
 777         return _mesa_hash_table_u64_search(ctx->ssa_to_register, index + 1) != NULL;
 778 }
 779
 780 /* Do not actually emit a load; instead, cache the constant for inlining */
 781
 782 static void
 783 emit_load_const(compiler_context *ctx, nir_load_const_instr *instr)
 784 {
 785         nir_ssa_def def = instr->def;
 786
 787         float *v = ralloc_array(NULL, float, 4);
 788         memcpy(v, &instr->value.f32, 4 * sizeof(float));
 789         _mesa_hash_table_u64_insert(ctx->ssa_constants, def.index + 1, v);
 790 }
 791
 792 /* Duplicate bits to convert sane 4-bit writemask to obscure 8-bit format (or
 793  * do the inverse) */
 794
 795 static unsigned
 796 expand_writemask(unsigned mask)
 797 {
 798         unsigned o = 0;
 799
 800         for (int i = 0; i < 4; ++i)
 801                 if (mask & (1 << i))
 802                         o |= (3 << (2 * i));
 803
 804         return o;
 805 }
 806
 807 static unsigned
 808 squeeze_writemask(unsigned mask)
 809 {
 810         unsigned o = 0;
 811
 812         for (int i = 0; i < 4; ++i)
 813                 if (mask & (3 << (2 * i)))
 814                         o |= (1 << i);
 815
 816         return o;
 817
 818 }
 819
 820 /* Determines effective writemask, taking quirks and expansion into account */
 821 static unsigned
 822 effective_writemask(midgard_vector_alu *alu)
 823 {
 824         /* Channel count is off-by-one to fit in two-bits (0 channel makes no
 825          * sense) */
 826
 827         unsigned channel_count = GET_CHANNEL_COUNT(alu_opcode_props[alu->op]);
 828
 829         /* If there is a fixed channel count, construct the appropriate mask */
 830
 831         if (channel_count)
 832                 return (1 << channel_count) - 1;
 833
 834         /* Otherwise, just squeeze the existing mask */
 835         return squeeze_writemask(alu->mask);
 836 }
 837
 838 static unsigned
 839 find_or_allocate_temp(compiler_context *ctx, unsigned hash)
 840 {
 841         if ((hash < 0) || (hash >= SSA_FIXED_MINIMUM))
 842                 return hash;
 843
 844         unsigned temp = (uintptr_t) _mesa_hash_table_u64_search(ctx->hash_to_temp, hash + 1);
 845
 846         if (temp)
 847                 return temp - 1;
 848
 849         /* If no temp is find, allocate one */
 850         temp = ctx->temp_count++;
 851         ctx->max_hash = MAX2(ctx->max_hash, hash);
 852
 853         _mesa_hash_table_u64_insert(ctx->hash_to_temp, hash + 1, (void *) ((uintptr_t) temp + 1));
 854
 855         return temp;
 856 }
 857
 858 static unsigned
 859 nir_src_index(compiler_context *ctx, nir_src *src)
 860 {
 861         if (src->is_ssa)
 862                 return src->ssa->index;
 863         else
 864                 return ctx->func->impl->ssa_alloc + src->reg.reg->index;
 865 }
 866
 867 static unsigned
 868 nir_dest_index(compiler_context *ctx, nir_dest *dst)
 869 {
 870         if (dst->is_ssa)
 871                 return dst->ssa.index;
 872         else
 873                 return ctx->func->impl->ssa_alloc + dst->reg.reg->index;
 874 }
 875
 876 static unsigned
 877 nir_alu_src_index(compiler_context *ctx, nir_alu_src *src)
 878 {
 879         return nir_src_index(ctx, &src->src);
 880 }
 881
 882 /* Midgard puts conditionals in r31.w; move an arbitrary source (the output of
 883  * a conditional test) into that register */
 884
 885 static void
 886 emit_condition(compiler_context *ctx, nir_src *src, bool for_branch)
 887 {
 888         /* XXX: Force component correct */
 889         int condition = nir_src_index(ctx, src);
 890
 891         const midgard_vector_alu_src alu_src = {
 892                 .swizzle = SWIZZLE(COMPONENT_X, COMPONENT_X, COMPONENT_X, COMPONENT_X),
 893         };
 894
 895         /* There is no boolean move instruction. Instead, we simulate a move by
 896          * ANDing the condition with itself to get it into r31.w */
 897
 898         midgard_instruction ins = {
 899                 .type = TAG_ALU_4,
 900                 .unit = for_branch ? UNIT_SMUL : UNIT_SADD, /* TODO: DEDUCE THIS */
 901                 .ssa_args = {
 902                         .src0 = condition,
 903                         .src1 = condition,
 904                         .dest = SSA_FIXED_REGISTER(31),
 905                 },
 906                 .alu = {
 907                         .op = midgard_alu_op_iand,
 908                         .reg_mode = midgard_reg_mode_full,
 909                         .dest_override = midgard_dest_override_none,
 910                         .mask = (0x3 << 6), /* w */
 911                         .src1 = vector_alu_srco_unsigned(alu_src),
 912                         .src2 = vector_alu_srco_unsigned(alu_src)
 913                 },
 914         };
 915
 916         emit_mir_instruction(ctx, ins);
 917 }
 918
 919 #define ALU_CASE(nir, _op) \
 920         case nir_op_##nir: \
 921                 op = midgard_alu_op_##_op; \
 922                 break;
 923
 924 static void
 925 emit_alu(compiler_context *ctx, nir_alu_instr *instr)
 926 {
 927         bool is_ssa = instr->dest.dest.is_ssa;
 928
 929         unsigned dest = nir_dest_index(ctx, &instr->dest.dest);
 930         unsigned nr_components = is_ssa ? instr->dest.dest.ssa.num_components : instr->dest.dest.reg.reg->num_components;
 931         unsigned nr_inputs = nir_op_infos[instr->op].num_inputs;
 932
 933         /* Most Midgard ALU ops have a 1:1 correspondance to NIR ops; these are
 934          * supported. A few do not and are commented for now. Also, there are a
 935          * number of NIR ops which Midgard does not support and need to be
 936          * lowered, also TODO. This switch block emits the opcode and calling
 937          * convention of the Midgard instruction; actual packing is done in
 938          * emit_alu below */
 939
 940         unsigned op;
 941
 942         switch (instr->op) {
 943                 ALU_CASE(fadd, fadd);
 944                 ALU_CASE(fmul, fmul);
 945                 ALU_CASE(fmin, fmin);
 946                 ALU_CASE(fmax, fmax);
 947                 ALU_CASE(imin, imin);
 948                 ALU_CASE(imax, imax);
 949                 ALU_CASE(fmov, fmov);
 950                 ALU_CASE(ffloor, ffloor);
 951                 ALU_CASE(fceil, fceil);
 952                 ALU_CASE(fdot3, fdot3);
 953                 ALU_CASE(fdot4, fdot4);
 954                 ALU_CASE(iadd, iadd);
 955                 ALU_CASE(isub, isub);
 956                 ALU_CASE(imul, imul);
 957
 958                 /* XXX: Use fmov, not imov, since imov was causing major
 959                  * issues with texture precision? XXX research */
 960                 ALU_CASE(imov, fmov);
 961
 962                 ALU_CASE(feq, feq);
 963                 ALU_CASE(fne, fne);
 964                 ALU_CASE(flt, flt);
 965                 ALU_CASE(ieq, ieq);
 966                 ALU_CASE(ine, ine);
 967                 ALU_CASE(ilt, ilt);
 968
 969                 ALU_CASE(frcp, frcp);
 970                 ALU_CASE(frsq, frsqrt);
 971                 ALU_CASE(fsqrt, fsqrt);
 972                 ALU_CASE(fexp2, fexp2);
 973                 ALU_CASE(flog2, flog2);
 974
 975                 ALU_CASE(f2i32, f2i);
 976                 ALU_CASE(f2u32, f2u);
 977                 ALU_CASE(i2f32, i2f);
 978                 ALU_CASE(u2f32, u2f);
 979
 980                 ALU_CASE(fsin, fsin);
 981                 ALU_CASE(fcos, fcos);
 982
 983                 ALU_CASE(iand, iand);
 984                 ALU_CASE(ior, ior);
 985                 ALU_CASE(ixor, ixor);
 986                 ALU_CASE(inot, inot);
 987                 ALU_CASE(ishl, ishl);
 988                 ALU_CASE(ishr, iasr);
 989                 ALU_CASE(ushr, ilsr);
 990
 991                 ALU_CASE(ball_fequal4, fball_eq);
 992                 ALU_CASE(bany_fnequal4, fbany_neq);
 993                 ALU_CASE(ball_iequal4, iball_eq);
 994                 ALU_CASE(bany_inequal4, ibany_neq);
 995
 996         /* For greater-or-equal, we use less-or-equal and flip the
 997          * arguments */
 998
 999         case nir_op_ige: {
1000                 op = midgard_alu_op_ile;
1001
1002                 /* Swap via temporary */
1003                 nir_alu_src temp = instr->src[1];
1004                 instr->src[1] = instr->src[0];
1005                 instr->src[0] = temp;
1006
1007                 break;
1008         }
1009
1010         case nir_op_bcsel: {
1011                 op = midgard_alu_op_fcsel;
1012
1013                 /* csel works as a two-arg in Midgard, since the condition is hardcoded in r31.w */
1014                 nr_inputs = 2;
1015
1016                 emit_condition(ctx, &instr->src[0].src, false);
1017
1018                 /* The condition is the first argument; move the other
1019                  * arguments up one to be a binary instruction for
1020                  * Midgard */
1021
1022                 memmove(instr->src, instr->src + 1, 2 * sizeof(nir_alu_src));
1023                 break;
1024         }
1025
1026         /* We don't have a native b2f32 instruction. Instead, like many GPUs,
1027          * we exploit booleans as 0/~0 for false/true, and correspondingly AND
1028          * by 1.0 to do the type conversion. For the moment, prime us to emit:
1029          *
1030          * iand [whatever], #0
1031          *
1032          * At the end of emit_alu (as MIR), we'll fix-up the constant */
1033
1034         case nir_op_b2f32: {
1035                 op = midgard_alu_op_iand;
1036                 break;
1037         }
1038
1039         default:
1040                 printf("Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
1041                 assert(0);
1042                 return;
1043         }
1044
1045         /* Fetch unit, quirks, etc information */
1046         unsigned opcode_props = alu_opcode_props[op];
1047         bool quirk_flipped_r24 = opcode_props & QUIRK_FLIPPED_R24;
1048
1049         /* Initialise fields common between scalar/vector instructions */
1050         midgard_outmod outmod = instr->dest.saturate ? midgard_outmod_sat : midgard_outmod_none;
1051
1052         /* src0 will always exist afaik, but src1 will not for 1-argument
1053          * instructions. The latter can only be fetched if the instruction
1054          * needs it, or else we may segfault. */
1055
1056         unsigned src0 = nir_alu_src_index(ctx, &instr->src[0]);
1057         unsigned src1 = nr_inputs == 2 ? nir_alu_src_index(ctx, &instr->src[1]) : SSA_UNUSED_0;
1058
1059         /* Rather than use the instruction generation helpers, we do it
1060          * ourselves here to avoid the mess */
1061
1062         midgard_instruction ins = {
1063                 .type = TAG_ALU_4,
1064                 .ssa_args = {
1065                         .src0 = quirk_flipped_r24 ? SSA_UNUSED_1 : src0,
1066                         .src1 = quirk_flipped_r24 ? src0         : src1,
1067                         .dest = dest,
1068                         .inline_constant = (nr_inputs == 1) && !quirk_flipped_r24
1069                 }
1070         };
1071
1072         nir_alu_src *nirmods[2] = { NULL };
1073
1074         if (nr_inputs == 2) {
1075                 nirmods[0] = &instr->src[0];
1076                 nirmods[1] = &instr->src[1];
1077         } else if (nr_inputs == 1) {
1078                 nirmods[quirk_flipped_r24] = &instr->src[0];
1079         } else {
1080                 assert(0);
1081         }
1082
1083         midgard_vector_alu alu = {
1084                 .op = op,
1085                 .reg_mode = midgard_reg_mode_full,
1086                 .dest_override = midgard_dest_override_none,
1087                 .outmod = outmod,
1088
1089                 /* Writemask only valid for non-SSA NIR */
1090                 .mask = expand_writemask((1 << nr_components) - 1),
1091
1092                 .src1 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmods[0])),
1093                 .src2 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmods[1])),
1094         };
1095
1096         /* Apply writemask if non-SSA, keeping in mind that we can't write to components that don't exist */
1097
1098         if (!is_ssa)
1099                 alu.mask &= expand_writemask(instr->dest.write_mask);
1100
1101         ins.alu = alu;
1102
1103         /* Late fixup for emulated instructions */
1104
1105         if (instr->op == nir_op_b2f32) {
1106                 /* Presently, our second argument is an inline #0 constant.
1107                  * Switch over to an embedded 1.0 constant (that can't fit
1108                  * inline, since we're 32-bit, not 16-bit like the inline
1109                  * constants) */
1110
1111                 ins.ssa_args.inline_constant = false;
1112                 ins.ssa_args.src1 = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
1113                 ins.has_constants = true;
1114                 ins.constants[0] = 1.0;
1115         }
1116
1117         if ((opcode_props & UNITS_ALL) == UNIT_VLUT) {
1118                 /* To avoid duplicating the lookup tables (probably), true LUT
1119                  * instructions can only operate as if they were scalars. Lower
1120                  * them here by changing the component. */
1121
1122                 uint8_t original_swizzle[4];
1123                 memcpy(original_swizzle, nirmods[0]->swizzle, sizeof(nirmods[0]->swizzle));
1124
1125                 for (int i = 0; i < nr_components; ++i) {
1126                         ins.alu.mask = (0x3) << (2 * i); /* Mask the associated component */
1127
1128                         for (int j = 0; j < 4; ++j)
1129                                 nirmods[0]->swizzle[j] = original_swizzle[i]; /* Pull from the correct component */
1130
1131                         ins.alu.src1 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmods[0]));
1132                         emit_mir_instruction(ctx, ins);
1133                 }
1134         } else {
1135                 emit_mir_instruction(ctx, ins);
1136         }
1137 }
1138
1139 #undef ALU_CASE
1140
1141 static void
1142 emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr)
1143 {
1144         nir_const_value *const_offset;
1145         unsigned offset, reg;
1146
1147         switch (instr->intrinsic) {
1148         case nir_intrinsic_discard_if:
1149                 emit_condition(ctx, &instr->src[0], true);
1150
1151         /* fallthrough */
1152
1153         case nir_intrinsic_discard: {
1154                 midgard_condition cond = instr->intrinsic == nir_intrinsic_discard_if ?
1155                                          midgard_condition_true : midgard_condition_always;
1156
1157                 EMIT(alu_br_compact_cond, midgard_jmp_writeout_op_discard, 0, 2, cond);
1158                 ctx->can_discard = true;
1159                 break;
1160         }
1161
1162         case nir_intrinsic_load_uniform:
1163         case nir_intrinsic_load_input:
1164                 const_offset = nir_src_as_const_value(instr->src[0]);
1165                 assert (const_offset && "no indirect inputs");
1166
1167                 offset = nir_intrinsic_base(instr) + const_offset->u32[0];
1168
1169                 reg = nir_dest_index(ctx, &instr->dest);
1170
1171                 if (instr->intrinsic == nir_intrinsic_load_uniform && !ctx->is_blend) {
1172                         /* TODO: half-floats */
1173
1174                         int uniform_offset = 0;
1175
1176                         if (offset >= SPECIAL_UNIFORM_BASE) {
1177                                 /* XXX: Resolve which uniform */
1178                                 uniform_offset = 0;
1179                         } else {
1180                                 /* Offset away from the special
1181                                  * uniform block */
1182
1183                                 void *entry = _mesa_hash_table_u64_search(ctx->uniform_nir_to_mdg, offset + 1);
1184
1185                                 /* XXX */
1186                                 if (!entry) {
1187                                         printf("WARNING: Unknown uniform %d\n", offset);
1188                                         break;
1189                                 }
1190
1191                                 uniform_offset = (uintptr_t) (entry) - 1;
1192                                 uniform_offset += ctx->special_uniforms;
1193                         }
1194
1195                         if (uniform_offset < ctx->uniform_cutoff) {
1196                                 /* Fast path: For the first 16 uniform,
1197                                  * accesses are 0-cycle, since they're
1198                                  * just a register fetch in the usual
1199                                  * case.  So, we alias the registers
1200                                  * while we're still in SSA-space */
1201
1202                                 int reg_slot = 23 - uniform_offset;
1203                                 alias_ssa(ctx, reg, SSA_FIXED_REGISTER(reg_slot));
1204                         } else {
1205                                 /* Otherwise, read from the 'special'
1206                                  * UBO to access higher-indexed
1207                                  * uniforms, at a performance cost */
1208
1209                                 midgard_instruction ins = m_load_uniform_32(reg, uniform_offset);
1210
1211                                 /* TODO: Don't split */
1212                                 ins.load_store.varying_parameters = (uniform_offset & 7) << 7;
1213                                 ins.load_store.address = uniform_offset >> 3;
1214
1215                                 ins.load_store.unknown = 0x1E00; /* xxx: what is this? */
1216                                 emit_mir_instruction(ctx, ins);
1217                         }
1218                 } else if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->is_blend) {
1219                         /* XXX: Half-floats? */
1220                         /* TODO: swizzle, mask */
1221
1222                         midgard_instruction ins = m_load_vary_32(reg, offset);
1223
1224                         midgard_varying_parameter p = {
1225                                 .is_varying = 1,
1226                                 .interpolation = midgard_interp_default,
1227                                 .flat = /*var->data.interpolation == INTERP_MODE_FLAT*/ 0
1228                         };
1229
1230                         unsigned u;
1231                         memcpy(&u, &p, sizeof(p));
1232                         ins.load_store.varying_parameters = u;
1233
1234                         ins.load_store.unknown = 0x1e9e; /* xxx: what is this? */
1235                         emit_mir_instruction(ctx, ins);
1236                 } else if (ctx->is_blend && instr->intrinsic == nir_intrinsic_load_uniform) {
1237                         /* Constant encoded as a pinned constant */
1238
1239                         midgard_instruction ins = v_fmov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), blank_alu_src, reg);
1240                         ins.has_constants = true;
1241                         ins.has_blend_constant = true;
1242                         emit_mir_instruction(ctx, ins);
1243                 } else if (ctx->is_blend) {
1244                         /* For blend shaders, a load might be
1245                          * translated various ways depending on what
1246                          * we're loading. Figure out how this is used */
1247
1248                         nir_variable *out = NULL;
1249
1250                         nir_foreach_variable(var, &ctx->nir->inputs) {
1251                                 int drvloc = var->data.driver_location;
1252
1253                                 if (nir_intrinsic_base(instr) == drvloc) {
1254                                         out = var;
1255                                         break;
1256                                 }
1257                         }
1258
1259                         assert(out);
1260
1261                         if (out->data.location == VARYING_SLOT_COL0) {
1262                                 /* Source color preloaded to r0 */
1263
1264                                 midgard_pin_output(ctx, reg, 0);
1265                         } else if (out->data.location == VARYING_SLOT_COL1) {
1266                                 /* Destination color must be read from framebuffer */
1267
1268                                 midgard_instruction ins = m_load_color_buffer_8(reg, 0);
1269                                 ins.load_store.swizzle = 0; /* xxxx */
1270
1271                                 /* Read each component sequentially */
1272
1273                                 for (int c = 0; c < 4; ++c) {
1274                                         ins.load_store.mask = (1 << c);
1275                                         ins.load_store.unknown = c;
1276                                         emit_mir_instruction(ctx, ins);
1277                                 }
1278
1279                                 /* vadd.u2f hr2, abs(hr2), #0 */
1280
1281                                 midgard_vector_alu_src alu_src = blank_alu_src;
1282                                 alu_src.abs = true;
1283                                 alu_src.half = true;
1284
1285                                 midgard_instruction u2f = {
1286                                         .type = TAG_ALU_4,
1287                                         .ssa_args = {
1288                                                 .src0 = reg,
1289                                                 .src1 = SSA_UNUSED_0,
1290                                                 .dest = reg,
1291                                                 .inline_constant = true
1292                                         },
1293                                         .alu = {
1294                                                 .op = midgard_alu_op_u2f,
1295                                                 .reg_mode = midgard_reg_mode_half,
1296                                                 .dest_override = midgard_dest_override_none,
1297                                                 .mask = 0xF,
1298                                                 .src1 = vector_alu_srco_unsigned(alu_src),
1299                                                 .src2 = vector_alu_srco_unsigned(blank_alu_src),
1300                                         }
1301                                 };
1302
1303                                 emit_mir_instruction(ctx, u2f);
1304
1305                                 /* vmul.fmul.sat r1, hr2, #0.00392151 */
1306
1307                                 alu_src.abs = false;
1308
1309                                 midgard_instruction fmul = {
1310                                         .type = TAG_ALU_4,
1311                                         .inline_constant = _mesa_float_to_half(1.0 / 255.0),
1312                                         .ssa_args = {
1313                                                 .src0 = reg,
1314                                                 .dest = reg,
1315                                                 .src1 = SSA_UNUSED_0,
1316                                                 .inline_constant = true
1317                                         },
1318                                         .alu = {
1319                                                 .op = midgard_alu_op_fmul,
1320                                                 .reg_mode = midgard_reg_mode_full,
1321                                                 .dest_override = midgard_dest_override_none,
1322                                                 .outmod = midgard_outmod_sat,
1323                                                 .mask = 0xFF,
1324                                                 .src1 = vector_alu_srco_unsigned(alu_src),
1325                                                 .src2 = vector_alu_srco_unsigned(blank_alu_src),
1326                                         }
1327                                 };
1328
1329                                 emit_mir_instruction(ctx, fmul);
1330                         } else {
1331                                 printf("Unknown input in blend shader\n");
1332                                 assert(0);
1333                         }
1334                 } else if (ctx->stage == MESA_SHADER_VERTEX) {
1335                         midgard_instruction ins = m_load_attr_32(reg, offset);
1336                         ins.load_store.unknown = 0x1E1E; /* XXX: What is this? */
1337                         ins.load_store.mask = (1 << instr->num_components) - 1;
1338                         emit_mir_instruction(ctx, ins);
1339                 } else {
1340                         printf("Unknown load\n");
1341                         assert(0);
1342                 }
1343
1344                 break;
1345
1346         case nir_intrinsic_store_output:
1347                 const_offset = nir_src_as_const_value(instr->src[1]);
1348                 assert(const_offset && "no indirect outputs");
1349
1350                 offset = nir_intrinsic_base(instr) + const_offset->u32[0];
1351
1352                 reg = nir_src_index(ctx, &instr->src[0]);
1353
1354                 if (ctx->stage == MESA_SHADER_FRAGMENT) {
1355                         /* gl_FragColor is not emitted with load/store
1356                          * instructions. Instead, it gets plonked into
1357                          * r0 at the end of the shader and we do the
1358                          * framebuffer writeout dance. TODO: Defer
1359                          * writes */
1360
1361                         midgard_pin_output(ctx, reg, 0);
1362
1363                         /* Save the index we're writing to for later reference
1364                          * in the epilogue */
1365
1366                         ctx->fragment_output = reg;
1367                 } else if (ctx->stage == MESA_SHADER_VERTEX) {
1368                         /* Varyings are written into one of two special
1369                          * varying register, r26 or r27. The register itself is selected as the register
1370                          * in the st_vary instruction, minus the base of 26. E.g. write into r27 and then call st_vary(1)
1371                          *
1372                          * Normally emitting fmov's is frowned upon,
1373                          * but due to unique constraints of
1374                          * REGISTER_VARYING, fmov emission + a
1375                          * dedicated cleanup pass is the only way to
1376                          * guarantee correctness when considering some
1377                          * (common) edge cases XXX: FIXME */
1378
1379                         /* Look up how it was actually laid out */
1380
1381                         void *entry = _mesa_hash_table_u64_search(ctx->varying_nir_to_mdg, offset + 1);
1382
1383                         if (!entry) {
1384                                 printf("WARNING: skipping varying\n");
1385                                 break;
1386                         }
1387
1388                         offset = (uintptr_t) (entry) - 1;
1389
1390                         /* If this varying corresponds to a constant (why?!),
1391                          * emit that now since it won't get picked up by
1392                          * hoisting (since there is no corresponding move
1393                          * emitted otherwise) */
1394
1395                         void *constant_value = _mesa_hash_table_u64_search(ctx->ssa_constants, reg + 1);
1396
1397                         if (constant_value) {
1398                                 /* Special case: emit the varying write
1399                                  * directly to r26 (looks funny in asm but it's
1400                                  * fine) and emit the store _now_. Possibly
1401                                  * slightly slower, but this is a really stupid
1402                                  * special case anyway (why on earth would you
1403                                  * have a constant varying? Your own fault for
1404                                  * slightly worse perf :P) */
1405
1406                                 midgard_instruction ins = v_fmov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), blank_alu_src, SSA_FIXED_REGISTER(26));
1407                                 attach_constants(ctx, &ins, constant_value, reg + 1);
1408                                 emit_mir_instruction(ctx, ins);
1409
1410                                 midgard_instruction st = m_store_vary_32(SSA_FIXED_REGISTER(0), offset);
1411                                 st.load_store.unknown = 0x1E9E; /* XXX: What is this? */
1412                                 emit_mir_instruction(ctx, st);
1413                         } else {
1414                                 /* Do not emit the varying yet -- instead, just mark down that we need to later */
1415
1416                                 _mesa_hash_table_u64_insert(ctx->ssa_varyings, reg + 1, (void *) ((uintptr_t) (offset + 1)));
1417                         }
1418                 } else {
1419                         printf("Unknown store\n");
1420                         assert(0);
1421                 }
1422
1423                 break;
1424
1425         case nir_intrinsic_load_alpha_ref_float:
1426                 assert(instr->dest.is_ssa);
1427
1428                 float ref_value = ctx->alpha_ref;
1429
1430                 float *v = ralloc_array(NULL, float, 4);
1431                 memcpy(v, &ref_value, sizeof(float));
1432                 _mesa_hash_table_u64_insert(ctx->ssa_constants, instr->dest.ssa.index + 1, v);
1433                 break;
1434
1435
1436         default:
1437                 printf ("Unhandled intrinsic\n");
1438                 assert(0);
1439                 break;
1440         }
1441 }
1442
1443 static unsigned
1444 midgard_tex_format(enum glsl_sampler_dim dim)
1445 {
1446         switch (dim) {
1447         case GLSL_SAMPLER_DIM_2D:
1448         case GLSL_SAMPLER_DIM_EXTERNAL:
1449                 return TEXTURE_2D;
1450
1451         case GLSL_SAMPLER_DIM_3D:
1452                 return TEXTURE_3D;
1453
1454         case GLSL_SAMPLER_DIM_CUBE:
1455                 return TEXTURE_CUBE;
1456
1457         default:
1458                 printf("Unknown sampler dim type\n");
1459                 assert(0);
1460                 return 0;
1461         }
1462 }
1463
1464 static void
1465 emit_tex(compiler_context *ctx, nir_tex_instr *instr)
1466 {
1467         /* TODO */
1468         //assert (!instr->sampler);
1469         //assert (!instr->texture_array_size);
1470         assert (instr->op == nir_texop_tex);
1471
1472         /* Allocate registers via a round robin scheme to alternate between the two registers */
1473         int reg = ctx->texture_op_count & 1;
1474         int in_reg = reg, out_reg = reg;
1475
1476         /* Make room for the reg */
1477
1478         if (ctx->texture_index[reg] > -1)
1479                 unalias_ssa(ctx, ctx->texture_index[reg]);
1480
1481         int texture_index = instr->texture_index;
1482         int sampler_index = texture_index;
1483
1484         for (unsigned i = 0; i < instr->num_srcs; ++i) {
1485                 switch (instr->src[i].src_type) {
1486                 case nir_tex_src_coord: {
1487                         int index = nir_src_index(ctx, &instr->src[i].src);
1488
1489                         midgard_vector_alu_src alu_src = blank_alu_src;
1490                         alu_src.swizzle = (COMPONENT_Y << 2);
1491
1492                         midgard_instruction ins = v_fmov(index, alu_src, SSA_FIXED_REGISTER(REGISTER_TEXTURE_BASE + in_reg));
1493                         emit_mir_instruction(ctx, ins);
1494
1495                         //midgard_pin_output(ctx, index, REGISTER_TEXTURE_BASE + in_reg);
1496
1497                         break;
1498                 }
1499
1500                 default: {
1501                         printf("Unknown source type\n");
1502                         //assert(0);
1503                         break;
1504                 }
1505                 }
1506         }
1507
1508         /* No helper to build texture words -- we do it all here */
1509         midgard_instruction ins = {
1510                 .type = TAG_TEXTURE_4,
1511                 .texture = {
1512                         .op = TEXTURE_OP_NORMAL,
1513                         .format = midgard_tex_format(instr->sampler_dim),
1514                         .texture_handle = texture_index,
1515                         .sampler_handle = sampler_index,
1516
1517                         /* TODO: Don't force xyzw */
1518                         .swizzle = SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
1519                         .mask = 0xF,
1520
1521                         /* TODO: half */
1522                         //.in_reg_full = 1,
1523                         .out_full = 1,
1524
1525                         .filter = 1,
1526
1527                         /* Always 1 */
1528                         .unknown7 = 1,
1529
1530                         /* Assume we can continue; hint it out later */
1531                         .cont = 1,
1532                 }
1533         };
1534
1535         /* Set registers to read and write from the same place */
1536         ins.texture.in_reg_select = in_reg;
1537         ins.texture.out_reg_select = out_reg;
1538
1539         /* TODO: Dynamic swizzle input selection, half-swizzles? */
1540         if (instr->sampler_dim == GLSL_SAMPLER_DIM_3D) {
1541                 ins.texture.in_reg_swizzle_right = COMPONENT_X;
1542                 ins.texture.in_reg_swizzle_left = COMPONENT_Y;
1543                 //ins.texture.in_reg_swizzle_third = COMPONENT_Z;
1544         } else {
1545                 ins.texture.in_reg_swizzle_left = COMPONENT_X;
1546                 ins.texture.in_reg_swizzle_right = COMPONENT_Y;
1547                 //ins.texture.in_reg_swizzle_third = COMPONENT_X;
1548         }
1549
1550         emit_mir_instruction(ctx, ins);
1551
1552         /* Simultaneously alias the destination and emit a move for it. The move will be eliminated if possible */
1553
1554         int o_reg = REGISTER_TEXTURE_BASE + out_reg, o_index = nir_dest_index(ctx, &instr->dest);
1555         alias_ssa(ctx, o_index, SSA_FIXED_REGISTER(o_reg));
1556         ctx->texture_index[reg] = o_index;
1557
1558         midgard_instruction ins2 = v_fmov(SSA_FIXED_REGISTER(o_reg), blank_alu_src, o_index);
1559         emit_mir_instruction(ctx, ins2);
1560
1561         /* Used for .cont and .last hinting */
1562         ctx->texture_op_count++;
1563 }
1564
1565 static void
1566 emit_jump(compiler_context *ctx, nir_jump_instr *instr)
1567 {
1568         switch (instr->type) {
1569                 case nir_jump_break: {
1570                         /* Emit a branch out of the loop */
1571                         struct midgard_instruction br = v_branch(false, false);
1572                         br.branch.target_type = TARGET_BREAK;
1573                         br.branch.target_break = ctx->current_loop;
1574                         emit_mir_instruction(ctx, br);
1575
1576                         printf("break..\n");
1577                         break;
1578                 }
1579
1580                 default:
1581                         printf("Unknown jump type %d\n", instr->type);
1582                         break;
1583         }
1584 }
1585
1586 static void
1587 emit_instr(compiler_context *ctx, struct nir_instr *instr)
1588 {
1589         switch (instr->type) {
1590         case nir_instr_type_load_const:
1591                 emit_load_const(ctx, nir_instr_as_load_const(instr));
1592                 break;
1593
1594         case nir_instr_type_intrinsic:
1595                 emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
1596                 break;
1597
1598         case nir_instr_type_alu:
1599                 emit_alu(ctx, nir_instr_as_alu(instr));
1600                 break;
1601
1602         case nir_instr_type_tex:
1603                 emit_tex(ctx, nir_instr_as_tex(instr));
1604                 break;
1605
1606         case nir_instr_type_jump:
1607                 emit_jump(ctx, nir_instr_as_jump(instr));
1608                 break;
1609
1610         case nir_instr_type_ssa_undef:
1611                 /* Spurious */
1612                 break;
1613
1614         default:
1615                 printf("Unhandled instruction type\n");
1616                 break;
1617         }
1618 }
1619
1620 /* Determine the actual hardware from the index based on the RA results or special values */
1621
1622 static int
1623 dealias_register(compiler_context *ctx, struct ra_graph *g, int reg, int maxreg)
1624 {
1625         if (reg >= SSA_FIXED_MINIMUM)
1626                 return SSA_REG_FROM_FIXED(reg);
1627
1628         if (reg >= 0) {
1629                 assert(reg < maxreg);
1630                 int r = ra_get_node_reg(g, reg);
1631                 ctx->work_registers = MAX2(ctx->work_registers, r);
1632                 return r;
1633         }
1634
1635         switch (reg) {
1636         /* fmov style unused */
1637         case SSA_UNUSED_0:
1638                 return REGISTER_UNUSED;
1639
1640         /* lut style unused */
1641         case SSA_UNUSED_1:
1642                 return REGISTER_UNUSED;
1643
1644         default:
1645                 printf("Unknown SSA register alias %d\n", reg);
1646                 assert(0);
1647                 return 31;
1648         }
1649 }
1650
1651 static unsigned int
1652 midgard_ra_select_callback(struct ra_graph *g, BITSET_WORD *regs, void *data)
1653 {
1654         /* Choose the first available register to minimise reported register pressure */
1655
1656         for (int i = 0; i < 16; ++i) {
1657                 if (BITSET_TEST(regs, i)) {
1658                         return i;
1659                 }
1660         }
1661
1662         assert(0);
1663         return 0;
1664 }
1665
1666 static bool
1667 midgard_is_live_in_instr(midgard_instruction *ins, int src)
1668 {
1669         if (ins->ssa_args.src0 == src) return true;
1670         if (ins->ssa_args.src1 == src) return true;
1671
1672         return false;
1673 }
1674
1675 static bool
1676 is_live_after(compiler_context *ctx, midgard_block *block, midgard_instruction *start, int src)
1677 {
1678         /* Check the rest of the block for liveness */
1679         mir_foreach_instr_in_block_from(block, ins, mir_next_op(start)) {
1680                 if (midgard_is_live_in_instr(ins, src))
1681                         return true;
1682         }
1683
1684         /* Check the rest of the blocks for liveness */
1685         mir_foreach_block_from(ctx, mir_next_block(block), b) {
1686                 mir_foreach_instr_in_block(b, ins) {
1687                         if (midgard_is_live_in_instr(ins, src))
1688                                 return true;
1689                 }
1690         }
1691
1692         /* TODO: How does control flow interact in complex shaders? */
1693
1694         return false;
1695 }
1696
1697 static void
1698 allocate_registers(compiler_context *ctx)
1699 {
1700         /* First, initialize the RA */
1701         struct ra_regs *regs = ra_alloc_reg_set(NULL, 32, true);
1702
1703         /* Create a primary (general purpose) class, as well as special purpose
1704          * pipeline register classes */
1705
1706         int primary_class = ra_alloc_reg_class(regs);
1707         int varying_class  = ra_alloc_reg_class(regs);
1708
1709         /* Add the full set of work registers */
1710         int work_count = 16 - MAX2((ctx->uniform_cutoff - 8), 0);
1711         for (int i = 0; i < work_count; ++i)
1712                 ra_class_add_reg(regs, primary_class, i);
1713
1714         /* Add special registers */
1715         ra_class_add_reg(regs, varying_class, REGISTER_VARYING_BASE);
1716         ra_class_add_reg(regs, varying_class, REGISTER_VARYING_BASE + 1);
1717
1718         /* We're done setting up */
1719         ra_set_finalize(regs, NULL);
1720
1721         /* Transform the MIR into squeezed index form */
1722         mir_foreach_block(ctx, block) {
1723                 mir_foreach_instr_in_block(block, ins) {
1724                         if (ins->compact_branch) continue;
1725
1726                         ins->ssa_args.src0 = find_or_allocate_temp(ctx, ins->ssa_args.src0);
1727                         ins->ssa_args.src1 = find_or_allocate_temp(ctx, ins->ssa_args.src1);
1728                         ins->ssa_args.dest = find_or_allocate_temp(ctx, ins->ssa_args.dest);
1729                 }
1730
1731                 print_mir_block(block);
1732         }
1733
1734         /* Let's actually do register allocation */
1735         int nodes = ctx->temp_count;
1736         struct ra_graph *g = ra_alloc_interference_graph(regs, nodes);
1737
1738         /* Set everything to the work register class, unless it has somewhere
1739          * special to go */
1740
1741         mir_foreach_block(ctx, block) {
1742                 mir_foreach_instr_in_block(block, ins) {
1743                         if (ins->compact_branch) continue;
1744
1745                         if (ins->ssa_args.dest < 0) continue;
1746
1747                         if (ins->ssa_args.dest >= SSA_FIXED_MINIMUM) continue;
1748
1749                         int class = primary_class;
1750
1751                         ra_set_node_class(g, ins->ssa_args.dest, class);
1752                 }
1753         }
1754
1755         for (int index = 0; index <= ctx->max_hash; ++index) {
1756                 unsigned temp = (uintptr_t) _mesa_hash_table_u64_search(ctx->ssa_to_register, index + 1);
1757
1758                 if (temp) {
1759                         unsigned reg = temp - 1;
1760                         int t = find_or_allocate_temp(ctx, index);
1761                         ra_set_node_reg(g, t, reg);
1762                 }
1763         }
1764
1765         /* Determine liveness */
1766
1767         int *live_start = malloc(nodes * sizeof(int));
1768         int *live_end = malloc(nodes * sizeof(int));
1769
1770         /* Initialize as non-existent */
1771
1772         for (int i = 0; i < nodes; ++i) {
1773                 live_start[i] = live_end[i] = -1;
1774         }
1775
1776         int d = 0;
1777
1778         mir_foreach_block(ctx, block) {
1779                 mir_foreach_instr_in_block(block, ins) {
1780                         if (ins->compact_branch) continue;
1781
1782                         if (ins->ssa_args.dest < SSA_FIXED_MINIMUM) {
1783                                 /* If this destination is not yet live, it is now since we just wrote it */
1784
1785                                 int dest = ins->ssa_args.dest;
1786
1787                                 if (live_start[dest] == -1)
1788                                         live_start[dest] = d;
1789                         }
1790
1791                         /* Since we just used a source, the source might be
1792                          * dead now. Scan the rest of the block for
1793                          * invocations, and if there are none, the source dies
1794                          * */
1795
1796                         int sources[2] = { ins->ssa_args.src0, ins->ssa_args.src1 };
1797
1798                         for (int src = 0; src < 2; ++src) {
1799                                 int s = sources[src];
1800
1801                                 if (s < 0) continue;
1802
1803                                 if (s >= SSA_FIXED_MINIMUM) continue;
1804
1805                                 if (!is_live_after(ctx, block, ins, s)) {
1806                                         live_end[s] = d;
1807                                 }
1808                         }
1809
1810                         ++d;
1811                 }
1812         }
1813
1814         /* If a node still hasn't been killed, kill it now */
1815
1816         for (int i = 0; i < nodes; ++i) {
1817                 /* live_start == -1 most likely indicates a pinned output */
1818
1819                 if (live_end[i] == -1)
1820                         live_end[i] = d;
1821         }
1822
1823         /* Setup interference between nodes that are live at the same time */
1824
1825         for (int i = 0; i < nodes; ++i) {
1826                 for (int j = i + 1; j < nodes; ++j) {
1827                         if (!(live_start[i] >= live_end[j] || live_start[j] >= live_end[i]))
1828                                 ra_add_node_interference(g, i, j);
1829                 }
1830         }
1831
1832         ra_set_select_reg_callback(g, midgard_ra_select_callback, NULL);
1833
1834         if (!ra_allocate(g)) {
1835                 printf("Error allocating registers\n");
1836                 assert(0);
1837         }
1838
1839         /* Cleanup */
1840         free(live_start);
1841         free(live_end);
1842
1843         mir_foreach_block(ctx, block) {
1844                 mir_foreach_instr_in_block(block, ins) {
1845                         if (ins->compact_branch) continue;
1846
1847                         ssa_args args = ins->ssa_args;
1848
1849                         switch (ins->type) {
1850                         case TAG_ALU_4:
1851                                 ins->registers.src1_reg = dealias_register(ctx, g, args.src0, nodes);
1852
1853                                 ins->registers.src2_imm = args.inline_constant;
1854
1855                                 if (args.inline_constant) {
1856                                         /* Encode inline 16-bit constant as a vector by default */
1857
1858                                         ins->registers.src2_reg = ins->inline_constant >> 11;
1859
1860                                         int lower_11 = ins->inline_constant & ((1 << 12) - 1);
1861
1862                                         uint16_t imm = ((lower_11 >> 8) & 0x7) | ((lower_11 & 0xFF) << 3);
1863                                         ins->alu.src2 = imm << 2;
1864                                 } else {
1865                                         ins->registers.src2_reg = dealias_register(ctx, g, args.src1, nodes);
1866                                 }
1867
1868                                 ins->registers.out_reg = dealias_register(ctx, g, args.dest, nodes);
1869
1870                                 break;
1871
1872                         case TAG_LOAD_STORE_4: {
1873                                 if (OP_IS_STORE(ins->load_store.op)) {
1874                                         /* TODO: use ssa_args for store_vary */
1875                                         ins->load_store.reg = 0;
1876                                 } else {
1877                                         bool has_dest = args.dest >= 0;
1878                                         int ssa_arg = has_dest ? args.dest : args.src0;
1879
1880                                         ins->load_store.reg = dealias_register(ctx, g, ssa_arg, nodes);
1881                                 }
1882
1883                                 break;
1884                         }
1885
1886                         default:
1887                                 break;
1888                         }
1889                 }
1890         }
1891 }
1892
1893 /* Midgard IR only knows vector ALU types, but we sometimes need to actually
1894  * use scalar ALU instructions, for functional or performance reasons. To do
1895  * this, we just demote vector ALU payloads to scalar. */
1896
1897 static int
1898 component_from_mask(unsigned mask)
1899 {
1900         for (int c = 0; c < 4; ++c) {
1901                 if (mask & (3 << (2 * c)))
1902                         return c;
1903         }
1904
1905         assert(0);
1906         return 0;
1907 }
1908
1909 static bool
1910 is_single_component_mask(unsigned mask)
1911 {
1912         int components = 0;
1913
1914         for (int c = 0; c < 4; ++c)
1915                 if (mask & (3 << (2 * c)))
1916                         components++;
1917
1918         return components == 1;
1919 }
1920
1921 /* Create a mask of accessed components from a swizzle to figure out vector
1922  * dependencies */
1923
1924 static unsigned
1925 swizzle_to_access_mask(unsigned swizzle)
1926 {
1927         unsigned component_mask = 0;
1928
1929         for (int i = 0; i < 4; ++i) {
1930                 unsigned c = (swizzle >> (2 * i)) & 3;
1931                 component_mask |= (1 << c);
1932         }
1933
1934         return component_mask;
1935 }
1936
1937 static unsigned
1938 vector_to_scalar_source(unsigned u)
1939 {
1940         midgard_vector_alu_src v;
1941         memcpy(&v, &u, sizeof(v));
1942
1943         midgard_scalar_alu_src s = {
1944                 .abs = v.abs,
1945                 .negate = v.negate,
1946                 .full = !v.half,
1947                 .component = (v.swizzle & 3) << 1
1948         };
1949
1950         unsigned o;
1951         memcpy(&o, &s, sizeof(s));
1952
1953         return o & ((1 << 6) - 1);
1954 }
1955
1956 static midgard_scalar_alu
1957 vector_to_scalar_alu(midgard_vector_alu v, midgard_instruction *ins)
1958 {
1959         /* The output component is from the mask */
1960         midgard_scalar_alu s = {
1961                 .op = v.op,
1962                 .src1 = vector_to_scalar_source(v.src1),
1963                 .src2 = vector_to_scalar_source(v.src2),
1964                 .unknown = 0,
1965                 .outmod = v.outmod,
1966                 .output_full = 1, /* TODO: Half */
1967                 .output_component = component_from_mask(v.mask) << 1,
1968         };
1969
1970         /* Inline constant is passed along rather than trying to extract it
1971          * from v */
1972
1973         if (ins->ssa_args.inline_constant) {
1974                 uint16_t imm = 0;
1975                 int lower_11 = ins->inline_constant & ((1 << 12) - 1);
1976                 imm |= (lower_11 >> 9) & 3;
1977                 imm |= (lower_11 >> 6) & 4;
1978                 imm |= (lower_11 >> 2) & 0x38;
1979                 imm |= (lower_11 & 63) << 6;
1980
1981                 s.src2 = imm;
1982         }
1983
1984         return s;
1985 }
1986
1987 /* Midgard prefetches instruction types, so during emission we need to
1988  * lookahead too. Unless this is the last instruction, in which we return 1. Or
1989  * if this is the second to last and the last is an ALU, then it's also 1... */
1990
1991 #define IS_ALU(tag) (tag == TAG_ALU_4 || tag == TAG_ALU_8 ||  \
1992                      tag == TAG_ALU_12 || tag == TAG_ALU_16)
1993
1994 #define EMIT_AND_COUNT(type, val) util_dynarray_append(emission, type, val); \
1995                                   bytes_emitted += sizeof(type)
1996
1997 static void
1998 emit_binary_vector_instruction(midgard_instruction *ains,
1999                                uint16_t *register_words, int *register_words_count,
2000                                uint64_t *body_words, size_t *body_size, int *body_words_count,
2001                                size_t *bytes_emitted)
2002 {
2003         memcpy(&register_words[(*register_words_count)++], &ains->registers, sizeof(ains->registers));
2004         *bytes_emitted += sizeof(midgard_reg_info);
2005
2006         body_size[*body_words_count] = sizeof(midgard_vector_alu);
2007         memcpy(&body_words[(*body_words_count)++], &ains->alu, sizeof(ains->alu));
2008         *bytes_emitted += sizeof(midgard_vector_alu);
2009 }
2010
2011 /* Checks for an SSA data hazard between two adjacent instructions, keeping in
2012  * mind that we are a vector architecture and we can write to different
2013  * components simultaneously */
2014
2015 static bool
2016 can_run_concurrent_ssa(midgard_instruction *first, midgard_instruction *second)
2017 {
2018         /* Each instruction reads some registers and writes to a register. See
2019          * where the first writes */
2020
2021         /* Figure out where exactly we wrote to */
2022         int source = first->ssa_args.dest;
2023         int source_mask = first->type == TAG_ALU_4 ? squeeze_writemask(first->alu.mask) : 0xF;
2024
2025         /* As long as the second doesn't read from the first, we're okay */
2026         if (second->ssa_args.src0 == source) {
2027                 if (first->type == TAG_ALU_4) {
2028                         /* Figure out which components we just read from */
2029
2030                         int q = second->alu.src1;
2031                         midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q;
2032
2033                         /* Check if there are components in common, and fail if so */
2034                         if (swizzle_to_access_mask(m->swizzle) & source_mask)
2035                                 return false;
2036                 } else
2037                         return false;
2038
2039         }
2040
2041         if (second->ssa_args.src1 == source)
2042                 return false;
2043
2044         /* Otherwise, it's safe in that regard. Another data hazard is both
2045          * writing to the same place, of course */
2046
2047         if (second->ssa_args.dest == source) {
2048                 /* ...but only if the components overlap */
2049                 int dest_mask = second->type == TAG_ALU_4 ? squeeze_writemask(second->alu.mask) : 0xF;
2050
2051                 if (dest_mask & source_mask)
2052                         return false;
2053         }
2054
2055         /* ...That's it */
2056         return true;
2057 }
2058
2059 /* Schedules, but does not emit, a single basic block. After scheduling, the
2060  * final tag and size of the block are known, which are necessary for branching
2061  * */
2062
2063 static midgard_bundle
2064 schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction *ins, int *skip)
2065 {
2066         int instructions_emitted = 0, instructions_consumed = -1;
2067         midgard_bundle bundle = { 0 };
2068
2069         uint8_t tag = ins->type;
2070
2071         /* Default to the instruction's tag */
2072         bundle.tag = tag;
2073
2074         switch (ins->type) {
2075         case TAG_ALU_4: {
2076                 uint32_t control = 0;
2077                 size_t bytes_emitted = sizeof(control);
2078
2079                 /* TODO: Constant combining */
2080                 int index = 0, last_unit = 0;
2081
2082                 /* Previous instructions, for the purpose of parallelism */
2083                 midgard_instruction *segment[4] = {0};
2084                 int segment_size = 0;
2085
2086                 instructions_emitted = -1;
2087                 midgard_instruction *pins = ins;
2088
2089                 for (;;) {
2090                         midgard_instruction *ains = pins;
2091
2092                         /* Advance instruction pointer */
2093                         if (index) {
2094                                 ains = mir_next_op(pins);
2095                                 pins = ains;
2096                         }
2097
2098                         /* Out-of-work condition */
2099                         if ((struct list_head *) ains == &block->instructions)
2100                                 break;
2101
2102                         /* Ensure that the chain can continue */
2103                         if (ains->type != TAG_ALU_4) break;
2104
2105                         /* According to the presentation "The ARM
2106                          * Mali-T880 Mobile GPU" from HotChips 27,
2107                          * there are two pipeline stages. Branching
2108                          * position determined experimentally. Lines
2109                          * are executed in parallel:
2110                          *
2111                          * [ VMUL ] [ SADD ]
2112                          * [ VADD ] [ SMUL ] [ LUT ] [ BRANCH ]
2113                          *
2114                          * Verify that there are no ordering dependencies here.
2115                          *
2116                          * TODO: Allow for parallelism!!!
2117                          */
2118
2119                         /* Pick a unit for it if it doesn't force a particular unit */
2120
2121                         int unit = ains->unit;
2122
2123                         if (!unit) {
2124                                 int op = ains->alu.op;
2125                                 int units = alu_opcode_props[op];
2126
2127                                 /* TODO: Promotion of scalars to vectors */
2128                                 int vector = ((!is_single_component_mask(ains->alu.mask)) || ((units & UNITS_SCALAR) == 0)) && (units & UNITS_ANY_VECTOR);
2129
2130                                 if (!vector)
2131                                         assert(units & UNITS_SCALAR);
2132
2133                                 if (vector) {
2134                                         if (last_unit >= UNIT_VADD) {
2135                                                 if (units & UNIT_VLUT)
2136                                                         unit = UNIT_VLUT;
2137                                                 else
2138                                                         break;
2139                                         } else {
2140                                                 if ((units & UNIT_VMUL) && !(control & UNIT_VMUL))
2141                                                         unit = UNIT_VMUL;
2142                                                 else if ((units & UNIT_VADD) && !(control & UNIT_VADD))
2143                                                         unit = UNIT_VADD;
2144                                                 else if (units & UNIT_VLUT)
2145                                                         unit = UNIT_VLUT;
2146                                                 else
2147                                                         break;
2148                                         }
2149                                 } else {
2150                                         if (last_unit >= UNIT_VADD) {
2151                                                 if ((units & UNIT_SMUL) && !(control & UNIT_SMUL))
2152                                                         unit = UNIT_SMUL;
2153                                                 else if (units & UNIT_VLUT)
2154                                                         unit = UNIT_VLUT;
2155                                                 else
2156                                                         break;
2157                                         } else {
2158                                                 if ((units & UNIT_SADD) && !(control & UNIT_SADD))
2159                                                         unit = UNIT_SADD;
2160                                                 else if (units & UNIT_SMUL)
2161                                                         unit = UNIT_SMUL;
2162                                                 else if ((units & UNIT_VADD) && !(control & UNIT_VADD))
2163                                                         unit = UNIT_VADD;
2164                                                 else
2165                                                         break;
2166                                         }
2167                                 }
2168
2169                                 assert(unit & units);
2170                         }
2171
2172                         /* Late unit check, this time for encoding (not parallelism) */
2173                         if (unit <= last_unit) break;
2174
2175                         /* Clear the segment */
2176                         if (last_unit < UNIT_VADD && unit >= UNIT_VADD)
2177                                 segment_size = 0;
2178
2179                         /* Check for data hazards */
2180                         int has_hazard = false;
2181
2182                         for (int s = 0; s < segment_size; ++s)
2183                                 if (!can_run_concurrent_ssa(segment[s], ains))
2184                                         has_hazard = true;
2185
2186                         if (has_hazard)
2187                                 break;
2188
2189                         /* We're good to go -- emit the instruction */
2190                         ains->unit = unit;
2191
2192                         segment[segment_size++] = ains;
2193
2194                         /* Only one set of embedded constants per
2195                          * bundle possible; if we have more, we must
2196                          * break the chain early, unfortunately */
2197
2198                         if (ains->has_constants) {
2199                                 if (bundle.has_embedded_constants) {
2200                                         /* ...but if there are already
2201                                          * constants but these are the
2202                                          * *same* constants, we let it
2203                                          * through */
2204
2205                                         if (memcmp(bundle.constants, ains->constants, sizeof(bundle.constants)))
2206                                                 break;
2207                                 } else {
2208                                         bundle.has_embedded_constants = true;
2209                                         memcpy(bundle.constants, ains->constants, sizeof(bundle.constants));
2210
2211                                         /* If this is a blend shader special constant, track it for patching */
2212                                         if (ains->has_blend_constant)
2213                                                 bundle.has_blend_constant = true;
2214                                 }
2215                         }
2216
2217                         if (ains->unit & UNITS_ANY_VECTOR) {
2218                                 emit_binary_vector_instruction(ains, bundle.register_words,
2219                                                                &bundle.register_words_count, bundle.body_words,
2220                                                                bundle.body_size, &bundle.body_words_count, &bytes_emitted);
2221                         } else if (ains->compact_branch) {
2222                                 /* All of r0 has to be written out
2223                                  * along with the branch writeout.
2224                                  * (slow!) */
2225
2226                                 if (ains->writeout) {
2227                                         if (index == 0) {
2228                                                 midgard_instruction ins = v_fmov(0, blank_alu_src, SSA_FIXED_REGISTER(0));
2229                                                 ins.unit = UNIT_VMUL;
2230
2231                                                 control |= ins.unit;
2232
2233                                                 emit_binary_vector_instruction(&ins, bundle.register_words,
2234                                                                                &bundle.register_words_count, bundle.body_words,
2235                                                                                bundle.body_size, &bundle.body_words_count, &bytes_emitted);
2236                                         } else {
2237                                                 /* Analyse the group to see if r0 is written in full, on-time, without hanging dependencies*/
2238                                                 bool written_late = false;
2239                                                 bool components[4] = { 0 };
2240                                                 uint16_t register_dep_mask = 0;
2241                                                 uint16_t written_mask = 0;
2242
2243                                                 midgard_instruction *qins = ins;
2244                                                 for (int t = 0; t < index; ++t) {
2245                                                         if (qins->registers.out_reg != 0) {
2246                                                                 /* Mark down writes */
2247
2248                                                                 written_mask |= (1 << qins->registers.out_reg);
2249                                                         } else {
2250                                                                 /* Mark down the register dependencies for errata check */
2251
2252                                                                 if (qins->registers.src1_reg < 16)
2253                                                                         register_dep_mask |= (1 << qins->registers.src1_reg);
2254
2255                                                                 if (qins->registers.src2_reg < 16)
2256                                                                         register_dep_mask |= (1 << qins->registers.src2_reg);
2257
2258                                                                 int mask = qins->alu.mask;
2259
2260                                                                 for (int c = 0; c < 4; ++c)
2261                                                                         if (mask & (0x3 << (2 * c)))
2262                                                                                 components[c] = true;
2263
2264                                                                 /* ..but if the writeout is too late, we have to break up anyway... for some reason */
2265
2266                                                                 if (qins->unit == UNIT_VLUT)
2267                                                                         written_late = true;
2268                                                         }
2269
2270                                                         /* Advance instruction pointer */
2271                                                         qins = mir_next_op(qins);
2272                                                 }
2273
2274
2275                                                 /* ERRATA (?): In a bundle ending in a fragment writeout, the register dependencies of r0 cannot be written within this bundle (discovered in -bshading:shading=phong) */
2276                                                 if (register_dep_mask & written_mask) {
2277                                                         printf("ERRATA WORKAROUND: Breakup for writeout dependency masks %X vs %X (common %X)\n", register_dep_mask, written_mask, register_dep_mask & written_mask);
2278                                                         break;
2279                                                 }
2280
2281                                                 if (written_late)
2282                                                         break;
2283
2284                                                 /* If even a single component is not written, break it up (conservative check). */
2285                                                 bool breakup = false;
2286
2287                                                 for (int c = 0; c < 4; ++c)
2288                                                         if (!components[c])
2289                                                                 breakup = true;
2290
2291                                                 if (breakup)
2292                                                         break;
2293
2294                                                 /* Otherwise, we're free to proceed */
2295                                         }
2296                                 }
2297
2298                                 if (ains->unit == ALU_ENAB_BRANCH) {
2299                                         bundle.body_size[bundle.body_words_count] = sizeof(midgard_branch_extended);
2300                                         memcpy(&bundle.body_words[bundle.body_words_count++], &ains->branch_extended, sizeof(midgard_branch_extended));
2301                                         bytes_emitted += sizeof(midgard_branch_extended);
2302                                 } else {
2303                                         bundle.body_size[bundle.body_words_count] = sizeof(ains->br_compact);
2304                                         memcpy(&bundle.body_words[bundle.body_words_count++], &ains->br_compact, sizeof(ains->br_compact));
2305                                         bytes_emitted += sizeof(ains->br_compact);
2306                                 }
2307                         } else {
2308                                 memcpy(&bundle.register_words[bundle.register_words_count++], &ains->registers, sizeof(ains->registers));
2309                                 bytes_emitted += sizeof(midgard_reg_info);
2310
2311                                 bundle.body_size[bundle.body_words_count] = sizeof(midgard_scalar_alu);
2312                                 bundle.body_words_count++;
2313                                 bytes_emitted += sizeof(midgard_scalar_alu);
2314                         }
2315
2316                         /* Defer marking until after writing to allow for break */
2317                         control |= ains->unit;
2318                         last_unit = ains->unit;
2319                         ++instructions_emitted;
2320                         ++index;
2321                 }
2322
2323                 /* Bubble up the number of instructions for skipping */
2324                 instructions_consumed = index - 1;
2325
2326                 int padding = 0;
2327
2328                 /* Pad ALU op to nearest word */
2329
2330                 if (bytes_emitted & 15) {
2331                         padding = 16 - (bytes_emitted & 15);
2332                         bytes_emitted += padding;
2333                 }
2334
2335                 /* Constants must always be quadwords */
2336                 if (bundle.has_embedded_constants)
2337                         bytes_emitted += 16;
2338
2339                 /* Size ALU instruction for tag */
2340                 bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1;
2341                 bundle.padding = padding;
2342                 bundle.control = bundle.tag | control;
2343
2344                 break;
2345         }
2346
2347         case TAG_LOAD_STORE_4: {
2348                 /* Load store instructions have two words at once. If
2349                  * we only have one queued up, we need to NOP pad.
2350                  * Otherwise, we store both in succession to save space
2351                  * and cycles -- letting them go in parallel -- skip
2352                  * the next. The usefulness of this optimisation is
2353                  * greatly dependent on the quality of the instruction
2354                  * scheduler.
2355                  */
2356
2357                 midgard_instruction *next_op = mir_next_op(ins);
2358
2359                 if ((struct list_head *) next_op != &block->instructions && next_op->type == TAG_LOAD_STORE_4) {
2360                         /* As the two operate concurrently, make sure
2361                          * they are not dependent */
2362
2363                         if (can_run_concurrent_ssa(ins, next_op) || true) {
2364                                 /* Skip ahead, since it's redundant with the pair */
2365                                 instructions_consumed = 1 + (instructions_emitted++);
2366                         }
2367                 }
2368
2369                 break;
2370         }
2371
2372         default:
2373                 /* Texture ops default to single-op-per-bundle scheduling */
2374                 break;
2375         }
2376
2377         /* Copy the instructions into the bundle */
2378         bundle.instruction_count = instructions_emitted + 1;
2379
2380         int used_idx = 0;
2381
2382         midgard_instruction *uins = ins;
2383         for (int i = 0; used_idx < bundle.instruction_count; ++i) {
2384                 bundle.instructions[used_idx++] = *uins;
2385                 uins = mir_next_op(uins);
2386         }
2387
2388         *skip = (instructions_consumed == -1) ? instructions_emitted : instructions_consumed;
2389
2390         return bundle;
2391 }
2392
2393 static int
2394 quadword_size(int tag)
2395 {
2396         switch (tag) {
2397         case TAG_ALU_4:
2398                 return 1;
2399
2400         case TAG_ALU_8:
2401                 return 2;
2402
2403         case TAG_ALU_12:
2404                 return 3;
2405
2406         case TAG_ALU_16:
2407                 return 4;
2408
2409         case TAG_LOAD_STORE_4:
2410                 return 1;
2411
2412         case TAG_TEXTURE_4:
2413                 return 1;
2414
2415         default:
2416                 assert(0);
2417                 return 0;
2418         }
2419 }
2420
2421 /* Schedule a single block by iterating its instruction to create bundles.
2422  * While we go, tally about the bundle sizes to compute the block size. */
2423
2424 static void
2425 schedule_block(compiler_context *ctx, midgard_block *block)
2426 {
2427         util_dynarray_init(&block->bundles, NULL);
2428
2429         block->quadword_count = 0;
2430
2431         mir_foreach_instr_in_block(block, ins) {
2432                 int skip;
2433                 midgard_bundle bundle = schedule_bundle(ctx, block, ins, &skip);
2434                 util_dynarray_append(&block->bundles, midgard_bundle, bundle);
2435
2436                 if (bundle.has_blend_constant) {
2437                         /* TODO: Multiblock? */
2438                         int quadwords_within_block = block->quadword_count + quadword_size(bundle.tag) - 1;
2439                         ctx->blend_constant_offset = quadwords_within_block * 0x10;
2440                 }
2441
2442                 while(skip--)
2443                         ins = mir_next_op(ins);
2444
2445                 block->quadword_count += quadword_size(bundle.tag);
2446         }
2447
2448         block->is_scheduled = true;
2449 }
2450
2451 static void
2452 schedule_program(compiler_context *ctx)
2453 {
2454         allocate_registers(ctx);
2455
2456         mir_foreach_block(ctx, block) {
2457                 schedule_block(ctx, block);
2458         }
2459 }
2460
2461 /* After everything is scheduled, emit whole bundles at a time */
2462
2463 static void
2464 emit_binary_bundle(compiler_context *ctx, midgard_bundle *bundle, struct util_dynarray *emission, int next_tag)
2465 {
2466         int lookahead = next_tag << 4;
2467
2468         switch (bundle->tag) {
2469         case TAG_ALU_4:
2470         case TAG_ALU_8:
2471         case TAG_ALU_12:
2472         case TAG_ALU_16: {
2473                 /* Actually emit each component */
2474                 util_dynarray_append(emission, uint32_t, bundle->control | lookahead);
2475
2476                 for (int i = 0; i < bundle->register_words_count; ++i)
2477                         util_dynarray_append(emission, uint16_t, bundle->register_words[i]);
2478
2479                 /* Emit body words based on the instructions bundled */
2480                 for (int i = 0; i < bundle->instruction_count; ++i) {
2481                         midgard_instruction *ins = &bundle->instructions[i];
2482
2483                         if (ins->unit & UNITS_ANY_VECTOR) {
2484                                 memcpy(util_dynarray_grow(emission, sizeof(midgard_vector_alu)), &ins->alu, sizeof(midgard_vector_alu));
2485                         } else if (ins->compact_branch) {
2486                                 /* Dummy move, XXX DRY */
2487                                 if ((i == 0) && ins->writeout) {
2488                                         midgard_instruction ins = v_fmov(0, blank_alu_src, SSA_FIXED_REGISTER(0));
2489                                         memcpy(util_dynarray_grow(emission, sizeof(midgard_vector_alu)), &ins.alu, sizeof(midgard_vector_alu));
2490                                 }
2491
2492                                 if (ins->unit == ALU_ENAB_BR_COMPACT) {
2493                                         memcpy(util_dynarray_grow(emission, sizeof(ins->br_compact)), &ins->br_compact, sizeof(ins->br_compact));
2494                                 } else {
2495                                         memcpy(util_dynarray_grow(emission, sizeof(ins->branch_extended)), &ins->branch_extended, sizeof(ins->branch_extended));
2496                                 }
2497                         } else {
2498                                 /* Scalar */
2499                                 midgard_scalar_alu scalarised = vector_to_scalar_alu(ins->alu, ins);
2500                                 memcpy(util_dynarray_grow(emission, sizeof(scalarised)), &scalarised, sizeof(scalarised));
2501                         }
2502                 }
2503
2504                 /* Emit padding (all zero) */
2505                 memset(util_dynarray_grow(emission, bundle->padding), 0, bundle->padding);
2506
2507                 /* Tack on constants */
2508
2509                 if (bundle->has_embedded_constants) {
2510                         util_dynarray_append(emission, float, bundle->constants[0]);
2511                         util_dynarray_append(emission, float, bundle->constants[1]);
2512                         util_dynarray_append(emission, float, bundle->constants[2]);
2513                         util_dynarray_append(emission, float, bundle->constants[3]);
2514                 }
2515
2516                 break;
2517         }
2518
2519         case TAG_LOAD_STORE_4: {
2520                 /* One or two composing instructions */
2521
2522                 uint64_t current64, next64 = LDST_NOP;
2523
2524                 memcpy(&current64, &bundle->instructions[0].load_store, sizeof(current64));
2525
2526                 if (bundle->instruction_count == 2)
2527                         memcpy(&next64, &bundle->instructions[1].load_store, sizeof(next64));
2528
2529                 midgard_load_store instruction = {
2530                         .type = bundle->tag,
2531                         .next_type = next_tag,
2532                         .word1 = current64,
2533                         .word2 = next64
2534                 };
2535
2536                 util_dynarray_append(emission, midgard_load_store, instruction);
2537
2538                 break;
2539         }
2540
2541         case TAG_TEXTURE_4: {
2542                 /* Texture instructions are easy, since there is no
2543                  * pipelining nor VLIW to worry about. We may need to set the .last flag */
2544
2545                 midgard_instruction *ins = &bundle->instructions[0];
2546
2547                 ins->texture.type = TAG_TEXTURE_4;
2548                 ins->texture.next_type = next_tag;
2549
2550                 ctx->texture_op_count--;
2551
2552                 if (!ctx->texture_op_count) {
2553                         ins->texture.cont = 0;
2554                         ins->texture.last = 1;
2555                 }
2556
2557                 util_dynarray_append(emission, midgard_texture_word, ins->texture);
2558                 break;
2559         }
2560
2561         default:
2562                 printf("Unknown midgard instruction type\n");
2563                 assert(0);
2564                 break;
2565         }
2566 }
2567
2568
2569 /* ALU instructions can inline or embed constants, which decreases register
2570  * pressure and saves space. */
2571
2572 #define CONDITIONAL_ATTACH(src) { \
2573         void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->ssa_args.src + 1); \
2574 \
2575         if (entry) { \
2576                 attach_constants(ctx, alu, entry, alu->ssa_args.src + 1); \
2577                 alu->ssa_args.src = SSA_FIXED_REGISTER(REGISTER_CONSTANT); \
2578         } \
2579 }
2580
2581 static void
2582 inline_alu_constants(compiler_context *ctx)
2583 {
2584         mir_foreach_instr(ctx, alu) {
2585                 /* Other instructions cannot inline constants */
2586                 if (alu->type != TAG_ALU_4) continue;
2587
2588                 /* If there is already a constant here, we can do nothing */
2589                 if (alu->has_constants) continue;
2590
2591                 CONDITIONAL_ATTACH(src0);
2592
2593                 if (!alu->has_constants) {
2594                         CONDITIONAL_ATTACH(src1)
2595                 } else if (!alu->inline_constant) {
2596                         /* Corner case: _two_ vec4 constants, for instance with a
2597                          * csel. For this case, we can only use a constant
2598                          * register for one, we'll have to emit a move for the
2599                          * other. Note, if both arguments are constants, then
2600                          * necessarily neither argument depends on the value of
2601                          * any particular register. As the destination register
2602                          * will be wiped, that means we can spill the constant
2603                          * to the destination register.
2604                          */
2605
2606                         void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->ssa_args.src1 + 1);
2607                         unsigned scratch = alu->ssa_args.dest;
2608
2609                         if (entry) {
2610                                 midgard_instruction ins = v_fmov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), blank_alu_src, scratch);
2611                                 attach_constants(ctx, &ins, entry, alu->ssa_args.src1 + 1);
2612
2613                                 /* Force a break XXX Defer r31 writes */
2614                                 ins.unit = UNIT_VLUT;
2615
2616                                 /* Set the source */
2617                                 alu->ssa_args.src1 = scratch;
2618
2619                                 /* Inject us -before- the last instruction which set r31 */
2620                                 mir_insert_instruction_before(mir_prev_op(alu), ins);
2621                         }
2622                 }
2623         }
2624 }
2625
2626 /* Midgard supports two types of constants, embedded constants (128-bit) and
2627  * inline constants (16-bit). Sometimes, especially with scalar ops, embedded
2628  * constants can be demoted to inline constants, for space savings and
2629  * sometimes a performance boost */
2630
2631 static void
2632 embedded_to_inline_constant(compiler_context *ctx)
2633 {
2634         mir_foreach_instr(ctx, ins) {
2635                 if (!ins->has_constants) continue;
2636
2637                 if (ins->ssa_args.inline_constant) continue;
2638
2639                 /* Blend constants must not be inlined by definition */
2640                 if (ins->has_blend_constant) continue;
2641
2642                 /* src1 cannot be an inline constant due to encoding
2643                  * restrictions. So, if possible we try to flip the arguments
2644                  * in that case */
2645
2646                 int op = ins->alu.op;
2647
2648                 if (ins->ssa_args.src0 == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) {
2649                         /* Flip based on op. Fallthrough intentional */
2650
2651                         switch (op) {
2652                         /* These ops require an operational change to flip their arguments TODO */
2653                         case midgard_alu_op_flt:
2654                         case midgard_alu_op_fle:
2655                         case midgard_alu_op_ilt:
2656                         case midgard_alu_op_ile:
2657                         case midgard_alu_op_fcsel:
2658                         case midgard_alu_op_icsel:
2659                         case midgard_alu_op_isub:
2660                                 printf("Missed non-commutative flip (%s)\n", alu_opcode_names[op]);
2661                                 break;
2662
2663                         /* These ops are commutative and Just Flip */
2664                         case midgard_alu_op_fne:
2665                         case midgard_alu_op_fadd:
2666                         case midgard_alu_op_fmul:
2667                         case midgard_alu_op_fmin:
2668                         case midgard_alu_op_fmax:
2669                         case midgard_alu_op_iadd:
2670                         case midgard_alu_op_imul:
2671                         case midgard_alu_op_feq:
2672                         case midgard_alu_op_ieq:
2673                         case midgard_alu_op_ine:
2674                         case midgard_alu_op_iand:
2675                         case midgard_alu_op_ior:
2676                         case midgard_alu_op_ixor:
2677                                 /* Flip the SSA numbers */
2678                                 ins->ssa_args.src0 = ins->ssa_args.src1;
2679                                 ins->ssa_args.src1 = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
2680
2681                                 /* And flip the modifiers */
2682
2683                                 unsigned src_temp;
2684
2685                                 src_temp = ins->alu.src2;
2686                                 ins->alu.src2 = ins->alu.src1;
2687                                 ins->alu.src1 = src_temp;
2688
2689                         default:
2690                                 break;
2691                         }
2692                 }
2693
2694                 if (ins->ssa_args.src1 == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) {
2695                         /* Extract the source information */
2696
2697                         midgard_vector_alu_src *src;
2698                         int q = ins->alu.src2;
2699                         midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q;
2700                         src = m;
2701
2702                         /* Component is from the swizzle, e.g. r26.w -> w component. TODO: What if x is masked out? */
2703                         int component = src->swizzle & 3;
2704
2705                         /* Scale constant appropriately, if we can legally */
2706                         uint16_t scaled_constant = 0;
2707
2708                         /* XXX: Check legality */
2709                         if (midgard_is_integer_op(op)) {
2710                                 /* TODO: Inline integer */
2711                                 continue;
2712
2713                                 unsigned int *iconstants = (unsigned int *) ins->constants;
2714                                 scaled_constant = (uint16_t) iconstants[component];
2715
2716                                 /* Constant overflow after resize */
2717                                 if (scaled_constant != iconstants[component])
2718                                         continue;
2719                         } else {
2720                                 scaled_constant = _mesa_float_to_half((float) ins->constants[component]);
2721                         }
2722
2723                         /* We don't know how to handle these with a constant */
2724
2725                         if (src->abs || src->negate || src->half || src->rep_low || src->rep_high) {
2726                                 printf("Bailing inline constant...\n");
2727                                 continue;
2728                         }
2729
2730                         /* Make sure that the constant is not itself a
2731                          * vector by checking if all accessed values
2732                          * (by the swizzle) are the same. */
2733
2734                         uint32_t *cons = (uint32_t *) ins->constants;
2735                         uint32_t value = cons[component];
2736
2737                         bool is_vector = false;
2738                         unsigned mask = effective_writemask(&ins->alu);
2739
2740                         for (int c = 1; c < 4; ++c) {
2741                                 /* We only care if this component is actually used */
2742                                 if (!(mask & (1 << c)))
2743                                         continue;
2744
2745                                 uint32_t test = cons[(src->swizzle >> (2 * c)) & 3];
2746
2747                                 if (test != value) {
2748                                         is_vector = true;
2749                                         break;
2750                                 }
2751                         }
2752
2753                         if (is_vector)
2754                                 continue;
2755
2756                         /* Get rid of the embedded constant */
2757                         ins->has_constants = false;
2758                         ins->ssa_args.src1 = SSA_UNUSED_0;
2759                         ins->ssa_args.inline_constant = true;
2760                         ins->inline_constant = scaled_constant;
2761                 }
2762         }
2763 }
2764
2765 /* Map normal SSA sources to other SSA sources / fixed registers (like
2766  * uniforms) */
2767
2768 static void
2769 map_ssa_to_alias(compiler_context *ctx, int *ref)
2770 {
2771         unsigned int alias = (uintptr_t) _mesa_hash_table_u64_search(ctx->ssa_to_alias, *ref + 1);
2772
2773         if (alias) {
2774                 /* Remove entry in leftovers to avoid a redunant fmov */
2775
2776                 struct set_entry *leftover = _mesa_set_search(ctx->leftover_ssa_to_alias, ((void *) (uintptr_t) (*ref + 1)));
2777
2778                 if (leftover)
2779                         _mesa_set_remove(ctx->leftover_ssa_to_alias, leftover);
2780
2781                 /* Assign the alias map */
2782                 *ref = alias - 1;
2783                 return;
2784         }
2785 }
2786
2787 #define AS_SRC(to, u) \
2788         int q##to = ins->alu.src2; \
2789         midgard_vector_alu_src *to = (midgard_vector_alu_src *) &q##to;
2790
2791 /* Removing unused moves is necessary to clean up the texture pipeline results.
2792  *
2793  * To do so, we find moves in the MIR. We check if their destination is live later. If it's not, the move is redundant. */
2794
2795 static void
2796 midgard_eliminate_orphan_moves(compiler_context *ctx, midgard_block *block)
2797 {
2798         mir_foreach_instr_in_block_safe(block, ins) {
2799                 if (ins->type != TAG_ALU_4) continue;
2800
2801                 if (ins->alu.op != midgard_alu_op_fmov) continue;
2802
2803                 if (ins->ssa_args.dest >= SSA_FIXED_MINIMUM) continue;
2804
2805                 if (midgard_is_pinned(ctx, ins->ssa_args.dest)) continue;
2806
2807                 if (is_live_after(ctx, block, ins, ins->ssa_args.dest)) continue;
2808
2809                 mir_remove_instruction(ins);
2810         }
2811 }
2812
2813 /* The following passes reorder MIR instructions to enable better scheduling */
2814
2815 static void
2816 midgard_pair_load_store(compiler_context *ctx, midgard_block *block)
2817 {
2818         mir_foreach_instr_in_block_safe(block, ins) {
2819                 if (ins->type != TAG_LOAD_STORE_4) continue;
2820
2821                 /* We've found a load/store op. Check if next is also load/store. */
2822                 midgard_instruction *next_op = mir_next_op(ins);
2823                 if (&next_op->link != &block->instructions) {
2824                         if (next_op->type == TAG_LOAD_STORE_4) {
2825                                 /* If so, we're done since we're a pair */
2826                                 ins = mir_next_op(ins);
2827                                 continue;
2828                         }
2829
2830                         /* Maximum search distance to pair, to avoid register pressure disasters */
2831                         int search_distance = 8;
2832
2833                         /* Otherwise, we have an orphaned load/store -- search for another load */
2834                         mir_foreach_instr_in_block_from(block, c, mir_next_op(ins)) {
2835                                 /* Terminate search if necessary */
2836                                 if (!(search_distance--)) break;
2837
2838                                 if (c->type != TAG_LOAD_STORE_4) continue;
2839
2840                                 if (OP_IS_STORE(c->load_store.op)) continue;
2841
2842                                 /* We found one! Move it up to pair and remove it from the old location */
2843
2844                                 mir_insert_instruction_before(ins, *c);
2845                                 mir_remove_instruction(c);
2846
2847                                 break;
2848                         }
2849                 }
2850         }
2851 }
2852
2853 /* Emit varying stores late */
2854
2855 static void
2856 midgard_emit_store(compiler_context *ctx, midgard_block *block) {
2857         /* Iterate in reverse to get the final write, rather than the first */
2858
2859         mir_foreach_instr_in_block_safe_rev(block, ins) {
2860                 /* Check if what we just wrote needs a store */
2861                 int idx = ins->ssa_args.dest;
2862                 uintptr_t varying = ((uintptr_t) _mesa_hash_table_u64_search(ctx->ssa_varyings, idx + 1));
2863
2864                 if (!varying) continue;
2865
2866                 varying -= 1;
2867
2868                 /* We need to store to the appropriate varying, so emit the
2869                  * move/store */
2870
2871                 /* TODO: Integrate with special purpose RA (and scheduler?) */
2872                 bool high_varying_register = false;
2873
2874                 midgard_instruction mov = v_fmov(idx, blank_alu_src, SSA_FIXED_REGISTER(REGISTER_VARYING_BASE + high_varying_register));
2875
2876                 midgard_instruction st = m_store_vary_32(SSA_FIXED_REGISTER(high_varying_register), varying);
2877                 st.load_store.unknown = 0x1E9E; /* XXX: What is this? */
2878
2879                 mir_insert_instruction_before(mir_next_op(ins), st);
2880                 mir_insert_instruction_before(mir_next_op(ins), mov);
2881
2882                 /* We no longer need to store this varying */
2883                 _mesa_hash_table_u64_remove(ctx->ssa_varyings, idx + 1);
2884         }
2885 }
2886
2887 /* If there are leftovers after the below pass, emit actual fmov
2888  * instructions for the slow-but-correct path */
2889
2890 static void
2891 emit_leftover_move(compiler_context *ctx)
2892 {
2893         set_foreach(ctx->leftover_ssa_to_alias, leftover) {
2894                 int base = ((uintptr_t) leftover->key) - 1;
2895                 int mapped = base;
2896
2897                 map_ssa_to_alias(ctx, &mapped);
2898                 EMIT(fmov, mapped, blank_alu_src, base);
2899         }
2900 }
2901
2902 static void
2903 actualise_ssa_to_alias(compiler_context *ctx)
2904 {
2905         mir_foreach_instr(ctx, ins) {
2906                 map_ssa_to_alias(ctx, &ins->ssa_args.src0);
2907                 map_ssa_to_alias(ctx, &ins->ssa_args.src1);
2908         }
2909
2910         emit_leftover_move(ctx);
2911 }
2912
2913 /* Vertex shaders do not write gl_Position as is; instead, they write a
2914  * transformed screen space position as a varying. See section 12.5 "Coordinate
2915  * Transformation" of the ES 3.2 full specification for details.
2916  *
2917  * This transformation occurs early on, as NIR and prior to optimisation, in
2918  * order to take advantage of NIR optimisation passes of the transform itself.
2919  * */
2920
2921 static void
2922 write_transformed_position(nir_builder *b, nir_src input_point_src, int uniform_no)
2923 {
2924         nir_ssa_def *input_point = nir_ssa_for_src(b, input_point_src, 4);
2925
2926         /* Get viewport from the uniforms */
2927         nir_intrinsic_instr *load;
2928         load = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_uniform);
2929         load->num_components = 4;
2930         load->src[0] = nir_src_for_ssa(nir_imm_int(b, uniform_no));
2931         nir_ssa_dest_init(&load->instr, &load->dest, 4, 32, NULL);
2932         nir_builder_instr_insert(b, &load->instr);
2933
2934         /* Formatted as <width, height, centerx, centery> */
2935         nir_ssa_def *viewport_vec4 = &load->dest.ssa;
2936         nir_ssa_def *viewport_width_2 = nir_channel(b, viewport_vec4, 0);
2937         nir_ssa_def *viewport_height_2 = nir_channel(b, viewport_vec4, 1);
2938         nir_ssa_def *viewport_offset = nir_channels(b, viewport_vec4, 0x8 | 0x4);
2939
2940         /* XXX: From uniforms? */
2941         nir_ssa_def *depth_near = nir_imm_float(b, 0.0);
2942         nir_ssa_def *depth_far = nir_imm_float(b, 1.0);
2943
2944         /* World space to normalised device coordinates */
2945
2946         nir_ssa_def *w_recip = nir_frcp(b, nir_channel(b, input_point, 3));
2947         nir_ssa_def *ndc_point = nir_fmul(b, nir_channels(b, input_point, 0x7), w_recip);
2948
2949         /* Normalised device coordinates to screen space */
2950
2951         nir_ssa_def *viewport_multiplier = nir_vec2(b, viewport_width_2, viewport_height_2);
2952         nir_ssa_def *viewport_xy = nir_fadd(b, nir_fmul(b, nir_channels(b, ndc_point, 0x3), viewport_multiplier), viewport_offset);
2953
2954         nir_ssa_def *depth_multiplier = nir_fmul(b, nir_fsub(b, depth_far, depth_near), nir_imm_float(b, 0.5f));
2955         nir_ssa_def *depth_offset     = nir_fmul(b, nir_fadd(b, depth_far, depth_near), nir_imm_float(b, 0.5f));
2956         nir_ssa_def *screen_depth     = nir_fadd(b, nir_fmul(b, nir_channel(b, ndc_point, 2), depth_multiplier), depth_offset);
2957
2958         /* gl_Position will be written out in screenspace xyz, with w set to
2959          * the reciprocal we computed earlier. The transformed w component is
2960          * then used for perspective-correct varying interpolation. The
2961          * transformed w component must preserve its original sign; this is
2962          * used in depth clipping computations */
2963
2964         nir_ssa_def *screen_space = nir_vec4(b,
2965                                              nir_channel(b, viewport_xy, 0),
2966                                              nir_channel(b, viewport_xy, 1),
2967                                              screen_depth,
2968                                              w_recip);
2969
2970         /* Finally, write out the transformed values to the varying */
2971
2972         nir_intrinsic_instr *store;
2973         store = nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_output);
2974         store->num_components = 4;
2975         nir_intrinsic_set_base(store, 0);
2976         nir_intrinsic_set_write_mask(store, 0xf);
2977         store->src[0].ssa = screen_space;
2978         store->src[0].is_ssa = true;
2979         store->src[1] = nir_src_for_ssa(nir_imm_int(b, 0));
2980         nir_builder_instr_insert(b, &store->instr);
2981 }
2982
2983 static void
2984 transform_position_writes(nir_shader *shader)
2985 {
2986         nir_foreach_function(func, shader) {
2987                 nir_foreach_block(block, func->impl) {
2988                         nir_foreach_instr_safe(instr, block) {
2989                                 if (instr->type != nir_instr_type_intrinsic) continue;
2990
2991                                 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
2992                                 nir_variable *out = NULL;
2993
2994                                 switch (intr->intrinsic) {
2995                                 case nir_intrinsic_store_output:
2996                                         /* already had i/o lowered.. lookup the matching output var: */
2997                                         nir_foreach_variable(var, &shader->outputs) {
2998                                                 int drvloc = var->data.driver_location;
2999
3000                                                 if (nir_intrinsic_base(intr) == drvloc) {
3001                                                         out = var;
3002                                                         break;
3003                                                 }
3004                                         }
3005
3006                                         break;
3007
3008                                 default:
3009                                         break;
3010                                 }
3011
3012                                 if (!out) continue;
3013
3014                                 if (out->data.mode != nir_var_shader_out)
3015                                         continue;
3016
3017                                 if (out->data.location != VARYING_SLOT_POS)
3018                                         continue;
3019
3020                                 nir_builder b;
3021                                 nir_builder_init(&b, func->impl);
3022                                 b.cursor = nir_before_instr(instr);
3023
3024                                 write_transformed_position(&b, intr->src[0], UNIFORM_VIEWPORT);
3025                                 nir_instr_remove(instr);
3026                         }
3027                 }
3028         }
3029 }
3030
3031 static void
3032 emit_fragment_epilogue(compiler_context *ctx)
3033 {
3034         /* Special case: writing out constants requires us to include the move
3035          * explicitly now, so shove it into r0 */
3036
3037         void *constant_value = _mesa_hash_table_u64_search(ctx->ssa_constants, ctx->fragment_output + 1);
3038
3039         if (constant_value) {
3040                 midgard_instruction ins = v_fmov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), blank_alu_src, SSA_FIXED_REGISTER(0));
3041                 attach_constants(ctx, &ins, constant_value, ctx->fragment_output + 1);
3042                 emit_mir_instruction(ctx, ins);
3043         }
3044
3045         /* Perform the actual fragment writeout. We have two writeout/branch
3046          * instructions, forming a loop until writeout is successful as per the
3047          * docs. TODO: gl_FragDepth */
3048
3049         EMIT(alu_br_compact_cond, midgard_jmp_writeout_op_writeout, TAG_ALU_4, 0, midgard_condition_always);
3050         EMIT(alu_br_compact_cond, midgard_jmp_writeout_op_writeout, TAG_ALU_4, -1, midgard_condition_always);
3051 }
3052
3053 /* For the blend epilogue, we need to convert the blended fragment vec4 (stored
3054  * in r0) to a RGBA8888 value by scaling and type converting. We then output it
3055  * with the int8 analogue to the fragment epilogue */
3056
3057 static void
3058 emit_blend_epilogue(compiler_context *ctx)
3059 {
3060         /* vmul.fmul.none.fulllow hr48, r0, #255 */
3061
3062         midgard_instruction scale = {
3063                 .type = TAG_ALU_4,
3064                 .unit = UNIT_VMUL,
3065                 .inline_constant = _mesa_float_to_half(255.0),
3066                 .ssa_args = {
3067                         .src0 = SSA_FIXED_REGISTER(0),
3068                         .src1 = SSA_UNUSED_0,
3069                         .dest = SSA_FIXED_REGISTER(24),
3070                         .inline_constant = true
3071                 },
3072                 .alu = {
3073                         .op = midgard_alu_op_fmul,
3074                         .reg_mode = midgard_reg_mode_full,
3075                         .dest_override = midgard_dest_override_lower,
3076                         .mask = 0xFF,
3077                         .src1 = vector_alu_srco_unsigned(blank_alu_src),
3078                         .src2 = vector_alu_srco_unsigned(blank_alu_src),
3079                 }
3080         };
3081
3082         emit_mir_instruction(ctx, scale);
3083
3084         /* vadd.f2u8.pos.low hr0, hr48, #0 */
3085
3086         midgard_vector_alu_src alu_src = blank_alu_src;
3087         alu_src.half = true;
3088
3089         midgard_instruction f2u8 = {
3090                 .type = TAG_ALU_4,
3091                 .ssa_args = {
3092                         .src0 = SSA_FIXED_REGISTER(24),
3093                         .src1 = SSA_UNUSED_0,
3094                         .dest = SSA_FIXED_REGISTER(0),
3095                         .inline_constant = true
3096                 },
3097                 .alu = {
3098                         .op = midgard_alu_op_f2u8,
3099                         .reg_mode = midgard_reg_mode_half,
3100                         .dest_override = midgard_dest_override_lower,
3101                         .outmod = midgard_outmod_pos,
3102                         .mask = 0xF,
3103                         .src1 = vector_alu_srco_unsigned(alu_src),
3104                         .src2 = vector_alu_srco_unsigned(blank_alu_src),
3105                 }
3106         };
3107
3108         emit_mir_instruction(ctx, f2u8);
3109
3110         /* vmul.imov.quarter r0, r0, r0 */
3111
3112         midgard_instruction imov_8 = {
3113                 .type = TAG_ALU_4,
3114                 .ssa_args = {
3115                         .src0 = SSA_UNUSED_1,
3116                         .src1 = SSA_FIXED_REGISTER(0),
3117                         .dest = SSA_FIXED_REGISTER(0),
3118                 },
3119                 .alu = {
3120                         .op = midgard_alu_op_imov,
3121                         .reg_mode = midgard_reg_mode_quarter,
3122                         .dest_override = midgard_dest_override_none,
3123                         .mask = 0xFF,
3124                         .src1 = vector_alu_srco_unsigned(blank_alu_src),
3125                         .src2 = vector_alu_srco_unsigned(blank_alu_src),
3126                 }
3127         };
3128
3129         /* Emit branch epilogue with the 8-bit move as the source */
3130
3131         emit_mir_instruction(ctx, imov_8);
3132         EMIT(alu_br_compact_cond, midgard_jmp_writeout_op_writeout, TAG_ALU_4, 0, midgard_condition_always);
3133
3134         emit_mir_instruction(ctx, imov_8);
3135         EMIT(alu_br_compact_cond, midgard_jmp_writeout_op_writeout, TAG_ALU_4, -1, midgard_condition_always);
3136 }
3137
3138 static midgard_block *
3139 emit_block(compiler_context *ctx, nir_block *block)
3140 {
3141         midgard_block *this_block = malloc(sizeof(midgard_block));
3142         list_addtail(&this_block->link, &ctx->blocks);
3143
3144         this_block->is_scheduled = false;
3145         ++ctx->block_count;
3146
3147         ctx->texture_index[0] = -1;
3148         ctx->texture_index[1] = -1;
3149
3150         /* Set up current block */
3151         list_inithead(&this_block->instructions);
3152         ctx->current_block = this_block;
3153
3154         nir_foreach_instr(instr, block) {
3155                 emit_instr(ctx, instr);
3156                 ++ctx->instruction_count;
3157         }
3158
3159         inline_alu_constants(ctx);
3160         embedded_to_inline_constant(ctx);
3161
3162         /* Perform heavylifting for aliasing */
3163         actualise_ssa_to_alias(ctx);
3164
3165         midgard_emit_store(ctx, this_block);
3166         midgard_eliminate_orphan_moves(ctx, this_block);
3167         midgard_pair_load_store(ctx, this_block);
3168
3169         /* Append fragment shader epilogue (value writeout) */
3170         if (ctx->stage == MESA_SHADER_FRAGMENT) {
3171                 if (block == nir_impl_last_block(ctx->func->impl)) {
3172                         if (ctx->is_blend)
3173                                 emit_blend_epilogue(ctx);
3174                         else
3175                                 emit_fragment_epilogue(ctx);
3176                 }
3177         }
3178
3179         /* Fallthrough save */
3180         this_block->next_fallthrough = ctx->previous_source_block;
3181
3182         if (block == nir_start_block(ctx->func->impl))
3183                 ctx->initial_block = this_block;
3184
3185         if (block == nir_impl_last_block(ctx->func->impl))
3186                 ctx->final_block = this_block;
3187
3188         /* Allow the next control flow to access us retroactively, for
3189          * branching etc */
3190         ctx->current_block = this_block;
3191
3192         /* Document the fallthrough chain */
3193         ctx->previous_source_block = this_block;
3194
3195         return this_block;
3196 }
3197
3198 static midgard_block *emit_cf_list(struct compiler_context *ctx, struct exec_list *list);
3199
3200 static void
3201 emit_if(struct compiler_context *ctx, nir_if *nif)
3202 {
3203         /* Conditional branches expect the condition in r31.w; emit a move for
3204          * that in the _previous_ block (which is the current block). */
3205         emit_condition(ctx, &nif->condition, true);
3206
3207         /* Speculatively emit the branch, but we can't fill it in until later */
3208         EMIT(branch, true, true);
3209         midgard_instruction *then_branch = mir_last_in_block(ctx->current_block);
3210
3211         /* Emit the two subblocks */
3212         midgard_block *then_block = emit_cf_list(ctx, &nif->then_list);
3213
3214         /* Emit a jump from the end of the then block to the end of the else */
3215         EMIT(branch, false, false);
3216         midgard_instruction *then_exit = mir_last_in_block(ctx->current_block);
3217
3218         /* Emit second block, and check if it's empty */
3219
3220         int else_idx = ctx->block_count;
3221         int count_in = ctx->instruction_count;
3222         midgard_block *else_block = emit_cf_list(ctx, &nif->else_list);
3223         int after_else_idx = ctx->block_count;
3224
3225         /* Now that we have the subblocks emitted, fix up the branches */
3226
3227         assert(then_block);
3228         assert(else_block);
3229
3230         if (ctx->instruction_count == count_in) {
3231                 /* The else block is empty, so don't emit an exit jump */
3232                 mir_remove_instruction(then_exit);
3233                 then_branch->branch.target_block = after_else_idx;
3234         } else {
3235                 then_branch->branch.target_block = else_idx;
3236                 then_exit->branch.target_block = after_else_idx;
3237         }
3238 }
3239
3240 static void
3241 emit_loop(struct compiler_context *ctx, nir_loop *nloop)
3242 {
3243         /* Remember where we are */
3244         midgard_block *start_block = ctx->current_block;
3245
3246         /* Allocate a loop number for this. TODO: Nested loops. Instead of a
3247          * single current_loop variable, maybe we need a stack */
3248
3249         int loop_idx = ++ctx->current_loop;
3250
3251         /* Get index from before the body so we can loop back later */
3252         int start_idx = ctx->block_count;
3253
3254         /* Emit the body itself */
3255         emit_cf_list(ctx, &nloop->body);
3256
3257         /* Branch back to loop back */
3258         struct midgard_instruction br_back = v_branch(false, false);
3259         br_back.branch.target_block = start_idx;
3260         emit_mir_instruction(ctx, br_back);
3261
3262         /* Find the index of the block about to follow us (note: we don't add
3263          * one; blocks are 0-indexed so we get a fencepost problem) */
3264         int break_block_idx = ctx->block_count;
3265
3266         /* Fix up the break statements we emitted to point to the right place,
3267          * now that we can allocate a block number for them */
3268
3269         list_for_each_entry_from(struct midgard_block, block, start_block, &ctx->blocks, link) {
3270                 print_mir_block(block);
3271                 mir_foreach_instr_in_block(block, ins) {
3272                         if (ins->type != TAG_ALU_4) continue;
3273                         if (!ins->compact_branch) continue;
3274                         if (ins->prepacked_branch) continue;
3275
3276                         /* We found a branch -- check the type to see if we need to do anything */
3277                         if (ins->branch.target_type != TARGET_BREAK) continue;
3278
3279                         /* It's a break! Check if it's our break */
3280                         if (ins->branch.target_break != loop_idx) continue;
3281
3282                         /* Okay, cool, we're breaking out of this loop.
3283                          * Rewrite from a break to a goto */
3284
3285                         ins->branch.target_type = TARGET_GOTO;
3286                         ins->branch.target_block = break_block_idx;
3287                 }
3288         }
3289 }
3290
3291 static midgard_block *
3292 emit_cf_list(struct compiler_context *ctx, struct exec_list *list)
3293 {
3294         midgard_block *start_block = NULL;
3295
3296         foreach_list_typed(nir_cf_node, node, node, list) {
3297                 switch (node->type) {
3298                 case nir_cf_node_block: {
3299                         midgard_block *block = emit_block(ctx, nir_cf_node_as_block(node));
3300
3301                         if (!start_block)
3302                                 start_block = block;
3303
3304                         break;
3305                 }
3306
3307                 case nir_cf_node_if:
3308                         emit_if(ctx, nir_cf_node_as_if(node));
3309                         break;
3310
3311                 case nir_cf_node_loop:
3312                         emit_loop(ctx, nir_cf_node_as_loop(node));
3313                         break;
3314
3315                 case nir_cf_node_function:
3316                         assert(0);
3317                         break;
3318                 }
3319         }
3320
3321         return start_block;
3322 }
3323
3324 /* Due to lookahead, we need to report the first tag executed in the command
3325  * stream and in branch targets. An initial block might be empty, so iterate
3326  * until we find one that 'works' */
3327
3328 static unsigned
3329 midgard_get_first_tag_from_block(compiler_context *ctx, unsigned block_idx)
3330 {
3331         midgard_block *initial_block = mir_get_block(ctx, block_idx);
3332
3333         unsigned first_tag = 0;
3334
3335         do {
3336                 midgard_bundle *initial_bundle = util_dynarray_element(&initial_block->bundles, midgard_bundle, 0);
3337
3338                 if (initial_bundle) {
3339                         first_tag = initial_bundle->tag;
3340                         break;
3341                 }
3342
3343                 /* Initial block is empty, try the next block */
3344                 initial_block = list_first_entry(&(initial_block->link), midgard_block, link);
3345         } while(initial_block != NULL);
3346
3347         assert(first_tag);
3348         return first_tag;
3349 }
3350
3351 int
3352 midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_blend)
3353 {
3354         struct util_dynarray *compiled = &program->compiled;
3355
3356         compiler_context ictx = {
3357                 .nir = nir,
3358                 .stage = nir->info.stage,
3359
3360                 .is_blend = is_blend,
3361                 .blend_constant_offset = -1,
3362
3363                 .alpha_ref = program->alpha_ref
3364         };
3365
3366         compiler_context *ctx = &ictx;
3367
3368         /* TODO: Decide this at runtime */
3369         ctx->uniform_cutoff = 8;
3370
3371         switch (ctx->stage) {
3372         case MESA_SHADER_VERTEX:
3373                 ctx->special_uniforms = 1;
3374                 break;
3375
3376         default:
3377                 ctx->special_uniforms = 0;
3378                 break;
3379         }
3380
3381         /* Append epilogue uniforms if necessary. The cmdstream depends on
3382          * these being at the -end-; see assign_var_locations. */
3383
3384         if (ctx->stage == MESA_SHADER_VERTEX) {
3385                 nir_variable_create(nir, nir_var_uniform, glsl_vec4_type(), "viewport");
3386         }
3387
3388         /* Assign var locations early, so the epilogue can use them if necessary */
3389
3390         nir_assign_var_locations(&nir->outputs, &nir->num_outputs, glsl_type_size);
3391         nir_assign_var_locations(&nir->inputs, &nir->num_inputs, glsl_type_size);
3392         nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms, glsl_type_size);
3393
3394         /* Initialize at a global (not block) level hash tables */
3395
3396         ctx->ssa_constants = _mesa_hash_table_u64_create(NULL);
3397         ctx->ssa_varyings = _mesa_hash_table_u64_create(NULL);
3398         ctx->ssa_to_alias = _mesa_hash_table_u64_create(NULL);
3399         ctx->ssa_to_register = _mesa_hash_table_u64_create(NULL);
3400         ctx->hash_to_temp = _mesa_hash_table_u64_create(NULL);
3401         ctx->leftover_ssa_to_alias = _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
3402
3403         /* Assign actual uniform location, skipping over samplers */
3404
3405         ctx->uniform_nir_to_mdg = _mesa_hash_table_u64_create(NULL);
3406
3407         nir_foreach_variable(var, &nir->uniforms) {
3408                 if (glsl_get_base_type(var->type) == GLSL_TYPE_SAMPLER) continue;
3409
3410                 unsigned length = glsl_get_aoa_size(var->type);
3411
3412                 if (!length) {
3413                         length = glsl_get_length(var->type);
3414                 }
3415
3416                 if (!length) {
3417                         length = glsl_get_matrix_columns(var->type);
3418                 }
3419
3420                 for (int col = 0; col < length; ++col) {
3421                         int id = ctx->uniform_count++;
3422                         _mesa_hash_table_u64_insert(ctx->uniform_nir_to_mdg, var->data.driver_location + col + 1, (void *) ((uintptr_t) (id + 1)));
3423                 }
3424         }
3425
3426         if (ctx->stage == MESA_SHADER_VERTEX) {
3427                 ctx->varying_nir_to_mdg = _mesa_hash_table_u64_create(NULL);
3428
3429                 /* First, collect the special varyings */
3430                 nir_foreach_variable(var, &nir->outputs) {
3431                         if (var->data.location == VARYING_SLOT_POS) {
3432                                 /* Set position first, always. It takes up two
3433                                  * spots, the latter one is de facto unused (at
3434                                  * least from the shader's perspective), we
3435                                  * just need to skip over the spot*/
3436
3437                                 _mesa_hash_table_u64_insert(ctx->varying_nir_to_mdg, var->data.driver_location + 1, (void *) ((uintptr_t) (0 + 1)));
3438                                 ctx->varying_count = MAX2(ctx->varying_count, 2);
3439                         } else if (var->data.location == VARYING_SLOT_PSIZ) {
3440                                 /* Set point size second (third, see above) */
3441                                 _mesa_hash_table_u64_insert(ctx->varying_nir_to_mdg, var->data.driver_location + 1, (void *) ((uintptr_t) (2 + 1)));
3442                                 ctx->varying_count = MAX2(ctx->varying_count, 3);
3443
3444                                 program->writes_point_size = true;
3445                         }
3446                 }
3447
3448                 /* Now, collect normal varyings */
3449
3450                 nir_foreach_variable(var, &nir->outputs) {
3451                         if (var->data.location == VARYING_SLOT_POS || var->data.location == VARYING_SLOT_PSIZ) continue;
3452
3453                         for (int col = 0; col < glsl_get_matrix_columns(var->type); ++col) {
3454                                 int id = ctx->varying_count++;
3455                                 _mesa_hash_table_u64_insert(ctx->varying_nir_to_mdg, var->data.driver_location + col + 1, (void *) ((uintptr_t) (id + 1)));
3456                         }
3457                 }
3458         }
3459
3460
3461
3462         /* Lower vars -- not I/O -- before epilogue */
3463
3464         NIR_PASS_V(nir, nir_lower_var_copies);
3465         NIR_PASS_V(nir, nir_lower_vars_to_ssa);
3466         NIR_PASS_V(nir, nir_split_var_copies);
3467         NIR_PASS_V(nir, nir_lower_var_copies);
3468         NIR_PASS_V(nir, nir_lower_global_vars_to_local);
3469         NIR_PASS_V(nir, nir_lower_var_copies);
3470         NIR_PASS_V(nir, nir_lower_vars_to_ssa);
3471         NIR_PASS_V(nir, nir_lower_io, nir_var_all, glsl_type_size, 0);
3472
3473         /* Append vertex epilogue before optimisation, so the epilogue itself
3474          * is optimised */
3475
3476         if (ctx->stage == MESA_SHADER_VERTEX)
3477                 transform_position_writes(nir);
3478
3479         /* Optimisation passes */
3480
3481         optimise_nir(nir);
3482
3483         nir_print_shader(nir, stdout);
3484
3485         /* Assign counts, now that we're sure (post-optimisation) */
3486         program->uniform_count = nir->num_uniforms;
3487
3488         program->attribute_count = (ctx->stage == MESA_SHADER_VERTEX) ? nir->num_inputs : 0;
3489         program->varying_count = (ctx->stage == MESA_SHADER_VERTEX) ? nir->num_outputs : ((ctx->stage == MESA_SHADER_FRAGMENT) ? nir->num_inputs : 0);
3490
3491
3492         nir_foreach_function(func, nir) {
3493                 if (!func->impl)
3494                         continue;
3495
3496                 list_inithead(&ctx->blocks);
3497                 ctx->block_count = 0;
3498                 ctx->func = func;
3499
3500                 emit_cf_list(ctx, &func->impl->body);
3501                 emit_block(ctx, func->impl->end_block);
3502
3503                 break; /* TODO: Multi-function shaders */
3504         }
3505
3506         util_dynarray_init(compiled, NULL);
3507
3508         /* Schedule! */
3509         schedule_program(ctx);
3510
3511         /* Now that all the bundles are scheduled and we can calculate block
3512          * sizes, emit actual branch instructions rather than placeholders */
3513
3514         int br_block_idx = 0;
3515
3516         mir_foreach_block(ctx, block) {
3517                 util_dynarray_foreach(&block->bundles, midgard_bundle, bundle) {
3518                         for (int c = 0; c < bundle->instruction_count; ++c) {
3519                                 midgard_instruction *ins = &bundle->instructions[c];
3520
3521                                 if (!midgard_is_branch_unit(ins->unit)) continue;
3522
3523                                 if (ins->prepacked_branch) continue;
3524
3525                                 /* Determine the block we're jumping to */
3526                                 int target_number = ins->branch.target_block;
3527
3528                                 midgard_block *target = mir_get_block(ctx, target_number);
3529                                 assert(target);
3530
3531                                 /* Report the destination tag. */
3532                                 int dest_tag = midgard_get_first_tag_from_block(ctx, target_number);
3533
3534                                 /* Count up the number of quadwords we're jumping over. That is, the number of quadwords in each of the blocks between (br_block_idx, target_number) */
3535                                 int quadword_offset = 0;
3536
3537                                 if (target_number > br_block_idx) {
3538                                         /* Jump forward */
3539
3540                                         for (int idx = br_block_idx + 1; idx < target_number; ++idx) {
3541                                                 midgard_block *blk = mir_get_block(ctx, idx);
3542                                                 assert(blk);
3543
3544                                                 quadword_offset += blk->quadword_count;
3545                                         }
3546                                 } else {
3547                                         /* Jump backwards */
3548
3549                                         for (int idx = br_block_idx; idx >= target_number; --idx) {
3550                                                 midgard_block *blk = mir_get_block(ctx, idx);
3551                                                 assert(blk);
3552
3553                                                 quadword_offset -= blk->quadword_count;
3554                                         }
3555                                 }
3556
3557                                 bool is_compact = ins->unit == ALU_ENAB_BR_COMPACT;
3558                                 bool is_conditional = ins->branch.conditional;
3559                                 bool is_inverted = ins->branch.invert_conditional;
3560
3561                                 /* Unconditional extended branches (far jumps)
3562                                  * have issues, so we always use a conditional
3563                                  * branch, setting the condition to always for
3564                                  * unconditional. For compact unconditional
3565                                  * branches, cond isn't used so it doesn't
3566                                  * matter what we pick. */
3567
3568                                 midgard_condition cond =
3569                                         !is_conditional ? midgard_condition_always :
3570                                         is_inverted ? midgard_condition_false :
3571                                         midgard_condition_true;
3572
3573                                 if (!is_compact) {
3574                                         midgard_branch_extended branch =
3575                                                 midgard_create_branch_extended(
3576                                                         cond,
3577                                                         midgard_jmp_writeout_op_branch_cond,
3578                                                         dest_tag,
3579                                                         quadword_offset);
3580
3581                                         memcpy(&ins->branch_extended, &branch, sizeof(branch));
3582                                 } else if (is_conditional) {
3583                                         midgard_branch_cond branch = {
3584                                                 .op = midgard_jmp_writeout_op_branch_cond,
3585                                                 .dest_tag = dest_tag,
3586                                                 .offset = quadword_offset,
3587                                                 .cond = cond
3588                                         };
3589
3590                                         assert(branch.offset == quadword_offset);
3591
3592                                         memcpy(&ins->br_compact, &branch, sizeof(branch));
3593                                 } else {
3594                                         midgard_branch_uncond branch = {
3595                                                 .op = midgard_jmp_writeout_op_branch_uncond,
3596                                                 .dest_tag = dest_tag,
3597                                                 .offset = quadword_offset,
3598                                                 .unknown = 1
3599                                         };
3600
3601                                         assert(branch.offset == quadword_offset);
3602
3603                                         memcpy(&ins->br_compact, &branch, sizeof(branch));
3604                                 }
3605                         }
3606                 }
3607
3608                 ++br_block_idx;
3609         }
3610
3611         /* Emit flat binary from the instruction arrays. Iterate each block in
3612          * sequence. Save instruction boundaries such that lookahead tags can
3613          * be assigned easily */
3614
3615         /* Cache _all_ bundles in source order for lookahead across failed branches */
3616
3617         int bundle_count = 0;
3618         mir_foreach_block(ctx, block) {
3619                 bundle_count += block->bundles.size / sizeof(midgard_bundle);
3620         }
3621         midgard_bundle **source_order_bundles = malloc(sizeof(midgard_bundle *) * bundle_count);
3622         int bundle_idx = 0;
3623         mir_foreach_block(ctx, block) {
3624                 util_dynarray_foreach(&block->bundles, midgard_bundle, bundle) {
3625                         source_order_bundles[bundle_idx++] = bundle;
3626                 }
3627         }
3628
3629         int current_bundle = 0;
3630
3631         mir_foreach_block(ctx, block) {
3632                 util_dynarray_foreach(&block->bundles, midgard_bundle, bundle) {
3633                         int lookahead = 1;
3634
3635                         if (current_bundle + 1 < bundle_count) {
3636                                 uint8_t next = source_order_bundles[current_bundle + 1]->tag;
3637
3638                                 if (!(current_bundle + 2 < bundle_count) && IS_ALU(next)) {
3639                                         lookahead = 1;
3640                                 } else {
3641                                         lookahead = next;
3642                                 }
3643                         }
3644
3645                         emit_binary_bundle(ctx, bundle, compiled, lookahead);
3646                         ++current_bundle;
3647                 }
3648
3649                 /* TODO: Free deeper */
3650                 //util_dynarray_fini(&block->instructions);
3651         }
3652
3653         free(source_order_bundles);
3654
3655         /* Report the very first tag executed */
3656         program->first_tag = midgard_get_first_tag_from_block(ctx, 0);
3657
3658         /* Deal with off-by-one related to the fencepost problem */
3659         program->work_register_count = ctx->work_registers + 1;
3660
3661         program->can_discard = ctx->can_discard;
3662         program->uniform_cutoff = ctx->uniform_cutoff;
3663
3664         program->blend_patch_offset = ctx->blend_constant_offset;
3665
3666         disassemble_midgard(program->compiled.data, program->compiled.size);
3667
3668         return 0;
3669 }