src/gallium/drivers/etnaviv/etnaviv_compiler_nir_emit.h

   1 /*
   2  * Copyright (c) 2019 Zodiac Inflight Innovations
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sub license,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the
  12  * next paragraph) shall be included in all copies or substantial portions
  13  * of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  21  * DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Jonathan Marek <jonathan@marek.ca>
  25  */
  26
  27 #include "etnaviv_asm.h"
  28 #include "etnaviv_context.h"
  29
  30 #include "compiler/nir/nir.h"
  31 #include "compiler/nir/nir_builder.h"
  32 #include "compiler/nir/nir_worklist.h"
  33 #include "util/register_allocate.h"
  34
  35 #define ALU_SWIZ(s) INST_SWIZ((s)->swizzle[0], (s)->swizzle[1], (s)->swizzle[2], (s)->swizzle[3])
  36 #define SRC_DISABLE ((hw_src){})
  37 #define SRC_CONST(idx, s) ((hw_src){.use=1, .rgroup = INST_RGROUP_UNIFORM_0, .reg=idx, .swiz=s})
  38 #define SRC_REG(idx, s) ((hw_src){.use=1, .rgroup = INST_RGROUP_TEMP, .reg=idx, .swiz=s})
  39
  40 #define emit(type, args...) etna_emit_##type(state->c, args)
  41
  42 typedef struct etna_inst_dst hw_dst;
  43 typedef struct etna_inst_src hw_src;
  44
  45 enum {
  46    BYPASS_DST = 1,
  47    BYPASS_SRC = 2,
  48 };
  49
  50 struct state {
  51    struct etna_compile *c;
  52
  53    unsigned const_count;
  54
  55    nir_shader *shader;
  56    nir_function_impl *impl;
  57
  58    /* ra state */
  59    struct ra_graph *g;
  60    struct ra_regs *regs;
  61    unsigned *live_map;
  62    unsigned num_nodes;
  63 };
  64
  65 #define compile_error(ctx, args...) ({ \
  66    printf(args); \
  67    ctx->error = true; \
  68    assert(0); \
  69 })
  70
  71 static inline hw_src
  72 src_swizzle(hw_src src, unsigned swizzle)
  73 {
  74    if (src.rgroup != INST_RGROUP_IMMEDIATE)
  75       src.swiz = inst_swiz_compose(src.swiz, swizzle);
  76
  77    return src;
  78 }
  79
  80 static inline bool is_sysval(nir_instr *instr)
  81 {
  82    if (instr->type != nir_instr_type_intrinsic)
  83       return false;
  84
  85    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
  86    return intr->intrinsic == nir_intrinsic_load_front_face ||
  87           intr->intrinsic == nir_intrinsic_load_frag_coord;
  88 }
  89
  90 /* constants are represented as 64-bit ints
  91  * 32-bit for the value and 32-bit for the type (imm, uniform, etc)
  92  */
  93
  94 #define CONST_VAL(a, b) (nir_const_value) {.u64 = (uint64_t)(a) << 32 | (uint64_t)(b)}
  95 #define CONST(x) CONST_VAL(ETNA_IMMEDIATE_CONSTANT, x)
  96 #define UNIFORM(x) CONST_VAL(ETNA_IMMEDIATE_UNIFORM, x)
  97 #define TEXSCALE(x, i) CONST_VAL(ETNA_IMMEDIATE_TEXRECT_SCALE_X + (i), x)
  98
  99 static int
 100 const_add(uint64_t *c, uint64_t value)
 101 {
 102    for (unsigned i = 0; i < 4; i++) {
 103       if (c[i] == value || !c[i]) {
 104          c[i] = value;
 105          return i;
 106       }
 107    }
 108    return -1;
 109 }
 110
 111 static hw_src
 112 const_src(struct state *state, nir_const_value *value, unsigned num_components)
 113 {
 114    /* use inline immediates if possible */
 115    if (state->c->specs->halti >= 2 && num_components == 1 &&
 116        value[0].u64 >> 32 == ETNA_IMMEDIATE_CONSTANT) {
 117       uint32_t bits = value[0].u32;
 118
 119       /* "float" - shifted by 12 */
 120       if ((bits & 0xfff) == 0)
 121          return etna_immediate_src(0, bits >> 12);
 122
 123       /* "unsigned" - raw 20 bit value */
 124       if (bits < (1 << 20))
 125          return etna_immediate_src(2, bits);
 126
 127       /* "signed" - sign extended 20-bit (sign included) value */
 128       if (bits >= 0xfff80000)
 129          return etna_immediate_src(1, bits);
 130    }
 131
 132    unsigned i;
 133    int swiz = -1;
 134    for (i = 0; swiz < 0; i++) {
 135       uint64_t *a = &state->c->consts[i*4];
 136       uint64_t save[4];
 137       memcpy(save, a, sizeof(save));
 138       swiz = 0;
 139       for (unsigned j = 0; j < num_components; j++) {
 140          int c = const_add(a, value[j].u64);
 141          if (c < 0) {
 142             memcpy(a, save, sizeof(save));
 143             swiz = -1;
 144             break;
 145          }
 146          swiz |= c << j * 2;
 147       }
 148    }
 149
 150    assert(i <= ETNA_MAX_IMM / 4);
 151    state->const_count = MAX2(state->const_count, i);
 152
 153    return SRC_CONST(i - 1, swiz);
 154 }
 155
 156 struct ssa_reg {
 157    uint8_t idx;
 158    uint8_t src_swizzle;
 159    uint8_t dst_swizzle;
 160    uint8_t write_mask;
 161 };
 162
 163 /* Swizzles and write masks can be used to layer virtual non-interfering
 164  * registers on top of the real VEC4 registers. For example, the virtual
 165  * VEC3_XYZ register and the virtual SCALAR_W register that use the same
 166  * physical VEC4 base register do not interfere.
 167  */
 168 enum {
 169    REG_CLASS_VIRT_SCALAR,
 170    REG_CLASS_VIRT_VEC2,
 171    REG_CLASS_VIRT_VEC3,
 172    REG_CLASS_VEC4,
 173    /* special vec2 class for fast transcendentals, limited to XY or ZW */
 174    REG_CLASS_VIRT_VEC2T,
 175    /* special classes for LOAD - contiguous components */
 176    REG_CLASS_VIRT_VEC2C,
 177    REG_CLASS_VIRT_VEC3C,
 178    NUM_REG_CLASSES,
 179 } reg_class;
 180
 181 enum {
 182    REG_TYPE_VEC4,
 183    REG_TYPE_VIRT_VEC3_XYZ,
 184    REG_TYPE_VIRT_VEC3_XYW,
 185    REG_TYPE_VIRT_VEC3_XZW,
 186    REG_TYPE_VIRT_VEC3_YZW,
 187    REG_TYPE_VIRT_VEC2_XY,
 188    REG_TYPE_VIRT_VEC2_XZ,
 189    REG_TYPE_VIRT_VEC2_XW,
 190    REG_TYPE_VIRT_VEC2_YZ,
 191    REG_TYPE_VIRT_VEC2_YW,
 192    REG_TYPE_VIRT_VEC2_ZW,
 193    REG_TYPE_VIRT_SCALAR_X,
 194    REG_TYPE_VIRT_SCALAR_Y,
 195    REG_TYPE_VIRT_SCALAR_Z,
 196    REG_TYPE_VIRT_SCALAR_W,
 197    REG_TYPE_VIRT_VEC2T_XY,
 198    REG_TYPE_VIRT_VEC2T_ZW,
 199    REG_TYPE_VIRT_VEC2C_XY,
 200    REG_TYPE_VIRT_VEC2C_YZ,
 201    REG_TYPE_VIRT_VEC2C_ZW,
 202    REG_TYPE_VIRT_VEC3C_XYZ,
 203    REG_TYPE_VIRT_VEC3C_YZW,
 204    NUM_REG_TYPES,
 205 } reg_type;
 206
 207 /* writemask when used as dest */
 208 static const uint8_t
 209 reg_writemask[NUM_REG_TYPES] = {
 210    [REG_TYPE_VEC4] = 0xf,
 211    [REG_TYPE_VIRT_SCALAR_X] = 0x1,
 212    [REG_TYPE_VIRT_SCALAR_Y] = 0x2,
 213    [REG_TYPE_VIRT_VEC2_XY] = 0x3,
 214    [REG_TYPE_VIRT_VEC2T_XY] = 0x3,
 215    [REG_TYPE_VIRT_VEC2C_XY] = 0x3,
 216    [REG_TYPE_VIRT_SCALAR_Z] = 0x4,
 217    [REG_TYPE_VIRT_VEC2_XZ] = 0x5,
 218    [REG_TYPE_VIRT_VEC2_YZ] = 0x6,
 219    [REG_TYPE_VIRT_VEC2C_YZ] = 0x6,
 220    [REG_TYPE_VIRT_VEC3_XYZ] = 0x7,
 221    [REG_TYPE_VIRT_VEC3C_XYZ] = 0x7,
 222    [REG_TYPE_VIRT_SCALAR_W] = 0x8,
 223    [REG_TYPE_VIRT_VEC2_XW] = 0x9,
 224    [REG_TYPE_VIRT_VEC2_YW] = 0xa,
 225    [REG_TYPE_VIRT_VEC3_XYW] = 0xb,
 226    [REG_TYPE_VIRT_VEC2_ZW] = 0xc,
 227    [REG_TYPE_VIRT_VEC2T_ZW] = 0xc,
 228    [REG_TYPE_VIRT_VEC2C_ZW] = 0xc,
 229    [REG_TYPE_VIRT_VEC3_XZW] = 0xd,
 230    [REG_TYPE_VIRT_VEC3_YZW] = 0xe,
 231    [REG_TYPE_VIRT_VEC3C_YZW] = 0xe,
 232 };
 233
 234 /* how to swizzle when used as a src */
 235 static const uint8_t
 236 reg_swiz[NUM_REG_TYPES] = {
 237    [REG_TYPE_VEC4] = INST_SWIZ_IDENTITY,
 238    [REG_TYPE_VIRT_SCALAR_X] = INST_SWIZ_IDENTITY,
 239    [REG_TYPE_VIRT_SCALAR_Y] = SWIZZLE(Y, Y, Y, Y),
 240    [REG_TYPE_VIRT_VEC2_XY] = INST_SWIZ_IDENTITY,
 241    [REG_TYPE_VIRT_VEC2T_XY] = INST_SWIZ_IDENTITY,
 242    [REG_TYPE_VIRT_VEC2C_XY] = INST_SWIZ_IDENTITY,
 243    [REG_TYPE_VIRT_SCALAR_Z] = SWIZZLE(Z, Z, Z, Z),
 244    [REG_TYPE_VIRT_VEC2_XZ] = SWIZZLE(X, Z, X, Z),
 245    [REG_TYPE_VIRT_VEC2_YZ] = SWIZZLE(Y, Z, Y, Z),
 246    [REG_TYPE_VIRT_VEC2C_YZ] = SWIZZLE(Y, Z, Y, Z),
 247    [REG_TYPE_VIRT_VEC3_XYZ] = INST_SWIZ_IDENTITY,
 248    [REG_TYPE_VIRT_VEC3C_XYZ] = INST_SWIZ_IDENTITY,
 249    [REG_TYPE_VIRT_SCALAR_W] = SWIZZLE(W, W, W, W),
 250    [REG_TYPE_VIRT_VEC2_XW] = SWIZZLE(X, W, X, W),
 251    [REG_TYPE_VIRT_VEC2_YW] = SWIZZLE(Y, W, Y, W),
 252    [REG_TYPE_VIRT_VEC3_XYW] = SWIZZLE(X, Y, W, X),
 253    [REG_TYPE_VIRT_VEC2_ZW] = SWIZZLE(Z, W, Z, W),
 254    [REG_TYPE_VIRT_VEC2T_ZW] = SWIZZLE(Z, W, Z, W),
 255    [REG_TYPE_VIRT_VEC2C_ZW] = SWIZZLE(Z, W, Z, W),
 256    [REG_TYPE_VIRT_VEC3_XZW] = SWIZZLE(X, Z, W, X),
 257    [REG_TYPE_VIRT_VEC3_YZW] = SWIZZLE(Y, Z, W, X),
 258    [REG_TYPE_VIRT_VEC3C_YZW] = SWIZZLE(Y, Z, W, X),
 259 };
 260
 261 /* how to swizzle when used as a dest */
 262 static const uint8_t
 263 reg_dst_swiz[NUM_REG_TYPES] = {
 264    [REG_TYPE_VEC4] = INST_SWIZ_IDENTITY,
 265    [REG_TYPE_VIRT_SCALAR_X] = INST_SWIZ_IDENTITY,
 266    [REG_TYPE_VIRT_SCALAR_Y] = SWIZZLE(X, X, X, X),
 267    [REG_TYPE_VIRT_VEC2_XY] = INST_SWIZ_IDENTITY,
 268    [REG_TYPE_VIRT_VEC2T_XY] = INST_SWIZ_IDENTITY,
 269    [REG_TYPE_VIRT_VEC2C_XY] = INST_SWIZ_IDENTITY,
 270    [REG_TYPE_VIRT_SCALAR_Z] = SWIZZLE(X, X, X, X),
 271    [REG_TYPE_VIRT_VEC2_XZ] = SWIZZLE(X, X, Y, Y),
 272    [REG_TYPE_VIRT_VEC2_YZ] = SWIZZLE(X, X, Y, Y),
 273    [REG_TYPE_VIRT_VEC2C_YZ] = SWIZZLE(X, X, Y, Y),
 274    [REG_TYPE_VIRT_VEC3_XYZ] = INST_SWIZ_IDENTITY,
 275    [REG_TYPE_VIRT_VEC3C_XYZ] = INST_SWIZ_IDENTITY,
 276    [REG_TYPE_VIRT_SCALAR_W] = SWIZZLE(X, X, X, X),
 277    [REG_TYPE_VIRT_VEC2_XW] = SWIZZLE(X, X, Y, Y),
 278    [REG_TYPE_VIRT_VEC2_YW] = SWIZZLE(X, X, Y, Y),
 279    [REG_TYPE_VIRT_VEC3_XYW] = SWIZZLE(X, Y, Z, Z),
 280    [REG_TYPE_VIRT_VEC2_ZW] = SWIZZLE(X, X, X, Y),
 281    [REG_TYPE_VIRT_VEC2T_ZW] = SWIZZLE(X, X, X, Y),
 282    [REG_TYPE_VIRT_VEC2C_ZW] = SWIZZLE(X, X, X, Y),
 283    [REG_TYPE_VIRT_VEC3_XZW] = SWIZZLE(X, Y, Y, Z),
 284    [REG_TYPE_VIRT_VEC3_YZW] = SWIZZLE(X, X, Y, Z),
 285    [REG_TYPE_VIRT_VEC3C_YZW] = SWIZZLE(X, X, Y, Z),
 286 };
 287
 288 static inline int reg_get_type(int virt_reg)
 289 {
 290    return virt_reg % NUM_REG_TYPES;
 291 }
 292
 293 static inline int reg_get_base(struct state *state, int virt_reg)
 294 {
 295    /* offset by 1 to avoid reserved position register */
 296    if (state->shader->info.stage == MESA_SHADER_FRAGMENT)
 297       return (virt_reg / NUM_REG_TYPES + 1) % ETNA_MAX_TEMPS;
 298    return virt_reg / NUM_REG_TYPES;
 299 }
 300
 301 /* use "r63.z" for depth reg, it will wrap around to r0.z by reg_get_base
 302  * (fs registers are offset by 1 to avoid reserving r0)
 303  */
 304 #define REG_FRAG_DEPTH ((ETNA_MAX_TEMPS - 1) * NUM_REG_TYPES + REG_TYPE_VIRT_SCALAR_Z)
 305
 306 static inline int reg_get_class(int virt_reg)
 307 {
 308    switch (reg_get_type(virt_reg)) {
 309    case REG_TYPE_VEC4:
 310       return REG_CLASS_VEC4;
 311    case REG_TYPE_VIRT_VEC3_XYZ:
 312    case REG_TYPE_VIRT_VEC3_XYW:
 313    case REG_TYPE_VIRT_VEC3_XZW:
 314    case REG_TYPE_VIRT_VEC3_YZW:
 315       return REG_CLASS_VIRT_VEC3;
 316    case REG_TYPE_VIRT_VEC2_XY:
 317    case REG_TYPE_VIRT_VEC2_XZ:
 318    case REG_TYPE_VIRT_VEC2_XW:
 319    case REG_TYPE_VIRT_VEC2_YZ:
 320    case REG_TYPE_VIRT_VEC2_YW:
 321    case REG_TYPE_VIRT_VEC2_ZW:
 322       return REG_CLASS_VIRT_VEC2;
 323    case REG_TYPE_VIRT_SCALAR_X:
 324    case REG_TYPE_VIRT_SCALAR_Y:
 325    case REG_TYPE_VIRT_SCALAR_Z:
 326    case REG_TYPE_VIRT_SCALAR_W:
 327       return REG_CLASS_VIRT_SCALAR;
 328    case REG_TYPE_VIRT_VEC2T_XY:
 329    case REG_TYPE_VIRT_VEC2T_ZW:
 330       return REG_CLASS_VIRT_VEC2T;
 331    case REG_TYPE_VIRT_VEC2C_XY:
 332    case REG_TYPE_VIRT_VEC2C_YZ:
 333    case REG_TYPE_VIRT_VEC2C_ZW:
 334       return REG_CLASS_VIRT_VEC2C;
 335    case REG_TYPE_VIRT_VEC3C_XYZ:
 336    case REG_TYPE_VIRT_VEC3C_YZW:
 337       return REG_CLASS_VIRT_VEC3C;
 338    }
 339
 340    assert(false);
 341    return 0;
 342 }
 343
 344 /* get unique ssa/reg index for nir_src */
 345 static unsigned
 346 src_index(nir_function_impl *impl, nir_src *src)
 347 {
 348    return src->is_ssa ? src->ssa->index : (src->reg.reg->index + impl->ssa_alloc);
 349 }
 350
 351 /* get unique ssa/reg index for nir_dest */
 352 static unsigned
 353 dest_index(nir_function_impl *impl, nir_dest *dest)
 354 {
 355    return dest->is_ssa ? dest->ssa.index : (dest->reg.reg->index + impl->ssa_alloc);
 356 }
 357
 358 /* nir_src to allocated register */
 359 static hw_src
 360 ra_src(struct state *state, nir_src *src)
 361 {
 362    unsigned reg = ra_get_node_reg(state->g, state->live_map[src_index(state->impl, src)]);
 363    return SRC_REG(reg_get_base(state, reg), reg_swiz[reg_get_type(reg)]);
 364 }
 365
 366 static hw_src
 367 get_src(struct state *state, nir_src *src)
 368 {
 369    if (!src->is_ssa)
 370       return ra_src(state, src);
 371
 372    nir_instr *instr = src->ssa->parent_instr;
 373
 374    if (instr->pass_flags & BYPASS_SRC) {
 375       assert(instr->type == nir_instr_type_alu);
 376       nir_alu_instr *alu = nir_instr_as_alu(instr);
 377       assert(alu->op == nir_op_mov);
 378       return src_swizzle(get_src(state, &alu->src[0].src), ALU_SWIZ(&alu->src[0]));
 379    }
 380
 381    switch (instr->type) {
 382    case nir_instr_type_load_const:
 383       return const_src(state, nir_instr_as_load_const(instr)->value, src->ssa->num_components);
 384    case nir_instr_type_intrinsic: {
 385       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
 386       switch (intr->intrinsic) {
 387       case nir_intrinsic_load_input:
 388       case nir_intrinsic_load_instance_id:
 389       case nir_intrinsic_load_uniform:
 390       case nir_intrinsic_load_ubo:
 391          return ra_src(state, src);
 392       case nir_intrinsic_load_front_face:
 393          return (hw_src) { .use = 1, .rgroup = INST_RGROUP_INTERNAL };
 394       case nir_intrinsic_load_frag_coord:
 395          return SRC_REG(0, INST_SWIZ_IDENTITY);
 396       default:
 397          compile_error(state->c, "Unhandled NIR intrinsic type: %s\n",
 398                        nir_intrinsic_infos[intr->intrinsic].name);
 399          break;
 400       }
 401    } break;
 402    case nir_instr_type_alu:
 403    case nir_instr_type_tex:
 404       return ra_src(state, src);
 405    case nir_instr_type_ssa_undef: {
 406       /* return zero to deal with broken Blur demo */
 407       nir_const_value value = CONST(0);
 408       return src_swizzle(const_src(state, &value, 1), SWIZZLE(X,X,X,X));
 409    }
 410    default:
 411       compile_error(state->c, "Unhandled NIR instruction type: %d\n", instr->type);
 412       break;
 413    }
 414
 415    return SRC_DISABLE;
 416 }
 417
 418 static void
 419 update_swiz_mask(nir_alu_instr *alu, nir_dest *dest, unsigned *swiz, unsigned *mask)
 420 {
 421    if (!swiz)
 422       return;
 423
 424    bool is_vec = dest != NULL;
 425    unsigned swizzle = 0, write_mask = 0;
 426    for (unsigned i = 0; i < 4; i++) {
 427       /* channel not written */
 428       if (!(alu->dest.write_mask & (1 << i)))
 429          continue;
 430       /* src is different (only check for vecN) */
 431       if (is_vec && alu->src[i].src.ssa != &dest->ssa)
 432          continue;
 433
 434       unsigned src_swiz = is_vec ? alu->src[i].swizzle[0] : alu->src[0].swizzle[i];
 435       swizzle |= (*swiz >> src_swiz * 2 & 3) << i * 2;
 436       /* this channel isn't written through this chain */
 437       if (*mask & (1 << src_swiz))
 438          write_mask |= 1 << i;
 439    }
 440    *swiz = swizzle;
 441    *mask = write_mask;
 442 }
 443
 444 static bool
 445 vec_dest_has_swizzle(nir_alu_instr *vec, nir_ssa_def *ssa)
 446 {
 447    for (unsigned i = 0; i < 4; i++) {
 448       if (!(vec->dest.write_mask & (1 << i)) || vec->src[i].src.ssa != ssa)
 449          continue;
 450
 451       if (vec->src[i].swizzle[0] != i)
 452          return true;
 453    }
 454
 455    /* don't deal with possible bypassed vec/mov chain */
 456    nir_foreach_use(use_src, ssa) {
 457       nir_instr *instr = use_src->parent_instr;
 458       if (instr->type != nir_instr_type_alu)
 459          continue;
 460
 461       nir_alu_instr *alu = nir_instr_as_alu(instr);
 462
 463       switch (alu->op) {
 464       case nir_op_mov:
 465       case nir_op_vec2:
 466       case nir_op_vec3:
 467       case nir_op_vec4:
 468          return true;
 469       default:
 470          break;
 471       }
 472    }
 473    return false;
 474 }
 475
 476 static nir_dest *
 477 real_dest(nir_dest *dest, unsigned *swiz, unsigned *mask)
 478 {
 479    if (!dest || !dest->is_ssa)
 480       return dest;
 481
 482    bool can_bypass_src = !list_length(&dest->ssa.if_uses);
 483    nir_instr *p_instr = dest->ssa.parent_instr;
 484
 485    /* if used by a vecN, the "real" destination becomes the vecN destination
 486     * lower_alu guarantees that values used by a vecN are only used by that vecN
 487     * we can apply the same logic to movs in a some cases too
 488     */
 489    nir_foreach_use(use_src, &dest->ssa) {
 490       nir_instr *instr = use_src->parent_instr;
 491
 492       /* src bypass check: for now only deal with tex src mov case
 493        * note: for alu don't bypass mov for multiple uniform sources
 494        */
 495       switch (instr->type) {
 496       case nir_instr_type_tex:
 497          if (p_instr->type == nir_instr_type_alu &&
 498              nir_instr_as_alu(p_instr)->op == nir_op_mov) {
 499             break;
 500          }
 501       default:
 502          can_bypass_src = false;
 503          break;
 504       }
 505
 506       if (instr->type != nir_instr_type_alu)
 507          continue;
 508
 509       nir_alu_instr *alu = nir_instr_as_alu(instr);
 510
 511       switch (alu->op) {
 512       case nir_op_vec2:
 513       case nir_op_vec3:
 514       case nir_op_vec4:
 515          assert(list_length(&dest->ssa.if_uses) == 0);
 516          nir_foreach_use(use_src, &dest->ssa)
 517             assert(use_src->parent_instr == instr);
 518
 519          update_swiz_mask(alu, dest, swiz, mask);
 520          break;
 521       case nir_op_mov: {
 522          switch (dest->ssa.parent_instr->type) {
 523          case nir_instr_type_alu:
 524          case nir_instr_type_tex:
 525             break;
 526          default:
 527             continue;
 528          }
 529          if (list_length(&dest->ssa.if_uses) || list_length(&dest->ssa.uses) > 1)
 530             continue;
 531
 532          update_swiz_mask(alu, NULL, swiz, mask);
 533          break;
 534       };
 535       default:
 536          continue;
 537       }
 538
 539       assert(!(instr->pass_flags & BYPASS_SRC));
 540       instr->pass_flags |= BYPASS_DST;
 541       return real_dest(&alu->dest.dest, swiz, mask);
 542    }
 543
 544    if (can_bypass_src && !(p_instr->pass_flags & BYPASS_DST)) {
 545       p_instr->pass_flags |= BYPASS_SRC;
 546       return NULL;
 547    }
 548
 549    return dest;
 550 }
 551
 552 /* get allocated dest register for nir_dest
 553  * *p_swiz tells how the components need to be placed into register
 554  */
 555 static hw_dst
 556 ra_dest(struct state *state, nir_dest *dest, unsigned *p_swiz)
 557 {
 558    unsigned swiz = INST_SWIZ_IDENTITY, mask = 0xf;
 559    dest = real_dest(dest, &swiz, &mask);
 560
 561    unsigned r = ra_get_node_reg(state->g, state->live_map[dest_index(state->impl, dest)]);
 562    unsigned t = reg_get_type(r);
 563
 564    *p_swiz = inst_swiz_compose(swiz, reg_dst_swiz[t]);
 565
 566    return (hw_dst) {
 567       .use = 1,
 568       .reg = reg_get_base(state, r),
 569       .write_mask = inst_write_mask_compose(mask, reg_writemask[t]),
 570    };
 571 }
 572
 573 /* if instruction dest needs a register, return nir_dest for it */
 574 static nir_dest *
 575 dest_for_instr(nir_instr *instr)
 576 {
 577    nir_dest *dest = NULL;
 578
 579    switch (instr->type) {
 580    case nir_instr_type_alu:
 581       dest = &nir_instr_as_alu(instr)->dest.dest;
 582       break;
 583    case nir_instr_type_tex:
 584       dest = &nir_instr_as_tex(instr)->dest;
 585       break;
 586    case nir_instr_type_intrinsic: {
 587       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
 588       if (intr->intrinsic == nir_intrinsic_load_uniform ||
 589           intr->intrinsic == nir_intrinsic_load_ubo ||
 590           intr->intrinsic == nir_intrinsic_load_input ||
 591           intr->intrinsic == nir_intrinsic_load_instance_id)
 592          dest = &intr->dest;
 593    } break;
 594    case nir_instr_type_deref:
 595       return NULL;
 596    default:
 597       break;
 598    }
 599    return real_dest(dest, NULL, NULL);
 600 }
 601
 602 struct live_def {
 603    nir_instr *instr;
 604    nir_dest *dest; /* cached dest_for_instr */
 605    unsigned live_start, live_end; /* live range */
 606 };
 607
 608 static void
 609 range_include(struct live_def *def, unsigned index)
 610 {
 611    if (def->live_start > index)
 612       def->live_start = index;
 613    if (def->live_end < index)
 614       def->live_end = index;
 615 }
 616
 617 struct live_defs_state {
 618    unsigned num_defs;
 619    unsigned bitset_words;
 620
 621    nir_function_impl *impl;
 622    nir_block *block; /* current block pointer */
 623    unsigned index; /* current live index */
 624
 625    struct live_def *defs;
 626    unsigned *live_map; /* to map ssa/reg index into defs array */
 627
 628    nir_block_worklist worklist;
 629 };
 630
 631 static bool
 632 init_liveness_block(nir_block *block,
 633                     struct live_defs_state *state)
 634 {
 635    block->live_in = reralloc(block, block->live_in, BITSET_WORD,
 636                              state->bitset_words);
 637    memset(block->live_in, 0, state->bitset_words * sizeof(BITSET_WORD));
 638
 639    block->live_out = reralloc(block, block->live_out, BITSET_WORD,
 640                               state->bitset_words);
 641    memset(block->live_out, 0, state->bitset_words * sizeof(BITSET_WORD));
 642
 643    nir_block_worklist_push_head(&state->worklist, block);
 644
 645    return true;
 646 }
 647
 648 static bool
 649 set_src_live(nir_src *src, void *void_state)
 650 {
 651    struct live_defs_state *state = void_state;
 652
 653    if (src->is_ssa) {
 654       nir_instr *instr = src->ssa->parent_instr;
 655
 656       if (is_sysval(instr) || instr->type == nir_instr_type_deref)
 657          return true;
 658
 659       switch (instr->type) {
 660       case nir_instr_type_load_const:
 661       case nir_instr_type_ssa_undef:
 662          return true;
 663       case nir_instr_type_alu: {
 664          /* alu op bypass */
 665          nir_alu_instr *alu = nir_instr_as_alu(instr);
 666          if (instr->pass_flags & BYPASS_SRC) {
 667             for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++)
 668                set_src_live(&alu->src[i].src, state);
 669             return true;
 670          }
 671       } break;
 672       default:
 673          break;
 674       }
 675    }
 676
 677    unsigned i = state->live_map[src_index(state->impl, src)];
 678    assert(i != ~0u);
 679
 680    BITSET_SET(state->block->live_in, i);
 681    range_include(&state->defs[i], state->index);
 682
 683    return true;
 684 }
 685
 686 static bool
 687 propagate_across_edge(nir_block *pred, nir_block *succ,
 688                       struct live_defs_state *state)
 689 {
 690    BITSET_WORD progress = 0;
 691    for (unsigned i = 0; i < state->bitset_words; ++i) {
 692       progress |= succ->live_in[i] & ~pred->live_out[i];
 693       pred->live_out[i] |= succ->live_in[i];
 694    }
 695    return progress != 0;
 696 }
 697
 698 static unsigned
 699 live_defs(nir_function_impl *impl, struct live_def *defs, unsigned *live_map)
 700 {
 701    struct live_defs_state state;
 702    unsigned block_live_index[impl->num_blocks + 1];
 703
 704    state.impl = impl;
 705    state.defs = defs;
 706    state.live_map = live_map;
 707
 708    state.num_defs = 0;
 709    nir_foreach_block(block, impl) {
 710       block_live_index[block->index] = state.num_defs;
 711       nir_foreach_instr(instr, block) {
 712          nir_dest *dest = dest_for_instr(instr);
 713          if (!dest)
 714             continue;
 715
 716          unsigned idx = dest_index(impl, dest);
 717          /* register is already in defs */
 718          if (live_map[idx] != ~0u)
 719             continue;
 720
 721          defs[state.num_defs] = (struct live_def) {instr, dest, state.num_defs, 0};
 722
 723          /* input live from the start */
 724          if (instr->type == nir_instr_type_intrinsic) {
 725             nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
 726             if (intr->intrinsic == nir_intrinsic_load_input ||
 727                 intr->intrinsic == nir_intrinsic_load_instance_id)
 728                defs[state.num_defs].live_start = 0;
 729          }
 730
 731          live_map[idx] = state.num_defs;
 732          state.num_defs++;
 733       }
 734    }
 735    block_live_index[impl->num_blocks] = state.num_defs;
 736
 737    nir_block_worklist_init(&state.worklist, impl->num_blocks, NULL);
 738
 739    /* We now know how many unique ssa definitions we have and we can go
 740     * ahead and allocate live_in and live_out sets and add all of the
 741     * blocks to the worklist.
 742     */
 743    state.bitset_words = BITSET_WORDS(state.num_defs);
 744    nir_foreach_block(block, impl) {
 745       init_liveness_block(block, &state);
 746    }
 747
 748    /* We're now ready to work through the worklist and update the liveness
 749     * sets of each of the blocks.  By the time we get to this point, every
 750     * block in the function implementation has been pushed onto the
 751     * worklist in reverse order.  As long as we keep the worklist
 752     * up-to-date as we go, everything will get covered.
 753     */
 754    while (!nir_block_worklist_is_empty(&state.worklist)) {
 755       /* We pop them off in the reverse order we pushed them on.  This way
 756        * the first walk of the instructions is backwards so we only walk
 757        * once in the case of no control flow.
 758        */
 759       nir_block *block = nir_block_worklist_pop_head(&state.worklist);
 760       state.block = block;
 761
 762       memcpy(block->live_in, block->live_out,
 763              state.bitset_words * sizeof(BITSET_WORD));
 764
 765       state.index = block_live_index[block->index + 1];
 766
 767       nir_if *following_if = nir_block_get_following_if(block);
 768       if (following_if)
 769          set_src_live(&following_if->condition, &state);
 770
 771       nir_foreach_instr_reverse(instr, block) {
 772          /* when we come across the next "live" instruction, decrement index */
 773          if (state.index && instr == defs[state.index - 1].instr) {
 774             state.index--;
 775             /* the only source of writes to registers is phis:
 776              * we don't expect any partial write_mask alus
 777              * so clearing live_in here is OK
 778              */
 779             BITSET_CLEAR(block->live_in, state.index);
 780          }
 781
 782          /* don't set_src_live for not-emitted instructions */
 783          if (instr->pass_flags)
 784             continue;
 785
 786          unsigned index = state.index;
 787
 788          /* output live till the end */
 789          if (instr->type == nir_instr_type_intrinsic) {
 790             nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
 791             if (intr->intrinsic == nir_intrinsic_store_deref)
 792                state.index = ~0u;
 793          }
 794
 795          nir_foreach_src(instr, set_src_live, &state);
 796
 797          state.index = index;
 798       }
 799       assert(state.index == block_live_index[block->index]);
 800
 801       /* Walk over all of the predecessors of the current block updating
 802        * their live in with the live out of this one.  If anything has
 803        * changed, add the predecessor to the work list so that we ensure
 804        * that the new information is used.
 805        */
 806       set_foreach(block->predecessors, entry) {
 807          nir_block *pred = (nir_block *)entry->key;
 808          if (propagate_across_edge(pred, block, &state))
 809             nir_block_worklist_push_tail(&state.worklist, pred);
 810       }
 811    }
 812
 813    nir_block_worklist_fini(&state.worklist);
 814
 815    /* apply live_in/live_out to ranges */
 816
 817    nir_foreach_block(block, impl) {
 818       int i;
 819
 820       BITSET_FOREACH_SET(i, block->live_in, state.num_defs)
 821          range_include(&state.defs[i], block_live_index[block->index]);
 822
 823       BITSET_FOREACH_SET(i, block->live_out, state.num_defs)
 824          range_include(&state.defs[i], block_live_index[block->index + 1]);
 825    }
 826
 827    return state.num_defs;
 828 }
 829
 830 /* precomputed by register_allocate  */
 831 static unsigned int *q_values[] = {
 832    (unsigned int[]) {1, 2, 3, 4, 2, 2, 3, },
 833    (unsigned int[]) {3, 5, 6, 6, 5, 5, 6, },
 834    (unsigned int[]) {3, 4, 4, 4, 4, 4, 4, },
 835    (unsigned int[]) {1, 1, 1, 1, 1, 1, 1, },
 836    (unsigned int[]) {1, 2, 2, 2, 1, 2, 2, },
 837    (unsigned int[]) {2, 3, 3, 3, 2, 3, 3, },
 838    (unsigned int[]) {2, 2, 2, 2, 2, 2, 2, },
 839 };
 840
 841 static void
 842 ra_assign(struct state *state, nir_shader *shader)
 843 {
 844    struct ra_regs *regs = ra_alloc_reg_set(NULL, ETNA_MAX_TEMPS *
 845                   NUM_REG_TYPES, false);
 846
 847    /* classes always be created from index 0, so equal to the class enum
 848     * which represents a register with (c+1) components
 849     */
 850    for (int c = 0; c < NUM_REG_CLASSES; c++)
 851       ra_alloc_reg_class(regs);
 852    /* add each register of each class */
 853    for (int r = 0; r < NUM_REG_TYPES * ETNA_MAX_TEMPS; r++)
 854       ra_class_add_reg(regs, reg_get_class(r), r);
 855    /* set conflicts */
 856    for (int r = 0; r < ETNA_MAX_TEMPS; r++) {
 857       for (int i = 0; i < NUM_REG_TYPES; i++) {
 858          for (int j = 0; j < i; j++) {
 859             if (reg_writemask[i] & reg_writemask[j]) {
 860                ra_add_reg_conflict(regs, NUM_REG_TYPES * r + i,
 861                                          NUM_REG_TYPES * r + j);
 862             }
 863          }
 864       }
 865    }
 866    ra_set_finalize(regs, q_values);
 867
 868    nir_function_impl *impl = nir_shader_get_entrypoint(shader);
 869
 870    /* liveness and interference */
 871
 872    nir_index_blocks(impl);
 873    nir_index_ssa_defs(impl);
 874    nir_foreach_block(block, impl) {
 875       nir_foreach_instr(instr, block)
 876          instr->pass_flags = 0;
 877    }
 878
 879    /* this gives an approximation/upper limit on how many nodes are needed
 880     * (some ssa values do not represent an allocated register)
 881     */
 882    unsigned max_nodes = impl->ssa_alloc + impl->reg_alloc;
 883    unsigned *live_map = ralloc_array(NULL, unsigned, max_nodes);
 884    memset(live_map, 0xff, sizeof(unsigned) * max_nodes);
 885    struct live_def *defs = rzalloc_array(NULL, struct live_def, max_nodes);
 886
 887    unsigned num_nodes = live_defs(impl, defs, live_map);
 888    struct ra_graph *g = ra_alloc_interference_graph(regs, num_nodes);
 889
 890    /* set classes from num_components */
 891    for (unsigned i = 0; i < num_nodes; i++) {
 892       nir_instr *instr = defs[i].instr;
 893       nir_dest *dest = defs[i].dest;
 894       unsigned c = nir_dest_num_components(*dest) - 1;
 895
 896       if (instr->type == nir_instr_type_alu &&
 897           state->c->specs->has_new_transcendentals) {
 898          switch (nir_instr_as_alu(instr)->op) {
 899          case nir_op_fdiv:
 900          case nir_op_flog2:
 901          case nir_op_fsin:
 902          case nir_op_fcos:
 903             assert(dest->is_ssa);
 904             c = REG_CLASS_VIRT_VEC2T;
 905          default:
 906             break;
 907          }
 908       }
 909
 910       if (instr->type == nir_instr_type_intrinsic) {
 911          nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
 912          /* can't have dst swizzle or sparse writemask on UBO loads */
 913          if (intr->intrinsic == nir_intrinsic_load_ubo) {
 914             assert(dest == &intr->dest);
 915             if (dest->ssa.num_components == 2)
 916                c = REG_CLASS_VIRT_VEC2C;
 917             if (dest->ssa.num_components == 3)
 918                c = REG_CLASS_VIRT_VEC3C;
 919          }
 920       }
 921
 922       ra_set_node_class(g, i, c);
 923    }
 924
 925    nir_foreach_block(block, impl) {
 926       nir_foreach_instr(instr, block) {
 927          if (instr->type != nir_instr_type_intrinsic)
 928             continue;
 929
 930          nir_dest *dest = dest_for_instr(instr);
 931          nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
 932          unsigned reg;
 933
 934          switch (intr->intrinsic) {
 935          case nir_intrinsic_store_deref: {
 936             /* don't want outputs to be swizzled
 937              * TODO: better would be to set the type to X/XY/XYZ/XYZW
 938              * TODO: what if fragcoord.z is read after writing fragdepth?
 939              */
 940             nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
 941             unsigned index = live_map[src_index(impl, &intr->src[1])];
 942
 943             if (shader->info.stage == MESA_SHADER_FRAGMENT &&
 944                 deref->var->data.location == FRAG_RESULT_DEPTH) {
 945                ra_set_node_reg(g, index, REG_FRAG_DEPTH);
 946             } else {
 947                ra_set_node_class(g, index, REG_CLASS_VEC4);
 948             }
 949          } continue;
 950          case nir_intrinsic_load_input:
 951             reg = nir_intrinsic_base(intr) * NUM_REG_TYPES + (unsigned[]) {
 952                REG_TYPE_VIRT_SCALAR_X,
 953                REG_TYPE_VIRT_VEC2_XY,
 954                REG_TYPE_VIRT_VEC3_XYZ,
 955                REG_TYPE_VEC4,
 956             }[nir_dest_num_components(*dest) - 1];
 957             break;
 958          case nir_intrinsic_load_instance_id:
 959             reg = state->c->variant->infile.num_reg * NUM_REG_TYPES + REG_TYPE_VIRT_SCALAR_Y;
 960             break;
 961          default:
 962             continue;
 963          }
 964
 965          ra_set_node_reg(g, live_map[dest_index(impl, dest)], reg);
 966       }
 967    }
 968
 969    /* add interference for intersecting live ranges */
 970    for (unsigned i = 0; i < num_nodes; i++) {
 971       assert(defs[i].live_start < defs[i].live_end);
 972       for (unsigned j = 0; j < i; j++) {
 973          if (defs[i].live_start >= defs[j].live_end || defs[j].live_start >= defs[i].live_end)
 974             continue;
 975          ra_add_node_interference(g, i, j);
 976       }
 977    }
 978
 979    ralloc_free(defs);
 980
 981    /* Allocate registers */
 982    ASSERTED bool ok = ra_allocate(g);
 983    assert(ok);
 984
 985    state->g = g;
 986    state->regs = regs;
 987    state->live_map = live_map;
 988    state->num_nodes = num_nodes;
 989 }
 990
 991 static unsigned
 992 ra_finish(struct state *state)
 993 {
 994    /* TODO: better way to get number of registers used? */
 995    unsigned j = 0;
 996    for (unsigned i = 0; i < state->num_nodes; i++) {
 997       j = MAX2(j, reg_get_base(state, ra_get_node_reg(state->g, i)) + 1);
 998    }
 999
1000    ralloc_free(state->g);
1001    ralloc_free(state->regs);
1002    ralloc_free(state->live_map);
1003
1004    return j;
1005 }
1006
1007 static void
1008 emit_alu(struct state *state, nir_alu_instr * alu)
1009 {
1010    const nir_op_info *info = &nir_op_infos[alu->op];
1011
1012    /* marked as dead instruction (vecN and other bypassed instr) */
1013    if (alu->instr.pass_flags)
1014       return;
1015
1016    assert(!(alu->op >= nir_op_vec2 && alu->op <= nir_op_vec4));
1017
1018    unsigned dst_swiz;
1019    hw_dst dst = ra_dest(state, &alu->dest.dest, &dst_swiz);
1020
1021    /* compose alu write_mask with RA write mask */
1022    if (!alu->dest.dest.is_ssa)
1023       dst.write_mask = inst_write_mask_compose(alu->dest.write_mask, dst.write_mask);
1024
1025    switch (alu->op) {
1026    case nir_op_fdot2:
1027    case nir_op_fdot3:
1028    case nir_op_fdot4:
1029       /* not per-component - don't compose dst_swiz */
1030       dst_swiz = INST_SWIZ_IDENTITY;
1031       break;
1032    default:
1033       break;
1034    }
1035
1036    hw_src srcs[3];
1037
1038    for (int i = 0; i < info->num_inputs; i++) {
1039       nir_alu_src *asrc = &alu->src[i];
1040       hw_src src;
1041
1042       src = src_swizzle(get_src(state, &asrc->src), ALU_SWIZ(asrc));
1043       src = src_swizzle(src, dst_swiz);
1044
1045       if (src.rgroup != INST_RGROUP_IMMEDIATE) {
1046          src.neg = asrc->negate || (alu->op == nir_op_fneg);
1047          src.abs = asrc->abs || (alu->op == nir_op_fabs);
1048       } else {
1049          assert(!asrc->negate && alu->op != nir_op_fneg);
1050          assert(!asrc->abs && alu->op != nir_op_fabs);
1051       }
1052
1053       srcs[i] = src;
1054    }
1055
1056    emit(alu, alu->op, dst, srcs, alu->dest.saturate || (alu->op == nir_op_fsat));
1057 }
1058
1059 static void
1060 emit_tex(struct state *state, nir_tex_instr * tex)
1061 {
1062    unsigned dst_swiz;
1063    hw_dst dst = ra_dest(state, &tex->dest, &dst_swiz);
1064    nir_src *coord = NULL, *lod_bias = NULL, *compare = NULL;
1065
1066    for (unsigned i = 0; i < tex->num_srcs; i++) {
1067       switch (tex->src[i].src_type) {
1068       case nir_tex_src_coord:
1069          coord = &tex->src[i].src;
1070          break;
1071       case nir_tex_src_bias:
1072       case nir_tex_src_lod:
1073          assert(!lod_bias);
1074          lod_bias = &tex->src[i].src;
1075          break;
1076       case nir_tex_src_comparator:
1077          compare = &tex->src[i].src;
1078          break;
1079       default:
1080          compile_error(state->c, "Unhandled NIR tex src type: %d\n",
1081                        tex->src[i].src_type);
1082          break;
1083       }
1084    }
1085
1086    emit(tex, tex->op, tex->sampler_index, dst_swiz, dst, get_src(state, coord),
1087         lod_bias ? get_src(state, lod_bias) : SRC_DISABLE,
1088         compare ? get_src(state, compare) : SRC_DISABLE);
1089 }
1090
1091 static void
1092 emit_intrinsic(struct state *state, nir_intrinsic_instr * intr)
1093 {
1094    switch (intr->intrinsic) {
1095    case nir_intrinsic_store_deref:
1096       emit(output, nir_src_as_deref(intr->src[0])->var, get_src(state, &intr->src[1]));
1097       break;
1098    case nir_intrinsic_discard_if:
1099       emit(discard, get_src(state, &intr->src[0]));
1100       break;
1101    case nir_intrinsic_discard:
1102       emit(discard, SRC_DISABLE);
1103       break;
1104    case nir_intrinsic_load_uniform: {
1105       unsigned dst_swiz;
1106       struct etna_inst_dst dst = ra_dest(state, &intr->dest, &dst_swiz);
1107
1108       /* TODO: rework so extra MOV isn't required, load up to 4 addresses at once */
1109       emit_inst(state->c, &(struct etna_inst) {
1110          .opcode = INST_OPCODE_MOVAR,
1111          .dst.write_mask = 0x1,
1112          .src[2] = get_src(state, &intr->src[0]),
1113       });
1114       emit_inst(state->c, &(struct etna_inst) {
1115          .opcode = INST_OPCODE_MOV,
1116          .dst = dst,
1117          .src[2] = {
1118             .use = 1,
1119             .rgroup = INST_RGROUP_UNIFORM_0,
1120             .reg = nir_intrinsic_base(intr),
1121             .swiz = dst_swiz,
1122             .amode = INST_AMODE_ADD_A_X,
1123          },
1124       });
1125    } break;
1126    case nir_intrinsic_load_ubo: {
1127       /* TODO: if offset is of the form (x + C) then add C to the base instead */
1128       unsigned idx = nir_src_as_const_value(intr->src[0])[0].u32;
1129       unsigned dst_swiz;
1130       emit_inst(state->c, &(struct etna_inst) {
1131          .opcode = INST_OPCODE_LOAD,
1132          .type = INST_TYPE_U32,
1133          .dst = ra_dest(state, &intr->dest, &dst_swiz),
1134          .src[0] = get_src(state, &intr->src[1]),
1135          .src[1] = const_src(state, &CONST_VAL(ETNA_IMMEDIATE_UBO0_ADDR + idx, 0), 1),
1136       });
1137    } break;
1138    case nir_intrinsic_load_front_face:
1139    case nir_intrinsic_load_frag_coord:
1140       assert(intr->dest.is_ssa); /* TODO - lower phis could cause this */
1141       break;
1142    case nir_intrinsic_load_input:
1143    case nir_intrinsic_load_instance_id:
1144       break;
1145    default:
1146       compile_error(state->c, "Unhandled NIR intrinsic type: %s\n",
1147                     nir_intrinsic_infos[intr->intrinsic].name);
1148    }
1149 }
1150
1151 static void
1152 emit_instr(struct state *state, nir_instr * instr)
1153 {
1154    switch (instr->type) {
1155    case nir_instr_type_alu:
1156       emit_alu(state, nir_instr_as_alu(instr));
1157       break;
1158    case nir_instr_type_tex:
1159       emit_tex(state, nir_instr_as_tex(instr));
1160       break;
1161    case nir_instr_type_intrinsic:
1162       emit_intrinsic(state, nir_instr_as_intrinsic(instr));
1163       break;
1164    case nir_instr_type_jump:
1165       assert(nir_instr_is_last(instr));
1166    case nir_instr_type_load_const:
1167    case nir_instr_type_ssa_undef:
1168    case nir_instr_type_deref:
1169       break;
1170    default:
1171       compile_error(state->c, "Unhandled NIR instruction type: %d\n", instr->type);
1172       break;
1173    }
1174 }
1175
1176 static void
1177 emit_block(struct state *state, nir_block * block)
1178 {
1179    emit(block_start, block->index);
1180
1181    nir_foreach_instr(instr, block)
1182       emit_instr(state, instr);
1183
1184    /* succs->index < block->index is for the loop case  */
1185    nir_block *succs = block->successors[0];
1186    if (nir_block_ends_in_jump(block) || succs->index < block->index)
1187       emit(jump, succs->index, SRC_DISABLE);
1188 }
1189
1190 static void
1191 emit_cf_list(struct state *state, struct exec_list *list);
1192
1193 static void
1194 emit_if(struct state *state, nir_if * nif)
1195 {
1196    emit(jump, nir_if_first_else_block(nif)->index, get_src(state, &nif->condition));
1197    emit_cf_list(state, &nif->then_list);
1198
1199    /* jump at end of then_list to skip else_list
1200     * not needed if then_list already ends with a jump or else_list is empty
1201     */
1202    if (!nir_block_ends_in_jump(nir_if_last_then_block(nif)) &&
1203        !nir_cf_list_is_empty_block(&nif->else_list))
1204       emit(jump, nir_if_last_else_block(nif)->successors[0]->index, SRC_DISABLE);
1205
1206    emit_cf_list(state, &nif->else_list);
1207 }
1208
1209 static void
1210 emit_cf_list(struct state *state, struct exec_list *list)
1211 {
1212    foreach_list_typed(nir_cf_node, node, node, list) {
1213       switch (node->type) {
1214       case nir_cf_node_block:
1215          emit_block(state, nir_cf_node_as_block(node));
1216          break;
1217       case nir_cf_node_if:
1218          emit_if(state, nir_cf_node_as_if(node));
1219          break;
1220       case nir_cf_node_loop:
1221          emit_cf_list(state, &nir_cf_node_as_loop(node)->body);
1222          break;
1223       default:
1224          compile_error(state->c, "Unknown NIR node type\n");
1225          break;
1226       }
1227    }
1228 }
1229
1230 /* based on nir_lower_vec_to_movs */
1231 static unsigned
1232 insert_vec_mov(nir_alu_instr *vec, unsigned start_idx, nir_shader *shader)
1233 {
1234    assert(start_idx < nir_op_infos[vec->op].num_inputs);
1235    unsigned write_mask = (1u << start_idx);
1236
1237    nir_alu_instr *mov = nir_alu_instr_create(shader, nir_op_mov);
1238    nir_alu_src_copy(&mov->src[0], &vec->src[start_idx], mov);
1239
1240    mov->src[0].swizzle[0] = vec->src[start_idx].swizzle[0];
1241    mov->src[0].negate = vec->src[start_idx].negate;
1242    mov->src[0].abs = vec->src[start_idx].abs;
1243
1244    unsigned num_components = 1;
1245
1246    for (unsigned i = start_idx + 1; i < 4; i++) {
1247       if (!(vec->dest.write_mask & (1 << i)))
1248          continue;
1249
1250       if (nir_srcs_equal(vec->src[i].src, vec->src[start_idx].src) &&
1251           vec->src[i].negate == vec->src[start_idx].negate &&
1252           vec->src[i].abs == vec->src[start_idx].abs) {
1253          write_mask |= (1 << i);
1254          mov->src[0].swizzle[num_components] = vec->src[i].swizzle[0];
1255          num_components++;
1256       }
1257    }
1258
1259    mov->dest.write_mask = (1 << num_components) - 1;
1260    nir_ssa_dest_init(&mov->instr, &mov->dest.dest, num_components, 32, NULL);
1261
1262    /* replace vec srcs with inserted mov */
1263    for (unsigned i = 0, j = 0; i < 4; i++) {
1264       if (!(write_mask & (1 << i)))
1265          continue;
1266
1267       nir_instr_rewrite_src(&vec->instr, &vec->src[i].src, nir_src_for_ssa(&mov->dest.dest.ssa));
1268       vec->src[i].swizzle[0] = j++;
1269    }
1270
1271    nir_instr_insert_before(&vec->instr, &mov->instr);
1272
1273    return write_mask;
1274 }
1275
1276 /*
1277  * for vecN instructions:
1278  * -merge constant sources into a single src
1279  * -insert movs (nir_lower_vec_to_movs equivalent)
1280  * for non-vecN instructions:
1281  * -try to merge constants as single constant
1282  * -insert movs for multiple constants (pre-HALTI5)
1283  */
1284 static void
1285 lower_alu(struct state *state, nir_alu_instr *alu)
1286 {
1287    const nir_op_info *info = &nir_op_infos[alu->op];
1288
1289    nir_builder b;
1290    nir_builder_init(&b, state->impl);
1291    b.cursor = nir_before_instr(&alu->instr);
1292
1293    switch (alu->op) {
1294    case nir_op_vec2:
1295    case nir_op_vec3:
1296    case nir_op_vec4:
1297       break;
1298    default:
1299       /* pre-GC7000L can only have 1 uniform src per instruction */
1300       if (state->c->specs->halti >= 5)
1301          return;
1302
1303       nir_const_value value[4] = {};
1304       uint8_t swizzle[4][4] = {};
1305       unsigned swiz_max = 0, num_const = 0;
1306
1307       for (unsigned i = 0; i < info->num_inputs; i++) {
1308          nir_const_value *cv = nir_src_as_const_value(alu->src[i].src);
1309          if (!cv)
1310             continue;
1311
1312          unsigned num_components = info->input_sizes[i] ?: alu->dest.dest.ssa.num_components;
1313          for (unsigned j = 0; j < num_components; j++) {
1314             int idx = const_add(&value[0].u64, cv[alu->src[i].swizzle[j]].u64);
1315             swizzle[i][j] = idx;
1316             swiz_max = MAX2(swiz_max, (unsigned) idx);
1317          }
1318          num_const++;
1319       }
1320
1321       /* nothing to do */
1322       if (num_const <= 1)
1323          return;
1324
1325       /* resolve with single combined const src */
1326       if (swiz_max < 4) {
1327          nir_ssa_def *def = nir_build_imm(&b, swiz_max + 1, 32, value);
1328
1329          for (unsigned i = 0; i < info->num_inputs; i++) {
1330             nir_const_value *cv = nir_src_as_const_value(alu->src[i].src);
1331             if (!cv)
1332                continue;
1333
1334             nir_instr_rewrite_src(&alu->instr, &alu->src[i].src, nir_src_for_ssa(def));
1335
1336             for (unsigned j = 0; j < 4; j++)
1337                alu->src[i].swizzle[j] = swizzle[i][j];
1338          }
1339          return;
1340       }
1341
1342       /* resolve with movs */
1343       num_const = 0;
1344       for (unsigned i = 0; i < info->num_inputs; i++) {
1345          nir_const_value *cv = nir_src_as_const_value(alu->src[i].src);
1346          if (!cv)
1347             continue;
1348
1349          num_const++;
1350          if (num_const == 1)
1351             continue;
1352
1353          nir_ssa_def *mov = nir_mov(&b, alu->src[i].src.ssa);
1354          nir_instr_rewrite_src(&alu->instr, &alu->src[i].src, nir_src_for_ssa(mov));
1355       }
1356       return;
1357    }
1358
1359    nir_const_value value[4];
1360    unsigned num_components = 0;
1361
1362    for (unsigned i = 0; i < info->num_inputs; i++) {
1363       nir_const_value *cv = nir_src_as_const_value(alu->src[i].src);
1364       if (cv)
1365          value[num_components++] = cv[alu->src[i].swizzle[0]];
1366    }
1367
1368    /* if there is more than one constant source to the vecN, combine them
1369     * into a single load_const (removing the vecN completely if all components
1370     * are constant)
1371     */
1372    if (num_components > 1) {
1373       nir_ssa_def *def = nir_build_imm(&b, num_components, 32, value);
1374
1375       if (num_components == info->num_inputs) {
1376          nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, nir_src_for_ssa(def));
1377          nir_instr_remove(&alu->instr);
1378          return;
1379       }
1380
1381       for (unsigned i = 0, j = 0; i < info->num_inputs; i++) {
1382          nir_const_value *cv = nir_src_as_const_value(alu->src[i].src);
1383          if (!cv)
1384             continue;
1385
1386          nir_instr_rewrite_src(&alu->instr, &alu->src[i].src, nir_src_for_ssa(def));
1387          alu->src[i].swizzle[0] = j++;
1388       }
1389    }
1390
1391    unsigned finished_write_mask = 0;
1392    for (unsigned i = 0; i < 4; i++) {
1393       if (!(alu->dest.write_mask & (1 << i)))
1394             continue;
1395
1396       nir_ssa_def *ssa = alu->src[i].src.ssa;
1397
1398       /* check that vecN instruction is only user of this */
1399       bool need_mov = list_length(&ssa->if_uses) != 0;
1400       nir_foreach_use(use_src, ssa) {
1401          if (use_src->parent_instr != &alu->instr)
1402             need_mov = true;
1403       }
1404
1405       nir_instr *instr = ssa->parent_instr;
1406       switch (instr->type) {
1407       case nir_instr_type_alu:
1408       case nir_instr_type_tex:
1409          break;
1410       case nir_instr_type_intrinsic:
1411          if (nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_input) {
1412             need_mov = vec_dest_has_swizzle(alu, &nir_instr_as_intrinsic(instr)->dest.ssa);
1413             break;
1414          }
1415       default:
1416          need_mov = true;
1417       }
1418
1419       if (need_mov && !(finished_write_mask & (1 << i)))
1420          finished_write_mask |= insert_vec_mov(alu, i, state->shader);
1421    }
1422 }
1423
1424 static bool
1425 emit_shader(struct etna_compile *c, unsigned *num_temps, unsigned *num_consts)
1426 {
1427    nir_shader *shader = c->nir;
1428
1429    struct state state = {
1430       .c = c,
1431       .shader = shader,
1432       .impl = nir_shader_get_entrypoint(shader),
1433    };
1434    bool have_indirect_uniform = false;
1435    unsigned indirect_max = 0;
1436
1437    nir_builder b;
1438    nir_builder_init(&b, state.impl);
1439
1440    /* convert non-dynamic uniform loads to constants, etc */
1441    nir_foreach_block(block, state.impl) {
1442       nir_foreach_instr_safe(instr, block) {
1443          switch(instr->type) {
1444          case nir_instr_type_alu:
1445             /* deals with vecN and const srcs */
1446             lower_alu(&state, nir_instr_as_alu(instr));
1447             break;
1448          case nir_instr_type_load_const: {
1449             nir_load_const_instr *load_const = nir_instr_as_load_const(instr);
1450             for (unsigned  i = 0; i < load_const->def.num_components; i++)
1451                load_const->value[i] = CONST(load_const->value[i].u32);
1452          } break;
1453          case nir_instr_type_intrinsic: {
1454             nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1455             /* TODO: load_ubo can also become a constant in some cases
1456              * (at the moment it can end up emitting a LOAD with two
1457              *  uniform sources, which could be a problem on HALTI2)
1458              */
1459             if (intr->intrinsic != nir_intrinsic_load_uniform)
1460                break;
1461             nir_const_value *off = nir_src_as_const_value(intr->src[0]);
1462             if (!off || off[0].u64 >> 32 != ETNA_IMMEDIATE_CONSTANT) {
1463                have_indirect_uniform = true;
1464                indirect_max = nir_intrinsic_base(intr) + nir_intrinsic_range(intr);
1465                break;
1466             }
1467
1468             unsigned base = nir_intrinsic_base(intr);
1469             /* pre halti2 uniform offset will be float */
1470             if (c->specs->halti < 2)
1471                base += (unsigned) off[0].f32;
1472             else
1473                base += off[0].u32;
1474             nir_const_value value[4];
1475
1476             for (unsigned i = 0; i < intr->dest.ssa.num_components; i++) {
1477                if (nir_intrinsic_base(intr) < 0)
1478                   value[i] = TEXSCALE(~nir_intrinsic_base(intr), i);
1479                else
1480                   value[i] = UNIFORM(base * 4 + i);
1481             }
1482
1483             b.cursor = nir_after_instr(instr);
1484             nir_ssa_def *def = nir_build_imm(&b, intr->dest.ssa.num_components, 32, value);
1485
1486             nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(def));
1487             nir_instr_remove(instr);
1488          } break;
1489          default:
1490             break;
1491          }
1492       }
1493    }
1494
1495    /* TODO: only emit required indirect uniform ranges */
1496    if (have_indirect_uniform) {
1497       for (unsigned i = 0; i < indirect_max * 4; i++)
1498          c->consts[i] = UNIFORM(i).u64;
1499       state.const_count = indirect_max;
1500    }
1501
1502    /* add mov for any store output using sysval/const  */
1503    nir_foreach_block(block, state.impl) {
1504       nir_foreach_instr_safe(instr, block) {
1505          if (instr->type != nir_instr_type_intrinsic)
1506             continue;
1507
1508          nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1509
1510          switch (intr->intrinsic) {
1511          case nir_intrinsic_store_deref: {
1512             nir_src *src = &intr->src[1];
1513             if (nir_src_is_const(*src) || is_sysval(src->ssa->parent_instr)) {
1514                b.cursor = nir_before_instr(instr);
1515                nir_instr_rewrite_src(instr, src, nir_src_for_ssa(nir_mov(&b, src->ssa)));
1516             }
1517          } break;
1518          default:
1519             break;
1520          }
1521       }
1522    }
1523
1524    /* call directly to avoid validation (load_const don't pass validation at this point) */
1525    nir_convert_from_ssa(shader, true);
1526    nir_opt_dce(shader);
1527
1528    ra_assign(&state, shader);
1529
1530    emit_cf_list(&state, &nir_shader_get_entrypoint(shader)->body);
1531
1532    *num_temps = ra_finish(&state);
1533    *num_consts = state.const_count;
1534    return true;
1535 }