src/gallium/drivers/etnaviv/etnaviv_compiler_nir.c

   1 /*
   2  * Copyright (c) 2012-2019 Etnaviv Project
   3  * Copyright (c) 2019 Zodiac Inflight Innovations
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sub license,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the
  13  * next paragraph) shall be included in all copies or substantial portions
  14  * of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  22  * DEALINGS IN THE SOFTWARE.
  23  *
  24  * Authors:
  25  *    Jonathan Marek <jonathan@marek.ca>
  26  *    Wladimir J. van der Laan <laanwj@gmail.com>
  27  */
  28
  29 #include "etnaviv_compiler.h"
  30 #include "etnaviv_compiler_nir.h"
  31 #include "etnaviv_asm.h"
  32 #include "etnaviv_context.h"
  33 #include "etnaviv_debug.h"
  34 #include "etnaviv_disasm.h"
  35 #include "etnaviv_nir.h"
  36 #include "etnaviv_uniforms.h"
  37 #include "etnaviv_util.h"
  38
  39 #include <math.h>
  40 #include "util/u_memory.h"
  41 #include "util/register_allocate.h"
  42 #include "compiler/nir/nir_builder.h"
  43
  44 #include "tgsi/tgsi_strings.h"
  45 #include "util/u_half.h"
  46
  47 static bool
  48 etna_alu_to_scalar_filter_cb(const nir_instr *instr, const void *data)
  49 {
  50    const struct etna_specs *specs = data;
  51
  52    if (instr->type != nir_instr_type_alu)
  53       return false;
  54
  55    nir_alu_instr *alu = nir_instr_as_alu(instr);
  56    switch (alu->op) {
  57    case nir_op_frsq:
  58    case nir_op_frcp:
  59    case nir_op_flog2:
  60    case nir_op_fexp2:
  61    case nir_op_fsqrt:
  62    case nir_op_fcos:
  63    case nir_op_fsin:
  64    case nir_op_fdiv:
  65    case nir_op_imul:
  66       return true;
  67    /* TODO: can do better than alu_to_scalar for vector compares */
  68    case nir_op_b32all_fequal2:
  69    case nir_op_b32all_fequal3:
  70    case nir_op_b32all_fequal4:
  71    case nir_op_b32any_fnequal2:
  72    case nir_op_b32any_fnequal3:
  73    case nir_op_b32any_fnequal4:
  74    case nir_op_b32all_iequal2:
  75    case nir_op_b32all_iequal3:
  76    case nir_op_b32all_iequal4:
  77    case nir_op_b32any_inequal2:
  78    case nir_op_b32any_inequal3:
  79    case nir_op_b32any_inequal4:
  80       return true;
  81    case nir_op_fdot2:
  82       if (!specs->has_halti2_instructions)
  83          return true;
  84       break;
  85    default:
  86       break;
  87    }
  88
  89    return false;
  90 }
  91
  92 static void
  93 etna_emit_block_start(struct etna_compile *c, unsigned block)
  94 {
  95    c->block_ptr[block] = c->inst_ptr;
  96 }
  97
  98 static void
  99 etna_emit_output(struct etna_compile *c, nir_variable *var, struct etna_inst_src src)
 100 {
 101    struct etna_shader_io_file *sf = &c->variant->outfile;
 102
 103    if (is_fs(c)) {
 104       switch (var->data.location) {
 105       case FRAG_RESULT_COLOR:
 106       case FRAG_RESULT_DATA0: /* DATA0 is used by gallium shaders for color */
 107          c->variant->ps_color_out_reg = src.reg;
 108          break;
 109       case FRAG_RESULT_DEPTH:
 110          c->variant->ps_depth_out_reg = src.reg;
 111          break;
 112       default:
 113          unreachable("Unsupported fs output");
 114       }
 115       return;
 116    }
 117
 118    switch (var->data.location) {
 119    case VARYING_SLOT_POS:
 120       c->variant->vs_pos_out_reg = src.reg;
 121       break;
 122    case VARYING_SLOT_PSIZ:
 123       c->variant->vs_pointsize_out_reg = src.reg;
 124       break;
 125    default:
 126       sf->reg[sf->num_reg].reg = src.reg;
 127       sf->reg[sf->num_reg].slot = var->data.location;
 128       sf->reg[sf->num_reg].num_components = glsl_get_components(var->type);
 129       sf->num_reg++;
 130       break;
 131    }
 132 }
 133
 134 #define OPT(nir, pass, ...) ({                             \
 135    bool this_progress = false;                             \
 136    NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);      \
 137    this_progress;                                          \
 138 })
 139
 140 static void
 141 etna_optimize_loop(nir_shader *s)
 142 {
 143    bool progress;
 144    do {
 145       progress = false;
 146
 147       NIR_PASS_V(s, nir_lower_vars_to_ssa);
 148       progress |= OPT(s, nir_opt_copy_prop_vars);
 149       progress |= OPT(s, nir_copy_prop);
 150       progress |= OPT(s, nir_opt_dce);
 151       progress |= OPT(s, nir_opt_cse);
 152       progress |= OPT(s, nir_opt_peephole_select, 16, true, true);
 153       progress |= OPT(s, nir_opt_intrinsics);
 154       progress |= OPT(s, nir_opt_algebraic);
 155       progress |= OPT(s, nir_opt_constant_folding);
 156       progress |= OPT(s, nir_opt_dead_cf);
 157       if (OPT(s, nir_opt_trivial_continues)) {
 158          progress = true;
 159          /* If nir_opt_trivial_continues makes progress, then we need to clean
 160           * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
 161           * to make progress.
 162           */
 163          OPT(s, nir_copy_prop);
 164          OPT(s, nir_opt_dce);
 165       }
 166       progress |= OPT(s, nir_opt_loop_unroll, nir_var_all);
 167       progress |= OPT(s, nir_opt_if, false);
 168       progress |= OPT(s, nir_opt_remove_phis);
 169       progress |= OPT(s, nir_opt_undef);
 170    }
 171    while (progress);
 172 }
 173
 174 static int
 175 etna_glsl_type_size(const struct glsl_type *type, bool bindless)
 176 {
 177    return glsl_count_attribute_slots(type, false);
 178 }
 179
 180 static void
 181 copy_uniform_state_to_shader(struct etna_shader_variant *sobj, uint64_t *consts, unsigned count)
 182 {
 183    struct etna_shader_uniform_info *uinfo = &sobj->uniforms;
 184
 185    uinfo->imm_count = count * 4;
 186    uinfo->imm_data = MALLOC(uinfo->imm_count * sizeof(*uinfo->imm_data));
 187    uinfo->imm_contents = MALLOC(uinfo->imm_count * sizeof(*uinfo->imm_contents));
 188
 189    for (unsigned i = 0; i < uinfo->imm_count; i++) {
 190       uinfo->imm_data[i] = consts[i];
 191       uinfo->imm_contents[i] = consts[i] >> 32;
 192    }
 193
 194    etna_set_shader_uniforms_dirty_flags(sobj);
 195 }
 196
 197 #define ALU_SWIZ(s) INST_SWIZ((s)->swizzle[0], (s)->swizzle[1], (s)->swizzle[2], (s)->swizzle[3])
 198 #define SRC_DISABLE ((hw_src){})
 199 #define SRC_CONST(idx, s) ((hw_src){.use=1, .rgroup = INST_RGROUP_UNIFORM_0, .reg=idx, .swiz=s})
 200 #define SRC_REG(idx, s) ((hw_src){.use=1, .rgroup = INST_RGROUP_TEMP, .reg=idx, .swiz=s})
 201
 202 typedef struct etna_inst_dst hw_dst;
 203 typedef struct etna_inst_src hw_src;
 204
 205 static inline hw_src
 206 src_swizzle(hw_src src, unsigned swizzle)
 207 {
 208    if (src.rgroup != INST_RGROUP_IMMEDIATE)
 209       src.swiz = inst_swiz_compose(src.swiz, swizzle);
 210
 211    return src;
 212 }
 213
 214 /* constants are represented as 64-bit ints
 215  * 32-bit for the value and 32-bit for the type (imm, uniform, etc)
 216  */
 217
 218 #define CONST_VAL(a, b) (nir_const_value) {.u64 = (uint64_t)(a) << 32 | (uint64_t)(b)}
 219 #define CONST(x) CONST_VAL(ETNA_IMMEDIATE_CONSTANT, x)
 220 #define UNIFORM(x) CONST_VAL(ETNA_IMMEDIATE_UNIFORM, x)
 221 #define TEXSCALE(x, i) CONST_VAL(ETNA_IMMEDIATE_TEXRECT_SCALE_X + (i), x)
 222
 223 static int
 224 const_add(uint64_t *c, uint64_t value)
 225 {
 226    for (unsigned i = 0; i < 4; i++) {
 227       if (c[i] == value || !c[i]) {
 228          c[i] = value;
 229          return i;
 230       }
 231    }
 232    return -1;
 233 }
 234
 235 static hw_src
 236 const_src(struct etna_compile *c, nir_const_value *value, unsigned num_components)
 237 {
 238    /* use inline immediates if possible */
 239    if (c->specs->halti >= 2 && num_components == 1 &&
 240        value[0].u64 >> 32 == ETNA_IMMEDIATE_CONSTANT) {
 241       uint32_t bits = value[0].u32;
 242
 243       /* "float" - shifted by 12 */
 244       if ((bits & 0xfff) == 0)
 245          return etna_immediate_src(0, bits >> 12);
 246
 247       /* "unsigned" - raw 20 bit value */
 248       if (bits < (1 << 20))
 249          return etna_immediate_src(2, bits);
 250
 251       /* "signed" - sign extended 20-bit (sign included) value */
 252       if (bits >= 0xfff80000)
 253          return etna_immediate_src(1, bits);
 254    }
 255
 256    unsigned i;
 257    int swiz = -1;
 258    for (i = 0; swiz < 0; i++) {
 259       uint64_t *a = &c->consts[i*4];
 260       uint64_t save[4];
 261       memcpy(save, a, sizeof(save));
 262       swiz = 0;
 263       for (unsigned j = 0; j < num_components; j++) {
 264          int c = const_add(a, value[j].u64);
 265          if (c < 0) {
 266             memcpy(a, save, sizeof(save));
 267             swiz = -1;
 268             break;
 269          }
 270          swiz |= c << j * 2;
 271       }
 272    }
 273
 274    assert(i <= ETNA_MAX_IMM / 4);
 275    c->const_count = MAX2(c->const_count, i);
 276
 277    return SRC_CONST(i - 1, swiz);
 278 }
 279
 280 /* Swizzles and write masks can be used to layer virtual non-interfering
 281  * registers on top of the real VEC4 registers. For example, the virtual
 282  * VEC3_XYZ register and the virtual SCALAR_W register that use the same
 283  * physical VEC4 base register do not interfere.
 284  */
 285 enum reg_class {
 286    REG_CLASS_VIRT_SCALAR,
 287    REG_CLASS_VIRT_VEC2,
 288    REG_CLASS_VIRT_VEC3,
 289    REG_CLASS_VEC4,
 290    /* special vec2 class for fast transcendentals, limited to XY or ZW */
 291    REG_CLASS_VIRT_VEC2T,
 292    /* special classes for LOAD - contiguous components */
 293    REG_CLASS_VIRT_VEC2C,
 294    REG_CLASS_VIRT_VEC3C,
 295    NUM_REG_CLASSES,
 296 };
 297
 298 enum reg_type {
 299    REG_TYPE_VEC4,
 300    REG_TYPE_VIRT_VEC3_XYZ,
 301    REG_TYPE_VIRT_VEC3_XYW,
 302    REG_TYPE_VIRT_VEC3_XZW,
 303    REG_TYPE_VIRT_VEC3_YZW,
 304    REG_TYPE_VIRT_VEC2_XY,
 305    REG_TYPE_VIRT_VEC2_XZ,
 306    REG_TYPE_VIRT_VEC2_XW,
 307    REG_TYPE_VIRT_VEC2_YZ,
 308    REG_TYPE_VIRT_VEC2_YW,
 309    REG_TYPE_VIRT_VEC2_ZW,
 310    REG_TYPE_VIRT_SCALAR_X,
 311    REG_TYPE_VIRT_SCALAR_Y,
 312    REG_TYPE_VIRT_SCALAR_Z,
 313    REG_TYPE_VIRT_SCALAR_W,
 314    REG_TYPE_VIRT_VEC2T_XY,
 315    REG_TYPE_VIRT_VEC2T_ZW,
 316    REG_TYPE_VIRT_VEC2C_XY,
 317    REG_TYPE_VIRT_VEC2C_YZ,
 318    REG_TYPE_VIRT_VEC2C_ZW,
 319    REG_TYPE_VIRT_VEC3C_XYZ,
 320    REG_TYPE_VIRT_VEC3C_YZW,
 321    NUM_REG_TYPES,
 322 };
 323
 324 /* writemask when used as dest */
 325 static const uint8_t
 326 reg_writemask[NUM_REG_TYPES] = {
 327    [REG_TYPE_VEC4] = 0xf,
 328    [REG_TYPE_VIRT_SCALAR_X] = 0x1,
 329    [REG_TYPE_VIRT_SCALAR_Y] = 0x2,
 330    [REG_TYPE_VIRT_VEC2_XY] = 0x3,
 331    [REG_TYPE_VIRT_VEC2T_XY] = 0x3,
 332    [REG_TYPE_VIRT_VEC2C_XY] = 0x3,
 333    [REG_TYPE_VIRT_SCALAR_Z] = 0x4,
 334    [REG_TYPE_VIRT_VEC2_XZ] = 0x5,
 335    [REG_TYPE_VIRT_VEC2_YZ] = 0x6,
 336    [REG_TYPE_VIRT_VEC2C_YZ] = 0x6,
 337    [REG_TYPE_VIRT_VEC3_XYZ] = 0x7,
 338    [REG_TYPE_VIRT_VEC3C_XYZ] = 0x7,
 339    [REG_TYPE_VIRT_SCALAR_W] = 0x8,
 340    [REG_TYPE_VIRT_VEC2_XW] = 0x9,
 341    [REG_TYPE_VIRT_VEC2_YW] = 0xa,
 342    [REG_TYPE_VIRT_VEC3_XYW] = 0xb,
 343    [REG_TYPE_VIRT_VEC2_ZW] = 0xc,
 344    [REG_TYPE_VIRT_VEC2T_ZW] = 0xc,
 345    [REG_TYPE_VIRT_VEC2C_ZW] = 0xc,
 346    [REG_TYPE_VIRT_VEC3_XZW] = 0xd,
 347    [REG_TYPE_VIRT_VEC3_YZW] = 0xe,
 348    [REG_TYPE_VIRT_VEC3C_YZW] = 0xe,
 349 };
 350
 351 /* how to swizzle when used as a src */
 352 static const uint8_t
 353 reg_swiz[NUM_REG_TYPES] = {
 354    [REG_TYPE_VEC4] = INST_SWIZ_IDENTITY,
 355    [REG_TYPE_VIRT_SCALAR_X] = INST_SWIZ_IDENTITY,
 356    [REG_TYPE_VIRT_SCALAR_Y] = SWIZZLE(Y, Y, Y, Y),
 357    [REG_TYPE_VIRT_VEC2_XY] = INST_SWIZ_IDENTITY,
 358    [REG_TYPE_VIRT_VEC2T_XY] = INST_SWIZ_IDENTITY,
 359    [REG_TYPE_VIRT_VEC2C_XY] = INST_SWIZ_IDENTITY,
 360    [REG_TYPE_VIRT_SCALAR_Z] = SWIZZLE(Z, Z, Z, Z),
 361    [REG_TYPE_VIRT_VEC2_XZ] = SWIZZLE(X, Z, X, Z),
 362    [REG_TYPE_VIRT_VEC2_YZ] = SWIZZLE(Y, Z, Y, Z),
 363    [REG_TYPE_VIRT_VEC2C_YZ] = SWIZZLE(Y, Z, Y, Z),
 364    [REG_TYPE_VIRT_VEC3_XYZ] = INST_SWIZ_IDENTITY,
 365    [REG_TYPE_VIRT_VEC3C_XYZ] = INST_SWIZ_IDENTITY,
 366    [REG_TYPE_VIRT_SCALAR_W] = SWIZZLE(W, W, W, W),
 367    [REG_TYPE_VIRT_VEC2_XW] = SWIZZLE(X, W, X, W),
 368    [REG_TYPE_VIRT_VEC2_YW] = SWIZZLE(Y, W, Y, W),
 369    [REG_TYPE_VIRT_VEC3_XYW] = SWIZZLE(X, Y, W, X),
 370    [REG_TYPE_VIRT_VEC2_ZW] = SWIZZLE(Z, W, Z, W),
 371    [REG_TYPE_VIRT_VEC2T_ZW] = SWIZZLE(Z, W, Z, W),
 372    [REG_TYPE_VIRT_VEC2C_ZW] = SWIZZLE(Z, W, Z, W),
 373    [REG_TYPE_VIRT_VEC3_XZW] = SWIZZLE(X, Z, W, X),
 374    [REG_TYPE_VIRT_VEC3_YZW] = SWIZZLE(Y, Z, W, X),
 375    [REG_TYPE_VIRT_VEC3C_YZW] = SWIZZLE(Y, Z, W, X),
 376 };
 377
 378 /* how to swizzle when used as a dest */
 379 static const uint8_t
 380 reg_dst_swiz[NUM_REG_TYPES] = {
 381    [REG_TYPE_VEC4] = INST_SWIZ_IDENTITY,
 382    [REG_TYPE_VIRT_SCALAR_X] = INST_SWIZ_IDENTITY,
 383    [REG_TYPE_VIRT_SCALAR_Y] = SWIZZLE(X, X, X, X),
 384    [REG_TYPE_VIRT_VEC2_XY] = INST_SWIZ_IDENTITY,
 385    [REG_TYPE_VIRT_VEC2T_XY] = INST_SWIZ_IDENTITY,
 386    [REG_TYPE_VIRT_VEC2C_XY] = INST_SWIZ_IDENTITY,
 387    [REG_TYPE_VIRT_SCALAR_Z] = SWIZZLE(X, X, X, X),
 388    [REG_TYPE_VIRT_VEC2_XZ] = SWIZZLE(X, X, Y, Y),
 389    [REG_TYPE_VIRT_VEC2_YZ] = SWIZZLE(X, X, Y, Y),
 390    [REG_TYPE_VIRT_VEC2C_YZ] = SWIZZLE(X, X, Y, Y),
 391    [REG_TYPE_VIRT_VEC3_XYZ] = INST_SWIZ_IDENTITY,
 392    [REG_TYPE_VIRT_VEC3C_XYZ] = INST_SWIZ_IDENTITY,
 393    [REG_TYPE_VIRT_SCALAR_W] = SWIZZLE(X, X, X, X),
 394    [REG_TYPE_VIRT_VEC2_XW] = SWIZZLE(X, X, Y, Y),
 395    [REG_TYPE_VIRT_VEC2_YW] = SWIZZLE(X, X, Y, Y),
 396    [REG_TYPE_VIRT_VEC3_XYW] = SWIZZLE(X, Y, Z, Z),
 397    [REG_TYPE_VIRT_VEC2_ZW] = SWIZZLE(X, X, X, Y),
 398    [REG_TYPE_VIRT_VEC2T_ZW] = SWIZZLE(X, X, X, Y),
 399    [REG_TYPE_VIRT_VEC2C_ZW] = SWIZZLE(X, X, X, Y),
 400    [REG_TYPE_VIRT_VEC3_XZW] = SWIZZLE(X, Y, Y, Z),
 401    [REG_TYPE_VIRT_VEC3_YZW] = SWIZZLE(X, X, Y, Z),
 402    [REG_TYPE_VIRT_VEC3C_YZW] = SWIZZLE(X, X, Y, Z),
 403 };
 404
 405 static inline int reg_get_type(int virt_reg)
 406 {
 407    return virt_reg % NUM_REG_TYPES;
 408 }
 409
 410 static inline int reg_get_base(struct etna_compile *c, int virt_reg)
 411 {
 412    /* offset by 1 to avoid reserved position register */
 413    if (c->nir->info.stage == MESA_SHADER_FRAGMENT)
 414       return (virt_reg / NUM_REG_TYPES + 1) % ETNA_MAX_TEMPS;
 415    return virt_reg / NUM_REG_TYPES;
 416 }
 417
 418 /* use "r63.z" for depth reg, it will wrap around to r0.z by reg_get_base
 419  * (fs registers are offset by 1 to avoid reserving r0)
 420  */
 421 #define REG_FRAG_DEPTH ((ETNA_MAX_TEMPS - 1) * NUM_REG_TYPES + REG_TYPE_VIRT_SCALAR_Z)
 422
 423 static inline int reg_get_class(int virt_reg)
 424 {
 425    switch (reg_get_type(virt_reg)) {
 426    case REG_TYPE_VEC4:
 427       return REG_CLASS_VEC4;
 428    case REG_TYPE_VIRT_VEC3_XYZ:
 429    case REG_TYPE_VIRT_VEC3_XYW:
 430    case REG_TYPE_VIRT_VEC3_XZW:
 431    case REG_TYPE_VIRT_VEC3_YZW:
 432       return REG_CLASS_VIRT_VEC3;
 433    case REG_TYPE_VIRT_VEC2_XY:
 434    case REG_TYPE_VIRT_VEC2_XZ:
 435    case REG_TYPE_VIRT_VEC2_XW:
 436    case REG_TYPE_VIRT_VEC2_YZ:
 437    case REG_TYPE_VIRT_VEC2_YW:
 438    case REG_TYPE_VIRT_VEC2_ZW:
 439       return REG_CLASS_VIRT_VEC2;
 440    case REG_TYPE_VIRT_SCALAR_X:
 441    case REG_TYPE_VIRT_SCALAR_Y:
 442    case REG_TYPE_VIRT_SCALAR_Z:
 443    case REG_TYPE_VIRT_SCALAR_W:
 444       return REG_CLASS_VIRT_SCALAR;
 445    case REG_TYPE_VIRT_VEC2T_XY:
 446    case REG_TYPE_VIRT_VEC2T_ZW:
 447       return REG_CLASS_VIRT_VEC2T;
 448    case REG_TYPE_VIRT_VEC2C_XY:
 449    case REG_TYPE_VIRT_VEC2C_YZ:
 450    case REG_TYPE_VIRT_VEC2C_ZW:
 451       return REG_CLASS_VIRT_VEC2C;
 452    case REG_TYPE_VIRT_VEC3C_XYZ:
 453    case REG_TYPE_VIRT_VEC3C_YZW:
 454       return REG_CLASS_VIRT_VEC3C;
 455    }
 456
 457    assert(false);
 458    return 0;
 459 }
 460
 461 /* nir_src to allocated register */
 462 static hw_src
 463 ra_src(struct etna_compile *c, nir_src *src)
 464 {
 465    unsigned reg = ra_get_node_reg(c->g, c->live_map[src_index(c->impl, src)]);
 466    return SRC_REG(reg_get_base(c, reg), reg_swiz[reg_get_type(reg)]);
 467 }
 468
 469 static hw_src
 470 get_src(struct etna_compile *c, nir_src *src)
 471 {
 472    if (!src->is_ssa)
 473       return ra_src(c, src);
 474
 475    nir_instr *instr = src->ssa->parent_instr;
 476
 477    if (instr->pass_flags & BYPASS_SRC) {
 478       assert(instr->type == nir_instr_type_alu);
 479       nir_alu_instr *alu = nir_instr_as_alu(instr);
 480       assert(alu->op == nir_op_mov);
 481       return src_swizzle(get_src(c, &alu->src[0].src), ALU_SWIZ(&alu->src[0]));
 482    }
 483
 484    switch (instr->type) {
 485    case nir_instr_type_load_const:
 486       return const_src(c, nir_instr_as_load_const(instr)->value, src->ssa->num_components);
 487    case nir_instr_type_intrinsic: {
 488       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
 489       switch (intr->intrinsic) {
 490       case nir_intrinsic_load_input:
 491       case nir_intrinsic_load_instance_id:
 492       case nir_intrinsic_load_uniform:
 493       case nir_intrinsic_load_ubo:
 494          return ra_src(c, src);
 495       case nir_intrinsic_load_front_face:
 496          return (hw_src) { .use = 1, .rgroup = INST_RGROUP_INTERNAL };
 497       case nir_intrinsic_load_frag_coord:
 498          return SRC_REG(0, INST_SWIZ_IDENTITY);
 499       default:
 500          compile_error(c, "Unhandled NIR intrinsic type: %s\n",
 501                        nir_intrinsic_infos[intr->intrinsic].name);
 502          break;
 503       }
 504    } break;
 505    case nir_instr_type_alu:
 506    case nir_instr_type_tex:
 507       return ra_src(c, src);
 508    case nir_instr_type_ssa_undef: {
 509       /* return zero to deal with broken Blur demo */
 510       nir_const_value value = CONST(0);
 511       return src_swizzle(const_src(c, &value, 1), SWIZZLE(X,X,X,X));
 512    }
 513    default:
 514       compile_error(c, "Unhandled NIR instruction type: %d\n", instr->type);
 515       break;
 516    }
 517
 518    return SRC_DISABLE;
 519 }
 520
 521 static bool
 522 vec_dest_has_swizzle(nir_alu_instr *vec, nir_ssa_def *ssa)
 523 {
 524    for (unsigned i = 0; i < 4; i++) {
 525       if (!(vec->dest.write_mask & (1 << i)) || vec->src[i].src.ssa != ssa)
 526          continue;
 527
 528       if (vec->src[i].swizzle[0] != i)
 529          return true;
 530    }
 531
 532    /* don't deal with possible bypassed vec/mov chain */
 533    nir_foreach_use(use_src, ssa) {
 534       nir_instr *instr = use_src->parent_instr;
 535       if (instr->type != nir_instr_type_alu)
 536          continue;
 537
 538       nir_alu_instr *alu = nir_instr_as_alu(instr);
 539
 540       switch (alu->op) {
 541       case nir_op_mov:
 542       case nir_op_vec2:
 543       case nir_op_vec3:
 544       case nir_op_vec4:
 545          return true;
 546       default:
 547          break;
 548       }
 549    }
 550    return false;
 551 }
 552
 553 /* get allocated dest register for nir_dest
 554  * *p_swiz tells how the components need to be placed into register
 555  */
 556 static hw_dst
 557 ra_dest(struct etna_compile *c, nir_dest *dest, unsigned *p_swiz)
 558 {
 559    unsigned swiz = INST_SWIZ_IDENTITY, mask = 0xf;
 560    dest = real_dest(dest, &swiz, &mask);
 561
 562    unsigned r = ra_get_node_reg(c->g, c->live_map[dest_index(c->impl, dest)]);
 563    unsigned t = reg_get_type(r);
 564
 565    *p_swiz = inst_swiz_compose(swiz, reg_dst_swiz[t]);
 566
 567    return (hw_dst) {
 568       .use = 1,
 569       .reg = reg_get_base(c, r),
 570       .write_mask = inst_write_mask_compose(mask, reg_writemask[t]),
 571    };
 572 }
 573
 574 /* precomputed by register_allocate  */
 575 static unsigned int *q_values[] = {
 576    (unsigned int[]) {1, 2, 3, 4, 2, 2, 3, },
 577    (unsigned int[]) {3, 5, 6, 6, 5, 5, 6, },
 578    (unsigned int[]) {3, 4, 4, 4, 4, 4, 4, },
 579    (unsigned int[]) {1, 1, 1, 1, 1, 1, 1, },
 580    (unsigned int[]) {1, 2, 2, 2, 1, 2, 2, },
 581    (unsigned int[]) {2, 3, 3, 3, 2, 3, 3, },
 582    (unsigned int[]) {2, 2, 2, 2, 2, 2, 2, },
 583 };
 584
 585 static void
 586 ra_assign(struct etna_compile *c, nir_shader *shader)
 587 {
 588    struct ra_regs *regs = ra_alloc_reg_set(NULL, ETNA_MAX_TEMPS *
 589                   NUM_REG_TYPES, false);
 590
 591    /* classes always be created from index 0, so equal to the class enum
 592     * which represents a register with (c+1) components
 593     */
 594    for (int c = 0; c < NUM_REG_CLASSES; c++)
 595       ra_alloc_reg_class(regs);
 596    /* add each register of each class */
 597    for (int r = 0; r < NUM_REG_TYPES * ETNA_MAX_TEMPS; r++)
 598       ra_class_add_reg(regs, reg_get_class(r), r);
 599    /* set conflicts */
 600    for (int r = 0; r < ETNA_MAX_TEMPS; r++) {
 601       for (int i = 0; i < NUM_REG_TYPES; i++) {
 602          for (int j = 0; j < i; j++) {
 603             if (reg_writemask[i] & reg_writemask[j]) {
 604                ra_add_reg_conflict(regs, NUM_REG_TYPES * r + i,
 605                                          NUM_REG_TYPES * r + j);
 606             }
 607          }
 608       }
 609    }
 610    ra_set_finalize(regs, q_values);
 611
 612    nir_function_impl *impl = nir_shader_get_entrypoint(shader);
 613
 614    /* liveness and interference */
 615
 616    nir_index_blocks(impl);
 617    nir_index_ssa_defs(impl);
 618    nir_foreach_block(block, impl) {
 619       nir_foreach_instr(instr, block)
 620          instr->pass_flags = 0;
 621    }
 622
 623    /* this gives an approximation/upper limit on how many nodes are needed
 624     * (some ssa values do not represent an allocated register)
 625     */
 626    unsigned max_nodes = impl->ssa_alloc + impl->reg_alloc;
 627    unsigned *live_map = ralloc_array(NULL, unsigned, max_nodes);
 628    memset(live_map, 0xff, sizeof(unsigned) * max_nodes);
 629    struct live_def *defs = rzalloc_array(NULL, struct live_def, max_nodes);
 630
 631    unsigned num_nodes = etna_live_defs(impl, defs, live_map);
 632    struct ra_graph *g = ra_alloc_interference_graph(regs, num_nodes);
 633
 634    /* set classes from num_components */
 635    for (unsigned i = 0; i < num_nodes; i++) {
 636       nir_instr *instr = defs[i].instr;
 637       nir_dest *dest = defs[i].dest;
 638       unsigned comp = nir_dest_num_components(*dest) - 1;
 639
 640       if (instr->type == nir_instr_type_alu &&
 641           c->specs->has_new_transcendentals) {
 642          switch (nir_instr_as_alu(instr)->op) {
 643          case nir_op_fdiv:
 644          case nir_op_flog2:
 645          case nir_op_fsin:
 646          case nir_op_fcos:
 647             assert(dest->is_ssa);
 648             comp = REG_CLASS_VIRT_VEC2T;
 649          default:
 650             break;
 651          }
 652       }
 653
 654       if (instr->type == nir_instr_type_intrinsic) {
 655          nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
 656          /* can't have dst swizzle or sparse writemask on UBO loads */
 657          if (intr->intrinsic == nir_intrinsic_load_ubo) {
 658             assert(dest == &intr->dest);
 659             if (dest->ssa.num_components == 2)
 660                comp = REG_CLASS_VIRT_VEC2C;
 661             if (dest->ssa.num_components == 3)
 662                comp = REG_CLASS_VIRT_VEC3C;
 663          }
 664       }
 665
 666       ra_set_node_class(g, i, comp);
 667    }
 668
 669    nir_foreach_block(block, impl) {
 670       nir_foreach_instr(instr, block) {
 671          if (instr->type != nir_instr_type_intrinsic)
 672             continue;
 673
 674          nir_dest *dest = dest_for_instr(instr);
 675          nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
 676          unsigned reg;
 677
 678          switch (intr->intrinsic) {
 679          case nir_intrinsic_store_deref: {
 680             /* don't want outputs to be swizzled
 681              * TODO: better would be to set the type to X/XY/XYZ/XYZW
 682              * TODO: what if fragcoord.z is read after writing fragdepth?
 683              */
 684             nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
 685             unsigned index = live_map[src_index(impl, &intr->src[1])];
 686
 687             if (shader->info.stage == MESA_SHADER_FRAGMENT &&
 688                 deref->var->data.location == FRAG_RESULT_DEPTH) {
 689                ra_set_node_reg(g, index, REG_FRAG_DEPTH);
 690             } else {
 691                ra_set_node_class(g, index, REG_CLASS_VEC4);
 692             }
 693          } continue;
 694          case nir_intrinsic_load_input:
 695             reg = nir_intrinsic_base(intr) * NUM_REG_TYPES + (unsigned[]) {
 696                REG_TYPE_VIRT_SCALAR_X,
 697                REG_TYPE_VIRT_VEC2_XY,
 698                REG_TYPE_VIRT_VEC3_XYZ,
 699                REG_TYPE_VEC4,
 700             }[nir_dest_num_components(*dest) - 1];
 701             break;
 702          case nir_intrinsic_load_instance_id:
 703             reg = c->variant->infile.num_reg * NUM_REG_TYPES + REG_TYPE_VIRT_SCALAR_Y;
 704             break;
 705          default:
 706             continue;
 707          }
 708
 709          ra_set_node_reg(g, live_map[dest_index(impl, dest)], reg);
 710       }
 711    }
 712
 713    /* add interference for intersecting live ranges */
 714    for (unsigned i = 0; i < num_nodes; i++) {
 715       assert(defs[i].live_start < defs[i].live_end);
 716       for (unsigned j = 0; j < i; j++) {
 717          if (defs[i].live_start >= defs[j].live_end || defs[j].live_start >= defs[i].live_end)
 718             continue;
 719          ra_add_node_interference(g, i, j);
 720       }
 721    }
 722
 723    ralloc_free(defs);
 724
 725    /* Allocate registers */
 726    ASSERTED bool ok = ra_allocate(g);
 727    assert(ok);
 728
 729    c->g = g;
 730    c->regs = regs;
 731    c->live_map = live_map;
 732    c->num_nodes = num_nodes;
 733 }
 734
 735 static unsigned
 736 ra_finish(struct etna_compile *c)
 737 {
 738    /* TODO: better way to get number of registers used? */
 739    unsigned j = 0;
 740    for (unsigned i = 0; i < c->num_nodes; i++) {
 741       j = MAX2(j, reg_get_base(c, ra_get_node_reg(c->g, i)) + 1);
 742    }
 743
 744    ralloc_free(c->g);
 745    ralloc_free(c->regs);
 746    ralloc_free(c->live_map);
 747
 748    return j;
 749 }
 750
 751 static void
 752 emit_alu(struct etna_compile *c, nir_alu_instr * alu)
 753 {
 754    const nir_op_info *info = &nir_op_infos[alu->op];
 755
 756    /* marked as dead instruction (vecN and other bypassed instr) */
 757    if (alu->instr.pass_flags)
 758       return;
 759
 760    assert(!(alu->op >= nir_op_vec2 && alu->op <= nir_op_vec4));
 761
 762    unsigned dst_swiz;
 763    hw_dst dst = ra_dest(c, &alu->dest.dest, &dst_swiz);
 764
 765    /* compose alu write_mask with RA write mask */
 766    if (!alu->dest.dest.is_ssa)
 767       dst.write_mask = inst_write_mask_compose(alu->dest.write_mask, dst.write_mask);
 768
 769    switch (alu->op) {
 770    case nir_op_fdot2:
 771    case nir_op_fdot3:
 772    case nir_op_fdot4:
 773       /* not per-component - don't compose dst_swiz */
 774       dst_swiz = INST_SWIZ_IDENTITY;
 775       break;
 776    default:
 777       break;
 778    }
 779
 780    hw_src srcs[3];
 781
 782    for (int i = 0; i < info->num_inputs; i++) {
 783       nir_alu_src *asrc = &alu->src[i];
 784       hw_src src;
 785
 786       src = src_swizzle(get_src(c, &asrc->src), ALU_SWIZ(asrc));
 787       src = src_swizzle(src, dst_swiz);
 788
 789       if (src.rgroup != INST_RGROUP_IMMEDIATE) {
 790          src.neg = asrc->negate || (alu->op == nir_op_fneg);
 791          src.abs = asrc->abs || (alu->op == nir_op_fabs);
 792       } else {
 793          assert(!asrc->negate && alu->op != nir_op_fneg);
 794          assert(!asrc->abs && alu->op != nir_op_fabs);
 795       }
 796
 797       srcs[i] = src;
 798    }
 799
 800    etna_emit_alu(c, alu->op, dst, srcs, alu->dest.saturate || (alu->op == nir_op_fsat));
 801 }
 802
 803 static void
 804 emit_tex(struct etna_compile *c, nir_tex_instr * tex)
 805 {
 806    unsigned dst_swiz;
 807    hw_dst dst = ra_dest(c, &tex->dest, &dst_swiz);
 808    nir_src *coord = NULL, *lod_bias = NULL, *compare = NULL;
 809
 810    for (unsigned i = 0; i < tex->num_srcs; i++) {
 811       switch (tex->src[i].src_type) {
 812       case nir_tex_src_coord:
 813          coord = &tex->src[i].src;
 814          break;
 815       case nir_tex_src_bias:
 816       case nir_tex_src_lod:
 817          assert(!lod_bias);
 818          lod_bias = &tex->src[i].src;
 819          break;
 820       case nir_tex_src_comparator:
 821          compare = &tex->src[i].src;
 822          break;
 823       default:
 824          compile_error(c, "Unhandled NIR tex src type: %d\n",
 825                        tex->src[i].src_type);
 826          break;
 827       }
 828    }
 829
 830    etna_emit_tex(c, tex->op, tex->sampler_index, dst_swiz, dst, get_src(c, coord),
 831                  lod_bias ? get_src(c, lod_bias) : SRC_DISABLE,
 832                  compare ? get_src(c, compare) : SRC_DISABLE);
 833 }
 834
 835 static void
 836 emit_intrinsic(struct etna_compile *c, nir_intrinsic_instr * intr)
 837 {
 838    switch (intr->intrinsic) {
 839    case nir_intrinsic_store_deref:
 840       etna_emit_output(c, nir_src_as_deref(intr->src[0])->var, get_src(c, &intr->src[1]));
 841       break;
 842    case nir_intrinsic_discard_if:
 843       etna_emit_discard(c, get_src(c, &intr->src[0]));
 844       break;
 845    case nir_intrinsic_discard:
 846       etna_emit_discard(c, SRC_DISABLE);
 847       break;
 848    case nir_intrinsic_load_uniform: {
 849       unsigned dst_swiz;
 850       struct etna_inst_dst dst = ra_dest(c, &intr->dest, &dst_swiz);
 851
 852       /* TODO: rework so extra MOV isn't required, load up to 4 addresses at once */
 853       emit_inst(c, &(struct etna_inst) {
 854          .opcode = INST_OPCODE_MOVAR,
 855          .dst.write_mask = 0x1,
 856          .src[2] = get_src(c, &intr->src[0]),
 857       });
 858       emit_inst(c, &(struct etna_inst) {
 859          .opcode = INST_OPCODE_MOV,
 860          .dst = dst,
 861          .src[2] = {
 862             .use = 1,
 863             .rgroup = INST_RGROUP_UNIFORM_0,
 864             .reg = nir_intrinsic_base(intr),
 865             .swiz = dst_swiz,
 866             .amode = INST_AMODE_ADD_A_X,
 867          },
 868       });
 869    } break;
 870    case nir_intrinsic_load_ubo: {
 871       /* TODO: if offset is of the form (x + C) then add C to the base instead */
 872       unsigned idx = nir_src_as_const_value(intr->src[0])[0].u32;
 873       unsigned dst_swiz;
 874       emit_inst(c, &(struct etna_inst) {
 875          .opcode = INST_OPCODE_LOAD,
 876          .type = INST_TYPE_U32,
 877          .dst = ra_dest(c, &intr->dest, &dst_swiz),
 878          .src[0] = get_src(c, &intr->src[1]),
 879          .src[1] = const_src(c, &CONST_VAL(ETNA_IMMEDIATE_UBO0_ADDR + idx, 0), 1),
 880       });
 881    } break;
 882    case nir_intrinsic_load_front_face:
 883    case nir_intrinsic_load_frag_coord:
 884       assert(intr->dest.is_ssa); /* TODO - lower phis could cause this */
 885       break;
 886    case nir_intrinsic_load_input:
 887    case nir_intrinsic_load_instance_id:
 888       break;
 889    default:
 890       compile_error(c, "Unhandled NIR intrinsic type: %s\n",
 891                     nir_intrinsic_infos[intr->intrinsic].name);
 892    }
 893 }
 894
 895 static void
 896 emit_instr(struct etna_compile *c, nir_instr * instr)
 897 {
 898    switch (instr->type) {
 899    case nir_instr_type_alu:
 900       emit_alu(c, nir_instr_as_alu(instr));
 901       break;
 902    case nir_instr_type_tex:
 903       emit_tex(c, nir_instr_as_tex(instr));
 904       break;
 905    case nir_instr_type_intrinsic:
 906       emit_intrinsic(c, nir_instr_as_intrinsic(instr));
 907       break;
 908    case nir_instr_type_jump:
 909       assert(nir_instr_is_last(instr));
 910    case nir_instr_type_load_const:
 911    case nir_instr_type_ssa_undef:
 912    case nir_instr_type_deref:
 913       break;
 914    default:
 915       compile_error(c, "Unhandled NIR instruction type: %d\n", instr->type);
 916       break;
 917    }
 918 }
 919
 920 static void
 921 emit_block(struct etna_compile *c, nir_block * block)
 922 {
 923    etna_emit_block_start(c, block->index);
 924
 925    nir_foreach_instr(instr, block)
 926       emit_instr(c, instr);
 927
 928    /* succs->index < block->index is for the loop case  */
 929    nir_block *succs = block->successors[0];
 930    if (nir_block_ends_in_jump(block) || succs->index < block->index)
 931       etna_emit_jump(c, succs->index, SRC_DISABLE);
 932 }
 933
 934 static void
 935 emit_cf_list(struct etna_compile *c, struct exec_list *list);
 936
 937 static void
 938 emit_if(struct etna_compile *c, nir_if * nif)
 939 {
 940    etna_emit_jump(c, nir_if_first_else_block(nif)->index, get_src(c, &nif->condition));
 941    emit_cf_list(c, &nif->then_list);
 942
 943    /* jump at end of then_list to skip else_list
 944     * not needed if then_list already ends with a jump or else_list is empty
 945     */
 946    if (!nir_block_ends_in_jump(nir_if_last_then_block(nif)) &&
 947        !nir_cf_list_is_empty_block(&nif->else_list))
 948       etna_emit_jump(c, nir_if_last_else_block(nif)->successors[0]->index, SRC_DISABLE);
 949
 950    emit_cf_list(c, &nif->else_list);
 951 }
 952
 953 static void
 954 emit_cf_list(struct etna_compile *c, struct exec_list *list)
 955 {
 956    foreach_list_typed(nir_cf_node, node, node, list) {
 957       switch (node->type) {
 958       case nir_cf_node_block:
 959          emit_block(c, nir_cf_node_as_block(node));
 960          break;
 961       case nir_cf_node_if:
 962          emit_if(c, nir_cf_node_as_if(node));
 963          break;
 964       case nir_cf_node_loop:
 965          emit_cf_list(c, &nir_cf_node_as_loop(node)->body);
 966          break;
 967       default:
 968          compile_error(c, "Unknown NIR node type\n");
 969          break;
 970       }
 971    }
 972 }
 973
 974 /* based on nir_lower_vec_to_movs */
 975 static unsigned
 976 insert_vec_mov(nir_alu_instr *vec, unsigned start_idx, nir_shader *shader)
 977 {
 978    assert(start_idx < nir_op_infos[vec->op].num_inputs);
 979    unsigned write_mask = (1u << start_idx);
 980
 981    nir_alu_instr *mov = nir_alu_instr_create(shader, nir_op_mov);
 982    nir_alu_src_copy(&mov->src[0], &vec->src[start_idx], mov);
 983
 984    mov->src[0].swizzle[0] = vec->src[start_idx].swizzle[0];
 985    mov->src[0].negate = vec->src[start_idx].negate;
 986    mov->src[0].abs = vec->src[start_idx].abs;
 987
 988    unsigned num_components = 1;
 989
 990    for (unsigned i = start_idx + 1; i < 4; i++) {
 991       if (!(vec->dest.write_mask & (1 << i)))
 992          continue;
 993
 994       if (nir_srcs_equal(vec->src[i].src, vec->src[start_idx].src) &&
 995           vec->src[i].negate == vec->src[start_idx].negate &&
 996           vec->src[i].abs == vec->src[start_idx].abs) {
 997          write_mask |= (1 << i);
 998          mov->src[0].swizzle[num_components] = vec->src[i].swizzle[0];
 999          num_components++;
1000       }
1001    }
1002
1003    mov->dest.write_mask = (1 << num_components) - 1;
1004    nir_ssa_dest_init(&mov->instr, &mov->dest.dest, num_components, 32, NULL);
1005
1006    /* replace vec srcs with inserted mov */
1007    for (unsigned i = 0, j = 0; i < 4; i++) {
1008       if (!(write_mask & (1 << i)))
1009          continue;
1010
1011       nir_instr_rewrite_src(&vec->instr, &vec->src[i].src, nir_src_for_ssa(&mov->dest.dest.ssa));
1012       vec->src[i].swizzle[0] = j++;
1013    }
1014
1015    nir_instr_insert_before(&vec->instr, &mov->instr);
1016
1017    return write_mask;
1018 }
1019
1020 /*
1021  * for vecN instructions:
1022  * -merge constant sources into a single src
1023  * -insert movs (nir_lower_vec_to_movs equivalent)
1024  * for non-vecN instructions:
1025  * -try to merge constants as single constant
1026  * -insert movs for multiple constants (pre-HALTI5)
1027  */
1028 static void
1029 lower_alu(struct etna_compile *c, nir_alu_instr *alu)
1030 {
1031    const nir_op_info *info = &nir_op_infos[alu->op];
1032
1033    nir_builder b;
1034    nir_builder_init(&b, c->impl);
1035    b.cursor = nir_before_instr(&alu->instr);
1036
1037    switch (alu->op) {
1038    case nir_op_vec2:
1039    case nir_op_vec3:
1040    case nir_op_vec4:
1041       break;
1042    default:
1043       /* pre-GC7000L can only have 1 uniform src per instruction */
1044       if (c->specs->halti >= 5)
1045          return;
1046
1047       nir_const_value value[4] = {};
1048       uint8_t swizzle[4][4] = {};
1049       unsigned swiz_max = 0, num_const = 0;
1050
1051       for (unsigned i = 0; i < info->num_inputs; i++) {
1052          nir_const_value *cv = nir_src_as_const_value(alu->src[i].src);
1053          if (!cv)
1054             continue;
1055
1056          unsigned num_components = info->input_sizes[i] ?: alu->dest.dest.ssa.num_components;
1057          for (unsigned j = 0; j < num_components; j++) {
1058             int idx = const_add(&value[0].u64, cv[alu->src[i].swizzle[j]].u64);
1059             swizzle[i][j] = idx;
1060             swiz_max = MAX2(swiz_max, (unsigned) idx);
1061          }
1062          num_const++;
1063       }
1064
1065       /* nothing to do */
1066       if (num_const <= 1)
1067          return;
1068
1069       /* resolve with single combined const src */
1070       if (swiz_max < 4) {
1071          nir_ssa_def *def = nir_build_imm(&b, swiz_max + 1, 32, value);
1072
1073          for (unsigned i = 0; i < info->num_inputs; i++) {
1074             nir_const_value *cv = nir_src_as_const_value(alu->src[i].src);
1075             if (!cv)
1076                continue;
1077
1078             nir_instr_rewrite_src(&alu->instr, &alu->src[i].src, nir_src_for_ssa(def));
1079
1080             for (unsigned j = 0; j < 4; j++)
1081                alu->src[i].swizzle[j] = swizzle[i][j];
1082          }
1083          return;
1084       }
1085
1086       /* resolve with movs */
1087       num_const = 0;
1088       for (unsigned i = 0; i < info->num_inputs; i++) {
1089          nir_const_value *cv = nir_src_as_const_value(alu->src[i].src);
1090          if (!cv)
1091             continue;
1092
1093          num_const++;
1094          if (num_const == 1)
1095             continue;
1096
1097          nir_ssa_def *mov = nir_mov(&b, alu->src[i].src.ssa);
1098          nir_instr_rewrite_src(&alu->instr, &alu->src[i].src, nir_src_for_ssa(mov));
1099       }
1100       return;
1101    }
1102
1103    nir_const_value value[4];
1104    unsigned num_components = 0;
1105
1106    for (unsigned i = 0; i < info->num_inputs; i++) {
1107       nir_const_value *cv = nir_src_as_const_value(alu->src[i].src);
1108       if (cv)
1109          value[num_components++] = cv[alu->src[i].swizzle[0]];
1110    }
1111
1112    /* if there is more than one constant source to the vecN, combine them
1113     * into a single load_const (removing the vecN completely if all components
1114     * are constant)
1115     */
1116    if (num_components > 1) {
1117       nir_ssa_def *def = nir_build_imm(&b, num_components, 32, value);
1118
1119       if (num_components == info->num_inputs) {
1120          nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, nir_src_for_ssa(def));
1121          nir_instr_remove(&alu->instr);
1122          return;
1123       }
1124
1125       for (unsigned i = 0, j = 0; i < info->num_inputs; i++) {
1126          nir_const_value *cv = nir_src_as_const_value(alu->src[i].src);
1127          if (!cv)
1128             continue;
1129
1130          nir_instr_rewrite_src(&alu->instr, &alu->src[i].src, nir_src_for_ssa(def));
1131          alu->src[i].swizzle[0] = j++;
1132       }
1133    }
1134
1135    unsigned finished_write_mask = 0;
1136    for (unsigned i = 0; i < 4; i++) {
1137       if (!(alu->dest.write_mask & (1 << i)))
1138             continue;
1139
1140       nir_ssa_def *ssa = alu->src[i].src.ssa;
1141
1142       /* check that vecN instruction is only user of this */
1143       bool need_mov = list_length(&ssa->if_uses) != 0;
1144       nir_foreach_use(use_src, ssa) {
1145          if (use_src->parent_instr != &alu->instr)
1146             need_mov = true;
1147       }
1148
1149       nir_instr *instr = ssa->parent_instr;
1150       switch (instr->type) {
1151       case nir_instr_type_alu:
1152       case nir_instr_type_tex:
1153          break;
1154       case nir_instr_type_intrinsic:
1155          if (nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_input) {
1156             need_mov = vec_dest_has_swizzle(alu, &nir_instr_as_intrinsic(instr)->dest.ssa);
1157             break;
1158          }
1159       default:
1160          need_mov = true;
1161       }
1162
1163       if (need_mov && !(finished_write_mask & (1 << i)))
1164          finished_write_mask |= insert_vec_mov(alu, i, c->nir);
1165    }
1166 }
1167
1168 static bool
1169 emit_shader(struct etna_compile *c, unsigned *num_temps, unsigned *num_consts)
1170 {
1171    nir_shader *shader = c->nir;
1172    c->impl = nir_shader_get_entrypoint(shader);
1173
1174    bool have_indirect_uniform = false;
1175    unsigned indirect_max = 0;
1176
1177    nir_builder b;
1178    nir_builder_init(&b, c->impl);
1179
1180    /* convert non-dynamic uniform loads to constants, etc */
1181    nir_foreach_block(block, c->impl) {
1182       nir_foreach_instr_safe(instr, block) {
1183          switch(instr->type) {
1184          case nir_instr_type_alu:
1185             /* deals with vecN and const srcs */
1186             lower_alu(c, nir_instr_as_alu(instr));
1187             break;
1188          case nir_instr_type_load_const: {
1189             nir_load_const_instr *load_const = nir_instr_as_load_const(instr);
1190             for (unsigned  i = 0; i < load_const->def.num_components; i++)
1191                load_const->value[i] = CONST(load_const->value[i].u32);
1192          } break;
1193          case nir_instr_type_intrinsic: {
1194             nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1195             /* TODO: load_ubo can also become a constant in some cases
1196              * (at the moment it can end up emitting a LOAD with two
1197              *  uniform sources, which could be a problem on HALTI2)
1198              */
1199             if (intr->intrinsic != nir_intrinsic_load_uniform)
1200                break;
1201             nir_const_value *off = nir_src_as_const_value(intr->src[0]);
1202             if (!off || off[0].u64 >> 32 != ETNA_IMMEDIATE_CONSTANT) {
1203                have_indirect_uniform = true;
1204                indirect_max = nir_intrinsic_base(intr) + nir_intrinsic_range(intr);
1205                break;
1206             }
1207
1208             unsigned base = nir_intrinsic_base(intr);
1209             /* pre halti2 uniform offset will be float */
1210             if (c->specs->halti < 2)
1211                base += (unsigned) off[0].f32;
1212             else
1213                base += off[0].u32;
1214             nir_const_value value[4];
1215
1216             for (unsigned i = 0; i < intr->dest.ssa.num_components; i++) {
1217                if (nir_intrinsic_base(intr) < 0)
1218                   value[i] = TEXSCALE(~nir_intrinsic_base(intr), i);
1219                else
1220                   value[i] = UNIFORM(base * 4 + i);
1221             }
1222
1223             b.cursor = nir_after_instr(instr);
1224             nir_ssa_def *def = nir_build_imm(&b, intr->dest.ssa.num_components, 32, value);
1225
1226             nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(def));
1227             nir_instr_remove(instr);
1228          } break;
1229          default:
1230             break;
1231          }
1232       }
1233    }
1234
1235    /* TODO: only emit required indirect uniform ranges */
1236    if (have_indirect_uniform) {
1237       for (unsigned i = 0; i < indirect_max * 4; i++)
1238          c->consts[i] = UNIFORM(i).u64;
1239       c->const_count = indirect_max;
1240    }
1241
1242    /* add mov for any store output using sysval/const  */
1243    nir_foreach_block(block, c->impl) {
1244       nir_foreach_instr_safe(instr, block) {
1245          if (instr->type != nir_instr_type_intrinsic)
1246             continue;
1247
1248          nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1249
1250          switch (intr->intrinsic) {
1251          case nir_intrinsic_store_deref: {
1252             nir_src *src = &intr->src[1];
1253             if (nir_src_is_const(*src) || is_sysval(src->ssa->parent_instr)) {
1254                b.cursor = nir_before_instr(instr);
1255                nir_instr_rewrite_src(instr, src, nir_src_for_ssa(nir_mov(&b, src->ssa)));
1256             }
1257          } break;
1258          default:
1259             break;
1260          }
1261       }
1262    }
1263
1264    /* call directly to avoid validation (load_const don't pass validation at this point) */
1265    nir_convert_from_ssa(shader, true);
1266    nir_opt_dce(shader);
1267
1268    ra_assign(c, shader);
1269
1270    emit_cf_list(c, &nir_shader_get_entrypoint(shader)->body);
1271
1272    *num_temps = ra_finish(c);
1273    *num_consts = c->const_count;
1274    return true;
1275 }
1276
1277 static bool
1278 etna_compile_check_limits(struct etna_shader_variant *v)
1279 {
1280    const struct etna_specs *specs = v->shader->specs;
1281    int max_uniforms = (v->stage == MESA_SHADER_VERTEX)
1282                          ? specs->max_vs_uniforms
1283                          : specs->max_ps_uniforms;
1284
1285    if (!specs->has_icache && v->needs_icache) {
1286       DBG("Number of instructions (%d) exceeds maximum %d", v->code_size / 4,
1287           specs->max_instructions);
1288       return false;
1289    }
1290
1291    if (v->num_temps > specs->max_registers) {
1292       DBG("Number of registers (%d) exceeds maximum %d", v->num_temps,
1293           specs->max_registers);
1294       return false;
1295    }
1296
1297    if (v->uniforms.imm_count / 4 > max_uniforms) {
1298       DBG("Number of uniforms (%d) exceeds maximum %d",
1299           v->uniforms.imm_count / 4, max_uniforms);
1300       return false;
1301    }
1302
1303    return true;
1304 }
1305
1306 static void
1307 fill_vs_mystery(struct etna_shader_variant *v)
1308 {
1309    const struct etna_specs *specs = v->shader->specs;
1310
1311    v->input_count_unk8 = DIV_ROUND_UP(v->infile.num_reg + 4, 16); /* XXX what is this */
1312
1313    /* fill in "mystery meat" load balancing value. This value determines how
1314     * work is scheduled between VS and PS
1315     * in the unified shader architecture. More precisely, it is determined from
1316     * the number of VS outputs, as well as chip-specific
1317     * vertex output buffer size, vertex cache size, and the number of shader
1318     * cores.
1319     *
1320     * XXX this is a conservative estimate, the "optimal" value is only known for
1321     * sure at link time because some
1322     * outputs may be unused and thus unmapped. Then again, in the general use
1323     * case with GLSL the vertex and fragment
1324     * shaders are linked already before submitting to Gallium, thus all outputs
1325     * are used.
1326     *
1327     * note: TGSI compiler counts all outputs (including position and pointsize), here
1328     * v->outfile.num_reg only counts varyings, +1 to compensate for the position output
1329     * TODO: might have a problem that we don't count pointsize when it is used
1330     */
1331
1332    int half_out = v->outfile.num_reg / 2 + 1;
1333    assert(half_out);
1334
1335    uint32_t b = ((20480 / (specs->vertex_output_buffer_size -
1336                            2 * half_out * specs->vertex_cache_size)) +
1337                  9) /
1338                 10;
1339    uint32_t a = (b + 256 / (specs->shader_core_count * half_out)) / 2;
1340    v->vs_load_balancing = VIVS_VS_LOAD_BALANCING_A(MIN2(a, 255)) |
1341                              VIVS_VS_LOAD_BALANCING_B(MIN2(b, 255)) |
1342                              VIVS_VS_LOAD_BALANCING_C(0x3f) |
1343                              VIVS_VS_LOAD_BALANCING_D(0x0f);
1344 }
1345
1346 bool
1347 etna_compile_shader_nir(struct etna_shader_variant *v)
1348 {
1349    if (unlikely(!v))
1350       return false;
1351
1352    struct etna_compile *c = CALLOC_STRUCT(etna_compile);
1353    if (!c)
1354       return false;
1355
1356    c->variant = v;
1357    c->specs = v->shader->specs;
1358    c->nir = nir_shader_clone(NULL, v->shader->nir);
1359
1360    nir_shader *s = c->nir;
1361    const struct etna_specs *specs = c->specs;
1362
1363    v->stage = s->info.stage;
1364    v->num_loops = 0; /* TODO */
1365    v->vs_id_in_reg = -1;
1366    v->vs_pos_out_reg = -1;
1367    v->vs_pointsize_out_reg = -1;
1368    v->ps_color_out_reg = 0; /* 0 for shader that doesn't write fragcolor.. */
1369    v->ps_depth_out_reg = -1;
1370
1371    /* setup input linking */
1372    struct etna_shader_io_file *sf = &v->infile;
1373    if (s->info.stage == MESA_SHADER_VERTEX) {
1374       nir_foreach_variable(var, &s->inputs) {
1375          unsigned idx = var->data.driver_location;
1376          sf->reg[idx].reg = idx;
1377          sf->reg[idx].slot = var->data.location;
1378          sf->reg[idx].num_components = glsl_get_components(var->type);
1379          sf->num_reg = MAX2(sf->num_reg, idx+1);
1380       }
1381    } else {
1382       unsigned count = 0;
1383       nir_foreach_variable(var, &s->inputs) {
1384          unsigned idx = var->data.driver_location;
1385          sf->reg[idx].reg = idx + 1;
1386          sf->reg[idx].slot = var->data.location;
1387          sf->reg[idx].num_components = glsl_get_components(var->type);
1388          sf->num_reg = MAX2(sf->num_reg, idx+1);
1389          count++;
1390       }
1391       assert(sf->num_reg == count);
1392    }
1393
1394    NIR_PASS_V(s, nir_lower_io, ~nir_var_shader_out, etna_glsl_type_size,
1395             (nir_lower_io_options)0);
1396
1397    NIR_PASS_V(s, nir_lower_regs_to_ssa);
1398    NIR_PASS_V(s, nir_lower_vars_to_ssa);
1399    NIR_PASS_V(s, nir_lower_indirect_derefs, nir_var_all);
1400    NIR_PASS_V(s, nir_lower_tex, &(struct nir_lower_tex_options) { .lower_txp = ~0u });
1401    NIR_PASS_V(s, nir_lower_alu_to_scalar, etna_alu_to_scalar_filter_cb, specs);
1402
1403    etna_optimize_loop(s);
1404
1405    NIR_PASS_V(s, etna_lower_io, v);
1406
1407    if (v->shader->specs->vs_need_z_div)
1408       NIR_PASS_V(s, nir_lower_clip_halfz);
1409
1410    /* lower pre-halti2 to float (halti0 has integers, but only scalar..) */
1411    if (c->specs->halti < 2) {
1412       /* use opt_algebraic between int_to_float and boot_to_float because
1413        * int_to_float emits ftrunc, and ftrunc lowering generates bool ops
1414        */
1415       NIR_PASS_V(s, nir_lower_int_to_float);
1416       NIR_PASS_V(s, nir_opt_algebraic);
1417       NIR_PASS_V(s, nir_lower_bool_to_float);
1418    } else {
1419       NIR_PASS_V(s, nir_lower_idiv, nir_lower_idiv_fast);
1420       NIR_PASS_V(s, nir_lower_bool_to_int32);
1421    }
1422
1423    etna_optimize_loop(s);
1424
1425    if (DBG_ENABLED(ETNA_DBG_DUMP_SHADERS))
1426       nir_print_shader(s, stdout);
1427
1428    while( OPT(s, nir_opt_vectorize) );
1429    NIR_PASS_V(s, nir_lower_alu_to_scalar, etna_alu_to_scalar_filter_cb, specs);
1430
1431    NIR_PASS_V(s, nir_remove_dead_variables, nir_var_function_temp, NULL);
1432    NIR_PASS_V(s, nir_opt_algebraic_late);
1433
1434    NIR_PASS_V(s, nir_move_vec_src_uses_to_dest);
1435    NIR_PASS_V(s, nir_copy_prop);
1436    /* only HW supported integer source mod is ineg for iadd instruction (?) */
1437    NIR_PASS_V(s, nir_lower_to_source_mods, ~nir_lower_int_source_mods);
1438    /* need copy prop after uses_to_dest, and before src mods: see
1439     * dEQP-GLES2.functional.shaders.random.all_features.fragment.95
1440     */
1441
1442    NIR_PASS_V(s, nir_opt_dce);
1443
1444    NIR_PASS_V(s, etna_lower_alu, c->specs->has_new_transcendentals);
1445
1446    if (DBG_ENABLED(ETNA_DBG_DUMP_SHADERS))
1447       nir_print_shader(s, stdout);
1448
1449    unsigned block_ptr[nir_shader_get_entrypoint(s)->num_blocks];
1450    c->block_ptr = block_ptr;
1451
1452    unsigned num_consts;
1453    ASSERTED bool ok = emit_shader(c, &v->num_temps, &num_consts);
1454    assert(ok);
1455
1456    /* empty shader, emit NOP */
1457    if (!c->inst_ptr)
1458       emit_inst(c, &(struct etna_inst) { .opcode = INST_OPCODE_NOP });
1459
1460    /* assemble instructions, fixing up labels */
1461    uint32_t *code = MALLOC(c->inst_ptr * 16);
1462    for (unsigned i = 0; i < c->inst_ptr; i++) {
1463       struct etna_inst *inst = &c->code[i];
1464       if (inst->opcode == INST_OPCODE_BRANCH)
1465          inst->imm = block_ptr[inst->imm];
1466
1467       inst->halti5 = specs->halti >= 5;
1468       etna_assemble(&code[i * 4], inst);
1469    }
1470
1471    v->code_size = c->inst_ptr * 4;
1472    v->code = code;
1473    v->needs_icache = c->inst_ptr > specs->max_instructions;
1474
1475    copy_uniform_state_to_shader(v, c->consts, num_consts);
1476
1477    if (s->info.stage == MESA_SHADER_FRAGMENT) {
1478       v->input_count_unk8 = 31; /* XXX what is this */
1479       assert(v->ps_depth_out_reg <= 0);
1480    } else {
1481       fill_vs_mystery(v);
1482    }
1483
1484    bool result = etna_compile_check_limits(v);
1485    ralloc_free(c->nir);
1486    FREE(c);
1487    return result;
1488 }
1489
1490 void
1491 etna_destroy_shader_nir(struct etna_shader_variant *shader)
1492 {
1493    assert(shader);
1494
1495    FREE(shader->code);
1496    FREE(shader->uniforms.imm_data);
1497    FREE(shader->uniforms.imm_contents);
1498    FREE(shader);
1499 }
1500
1501 extern const char *tgsi_swizzle_names[];
1502 void
1503 etna_dump_shader_nir(const struct etna_shader_variant *shader)
1504 {
1505    if (shader->stage == MESA_SHADER_VERTEX)
1506       printf("VERT\n");
1507    else
1508       printf("FRAG\n");
1509
1510    etna_disasm(shader->code, shader->code_size, PRINT_RAW);
1511
1512    printf("num loops: %i\n", shader->num_loops);
1513    printf("num temps: %i\n", shader->num_temps);
1514    printf("immediates:\n");
1515    for (int idx = 0; idx < shader->uniforms.imm_count; ++idx) {
1516       printf(" [%i].%s = %f (0x%08x) (%d)\n",
1517              idx / 4,
1518              tgsi_swizzle_names[idx % 4],
1519              *((float *)&shader->uniforms.imm_data[idx]),
1520              shader->uniforms.imm_data[idx],
1521              shader->uniforms.imm_contents[idx]);
1522    }
1523    printf("inputs:\n");
1524    for (int idx = 0; idx < shader->infile.num_reg; ++idx) {
1525       printf(" [%i] name=%s comps=%i\n", shader->infile.reg[idx].reg,
1526                (shader->stage == MESA_SHADER_VERTEX) ?
1527                gl_vert_attrib_name(shader->infile.reg[idx].slot) :
1528                gl_varying_slot_name(shader->infile.reg[idx].slot),
1529                shader->infile.reg[idx].num_components);
1530    }
1531    printf("outputs:\n");
1532    for (int idx = 0; idx < shader->outfile.num_reg; ++idx) {
1533       printf(" [%i] name=%s comps=%i\n", shader->outfile.reg[idx].reg,
1534                (shader->stage == MESA_SHADER_VERTEX) ?
1535                gl_varying_slot_name(shader->outfile.reg[idx].slot) :
1536                gl_frag_result_name(shader->outfile.reg[idx].slot),
1537                shader->outfile.reg[idx].num_components);
1538    }
1539    printf("special:\n");
1540    if (shader->stage == MESA_SHADER_VERTEX) {
1541       printf("  vs_pos_out_reg=%i\n", shader->vs_pos_out_reg);
1542       printf("  vs_pointsize_out_reg=%i\n", shader->vs_pointsize_out_reg);
1543       printf("  vs_load_balancing=0x%08x\n", shader->vs_load_balancing);
1544    } else {
1545       printf("  ps_color_out_reg=%i\n", shader->ps_color_out_reg);
1546       printf("  ps_depth_out_reg=%i\n", shader->ps_depth_out_reg);
1547    }
1548    printf("  input_count_unk8=0x%08x\n", shader->input_count_unk8);
1549 }
1550
1551 static const struct etna_shader_inout *
1552 etna_shader_vs_lookup(const struct etna_shader_variant *sobj,
1553                       const struct etna_shader_inout *in)
1554 {
1555    for (int i = 0; i < sobj->outfile.num_reg; i++)
1556       if (sobj->outfile.reg[i].slot == in->slot)
1557          return &sobj->outfile.reg[i];
1558
1559    return NULL;
1560 }
1561
1562 bool
1563 etna_link_shader_nir(struct etna_shader_link_info *info,
1564                      const struct etna_shader_variant *vs,
1565                      const struct etna_shader_variant *fs)
1566 {
1567    int comp_ofs = 0;
1568    /* For each fragment input we need to find the associated vertex shader
1569     * output, which can be found by matching on semantic name and index. A
1570     * binary search could be used because the vs outputs are sorted by their
1571     * semantic index and grouped by semantic type by fill_in_vs_outputs.
1572     */
1573    assert(fs->infile.num_reg < ETNA_NUM_INPUTS);
1574    info->pcoord_varying_comp_ofs = -1;
1575
1576    for (int idx = 0; idx < fs->infile.num_reg; ++idx) {
1577       const struct etna_shader_inout *fsio = &fs->infile.reg[idx];
1578       const struct etna_shader_inout *vsio = etna_shader_vs_lookup(vs, fsio);
1579       struct etna_varying *varying;
1580       bool interpolate_always = true;
1581
1582       assert(fsio->reg > 0 && fsio->reg <= ARRAY_SIZE(info->varyings));
1583
1584       if (fsio->reg > info->num_varyings)
1585          info->num_varyings = fsio->reg;
1586
1587       varying = &info->varyings[fsio->reg - 1];
1588       varying->num_components = fsio->num_components;
1589
1590       if (!interpolate_always) /* colors affected by flat shading */
1591          varying->pa_attributes = 0x200;
1592       else /* texture coord or other bypasses flat shading */
1593          varying->pa_attributes = 0x2f1;
1594
1595       varying->use[0] = VARYING_COMPONENT_USE_UNUSED;
1596       varying->use[1] = VARYING_COMPONENT_USE_UNUSED;
1597       varying->use[2] = VARYING_COMPONENT_USE_UNUSED;
1598       varying->use[3] = VARYING_COMPONENT_USE_UNUSED;
1599
1600       /* point coord is an input to the PS without matching VS output,
1601        * so it gets a varying slot without being assigned a VS register.
1602        */
1603       if (fsio->slot == VARYING_SLOT_PNTC) {
1604          varying->use[0] = VARYING_COMPONENT_USE_POINTCOORD_X;
1605          varying->use[1] = VARYING_COMPONENT_USE_POINTCOORD_Y;
1606
1607          info->pcoord_varying_comp_ofs = comp_ofs;
1608       } else {
1609          if (vsio == NULL) { /* not found -- link error */
1610             BUG("Semantic value not found in vertex shader outputs\n");
1611             return true;
1612          }
1613          varying->reg = vsio->reg;
1614       }
1615
1616       comp_ofs += varying->num_components;
1617    }
1618
1619    assert(info->num_varyings == fs->infile.num_reg);
1620
1621    return false;
1622 }