src/gallium/drivers/etnaviv/etnaviv_compiler_nir.c

   1 /*
   2  * Copyright (c) 2012-2019 Etnaviv Project
   3  * Copyright (c) 2019 Zodiac Inflight Innovations
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sub license,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the
  13  * next paragraph) shall be included in all copies or substantial portions
  14  * of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  22  * DEALINGS IN THE SOFTWARE.
  23  *
  24  * Authors:
  25  *    Jonathan Marek <jonathan@marek.ca>
  26  *    Wladimir J. van der Laan <laanwj@gmail.com>
  27  */
  28
  29 #include "etnaviv_compiler.h"
  30 #include "etnaviv_asm.h"
  31 #include "etnaviv_context.h"
  32 #include "etnaviv_debug.h"
  33 #include "etnaviv_disasm.h"
  34 #include "etnaviv_uniforms.h"
  35 #include "etnaviv_util.h"
  36
  37 #include <math.h>
  38 #include "util/u_memory.h"
  39 #include "util/register_allocate.h"
  40 #include "compiler/nir/nir_builder.h"
  41 #include "compiler/nir/nir_worklist.h"
  42
  43 #include "tgsi/tgsi_strings.h"
  44 #include "util/u_half.h"
  45
  46 struct etna_compile {
  47    nir_shader *nir;
  48 #define is_fs(c) ((c)->nir->info.stage == MESA_SHADER_FRAGMENT)
  49    const struct etna_specs *specs;
  50    struct etna_shader_variant *variant;
  51
  52    /* register assigned to each output, indexed by driver_location */
  53    unsigned output_reg[ETNA_NUM_INPUTS];
  54
  55    /* block # to instr index */
  56    unsigned *block_ptr;
  57
  58    /* Code generation */
  59    int inst_ptr; /* current instruction pointer */
  60    struct etna_inst code[ETNA_MAX_INSTRUCTIONS * ETNA_INST_SIZE];
  61
  62    /* There was an error during compilation */
  63    bool error;
  64 };
  65
  66 #define compile_error(ctx, args...) ({ \
  67    printf(args); \
  68    ctx->error = true; \
  69    assert(0); \
  70 })
  71
  72 /* io related lowering
  73  * run after lower_int_to_float because it adds i2f/f2i ops
  74  */
  75 static void
  76 etna_lower_io(nir_shader *shader, struct etna_shader_variant *v)
  77 {
  78    bool rb_swap = shader->info.stage == MESA_SHADER_FRAGMENT && v->key.frag_rb_swap;
  79
  80    unsigned color_location = 0;
  81    nir_foreach_variable(var, &shader->outputs) {
  82       switch (var->data.location) {
  83       case FRAG_RESULT_COLOR:
  84       case FRAG_RESULT_DATA0:
  85          color_location = var->data.driver_location;
  86          break;
  87       }
  88    }
  89
  90    nir_foreach_function(function, shader) {
  91       nir_builder b;
  92       nir_builder_init(&b, function->impl);
  93
  94       nir_foreach_block(block, function->impl) {
  95          nir_foreach_instr_safe(instr, block) {
  96             if (instr->type == nir_instr_type_intrinsic) {
  97                nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
  98
  99                switch (intr->intrinsic) {
 100                case nir_intrinsic_load_front_face: {
 101                   if (!v->key.front_ccw)
 102                      break;
 103
 104                   /* front face inverted (run after int_to_float, so invert as float) */
 105                   b.cursor = nir_after_instr(instr);
 106
 107                   nir_ssa_def *ssa = nir_seq(&b, &intr->dest.ssa, nir_imm_float(&b, 0.0));
 108                   nir_ssa_def_rewrite_uses_after(&intr->dest.ssa,
 109                                                  nir_src_for_ssa(ssa),
 110                                                  ssa->parent_instr);
 111                } break;
 112                case nir_intrinsic_store_output: {
 113                   if (!rb_swap || nir_intrinsic_base(intr) != color_location)
 114                      break;
 115                   b.cursor = nir_before_instr(instr);
 116
 117                   nir_ssa_def *ssa = nir_mov(&b, intr->src[0].ssa);
 118                   nir_alu_instr *alu = nir_instr_as_alu(ssa->parent_instr);
 119                   alu->src[0].swizzle[0] = 2;
 120                   alu->src[0].swizzle[2] = 0;
 121                   nir_instr_rewrite_src(instr, &intr->src[0], nir_src_for_ssa(ssa));
 122                } break;
 123                case nir_intrinsic_load_instance_id: {
 124                   b.cursor = nir_after_instr(instr);
 125                   nir_ssa_def *ssa = nir_i2f32(&b, &intr->dest.ssa);
 126                   nir_ssa_def_rewrite_uses_after(&intr->dest.ssa,
 127                                                  nir_src_for_ssa(ssa),
 128                                                  ssa->parent_instr);
 129                } break;
 130                case nir_intrinsic_load_uniform: {
 131                   /* multiply by 16 and convert to int */
 132                   b.cursor = nir_before_instr(instr);
 133                   nir_ssa_def *ssa = nir_f2u32(&b, nir_fmul(&b, intr->src[0].ssa,
 134                                                                 nir_imm_float(&b, 16.0f)));
 135                   nir_instr_rewrite_src(instr, &intr->src[0], nir_src_for_ssa(ssa));
 136                } break;
 137                default:
 138                   break;
 139                }
 140             }
 141
 142             if (instr->type != nir_instr_type_tex)
 143                continue;
 144
 145             nir_tex_instr *tex = nir_instr_as_tex(instr);
 146             nir_src *coord = NULL;
 147             nir_src *lod_bias = NULL;
 148             unsigned lod_bias_idx;
 149
 150             assert(tex->sampler_index == tex->texture_index);
 151
 152             for (unsigned i = 0; i < tex->num_srcs; i++) {
 153                switch (tex->src[i].src_type) {
 154                case nir_tex_src_coord:
 155                   coord = &tex->src[i].src;
 156                   break;
 157                case nir_tex_src_bias:
 158                case nir_tex_src_lod:
 159                   assert(!lod_bias);
 160                   lod_bias = &tex->src[i].src;
 161                   lod_bias_idx = i;
 162                   break;
 163                default:
 164                   assert(0);
 165                   break;
 166                }
 167             }
 168
 169             if (tex->sampler_dim == GLSL_SAMPLER_DIM_RECT) {
 170                /* use a dummy load_uniform here to represent texcoord scale */
 171                b.cursor = nir_before_instr(instr);
 172                nir_intrinsic_instr *load =
 173                   nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_uniform);
 174                nir_intrinsic_set_base(load, ~tex->sampler_index);
 175                load->num_components = 2;
 176                load->src[0] = nir_src_for_ssa(nir_imm_float(&b, 0.0f));
 177                nir_ssa_dest_init(&load->instr, &load->dest, 2, 32, NULL);
 178                nir_intrinsic_set_type(load, nir_type_float);
 179
 180                nir_builder_instr_insert(&b, &load->instr);
 181
 182                nir_ssa_def *new_coord = nir_fmul(&b, coord->ssa, &load->dest.ssa);
 183                nir_instr_rewrite_src(&tex->instr, coord, nir_src_for_ssa(new_coord));
 184             }
 185
 186             /* pre HALTI5 needs texture sources in a single source */
 187
 188             if (!lod_bias || v->shader->specs->halti >= 5)
 189                continue;
 190
 191             assert(coord && lod_bias && tex->coord_components < 4);
 192
 193             nir_alu_instr *vec = nir_alu_instr_create(shader, nir_op_vec4);
 194             for (unsigned i = 0; i < tex->coord_components; i++) {
 195                vec->src[i].src = nir_src_for_ssa(coord->ssa);
 196                vec->src[i].swizzle[0] = i;
 197             }
 198             for (unsigned i = tex->coord_components; i < 4; i++)
 199                vec->src[i].src = nir_src_for_ssa(lod_bias->ssa);
 200
 201             vec->dest.write_mask = 0xf;
 202             nir_ssa_dest_init(&vec->instr, &vec->dest.dest, 4, 32, NULL);
 203
 204             nir_tex_instr_remove_src(tex, lod_bias_idx);
 205             nir_instr_rewrite_src(&tex->instr, coord, nir_src_for_ssa(&vec->dest.dest.ssa));
 206             tex->coord_components = 4;
 207
 208             nir_instr_insert_before(&tex->instr, &vec->instr);
 209          }
 210       }
 211    }
 212 }
 213
 214 static bool
 215 etna_alu_to_scalar_filter_cb(const nir_instr *instr, const void *data)
 216 {
 217    const struct etna_specs *specs = data;
 218
 219    if (instr->type != nir_instr_type_alu)
 220       return false;
 221
 222    nir_alu_instr *alu = nir_instr_as_alu(instr);
 223    switch (alu->op) {
 224    case nir_op_frsq:
 225    case nir_op_frcp:
 226    case nir_op_flog2:
 227    case nir_op_fexp2:
 228    case nir_op_fsqrt:
 229    case nir_op_fcos:
 230    case nir_op_fsin:
 231    case nir_op_fdiv:
 232       return true;
 233    case nir_op_fdot2:
 234       if (!specs->has_halti2_instructions)
 235          return true;
 236       break;
 237    default:
 238       break;
 239    }
 240
 241    return false;
 242 }
 243
 244 static void
 245 etna_lower_alu_impl(nir_function_impl *impl, struct etna_compile *c)
 246 {
 247    nir_shader *shader = impl->function->shader;
 248
 249    nir_builder b;
 250    nir_builder_init(&b, impl);
 251
 252    /* in a seperate loop so we can apply the multiple-uniform logic to the new fmul */
 253    nir_foreach_block(block, impl) {
 254       nir_foreach_instr_safe(instr, block) {
 255          if (instr->type != nir_instr_type_alu)
 256             continue;
 257
 258          nir_alu_instr *alu = nir_instr_as_alu(instr);
 259          /* multiply sin/cos src by constant
 260           * TODO: do this earlier (but it breaks const_prop opt)
 261           */
 262          if (alu->op == nir_op_fsin || alu->op == nir_op_fcos) {
 263             b.cursor = nir_before_instr(instr);
 264
 265             nir_ssa_def *imm = c->specs->has_new_transcendentals ?
 266                nir_imm_float(&b, 1.0 / M_PI) :
 267                nir_imm_float(&b, 2.0 / M_PI);
 268
 269             nir_instr_rewrite_src(instr, &alu->src[0].src,
 270                nir_src_for_ssa(nir_fmul(&b, alu->src[0].src.ssa, imm)));
 271          }
 272
 273          /* change transcendental ops to vec2 and insert vec1 mul for the result
 274           * TODO: do this earlier (but it breaks with optimizations)
 275           */
 276          if (c->specs->has_new_transcendentals && (
 277              alu->op == nir_op_fdiv || alu->op == nir_op_flog2 ||
 278              alu->op == nir_op_fsin || alu->op == nir_op_fcos)) {
 279             nir_ssa_def *ssa = &alu->dest.dest.ssa;
 280
 281             assert(ssa->num_components == 1);
 282
 283             nir_alu_instr *mul = nir_alu_instr_create(shader, nir_op_fmul);
 284             mul->src[0].src = mul->src[1].src = nir_src_for_ssa(ssa);
 285             mul->src[1].swizzle[0] = 1;
 286
 287             mul->dest.write_mask = 1;
 288             nir_ssa_dest_init(&mul->instr, &mul->dest.dest, 1, 32, NULL);
 289
 290             ssa->num_components = 2;
 291
 292             mul->dest.saturate = alu->dest.saturate;
 293             alu->dest.saturate = 0;
 294
 295             nir_instr_insert_after(instr, &mul->instr);
 296
 297             nir_ssa_def_rewrite_uses_after(ssa, nir_src_for_ssa(&mul->dest.dest.ssa), &mul->instr);
 298          }
 299       }
 300    }
 301 }
 302
 303 static void etna_lower_alu(nir_shader *shader, struct etna_compile *c)
 304 {
 305    nir_foreach_function(function, shader) {
 306       if (function->impl)
 307          etna_lower_alu_impl(function->impl, c);
 308    }
 309 }
 310
 311 static void
 312 emit_inst(struct etna_compile *c, struct etna_inst *inst)
 313 {
 314    c->code[c->inst_ptr++] = *inst;
 315 }
 316
 317 /* to map nir srcs should to etna_inst srcs */
 318 enum {
 319    SRC_0_1_2 = (0 << 0) | (1 << 2) | (2 << 4),
 320    SRC_0_1_X = (0 << 0) | (1 << 2) | (3 << 4),
 321    SRC_0_X_X = (0 << 0) | (3 << 2) | (3 << 4),
 322    SRC_0_X_1 = (0 << 0) | (3 << 2) | (1 << 4),
 323    SRC_0_1_0 = (0 << 0) | (1 << 2) | (0 << 4),
 324    SRC_X_X_0 = (3 << 0) | (3 << 2) | (0 << 4),
 325    SRC_0_X_0 = (0 << 0) | (3 << 2) | (0 << 4),
 326 };
 327
 328 /* info to translate a nir op to etna_inst */
 329 struct etna_op_info {
 330    uint8_t opcode; /* INST_OPCODE_ */
 331    uint8_t src; /* SRC_ enum  */
 332    uint8_t cond; /* INST_CONDITION_ */
 333    uint8_t type; /* INST_TYPE_ */
 334 };
 335
 336 static const struct etna_op_info etna_ops[] = {
 337    [0 ... nir_num_opcodes - 1] = {0xff},
 338 #undef TRUE
 339 #undef FALSE
 340 #define OPCT(nir, op, src, cond, type) [nir_op_##nir] = { \
 341    INST_OPCODE_##op, \
 342    SRC_##src, \
 343    INST_CONDITION_##cond, \
 344    INST_TYPE_##type \
 345 }
 346 #define OPC(nir, op, src, cond) OPCT(nir, op, src, cond, F32)
 347 #define OP(nir, op, src) OPC(nir, op, src, TRUE)
 348    OP(mov, MOV, X_X_0), OP(fneg, MOV, X_X_0), OP(fabs, MOV, X_X_0), OP(fsat, MOV, X_X_0),
 349    OP(fmul, MUL, 0_1_X), OP(fadd, ADD, 0_X_1), OP(ffma, MAD, 0_1_2),
 350    OP(fdot2, DP2, 0_1_X), OP(fdot3, DP3, 0_1_X), OP(fdot4, DP4, 0_1_X),
 351    OPC(fmin, SELECT, 0_1_0, GT), OPC(fmax, SELECT, 0_1_0, LT),
 352    OP(ffract, FRC, X_X_0), OP(frcp, RCP, X_X_0), OP(frsq, RSQ, X_X_0),
 353    OP(fsqrt, SQRT, X_X_0), OP(fsin, SIN, X_X_0), OP(fcos, COS, X_X_0),
 354    OP(fsign, SIGN, X_X_0), OP(ffloor, FLOOR, X_X_0), OP(fceil, CEIL, X_X_0),
 355    OP(flog2, LOG, X_X_0), OP(fexp2, EXP, X_X_0),
 356    OPC(seq, SET, 0_1_X, EQ), OPC(sne, SET, 0_1_X, NE), OPC(sge, SET, 0_1_X, GE), OPC(slt, SET, 0_1_X, LT),
 357    OPC(fcsel, SELECT, 0_1_2, NZ),
 358    OP(fdiv, DIV, 0_1_X),
 359    OP(fddx, DSX, 0_X_0), OP(fddy, DSY, 0_X_0),
 360
 361    /* integer opcodes */
 362    OPCT(i2f32, I2F, 0_X_X, TRUE, S32),
 363    OPCT(f2u32, F2I, 0_X_X, TRUE, U32),
 364 };
 365
 366 static void
 367 etna_emit_block_start(struct etna_compile *c, unsigned block)
 368 {
 369    c->block_ptr[block] = c->inst_ptr;
 370 }
 371
 372 static void
 373 etna_emit_alu(struct etna_compile *c, nir_op op, struct etna_inst_dst dst,
 374               struct etna_inst_src src[3], bool saturate)
 375 {
 376    struct etna_op_info ei = etna_ops[op];
 377
 378    assert(ei.opcode != 0xff);
 379
 380    struct etna_inst inst = {
 381       .opcode = ei.opcode,
 382       .type = ei.type,
 383       .cond = ei.cond,
 384       .dst = dst,
 385       .sat = saturate,
 386    };
 387
 388    switch (op) {
 389    case nir_op_fdiv:
 390    case nir_op_flog2:
 391    case nir_op_fsin:
 392    case nir_op_fcos:
 393       if (c->specs->has_new_transcendentals)
 394          inst.tex.amode = 1;
 395       /* fall through */
 396    case nir_op_frsq:
 397    case nir_op_frcp:
 398    case nir_op_fexp2:
 399    case nir_op_fsqrt:
 400    case nir_op_i2f32:
 401    case nir_op_f2u32:
 402       /* for these instructions we want src to be in x component
 403        * note: on HALTI2+ i2f/f2u are not scalar but we only use them this way currently
 404        */
 405       src[0].swiz = inst_swiz_compose(src[0].swiz,
 406                                       INST_SWIZ_BROADCAST(ffs(inst.dst.write_mask)-1));
 407    default:
 408       break;
 409    }
 410
 411    for (unsigned j = 0; j < 3; j++) {
 412       unsigned i = ((ei.src >> j*2) & 3);
 413       if (i < 3)
 414          inst.src[j] = src[i];
 415    }
 416
 417    emit_inst(c, &inst);
 418 }
 419
 420 static void
 421 etna_emit_tex(struct etna_compile *c, nir_texop op, unsigned texid, unsigned dst_swiz,
 422               struct etna_inst_dst dst, struct etna_inst_src coord,
 423               struct etna_inst_src lod_bias)
 424 {
 425    struct etna_inst inst = {
 426       .dst = dst,
 427       .tex.id = texid + (is_fs(c) ? 0 : c->specs->vertex_sampler_offset),
 428       .tex.swiz = dst_swiz,
 429       .src[0] = coord,
 430    };
 431
 432    if (lod_bias.use)
 433       inst.src[1] = lod_bias;
 434
 435    switch (op) {
 436    case nir_texop_tex: inst.opcode = INST_OPCODE_TEXLD; break;
 437    case nir_texop_txb: inst.opcode = INST_OPCODE_TEXLDB; break;
 438    case nir_texop_txl: inst.opcode = INST_OPCODE_TEXLDL; break;
 439    default:
 440       assert(0);
 441    }
 442
 443    emit_inst(c, &inst);
 444 }
 445
 446 static void
 447 etna_emit_jump(struct etna_compile *c, unsigned block, struct etna_inst_src condition)
 448 {
 449    if (!condition.use) {
 450       emit_inst(c, &(struct etna_inst) {.opcode = INST_OPCODE_BRANCH, .imm = block });
 451       return;
 452    }
 453
 454    struct etna_inst inst = {
 455       .opcode = INST_OPCODE_BRANCH,
 456       .cond = INST_CONDITION_NOT,
 457       .type = INST_TYPE_U32,
 458       .src[0] = condition,
 459       .imm = block,
 460    };
 461    inst.src[0].swiz = INST_SWIZ_BROADCAST(inst.src[0].swiz & 3);
 462    emit_inst(c, &inst);
 463 }
 464
 465 static void
 466 etna_emit_discard(struct etna_compile *c, struct etna_inst_src condition)
 467 {
 468    if (!condition.use) {
 469       emit_inst(c, &(struct etna_inst) { .opcode = INST_OPCODE_TEXKILL });
 470       return;
 471    }
 472
 473    struct etna_inst inst = {
 474       .opcode = INST_OPCODE_TEXKILL,
 475       .cond = INST_CONDITION_GZ,
 476       .src[0] = condition,
 477    };
 478    inst.src[0].swiz = INST_SWIZ_BROADCAST(inst.src[0].swiz & 3);
 479    emit_inst(c, &inst);
 480 }
 481
 482 static void
 483 etna_emit_output(struct etna_compile *c, unsigned index, struct etna_inst_src src)
 484 {
 485    c->output_reg[index] = src.reg;
 486 }
 487
 488 static void
 489 etna_emit_load_ubo(struct etna_compile *c, struct etna_inst_dst dst,
 490                    struct etna_inst_src src, struct etna_inst_src base)
 491 {
 492    emit_inst(c, &(struct etna_inst) {
 493       .opcode = INST_OPCODE_LOAD,
 494       .type = INST_TYPE_U32,
 495       .dst = dst,
 496       .src[0] = src,
 497       .src[1] = base,
 498    });
 499 }
 500
 501 #define OPT(nir, pass, ...) ({                             \
 502    bool this_progress = false;                             \
 503    NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);      \
 504    this_progress;                                          \
 505 })
 506 #define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
 507
 508 static void
 509 etna_optimize_loop(nir_shader *s)
 510 {
 511    bool progress;
 512    do {
 513       progress = false;
 514
 515       OPT_V(s, nir_lower_vars_to_ssa);
 516       progress |= OPT(s, nir_opt_copy_prop_vars);
 517       progress |= OPT(s, nir_copy_prop);
 518       progress |= OPT(s, nir_opt_dce);
 519       progress |= OPT(s, nir_opt_cse);
 520       progress |= OPT(s, nir_opt_peephole_select, 16, true, true);
 521       progress |= OPT(s, nir_opt_intrinsics);
 522       progress |= OPT(s, nir_opt_algebraic);
 523       progress |= OPT(s, nir_opt_constant_folding);
 524       progress |= OPT(s, nir_opt_dead_cf);
 525       if (OPT(s, nir_opt_trivial_continues)) {
 526          progress = true;
 527          /* If nir_opt_trivial_continues makes progress, then we need to clean
 528           * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
 529           * to make progress.
 530           */
 531          OPT(s, nir_copy_prop);
 532          OPT(s, nir_opt_dce);
 533       }
 534       progress |= OPT(s, nir_opt_loop_unroll, nir_var_all);
 535       progress |= OPT(s, nir_opt_if, false);
 536       progress |= OPT(s, nir_opt_remove_phis);
 537       progress |= OPT(s, nir_opt_undef);
 538    }
 539    while (progress);
 540 }
 541
 542 static int
 543 etna_glsl_type_size(const struct glsl_type *type, bool bindless)
 544 {
 545    return glsl_count_attribute_slots(type, false);
 546 }
 547
 548 static void
 549 copy_uniform_state_to_shader(struct etna_shader_variant *sobj, uint64_t *consts, unsigned count)
 550 {
 551    struct etna_shader_uniform_info *uinfo = &sobj->uniforms;
 552
 553    uinfo->imm_count = count * 4;
 554    uinfo->imm_data = MALLOC(uinfo->imm_count * sizeof(*uinfo->imm_data));
 555    uinfo->imm_contents = MALLOC(uinfo->imm_count * sizeof(*uinfo->imm_contents));
 556
 557    for (unsigned i = 0; i < uinfo->imm_count; i++) {
 558       uinfo->imm_data[i] = consts[i];
 559       uinfo->imm_contents[i] = consts[i] >> 32;
 560    }
 561
 562    etna_set_shader_uniforms_dirty_flags(sobj);
 563 }
 564
 565 #include "etnaviv_compiler_nir_emit.h"
 566
 567 bool
 568 etna_compile_shader_nir(struct etna_shader_variant *v)
 569 {
 570    if (unlikely(!v))
 571       return false;
 572
 573    struct etna_compile *c = CALLOC_STRUCT(etna_compile);
 574    if (!c)
 575       return false;
 576
 577    c->variant = v;
 578    c->specs = v->shader->specs;
 579    c->nir = nir_shader_clone(NULL, v->shader->nir);
 580
 581    nir_shader *s = c->nir;
 582    const struct etna_specs *specs = c->specs;
 583
 584    v->stage = s->info.stage;
 585    v->num_loops = 0; /* TODO */
 586    v->vs_id_in_reg = -1;
 587    v->vs_pos_out_reg = -1;
 588    v->vs_pointsize_out_reg = -1;
 589    v->ps_color_out_reg = 0; /* 0 for shader that doesn't write fragcolor.. */
 590    v->ps_depth_out_reg = -1;
 591
 592    /* setup input linking */
 593    struct etna_shader_io_file *sf = &v->infile;
 594    if (s->info.stage == MESA_SHADER_VERTEX) {
 595       nir_foreach_variable(var, &s->inputs) {
 596          unsigned idx = var->data.driver_location;
 597          sf->reg[idx].reg = idx;
 598          sf->reg[idx].slot = var->data.location;
 599          sf->reg[idx].num_components = 4; /* TODO */
 600          sf->num_reg = MAX2(sf->num_reg, idx+1);
 601       }
 602    } else {
 603       unsigned count = 0;
 604       nir_foreach_variable(var, &s->inputs) {
 605          unsigned idx = var->data.driver_location;
 606          sf->reg[idx].reg = idx + 1;
 607          sf->reg[idx].slot = var->data.location;
 608          sf->reg[idx].num_components = 4; /* TODO */
 609          sf->num_reg = MAX2(sf->num_reg, idx+1);
 610          count++;
 611       }
 612       assert(sf->num_reg == count);
 613    }
 614
 615    NIR_PASS_V(s, nir_lower_io, nir_var_all, etna_glsl_type_size,
 616             (nir_lower_io_options)0);
 617
 618    OPT_V(s, nir_lower_regs_to_ssa);
 619    OPT_V(s, nir_lower_vars_to_ssa);
 620    OPT_V(s, nir_lower_indirect_derefs, nir_var_all);
 621    OPT_V(s, nir_lower_tex, &(struct nir_lower_tex_options) { .lower_txp = ~0u });
 622    OPT_V(s, nir_lower_alu_to_scalar, etna_alu_to_scalar_filter_cb, specs);
 623
 624    etna_optimize_loop(s);
 625
 626    /* use opt_algebraic between int_to_float and boot_to_float because
 627     * int_to_float emits ftrunc, and ftrunc lowering generates bool ops
 628     */
 629    OPT_V(s, nir_lower_int_to_float);
 630    OPT_V(s, nir_opt_algebraic);
 631    OPT_V(s, nir_lower_bool_to_float);
 632
 633    /* after int to float because insert i2f for instance_id */
 634    OPT_V(s, etna_lower_io, v);
 635
 636    etna_optimize_loop(s);
 637
 638    if (DBG_ENABLED(ETNA_DBG_DUMP_SHADERS))
 639       nir_print_shader(s, stdout);
 640
 641    while( OPT(s, nir_opt_vectorize) );
 642    OPT_V(s, nir_lower_alu_to_scalar, etna_alu_to_scalar_filter_cb, specs);
 643
 644    NIR_PASS_V(s, nir_remove_dead_variables, nir_var_function_temp);
 645    NIR_PASS_V(s, nir_opt_algebraic_late);
 646
 647    NIR_PASS_V(s, nir_move_vec_src_uses_to_dest);
 648    NIR_PASS_V(s, nir_copy_prop);
 649    NIR_PASS_V(s, nir_lower_to_source_mods, ~nir_lower_int_source_mods);
 650    /* need copy prop after uses_to_dest, and before src mods: see
 651     * dEQP-GLES2.functional.shaders.random.all_features.fragment.95
 652     */
 653
 654    NIR_PASS_V(s, nir_opt_dce);
 655
 656    NIR_PASS_V(s, etna_lower_alu, c);
 657
 658    if (DBG_ENABLED(ETNA_DBG_DUMP_SHADERS))
 659       nir_print_shader(s, stdout);
 660
 661    uint64_t consts[ETNA_MAX_IMM] = {};
 662
 663    unsigned block_ptr[nir_shader_get_entrypoint(s)->num_blocks];
 664    c->block_ptr = block_ptr;
 665    struct emit_options options = {
 666       .max_temps = ETNA_MAX_TEMPS,
 667       .max_consts = ETNA_MAX_IMM / 4,
 668       .id_reg = sf->num_reg,
 669       .single_const_src = c->specs->halti < 5,
 670       .etna_new_transcendentals = c->specs->has_new_transcendentals,
 671       .user = c,
 672       .consts = consts,
 673    };
 674
 675    unsigned num_consts;
 676    ASSERTED bool ok = emit_shader(c->nir, &options, &v->num_temps, &num_consts);
 677    assert(ok);
 678
 679    /* empty shader, emit NOP */
 680    if (!c->inst_ptr)
 681       emit_inst(c, &(struct etna_inst) { .opcode = INST_OPCODE_NOP });
 682
 683    /* assemble instructions, fixing up labels */
 684    uint32_t *code = MALLOC(c->inst_ptr * 16 + 1024);
 685    for (unsigned i = 0; i < c->inst_ptr; i++) {
 686       struct etna_inst *inst = &c->code[i];
 687       if (inst->opcode == INST_OPCODE_BRANCH)
 688          inst->imm = block_ptr[inst->imm];
 689
 690       inst->halti5 = specs->halti >= 5;
 691       etna_assemble(&code[i * 4], inst);
 692    }
 693
 694    v->code_size = c->inst_ptr * 4;
 695    v->code = code;
 696    v->needs_icache = c->inst_ptr > specs->max_instructions;
 697
 698    copy_uniform_state_to_shader(v, consts, num_consts);
 699
 700    if (s->info.stage == MESA_SHADER_FRAGMENT) {
 701       v->input_count_unk8 = 31; /* XXX what is this */
 702
 703       nir_foreach_variable(var, &s->outputs) {
 704          unsigned reg = c->output_reg[var->data.driver_location];
 705          switch (var->data.location) {
 706          case FRAG_RESULT_COLOR:
 707          case FRAG_RESULT_DATA0: /* DATA0 is used by gallium shaders for color */
 708             v->ps_color_out_reg = reg;
 709             break;
 710          case FRAG_RESULT_DEPTH:
 711             v->ps_depth_out_reg = reg;
 712             break;
 713          default:
 714             compile_error(c, "Unsupported fs output %s\n", gl_frag_result_name(var->data.location));
 715          }
 716       }
 717       assert(v->ps_depth_out_reg <= 0);
 718       v->outfile.num_reg = 0;
 719       ralloc_free(c->nir);
 720       FREE(c);
 721       return true;
 722    }
 723
 724    v->input_count_unk8 = DIV_ROUND_UP(v->infile.num_reg + 4, 16); /* XXX what is this */
 725
 726    sf = &v->outfile;
 727    sf->num_reg = 0;
 728    nir_foreach_variable(var, &s->outputs) {
 729       unsigned native = c->output_reg[var->data.driver_location];
 730
 731       if (var->data.location == VARYING_SLOT_POS) {
 732          v->vs_pos_out_reg = native;
 733          continue;
 734       }
 735
 736       if (var->data.location == VARYING_SLOT_PSIZ) {
 737          v->vs_pointsize_out_reg = native;
 738          continue;
 739       }
 740
 741       sf->reg[sf->num_reg].reg = native;
 742       sf->reg[sf->num_reg].slot = var->data.location;
 743       sf->reg[sf->num_reg].num_components = 4; /* TODO */
 744       sf->num_reg++;
 745    }
 746
 747    /* fill in "mystery meat" load balancing value. This value determines how
 748     * work is scheduled between VS and PS
 749     * in the unified shader architecture. More precisely, it is determined from
 750     * the number of VS outputs, as well as chip-specific
 751     * vertex output buffer size, vertex cache size, and the number of shader
 752     * cores.
 753     *
 754     * XXX this is a conservative estimate, the "optimal" value is only known for
 755     * sure at link time because some
 756     * outputs may be unused and thus unmapped. Then again, in the general use
 757     * case with GLSL the vertex and fragment
 758     * shaders are linked already before submitting to Gallium, thus all outputs
 759     * are used.
 760     *
 761     * note: TGSI compiler counts all outputs (including position and pointsize), here
 762     * v->outfile.num_reg only counts varyings, +1 to compensate for the position output
 763     * TODO: might have a problem that we don't count pointsize when it is used
 764     */
 765
 766    int half_out = v->outfile.num_reg / 2 + 1;
 767    assert(half_out);
 768
 769    uint32_t b = ((20480 / (specs->vertex_output_buffer_size -
 770                            2 * half_out * specs->vertex_cache_size)) +
 771                  9) /
 772                 10;
 773    uint32_t a = (b + 256 / (specs->shader_core_count * half_out)) / 2;
 774    v->vs_load_balancing = VIVS_VS_LOAD_BALANCING_A(MIN2(a, 255)) |
 775                              VIVS_VS_LOAD_BALANCING_B(MIN2(b, 255)) |
 776                              VIVS_VS_LOAD_BALANCING_C(0x3f) |
 777                              VIVS_VS_LOAD_BALANCING_D(0x0f);
 778
 779    ralloc_free(c->nir);
 780    FREE(c);
 781    return true;
 782 }
 783
 784 void
 785 etna_destroy_shader_nir(struct etna_shader_variant *shader)
 786 {
 787    assert(shader);
 788
 789    FREE(shader->code);
 790    FREE(shader->uniforms.imm_data);
 791    FREE(shader->uniforms.imm_contents);
 792    FREE(shader);
 793 }
 794
 795 extern const char *tgsi_swizzle_names[];
 796 void
 797 etna_dump_shader_nir(const struct etna_shader_variant *shader)
 798 {
 799    if (shader->stage == MESA_SHADER_VERTEX)
 800       printf("VERT\n");
 801    else
 802       printf("FRAG\n");
 803
 804    etna_disasm(shader->code, shader->code_size, PRINT_RAW);
 805
 806    printf("num loops: %i\n", shader->num_loops);
 807    printf("num temps: %i\n", shader->num_temps);
 808    printf("immediates:\n");
 809    for (int idx = 0; idx < shader->uniforms.imm_count; ++idx) {
 810       printf(" [%i].%s = %f (0x%08x) (%d)\n",
 811              idx / 4,
 812              tgsi_swizzle_names[idx % 4],
 813              *((float *)&shader->uniforms.imm_data[idx]),
 814              shader->uniforms.imm_data[idx],
 815              shader->uniforms.imm_contents[idx]);
 816    }
 817    printf("inputs:\n");
 818    for (int idx = 0; idx < shader->infile.num_reg; ++idx) {
 819       printf(" [%i] name=%s comps=%i\n", shader->infile.reg[idx].reg,
 820                (shader->stage == MESA_SHADER_VERTEX) ?
 821                gl_vert_attrib_name(shader->infile.reg[idx].slot) :
 822                gl_varying_slot_name(shader->infile.reg[idx].slot),
 823                shader->infile.reg[idx].num_components);
 824    }
 825    printf("outputs:\n");
 826    for (int idx = 0; idx < shader->outfile.num_reg; ++idx) {
 827       printf(" [%i] name=%s comps=%i\n", shader->outfile.reg[idx].reg,
 828                (shader->stage == MESA_SHADER_VERTEX) ?
 829                gl_varying_slot_name(shader->outfile.reg[idx].slot) :
 830                gl_frag_result_name(shader->outfile.reg[idx].slot),
 831                shader->outfile.reg[idx].num_components);
 832    }
 833    printf("special:\n");
 834    if (shader->stage == MESA_SHADER_VERTEX) {
 835       printf("  vs_pos_out_reg=%i\n", shader->vs_pos_out_reg);
 836       printf("  vs_pointsize_out_reg=%i\n", shader->vs_pointsize_out_reg);
 837       printf("  vs_load_balancing=0x%08x\n", shader->vs_load_balancing);
 838    } else {
 839       printf("  ps_color_out_reg=%i\n", shader->ps_color_out_reg);
 840       printf("  ps_depth_out_reg=%i\n", shader->ps_depth_out_reg);
 841    }
 842    printf("  input_count_unk8=0x%08x\n", shader->input_count_unk8);
 843 }
 844
 845 static const struct etna_shader_inout *
 846 etna_shader_vs_lookup(const struct etna_shader_variant *sobj,
 847                       const struct etna_shader_inout *in)
 848 {
 849    for (int i = 0; i < sobj->outfile.num_reg; i++)
 850       if (sobj->outfile.reg[i].slot == in->slot)
 851          return &sobj->outfile.reg[i];
 852
 853    return NULL;
 854 }
 855
 856 bool
 857 etna_link_shader_nir(struct etna_shader_link_info *info,
 858                      const struct etna_shader_variant *vs,
 859                      const struct etna_shader_variant *fs)
 860 {
 861    int comp_ofs = 0;
 862    /* For each fragment input we need to find the associated vertex shader
 863     * output, which can be found by matching on semantic name and index. A
 864     * binary search could be used because the vs outputs are sorted by their
 865     * semantic index and grouped by semantic type by fill_in_vs_outputs.
 866     */
 867    assert(fs->infile.num_reg < ETNA_NUM_INPUTS);
 868    info->pcoord_varying_comp_ofs = -1;
 869
 870    for (int idx = 0; idx < fs->infile.num_reg; ++idx) {
 871       const struct etna_shader_inout *fsio = &fs->infile.reg[idx];
 872       const struct etna_shader_inout *vsio = etna_shader_vs_lookup(vs, fsio);
 873       struct etna_varying *varying;
 874       bool interpolate_always = true;
 875
 876       assert(fsio->reg > 0 && fsio->reg <= ARRAY_SIZE(info->varyings));
 877
 878       if (fsio->reg > info->num_varyings)
 879          info->num_varyings = fsio->reg;
 880
 881       varying = &info->varyings[fsio->reg - 1];
 882       varying->num_components = fsio->num_components;
 883
 884       if (!interpolate_always) /* colors affected by flat shading */
 885          varying->pa_attributes = 0x200;
 886       else /* texture coord or other bypasses flat shading */
 887          varying->pa_attributes = 0x2f1;
 888
 889       varying->use[0] = VARYING_COMPONENT_USE_UNUSED;
 890       varying->use[1] = VARYING_COMPONENT_USE_UNUSED;
 891       varying->use[2] = VARYING_COMPONENT_USE_UNUSED;
 892       varying->use[3] = VARYING_COMPONENT_USE_UNUSED;
 893
 894       /* point coord is an input to the PS without matching VS output,
 895        * so it gets a varying slot without being assigned a VS register.
 896        */
 897       if (fsio->slot == VARYING_SLOT_PNTC) {
 898          varying->use[0] = VARYING_COMPONENT_USE_POINTCOORD_X;
 899          varying->use[1] = VARYING_COMPONENT_USE_POINTCOORD_Y;
 900
 901          info->pcoord_varying_comp_ofs = comp_ofs;
 902       } else {
 903          if (vsio == NULL) { /* not found -- link error */
 904             BUG("Semantic value not found in vertex shader outputs\n");
 905             return true;
 906          }
 907          varying->reg = vsio->reg;
 908       }
 909
 910       comp_ofs += varying->num_components;
 911    }
 912
 913    assert(info->num_varyings == fs->infile.num_reg);
 914
 915    return false;
 916 }