src/gallium/drivers/freedreno/a2xx/ir2_nir.c

   1 /*
   2  * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  *
  23  * Authors:
  24  *    Jonathan Marek <jonathan@marek.ca>
  25  */
  26
  27 #include "ir2_private.h"
  28
  29 #include "freedreno_util.h"
  30 #include "fd2_program.h"
  31
  32 static const nir_shader_compiler_options options = {
  33         .lower_fpow = true,
  34         .lower_flrp32 = true,
  35         .lower_fmod32 = true,
  36         .lower_fdiv = true,
  37         .lower_fceil = true,
  38         .fuse_ffma = true,
  39         /* .fdot_replicates = true, it is replicated, but it makes things worse */
  40         .lower_all_io_to_temps = true,
  41         .vertex_id_zero_based = true, /* its not implemented anyway */
  42 };
  43
  44 const nir_shader_compiler_options *
  45 ir2_get_compiler_options(void)
  46 {
  47         return &options;
  48 }
  49
  50 #define OPT(nir, pass, ...) ({                             \
  51    bool this_progress = false;                             \
  52    NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);      \
  53    this_progress;                                          \
  54 })
  55 #define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
  56
  57 static void
  58 ir2_optimize_loop(nir_shader *s)
  59 {
  60         bool progress;
  61         do {
  62                 progress = false;
  63
  64                 OPT_V(s, nir_lower_vars_to_ssa);
  65                 progress |= OPT(s, nir_opt_copy_prop_vars);
  66                 progress |= OPT(s, nir_copy_prop);
  67                 progress |= OPT(s, nir_opt_dce);
  68                 progress |= OPT(s, nir_opt_cse);
  69                 /* progress |= OPT(s, nir_opt_gcm, true); */
  70                 progress |= OPT(s, nir_opt_peephole_select, UINT_MAX, true, true);
  71                 progress |= OPT(s, nir_opt_intrinsics);
  72                 progress |= OPT(s, nir_opt_algebraic);
  73                 progress |= OPT(s, nir_opt_constant_folding);
  74                 progress |= OPT(s, nir_opt_dead_cf);
  75                 if (OPT(s, nir_opt_trivial_continues)) {
  76                         progress |= true;
  77                         /* If nir_opt_trivial_continues makes progress, then we need to clean
  78                          * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
  79                          * to make progress.
  80                          */
  81                         OPT(s, nir_copy_prop);
  82                         OPT(s, nir_opt_dce);
  83                 }
  84                 progress |= OPT(s, nir_opt_loop_unroll, nir_var_all);
  85                 progress |= OPT(s, nir_opt_if, false);
  86                 progress |= OPT(s, nir_opt_remove_phis);
  87                 progress |= OPT(s, nir_opt_undef);
  88
  89         }
  90         while (progress);
  91 }
  92
  93 /* trig workarounds is the same as ir3.. but we don't want to include ir3 */
  94 bool ir3_nir_apply_trig_workarounds(nir_shader * shader);
  95
  96 int
  97 ir2_optimize_nir(nir_shader *s, bool lower)
  98 {
  99         struct nir_lower_tex_options tex_options = {
 100                 .lower_txp = ~0u,
 101                 .lower_rect = 0,
 102         };
 103
 104         if (fd_mesa_debug & FD_DBG_DISASM) {
 105                 debug_printf("----------------------\n");
 106                 nir_print_shader(s, stdout);
 107                 debug_printf("----------------------\n");
 108         }
 109
 110         OPT_V(s, nir_lower_regs_to_ssa);
 111         OPT_V(s, nir_lower_vars_to_ssa);
 112         OPT_V(s, nir_lower_indirect_derefs, nir_var_shader_in | nir_var_shader_out);
 113
 114         if (lower) {
 115                 OPT_V(s, ir3_nir_apply_trig_workarounds);
 116                 OPT_V(s, nir_lower_tex, &tex_options);
 117         }
 118
 119         ir2_optimize_loop(s);
 120
 121         OPT_V(s, nir_remove_dead_variables, nir_var_function_temp);
 122         OPT_V(s, nir_move_load_const);
 123
 124         /* TODO we dont want to get shaders writing to depth for depth textures */
 125         if (s->info.stage == MESA_SHADER_FRAGMENT) {
 126                 nir_foreach_variable(var, &s->outputs) {
 127                         if (var->data.location == FRAG_RESULT_DEPTH)
 128                                 return -1;
 129                 }
 130         }
 131
 132         return 0;
 133 }
 134
 135 static struct ir2_src
 136 load_const(struct ir2_context *ctx, float *value_f, unsigned ncomp)
 137 {
 138         struct fd2_shader_stateobj *so = ctx->so;
 139         unsigned imm_ncomp, swiz, idx, i, j;
 140         uint32_t *value = (uint32_t*) value_f;
 141
 142         /* try to merge with existing immediate (TODO: try with neg) */
 143         for (idx = 0; idx < so->num_immediates; idx++) {
 144                 swiz = 0;
 145                 imm_ncomp = so->immediates[idx].ncomp;
 146                 for (i = 0; i < ncomp; i++) {
 147                         for (j = 0; j < imm_ncomp; j++) {
 148                                 if (value[i] == so->immediates[idx].val[j])
 149                                         break;
 150                         }
 151                         if (j == imm_ncomp) {
 152                                 if (j == 4)
 153                                         break;
 154                                 so->immediates[idx].val[imm_ncomp++] = value[i];
 155                         }
 156                         swiz |= swiz_set(j, i);
 157                 }
 158                 /* matched all components */
 159                 if (i == ncomp)
 160                         break;
 161         }
 162
 163         /* need to allocate new immediate */
 164         if (idx == so->num_immediates) {
 165                 swiz = 0;
 166                 imm_ncomp = 0;
 167                 for (i = 0; i < ncomp; i++) {
 168                         for (j = 0; j < imm_ncomp; j++) {
 169                                 if (value[i] == ctx->so->immediates[idx].val[j])
 170                                         break;
 171                         }
 172                         if (j == imm_ncomp) {
 173                                 so->immediates[idx].val[imm_ncomp++] = value[i];
 174                         }
 175                         swiz |= swiz_set(j, i);
 176                 }
 177                 so->num_immediates++;
 178         }
 179         so->immediates[idx].ncomp = imm_ncomp;
 180
 181         if (ncomp == 1)
 182                 swiz = swiz_merge(swiz, IR2_SWIZZLE_XXXX);
 183
 184         return ir2_src(so->first_immediate + idx, swiz, IR2_SRC_CONST);
 185 }
 186
 187 struct ir2_src
 188 ir2_zero(struct ir2_context *ctx)
 189 {
 190         return load_const(ctx, (float[]) {0.0f}, 1);
 191 }
 192
 193 static void
 194 update_range(struct ir2_context *ctx, struct ir2_reg *reg)
 195 {
 196         if (!reg->initialized) {
 197                 reg->initialized = true;
 198                 reg->loop_depth = ctx->loop_depth;
 199         }
 200
 201         if (ctx->loop_depth > reg->loop_depth) {
 202                 reg->block_idx_free = ctx->loop_last_block[reg->loop_depth + 1];
 203         } else {
 204                 reg->loop_depth = ctx->loop_depth;
 205                 reg->block_idx_free = -1;
 206         }
 207
 208         /* for regs we want to free at the end of the loop in any case
 209          * XXX dont do this for ssa
 210          */
 211         if (reg->loop_depth)
 212                 reg->block_idx_free = ctx->loop_last_block[reg->loop_depth];
 213 }
 214
 215 static struct ir2_src
 216 make_src(struct ir2_context *ctx, nir_src src)
 217 {
 218         struct ir2_src res = {};
 219         struct ir2_reg *reg;
 220
 221         nir_const_value *const_value = nir_src_as_const_value(src);
 222
 223         if (const_value) {
 224                 assert(src.is_ssa);
 225                 float c[src.ssa->num_components];
 226                 nir_const_value_to_array(c, const_value, src.ssa->num_components, f32);
 227                 return load_const(ctx, c, src.ssa->num_components);
 228         }
 229
 230         if (!src.is_ssa) {
 231                 res.num = src.reg.reg->index;
 232                 res.type = IR2_SRC_REG;
 233                 reg = &ctx->reg[res.num];
 234         } else {
 235                 assert(ctx->ssa_map[src.ssa->index] >= 0);
 236                 res.num = ctx->ssa_map[src.ssa->index];
 237                 res.type = IR2_SRC_SSA;
 238                 reg = &ctx->instr[res.num].ssa;
 239         }
 240
 241         update_range(ctx, reg);
 242         return res;
 243 }
 244
 245 static void
 246 set_index(struct ir2_context *ctx, nir_dest * dst,
 247                   struct ir2_instr *instr)
 248 {
 249         struct ir2_reg *reg = &instr->ssa;
 250
 251         if (dst->is_ssa) {
 252                 ctx->ssa_map[dst->ssa.index] = instr->idx;
 253         } else {
 254                 assert(instr->is_ssa);
 255                 reg = &ctx->reg[dst->reg.reg->index];
 256
 257                 instr->is_ssa = false;
 258                 instr->reg = reg;
 259         }
 260         update_range(ctx, reg);
 261 }
 262
 263 static struct ir2_instr *
 264 ir2_instr_create(struct ir2_context *ctx, int type)
 265 {
 266         struct ir2_instr *instr;
 267
 268         instr = &ctx->instr[ctx->instr_count++];
 269         instr->idx = ctx->instr_count - 1;
 270         instr->type = type;
 271         instr->block_idx = ctx->block_idx;
 272         instr->pred = ctx->pred;
 273         instr->is_ssa = true;
 274         return instr;
 275 }
 276
 277 static struct ir2_instr *
 278 instr_create_alu(struct ir2_context *ctx, nir_op opcode, unsigned ncomp)
 279 {
 280         /* emit_alu will fixup instrs that don't map directly */
 281         static const struct ir2_opc {
 282                 int8_t scalar, vector;
 283         } nir_ir2_opc[nir_num_opcodes+1] = {
 284                 [0 ... nir_num_opcodes - 1] = {-1, -1},
 285
 286                 [nir_op_mov] = {MAXs, MAXv},
 287                 [nir_op_fsign] = {-1, CNDGTEv},
 288                 [nir_op_fnot] = {SETEs, SETEv},
 289                 [nir_op_for] = {MAXs, MAXv},
 290                 [nir_op_fand] = {MINs, MINv},
 291                 [nir_op_fxor] = {-1, SETNEv},
 292                 [nir_op_fadd] = {ADDs, ADDv},
 293                 [nir_op_fsub] = {ADDs, ADDv},
 294                 [nir_op_fmul] = {MULs, MULv},
 295                 [nir_op_ffma] = {-1, MULADDv},
 296                 [nir_op_fmax] = {MAXs, MAXv},
 297                 [nir_op_fmin] = {MINs, MINv},
 298                 [nir_op_ffloor] = {FLOORs, FLOORv},
 299                 [nir_op_ffract] = {FRACs, FRACv},
 300                 [nir_op_ftrunc] = {TRUNCs, TRUNCv},
 301                 [nir_op_fdot2] = {-1, DOT2ADDv},
 302                 [nir_op_fdot3] = {-1, DOT3v},
 303                 [nir_op_fdot4] = {-1, DOT4v},
 304                 [nir_op_sge] = {-1, SETGTEv},
 305                 [nir_op_slt] = {-1, SETGTv},
 306                 [nir_op_sne] = {-1, SETNEv},
 307                 [nir_op_seq] = {-1, SETEv},
 308                 [nir_op_fcsel] = {-1, CNDEv},
 309                 [nir_op_frsq] = {RECIPSQ_IEEE, -1},
 310                 [nir_op_frcp] = {RECIP_IEEE, -1},
 311                 [nir_op_flog2] = {LOG_IEEE, -1},
 312                 [nir_op_fexp2] = {EXP_IEEE, -1},
 313                 [nir_op_fsqrt] = {SQRT_IEEE, -1},
 314                 [nir_op_fcos] = {COS, -1},
 315                 [nir_op_fsin] = {SIN, -1},
 316                 /* no fsat, fneg, fabs since source mods deal with those */
 317
 318                 /* so we can use this function with non-nir op */
 319 #define ir2_op_cube nir_num_opcodes
 320                 [ir2_op_cube] = {-1, CUBEv},
 321         };
 322
 323         struct ir2_opc op = nir_ir2_opc[opcode];
 324         assert(op.vector >= 0 || op.scalar >= 0);
 325
 326         struct ir2_instr *instr = ir2_instr_create(ctx, IR2_ALU);
 327         instr->alu.vector_opc = op.vector;
 328         instr->alu.scalar_opc = op.scalar;
 329         instr->alu.export = -1;
 330         instr->alu.write_mask = (1 << ncomp) - 1;
 331         instr->src_count = opcode == ir2_op_cube ? 2 :
 332                 nir_op_infos[opcode].num_inputs;
 333         instr->ssa.ncomp = ncomp;
 334         return instr;
 335 }
 336
 337 static struct ir2_instr *
 338 instr_create_alu_reg(struct ir2_context *ctx, nir_op opcode,
 339                 uint8_t write_mask, struct ir2_instr *share_reg)
 340 {
 341         struct ir2_instr *instr;
 342         struct ir2_reg *reg;
 343
 344         reg = share_reg ? share_reg->reg : &ctx->reg[ctx->reg_count++];
 345         reg->ncomp = MAX2(reg->ncomp, util_logbase2(write_mask) + 1);
 346
 347         instr = instr_create_alu(ctx, opcode, util_bitcount(write_mask));
 348         instr->alu.write_mask = write_mask;
 349         instr->reg = reg;
 350         instr->is_ssa = false;
 351         return instr;
 352 }
 353
 354
 355 static struct ir2_instr *
 356 instr_create_alu_dest(struct ir2_context *ctx, nir_op opcode, nir_dest *dst)
 357 {
 358         struct ir2_instr *instr;
 359         instr = instr_create_alu(ctx, opcode, nir_dest_num_components(*dst));
 360         set_index(ctx, dst, instr);
 361         return instr;
 362 }
 363
 364 static struct ir2_instr *
 365 ir2_instr_create_fetch(struct ir2_context *ctx, nir_dest *dst,
 366                 instr_fetch_opc_t opc)
 367 {
 368         struct ir2_instr *instr = ir2_instr_create(ctx, IR2_FETCH);
 369         instr->fetch.opc = opc;
 370         instr->src_count = 1;
 371         instr->ssa.ncomp = nir_dest_num_components(*dst);
 372         set_index(ctx, dst, instr);
 373         return instr;
 374 }
 375
 376 static struct ir2_src
 377 make_src_noconst(struct ir2_context *ctx, nir_src src)
 378 {
 379         struct ir2_instr *instr;
 380
 381         if (nir_src_as_const_value(src)) {
 382                 assert(src.is_ssa);
 383                 instr = instr_create_alu(ctx, nir_op_mov, src.ssa->num_components);
 384                 instr->src[0] = make_src(ctx, src);
 385                 return ir2_src(instr->idx, 0, IR2_SRC_SSA);
 386         }
 387
 388         return make_src(ctx, src);
 389 }
 390
 391 static void
 392 emit_alu(struct ir2_context *ctx, nir_alu_instr * alu)
 393 {
 394         const nir_op_info *info = &nir_op_infos[alu->op];
 395         nir_dest *dst = &alu->dest.dest;
 396         struct ir2_instr *instr;
 397         struct ir2_src tmp;
 398         unsigned ncomp;
 399
 400         /* get the number of dst components */
 401         if (dst->is_ssa) {
 402                 ncomp = dst->ssa.num_components;
 403         } else {
 404                 ncomp = 0;
 405                 for (int i = 0; i < 4; i++)
 406                         ncomp += !!(alu->dest.write_mask & 1 << i);
 407         }
 408
 409         instr = instr_create_alu(ctx, alu->op, ncomp);
 410         set_index(ctx, dst, instr);
 411         instr->alu.saturate = alu->dest.saturate;
 412         instr->alu.write_mask = alu->dest.write_mask;
 413
 414         for (int i = 0; i < info->num_inputs; i++) {
 415                 nir_alu_src *src = &alu->src[i];
 416
 417                 /* compress swizzle with writemask when applicable */
 418                 unsigned swiz = 0, j = 0;
 419                 for (int i = 0; i < 4; i++) {
 420                         if (!(alu->dest.write_mask & 1 << i) && !info->output_size)
 421                                 continue;
 422                         swiz |= swiz_set(src->swizzle[i], j++);
 423                 }
 424
 425                 instr->src[i] = make_src(ctx, src->src);
 426                 instr->src[i].swizzle = swiz_merge(instr->src[i].swizzle, swiz);
 427                 instr->src[i].negate = src->negate;
 428                 instr->src[i].abs = src->abs;
 429         }
 430
 431         /* workarounds for NIR ops that don't map directly to a2xx ops */
 432         switch (alu->op) {
 433         case nir_op_slt:
 434                 tmp = instr->src[0];
 435                 instr->src[0] = instr->src[1];
 436                 instr->src[1] = tmp;
 437                 break;
 438         case nir_op_fcsel:
 439                 tmp = instr->src[1];
 440                 instr->src[1] = instr->src[2];
 441                 instr->src[2] = tmp;
 442                 break;
 443         case nir_op_fsub:
 444                 instr->src[1].negate = !instr->src[1].negate;
 445                 break;
 446         case nir_op_fdot2:
 447                 instr->src_count = 3;
 448                 instr->src[2] = ir2_zero(ctx);
 449                 break;
 450         case nir_op_fsign: {
 451                 /* we need an extra instruction to deal with the zero case */
 452                 struct ir2_instr *tmp;
 453
 454                 /* tmp = x == 0 ? 0 : 1 */
 455                 tmp = instr_create_alu(ctx, nir_op_fcsel, ncomp);
 456                 tmp->src[0] = instr->src[0];
 457                 tmp->src[1] = ir2_zero(ctx);
 458                 tmp->src[2] = load_const(ctx, (float[]) {1.0f}, 1);
 459
 460                 /* result = x >= 0 ? tmp : -tmp */
 461                 instr->src[1] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);
 462                 instr->src[2] = instr->src[1];
 463                 instr->src[2].negate = true;
 464                 instr->src_count = 3;
 465         } break;
 466         default:
 467                 break;
 468         }
 469 }
 470
 471 static void
 472 load_input(struct ir2_context *ctx, nir_dest *dst, unsigned idx)
 473 {
 474         struct ir2_instr *instr;
 475         int slot = -1;
 476
 477         if (ctx->so->type == MESA_SHADER_VERTEX) {
 478                 instr = ir2_instr_create_fetch(ctx, dst, 0);
 479                 instr->src[0] = ir2_src(0, 0, IR2_SRC_INPUT);
 480                 instr->fetch.vtx.const_idx = 20 + (idx / 3);
 481                 instr->fetch.vtx.const_idx_sel = idx % 3;
 482                 return;
 483         }
 484
 485         /* get slot from idx */
 486         nir_foreach_variable(var, &ctx->nir->inputs) {
 487                 if (var->data.driver_location == idx) {
 488                         slot = var->data.location;
 489                         break;
 490                 }
 491         }
 492         assert(slot >= 0);
 493
 494         switch (slot) {
 495         case VARYING_SLOT_PNTC:
 496                 /* need to extract with abs and invert y */
 497                 instr = instr_create_alu_dest(ctx, nir_op_ffma, dst);
 498                 instr->src[0] = ir2_src(ctx->f->inputs_count, IR2_SWIZZLE_ZW, IR2_SRC_INPUT);
 499                 instr->src[0].abs = true;
 500                 instr->src[1] = load_const(ctx, (float[]) {1.0f, -1.0f}, 2);
 501                 instr->src[2] = load_const(ctx, (float[]) {0.0f, 1.0f}, 2);
 502                 break;
 503         case VARYING_SLOT_POS:
 504                 /* need to extract xy with abs and add tile offset on a20x
 505                  * zw from fragcoord input (w inverted in fragment shader)
 506                  * TODO: only components that are required by fragment shader
 507                  */
 508                 instr = instr_create_alu_reg(ctx,
 509                         ctx->so->is_a20x ? nir_op_fadd : nir_op_mov, 3, NULL);
 510                 instr->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT);
 511                 instr->src[0].abs = true;
 512                 /* on a20x, C64 contains the tile offset */
 513                 instr->src[1] = ir2_src(64, 0, IR2_SRC_CONST);
 514
 515                 instr = instr_create_alu_reg(ctx, nir_op_mov, 4, instr);
 516                 instr->src[0] = ir2_src(ctx->f->fragcoord, 0, IR2_SRC_INPUT);
 517
 518                 instr = instr_create_alu_reg(ctx, nir_op_frcp, 8, instr);
 519                 instr->src[0] = ir2_src(ctx->f->fragcoord, IR2_SWIZZLE_Y, IR2_SRC_INPUT);
 520
 521                 unsigned reg_idx = instr->reg - ctx->reg; /* XXX */
 522                 instr = instr_create_alu_dest(ctx, nir_op_mov, dst);
 523                 instr->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);
 524                 break;
 525         default:
 526                 instr = instr_create_alu_dest(ctx, nir_op_mov, dst);
 527                 instr->src[0] = ir2_src(idx, 0, IR2_SRC_INPUT);
 528                 break;
 529         }
 530 }
 531
 532 static unsigned
 533 output_slot(struct ir2_context *ctx, nir_intrinsic_instr *intr)
 534 {
 535         int slot = -1;
 536         unsigned idx = nir_intrinsic_base(intr);
 537         nir_foreach_variable(var, &ctx->nir->outputs) {
 538                 if (var->data.driver_location == idx) {
 539                         slot = var->data.location;
 540                         break;
 541                 }
 542         }
 543         assert(slot != -1);
 544         return slot;
 545 }
 546
 547 static void
 548 store_output(struct ir2_context *ctx, nir_src src, unsigned slot, unsigned ncomp)
 549 {
 550         struct ir2_instr *instr;
 551         unsigned idx = 0;
 552
 553         if (ctx->so->type == MESA_SHADER_VERTEX) {
 554                 switch (slot) {
 555                 case VARYING_SLOT_POS:
 556                         ctx->position = make_src(ctx, src);
 557                         idx = 62;
 558                         break;
 559                 case VARYING_SLOT_PSIZ:
 560                         ctx->so->writes_psize = true;
 561                         idx = 63;
 562                         break;
 563                 default:
 564                         /* find matching slot from fragment shader input */
 565                         for (idx = 0; idx < ctx->f->inputs_count; idx++)
 566                                 if (ctx->f->inputs[idx].slot == slot)
 567                                         break;
 568                         if (idx == ctx->f->inputs_count)
 569                                 return;
 570                 }
 571         } else if (slot != FRAG_RESULT_COLOR && slot != FRAG_RESULT_DATA0) {
 572                 /* only color output is implemented */
 573                 return;
 574         }
 575
 576         instr = instr_create_alu(ctx, nir_op_mov, ncomp);
 577         instr->src[0] = make_src(ctx, src);
 578         instr->alu.export = idx;
 579 }
 580
 581 static void
 582 emit_intrinsic(struct ir2_context *ctx, nir_intrinsic_instr *intr)
 583 {
 584         struct ir2_instr *instr;
 585         nir_const_value *const_offset;
 586         unsigned idx;
 587
 588         switch (intr->intrinsic) {
 589         case nir_intrinsic_load_input:
 590                 load_input(ctx, &intr->dest, nir_intrinsic_base(intr));
 591                 break;
 592         case nir_intrinsic_store_output:
 593                 store_output(ctx, intr->src[0], output_slot(ctx, intr), intr->num_components);
 594                 break;
 595         case nir_intrinsic_load_uniform:
 596                 const_offset = nir_src_as_const_value(intr->src[0]);
 597                 assert(const_offset); /* TODO can be false in ES2? */
 598                 idx = nir_intrinsic_base(intr);
 599                 idx += (uint32_t) nir_src_as_const_value(intr->src[0])[0].f32;
 600                 instr = instr_create_alu_dest(ctx, nir_op_mov, &intr->dest);
 601                 instr->src[0] = ir2_src(idx, 0, IR2_SRC_CONST);
 602                 break;
 603         case nir_intrinsic_discard:
 604         case nir_intrinsic_discard_if:
 605                 instr = ir2_instr_create(ctx, IR2_ALU);
 606                 instr->alu.vector_opc = VECTOR_NONE;
 607                 if (intr->intrinsic == nir_intrinsic_discard_if) {
 608                         instr->alu.scalar_opc = KILLNEs;
 609                         instr->src[0] = make_src(ctx, intr->src[0]);
 610                 } else {
 611                         instr->alu.scalar_opc = KILLEs;
 612                         instr->src[0] = ir2_zero(ctx);
 613                 }
 614                 instr->alu.export = -1;
 615                 instr->src_count = 1;
 616                 ctx->so->has_kill = true;
 617                 break;
 618         case nir_intrinsic_load_front_face:
 619                 /* gl_FrontFacing is in the sign of param.x
 620                  * rcp required because otherwise we can't differentiate -0.0 and +0.0
 621                  */
 622                 ctx->so->need_param = true;
 623
 624                 struct ir2_instr *tmp = instr_create_alu(ctx, nir_op_frcp, 1);
 625                 tmp->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT);
 626
 627                 instr = instr_create_alu_dest(ctx, nir_op_sge, &intr->dest);
 628                 instr->src[0] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);
 629                 instr->src[1] = ir2_zero(ctx);
 630                 break;
 631         default:
 632                 compile_error(ctx, "unimplemented intr %d\n", intr->intrinsic);
 633                 break;
 634         }
 635 }
 636
 637 static void
 638 emit_tex(struct ir2_context *ctx, nir_tex_instr * tex)
 639 {
 640         bool is_rect = false, is_cube = false;
 641         struct ir2_instr *instr;
 642         nir_src *coord, *lod_bias;
 643
 644         coord = lod_bias = NULL;
 645
 646         for (unsigned i = 0; i < tex->num_srcs; i++) {
 647                 switch (tex->src[i].src_type) {
 648                 case nir_tex_src_coord:
 649                         coord = &tex->src[i].src;
 650                         break;
 651                 case nir_tex_src_bias:
 652                 case nir_tex_src_lod:
 653                         assert(!lod_bias);
 654                         lod_bias = &tex->src[i].src;
 655                         break;
 656                 default:
 657                         compile_error(ctx, "Unhandled NIR tex src type: %d\n",
 658                                                   tex->src[i].src_type);
 659                         return;
 660                 }
 661         }
 662
 663         switch (tex->op) {
 664         case nir_texop_tex:
 665         case nir_texop_txb:
 666         case nir_texop_txl:
 667                 break;
 668         default:
 669                 compile_error(ctx, "unimplemented texop %d\n", tex->op);
 670                 return;
 671         }
 672
 673         switch (tex->sampler_dim) {
 674         case GLSL_SAMPLER_DIM_2D:
 675                 break;
 676         case GLSL_SAMPLER_DIM_RECT:
 677                 is_rect = true;
 678                 break;
 679         case GLSL_SAMPLER_DIM_CUBE:
 680                 is_cube = true;
 681                 break;
 682         default:
 683                 compile_error(ctx, "unimplemented sampler %d\n", tex->sampler_dim);
 684                 return;
 685         }
 686
 687         struct ir2_src src_coord = make_src_noconst(ctx, *coord);
 688
 689         /* for cube maps
 690          * tmp = cube(coord)
 691          * tmp.xy = tmp.xy / |tmp.z| + 1.5
 692          * coord = tmp.xyw
 693          */
 694         if (is_cube) {
 695                 struct ir2_instr *rcp, *coord_xy;
 696                 unsigned reg_idx;
 697
 698                 instr = instr_create_alu_reg(ctx, ir2_op_cube, 15, NULL);
 699                 instr->src[0] = src_coord;
 700                 instr->src[0].swizzle = IR2_SWIZZLE_ZZXY;
 701                 instr->src[1] = src_coord;
 702                 instr->src[1].swizzle = IR2_SWIZZLE_YXZZ;
 703
 704                 reg_idx = instr->reg - ctx->reg; /* hacky */
 705
 706                 rcp = instr_create_alu(ctx, nir_op_frcp, 1);
 707                 rcp->src[0] = ir2_src(reg_idx, IR2_SWIZZLE_Z, IR2_SRC_REG);
 708                 rcp->src[0].abs = true;
 709
 710                 coord_xy = instr_create_alu_reg(ctx, nir_op_ffma, 3, instr);
 711                 coord_xy->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);
 712                 coord_xy->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
 713                 coord_xy->src[2] = load_const(ctx, (float[]) {1.5f}, 1);
 714
 715                 src_coord = ir2_src(reg_idx, 0, IR2_SRC_REG);
 716                 /* TODO: lod/bias transformed by src_coord.z ? */
 717         }
 718
 719         instr = ir2_instr_create_fetch(ctx, &tex->dest, TEX_FETCH);
 720         instr->src[0] = src_coord;
 721         instr->src[0].swizzle = is_cube ? IR2_SWIZZLE_XYW : 0;
 722         instr->fetch.tex.is_cube = is_cube;
 723         instr->fetch.tex.is_rect = is_rect;
 724         instr->fetch.tex.samp_id = tex->sampler_index;
 725
 726         /* for lod/bias, we insert an extra src for the backend to deal with */
 727         if (lod_bias) {
 728                 instr->src[1] = make_src_noconst(ctx, *lod_bias);
 729                 /* backend will use 2-3 components so apply swizzle */
 730                 swiz_merge_p(&instr->src[1].swizzle, IR2_SWIZZLE_XXXX);
 731                 instr->src_count = 2;
 732         }
 733 }
 734
 735 static void
 736 setup_input(struct ir2_context *ctx, nir_variable * in)
 737 {
 738         struct fd2_shader_stateobj *so = ctx->so;
 739         unsigned array_len = MAX2(glsl_get_length(in->type), 1);
 740         unsigned n = in->data.driver_location;
 741         unsigned slot = in->data.location;
 742
 743         assert(array_len == 1);
 744
 745         /* handle later */
 746         if (ctx->so->type == MESA_SHADER_VERTEX)
 747                 return;
 748
 749         if (ctx->so->type != MESA_SHADER_FRAGMENT)
 750                 compile_error(ctx, "unknown shader type: %d\n", ctx->so->type);
 751
 752         if (slot == VARYING_SLOT_PNTC) {
 753                 so->need_param = true;
 754                 return;
 755         }
 756
 757         n = ctx->f->inputs_count++;
 758
 759         /* half of fragcoord from param reg, half from a varying */
 760         if (slot == VARYING_SLOT_POS) {
 761                 ctx->f->fragcoord = n;
 762                 so->need_param = true;
 763         }
 764
 765         ctx->f->inputs[n].slot = slot;
 766         ctx->f->inputs[n].ncomp = glsl_get_components(in->type);
 767
 768         /* in->data.interpolation?
 769          * opengl ES 2.0 can't do flat mode, but we still get it from GALLIUM_HUD
 770          */
 771 }
 772
 773 static void
 774 emit_undef(struct ir2_context *ctx, nir_ssa_undef_instr * undef)
 775 {
 776         /* TODO we don't want to emit anything for undefs */
 777
 778         struct ir2_instr *instr;
 779
 780         instr = instr_create_alu_dest(ctx, nir_op_mov,
 781                 &(nir_dest) {.ssa = undef->def,.is_ssa = true});
 782         instr->src[0] = ir2_src(0, 0, IR2_SRC_CONST);
 783 }
 784
 785 static void
 786 emit_instr(struct ir2_context *ctx, nir_instr * instr)
 787 {
 788         switch (instr->type) {
 789         case nir_instr_type_alu:
 790                 emit_alu(ctx, nir_instr_as_alu(instr));
 791                 break;
 792         case nir_instr_type_deref:
 793                 /* ignored, handled as part of the intrinsic they are src to */
 794                 break;
 795         case nir_instr_type_intrinsic:
 796                 emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
 797                 break;
 798         case nir_instr_type_load_const:
 799                 /* dealt with when using nir_src */
 800                 break;
 801         case nir_instr_type_tex:
 802                 emit_tex(ctx, nir_instr_as_tex(instr));
 803                 break;
 804         case nir_instr_type_jump:
 805                 ctx->block_has_jump[ctx->block_idx] = true;
 806                 break;
 807         case nir_instr_type_ssa_undef:
 808                 emit_undef(ctx, nir_instr_as_ssa_undef(instr));
 809                 break;
 810         default:
 811                 break;
 812         }
 813 }
 814
 815 /* fragcoord.zw and a20x hw binning outputs */
 816 static void
 817 extra_position_exports(struct ir2_context *ctx, bool binning)
 818 {
 819         struct ir2_instr *instr, *rcp, *sc, *wincoord, *off;
 820
 821         if (ctx->f->fragcoord < 0 && !binning)
 822                 return;
 823
 824         instr = instr_create_alu(ctx, nir_op_fmax, 1);
 825         instr->src[0] = ctx->position;
 826         instr->src[0].swizzle = IR2_SWIZZLE_W;
 827         instr->src[1] = ir2_zero(ctx);
 828
 829         rcp = instr_create_alu(ctx, nir_op_frcp, 1);
 830         rcp->src[0] = ir2_src(instr->idx, 0, IR2_SRC_SSA);
 831
 832         sc = instr_create_alu(ctx, nir_op_fmul, 4);
 833         sc->src[0] = ctx->position;
 834         sc->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
 835
 836         wincoord = instr_create_alu(ctx, nir_op_ffma, 4);
 837         wincoord->src[0] = ir2_src(66, 0, IR2_SRC_CONST);
 838         wincoord->src[1] = ir2_src(sc->idx, 0, IR2_SRC_SSA);
 839         wincoord->src[2] = ir2_src(65, 0, IR2_SRC_CONST);
 840
 841         /* fragcoord z/w */
 842         if (ctx->f->fragcoord >= 0 && !binning) {
 843                 instr = instr_create_alu(ctx, nir_op_mov, 1);
 844                 instr->src[0] = ir2_src(wincoord->idx, IR2_SWIZZLE_Z, IR2_SRC_SSA);
 845                 instr->alu.export = ctx->f->fragcoord;
 846
 847                 instr = instr_create_alu(ctx, nir_op_mov, 1);
 848                 instr->src[0] = ctx->position;
 849                 instr->src[0].swizzle = IR2_SWIZZLE_W;
 850                 instr->alu.export = ctx->f->fragcoord;
 851                 instr->alu.write_mask = 2;
 852         }
 853
 854         if (!binning)
 855                 return;
 856
 857         off = instr_create_alu(ctx, nir_op_fadd, 1);
 858         off->src[0] = ir2_src(64, 0, IR2_SRC_CONST);
 859         off->src[1] = ir2_src(2, 0, IR2_SRC_INPUT);
 860
 861         /* 8 max set in freedreno_screen.. unneeded instrs patched out */
 862         for (int i = 0; i < 8; i++) {
 863                 instr = instr_create_alu(ctx, nir_op_ffma, 4);
 864                 instr->src[0] = ir2_src(1, IR2_SWIZZLE_WYWW, IR2_SRC_CONST);
 865                 instr->src[1] = ir2_src(off->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
 866                 instr->src[2] = ir2_src(3 + i, 0, IR2_SRC_CONST);
 867                 instr->alu.export = 32;
 868
 869                 instr = instr_create_alu(ctx, nir_op_ffma, 4);
 870                 instr->src[0] = ir2_src(68 + i * 2, 0, IR2_SRC_CONST);
 871                 instr->src[1] = ir2_src(wincoord->idx, 0, IR2_SRC_SSA);
 872                 instr->src[2] = ir2_src(67 + i * 2, 0, IR2_SRC_CONST);
 873                 instr->alu.export = 33;
 874         }
 875 }
 876
 877 static bool emit_cf_list(struct ir2_context *ctx, struct exec_list *list);
 878
 879 static bool
 880 emit_block(struct ir2_context *ctx, nir_block * block)
 881 {
 882         struct ir2_instr *instr;
 883         nir_block *succs = block->successors[0];
 884
 885         ctx->block_idx = block->index;
 886
 887         nir_foreach_instr(instr, block)
 888                 emit_instr(ctx, instr);
 889
 890         if (!succs || !succs->index)
 891                 return false;
 892
 893         /* we want to be smart and always jump and have the backend cleanup
 894          * but we are not, so there are two cases where jump is needed:
 895          *  loops (succs index lower)
 896          *  jumps (jump instruction seen in block)
 897          */
 898         if (succs->index > block->index && !ctx->block_has_jump[block->index])
 899                 return false;
 900
 901         assert(block->successors[1] == NULL);
 902
 903         instr = ir2_instr_create(ctx, IR2_CF);
 904         instr->cf.block_idx = succs->index;
 905         /* XXX can't jump to a block with different predicate */
 906         return true;
 907 }
 908
 909 static void
 910 emit_if(struct ir2_context *ctx, nir_if * nif)
 911 {
 912         unsigned pred = ctx->pred, pred_idx = ctx->pred_idx;
 913         struct ir2_instr *instr;
 914
 915         /* XXX: blob seems to always use same register for condition */
 916
 917         instr = ir2_instr_create(ctx, IR2_ALU);
 918         instr->src[0] = make_src(ctx, nif->condition);
 919         instr->src_count = 1;
 920         instr->ssa.ncomp = 1;
 921         instr->alu.vector_opc = VECTOR_NONE;
 922         instr->alu.scalar_opc = SCALAR_NONE;
 923         instr->alu.export = -1;
 924         instr->alu.write_mask = 1;
 925         instr->pred = 0;
 926
 927         /* if nested, use PRED_SETNE_PUSHv */
 928         if (pred) {
 929                 instr->alu.vector_opc = PRED_SETNE_PUSHv;
 930                 instr->src[1] = instr->src[0];
 931                 instr->src[0] = ir2_src(pred_idx, 0, IR2_SRC_SSA);
 932                 instr->src[0].swizzle = IR2_SWIZZLE_XXXX;
 933                 instr->src[1].swizzle = IR2_SWIZZLE_XXXX;
 934                 instr->src_count = 2;
 935         } else {
 936                 instr->alu.scalar_opc = PRED_SETNEs;
 937         }
 938
 939         ctx->pred_idx = instr->idx;
 940         ctx->pred = 3;
 941
 942         emit_cf_list(ctx, &nif->then_list);
 943
 944         /* TODO: if these is no else branch we don't need this
 945          * and if the else branch is simple, can just flip ctx->pred instead
 946          */
 947         instr = ir2_instr_create(ctx, IR2_ALU);
 948         instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA);
 949         instr->src_count = 1;
 950         instr->ssa.ncomp = 1;
 951         instr->alu.vector_opc = VECTOR_NONE;
 952         instr->alu.scalar_opc = PRED_SET_INVs;
 953         instr->alu.export = -1;
 954         instr->alu.write_mask = 1;
 955         instr->pred = 0;
 956         ctx->pred_idx = instr->idx;
 957
 958         emit_cf_list(ctx, &nif->else_list);
 959
 960         /* restore predicate for nested predicates */
 961         if (pred) {
 962                 instr = ir2_instr_create(ctx, IR2_ALU);
 963                 instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA);
 964                 instr->src_count = 1;
 965                 instr->ssa.ncomp = 1;
 966                 instr->alu.vector_opc = VECTOR_NONE;
 967                 instr->alu.scalar_opc = PRED_SET_POPs;
 968                 instr->alu.export = -1;
 969                 instr->alu.write_mask = 1;
 970                 instr->pred = 0;
 971                 ctx->pred_idx = instr->idx;
 972         }
 973
 974         /* restore ctx->pred */
 975         ctx->pred = pred;
 976 }
 977
 978 /* get the highest block idx in the loop, so we know when
 979  * we can free registers that are allocated outside the loop
 980  */
 981 static unsigned
 982 loop_last_block(struct exec_list *list)
 983 {
 984         nir_cf_node *node =
 985                 exec_node_data(nir_cf_node, exec_list_get_tail(list), node);
 986         switch (node->type) {
 987         case nir_cf_node_block:
 988                 return nir_cf_node_as_block(node)->index;
 989         case nir_cf_node_if:
 990                 assert(0); /* XXX could this ever happen? */
 991                 return 0;
 992         case nir_cf_node_loop:
 993                 return loop_last_block(&nir_cf_node_as_loop(node)->body);
 994         default:
 995                 compile_error(ctx, "Not supported\n");
 996                 return 0;
 997         }
 998 }
 999
1000 static void
1001 emit_loop(struct ir2_context *ctx, nir_loop *nloop)
1002 {
1003         ctx->loop_last_block[++ctx->loop_depth] = loop_last_block(&nloop->body);
1004         emit_cf_list(ctx, &nloop->body);
1005         ctx->loop_depth--;
1006 }
1007
1008 static bool
1009 emit_cf_list(struct ir2_context *ctx, struct exec_list *list)
1010 {
1011         bool ret = false;
1012         foreach_list_typed(nir_cf_node, node, node, list) {
1013                 ret = false;
1014                 switch (node->type) {
1015                 case nir_cf_node_block:
1016                         ret = emit_block(ctx, nir_cf_node_as_block(node));
1017                         break;
1018                 case nir_cf_node_if:
1019                         emit_if(ctx, nir_cf_node_as_if(node));
1020                         break;
1021                 case nir_cf_node_loop:
1022                         emit_loop(ctx, nir_cf_node_as_loop(node));
1023                         break;
1024                 case nir_cf_node_function:
1025                         compile_error(ctx, "Not supported\n");
1026                         break;
1027                 }
1028         }
1029         return ret;
1030 }
1031
1032 static void cleanup_binning(struct ir2_context *ctx)
1033 {
1034         assert(ctx->so->type == MESA_SHADER_VERTEX);
1035
1036         /* kill non-position outputs for binning variant */
1037         nir_foreach_block(block, nir_shader_get_entrypoint(ctx->nir)) {
1038                 nir_foreach_instr_safe(instr, block) {
1039                         if (instr->type != nir_instr_type_intrinsic)
1040                                 continue;
1041
1042                         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1043                         if (intr->intrinsic != nir_intrinsic_store_output)
1044                                 continue;
1045
1046                         if (output_slot(ctx, intr) != VARYING_SLOT_POS)
1047                                 nir_instr_remove(instr);
1048                 }
1049         }
1050
1051         ir2_optimize_nir(ctx->nir, false);
1052 }
1053
1054 void
1055 ir2_nir_compile(struct ir2_context *ctx, bool binning)
1056 {
1057         struct fd2_shader_stateobj *so = ctx->so;
1058
1059         memset(ctx->ssa_map, 0xff, sizeof(ctx->ssa_map));
1060
1061         ctx->nir = nir_shader_clone(NULL, so->nir);
1062
1063         if (binning)
1064                 cleanup_binning(ctx);
1065
1066         /* postprocess */
1067         OPT_V(ctx->nir, nir_opt_algebraic_late);
1068
1069         OPT_V(ctx->nir, nir_lower_to_source_mods, nir_lower_all_source_mods);
1070         OPT_V(ctx->nir, nir_copy_prop);
1071         OPT_V(ctx->nir, nir_opt_dce);
1072         OPT_V(ctx->nir, nir_opt_move_comparisons);
1073
1074         OPT_V(ctx->nir, nir_lower_bool_to_float);
1075
1076         /* lower to scalar instructions that can only be scalar on a2xx */
1077         OPT_V(ctx->nir, ir2_nir_lower_scalar);
1078
1079         OPT_V(ctx->nir, nir_lower_locals_to_regs);
1080
1081         OPT_V(ctx->nir, nir_convert_from_ssa, true);
1082
1083         OPT_V(ctx->nir, nir_move_vec_src_uses_to_dest);
1084         OPT_V(ctx->nir, nir_lower_vec_to_movs);
1085
1086         OPT_V(ctx->nir, nir_opt_dce);
1087
1088         nir_sweep(ctx->nir);
1089
1090         if (fd_mesa_debug & FD_DBG_DISASM) {
1091                 debug_printf("----------------------\n");
1092                 nir_print_shader(ctx->nir, stdout);
1093                 debug_printf("----------------------\n");
1094         }
1095
1096         /* fd2_shader_stateobj init */
1097         if (so->type == MESA_SHADER_FRAGMENT) {
1098                 ctx->f->fragcoord = -1;
1099                 ctx->f->inputs_count = 0;
1100                 memset(ctx->f->inputs, 0, sizeof(ctx->f->inputs));
1101         }
1102
1103         /* Setup inputs: */
1104         nir_foreach_variable(in, &ctx->nir->inputs)
1105                 setup_input(ctx, in);
1106
1107         if (so->type == MESA_SHADER_FRAGMENT) {
1108                 unsigned idx;
1109                 for (idx = 0; idx < ctx->f->inputs_count; idx++) {
1110                         ctx->input[idx].ncomp = ctx->f->inputs[idx].ncomp;
1111                         update_range(ctx, &ctx->input[idx]);
1112                 }
1113                 /* assume we have param input and kill it later if not */
1114                 ctx->input[idx].ncomp = 4;
1115                 update_range(ctx, &ctx->input[idx]);
1116         } else {
1117                 ctx->input[0].ncomp = 1;
1118                 ctx->input[2].ncomp = 1;
1119                 update_range(ctx, &ctx->input[0]);
1120                 update_range(ctx, &ctx->input[2]);
1121         }
1122
1123         /* And emit the body: */
1124         nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->nir);
1125
1126         nir_foreach_register(reg, &fxn->registers) {
1127                 ctx->reg[reg->index].ncomp = reg->num_components;
1128                 ctx->reg_count = MAX2(ctx->reg_count, reg->index + 1);
1129         }
1130
1131         nir_metadata_require(fxn, nir_metadata_block_index);
1132         emit_cf_list(ctx, &fxn->body);
1133         /* TODO emit_block(ctx, fxn->end_block); */
1134
1135         if (so->type == MESA_SHADER_VERTEX)
1136                 extra_position_exports(ctx, binning);
1137
1138         ralloc_free(ctx->nir);
1139
1140         /* kill unused param input */
1141         if (so->type == MESA_SHADER_FRAGMENT && !so->need_param)
1142                 ctx->input[ctx->f->inputs_count].initialized = false;
1143 }