src/mesa/state_tracker/st_glsl_to_tgsi.cpp

   1 /*
   2  * Copyright (C) 2005-2007  Brian Paul   All Rights Reserved.
   3  * Copyright (C) 2008  VMware, Inc.   All Rights Reserved.
   4  * Copyright © 2010 Intel Corporation
   5  * Copyright © 2011 Bryan Cain
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the next
  15  * paragraph) shall be included in all copies or substantial portions of the
  16  * Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  21  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  23  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  24  * DEALINGS IN THE SOFTWARE.
  25  */
  26
  27 /**
  28  * \file glsl_to_tgsi.cpp
  29  *
  30  * Translate GLSL IR to TGSI.
  31  */
  32
  33 #include "st_glsl_to_tgsi.h"
  34
  35 #include "compiler/glsl/glsl_parser_extras.h"
  36 #include "compiler/glsl/ir_optimization.h"
  37 #include "compiler/glsl/program.h"
  38
  39 #include "main/errors.h"
  40 #include "main/shaderobj.h"
  41 #include "main/uniforms.h"
  42 #include "main/shaderapi.h"
  43 #include "main/shaderimage.h"
  44 #include "program/prog_instruction.h"
  45
  46 #include "pipe/p_context.h"
  47 #include "pipe/p_screen.h"
  48 #include "tgsi/tgsi_ureg.h"
  49 #include "tgsi/tgsi_info.h"
  50 #include "util/u_math.h"
  51 #include "util/u_memory.h"
  52 #include "st_glsl_types.h"
  53 #include "st_program.h"
  54 #include "st_mesa_to_tgsi.h"
  55 #include "st_format.h"
  56 #include "st_nir.h"
  57 #include "st_shader_cache.h"
  58 #include "st_glsl_to_tgsi_temprename.h"
  59
  60 #include "util/hash_table.h"
  61 #include <algorithm>
  62
  63 #define PROGRAM_ANY_CONST ((1 << PROGRAM_STATE_VAR) |    \
  64                            (1 << PROGRAM_CONSTANT) |     \
  65                            (1 << PROGRAM_UNIFORM))
  66
  67 #define MAX_GLSL_TEXTURE_OFFSET 4
  68
  69 static unsigned is_precise(const ir_variable *ir)
  70 {
  71    if (!ir)
  72       return 0;
  73    return ir->data.precise || ir->data.invariant;
  74 }
  75
  76 class variable_storage {
  77    DECLARE_RZALLOC_CXX_OPERATORS(variable_storage)
  78
  79 public:
  80    variable_storage(ir_variable *var, gl_register_file file, int index,
  81                     unsigned array_id = 0)
  82       : file(file), index(index), component(0), var(var), array_id(array_id)
  83    {
  84       assert(file != PROGRAM_ARRAY || array_id != 0);
  85    }
  86
  87    gl_register_file file;
  88    int index;
  89
  90    /* Explicit component location. This is given in terms of the GLSL-style
  91     * swizzles where each double is a single component, i.e. for 64-bit types
  92     * it can only be 0 or 1.
  93     */
  94    int component;
  95    ir_variable *var; /* variable that maps to this, if any */
  96    unsigned array_id;
  97 };
  98
  99 class immediate_storage : public exec_node {
 100 public:
 101    immediate_storage(gl_constant_value *values, int size32, int type)
 102    {
 103       memcpy(this->values, values, size32 * sizeof(gl_constant_value));
 104       this->size32 = size32;
 105       this->type = type;
 106    }
 107
 108    /* doubles are stored across 2 gl_constant_values */
 109    gl_constant_value values[4];
 110    int size32; /**< Number of 32-bit components (1-4) */
 111    int type; /**< GL_DOUBLE, GL_FLOAT, GL_INT, GL_BOOL, or GL_UNSIGNED_INT */
 112 };
 113
 114 static const st_src_reg undef_src = st_src_reg(PROGRAM_UNDEFINED, 0, GLSL_TYPE_ERROR);
 115 static const st_dst_reg undef_dst = st_dst_reg(PROGRAM_UNDEFINED, SWIZZLE_NOOP, GLSL_TYPE_ERROR);
 116
 117 struct inout_decl {
 118    unsigned mesa_index;
 119    unsigned array_id; /* TGSI ArrayID; 1-based: 0 means not an array */
 120    unsigned size;
 121    unsigned interp_loc;
 122    unsigned gs_out_streams;
 123    enum glsl_interp_mode interp;
 124    enum glsl_base_type base_type;
 125    ubyte usage_mask; /* GLSL-style usage-mask,  i.e. single bit per double */
 126 };
 127
 128 static struct inout_decl *
 129 find_inout_array(struct inout_decl *decls, unsigned count, unsigned array_id)
 130 {
 131    assert(array_id != 0);
 132
 133    for (unsigned i = 0; i < count; i++) {
 134       struct inout_decl *decl = &decls[i];
 135
 136       if (array_id == decl->array_id) {
 137          return decl;
 138       }
 139    }
 140
 141    return NULL;
 142 }
 143
 144 static enum glsl_base_type
 145 find_array_type(struct inout_decl *decls, unsigned count, unsigned array_id)
 146 {
 147    if (!array_id)
 148       return GLSL_TYPE_ERROR;
 149    struct inout_decl *decl = find_inout_array(decls, count, array_id);
 150    if (decl)
 151       return decl->base_type;
 152    return GLSL_TYPE_ERROR;
 153 }
 154
 155 struct glsl_to_tgsi_visitor : public ir_visitor {
 156 public:
 157    glsl_to_tgsi_visitor();
 158    ~glsl_to_tgsi_visitor();
 159
 160    struct gl_context *ctx;
 161    struct gl_program *prog;
 162    struct gl_shader_program *shader_program;
 163    struct gl_linked_shader *shader;
 164    struct gl_shader_compiler_options *options;
 165
 166    int next_temp;
 167
 168    unsigned *array_sizes;
 169    unsigned max_num_arrays;
 170    unsigned next_array;
 171
 172    struct inout_decl inputs[4 * PIPE_MAX_SHADER_INPUTS];
 173    unsigned num_inputs;
 174    unsigned num_input_arrays;
 175    struct inout_decl outputs[4 * PIPE_MAX_SHADER_OUTPUTS];
 176    unsigned num_outputs;
 177    unsigned num_output_arrays;
 178
 179    int num_address_regs;
 180    uint32_t samplers_used;
 181    glsl_base_type sampler_types[PIPE_MAX_SAMPLERS];
 182    int sampler_targets[PIPE_MAX_SAMPLERS];   /**< One of TGSI_TEXTURE_* */
 183    int images_used;
 184    int image_targets[PIPE_MAX_SHADER_IMAGES];
 185    unsigned image_formats[PIPE_MAX_SHADER_IMAGES];
 186    bool indirect_addr_consts;
 187    int wpos_transform_const;
 188
 189    int glsl_version;
 190    bool native_integers;
 191    bool have_sqrt;
 192    bool have_fma;
 193    bool use_shared_memory;
 194    bool has_tex_txf_lz;
 195    bool precise;
 196
 197    variable_storage *find_variable_storage(ir_variable *var);
 198
 199    int add_constant(gl_register_file file, gl_constant_value values[8],
 200                     int size, int datatype, uint16_t *swizzle_out);
 201
 202    st_src_reg get_temp(const glsl_type *type);
 203    void reladdr_to_temp(ir_instruction *ir, st_src_reg *reg, int *num_reladdr);
 204
 205    st_src_reg st_src_reg_for_double(double val);
 206    st_src_reg st_src_reg_for_float(float val);
 207    st_src_reg st_src_reg_for_int(int val);
 208    st_src_reg st_src_reg_for_type(enum glsl_base_type type, int val);
 209
 210    /**
 211     * \name Visit methods
 212     *
 213     * As typical for the visitor pattern, there must be one \c visit method for
 214     * each concrete subclass of \c ir_instruction.  Virtual base classes within
 215     * the hierarchy should not have \c visit methods.
 216     */
 217    /*@{*/
 218    virtual void visit(ir_variable *);
 219    virtual void visit(ir_loop *);
 220    virtual void visit(ir_loop_jump *);
 221    virtual void visit(ir_function_signature *);
 222    virtual void visit(ir_function *);
 223    virtual void visit(ir_expression *);
 224    virtual void visit(ir_swizzle *);
 225    virtual void visit(ir_dereference_variable  *);
 226    virtual void visit(ir_dereference_array *);
 227    virtual void visit(ir_dereference_record *);
 228    virtual void visit(ir_assignment *);
 229    virtual void visit(ir_constant *);
 230    virtual void visit(ir_call *);
 231    virtual void visit(ir_return *);
 232    virtual void visit(ir_discard *);
 233    virtual void visit(ir_texture *);
 234    virtual void visit(ir_if *);
 235    virtual void visit(ir_emit_vertex *);
 236    virtual void visit(ir_end_primitive *);
 237    virtual void visit(ir_barrier *);
 238    /*@}*/
 239
 240    void visit_expression(ir_expression *, st_src_reg *) ATTRIBUTE_NOINLINE;
 241
 242    void visit_atomic_counter_intrinsic(ir_call *);
 243    void visit_ssbo_intrinsic(ir_call *);
 244    void visit_membar_intrinsic(ir_call *);
 245    void visit_shared_intrinsic(ir_call *);
 246    void visit_image_intrinsic(ir_call *);
 247    void visit_generic_intrinsic(ir_call *, unsigned op);
 248
 249    st_src_reg result;
 250
 251    /** List of variable_storage */
 252    struct hash_table *variables;
 253
 254    /** List of immediate_storage */
 255    exec_list immediates;
 256    unsigned num_immediates;
 257
 258    /** List of glsl_to_tgsi_instruction */
 259    exec_list instructions;
 260
 261    glsl_to_tgsi_instruction *emit_asm(ir_instruction *ir, unsigned op,
 262                                       st_dst_reg dst = undef_dst,
 263                                       st_src_reg src0 = undef_src,
 264                                       st_src_reg src1 = undef_src,
 265                                       st_src_reg src2 = undef_src,
 266                                       st_src_reg src3 = undef_src);
 267
 268    glsl_to_tgsi_instruction *emit_asm(ir_instruction *ir, unsigned op,
 269                                       st_dst_reg dst, st_dst_reg dst1,
 270                                       st_src_reg src0 = undef_src,
 271                                       st_src_reg src1 = undef_src,
 272                                       st_src_reg src2 = undef_src,
 273                                       st_src_reg src3 = undef_src);
 274
 275    unsigned get_opcode(unsigned op,
 276                     st_dst_reg dst,
 277                     st_src_reg src0, st_src_reg src1);
 278
 279    /**
 280     * Emit the correct dot-product instruction for the type of arguments
 281     */
 282    glsl_to_tgsi_instruction *emit_dp(ir_instruction *ir,
 283                                      st_dst_reg dst,
 284                                      st_src_reg src0,
 285                                      st_src_reg src1,
 286                                      unsigned elements);
 287
 288    void emit_scalar(ir_instruction *ir, unsigned op,
 289                     st_dst_reg dst, st_src_reg src0);
 290
 291    void emit_scalar(ir_instruction *ir, unsigned op,
 292                     st_dst_reg dst, st_src_reg src0, st_src_reg src1);
 293
 294    void emit_arl(ir_instruction *ir, st_dst_reg dst, st_src_reg src0);
 295
 296    void get_deref_offsets(ir_dereference *ir,
 297                           unsigned *array_size,
 298                           unsigned *base,
 299                           uint16_t *index,
 300                           st_src_reg *reladdr,
 301                           bool opaque);
 302   void calc_deref_offsets(ir_dereference *tail,
 303                           unsigned *array_elements,
 304                           uint16_t *index,
 305                           st_src_reg *indirect,
 306                           unsigned *location);
 307    st_src_reg canonicalize_gather_offset(st_src_reg offset);
 308
 309    bool try_emit_mad(ir_expression *ir,
 310               int mul_operand);
 311    bool try_emit_mad_for_and_not(ir_expression *ir,
 312               int mul_operand);
 313
 314    void emit_swz(ir_expression *ir);
 315
 316    bool process_move_condition(ir_rvalue *ir);
 317
 318    void simplify_cmp(void);
 319
 320    void rename_temp_registers(struct rename_reg_pair *renames);
 321    void get_first_temp_read(int *first_reads);
 322    void get_first_temp_write(int *first_writes);
 323    void get_last_temp_read_first_temp_write(int *last_reads, int *first_writes);
 324    void get_last_temp_write(int *last_writes);
 325
 326    void copy_propagate(void);
 327    int eliminate_dead_code(void);
 328
 329    void merge_two_dsts(void);
 330    void merge_registers(void);
 331    void renumber_registers(void);
 332
 333    void emit_block_mov(ir_assignment *ir, const struct glsl_type *type,
 334                        st_dst_reg *l, st_src_reg *r,
 335                        st_src_reg *cond, bool cond_swap);
 336
 337    void *mem_ctx;
 338 };
 339
 340 static st_dst_reg address_reg = st_dst_reg(PROGRAM_ADDRESS, WRITEMASK_X, GLSL_TYPE_FLOAT, 0);
 341 static st_dst_reg address_reg2 = st_dst_reg(PROGRAM_ADDRESS, WRITEMASK_X, GLSL_TYPE_FLOAT, 1);
 342 static st_dst_reg sampler_reladdr = st_dst_reg(PROGRAM_ADDRESS, WRITEMASK_X, GLSL_TYPE_FLOAT, 2);
 343
 344 static void
 345 fail_link(struct gl_shader_program *prog, const char *fmt, ...) PRINTFLIKE(2, 3);
 346
 347 static void
 348 fail_link(struct gl_shader_program *prog, const char *fmt, ...)
 349 {
 350    va_list args;
 351    va_start(args, fmt);
 352    ralloc_vasprintf_append(&prog->data->InfoLog, fmt, args);
 353    va_end(args);
 354
 355    prog->data->LinkStatus = linking_failure;
 356 }
 357
 358 int
 359 swizzle_for_size(int size)
 360 {
 361    static const int size_swizzles[4] = {
 362       MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
 363       MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y),
 364       MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_Z),
 365       MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W),
 366    };
 367
 368    assert((size >= 1) && (size <= 4));
 369    return size_swizzles[size - 1];
 370 }
 371
 372
 373 glsl_to_tgsi_instruction *
 374 glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
 375                                st_dst_reg dst, st_dst_reg dst1,
 376                                st_src_reg src0, st_src_reg src1,
 377                                st_src_reg src2, st_src_reg src3)
 378 {
 379    glsl_to_tgsi_instruction *inst = new(mem_ctx) glsl_to_tgsi_instruction();
 380    int num_reladdr = 0, i, j;
 381    bool dst_is_64bit[2];
 382
 383    op = get_opcode(op, dst, src0, src1);
 384
 385    /* If we have to do relative addressing, we want to load the ARL
 386     * reg directly for one of the regs, and preload the other reladdr
 387     * sources into temps.
 388     */
 389    num_reladdr += dst.reladdr != NULL || dst.reladdr2;
 390    num_reladdr += dst1.reladdr != NULL || dst1.reladdr2;
 391    num_reladdr += src0.reladdr != NULL || src0.reladdr2 != NULL;
 392    num_reladdr += src1.reladdr != NULL || src1.reladdr2 != NULL;
 393    num_reladdr += src2.reladdr != NULL || src2.reladdr2 != NULL;
 394    num_reladdr += src3.reladdr != NULL || src3.reladdr2 != NULL;
 395
 396    reladdr_to_temp(ir, &src3, &num_reladdr);
 397    reladdr_to_temp(ir, &src2, &num_reladdr);
 398    reladdr_to_temp(ir, &src1, &num_reladdr);
 399    reladdr_to_temp(ir, &src0, &num_reladdr);
 400
 401    if (dst.reladdr || dst.reladdr2) {
 402       if (dst.reladdr)
 403          emit_arl(ir, address_reg, *dst.reladdr);
 404       if (dst.reladdr2)
 405          emit_arl(ir, address_reg2, *dst.reladdr2);
 406       num_reladdr--;
 407    }
 408    if (dst1.reladdr) {
 409       emit_arl(ir, address_reg, *dst1.reladdr);
 410       num_reladdr--;
 411    }
 412    assert(num_reladdr == 0);
 413
 414    /* inst->op has only 8 bits. */
 415    STATIC_ASSERT(TGSI_OPCODE_LAST <= 255);
 416
 417    inst->op = op;
 418    inst->precise = this->precise;
 419    inst->info = tgsi_get_opcode_info(op);
 420    inst->dst[0] = dst;
 421    inst->dst[1] = dst1;
 422    inst->src[0] = src0;
 423    inst->src[1] = src1;
 424    inst->src[2] = src2;
 425    inst->src[3] = src3;
 426    inst->is_64bit_expanded = false;
 427    inst->ir = ir;
 428    inst->dead_mask = 0;
 429    inst->tex_offsets = NULL;
 430    inst->tex_offset_num_offset = 0;
 431    inst->saturate = 0;
 432    inst->tex_shadow = 0;
 433    /* default to float, for paths where this is not initialized
 434     * (since 0==UINT which is likely wrong):
 435     */
 436    inst->tex_type = GLSL_TYPE_FLOAT;
 437
 438    /* Update indirect addressing status used by TGSI */
 439    if (dst.reladdr || dst.reladdr2) {
 440       switch(dst.file) {
 441       case PROGRAM_STATE_VAR:
 442       case PROGRAM_CONSTANT:
 443       case PROGRAM_UNIFORM:
 444          this->indirect_addr_consts = true;
 445          break;
 446       case PROGRAM_IMMEDIATE:
 447          assert(!"immediates should not have indirect addressing");
 448          break;
 449       default:
 450          break;
 451       }
 452    }
 453    else {
 454       for (i = 0; i < 4; i++) {
 455          if(inst->src[i].reladdr) {
 456             switch(inst->src[i].file) {
 457             case PROGRAM_STATE_VAR:
 458             case PROGRAM_CONSTANT:
 459             case PROGRAM_UNIFORM:
 460                this->indirect_addr_consts = true;
 461                break;
 462             case PROGRAM_IMMEDIATE:
 463                assert(!"immediates should not have indirect addressing");
 464                break;
 465             default:
 466                break;
 467             }
 468          }
 469       }
 470    }
 471
 472    /*
 473     * This section contains the double processing.
 474     * GLSL just represents doubles as single channel values,
 475     * however most HW and TGSI represent doubles as pairs of register channels.
 476     *
 477     * so we have to fixup destination writemask/index and src swizzle/indexes.
 478     * dest writemasks need to translate from single channel write mask
 479     * to a dual-channel writemask, but also need to modify the index,
 480     * if we are touching the Z,W fields in the pre-translated writemask.
 481     *
 482     * src channels have similiar index modifications along with swizzle
 483     * changes to we pick the XY, ZW pairs from the correct index.
 484     *
 485     * GLSL [0].x -> TGSI [0].xy
 486     * GLSL [0].y -> TGSI [0].zw
 487     * GLSL [0].z -> TGSI [1].xy
 488     * GLSL [0].w -> TGSI [1].zw
 489     */
 490    for (j = 0; j < 2; j++) {
 491       dst_is_64bit[j] = glsl_base_type_is_64bit(inst->dst[j].type);
 492       if (!dst_is_64bit[j] && inst->dst[j].file == PROGRAM_OUTPUT && inst->dst[j].type == GLSL_TYPE_ARRAY) {
 493          enum glsl_base_type type = find_array_type(this->outputs, this->num_outputs, inst->dst[j].array_id);
 494          if (glsl_base_type_is_64bit(type))
 495             dst_is_64bit[j] = true;
 496       }
 497    }
 498
 499    if (dst_is_64bit[0] || dst_is_64bit[1] ||
 500        glsl_base_type_is_64bit(inst->src[0].type)) {
 501       glsl_to_tgsi_instruction *dinst = NULL;
 502       int initial_src_swz[4], initial_src_idx[4];
 503       int initial_dst_idx[2], initial_dst_writemask[2];
 504       /* select the writemask for dst0 or dst1 */
 505       unsigned writemask = inst->dst[1].file == PROGRAM_UNDEFINED ? inst->dst[0].writemask : inst->dst[1].writemask;
 506
 507       /* copy out the writemask, index and swizzles for all src/dsts. */
 508       for (j = 0; j < 2; j++) {
 509          initial_dst_writemask[j] = inst->dst[j].writemask;
 510          initial_dst_idx[j] = inst->dst[j].index;
 511       }
 512
 513       for (j = 0; j < 4; j++) {
 514          initial_src_swz[j] = inst->src[j].swizzle;
 515          initial_src_idx[j] = inst->src[j].index;
 516       }
 517
 518       /*
 519        * scan all the components in the dst writemask
 520        * generate an instruction for each of them if required.
 521        */
 522       st_src_reg addr;
 523       while (writemask) {
 524
 525          int i = u_bit_scan(&writemask);
 526
 527          /* before emitting the instruction, see if we have to adjust load / store
 528           * address */
 529          if (i > 1 && (inst->op == TGSI_OPCODE_LOAD || inst->op == TGSI_OPCODE_STORE) &&
 530              addr.file == PROGRAM_UNDEFINED) {
 531             /* We have to advance the buffer address by 16 */
 532             addr = get_temp(glsl_type::uint_type);
 533             emit_asm(ir, TGSI_OPCODE_UADD, st_dst_reg(addr),
 534                      inst->src[0], st_src_reg_for_int(16));
 535          }
 536
 537          /* first time use previous instruction */
 538          if (dinst == NULL) {
 539             dinst = inst;
 540          } else {
 541             /* create a new instructions for subsequent attempts */
 542             dinst = new(mem_ctx) glsl_to_tgsi_instruction();
 543             *dinst = *inst;
 544             dinst->next = NULL;
 545             dinst->prev = NULL;
 546          }
 547          this->instructions.push_tail(dinst);
 548          dinst->is_64bit_expanded = true;
 549
 550          /* modify the destination if we are splitting */
 551          for (j = 0; j < 2; j++) {
 552             if (dst_is_64bit[j]) {
 553                dinst->dst[j].writemask = (i & 1) ? WRITEMASK_ZW : WRITEMASK_XY;
 554                dinst->dst[j].index = initial_dst_idx[j];
 555                if (i > 1) {
 556                   if (dinst->op == TGSI_OPCODE_LOAD || dinst->op == TGSI_OPCODE_STORE)
 557                      dinst->src[0] = addr;
 558                   if (dinst->op != TGSI_OPCODE_STORE)
 559                      dinst->dst[j].index++;
 560                }
 561             } else {
 562                /* if we aren't writing to a double, just get the bit of the initial writemask
 563                   for this channel */
 564                dinst->dst[j].writemask = initial_dst_writemask[j] & (1 << i);
 565             }
 566          }
 567
 568          /* modify the src registers */
 569          for (j = 0; j < 4; j++) {
 570             int swz = GET_SWZ(initial_src_swz[j], i);
 571
 572             if (glsl_base_type_is_64bit(dinst->src[j].type)) {
 573                dinst->src[j].index = initial_src_idx[j];
 574                if (swz > 1) {
 575                   dinst->src[j].double_reg2 = true;
 576                   dinst->src[j].index++;
 577                }
 578
 579                if (swz & 1)
 580                   dinst->src[j].swizzle = MAKE_SWIZZLE4(SWIZZLE_Z, SWIZZLE_W, SWIZZLE_Z, SWIZZLE_W);
 581                else
 582                   dinst->src[j].swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_X, SWIZZLE_Y);
 583
 584             } else {
 585                /* some opcodes are special case in what they use as sources
 586                   - [FUI]2D/[UI]2I64 is a float/[u]int src0, DLDEXP is integer src1 */
 587                if (op == TGSI_OPCODE_F2D || op == TGSI_OPCODE_U2D || op == TGSI_OPCODE_I2D ||
 588                    op == TGSI_OPCODE_I2I64 || op == TGSI_OPCODE_U2I64 ||
 589                    op == TGSI_OPCODE_DLDEXP ||
 590                    (op == TGSI_OPCODE_UCMP && dst_is_64bit[0])) {
 591                   dinst->src[j].swizzle = MAKE_SWIZZLE4(swz, swz, swz, swz);
 592                }
 593             }
 594          }
 595       }
 596       inst = dinst;
 597    } else {
 598       this->instructions.push_tail(inst);
 599    }
 600
 601
 602    return inst;
 603 }
 604
 605 glsl_to_tgsi_instruction *
 606 glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
 607                                st_dst_reg dst,
 608                                st_src_reg src0, st_src_reg src1,
 609                                st_src_reg src2, st_src_reg src3)
 610 {
 611    return emit_asm(ir, op, dst, undef_dst, src0, src1, src2, src3);
 612 }
 613
 614 /**
 615  * Determines whether to use an integer, unsigned integer, or float opcode
 616  * based on the operands and input opcode, then emits the result.
 617  */
 618 unsigned
 619 glsl_to_tgsi_visitor::get_opcode(unsigned op,
 620                                  st_dst_reg dst,
 621                                  st_src_reg src0, st_src_reg src1)
 622 {
 623    enum glsl_base_type type = GLSL_TYPE_FLOAT;
 624
 625    if (op == TGSI_OPCODE_MOV)
 626        return op;
 627
 628    assert(src0.type != GLSL_TYPE_ARRAY);
 629    assert(src0.type != GLSL_TYPE_STRUCT);
 630    assert(src1.type != GLSL_TYPE_ARRAY);
 631    assert(src1.type != GLSL_TYPE_STRUCT);
 632
 633    if (is_resource_instruction(op))
 634       type = src1.type;
 635    else if (src0.type == GLSL_TYPE_INT64 || src1.type == GLSL_TYPE_INT64)
 636       type = GLSL_TYPE_INT64;
 637    else if (src0.type == GLSL_TYPE_UINT64 || src1.type == GLSL_TYPE_UINT64)
 638       type = GLSL_TYPE_UINT64;
 639    else if (src0.type == GLSL_TYPE_DOUBLE || src1.type == GLSL_TYPE_DOUBLE)
 640       type = GLSL_TYPE_DOUBLE;
 641    else if (src0.type == GLSL_TYPE_FLOAT || src1.type == GLSL_TYPE_FLOAT)
 642       type = GLSL_TYPE_FLOAT;
 643    else if (native_integers)
 644       type = src0.type == GLSL_TYPE_BOOL ? GLSL_TYPE_INT : src0.type;
 645
 646 #define case7(c, f, i, u, d, i64, ui64)             \
 647    case TGSI_OPCODE_##c: \
 648       if (type == GLSL_TYPE_UINT64)           \
 649          op = TGSI_OPCODE_##ui64; \
 650       else if (type == GLSL_TYPE_INT64)       \
 651          op = TGSI_OPCODE_##i64; \
 652       else if (type == GLSL_TYPE_DOUBLE)       \
 653          op = TGSI_OPCODE_##d; \
 654       else if (type == GLSL_TYPE_INT)       \
 655          op = TGSI_OPCODE_##i; \
 656       else if (type == GLSL_TYPE_UINT) \
 657          op = TGSI_OPCODE_##u; \
 658       else \
 659          op = TGSI_OPCODE_##f; \
 660       break;
 661
 662 #define casecomp(c, f, i, u, d, i64, ui64)           \
 663    case TGSI_OPCODE_##c: \
 664       if (type == GLSL_TYPE_INT64)             \
 665          op = TGSI_OPCODE_##i64; \
 666       else if (type == GLSL_TYPE_UINT64)        \
 667          op = TGSI_OPCODE_##ui64; \
 668       else if (type == GLSL_TYPE_DOUBLE)       \
 669          op = TGSI_OPCODE_##d; \
 670       else if (type == GLSL_TYPE_INT || type == GLSL_TYPE_SUBROUTINE)       \
 671          op = TGSI_OPCODE_##i; \
 672       else if (type == GLSL_TYPE_UINT) \
 673          op = TGSI_OPCODE_##u; \
 674       else if (native_integers) \
 675          op = TGSI_OPCODE_##f; \
 676       else \
 677          op = TGSI_OPCODE_##c; \
 678       break;
 679
 680    switch(op) {
 681       /* Some instructions are initially selected without considering the type.
 682        * This fixes the type:
 683        *
 684        *    INIT     FLOAT SINT     UINT     DOUBLE   SINT64   UINT64
 685        */
 686       case7(ADD,     ADD,  UADD,    UADD,    DADD,    U64ADD,  U64ADD);
 687       case7(CEIL,    CEIL, LAST,    LAST,    DCEIL,   LAST,    LAST);
 688       case7(DIV,     DIV,  IDIV,    UDIV,    DDIV,    I64DIV,  U64DIV);
 689       case7(FMA,     FMA,  UMAD,    UMAD,    DFMA,    LAST,    LAST);
 690       case7(FLR,     FLR,  LAST,    LAST,    DFLR,    LAST,    LAST);
 691       case7(FRC,     FRC,  LAST,    LAST,    DFRAC,   LAST,    LAST);
 692       case7(MUL,     MUL,  UMUL,    UMUL,    DMUL,    U64MUL,  U64MUL);
 693       case7(MAD,     MAD,  UMAD,    UMAD,    DMAD,    LAST,    LAST);
 694       case7(MAX,     MAX,  IMAX,    UMAX,    DMAX,    I64MAX,  U64MAX);
 695       case7(MIN,     MIN,  IMIN,    UMIN,    DMIN,    I64MIN,  U64MIN);
 696       case7(RCP,     RCP,  LAST,    LAST,    DRCP,    LAST,    LAST);
 697       case7(ROUND,   ROUND,LAST,    LAST,    DROUND,  LAST,    LAST);
 698       case7(RSQ,     RSQ,  LAST,    LAST,    DRSQ,    LAST,    LAST);
 699       case7(SQRT,    SQRT, LAST,    LAST,    DSQRT,   LAST,    LAST);
 700       case7(SSG,     SSG,  ISSG,    ISSG,    DSSG,    I64SSG,  I64SSG);
 701       case7(TRUNC,   TRUNC,LAST,    LAST,    DTRUNC,  LAST,    LAST);
 702
 703       case7(MOD,     LAST, MOD,     UMOD,    LAST,    I64MOD,  U64MOD);
 704       case7(SHL,     LAST, SHL,     SHL,     LAST,    U64SHL,  U64SHL);
 705       case7(IBFE,    LAST, IBFE,    UBFE,    LAST,    LAST,    LAST);
 706       case7(IMSB,    LAST, IMSB,    UMSB,    LAST,    LAST,    LAST);
 707       case7(IMUL_HI, LAST, IMUL_HI, UMUL_HI, LAST,    LAST,    LAST);
 708       case7(ISHR,    LAST, ISHR,    USHR,    LAST,    I64SHR,  U64SHR);
 709       case7(ATOMIMAX,LAST, ATOMIMAX,ATOMUMAX,LAST,    LAST,    LAST);
 710       case7(ATOMIMIN,LAST, ATOMIMIN,ATOMUMIN,LAST,    LAST,    LAST);
 711
 712       casecomp(SEQ, FSEQ, USEQ, USEQ, DSEQ, U64SEQ, U64SEQ);
 713       casecomp(SNE, FSNE, USNE, USNE, DSNE, U64SNE, U64SNE);
 714       casecomp(SGE, FSGE, ISGE, USGE, DSGE, I64SGE, U64SGE);
 715       casecomp(SLT, FSLT, ISLT, USLT, DSLT, I64SLT, U64SLT);
 716
 717       default: break;
 718    }
 719
 720    assert(op != TGSI_OPCODE_LAST);
 721    return op;
 722 }
 723
 724 glsl_to_tgsi_instruction *
 725 glsl_to_tgsi_visitor::emit_dp(ir_instruction *ir,
 726                               st_dst_reg dst, st_src_reg src0, st_src_reg src1,
 727                               unsigned elements)
 728 {
 729    static const unsigned dot_opcodes[] = {
 730       TGSI_OPCODE_DP2, TGSI_OPCODE_DP3, TGSI_OPCODE_DP4
 731    };
 732
 733    return emit_asm(ir, dot_opcodes[elements - 2], dst, src0, src1);
 734 }
 735
 736 /**
 737  * Emits TGSI scalar opcodes to produce unique answers across channels.
 738  *
 739  * Some TGSI opcodes are scalar-only, like ARB_fp/vp.  The src X
 740  * channel determines the result across all channels.  So to do a vec4
 741  * of this operation, we want to emit a scalar per source channel used
 742  * to produce dest channels.
 743  */
 744 void
 745 glsl_to_tgsi_visitor::emit_scalar(ir_instruction *ir, unsigned op,
 746                                   st_dst_reg dst,
 747                                   st_src_reg orig_src0, st_src_reg orig_src1)
 748 {
 749    int i, j;
 750    int done_mask = ~dst.writemask;
 751
 752    /* TGSI RCP is a scalar operation splatting results to all channels,
 753     * like ARB_fp/vp.  So emit as many RCPs as necessary to cover our
 754     * dst channels.
 755     */
 756    for (i = 0; i < 4; i++) {
 757       GLuint this_mask = (1 << i);
 758       st_src_reg src0 = orig_src0;
 759       st_src_reg src1 = orig_src1;
 760
 761       if (done_mask & this_mask)
 762          continue;
 763
 764       GLuint src0_swiz = GET_SWZ(src0.swizzle, i);
 765       GLuint src1_swiz = GET_SWZ(src1.swizzle, i);
 766       for (j = i + 1; j < 4; j++) {
 767          /* If there is another enabled component in the destination that is
 768           * derived from the same inputs, generate its value on this pass as
 769           * well.
 770           */
 771          if (!(done_mask & (1 << j)) &&
 772              GET_SWZ(src0.swizzle, j) == src0_swiz &&
 773              GET_SWZ(src1.swizzle, j) == src1_swiz) {
 774             this_mask |= (1 << j);
 775          }
 776       }
 777       src0.swizzle = MAKE_SWIZZLE4(src0_swiz, src0_swiz,
 778                                    src0_swiz, src0_swiz);
 779       src1.swizzle = MAKE_SWIZZLE4(src1_swiz, src1_swiz,
 780                                    src1_swiz, src1_swiz);
 781
 782       dst.writemask = this_mask;
 783       emit_asm(ir, op, dst, src0, src1);
 784       done_mask |= this_mask;
 785    }
 786 }
 787
 788 void
 789 glsl_to_tgsi_visitor::emit_scalar(ir_instruction *ir, unsigned op,
 790                                   st_dst_reg dst, st_src_reg src0)
 791 {
 792    st_src_reg undef = undef_src;
 793
 794    undef.swizzle = SWIZZLE_XXXX;
 795
 796    emit_scalar(ir, op, dst, src0, undef);
 797 }
 798
 799 void
 800 glsl_to_tgsi_visitor::emit_arl(ir_instruction *ir,
 801                                st_dst_reg dst, st_src_reg src0)
 802 {
 803    int op = TGSI_OPCODE_ARL;
 804
 805    if (src0.type == GLSL_TYPE_INT || src0.type == GLSL_TYPE_UINT)
 806       op = TGSI_OPCODE_UARL;
 807
 808    assert(dst.file == PROGRAM_ADDRESS);
 809    if (dst.index >= this->num_address_regs)
 810       this->num_address_regs = dst.index + 1;
 811
 812    emit_asm(NULL, op, dst, src0);
 813 }
 814
 815 int
 816 glsl_to_tgsi_visitor::add_constant(gl_register_file file,
 817                                    gl_constant_value values[8], int size, int datatype,
 818                                    uint16_t *swizzle_out)
 819 {
 820    if (file == PROGRAM_CONSTANT) {
 821       GLuint swizzle = swizzle_out ? *swizzle_out : 0;
 822       int result = _mesa_add_typed_unnamed_constant(this->prog->Parameters, values,
 823                                                     size, datatype, &swizzle);
 824       if (swizzle_out)
 825          *swizzle_out = swizzle;
 826       return result;
 827    }
 828
 829    assert(file == PROGRAM_IMMEDIATE);
 830
 831    int index = 0;
 832    immediate_storage *entry;
 833    int size32 = size * ((datatype == GL_DOUBLE ||
 834                          datatype == GL_INT64_ARB ||
 835                          datatype == GL_UNSIGNED_INT64_ARB)? 2 : 1);
 836    int i;
 837
 838    /* Search immediate storage to see if we already have an identical
 839     * immediate that we can use instead of adding a duplicate entry.
 840     */
 841    foreach_in_list(immediate_storage, entry, &this->immediates) {
 842       immediate_storage *tmp = entry;
 843
 844       for (i = 0; i * 4 < size32; i++) {
 845          int slot_size = MIN2(size32 - (i * 4), 4);
 846          if (tmp->type != datatype || tmp->size32 != slot_size)
 847             break;
 848          if (memcmp(tmp->values, &values[i * 4],
 849                     slot_size * sizeof(gl_constant_value)))
 850             break;
 851
 852          /* Everything matches, keep going until the full size is matched */
 853          tmp = (immediate_storage *)tmp->next;
 854       }
 855
 856       /* The full value matched */
 857       if (i * 4 >= size32)
 858          return index;
 859
 860       index++;
 861    }
 862
 863    for (i = 0; i * 4 < size32; i++) {
 864       int slot_size = MIN2(size32 - (i * 4), 4);
 865       /* Add this immediate to the list. */
 866       entry = new(mem_ctx) immediate_storage(&values[i * 4], slot_size, datatype);
 867       this->immediates.push_tail(entry);
 868       this->num_immediates++;
 869    }
 870    return index;
 871 }
 872
 873 st_src_reg
 874 glsl_to_tgsi_visitor::st_src_reg_for_float(float val)
 875 {
 876    st_src_reg src(PROGRAM_IMMEDIATE, -1, GLSL_TYPE_FLOAT);
 877    union gl_constant_value uval;
 878
 879    uval.f = val;
 880    src.index = add_constant(src.file, &uval, 1, GL_FLOAT, &src.swizzle);
 881
 882    return src;
 883 }
 884
 885 st_src_reg
 886 glsl_to_tgsi_visitor::st_src_reg_for_double(double val)
 887 {
 888    st_src_reg src(PROGRAM_IMMEDIATE, -1, GLSL_TYPE_DOUBLE);
 889    union gl_constant_value uval[2];
 890
 891    memcpy(uval, &val, sizeof(uval));
 892    src.index = add_constant(src.file, uval, 1, GL_DOUBLE, &src.swizzle);
 893    src.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_X, SWIZZLE_Y);
 894    return src;
 895 }
 896
 897 st_src_reg
 898 glsl_to_tgsi_visitor::st_src_reg_for_int(int val)
 899 {
 900    st_src_reg src(PROGRAM_IMMEDIATE, -1, GLSL_TYPE_INT);
 901    union gl_constant_value uval;
 902
 903    assert(native_integers);
 904
 905    uval.i = val;
 906    src.index = add_constant(src.file, &uval, 1, GL_INT, &src.swizzle);
 907
 908    return src;
 909 }
 910
 911 st_src_reg
 912 glsl_to_tgsi_visitor::st_src_reg_for_type(enum glsl_base_type type, int val)
 913 {
 914    if (native_integers)
 915       return type == GLSL_TYPE_FLOAT ? st_src_reg_for_float(val) :
 916                                        st_src_reg_for_int(val);
 917    else
 918       return st_src_reg_for_float(val);
 919 }
 920
 921 static int
 922 attrib_type_size(const struct glsl_type *type, bool is_vs_input)
 923 {
 924    return type->count_attribute_slots(is_vs_input);
 925 }
 926
 927 static int
 928 type_size(const struct glsl_type *type)
 929 {
 930    return type->count_attribute_slots(false);
 931 }
 932
 933 /**
 934  * If the given GLSL type is an array or matrix or a structure containing
 935  * an array/matrix member, return true.  Else return false.
 936  *
 937  * This is used to determine which kind of temp storage (PROGRAM_TEMPORARY
 938  * or PROGRAM_ARRAY) should be used for variables of this type.  Anytime
 939  * we have an array that might be indexed with a variable, we need to use
 940  * the later storage type.
 941  */
 942 static bool
 943 type_has_array_or_matrix(const glsl_type *type)
 944 {
 945    if (type->is_array() || type->is_matrix())
 946       return true;
 947
 948    if (type->is_record()) {
 949       for (unsigned i = 0; i < type->length; i++) {
 950          if (type_has_array_or_matrix(type->fields.structure[i].type)) {
 951             return true;
 952          }
 953       }
 954    }
 955
 956    return false;
 957 }
 958
 959
 960 /**
 961  * In the initial pass of codegen, we assign temporary numbers to
 962  * intermediate results.  (not SSA -- variable assignments will reuse
 963  * storage).
 964  */
 965 st_src_reg
 966 glsl_to_tgsi_visitor::get_temp(const glsl_type *type)
 967 {
 968    st_src_reg src;
 969
 970    src.type = native_integers ? type->base_type : GLSL_TYPE_FLOAT;
 971    src.reladdr = NULL;
 972    src.negate = 0;
 973    src.abs = 0;
 974
 975    if (!options->EmitNoIndirectTemp && type_has_array_or_matrix(type)) {
 976       if (next_array >= max_num_arrays) {
 977          max_num_arrays += 32;
 978          array_sizes = (unsigned*)
 979             realloc(array_sizes, sizeof(array_sizes[0]) * max_num_arrays);
 980       }
 981
 982       src.file = PROGRAM_ARRAY;
 983       src.index = 0;
 984       src.array_id = next_array + 1;
 985       array_sizes[next_array] = type_size(type);
 986       ++next_array;
 987
 988    } else {
 989       src.file = PROGRAM_TEMPORARY;
 990       src.index = next_temp;
 991       next_temp += type_size(type);
 992    }
 993
 994    if (type->is_array() || type->is_record()) {
 995       src.swizzle = SWIZZLE_NOOP;
 996    } else {
 997       src.swizzle = swizzle_for_size(type->vector_elements);
 998    }
 999
1000    return src;
1001 }
1002
1003 variable_storage *
1004 glsl_to_tgsi_visitor::find_variable_storage(ir_variable *var)
1005 {
1006    struct hash_entry *entry;
1007
1008    entry = _mesa_hash_table_search(this->variables, var);
1009    if (!entry)
1010       return NULL;
1011
1012    return (variable_storage *)entry->data;
1013 }
1014
1015 void
1016 glsl_to_tgsi_visitor::visit(ir_variable *ir)
1017 {
1018    if (strcmp(ir->name, "gl_FragCoord") == 0) {
1019       this->prog->OriginUpperLeft = ir->data.origin_upper_left;
1020       this->prog->PixelCenterInteger = ir->data.pixel_center_integer;
1021    }
1022
1023    if (ir->data.mode == ir_var_uniform && strncmp(ir->name, "gl_", 3) == 0) {
1024       unsigned int i;
1025       const ir_state_slot *const slots = ir->get_state_slots();
1026       assert(slots != NULL);
1027
1028       /* Check if this statevar's setup in the STATE file exactly
1029        * matches how we'll want to reference it as a
1030        * struct/array/whatever.  If not, then we need to move it into
1031        * temporary storage and hope that it'll get copy-propagated
1032        * out.
1033        */
1034       for (i = 0; i < ir->get_num_state_slots(); i++) {
1035          if (slots[i].swizzle != SWIZZLE_XYZW) {
1036             break;
1037          }
1038       }
1039
1040       variable_storage *storage;
1041       st_dst_reg dst;
1042       if (i == ir->get_num_state_slots()) {
1043          /* We'll set the index later. */
1044          storage = new(mem_ctx) variable_storage(ir, PROGRAM_STATE_VAR, -1);
1045
1046          _mesa_hash_table_insert(this->variables, ir, storage);
1047
1048          dst = undef_dst;
1049       } else {
1050          /* The variable_storage constructor allocates slots based on the size
1051           * of the type.  However, this had better match the number of state
1052           * elements that we're going to copy into the new temporary.
1053           */
1054          assert((int) ir->get_num_state_slots() == type_size(ir->type));
1055
1056          dst = st_dst_reg(get_temp(ir->type));
1057
1058          storage = new(mem_ctx) variable_storage(ir, dst.file, dst.index,
1059                                                  dst.array_id);
1060
1061          _mesa_hash_table_insert(this->variables, ir, storage);
1062       }
1063
1064
1065       for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1066          int index = _mesa_add_state_reference(this->prog->Parameters,
1067                                                (gl_state_index *)slots[i].tokens);
1068
1069          if (storage->file == PROGRAM_STATE_VAR) {
1070             if (storage->index == -1) {
1071                storage->index = index;
1072             } else {
1073                assert(index == storage->index + (int)i);
1074             }
1075          } else {
1076             /* We use GLSL_TYPE_FLOAT here regardless of the actual type of
1077              * the data being moved since MOV does not care about the type of
1078              * data it is moving, and we don't want to declare registers with
1079              * array or struct types.
1080              */
1081             st_src_reg src(PROGRAM_STATE_VAR, index, GLSL_TYPE_FLOAT);
1082             src.swizzle = slots[i].swizzle;
1083             emit_asm(ir, TGSI_OPCODE_MOV, dst, src);
1084             /* even a float takes up a whole vec4 reg in a struct/array. */
1085             dst.index++;
1086          }
1087       }
1088
1089       if (storage->file == PROGRAM_TEMPORARY &&
1090           dst.index != storage->index + (int) ir->get_num_state_slots()) {
1091          fail_link(this->shader_program,
1092                   "failed to load builtin uniform `%s'  (%d/%d regs loaded)\n",
1093                   ir->name, dst.index - storage->index,
1094                   type_size(ir->type));
1095       }
1096    }
1097 }
1098
1099 void
1100 glsl_to_tgsi_visitor::visit(ir_loop *ir)
1101 {
1102    emit_asm(NULL, TGSI_OPCODE_BGNLOOP);
1103
1104    visit_exec_list(&ir->body_instructions, this);
1105
1106    emit_asm(NULL, TGSI_OPCODE_ENDLOOP);
1107 }
1108
1109 void
1110 glsl_to_tgsi_visitor::visit(ir_loop_jump *ir)
1111 {
1112    switch (ir->mode) {
1113    case ir_loop_jump::jump_break:
1114       emit_asm(NULL, TGSI_OPCODE_BRK);
1115       break;
1116    case ir_loop_jump::jump_continue:
1117       emit_asm(NULL, TGSI_OPCODE_CONT);
1118       break;
1119    }
1120 }
1121
1122
1123 void
1124 glsl_to_tgsi_visitor::visit(ir_function_signature *ir)
1125 {
1126    assert(0);
1127    (void)ir;
1128 }
1129
1130 void
1131 glsl_to_tgsi_visitor::visit(ir_function *ir)
1132 {
1133    /* Ignore function bodies other than main() -- we shouldn't see calls to
1134     * them since they should all be inlined before we get to glsl_to_tgsi.
1135     */
1136    if (strcmp(ir->name, "main") == 0) {
1137       const ir_function_signature *sig;
1138       exec_list empty;
1139
1140       sig = ir->matching_signature(NULL, &empty, false);
1141
1142       assert(sig);
1143
1144       foreach_in_list(ir_instruction, ir, &sig->body) {
1145          ir->accept(this);
1146       }
1147    }
1148 }
1149
1150 bool
1151 glsl_to_tgsi_visitor::try_emit_mad(ir_expression *ir, int mul_operand)
1152 {
1153    int nonmul_operand = 1 - mul_operand;
1154    st_src_reg a, b, c;
1155    st_dst_reg result_dst;
1156
1157    ir_expression *expr = ir->operands[mul_operand]->as_expression();
1158    if (!expr || expr->operation != ir_binop_mul)
1159       return false;
1160
1161    expr->operands[0]->accept(this);
1162    a = this->result;
1163    expr->operands[1]->accept(this);
1164    b = this->result;
1165    ir->operands[nonmul_operand]->accept(this);
1166    c = this->result;
1167
1168    this->result = get_temp(ir->type);
1169    result_dst = st_dst_reg(this->result);
1170    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1171    emit_asm(ir, TGSI_OPCODE_MAD, result_dst, a, b, c);
1172
1173    return true;
1174 }
1175
1176 /**
1177  * Emit MAD(a, -b, a) instead of AND(a, NOT(b))
1178  *
1179  * The logic values are 1.0 for true and 0.0 for false.  Logical-and is
1180  * implemented using multiplication, and logical-or is implemented using
1181  * addition.  Logical-not can be implemented as (true - x), or (1.0 - x).
1182  * As result, the logical expression (a & !b) can be rewritten as:
1183  *
1184  *     - a * !b
1185  *     - a * (1 - b)
1186  *     - (a * 1) - (a * b)
1187  *     - a + -(a * b)
1188  *     - a + (a * -b)
1189  *
1190  * This final expression can be implemented as a single MAD(a, -b, a)
1191  * instruction.
1192  */
1193 bool
1194 glsl_to_tgsi_visitor::try_emit_mad_for_and_not(ir_expression *ir, int try_operand)
1195 {
1196    const int other_operand = 1 - try_operand;
1197    st_src_reg a, b;
1198
1199    ir_expression *expr = ir->operands[try_operand]->as_expression();
1200    if (!expr || expr->operation != ir_unop_logic_not)
1201       return false;
1202
1203    ir->operands[other_operand]->accept(this);
1204    a = this->result;
1205    expr->operands[0]->accept(this);
1206    b = this->result;
1207
1208    b.negate = ~b.negate;
1209
1210    this->result = get_temp(ir->type);
1211    emit_asm(ir, TGSI_OPCODE_MAD, st_dst_reg(this->result), a, b, a);
1212
1213    return true;
1214 }
1215
1216 void
1217 glsl_to_tgsi_visitor::reladdr_to_temp(ir_instruction *ir,
1218                                       st_src_reg *reg, int *num_reladdr)
1219 {
1220    if (!reg->reladdr && !reg->reladdr2)
1221       return;
1222
1223    if (reg->reladdr) emit_arl(ir, address_reg, *reg->reladdr);
1224    if (reg->reladdr2) emit_arl(ir, address_reg2, *reg->reladdr2);
1225
1226    if (*num_reladdr != 1) {
1227       st_src_reg temp = get_temp(reg->type == GLSL_TYPE_DOUBLE ? glsl_type::dvec4_type : glsl_type::vec4_type);
1228
1229       emit_asm(ir, TGSI_OPCODE_MOV, st_dst_reg(temp), *reg);
1230       *reg = temp;
1231    }
1232
1233    (*num_reladdr)--;
1234 }
1235
1236 void
1237 glsl_to_tgsi_visitor::visit(ir_expression *ir)
1238 {
1239    st_src_reg op[ARRAY_SIZE(ir->operands)];
1240
1241    /* Quick peephole: Emit MAD(a, b, c) instead of ADD(MUL(a, b), c)
1242     */
1243    if (!this->precise && ir->operation == ir_binop_add) {
1244       if (try_emit_mad(ir, 1))
1245          return;
1246       if (try_emit_mad(ir, 0))
1247          return;
1248    }
1249
1250    /* Quick peephole: Emit OPCODE_MAD(-a, -b, a) instead of AND(a, NOT(b))
1251     */
1252    if (!native_integers && ir->operation == ir_binop_logic_and) {
1253       if (try_emit_mad_for_and_not(ir, 1))
1254          return;
1255       if (try_emit_mad_for_and_not(ir, 0))
1256          return;
1257    }
1258
1259    if (ir->operation == ir_quadop_vector)
1260       assert(!"ir_quadop_vector should have been lowered");
1261
1262    for (unsigned int operand = 0; operand < ir->num_operands; operand++) {
1263       this->result.file = PROGRAM_UNDEFINED;
1264       ir->operands[operand]->accept(this);
1265       if (this->result.file == PROGRAM_UNDEFINED) {
1266          printf("Failed to get tree for expression operand:\n");
1267          ir->operands[operand]->print();
1268          printf("\n");
1269          exit(1);
1270       }
1271       op[operand] = this->result;
1272
1273       /* Matrix expression operands should have been broken down to vector
1274        * operations already.
1275        */
1276       assert(!ir->operands[operand]->type->is_matrix());
1277    }
1278
1279    visit_expression(ir, op);
1280 }
1281
1282 /* The non-recursive part of the expression visitor lives in a separate
1283  * function and should be prevented from being inlined, to avoid a stack
1284  * explosion when deeply nested expressions are visited.
1285  */
1286 void
1287 glsl_to_tgsi_visitor::visit_expression(ir_expression* ir, st_src_reg *op)
1288 {
1289    st_src_reg result_src;
1290    st_dst_reg result_dst;
1291
1292    int vector_elements = ir->operands[0]->type->vector_elements;
1293    if (ir->operands[1]) {
1294       vector_elements = MAX2(vector_elements,
1295                              ir->operands[1]->type->vector_elements);
1296    }
1297
1298    this->result.file = PROGRAM_UNDEFINED;
1299
1300    /* Storage for our result.  Ideally for an assignment we'd be using
1301     * the actual storage for the result here, instead.
1302     */
1303    result_src = get_temp(ir->type);
1304    /* convenience for the emit functions below. */
1305    result_dst = st_dst_reg(result_src);
1306    /* Limit writes to the channels that will be used by result_src later.
1307     * This does limit this temp's use as a temporary for multi-instruction
1308     * sequences.
1309     */
1310    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1311
1312    switch (ir->operation) {
1313    case ir_unop_logic_not:
1314       if (result_dst.type != GLSL_TYPE_FLOAT)
1315          emit_asm(ir, TGSI_OPCODE_NOT, result_dst, op[0]);
1316       else {
1317          /* Previously 'SEQ dst, src, 0.0' was used for this.  However, many
1318           * older GPUs implement SEQ using multiple instructions (i915 uses two
1319           * SGE instructions and a MUL instruction).  Since our logic values are
1320           * 0.0 and 1.0, 1-x also implements !x.
1321           */
1322          op[0].negate = ~op[0].negate;
1323          emit_asm(ir, TGSI_OPCODE_ADD, result_dst, op[0], st_src_reg_for_float(1.0));
1324       }
1325       break;
1326    case ir_unop_neg:
1327       if (result_dst.type == GLSL_TYPE_INT64 || result_dst.type == GLSL_TYPE_UINT64)
1328          emit_asm(ir, TGSI_OPCODE_I64NEG, result_dst, op[0]);
1329       else if (result_dst.type == GLSL_TYPE_INT || result_dst.type == GLSL_TYPE_UINT)
1330          emit_asm(ir, TGSI_OPCODE_INEG, result_dst, op[0]);
1331       else if (result_dst.type == GLSL_TYPE_DOUBLE)
1332          emit_asm(ir, TGSI_OPCODE_DNEG, result_dst, op[0]);
1333       else {
1334          op[0].negate = ~op[0].negate;
1335          result_src = op[0];
1336       }
1337       break;
1338    case ir_unop_subroutine_to_int:
1339       emit_asm(ir, TGSI_OPCODE_MOV, result_dst, op[0]);
1340       break;
1341    case ir_unop_abs:
1342       if (result_dst.type == GLSL_TYPE_FLOAT)
1343          emit_asm(ir, TGSI_OPCODE_MOV, result_dst, op[0].get_abs());
1344       else if (result_dst.type == GLSL_TYPE_DOUBLE)
1345          emit_asm(ir, TGSI_OPCODE_DABS, result_dst, op[0]);
1346       else if (result_dst.type == GLSL_TYPE_INT64 || result_dst.type == GLSL_TYPE_UINT64)
1347          emit_asm(ir, TGSI_OPCODE_I64ABS, result_dst, op[0]);
1348       else
1349          emit_asm(ir, TGSI_OPCODE_IABS, result_dst, op[0]);
1350       break;
1351    case ir_unop_sign:
1352       emit_asm(ir, TGSI_OPCODE_SSG, result_dst, op[0]);
1353       break;
1354    case ir_unop_rcp:
1355       emit_scalar(ir, TGSI_OPCODE_RCP, result_dst, op[0]);
1356       break;
1357
1358    case ir_unop_exp2:
1359       emit_scalar(ir, TGSI_OPCODE_EX2, result_dst, op[0]);
1360       break;
1361    case ir_unop_exp:
1362       assert(!"not reached: should be handled by exp_to_exp2");
1363       break;
1364    case ir_unop_log:
1365       assert(!"not reached: should be handled by log_to_log2");
1366       break;
1367    case ir_unop_log2:
1368       emit_scalar(ir, TGSI_OPCODE_LG2, result_dst, op[0]);
1369       break;
1370    case ir_unop_sin:
1371       emit_scalar(ir, TGSI_OPCODE_SIN, result_dst, op[0]);
1372       break;
1373    case ir_unop_cos:
1374       emit_scalar(ir, TGSI_OPCODE_COS, result_dst, op[0]);
1375       break;
1376    case ir_unop_saturate: {
1377       glsl_to_tgsi_instruction *inst;
1378       inst = emit_asm(ir, TGSI_OPCODE_MOV, result_dst, op[0]);
1379       inst->saturate = true;
1380       break;
1381    }
1382
1383    case ir_unop_dFdx:
1384    case ir_unop_dFdx_coarse:
1385       emit_asm(ir, TGSI_OPCODE_DDX, result_dst, op[0]);
1386       break;
1387    case ir_unop_dFdx_fine:
1388       emit_asm(ir, TGSI_OPCODE_DDX_FINE, result_dst, op[0]);
1389       break;
1390    case ir_unop_dFdy:
1391    case ir_unop_dFdy_coarse:
1392    case ir_unop_dFdy_fine:
1393    {
1394       /* The X component contains 1 or -1 depending on whether the framebuffer
1395        * is a FBO or the window system buffer, respectively.
1396        * It is then multiplied with the source operand of DDY.
1397        */
1398       static const gl_state_index transform_y_state[STATE_LENGTH]
1399          = { STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM };
1400
1401       unsigned transform_y_index =
1402          _mesa_add_state_reference(this->prog->Parameters,
1403                                    transform_y_state);
1404
1405       st_src_reg transform_y = st_src_reg(PROGRAM_STATE_VAR,
1406                                           transform_y_index,
1407                                           glsl_type::vec4_type);
1408       transform_y.swizzle = SWIZZLE_XXXX;
1409
1410       st_src_reg temp = get_temp(glsl_type::vec4_type);
1411
1412       emit_asm(ir, TGSI_OPCODE_MUL, st_dst_reg(temp), transform_y, op[0]);
1413       emit_asm(ir, ir->operation == ir_unop_dFdy_fine ?
1414            TGSI_OPCODE_DDY_FINE : TGSI_OPCODE_DDY, result_dst, temp);
1415       break;
1416    }
1417
1418    case ir_unop_frexp_sig:
1419       emit_asm(ir, TGSI_OPCODE_DFRACEXP, result_dst, undef_dst, op[0]);
1420       break;
1421
1422    case ir_unop_frexp_exp:
1423       emit_asm(ir, TGSI_OPCODE_DFRACEXP, undef_dst, result_dst, op[0]);
1424       break;
1425
1426    case ir_unop_noise: {
1427       /* At some point, a motivated person could add a better
1428        * implementation of noise.  Currently not even the nvidia
1429        * binary drivers do anything more than this.  In any case, the
1430        * place to do this is in the GL state tracker, not the poor
1431        * driver.
1432        */
1433       emit_asm(ir, TGSI_OPCODE_MOV, result_dst, st_src_reg_for_float(0.5));
1434       break;
1435    }
1436
1437    case ir_binop_add:
1438       emit_asm(ir, TGSI_OPCODE_ADD, result_dst, op[0], op[1]);
1439       break;
1440    case ir_binop_sub:
1441       op[1].negate = ~op[1].negate;
1442       emit_asm(ir, TGSI_OPCODE_ADD, result_dst, op[0], op[1]);
1443       break;
1444
1445    case ir_binop_mul:
1446       emit_asm(ir, TGSI_OPCODE_MUL, result_dst, op[0], op[1]);
1447       break;
1448    case ir_binop_div:
1449       emit_asm(ir, TGSI_OPCODE_DIV, result_dst, op[0], op[1]);
1450       break;
1451    case ir_binop_mod:
1452       if (result_dst.type == GLSL_TYPE_FLOAT)
1453          assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
1454       else
1455          emit_asm(ir, TGSI_OPCODE_MOD, result_dst, op[0], op[1]);
1456       break;
1457
1458    case ir_binop_less:
1459       emit_asm(ir, TGSI_OPCODE_SLT, result_dst, op[0], op[1]);
1460       break;
1461    case ir_binop_greater:
1462       emit_asm(ir, TGSI_OPCODE_SLT, result_dst, op[1], op[0]);
1463       break;
1464    case ir_binop_lequal:
1465       emit_asm(ir, TGSI_OPCODE_SGE, result_dst, op[1], op[0]);
1466       break;
1467    case ir_binop_gequal:
1468       emit_asm(ir, TGSI_OPCODE_SGE, result_dst, op[0], op[1]);
1469       break;
1470    case ir_binop_equal:
1471       emit_asm(ir, TGSI_OPCODE_SEQ, result_dst, op[0], op[1]);
1472       break;
1473    case ir_binop_nequal:
1474       emit_asm(ir, TGSI_OPCODE_SNE, result_dst, op[0], op[1]);
1475       break;
1476    case ir_binop_all_equal:
1477       /* "==" operator producing a scalar boolean. */
1478       if (ir->operands[0]->type->is_vector() ||
1479           ir->operands[1]->type->is_vector()) {
1480          st_src_reg temp = get_temp(native_integers ?
1481                                     glsl_type::uvec4_type :
1482                                     glsl_type::vec4_type);
1483
1484          if (native_integers) {
1485             st_dst_reg temp_dst = st_dst_reg(temp);
1486             st_src_reg temp1 = st_src_reg(temp), temp2 = st_src_reg(temp);
1487
1488             if (ir->operands[0]->type->is_boolean() &&
1489                 ir->operands[1]->as_constant() &&
1490                 ir->operands[1]->as_constant()->is_one()) {
1491                emit_asm(ir, TGSI_OPCODE_MOV, st_dst_reg(temp), op[0]);
1492             } else {
1493                emit_asm(ir, TGSI_OPCODE_SEQ, st_dst_reg(temp), op[0], op[1]);
1494             }
1495
1496             /* Emit 1-3 AND operations to combine the SEQ results. */
1497             switch (ir->operands[0]->type->vector_elements) {
1498             case 2:
1499                break;
1500             case 3:
1501                temp_dst.writemask = WRITEMASK_Y;
1502                temp1.swizzle = SWIZZLE_YYYY;
1503                temp2.swizzle = SWIZZLE_ZZZZ;
1504                emit_asm(ir, TGSI_OPCODE_AND, temp_dst, temp1, temp2);
1505                break;
1506             case 4:
1507                temp_dst.writemask = WRITEMASK_X;
1508                temp1.swizzle = SWIZZLE_XXXX;
1509                temp2.swizzle = SWIZZLE_YYYY;
1510                emit_asm(ir, TGSI_OPCODE_AND, temp_dst, temp1, temp2);
1511                temp_dst.writemask = WRITEMASK_Y;
1512                temp1.swizzle = SWIZZLE_ZZZZ;
1513                temp2.swizzle = SWIZZLE_WWWW;
1514                emit_asm(ir, TGSI_OPCODE_AND, temp_dst, temp1, temp2);
1515             }
1516
1517             temp1.swizzle = SWIZZLE_XXXX;
1518             temp2.swizzle = SWIZZLE_YYYY;
1519             emit_asm(ir, TGSI_OPCODE_AND, result_dst, temp1, temp2);
1520          } else {
1521             emit_asm(ir, TGSI_OPCODE_SNE, st_dst_reg(temp), op[0], op[1]);
1522
1523             /* After the dot-product, the value will be an integer on the
1524              * range [0,4].  Zero becomes 1.0, and positive values become zero.
1525              */
1526             emit_dp(ir, result_dst, temp, temp, vector_elements);
1527
1528             /* Negating the result of the dot-product gives values on the range
1529              * [-4, 0].  Zero becomes 1.0, and negative values become zero.
1530              * This is achieved using SGE.
1531              */
1532             st_src_reg sge_src = result_src;
1533             sge_src.negate = ~sge_src.negate;
1534             emit_asm(ir, TGSI_OPCODE_SGE, result_dst, sge_src, st_src_reg_for_float(0.0));
1535          }
1536       } else {
1537          emit_asm(ir, TGSI_OPCODE_SEQ, result_dst, op[0], op[1]);
1538       }
1539       break;
1540    case ir_binop_any_nequal:
1541       /* "!=" operator producing a scalar boolean. */
1542       if (ir->operands[0]->type->is_vector() ||
1543           ir->operands[1]->type->is_vector()) {
1544          st_src_reg temp = get_temp(native_integers ?
1545                                     glsl_type::uvec4_type :
1546                                     glsl_type::vec4_type);
1547          if (ir->operands[0]->type->is_boolean() &&
1548              ir->operands[1]->as_constant() &&
1549              ir->operands[1]->as_constant()->is_zero()) {
1550             emit_asm(ir, TGSI_OPCODE_MOV, st_dst_reg(temp), op[0]);
1551          } else {
1552             emit_asm(ir, TGSI_OPCODE_SNE, st_dst_reg(temp), op[0], op[1]);
1553          }
1554
1555          if (native_integers) {
1556             st_dst_reg temp_dst = st_dst_reg(temp);
1557             st_src_reg temp1 = st_src_reg(temp), temp2 = st_src_reg(temp);
1558
1559             /* Emit 1-3 OR operations to combine the SNE results. */
1560             switch (ir->operands[0]->type->vector_elements) {
1561             case 2:
1562                break;
1563             case 3:
1564                temp_dst.writemask = WRITEMASK_Y;
1565                temp1.swizzle = SWIZZLE_YYYY;
1566                temp2.swizzle = SWIZZLE_ZZZZ;
1567                emit_asm(ir, TGSI_OPCODE_OR, temp_dst, temp1, temp2);
1568                break;
1569             case 4:
1570                temp_dst.writemask = WRITEMASK_X;
1571                temp1.swizzle = SWIZZLE_XXXX;
1572                temp2.swizzle = SWIZZLE_YYYY;
1573                emit_asm(ir, TGSI_OPCODE_OR, temp_dst, temp1, temp2);
1574                temp_dst.writemask = WRITEMASK_Y;
1575                temp1.swizzle = SWIZZLE_ZZZZ;
1576                temp2.swizzle = SWIZZLE_WWWW;
1577                emit_asm(ir, TGSI_OPCODE_OR, temp_dst, temp1, temp2);
1578             }
1579
1580             temp1.swizzle = SWIZZLE_XXXX;
1581             temp2.swizzle = SWIZZLE_YYYY;
1582             emit_asm(ir, TGSI_OPCODE_OR, result_dst, temp1, temp2);
1583          } else {
1584             /* After the dot-product, the value will be an integer on the
1585              * range [0,4].  Zero stays zero, and positive values become 1.0.
1586              */
1587             glsl_to_tgsi_instruction *const dp =
1588                   emit_dp(ir, result_dst, temp, temp, vector_elements);
1589             if (this->prog->Target == GL_FRAGMENT_PROGRAM_ARB) {
1590                /* The clamping to [0,1] can be done for free in the fragment
1591                 * shader with a saturate.
1592                 */
1593                dp->saturate = true;
1594             } else {
1595                /* Negating the result of the dot-product gives values on the range
1596                 * [-4, 0].  Zero stays zero, and negative values become 1.0.  This
1597                 * achieved using SLT.
1598                 */
1599                st_src_reg slt_src = result_src;
1600                slt_src.negate = ~slt_src.negate;
1601                emit_asm(ir, TGSI_OPCODE_SLT, result_dst, slt_src, st_src_reg_for_float(0.0));
1602             }
1603          }
1604       } else {
1605          emit_asm(ir, TGSI_OPCODE_SNE, result_dst, op[0], op[1]);
1606       }
1607       break;
1608
1609    case ir_binop_logic_xor:
1610       if (native_integers)
1611          emit_asm(ir, TGSI_OPCODE_XOR, result_dst, op[0], op[1]);
1612       else
1613          emit_asm(ir, TGSI_OPCODE_SNE, result_dst, op[0], op[1]);
1614       break;
1615
1616    case ir_binop_logic_or: {
1617       if (native_integers) {
1618          /* If integers are used as booleans, we can use an actual "or"
1619           * instruction.
1620           */
1621          assert(native_integers);
1622          emit_asm(ir, TGSI_OPCODE_OR, result_dst, op[0], op[1]);
1623       } else {
1624          /* After the addition, the value will be an integer on the
1625           * range [0,2].  Zero stays zero, and positive values become 1.0.
1626           */
1627          glsl_to_tgsi_instruction *add =
1628             emit_asm(ir, TGSI_OPCODE_ADD, result_dst, op[0], op[1]);
1629          if (this->prog->Target == GL_FRAGMENT_PROGRAM_ARB) {
1630             /* The clamping to [0,1] can be done for free in the fragment
1631              * shader with a saturate if floats are being used as boolean values.
1632              */
1633             add->saturate = true;
1634          } else {
1635             /* Negating the result of the addition gives values on the range
1636              * [-2, 0].  Zero stays zero, and negative values become 1.0.  This
1637              * is achieved using SLT.
1638              */
1639             st_src_reg slt_src = result_src;
1640             slt_src.negate = ~slt_src.negate;
1641             emit_asm(ir, TGSI_OPCODE_SLT, result_dst, slt_src, st_src_reg_for_float(0.0));
1642          }
1643       }
1644       break;
1645    }
1646
1647    case ir_binop_logic_and:
1648       /* If native integers are disabled, the bool args are stored as float 0.0
1649        * or 1.0, so "mul" gives us "and".  If they're enabled, just use the
1650        * actual AND opcode.
1651        */
1652       if (native_integers)
1653          emit_asm(ir, TGSI_OPCODE_AND, result_dst, op[0], op[1]);
1654       else
1655          emit_asm(ir, TGSI_OPCODE_MUL, result_dst, op[0], op[1]);
1656       break;
1657
1658    case ir_binop_dot:
1659       assert(ir->operands[0]->type->is_vector());
1660       assert(ir->operands[0]->type == ir->operands[1]->type);
1661       emit_dp(ir, result_dst, op[0], op[1],
1662               ir->operands[0]->type->vector_elements);
1663       break;
1664
1665    case ir_unop_sqrt:
1666       if (have_sqrt) {
1667          emit_scalar(ir, TGSI_OPCODE_SQRT, result_dst, op[0]);
1668       } else {
1669          /* This is the only instruction sequence that makes the game "Risen"
1670           * render correctly. ABS is not required for the game, but since GLSL
1671           * declares negative values as "undefined", allowing us to do whatever
1672           * we want, I choose to use ABS to match DX9 and pre-GLSL RSQ
1673           * behavior.
1674           */
1675          emit_scalar(ir, TGSI_OPCODE_RSQ, result_dst, op[0].get_abs());
1676          emit_scalar(ir, TGSI_OPCODE_RCP, result_dst, result_src);
1677       }
1678       break;
1679    case ir_unop_rsq:
1680       emit_scalar(ir, TGSI_OPCODE_RSQ, result_dst, op[0]);
1681       break;
1682    case ir_unop_i2f:
1683       if (native_integers) {
1684          emit_asm(ir, TGSI_OPCODE_I2F, result_dst, op[0]);
1685          break;
1686       }
1687       /* fallthrough to next case otherwise */
1688    case ir_unop_b2f:
1689       if (native_integers) {
1690          emit_asm(ir, TGSI_OPCODE_AND, result_dst, op[0], st_src_reg_for_float(1.0));
1691          break;
1692       }
1693       /* fallthrough to next case otherwise */
1694    case ir_unop_i2u:
1695    case ir_unop_u2i:
1696    case ir_unop_i642u64:
1697    case ir_unop_u642i64:
1698       /* Converting between signed and unsigned integers is a no-op. */
1699       result_src = op[0];
1700       result_src.type = result_dst.type;
1701       break;
1702    case ir_unop_b2i:
1703       if (native_integers) {
1704          /* Booleans are stored as integers using ~0 for true and 0 for false.
1705           * GLSL requires that int(bool) return 1 for true and 0 for false.
1706           * This conversion is done with AND, but it could be done with NEG.
1707           */
1708          emit_asm(ir, TGSI_OPCODE_AND, result_dst, op[0], st_src_reg_for_int(1));
1709       } else {
1710          /* Booleans and integers are both stored as floats when native
1711           * integers are disabled.
1712           */
1713          result_src = op[0];
1714       }
1715       break;
1716    case ir_unop_f2i:
1717       if (native_integers)
1718          emit_asm(ir, TGSI_OPCODE_F2I, result_dst, op[0]);
1719       else
1720          emit_asm(ir, TGSI_OPCODE_TRUNC, result_dst, op[0]);
1721       break;
1722    case ir_unop_f2u:
1723       if (native_integers)
1724          emit_asm(ir, TGSI_OPCODE_F2U, result_dst, op[0]);
1725       else
1726          emit_asm(ir, TGSI_OPCODE_TRUNC, result_dst, op[0]);
1727       break;
1728    case ir_unop_bitcast_f2i:
1729    case ir_unop_bitcast_f2u:
1730       /* Make sure we don't propagate the negate modifier to integer opcodes. */
1731       if (op[0].negate || op[0].abs)
1732          emit_asm(ir, TGSI_OPCODE_MOV, result_dst, op[0]);
1733       else
1734          result_src = op[0];
1735       result_src.type = ir->operation == ir_unop_bitcast_f2i ? GLSL_TYPE_INT :
1736                                                                GLSL_TYPE_UINT;
1737       break;
1738    case ir_unop_bitcast_i2f:
1739    case ir_unop_bitcast_u2f:
1740       result_src = op[0];
1741       result_src.type = GLSL_TYPE_FLOAT;
1742       break;
1743    case ir_unop_f2b:
1744       emit_asm(ir, TGSI_OPCODE_SNE, result_dst, op[0], st_src_reg_for_float(0.0));
1745       break;
1746    case ir_unop_d2b:
1747       emit_asm(ir, TGSI_OPCODE_SNE, result_dst, op[0], st_src_reg_for_double(0.0));
1748       break;
1749    case ir_unop_i2b:
1750       if (native_integers)
1751          emit_asm(ir, TGSI_OPCODE_USNE, result_dst, op[0], st_src_reg_for_int(0));
1752       else
1753          emit_asm(ir, TGSI_OPCODE_SNE, result_dst, op[0], st_src_reg_for_float(0.0));
1754       break;
1755    case ir_unop_bitcast_u642d:
1756    case ir_unop_bitcast_i642d:
1757       result_src = op[0];
1758       result_src.type = GLSL_TYPE_DOUBLE;
1759       break;
1760    case ir_unop_bitcast_d2i64:
1761       result_src = op[0];
1762       result_src.type = GLSL_TYPE_INT64;
1763       break;
1764    case ir_unop_bitcast_d2u64:
1765       result_src = op[0];
1766       result_src.type = GLSL_TYPE_UINT64;
1767       break;
1768    case ir_unop_trunc:
1769       emit_asm(ir, TGSI_OPCODE_TRUNC, result_dst, op[0]);
1770       break;
1771    case ir_unop_ceil:
1772       emit_asm(ir, TGSI_OPCODE_CEIL, result_dst, op[0]);
1773       break;
1774    case ir_unop_floor:
1775       emit_asm(ir, TGSI_OPCODE_FLR, result_dst, op[0]);
1776       break;
1777    case ir_unop_round_even:
1778       emit_asm(ir, TGSI_OPCODE_ROUND, result_dst, op[0]);
1779       break;
1780    case ir_unop_fract:
1781       emit_asm(ir, TGSI_OPCODE_FRC, result_dst, op[0]);
1782       break;
1783
1784    case ir_binop_min:
1785       emit_asm(ir, TGSI_OPCODE_MIN, result_dst, op[0], op[1]);
1786       break;
1787    case ir_binop_max:
1788       emit_asm(ir, TGSI_OPCODE_MAX, result_dst, op[0], op[1]);
1789       break;
1790    case ir_binop_pow:
1791       emit_scalar(ir, TGSI_OPCODE_POW, result_dst, op[0], op[1]);
1792       break;
1793
1794    case ir_unop_bit_not:
1795       if (native_integers) {
1796          emit_asm(ir, TGSI_OPCODE_NOT, result_dst, op[0]);
1797          break;
1798       }
1799    case ir_unop_u2f:
1800       if (native_integers) {
1801          emit_asm(ir, TGSI_OPCODE_U2F, result_dst, op[0]);
1802          break;
1803       }
1804    case ir_binop_lshift:
1805    case ir_binop_rshift:
1806       if (native_integers) {
1807          unsigned opcode = ir->operation == ir_binop_lshift ? TGSI_OPCODE_SHL
1808                                                             : TGSI_OPCODE_ISHR;
1809          st_src_reg count;
1810
1811          if (glsl_base_type_is_64bit(op[0].type)) {
1812             /* GLSL shift operations have 32-bit shift counts, but TGSI uses
1813              * 64 bits.
1814              */
1815             count = get_temp(glsl_type::u64vec(ir->operands[1]->type->components()));
1816             emit_asm(ir, TGSI_OPCODE_U2I64, st_dst_reg(count), op[1]);
1817          } else {
1818             count = op[1];
1819          }
1820
1821          emit_asm(ir, opcode, result_dst, op[0], count);
1822          break;
1823       }
1824    case ir_binop_bit_and:
1825       if (native_integers) {
1826          emit_asm(ir, TGSI_OPCODE_AND, result_dst, op[0], op[1]);
1827          break;
1828       }
1829    case ir_binop_bit_xor:
1830       if (native_integers) {
1831          emit_asm(ir, TGSI_OPCODE_XOR, result_dst, op[0], op[1]);
1832          break;
1833       }
1834    case ir_binop_bit_or:
1835       if (native_integers) {
1836          emit_asm(ir, TGSI_OPCODE_OR, result_dst, op[0], op[1]);
1837          break;
1838       }
1839
1840       assert(!"GLSL 1.30 features unsupported");
1841       break;
1842
1843    case ir_binop_ubo_load: {
1844       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1845       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1846       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1847       unsigned const_block = const_uniform_block ? const_uniform_block->value.u[0] + 1 : 1;
1848       st_src_reg index_reg = get_temp(glsl_type::uint_type);
1849       st_src_reg cbuf;
1850
1851       cbuf.type = ir->type->base_type;
1852       cbuf.file = PROGRAM_CONSTANT;
1853       cbuf.index = 0;
1854       cbuf.reladdr = NULL;
1855       cbuf.negate = 0;
1856       cbuf.abs = 0;
1857       cbuf.index2D = const_block;
1858
1859       assert(ir->type->is_vector() || ir->type->is_scalar());
1860
1861       if (const_offset_ir) {
1862          /* Constant index into constant buffer */
1863          cbuf.reladdr = NULL;
1864          cbuf.index = const_offset / 16;
1865       }
1866       else {
1867          ir_expression *offset_expr = ir->operands[1]->as_expression();
1868          st_src_reg offset = op[1];
1869
1870          /* The OpenGL spec is written in such a way that accesses with
1871           * non-constant offset are almost always vec4-aligned. The only
1872           * exception to this are members of structs in arrays of structs:
1873           * each struct in an array of structs is at least vec4-aligned,
1874           * but single-element and [ui]vec2 members of the struct may be at
1875           * an offset that is not a multiple of 16 bytes.
1876           *
1877           * Here, we extract that offset, relying on previous passes to always
1878           * generate offset expressions of the form (+ expr constant_offset).
1879           *
1880           * Note that the std430 layout, which allows more cases of alignment
1881           * less than vec4 in arrays, is not supported for uniform blocks, so
1882           * we do not have to deal with it here.
1883           */
1884          if (offset_expr && offset_expr->operation == ir_binop_add) {
1885             const_offset_ir = offset_expr->operands[1]->as_constant();
1886             if (const_offset_ir) {
1887                const_offset = const_offset_ir->value.u[0];
1888                cbuf.index = const_offset / 16;
1889                offset_expr->operands[0]->accept(this);
1890                offset = this->result;
1891             }
1892          }
1893
1894          /* Relative/variable index into constant buffer */
1895          emit_asm(ir, TGSI_OPCODE_USHR, st_dst_reg(index_reg), offset,
1896               st_src_reg_for_int(4));
1897          cbuf.reladdr = ralloc(mem_ctx, st_src_reg);
1898          memcpy(cbuf.reladdr, &index_reg, sizeof(index_reg));
1899       }
1900
1901       if (const_uniform_block) {
1902          /* Constant constant buffer */
1903          cbuf.reladdr2 = NULL;
1904       }
1905       else {
1906          /* Relative/variable constant buffer */
1907          cbuf.reladdr2 = ralloc(mem_ctx, st_src_reg);
1908          memcpy(cbuf.reladdr2, &op[0], sizeof(st_src_reg));
1909       }
1910       cbuf.has_index2 = true;
1911
1912       cbuf.swizzle = swizzle_for_size(ir->type->vector_elements);
1913       if (glsl_base_type_is_64bit(cbuf.type))
1914          cbuf.swizzle += MAKE_SWIZZLE4(const_offset % 16 / 8,
1915                                        const_offset % 16 / 8,
1916                                        const_offset % 16 / 8,
1917                                        const_offset % 16 / 8);
1918       else
1919          cbuf.swizzle += MAKE_SWIZZLE4(const_offset % 16 / 4,
1920                                        const_offset % 16 / 4,
1921                                        const_offset % 16 / 4,
1922                                        const_offset % 16 / 4);
1923
1924       if (ir->type->is_boolean()) {
1925          emit_asm(ir, TGSI_OPCODE_USNE, result_dst, cbuf, st_src_reg_for_int(0));
1926       } else {
1927          emit_asm(ir, TGSI_OPCODE_MOV, result_dst, cbuf);
1928       }
1929       break;
1930    }
1931    case ir_triop_lrp:
1932       /* note: we have to reorder the three args here */
1933       emit_asm(ir, TGSI_OPCODE_LRP, result_dst, op[2], op[1], op[0]);
1934       break;
1935    case ir_triop_csel:
1936       if (this->ctx->Const.NativeIntegers)
1937          emit_asm(ir, TGSI_OPCODE_UCMP, result_dst, op[0], op[1], op[2]);
1938       else {
1939          op[0].negate = ~op[0].negate;
1940          emit_asm(ir, TGSI_OPCODE_CMP, result_dst, op[0], op[1], op[2]);
1941       }
1942       break;
1943    case ir_triop_bitfield_extract:
1944       emit_asm(ir, TGSI_OPCODE_IBFE, result_dst, op[0], op[1], op[2]);
1945       break;
1946    case ir_quadop_bitfield_insert:
1947       emit_asm(ir, TGSI_OPCODE_BFI, result_dst, op[0], op[1], op[2], op[3]);
1948       break;
1949    case ir_unop_bitfield_reverse:
1950       emit_asm(ir, TGSI_OPCODE_BREV, result_dst, op[0]);
1951       break;
1952    case ir_unop_bit_count:
1953       emit_asm(ir, TGSI_OPCODE_POPC, result_dst, op[0]);
1954       break;
1955    case ir_unop_find_msb:
1956       emit_asm(ir, TGSI_OPCODE_IMSB, result_dst, op[0]);
1957       break;
1958    case ir_unop_find_lsb:
1959       emit_asm(ir, TGSI_OPCODE_LSB, result_dst, op[0]);
1960       break;
1961    case ir_binop_imul_high:
1962       emit_asm(ir, TGSI_OPCODE_IMUL_HI, result_dst, op[0], op[1]);
1963       break;
1964    case ir_triop_fma:
1965       /* In theory, MAD is incorrect here. */
1966       if (have_fma)
1967          emit_asm(ir, TGSI_OPCODE_FMA, result_dst, op[0], op[1], op[2]);
1968       else
1969          emit_asm(ir, TGSI_OPCODE_MAD, result_dst, op[0], op[1], op[2]);
1970       break;
1971    case ir_unop_interpolate_at_centroid:
1972       emit_asm(ir, TGSI_OPCODE_INTERP_CENTROID, result_dst, op[0]);
1973       break;
1974    case ir_binop_interpolate_at_offset: {
1975       /* The y coordinate needs to be flipped for the default fb */
1976       static const gl_state_index transform_y_state[STATE_LENGTH]
1977          = { STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM };
1978
1979       unsigned transform_y_index =
1980          _mesa_add_state_reference(this->prog->Parameters,
1981                                    transform_y_state);
1982
1983       st_src_reg transform_y = st_src_reg(PROGRAM_STATE_VAR,
1984                                           transform_y_index,
1985                                           glsl_type::vec4_type);
1986       transform_y.swizzle = SWIZZLE_XXXX;
1987
1988       st_src_reg temp = get_temp(glsl_type::vec2_type);
1989       st_dst_reg temp_dst = st_dst_reg(temp);
1990
1991       emit_asm(ir, TGSI_OPCODE_MOV, temp_dst, op[1]);
1992       temp_dst.writemask = WRITEMASK_Y;
1993       emit_asm(ir, TGSI_OPCODE_MUL, temp_dst, transform_y, op[1]);
1994       emit_asm(ir, TGSI_OPCODE_INTERP_OFFSET, result_dst, op[0], temp);
1995       break;
1996    }
1997    case ir_binop_interpolate_at_sample:
1998       emit_asm(ir, TGSI_OPCODE_INTERP_SAMPLE, result_dst, op[0], op[1]);
1999       break;
2000
2001    case ir_unop_d2f:
2002       emit_asm(ir, TGSI_OPCODE_D2F, result_dst, op[0]);
2003       break;
2004    case ir_unop_f2d:
2005       emit_asm(ir, TGSI_OPCODE_F2D, result_dst, op[0]);
2006       break;
2007    case ir_unop_d2i:
2008       emit_asm(ir, TGSI_OPCODE_D2I, result_dst, op[0]);
2009       break;
2010    case ir_unop_i2d:
2011       emit_asm(ir, TGSI_OPCODE_I2D, result_dst, op[0]);
2012       break;
2013    case ir_unop_d2u:
2014       emit_asm(ir, TGSI_OPCODE_D2U, result_dst, op[0]);
2015       break;
2016    case ir_unop_u2d:
2017       emit_asm(ir, TGSI_OPCODE_U2D, result_dst, op[0]);
2018       break;
2019    case ir_unop_unpack_double_2x32:
2020    case ir_unop_pack_double_2x32:
2021    case ir_unop_unpack_int_2x32:
2022    case ir_unop_pack_int_2x32:
2023    case ir_unop_unpack_uint_2x32:
2024    case ir_unop_pack_uint_2x32:
2025    case ir_unop_unpack_sampler_2x32:
2026    case ir_unop_pack_sampler_2x32:
2027    case ir_unop_unpack_image_2x32:
2028    case ir_unop_pack_image_2x32:
2029       emit_asm(ir, TGSI_OPCODE_MOV, result_dst, op[0]);
2030       break;
2031
2032    case ir_binop_ldexp:
2033       if (ir->operands[0]->type->is_double()) {
2034          emit_asm(ir, TGSI_OPCODE_DLDEXP, result_dst, op[0], op[1]);
2035       } else {
2036          assert(!"Invalid ldexp for non-double opcode in glsl_to_tgsi_visitor::visit()");
2037       }
2038       break;
2039
2040    case ir_unop_pack_half_2x16:
2041       emit_asm(ir, TGSI_OPCODE_PK2H, result_dst, op[0]);
2042       break;
2043    case ir_unop_unpack_half_2x16:
2044       emit_asm(ir, TGSI_OPCODE_UP2H, result_dst, op[0]);
2045       break;
2046
2047    case ir_unop_get_buffer_size: {
2048       ir_constant *const_offset = ir->operands[0]->as_constant();
2049       st_src_reg buffer(
2050             PROGRAM_BUFFER,
2051             ctx->Const.Program[shader->Stage].MaxAtomicBuffers +
2052             (const_offset ? const_offset->value.u[0] : 0),
2053             GLSL_TYPE_UINT);
2054       if (!const_offset) {
2055          buffer.reladdr = ralloc(mem_ctx, st_src_reg);
2056          *buffer.reladdr = op[0];
2057          emit_arl(ir, sampler_reladdr, op[0]);
2058       }
2059       emit_asm(ir, TGSI_OPCODE_RESQ, result_dst)->resource = buffer;
2060       break;
2061    }
2062
2063    case ir_unop_u2i64:
2064    case ir_unop_u2u64:
2065    case ir_unop_b2i64: {
2066       st_src_reg temp = get_temp(glsl_type::uvec4_type);
2067       st_dst_reg temp_dst = st_dst_reg(temp);
2068       unsigned orig_swz = op[0].swizzle;
2069       /*
2070        * To convert unsigned to 64-bit:
2071        * zero Y channel, copy X channel.
2072        */
2073       temp_dst.writemask = WRITEMASK_Y;
2074       if (vector_elements > 1)
2075          temp_dst.writemask |= WRITEMASK_W;
2076       emit_asm(ir, TGSI_OPCODE_MOV, temp_dst, st_src_reg_for_int(0));
2077       temp_dst.writemask = WRITEMASK_X;
2078       if (vector_elements > 1)
2079           temp_dst.writemask |= WRITEMASK_Z;
2080       op[0].swizzle = MAKE_SWIZZLE4(GET_SWZ(orig_swz, 0), GET_SWZ(orig_swz, 0),
2081                                     GET_SWZ(orig_swz, 1), GET_SWZ(orig_swz, 1));
2082       if (ir->operation == ir_unop_u2i64 || ir->operation == ir_unop_u2u64)
2083          emit_asm(ir, TGSI_OPCODE_MOV, temp_dst, op[0]);
2084       else
2085          emit_asm(ir, TGSI_OPCODE_AND, temp_dst, op[0], st_src_reg_for_int(1));
2086       result_src = temp;
2087       result_src.type = GLSL_TYPE_UINT64;
2088       if (vector_elements > 2) {
2089          /* Subtle: We rely on the fact that get_temp here returns the next
2090           * TGSI temporary register directly after the temp register used for
2091           * the first two components, so that the result gets picked up
2092           * automatically.
2093           */
2094          st_src_reg temp = get_temp(glsl_type::uvec4_type);
2095          st_dst_reg temp_dst = st_dst_reg(temp);
2096          temp_dst.writemask = WRITEMASK_Y;
2097          if (vector_elements > 3)
2098             temp_dst.writemask |= WRITEMASK_W;
2099          emit_asm(ir, TGSI_OPCODE_MOV, temp_dst, st_src_reg_for_int(0));
2100
2101          temp_dst.writemask = WRITEMASK_X;
2102          if (vector_elements > 3)
2103             temp_dst.writemask |= WRITEMASK_Z;
2104          op[0].swizzle = MAKE_SWIZZLE4(GET_SWZ(orig_swz, 2), GET_SWZ(orig_swz, 2),
2105                                        GET_SWZ(orig_swz, 3), GET_SWZ(orig_swz, 3));
2106          if (ir->operation == ir_unop_u2i64 || ir->operation == ir_unop_u2u64)
2107             emit_asm(ir, TGSI_OPCODE_MOV, temp_dst, op[0]);
2108          else
2109             emit_asm(ir, TGSI_OPCODE_AND, temp_dst, op[0], st_src_reg_for_int(1));
2110       }
2111       break;
2112    }
2113    case ir_unop_i642i:
2114    case ir_unop_u642i:
2115    case ir_unop_u642u:
2116    case ir_unop_i642u: {
2117       st_src_reg temp = get_temp(glsl_type::uvec4_type);
2118       st_dst_reg temp_dst = st_dst_reg(temp);
2119       unsigned orig_swz = op[0].swizzle;
2120       unsigned orig_idx = op[0].index;
2121       int el;
2122       temp_dst.writemask = WRITEMASK_X;
2123
2124       for (el = 0; el < vector_elements; el++) {
2125          unsigned swz = GET_SWZ(orig_swz, el);
2126          if (swz & 1)
2127             op[0].swizzle = MAKE_SWIZZLE4(SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z);
2128          else
2129             op[0].swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X);
2130          if (swz > 2)
2131             op[0].index = orig_idx + 1;
2132          op[0].type = GLSL_TYPE_UINT;
2133          temp_dst.writemask = WRITEMASK_X << el;
2134          emit_asm(ir, TGSI_OPCODE_MOV, temp_dst, op[0]);
2135       }
2136       result_src = temp;
2137       if (ir->operation == ir_unop_u642u || ir->operation == ir_unop_i642u)
2138          result_src.type = GLSL_TYPE_UINT;
2139       else
2140          result_src.type = GLSL_TYPE_INT;
2141       break;
2142    }
2143    case ir_unop_i642b:
2144       emit_asm(ir, TGSI_OPCODE_U64SNE, result_dst, op[0], st_src_reg_for_int(0));
2145       break;
2146    case ir_unop_i642f:
2147       emit_asm(ir, TGSI_OPCODE_I642F, result_dst, op[0]);
2148       break;
2149    case ir_unop_u642f:
2150       emit_asm(ir, TGSI_OPCODE_U642F, result_dst, op[0]);
2151       break;
2152    case ir_unop_i642d:
2153       emit_asm(ir, TGSI_OPCODE_I642D, result_dst, op[0]);
2154       break;
2155    case ir_unop_u642d:
2156       emit_asm(ir, TGSI_OPCODE_U642D, result_dst, op[0]);
2157       break;
2158    case ir_unop_i2i64:
2159       emit_asm(ir, TGSI_OPCODE_I2I64, result_dst, op[0]);
2160       break;
2161    case ir_unop_f2i64:
2162       emit_asm(ir, TGSI_OPCODE_F2I64, result_dst, op[0]);
2163       break;
2164    case ir_unop_d2i64:
2165       emit_asm(ir, TGSI_OPCODE_D2I64, result_dst, op[0]);
2166       break;
2167    case ir_unop_i2u64:
2168       emit_asm(ir, TGSI_OPCODE_I2I64, result_dst, op[0]);
2169       break;
2170    case ir_unop_f2u64:
2171       emit_asm(ir, TGSI_OPCODE_F2U64, result_dst, op[0]);
2172       break;
2173    case ir_unop_d2u64:
2174       emit_asm(ir, TGSI_OPCODE_D2U64, result_dst, op[0]);
2175       break;
2176       /* these might be needed */
2177    case ir_unop_pack_snorm_2x16:
2178    case ir_unop_pack_unorm_2x16:
2179    case ir_unop_pack_snorm_4x8:
2180    case ir_unop_pack_unorm_4x8:
2181
2182    case ir_unop_unpack_snorm_2x16:
2183    case ir_unop_unpack_unorm_2x16:
2184    case ir_unop_unpack_snorm_4x8:
2185    case ir_unop_unpack_unorm_4x8:
2186
2187    case ir_quadop_vector:
2188    case ir_binop_vector_extract:
2189    case ir_triop_vector_insert:
2190    case ir_binop_carry:
2191    case ir_binop_borrow:
2192    case ir_unop_ssbo_unsized_array_length:
2193       /* This operation is not supported, or should have already been handled.
2194        */
2195       assert(!"Invalid ir opcode in glsl_to_tgsi_visitor::visit()");
2196       break;
2197    }
2198
2199    this->result = result_src;
2200 }
2201
2202
2203 void
2204 glsl_to_tgsi_visitor::visit(ir_swizzle *ir)
2205 {
2206    st_src_reg src;
2207    int i;
2208    int swizzle[4];
2209
2210    /* Note that this is only swizzles in expressions, not those on the left
2211     * hand side of an assignment, which do write masking.  See ir_assignment
2212     * for that.
2213     */
2214
2215    ir->val->accept(this);
2216    src = this->result;
2217    assert(src.file != PROGRAM_UNDEFINED);
2218    assert(ir->type->vector_elements > 0);
2219
2220    for (i = 0; i < 4; i++) {
2221       if (i < ir->type->vector_elements) {
2222          switch (i) {
2223          case 0:
2224             swizzle[i] = GET_SWZ(src.swizzle, ir->mask.x);
2225             break;
2226          case 1:
2227             swizzle[i] = GET_SWZ(src.swizzle, ir->mask.y);
2228             break;
2229          case 2:
2230             swizzle[i] = GET_SWZ(src.swizzle, ir->mask.z);
2231             break;
2232          case 3:
2233             swizzle[i] = GET_SWZ(src.swizzle, ir->mask.w);
2234             break;
2235          }
2236       } else {
2237          /* If the type is smaller than a vec4, replicate the last
2238           * channel out.
2239           */
2240          swizzle[i] = swizzle[ir->type->vector_elements - 1];
2241       }
2242    }
2243
2244    src.swizzle = MAKE_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2245
2246    this->result = src;
2247 }
2248
2249 /* Test if the variable is an array. Note that geometry and
2250  * tessellation shader inputs are outputs are always arrays (except
2251  * for patch inputs), so only the array element type is considered.
2252  */
2253 static bool
2254 is_inout_array(unsigned stage, ir_variable *var, bool *remove_array)
2255 {
2256    const glsl_type *type = var->type;
2257
2258    *remove_array = false;
2259
2260    if ((stage == MESA_SHADER_VERTEX && var->data.mode == ir_var_shader_in) ||
2261        (stage == MESA_SHADER_FRAGMENT && var->data.mode == ir_var_shader_out))
2262       return false;
2263
2264    if (((stage == MESA_SHADER_GEOMETRY && var->data.mode == ir_var_shader_in) ||
2265         (stage == MESA_SHADER_TESS_EVAL && var->data.mode == ir_var_shader_in) ||
2266         stage == MESA_SHADER_TESS_CTRL) &&
2267        !var->data.patch) {
2268       if (!var->type->is_array())
2269          return false; /* a system value probably */
2270
2271       type = var->type->fields.array;
2272       *remove_array = true;
2273    }
2274
2275    return type->is_array() || type->is_matrix();
2276 }
2277
2278 static unsigned
2279 st_translate_interp_loc(ir_variable *var)
2280 {
2281    if (var->data.centroid)
2282       return TGSI_INTERPOLATE_LOC_CENTROID;
2283    else if (var->data.sample)
2284       return TGSI_INTERPOLATE_LOC_SAMPLE;
2285    else
2286       return TGSI_INTERPOLATE_LOC_CENTER;
2287 }
2288
2289 void
2290 glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
2291 {
2292    variable_storage *entry = find_variable_storage(ir->var);
2293    ir_variable *var = ir->var;
2294    bool remove_array;
2295
2296    if (!entry) {
2297       switch (var->data.mode) {
2298       case ir_var_uniform:
2299          entry = new(mem_ctx) variable_storage(var, PROGRAM_UNIFORM,
2300                                                var->data.param_index);
2301          _mesa_hash_table_insert(this->variables, var, entry);
2302          break;
2303       case ir_var_shader_in: {
2304          /* The linker assigns locations for varyings and attributes,
2305           * including deprecated builtins (like gl_Color), user-assign
2306           * generic attributes (glBindVertexLocation), and
2307           * user-defined varyings.
2308           */
2309          assert(var->data.location != -1);
2310
2311          const glsl_type *type_without_array = var->type->without_array();
2312          struct inout_decl *decl = &inputs[num_inputs];
2313          unsigned component = var->data.location_frac;
2314          unsigned num_components;
2315          num_inputs++;
2316
2317          if (type_without_array->is_64bit())
2318             component = component / 2;
2319          if (type_without_array->vector_elements)
2320             num_components = type_without_array->vector_elements;
2321          else
2322             num_components = 4;
2323
2324          decl->mesa_index = var->data.location;
2325          decl->interp = (glsl_interp_mode) var->data.interpolation;
2326          decl->interp_loc = st_translate_interp_loc(var);
2327          decl->base_type = type_without_array->base_type;
2328          decl->usage_mask = u_bit_consecutive(component, num_components);
2329
2330          if (is_inout_array(shader->Stage, var, &remove_array)) {
2331             decl->array_id = num_input_arrays + 1;
2332             num_input_arrays++;
2333          } else {
2334             decl->array_id = 0;
2335          }
2336
2337          if (remove_array)
2338             decl->size = type_size(var->type->fields.array);
2339          else
2340             decl->size = type_size(var->type);
2341
2342          entry = new(mem_ctx) variable_storage(var,
2343                                                PROGRAM_INPUT,
2344                                                decl->mesa_index,
2345                                                decl->array_id);
2346          entry->component = component;
2347
2348          _mesa_hash_table_insert(this->variables, var, entry);
2349
2350          break;
2351       }
2352       case ir_var_shader_out: {
2353          assert(var->data.location != -1);
2354
2355          const glsl_type *type_without_array = var->type->without_array();
2356          struct inout_decl *decl = &outputs[num_outputs];
2357          unsigned component = var->data.location_frac;
2358          unsigned num_components;
2359          num_outputs++;
2360
2361          if (type_without_array->is_64bit())
2362             component = component / 2;
2363          if (type_without_array->vector_elements)
2364             num_components = type_without_array->vector_elements;
2365          else
2366             num_components = 4;
2367
2368          decl->mesa_index = var->data.location + FRAG_RESULT_MAX * var->data.index;
2369          decl->base_type = type_without_array->base_type;
2370          decl->usage_mask = u_bit_consecutive(component, num_components);
2371          if (var->data.stream & (1u << 31)) {
2372             decl->gs_out_streams = var->data.stream & ~(1u << 31);
2373          } else {
2374             assert(var->data.stream < 4);
2375             decl->gs_out_streams = 0;
2376             for (unsigned i = 0; i < num_components; ++i)
2377                decl->gs_out_streams |= var->data.stream << (2 * (component + i));
2378          }
2379
2380          if (is_inout_array(shader->Stage, var, &remove_array)) {
2381             decl->array_id = num_output_arrays + 1;
2382             num_output_arrays++;
2383          } else {
2384             decl->array_id = 0;
2385          }
2386
2387          if (remove_array)
2388             decl->size = type_size(var->type->fields.array);
2389          else
2390             decl->size = type_size(var->type);
2391
2392          if (var->data.fb_fetch_output) {
2393             st_dst_reg dst = st_dst_reg(get_temp(var->type));
2394             st_src_reg src = st_src_reg(PROGRAM_OUTPUT, decl->mesa_index,
2395                                         var->type, component, decl->array_id);
2396             emit_asm(NULL, TGSI_OPCODE_FBFETCH, dst, src);
2397             entry = new(mem_ctx) variable_storage(var, dst.file, dst.index,
2398                                                   dst.array_id);
2399          } else {
2400             entry = new(mem_ctx) variable_storage(var,
2401                                                   PROGRAM_OUTPUT,
2402                                                   decl->mesa_index,
2403                                                   decl->array_id);
2404          }
2405          entry->component = component;
2406
2407          _mesa_hash_table_insert(this->variables, var, entry);
2408
2409          break;
2410       }
2411       case ir_var_system_value:
2412          entry = new(mem_ctx) variable_storage(var,
2413                                                PROGRAM_SYSTEM_VALUE,
2414                                                var->data.location);
2415          break;
2416       case ir_var_auto:
2417       case ir_var_temporary:
2418          st_src_reg src = get_temp(var->type);
2419
2420          entry = new(mem_ctx) variable_storage(var, src.file, src.index,
2421                                                src.array_id);
2422          _mesa_hash_table_insert(this->variables, var, entry);
2423
2424          break;
2425       }
2426
2427       if (!entry) {
2428          printf("Failed to make storage for %s\n", var->name);
2429          exit(1);
2430       }
2431    }
2432
2433    this->result = st_src_reg(entry->file, entry->index, var->type,
2434                              entry->component, entry->array_id);
2435    if (this->shader->Stage == MESA_SHADER_VERTEX &&
2436        var->data.mode == ir_var_shader_in &&
2437        var->type->without_array()->is_double())
2438       this->result.is_double_vertex_input = true;
2439    if (!native_integers)
2440       this->result.type = GLSL_TYPE_FLOAT;
2441 }
2442
2443 static void
2444 shrink_array_declarations(struct inout_decl *decls, unsigned count,
2445                           GLbitfield64* usage_mask,
2446                           GLbitfield64 double_usage_mask,
2447                           GLbitfield* patch_usage_mask)
2448 {
2449    unsigned i;
2450    int j;
2451
2452    /* Fix array declarations by removing unused array elements at both ends
2453     * of the arrays. For example, mat4[3] where only mat[1] is used.
2454     */
2455    for (i = 0; i < count; i++) {
2456       struct inout_decl *decl = &decls[i];
2457       if (!decl->array_id)
2458          continue;
2459
2460       /* Shrink the beginning. */
2461       for (j = 0; j < (int)decl->size; j++) {
2462          if (decl->mesa_index >= VARYING_SLOT_PATCH0) {
2463             if (*patch_usage_mask &
2464                 BITFIELD64_BIT(decl->mesa_index - VARYING_SLOT_PATCH0 + j))
2465                break;
2466          }
2467          else {
2468             if (*usage_mask & BITFIELD64_BIT(decl->mesa_index+j))
2469                break;
2470             if (double_usage_mask & BITFIELD64_BIT(decl->mesa_index+j-1))
2471                break;
2472          }
2473
2474          decl->mesa_index++;
2475          decl->size--;
2476          j--;
2477       }
2478
2479       /* Shrink the end. */
2480       for (j = decl->size-1; j >= 0; j--) {
2481          if (decl->mesa_index >= VARYING_SLOT_PATCH0) {
2482             if (*patch_usage_mask &
2483                 BITFIELD64_BIT(decl->mesa_index - VARYING_SLOT_PATCH0 + j))
2484                break;
2485          }
2486          else {
2487             if (*usage_mask & BITFIELD64_BIT(decl->mesa_index+j))
2488                break;
2489             if (double_usage_mask & BITFIELD64_BIT(decl->mesa_index+j-1))
2490                break;
2491          }
2492
2493          decl->size--;
2494       }
2495
2496       /* When not all entries of an array are accessed, we mark them as used
2497        * here anyway, to ensure that the input/output mapping logic doesn't get
2498        * confused.
2499        *
2500        * TODO This happens when an array isn't used via indirect access, which
2501        * some game ports do (at least eON-based). There is an optimization
2502        * opportunity here by replacing the array declaration with non-array
2503        * declarations of those slots that are actually used.
2504        */
2505       for (j = 1; j < (int)decl->size; ++j) {
2506          if (decl->mesa_index >= VARYING_SLOT_PATCH0)
2507             *patch_usage_mask |= BITFIELD64_BIT(decl->mesa_index - VARYING_SLOT_PATCH0 + j);
2508          else
2509             *usage_mask |= BITFIELD64_BIT(decl->mesa_index + j);
2510       }
2511    }
2512 }
2513
2514 void
2515 glsl_to_tgsi_visitor::visit(ir_dereference_array *ir)
2516 {
2517    ir_constant *index;
2518    st_src_reg src;
2519    bool is_2D = false;
2520    ir_variable *var = ir->variable_referenced();
2521
2522    /* We only need the logic provided by st_glsl_storage_type_size()
2523     * for arrays of structs. Indirect sampler and image indexing is handled
2524     * elsewhere.
2525     */
2526    int element_size = ir->type->without_array()->is_record() ?
2527       st_glsl_storage_type_size(ir->type, var->data.bindless) :
2528       type_size(ir->type);
2529
2530    index = ir->array_index->constant_expression_value(ralloc_parent(ir));
2531
2532    ir->array->accept(this);
2533    src = this->result;
2534
2535    if (!src.has_index2) {
2536       switch (this->prog->Target) {
2537       case GL_TESS_CONTROL_PROGRAM_NV:
2538          is_2D = (src.file == PROGRAM_INPUT || src.file == PROGRAM_OUTPUT) &&
2539                  !ir->variable_referenced()->data.patch;
2540          break;
2541       case GL_TESS_EVALUATION_PROGRAM_NV:
2542          is_2D = src.file == PROGRAM_INPUT &&
2543                  !ir->variable_referenced()->data.patch;
2544          break;
2545       case GL_GEOMETRY_PROGRAM_NV:
2546          is_2D = src.file == PROGRAM_INPUT;
2547          break;
2548       }
2549    }
2550
2551    if (is_2D)
2552       element_size = 1;
2553
2554    if (index) {
2555
2556       if (this->prog->Target == GL_VERTEX_PROGRAM_ARB &&
2557           src.file == PROGRAM_INPUT)
2558          element_size = attrib_type_size(ir->type, true);
2559       if (is_2D) {
2560          src.index2D = index->value.i[0];
2561          src.has_index2 = true;
2562       } else
2563          src.index += index->value.i[0] * element_size;
2564    } else {
2565       /* Variable index array dereference.  It eats the "vec4" of the
2566        * base of the array and an index that offsets the TGSI register
2567        * index.
2568        */
2569       ir->array_index->accept(this);
2570
2571       st_src_reg index_reg;
2572
2573       if (element_size == 1) {
2574          index_reg = this->result;
2575       } else {
2576          index_reg = get_temp(native_integers ?
2577                               glsl_type::int_type : glsl_type::float_type);
2578
2579          emit_asm(ir, TGSI_OPCODE_MUL, st_dst_reg(index_reg),
2580               this->result, st_src_reg_for_type(index_reg.type, element_size));
2581       }
2582
2583       /* If there was already a relative address register involved, add the
2584        * new and the old together to get the new offset.
2585        */
2586       if (!is_2D && src.reladdr != NULL) {
2587          st_src_reg accum_reg = get_temp(native_integers ?
2588                                 glsl_type::int_type : glsl_type::float_type);
2589
2590          emit_asm(ir, TGSI_OPCODE_ADD, st_dst_reg(accum_reg),
2591               index_reg, *src.reladdr);
2592
2593          index_reg = accum_reg;
2594       }
2595
2596       if (is_2D) {
2597          src.reladdr2 = ralloc(mem_ctx, st_src_reg);
2598          memcpy(src.reladdr2, &index_reg, sizeof(index_reg));
2599          src.index2D = 0;
2600          src.has_index2 = true;
2601       } else {
2602          src.reladdr = ralloc(mem_ctx, st_src_reg);
2603          memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2604       }
2605    }
2606
2607    /* Change the register type to the element type of the array. */
2608    src.type = ir->type->base_type;
2609
2610    this->result = src;
2611 }
2612
2613 void
2614 glsl_to_tgsi_visitor::visit(ir_dereference_record *ir)
2615 {
2616    unsigned int i;
2617    const glsl_type *struct_type = ir->record->type;
2618    ir_variable *var = ir->record->variable_referenced();
2619    int offset = 0;
2620
2621    ir->record->accept(this);
2622
2623    assert(ir->field_idx >= 0);
2624    assert(var);
2625    for (i = 0; i < struct_type->length; i++) {
2626       if (i == (unsigned) ir->field_idx)
2627          break;
2628       const glsl_type *member_type = struct_type->fields.structure[i].type;
2629       offset += st_glsl_storage_type_size(member_type, var->data.bindless);
2630    }
2631
2632    /* If the type is smaller than a vec4, replicate the last channel out. */
2633    if (ir->type->is_scalar() || ir->type->is_vector())
2634       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
2635    else
2636       this->result.swizzle = SWIZZLE_NOOP;
2637
2638    this->result.index += offset;
2639    this->result.type = ir->type->base_type;
2640 }
2641
2642 /**
2643  * We want to be careful in assignment setup to hit the actual storage
2644  * instead of potentially using a temporary like we might with the
2645  * ir_dereference handler.
2646  */
2647 static st_dst_reg
2648 get_assignment_lhs(ir_dereference *ir, glsl_to_tgsi_visitor *v, int *component)
2649 {
2650    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2651     * access of a vector, it must be separated into a series conditional moves
2652     * before reaching this point (see ir_vec_index_to_cond_assign).
2653     */
2654    assert(ir->as_dereference());
2655    ir_dereference_array *deref_array = ir->as_dereference_array();
2656    if (deref_array) {
2657       assert(!deref_array->array->type->is_vector());
2658    }
2659
2660    /* Use the rvalue deref handler for the most part.  We write swizzles using
2661     * the writemask, but we do extract the base component for enhanced layouts
2662     * from the source swizzle.
2663     */
2664    ir->accept(v);
2665    *component = GET_SWZ(v->result.swizzle, 0);
2666    return st_dst_reg(v->result);
2667 }
2668
2669 /**
2670  * Process the condition of a conditional assignment
2671  *
2672  * Examines the condition of a conditional assignment to generate the optimal
2673  * first operand of a \c CMP instruction.  If the condition is a relational
2674  * operator with 0 (e.g., \c ir_binop_less), the value being compared will be
2675  * used as the source for the \c CMP instruction.  Otherwise the comparison
2676  * is processed to a boolean result, and the boolean result is used as the
2677  * operand to the CMP instruction.
2678  */
2679 bool
2680 glsl_to_tgsi_visitor::process_move_condition(ir_rvalue *ir)
2681 {
2682    ir_rvalue *src_ir = ir;
2683    bool negate = true;
2684    bool switch_order = false;
2685
2686    ir_expression *const expr = ir->as_expression();
2687
2688    if (native_integers) {
2689       if ((expr != NULL) && (expr->num_operands == 2)) {
2690          enum glsl_base_type type = expr->operands[0]->type->base_type;
2691          if (type == GLSL_TYPE_INT || type == GLSL_TYPE_UINT ||
2692              type == GLSL_TYPE_BOOL) {
2693             if (expr->operation == ir_binop_equal) {
2694                if (expr->operands[0]->is_zero()) {
2695                   src_ir = expr->operands[1];
2696                   switch_order = true;
2697                }
2698                else if (expr->operands[1]->is_zero()) {
2699                   src_ir = expr->operands[0];
2700                   switch_order = true;
2701                }
2702             }
2703             else if (expr->operation == ir_binop_nequal) {
2704                if (expr->operands[0]->is_zero()) {
2705                   src_ir = expr->operands[1];
2706                }
2707                else if (expr->operands[1]->is_zero()) {
2708                   src_ir = expr->operands[0];
2709                }
2710             }
2711          }
2712       }
2713
2714       src_ir->accept(this);
2715       return switch_order;
2716    }
2717
2718    if ((expr != NULL) && (expr->num_operands == 2)) {
2719       bool zero_on_left = false;
2720
2721       if (expr->operands[0]->is_zero()) {
2722          src_ir = expr->operands[1];
2723          zero_on_left = true;
2724       } else if (expr->operands[1]->is_zero()) {
2725          src_ir = expr->operands[0];
2726          zero_on_left = false;
2727       }
2728
2729       /*      a is -  0  +            -  0  +
2730        * (a <  0)  T  F  F  ( a < 0)  T  F  F
2731        * (0 <  a)  F  F  T  (-a < 0)  F  F  T
2732        * (a <= 0)  T  T  F  (-a < 0)  F  F  T  (swap order of other operands)
2733        * (0 <= a)  F  T  T  ( a < 0)  T  F  F  (swap order of other operands)
2734        * (a >  0)  F  F  T  (-a < 0)  F  F  T
2735        * (0 >  a)  T  F  F  ( a < 0)  T  F  F
2736        * (a >= 0)  F  T  T  ( a < 0)  T  F  F  (swap order of other operands)
2737        * (0 >= a)  T  T  F  (-a < 0)  F  F  T  (swap order of other operands)
2738        *
2739        * Note that exchanging the order of 0 and 'a' in the comparison simply
2740        * means that the value of 'a' should be negated.
2741        */
2742       if (src_ir != ir) {
2743          switch (expr->operation) {
2744          case ir_binop_less:
2745             switch_order = false;
2746             negate = zero_on_left;
2747             break;
2748
2749          case ir_binop_greater:
2750             switch_order = false;
2751             negate = !zero_on_left;
2752             break;
2753
2754          case ir_binop_lequal:
2755             switch_order = true;
2756             negate = !zero_on_left;
2757             break;
2758
2759          case ir_binop_gequal:
2760             switch_order = true;
2761             negate = zero_on_left;
2762             break;
2763
2764          default:
2765             /* This isn't the right kind of comparison afterall, so make sure
2766              * the whole condition is visited.
2767              */
2768             src_ir = ir;
2769             break;
2770          }
2771       }
2772    }
2773
2774    src_ir->accept(this);
2775
2776    /* We use the TGSI_OPCODE_CMP (a < 0 ? b : c) for conditional moves, and the
2777     * condition we produced is 0.0 or 1.0.  By flipping the sign, we can
2778     * choose which value TGSI_OPCODE_CMP produces without an extra instruction
2779     * computing the condition.
2780     */
2781    if (negate)
2782       this->result.negate = ~this->result.negate;
2783
2784    return switch_order;
2785 }
2786
2787 void
2788 glsl_to_tgsi_visitor::emit_block_mov(ir_assignment *ir, const struct glsl_type *type,
2789                                      st_dst_reg *l, st_src_reg *r,
2790                                      st_src_reg *cond, bool cond_swap)
2791 {
2792    if (type->is_record()) {
2793       for (unsigned int i = 0; i < type->length; i++) {
2794          emit_block_mov(ir, type->fields.structure[i].type, l, r,
2795                         cond, cond_swap);
2796       }
2797       return;
2798    }
2799
2800    if (type->is_array()) {
2801       for (unsigned int i = 0; i < type->length; i++) {
2802          emit_block_mov(ir, type->fields.array, l, r, cond, cond_swap);
2803       }
2804       return;
2805    }
2806
2807    if (type->is_matrix()) {
2808       const struct glsl_type *vec_type;
2809
2810       vec_type = glsl_type::get_instance(type->is_double() ? GLSL_TYPE_DOUBLE : GLSL_TYPE_FLOAT,
2811                                          type->vector_elements, 1);
2812
2813       for (int i = 0; i < type->matrix_columns; i++) {
2814          emit_block_mov(ir, vec_type, l, r, cond, cond_swap);
2815       }
2816       return;
2817    }
2818
2819    assert(type->is_scalar() || type->is_vector());
2820
2821    l->type = type->base_type;
2822    r->type = type->base_type;
2823    if (cond) {
2824       st_src_reg l_src = st_src_reg(*l);
2825       l_src.swizzle = swizzle_for_size(type->vector_elements);
2826
2827       if (native_integers) {
2828          emit_asm(ir, TGSI_OPCODE_UCMP, *l, *cond,
2829               cond_swap ? l_src : *r,
2830               cond_swap ? *r : l_src);
2831       } else {
2832          emit_asm(ir, TGSI_OPCODE_CMP, *l, *cond,
2833               cond_swap ? l_src : *r,
2834               cond_swap ? *r : l_src);
2835       }
2836    } else {
2837       emit_asm(ir, TGSI_OPCODE_MOV, *l, *r);
2838    }
2839    l->index++;
2840    r->index++;
2841    if (type->is_dual_slot()) {
2842       l->index++;
2843       if (r->is_double_vertex_input == false)
2844          r->index++;
2845    }
2846 }
2847
2848 void
2849 glsl_to_tgsi_visitor::visit(ir_assignment *ir)
2850 {
2851    int dst_component;
2852    st_dst_reg l;
2853    st_src_reg r;
2854
2855    /* all generated instructions need to be flaged as precise */
2856    this->precise = is_precise(ir->lhs->variable_referenced());
2857    ir->rhs->accept(this);
2858    r = this->result;
2859
2860    l = get_assignment_lhs(ir->lhs, this, &dst_component);
2861
2862    {
2863       int swizzles[4];
2864       int first_enabled_chan = 0;
2865       int rhs_chan = 0;
2866       ir_variable *variable = ir->lhs->variable_referenced();
2867
2868       if (shader->Stage == MESA_SHADER_FRAGMENT &&
2869           variable->data.mode == ir_var_shader_out &&
2870           (variable->data.location == FRAG_RESULT_DEPTH ||
2871            variable->data.location == FRAG_RESULT_STENCIL)) {
2872          assert(ir->lhs->type->is_scalar());
2873          assert(ir->write_mask == WRITEMASK_X);
2874
2875          if (variable->data.location == FRAG_RESULT_DEPTH)
2876             l.writemask = WRITEMASK_Z;
2877          else {
2878             assert(variable->data.location == FRAG_RESULT_STENCIL);
2879             l.writemask = WRITEMASK_Y;
2880          }
2881       } else if (ir->write_mask == 0) {
2882          assert(!ir->lhs->type->is_scalar() && !ir->lhs->type->is_vector());
2883
2884          unsigned num_elements = ir->lhs->type->without_array()->vector_elements;
2885
2886          if (num_elements) {
2887             l.writemask = u_bit_consecutive(0, num_elements);
2888          } else {
2889             /* The type is a struct or an array of (array of) structs. */
2890             l.writemask = WRITEMASK_XYZW;
2891          }
2892       } else {
2893          l.writemask = ir->write_mask;
2894       }
2895
2896       for (int i = 0; i < 4; i++) {
2897          if (l.writemask & (1 << i)) {
2898             first_enabled_chan = GET_SWZ(r.swizzle, i);
2899             break;
2900          }
2901       }
2902
2903       l.writemask = l.writemask << dst_component;
2904
2905       /* Swizzle a small RHS vector into the channels being written.
2906        *
2907        * glsl ir treats write_mask as dictating how many channels are
2908        * present on the RHS while TGSI treats write_mask as just
2909        * showing which channels of the vec4 RHS get written.
2910        */
2911       for (int i = 0; i < 4; i++) {
2912          if (l.writemask & (1 << i))
2913             swizzles[i] = GET_SWZ(r.swizzle, rhs_chan++);
2914          else
2915             swizzles[i] = first_enabled_chan;
2916       }
2917       r.swizzle = MAKE_SWIZZLE4(swizzles[0], swizzles[1],
2918                                 swizzles[2], swizzles[3]);
2919    }
2920
2921    assert(l.file != PROGRAM_UNDEFINED);
2922    assert(r.file != PROGRAM_UNDEFINED);
2923
2924    if (ir->condition) {
2925       const bool switch_order = this->process_move_condition(ir->condition);
2926       st_src_reg condition = this->result;
2927
2928       emit_block_mov(ir, ir->lhs->type, &l, &r, &condition, switch_order);
2929    } else if (ir->rhs->as_expression() &&
2930               this->instructions.get_tail() &&
2931               ir->rhs == ((glsl_to_tgsi_instruction *)this->instructions.get_tail())->ir &&
2932               !((glsl_to_tgsi_instruction *)this->instructions.get_tail())->is_64bit_expanded &&
2933               type_size(ir->lhs->type) == 1 &&
2934               l.writemask == ((glsl_to_tgsi_instruction *)this->instructions.get_tail())->dst[0].writemask) {
2935       /* To avoid emitting an extra MOV when assigning an expression to a
2936        * variable, emit the last instruction of the expression again, but
2937        * replace the destination register with the target of the assignment.
2938        * Dead code elimination will remove the original instruction.
2939        */
2940       glsl_to_tgsi_instruction *inst, *new_inst;
2941       inst = (glsl_to_tgsi_instruction *)this->instructions.get_tail();
2942       new_inst = emit_asm(ir, inst->op, l, inst->src[0], inst->src[1], inst->src[2], inst->src[3]);
2943       new_inst->saturate = inst->saturate;
2944       inst->dead_mask = inst->dst[0].writemask;
2945    } else {
2946       emit_block_mov(ir, ir->rhs->type, &l, &r, NULL, false);
2947    }
2948    this->precise = 0;
2949 }
2950
2951
2952 void
2953 glsl_to_tgsi_visitor::visit(ir_constant *ir)
2954 {
2955    st_src_reg src;
2956    GLdouble stack_vals[4] = { 0 };
2957    gl_constant_value *values = (gl_constant_value *) stack_vals;
2958    GLenum gl_type = GL_NONE;
2959    unsigned int i;
2960    static int in_array = 0;
2961    gl_register_file file = in_array ? PROGRAM_CONSTANT : PROGRAM_IMMEDIATE;
2962
2963    /* Unfortunately, 4 floats is all we can get into
2964     * _mesa_add_typed_unnamed_constant.  So, make a temp to store an
2965     * aggregate constant and move each constant value into it.  If we
2966     * get lucky, copy propagation will eliminate the extra moves.
2967     */
2968    if (ir->type->is_record()) {
2969       st_src_reg temp_base = get_temp(ir->type);
2970       st_dst_reg temp = st_dst_reg(temp_base);
2971
2972       foreach_in_list(ir_constant, field_value, &ir->components) {
2973          int size = type_size(field_value->type);
2974
2975          assert(size > 0);
2976
2977          field_value->accept(this);
2978          src = this->result;
2979
2980          for (i = 0; i < (unsigned int)size; i++) {
2981             emit_asm(ir, TGSI_OPCODE_MOV, temp, src);
2982
2983             src.index++;
2984             temp.index++;
2985          }
2986       }
2987       this->result = temp_base;
2988       return;
2989    }
2990
2991    if (ir->type->is_array()) {
2992       st_src_reg temp_base = get_temp(ir->type);
2993       st_dst_reg temp = st_dst_reg(temp_base);
2994       int size = type_size(ir->type->fields.array);
2995
2996       assert(size > 0);
2997       in_array++;
2998
2999       for (i = 0; i < ir->type->length; i++) {
3000          ir->array_elements[i]->accept(this);
3001          src = this->result;
3002          for (int j = 0; j < size; j++) {
3003             emit_asm(ir, TGSI_OPCODE_MOV, temp, src);
3004
3005             src.index++;
3006             temp.index++;
3007          }
3008       }
3009       this->result = temp_base;
3010       in_array--;
3011       return;
3012    }
3013
3014    if (ir->type->is_matrix()) {
3015       st_src_reg mat = get_temp(ir->type);
3016       st_dst_reg mat_column = st_dst_reg(mat);
3017
3018       for (i = 0; i < ir->type->matrix_columns; i++) {
3019          switch (ir->type->base_type) {
3020          case GLSL_TYPE_FLOAT:
3021             values = (gl_constant_value *) &ir->value.f[i * ir->type->vector_elements];
3022
3023             src = st_src_reg(file, -1, ir->type->base_type);
3024             src.index = add_constant(file,
3025                                      values,
3026                                      ir->type->vector_elements,
3027                                      GL_FLOAT,
3028                                      &src.swizzle);
3029             emit_asm(ir, TGSI_OPCODE_MOV, mat_column, src);
3030             break;
3031          case GLSL_TYPE_DOUBLE:
3032             values = (gl_constant_value *) &ir->value.d[i * ir->type->vector_elements];
3033             src = st_src_reg(file, -1, ir->type->base_type);
3034             src.index = add_constant(file,
3035                                      values,
3036                                      ir->type->vector_elements,
3037                                      GL_DOUBLE,
3038                                      &src.swizzle);
3039             if (ir->type->vector_elements >= 2) {
3040                mat_column.writemask = WRITEMASK_XY;
3041                src.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_X, SWIZZLE_Y);
3042                emit_asm(ir, TGSI_OPCODE_MOV, mat_column, src);
3043             } else {
3044                mat_column.writemask = WRITEMASK_X;
3045                src.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X);
3046                emit_asm(ir, TGSI_OPCODE_MOV, mat_column, src);
3047             }
3048             src.index++;
3049             if (ir->type->vector_elements > 2) {
3050                if (ir->type->vector_elements == 4) {
3051                   mat_column.writemask = WRITEMASK_ZW;
3052                   src.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_X, SWIZZLE_Y);
3053                   emit_asm(ir, TGSI_OPCODE_MOV, mat_column, src);
3054                } else {
3055                   mat_column.writemask = WRITEMASK_Z;
3056                   src.swizzle = MAKE_SWIZZLE4(SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y);
3057                   emit_asm(ir, TGSI_OPCODE_MOV, mat_column, src);
3058                   mat_column.writemask = WRITEMASK_XYZW;
3059                   src.swizzle = SWIZZLE_XYZW;
3060                }
3061                mat_column.index++;
3062             }
3063             break;
3064          default:
3065             unreachable("Illegal matrix constant type.\n");
3066             break;
3067          }
3068          mat_column.index++;
3069       }
3070       this->result = mat;
3071       return;
3072    }
3073
3074    switch (ir->type->base_type) {
3075    case GLSL_TYPE_FLOAT:
3076       gl_type = GL_FLOAT;
3077       for (i = 0; i < ir->type->vector_elements; i++) {
3078          values[i].f = ir->value.f[i];
3079       }
3080       break;
3081    case GLSL_TYPE_DOUBLE:
3082       gl_type = GL_DOUBLE;
3083       for (i = 0; i < ir->type->vector_elements; i++) {
3084          memcpy(&values[i * 2], &ir->value.d[i], sizeof(double));
3085       }
3086       break;
3087    case GLSL_TYPE_INT64:
3088       gl_type = GL_INT64_ARB;
3089       for (i = 0; i < ir->type->vector_elements; i++) {
3090          memcpy(&values[i * 2], &ir->value.d[i], sizeof(int64_t));
3091       }
3092       break;
3093    case GLSL_TYPE_UINT64:
3094       gl_type = GL_UNSIGNED_INT64_ARB;
3095       for (i = 0; i < ir->type->vector_elements; i++) {
3096          memcpy(&values[i * 2], &ir->value.d[i], sizeof(uint64_t));
3097       }
3098       break;
3099    case GLSL_TYPE_UINT:
3100       gl_type = native_integers ? GL_UNSIGNED_INT : GL_FLOAT;
3101       for (i = 0; i < ir->type->vector_elements; i++) {
3102          if (native_integers)
3103             values[i].u = ir->value.u[i];
3104          else
3105             values[i].f = ir->value.u[i];
3106       }
3107       break;
3108    case GLSL_TYPE_INT:
3109       gl_type = native_integers ? GL_INT : GL_FLOAT;
3110       for (i = 0; i < ir->type->vector_elements; i++) {
3111          if (native_integers)
3112             values[i].i = ir->value.i[i];
3113          else
3114             values[i].f = ir->value.i[i];
3115       }
3116       break;
3117    case GLSL_TYPE_BOOL:
3118       gl_type = native_integers ? GL_BOOL : GL_FLOAT;
3119       for (i = 0; i < ir->type->vector_elements; i++) {
3120          values[i].u = ir->value.b[i] ? ctx->Const.UniformBooleanTrue : 0;
3121       }
3122       break;
3123    default:
3124       assert(!"Non-float/uint/int/bool constant");
3125    }
3126
3127    this->result = st_src_reg(file, -1, ir->type);
3128    this->result.index = add_constant(file,
3129                                      values,
3130                                      ir->type->vector_elements,
3131                                      gl_type,
3132                                      &this->result.swizzle);
3133 }
3134
3135 void
3136 glsl_to_tgsi_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
3137 {
3138    exec_node *param = ir->actual_parameters.get_head();
3139    ir_dereference *deref = static_cast<ir_dereference *>(param);
3140    ir_variable *location = deref->variable_referenced();
3141
3142    st_src_reg buffer(
3143          PROGRAM_BUFFER, location->data.binding, GLSL_TYPE_ATOMIC_UINT);
3144
3145    /* Calculate the surface offset */
3146    st_src_reg offset;
3147    unsigned array_size = 0, base = 0;
3148    uint16_t index = 0;
3149
3150    get_deref_offsets(deref, &array_size, &base, &index, &offset, false);
3151
3152    if (offset.file != PROGRAM_UNDEFINED) {
3153       emit_asm(ir, TGSI_OPCODE_MUL, st_dst_reg(offset),
3154                offset, st_src_reg_for_int(ATOMIC_COUNTER_SIZE));
3155       emit_asm(ir, TGSI_OPCODE_ADD, st_dst_reg(offset),
3156                offset, st_src_reg_for_int(location->data.offset + index * ATOMIC_COUNTER_SIZE));
3157    } else {
3158       offset = st_src_reg_for_int(location->data.offset + index * ATOMIC_COUNTER_SIZE);
3159    }
3160
3161    ir->return_deref->accept(this);
3162    st_dst_reg dst(this->result);
3163    dst.writemask = WRITEMASK_X;
3164
3165    glsl_to_tgsi_instruction *inst;
3166
3167    if (ir->callee->intrinsic_id == ir_intrinsic_atomic_counter_read) {
3168       inst = emit_asm(ir, TGSI_OPCODE_LOAD, dst, offset);
3169    } else if (ir->callee->intrinsic_id == ir_intrinsic_atomic_counter_increment) {
3170       inst = emit_asm(ir, TGSI_OPCODE_ATOMUADD, dst, offset,
3171                       st_src_reg_for_int(1));
3172    } else if (ir->callee->intrinsic_id == ir_intrinsic_atomic_counter_predecrement) {
3173       inst = emit_asm(ir, TGSI_OPCODE_ATOMUADD, dst, offset,
3174                       st_src_reg_for_int(-1));
3175       emit_asm(ir, TGSI_OPCODE_ADD, dst, this->result, st_src_reg_for_int(-1));
3176    } else {
3177       param = param->get_next();
3178       ir_rvalue *val = ((ir_instruction *)param)->as_rvalue();
3179       val->accept(this);
3180
3181       st_src_reg data = this->result, data2 = undef_src;
3182       unsigned opcode;
3183       switch (ir->callee->intrinsic_id) {
3184       case ir_intrinsic_atomic_counter_add:
3185          opcode = TGSI_OPCODE_ATOMUADD;
3186          break;
3187       case ir_intrinsic_atomic_counter_min:
3188          opcode = TGSI_OPCODE_ATOMIMIN;
3189          break;
3190       case ir_intrinsic_atomic_counter_max:
3191          opcode = TGSI_OPCODE_ATOMIMAX;
3192          break;
3193       case ir_intrinsic_atomic_counter_and:
3194          opcode = TGSI_OPCODE_ATOMAND;
3195          break;
3196       case ir_intrinsic_atomic_counter_or:
3197          opcode = TGSI_OPCODE_ATOMOR;
3198          break;
3199       case ir_intrinsic_atomic_counter_xor:
3200          opcode = TGSI_OPCODE_ATOMXOR;
3201          break;
3202       case ir_intrinsic_atomic_counter_exchange:
3203          opcode = TGSI_OPCODE_ATOMXCHG;
3204          break;
3205       case ir_intrinsic_atomic_counter_comp_swap: {
3206          opcode = TGSI_OPCODE_ATOMCAS;
3207          param = param->get_next();
3208          val = ((ir_instruction *)param)->as_rvalue();
3209          val->accept(this);
3210          data2 = this->result;
3211          break;
3212       }
3213       default:
3214          assert(!"Unexpected intrinsic");
3215          return;
3216       }
3217
3218       inst = emit_asm(ir, opcode, dst, offset, data, data2);
3219    }
3220
3221    inst->resource = buffer;
3222 }
3223
3224 void
3225 glsl_to_tgsi_visitor::visit_ssbo_intrinsic(ir_call *ir)
3226 {
3227    exec_node *param = ir->actual_parameters.get_head();
3228
3229    ir_rvalue *block = ((ir_instruction *)param)->as_rvalue();
3230
3231    param = param->get_next();
3232    ir_rvalue *offset = ((ir_instruction *)param)->as_rvalue();
3233
3234    ir_constant *const_block = block->as_constant();
3235
3236    st_src_reg buffer(
3237          PROGRAM_BUFFER,
3238          ctx->Const.Program[shader->Stage].MaxAtomicBuffers +
3239          (const_block ? const_block->value.u[0] : 0),
3240          GLSL_TYPE_UINT);
3241
3242    if (!const_block) {
3243       block->accept(this);
3244       buffer.reladdr = ralloc(mem_ctx, st_src_reg);
3245       *buffer.reladdr = this->result;
3246       emit_arl(ir, sampler_reladdr, this->result);
3247    }
3248
3249    /* Calculate the surface offset */
3250    offset->accept(this);
3251    st_src_reg off = this->result;
3252
3253    st_dst_reg dst = undef_dst;
3254    if (ir->return_deref) {
3255       ir->return_deref->accept(this);
3256       dst = st_dst_reg(this->result);
3257       dst.writemask = (1 << ir->return_deref->type->vector_elements) - 1;
3258    }
3259
3260    glsl_to_tgsi_instruction *inst;
3261
3262    if (ir->callee->intrinsic_id == ir_intrinsic_ssbo_load) {
3263       inst = emit_asm(ir, TGSI_OPCODE_LOAD, dst, off);
3264       if (dst.type == GLSL_TYPE_BOOL)
3265          emit_asm(ir, TGSI_OPCODE_USNE, dst, st_src_reg(dst), st_src_reg_for_int(0));
3266    } else if (ir->callee->intrinsic_id == ir_intrinsic_ssbo_store) {
3267       param = param->get_next();
3268       ir_rvalue *val = ((ir_instruction *)param)->as_rvalue();
3269       val->accept(this);
3270
3271       param = param->get_next();
3272       ir_constant *write_mask = ((ir_instruction *)param)->as_constant();
3273       assert(write_mask);
3274       dst.writemask = write_mask->value.u[0];
3275
3276       dst.type = this->result.type;
3277       inst = emit_asm(ir, TGSI_OPCODE_STORE, dst, off, this->result);
3278    } else {
3279       param = param->get_next();
3280       ir_rvalue *val = ((ir_instruction *)param)->as_rvalue();
3281       val->accept(this);
3282
3283       st_src_reg data = this->result, data2 = undef_src;
3284       unsigned opcode;
3285       switch (ir->callee->intrinsic_id) {
3286       case ir_intrinsic_ssbo_atomic_add:
3287          opcode = TGSI_OPCODE_ATOMUADD;
3288          break;
3289       case ir_intrinsic_ssbo_atomic_min:
3290          opcode = TGSI_OPCODE_ATOMIMIN;
3291          break;
3292       case ir_intrinsic_ssbo_atomic_max:
3293          opcode = TGSI_OPCODE_ATOMIMAX;
3294          break;
3295       case ir_intrinsic_ssbo_atomic_and:
3296          opcode = TGSI_OPCODE_ATOMAND;
3297          break;
3298       case ir_intrinsic_ssbo_atomic_or:
3299          opcode = TGSI_OPCODE_ATOMOR;
3300          break;
3301       case ir_intrinsic_ssbo_atomic_xor:
3302          opcode = TGSI_OPCODE_ATOMXOR;
3303          break;
3304       case ir_intrinsic_ssbo_atomic_exchange:
3305          opcode = TGSI_OPCODE_ATOMXCHG;
3306          break;
3307       case ir_intrinsic_ssbo_atomic_comp_swap:
3308          opcode = TGSI_OPCODE_ATOMCAS;
3309          param = param->get_next();
3310          val = ((ir_instruction *)param)->as_rvalue();
3311          val->accept(this);
3312          data2 = this->result;
3313          break;
3314       default:
3315          assert(!"Unexpected intrinsic");
3316          return;
3317       }
3318
3319       inst = emit_asm(ir, opcode, dst, off, data, data2);
3320    }
3321
3322    param = param->get_next();
3323    ir_constant *access = NULL;
3324    if (!param->is_tail_sentinel()) {
3325       access = ((ir_instruction *)param)->as_constant();
3326       assert(access);
3327    }
3328
3329    /* The emit_asm() might have actually split the op into pieces, e.g. for
3330     * double stores. We have to go back and fix up all the generated ops.
3331     */
3332    unsigned op = inst->op;
3333    do {
3334       inst->resource = buffer;
3335       if (access)
3336          inst->buffer_access = access->value.u[0];
3337
3338       if (inst == this->instructions.get_head_raw())
3339          break;
3340       inst = (glsl_to_tgsi_instruction *)inst->get_prev();
3341
3342       if (inst->op == TGSI_OPCODE_UADD) {
3343          if (inst == this->instructions.get_head_raw())
3344             break;
3345          inst = (glsl_to_tgsi_instruction *)inst->get_prev();
3346       }
3347    } while (inst->op == op && inst->resource.file == PROGRAM_UNDEFINED);
3348 }
3349
3350 void
3351 glsl_to_tgsi_visitor::visit_membar_intrinsic(ir_call *ir)
3352 {
3353    switch (ir->callee->intrinsic_id) {
3354    case ir_intrinsic_memory_barrier:
3355       emit_asm(ir, TGSI_OPCODE_MEMBAR, undef_dst,
3356                st_src_reg_for_int(TGSI_MEMBAR_SHADER_BUFFER |
3357                                   TGSI_MEMBAR_ATOMIC_BUFFER |
3358                                   TGSI_MEMBAR_SHADER_IMAGE |
3359                                   TGSI_MEMBAR_SHARED));
3360       break;
3361    case ir_intrinsic_memory_barrier_atomic_counter:
3362       emit_asm(ir, TGSI_OPCODE_MEMBAR, undef_dst,
3363                st_src_reg_for_int(TGSI_MEMBAR_ATOMIC_BUFFER));
3364       break;
3365    case ir_intrinsic_memory_barrier_buffer:
3366       emit_asm(ir, TGSI_OPCODE_MEMBAR, undef_dst,
3367                st_src_reg_for_int(TGSI_MEMBAR_SHADER_BUFFER));
3368       break;
3369    case ir_intrinsic_memory_barrier_image:
3370       emit_asm(ir, TGSI_OPCODE_MEMBAR, undef_dst,
3371                st_src_reg_for_int(TGSI_MEMBAR_SHADER_IMAGE));
3372       break;
3373    case ir_intrinsic_memory_barrier_shared:
3374       emit_asm(ir, TGSI_OPCODE_MEMBAR, undef_dst,
3375                st_src_reg_for_int(TGSI_MEMBAR_SHARED));
3376       break;
3377    case ir_intrinsic_group_memory_barrier:
3378       emit_asm(ir, TGSI_OPCODE_MEMBAR, undef_dst,
3379                st_src_reg_for_int(TGSI_MEMBAR_SHADER_BUFFER |
3380                                   TGSI_MEMBAR_ATOMIC_BUFFER |
3381                                   TGSI_MEMBAR_SHADER_IMAGE |
3382                                   TGSI_MEMBAR_SHARED |
3383                                   TGSI_MEMBAR_THREAD_GROUP));
3384       break;
3385    default:
3386       assert(!"Unexpected memory barrier intrinsic");
3387    }
3388 }
3389
3390 void
3391 glsl_to_tgsi_visitor::visit_shared_intrinsic(ir_call *ir)
3392 {
3393    exec_node *param = ir->actual_parameters.get_head();
3394
3395    ir_rvalue *offset = ((ir_instruction *)param)->as_rvalue();
3396
3397    st_src_reg buffer(PROGRAM_MEMORY, 0, GLSL_TYPE_UINT);
3398
3399    /* Calculate the surface offset */
3400    offset->accept(this);
3401    st_src_reg off = this->result;
3402
3403    st_dst_reg dst = undef_dst;
3404    if (ir->return_deref) {
3405       ir->return_deref->accept(this);
3406       dst = st_dst_reg(this->result);
3407       dst.writemask = (1 << ir->return_deref->type->vector_elements) - 1;
3408    }
3409
3410    glsl_to_tgsi_instruction *inst;
3411
3412    if (ir->callee->intrinsic_id == ir_intrinsic_shared_load) {
3413       inst = emit_asm(ir, TGSI_OPCODE_LOAD, dst, off);
3414       inst->resource = buffer;
3415    } else if (ir->callee->intrinsic_id == ir_intrinsic_shared_store) {
3416       param = param->get_next();
3417       ir_rvalue *val = ((ir_instruction *)param)->as_rvalue();
3418       val->accept(this);
3419
3420       param = param->get_next();
3421       ir_constant *write_mask = ((ir_instruction *)param)->as_constant();
3422       assert(write_mask);
3423       dst.writemask = write_mask->value.u[0];
3424
3425       dst.type = this->result.type;
3426       inst = emit_asm(ir, TGSI_OPCODE_STORE, dst, off, this->result);
3427       inst->resource = buffer;
3428    } else {
3429       param = param->get_next();
3430       ir_rvalue *val = ((ir_instruction *)param)->as_rvalue();
3431       val->accept(this);
3432
3433       st_src_reg data = this->result, data2 = undef_src;
3434       unsigned opcode;
3435       switch (ir->callee->intrinsic_id) {
3436       case ir_intrinsic_shared_atomic_add:
3437          opcode = TGSI_OPCODE_ATOMUADD;
3438          break;
3439       case ir_intrinsic_shared_atomic_min:
3440          opcode = TGSI_OPCODE_ATOMIMIN;
3441          break;
3442       case ir_intrinsic_shared_atomic_max:
3443          opcode = TGSI_OPCODE_ATOMIMAX;
3444          break;
3445       case ir_intrinsic_shared_atomic_and:
3446          opcode = TGSI_OPCODE_ATOMAND;
3447          break;
3448       case ir_intrinsic_shared_atomic_or:
3449          opcode = TGSI_OPCODE_ATOMOR;
3450          break;
3451       case ir_intrinsic_shared_atomic_xor:
3452          opcode = TGSI_OPCODE_ATOMXOR;
3453          break;
3454       case ir_intrinsic_shared_atomic_exchange:
3455          opcode = TGSI_OPCODE_ATOMXCHG;
3456          break;
3457       case ir_intrinsic_shared_atomic_comp_swap:
3458          opcode = TGSI_OPCODE_ATOMCAS;
3459          param = param->get_next();
3460          val = ((ir_instruction *)param)->as_rvalue();
3461          val->accept(this);
3462          data2 = this->result;
3463          break;
3464       default:
3465          assert(!"Unexpected intrinsic");
3466          return;
3467       }
3468
3469       inst = emit_asm(ir, opcode, dst, off, data, data2);
3470       inst->resource = buffer;
3471    }
3472 }
3473
3474 static void
3475 get_image_qualifiers(ir_dereference *ir, const glsl_type **type,
3476                      bool *memory_coherent, bool *memory_volatile,
3477                      bool *memory_restrict, unsigned *image_format)
3478 {
3479
3480    switch (ir->ir_type) {
3481    case ir_type_dereference_record: {
3482       ir_dereference_record *deref_record = ir->as_dereference_record();
3483       const glsl_type *struct_type = deref_record->record->type;
3484       int fild_idx = deref_record->field_idx;
3485
3486       *type = struct_type->fields.structure[fild_idx].type->without_array();
3487       *memory_coherent =
3488          struct_type->fields.structure[fild_idx].memory_coherent;
3489       *memory_volatile =
3490          struct_type->fields.structure[fild_idx].memory_volatile;
3491       *memory_restrict =
3492          struct_type->fields.structure[fild_idx].memory_restrict;
3493       *image_format =
3494          struct_type->fields.structure[fild_idx].image_format;
3495       break;
3496    }
3497
3498    case ir_type_dereference_array: {
3499       ir_dereference_array *deref_arr = ir->as_dereference_array();
3500       get_image_qualifiers((ir_dereference *)deref_arr->array, type,
3501                            memory_coherent, memory_volatile, memory_restrict,
3502                            image_format);
3503       break;
3504    }
3505
3506    case ir_type_dereference_variable: {
3507       ir_variable *var = ir->variable_referenced();
3508
3509       *type = var->type->without_array();
3510       *memory_coherent = var->data.memory_coherent;
3511       *memory_volatile = var->data.memory_volatile;
3512       *memory_restrict = var->data.memory_restrict;
3513       *image_format = var->data.image_format;
3514       break;
3515    }
3516
3517    default:
3518       break;
3519    }
3520 }
3521
3522 void
3523 glsl_to_tgsi_visitor::visit_image_intrinsic(ir_call *ir)
3524 {
3525    exec_node *param = ir->actual_parameters.get_head();
3526
3527    ir_dereference *img = (ir_dereference *)param;
3528    const ir_variable *imgvar = img->variable_referenced();
3529    unsigned sampler_array_size = 1, sampler_base = 0;
3530    bool memory_coherent = false, memory_volatile = false, memory_restrict = false;
3531    unsigned image_format = 0;
3532    const glsl_type *type = NULL;
3533
3534    get_image_qualifiers(img, &type, &memory_coherent, &memory_volatile,
3535                         &memory_restrict, &image_format);
3536
3537    st_src_reg reladdr;
3538    st_src_reg image(PROGRAM_IMAGE, 0, GLSL_TYPE_UINT);
3539    uint16_t index = 0;
3540    get_deref_offsets(img, &sampler_array_size, &sampler_base,
3541                      &index, &reladdr, !imgvar->contains_bindless());
3542
3543    image.index = index;
3544    if (reladdr.file != PROGRAM_UNDEFINED) {
3545       image.reladdr = ralloc(mem_ctx, st_src_reg);
3546       *image.reladdr = reladdr;
3547       emit_arl(ir, sampler_reladdr, reladdr);
3548    }
3549
3550    st_dst_reg dst = undef_dst;
3551    if (ir->return_deref) {
3552       ir->return_deref->accept(this);
3553       dst = st_dst_reg(this->result);
3554       dst.writemask = (1 << ir->return_deref->type->vector_elements) - 1;
3555    }
3556
3557    glsl_to_tgsi_instruction *inst;
3558
3559    if (ir->callee->intrinsic_id == ir_intrinsic_image_size) {
3560       dst.writemask = WRITEMASK_XYZ;
3561       inst = emit_asm(ir, TGSI_OPCODE_RESQ, dst);
3562    } else if (ir->callee->intrinsic_id == ir_intrinsic_image_samples) {
3563       st_src_reg res = get_temp(glsl_type::ivec4_type);
3564       st_dst_reg dstres = st_dst_reg(res);
3565       dstres.writemask = WRITEMASK_W;
3566       inst = emit_asm(ir, TGSI_OPCODE_RESQ, dstres);
3567       res.swizzle = SWIZZLE_WWWW;
3568       emit_asm(ir, TGSI_OPCODE_MOV, dst, res);
3569    } else {
3570       st_src_reg arg1 = undef_src, arg2 = undef_src;
3571       st_src_reg coord;
3572       st_dst_reg coord_dst;
3573       coord = get_temp(glsl_type::ivec4_type);
3574       coord_dst = st_dst_reg(coord);
3575       coord_dst.writemask = (1 << type->coordinate_components()) - 1;
3576       param = param->get_next();
3577       ((ir_dereference *)param)->accept(this);
3578       emit_asm(ir, TGSI_OPCODE_MOV, coord_dst, this->result);
3579       coord.swizzle = SWIZZLE_XXXX;
3580       switch (type->coordinate_components()) {
3581       case 4: assert(!"unexpected coord count");
3582       /* fallthrough */
3583       case 3: coord.swizzle |= SWIZZLE_Z << 6;
3584       /* fallthrough */
3585       case 2: coord.swizzle |= SWIZZLE_Y << 3;
3586       }
3587
3588       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_MS) {
3589          param = param->get_next();
3590          ((ir_dereference *)param)->accept(this);
3591          st_src_reg sample = this->result;
3592          sample.swizzle = SWIZZLE_XXXX;
3593          coord_dst.writemask = WRITEMASK_W;
3594          emit_asm(ir, TGSI_OPCODE_MOV, coord_dst, sample);
3595          coord.swizzle |= SWIZZLE_W << 9;
3596       }
3597
3598       param = param->get_next();
3599       if (!param->is_tail_sentinel()) {
3600          ((ir_dereference *)param)->accept(this);
3601          arg1 = this->result;
3602          param = param->get_next();
3603       }
3604
3605       if (!param->is_tail_sentinel()) {
3606          ((ir_dereference *)param)->accept(this);
3607          arg2 = this->result;
3608          param = param->get_next();
3609       }
3610
3611       assert(param->is_tail_sentinel());
3612
3613       unsigned opcode;
3614       switch (ir->callee->intrinsic_id) {
3615       case ir_intrinsic_image_load:
3616          opcode = TGSI_OPCODE_LOAD;
3617          break;
3618       case ir_intrinsic_image_store:
3619          opcode = TGSI_OPCODE_STORE;
3620          break;
3621       case ir_intrinsic_image_atomic_add:
3622          opcode = TGSI_OPCODE_ATOMUADD;
3623          break;
3624       case ir_intrinsic_image_atomic_min:
3625          opcode = TGSI_OPCODE_ATOMIMIN;
3626          break;
3627       case ir_intrinsic_image_atomic_max:
3628          opcode = TGSI_OPCODE_ATOMIMAX;
3629          break;
3630       case ir_intrinsic_image_atomic_and:
3631          opcode = TGSI_OPCODE_ATOMAND;
3632          break;
3633       case ir_intrinsic_image_atomic_or:
3634          opcode = TGSI_OPCODE_ATOMOR;
3635          break;
3636       case ir_intrinsic_image_atomic_xor:
3637          opcode = TGSI_OPCODE_ATOMXOR;
3638          break;
3639       case ir_intrinsic_image_atomic_exchange:
3640          opcode = TGSI_OPCODE_ATOMXCHG;
3641          break;
3642       case ir_intrinsic_image_atomic_comp_swap:
3643          opcode = TGSI_OPCODE_ATOMCAS;
3644          break;
3645       default:
3646          assert(!"Unexpected intrinsic");
3647          return;
3648       }
3649
3650       inst = emit_asm(ir, opcode, dst, coord, arg1, arg2);
3651       if (opcode == TGSI_OPCODE_STORE)
3652          inst->dst[0].writemask = WRITEMASK_XYZW;
3653    }
3654
3655    if (imgvar->contains_bindless()) {
3656       img->accept(this);
3657       inst->resource = this->result;
3658       inst->resource.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y,
3659                                              SWIZZLE_X, SWIZZLE_Y);
3660    } else {
3661       inst->resource = image;
3662       inst->sampler_array_size = sampler_array_size;
3663       inst->sampler_base = sampler_base;
3664    }
3665
3666    inst->tex_target = type->sampler_index();
3667    inst->image_format = st_mesa_format_to_pipe_format(st_context(ctx),
3668          _mesa_get_shader_image_format(image_format));
3669
3670    if (memory_coherent)
3671       inst->buffer_access |= TGSI_MEMORY_COHERENT;
3672    if (memory_restrict)
3673       inst->buffer_access |= TGSI_MEMORY_RESTRICT;
3674    if (memory_volatile)
3675       inst->buffer_access |= TGSI_MEMORY_VOLATILE;
3676 }
3677
3678 void
3679 glsl_to_tgsi_visitor::visit_generic_intrinsic(ir_call *ir, unsigned op)
3680 {
3681    ir->return_deref->accept(this);
3682    st_dst_reg dst = st_dst_reg(this->result);
3683
3684    dst.writemask = u_bit_consecutive(0, ir->return_deref->var->type->vector_elements);
3685
3686    st_src_reg src[4] = { undef_src, undef_src, undef_src, undef_src };
3687    unsigned num_src = 0;
3688    foreach_in_list(ir_rvalue, param, &ir->actual_parameters) {
3689       assert(num_src < ARRAY_SIZE(src));
3690
3691       this->result.file = PROGRAM_UNDEFINED;
3692       param->accept(this);
3693       assert(this->result.file != PROGRAM_UNDEFINED);
3694
3695       src[num_src] = this->result;
3696       num_src++;
3697    }
3698
3699    emit_asm(ir, op, dst, src[0], src[1], src[2], src[3]);
3700 }
3701
3702 void
3703 glsl_to_tgsi_visitor::visit(ir_call *ir)
3704 {
3705    ir_function_signature *sig = ir->callee;
3706
3707    /* Filter out intrinsics */
3708    switch (sig->intrinsic_id) {
3709    case ir_intrinsic_atomic_counter_read:
3710    case ir_intrinsic_atomic_counter_increment:
3711    case ir_intrinsic_atomic_counter_predecrement:
3712    case ir_intrinsic_atomic_counter_add:
3713    case ir_intrinsic_atomic_counter_min:
3714    case ir_intrinsic_atomic_counter_max:
3715    case ir_intrinsic_atomic_counter_and:
3716    case ir_intrinsic_atomic_counter_or:
3717    case ir_intrinsic_atomic_counter_xor:
3718    case ir_intrinsic_atomic_counter_exchange:
3719    case ir_intrinsic_atomic_counter_comp_swap:
3720       visit_atomic_counter_intrinsic(ir);
3721       return;
3722
3723    case ir_intrinsic_ssbo_load:
3724    case ir_intrinsic_ssbo_store:
3725    case ir_intrinsic_ssbo_atomic_add:
3726    case ir_intrinsic_ssbo_atomic_min:
3727    case ir_intrinsic_ssbo_atomic_max:
3728    case ir_intrinsic_ssbo_atomic_and:
3729    case ir_intrinsic_ssbo_atomic_or:
3730    case ir_intrinsic_ssbo_atomic_xor:
3731    case ir_intrinsic_ssbo_atomic_exchange:
3732    case ir_intrinsic_ssbo_atomic_comp_swap:
3733       visit_ssbo_intrinsic(ir);
3734       return;
3735
3736    case ir_intrinsic_memory_barrier:
3737    case ir_intrinsic_memory_barrier_atomic_counter:
3738    case ir_intrinsic_memory_barrier_buffer:
3739    case ir_intrinsic_memory_barrier_image:
3740    case ir_intrinsic_memory_barrier_shared:
3741    case ir_intrinsic_group_memory_barrier:
3742       visit_membar_intrinsic(ir);
3743       return;
3744
3745    case ir_intrinsic_shared_load:
3746    case ir_intrinsic_shared_store:
3747    case ir_intrinsic_shared_atomic_add:
3748    case ir_intrinsic_shared_atomic_min:
3749    case ir_intrinsic_shared_atomic_max:
3750    case ir_intrinsic_shared_atomic_and:
3751    case ir_intrinsic_shared_atomic_or:
3752    case ir_intrinsic_shared_atomic_xor:
3753    case ir_intrinsic_shared_atomic_exchange:
3754    case ir_intrinsic_shared_atomic_comp_swap:
3755       visit_shared_intrinsic(ir);
3756       return;
3757
3758    case ir_intrinsic_image_load:
3759    case ir_intrinsic_image_store:
3760    case ir_intrinsic_image_atomic_add:
3761    case ir_intrinsic_image_atomic_min:
3762    case ir_intrinsic_image_atomic_max:
3763    case ir_intrinsic_image_atomic_and:
3764    case ir_intrinsic_image_atomic_or:
3765    case ir_intrinsic_image_atomic_xor:
3766    case ir_intrinsic_image_atomic_exchange:
3767    case ir_intrinsic_image_atomic_comp_swap:
3768    case ir_intrinsic_image_size:
3769    case ir_intrinsic_image_samples:
3770       visit_image_intrinsic(ir);
3771       return;
3772
3773    case ir_intrinsic_shader_clock:
3774       visit_generic_intrinsic(ir, TGSI_OPCODE_CLOCK);
3775       return;
3776
3777    case ir_intrinsic_vote_all:
3778       visit_generic_intrinsic(ir, TGSI_OPCODE_VOTE_ALL);
3779       return;
3780    case ir_intrinsic_vote_any:
3781       visit_generic_intrinsic(ir, TGSI_OPCODE_VOTE_ANY);
3782       return;
3783    case ir_intrinsic_vote_eq:
3784       visit_generic_intrinsic(ir, TGSI_OPCODE_VOTE_EQ);
3785       return;
3786    case ir_intrinsic_ballot:
3787       visit_generic_intrinsic(ir, TGSI_OPCODE_BALLOT);
3788       return;
3789    case ir_intrinsic_read_first_invocation:
3790       visit_generic_intrinsic(ir, TGSI_OPCODE_READ_FIRST);
3791       return;
3792    case ir_intrinsic_read_invocation:
3793       visit_generic_intrinsic(ir, TGSI_OPCODE_READ_INVOC);
3794       return;
3795
3796    case ir_intrinsic_invalid:
3797    case ir_intrinsic_generic_load:
3798    case ir_intrinsic_generic_store:
3799    case ir_intrinsic_generic_atomic_add:
3800    case ir_intrinsic_generic_atomic_and:
3801    case ir_intrinsic_generic_atomic_or:
3802    case ir_intrinsic_generic_atomic_xor:
3803    case ir_intrinsic_generic_atomic_min:
3804    case ir_intrinsic_generic_atomic_max:
3805    case ir_intrinsic_generic_atomic_exchange:
3806    case ir_intrinsic_generic_atomic_comp_swap:
3807       unreachable("Invalid intrinsic");
3808    }
3809 }
3810
3811 void
3812 glsl_to_tgsi_visitor::calc_deref_offsets(ir_dereference *tail,
3813                                          unsigned *array_elements,
3814                                          uint16_t *index,
3815                                          st_src_reg *indirect,
3816                                          unsigned *location)
3817 {
3818    switch (tail->ir_type) {
3819    case ir_type_dereference_record: {
3820       ir_dereference_record *deref_record = tail->as_dereference_record();
3821       const glsl_type *struct_type = deref_record->record->type;
3822       int field_index = deref_record->field_idx;
3823
3824       calc_deref_offsets(deref_record->record->as_dereference(), array_elements, index, indirect, location);
3825
3826       assert(field_index >= 0);
3827       *location += struct_type->record_location_offset(field_index);
3828       break;
3829    }
3830
3831    case ir_type_dereference_array: {
3832       ir_dereference_array *deref_arr = tail->as_dereference_array();
3833
3834       void *mem_ctx = ralloc_parent(deref_arr);
3835       ir_constant *array_index =
3836          deref_arr->array_index->constant_expression_value(mem_ctx);
3837
3838       if (!array_index) {
3839          st_src_reg temp_reg;
3840          st_dst_reg temp_dst;
3841
3842          temp_reg = get_temp(glsl_type::uint_type);
3843          temp_dst = st_dst_reg(temp_reg);
3844          temp_dst.writemask = 1;
3845
3846          deref_arr->array_index->accept(this);
3847          if (*array_elements != 1)
3848             emit_asm(NULL, TGSI_OPCODE_MUL, temp_dst, this->result, st_src_reg_for_int(*array_elements));
3849          else
3850             emit_asm(NULL, TGSI_OPCODE_MOV, temp_dst, this->result);
3851
3852          if (indirect->file == PROGRAM_UNDEFINED)
3853             *indirect = temp_reg;
3854          else {
3855             temp_dst = st_dst_reg(*indirect);
3856             temp_dst.writemask = 1;
3857             emit_asm(NULL, TGSI_OPCODE_ADD, temp_dst, *indirect, temp_reg);
3858          }
3859       } else
3860          *index += array_index->value.u[0] * *array_elements;
3861
3862       *array_elements *= deref_arr->array->type->length;
3863
3864       calc_deref_offsets(deref_arr->array->as_dereference(), array_elements, index, indirect, location);
3865       break;
3866    }
3867    default:
3868       break;
3869    }
3870 }
3871
3872 void
3873 glsl_to_tgsi_visitor::get_deref_offsets(ir_dereference *ir,
3874                                         unsigned *array_size,
3875                                         unsigned *base,
3876                                         uint16_t *index,
3877                                         st_src_reg *reladdr,
3878                                         bool opaque)
3879 {
3880    GLuint shader = _mesa_program_enum_to_shader_stage(this->prog->Target);
3881    unsigned location = 0;
3882    ir_variable *var = ir->variable_referenced();
3883
3884    memset(reladdr, 0, sizeof(*reladdr));
3885    reladdr->file = PROGRAM_UNDEFINED;
3886
3887    *base = 0;
3888    *array_size = 1;
3889
3890    assert(var);
3891    location = var->data.location;
3892    calc_deref_offsets(ir, array_size, index, reladdr, &location);
3893
3894    /*
3895     * If we end up with no indirect then adjust the base to the index,
3896     * and set the array size to 1.
3897     */
3898    if (reladdr->file == PROGRAM_UNDEFINED) {
3899       *base = *index;
3900       *array_size = 1;
3901    }
3902
3903    if (opaque) {
3904       assert(location != 0xffffffff);
3905       *base += this->shader_program->data->UniformStorage[location].opaque[shader].index;
3906       *index += this->shader_program->data->UniformStorage[location].opaque[shader].index;
3907    }
3908 }
3909
3910 st_src_reg
3911 glsl_to_tgsi_visitor::canonicalize_gather_offset(st_src_reg offset)
3912 {
3913    if (offset.reladdr || offset.reladdr2) {
3914       st_src_reg tmp = get_temp(glsl_type::ivec2_type);
3915       st_dst_reg tmp_dst = st_dst_reg(tmp);
3916       tmp_dst.writemask = WRITEMASK_XY;
3917       emit_asm(NULL, TGSI_OPCODE_MOV, tmp_dst, offset);
3918       return tmp;
3919    }
3920
3921    return offset;
3922 }
3923
3924 void
3925 glsl_to_tgsi_visitor::visit(ir_texture *ir)
3926 {
3927    st_src_reg result_src, coord, cube_sc, lod_info, projector, dx, dy;
3928    st_src_reg offset[MAX_GLSL_TEXTURE_OFFSET], sample_index, component;
3929    st_src_reg levels_src, reladdr;
3930    st_dst_reg result_dst, coord_dst, cube_sc_dst;
3931    glsl_to_tgsi_instruction *inst = NULL;
3932    unsigned opcode = TGSI_OPCODE_NOP;
3933    const glsl_type *sampler_type = ir->sampler->type;
3934    unsigned sampler_array_size = 1, sampler_base = 0;
3935    bool is_cube_array = false, is_cube_shadow = false;
3936    ir_variable *var = ir->sampler->variable_referenced();
3937    unsigned i;
3938
3939    /* if we are a cube array sampler or a cube shadow */
3940    if (sampler_type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE) {
3941       is_cube_array = sampler_type->sampler_array;
3942       is_cube_shadow = sampler_type->sampler_shadow;
3943    }
3944
3945    if (ir->coordinate) {
3946       ir->coordinate->accept(this);
3947
3948       /* Put our coords in a temp.  We'll need to modify them for shadow,
3949        * projection, or LOD, so the only case we'd use it as-is is if
3950        * we're doing plain old texturing.  The optimization passes on
3951        * glsl_to_tgsi_visitor should handle cleaning up our mess in that case.
3952        */
3953       coord = get_temp(glsl_type::vec4_type);
3954       coord_dst = st_dst_reg(coord);
3955       coord_dst.writemask = (1 << ir->coordinate->type->vector_elements) - 1;
3956       emit_asm(ir, TGSI_OPCODE_MOV, coord_dst, this->result);
3957    }
3958
3959    if (ir->projector) {
3960       ir->projector->accept(this);
3961       projector = this->result;
3962    }
3963
3964    /* Storage for our result.  Ideally for an assignment we'd be using
3965     * the actual storage for the result here, instead.
3966     */
3967    result_src = get_temp(ir->type);
3968    result_dst = st_dst_reg(result_src);
3969    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
3970
3971    switch (ir->op) {
3972    case ir_tex:
3973       opcode = (is_cube_array && ir->shadow_comparator) ? TGSI_OPCODE_TEX2 : TGSI_OPCODE_TEX;
3974       if (ir->offset) {
3975          ir->offset->accept(this);
3976          offset[0] = this->result;
3977       }
3978       break;
3979    case ir_txb:
3980       if (is_cube_array || is_cube_shadow) {
3981          opcode = TGSI_OPCODE_TXB2;
3982       }
3983       else {
3984          opcode = TGSI_OPCODE_TXB;
3985       }
3986       ir->lod_info.bias->accept(this);
3987       lod_info = this->result;
3988       if (ir->offset) {
3989          ir->offset->accept(this);
3990          offset[0] = this->result;
3991       }
3992       break;
3993    case ir_txl:
3994       if (this->has_tex_txf_lz && ir->lod_info.lod->is_zero()) {
3995          opcode = TGSI_OPCODE_TEX_LZ;
3996       } else {
3997          opcode = is_cube_array ? TGSI_OPCODE_TXL2 : TGSI_OPCODE_TXL;
3998          ir->lod_info.lod->accept(this);
3999          lod_info = this->result;
4000       }
4001       if (ir->offset) {
4002          ir->offset->accept(this);
4003          offset[0] = this->result;
4004       }
4005       break;
4006    case ir_txd:
4007       opcode = TGSI_OPCODE_TXD;
4008       ir->lod_info.grad.dPdx->accept(this);
4009       dx = this->result;
4010       ir->lod_info.grad.dPdy->accept(this);
4011       dy = this->result;
4012       if (ir->offset) {
4013          ir->offset->accept(this);
4014          offset[0] = this->result;
4015       }
4016       break;
4017    case ir_txs:
4018       opcode = TGSI_OPCODE_TXQ;
4019       ir->lod_info.lod->accept(this);
4020       lod_info = this->result;
4021       break;
4022    case ir_query_levels:
4023       opcode = TGSI_OPCODE_TXQ;
4024       lod_info = undef_src;
4025       levels_src = get_temp(ir->type);
4026       break;
4027    case ir_txf:
4028       if (this->has_tex_txf_lz && ir->lod_info.lod->is_zero()) {
4029          opcode = TGSI_OPCODE_TXF_LZ;
4030       } else {
4031          opcode = TGSI_OPCODE_TXF;
4032          ir->lod_info.lod->accept(this);
4033          lod_info = this->result;
4034       }
4035       if (ir->offset) {
4036          ir->offset->accept(this);
4037          offset[0] = this->result;
4038       }
4039       break;
4040    case ir_txf_ms:
4041       opcode = TGSI_OPCODE_TXF;
4042       ir->lod_info.sample_index->accept(this);
4043       sample_index = this->result;
4044       break;
4045    case ir_tg4:
4046       opcode = TGSI_OPCODE_TG4;
4047       ir->lod_info.component->accept(this);
4048       component = this->result;
4049       if (ir->offset) {
4050          ir->offset->accept(this);
4051          if (ir->offset->type->is_array()) {
4052             const glsl_type *elt_type = ir->offset->type->fields.array;
4053             for (i = 0; i < ir->offset->type->length; i++) {
4054                offset[i] = this->result;
4055                offset[i].index += i * type_size(elt_type);
4056                offset[i].type = elt_type->base_type;
4057                offset[i].swizzle = swizzle_for_size(elt_type->vector_elements);
4058                offset[i] = canonicalize_gather_offset(offset[i]);
4059             }
4060          } else {
4061             offset[0] = canonicalize_gather_offset(this->result);
4062          }
4063       }
4064       break;
4065    case ir_lod:
4066       opcode = TGSI_OPCODE_LODQ;
4067       break;
4068    case ir_texture_samples:
4069       opcode = TGSI_OPCODE_TXQS;
4070       break;
4071    case ir_samples_identical:
4072       unreachable("Unexpected ir_samples_identical opcode");
4073    }
4074
4075    if (ir->projector) {
4076       if (opcode == TGSI_OPCODE_TEX) {
4077          /* Slot the projector in as the last component of the coord. */
4078          coord_dst.writemask = WRITEMASK_W;
4079          emit_asm(ir, TGSI_OPCODE_MOV, coord_dst, projector);
4080          coord_dst.writemask = WRITEMASK_XYZW;
4081          opcode = TGSI_OPCODE_TXP;
4082       } else {
4083          st_src_reg coord_w = coord;
4084          coord_w.swizzle = SWIZZLE_WWWW;
4085
4086          /* For the other TEX opcodes there's no projective version
4087           * since the last slot is taken up by LOD info.  Do the
4088           * projective divide now.
4089           */
4090          coord_dst.writemask = WRITEMASK_W;
4091          emit_asm(ir, TGSI_OPCODE_RCP, coord_dst, projector);
4092
4093          /* In the case where we have to project the coordinates "by hand,"
4094           * the shadow comparator value must also be projected.
4095           */
4096          st_src_reg tmp_src = coord;
4097          if (ir->shadow_comparator) {
4098             /* Slot the shadow value in as the second to last component of the
4099              * coord.
4100              */
4101             ir->shadow_comparator->accept(this);
4102
4103             tmp_src = get_temp(glsl_type::vec4_type);
4104             st_dst_reg tmp_dst = st_dst_reg(tmp_src);
4105
4106             /* Projective division not allowed for array samplers. */
4107             assert(!sampler_type->sampler_array);
4108
4109             tmp_dst.writemask = WRITEMASK_Z;
4110             emit_asm(ir, TGSI_OPCODE_MOV, tmp_dst, this->result);
4111
4112             tmp_dst.writemask = WRITEMASK_XY;
4113             emit_asm(ir, TGSI_OPCODE_MOV, tmp_dst, coord);
4114          }
4115
4116          coord_dst.writemask = WRITEMASK_XYZ;
4117          emit_asm(ir, TGSI_OPCODE_MUL, coord_dst, tmp_src, coord_w);
4118
4119          coord_dst.writemask = WRITEMASK_XYZW;
4120          coord.swizzle = SWIZZLE_XYZW;
4121       }
4122    }
4123
4124    /* If projection is done and the opcode is not TGSI_OPCODE_TXP, then the shadow
4125     * comparator was put in the correct place (and projected) by the code,
4126     * above, that handles by-hand projection.
4127     */
4128    if (ir->shadow_comparator && (!ir->projector || opcode == TGSI_OPCODE_TXP)) {
4129       /* Slot the shadow value in as the second to last component of the
4130        * coord.
4131        */
4132       ir->shadow_comparator->accept(this);
4133
4134       if (is_cube_array) {
4135          cube_sc = get_temp(glsl_type::float_type);
4136          cube_sc_dst = st_dst_reg(cube_sc);
4137          cube_sc_dst.writemask = WRITEMASK_X;
4138          emit_asm(ir, TGSI_OPCODE_MOV, cube_sc_dst, this->result);
4139          cube_sc_dst.writemask = WRITEMASK_X;
4140       }
4141       else {
4142          if ((sampler_type->sampler_dimensionality == GLSL_SAMPLER_DIM_2D &&
4143               sampler_type->sampler_array) ||
4144              sampler_type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE) {
4145             coord_dst.writemask = WRITEMASK_W;
4146          } else {
4147             coord_dst.writemask = WRITEMASK_Z;
4148          }
4149          emit_asm(ir, TGSI_OPCODE_MOV, coord_dst, this->result);
4150          coord_dst.writemask = WRITEMASK_XYZW;
4151       }
4152    }
4153
4154    if (ir->op == ir_txf_ms) {
4155       coord_dst.writemask = WRITEMASK_W;
4156       emit_asm(ir, TGSI_OPCODE_MOV, coord_dst, sample_index);
4157       coord_dst.writemask = WRITEMASK_XYZW;
4158    } else if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXB ||
4159        opcode == TGSI_OPCODE_TXF) {
4160       /* TGSI stores LOD or LOD bias in the last channel of the coords. */
4161       coord_dst.writemask = WRITEMASK_W;
4162       emit_asm(ir, TGSI_OPCODE_MOV, coord_dst, lod_info);
4163       coord_dst.writemask = WRITEMASK_XYZW;
4164    }
4165
4166    st_src_reg sampler(PROGRAM_SAMPLER, 0, GLSL_TYPE_UINT);
4167
4168    uint16_t index = 0;
4169    get_deref_offsets(ir->sampler, &sampler_array_size, &sampler_base,
4170                      &index, &reladdr, !var->contains_bindless());
4171
4172    sampler.index = index;
4173    if (reladdr.file != PROGRAM_UNDEFINED) {
4174       sampler.reladdr = ralloc(mem_ctx, st_src_reg);
4175       *sampler.reladdr = reladdr;
4176       emit_arl(ir, sampler_reladdr, reladdr);
4177    }
4178
4179    if (opcode == TGSI_OPCODE_TXD)
4180       inst = emit_asm(ir, opcode, result_dst, coord, dx, dy);
4181    else if (opcode == TGSI_OPCODE_TXQ) {
4182       if (ir->op == ir_query_levels) {
4183          /* the level is stored in W */
4184          inst = emit_asm(ir, opcode, st_dst_reg(levels_src), lod_info);
4185          result_dst.writemask = WRITEMASK_X;
4186          levels_src.swizzle = SWIZZLE_WWWW;
4187          emit_asm(ir, TGSI_OPCODE_MOV, result_dst, levels_src);
4188       } else
4189          inst = emit_asm(ir, opcode, result_dst, lod_info);
4190    } else if (opcode == TGSI_OPCODE_TXQS) {
4191       inst = emit_asm(ir, opcode, result_dst);
4192    } else if (opcode == TGSI_OPCODE_TXL2 || opcode == TGSI_OPCODE_TXB2) {
4193       inst = emit_asm(ir, opcode, result_dst, coord, lod_info);
4194    } else if (opcode == TGSI_OPCODE_TEX2) {
4195       inst = emit_asm(ir, opcode, result_dst, coord, cube_sc);
4196    } else if (opcode == TGSI_OPCODE_TG4) {
4197       if (is_cube_array && ir->shadow_comparator) {
4198          inst = emit_asm(ir, opcode, result_dst, coord, cube_sc);
4199       } else {
4200          inst = emit_asm(ir, opcode, result_dst, coord, component);
4201       }
4202    } else
4203       inst = emit_asm(ir, opcode, result_dst, coord);
4204
4205    if (ir->shadow_comparator)
4206       inst->tex_shadow = GL_TRUE;
4207
4208    if (var->contains_bindless()) {
4209       ir->sampler->accept(this);
4210       inst->resource = this->result;
4211       inst->resource.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y,
4212                                              SWIZZLE_X, SWIZZLE_Y);
4213    } else {
4214       inst->resource = sampler;
4215       inst->sampler_array_size = sampler_array_size;
4216       inst->sampler_base = sampler_base;
4217    }
4218
4219    if (ir->offset) {
4220       if (!inst->tex_offsets)
4221          inst->tex_offsets = rzalloc_array(inst, st_src_reg, MAX_GLSL_TEXTURE_OFFSET);
4222
4223       for (i = 0; i < MAX_GLSL_TEXTURE_OFFSET && offset[i].file != PROGRAM_UNDEFINED; i++)
4224          inst->tex_offsets[i] = offset[i];
4225       inst->tex_offset_num_offset = i;
4226    }
4227
4228    inst->tex_target = sampler_type->sampler_index();
4229    inst->tex_type = ir->type->base_type;
4230
4231    this->result = result_src;
4232 }
4233
4234 void
4235 glsl_to_tgsi_visitor::visit(ir_return *ir)
4236 {
4237    assert(!ir->get_value());
4238
4239    emit_asm(ir, TGSI_OPCODE_RET);
4240 }
4241
4242 void
4243 glsl_to_tgsi_visitor::visit(ir_discard *ir)
4244 {
4245    if (ir->condition) {
4246       ir->condition->accept(this);
4247       st_src_reg condition = this->result;
4248
4249       /* Convert the bool condition to a float so we can negate. */
4250       if (native_integers) {
4251          st_src_reg temp = get_temp(ir->condition->type);
4252          emit_asm(ir, TGSI_OPCODE_AND, st_dst_reg(temp),
4253               condition, st_src_reg_for_float(1.0));
4254          condition = temp;
4255       }
4256
4257       condition.negate = ~condition.negate;
4258       emit_asm(ir, TGSI_OPCODE_KILL_IF, undef_dst, condition);
4259    } else {
4260       /* unconditional kil */
4261       emit_asm(ir, TGSI_OPCODE_KILL);
4262    }
4263 }
4264
4265 void
4266 glsl_to_tgsi_visitor::visit(ir_if *ir)
4267 {
4268    unsigned if_opcode;
4269    glsl_to_tgsi_instruction *if_inst;
4270
4271    ir->condition->accept(this);
4272    assert(this->result.file != PROGRAM_UNDEFINED);
4273
4274    if_opcode = native_integers ? TGSI_OPCODE_UIF : TGSI_OPCODE_IF;
4275
4276    if_inst = emit_asm(ir->condition, if_opcode, undef_dst, this->result);
4277
4278    this->instructions.push_tail(if_inst);
4279
4280    visit_exec_list(&ir->then_instructions, this);
4281
4282    if (!ir->else_instructions.is_empty()) {
4283       emit_asm(ir->condition, TGSI_OPCODE_ELSE);
4284       visit_exec_list(&ir->else_instructions, this);
4285    }
4286
4287    if_inst = emit_asm(ir->condition, TGSI_OPCODE_ENDIF);
4288 }
4289
4290
4291 void
4292 glsl_to_tgsi_visitor::visit(ir_emit_vertex *ir)
4293 {
4294    assert(this->prog->Target == GL_GEOMETRY_PROGRAM_NV);
4295
4296    ir->stream->accept(this);
4297    emit_asm(ir, TGSI_OPCODE_EMIT, undef_dst, this->result);
4298 }
4299
4300 void
4301 glsl_to_tgsi_visitor::visit(ir_end_primitive *ir)
4302 {
4303    assert(this->prog->Target == GL_GEOMETRY_PROGRAM_NV);
4304
4305    ir->stream->accept(this);
4306    emit_asm(ir, TGSI_OPCODE_ENDPRIM, undef_dst, this->result);
4307 }
4308
4309 void
4310 glsl_to_tgsi_visitor::visit(ir_barrier *ir)
4311 {
4312    assert(this->prog->Target == GL_TESS_CONTROL_PROGRAM_NV ||
4313           this->prog->Target == GL_COMPUTE_PROGRAM_NV);
4314
4315    emit_asm(ir, TGSI_OPCODE_BARRIER);
4316 }
4317
4318 glsl_to_tgsi_visitor::glsl_to_tgsi_visitor()
4319 {
4320    STATIC_ASSERT(sizeof(samplers_used) * 8 >= PIPE_MAX_SAMPLERS);
4321
4322    result.file = PROGRAM_UNDEFINED;
4323    next_temp = 1;
4324    array_sizes = NULL;
4325    max_num_arrays = 0;
4326    next_array = 0;
4327    num_inputs = 0;
4328    num_outputs = 0;
4329    num_input_arrays = 0;
4330    num_output_arrays = 0;
4331    num_immediates = 0;
4332    num_address_regs = 0;
4333    samplers_used = 0;
4334    images_used = 0;
4335    indirect_addr_consts = false;
4336    wpos_transform_const = -1;
4337    glsl_version = 0;
4338    native_integers = false;
4339    mem_ctx = ralloc_context(NULL);
4340    ctx = NULL;
4341    prog = NULL;
4342    precise = 0;
4343    shader_program = NULL;
4344    shader = NULL;
4345    options = NULL;
4346    have_sqrt = false;
4347    have_fma = false;
4348    use_shared_memory = false;
4349    has_tex_txf_lz = false;
4350    variables = NULL;
4351 }
4352
4353 static void var_destroy(struct hash_entry *entry)
4354 {
4355    variable_storage *storage = (variable_storage *)entry->data;
4356
4357    delete storage;
4358 }
4359
4360 glsl_to_tgsi_visitor::~glsl_to_tgsi_visitor()
4361 {
4362    _mesa_hash_table_destroy(variables, var_destroy);
4363    free(array_sizes);
4364    ralloc_free(mem_ctx);
4365 }
4366
4367 extern "C" void free_glsl_to_tgsi_visitor(glsl_to_tgsi_visitor *v)
4368 {
4369    delete v;
4370 }
4371
4372
4373 /**
4374  * Count resources used by the given gpu program (number of texture
4375  * samplers, etc).
4376  */
4377 static void
4378 count_resources(glsl_to_tgsi_visitor *v, gl_program *prog)
4379 {
4380    v->samplers_used = 0;
4381    v->images_used = 0;
4382
4383    foreach_in_list(glsl_to_tgsi_instruction, inst, &v->instructions) {
4384       if (inst->info->is_tex) {
4385          for (int i = 0; i < inst->sampler_array_size; i++) {
4386             unsigned idx = inst->sampler_base + i;
4387             v->samplers_used |= 1u << idx;
4388
4389             debug_assert(idx < (int)ARRAY_SIZE(v->sampler_types));
4390             v->sampler_types[idx] = inst->tex_type;
4391             v->sampler_targets[idx] =
4392                st_translate_texture_target(inst->tex_target, inst->tex_shadow);
4393
4394             if (inst->tex_shadow) {
4395                prog->ShadowSamplers |= 1 << (inst->resource.index + i);
4396             }
4397          }
4398       }
4399
4400       if (inst->tex_target == TEXTURE_EXTERNAL_INDEX)
4401          prog->ExternalSamplersUsed |= 1 << inst->resource.index;
4402
4403       if (inst->resource.file != PROGRAM_UNDEFINED && (
4404                 is_resource_instruction(inst->op) ||
4405                 inst->op == TGSI_OPCODE_STORE)) {
4406          if (inst->resource.file == PROGRAM_MEMORY) {
4407             v->use_shared_memory = true;
4408          } else if (inst->resource.file == PROGRAM_IMAGE) {
4409             for (int i = 0; i < inst->sampler_array_size; i++) {
4410                unsigned idx = inst->sampler_base + i;
4411                v->images_used |= 1 << idx;
4412                v->image_targets[idx] =
4413                   st_translate_texture_target(inst->tex_target, false);
4414                v->image_formats[idx] = inst->image_format;
4415             }
4416          }
4417       }
4418    }
4419    prog->SamplersUsed = v->samplers_used;
4420
4421    if (v->shader_program != NULL)
4422       _mesa_update_shader_textures_used(v->shader_program, prog);
4423 }
4424
4425 /**
4426  * Returns the mask of channels (bitmask of WRITEMASK_X,Y,Z,W) which
4427  * are read from the given src in this instruction
4428  */
4429 static int
4430 get_src_arg_mask(st_dst_reg dst, st_src_reg src)
4431 {
4432    int read_mask = 0, comp;
4433
4434    /* Now, given the src swizzle and the written channels, find which
4435     * components are actually read
4436     */
4437    for (comp = 0; comp < 4; ++comp) {
4438       const unsigned coord = GET_SWZ(src.swizzle, comp);
4439       assert(coord < 4);
4440       if (dst.writemask & (1 << comp) && coord <= SWIZZLE_W)
4441          read_mask |= 1 << coord;
4442    }
4443
4444    return read_mask;
4445 }
4446
4447 /**
4448  * This pass replaces CMP T0, T1 T2 T0 with MOV T0, T2 when the CMP
4449  * instruction is the first instruction to write to register T0.  There are
4450  * several lowering passes done in GLSL IR (e.g. branches and
4451  * relative addressing) that create a large number of conditional assignments
4452  * that ir_to_mesa converts to CMP instructions like the one mentioned above.
4453  *
4454  * Here is why this conversion is safe:
4455  * CMP T0, T1 T2 T0 can be expanded to:
4456  * if (T1 < 0.0)
4457  *   MOV T0, T2;
4458  * else
4459  *   MOV T0, T0;
4460  *
4461  * If (T1 < 0.0) evaluates to true then our replacement MOV T0, T2 is the same
4462  * as the original program.  If (T1 < 0.0) evaluates to false, executing
4463  * MOV T0, T0 will store a garbage value in T0 since T0 is uninitialized.
4464  * Therefore, it doesn't matter that we are replacing MOV T0, T0 with MOV T0, T2
4465  * because any instruction that was going to read from T0 after this was going
4466  * to read a garbage value anyway.
4467  */
4468 void
4469 glsl_to_tgsi_visitor::simplify_cmp(void)
4470 {
4471    int tempWritesSize = 0;
4472    unsigned *tempWrites = NULL;
4473    unsigned outputWrites[VARYING_SLOT_TESS_MAX];
4474
4475    memset(outputWrites, 0, sizeof(outputWrites));
4476
4477    foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
4478       unsigned prevWriteMask = 0;
4479
4480       /* Give up if we encounter relative addressing or flow control. */
4481       if (inst->dst[0].reladdr || inst->dst[0].reladdr2 ||
4482           inst->dst[1].reladdr || inst->dst[1].reladdr2 ||
4483           inst->info->is_branch ||
4484           inst->op == TGSI_OPCODE_CONT ||
4485           inst->op == TGSI_OPCODE_END ||
4486           inst->op == TGSI_OPCODE_RET) {
4487          break;
4488       }
4489
4490       if (inst->dst[0].file == PROGRAM_OUTPUT) {
4491          assert(inst->dst[0].index < (signed)ARRAY_SIZE(outputWrites));
4492          prevWriteMask = outputWrites[inst->dst[0].index];
4493          outputWrites[inst->dst[0].index] |= inst->dst[0].writemask;
4494       } else if (inst->dst[0].file == PROGRAM_TEMPORARY) {
4495          if (inst->dst[0].index >= tempWritesSize) {
4496             const int inc = 4096;
4497
4498             tempWrites = (unsigned*)
4499                          realloc(tempWrites,
4500                                  (tempWritesSize + inc) * sizeof(unsigned));
4501             if (!tempWrites)
4502                return;
4503
4504             memset(tempWrites + tempWritesSize, 0, inc * sizeof(unsigned));
4505             tempWritesSize += inc;
4506          }
4507
4508          prevWriteMask = tempWrites[inst->dst[0].index];
4509          tempWrites[inst->dst[0].index] |= inst->dst[0].writemask;
4510       } else
4511          continue;
4512
4513       /* For a CMP to be considered a conditional write, the destination
4514        * register and source register two must be the same. */
4515       if (inst->op == TGSI_OPCODE_CMP
4516           && !(inst->dst[0].writemask & prevWriteMask)
4517           && inst->src[2].file == inst->dst[0].file
4518           && inst->src[2].index == inst->dst[0].index
4519           && inst->dst[0].writemask == get_src_arg_mask(inst->dst[0], inst->src[2])) {
4520
4521          inst->op = TGSI_OPCODE_MOV;
4522          inst->info = tgsi_get_opcode_info(inst->op);
4523          inst->src[0] = inst->src[1];
4524       }
4525    }
4526
4527    free(tempWrites);
4528 }
4529
4530 /* Replaces all references to a temporary register index with another index. */
4531 void
4532 glsl_to_tgsi_visitor::rename_temp_registers(struct rename_reg_pair *renames)
4533 {
4534    foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
4535       unsigned j;
4536       for (j = 0; j < num_inst_src_regs(inst); j++) {
4537          if (inst->src[j].file == PROGRAM_TEMPORARY) {
4538             int old_idx = inst->src[j].index;
4539             if (renames[old_idx].valid)
4540                inst->src[j].index = renames[old_idx].new_reg;
4541          }
4542       }
4543
4544       for (j = 0; j < inst->tex_offset_num_offset; j++) {
4545          if (inst->tex_offsets[j].file == PROGRAM_TEMPORARY) {
4546             int old_idx = inst->tex_offsets[j].index;
4547             if (renames[old_idx].valid)
4548                inst->tex_offsets[j].index = renames[old_idx].new_reg;
4549          }
4550       }
4551
4552       if (inst->resource.file == PROGRAM_TEMPORARY) {
4553          int old_idx = inst->resource.index;
4554          if (renames[old_idx].valid)
4555             inst->resource.index = renames[old_idx].new_reg;
4556       }
4557
4558       for (j = 0; j < num_inst_dst_regs(inst); j++) {
4559          if (inst->dst[j].file == PROGRAM_TEMPORARY) {
4560             int old_idx = inst->dst[j].index;
4561             if (renames[old_idx].valid)
4562                inst->dst[j].index = renames[old_idx].new_reg;}
4563       }
4564    }
4565 }
4566
4567 void
4568 glsl_to_tgsi_visitor::get_first_temp_write(int *first_writes)
4569 {
4570    int depth = 0; /* loop depth */
4571    int loop_start = -1; /* index of the first active BGNLOOP (if any) */
4572    unsigned i = 0, j;
4573
4574    foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
4575       for (j = 0; j < num_inst_dst_regs(inst); j++) {
4576          if (inst->dst[j].file == PROGRAM_TEMPORARY) {
4577             if (first_writes[inst->dst[j].index] == -1)
4578                 first_writes[inst->dst[j].index] = (depth == 0) ? i : loop_start;
4579          }
4580       }
4581
4582       if (inst->op == TGSI_OPCODE_BGNLOOP) {
4583          if(depth++ == 0)
4584             loop_start = i;
4585       } else if (inst->op == TGSI_OPCODE_ENDLOOP) {
4586          if (--depth == 0)
4587             loop_start = -1;
4588       }
4589       assert(depth >= 0);
4590       i++;
4591    }
4592 }
4593
4594 void
4595 glsl_to_tgsi_visitor::get_first_temp_read(int *first_reads)
4596 {
4597    int depth = 0; /* loop depth */
4598    int loop_start = -1; /* index of the first active BGNLOOP (if any) */
4599    unsigned i = 0, j;
4600
4601    foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
4602       for (j = 0; j < num_inst_src_regs(inst); j++) {
4603          if (inst->src[j].file == PROGRAM_TEMPORARY) {
4604             if (first_reads[inst->src[j].index] == -1)
4605                 first_reads[inst->src[j].index] = (depth == 0) ? i : loop_start;
4606          }
4607       }
4608       for (j = 0; j < inst->tex_offset_num_offset; j++) {
4609          if (inst->tex_offsets[j].file == PROGRAM_TEMPORARY) {
4610             if (first_reads[inst->tex_offsets[j].index] == -1)
4611                first_reads[inst->tex_offsets[j].index] = (depth == 0) ? i : loop_start;
4612          }
4613       }
4614       if (inst->op == TGSI_OPCODE_BGNLOOP) {
4615          if(depth++ == 0)
4616             loop_start = i;
4617       } else if (inst->op == TGSI_OPCODE_ENDLOOP) {
4618          if (--depth == 0)
4619             loop_start = -1;
4620       }
4621       assert(depth >= 0);
4622       i++;
4623    }
4624 }
4625
4626 void
4627 glsl_to_tgsi_visitor::get_last_temp_read_first_temp_write(int *last_reads, int *first_writes)
4628 {
4629    int depth = 0; /* loop depth */
4630    int loop_start = -1; /* index of the first active BGNLOOP (if any) */
4631    unsigned i = 0, j;
4632    int k;
4633    foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
4634       for (j = 0; j < num_inst_src_regs(inst); j++) {
4635          if (inst->src[j].file == PROGRAM_TEMPORARY)
4636             last_reads[inst->src[j].index] = (depth == 0) ? i : -2;
4637       }
4638       for (j = 0; j < num_inst_dst_regs(inst); j++) {
4639          if (inst->dst[j].file == PROGRAM_TEMPORARY) {
4640             if (first_writes[inst->dst[j].index] == -1)
4641                first_writes[inst->dst[j].index] = (depth == 0) ? i : loop_start;
4642             last_reads[inst->dst[j].index] = (depth == 0) ? i : -2;
4643          }
4644       }
4645       for (j = 0; j < inst->tex_offset_num_offset; j++) {
4646          if (inst->tex_offsets[j].file == PROGRAM_TEMPORARY)
4647             last_reads[inst->tex_offsets[j].index] = (depth == 0) ? i : -2;
4648       }
4649       if (inst->op == TGSI_OPCODE_BGNLOOP) {
4650          if(depth++ == 0)
4651             loop_start = i;
4652       } else if (inst->op == TGSI_OPCODE_ENDLOOP) {
4653          if (--depth == 0) {
4654             loop_start = -1;
4655             for (k = 0; k < this->next_temp; k++) {
4656                if (last_reads[k] == -2) {
4657                   last_reads[k] = i;
4658                }
4659             }
4660          }
4661       }
4662       assert(depth >= 0);
4663       i++;
4664    }
4665 }
4666
4667 void
4668 glsl_to_tgsi_visitor::get_last_temp_write(int *last_writes)
4669 {
4670    int depth = 0; /* loop depth */
4671    int i = 0, k;
4672    unsigned j;
4673
4674    foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
4675       for (j = 0; j < num_inst_dst_regs(inst); j++) {
4676          if (inst->dst[j].file == PROGRAM_TEMPORARY)
4677             last_writes[inst->dst[j].index] = (depth == 0) ? i : -2;
4678       }
4679
4680       if (inst->op == TGSI_OPCODE_BGNLOOP)
4681          depth++;
4682       else if (inst->op == TGSI_OPCODE_ENDLOOP)
4683          if (--depth == 0) {
4684             for (k = 0; k < this->next_temp; k++) {
4685                if (last_writes[k] == -2) {
4686                   last_writes[k] = i;
4687                }
4688             }
4689          }
4690       assert(depth >= 0);
4691       i++;
4692    }
4693 }
4694
4695 /*
4696  * On a basic block basis, tracks available PROGRAM_TEMPORARY register
4697  * channels for copy propagation and updates following instructions to
4698  * use the original versions.
4699  *
4700  * The glsl_to_tgsi_visitor lazily produces code assuming that this pass
4701  * will occur.  As an example, a TXP production before this pass:
4702  *
4703  * 0: MOV TEMP[1], INPUT[4].xyyy;
4704  * 1: MOV TEMP[1].w, INPUT[4].wwww;
4705  * 2: TXP TEMP[2], TEMP[1], texture[0], 2D;
4706  *
4707  * and after:
4708  *
4709  * 0: MOV TEMP[1], INPUT[4].xyyy;
4710  * 1: MOV TEMP[1].w, INPUT[4].wwww;
4711  * 2: TXP TEMP[2], INPUT[4].xyyw, texture[0], 2D;
4712  *
4713  * which allows for dead code elimination on TEMP[1]'s writes.
4714  */
4715 void
4716 glsl_to_tgsi_visitor::copy_propagate(void)
4717 {
4718    glsl_to_tgsi_instruction **acp = rzalloc_array(mem_ctx,
4719                                                   glsl_to_tgsi_instruction *,
4720                                                   this->next_temp * 4);
4721    int *acp_level = rzalloc_array(mem_ctx, int, this->next_temp * 4);
4722    int level = 0;
4723
4724    foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
4725       assert(inst->dst[0].file != PROGRAM_TEMPORARY
4726              || inst->dst[0].index < this->next_temp);
4727
4728       /* First, do any copy propagation possible into the src regs. */
4729       for (int r = 0; r < 3; r++) {
4730          glsl_to_tgsi_instruction *first = NULL;
4731          bool good = true;
4732          int acp_base = inst->src[r].index * 4;
4733
4734          if (inst->src[r].file != PROGRAM_TEMPORARY ||
4735              inst->src[r].reladdr ||
4736              inst->src[r].reladdr2)
4737             continue;
4738
4739          /* See if we can find entries in the ACP consisting of MOVs
4740           * from the same src register for all the swizzled channels
4741           * of this src register reference.
4742           */
4743          for (int i = 0; i < 4; i++) {
4744             int src_chan = GET_SWZ(inst->src[r].swizzle, i);
4745             glsl_to_tgsi_instruction *copy_chan = acp[acp_base + src_chan];
4746
4747             if (!copy_chan) {
4748                good = false;
4749                break;
4750             }
4751
4752             assert(acp_level[acp_base + src_chan] <= level);
4753
4754             if (!first) {
4755                first = copy_chan;
4756             } else {
4757                if (first->src[0].file != copy_chan->src[0].file ||
4758                    first->src[0].index != copy_chan->src[0].index ||
4759                    first->src[0].double_reg2 != copy_chan->src[0].double_reg2 ||
4760                    first->src[0].index2D != copy_chan->src[0].index2D) {
4761                   good = false;
4762                   break;
4763                }
4764             }
4765          }
4766
4767          if (good) {
4768             /* We've now validated that we can copy-propagate to
4769              * replace this src register reference.  Do it.
4770              */
4771             inst->src[r].file = first->src[0].file;
4772             inst->src[r].index = first->src[0].index;
4773             inst->src[r].index2D = first->src[0].index2D;
4774             inst->src[r].has_index2 = first->src[0].has_index2;
4775             inst->src[r].double_reg2 = first->src[0].double_reg2;
4776             inst->src[r].array_id = first->src[0].array_id;
4777
4778             int swizzle = 0;
4779             for (int i = 0; i < 4; i++) {
4780                int src_chan = GET_SWZ(inst->src[r].swizzle, i);
4781                glsl_to_tgsi_instruction *copy_inst = acp[acp_base + src_chan];
4782                swizzle |= (GET_SWZ(copy_inst->src[0].swizzle, src_chan) << (3 * i));
4783             }
4784             inst->src[r].swizzle = swizzle;
4785          }
4786       }
4787
4788       switch (inst->op) {
4789       case TGSI_OPCODE_BGNLOOP:
4790       case TGSI_OPCODE_ENDLOOP:
4791          /* End of a basic block, clear the ACP entirely. */
4792          memset(acp, 0, sizeof(*acp) * this->next_temp * 4);
4793          break;
4794
4795       case TGSI_OPCODE_IF:
4796       case TGSI_OPCODE_UIF:
4797          ++level;
4798          break;
4799
4800       case TGSI_OPCODE_ENDIF:
4801       case TGSI_OPCODE_ELSE:
4802          /* Clear all channels written inside the block from the ACP, but
4803           * leaving those that were not touched.
4804           */
4805          for (int r = 0; r < this->next_temp; r++) {
4806             for (int c = 0; c < 4; c++) {
4807                if (!acp[4 * r + c])
4808                   continue;
4809
4810                if (acp_level[4 * r + c] >= level)
4811                   acp[4 * r + c] = NULL;
4812             }
4813          }
4814          if (inst->op == TGSI_OPCODE_ENDIF)
4815             --level;
4816          break;
4817
4818       default:
4819          /* Continuing the block, clear any written channels from
4820           * the ACP.
4821           */
4822          for (int d = 0; d < 2; d++) {
4823             if (inst->dst[d].file == PROGRAM_TEMPORARY && inst->dst[d].reladdr) {
4824                /* Any temporary might be written, so no copy propagation
4825                 * across this instruction.
4826                 */
4827                memset(acp, 0, sizeof(*acp) * this->next_temp * 4);
4828             } else if (inst->dst[d].file == PROGRAM_OUTPUT &&
4829                        inst->dst[d].reladdr) {
4830                /* Any output might be written, so no copy propagation
4831                 * from outputs across this instruction.
4832                 */
4833                for (int r = 0; r < this->next_temp; r++) {
4834                   for (int c = 0; c < 4; c++) {
4835                      if (!acp[4 * r + c])
4836                         continue;
4837
4838                      if (acp[4 * r + c]->src[0].file == PROGRAM_OUTPUT)
4839                         acp[4 * r + c] = NULL;
4840                   }
4841                }
4842             } else if (inst->dst[d].file == PROGRAM_TEMPORARY ||
4843                        inst->dst[d].file == PROGRAM_OUTPUT) {
4844                /* Clear where it's used as dst. */
4845                if (inst->dst[d].file == PROGRAM_TEMPORARY) {
4846                   for (int c = 0; c < 4; c++) {
4847                      if (inst->dst[d].writemask & (1 << c))
4848                         acp[4 * inst->dst[d].index + c] = NULL;
4849                   }
4850                }
4851
4852                /* Clear where it's used as src. */
4853                for (int r = 0; r < this->next_temp; r++) {
4854                   for (int c = 0; c < 4; c++) {
4855                      if (!acp[4 * r + c])
4856                         continue;
4857
4858                      int src_chan = GET_SWZ(acp[4 * r + c]->src[0].swizzle, c);
4859
4860                      if (acp[4 * r + c]->src[0].file == inst->dst[d].file &&
4861                          acp[4 * r + c]->src[0].index == inst->dst[d].index &&
4862                          inst->dst[d].writemask & (1 << src_chan)) {
4863                         acp[4 * r + c] = NULL;
4864                      }
4865                   }
4866                }
4867             }
4868          }
4869          break;
4870       }
4871
4872       /* If this is a copy, add it to the ACP. */
4873       if (inst->op == TGSI_OPCODE_MOV &&
4874           inst->dst[0].file == PROGRAM_TEMPORARY &&
4875           !(inst->dst[0].file == inst->src[0].file &&
4876              inst->dst[0].index == inst->src[0].index) &&
4877           !inst->dst[0].reladdr &&
4878           !inst->dst[0].reladdr2 &&
4879           !inst->saturate &&
4880           inst->src[0].file != PROGRAM_ARRAY &&
4881           !inst->src[0].reladdr &&
4882           !inst->src[0].reladdr2 &&
4883           !inst->src[0].negate &&
4884           !inst->src[0].abs) {
4885          for (int i = 0; i < 4; i++) {
4886             if (inst->dst[0].writemask & (1 << i)) {
4887                acp[4 * inst->dst[0].index + i] = inst;
4888                acp_level[4 * inst->dst[0].index + i] = level;
4889             }
4890          }
4891       }
4892    }
4893
4894    ralloc_free(acp_level);
4895    ralloc_free(acp);
4896 }
4897
4898 /*
4899  * On a basic block basis, tracks available PROGRAM_TEMPORARY registers for dead
4900  * code elimination.
4901  *
4902  * The glsl_to_tgsi_visitor lazily produces code assuming that this pass
4903  * will occur.  As an example, a TXP production after copy propagation but
4904  * before this pass:
4905  *
4906  * 0: MOV TEMP[1], INPUT[4].xyyy;
4907  * 1: MOV TEMP[1].w, INPUT[4].wwww;
4908  * 2: TXP TEMP[2], INPUT[4].xyyw, texture[0], 2D;
4909  *
4910  * and after this pass:
4911  *
4912  * 0: TXP TEMP[2], INPUT[4].xyyw, texture[0], 2D;
4913  */
4914 int
4915 glsl_to_tgsi_visitor::eliminate_dead_code(void)
4916 {
4917    glsl_to_tgsi_instruction **writes = rzalloc_array(mem_ctx,
4918                                                      glsl_to_tgsi_instruction *,
4919                                                      this->next_temp * 4);
4920    int *write_level = rzalloc_array(mem_ctx, int, this->next_temp * 4);
4921    int level = 0;
4922    int removed = 0;
4923
4924    foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
4925       assert(inst->dst[0].file != PROGRAM_TEMPORARY
4926              || inst->dst[0].index < this->next_temp);
4927
4928       switch (inst->op) {
4929       case TGSI_OPCODE_BGNLOOP:
4930       case TGSI_OPCODE_ENDLOOP:
4931       case TGSI_OPCODE_CONT:
4932       case TGSI_OPCODE_BRK:
4933          /* End of a basic block, clear the write array entirely.
4934           *
4935           * This keeps us from killing dead code when the writes are
4936           * on either side of a loop, even when the register isn't touched
4937           * inside the loop.  However, glsl_to_tgsi_visitor doesn't seem to emit
4938           * dead code of this type, so it shouldn't make a difference as long as
4939           * the dead code elimination pass in the GLSL compiler does its job.
4940           */
4941          memset(writes, 0, sizeof(*writes) * this->next_temp * 4);
4942          break;
4943
4944       case TGSI_OPCODE_ENDIF:
4945       case TGSI_OPCODE_ELSE:
4946          /* Promote the recorded level of all channels written inside the
4947           * preceding if or else block to the level above the if/else block.
4948           */
4949          for (int r = 0; r < this->next_temp; r++) {
4950             for (int c = 0; c < 4; c++) {
4951                if (!writes[4 * r + c])
4952                   continue;
4953
4954                if (write_level[4 * r + c] == level)
4955                   write_level[4 * r + c] = level-1;
4956             }
4957          }
4958          if(inst->op == TGSI_OPCODE_ENDIF)
4959             --level;
4960          break;
4961
4962       case TGSI_OPCODE_IF:
4963       case TGSI_OPCODE_UIF:
4964          ++level;
4965          /* fallthrough to default case to mark the condition as read */
4966       default:
4967          /* Continuing the block, clear any channels from the write array that
4968           * are read by this instruction.
4969           */
4970          for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
4971             if (inst->src[i].file == PROGRAM_TEMPORARY && inst->src[i].reladdr){
4972                /* Any temporary might be read, so no dead code elimination
4973                 * across this instruction.
4974                 */
4975                memset(writes, 0, sizeof(*writes) * this->next_temp * 4);
4976             } else if (inst->src[i].file == PROGRAM_TEMPORARY) {
4977                /* Clear where it's used as src. */
4978                int src_chans = 1 << GET_SWZ(inst->src[i].swizzle, 0);
4979                src_chans |= 1 << GET_SWZ(inst->src[i].swizzle, 1);
4980                src_chans |= 1 << GET_SWZ(inst->src[i].swizzle, 2);
4981                src_chans |= 1 << GET_SWZ(inst->src[i].swizzle, 3);
4982
4983                for (int c = 0; c < 4; c++) {
4984                   if (src_chans & (1 << c))
4985                      writes[4 * inst->src[i].index + c] = NULL;
4986                }
4987             }
4988          }
4989          for (unsigned i = 0; i < inst->tex_offset_num_offset; i++) {
4990             if (inst->tex_offsets[i].file == PROGRAM_TEMPORARY && inst->tex_offsets[i].reladdr){
4991                /* Any temporary might be read, so no dead code elimination
4992                 * across this instruction.
4993                 */
4994                memset(writes, 0, sizeof(*writes) * this->next_temp * 4);
4995             } else if (inst->tex_offsets[i].file == PROGRAM_TEMPORARY) {
4996                /* Clear where it's used as src. */
4997                int src_chans = 1 << GET_SWZ(inst->tex_offsets[i].swizzle, 0);
4998                src_chans |= 1 << GET_SWZ(inst->tex_offsets[i].swizzle, 1);
4999                src_chans |= 1 << GET_SWZ(inst->tex_offsets[i].swizzle, 2);
5000                src_chans |= 1 << GET_SWZ(inst->tex_offsets[i].swizzle, 3);
5001
5002                for (int c = 0; c < 4; c++) {
5003                   if (src_chans & (1 << c))
5004                      writes[4 * inst->tex_offsets[i].index + c] = NULL;
5005                }
5006             }
5007          }
5008
5009          if (inst->resource.file == PROGRAM_TEMPORARY) {
5010             int src_chans;
5011
5012             src_chans  = 1 << GET_SWZ(inst->resource.swizzle, 0);
5013             src_chans |= 1 << GET_SWZ(inst->resource.swizzle, 1);
5014             src_chans |= 1 << GET_SWZ(inst->resource.swizzle, 2);
5015             src_chans |= 1 << GET_SWZ(inst->resource.swizzle, 3);
5016
5017             for (int c = 0; c < 4; c++) {
5018                if (src_chans & (1 << c))
5019                   writes[4 * inst->resource.index + c] = NULL;
5020             }
5021          }
5022
5023          break;
5024       }
5025
5026       /* If this instruction writes to a temporary, add it to the write array.
5027        * If there is already an instruction in the write array for one or more
5028        * of the channels, flag that channel write as dead.
5029        */
5030       for (unsigned i = 0; i < ARRAY_SIZE(inst->dst); i++) {
5031          if (inst->dst[i].file == PROGRAM_TEMPORARY &&
5032              !inst->dst[i].reladdr) {
5033             for (int c = 0; c < 4; c++) {
5034                if (inst->dst[i].writemask & (1 << c)) {
5035                   if (writes[4 * inst->dst[i].index + c]) {
5036                      if (write_level[4 * inst->dst[i].index + c] < level)
5037                         continue;
5038                      else
5039                         writes[4 * inst->dst[i].index + c]->dead_mask |= (1 << c);
5040                   }
5041                   writes[4 * inst->dst[i].index + c] = inst;
5042                   write_level[4 * inst->dst[i].index + c] = level;
5043                }
5044             }
5045          }
5046       }
5047    }
5048
5049    /* Anything still in the write array at this point is dead code. */
5050    for (int r = 0; r < this->next_temp; r++) {
5051       for (int c = 0; c < 4; c++) {
5052          glsl_to_tgsi_instruction *inst = writes[4 * r + c];
5053          if (inst)
5054             inst->dead_mask |= (1 << c);
5055       }
5056    }
5057
5058    /* Now actually remove the instructions that are completely dead and update
5059     * the writemask of other instructions with dead channels.
5060     */
5061    foreach_in_list_safe(glsl_to_tgsi_instruction, inst, &this->instructions) {
5062       if (!inst->dead_mask || !inst->dst[0].writemask)
5063          continue;
5064       /* No amount of dead masks should remove memory stores */
5065       if (inst->info->is_store)
5066          continue;
5067
5068       if ((inst->dst[0].writemask & ~inst->dead_mask) == 0) {
5069          inst->remove();
5070          delete inst;
5071          removed++;
5072       } else {
5073          if (glsl_base_type_is_64bit(inst->dst[0].type)) {
5074             if (inst->dead_mask == WRITEMASK_XY ||
5075                 inst->dead_mask == WRITEMASK_ZW)
5076                inst->dst[0].writemask &= ~(inst->dead_mask);
5077          } else
5078             inst->dst[0].writemask &= ~(inst->dead_mask);
5079       }
5080    }
5081
5082    ralloc_free(write_level);
5083    ralloc_free(writes);
5084
5085    return removed;
5086 }
5087
5088 /* merge DFRACEXP instructions into one. */
5089 void
5090 glsl_to_tgsi_visitor::merge_two_dsts(void)
5091 {
5092    foreach_in_list_safe(glsl_to_tgsi_instruction, inst, &this->instructions) {
5093       glsl_to_tgsi_instruction *inst2;
5094       bool merged;
5095       if (num_inst_dst_regs(inst) != 2)
5096          continue;
5097
5098       if (inst->dst[0].file != PROGRAM_UNDEFINED &&
5099           inst->dst[1].file != PROGRAM_UNDEFINED)
5100          continue;
5101
5102       inst2 = (glsl_to_tgsi_instruction *) inst->next;
5103       do {
5104
5105          if (inst->src[0].file == inst2->src[0].file &&
5106              inst->src[0].index == inst2->src[0].index &&
5107              inst->src[0].type == inst2->src[0].type &&
5108              inst->src[0].swizzle == inst2->src[0].swizzle)
5109             break;
5110          inst2 = (glsl_to_tgsi_instruction *) inst2->next;
5111       } while (inst2);
5112
5113       if (!inst2)
5114          continue;
5115       merged = false;
5116       if (inst->dst[0].file == PROGRAM_UNDEFINED) {
5117          merged = true;
5118          inst->dst[0] = inst2->dst[0];
5119       } else if (inst->dst[1].file == PROGRAM_UNDEFINED) {
5120          inst->dst[1] = inst2->dst[1];
5121          merged = true;
5122       }
5123
5124       if (merged) {
5125          inst2->remove();
5126          delete inst2;
5127       }
5128    }
5129 }
5130
5131 /* Merges temporary registers together where possible to reduce the number of
5132  * registers needed to run a program.
5133  *
5134  * Produces optimal code only after copy propagation and dead code elimination
5135  * have been run. */
5136 void
5137 glsl_to_tgsi_visitor::merge_registers(void)
5138 {
5139
5140    struct lifetime *lifetimes =
5141          rzalloc_array(mem_ctx, struct lifetime, this->next_temp);
5142
5143    if (get_temp_registers_required_lifetimes(mem_ctx, &this->instructions,
5144                                              this->next_temp, lifetimes)) {
5145       struct rename_reg_pair *renames =
5146             rzalloc_array(mem_ctx, struct rename_reg_pair, this->next_temp);
5147       get_temp_registers_remapping(mem_ctx, this->next_temp, lifetimes, renames);
5148       rename_temp_registers(renames);
5149       ralloc_free(renames);
5150    }
5151
5152    ralloc_free(lifetimes);
5153 }
5154
5155 /* Reassign indices to temporary registers by reusing unused indices created
5156  * by optimization passes. */
5157 void
5158 glsl_to_tgsi_visitor::renumber_registers(void)
5159 {
5160    int i = 0;
5161    int new_index = 0;
5162    int *first_writes = ralloc_array(mem_ctx, int, this->next_temp);
5163    struct rename_reg_pair *renames = rzalloc_array(mem_ctx, struct rename_reg_pair, this->next_temp);
5164
5165    for (i = 0; i < this->next_temp; i++) {
5166       first_writes[i] = -1;
5167    }
5168    get_first_temp_write(first_writes);
5169
5170    for (i = 0; i < this->next_temp; i++) {
5171       if (first_writes[i] < 0) continue;
5172       if (i != new_index) {
5173          renames[i].new_reg = new_index;
5174          renames[i].valid = true;
5175       }
5176       new_index++;
5177    }
5178
5179    rename_temp_registers(renames);
5180    this->next_temp = new_index;
5181    ralloc_free(renames);
5182    ralloc_free(first_writes);
5183 }
5184
5185 /* ------------------------- TGSI conversion stuff -------------------------- */
5186
5187 /**
5188  * Intermediate state used during shader translation.
5189  */
5190 struct st_translate {
5191    struct ureg_program *ureg;
5192
5193    unsigned temps_size;
5194    struct ureg_dst *temps;
5195
5196    struct ureg_dst *arrays;
5197    unsigned num_temp_arrays;
5198    struct ureg_src *constants;
5199    int num_constants;
5200    struct ureg_src *immediates;
5201    int num_immediates;
5202    struct ureg_dst outputs[PIPE_MAX_SHADER_OUTPUTS];
5203    struct ureg_src inputs[PIPE_MAX_SHADER_INPUTS];
5204    struct ureg_dst address[3];
5205    struct ureg_src samplers[PIPE_MAX_SAMPLERS];
5206    struct ureg_src buffers[PIPE_MAX_SHADER_BUFFERS];
5207    struct ureg_src images[PIPE_MAX_SHADER_IMAGES];
5208    struct ureg_src systemValues[SYSTEM_VALUE_MAX];
5209    struct ureg_src shared_memory;
5210    unsigned *array_sizes;
5211    struct inout_decl *input_decls;
5212    unsigned num_input_decls;
5213    struct inout_decl *output_decls;
5214    unsigned num_output_decls;
5215
5216    const ubyte *inputMapping;
5217    const ubyte *outputMapping;
5218
5219    unsigned procType;  /**< PIPE_SHADER_VERTEX/FRAGMENT */
5220 };
5221
5222 /** Map Mesa's SYSTEM_VALUE_x to TGSI_SEMANTIC_x */
5223 unsigned
5224 _mesa_sysval_to_semantic(unsigned sysval)
5225 {
5226    switch (sysval) {
5227    /* Vertex shader */
5228    case SYSTEM_VALUE_VERTEX_ID:
5229       return TGSI_SEMANTIC_VERTEXID;
5230    case SYSTEM_VALUE_INSTANCE_ID:
5231       return TGSI_SEMANTIC_INSTANCEID;
5232    case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
5233       return TGSI_SEMANTIC_VERTEXID_NOBASE;
5234    case SYSTEM_VALUE_BASE_VERTEX:
5235       return TGSI_SEMANTIC_BASEVERTEX;
5236    case SYSTEM_VALUE_BASE_INSTANCE:
5237       return TGSI_SEMANTIC_BASEINSTANCE;
5238    case SYSTEM_VALUE_DRAW_ID:
5239       return TGSI_SEMANTIC_DRAWID;
5240
5241    /* Geometry shader */
5242    case SYSTEM_VALUE_INVOCATION_ID:
5243       return TGSI_SEMANTIC_INVOCATIONID;
5244
5245    /* Fragment shader */
5246    case SYSTEM_VALUE_FRAG_COORD:
5247       return TGSI_SEMANTIC_POSITION;
5248    case SYSTEM_VALUE_FRONT_FACE:
5249       return TGSI_SEMANTIC_FACE;
5250    case SYSTEM_VALUE_SAMPLE_ID:
5251       return TGSI_SEMANTIC_SAMPLEID;
5252    case SYSTEM_VALUE_SAMPLE_POS:
5253       return TGSI_SEMANTIC_SAMPLEPOS;
5254    case SYSTEM_VALUE_SAMPLE_MASK_IN:
5255       return TGSI_SEMANTIC_SAMPLEMASK;
5256    case SYSTEM_VALUE_HELPER_INVOCATION:
5257       return TGSI_SEMANTIC_HELPER_INVOCATION;
5258
5259    /* Tessellation shader */
5260    case SYSTEM_VALUE_TESS_COORD:
5261       return TGSI_SEMANTIC_TESSCOORD;
5262    case SYSTEM_VALUE_VERTICES_IN:
5263       return TGSI_SEMANTIC_VERTICESIN;
5264    case SYSTEM_VALUE_PRIMITIVE_ID:
5265       return TGSI_SEMANTIC_PRIMID;
5266    case SYSTEM_VALUE_TESS_LEVEL_OUTER:
5267       return TGSI_SEMANTIC_TESSOUTER;
5268    case SYSTEM_VALUE_TESS_LEVEL_INNER:
5269       return TGSI_SEMANTIC_TESSINNER;
5270
5271    /* Compute shader */
5272    case SYSTEM_VALUE_LOCAL_INVOCATION_ID:
5273       return TGSI_SEMANTIC_THREAD_ID;
5274    case SYSTEM_VALUE_WORK_GROUP_ID:
5275       return TGSI_SEMANTIC_BLOCK_ID;
5276    case SYSTEM_VALUE_NUM_WORK_GROUPS:
5277       return TGSI_SEMANTIC_GRID_SIZE;
5278    case SYSTEM_VALUE_LOCAL_GROUP_SIZE:
5279       return TGSI_SEMANTIC_BLOCK_SIZE;
5280
5281    /* ARB_shader_ballot */
5282    case SYSTEM_VALUE_SUBGROUP_SIZE:
5283       return TGSI_SEMANTIC_SUBGROUP_SIZE;
5284    case SYSTEM_VALUE_SUBGROUP_INVOCATION:
5285       return TGSI_SEMANTIC_SUBGROUP_INVOCATION;
5286    case SYSTEM_VALUE_SUBGROUP_EQ_MASK:
5287       return TGSI_SEMANTIC_SUBGROUP_EQ_MASK;
5288    case SYSTEM_VALUE_SUBGROUP_GE_MASK:
5289       return TGSI_SEMANTIC_SUBGROUP_GE_MASK;
5290    case SYSTEM_VALUE_SUBGROUP_GT_MASK:
5291       return TGSI_SEMANTIC_SUBGROUP_GT_MASK;
5292    case SYSTEM_VALUE_SUBGROUP_LE_MASK:
5293       return TGSI_SEMANTIC_SUBGROUP_LE_MASK;
5294    case SYSTEM_VALUE_SUBGROUP_LT_MASK:
5295       return TGSI_SEMANTIC_SUBGROUP_LT_MASK;
5296
5297    /* Unhandled */
5298    case SYSTEM_VALUE_LOCAL_INVOCATION_INDEX:
5299    case SYSTEM_VALUE_GLOBAL_INVOCATION_ID:
5300    case SYSTEM_VALUE_VERTEX_CNT:
5301    default:
5302       assert(!"Unexpected SYSTEM_VALUE_ enum");
5303       return TGSI_SEMANTIC_COUNT;
5304    }
5305 }
5306
5307 /**
5308  * Map a glsl_to_tgsi constant/immediate to a TGSI immediate.
5309  */
5310 static struct ureg_src
5311 emit_immediate(struct st_translate *t,
5312                gl_constant_value values[4],
5313                int type, int size)
5314 {
5315    struct ureg_program *ureg = t->ureg;
5316
5317    switch(type)
5318    {
5319    case GL_FLOAT:
5320       return ureg_DECL_immediate(ureg, &values[0].f, size);
5321    case GL_DOUBLE:
5322       return ureg_DECL_immediate_f64(ureg, (double *)&values[0].f, size);
5323    case GL_INT64_ARB:
5324       return ureg_DECL_immediate_int64(ureg, (int64_t *)&values[0].f, size);
5325    case GL_UNSIGNED_INT64_ARB:
5326       return ureg_DECL_immediate_uint64(ureg, (uint64_t *)&values[0].f, size);
5327    case GL_INT:
5328       return ureg_DECL_immediate_int(ureg, &values[0].i, size);
5329    case GL_UNSIGNED_INT:
5330    case GL_BOOL:
5331       return ureg_DECL_immediate_uint(ureg, &values[0].u, size);
5332    default:
5333       assert(!"should not get here - type must be float, int, uint, or bool");
5334       return ureg_src_undef();
5335    }
5336 }
5337
5338 /**
5339  * Map a glsl_to_tgsi dst register to a TGSI ureg_dst register.
5340  */
5341 static struct ureg_dst
5342 dst_register(struct st_translate *t, gl_register_file file, unsigned index,
5343              unsigned array_id)
5344 {
5345    unsigned array;
5346
5347    switch(file) {
5348    case PROGRAM_UNDEFINED:
5349       return ureg_dst_undef();
5350
5351    case PROGRAM_TEMPORARY:
5352       /* Allocate space for temporaries on demand. */
5353       if (index >= t->temps_size) {
5354          const int inc = align(index - t->temps_size + 1, 4096);
5355
5356          t->temps = (struct ureg_dst*)
5357                     realloc(t->temps,
5358                             (t->temps_size + inc) * sizeof(struct ureg_dst));
5359          if (!t->temps)
5360             return ureg_dst_undef();
5361
5362          memset(t->temps + t->temps_size, 0, inc * sizeof(struct ureg_dst));
5363          t->temps_size += inc;
5364       }
5365
5366       if (ureg_dst_is_undef(t->temps[index]))
5367          t->temps[index] = ureg_DECL_local_temporary(t->ureg);
5368
5369       return t->temps[index];
5370
5371    case PROGRAM_ARRAY:
5372       assert(array_id && array_id <= t->num_temp_arrays);
5373       array = array_id - 1;
5374
5375       if (ureg_dst_is_undef(t->arrays[array]))
5376          t->arrays[array] = ureg_DECL_array_temporary(
5377             t->ureg, t->array_sizes[array], TRUE);
5378
5379       return ureg_dst_array_offset(t->arrays[array], index);
5380
5381    case PROGRAM_OUTPUT:
5382       if (!array_id) {
5383          if (t->procType == PIPE_SHADER_FRAGMENT)
5384             assert(index < 2 * FRAG_RESULT_MAX);
5385          else if (t->procType == PIPE_SHADER_TESS_CTRL ||
5386                   t->procType == PIPE_SHADER_TESS_EVAL)
5387             assert(index < VARYING_SLOT_TESS_MAX);
5388          else
5389             assert(index < VARYING_SLOT_MAX);
5390
5391          assert(t->outputMapping[index] < ARRAY_SIZE(t->outputs));
5392          assert(t->outputs[t->outputMapping[index]].File != TGSI_FILE_NULL);
5393          return t->outputs[t->outputMapping[index]];
5394       }
5395       else {
5396          struct inout_decl *decl = find_inout_array(t->output_decls, t->num_output_decls, array_id);
5397          unsigned mesa_index = decl->mesa_index;
5398          int slot = t->outputMapping[mesa_index];
5399
5400          assert(slot != -1 && t->outputs[slot].File == TGSI_FILE_OUTPUT);
5401
5402          struct ureg_dst dst = t->outputs[slot];
5403          dst.ArrayID = array_id;
5404          return ureg_dst_array_offset(dst, index - mesa_index);
5405       }
5406
5407    case PROGRAM_ADDRESS:
5408       return t->address[index];
5409
5410    default:
5411       assert(!"unknown dst register file");
5412       return ureg_dst_undef();
5413    }
5414 }
5415
5416 /**
5417  * Create a TGSI ureg_dst register from an st_dst_reg.
5418  */
5419 static struct ureg_dst
5420 translate_dst(struct st_translate *t,
5421               const st_dst_reg *dst_reg,
5422               bool saturate)
5423 {
5424    struct ureg_dst dst = dst_register(t, dst_reg->file, dst_reg->index,
5425                                       dst_reg->array_id);
5426
5427    if (dst.File == TGSI_FILE_NULL)
5428       return dst;
5429
5430    dst = ureg_writemask(dst, dst_reg->writemask);
5431
5432    if (saturate)
5433       dst = ureg_saturate(dst);
5434
5435    if (dst_reg->reladdr != NULL) {
5436       assert(dst_reg->file != PROGRAM_TEMPORARY);
5437       dst = ureg_dst_indirect(dst, ureg_src(t->address[0]));
5438    }
5439
5440    if (dst_reg->has_index2) {
5441       if (dst_reg->reladdr2)
5442          dst = ureg_dst_dimension_indirect(dst, ureg_src(t->address[1]),
5443                                            dst_reg->index2D);
5444       else
5445          dst = ureg_dst_dimension(dst, dst_reg->index2D);
5446    }
5447
5448    return dst;
5449 }
5450
5451 /**
5452  * Create a TGSI ureg_src register from an st_src_reg.
5453  */
5454 static struct ureg_src
5455 translate_src(struct st_translate *t, const st_src_reg *src_reg)
5456 {
5457    struct ureg_src src;
5458    int index = src_reg->index;
5459    int double_reg2 = src_reg->double_reg2 ? 1 : 0;
5460
5461    switch(src_reg->file) {
5462    case PROGRAM_UNDEFINED:
5463       src = ureg_imm4f(t->ureg, 0, 0, 0, 0);
5464       break;
5465
5466    case PROGRAM_TEMPORARY:
5467    case PROGRAM_ARRAY:
5468       src = ureg_src(dst_register(t, src_reg->file, src_reg->index, src_reg->array_id));
5469       break;
5470
5471    case PROGRAM_OUTPUT: {
5472       struct ureg_dst dst = dst_register(t, src_reg->file, src_reg->index, src_reg->array_id);
5473       assert(dst.WriteMask != 0);
5474       unsigned shift = ffs(dst.WriteMask) - 1;
5475       src = ureg_swizzle(ureg_src(dst),
5476                          shift,
5477                          MIN2(shift + 1, 3),
5478                          MIN2(shift + 2, 3),
5479                          MIN2(shift + 3, 3));
5480       break;
5481    }
5482
5483    case PROGRAM_UNIFORM:
5484       assert(src_reg->index >= 0);
5485       src = src_reg->index < t->num_constants ?
5486                t->constants[src_reg->index] : ureg_imm4f(t->ureg, 0, 0, 0, 0);
5487       break;
5488    case PROGRAM_STATE_VAR:
5489    case PROGRAM_CONSTANT:       /* ie, immediate */
5490       if (src_reg->has_index2)
5491          src = ureg_src_register(TGSI_FILE_CONSTANT, src_reg->index);
5492       else
5493          src = src_reg->index >= 0 && src_reg->index < t->num_constants ?
5494                   t->constants[src_reg->index] : ureg_imm4f(t->ureg, 0, 0, 0, 0);
5495       break;
5496
5497    case PROGRAM_IMMEDIATE:
5498       assert(src_reg->index >= 0 && src_reg->index < t->num_immediates);
5499       src = t->immediates[src_reg->index];
5500       break;
5501
5502    case PROGRAM_INPUT:
5503       /* GLSL inputs are 64-bit containers, so we have to
5504        * map back to the original index and add the offset after
5505        * mapping. */
5506       index -= double_reg2;
5507       if (!src_reg->array_id) {
5508          assert(t->inputMapping[index] < ARRAY_SIZE(t->inputs));
5509          assert(t->inputs[t->inputMapping[index]].File != TGSI_FILE_NULL);
5510          src = t->inputs[t->inputMapping[index] + double_reg2];
5511       }
5512       else {
5513          struct inout_decl *decl = find_inout_array(t->input_decls, t->num_input_decls,
5514                                                     src_reg->array_id);
5515          unsigned mesa_index = decl->mesa_index;
5516          int slot = t->inputMapping[mesa_index];
5517
5518          assert(slot != -1 && t->inputs[slot].File == TGSI_FILE_INPUT);
5519
5520          src = t->inputs[slot];
5521          src.ArrayID = src_reg->array_id;
5522          src = ureg_src_array_offset(src, index + double_reg2 - mesa_index);
5523       }
5524       break;
5525
5526    case PROGRAM_ADDRESS:
5527       src = ureg_src(t->address[src_reg->index]);
5528       break;
5529
5530    case PROGRAM_SYSTEM_VALUE:
5531       assert(src_reg->index < (int) ARRAY_SIZE(t->systemValues));
5532       src = t->systemValues[src_reg->index];
5533       break;
5534
5535    default:
5536       assert(!"unknown src register file");
5537       return ureg_src_undef();
5538    }
5539
5540    if (src_reg->has_index2) {
5541       /* 2D indexes occur with geometry shader inputs (attrib, vertex)
5542        * and UBO constant buffers (buffer, position).
5543        */
5544       if (src_reg->reladdr2)
5545          src = ureg_src_dimension_indirect(src, ureg_src(t->address[1]),
5546                                            src_reg->index2D);
5547       else
5548          src = ureg_src_dimension(src, src_reg->index2D);
5549    }
5550
5551    src = ureg_swizzle(src,
5552                       GET_SWZ(src_reg->swizzle, 0) & 0x3,
5553                       GET_SWZ(src_reg->swizzle, 1) & 0x3,
5554                       GET_SWZ(src_reg->swizzle, 2) & 0x3,
5555                       GET_SWZ(src_reg->swizzle, 3) & 0x3);
5556
5557    if (src_reg->abs)
5558       src = ureg_abs(src);
5559
5560    if ((src_reg->negate & 0xf) == NEGATE_XYZW)
5561       src = ureg_negate(src);
5562
5563    if (src_reg->reladdr != NULL) {
5564       assert(src_reg->file != PROGRAM_TEMPORARY);
5565       src = ureg_src_indirect(src, ureg_src(t->address[0]));
5566    }
5567
5568    return src;
5569 }
5570
5571 static struct tgsi_texture_offset
5572 translate_tex_offset(struct st_translate *t,
5573                      const st_src_reg *in_offset)
5574 {
5575    struct tgsi_texture_offset offset;
5576    struct ureg_src src = translate_src(t, in_offset);
5577
5578    offset.File = src.File;
5579    offset.Index = src.Index;
5580    offset.SwizzleX = src.SwizzleX;
5581    offset.SwizzleY = src.SwizzleY;
5582    offset.SwizzleZ = src.SwizzleZ;
5583    offset.Padding = 0;
5584
5585    assert(!src.Indirect);
5586    assert(!src.DimIndirect);
5587    assert(!src.Dimension);
5588    assert(!src.Absolute); /* those shouldn't be used with integers anyway */
5589    assert(!src.Negate);
5590
5591    return offset;
5592 }
5593
5594 static void
5595 compile_tgsi_instruction(struct st_translate *t,
5596                          const glsl_to_tgsi_instruction *inst)
5597 {
5598    struct ureg_program *ureg = t->ureg;
5599    int i;
5600    struct ureg_dst dst[2];
5601    struct ureg_src src[4];
5602    struct tgsi_texture_offset texoffsets[MAX_GLSL_TEXTURE_OFFSET];
5603
5604    int num_dst;
5605    int num_src;
5606    unsigned tex_target = 0;
5607
5608    num_dst = num_inst_dst_regs(inst);
5609    num_src = num_inst_src_regs(inst);
5610
5611    for (i = 0; i < num_dst; i++)
5612       dst[i] = translate_dst(t,
5613                              &inst->dst[i],
5614                              inst->saturate);
5615
5616    for (i = 0; i < num_src; i++)
5617       src[i] = translate_src(t, &inst->src[i]);
5618
5619    switch(inst->op) {
5620    case TGSI_OPCODE_BGNLOOP:
5621    case TGSI_OPCODE_ELSE:
5622    case TGSI_OPCODE_ENDLOOP:
5623    case TGSI_OPCODE_IF:
5624    case TGSI_OPCODE_UIF:
5625       assert(num_dst == 0);
5626       ureg_insn(ureg, inst->op, NULL, 0, src, num_src, inst->precise);
5627       return;
5628
5629    case TGSI_OPCODE_TEX:
5630    case TGSI_OPCODE_TEX_LZ:
5631    case TGSI_OPCODE_TXB:
5632    case TGSI_OPCODE_TXD:
5633    case TGSI_OPCODE_TXL:
5634    case TGSI_OPCODE_TXP:
5635    case TGSI_OPCODE_TXQ:
5636    case TGSI_OPCODE_TXQS:
5637    case TGSI_OPCODE_TXF:
5638    case TGSI_OPCODE_TXF_LZ:
5639    case TGSI_OPCODE_TEX2:
5640    case TGSI_OPCODE_TXB2:
5641    case TGSI_OPCODE_TXL2:
5642    case TGSI_OPCODE_TG4:
5643    case TGSI_OPCODE_LODQ:
5644       if (inst->resource.file == PROGRAM_SAMPLER) {
5645          src[num_src] = t->samplers[inst->resource.index];
5646       } else {
5647          /* Bindless samplers. */
5648          src[num_src] = translate_src(t, &inst->resource);
5649       }
5650       assert(src[num_src].File != TGSI_FILE_NULL);
5651       if (inst->resource.reladdr)
5652          src[num_src] =
5653             ureg_src_indirect(src[num_src], ureg_src(t->address[2]));
5654       num_src++;
5655       for (i = 0; i < (int)inst->tex_offset_num_offset; i++) {
5656          texoffsets[i] = translate_tex_offset(t, &inst->tex_offsets[i]);
5657       }
5658       tex_target = st_translate_texture_target(inst->tex_target, inst->tex_shadow);
5659
5660       ureg_tex_insn(ureg,
5661                     inst->op,
5662                     dst, num_dst,
5663                     tex_target,
5664                     st_translate_texture_type(inst->tex_type),
5665                     texoffsets, inst->tex_offset_num_offset,
5666                     src, num_src);
5667       return;
5668
5669    case TGSI_OPCODE_RESQ:
5670    case TGSI_OPCODE_LOAD:
5671    case TGSI_OPCODE_ATOMUADD:
5672    case TGSI_OPCODE_ATOMXCHG:
5673    case TGSI_OPCODE_ATOMCAS:
5674    case TGSI_OPCODE_ATOMAND:
5675    case TGSI_OPCODE_ATOMOR:
5676    case TGSI_OPCODE_ATOMXOR:
5677    case TGSI_OPCODE_ATOMUMIN:
5678    case TGSI_OPCODE_ATOMUMAX:
5679    case TGSI_OPCODE_ATOMIMIN:
5680    case TGSI_OPCODE_ATOMIMAX:
5681       for (i = num_src - 1; i >= 0; i--)
5682          src[i + 1] = src[i];
5683       num_src++;
5684       if (inst->resource.file == PROGRAM_MEMORY) {
5685          src[0] = t->shared_memory;
5686       } else if (inst->resource.file == PROGRAM_BUFFER) {
5687          src[0] = t->buffers[inst->resource.index];
5688       } else {
5689          if (inst->resource.file == PROGRAM_IMAGE) {
5690             src[0] = t->images[inst->resource.index];
5691          } else {
5692             /* Bindless images. */
5693             src[0] = translate_src(t, &inst->resource);
5694          }
5695          tex_target = st_translate_texture_target(inst->tex_target, inst->tex_shadow);
5696       }
5697       if (inst->resource.reladdr)
5698          src[0] = ureg_src_indirect(src[0], ureg_src(t->address[2]));
5699       assert(src[0].File != TGSI_FILE_NULL);
5700       ureg_memory_insn(ureg, inst->op, dst, num_dst, src, num_src,
5701                        inst->buffer_access,
5702                        tex_target, inst->image_format);
5703       break;
5704
5705    case TGSI_OPCODE_STORE:
5706       if (inst->resource.file == PROGRAM_MEMORY) {
5707          dst[0] = ureg_dst(t->shared_memory);
5708       } else if (inst->resource.file == PROGRAM_BUFFER) {
5709          dst[0] = ureg_dst(t->buffers[inst->resource.index]);
5710       } else {
5711          if (inst->resource.file == PROGRAM_IMAGE) {
5712             dst[0] = ureg_dst(t->images[inst->resource.index]);
5713          } else {
5714             /* Bindless images. */
5715             dst[0] = ureg_dst(translate_src(t, &inst->resource));
5716          }
5717          tex_target = st_translate_texture_target(inst->tex_target, inst->tex_shadow);
5718       }
5719       dst[0] = ureg_writemask(dst[0], inst->dst[0].writemask);
5720       if (inst->resource.reladdr)
5721          dst[0] = ureg_dst_indirect(dst[0], ureg_src(t->address[2]));
5722       assert(dst[0].File != TGSI_FILE_NULL);
5723       ureg_memory_insn(ureg, inst->op, dst, num_dst, src, num_src,
5724                        inst->buffer_access,
5725                        tex_target, inst->image_format);
5726       break;
5727
5728    default:
5729       ureg_insn(ureg,
5730                 inst->op,
5731                 dst, num_dst,
5732                 src, num_src, inst->precise);
5733       break;
5734    }
5735 }
5736
5737 /**
5738  * Emit the TGSI instructions for inverting and adjusting WPOS.
5739  * This code is unavoidable because it also depends on whether
5740  * a FBO is bound (STATE_FB_WPOS_Y_TRANSFORM).
5741  */
5742 static void
5743 emit_wpos_adjustment(struct gl_context *ctx,
5744                      struct st_translate *t,
5745                      int wpos_transform_const,
5746                      boolean invert,
5747                      GLfloat adjX, GLfloat adjY[2])
5748 {
5749    struct ureg_program *ureg = t->ureg;
5750
5751    assert(wpos_transform_const >= 0);
5752
5753    /* Fragment program uses fragment position input.
5754     * Need to replace instances of INPUT[WPOS] with temp T
5755     * where T = INPUT[WPOS] is inverted by Y.
5756     */
5757    struct ureg_src wpostrans = ureg_DECL_constant(ureg, wpos_transform_const);
5758    struct ureg_dst wpos_temp = ureg_DECL_temporary( ureg );
5759    struct ureg_src *wpos =
5760       ctx->Const.GLSLFragCoordIsSysVal ?
5761          &t->systemValues[SYSTEM_VALUE_FRAG_COORD] :
5762          &t->inputs[t->inputMapping[VARYING_SLOT_POS]];
5763    struct ureg_src wpos_input = *wpos;
5764
5765    /* First, apply the coordinate shift: */
5766    if (adjX || adjY[0] || adjY[1]) {
5767       if (adjY[0] != adjY[1]) {
5768          /* Adjust the y coordinate by adjY[1] or adjY[0] respectively
5769           * depending on whether inversion is actually going to be applied
5770           * or not, which is determined by testing against the inversion
5771           * state variable used below, which will be either +1 or -1.
5772           */
5773          struct ureg_dst adj_temp = ureg_DECL_local_temporary(ureg);
5774
5775          ureg_CMP(ureg, adj_temp,
5776                   ureg_scalar(wpostrans, invert ? 2 : 0),
5777                   ureg_imm4f(ureg, adjX, adjY[0], 0.0f, 0.0f),
5778                   ureg_imm4f(ureg, adjX, adjY[1], 0.0f, 0.0f));
5779          ureg_ADD(ureg, wpos_temp, wpos_input, ureg_src(adj_temp));
5780       } else {
5781          ureg_ADD(ureg, wpos_temp, wpos_input,
5782                   ureg_imm4f(ureg, adjX, adjY[0], 0.0f, 0.0f));
5783       }
5784       wpos_input = ureg_src(wpos_temp);
5785    } else {
5786       /* MOV wpos_temp, input[wpos]
5787        */
5788       ureg_MOV( ureg, wpos_temp, wpos_input );
5789    }
5790
5791    /* Now the conditional y flip: STATE_FB_WPOS_Y_TRANSFORM.xy/zw will be
5792     * inversion/identity, or the other way around if we're drawing to an FBO.
5793     */
5794    if (invert) {
5795       /* MAD wpos_temp.y, wpos_input, wpostrans.xxxx, wpostrans.yyyy
5796        */
5797       ureg_MAD( ureg,
5798                 ureg_writemask(wpos_temp, TGSI_WRITEMASK_Y ),
5799                 wpos_input,
5800                 ureg_scalar(wpostrans, 0),
5801                 ureg_scalar(wpostrans, 1));
5802    } else {
5803       /* MAD wpos_temp.y, wpos_input, wpostrans.zzzz, wpostrans.wwww
5804        */
5805       ureg_MAD( ureg,
5806                 ureg_writemask(wpos_temp, TGSI_WRITEMASK_Y ),
5807                 wpos_input,
5808                 ureg_scalar(wpostrans, 2),
5809                 ureg_scalar(wpostrans, 3));
5810    }
5811
5812    /* Use wpos_temp as position input from here on:
5813     */
5814    *wpos = ureg_src(wpos_temp);
5815 }
5816
5817
5818 /**
5819  * Emit fragment position/ooordinate code.
5820  */
5821 static void
5822 emit_wpos(struct st_context *st,
5823           struct st_translate *t,
5824           const struct gl_program *program,
5825           struct ureg_program *ureg,
5826           int wpos_transform_const)
5827 {
5828    struct pipe_screen *pscreen = st->pipe->screen;
5829    GLfloat adjX = 0.0f;
5830    GLfloat adjY[2] = { 0.0f, 0.0f };
5831    boolean invert = FALSE;
5832
5833    /* Query the pixel center conventions supported by the pipe driver and set
5834     * adjX, adjY to help out if it cannot handle the requested one internally.
5835     *
5836     * The bias of the y-coordinate depends on whether y-inversion takes place
5837     * (adjY[1]) or not (adjY[0]), which is in turn dependent on whether we are
5838     * drawing to an FBO (causes additional inversion), and whether the pipe
5839     * driver origin and the requested origin differ (the latter condition is
5840     * stored in the 'invert' variable).
5841     *
5842     * For height = 100 (i = integer, h = half-integer, l = lower, u = upper):
5843     *
5844     * center shift only:
5845     * i -> h: +0.5
5846     * h -> i: -0.5
5847     *
5848     * inversion only:
5849     * l,i -> u,i: ( 0.0 + 1.0) * -1 + 100 = 99
5850     * l,h -> u,h: ( 0.5 + 0.0) * -1 + 100 = 99.5
5851     * u,i -> l,i: (99.0 + 1.0) * -1 + 100 = 0
5852     * u,h -> l,h: (99.5 + 0.0) * -1 + 100 = 0.5
5853     *
5854     * inversion and center shift:
5855     * l,i -> u,h: ( 0.0 + 0.5) * -1 + 100 = 99.5
5856     * l,h -> u,i: ( 0.5 + 0.5) * -1 + 100 = 99
5857     * u,i -> l,h: (99.0 + 0.5) * -1 + 100 = 0.5
5858     * u,h -> l,i: (99.5 + 0.5) * -1 + 100 = 0
5859     */
5860    if (program->OriginUpperLeft) {
5861       /* Fragment shader wants origin in upper-left */
5862       if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT)) {
5863          /* the driver supports upper-left origin */
5864       }
5865       else if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT)) {
5866          /* the driver supports lower-left origin, need to invert Y */
5867          ureg_property(ureg, TGSI_PROPERTY_FS_COORD_ORIGIN,
5868                        TGSI_FS_COORD_ORIGIN_LOWER_LEFT);
5869          invert = TRUE;
5870       }
5871       else
5872          assert(0);
5873    }
5874    else {
5875       /* Fragment shader wants origin in lower-left */
5876       if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT))
5877          /* the driver supports lower-left origin */
5878          ureg_property(ureg, TGSI_PROPERTY_FS_COORD_ORIGIN,
5879                        TGSI_FS_COORD_ORIGIN_LOWER_LEFT);
5880       else if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT))
5881          /* the driver supports upper-left origin, need to invert Y */
5882          invert = TRUE;
5883       else
5884          assert(0);
5885    }
5886
5887    if (program->PixelCenterInteger) {
5888       /* Fragment shader wants pixel center integer */
5889       if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER)) {
5890          /* the driver supports pixel center integer */
5891          adjY[1] = 1.0f;
5892          ureg_property(ureg, TGSI_PROPERTY_FS_COORD_PIXEL_CENTER,
5893                        TGSI_FS_COORD_PIXEL_CENTER_INTEGER);
5894       }
5895       else if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER)) {
5896          /* the driver supports pixel center half integer, need to bias X,Y */
5897          adjX = -0.5f;
5898          adjY[0] = -0.5f;
5899          adjY[1] = 0.5f;
5900       }
5901       else
5902          assert(0);
5903    }
5904    else {
5905       /* Fragment shader wants pixel center half integer */
5906       if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER)) {
5907          /* the driver supports pixel center half integer */
5908       }
5909       else if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER)) {
5910          /* the driver supports pixel center integer, need to bias X,Y */
5911          adjX = adjY[0] = adjY[1] = 0.5f;
5912          ureg_property(ureg, TGSI_PROPERTY_FS_COORD_PIXEL_CENTER,
5913                        TGSI_FS_COORD_PIXEL_CENTER_INTEGER);
5914       }
5915       else
5916          assert(0);
5917    }
5918
5919    /* we invert after adjustment so that we avoid the MOV to temporary,
5920     * and reuse the adjustment ADD instead */
5921    emit_wpos_adjustment(st->ctx, t, wpos_transform_const, invert, adjX, adjY);
5922 }
5923
5924 /**
5925  * OpenGL's fragment gl_FrontFace input is 1 for front-facing, 0 for back.
5926  * TGSI uses +1 for front, -1 for back.
5927  * This function converts the TGSI value to the GL value.  Simply clamping/
5928  * saturating the value to [0,1] does the job.
5929  */
5930 static void
5931 emit_face_var(struct gl_context *ctx, struct st_translate *t)
5932 {
5933    struct ureg_program *ureg = t->ureg;
5934    struct ureg_dst face_temp = ureg_DECL_temporary(ureg);
5935    struct ureg_src face_input = t->inputs[t->inputMapping[VARYING_SLOT_FACE]];
5936
5937    if (ctx->Const.NativeIntegers) {
5938       ureg_FSGE(ureg, face_temp, face_input, ureg_imm1f(ureg, 0));
5939    }
5940    else {
5941       /* MOV_SAT face_temp, input[face] */
5942       ureg_MOV(ureg, ureg_saturate(face_temp), face_input);
5943    }
5944
5945    /* Use face_temp as face input from here on: */
5946    t->inputs[t->inputMapping[VARYING_SLOT_FACE]] = ureg_src(face_temp);
5947 }
5948
5949 static void
5950 emit_compute_block_size(const struct gl_program *prog,
5951                         struct ureg_program *ureg) {
5952    ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH,
5953                  prog->info.cs.local_size[0]);
5954    ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT,
5955                  prog->info.cs.local_size[1]);
5956    ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH,
5957                  prog->info.cs.local_size[2]);
5958 }
5959
5960 struct sort_inout_decls {
5961    bool operator()(const struct inout_decl &a, const struct inout_decl &b) const {
5962       return mapping[a.mesa_index] < mapping[b.mesa_index];
5963    }
5964
5965    const ubyte *mapping;
5966 };
5967
5968 /* Sort the given array of decls by the corresponding slot (TGSI file index).
5969  *
5970  * This is for the benefit of older drivers which are broken when the
5971  * declarations aren't sorted in this way.
5972  */
5973 static void
5974 sort_inout_decls_by_slot(struct inout_decl *decls,
5975                          unsigned count,
5976                          const ubyte mapping[])
5977 {
5978    sort_inout_decls sorter;
5979    sorter.mapping = mapping;
5980    std::sort(decls, decls + count, sorter);
5981 }
5982
5983 static unsigned
5984 st_translate_interp(enum glsl_interp_mode glsl_qual, GLuint varying)
5985 {
5986    switch (glsl_qual) {
5987    case INTERP_MODE_NONE:
5988       if (varying == VARYING_SLOT_COL0 || varying == VARYING_SLOT_COL1)
5989          return TGSI_INTERPOLATE_COLOR;
5990       return TGSI_INTERPOLATE_PERSPECTIVE;
5991    case INTERP_MODE_SMOOTH:
5992       return TGSI_INTERPOLATE_PERSPECTIVE;
5993    case INTERP_MODE_FLAT:
5994       return TGSI_INTERPOLATE_CONSTANT;
5995    case INTERP_MODE_NOPERSPECTIVE:
5996       return TGSI_INTERPOLATE_LINEAR;
5997    default:
5998       assert(0 && "unexpected interp mode in st_translate_interp()");
5999       return TGSI_INTERPOLATE_PERSPECTIVE;
6000    }
6001 }
6002
6003 /**
6004  * Translate intermediate IR (glsl_to_tgsi_instruction) to TGSI format.
6005  * \param program  the program to translate
6006  * \param numInputs  number of input registers used
6007  * \param inputMapping  maps Mesa fragment program inputs to TGSI generic
6008  *                      input indexes
6009  * \param inputSemanticName  the TGSI_SEMANTIC flag for each input
6010  * \param inputSemanticIndex  the semantic index (ex: which texcoord) for
6011  *                            each input
6012  * \param interpMode  the TGSI_INTERPOLATE_LINEAR/PERSP mode for each input
6013  * \param numOutputs  number of output registers used
6014  * \param outputMapping  maps Mesa fragment program outputs to TGSI
6015  *                       generic outputs
6016  * \param outputSemanticName  the TGSI_SEMANTIC flag for each output
6017  * \param outputSemanticIndex  the semantic index (ex: which texcoord) for
6018  *                             each output
6019  *
6020  * \return  PIPE_OK or PIPE_ERROR_OUT_OF_MEMORY
6021  */
6022 extern "C" enum pipe_error
6023 st_translate_program(
6024    struct gl_context *ctx,
6025    uint procType,
6026    struct ureg_program *ureg,
6027    glsl_to_tgsi_visitor *program,
6028    const struct gl_program *proginfo,
6029    GLuint numInputs,
6030    const ubyte inputMapping[],
6031    const ubyte inputSlotToAttr[],
6032    const ubyte inputSemanticName[],
6033    const ubyte inputSemanticIndex[],
6034    const ubyte interpMode[],
6035    GLuint numOutputs,
6036    const ubyte outputMapping[],
6037    const ubyte outputSemanticName[],
6038    const ubyte outputSemanticIndex[])
6039 {
6040    struct st_translate *t;
6041    unsigned i;
6042    struct gl_program_constants *frag_const =
6043       &ctx->Const.Program[MESA_SHADER_FRAGMENT];
6044    enum pipe_error ret = PIPE_OK;
6045
6046    assert(numInputs <= ARRAY_SIZE(t->inputs));
6047    assert(numOutputs <= ARRAY_SIZE(t->outputs));
6048
6049    t = CALLOC_STRUCT(st_translate);
6050    if (!t) {
6051       ret = PIPE_ERROR_OUT_OF_MEMORY;
6052       goto out;
6053    }
6054
6055    t->procType = procType;
6056    t->inputMapping = inputMapping;
6057    t->outputMapping = outputMapping;
6058    t->ureg = ureg;
6059    t->num_temp_arrays = program->next_array;
6060    if (t->num_temp_arrays)
6061       t->arrays = (struct ureg_dst*)
6062                   calloc(t->num_temp_arrays, sizeof(t->arrays[0]));
6063
6064    /*
6065     * Declare input attributes.
6066     */
6067    switch (procType) {
6068    case PIPE_SHADER_FRAGMENT:
6069    case PIPE_SHADER_GEOMETRY:
6070    case PIPE_SHADER_TESS_EVAL:
6071    case PIPE_SHADER_TESS_CTRL:
6072       sort_inout_decls_by_slot(program->inputs, program->num_inputs, inputMapping);
6073
6074       for (i = 0; i < program->num_inputs; ++i) {
6075          struct inout_decl *decl = &program->inputs[i];
6076          unsigned slot = inputMapping[decl->mesa_index];
6077          struct ureg_src src;
6078          ubyte tgsi_usage_mask = decl->usage_mask;
6079
6080          if (glsl_base_type_is_64bit(decl->base_type)) {
6081             if (tgsi_usage_mask == 1)
6082                tgsi_usage_mask = TGSI_WRITEMASK_XY;
6083             else if (tgsi_usage_mask == 2)
6084                tgsi_usage_mask = TGSI_WRITEMASK_ZW;
6085             else
6086                tgsi_usage_mask = TGSI_WRITEMASK_XYZW;
6087          }
6088
6089          unsigned interp_mode = 0;
6090          unsigned interp_location = 0;
6091          if (procType == PIPE_SHADER_FRAGMENT) {
6092             assert(interpMode);
6093             interp_mode = interpMode[slot] != TGSI_INTERPOLATE_COUNT ?
6094                interpMode[slot] :
6095                st_translate_interp(decl->interp, inputSlotToAttr[slot]);
6096
6097             interp_location = decl->interp_loc;
6098          }
6099
6100          src = ureg_DECL_fs_input_cyl_centroid_layout(ureg,
6101                   inputSemanticName[slot], inputSemanticIndex[slot],
6102                   interp_mode, 0, interp_location, slot, tgsi_usage_mask,
6103                   decl->array_id, decl->size);
6104
6105          for (unsigned j = 0; j < decl->size; ++j) {
6106             if (t->inputs[slot + j].File != TGSI_FILE_INPUT) {
6107                /* The ArrayID is set up in dst_register */
6108                t->inputs[slot + j] = src;
6109                t->inputs[slot + j].ArrayID = 0;
6110                t->inputs[slot + j].Index += j;
6111             }
6112          }
6113       }
6114       break;
6115    case PIPE_SHADER_VERTEX:
6116       for (i = 0; i < numInputs; i++) {
6117          t->inputs[i] = ureg_DECL_vs_input(ureg, i);
6118       }
6119       break;
6120    case PIPE_SHADER_COMPUTE:
6121       break;
6122    default:
6123       assert(0);
6124    }
6125
6126    /*
6127     * Declare output attributes.
6128     */
6129    switch (procType) {
6130    case PIPE_SHADER_FRAGMENT:
6131    case PIPE_SHADER_COMPUTE:
6132       break;
6133    case PIPE_SHADER_GEOMETRY:
6134    case PIPE_SHADER_TESS_EVAL:
6135    case PIPE_SHADER_TESS_CTRL:
6136    case PIPE_SHADER_VERTEX:
6137       sort_inout_decls_by_slot(program->outputs, program->num_outputs, outputMapping);
6138
6139       for (i = 0; i < program->num_outputs; ++i) {
6140          struct inout_decl *decl = &program->outputs[i];
6141          unsigned slot = outputMapping[decl->mesa_index];
6142          struct ureg_dst dst;
6143          ubyte tgsi_usage_mask = decl->usage_mask;
6144
6145          if (glsl_base_type_is_64bit(decl->base_type)) {
6146             if (tgsi_usage_mask == 1)
6147                tgsi_usage_mask = TGSI_WRITEMASK_XY;
6148             else if (tgsi_usage_mask == 2)
6149                tgsi_usage_mask = TGSI_WRITEMASK_ZW;
6150             else
6151                tgsi_usage_mask = TGSI_WRITEMASK_XYZW;
6152          }
6153
6154          dst = ureg_DECL_output_layout(ureg,
6155                      outputSemanticName[slot], outputSemanticIndex[slot],
6156                      decl->gs_out_streams,
6157                      slot, tgsi_usage_mask, decl->array_id, decl->size);
6158
6159          for (unsigned j = 0; j < decl->size; ++j) {
6160             if (t->outputs[slot + j].File != TGSI_FILE_OUTPUT) {
6161                /* The ArrayID is set up in dst_register */
6162                t->outputs[slot + j] = dst;
6163                t->outputs[slot + j].ArrayID = 0;
6164                t->outputs[slot + j].Index += j;
6165             }
6166          }
6167       }
6168       break;
6169    default:
6170       assert(0);
6171    }
6172
6173    if (procType == PIPE_SHADER_FRAGMENT) {
6174       if (program->shader->Program->info.fs.early_fragment_tests ||
6175           program->shader->Program->info.fs.post_depth_coverage) {
6176          ureg_property(ureg, TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL, 1);
6177
6178          if (program->shader->Program->info.fs.post_depth_coverage)
6179             ureg_property(ureg, TGSI_PROPERTY_FS_POST_DEPTH_COVERAGE, 1);
6180       }
6181
6182       if (proginfo->info.inputs_read & VARYING_BIT_POS) {
6183           /* Must do this after setting up t->inputs. */
6184           emit_wpos(st_context(ctx), t, proginfo, ureg,
6185                     program->wpos_transform_const);
6186       }
6187
6188       if (proginfo->info.inputs_read & VARYING_BIT_FACE)
6189          emit_face_var(ctx, t);
6190
6191       for (i = 0; i < numOutputs; i++) {
6192          switch (outputSemanticName[i]) {
6193          case TGSI_SEMANTIC_POSITION:
6194             t->outputs[i] = ureg_DECL_output(ureg,
6195                                              TGSI_SEMANTIC_POSITION, /* Z/Depth */
6196                                              outputSemanticIndex[i]);
6197             t->outputs[i] = ureg_writemask(t->outputs[i], TGSI_WRITEMASK_Z);
6198             break;
6199          case TGSI_SEMANTIC_STENCIL:
6200             t->outputs[i] = ureg_DECL_output(ureg,
6201                                              TGSI_SEMANTIC_STENCIL, /* Stencil */
6202                                              outputSemanticIndex[i]);
6203             t->outputs[i] = ureg_writemask(t->outputs[i], TGSI_WRITEMASK_Y);
6204             break;
6205          case TGSI_SEMANTIC_COLOR:
6206             t->outputs[i] = ureg_DECL_output(ureg,
6207                                              TGSI_SEMANTIC_COLOR,
6208                                              outputSemanticIndex[i]);
6209             break;
6210          case TGSI_SEMANTIC_SAMPLEMASK:
6211             t->outputs[i] = ureg_DECL_output(ureg,
6212                                              TGSI_SEMANTIC_SAMPLEMASK,
6213                                              outputSemanticIndex[i]);
6214             /* TODO: If we ever support more than 32 samples, this will have
6215              * to become an array.
6216              */
6217             t->outputs[i] = ureg_writemask(t->outputs[i], TGSI_WRITEMASK_X);
6218             break;
6219          default:
6220             assert(!"fragment shader outputs must be POSITION/STENCIL/COLOR");
6221             ret = PIPE_ERROR_BAD_INPUT;
6222             goto out;
6223          }
6224       }
6225    }
6226    else if (procType == PIPE_SHADER_VERTEX) {
6227       for (i = 0; i < numOutputs; i++) {
6228          if (outputSemanticName[i] == TGSI_SEMANTIC_FOG) {
6229             /* force register to contain a fog coordinate in the form (F, 0, 0, 1). */
6230             ureg_MOV(ureg,
6231                      ureg_writemask(t->outputs[i], TGSI_WRITEMASK_YZW),
6232                      ureg_imm4f(ureg, 0.0f, 0.0f, 0.0f, 1.0f));
6233             t->outputs[i] = ureg_writemask(t->outputs[i], TGSI_WRITEMASK_X);
6234          }
6235       }
6236    }
6237
6238    if (procType == PIPE_SHADER_COMPUTE) {
6239       emit_compute_block_size(proginfo, ureg);
6240    }
6241
6242    /* Declare address register.
6243     */
6244    if (program->num_address_regs > 0) {
6245       assert(program->num_address_regs <= 3);
6246       for (int i = 0; i < program->num_address_regs; i++)
6247          t->address[i] = ureg_DECL_address(ureg);
6248    }
6249
6250    /* Declare misc input registers
6251     */
6252    {
6253       GLbitfield sysInputs = proginfo->info.system_values_read;
6254
6255       for (i = 0; sysInputs; i++) {
6256          if (sysInputs & (1 << i)) {
6257             unsigned semName = _mesa_sysval_to_semantic(i);
6258
6259             t->systemValues[i] = ureg_DECL_system_value(ureg, semName, 0);
6260
6261             if (semName == TGSI_SEMANTIC_INSTANCEID ||
6262                 semName == TGSI_SEMANTIC_VERTEXID) {
6263                /* From Gallium perspective, these system values are always
6264                 * integer, and require native integer support.  However, if
6265                 * native integer is supported on the vertex stage but not the
6266                 * pixel stage (e.g, i915g + draw), Mesa will generate IR that
6267                 * assumes these system values are floats. To resolve the
6268                 * inconsistency, we insert a U2F.
6269                 */
6270                struct st_context *st = st_context(ctx);
6271                struct pipe_screen *pscreen = st->pipe->screen;
6272                assert(procType == PIPE_SHADER_VERTEX);
6273                assert(pscreen->get_shader_param(pscreen, PIPE_SHADER_VERTEX, PIPE_SHADER_CAP_INTEGERS));
6274                (void) pscreen;
6275                if (!ctx->Const.NativeIntegers) {
6276                   struct ureg_dst temp = ureg_DECL_local_temporary(t->ureg);
6277                   ureg_U2F( t->ureg, ureg_writemask(temp, TGSI_WRITEMASK_X), t->systemValues[i]);
6278                   t->systemValues[i] = ureg_scalar(ureg_src(temp), 0);
6279                }
6280             }
6281
6282             if (procType == PIPE_SHADER_FRAGMENT &&
6283                 semName == TGSI_SEMANTIC_POSITION)
6284                emit_wpos(st_context(ctx), t, proginfo, ureg,
6285                          program->wpos_transform_const);
6286
6287             sysInputs &= ~(1 << i);
6288          }
6289       }
6290    }
6291
6292    t->array_sizes = program->array_sizes;
6293    t->input_decls = program->inputs;
6294    t->num_input_decls = program->num_inputs;
6295    t->output_decls = program->outputs;
6296    t->num_output_decls = program->num_outputs;
6297
6298    /* Emit constants and uniforms.  TGSI uses a single index space for these,
6299     * so we put all the translated regs in t->constants.
6300     */
6301    if (proginfo->Parameters) {
6302       t->constants = (struct ureg_src *)
6303          calloc(proginfo->Parameters->NumParameters, sizeof(t->constants[0]));
6304       if (t->constants == NULL) {
6305          ret = PIPE_ERROR_OUT_OF_MEMORY;
6306          goto out;
6307       }
6308       t->num_constants = proginfo->Parameters->NumParameters;
6309
6310       for (i = 0; i < proginfo->Parameters->NumParameters; i++) {
6311          switch (proginfo->Parameters->Parameters[i].Type) {
6312          case PROGRAM_STATE_VAR:
6313          case PROGRAM_UNIFORM:
6314             t->constants[i] = ureg_DECL_constant(ureg, i);
6315             break;
6316
6317          /* Emit immediates for PROGRAM_CONSTANT only when there's no indirect
6318           * addressing of the const buffer.
6319           * FIXME: Be smarter and recognize param arrays:
6320           * indirect addressing is only valid within the referenced
6321           * array.
6322           */
6323          case PROGRAM_CONSTANT:
6324             if (program->indirect_addr_consts)
6325                t->constants[i] = ureg_DECL_constant(ureg, i);
6326             else
6327                t->constants[i] = emit_immediate(t,
6328                                                 proginfo->Parameters->ParameterValues[i],
6329                                                 proginfo->Parameters->Parameters[i].DataType,
6330                                                 4);
6331             break;
6332          default:
6333             break;
6334          }
6335       }
6336    }
6337
6338    for (i = 0; i < proginfo->info.num_ubos; i++) {
6339       unsigned size = proginfo->sh.UniformBlocks[i]->UniformBufferSize;
6340       unsigned num_const_vecs = (size + 15) / 16;
6341       unsigned first, last;
6342       assert(num_const_vecs > 0);
6343       first = 0;
6344       last = num_const_vecs > 0 ? num_const_vecs - 1 : 0;
6345       ureg_DECL_constant2D(t->ureg, first, last, i + 1);
6346    }
6347
6348    /* Emit immediate values.
6349     */
6350    t->immediates = (struct ureg_src *)
6351       calloc(program->num_immediates, sizeof(struct ureg_src));
6352    if (t->immediates == NULL) {
6353       ret = PIPE_ERROR_OUT_OF_MEMORY;
6354       goto out;
6355    }
6356    t->num_immediates = program->num_immediates;
6357
6358    i = 0;
6359    foreach_in_list(immediate_storage, imm, &program->immediates) {
6360       assert(i < program->num_immediates);
6361       t->immediates[i++] = emit_immediate(t, imm->values, imm->type, imm->size32);
6362    }
6363    assert(i == program->num_immediates);
6364
6365    /* texture samplers */
6366    for (i = 0; i < frag_const->MaxTextureImageUnits; i++) {
6367       if (program->samplers_used & (1u << i)) {
6368          unsigned type = st_translate_texture_type(program->sampler_types[i]);
6369
6370          t->samplers[i] = ureg_DECL_sampler(ureg, i);
6371
6372          ureg_DECL_sampler_view( ureg, i, program->sampler_targets[i],
6373                                  type, type, type, type );
6374       }
6375    }
6376
6377    /* Declare atomic and shader storage buffers. */
6378    {
6379       struct gl_program *prog = program->prog;
6380
6381       for (i = 0; i < prog->info.num_abos; i++) {
6382          unsigned index = prog->sh.AtomicBuffers[i]->Binding;
6383          assert(index < frag_const->MaxAtomicBuffers);
6384          t->buffers[index] = ureg_DECL_buffer(ureg, index, true);
6385       }
6386
6387       assert(prog->info.num_ssbos <= frag_const->MaxShaderStorageBlocks);
6388       for (i = 0; i < prog->info.num_ssbos; i++) {
6389          unsigned index = frag_const->MaxAtomicBuffers + i;
6390          t->buffers[index] = ureg_DECL_buffer(ureg, index, false);
6391       }
6392    }
6393
6394    if (program->use_shared_memory)
6395       t->shared_memory = ureg_DECL_memory(ureg, TGSI_MEMORY_TYPE_SHARED);
6396
6397    for (i = 0; i < program->shader->Program->info.num_images; i++) {
6398       if (program->images_used & (1 << i)) {
6399          t->images[i] = ureg_DECL_image(ureg, i,
6400                                         program->image_targets[i],
6401                                         program->image_formats[i],
6402                                         true, false);
6403       }
6404    }
6405
6406    /* Emit each instruction in turn:
6407     */
6408    foreach_in_list(glsl_to_tgsi_instruction, inst, &program->instructions)
6409       compile_tgsi_instruction(t, inst);
6410
6411    /* Set the next shader stage hint for VS and TES. */
6412    switch (procType) {
6413    case PIPE_SHADER_VERTEX:
6414    case PIPE_SHADER_TESS_EVAL:
6415       if (program->shader_program->SeparateShader)
6416          break;
6417
6418       for (i = program->shader->Stage+1; i <= MESA_SHADER_FRAGMENT; i++) {
6419          if (program->shader_program->_LinkedShaders[i]) {
6420             ureg_set_next_shader_processor(
6421                   ureg, pipe_shader_type_from_mesa((gl_shader_stage)i));
6422             break;
6423          }
6424       }
6425       break;
6426    }
6427
6428 out:
6429    if (t) {
6430       free(t->arrays);
6431       free(t->temps);
6432       free(t->constants);
6433       t->num_constants = 0;
6434       free(t->immediates);
6435       t->num_immediates = 0;
6436       FREE(t);
6437    }
6438
6439    return ret;
6440 }
6441 /* ----------------------------- End TGSI code ------------------------------ */
6442
6443
6444 /**
6445  * Convert a shader's GLSL IR into a Mesa gl_program, although without
6446  * generating Mesa IR.
6447  */
6448 static struct gl_program *
6449 get_mesa_program_tgsi(struct gl_context *ctx,
6450                       struct gl_shader_program *shader_program,
6451                       struct gl_linked_shader *shader)
6452 {
6453    glsl_to_tgsi_visitor* v;
6454    struct gl_program *prog;
6455    struct gl_shader_compiler_options *options =
6456          &ctx->Const.ShaderCompilerOptions[shader->Stage];
6457    struct pipe_screen *pscreen = ctx->st->pipe->screen;
6458    enum pipe_shader_type ptarget = pipe_shader_type_from_mesa(shader->Stage);
6459    unsigned skip_merge_registers;
6460
6461    validate_ir_tree(shader->ir);
6462
6463    prog = shader->Program;
6464
6465    prog->Parameters = _mesa_new_parameter_list();
6466    v = new glsl_to_tgsi_visitor();
6467    v->ctx = ctx;
6468    v->prog = prog;
6469    v->shader_program = shader_program;
6470    v->shader = shader;
6471    v->options = options;
6472    v->glsl_version = ctx->Const.GLSLVersion;
6473    v->native_integers = ctx->Const.NativeIntegers;
6474
6475    v->have_sqrt = pscreen->get_shader_param(pscreen, ptarget,
6476                                             PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED);
6477    v->have_fma = pscreen->get_shader_param(pscreen, ptarget,
6478                                            PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED);
6479    v->has_tex_txf_lz = pscreen->get_param(pscreen,
6480                                           PIPE_CAP_TGSI_TEX_TXF_LZ);
6481
6482    v->variables = _mesa_hash_table_create(v->mem_ctx, _mesa_hash_pointer,
6483                                           _mesa_key_pointer_equal);
6484    skip_merge_registers =
6485       pscreen->get_shader_param(pscreen, ptarget,
6486                                 PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS);
6487
6488    _mesa_generate_parameters_list_for_uniforms(ctx, shader_program, shader,
6489                                                prog->Parameters);
6490
6491    /* Remove reads from output registers. */
6492    if (!pscreen->get_param(pscreen, PIPE_CAP_TGSI_CAN_READ_OUTPUTS))
6493       lower_output_reads(shader->Stage, shader->ir);
6494
6495    /* Emit intermediate IR for main(). */
6496    visit_exec_list(shader->ir, v);
6497
6498 #if 0
6499    /* Print out some information (for debugging purposes) used by the
6500     * optimization passes. */
6501    {
6502       int i;
6503       int *first_writes = ralloc_array(v->mem_ctx, int, v->next_temp);
6504       int *first_reads = ralloc_array(v->mem_ctx, int, v->next_temp);
6505       int *last_writes = ralloc_array(v->mem_ctx, int, v->next_temp);
6506       int *last_reads = ralloc_array(v->mem_ctx, int, v->next_temp);
6507
6508       for (i = 0; i < v->next_temp; i++) {
6509          first_writes[i] = -1;
6510          first_reads[i] = -1;
6511          last_writes[i] = -1;
6512          last_reads[i] = -1;
6513       }
6514       v->get_first_temp_read(first_reads);
6515       v->get_last_temp_read_first_temp_write(last_reads, first_writes);
6516       v->get_last_temp_write(last_writes);
6517       for (i = 0; i < v->next_temp; i++)
6518          printf("Temp %d: FR=%3d FW=%3d LR=%3d LW=%3d\n", i, first_reads[i],
6519                 first_writes[i],
6520                 last_reads[i],
6521                 last_writes[i]);
6522       ralloc_free(first_writes);
6523       ralloc_free(first_reads);
6524       ralloc_free(last_writes);
6525       ralloc_free(last_reads);
6526    }
6527 #endif
6528
6529    /* Perform optimizations on the instructions in the glsl_to_tgsi_visitor. */
6530    v->simplify_cmp();
6531
6532    if (shader->Stage != MESA_SHADER_TESS_CTRL &&
6533        shader->Stage != MESA_SHADER_TESS_EVAL)
6534       v->copy_propagate();
6535
6536    while (v->eliminate_dead_code());
6537
6538    v->merge_two_dsts();
6539    if (!skip_merge_registers)
6540       v->merge_registers();
6541    v->renumber_registers();
6542
6543    /* Write the END instruction. */
6544    v->emit_asm(NULL, TGSI_OPCODE_END);
6545
6546    if (ctx->_Shader->Flags & GLSL_DUMP) {
6547       _mesa_log("\n");
6548       _mesa_log("GLSL IR for linked %s program %d:\n",
6549              _mesa_shader_stage_to_string(shader->Stage),
6550              shader_program->Name);
6551       _mesa_print_ir(_mesa_get_log_file(), shader->ir, NULL);
6552       _mesa_log("\n\n");
6553    }
6554
6555    do_set_program_inouts(shader->ir, prog, shader->Stage);
6556    _mesa_copy_linked_program_data(shader_program, shader);
6557    shrink_array_declarations(v->inputs, v->num_inputs,
6558                              &prog->info.inputs_read,
6559                              prog->info.double_inputs_read,
6560                              &prog->info.patch_inputs_read);
6561    shrink_array_declarations(v->outputs, v->num_outputs,
6562                              &prog->info.outputs_written, 0ULL,
6563                              &prog->info.patch_outputs_written);
6564    count_resources(v, prog);
6565
6566    /* The GLSL IR won't be needed anymore. */
6567    ralloc_free(shader->ir);
6568    shader->ir = NULL;
6569
6570    /* This must be done before the uniform storage is associated. */
6571    if (shader->Stage == MESA_SHADER_FRAGMENT &&
6572        (prog->info.inputs_read & VARYING_BIT_POS ||
6573         prog->info.system_values_read & (1 << SYSTEM_VALUE_FRAG_COORD))) {
6574       static const gl_state_index wposTransformState[STATE_LENGTH] = {
6575          STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM
6576       };
6577
6578       v->wpos_transform_const = _mesa_add_state_reference(prog->Parameters,
6579                                                           wposTransformState);
6580    }
6581
6582    /* Avoid reallocation of the program parameter list, because the uniform
6583     * storage is only associated with the original parameter list.
6584     * This should be enough for Bitmap and DrawPixels constants.
6585     */
6586    _mesa_reserve_parameter_storage(prog->Parameters, 8);
6587
6588    /* This has to be done last.  Any operation the can cause
6589     * prog->ParameterValues to get reallocated (e.g., anything that adds a
6590     * program constant) has to happen before creating this linkage.
6591     */
6592    _mesa_associate_uniform_storage(ctx, shader_program, prog, true);
6593    if (!shader_program->data->LinkStatus) {
6594       free_glsl_to_tgsi_visitor(v);
6595       _mesa_reference_program(ctx, &shader->Program, NULL);
6596       return NULL;
6597    }
6598
6599    struct st_vertex_program *stvp;
6600    struct st_fragment_program *stfp;
6601    struct st_common_program *stp;
6602    struct st_compute_program *stcp;
6603
6604    switch (shader->Stage) {
6605    case MESA_SHADER_VERTEX:
6606       stvp = (struct st_vertex_program *)prog;
6607       stvp->glsl_to_tgsi = v;
6608       break;
6609    case MESA_SHADER_FRAGMENT:
6610       stfp = (struct st_fragment_program *)prog;
6611       stfp->glsl_to_tgsi = v;
6612       break;
6613    case MESA_SHADER_TESS_CTRL:
6614    case MESA_SHADER_TESS_EVAL:
6615    case MESA_SHADER_GEOMETRY:
6616       stp = st_common_program(prog);
6617       stp->glsl_to_tgsi = v;
6618       break;
6619    case MESA_SHADER_COMPUTE:
6620       stcp = (struct st_compute_program *)prog;
6621       stcp->glsl_to_tgsi = v;
6622       break;
6623    default:
6624       assert(!"should not be reached");
6625       return NULL;
6626    }
6627
6628    return prog;
6629 }
6630
6631 /* See if there are unsupported control flow statements. */
6632 class ir_control_flow_info_visitor : public ir_hierarchical_visitor {
6633 private:
6634    const struct gl_shader_compiler_options *options;
6635 public:
6636    ir_control_flow_info_visitor(const struct gl_shader_compiler_options *options)
6637       : options(options),
6638         unsupported(false)
6639    {
6640    }
6641
6642    virtual ir_visitor_status visit_enter(ir_function *ir)
6643    {
6644       /* Other functions are skipped (same as glsl_to_tgsi). */
6645       if (strcmp(ir->name, "main") == 0)
6646          return visit_continue;
6647
6648       return visit_continue_with_parent;
6649    }
6650
6651    virtual ir_visitor_status visit_enter(ir_call *ir)
6652    {
6653       if (!ir->callee->is_intrinsic()) {
6654          unsupported = true; /* it's a function call */
6655          return visit_stop;
6656       }
6657       return visit_continue;
6658    }
6659
6660    virtual ir_visitor_status visit_enter(ir_return *ir)
6661    {
6662       if (options->EmitNoMainReturn) {
6663          unsupported = true;
6664          return visit_stop;
6665       }
6666       return visit_continue;
6667    }
6668
6669    bool unsupported;
6670 };
6671
6672 static bool
6673 has_unsupported_control_flow(exec_list *ir,
6674                              const struct gl_shader_compiler_options *options)
6675 {
6676    ir_control_flow_info_visitor visitor(options);
6677    visit_list_elements(&visitor, ir);
6678    return visitor.unsupported;
6679 }
6680
6681 extern "C" {
6682
6683 /**
6684  * Link a shader.
6685  * Called via ctx->Driver.LinkShader()
6686  * This actually involves converting GLSL IR into an intermediate TGSI-like IR
6687  * with code lowering and other optimizations.
6688  */
6689 GLboolean
6690 st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
6691 {
6692    /* Return early if we are loading the shader from on-disk cache */
6693    if (st_load_tgsi_from_disk_cache(ctx, prog)) {
6694       return GL_TRUE;
6695    }
6696
6697    struct pipe_screen *pscreen = ctx->st->pipe->screen;
6698    assert(prog->data->LinkStatus);
6699
6700    for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
6701       if (prog->_LinkedShaders[i] == NULL)
6702          continue;
6703
6704       struct gl_linked_shader *shader = prog->_LinkedShaders[i];
6705       exec_list *ir = shader->ir;
6706       gl_shader_stage stage = shader->Stage;
6707       const struct gl_shader_compiler_options *options =
6708             &ctx->Const.ShaderCompilerOptions[stage];
6709       enum pipe_shader_type ptarget = pipe_shader_type_from_mesa(stage);
6710       bool have_dround = pscreen->get_shader_param(pscreen, ptarget,
6711                                                    PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED);
6712       bool have_dfrexp = pscreen->get_shader_param(pscreen, ptarget,
6713                                                    PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED);
6714       unsigned if_threshold = pscreen->get_shader_param(pscreen, ptarget,
6715                                                         PIPE_SHADER_CAP_LOWER_IF_THRESHOLD);
6716
6717       /* If there are forms of indirect addressing that the driver
6718        * cannot handle, perform the lowering pass.
6719        */
6720       if (options->EmitNoIndirectInput || options->EmitNoIndirectOutput ||
6721           options->EmitNoIndirectTemp || options->EmitNoIndirectUniform) {
6722          lower_variable_index_to_cond_assign(stage, ir,
6723                                              options->EmitNoIndirectInput,
6724                                              options->EmitNoIndirectOutput,
6725                                              options->EmitNoIndirectTemp,
6726                                              options->EmitNoIndirectUniform);
6727       }
6728
6729       if (!pscreen->get_param(pscreen, PIPE_CAP_INT64_DIVMOD))
6730          lower_64bit_integer_instructions(ir, DIV64 | MOD64);
6731
6732       if (ctx->Extensions.ARB_shading_language_packing) {
6733          unsigned lower_inst = LOWER_PACK_SNORM_2x16 |
6734                                LOWER_UNPACK_SNORM_2x16 |
6735                                LOWER_PACK_UNORM_2x16 |
6736                                LOWER_UNPACK_UNORM_2x16 |
6737                                LOWER_PACK_SNORM_4x8 |
6738                                LOWER_UNPACK_SNORM_4x8 |
6739                                LOWER_UNPACK_UNORM_4x8 |
6740                                LOWER_PACK_UNORM_4x8;
6741
6742          if (ctx->Extensions.ARB_gpu_shader5)
6743             lower_inst |= LOWER_PACK_USE_BFI |
6744                           LOWER_PACK_USE_BFE;
6745          if (!ctx->st->has_half_float_packing)
6746             lower_inst |= LOWER_PACK_HALF_2x16 |
6747                           LOWER_UNPACK_HALF_2x16;
6748
6749          lower_packing_builtins(ir, lower_inst);
6750       }
6751
6752       if (!pscreen->get_param(pscreen, PIPE_CAP_TEXTURE_GATHER_OFFSETS))
6753          lower_offset_arrays(ir);
6754       do_mat_op_to_vec(ir);
6755
6756       if (stage == MESA_SHADER_FRAGMENT)
6757          lower_blend_equation_advanced(shader);
6758
6759       lower_instructions(ir,
6760                          MOD_TO_FLOOR |
6761                          FDIV_TO_MUL_RCP |
6762                          EXP_TO_EXP2 |
6763                          LOG_TO_LOG2 |
6764                          LDEXP_TO_ARITH |
6765                          (have_dfrexp ? 0 : DFREXP_DLDEXP_TO_ARITH) |
6766                          CARRY_TO_ARITH |
6767                          BORROW_TO_ARITH |
6768                          (have_dround ? 0 : DOPS_TO_DFRAC) |
6769                          (options->EmitNoPow ? POW_TO_EXP2 : 0) |
6770                          (!ctx->Const.NativeIntegers ? INT_DIV_TO_MUL_RCP : 0) |
6771                          (options->EmitNoSat ? SAT_TO_CLAMP : 0) |
6772                          (ctx->Const.ForceGLSLAbsSqrt ? SQRT_TO_ABS_SQRT : 0) |
6773                          /* Assume that if ARB_gpu_shader5 is not supported
6774                           * then all of the extended integer functions need
6775                           * lowering.  It may be necessary to add some caps
6776                           * for individual instructions.
6777                           */
6778                          (!ctx->Extensions.ARB_gpu_shader5
6779                           ? BIT_COUNT_TO_MATH |
6780                             EXTRACT_TO_SHIFTS |
6781                             INSERT_TO_SHIFTS |
6782                             REVERSE_TO_SHIFTS |
6783                             FIND_LSB_TO_FLOAT_CAST |
6784                             FIND_MSB_TO_FLOAT_CAST |
6785                             IMUL_HIGH_TO_MUL
6786                           : 0));
6787
6788       do_vec_index_to_cond_assign(ir);
6789       lower_vector_insert(ir, true);
6790       lower_quadop_vector(ir, false);
6791       lower_noise(ir);
6792       if (options->MaxIfDepth == 0) {
6793          lower_discard(ir);
6794       }
6795
6796       if (ctx->Const.GLSLOptimizeConservatively) {
6797          /* Do it once and repeat only if there's unsupported control flow. */
6798          do {
6799             do_common_optimization(ir, true, true, options,
6800                                    ctx->Const.NativeIntegers);
6801             lower_if_to_cond_assign((gl_shader_stage)i, ir,
6802                                     options->MaxIfDepth, if_threshold);
6803          } while (has_unsupported_control_flow(ir, options));
6804       } else {
6805          /* Repeat it until it stops making changes. */
6806          bool progress;
6807          do {
6808             progress = do_common_optimization(ir, true, true, options,
6809                                               ctx->Const.NativeIntegers);
6810             progress |= lower_if_to_cond_assign((gl_shader_stage)i, ir,
6811                                                 options->MaxIfDepth, if_threshold);
6812          } while (progress);
6813       }
6814
6815       validate_ir_tree(ir);
6816    }
6817
6818    build_program_resource_list(ctx, prog);
6819
6820    for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
6821       struct gl_linked_shader *shader = prog->_LinkedShaders[i];
6822       if (shader == NULL)
6823          continue;
6824
6825       enum pipe_shader_type ptarget =
6826          pipe_shader_type_from_mesa(shader->Stage);
6827       enum pipe_shader_ir preferred_ir = (enum pipe_shader_ir)
6828          pscreen->get_shader_param(pscreen, ptarget,
6829                                    PIPE_SHADER_CAP_PREFERRED_IR);
6830
6831       struct gl_program *linked_prog = NULL;
6832       if (preferred_ir == PIPE_SHADER_IR_NIR) {
6833          /* TODO only for GLSL VS/FS/CS for now: */
6834          switch (shader->Stage) {
6835          case MESA_SHADER_VERTEX:
6836          case MESA_SHADER_FRAGMENT:
6837          case MESA_SHADER_COMPUTE:
6838             linked_prog = st_nir_get_mesa_program(ctx, prog, shader);
6839          default:
6840             break;
6841          }
6842       } else {
6843          linked_prog = get_mesa_program_tgsi(ctx, prog, shader);
6844       }
6845
6846       if (linked_prog) {
6847          st_set_prog_affected_state_flags(linked_prog);
6848          if (!ctx->Driver.ProgramStringNotify(ctx,
6849                                               _mesa_shader_stage_to_program(i),
6850                                               linked_prog)) {
6851             _mesa_reference_program(ctx, &shader->Program, NULL);
6852             return GL_FALSE;
6853          }
6854       }
6855    }
6856
6857    return GL_TRUE;
6858 }
6859
6860 void
6861 st_translate_stream_output_info(glsl_to_tgsi_visitor *glsl_to_tgsi,
6862                                 const ubyte outputMapping[],
6863                                 struct pipe_stream_output_info *so)
6864 {
6865    if (!glsl_to_tgsi->shader_program->last_vert_prog)
6866       return;
6867
6868    struct gl_transform_feedback_info *info =
6869       glsl_to_tgsi->shader_program->last_vert_prog->sh.LinkedTransformFeedback;
6870    st_translate_stream_output_info2(info, outputMapping, so);
6871 }
6872
6873 void
6874 st_translate_stream_output_info2(struct gl_transform_feedback_info *info,
6875                                 const ubyte outputMapping[],
6876                                 struct pipe_stream_output_info *so)
6877 {
6878    unsigned i;
6879
6880    for (i = 0; i < info->NumOutputs; i++) {
6881       so->output[i].register_index =
6882          outputMapping[info->Outputs[i].OutputRegister];
6883       so->output[i].start_component = info->Outputs[i].ComponentOffset;
6884       so->output[i].num_components = info->Outputs[i].NumComponents;
6885       so->output[i].output_buffer = info->Outputs[i].OutputBuffer;
6886       so->output[i].dst_offset = info->Outputs[i].DstOffset;
6887       so->output[i].stream = info->Outputs[i].StreamId;
6888    }
6889
6890    for (i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
6891       so->stride[i] = info->Buffers[i].Stride;
6892    }
6893    so->num_outputs = info->NumOutputs;
6894 }
6895
6896 } /* extern "C" */