src/mesa/state_tracker/st_glsl_to_tgsi.cpp

   1 /*
   2  * Copyright (C) 2005-2007  Brian Paul   All Rights Reserved.
   3  * Copyright (C) 2008  VMware, Inc.   All Rights Reserved.
   4  * Copyright © 2010 Intel Corporation
   5  * Copyright © 2011 Bryan Cain
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the next
  15  * paragraph) shall be included in all copies or substantial portions of the
  16  * Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  21  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  23  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  24  * DEALINGS IN THE SOFTWARE.
  25  */
  26
  27 /**
  28  * \file glsl_to_tgsi.cpp
  29  *
  30  * Translate GLSL IR to TGSI.
  31  */
  32
  33 #include "st_glsl_to_tgsi.h"
  34
  35 #include "compiler/glsl/glsl_parser_extras.h"
  36 #include "compiler/glsl/ir_optimization.h"
  37 #include "compiler/glsl/program.h"
  38
  39 #include "main/errors.h"
  40 #include "main/shaderobj.h"
  41 #include "main/uniforms.h"
  42 #include "main/shaderapi.h"
  43 #include "main/shaderimage.h"
  44 #include "program/prog_instruction.h"
  45
  46 #include "pipe/p_context.h"
  47 #include "pipe/p_screen.h"
  48 #include "tgsi/tgsi_ureg.h"
  49 #include "tgsi/tgsi_info.h"
  50 #include "util/u_math.h"
  51 #include "util/u_memory.h"
  52 #include "st_program.h"
  53 #include "st_mesa_to_tgsi.h"
  54 #include "st_format.h"
  55 #include "st_glsl_types.h"
  56 #include "st_nir.h"
  57
  58 #include <algorithm>
  59
  60 #define PROGRAM_ANY_CONST ((1 << PROGRAM_STATE_VAR) |    \
  61                            (1 << PROGRAM_CONSTANT) |     \
  62                            (1 << PROGRAM_UNIFORM))
  63
  64 #define MAX_GLSL_TEXTURE_OFFSET 4
  65
  66 class st_src_reg;
  67 class st_dst_reg;
  68
  69 static int swizzle_for_size(int size);
  70
  71 static int swizzle_for_type(const glsl_type *type, int component = 0)
  72 {
  73    unsigned num_elements = 4;
  74
  75    if (type) {
  76       type = type->without_array();
  77       if (type->is_scalar() || type->is_vector() || type->is_matrix())
  78          num_elements = type->vector_elements;
  79    }
  80
  81    int swizzle = swizzle_for_size(num_elements);
  82    assert(num_elements + component <= 4);
  83
  84    swizzle += component * MAKE_SWIZZLE4(1, 1, 1, 1);
  85    return swizzle;
  86 }
  87
  88 /**
  89  * This struct is a corresponding struct to TGSI ureg_src.
  90  */
  91 class st_src_reg {
  92 public:
  93    st_src_reg(gl_register_file file, int index, const glsl_type *type,
  94               int component = 0, unsigned array_id = 0)
  95    {
  96       assert(file != PROGRAM_ARRAY || array_id != 0);
  97       this->file = file;
  98       this->index = index;
  99       this->swizzle = swizzle_for_type(type, component);
 100       this->negate = 0;
 101       this->abs = 0;
 102       this->index2D = 0;
 103       this->type = type ? type->base_type : GLSL_TYPE_ERROR;
 104       this->reladdr = NULL;
 105       this->reladdr2 = NULL;
 106       this->has_index2 = false;
 107       this->double_reg2 = false;
 108       this->array_id = array_id;
 109       this->is_double_vertex_input = false;
 110    }
 111
 112    st_src_reg(gl_register_file file, int index, enum glsl_base_type type)
 113    {
 114       assert(file != PROGRAM_ARRAY); /* need array_id > 0 */
 115       this->type = type;
 116       this->file = file;
 117       this->index = index;
 118       this->index2D = 0;
 119       this->swizzle = SWIZZLE_XYZW;
 120       this->negate = 0;
 121       this->abs = 0;
 122       this->reladdr = NULL;
 123       this->reladdr2 = NULL;
 124       this->has_index2 = false;
 125       this->double_reg2 = false;
 126       this->array_id = 0;
 127       this->is_double_vertex_input = false;
 128    }
 129
 130    st_src_reg(gl_register_file file, int index, enum glsl_base_type type, int index2D)
 131    {
 132       assert(file != PROGRAM_ARRAY); /* need array_id > 0 */
 133       this->type = type;
 134       this->file = file;
 135       this->index = index;
 136       this->index2D = index2D;
 137       this->swizzle = SWIZZLE_XYZW;
 138       this->negate = 0;
 139       this->abs = 0;
 140       this->reladdr = NULL;
 141       this->reladdr2 = NULL;
 142       this->has_index2 = false;
 143       this->double_reg2 = false;
 144       this->array_id = 0;
 145       this->is_double_vertex_input = false;
 146    }
 147
 148    st_src_reg()
 149    {
 150       this->type = GLSL_TYPE_ERROR;
 151       this->file = PROGRAM_UNDEFINED;
 152       this->index = 0;
 153       this->index2D = 0;
 154       this->swizzle = 0;
 155       this->negate = 0;
 156       this->abs = 0;
 157       this->reladdr = NULL;
 158       this->reladdr2 = NULL;
 159       this->has_index2 = false;
 160       this->double_reg2 = false;
 161       this->array_id = 0;
 162       this->is_double_vertex_input = false;
 163    }
 164
 165    explicit st_src_reg(st_dst_reg reg);
 166
 167    int16_t index; /**< temporary index, VERT_ATTRIB_*, VARYING_SLOT_*, etc. */
 168    int16_t index2D;
 169    uint16_t swizzle; /**< SWIZZLE_XYZWONEZERO swizzles from Mesa. */
 170    int negate:4; /**< NEGATE_XYZW mask from mesa */
 171    unsigned abs:1;
 172    enum glsl_base_type type:4; /** GLSL_TYPE_* from GLSL IR (enum glsl_base_type) */
 173    unsigned has_index2:1;
 174    gl_register_file file:5; /**< PROGRAM_* from Mesa */
 175    /*
 176     * Is this the second half of a double register pair?
 177     * currently used for input mapping only.
 178     */
 179    unsigned double_reg2:1;
 180    unsigned is_double_vertex_input:1;
 181    unsigned array_id:10;
 182
 183    /** Register index should be offset by the integer in this reg. */
 184    st_src_reg *reladdr;
 185    st_src_reg *reladdr2;
 186
 187    st_src_reg get_abs()
 188    {
 189       st_src_reg reg = *this;
 190       reg.negate = 0;
 191       reg.abs = 1;
 192       return reg;
 193    }
 194 };
 195
 196 class st_dst_reg {
 197 public:
 198    st_dst_reg(gl_register_file file, int writemask, enum glsl_base_type type, int index)
 199    {
 200       assert(file != PROGRAM_ARRAY); /* need array_id > 0 */
 201       this->file = file;
 202       this->index = index;
 203       this->index2D = 0;
 204       this->writemask = writemask;
 205       this->reladdr = NULL;
 206       this->reladdr2 = NULL;
 207       this->has_index2 = false;
 208       this->type = type;
 209       this->array_id = 0;
 210    }
 211
 212    st_dst_reg(gl_register_file file, int writemask, enum glsl_base_type type)
 213    {
 214       assert(file != PROGRAM_ARRAY); /* need array_id > 0 */
 215       this->file = file;
 216       this->index = 0;
 217       this->index2D = 0;
 218       this->writemask = writemask;
 219       this->reladdr = NULL;
 220       this->reladdr2 = NULL;
 221       this->has_index2 = false;
 222       this->type = type;
 223       this->array_id = 0;
 224    }
 225
 226    st_dst_reg()
 227    {
 228       this->type = GLSL_TYPE_ERROR;
 229       this->file = PROGRAM_UNDEFINED;
 230       this->index = 0;
 231       this->index2D = 0;
 232       this->writemask = 0;
 233       this->reladdr = NULL;
 234       this->reladdr2 = NULL;
 235       this->has_index2 = false;
 236       this->array_id = 0;
 237    }
 238
 239    explicit st_dst_reg(st_src_reg reg);
 240
 241    int16_t index; /**< temporary index, VERT_ATTRIB_*, VARYING_SLOT_*, etc. */
 242    int16_t index2D;
 243    gl_register_file file:5; /**< PROGRAM_* from Mesa */
 244    unsigned writemask:4; /**< Bitfield of WRITEMASK_[XYZW] */
 245    enum glsl_base_type type:4; /** GLSL_TYPE_* from GLSL IR (enum glsl_base_type) */
 246    unsigned has_index2:1;
 247    unsigned array_id:10;
 248
 249    /** Register index should be offset by the integer in this reg. */
 250    st_src_reg *reladdr;
 251    st_src_reg *reladdr2;
 252 };
 253
 254 st_src_reg::st_src_reg(st_dst_reg reg)
 255 {
 256    this->type = reg.type;
 257    this->file = reg.file;
 258    this->index = reg.index;
 259    this->swizzle = SWIZZLE_XYZW;
 260    this->negate = 0;
 261    this->abs = 0;
 262    this->reladdr = reg.reladdr;
 263    this->index2D = reg.index2D;
 264    this->reladdr2 = reg.reladdr2;
 265    this->has_index2 = reg.has_index2;
 266    this->double_reg2 = false;
 267    this->array_id = reg.array_id;
 268    this->is_double_vertex_input = false;
 269 }
 270
 271 st_dst_reg::st_dst_reg(st_src_reg reg)
 272 {
 273    this->type = reg.type;
 274    this->file = reg.file;
 275    this->index = reg.index;
 276    this->writemask = WRITEMASK_XYZW;
 277    this->reladdr = reg.reladdr;
 278    this->index2D = reg.index2D;
 279    this->reladdr2 = reg.reladdr2;
 280    this->has_index2 = reg.has_index2;
 281    this->array_id = reg.array_id;
 282 }
 283
 284 class glsl_to_tgsi_instruction : public exec_node {
 285 public:
 286    DECLARE_RALLOC_CXX_OPERATORS(glsl_to_tgsi_instruction)
 287
 288    st_dst_reg dst[2];
 289    st_src_reg src[4];
 290    st_src_reg resource; /**< sampler or buffer register */
 291    st_src_reg *tex_offsets;
 292
 293    /** Pointer to the ir source this tree came from for debugging */
 294    ir_instruction *ir;
 295
 296    unsigned op:8; /**< TGSI opcode */
 297    unsigned saturate:1;
 298    unsigned is_64bit_expanded:1;
 299    unsigned sampler_base:5;
 300    unsigned sampler_array_size:6; /**< 1-based size of sampler array, 1 if not array */
 301    unsigned tex_target:4; /**< One of TEXTURE_*_INDEX */
 302    glsl_base_type tex_type:4;
 303    unsigned tex_shadow:1;
 304    unsigned image_format:9;
 305    unsigned tex_offset_num_offset:3;
 306    unsigned dead_mask:4; /**< Used in dead code elimination */
 307    unsigned buffer_access:3; /**< buffer access type */
 308
 309    const struct tgsi_opcode_info *info;
 310 };
 311
 312 class variable_storage : public exec_node {
 313 public:
 314    variable_storage(ir_variable *var, gl_register_file file, int index,
 315                     unsigned array_id = 0)
 316       : file(file), index(index), component(0), var(var), array_id(array_id)
 317    {
 318       assert(file != PROGRAM_ARRAY || array_id != 0);
 319    }
 320
 321    gl_register_file file;
 322    int index;
 323
 324    /* Explicit component location. This is given in terms of the GLSL-style
 325     * swizzles where each double is a single component, i.e. for 64-bit types
 326     * it can only be 0 or 1.
 327     */
 328    int component;
 329    ir_variable *var; /* variable that maps to this, if any */
 330    unsigned array_id;
 331 };
 332
 333 class immediate_storage : public exec_node {
 334 public:
 335    immediate_storage(gl_constant_value *values, int size32, int type)
 336    {
 337       memcpy(this->values, values, size32 * sizeof(gl_constant_value));
 338       this->size32 = size32;
 339       this->type = type;
 340    }
 341
 342    /* doubles are stored across 2 gl_constant_values */
 343    gl_constant_value values[4];
 344    int size32; /**< Number of 32-bit components (1-4) */
 345    int type; /**< GL_DOUBLE, GL_FLOAT, GL_INT, GL_BOOL, or GL_UNSIGNED_INT */
 346 };
 347
 348 static st_src_reg undef_src = st_src_reg(PROGRAM_UNDEFINED, 0, GLSL_TYPE_ERROR);
 349 static st_dst_reg undef_dst = st_dst_reg(PROGRAM_UNDEFINED, SWIZZLE_NOOP, GLSL_TYPE_ERROR);
 350
 351 struct inout_decl {
 352    unsigned mesa_index;
 353    unsigned array_id; /* TGSI ArrayID; 1-based: 0 means not an array */
 354    unsigned size;
 355    unsigned interp_loc;
 356    unsigned gs_out_streams;
 357    enum glsl_interp_mode interp;
 358    enum glsl_base_type base_type;
 359    ubyte usage_mask; /* GLSL-style usage-mask,  i.e. single bit per double */
 360 };
 361
 362 static struct inout_decl *
 363 find_inout_array(struct inout_decl *decls, unsigned count, unsigned array_id)
 364 {
 365    assert(array_id != 0);
 366
 367    for (unsigned i = 0; i < count; i++) {
 368       struct inout_decl *decl = &decls[i];
 369
 370       if (array_id == decl->array_id) {
 371          return decl;
 372       }
 373    }
 374
 375    return NULL;
 376 }
 377
 378 static enum glsl_base_type
 379 find_array_type(struct inout_decl *decls, unsigned count, unsigned array_id)
 380 {
 381    if (!array_id)
 382       return GLSL_TYPE_ERROR;
 383    struct inout_decl *decl = find_inout_array(decls, count, array_id);
 384    if (decl)
 385       return decl->base_type;
 386    return GLSL_TYPE_ERROR;
 387 }
 388
 389 struct rename_reg_pair {
 390    int old_reg;
 391    int new_reg;
 392 };
 393
 394 struct glsl_to_tgsi_visitor : public ir_visitor {
 395 public:
 396    glsl_to_tgsi_visitor();
 397    ~glsl_to_tgsi_visitor();
 398
 399    struct gl_context *ctx;
 400    struct gl_program *prog;
 401    struct gl_shader_program *shader_program;
 402    struct gl_linked_shader *shader;
 403    struct gl_shader_compiler_options *options;
 404
 405    int next_temp;
 406
 407    unsigned *array_sizes;
 408    unsigned max_num_arrays;
 409    unsigned next_array;
 410
 411    struct inout_decl inputs[4 * PIPE_MAX_SHADER_INPUTS];
 412    unsigned num_inputs;
 413    unsigned num_input_arrays;
 414    struct inout_decl outputs[4 * PIPE_MAX_SHADER_OUTPUTS];
 415    unsigned num_outputs;
 416    unsigned num_output_arrays;
 417
 418    int num_address_regs;
 419    uint32_t samplers_used;
 420    glsl_base_type sampler_types[PIPE_MAX_SAMPLERS];
 421    int sampler_targets[PIPE_MAX_SAMPLERS];   /**< One of TGSI_TEXTURE_* */
 422    int buffers_used;
 423    int images_used;
 424    int image_targets[PIPE_MAX_SHADER_IMAGES];
 425    unsigned image_formats[PIPE_MAX_SHADER_IMAGES];
 426    bool indirect_addr_consts;
 427    int wpos_transform_const;
 428
 429    int glsl_version;
 430    bool native_integers;
 431    bool have_sqrt;
 432    bool have_fma;
 433    bool use_shared_memory;
 434
 435    variable_storage *find_variable_storage(ir_variable *var);
 436
 437    int add_constant(gl_register_file file, gl_constant_value values[8],
 438                     int size, int datatype, uint16_t *swizzle_out);
 439
 440    st_src_reg get_temp(const glsl_type *type);
 441    void reladdr_to_temp(ir_instruction *ir, st_src_reg *reg, int *num_reladdr);
 442
 443    st_src_reg st_src_reg_for_double(double val);
 444    st_src_reg st_src_reg_for_float(float val);
 445    st_src_reg st_src_reg_for_int(int val);
 446    st_src_reg st_src_reg_for_type(enum glsl_base_type type, int val);
 447
 448    /**
 449     * \name Visit methods
 450     *
 451     * As typical for the visitor pattern, there must be one \c visit method for
 452     * each concrete subclass of \c ir_instruction.  Virtual base classes within
 453     * the hierarchy should not have \c visit methods.
 454     */
 455    /*@{*/
 456    virtual void visit(ir_variable *);
 457    virtual void visit(ir_loop *);
 458    virtual void visit(ir_loop_jump *);
 459    virtual void visit(ir_function_signature *);
 460    virtual void visit(ir_function *);
 461    virtual void visit(ir_expression *);
 462    virtual void visit(ir_swizzle *);
 463    virtual void visit(ir_dereference_variable  *);
 464    virtual void visit(ir_dereference_array *);
 465    virtual void visit(ir_dereference_record *);
 466    virtual void visit(ir_assignment *);
 467    virtual void visit(ir_constant *);
 468    virtual void visit(ir_call *);
 469    virtual void visit(ir_return *);
 470    virtual void visit(ir_discard *);
 471    virtual void visit(ir_texture *);
 472    virtual void visit(ir_if *);
 473    virtual void visit(ir_emit_vertex *);
 474    virtual void visit(ir_end_primitive *);
 475    virtual void visit(ir_barrier *);
 476    /*@}*/
 477
 478    void visit_expression(ir_expression *, st_src_reg *) ATTRIBUTE_NOINLINE;
 479
 480    void visit_atomic_counter_intrinsic(ir_call *);
 481    void visit_ssbo_intrinsic(ir_call *);
 482    void visit_membar_intrinsic(ir_call *);
 483    void visit_shared_intrinsic(ir_call *);
 484    void visit_image_intrinsic(ir_call *);
 485
 486    st_src_reg result;
 487
 488    /** List of variable_storage */
 489    exec_list variables;
 490
 491    /** List of immediate_storage */
 492    exec_list immediates;
 493    unsigned num_immediates;
 494
 495    /** List of glsl_to_tgsi_instruction */
 496    exec_list instructions;
 497
 498    glsl_to_tgsi_instruction *emit_asm(ir_instruction *ir, unsigned op,
 499                                       st_dst_reg dst = undef_dst,
 500                                       st_src_reg src0 = undef_src,
 501                                       st_src_reg src1 = undef_src,
 502                                       st_src_reg src2 = undef_src,
 503                                       st_src_reg src3 = undef_src);
 504
 505    glsl_to_tgsi_instruction *emit_asm(ir_instruction *ir, unsigned op,
 506                                       st_dst_reg dst, st_dst_reg dst1,
 507                                       st_src_reg src0 = undef_src,
 508                                       st_src_reg src1 = undef_src,
 509                                       st_src_reg src2 = undef_src,
 510                                       st_src_reg src3 = undef_src);
 511
 512    unsigned get_opcode(unsigned op,
 513                     st_dst_reg dst,
 514                     st_src_reg src0, st_src_reg src1);
 515
 516    /**
 517     * Emit the correct dot-product instruction for the type of arguments
 518     */
 519    glsl_to_tgsi_instruction *emit_dp(ir_instruction *ir,
 520                                      st_dst_reg dst,
 521                                      st_src_reg src0,
 522                                      st_src_reg src1,
 523                                      unsigned elements);
 524
 525    void emit_scalar(ir_instruction *ir, unsigned op,
 526                     st_dst_reg dst, st_src_reg src0);
 527
 528    void emit_scalar(ir_instruction *ir, unsigned op,
 529                     st_dst_reg dst, st_src_reg src0, st_src_reg src1);
 530
 531    void emit_arl(ir_instruction *ir, st_dst_reg dst, st_src_reg src0);
 532
 533    void get_deref_offsets(ir_dereference *ir,
 534                           unsigned *array_size,
 535                           unsigned *base,
 536                           uint16_t *index,
 537                           st_src_reg *reladdr,
 538                           bool opaque);
 539   void calc_deref_offsets(ir_dereference *tail,
 540                           unsigned *array_elements,
 541                           uint16_t *index,
 542                           st_src_reg *indirect,
 543                           unsigned *location);
 544    st_src_reg canonicalize_gather_offset(st_src_reg offset);
 545
 546    bool try_emit_mad(ir_expression *ir,
 547               int mul_operand);
 548    bool try_emit_mad_for_and_not(ir_expression *ir,
 549               int mul_operand);
 550
 551    void emit_swz(ir_expression *ir);
 552
 553    bool process_move_condition(ir_rvalue *ir);
 554
 555    void simplify_cmp(void);
 556
 557    void rename_temp_registers(int num_renames, struct rename_reg_pair *renames);
 558    void get_first_temp_read(int *first_reads);
 559    void get_last_temp_read_first_temp_write(int *last_reads, int *first_writes);
 560    void get_last_temp_write(int *last_writes);
 561
 562    void copy_propagate(void);
 563    int eliminate_dead_code(void);
 564
 565    void merge_two_dsts(void);
 566    void merge_registers(void);
 567    void renumber_registers(void);
 568
 569    void emit_block_mov(ir_assignment *ir, const struct glsl_type *type,
 570                        st_dst_reg *l, st_src_reg *r,
 571                        st_src_reg *cond, bool cond_swap);
 572
 573    void *mem_ctx;
 574 };
 575
 576 static st_dst_reg address_reg = st_dst_reg(PROGRAM_ADDRESS, WRITEMASK_X, GLSL_TYPE_FLOAT, 0);
 577 static st_dst_reg address_reg2 = st_dst_reg(PROGRAM_ADDRESS, WRITEMASK_X, GLSL_TYPE_FLOAT, 1);
 578 static st_dst_reg sampler_reladdr = st_dst_reg(PROGRAM_ADDRESS, WRITEMASK_X, GLSL_TYPE_FLOAT, 2);
 579
 580 static void
 581 fail_link(struct gl_shader_program *prog, const char *fmt, ...) PRINTFLIKE(2, 3);
 582
 583 static void
 584 fail_link(struct gl_shader_program *prog, const char *fmt, ...)
 585 {
 586    va_list args;
 587    va_start(args, fmt);
 588    ralloc_vasprintf_append(&prog->data->InfoLog, fmt, args);
 589    va_end(args);
 590
 591    prog->data->LinkStatus = GL_FALSE;
 592 }
 593
 594 static int
 595 swizzle_for_size(int size)
 596 {
 597    static const int size_swizzles[4] = {
 598       MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
 599       MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y),
 600       MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_Z),
 601       MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W),
 602    };
 603
 604    assert((size >= 1) && (size <= 4));
 605    return size_swizzles[size - 1];
 606 }
 607
 608 static bool
 609 is_resource_instruction(unsigned opcode)
 610 {
 611    switch (opcode) {
 612    case TGSI_OPCODE_RESQ:
 613    case TGSI_OPCODE_LOAD:
 614    case TGSI_OPCODE_ATOMUADD:
 615    case TGSI_OPCODE_ATOMXCHG:
 616    case TGSI_OPCODE_ATOMCAS:
 617    case TGSI_OPCODE_ATOMAND:
 618    case TGSI_OPCODE_ATOMOR:
 619    case TGSI_OPCODE_ATOMXOR:
 620    case TGSI_OPCODE_ATOMUMIN:
 621    case TGSI_OPCODE_ATOMUMAX:
 622    case TGSI_OPCODE_ATOMIMIN:
 623    case TGSI_OPCODE_ATOMIMAX:
 624       return true;
 625    default:
 626       return false;
 627    }
 628 }
 629
 630 static unsigned
 631 num_inst_dst_regs(const glsl_to_tgsi_instruction *op)
 632 {
 633    return op->info->num_dst;
 634 }
 635
 636 static unsigned
 637 num_inst_src_regs(const glsl_to_tgsi_instruction *op)
 638 {
 639    return op->info->is_tex || is_resource_instruction(op->op) ?
 640       op->info->num_src - 1 : op->info->num_src;
 641 }
 642
 643 glsl_to_tgsi_instruction *
 644 glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
 645                                st_dst_reg dst, st_dst_reg dst1,
 646                                st_src_reg src0, st_src_reg src1,
 647                                st_src_reg src2, st_src_reg src3)
 648 {
 649    glsl_to_tgsi_instruction *inst = new(mem_ctx) glsl_to_tgsi_instruction();
 650    int num_reladdr = 0, i, j;
 651    bool dst_is_64bit[2];
 652
 653    op = get_opcode(op, dst, src0, src1);
 654
 655    /* If we have to do relative addressing, we want to load the ARL
 656     * reg directly for one of the regs, and preload the other reladdr
 657     * sources into temps.
 658     */
 659    num_reladdr += dst.reladdr != NULL || dst.reladdr2;
 660    num_reladdr += dst1.reladdr != NULL || dst1.reladdr2;
 661    num_reladdr += src0.reladdr != NULL || src0.reladdr2 != NULL;
 662    num_reladdr += src1.reladdr != NULL || src1.reladdr2 != NULL;
 663    num_reladdr += src2.reladdr != NULL || src2.reladdr2 != NULL;
 664    num_reladdr += src3.reladdr != NULL || src3.reladdr2 != NULL;
 665
 666    reladdr_to_temp(ir, &src3, &num_reladdr);
 667    reladdr_to_temp(ir, &src2, &num_reladdr);
 668    reladdr_to_temp(ir, &src1, &num_reladdr);
 669    reladdr_to_temp(ir, &src0, &num_reladdr);
 670
 671    if (dst.reladdr || dst.reladdr2) {
 672       if (dst.reladdr)
 673          emit_arl(ir, address_reg, *dst.reladdr);
 674       if (dst.reladdr2)
 675          emit_arl(ir, address_reg2, *dst.reladdr2);
 676       num_reladdr--;
 677    }
 678    if (dst1.reladdr) {
 679       emit_arl(ir, address_reg, *dst1.reladdr);
 680       num_reladdr--;
 681    }
 682    assert(num_reladdr == 0);
 683
 684    /* inst->op has only 8 bits. */
 685    STATIC_ASSERT(TGSI_OPCODE_LAST <= 255);
 686
 687    inst->op = op;
 688    inst->info = tgsi_get_opcode_info(op);
 689    inst->dst[0] = dst;
 690    inst->dst[1] = dst1;
 691    inst->src[0] = src0;
 692    inst->src[1] = src1;
 693    inst->src[2] = src2;
 694    inst->src[3] = src3;
 695    inst->is_64bit_expanded = false;
 696    inst->ir = ir;
 697    inst->dead_mask = 0;
 698    inst->tex_offsets = NULL;
 699    inst->tex_offset_num_offset = 0;
 700    inst->saturate = 0;
 701    inst->tex_shadow = 0;
 702    /* default to float, for paths where this is not initialized
 703     * (since 0==UINT which is likely wrong):
 704     */
 705    inst->tex_type = GLSL_TYPE_FLOAT;
 706
 707    /* Update indirect addressing status used by TGSI */
 708    if (dst.reladdr || dst.reladdr2) {
 709       switch(dst.file) {
 710       case PROGRAM_STATE_VAR:
 711       case PROGRAM_CONSTANT:
 712       case PROGRAM_UNIFORM:
 713          this->indirect_addr_consts = true;
 714          break;
 715       case PROGRAM_IMMEDIATE:
 716          assert(!"immediates should not have indirect addressing");
 717          break;
 718       default:
 719          break;
 720       }
 721    }
 722    else {
 723       for (i = 0; i < 4; i++) {
 724          if(inst->src[i].reladdr) {
 725             switch(inst->src[i].file) {
 726             case PROGRAM_STATE_VAR:
 727             case PROGRAM_CONSTANT:
 728             case PROGRAM_UNIFORM:
 729                this->indirect_addr_consts = true;
 730                break;
 731             case PROGRAM_IMMEDIATE:
 732                assert(!"immediates should not have indirect addressing");
 733                break;
 734             default:
 735                break;
 736             }
 737          }
 738       }
 739    }
 740
 741    /*
 742     * This section contains the double processing.
 743     * GLSL just represents doubles as single channel values,
 744     * however most HW and TGSI represent doubles as pairs of register channels.
 745     *
 746     * so we have to fixup destination writemask/index and src swizzle/indexes.
 747     * dest writemasks need to translate from single channel write mask
 748     * to a dual-channel writemask, but also need to modify the index,
 749     * if we are touching the Z,W fields in the pre-translated writemask.
 750     *
 751     * src channels have similiar index modifications along with swizzle
 752     * changes to we pick the XY, ZW pairs from the correct index.
 753     *
 754     * GLSL [0].x -> TGSI [0].xy
 755     * GLSL [0].y -> TGSI [0].zw
 756     * GLSL [0].z -> TGSI [1].xy
 757     * GLSL [0].w -> TGSI [1].zw
 758     */
 759    for (j = 0; j < 2; j++) {
 760       dst_is_64bit[j] = glsl_base_type_is_64bit(inst->dst[j].type);
 761       if (!dst_is_64bit[j] && inst->dst[j].file == PROGRAM_OUTPUT && inst->dst[j].type == GLSL_TYPE_ARRAY) {
 762          enum glsl_base_type type = find_array_type(this->outputs, this->num_outputs, inst->dst[j].array_id);
 763          if (glsl_base_type_is_64bit(type))
 764             dst_is_64bit[j] = true;
 765       }
 766    }
 767
 768    if (dst_is_64bit[0] || dst_is_64bit[1] ||
 769        glsl_base_type_is_64bit(inst->src[0].type)) {
 770       glsl_to_tgsi_instruction *dinst = NULL;
 771       int initial_src_swz[4], initial_src_idx[4];
 772       int initial_dst_idx[2], initial_dst_writemask[2];
 773       /* select the writemask for dst0 or dst1 */
 774       unsigned writemask = inst->dst[1].file == PROGRAM_UNDEFINED ? inst->dst[0].writemask : inst->dst[1].writemask;
 775
 776       /* copy out the writemask, index and swizzles for all src/dsts. */
 777       for (j = 0; j < 2; j++) {
 778          initial_dst_writemask[j] = inst->dst[j].writemask;
 779          initial_dst_idx[j] = inst->dst[j].index;
 780       }
 781
 782       for (j = 0; j < 4; j++) {
 783          initial_src_swz[j] = inst->src[j].swizzle;
 784          initial_src_idx[j] = inst->src[j].index;
 785       }
 786
 787       /*
 788        * scan all the components in the dst writemask
 789        * generate an instruction for each of them if required.
 790        */
 791       st_src_reg addr;
 792       while (writemask) {
 793
 794          int i = u_bit_scan(&writemask);
 795
 796          /* before emitting the instruction, see if we have to adjust load / store
 797           * address */
 798          if (i > 1 && (inst->op == TGSI_OPCODE_LOAD || inst->op == TGSI_OPCODE_STORE) &&
 799              addr.file == PROGRAM_UNDEFINED) {
 800             /* We have to advance the buffer address by 16 */
 801             addr = get_temp(glsl_type::uint_type);
 802             emit_asm(ir, TGSI_OPCODE_UADD, st_dst_reg(addr),
 803                      inst->src[0], st_src_reg_for_int(16));
 804          }
 805
 806          /* first time use previous instruction */
 807          if (dinst == NULL) {
 808             dinst = inst;
 809          } else {
 810             /* create a new instructions for subsequent attempts */
 811             dinst = new(mem_ctx) glsl_to_tgsi_instruction();
 812             *dinst = *inst;
 813             dinst->next = NULL;
 814             dinst->prev = NULL;
 815          }
 816          this->instructions.push_tail(dinst);
 817          dinst->is_64bit_expanded = true;
 818
 819          /* modify the destination if we are splitting */
 820          for (j = 0; j < 2; j++) {
 821             if (dst_is_64bit[j]) {
 822                dinst->dst[j].writemask = (i & 1) ? WRITEMASK_ZW : WRITEMASK_XY;
 823                dinst->dst[j].index = initial_dst_idx[j];
 824                if (i > 1) {
 825                   if (dinst->op == TGSI_OPCODE_LOAD || dinst->op == TGSI_OPCODE_STORE)
 826                      dinst->src[0] = addr;
 827                   if (dinst->op != TGSI_OPCODE_STORE)
 828                      dinst->dst[j].index++;
 829                }
 830             } else {
 831                /* if we aren't writing to a double, just get the bit of the initial writemask
 832                   for this channel */
 833                dinst->dst[j].writemask = initial_dst_writemask[j] & (1 << i);
 834             }
 835          }
 836
 837          /* modify the src registers */
 838          for (j = 0; j < 4; j++) {
 839             int swz = GET_SWZ(initial_src_swz[j], i);
 840
 841             if (glsl_base_type_is_64bit(dinst->src[j].type)) {
 842                dinst->src[j].index = initial_src_idx[j];
 843                if (swz > 1) {
 844                   dinst->src[j].double_reg2 = true;
 845                   dinst->src[j].index++;
 846                }
 847
 848                if (swz & 1)
 849                   dinst->src[j].swizzle = MAKE_SWIZZLE4(SWIZZLE_Z, SWIZZLE_W, SWIZZLE_Z, SWIZZLE_W);
 850                else
 851                   dinst->src[j].swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_X, SWIZZLE_Y);
 852
 853             } else {
 854                /* some opcodes are special case in what they use as sources
 855                   - [FUI]2D/[UI]2I64 is a float/[u]int src0, DLDEXP is integer src1 */
 856                if (op == TGSI_OPCODE_F2D || op == TGSI_OPCODE_U2D || op == TGSI_OPCODE_I2D ||
 857                    op == TGSI_OPCODE_I2I64 || op == TGSI_OPCODE_U2I64 ||
 858                    op == TGSI_OPCODE_DLDEXP ||
 859                    (op == TGSI_OPCODE_UCMP && dst_is_64bit[0])) {
 860                   dinst->src[j].swizzle = MAKE_SWIZZLE4(swz, swz, swz, swz);
 861                }
 862             }
 863          }
 864       }
 865       inst = dinst;
 866    } else {
 867       this->instructions.push_tail(inst);
 868    }
 869
 870
 871    return inst;
 872 }
 873
 874 glsl_to_tgsi_instruction *
 875 glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
 876                                st_dst_reg dst,
 877                                st_src_reg src0, st_src_reg src1,
 878                                st_src_reg src2, st_src_reg src3)
 879 {
 880    return emit_asm(ir, op, dst, undef_dst, src0, src1, src2, src3);
 881 }
 882
 883 /**
 884  * Determines whether to use an integer, unsigned integer, or float opcode
 885  * based on the operands and input opcode, then emits the result.
 886  */
 887 unsigned
 888 glsl_to_tgsi_visitor::get_opcode(unsigned op,
 889                                  st_dst_reg dst,
 890                                  st_src_reg src0, st_src_reg src1)
 891 {
 892    enum glsl_base_type type = GLSL_TYPE_FLOAT;
 893
 894    if (op == TGSI_OPCODE_MOV)
 895        return op;
 896
 897    assert(src0.type != GLSL_TYPE_ARRAY);
 898    assert(src0.type != GLSL_TYPE_STRUCT);
 899    assert(src1.type != GLSL_TYPE_ARRAY);
 900    assert(src1.type != GLSL_TYPE_STRUCT);
 901
 902    if (is_resource_instruction(op))
 903       type = src1.type;
 904    else if (src0.type == GLSL_TYPE_DOUBLE || src1.type == GLSL_TYPE_DOUBLE)
 905       type = GLSL_TYPE_DOUBLE;
 906    else if (src0.type == GLSL_TYPE_FLOAT || src1.type == GLSL_TYPE_FLOAT)
 907       type = GLSL_TYPE_FLOAT;
 908    else if (native_integers)
 909       type = src0.type == GLSL_TYPE_BOOL ? GLSL_TYPE_INT : src0.type;
 910
 911 #define case5(c, f, i, u, d)                    \
 912    case TGSI_OPCODE_##c: \
 913       if (type == GLSL_TYPE_DOUBLE)           \
 914          op = TGSI_OPCODE_##d; \
 915       else if (type == GLSL_TYPE_INT)       \
 916          op = TGSI_OPCODE_##i; \
 917       else if (type == GLSL_TYPE_UINT) \
 918          op = TGSI_OPCODE_##u; \
 919       else \
 920          op = TGSI_OPCODE_##f; \
 921       break;
 922
 923 #define case4(c, f, i, u)                    \
 924    case TGSI_OPCODE_##c: \
 925       if (type == GLSL_TYPE_INT) \
 926          op = TGSI_OPCODE_##i; \
 927       else if (type == GLSL_TYPE_UINT) \
 928          op = TGSI_OPCODE_##u; \
 929       else \
 930          op = TGSI_OPCODE_##f; \
 931       break;
 932
 933 #define case3(f, i, u)  case4(f, f, i, u)
 934 #define case4d(f, i, u, d)  case5(f, f, i, u, d)
 935 #define case3fid(f, i, d) case5(f, f, i, i, d)
 936 #define case2fi(f, i)   case4(f, f, i, i)
 937 #define case2iu(i, u)   case4(i, LAST, i, u)
 938
 939 #define casecomp(c, f, i, u, d)                   \
 940    case TGSI_OPCODE_##c: \
 941       if (type == GLSL_TYPE_DOUBLE) \
 942          op = TGSI_OPCODE_##d; \
 943       else if (type == GLSL_TYPE_INT || type == GLSL_TYPE_SUBROUTINE)       \
 944          op = TGSI_OPCODE_##i; \
 945       else if (type == GLSL_TYPE_UINT) \
 946          op = TGSI_OPCODE_##u; \
 947       else if (native_integers) \
 948          op = TGSI_OPCODE_##f; \
 949       else \
 950          op = TGSI_OPCODE_##c; \
 951       break;
 952
 953    switch(op) {
 954       case3fid(ADD, UADD, DADD);
 955       case3fid(MUL, UMUL, DMUL);
 956       case3fid(MAD, UMAD, DMAD);
 957       case3fid(FMA, UMAD, DFMA);
 958       case3(DIV, IDIV, UDIV);
 959       case4d(MAX, IMAX, UMAX, DMAX);
 960       case4d(MIN, IMIN, UMIN, DMIN);
 961       case2iu(MOD, UMOD);
 962
 963       casecomp(SEQ, FSEQ, USEQ, USEQ, DSEQ);
 964       casecomp(SNE, FSNE, USNE, USNE, DSNE);
 965       casecomp(SGE, FSGE, ISGE, USGE, DSGE);
 966       casecomp(SLT, FSLT, ISLT, USLT, DSLT);
 967
 968       case2iu(ISHR, USHR);
 969
 970       case3fid(SSG, ISSG, DSSG);
 971
 972       case2iu(IBFE, UBFE);
 973       case2iu(IMSB, UMSB);
 974       case2iu(IMUL_HI, UMUL_HI);
 975
 976       case3fid(SQRT, SQRT, DSQRT);
 977
 978       case3fid(RCP, RCP, DRCP);
 979       case3fid(RSQ, RSQ, DRSQ);
 980
 981       case3fid(FRC, FRC, DFRAC);
 982       case3fid(TRUNC, TRUNC, DTRUNC);
 983       case3fid(CEIL, CEIL, DCEIL);
 984       case3fid(FLR, FLR, DFLR);
 985       case3fid(ROUND, ROUND, DROUND);
 986
 987       case2iu(ATOMIMAX, ATOMUMAX);
 988       case2iu(ATOMIMIN, ATOMUMIN);
 989
 990       default: break;
 991    }
 992
 993    assert(op != TGSI_OPCODE_LAST);
 994    return op;
 995 }
 996
 997 glsl_to_tgsi_instruction *
 998 glsl_to_tgsi_visitor::emit_dp(ir_instruction *ir,
 999                               st_dst_reg dst, st_src_reg src0, st_src_reg src1,
1000                               unsigned elements)
1001 {
1002    static const unsigned dot_opcodes[] = {
1003       TGSI_OPCODE_DP2, TGSI_OPCODE_DP3, TGSI_OPCODE_DP4
1004    };
1005
1006    return emit_asm(ir, dot_opcodes[elements - 2], dst, src0, src1);
1007 }
1008
1009 /**
1010  * Emits TGSI scalar opcodes to produce unique answers across channels.
1011  *
1012  * Some TGSI opcodes are scalar-only, like ARB_fp/vp.  The src X
1013  * channel determines the result across all channels.  So to do a vec4
1014  * of this operation, we want to emit a scalar per source channel used
1015  * to produce dest channels.
1016  */
1017 void
1018 glsl_to_tgsi_visitor::emit_scalar(ir_instruction *ir, unsigned op,
1019                                   st_dst_reg dst,
1020                                   st_src_reg orig_src0, st_src_reg orig_src1)
1021 {
1022    int i, j;
1023    int done_mask = ~dst.writemask;
1024
1025    /* TGSI RCP is a scalar operation splatting results to all channels,
1026     * like ARB_fp/vp.  So emit as many RCPs as necessary to cover our
1027     * dst channels.
1028     */
1029    for (i = 0; i < 4; i++) {
1030       GLuint this_mask = (1 << i);
1031       st_src_reg src0 = orig_src0;
1032       st_src_reg src1 = orig_src1;
1033
1034       if (done_mask & this_mask)
1035          continue;
1036
1037       GLuint src0_swiz = GET_SWZ(src0.swizzle, i);
1038       GLuint src1_swiz = GET_SWZ(src1.swizzle, i);
1039       for (j = i + 1; j < 4; j++) {
1040          /* If there is another enabled component in the destination that is
1041           * derived from the same inputs, generate its value on this pass as
1042           * well.
1043           */
1044          if (!(done_mask & (1 << j)) &&
1045              GET_SWZ(src0.swizzle, j) == src0_swiz &&
1046              GET_SWZ(src1.swizzle, j) == src1_swiz) {
1047             this_mask |= (1 << j);
1048          }
1049       }
1050       src0.swizzle = MAKE_SWIZZLE4(src0_swiz, src0_swiz,
1051                                    src0_swiz, src0_swiz);
1052       src1.swizzle = MAKE_SWIZZLE4(src1_swiz, src1_swiz,
1053                                    src1_swiz, src1_swiz);
1054
1055       dst.writemask = this_mask;
1056       emit_asm(ir, op, dst, src0, src1);
1057       done_mask |= this_mask;
1058    }
1059 }
1060
1061 void
1062 glsl_to_tgsi_visitor::emit_scalar(ir_instruction *ir, unsigned op,
1063                                   st_dst_reg dst, st_src_reg src0)
1064 {
1065    st_src_reg undef = undef_src;
1066
1067    undef.swizzle = SWIZZLE_XXXX;
1068
1069    emit_scalar(ir, op, dst, src0, undef);
1070 }
1071
1072 void
1073 glsl_to_tgsi_visitor::emit_arl(ir_instruction *ir,
1074                                st_dst_reg dst, st_src_reg src0)
1075 {
1076    int op = TGSI_OPCODE_ARL;
1077
1078    if (src0.type == GLSL_TYPE_INT || src0.type == GLSL_TYPE_UINT)
1079       op = TGSI_OPCODE_UARL;
1080
1081    assert(dst.file == PROGRAM_ADDRESS);
1082    if (dst.index >= this->num_address_regs)
1083       this->num_address_regs = dst.index + 1;
1084
1085    emit_asm(NULL, op, dst, src0);
1086 }
1087
1088 int
1089 glsl_to_tgsi_visitor::add_constant(gl_register_file file,
1090                                    gl_constant_value values[8], int size, int datatype,
1091                                    uint16_t *swizzle_out)
1092 {
1093    if (file == PROGRAM_CONSTANT) {
1094       GLuint swizzle = swizzle_out ? *swizzle_out : 0;
1095       int result = _mesa_add_typed_unnamed_constant(this->prog->Parameters, values,
1096                                                     size, datatype, &swizzle);
1097       if (swizzle_out)
1098          *swizzle_out = swizzle;
1099       return result;
1100    }
1101
1102    assert(file == PROGRAM_IMMEDIATE);
1103
1104    int index = 0;
1105    immediate_storage *entry;
1106    int size32 = size * (datatype == GL_DOUBLE ? 2 : 1);
1107    int i;
1108
1109    /* Search immediate storage to see if we already have an identical
1110     * immediate that we can use instead of adding a duplicate entry.
1111     */
1112    foreach_in_list(immediate_storage, entry, &this->immediates) {
1113       immediate_storage *tmp = entry;
1114
1115       for (i = 0; i * 4 < size32; i++) {
1116          int slot_size = MIN2(size32 - (i * 4), 4);
1117          if (tmp->type != datatype || tmp->size32 != slot_size)
1118             break;
1119          if (memcmp(tmp->values, &values[i * 4],
1120                     slot_size * sizeof(gl_constant_value)))
1121             break;
1122
1123          /* Everything matches, keep going until the full size is matched */
1124          tmp = (immediate_storage *)tmp->next;
1125       }
1126
1127       /* The full value matched */
1128       if (i * 4 >= size32)
1129          return index;
1130
1131       index++;
1132    }
1133
1134    for (i = 0; i * 4 < size32; i++) {
1135       int slot_size = MIN2(size32 - (i * 4), 4);
1136       /* Add this immediate to the list. */
1137       entry = new(mem_ctx) immediate_storage(&values[i * 4], slot_size, datatype);
1138       this->immediates.push_tail(entry);
1139       this->num_immediates++;
1140    }
1141    return index;
1142 }
1143
1144 st_src_reg
1145 glsl_to_tgsi_visitor::st_src_reg_for_float(float val)
1146 {
1147    st_src_reg src(PROGRAM_IMMEDIATE, -1, GLSL_TYPE_FLOAT);
1148    union gl_constant_value uval;
1149
1150    uval.f = val;
1151    src.index = add_constant(src.file, &uval, 1, GL_FLOAT, &src.swizzle);
1152
1153    return src;
1154 }
1155
1156 st_src_reg
1157 glsl_to_tgsi_visitor::st_src_reg_for_double(double val)
1158 {
1159    st_src_reg src(PROGRAM_IMMEDIATE, -1, GLSL_TYPE_DOUBLE);
1160    union gl_constant_value uval[2];
1161
1162    memcpy(uval, &val, sizeof(uval));
1163    src.index = add_constant(src.file, uval, 1, GL_DOUBLE, &src.swizzle);
1164    src.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_X, SWIZZLE_Y);
1165    return src;
1166 }
1167
1168 st_src_reg
1169 glsl_to_tgsi_visitor::st_src_reg_for_int(int val)
1170 {
1171    st_src_reg src(PROGRAM_IMMEDIATE, -1, GLSL_TYPE_INT);
1172    union gl_constant_value uval;
1173
1174    assert(native_integers);
1175
1176    uval.i = val;
1177    src.index = add_constant(src.file, &uval, 1, GL_INT, &src.swizzle);
1178
1179    return src;
1180 }
1181
1182 st_src_reg
1183 glsl_to_tgsi_visitor::st_src_reg_for_type(enum glsl_base_type type, int val)
1184 {
1185    if (native_integers)
1186       return type == GLSL_TYPE_FLOAT ? st_src_reg_for_float(val) :
1187                                        st_src_reg_for_int(val);
1188    else
1189       return st_src_reg_for_float(val);
1190 }
1191
1192 static int
1193 attrib_type_size(const struct glsl_type *type, bool is_vs_input)
1194 {
1195    return st_glsl_attrib_type_size(type, is_vs_input);
1196 }
1197
1198 static int
1199 type_size(const struct glsl_type *type)
1200 {
1201    return st_glsl_type_size(type);
1202 }
1203
1204 /**
1205  * If the given GLSL type is an array or matrix or a structure containing
1206  * an array/matrix member, return true.  Else return false.
1207  *
1208  * This is used to determine which kind of temp storage (PROGRAM_TEMPORARY
1209  * or PROGRAM_ARRAY) should be used for variables of this type.  Anytime
1210  * we have an array that might be indexed with a variable, we need to use
1211  * the later storage type.
1212  */
1213 static bool
1214 type_has_array_or_matrix(const glsl_type *type)
1215 {
1216    if (type->is_array() || type->is_matrix())
1217       return true;
1218
1219    if (type->is_record()) {
1220       for (unsigned i = 0; i < type->length; i++) {
1221          if (type_has_array_or_matrix(type->fields.structure[i].type)) {
1222             return true;
1223          }
1224       }
1225    }
1226
1227    return false;
1228 }
1229
1230
1231 /**
1232  * In the initial pass of codegen, we assign temporary numbers to
1233  * intermediate results.  (not SSA -- variable assignments will reuse
1234  * storage).
1235  */
1236 st_src_reg
1237 glsl_to_tgsi_visitor::get_temp(const glsl_type *type)
1238 {
1239    st_src_reg src;
1240
1241    src.type = native_integers ? type->base_type : GLSL_TYPE_FLOAT;
1242    src.reladdr = NULL;
1243    src.negate = 0;
1244    src.abs = 0;
1245
1246    if (!options->EmitNoIndirectTemp && type_has_array_or_matrix(type)) {
1247       if (next_array >= max_num_arrays) {
1248          max_num_arrays += 32;
1249          array_sizes = (unsigned*)
1250             realloc(array_sizes, sizeof(array_sizes[0]) * max_num_arrays);
1251       }
1252
1253       src.file = PROGRAM_ARRAY;
1254       src.index = 0;
1255       src.array_id = next_array + 1;
1256       array_sizes[next_array] = type_size(type);
1257       ++next_array;
1258
1259    } else {
1260       src.file = PROGRAM_TEMPORARY;
1261       src.index = next_temp;
1262       next_temp += type_size(type);
1263    }
1264
1265    if (type->is_array() || type->is_record()) {
1266       src.swizzle = SWIZZLE_NOOP;
1267    } else {
1268       src.swizzle = swizzle_for_size(type->vector_elements);
1269    }
1270
1271    return src;
1272 }
1273
1274 variable_storage *
1275 glsl_to_tgsi_visitor::find_variable_storage(ir_variable *var)
1276 {
1277
1278    foreach_in_list(variable_storage, entry, &this->variables) {
1279       if (entry->var == var)
1280          return entry;
1281    }
1282
1283    return NULL;
1284 }
1285
1286 void
1287 glsl_to_tgsi_visitor::visit(ir_variable *ir)
1288 {
1289    if (strcmp(ir->name, "gl_FragCoord") == 0) {
1290       this->prog->OriginUpperLeft = ir->data.origin_upper_left;
1291       this->prog->PixelCenterInteger = ir->data.pixel_center_integer;
1292    }
1293
1294    if (ir->data.mode == ir_var_uniform && strncmp(ir->name, "gl_", 3) == 0) {
1295       unsigned int i;
1296       const ir_state_slot *const slots = ir->get_state_slots();
1297       assert(slots != NULL);
1298
1299       /* Check if this statevar's setup in the STATE file exactly
1300        * matches how we'll want to reference it as a
1301        * struct/array/whatever.  If not, then we need to move it into
1302        * temporary storage and hope that it'll get copy-propagated
1303        * out.
1304        */
1305       for (i = 0; i < ir->get_num_state_slots(); i++) {
1306          if (slots[i].swizzle != SWIZZLE_XYZW) {
1307             break;
1308          }
1309       }
1310
1311       variable_storage *storage;
1312       st_dst_reg dst;
1313       if (i == ir->get_num_state_slots()) {
1314          /* We'll set the index later. */
1315          storage = new(mem_ctx) variable_storage(ir, PROGRAM_STATE_VAR, -1);
1316          this->variables.push_tail(storage);
1317
1318          dst = undef_dst;
1319       } else {
1320          /* The variable_storage constructor allocates slots based on the size
1321           * of the type.  However, this had better match the number of state
1322           * elements that we're going to copy into the new temporary.
1323           */
1324          assert((int) ir->get_num_state_slots() == type_size(ir->type));
1325
1326          dst = st_dst_reg(get_temp(ir->type));
1327
1328          storage = new(mem_ctx) variable_storage(ir, dst.file, dst.index,
1329                                                  dst.array_id);
1330
1331          this->variables.push_tail(storage);
1332       }
1333
1334
1335       for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1336          int index = _mesa_add_state_reference(this->prog->Parameters,
1337                                                (gl_state_index *)slots[i].tokens);
1338
1339          if (storage->file == PROGRAM_STATE_VAR) {
1340             if (storage->index == -1) {
1341                storage->index = index;
1342             } else {
1343                assert(index == storage->index + (int)i);
1344             }
1345          } else {
1346             /* We use GLSL_TYPE_FLOAT here regardless of the actual type of
1347              * the data being moved since MOV does not care about the type of
1348              * data it is moving, and we don't want to declare registers with
1349              * array or struct types.
1350              */
1351             st_src_reg src(PROGRAM_STATE_VAR, index, GLSL_TYPE_FLOAT);
1352             src.swizzle = slots[i].swizzle;
1353             emit_asm(ir, TGSI_OPCODE_MOV, dst, src);
1354             /* even a float takes up a whole vec4 reg in a struct/array. */
1355             dst.index++;
1356          }
1357       }
1358
1359       if (storage->file == PROGRAM_TEMPORARY &&
1360           dst.index != storage->index + (int) ir->get_num_state_slots()) {
1361          fail_link(this->shader_program,
1362                   "failed to load builtin uniform `%s'  (%d/%d regs loaded)\n",
1363                   ir->name, dst.index - storage->index,
1364                   type_size(ir->type));
1365       }
1366    }
1367 }
1368
1369 void
1370 glsl_to_tgsi_visitor::visit(ir_loop *ir)
1371 {
1372    emit_asm(NULL, TGSI_OPCODE_BGNLOOP);
1373
1374    visit_exec_list(&ir->body_instructions, this);
1375
1376    emit_asm(NULL, TGSI_OPCODE_ENDLOOP);
1377 }
1378
1379 void
1380 glsl_to_tgsi_visitor::visit(ir_loop_jump *ir)
1381 {
1382    switch (ir->mode) {
1383    case ir_loop_jump::jump_break:
1384       emit_asm(NULL, TGSI_OPCODE_BRK);
1385       break;
1386    case ir_loop_jump::jump_continue:
1387       emit_asm(NULL, TGSI_OPCODE_CONT);
1388       break;
1389    }
1390 }
1391
1392
1393 void
1394 glsl_to_tgsi_visitor::visit(ir_function_signature *ir)
1395 {
1396    assert(0);
1397    (void)ir;
1398 }
1399
1400 void
1401 glsl_to_tgsi_visitor::visit(ir_function *ir)
1402 {
1403    /* Ignore function bodies other than main() -- we shouldn't see calls to
1404     * them since they should all be inlined before we get to glsl_to_tgsi.
1405     */
1406    if (strcmp(ir->name, "main") == 0) {
1407       const ir_function_signature *sig;
1408       exec_list empty;
1409
1410       sig = ir->matching_signature(NULL, &empty, false);
1411
1412       assert(sig);
1413
1414       foreach_in_list(ir_instruction, ir, &sig->body) {
1415          ir->accept(this);
1416       }
1417    }
1418 }
1419
1420 bool
1421 glsl_to_tgsi_visitor::try_emit_mad(ir_expression *ir, int mul_operand)
1422 {
1423    int nonmul_operand = 1 - mul_operand;
1424    st_src_reg a, b, c;
1425    st_dst_reg result_dst;
1426
1427    ir_expression *expr = ir->operands[mul_operand]->as_expression();
1428    if (!expr || expr->operation != ir_binop_mul)
1429       return false;
1430
1431    expr->operands[0]->accept(this);
1432    a = this->result;
1433    expr->operands[1]->accept(this);
1434    b = this->result;
1435    ir->operands[nonmul_operand]->accept(this);
1436    c = this->result;
1437
1438    this->result = get_temp(ir->type);
1439    result_dst = st_dst_reg(this->result);
1440    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1441    emit_asm(ir, TGSI_OPCODE_MAD, result_dst, a, b, c);
1442
1443    return true;
1444 }
1445
1446 /**
1447  * Emit MAD(a, -b, a) instead of AND(a, NOT(b))
1448  *
1449  * The logic values are 1.0 for true and 0.0 for false.  Logical-and is
1450  * implemented using multiplication, and logical-or is implemented using
1451  * addition.  Logical-not can be implemented as (true - x), or (1.0 - x).
1452  * As result, the logical expression (a & !b) can be rewritten as:
1453  *
1454  *     - a * !b
1455  *     - a * (1 - b)
1456  *     - (a * 1) - (a * b)
1457  *     - a + -(a * b)
1458  *     - a + (a * -b)
1459  *
1460  * This final expression can be implemented as a single MAD(a, -b, a)
1461  * instruction.
1462  */
1463 bool
1464 glsl_to_tgsi_visitor::try_emit_mad_for_and_not(ir_expression *ir, int try_operand)
1465 {
1466    const int other_operand = 1 - try_operand;
1467    st_src_reg a, b;
1468
1469    ir_expression *expr = ir->operands[try_operand]->as_expression();
1470    if (!expr || expr->operation != ir_unop_logic_not)
1471       return false;
1472
1473    ir->operands[other_operand]->accept(this);
1474    a = this->result;
1475    expr->operands[0]->accept(this);
1476    b = this->result;
1477
1478    b.negate = ~b.negate;
1479
1480    this->result = get_temp(ir->type);
1481    emit_asm(ir, TGSI_OPCODE_MAD, st_dst_reg(this->result), a, b, a);
1482
1483    return true;
1484 }
1485
1486 void
1487 glsl_to_tgsi_visitor::reladdr_to_temp(ir_instruction *ir,
1488                                       st_src_reg *reg, int *num_reladdr)
1489 {
1490    if (!reg->reladdr && !reg->reladdr2)
1491       return;
1492
1493    if (reg->reladdr) emit_arl(ir, address_reg, *reg->reladdr);
1494    if (reg->reladdr2) emit_arl(ir, address_reg2, *reg->reladdr2);
1495
1496    if (*num_reladdr != 1) {
1497       st_src_reg temp = get_temp(reg->type == GLSL_TYPE_DOUBLE ? glsl_type::dvec4_type : glsl_type::vec4_type);
1498
1499       emit_asm(ir, TGSI_OPCODE_MOV, st_dst_reg(temp), *reg);
1500       *reg = temp;
1501    }
1502
1503    (*num_reladdr)--;
1504 }
1505
1506 void
1507 glsl_to_tgsi_visitor::visit(ir_expression *ir)
1508 {
1509    st_src_reg op[ARRAY_SIZE(ir->operands)];
1510
1511    /* Quick peephole: Emit MAD(a, b, c) instead of ADD(MUL(a, b), c)
1512     */
1513    if (ir->operation == ir_binop_add) {
1514       if (try_emit_mad(ir, 1))
1515          return;
1516       if (try_emit_mad(ir, 0))
1517          return;
1518    }
1519
1520    /* Quick peephole: Emit OPCODE_MAD(-a, -b, a) instead of AND(a, NOT(b))
1521     */
1522    if (!native_integers && ir->operation == ir_binop_logic_and) {
1523       if (try_emit_mad_for_and_not(ir, 1))
1524          return;
1525       if (try_emit_mad_for_and_not(ir, 0))
1526          return;
1527    }
1528
1529    if (ir->operation == ir_quadop_vector)
1530       assert(!"ir_quadop_vector should have been lowered");
1531
1532    for (unsigned int operand = 0; operand < ir->get_num_operands(); operand++) {
1533       this->result.file = PROGRAM_UNDEFINED;
1534       ir->operands[operand]->accept(this);
1535       if (this->result.file == PROGRAM_UNDEFINED) {
1536          printf("Failed to get tree for expression operand:\n");
1537          ir->operands[operand]->print();
1538          printf("\n");
1539          exit(1);
1540       }
1541       op[operand] = this->result;
1542
1543       /* Matrix expression operands should have been broken down to vector
1544        * operations already.
1545        */
1546       assert(!ir->operands[operand]->type->is_matrix());
1547    }
1548
1549    visit_expression(ir, op);
1550 }
1551
1552 /* The non-recursive part of the expression visitor lives in a separate
1553  * function and should be prevented from being inlined, to avoid a stack
1554  * explosion when deeply nested expressions are visited.
1555  */
1556 void
1557 glsl_to_tgsi_visitor::visit_expression(ir_expression* ir, st_src_reg *op)
1558 {
1559    st_src_reg result_src;
1560    st_dst_reg result_dst;
1561
1562    int vector_elements = ir->operands[0]->type->vector_elements;
1563    if (ir->operands[1]) {
1564       vector_elements = MAX2(vector_elements,
1565                              ir->operands[1]->type->vector_elements);
1566    }
1567
1568    this->result.file = PROGRAM_UNDEFINED;
1569
1570    /* Storage for our result.  Ideally for an assignment we'd be using
1571     * the actual storage for the result here, instead.
1572     */
1573    result_src = get_temp(ir->type);
1574    /* convenience for the emit functions below. */
1575    result_dst = st_dst_reg(result_src);
1576    /* Limit writes to the channels that will be used by result_src later.
1577     * This does limit this temp's use as a temporary for multi-instruction
1578     * sequences.
1579     */
1580    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1581
1582    switch (ir->operation) {
1583    case ir_unop_logic_not:
1584       if (result_dst.type != GLSL_TYPE_FLOAT)
1585          emit_asm(ir, TGSI_OPCODE_NOT, result_dst, op[0]);
1586       else {
1587          /* Previously 'SEQ dst, src, 0.0' was used for this.  However, many
1588           * older GPUs implement SEQ using multiple instructions (i915 uses two
1589           * SGE instructions and a MUL instruction).  Since our logic values are
1590           * 0.0 and 1.0, 1-x also implements !x.
1591           */
1592          op[0].negate = ~op[0].negate;
1593          emit_asm(ir, TGSI_OPCODE_ADD, result_dst, op[0], st_src_reg_for_float(1.0));
1594       }
1595       break;
1596    case ir_unop_neg:
1597       if (result_dst.type == GLSL_TYPE_INT || result_dst.type == GLSL_TYPE_UINT)
1598          emit_asm(ir, TGSI_OPCODE_INEG, result_dst, op[0]);
1599       else if (result_dst.type == GLSL_TYPE_DOUBLE)
1600          emit_asm(ir, TGSI_OPCODE_DNEG, result_dst, op[0]);
1601       else {
1602          op[0].negate = ~op[0].negate;
1603          result_src = op[0];
1604       }
1605       break;
1606    case ir_unop_subroutine_to_int:
1607       emit_asm(ir, TGSI_OPCODE_MOV, result_dst, op[0]);
1608       break;
1609    case ir_unop_abs:
1610       if (result_dst.type == GLSL_TYPE_FLOAT)
1611          emit_asm(ir, TGSI_OPCODE_MOV, result_dst, op[0].get_abs());
1612       else if (result_dst.type == GLSL_TYPE_DOUBLE)
1613          emit_asm(ir, TGSI_OPCODE_DABS, result_dst, op[0]);
1614       else
1615          emit_asm(ir, TGSI_OPCODE_IABS, result_dst, op[0]);
1616       break;
1617    case ir_unop_sign:
1618       emit_asm(ir, TGSI_OPCODE_SSG, result_dst, op[0]);
1619       break;
1620    case ir_unop_rcp:
1621       emit_scalar(ir, TGSI_OPCODE_RCP, result_dst, op[0]);
1622       break;
1623
1624    case ir_unop_exp2:
1625       emit_scalar(ir, TGSI_OPCODE_EX2, result_dst, op[0]);
1626       break;
1627    case ir_unop_exp:
1628    case ir_unop_log:
1629       assert(!"not reached: should be handled by ir_explog_to_explog2");
1630       break;
1631    case ir_unop_log2:
1632       emit_scalar(ir, TGSI_OPCODE_LG2, result_dst, op[0]);
1633       break;
1634    case ir_unop_sin:
1635       emit_scalar(ir, TGSI_OPCODE_SIN, result_dst, op[0]);
1636       break;
1637    case ir_unop_cos:
1638       emit_scalar(ir, TGSI_OPCODE_COS, result_dst, op[0]);
1639       break;
1640    case ir_unop_saturate: {
1641       glsl_to_tgsi_instruction *inst;
1642       inst = emit_asm(ir, TGSI_OPCODE_MOV, result_dst, op[0]);
1643       inst->saturate = true;
1644       break;
1645    }
1646
1647    case ir_unop_dFdx:
1648    case ir_unop_dFdx_coarse:
1649       emit_asm(ir, TGSI_OPCODE_DDX, result_dst, op[0]);
1650       break;
1651    case ir_unop_dFdx_fine:
1652       emit_asm(ir, TGSI_OPCODE_DDX_FINE, result_dst, op[0]);
1653       break;
1654    case ir_unop_dFdy:
1655    case ir_unop_dFdy_coarse:
1656    case ir_unop_dFdy_fine:
1657    {
1658       /* The X component contains 1 or -1 depending on whether the framebuffer
1659        * is a FBO or the window system buffer, respectively.
1660        * It is then multiplied with the source operand of DDY.
1661        */
1662       static const gl_state_index transform_y_state[STATE_LENGTH]
1663          = { STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM };
1664
1665       unsigned transform_y_index =
1666          _mesa_add_state_reference(this->prog->Parameters,
1667                                    transform_y_state);
1668
1669       st_src_reg transform_y = st_src_reg(PROGRAM_STATE_VAR,
1670                                           transform_y_index,
1671                                           glsl_type::vec4_type);
1672       transform_y.swizzle = SWIZZLE_XXXX;
1673
1674       st_src_reg temp = get_temp(glsl_type::vec4_type);
1675
1676       emit_asm(ir, TGSI_OPCODE_MUL, st_dst_reg(temp), transform_y, op[0]);
1677       emit_asm(ir, ir->operation == ir_unop_dFdy_fine ?
1678            TGSI_OPCODE_DDY_FINE : TGSI_OPCODE_DDY, result_dst, temp);
1679       break;
1680    }
1681
1682    case ir_unop_frexp_sig:
1683       emit_asm(ir, TGSI_OPCODE_DFRACEXP, result_dst, undef_dst, op[0]);
1684       break;
1685
1686    case ir_unop_frexp_exp:
1687       emit_asm(ir, TGSI_OPCODE_DFRACEXP, undef_dst, result_dst, op[0]);
1688       break;
1689
1690    case ir_unop_noise: {
1691       /* At some point, a motivated person could add a better
1692        * implementation of noise.  Currently not even the nvidia
1693        * binary drivers do anything more than this.  In any case, the
1694        * place to do this is in the GL state tracker, not the poor
1695        * driver.
1696        */
1697       emit_asm(ir, TGSI_OPCODE_MOV, result_dst, st_src_reg_for_float(0.5));
1698       break;
1699    }
1700
1701    case ir_binop_add:
1702       emit_asm(ir, TGSI_OPCODE_ADD, result_dst, op[0], op[1]);
1703       break;
1704    case ir_binop_sub:
1705       emit_asm(ir, TGSI_OPCODE_SUB, result_dst, op[0], op[1]);
1706       break;
1707
1708    case ir_binop_mul:
1709       emit_asm(ir, TGSI_OPCODE_MUL, result_dst, op[0], op[1]);
1710       break;
1711    case ir_binop_div:
1712       if (result_dst.type == GLSL_TYPE_FLOAT || result_dst.type == GLSL_TYPE_DOUBLE)
1713          assert(!"not reached: should be handled by ir_div_to_mul_rcp");
1714       else
1715          emit_asm(ir, TGSI_OPCODE_DIV, result_dst, op[0], op[1]);
1716       break;
1717    case ir_binop_mod:
1718       if (result_dst.type == GLSL_TYPE_FLOAT)
1719          assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
1720       else
1721          emit_asm(ir, TGSI_OPCODE_MOD, result_dst, op[0], op[1]);
1722       break;
1723
1724    case ir_binop_less:
1725       emit_asm(ir, TGSI_OPCODE_SLT, result_dst, op[0], op[1]);
1726       break;
1727    case ir_binop_greater:
1728       emit_asm(ir, TGSI_OPCODE_SLT, result_dst, op[1], op[0]);
1729       break;
1730    case ir_binop_lequal:
1731       emit_asm(ir, TGSI_OPCODE_SGE, result_dst, op[1], op[0]);
1732       break;
1733    case ir_binop_gequal:
1734       emit_asm(ir, TGSI_OPCODE_SGE, result_dst, op[0], op[1]);
1735       break;
1736    case ir_binop_equal:
1737       emit_asm(ir, TGSI_OPCODE_SEQ, result_dst, op[0], op[1]);
1738       break;
1739    case ir_binop_nequal:
1740       emit_asm(ir, TGSI_OPCODE_SNE, result_dst, op[0], op[1]);
1741       break;
1742    case ir_binop_all_equal:
1743       /* "==" operator producing a scalar boolean. */
1744       if (ir->operands[0]->type->is_vector() ||
1745           ir->operands[1]->type->is_vector()) {
1746          st_src_reg temp = get_temp(native_integers ?
1747                                     glsl_type::uvec4_type :
1748                                     glsl_type::vec4_type);
1749
1750          if (native_integers) {
1751             st_dst_reg temp_dst = st_dst_reg(temp);
1752             st_src_reg temp1 = st_src_reg(temp), temp2 = st_src_reg(temp);
1753
1754             if (ir->operands[0]->type->is_boolean() &&
1755                 ir->operands[1]->as_constant() &&
1756                 ir->operands[1]->as_constant()->is_one()) {
1757                emit_asm(ir, TGSI_OPCODE_MOV, st_dst_reg(temp), op[0]);
1758             } else {
1759                emit_asm(ir, TGSI_OPCODE_SEQ, st_dst_reg(temp), op[0], op[1]);
1760             }
1761
1762             /* Emit 1-3 AND operations to combine the SEQ results. */
1763             switch (ir->operands[0]->type->vector_elements) {
1764             case 2:
1765                break;
1766             case 3:
1767                temp_dst.writemask = WRITEMASK_Y;
1768                temp1.swizzle = SWIZZLE_YYYY;
1769                temp2.swizzle = SWIZZLE_ZZZZ;
1770                emit_asm(ir, TGSI_OPCODE_AND, temp_dst, temp1, temp2);
1771                break;
1772             case 4:
1773                temp_dst.writemask = WRITEMASK_X;
1774                temp1.swizzle = SWIZZLE_XXXX;
1775                temp2.swizzle = SWIZZLE_YYYY;
1776                emit_asm(ir, TGSI_OPCODE_AND, temp_dst, temp1, temp2);
1777                temp_dst.writemask = WRITEMASK_Y;
1778                temp1.swizzle = SWIZZLE_ZZZZ;
1779                temp2.swizzle = SWIZZLE_WWWW;
1780                emit_asm(ir, TGSI_OPCODE_AND, temp_dst, temp1, temp2);
1781             }
1782
1783             temp1.swizzle = SWIZZLE_XXXX;
1784             temp2.swizzle = SWIZZLE_YYYY;
1785             emit_asm(ir, TGSI_OPCODE_AND, result_dst, temp1, temp2);
1786          } else {
1787             emit_asm(ir, TGSI_OPCODE_SNE, st_dst_reg(temp), op[0], op[1]);
1788
1789             /* After the dot-product, the value will be an integer on the
1790              * range [0,4].  Zero becomes 1.0, and positive values become zero.
1791              */
1792             emit_dp(ir, result_dst, temp, temp, vector_elements);
1793
1794             /* Negating the result of the dot-product gives values on the range
1795              * [-4, 0].  Zero becomes 1.0, and negative values become zero.
1796              * This is achieved using SGE.
1797              */
1798             st_src_reg sge_src = result_src;
1799             sge_src.negate = ~sge_src.negate;
1800             emit_asm(ir, TGSI_OPCODE_SGE, result_dst, sge_src, st_src_reg_for_float(0.0));
1801          }
1802       } else {
1803          emit_asm(ir, TGSI_OPCODE_SEQ, result_dst, op[0], op[1]);
1804       }
1805       break;
1806    case ir_binop_any_nequal:
1807       /* "!=" operator producing a scalar boolean. */
1808       if (ir->operands[0]->type->is_vector() ||
1809           ir->operands[1]->type->is_vector()) {
1810          st_src_reg temp = get_temp(native_integers ?
1811                                     glsl_type::uvec4_type :
1812                                     glsl_type::vec4_type);
1813          if (ir->operands[0]->type->is_boolean() &&
1814              ir->operands[1]->as_constant() &&
1815              ir->operands[1]->as_constant()->is_zero()) {
1816             emit_asm(ir, TGSI_OPCODE_MOV, st_dst_reg(temp), op[0]);
1817          } else {
1818             emit_asm(ir, TGSI_OPCODE_SNE, st_dst_reg(temp), op[0], op[1]);
1819          }
1820
1821          if (native_integers) {
1822             st_dst_reg temp_dst = st_dst_reg(temp);
1823             st_src_reg temp1 = st_src_reg(temp), temp2 = st_src_reg(temp);
1824
1825             /* Emit 1-3 OR operations to combine the SNE results. */
1826             switch (ir->operands[0]->type->vector_elements) {
1827             case 2:
1828                break;
1829             case 3:
1830                temp_dst.writemask = WRITEMASK_Y;
1831                temp1.swizzle = SWIZZLE_YYYY;
1832                temp2.swizzle = SWIZZLE_ZZZZ;
1833                emit_asm(ir, TGSI_OPCODE_OR, temp_dst, temp1, temp2);
1834                break;
1835             case 4:
1836                temp_dst.writemask = WRITEMASK_X;
1837                temp1.swizzle = SWIZZLE_XXXX;
1838                temp2.swizzle = SWIZZLE_YYYY;
1839                emit_asm(ir, TGSI_OPCODE_OR, temp_dst, temp1, temp2);
1840                temp_dst.writemask = WRITEMASK_Y;
1841                temp1.swizzle = SWIZZLE_ZZZZ;
1842                temp2.swizzle = SWIZZLE_WWWW;
1843                emit_asm(ir, TGSI_OPCODE_OR, temp_dst, temp1, temp2);
1844             }
1845
1846             temp1.swizzle = SWIZZLE_XXXX;
1847             temp2.swizzle = SWIZZLE_YYYY;
1848             emit_asm(ir, TGSI_OPCODE_OR, result_dst, temp1, temp2);
1849          } else {
1850             /* After the dot-product, the value will be an integer on the
1851              * range [0,4].  Zero stays zero, and positive values become 1.0.
1852              */
1853             glsl_to_tgsi_instruction *const dp =
1854                   emit_dp(ir, result_dst, temp, temp, vector_elements);
1855             if (this->prog->Target == GL_FRAGMENT_PROGRAM_ARB) {
1856                /* The clamping to [0,1] can be done for free in the fragment
1857                 * shader with a saturate.
1858                 */
1859                dp->saturate = true;
1860             } else {
1861                /* Negating the result of the dot-product gives values on the range
1862                 * [-4, 0].  Zero stays zero, and negative values become 1.0.  This
1863                 * achieved using SLT.
1864                 */
1865                st_src_reg slt_src = result_src;
1866                slt_src.negate = ~slt_src.negate;
1867                emit_asm(ir, TGSI_OPCODE_SLT, result_dst, slt_src, st_src_reg_for_float(0.0));
1868             }
1869          }
1870       } else {
1871          emit_asm(ir, TGSI_OPCODE_SNE, result_dst, op[0], op[1]);
1872       }
1873       break;
1874
1875    case ir_binop_logic_xor:
1876       if (native_integers)
1877          emit_asm(ir, TGSI_OPCODE_XOR, result_dst, op[0], op[1]);
1878       else
1879          emit_asm(ir, TGSI_OPCODE_SNE, result_dst, op[0], op[1]);
1880       break;
1881
1882    case ir_binop_logic_or: {
1883       if (native_integers) {
1884          /* If integers are used as booleans, we can use an actual "or"
1885           * instruction.
1886           */
1887          assert(native_integers);
1888          emit_asm(ir, TGSI_OPCODE_OR, result_dst, op[0], op[1]);
1889       } else {
1890          /* After the addition, the value will be an integer on the
1891           * range [0,2].  Zero stays zero, and positive values become 1.0.
1892           */
1893          glsl_to_tgsi_instruction *add =
1894             emit_asm(ir, TGSI_OPCODE_ADD, result_dst, op[0], op[1]);
1895          if (this->prog->Target == GL_FRAGMENT_PROGRAM_ARB) {
1896             /* The clamping to [0,1] can be done for free in the fragment
1897              * shader with a saturate if floats are being used as boolean values.
1898              */
1899             add->saturate = true;
1900          } else {
1901             /* Negating the result of the addition gives values on the range
1902              * [-2, 0].  Zero stays zero, and negative values become 1.0.  This
1903              * is achieved using SLT.
1904              */
1905             st_src_reg slt_src = result_src;
1906             slt_src.negate = ~slt_src.negate;
1907             emit_asm(ir, TGSI_OPCODE_SLT, result_dst, slt_src, st_src_reg_for_float(0.0));
1908          }
1909       }
1910       break;
1911    }
1912
1913    case ir_binop_logic_and:
1914       /* If native integers are disabled, the bool args are stored as float 0.0
1915        * or 1.0, so "mul" gives us "and".  If they're enabled, just use the
1916        * actual AND opcode.
1917        */
1918       if (native_integers)
1919          emit_asm(ir, TGSI_OPCODE_AND, result_dst, op[0], op[1]);
1920       else
1921          emit_asm(ir, TGSI_OPCODE_MUL, result_dst, op[0], op[1]);
1922       break;
1923
1924    case ir_binop_dot:
1925       assert(ir->operands[0]->type->is_vector());
1926       assert(ir->operands[0]->type == ir->operands[1]->type);
1927       emit_dp(ir, result_dst, op[0], op[1],
1928               ir->operands[0]->type->vector_elements);
1929       break;
1930
1931    case ir_unop_sqrt:
1932       if (have_sqrt) {
1933          emit_scalar(ir, TGSI_OPCODE_SQRT, result_dst, op[0]);
1934       } else {
1935          /* This is the only instruction sequence that makes the game "Risen"
1936           * render correctly. ABS is not required for the game, but since GLSL
1937           * declares negative values as "undefined", allowing us to do whatever
1938           * we want, I choose to use ABS to match DX9 and pre-GLSL RSQ
1939           * behavior.
1940           */
1941          emit_scalar(ir, TGSI_OPCODE_RSQ, result_dst, op[0].get_abs());
1942          emit_scalar(ir, TGSI_OPCODE_RCP, result_dst, result_src);
1943       }
1944       break;
1945    case ir_unop_rsq:
1946       emit_scalar(ir, TGSI_OPCODE_RSQ, result_dst, op[0]);
1947       break;
1948    case ir_unop_i2f:
1949       if (native_integers) {
1950          emit_asm(ir, TGSI_OPCODE_I2F, result_dst, op[0]);
1951          break;
1952       }
1953       /* fallthrough to next case otherwise */
1954    case ir_unop_b2f:
1955       if (native_integers) {
1956          emit_asm(ir, TGSI_OPCODE_AND, result_dst, op[0], st_src_reg_for_float(1.0));
1957          break;
1958       }
1959       /* fallthrough to next case otherwise */
1960    case ir_unop_i2u:
1961    case ir_unop_u2i:
1962       /* Converting between signed and unsigned integers is a no-op. */
1963       result_src = op[0];
1964       result_src.type = result_dst.type;
1965       break;
1966    case ir_unop_b2i:
1967       if (native_integers) {
1968          /* Booleans are stored as integers using ~0 for true and 0 for false.
1969           * GLSL requires that int(bool) return 1 for true and 0 for false.
1970           * This conversion is done with AND, but it could be done with NEG.
1971           */
1972          emit_asm(ir, TGSI_OPCODE_AND, result_dst, op[0], st_src_reg_for_int(1));
1973       } else {
1974          /* Booleans and integers are both stored as floats when native
1975           * integers are disabled.
1976           */
1977          result_src = op[0];
1978       }
1979       break;
1980    case ir_unop_f2i:
1981       if (native_integers)
1982          emit_asm(ir, TGSI_OPCODE_F2I, result_dst, op[0]);
1983       else
1984          emit_asm(ir, TGSI_OPCODE_TRUNC, result_dst, op[0]);
1985       break;
1986    case ir_unop_f2u:
1987       if (native_integers)
1988          emit_asm(ir, TGSI_OPCODE_F2U, result_dst, op[0]);
1989       else
1990          emit_asm(ir, TGSI_OPCODE_TRUNC, result_dst, op[0]);
1991       break;
1992    case ir_unop_bitcast_f2i:
1993    case ir_unop_bitcast_f2u:
1994       /* Make sure we don't propagate the negate modifier to integer opcodes. */
1995       if (op[0].negate || op[0].abs)
1996          emit_asm(ir, TGSI_OPCODE_MOV, result_dst, op[0]);
1997       else
1998          result_src = op[0];
1999       result_src.type = ir->operation == ir_unop_bitcast_f2i ? GLSL_TYPE_INT :
2000                                                                GLSL_TYPE_UINT;
2001       break;
2002    case ir_unop_bitcast_i2f:
2003    case ir_unop_bitcast_u2f:
2004       result_src = op[0];
2005       result_src.type = GLSL_TYPE_FLOAT;
2006       break;
2007    case ir_unop_f2b:
2008       emit_asm(ir, TGSI_OPCODE_SNE, result_dst, op[0], st_src_reg_for_float(0.0));
2009       break;
2010    case ir_unop_d2b:
2011       emit_asm(ir, TGSI_OPCODE_SNE, result_dst, op[0], st_src_reg_for_double(0.0));
2012       break;
2013    case ir_unop_i2b:
2014       if (native_integers)
2015          emit_asm(ir, TGSI_OPCODE_USNE, result_dst, op[0], st_src_reg_for_int(0));
2016       else
2017          emit_asm(ir, TGSI_OPCODE_SNE, result_dst, op[0], st_src_reg_for_float(0.0));
2018       break;
2019    case ir_unop_trunc:
2020       emit_asm(ir, TGSI_OPCODE_TRUNC, result_dst, op[0]);
2021       break;
2022    case ir_unop_ceil:
2023       emit_asm(ir, TGSI_OPCODE_CEIL, result_dst, op[0]);
2024       break;
2025    case ir_unop_floor:
2026       emit_asm(ir, TGSI_OPCODE_FLR, result_dst, op[0]);
2027       break;
2028    case ir_unop_round_even:
2029       emit_asm(ir, TGSI_OPCODE_ROUND, result_dst, op[0]);
2030       break;
2031    case ir_unop_fract:
2032       emit_asm(ir, TGSI_OPCODE_FRC, result_dst, op[0]);
2033       break;
2034
2035    case ir_binop_min:
2036       emit_asm(ir, TGSI_OPCODE_MIN, result_dst, op[0], op[1]);
2037       break;
2038    case ir_binop_max:
2039       emit_asm(ir, TGSI_OPCODE_MAX, result_dst, op[0], op[1]);
2040       break;
2041    case ir_binop_pow:
2042       emit_scalar(ir, TGSI_OPCODE_POW, result_dst, op[0], op[1]);
2043       break;
2044
2045    case ir_unop_bit_not:
2046       if (native_integers) {
2047          emit_asm(ir, TGSI_OPCODE_NOT, result_dst, op[0]);
2048          break;
2049       }
2050    case ir_unop_u2f:
2051       if (native_integers) {
2052          emit_asm(ir, TGSI_OPCODE_U2F, result_dst, op[0]);
2053          break;
2054       }
2055    case ir_binop_lshift:
2056       if (native_integers) {
2057          emit_asm(ir, TGSI_OPCODE_SHL, result_dst, op[0], op[1]);
2058          break;
2059       }
2060    case ir_binop_rshift:
2061       if (native_integers) {
2062          emit_asm(ir, TGSI_OPCODE_ISHR, result_dst, op[0], op[1]);
2063          break;
2064       }
2065    case ir_binop_bit_and:
2066       if (native_integers) {
2067          emit_asm(ir, TGSI_OPCODE_AND, result_dst, op[0], op[1]);
2068          break;
2069       }
2070    case ir_binop_bit_xor:
2071       if (native_integers) {
2072          emit_asm(ir, TGSI_OPCODE_XOR, result_dst, op[0], op[1]);
2073          break;
2074       }
2075    case ir_binop_bit_or:
2076       if (native_integers) {
2077          emit_asm(ir, TGSI_OPCODE_OR, result_dst, op[0], op[1]);
2078          break;
2079       }
2080
2081       assert(!"GLSL 1.30 features unsupported");
2082       break;
2083
2084    case ir_binop_ubo_load: {
2085       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
2086       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
2087       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
2088       unsigned const_block = const_uniform_block ? const_uniform_block->value.u[0] + 1 : 0;
2089       st_src_reg index_reg = get_temp(glsl_type::uint_type);
2090       st_src_reg cbuf;
2091
2092       cbuf.type = ir->type->base_type;
2093       cbuf.file = PROGRAM_CONSTANT;
2094       cbuf.index = 0;
2095       cbuf.reladdr = NULL;
2096       cbuf.negate = 0;
2097       cbuf.abs = 0;
2098
2099       assert(ir->type->is_vector() || ir->type->is_scalar());
2100
2101       if (const_offset_ir) {
2102          /* Constant index into constant buffer */
2103          cbuf.reladdr = NULL;
2104          cbuf.index = const_offset / 16;
2105       }
2106       else {
2107          ir_expression *offset_expr = ir->operands[1]->as_expression();
2108          st_src_reg offset = op[1];
2109
2110          /* The OpenGL spec is written in such a way that accesses with
2111           * non-constant offset are almost always vec4-aligned. The only
2112           * exception to this are members of structs in arrays of structs:
2113           * each struct in an array of structs is at least vec4-aligned,
2114           * but single-element and [ui]vec2 members of the struct may be at
2115           * an offset that is not a multiple of 16 bytes.
2116           *
2117           * Here, we extract that offset, relying on previous passes to always
2118           * generate offset expressions of the form (+ expr constant_offset).
2119           *
2120           * Note that the std430 layout, which allows more cases of alignment
2121           * less than vec4 in arrays, is not supported for uniform blocks, so
2122           * we do not have to deal with it here.
2123           */
2124          if (offset_expr && offset_expr->operation == ir_binop_add) {
2125             const_offset_ir = offset_expr->operands[1]->as_constant();
2126             if (const_offset_ir) {
2127                const_offset = const_offset_ir->value.u[0];
2128                cbuf.index = const_offset / 16;
2129                offset_expr->operands[0]->accept(this);
2130                offset = this->result;
2131             }
2132          }
2133
2134          /* Relative/variable index into constant buffer */
2135          emit_asm(ir, TGSI_OPCODE_USHR, st_dst_reg(index_reg), offset,
2136               st_src_reg_for_int(4));
2137          cbuf.reladdr = ralloc(mem_ctx, st_src_reg);
2138          memcpy(cbuf.reladdr, &index_reg, sizeof(index_reg));
2139       }
2140
2141       if (const_uniform_block) {
2142          /* Constant constant buffer */
2143          cbuf.reladdr2 = NULL;
2144          cbuf.index2D = const_block;
2145          cbuf.has_index2 = true;
2146       }
2147       else {
2148          /* Relative/variable constant buffer */
2149          cbuf.reladdr2 = ralloc(mem_ctx, st_src_reg);
2150          cbuf.index2D = 1;
2151          memcpy(cbuf.reladdr2, &op[0], sizeof(st_src_reg));
2152          cbuf.has_index2 = true;
2153       }
2154
2155       cbuf.swizzle = swizzle_for_size(ir->type->vector_elements);
2156       if (glsl_base_type_is_64bit(cbuf.type))
2157          cbuf.swizzle += MAKE_SWIZZLE4(const_offset % 16 / 8,
2158                                        const_offset % 16 / 8,
2159                                        const_offset % 16 / 8,
2160                                        const_offset % 16 / 8);
2161       else
2162          cbuf.swizzle += MAKE_SWIZZLE4(const_offset % 16 / 4,
2163                                        const_offset % 16 / 4,
2164                                        const_offset % 16 / 4,
2165                                        const_offset % 16 / 4);
2166
2167       if (ir->type->base_type == GLSL_TYPE_BOOL) {
2168          emit_asm(ir, TGSI_OPCODE_USNE, result_dst, cbuf, st_src_reg_for_int(0));
2169       } else {
2170          emit_asm(ir, TGSI_OPCODE_MOV, result_dst, cbuf);
2171       }
2172       break;
2173    }
2174    case ir_triop_lrp:
2175       /* note: we have to reorder the three args here */
2176       emit_asm(ir, TGSI_OPCODE_LRP, result_dst, op[2], op[1], op[0]);
2177       break;
2178    case ir_triop_csel:
2179       if (this->ctx->Const.NativeIntegers)
2180          emit_asm(ir, TGSI_OPCODE_UCMP, result_dst, op[0], op[1], op[2]);
2181       else {
2182          op[0].negate = ~op[0].negate;
2183          emit_asm(ir, TGSI_OPCODE_CMP, result_dst, op[0], op[1], op[2]);
2184       }
2185       break;
2186    case ir_triop_bitfield_extract:
2187       emit_asm(ir, TGSI_OPCODE_IBFE, result_dst, op[0], op[1], op[2]);
2188       break;
2189    case ir_quadop_bitfield_insert:
2190       emit_asm(ir, TGSI_OPCODE_BFI, result_dst, op[0], op[1], op[2], op[3]);
2191       break;
2192    case ir_unop_bitfield_reverse:
2193       emit_asm(ir, TGSI_OPCODE_BREV, result_dst, op[0]);
2194       break;
2195    case ir_unop_bit_count:
2196       emit_asm(ir, TGSI_OPCODE_POPC, result_dst, op[0]);
2197       break;
2198    case ir_unop_find_msb:
2199       emit_asm(ir, TGSI_OPCODE_IMSB, result_dst, op[0]);
2200       break;
2201    case ir_unop_find_lsb:
2202       emit_asm(ir, TGSI_OPCODE_LSB, result_dst, op[0]);
2203       break;
2204    case ir_binop_imul_high:
2205       emit_asm(ir, TGSI_OPCODE_IMUL_HI, result_dst, op[0], op[1]);
2206       break;
2207    case ir_triop_fma:
2208       /* In theory, MAD is incorrect here. */
2209       if (have_fma)
2210          emit_asm(ir, TGSI_OPCODE_FMA, result_dst, op[0], op[1], op[2]);
2211       else
2212          emit_asm(ir, TGSI_OPCODE_MAD, result_dst, op[0], op[1], op[2]);
2213       break;
2214    case ir_unop_interpolate_at_centroid:
2215       emit_asm(ir, TGSI_OPCODE_INTERP_CENTROID, result_dst, op[0]);
2216       break;
2217    case ir_binop_interpolate_at_offset: {
2218       /* The y coordinate needs to be flipped for the default fb */
2219       static const gl_state_index transform_y_state[STATE_LENGTH]
2220          = { STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM };
2221
2222       unsigned transform_y_index =
2223          _mesa_add_state_reference(this->prog->Parameters,
2224                                    transform_y_state);
2225
2226       st_src_reg transform_y = st_src_reg(PROGRAM_STATE_VAR,
2227                                           transform_y_index,
2228                                           glsl_type::vec4_type);
2229       transform_y.swizzle = SWIZZLE_XXXX;
2230
2231       st_src_reg temp = get_temp(glsl_type::vec2_type);
2232       st_dst_reg temp_dst = st_dst_reg(temp);
2233
2234       emit_asm(ir, TGSI_OPCODE_MOV, temp_dst, op[1]);
2235       temp_dst.writemask = WRITEMASK_Y;
2236       emit_asm(ir, TGSI_OPCODE_MUL, temp_dst, transform_y, op[1]);
2237       emit_asm(ir, TGSI_OPCODE_INTERP_OFFSET, result_dst, op[0], temp);
2238       break;
2239    }
2240    case ir_binop_interpolate_at_sample:
2241       emit_asm(ir, TGSI_OPCODE_INTERP_SAMPLE, result_dst, op[0], op[1]);
2242       break;
2243
2244    case ir_unop_d2f:
2245       emit_asm(ir, TGSI_OPCODE_D2F, result_dst, op[0]);
2246       break;
2247    case ir_unop_f2d:
2248       emit_asm(ir, TGSI_OPCODE_F2D, result_dst, op[0]);
2249       break;
2250    case ir_unop_d2i:
2251       emit_asm(ir, TGSI_OPCODE_D2I, result_dst, op[0]);
2252       break;
2253    case ir_unop_i2d:
2254       emit_asm(ir, TGSI_OPCODE_I2D, result_dst, op[0]);
2255       break;
2256    case ir_unop_d2u:
2257       emit_asm(ir, TGSI_OPCODE_D2U, result_dst, op[0]);
2258       break;
2259    case ir_unop_u2d:
2260       emit_asm(ir, TGSI_OPCODE_U2D, result_dst, op[0]);
2261       break;
2262    case ir_unop_unpack_double_2x32:
2263    case ir_unop_pack_double_2x32:
2264       emit_asm(ir, TGSI_OPCODE_MOV, result_dst, op[0]);
2265       break;
2266
2267    case ir_binop_ldexp:
2268       if (ir->operands[0]->type->base_type == GLSL_TYPE_DOUBLE) {
2269          emit_asm(ir, TGSI_OPCODE_DLDEXP, result_dst, op[0], op[1]);
2270       } else {
2271          assert(!"Invalid ldexp for non-double opcode in glsl_to_tgsi_visitor::visit()");
2272       }
2273       break;
2274
2275    case ir_unop_pack_half_2x16:
2276       emit_asm(ir, TGSI_OPCODE_PK2H, result_dst, op[0]);
2277       break;
2278    case ir_unop_unpack_half_2x16:
2279       emit_asm(ir, TGSI_OPCODE_UP2H, result_dst, op[0]);
2280       break;
2281
2282    case ir_unop_get_buffer_size: {
2283       ir_constant *const_offset = ir->operands[0]->as_constant();
2284       st_src_reg buffer(
2285             PROGRAM_BUFFER,
2286             ctx->Const.Program[shader->Stage].MaxAtomicBuffers +
2287             (const_offset ? const_offset->value.u[0] : 0),
2288             GLSL_TYPE_UINT);
2289       if (!const_offset) {
2290          buffer.reladdr = ralloc(mem_ctx, st_src_reg);
2291          *buffer.reladdr = op[0];
2292          emit_arl(ir, sampler_reladdr, op[0]);
2293       }
2294       emit_asm(ir, TGSI_OPCODE_RESQ, result_dst)->resource = buffer;
2295       break;
2296    }
2297
2298    case ir_unop_vote_any:
2299       emit_asm(ir, TGSI_OPCODE_VOTE_ANY, result_dst, op[0]);
2300       break;
2301    case ir_unop_vote_all:
2302       emit_asm(ir, TGSI_OPCODE_VOTE_ALL, result_dst, op[0]);
2303       break;
2304    case ir_unop_vote_eq:
2305       emit_asm(ir, TGSI_OPCODE_VOTE_EQ, result_dst, op[0]);
2306       break;
2307
2308    case ir_unop_pack_snorm_2x16:
2309    case ir_unop_pack_unorm_2x16:
2310    case ir_unop_pack_snorm_4x8:
2311    case ir_unop_pack_unorm_4x8:
2312
2313    case ir_unop_unpack_snorm_2x16:
2314    case ir_unop_unpack_unorm_2x16:
2315    case ir_unop_unpack_snorm_4x8:
2316    case ir_unop_unpack_unorm_4x8:
2317
2318    case ir_quadop_vector:
2319    case ir_binop_vector_extract:
2320    case ir_triop_vector_insert:
2321    case ir_binop_carry:
2322    case ir_binop_borrow:
2323    case ir_unop_ssbo_unsized_array_length:
2324       /* This operation is not supported, or should have already been handled.
2325        */
2326       assert(!"Invalid ir opcode in glsl_to_tgsi_visitor::visit()");
2327       break;
2328    }
2329
2330    this->result = result_src;
2331 }
2332
2333
2334 void
2335 glsl_to_tgsi_visitor::visit(ir_swizzle *ir)
2336 {
2337    st_src_reg src;
2338    int i;
2339    int swizzle[4];
2340
2341    /* Note that this is only swizzles in expressions, not those on the left
2342     * hand side of an assignment, which do write masking.  See ir_assignment
2343     * for that.
2344     */
2345
2346    ir->val->accept(this);
2347    src = this->result;
2348    assert(src.file != PROGRAM_UNDEFINED);
2349    assert(ir->type->vector_elements > 0);
2350
2351    for (i = 0; i < 4; i++) {
2352       if (i < ir->type->vector_elements) {
2353          switch (i) {
2354          case 0:
2355             swizzle[i] = GET_SWZ(src.swizzle, ir->mask.x);
2356             break;
2357          case 1:
2358             swizzle[i] = GET_SWZ(src.swizzle, ir->mask.y);
2359             break;
2360          case 2:
2361             swizzle[i] = GET_SWZ(src.swizzle, ir->mask.z);
2362             break;
2363          case 3:
2364             swizzle[i] = GET_SWZ(src.swizzle, ir->mask.w);
2365             break;
2366          }
2367       } else {
2368          /* If the type is smaller than a vec4, replicate the last
2369           * channel out.
2370           */
2371          swizzle[i] = swizzle[ir->type->vector_elements - 1];
2372       }
2373    }
2374
2375    src.swizzle = MAKE_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2376
2377    this->result = src;
2378 }
2379
2380 /* Test if the variable is an array. Note that geometry and
2381  * tessellation shader inputs are outputs are always arrays (except
2382  * for patch inputs), so only the array element type is considered.
2383  */
2384 static bool
2385 is_inout_array(unsigned stage, ir_variable *var, bool *remove_array)
2386 {
2387    const glsl_type *type = var->type;
2388
2389    *remove_array = false;
2390
2391    if ((stage == MESA_SHADER_VERTEX && var->data.mode == ir_var_shader_in) ||
2392        (stage == MESA_SHADER_FRAGMENT && var->data.mode == ir_var_shader_out))
2393       return false;
2394
2395    if (((stage == MESA_SHADER_GEOMETRY && var->data.mode == ir_var_shader_in) ||
2396         (stage == MESA_SHADER_TESS_EVAL && var->data.mode == ir_var_shader_in) ||
2397         stage == MESA_SHADER_TESS_CTRL) &&
2398        !var->data.patch) {
2399       if (!var->type->is_array())
2400          return false; /* a system value probably */
2401
2402       type = var->type->fields.array;
2403       *remove_array = true;
2404    }
2405
2406    return type->is_array() || type->is_matrix();
2407 }
2408
2409 static unsigned
2410 st_translate_interp_loc(ir_variable *var)
2411 {
2412    if (var->data.centroid)
2413       return TGSI_INTERPOLATE_LOC_CENTROID;
2414    else if (var->data.sample)
2415       return TGSI_INTERPOLATE_LOC_SAMPLE;
2416    else
2417       return TGSI_INTERPOLATE_LOC_CENTER;
2418 }
2419
2420 void
2421 glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
2422 {
2423    variable_storage *entry = find_variable_storage(ir->var);
2424    ir_variable *var = ir->var;
2425    bool remove_array;
2426
2427    if (!entry) {
2428       switch (var->data.mode) {
2429       case ir_var_uniform:
2430          entry = new(mem_ctx) variable_storage(var, PROGRAM_UNIFORM,
2431                                                var->data.param_index);
2432          this->variables.push_tail(entry);
2433          break;
2434       case ir_var_shader_in: {
2435          /* The linker assigns locations for varyings and attributes,
2436           * including deprecated builtins (like gl_Color), user-assign
2437           * generic attributes (glBindVertexLocation), and
2438           * user-defined varyings.
2439           */
2440          assert(var->data.location != -1);
2441
2442          const glsl_type *type_without_array = var->type->without_array();
2443          struct inout_decl *decl = &inputs[num_inputs];
2444          unsigned component = var->data.location_frac;
2445          unsigned num_components;
2446          num_inputs++;
2447
2448          if (type_without_array->is_64bit())
2449             component = component / 2;
2450          if (type_without_array->vector_elements)
2451             num_components = type_without_array->vector_elements;
2452          else
2453             num_components = 4;
2454
2455          decl->mesa_index = var->data.location;
2456          decl->interp = (glsl_interp_mode) var->data.interpolation;
2457          decl->interp_loc = st_translate_interp_loc(var);
2458          decl->base_type = type_without_array->base_type;
2459          decl->usage_mask = u_bit_consecutive(component, num_components);
2460
2461          if (is_inout_array(shader->Stage, var, &remove_array)) {
2462             decl->array_id = num_input_arrays + 1;
2463             num_input_arrays++;
2464          } else {
2465             decl->array_id = 0;
2466          }
2467
2468          if (remove_array)
2469             decl->size = type_size(var->type->fields.array);
2470          else
2471             decl->size = type_size(var->type);
2472
2473          entry = new(mem_ctx) variable_storage(var,
2474                                                PROGRAM_INPUT,
2475                                                decl->mesa_index,
2476                                                decl->array_id);
2477          entry->component = component;
2478
2479          this->variables.push_tail(entry);
2480          break;
2481       }
2482       case ir_var_shader_out: {
2483          assert(var->data.location != -1);
2484
2485          const glsl_type *type_without_array = var->type->without_array();
2486          struct inout_decl *decl = &outputs[num_outputs];
2487          unsigned component = var->data.location_frac;
2488          unsigned num_components;
2489          num_outputs++;
2490
2491          if (type_without_array->is_64bit())
2492             component = component / 2;
2493          if (type_without_array->vector_elements)
2494             num_components = type_without_array->vector_elements;
2495          else
2496             num_components = 4;
2497
2498          decl->mesa_index = var->data.location + FRAG_RESULT_MAX * var->data.index;
2499          decl->base_type = type_without_array->base_type;
2500          decl->usage_mask = u_bit_consecutive(component, num_components);
2501          if (var->data.stream & (1u << 31)) {
2502             decl->gs_out_streams = var->data.stream & ~(1u << 31);
2503          } else {
2504             assert(var->data.stream < 4);
2505             decl->gs_out_streams = 0;
2506             for (unsigned i = 0; i < num_components; ++i)
2507                decl->gs_out_streams |= var->data.stream << (2 * (component + i));
2508          }
2509
2510          if (is_inout_array(shader->Stage, var, &remove_array)) {
2511             decl->array_id = num_output_arrays + 1;
2512             num_output_arrays++;
2513          } else {
2514             decl->array_id = 0;
2515          }
2516
2517          if (remove_array)
2518             decl->size = type_size(var->type->fields.array);
2519          else
2520             decl->size = type_size(var->type);
2521
2522          entry = new(mem_ctx) variable_storage(var,
2523                                                PROGRAM_OUTPUT,
2524                                                decl->mesa_index,
2525                                                decl->array_id);
2526          entry->component = component;
2527
2528          this->variables.push_tail(entry);
2529          break;
2530       }
2531       case ir_var_system_value:
2532          entry = new(mem_ctx) variable_storage(var,
2533                                                PROGRAM_SYSTEM_VALUE,
2534                                                var->data.location);
2535          break;
2536       case ir_var_auto:
2537       case ir_var_temporary:
2538          st_src_reg src = get_temp(var->type);
2539
2540          entry = new(mem_ctx) variable_storage(var, src.file, src.index,
2541                                                src.array_id);
2542          this->variables.push_tail(entry);
2543
2544          break;
2545       }
2546
2547       if (!entry) {
2548          printf("Failed to make storage for %s\n", var->name);
2549          exit(1);
2550       }
2551    }
2552
2553    this->result = st_src_reg(entry->file, entry->index, var->type,
2554                              entry->component, entry->array_id);
2555    if (this->shader->Stage == MESA_SHADER_VERTEX && var->data.mode == ir_var_shader_in && var->type->is_double())
2556       this->result.is_double_vertex_input = true;
2557    if (!native_integers)
2558       this->result.type = GLSL_TYPE_FLOAT;
2559 }
2560
2561 static void
2562 shrink_array_declarations(struct inout_decl *decls, unsigned count,
2563                           GLbitfield64* usage_mask,
2564                           GLbitfield64 double_usage_mask,
2565                           GLbitfield* patch_usage_mask)
2566 {
2567    unsigned i;
2568    int j;
2569
2570    /* Fix array declarations by removing unused array elements at both ends
2571     * of the arrays. For example, mat4[3] where only mat[1] is used.
2572     */
2573    for (i = 0; i < count; i++) {
2574       struct inout_decl *decl = &decls[i];
2575       if (!decl->array_id)
2576          continue;
2577
2578       /* Shrink the beginning. */
2579       for (j = 0; j < (int)decl->size; j++) {
2580          if (decl->mesa_index >= VARYING_SLOT_PATCH0) {
2581             if (*patch_usage_mask &
2582                 BITFIELD64_BIT(decl->mesa_index - VARYING_SLOT_PATCH0 + j))
2583                break;
2584          }
2585          else {
2586             if (*usage_mask & BITFIELD64_BIT(decl->mesa_index+j))
2587                break;
2588             if (double_usage_mask & BITFIELD64_BIT(decl->mesa_index+j-1))
2589                break;
2590          }
2591
2592          decl->mesa_index++;
2593          decl->size--;
2594          j--;
2595       }
2596
2597       /* Shrink the end. */
2598       for (j = decl->size-1; j >= 0; j--) {
2599          if (decl->mesa_index >= VARYING_SLOT_PATCH0) {
2600             if (*patch_usage_mask &
2601                 BITFIELD64_BIT(decl->mesa_index - VARYING_SLOT_PATCH0 + j))
2602                break;
2603          }
2604          else {
2605             if (*usage_mask & BITFIELD64_BIT(decl->mesa_index+j))
2606                break;
2607             if (double_usage_mask & BITFIELD64_BIT(decl->mesa_index+j-1))
2608                break;
2609          }
2610
2611          decl->size--;
2612       }
2613
2614       /* When not all entries of an array are accessed, we mark them as used
2615        * here anyway, to ensure that the input/output mapping logic doesn't get
2616        * confused.
2617        *
2618        * TODO This happens when an array isn't used via indirect access, which
2619        * some game ports do (at least eON-based). There is an optimization
2620        * opportunity here by replacing the array declaration with non-array
2621        * declarations of those slots that are actually used.
2622        */
2623       for (j = 1; j < (int)decl->size; ++j) {
2624          if (decl->mesa_index >= VARYING_SLOT_PATCH0)
2625             *patch_usage_mask |= BITFIELD64_BIT(decl->mesa_index - VARYING_SLOT_PATCH0 + j);
2626          else
2627             *usage_mask |= BITFIELD64_BIT(decl->mesa_index + j);
2628       }
2629    }
2630 }
2631
2632 void
2633 glsl_to_tgsi_visitor::visit(ir_dereference_array *ir)
2634 {
2635    ir_constant *index;
2636    st_src_reg src;
2637    int element_size = type_size(ir->type);
2638    bool is_2D = false;
2639
2640    index = ir->array_index->constant_expression_value();
2641
2642    ir->array->accept(this);
2643    src = this->result;
2644
2645    if (ir->array->ir_type != ir_type_dereference_array) {
2646       switch (this->prog->Target) {
2647       case GL_TESS_CONTROL_PROGRAM_NV:
2648          is_2D = (src.file == PROGRAM_INPUT || src.file == PROGRAM_OUTPUT) &&
2649                  !ir->variable_referenced()->data.patch;
2650          break;
2651       case GL_TESS_EVALUATION_PROGRAM_NV:
2652          is_2D = src.file == PROGRAM_INPUT &&
2653                  !ir->variable_referenced()->data.patch;
2654          break;
2655       case GL_GEOMETRY_PROGRAM_NV:
2656          is_2D = src.file == PROGRAM_INPUT;
2657          break;
2658       }
2659    }
2660
2661    if (is_2D)
2662       element_size = 1;
2663
2664    if (index) {
2665
2666       if (this->prog->Target == GL_VERTEX_PROGRAM_ARB &&
2667           src.file == PROGRAM_INPUT)
2668          element_size = attrib_type_size(ir->type, true);
2669       if (is_2D) {
2670          src.index2D = index->value.i[0];
2671          src.has_index2 = true;
2672       } else
2673          src.index += index->value.i[0] * element_size;
2674    } else {
2675       /* Variable index array dereference.  It eats the "vec4" of the
2676        * base of the array and an index that offsets the TGSI register
2677        * index.
2678        */
2679       ir->array_index->accept(this);
2680
2681       st_src_reg index_reg;
2682
2683       if (element_size == 1) {
2684          index_reg = this->result;
2685       } else {
2686          index_reg = get_temp(native_integers ?
2687                               glsl_type::int_type : glsl_type::float_type);
2688
2689          emit_asm(ir, TGSI_OPCODE_MUL, st_dst_reg(index_reg),
2690               this->result, st_src_reg_for_type(index_reg.type, element_size));
2691       }
2692
2693       /* If there was already a relative address register involved, add the
2694        * new and the old together to get the new offset.
2695        */
2696       if (!is_2D && src.reladdr != NULL) {
2697          st_src_reg accum_reg = get_temp(native_integers ?
2698                                 glsl_type::int_type : glsl_type::float_type);
2699
2700          emit_asm(ir, TGSI_OPCODE_ADD, st_dst_reg(accum_reg),
2701               index_reg, *src.reladdr);
2702
2703          index_reg = accum_reg;
2704       }
2705
2706       if (is_2D) {
2707          src.reladdr2 = ralloc(mem_ctx, st_src_reg);
2708          memcpy(src.reladdr2, &index_reg, sizeof(index_reg));
2709          src.index2D = 0;
2710          src.has_index2 = true;
2711       } else {
2712          src.reladdr = ralloc(mem_ctx, st_src_reg);
2713          memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2714       }
2715    }
2716
2717    /* Change the register type to the element type of the array. */
2718    src.type = ir->type->base_type;
2719
2720    this->result = src;
2721 }
2722
2723 void
2724 glsl_to_tgsi_visitor::visit(ir_dereference_record *ir)
2725 {
2726    unsigned int i;
2727    const glsl_type *struct_type = ir->record->type;
2728    int offset = 0;
2729
2730    ir->record->accept(this);
2731
2732    for (i = 0; i < struct_type->length; i++) {
2733       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2734          break;
2735       offset += type_size(struct_type->fields.structure[i].type);
2736    }
2737
2738    /* If the type is smaller than a vec4, replicate the last channel out. */
2739    if (ir->type->is_scalar() || ir->type->is_vector())
2740       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
2741    else
2742       this->result.swizzle = SWIZZLE_NOOP;
2743
2744    this->result.index += offset;
2745    this->result.type = ir->type->base_type;
2746 }
2747
2748 /**
2749  * We want to be careful in assignment setup to hit the actual storage
2750  * instead of potentially using a temporary like we might with the
2751  * ir_dereference handler.
2752  */
2753 static st_dst_reg
2754 get_assignment_lhs(ir_dereference *ir, glsl_to_tgsi_visitor *v, int *component)
2755 {
2756    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2757     * access of a vector, it must be separated into a series conditional moves
2758     * before reaching this point (see ir_vec_index_to_cond_assign).
2759     */
2760    assert(ir->as_dereference());
2761    ir_dereference_array *deref_array = ir->as_dereference_array();
2762    if (deref_array) {
2763       assert(!deref_array->array->type->is_vector());
2764    }
2765
2766    /* Use the rvalue deref handler for the most part.  We write swizzles using
2767     * the writemask, but we do extract the base component for enhanced layouts
2768     * from the source swizzle.
2769     */
2770    ir->accept(v);
2771    *component = GET_SWZ(v->result.swizzle, 0);
2772    return st_dst_reg(v->result);
2773 }
2774
2775 /**
2776  * Process the condition of a conditional assignment
2777  *
2778  * Examines the condition of a conditional assignment to generate the optimal
2779  * first operand of a \c CMP instruction.  If the condition is a relational
2780  * operator with 0 (e.g., \c ir_binop_less), the value being compared will be
2781  * used as the source for the \c CMP instruction.  Otherwise the comparison
2782  * is processed to a boolean result, and the boolean result is used as the
2783  * operand to the CMP instruction.
2784  */
2785 bool
2786 glsl_to_tgsi_visitor::process_move_condition(ir_rvalue *ir)
2787 {
2788    ir_rvalue *src_ir = ir;
2789    bool negate = true;
2790    bool switch_order = false;
2791
2792    ir_expression *const expr = ir->as_expression();
2793
2794    if (native_integers) {
2795       if ((expr != NULL) && (expr->get_num_operands() == 2)) {
2796          enum glsl_base_type type = expr->operands[0]->type->base_type;
2797          if (type == GLSL_TYPE_INT || type == GLSL_TYPE_UINT ||
2798              type == GLSL_TYPE_BOOL) {
2799             if (expr->operation == ir_binop_equal) {
2800                if (expr->operands[0]->is_zero()) {
2801                   src_ir = expr->operands[1];
2802                   switch_order = true;
2803                }
2804                else if (expr->operands[1]->is_zero()) {
2805                   src_ir = expr->operands[0];
2806                   switch_order = true;
2807                }
2808             }
2809             else if (expr->operation == ir_binop_nequal) {
2810                if (expr->operands[0]->is_zero()) {
2811                   src_ir = expr->operands[1];
2812                }
2813                else if (expr->operands[1]->is_zero()) {
2814                   src_ir = expr->operands[0];
2815                }
2816             }
2817          }
2818       }
2819
2820       src_ir->accept(this);
2821       return switch_order;
2822    }
2823
2824    if ((expr != NULL) && (expr->get_num_operands() == 2)) {
2825       bool zero_on_left = false;
2826
2827       if (expr->operands[0]->is_zero()) {
2828          src_ir = expr->operands[1];
2829          zero_on_left = true;
2830       } else if (expr->operands[1]->is_zero()) {
2831          src_ir = expr->operands[0];
2832          zero_on_left = false;
2833       }
2834
2835       /*      a is -  0  +            -  0  +
2836        * (a <  0)  T  F  F  ( a < 0)  T  F  F
2837        * (0 <  a)  F  F  T  (-a < 0)  F  F  T
2838        * (a <= 0)  T  T  F  (-a < 0)  F  F  T  (swap order of other operands)
2839        * (0 <= a)  F  T  T  ( a < 0)  T  F  F  (swap order of other operands)
2840        * (a >  0)  F  F  T  (-a < 0)  F  F  T
2841        * (0 >  a)  T  F  F  ( a < 0)  T  F  F
2842        * (a >= 0)  F  T  T  ( a < 0)  T  F  F  (swap order of other operands)
2843        * (0 >= a)  T  T  F  (-a < 0)  F  F  T  (swap order of other operands)
2844        *
2845        * Note that exchanging the order of 0 and 'a' in the comparison simply
2846        * means that the value of 'a' should be negated.
2847        */
2848       if (src_ir != ir) {
2849          switch (expr->operation) {
2850          case ir_binop_less:
2851             switch_order = false;
2852             negate = zero_on_left;
2853             break;
2854
2855          case ir_binop_greater:
2856             switch_order = false;
2857             negate = !zero_on_left;
2858             break;
2859
2860          case ir_binop_lequal:
2861             switch_order = true;
2862             negate = !zero_on_left;
2863             break;
2864
2865          case ir_binop_gequal:
2866             switch_order = true;
2867             negate = zero_on_left;
2868             break;
2869
2870          default:
2871             /* This isn't the right kind of comparison afterall, so make sure
2872              * the whole condition is visited.
2873              */
2874             src_ir = ir;
2875             break;
2876          }
2877       }
2878    }
2879
2880    src_ir->accept(this);
2881
2882    /* We use the TGSI_OPCODE_CMP (a < 0 ? b : c) for conditional moves, and the
2883     * condition we produced is 0.0 or 1.0.  By flipping the sign, we can
2884     * choose which value TGSI_OPCODE_CMP produces without an extra instruction
2885     * computing the condition.
2886     */
2887    if (negate)
2888       this->result.negate = ~this->result.negate;
2889
2890    return switch_order;
2891 }
2892
2893 void
2894 glsl_to_tgsi_visitor::emit_block_mov(ir_assignment *ir, const struct glsl_type *type,
2895                                      st_dst_reg *l, st_src_reg *r,
2896                                      st_src_reg *cond, bool cond_swap)
2897 {
2898    if (type->base_type == GLSL_TYPE_STRUCT) {
2899       for (unsigned int i = 0; i < type->length; i++) {
2900          emit_block_mov(ir, type->fields.structure[i].type, l, r,
2901                         cond, cond_swap);
2902       }
2903       return;
2904    }
2905
2906    if (type->is_array()) {
2907       for (unsigned int i = 0; i < type->length; i++) {
2908          emit_block_mov(ir, type->fields.array, l, r, cond, cond_swap);
2909       }
2910       return;
2911    }
2912
2913    if (type->is_matrix()) {
2914       const struct glsl_type *vec_type;
2915
2916       vec_type = glsl_type::get_instance(type->is_double() ? GLSL_TYPE_DOUBLE : GLSL_TYPE_FLOAT,
2917                                          type->vector_elements, 1);
2918
2919       for (int i = 0; i < type->matrix_columns; i++) {
2920          emit_block_mov(ir, vec_type, l, r, cond, cond_swap);
2921       }
2922       return;
2923    }
2924
2925    assert(type->is_scalar() || type->is_vector());
2926
2927    l->type = type->base_type;
2928    r->type = type->base_type;
2929    if (cond) {
2930       st_src_reg l_src = st_src_reg(*l);
2931       l_src.swizzle = swizzle_for_size(type->vector_elements);
2932
2933       if (native_integers) {
2934          emit_asm(ir, TGSI_OPCODE_UCMP, *l, *cond,
2935               cond_swap ? l_src : *r,
2936               cond_swap ? *r : l_src);
2937       } else {
2938          emit_asm(ir, TGSI_OPCODE_CMP, *l, *cond,
2939               cond_swap ? l_src : *r,
2940               cond_swap ? *r : l_src);
2941       }
2942    } else {
2943       emit_asm(ir, TGSI_OPCODE_MOV, *l, *r);
2944    }
2945    l->index++;
2946    r->index++;
2947    if (type->is_dual_slot()) {
2948       l->index++;
2949       if (r->is_double_vertex_input == false)
2950          r->index++;
2951    }
2952 }
2953
2954 void
2955 glsl_to_tgsi_visitor::visit(ir_assignment *ir)
2956 {
2957    int dst_component;
2958    st_dst_reg l;
2959    st_src_reg r;
2960
2961    ir->rhs->accept(this);
2962    r = this->result;
2963
2964    l = get_assignment_lhs(ir->lhs, this, &dst_component);
2965
2966    {
2967       int swizzles[4];
2968       int first_enabled_chan = 0;
2969       int rhs_chan = 0;
2970       ir_variable *variable = ir->lhs->variable_referenced();
2971
2972       if (shader->Stage == MESA_SHADER_FRAGMENT &&
2973           variable->data.mode == ir_var_shader_out &&
2974           (variable->data.location == FRAG_RESULT_DEPTH ||
2975            variable->data.location == FRAG_RESULT_STENCIL)) {
2976          assert(ir->lhs->type->is_scalar());
2977          assert(ir->write_mask == WRITEMASK_X);
2978
2979          if (variable->data.location == FRAG_RESULT_DEPTH)
2980             l.writemask = WRITEMASK_Z;
2981          else {
2982             assert(variable->data.location == FRAG_RESULT_STENCIL);
2983             l.writemask = WRITEMASK_Y;
2984          }
2985       } else if (ir->write_mask == 0) {
2986          assert(!ir->lhs->type->is_scalar() && !ir->lhs->type->is_vector());
2987
2988          unsigned num_elements = ir->lhs->type->without_array()->vector_elements;
2989
2990          if (num_elements) {
2991             l.writemask = u_bit_consecutive(0, num_elements);
2992          } else {
2993             /* The type is a struct or an array of (array of) structs. */
2994             l.writemask = WRITEMASK_XYZW;
2995          }
2996       } else {
2997          l.writemask = ir->write_mask;
2998       }
2999
3000       for (int i = 0; i < 4; i++) {
3001          if (l.writemask & (1 << i)) {
3002             first_enabled_chan = GET_SWZ(r.swizzle, i);
3003             break;
3004          }
3005       }
3006
3007       l.writemask = l.writemask << dst_component;
3008
3009       /* Swizzle a small RHS vector into the channels being written.
3010        *
3011        * glsl ir treats write_mask as dictating how many channels are
3012        * present on the RHS while TGSI treats write_mask as just
3013        * showing which channels of the vec4 RHS get written.
3014        */
3015       for (int i = 0; i < 4; i++) {
3016          if (l.writemask & (1 << i))
3017             swizzles[i] = GET_SWZ(r.swizzle, rhs_chan++);
3018          else
3019             swizzles[i] = first_enabled_chan;
3020       }
3021       r.swizzle = MAKE_SWIZZLE4(swizzles[0], swizzles[1],
3022                                 swizzles[2], swizzles[3]);
3023    }
3024
3025    assert(l.file != PROGRAM_UNDEFINED);
3026    assert(r.file != PROGRAM_UNDEFINED);
3027
3028    if (ir->condition) {
3029       const bool switch_order = this->process_move_condition(ir->condition);
3030       st_src_reg condition = this->result;
3031
3032       emit_block_mov(ir, ir->lhs->type, &l, &r, &condition, switch_order);
3033    } else if (ir->rhs->as_expression() &&
3034               this->instructions.get_tail() &&
3035               ir->rhs == ((glsl_to_tgsi_instruction *)this->instructions.get_tail())->ir &&
3036               !((glsl_to_tgsi_instruction *)this->instructions.get_tail())->is_64bit_expanded &&
3037               type_size(ir->lhs->type) == 1 &&
3038               l.writemask == ((glsl_to_tgsi_instruction *)this->instructions.get_tail())->dst[0].writemask) {
3039       /* To avoid emitting an extra MOV when assigning an expression to a
3040        * variable, emit the last instruction of the expression again, but
3041        * replace the destination register with the target of the assignment.
3042        * Dead code elimination will remove the original instruction.
3043        */
3044       glsl_to_tgsi_instruction *inst, *new_inst;
3045       inst = (glsl_to_tgsi_instruction *)this->instructions.get_tail();
3046       new_inst = emit_asm(ir, inst->op, l, inst->src[0], inst->src[1], inst->src[2], inst->src[3]);
3047       new_inst->saturate = inst->saturate;
3048       inst->dead_mask = inst->dst[0].writemask;
3049    } else {
3050       emit_block_mov(ir, ir->rhs->type, &l, &r, NULL, false);
3051    }
3052 }
3053
3054
3055 void
3056 glsl_to_tgsi_visitor::visit(ir_constant *ir)
3057 {
3058    st_src_reg src;
3059    GLdouble stack_vals[4] = { 0 };
3060    gl_constant_value *values = (gl_constant_value *) stack_vals;
3061    GLenum gl_type = GL_NONE;
3062    unsigned int i;
3063    static int in_array = 0;
3064    gl_register_file file = in_array ? PROGRAM_CONSTANT : PROGRAM_IMMEDIATE;
3065
3066    /* Unfortunately, 4 floats is all we can get into
3067     * _mesa_add_typed_unnamed_constant.  So, make a temp to store an
3068     * aggregate constant and move each constant value into it.  If we
3069     * get lucky, copy propagation will eliminate the extra moves.
3070     */
3071    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
3072       st_src_reg temp_base = get_temp(ir->type);
3073       st_dst_reg temp = st_dst_reg(temp_base);
3074
3075       foreach_in_list(ir_constant, field_value, &ir->components) {
3076          int size = type_size(field_value->type);
3077
3078          assert(size > 0);
3079
3080          field_value->accept(this);
3081          src = this->result;
3082
3083          for (i = 0; i < (unsigned int)size; i++) {
3084             emit_asm(ir, TGSI_OPCODE_MOV, temp, src);
3085
3086             src.index++;
3087             temp.index++;
3088          }
3089       }
3090       this->result = temp_base;
3091       return;
3092    }
3093
3094    if (ir->type->is_array()) {
3095       st_src_reg temp_base = get_temp(ir->type);
3096       st_dst_reg temp = st_dst_reg(temp_base);
3097       int size = type_size(ir->type->fields.array);
3098
3099       assert(size > 0);
3100       in_array++;
3101
3102       for (i = 0; i < ir->type->length; i++) {
3103          ir->array_elements[i]->accept(this);
3104          src = this->result;
3105          for (int j = 0; j < size; j++) {
3106             emit_asm(ir, TGSI_OPCODE_MOV, temp, src);
3107
3108             src.index++;
3109             temp.index++;
3110          }
3111       }
3112       this->result = temp_base;
3113       in_array--;
3114       return;
3115    }
3116
3117    if (ir->type->is_matrix()) {
3118       st_src_reg mat = get_temp(ir->type);
3119       st_dst_reg mat_column = st_dst_reg(mat);
3120
3121       for (i = 0; i < ir->type->matrix_columns; i++) {
3122          switch (ir->type->base_type) {
3123          case GLSL_TYPE_FLOAT:
3124             values = (gl_constant_value *) &ir->value.f[i * ir->type->vector_elements];
3125
3126             src = st_src_reg(file, -1, ir->type->base_type);
3127             src.index = add_constant(file,
3128                                      values,
3129                                      ir->type->vector_elements,
3130                                      GL_FLOAT,
3131                                      &src.swizzle);
3132             emit_asm(ir, TGSI_OPCODE_MOV, mat_column, src);
3133             break;
3134          case GLSL_TYPE_DOUBLE:
3135             values = (gl_constant_value *) &ir->value.d[i * ir->type->vector_elements];
3136             src = st_src_reg(file, -1, ir->type->base_type);
3137             src.index = add_constant(file,
3138                                      values,
3139                                      ir->type->vector_elements,
3140                                      GL_DOUBLE,
3141                                      &src.swizzle);
3142             if (ir->type->vector_elements >= 2) {
3143                mat_column.writemask = WRITEMASK_XY;
3144                src.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_X, SWIZZLE_Y);
3145                emit_asm(ir, TGSI_OPCODE_MOV, mat_column, src);
3146             } else {
3147                mat_column.writemask = WRITEMASK_X;
3148                src.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X);
3149                emit_asm(ir, TGSI_OPCODE_MOV, mat_column, src);
3150             }
3151             src.index++;
3152             if (ir->type->vector_elements > 2) {
3153                if (ir->type->vector_elements == 4) {
3154                   mat_column.writemask = WRITEMASK_ZW;
3155                   src.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_X, SWIZZLE_Y);
3156                   emit_asm(ir, TGSI_OPCODE_MOV, mat_column, src);
3157                } else {
3158                   mat_column.writemask = WRITEMASK_Z;
3159                   src.swizzle = MAKE_SWIZZLE4(SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y);
3160                   emit_asm(ir, TGSI_OPCODE_MOV, mat_column, src);
3161                   mat_column.writemask = WRITEMASK_XYZW;
3162                   src.swizzle = SWIZZLE_XYZW;
3163                }
3164                mat_column.index++;
3165             }
3166             break;
3167          default:
3168             unreachable("Illegal matrix constant type.\n");
3169             break;
3170          }
3171          mat_column.index++;
3172       }
3173       this->result = mat;
3174       return;
3175    }
3176
3177    switch (ir->type->base_type) {
3178    case GLSL_TYPE_FLOAT:
3179       gl_type = GL_FLOAT;
3180       for (i = 0; i < ir->type->vector_elements; i++) {
3181          values[i].f = ir->value.f[i];
3182       }
3183       break;
3184    case GLSL_TYPE_DOUBLE:
3185       gl_type = GL_DOUBLE;
3186       for (i = 0; i < ir->type->vector_elements; i++) {
3187          memcpy(&values[i * 2], &ir->value.d[i], sizeof(double));
3188       }
3189       break;
3190    case GLSL_TYPE_UINT:
3191       gl_type = native_integers ? GL_UNSIGNED_INT : GL_FLOAT;
3192       for (i = 0; i < ir->type->vector_elements; i++) {
3193          if (native_integers)
3194             values[i].u = ir->value.u[i];
3195          else
3196             values[i].f = ir->value.u[i];
3197       }
3198       break;
3199    case GLSL_TYPE_INT:
3200       gl_type = native_integers ? GL_INT : GL_FLOAT;
3201       for (i = 0; i < ir->type->vector_elements; i++) {
3202          if (native_integers)
3203             values[i].i = ir->value.i[i];
3204          else
3205             values[i].f = ir->value.i[i];
3206       }
3207       break;
3208    case GLSL_TYPE_BOOL:
3209       gl_type = native_integers ? GL_BOOL : GL_FLOAT;
3210       for (i = 0; i < ir->type->vector_elements; i++) {
3211          values[i].u = ir->value.b[i] ? ctx->Const.UniformBooleanTrue : 0;
3212       }
3213       break;
3214    default:
3215       assert(!"Non-float/uint/int/bool constant");
3216    }
3217
3218    this->result = st_src_reg(file, -1, ir->type);
3219    this->result.index = add_constant(file,
3220                                      values,
3221                                      ir->type->vector_elements,
3222                                      gl_type,
3223                                      &this->result.swizzle);
3224 }
3225
3226 void
3227 glsl_to_tgsi_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
3228 {
3229    exec_node *param = ir->actual_parameters.get_head();
3230    ir_dereference *deref = static_cast<ir_dereference *>(param);
3231    ir_variable *location = deref->variable_referenced();
3232
3233    st_src_reg buffer(
3234          PROGRAM_BUFFER, location->data.binding, GLSL_TYPE_ATOMIC_UINT);
3235
3236    /* Calculate the surface offset */
3237    st_src_reg offset;
3238    unsigned array_size = 0, base = 0;
3239    uint16_t index = 0;
3240
3241    get_deref_offsets(deref, &array_size, &base, &index, &offset, false);
3242
3243    if (offset.file != PROGRAM_UNDEFINED) {
3244       emit_asm(ir, TGSI_OPCODE_MUL, st_dst_reg(offset),
3245                offset, st_src_reg_for_int(ATOMIC_COUNTER_SIZE));
3246       emit_asm(ir, TGSI_OPCODE_ADD, st_dst_reg(offset),
3247                offset, st_src_reg_for_int(location->data.offset + index * ATOMIC_COUNTER_SIZE));
3248    } else {
3249       offset = st_src_reg_for_int(location->data.offset + index * ATOMIC_COUNTER_SIZE);
3250    }
3251
3252    ir->return_deref->accept(this);
3253    st_dst_reg dst(this->result);
3254    dst.writemask = WRITEMASK_X;
3255
3256    glsl_to_tgsi_instruction *inst;
3257
3258    if (ir->callee->intrinsic_id == ir_intrinsic_atomic_counter_read) {
3259       inst = emit_asm(ir, TGSI_OPCODE_LOAD, dst, offset);
3260    } else if (ir->callee->intrinsic_id == ir_intrinsic_atomic_counter_increment) {
3261       inst = emit_asm(ir, TGSI_OPCODE_ATOMUADD, dst, offset,
3262                       st_src_reg_for_int(1));
3263    } else if (ir->callee->intrinsic_id == ir_intrinsic_atomic_counter_predecrement) {
3264       inst = emit_asm(ir, TGSI_OPCODE_ATOMUADD, dst, offset,
3265                       st_src_reg_for_int(-1));
3266       emit_asm(ir, TGSI_OPCODE_ADD, dst, this->result, st_src_reg_for_int(-1));
3267    } else {
3268       param = param->get_next();
3269       ir_rvalue *val = ((ir_instruction *)param)->as_rvalue();
3270       val->accept(this);
3271
3272       st_src_reg data = this->result, data2 = undef_src;
3273       unsigned opcode;
3274       switch (ir->callee->intrinsic_id) {
3275       case ir_intrinsic_atomic_counter_add:
3276          opcode = TGSI_OPCODE_ATOMUADD;
3277          break;
3278       case ir_intrinsic_atomic_counter_min:
3279          opcode = TGSI_OPCODE_ATOMIMIN;
3280          break;
3281       case ir_intrinsic_atomic_counter_max:
3282          opcode = TGSI_OPCODE_ATOMIMAX;
3283          break;
3284       case ir_intrinsic_atomic_counter_and:
3285          opcode = TGSI_OPCODE_ATOMAND;
3286          break;
3287       case ir_intrinsic_atomic_counter_or:
3288          opcode = TGSI_OPCODE_ATOMOR;
3289          break;
3290       case ir_intrinsic_atomic_counter_xor:
3291          opcode = TGSI_OPCODE_ATOMXOR;
3292          break;
3293       case ir_intrinsic_atomic_counter_exchange:
3294          opcode = TGSI_OPCODE_ATOMXCHG;
3295          break;
3296       case ir_intrinsic_atomic_counter_comp_swap: {
3297          opcode = TGSI_OPCODE_ATOMCAS;
3298          param = param->get_next();
3299          val = ((ir_instruction *)param)->as_rvalue();
3300          val->accept(this);
3301          data2 = this->result;
3302          break;
3303       }
3304       default:
3305          assert(!"Unexpected intrinsic");
3306          return;
3307       }
3308
3309       inst = emit_asm(ir, opcode, dst, offset, data, data2);
3310    }
3311
3312    inst->resource = buffer;
3313 }
3314
3315 void
3316 glsl_to_tgsi_visitor::visit_ssbo_intrinsic(ir_call *ir)
3317 {
3318    exec_node *param = ir->actual_parameters.get_head();
3319
3320    ir_rvalue *block = ((ir_instruction *)param)->as_rvalue();
3321
3322    param = param->get_next();
3323    ir_rvalue *offset = ((ir_instruction *)param)->as_rvalue();
3324
3325    ir_constant *const_block = block->as_constant();
3326
3327    st_src_reg buffer(
3328          PROGRAM_BUFFER,
3329          ctx->Const.Program[shader->Stage].MaxAtomicBuffers +
3330          (const_block ? const_block->value.u[0] : 0),
3331          GLSL_TYPE_UINT);
3332
3333    if (!const_block) {
3334       block->accept(this);
3335       buffer.reladdr = ralloc(mem_ctx, st_src_reg);
3336       *buffer.reladdr = this->result;
3337       emit_arl(ir, sampler_reladdr, this->result);
3338    }
3339
3340    /* Calculate the surface offset */
3341    offset->accept(this);
3342    st_src_reg off = this->result;
3343
3344    st_dst_reg dst = undef_dst;
3345    if (ir->return_deref) {
3346       ir->return_deref->accept(this);
3347       dst = st_dst_reg(this->result);
3348       dst.writemask = (1 << ir->return_deref->type->vector_elements) - 1;
3349    }
3350
3351    glsl_to_tgsi_instruction *inst;
3352
3353    if (ir->callee->intrinsic_id == ir_intrinsic_ssbo_load) {
3354       inst = emit_asm(ir, TGSI_OPCODE_LOAD, dst, off);
3355       if (dst.type == GLSL_TYPE_BOOL)
3356          emit_asm(ir, TGSI_OPCODE_USNE, dst, st_src_reg(dst), st_src_reg_for_int(0));
3357    } else if (ir->callee->intrinsic_id == ir_intrinsic_ssbo_store) {
3358       param = param->get_next();
3359       ir_rvalue *val = ((ir_instruction *)param)->as_rvalue();
3360       val->accept(this);
3361
3362       param = param->get_next();
3363       ir_constant *write_mask = ((ir_instruction *)param)->as_constant();
3364       assert(write_mask);
3365       dst.writemask = write_mask->value.u[0];
3366
3367       dst.type = this->result.type;
3368       inst = emit_asm(ir, TGSI_OPCODE_STORE, dst, off, this->result);
3369    } else {
3370       param = param->get_next();
3371       ir_rvalue *val = ((ir_instruction *)param)->as_rvalue();
3372       val->accept(this);
3373
3374       st_src_reg data = this->result, data2 = undef_src;
3375       unsigned opcode;
3376       switch (ir->callee->intrinsic_id) {
3377       case ir_intrinsic_ssbo_atomic_add:
3378          opcode = TGSI_OPCODE_ATOMUADD;
3379          break;
3380       case ir_intrinsic_ssbo_atomic_min:
3381          opcode = TGSI_OPCODE_ATOMIMIN;
3382          break;
3383       case ir_intrinsic_ssbo_atomic_max:
3384          opcode = TGSI_OPCODE_ATOMIMAX;
3385          break;
3386       case ir_intrinsic_ssbo_atomic_and:
3387          opcode = TGSI_OPCODE_ATOMAND;
3388          break;
3389       case ir_intrinsic_ssbo_atomic_or:
3390          opcode = TGSI_OPCODE_ATOMOR;
3391          break;
3392       case ir_intrinsic_ssbo_atomic_xor:
3393          opcode = TGSI_OPCODE_ATOMXOR;
3394          break;
3395       case ir_intrinsic_ssbo_atomic_exchange:
3396          opcode = TGSI_OPCODE_ATOMXCHG;
3397          break;
3398       case ir_intrinsic_ssbo_atomic_comp_swap:
3399          opcode = TGSI_OPCODE_ATOMCAS;
3400          param = param->get_next();
3401          val = ((ir_instruction *)param)->as_rvalue();
3402          val->accept(this);
3403          data2 = this->result;
3404          break;
3405       default:
3406          assert(!"Unexpected intrinsic");
3407          return;
3408       }
3409
3410       inst = emit_asm(ir, opcode, dst, off, data, data2);
3411    }
3412
3413    param = param->get_next();
3414    ir_constant *access = NULL;
3415    if (!param->is_tail_sentinel()) {
3416       access = ((ir_instruction *)param)->as_constant();
3417       assert(access);
3418    }
3419
3420    /* The emit_asm() might have actually split the op into pieces, e.g. for
3421     * double stores. We have to go back and fix up all the generated ops.
3422     */
3423    unsigned op = inst->op;
3424    do {
3425       inst->resource = buffer;
3426       if (access)
3427          inst->buffer_access = access->value.u[0];
3428       inst = (glsl_to_tgsi_instruction *)inst->get_prev();
3429       if (inst->op == TGSI_OPCODE_UADD)
3430          inst = (glsl_to_tgsi_instruction *)inst->get_prev();
3431    } while (inst && inst->op == op && inst->resource.file == PROGRAM_UNDEFINED);
3432 }
3433
3434 void
3435 glsl_to_tgsi_visitor::visit_membar_intrinsic(ir_call *ir)
3436 {
3437    switch (ir->callee->intrinsic_id) {
3438    case ir_intrinsic_memory_barrier:
3439       emit_asm(ir, TGSI_OPCODE_MEMBAR, undef_dst,
3440                st_src_reg_for_int(TGSI_MEMBAR_SHADER_BUFFER |
3441                                   TGSI_MEMBAR_ATOMIC_BUFFER |
3442                                   TGSI_MEMBAR_SHADER_IMAGE |
3443                                   TGSI_MEMBAR_SHARED));
3444       break;
3445    case ir_intrinsic_memory_barrier_atomic_counter:
3446       emit_asm(ir, TGSI_OPCODE_MEMBAR, undef_dst,
3447                st_src_reg_for_int(TGSI_MEMBAR_ATOMIC_BUFFER));
3448       break;
3449    case ir_intrinsic_memory_barrier_buffer:
3450       emit_asm(ir, TGSI_OPCODE_MEMBAR, undef_dst,
3451                st_src_reg_for_int(TGSI_MEMBAR_SHADER_BUFFER));
3452       break;
3453    case ir_intrinsic_memory_barrier_image:
3454       emit_asm(ir, TGSI_OPCODE_MEMBAR, undef_dst,
3455                st_src_reg_for_int(TGSI_MEMBAR_SHADER_IMAGE));
3456       break;
3457    case ir_intrinsic_memory_barrier_shared:
3458       emit_asm(ir, TGSI_OPCODE_MEMBAR, undef_dst,
3459                st_src_reg_for_int(TGSI_MEMBAR_SHARED));
3460       break;
3461    case ir_intrinsic_group_memory_barrier:
3462       emit_asm(ir, TGSI_OPCODE_MEMBAR, undef_dst,
3463                st_src_reg_for_int(TGSI_MEMBAR_SHADER_BUFFER |
3464                                   TGSI_MEMBAR_ATOMIC_BUFFER |
3465                                   TGSI_MEMBAR_SHADER_IMAGE |
3466                                   TGSI_MEMBAR_SHARED |
3467                                   TGSI_MEMBAR_THREAD_GROUP));
3468       break;
3469    default:
3470       assert(!"Unexpected memory barrier intrinsic");
3471    }
3472 }
3473
3474 void
3475 glsl_to_tgsi_visitor::visit_shared_intrinsic(ir_call *ir)
3476 {
3477    exec_node *param = ir->actual_parameters.get_head();
3478
3479    ir_rvalue *offset = ((ir_instruction *)param)->as_rvalue();
3480
3481    st_src_reg buffer(PROGRAM_MEMORY, 0, GLSL_TYPE_UINT);
3482
3483    /* Calculate the surface offset */
3484    offset->accept(this);
3485    st_src_reg off = this->result;
3486
3487    st_dst_reg dst = undef_dst;
3488    if (ir->return_deref) {
3489       ir->return_deref->accept(this);
3490       dst = st_dst_reg(this->result);
3491       dst.writemask = (1 << ir->return_deref->type->vector_elements) - 1;
3492    }
3493
3494    glsl_to_tgsi_instruction *inst;
3495
3496    if (ir->callee->intrinsic_id == ir_intrinsic_shared_load) {
3497       inst = emit_asm(ir, TGSI_OPCODE_LOAD, dst, off);
3498       inst->resource = buffer;
3499    } else if (ir->callee->intrinsic_id == ir_intrinsic_shared_store) {
3500       param = param->get_next();
3501       ir_rvalue *val = ((ir_instruction *)param)->as_rvalue();
3502       val->accept(this);
3503
3504       param = param->get_next();
3505       ir_constant *write_mask = ((ir_instruction *)param)->as_constant();
3506       assert(write_mask);
3507       dst.writemask = write_mask->value.u[0];
3508
3509       dst.type = this->result.type;
3510       inst = emit_asm(ir, TGSI_OPCODE_STORE, dst, off, this->result);
3511       inst->resource = buffer;
3512    } else {
3513       param = param->get_next();
3514       ir_rvalue *val = ((ir_instruction *)param)->as_rvalue();
3515       val->accept(this);
3516
3517       st_src_reg data = this->result, data2 = undef_src;
3518       unsigned opcode;
3519       switch (ir->callee->intrinsic_id) {
3520       case ir_intrinsic_shared_atomic_add:
3521          opcode = TGSI_OPCODE_ATOMUADD;
3522          break;
3523       case ir_intrinsic_shared_atomic_min:
3524          opcode = TGSI_OPCODE_ATOMIMIN;
3525          break;
3526       case ir_intrinsic_shared_atomic_max:
3527          opcode = TGSI_OPCODE_ATOMIMAX;
3528          break;
3529       case ir_intrinsic_shared_atomic_and:
3530          opcode = TGSI_OPCODE_ATOMAND;
3531          break;
3532       case ir_intrinsic_shared_atomic_or:
3533          opcode = TGSI_OPCODE_ATOMOR;
3534          break;
3535       case ir_intrinsic_shared_atomic_xor:
3536          opcode = TGSI_OPCODE_ATOMXOR;
3537          break;
3538       case ir_intrinsic_shared_atomic_exchange:
3539          opcode = TGSI_OPCODE_ATOMXCHG;
3540          break;
3541       case ir_intrinsic_shared_atomic_comp_swap:
3542          opcode = TGSI_OPCODE_ATOMCAS;
3543          param = param->get_next();
3544          val = ((ir_instruction *)param)->as_rvalue();
3545          val->accept(this);
3546          data2 = this->result;
3547          break;
3548       default:
3549          assert(!"Unexpected intrinsic");
3550          return;
3551       }
3552
3553       inst = emit_asm(ir, opcode, dst, off, data, data2);
3554       inst->resource = buffer;
3555    }
3556 }
3557
3558 void
3559 glsl_to_tgsi_visitor::visit_image_intrinsic(ir_call *ir)
3560 {
3561    exec_node *param = ir->actual_parameters.get_head();
3562
3563    ir_dereference *img = (ir_dereference *)param;
3564    const ir_variable *imgvar = img->variable_referenced();
3565    const glsl_type *type = imgvar->type->without_array();
3566    unsigned sampler_array_size = 1, sampler_base = 0;
3567
3568    st_src_reg reladdr;
3569    st_src_reg image(PROGRAM_IMAGE, 0, GLSL_TYPE_UINT);
3570
3571    get_deref_offsets(img, &sampler_array_size, &sampler_base,
3572                      (uint16_t*)&image.index, &reladdr, true);
3573
3574    if (reladdr.file != PROGRAM_UNDEFINED) {
3575       image.reladdr = ralloc(mem_ctx, st_src_reg);
3576       *image.reladdr = reladdr;
3577       emit_arl(ir, sampler_reladdr, reladdr);
3578    }
3579
3580    st_dst_reg dst = undef_dst;
3581    if (ir->return_deref) {
3582       ir->return_deref->accept(this);
3583       dst = st_dst_reg(this->result);
3584       dst.writemask = (1 << ir->return_deref->type->vector_elements) - 1;
3585    }
3586
3587    glsl_to_tgsi_instruction *inst;
3588
3589    if (ir->callee->intrinsic_id == ir_intrinsic_image_size) {
3590       dst.writemask = WRITEMASK_XYZ;
3591       inst = emit_asm(ir, TGSI_OPCODE_RESQ, dst);
3592    } else if (ir->callee->intrinsic_id == ir_intrinsic_image_samples) {
3593       st_src_reg res = get_temp(glsl_type::ivec4_type);
3594       st_dst_reg dstres = st_dst_reg(res);
3595       dstres.writemask = WRITEMASK_W;
3596       inst = emit_asm(ir, TGSI_OPCODE_RESQ, dstres);
3597       res.swizzle = SWIZZLE_WWWW;
3598       emit_asm(ir, TGSI_OPCODE_MOV, dst, res);
3599    } else {
3600       st_src_reg arg1 = undef_src, arg2 = undef_src;
3601       st_src_reg coord;
3602       st_dst_reg coord_dst;
3603       coord = get_temp(glsl_type::ivec4_type);
3604       coord_dst = st_dst_reg(coord);
3605       coord_dst.writemask = (1 << type->coordinate_components()) - 1;
3606       param = param->get_next();
3607       ((ir_dereference *)param)->accept(this);
3608       emit_asm(ir, TGSI_OPCODE_MOV, coord_dst, this->result);
3609       coord.swizzle = SWIZZLE_XXXX;
3610       switch (type->coordinate_components()) {
3611       case 4: assert(!"unexpected coord count");
3612       /* fallthrough */
3613       case 3: coord.swizzle |= SWIZZLE_Z << 6;
3614       /* fallthrough */
3615       case 2: coord.swizzle |= SWIZZLE_Y << 3;
3616       }
3617
3618       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_MS) {
3619          param = param->get_next();
3620          ((ir_dereference *)param)->accept(this);
3621          st_src_reg sample = this->result;
3622          sample.swizzle = SWIZZLE_XXXX;
3623          coord_dst.writemask = WRITEMASK_W;
3624          emit_asm(ir, TGSI_OPCODE_MOV, coord_dst, sample);
3625          coord.swizzle |= SWIZZLE_W << 9;
3626       }
3627
3628       param = param->get_next();
3629       if (!param->is_tail_sentinel()) {
3630          ((ir_dereference *)param)->accept(this);
3631          arg1 = this->result;
3632          param = param->get_next();
3633       }
3634
3635       if (!param->is_tail_sentinel()) {
3636          ((ir_dereference *)param)->accept(this);
3637          arg2 = this->result;
3638          param = param->get_next();
3639       }
3640
3641       assert(param->is_tail_sentinel());
3642
3643       unsigned opcode;
3644       switch (ir->callee->intrinsic_id) {
3645       case ir_intrinsic_image_load:
3646          opcode = TGSI_OPCODE_LOAD;
3647          break;
3648       case ir_intrinsic_image_store:
3649          opcode = TGSI_OPCODE_STORE;
3650          break;
3651       case ir_intrinsic_image_atomic_add:
3652          opcode = TGSI_OPCODE_ATOMUADD;
3653          break;
3654       case ir_intrinsic_image_atomic_min:
3655          opcode = TGSI_OPCODE_ATOMIMIN;
3656          break;
3657       case ir_intrinsic_image_atomic_max:
3658          opcode = TGSI_OPCODE_ATOMIMAX;
3659          break;
3660       case ir_intrinsic_image_atomic_and:
3661          opcode = TGSI_OPCODE_ATOMAND;
3662          break;
3663       case ir_intrinsic_image_atomic_or:
3664          opcode = TGSI_OPCODE_ATOMOR;
3665          break;
3666       case ir_intrinsic_image_atomic_xor:
3667          opcode = TGSI_OPCODE_ATOMXOR;
3668          break;
3669       case ir_intrinsic_image_atomic_exchange:
3670          opcode = TGSI_OPCODE_ATOMXCHG;
3671          break;
3672       case ir_intrinsic_image_atomic_comp_swap:
3673          opcode = TGSI_OPCODE_ATOMCAS;
3674          break;
3675       default:
3676          assert(!"Unexpected intrinsic");
3677          return;
3678       }
3679
3680       inst = emit_asm(ir, opcode, dst, coord, arg1, arg2);
3681       if (opcode == TGSI_OPCODE_STORE)
3682          inst->dst[0].writemask = WRITEMASK_XYZW;
3683    }
3684
3685    inst->resource = image;
3686    inst->sampler_array_size = sampler_array_size;
3687    inst->sampler_base = sampler_base;
3688
3689    switch (type->sampler_dimensionality) {
3690    case GLSL_SAMPLER_DIM_1D:
3691       inst->tex_target = (type->sampler_array)
3692          ? TEXTURE_1D_ARRAY_INDEX : TEXTURE_1D_INDEX;
3693       break;
3694    case GLSL_SAMPLER_DIM_2D:
3695       inst->tex_target = (type->sampler_array)
3696          ? TEXTURE_2D_ARRAY_INDEX : TEXTURE_2D_INDEX;
3697       break;
3698    case GLSL_SAMPLER_DIM_3D:
3699       inst->tex_target = TEXTURE_3D_INDEX;
3700       break;
3701    case GLSL_SAMPLER_DIM_CUBE:
3702       inst->tex_target = (type->sampler_array)
3703          ? TEXTURE_CUBE_ARRAY_INDEX : TEXTURE_CUBE_INDEX;
3704       break;
3705    case GLSL_SAMPLER_DIM_RECT:
3706       inst->tex_target = TEXTURE_RECT_INDEX;
3707       break;
3708    case GLSL_SAMPLER_DIM_BUF:
3709       inst->tex_target = TEXTURE_BUFFER_INDEX;
3710       break;
3711    case GLSL_SAMPLER_DIM_EXTERNAL:
3712       inst->tex_target = TEXTURE_EXTERNAL_INDEX;
3713       break;
3714    case GLSL_SAMPLER_DIM_MS:
3715       inst->tex_target = (type->sampler_array)
3716          ? TEXTURE_2D_MULTISAMPLE_ARRAY_INDEX : TEXTURE_2D_MULTISAMPLE_INDEX;
3717       break;
3718    default:
3719       assert(!"Should not get here.");
3720    }
3721
3722    inst->image_format = st_mesa_format_to_pipe_format(st_context(ctx),
3723          _mesa_get_shader_image_format(imgvar->data.image_format));
3724
3725    if (imgvar->data.image_coherent)
3726       inst->buffer_access |= TGSI_MEMORY_COHERENT;
3727    if (imgvar->data.image_restrict)
3728       inst->buffer_access |= TGSI_MEMORY_RESTRICT;
3729    if (imgvar->data.image_volatile)
3730       inst->buffer_access |= TGSI_MEMORY_VOLATILE;
3731 }
3732
3733 void
3734 glsl_to_tgsi_visitor::visit(ir_call *ir)
3735 {
3736    ir_function_signature *sig = ir->callee;
3737
3738    /* Filter out intrinsics */
3739    switch (sig->intrinsic_id) {
3740    case ir_intrinsic_atomic_counter_read:
3741    case ir_intrinsic_atomic_counter_increment:
3742    case ir_intrinsic_atomic_counter_predecrement:
3743    case ir_intrinsic_atomic_counter_add:
3744    case ir_intrinsic_atomic_counter_min:
3745    case ir_intrinsic_atomic_counter_max:
3746    case ir_intrinsic_atomic_counter_and:
3747    case ir_intrinsic_atomic_counter_or:
3748    case ir_intrinsic_atomic_counter_xor:
3749    case ir_intrinsic_atomic_counter_exchange:
3750    case ir_intrinsic_atomic_counter_comp_swap:
3751       visit_atomic_counter_intrinsic(ir);
3752       return;
3753
3754    case ir_intrinsic_ssbo_load:
3755    case ir_intrinsic_ssbo_store:
3756    case ir_intrinsic_ssbo_atomic_add:
3757    case ir_intrinsic_ssbo_atomic_min:
3758    case ir_intrinsic_ssbo_atomic_max:
3759    case ir_intrinsic_ssbo_atomic_and:
3760    case ir_intrinsic_ssbo_atomic_or:
3761    case ir_intrinsic_ssbo_atomic_xor:
3762    case ir_intrinsic_ssbo_atomic_exchange:
3763    case ir_intrinsic_ssbo_atomic_comp_swap:
3764       visit_ssbo_intrinsic(ir);
3765       return;
3766
3767    case ir_intrinsic_memory_barrier:
3768    case ir_intrinsic_memory_barrier_atomic_counter:
3769    case ir_intrinsic_memory_barrier_buffer:
3770    case ir_intrinsic_memory_barrier_image:
3771    case ir_intrinsic_memory_barrier_shared:
3772    case ir_intrinsic_group_memory_barrier:
3773       visit_membar_intrinsic(ir);
3774       return;
3775
3776    case ir_intrinsic_shared_load:
3777    case ir_intrinsic_shared_store:
3778    case ir_intrinsic_shared_atomic_add:
3779    case ir_intrinsic_shared_atomic_min:
3780    case ir_intrinsic_shared_atomic_max:
3781    case ir_intrinsic_shared_atomic_and:
3782    case ir_intrinsic_shared_atomic_or:
3783    case ir_intrinsic_shared_atomic_xor:
3784    case ir_intrinsic_shared_atomic_exchange:
3785    case ir_intrinsic_shared_atomic_comp_swap:
3786       visit_shared_intrinsic(ir);
3787       return;
3788
3789    case ir_intrinsic_image_load:
3790    case ir_intrinsic_image_store:
3791    case ir_intrinsic_image_atomic_add:
3792    case ir_intrinsic_image_atomic_min:
3793    case ir_intrinsic_image_atomic_max:
3794    case ir_intrinsic_image_atomic_and:
3795    case ir_intrinsic_image_atomic_or:
3796    case ir_intrinsic_image_atomic_xor:
3797    case ir_intrinsic_image_atomic_exchange:
3798    case ir_intrinsic_image_atomic_comp_swap:
3799    case ir_intrinsic_image_size:
3800    case ir_intrinsic_image_samples:
3801       visit_image_intrinsic(ir);
3802       return;
3803
3804    case ir_intrinsic_invalid:
3805    case ir_intrinsic_generic_load:
3806    case ir_intrinsic_generic_store:
3807    case ir_intrinsic_generic_atomic_add:
3808    case ir_intrinsic_generic_atomic_and:
3809    case ir_intrinsic_generic_atomic_or:
3810    case ir_intrinsic_generic_atomic_xor:
3811    case ir_intrinsic_generic_atomic_min:
3812    case ir_intrinsic_generic_atomic_max:
3813    case ir_intrinsic_generic_atomic_exchange:
3814    case ir_intrinsic_generic_atomic_comp_swap:
3815    case ir_intrinsic_shader_clock:
3816       unreachable("Invalid intrinsic");
3817    }
3818 }
3819
3820 void
3821 glsl_to_tgsi_visitor::calc_deref_offsets(ir_dereference *tail,
3822                                          unsigned *array_elements,
3823                                          uint16_t *index,
3824                                          st_src_reg *indirect,
3825                                          unsigned *location)
3826 {
3827    switch (tail->ir_type) {
3828    case ir_type_dereference_record: {
3829       ir_dereference_record *deref_record = tail->as_dereference_record();
3830       const glsl_type *struct_type = deref_record->record->type;
3831       int field_index = deref_record->record->type->field_index(deref_record->field);
3832
3833       calc_deref_offsets(deref_record->record->as_dereference(), array_elements, index, indirect, location);
3834
3835       assert(field_index >= 0);
3836       *location += struct_type->record_location_offset(field_index);
3837       break;
3838    }
3839
3840    case ir_type_dereference_array: {
3841       ir_dereference_array *deref_arr = tail->as_dereference_array();
3842       ir_constant *array_index = deref_arr->array_index->constant_expression_value();
3843
3844       if (!array_index) {
3845          st_src_reg temp_reg;
3846          st_dst_reg temp_dst;
3847
3848          temp_reg = get_temp(glsl_type::uint_type);
3849          temp_dst = st_dst_reg(temp_reg);
3850          temp_dst.writemask = 1;
3851
3852          deref_arr->array_index->accept(this);
3853          if (*array_elements != 1)
3854             emit_asm(NULL, TGSI_OPCODE_MUL, temp_dst, this->result, st_src_reg_for_int(*array_elements));
3855          else
3856             emit_asm(NULL, TGSI_OPCODE_MOV, temp_dst, this->result);
3857
3858          if (indirect->file == PROGRAM_UNDEFINED)
3859             *indirect = temp_reg;
3860          else {
3861             temp_dst = st_dst_reg(*indirect);
3862             temp_dst.writemask = 1;
3863             emit_asm(NULL, TGSI_OPCODE_ADD, temp_dst, *indirect, temp_reg);
3864          }
3865       } else
3866          *index += array_index->value.u[0] * *array_elements;
3867
3868       *array_elements *= deref_arr->array->type->length;
3869
3870       calc_deref_offsets(deref_arr->array->as_dereference(), array_elements, index, indirect, location);
3871       break;
3872    }
3873    default:
3874       break;
3875    }
3876 }
3877
3878 void
3879 glsl_to_tgsi_visitor::get_deref_offsets(ir_dereference *ir,
3880                                         unsigned *array_size,
3881                                         unsigned *base,
3882                                         uint16_t *index,
3883                                         st_src_reg *reladdr,
3884                                         bool opaque)
3885 {
3886    GLuint shader = _mesa_program_enum_to_shader_stage(this->prog->Target);
3887    unsigned location = 0;
3888    ir_variable *var = ir->variable_referenced();
3889
3890    memset(reladdr, 0, sizeof(*reladdr));
3891    reladdr->file = PROGRAM_UNDEFINED;
3892
3893    *base = 0;
3894    *array_size = 1;
3895
3896    assert(var);
3897    location = var->data.location;
3898    calc_deref_offsets(ir, array_size, index, reladdr, &location);
3899
3900    /*
3901     * If we end up with no indirect then adjust the base to the index,
3902     * and set the array size to 1.
3903     */
3904    if (reladdr->file == PROGRAM_UNDEFINED) {
3905       *base = *index;
3906       *array_size = 1;
3907    }
3908
3909    if (opaque) {
3910       assert(location != 0xffffffff);
3911       *base += this->shader_program->data->UniformStorage[location].opaque[shader].index;
3912       *index += this->shader_program->data->UniformStorage[location].opaque[shader].index;
3913    }
3914 }
3915
3916 st_src_reg
3917 glsl_to_tgsi_visitor::canonicalize_gather_offset(st_src_reg offset)
3918 {
3919    if (offset.reladdr || offset.reladdr2) {
3920       st_src_reg tmp = get_temp(glsl_type::ivec2_type);
3921       st_dst_reg tmp_dst = st_dst_reg(tmp);
3922       tmp_dst.writemask = WRITEMASK_XY;
3923       emit_asm(NULL, TGSI_OPCODE_MOV, tmp_dst, offset);
3924       return tmp;
3925    }
3926
3927    return offset;
3928 }
3929
3930 void
3931 glsl_to_tgsi_visitor::visit(ir_texture *ir)
3932 {
3933    st_src_reg result_src, coord, cube_sc, lod_info, projector, dx, dy;
3934    st_src_reg offset[MAX_GLSL_TEXTURE_OFFSET], sample_index, component;
3935    st_src_reg levels_src, reladdr;
3936    st_dst_reg result_dst, coord_dst, cube_sc_dst;
3937    glsl_to_tgsi_instruction *inst = NULL;
3938    unsigned opcode = TGSI_OPCODE_NOP;
3939    const glsl_type *sampler_type = ir->sampler->type;
3940    unsigned sampler_array_size = 1, sampler_base = 0;
3941    uint16_t sampler_index = 0;
3942    bool is_cube_array = false;
3943    unsigned i;
3944
3945    /* if we are a cube array sampler */
3946    if ((sampler_type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
3947         sampler_type->sampler_array)) {
3948       is_cube_array = true;
3949    }
3950
3951    if (ir->coordinate) {
3952       ir->coordinate->accept(this);
3953
3954       /* Put our coords in a temp.  We'll need to modify them for shadow,
3955        * projection, or LOD, so the only case we'd use it as-is is if
3956        * we're doing plain old texturing.  The optimization passes on
3957        * glsl_to_tgsi_visitor should handle cleaning up our mess in that case.
3958        */
3959       coord = get_temp(glsl_type::vec4_type);
3960       coord_dst = st_dst_reg(coord);
3961       coord_dst.writemask = (1 << ir->coordinate->type->vector_elements) - 1;
3962       emit_asm(ir, TGSI_OPCODE_MOV, coord_dst, this->result);
3963    }
3964
3965    if (ir->projector) {
3966       ir->projector->accept(this);
3967       projector = this->result;
3968    }
3969
3970    /* Storage for our result.  Ideally for an assignment we'd be using
3971     * the actual storage for the result here, instead.
3972     */
3973    result_src = get_temp(ir->type);
3974    result_dst = st_dst_reg(result_src);
3975
3976    switch (ir->op) {
3977    case ir_tex:
3978       opcode = (is_cube_array && ir->shadow_comparator) ? TGSI_OPCODE_TEX2 : TGSI_OPCODE_TEX;
3979       if (ir->offset) {
3980          ir->offset->accept(this);
3981          offset[0] = this->result;
3982       }
3983       break;
3984    case ir_txb:
3985       if (is_cube_array ||
3986           sampler_type == glsl_type::samplerCubeShadow_type) {
3987          opcode = TGSI_OPCODE_TXB2;
3988       }
3989       else {
3990          opcode = TGSI_OPCODE_TXB;
3991       }
3992       ir->lod_info.bias->accept(this);
3993       lod_info = this->result;
3994       if (ir->offset) {
3995          ir->offset->accept(this);
3996          offset[0] = this->result;
3997       }
3998       break;
3999    case ir_txl:
4000       opcode = is_cube_array ? TGSI_OPCODE_TXL2 : TGSI_OPCODE_TXL;
4001       ir->lod_info.lod->accept(this);
4002       lod_info = this->result;
4003       if (ir->offset) {
4004          ir->offset->accept(this);
4005          offset[0] = this->result;
4006       }
4007       break;
4008    case ir_txd:
4009       opcode = TGSI_OPCODE_TXD;
4010       ir->lod_info.grad.dPdx->accept(this);
4011       dx = this->result;
4012       ir->lod_info.grad.dPdy->accept(this);
4013       dy = this->result;
4014       if (ir->offset) {
4015          ir->offset->accept(this);
4016          offset[0] = this->result;
4017       }
4018       break;
4019    case ir_txs:
4020       opcode = TGSI_OPCODE_TXQ;
4021       ir->lod_info.lod->accept(this);
4022       lod_info = this->result;
4023       break;
4024    case ir_query_levels:
4025       opcode = TGSI_OPCODE_TXQ;
4026       lod_info = undef_src;
4027       levels_src = get_temp(ir->type);
4028       break;
4029    case ir_txf:
4030       opcode = TGSI_OPCODE_TXF;
4031       ir->lod_info.lod->accept(this);
4032       lod_info = this->result;
4033       if (ir->offset) {
4034          ir->offset->accept(this);
4035          offset[0] = this->result;
4036       }
4037       break;
4038    case ir_txf_ms:
4039       opcode = TGSI_OPCODE_TXF;
4040       ir->lod_info.sample_index->accept(this);
4041       sample_index = this->result;
4042       break;
4043    case ir_tg4:
4044       opcode = TGSI_OPCODE_TG4;
4045       ir->lod_info.component->accept(this);
4046       component = this->result;
4047       if (ir->offset) {
4048          ir->offset->accept(this);
4049          if (ir->offset->type->base_type == GLSL_TYPE_ARRAY) {
4050             const glsl_type *elt_type = ir->offset->type->fields.array;
4051             for (i = 0; i < ir->offset->type->length; i++) {
4052                offset[i] = this->result;
4053                offset[i].index += i * type_size(elt_type);
4054                offset[i].type = elt_type->base_type;
4055                offset[i].swizzle = swizzle_for_size(elt_type->vector_elements);
4056                offset[i] = canonicalize_gather_offset(offset[i]);
4057             }
4058          } else {
4059             offset[0] = canonicalize_gather_offset(this->result);
4060          }
4061       }
4062       break;
4063    case ir_lod:
4064       opcode = TGSI_OPCODE_LODQ;
4065       break;
4066    case ir_texture_samples:
4067       opcode = TGSI_OPCODE_TXQS;
4068       break;
4069    case ir_samples_identical:
4070       unreachable("Unexpected ir_samples_identical opcode");
4071    }
4072
4073    if (ir->projector) {
4074       if (opcode == TGSI_OPCODE_TEX) {
4075          /* Slot the projector in as the last component of the coord. */
4076          coord_dst.writemask = WRITEMASK_W;
4077          emit_asm(ir, TGSI_OPCODE_MOV, coord_dst, projector);
4078          coord_dst.writemask = WRITEMASK_XYZW;
4079          opcode = TGSI_OPCODE_TXP;
4080       } else {
4081          st_src_reg coord_w = coord;
4082          coord_w.swizzle = SWIZZLE_WWWW;
4083
4084          /* For the other TEX opcodes there's no projective version
4085           * since the last slot is taken up by LOD info.  Do the
4086           * projective divide now.
4087           */
4088          coord_dst.writemask = WRITEMASK_W;
4089          emit_asm(ir, TGSI_OPCODE_RCP, coord_dst, projector);
4090
4091          /* In the case where we have to project the coordinates "by hand,"
4092           * the shadow comparator value must also be projected.
4093           */
4094          st_src_reg tmp_src = coord;
4095          if (ir->shadow_comparator) {
4096             /* Slot the shadow value in as the second to last component of the
4097              * coord.
4098              */
4099             ir->shadow_comparator->accept(this);
4100
4101             tmp_src = get_temp(glsl_type::vec4_type);
4102             st_dst_reg tmp_dst = st_dst_reg(tmp_src);
4103
4104             /* Projective division not allowed for array samplers. */
4105             assert(!sampler_type->sampler_array);
4106
4107             tmp_dst.writemask = WRITEMASK_Z;
4108             emit_asm(ir, TGSI_OPCODE_MOV, tmp_dst, this->result);
4109
4110             tmp_dst.writemask = WRITEMASK_XY;
4111             emit_asm(ir, TGSI_OPCODE_MOV, tmp_dst, coord);
4112          }
4113
4114          coord_dst.writemask = WRITEMASK_XYZ;
4115          emit_asm(ir, TGSI_OPCODE_MUL, coord_dst, tmp_src, coord_w);
4116
4117          coord_dst.writemask = WRITEMASK_XYZW;
4118          coord.swizzle = SWIZZLE_XYZW;
4119       }
4120    }
4121
4122    /* If projection is done and the opcode is not TGSI_OPCODE_TXP, then the shadow
4123     * comparator was put in the correct place (and projected) by the code,
4124     * above, that handles by-hand projection.
4125     */
4126    if (ir->shadow_comparator && (!ir->projector || opcode == TGSI_OPCODE_TXP)) {
4127       /* Slot the shadow value in as the second to last component of the
4128        * coord.
4129        */
4130       ir->shadow_comparator->accept(this);
4131
4132       if (is_cube_array) {
4133          cube_sc = get_temp(glsl_type::float_type);
4134          cube_sc_dst = st_dst_reg(cube_sc);
4135          cube_sc_dst.writemask = WRITEMASK_X;
4136          emit_asm(ir, TGSI_OPCODE_MOV, cube_sc_dst, this->result);
4137          cube_sc_dst.writemask = WRITEMASK_X;
4138       }
4139       else {
4140          if ((sampler_type->sampler_dimensionality == GLSL_SAMPLER_DIM_2D &&
4141               sampler_type->sampler_array) ||
4142              sampler_type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE) {
4143             coord_dst.writemask = WRITEMASK_W;
4144          } else {
4145             coord_dst.writemask = WRITEMASK_Z;
4146          }
4147          emit_asm(ir, TGSI_OPCODE_MOV, coord_dst, this->result);
4148          coord_dst.writemask = WRITEMASK_XYZW;
4149       }
4150    }
4151
4152    if (ir->op == ir_txf_ms) {
4153       coord_dst.writemask = WRITEMASK_W;
4154       emit_asm(ir, TGSI_OPCODE_MOV, coord_dst, sample_index);
4155       coord_dst.writemask = WRITEMASK_XYZW;
4156    } else if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXB ||
4157        opcode == TGSI_OPCODE_TXF) {
4158       /* TGSI stores LOD or LOD bias in the last channel of the coords. */
4159       coord_dst.writemask = WRITEMASK_W;
4160       emit_asm(ir, TGSI_OPCODE_MOV, coord_dst, lod_info);
4161       coord_dst.writemask = WRITEMASK_XYZW;
4162    }
4163
4164    get_deref_offsets(ir->sampler, &sampler_array_size, &sampler_base,
4165                      &sampler_index, &reladdr, true);
4166    if (reladdr.file != PROGRAM_UNDEFINED)
4167       emit_arl(ir, sampler_reladdr, reladdr);
4168
4169    if (opcode == TGSI_OPCODE_TXD)
4170       inst = emit_asm(ir, opcode, result_dst, coord, dx, dy);
4171    else if (opcode == TGSI_OPCODE_TXQ) {
4172       if (ir->op == ir_query_levels) {
4173          /* the level is stored in W */
4174          inst = emit_asm(ir, opcode, st_dst_reg(levels_src), lod_info);
4175          result_dst.writemask = WRITEMASK_X;
4176          levels_src.swizzle = SWIZZLE_WWWW;
4177          emit_asm(ir, TGSI_OPCODE_MOV, result_dst, levels_src);
4178       } else
4179          inst = emit_asm(ir, opcode, result_dst, lod_info);
4180    } else if (opcode == TGSI_OPCODE_TXQS) {
4181       inst = emit_asm(ir, opcode, result_dst);
4182    } else if (opcode == TGSI_OPCODE_TXF) {
4183       inst = emit_asm(ir, opcode, result_dst, coord);
4184    } else if (opcode == TGSI_OPCODE_TXL2 || opcode == TGSI_OPCODE_TXB2) {
4185       inst = emit_asm(ir, opcode, result_dst, coord, lod_info);
4186    } else if (opcode == TGSI_OPCODE_TEX2) {
4187       inst = emit_asm(ir, opcode, result_dst, coord, cube_sc);
4188    } else if (opcode == TGSI_OPCODE_TG4) {
4189       if (is_cube_array && ir->shadow_comparator) {
4190          inst = emit_asm(ir, opcode, result_dst, coord, cube_sc);
4191       } else {
4192          inst = emit_asm(ir, opcode, result_dst, coord, component);
4193       }
4194    } else
4195       inst = emit_asm(ir, opcode, result_dst, coord);
4196
4197    if (ir->shadow_comparator)
4198       inst->tex_shadow = GL_TRUE;
4199
4200    inst->resource.index = sampler_index;
4201    inst->sampler_array_size = sampler_array_size;
4202    inst->sampler_base = sampler_base;
4203
4204    if (reladdr.file != PROGRAM_UNDEFINED) {
4205       inst->resource.reladdr = ralloc(mem_ctx, st_src_reg);
4206       memcpy(inst->resource.reladdr, &reladdr, sizeof(reladdr));
4207    }
4208
4209    if (ir->offset) {
4210       if (!inst->tex_offsets)
4211          inst->tex_offsets = rzalloc_array(inst, st_src_reg, MAX_GLSL_TEXTURE_OFFSET);
4212
4213       for (i = 0; i < MAX_GLSL_TEXTURE_OFFSET && offset[i].file != PROGRAM_UNDEFINED; i++)
4214          inst->tex_offsets[i] = offset[i];
4215       inst->tex_offset_num_offset = i;
4216    }
4217
4218    switch (sampler_type->sampler_dimensionality) {
4219    case GLSL_SAMPLER_DIM_1D:
4220       inst->tex_target = (sampler_type->sampler_array)
4221          ? TEXTURE_1D_ARRAY_INDEX : TEXTURE_1D_INDEX;
4222       break;
4223    case GLSL_SAMPLER_DIM_2D:
4224       inst->tex_target = (sampler_type->sampler_array)
4225          ? TEXTURE_2D_ARRAY_INDEX : TEXTURE_2D_INDEX;
4226       break;
4227    case GLSL_SAMPLER_DIM_3D:
4228       inst->tex_target = TEXTURE_3D_INDEX;
4229       break;
4230    case GLSL_SAMPLER_DIM_CUBE:
4231       inst->tex_target = (sampler_type->sampler_array)
4232          ? TEXTURE_CUBE_ARRAY_INDEX : TEXTURE_CUBE_INDEX;
4233       break;
4234    case GLSL_SAMPLER_DIM_RECT:
4235       inst->tex_target = TEXTURE_RECT_INDEX;
4236       break;
4237    case GLSL_SAMPLER_DIM_BUF:
4238       inst->tex_target = TEXTURE_BUFFER_INDEX;
4239       break;
4240    case GLSL_SAMPLER_DIM_EXTERNAL:
4241       inst->tex_target = TEXTURE_EXTERNAL_INDEX;
4242       break;
4243    case GLSL_SAMPLER_DIM_MS:
4244       inst->tex_target = (sampler_type->sampler_array)
4245          ? TEXTURE_2D_MULTISAMPLE_ARRAY_INDEX : TEXTURE_2D_MULTISAMPLE_INDEX;
4246       break;
4247    default:
4248       assert(!"Should not get here.");
4249    }
4250
4251    inst->tex_type = ir->type->base_type;
4252
4253    this->result = result_src;
4254 }
4255
4256 void
4257 glsl_to_tgsi_visitor::visit(ir_return *ir)
4258 {
4259    assert(!ir->get_value());
4260
4261    emit_asm(ir, TGSI_OPCODE_RET);
4262 }
4263
4264 void
4265 glsl_to_tgsi_visitor::visit(ir_discard *ir)
4266 {
4267    if (ir->condition) {
4268       ir->condition->accept(this);
4269       st_src_reg condition = this->result;
4270
4271       /* Convert the bool condition to a float so we can negate. */
4272       if (native_integers) {
4273          st_src_reg temp = get_temp(ir->condition->type);
4274          emit_asm(ir, TGSI_OPCODE_AND, st_dst_reg(temp),
4275               condition, st_src_reg_for_float(1.0));
4276          condition = temp;
4277       }
4278
4279       condition.negate = ~condition.negate;
4280       emit_asm(ir, TGSI_OPCODE_KILL_IF, undef_dst, condition);
4281    } else {
4282       /* unconditional kil */
4283       emit_asm(ir, TGSI_OPCODE_KILL);
4284    }
4285 }
4286
4287 void
4288 glsl_to_tgsi_visitor::visit(ir_if *ir)
4289 {
4290    unsigned if_opcode;
4291    glsl_to_tgsi_instruction *if_inst;
4292
4293    ir->condition->accept(this);
4294    assert(this->result.file != PROGRAM_UNDEFINED);
4295
4296    if_opcode = native_integers ? TGSI_OPCODE_UIF : TGSI_OPCODE_IF;
4297
4298    if_inst = emit_asm(ir->condition, if_opcode, undef_dst, this->result);
4299
4300    this->instructions.push_tail(if_inst);
4301
4302    visit_exec_list(&ir->then_instructions, this);
4303
4304    if (!ir->else_instructions.is_empty()) {
4305       emit_asm(ir->condition, TGSI_OPCODE_ELSE);
4306       visit_exec_list(&ir->else_instructions, this);
4307    }
4308
4309    if_inst = emit_asm(ir->condition, TGSI_OPCODE_ENDIF);
4310 }
4311
4312
4313 void
4314 glsl_to_tgsi_visitor::visit(ir_emit_vertex *ir)
4315 {
4316    assert(this->prog->Target == GL_GEOMETRY_PROGRAM_NV);
4317
4318    ir->stream->accept(this);
4319    emit_asm(ir, TGSI_OPCODE_EMIT, undef_dst, this->result);
4320 }
4321
4322 void
4323 glsl_to_tgsi_visitor::visit(ir_end_primitive *ir)
4324 {
4325    assert(this->prog->Target == GL_GEOMETRY_PROGRAM_NV);
4326
4327    ir->stream->accept(this);
4328    emit_asm(ir, TGSI_OPCODE_ENDPRIM, undef_dst, this->result);
4329 }
4330
4331 void
4332 glsl_to_tgsi_visitor::visit(ir_barrier *ir)
4333 {
4334    assert(this->prog->Target == GL_TESS_CONTROL_PROGRAM_NV ||
4335           this->prog->Target == GL_COMPUTE_PROGRAM_NV);
4336
4337    emit_asm(ir, TGSI_OPCODE_BARRIER);
4338 }
4339
4340 glsl_to_tgsi_visitor::glsl_to_tgsi_visitor()
4341 {
4342    STATIC_ASSERT(sizeof(samplers_used) * 8 >= PIPE_MAX_SAMPLERS);
4343
4344    result.file = PROGRAM_UNDEFINED;
4345    next_temp = 1;
4346    array_sizes = NULL;
4347    max_num_arrays = 0;
4348    next_array = 0;
4349    num_inputs = 0;
4350    num_outputs = 0;
4351    num_input_arrays = 0;
4352    num_output_arrays = 0;
4353    num_immediates = 0;
4354    num_address_regs = 0;
4355    samplers_used = 0;
4356    buffers_used = 0;
4357    images_used = 0;
4358    indirect_addr_consts = false;
4359    wpos_transform_const = -1;
4360    glsl_version = 0;
4361    native_integers = false;
4362    mem_ctx = ralloc_context(NULL);
4363    ctx = NULL;
4364    prog = NULL;
4365    shader_program = NULL;
4366    shader = NULL;
4367    options = NULL;
4368    have_sqrt = false;
4369    have_fma = false;
4370    use_shared_memory = false;
4371 }
4372
4373 glsl_to_tgsi_visitor::~glsl_to_tgsi_visitor()
4374 {
4375    free(array_sizes);
4376    ralloc_free(mem_ctx);
4377 }
4378
4379 extern "C" void free_glsl_to_tgsi_visitor(glsl_to_tgsi_visitor *v)
4380 {
4381    delete v;
4382 }
4383
4384
4385 /**
4386  * Count resources used by the given gpu program (number of texture
4387  * samplers, etc).
4388  */
4389 static void
4390 count_resources(glsl_to_tgsi_visitor *v, gl_program *prog)
4391 {
4392    v->samplers_used = 0;
4393    v->buffers_used = 0;
4394    v->images_used = 0;
4395
4396    foreach_in_list(glsl_to_tgsi_instruction, inst, &v->instructions) {
4397       if (inst->info->is_tex) {
4398          for (int i = 0; i < inst->sampler_array_size; i++) {
4399             unsigned idx = inst->sampler_base + i;
4400             v->samplers_used |= 1u << idx;
4401
4402             debug_assert(idx < (int)ARRAY_SIZE(v->sampler_types));
4403             v->sampler_types[idx] = inst->tex_type;
4404             v->sampler_targets[idx] =
4405                st_translate_texture_target(inst->tex_target, inst->tex_shadow);
4406
4407             if (inst->tex_shadow) {
4408                prog->ShadowSamplers |= 1 << (inst->resource.index + i);
4409             }
4410          }
4411       }
4412
4413       if (inst->tex_target == TEXTURE_EXTERNAL_INDEX)
4414          prog->ExternalSamplersUsed |= 1 << inst->resource.index;
4415
4416       if (inst->resource.file != PROGRAM_UNDEFINED && (
4417                 is_resource_instruction(inst->op) ||
4418                 inst->op == TGSI_OPCODE_STORE)) {
4419          if (inst->resource.file == PROGRAM_BUFFER) {
4420             v->buffers_used |= 1 << inst->resource.index;
4421          } else if (inst->resource.file == PROGRAM_MEMORY) {
4422             v->use_shared_memory = true;
4423          } else {
4424             assert(inst->resource.file == PROGRAM_IMAGE);
4425             for (int i = 0; i < inst->sampler_array_size; i++) {
4426                unsigned idx = inst->sampler_base + i;
4427                v->images_used |= 1 << idx;
4428                v->image_targets[idx] =
4429                   st_translate_texture_target(inst->tex_target, false);
4430                v->image_formats[idx] = inst->image_format;
4431             }
4432          }
4433       }
4434    }
4435    prog->SamplersUsed = v->samplers_used;
4436
4437    if (v->shader_program != NULL)
4438       _mesa_update_shader_textures_used(v->shader_program, prog);
4439 }
4440
4441 /**
4442  * Returns the mask of channels (bitmask of WRITEMASK_X,Y,Z,W) which
4443  * are read from the given src in this instruction
4444  */
4445 static int
4446 get_src_arg_mask(st_dst_reg dst, st_src_reg src)
4447 {
4448    int read_mask = 0, comp;
4449
4450    /* Now, given the src swizzle and the written channels, find which
4451     * components are actually read
4452     */
4453    for (comp = 0; comp < 4; ++comp) {
4454       const unsigned coord = GET_SWZ(src.swizzle, comp);
4455       assert(coord < 4);
4456       if (dst.writemask & (1 << comp) && coord <= SWIZZLE_W)
4457          read_mask |= 1 << coord;
4458    }
4459
4460    return read_mask;
4461 }
4462
4463 /**
4464  * This pass replaces CMP T0, T1 T2 T0 with MOV T0, T2 when the CMP
4465  * instruction is the first instruction to write to register T0.  There are
4466  * several lowering passes done in GLSL IR (e.g. branches and
4467  * relative addressing) that create a large number of conditional assignments
4468  * that ir_to_mesa converts to CMP instructions like the one mentioned above.
4469  *
4470  * Here is why this conversion is safe:
4471  * CMP T0, T1 T2 T0 can be expanded to:
4472  * if (T1 < 0.0)
4473  *   MOV T0, T2;
4474  * else
4475  *   MOV T0, T0;
4476  *
4477  * If (T1 < 0.0) evaluates to true then our replacement MOV T0, T2 is the same
4478  * as the original program.  If (T1 < 0.0) evaluates to false, executing
4479  * MOV T0, T0 will store a garbage value in T0 since T0 is uninitialized.
4480  * Therefore, it doesn't matter that we are replacing MOV T0, T0 with MOV T0, T2
4481  * because any instruction that was going to read from T0 after this was going
4482  * to read a garbage value anyway.
4483  */
4484 void
4485 glsl_to_tgsi_visitor::simplify_cmp(void)
4486 {
4487    int tempWritesSize = 0;
4488    unsigned *tempWrites = NULL;
4489    unsigned outputWrites[VARYING_SLOT_TESS_MAX];
4490
4491    memset(outputWrites, 0, sizeof(outputWrites));
4492
4493    foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
4494       unsigned prevWriteMask = 0;
4495
4496       /* Give up if we encounter relative addressing or flow control. */
4497       if (inst->dst[0].reladdr || inst->dst[0].reladdr2 ||
4498           inst->dst[1].reladdr || inst->dst[1].reladdr2 ||
4499           tgsi_get_opcode_info(inst->op)->is_branch ||
4500           inst->op == TGSI_OPCODE_CONT ||
4501           inst->op == TGSI_OPCODE_END ||
4502           inst->op == TGSI_OPCODE_RET) {
4503          break;
4504       }
4505
4506       if (inst->dst[0].file == PROGRAM_OUTPUT) {
4507          assert(inst->dst[0].index < (signed)ARRAY_SIZE(outputWrites));
4508          prevWriteMask = outputWrites[inst->dst[0].index];
4509          outputWrites[inst->dst[0].index] |= inst->dst[0].writemask;
4510       } else if (inst->dst[0].file == PROGRAM_TEMPORARY) {
4511          if (inst->dst[0].index >= tempWritesSize) {
4512             const int inc = 4096;
4513
4514             tempWrites = (unsigned*)
4515                          realloc(tempWrites,
4516                                  (tempWritesSize + inc) * sizeof(unsigned));
4517             if (!tempWrites)
4518                return;
4519
4520             memset(tempWrites + tempWritesSize, 0, inc * sizeof(unsigned));
4521             tempWritesSize += inc;
4522          }
4523
4524          prevWriteMask = tempWrites[inst->dst[0].index];
4525          tempWrites[inst->dst[0].index] |= inst->dst[0].writemask;
4526       } else
4527          continue;
4528
4529       /* For a CMP to be considered a conditional write, the destination
4530        * register and source register two must be the same. */
4531       if (inst->op == TGSI_OPCODE_CMP
4532           && !(inst->dst[0].writemask & prevWriteMask)
4533           && inst->src[2].file == inst->dst[0].file
4534           && inst->src[2].index == inst->dst[0].index
4535           && inst->dst[0].writemask == get_src_arg_mask(inst->dst[0], inst->src[2])) {
4536
4537          inst->op = TGSI_OPCODE_MOV;
4538          inst->info = tgsi_get_opcode_info(inst->op);
4539          inst->src[0] = inst->src[1];
4540       }
4541    }
4542
4543    free(tempWrites);
4544 }
4545
4546 /* Replaces all references to a temporary register index with another index. */
4547 void
4548 glsl_to_tgsi_visitor::rename_temp_registers(int num_renames, struct rename_reg_pair *renames)
4549 {
4550    foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
4551       unsigned j;
4552       int k;
4553       for (j = 0; j < num_inst_src_regs(inst); j++) {
4554          if (inst->src[j].file == PROGRAM_TEMPORARY)
4555             for (k = 0; k < num_renames; k++)
4556                if (inst->src[j].index == renames[k].old_reg)
4557                   inst->src[j].index = renames[k].new_reg;
4558       }
4559
4560       for (j = 0; j < inst->tex_offset_num_offset; j++) {
4561          if (inst->tex_offsets[j].file == PROGRAM_TEMPORARY)
4562             for (k = 0; k < num_renames; k++)
4563                if (inst->tex_offsets[j].index == renames[k].old_reg)
4564                   inst->tex_offsets[j].index = renames[k].new_reg;
4565       }
4566
4567       for (j = 0; j < num_inst_dst_regs(inst); j++) {
4568          if (inst->dst[j].file == PROGRAM_TEMPORARY)
4569              for (k = 0; k < num_renames; k++)
4570                 if (inst->dst[j].index == renames[k].old_reg)
4571                    inst->dst[j].index = renames[k].new_reg;
4572       }
4573    }
4574 }
4575
4576 void
4577 glsl_to_tgsi_visitor::get_first_temp_read(int *first_reads)
4578 {
4579    int depth = 0; /* loop depth */
4580    int loop_start = -1; /* index of the first active BGNLOOP (if any) */
4581    unsigned i = 0, j;
4582
4583    foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
4584       for (j = 0; j < num_inst_src_regs(inst); j++) {
4585          if (inst->src[j].file == PROGRAM_TEMPORARY) {
4586             if (first_reads[inst->src[j].index] == -1)
4587                 first_reads[inst->src[j].index] = (depth == 0) ? i : loop_start;
4588          }
4589       }
4590       for (j = 0; j < inst->tex_offset_num_offset; j++) {
4591          if (inst->tex_offsets[j].file == PROGRAM_TEMPORARY) {
4592             if (first_reads[inst->tex_offsets[j].index] == -1)
4593                first_reads[inst->tex_offsets[j].index] = (depth == 0) ? i : loop_start;
4594          }
4595       }
4596       if (inst->op == TGSI_OPCODE_BGNLOOP) {
4597          if(depth++ == 0)
4598             loop_start = i;
4599       } else if (inst->op == TGSI_OPCODE_ENDLOOP) {
4600          if (--depth == 0)
4601             loop_start = -1;
4602       }
4603       assert(depth >= 0);
4604       i++;
4605    }
4606 }
4607
4608 void
4609 glsl_to_tgsi_visitor::get_last_temp_read_first_temp_write(int *last_reads, int *first_writes)
4610 {
4611    int depth = 0; /* loop depth */
4612    int loop_start = -1; /* index of the first active BGNLOOP (if any) */
4613    unsigned i = 0, j;
4614    int k;
4615    foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
4616       for (j = 0; j < num_inst_src_regs(inst); j++) {
4617          if (inst->src[j].file == PROGRAM_TEMPORARY)
4618             last_reads[inst->src[j].index] = (depth == 0) ? i : -2;
4619       }
4620       for (j = 0; j < num_inst_dst_regs(inst); j++) {
4621          if (inst->dst[j].file == PROGRAM_TEMPORARY) {
4622             if (first_writes[inst->dst[j].index] == -1)
4623                first_writes[inst->dst[j].index] = (depth == 0) ? i : loop_start;
4624             last_reads[inst->dst[j].index] = (depth == 0) ? i : -2;
4625          }
4626       }
4627       for (j = 0; j < inst->tex_offset_num_offset; j++) {
4628          if (inst->tex_offsets[j].file == PROGRAM_TEMPORARY)
4629             last_reads[inst->tex_offsets[j].index] = (depth == 0) ? i : -2;
4630       }
4631       if (inst->op == TGSI_OPCODE_BGNLOOP) {
4632          if(depth++ == 0)
4633             loop_start = i;
4634       } else if (inst->op == TGSI_OPCODE_ENDLOOP) {
4635          if (--depth == 0) {
4636             loop_start = -1;
4637             for (k = 0; k < this->next_temp; k++) {
4638                if (last_reads[k] == -2) {
4639                   last_reads[k] = i;
4640                }
4641             }
4642          }
4643       }
4644       assert(depth >= 0);
4645       i++;
4646    }
4647 }
4648
4649 void
4650 glsl_to_tgsi_visitor::get_last_temp_write(int *last_writes)
4651 {
4652    int depth = 0; /* loop depth */
4653    int i = 0, k;
4654    unsigned j;
4655
4656    foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
4657       for (j = 0; j < num_inst_dst_regs(inst); j++) {
4658          if (inst->dst[j].file == PROGRAM_TEMPORARY)
4659             last_writes[inst->dst[j].index] = (depth == 0) ? i : -2;
4660       }
4661
4662       if (inst->op == TGSI_OPCODE_BGNLOOP)
4663          depth++;
4664       else if (inst->op == TGSI_OPCODE_ENDLOOP)
4665          if (--depth == 0) {
4666             for (k = 0; k < this->next_temp; k++) {
4667                if (last_writes[k] == -2) {
4668                   last_writes[k] = i;
4669                }
4670             }
4671          }
4672       assert(depth >= 0);
4673       i++;
4674    }
4675 }
4676
4677 /*
4678  * On a basic block basis, tracks available PROGRAM_TEMPORARY register
4679  * channels for copy propagation and updates following instructions to
4680  * use the original versions.
4681  *
4682  * The glsl_to_tgsi_visitor lazily produces code assuming that this pass
4683  * will occur.  As an example, a TXP production before this pass:
4684  *
4685  * 0: MOV TEMP[1], INPUT[4].xyyy;
4686  * 1: MOV TEMP[1].w, INPUT[4].wwww;
4687  * 2: TXP TEMP[2], TEMP[1], texture[0], 2D;
4688  *
4689  * and after:
4690  *
4691  * 0: MOV TEMP[1], INPUT[4].xyyy;
4692  * 1: MOV TEMP[1].w, INPUT[4].wwww;
4693  * 2: TXP TEMP[2], INPUT[4].xyyw, texture[0], 2D;
4694  *
4695  * which allows for dead code elimination on TEMP[1]'s writes.
4696  */
4697 void
4698 glsl_to_tgsi_visitor::copy_propagate(void)
4699 {
4700    glsl_to_tgsi_instruction **acp = rzalloc_array(mem_ctx,
4701                                                   glsl_to_tgsi_instruction *,
4702                                                   this->next_temp * 4);
4703    int *acp_level = rzalloc_array(mem_ctx, int, this->next_temp * 4);
4704    int level = 0;
4705
4706    foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
4707       assert(inst->dst[0].file != PROGRAM_TEMPORARY
4708              || inst->dst[0].index < this->next_temp);
4709
4710       /* First, do any copy propagation possible into the src regs. */
4711       for (int r = 0; r < 3; r++) {
4712          glsl_to_tgsi_instruction *first = NULL;
4713          bool good = true;
4714          int acp_base = inst->src[r].index * 4;
4715
4716          if (inst->src[r].file != PROGRAM_TEMPORARY ||
4717              inst->src[r].reladdr ||
4718              inst->src[r].reladdr2)
4719             continue;
4720
4721          /* See if we can find entries in the ACP consisting of MOVs
4722           * from the same src register for all the swizzled channels
4723           * of this src register reference.
4724           */
4725          for (int i = 0; i < 4; i++) {
4726             int src_chan = GET_SWZ(inst->src[r].swizzle, i);
4727             glsl_to_tgsi_instruction *copy_chan = acp[acp_base + src_chan];
4728
4729             if (!copy_chan) {
4730                good = false;
4731                break;
4732             }
4733
4734             assert(acp_level[acp_base + src_chan] <= level);
4735
4736             if (!first) {
4737                first = copy_chan;
4738             } else {
4739                if (first->src[0].file != copy_chan->src[0].file ||
4740                    first->src[0].index != copy_chan->src[0].index ||
4741                    first->src[0].double_reg2 != copy_chan->src[0].double_reg2 ||
4742                    first->src[0].index2D != copy_chan->src[0].index2D) {
4743                   good = false;
4744                   break;
4745                }
4746             }
4747          }
4748
4749          if (good) {
4750             /* We've now validated that we can copy-propagate to
4751              * replace this src register reference.  Do it.
4752              */
4753             inst->src[r].file = first->src[0].file;
4754             inst->src[r].index = first->src[0].index;
4755             inst->src[r].index2D = first->src[0].index2D;
4756             inst->src[r].has_index2 = first->src[0].has_index2;
4757             inst->src[r].double_reg2 = first->src[0].double_reg2;
4758             inst->src[r].array_id = first->src[0].array_id;
4759
4760             int swizzle = 0;
4761             for (int i = 0; i < 4; i++) {
4762                int src_chan = GET_SWZ(inst->src[r].swizzle, i);
4763                glsl_to_tgsi_instruction *copy_inst = acp[acp_base + src_chan];
4764                swizzle |= (GET_SWZ(copy_inst->src[0].swizzle, src_chan) << (3 * i));
4765             }
4766             inst->src[r].swizzle = swizzle;
4767          }
4768       }
4769
4770       switch (inst->op) {
4771       case TGSI_OPCODE_BGNLOOP:
4772       case TGSI_OPCODE_ENDLOOP:
4773          /* End of a basic block, clear the ACP entirely. */
4774          memset(acp, 0, sizeof(*acp) * this->next_temp * 4);
4775          break;
4776
4777       case TGSI_OPCODE_IF:
4778       case TGSI_OPCODE_UIF:
4779          ++level;
4780          break;
4781
4782       case TGSI_OPCODE_ENDIF:
4783       case TGSI_OPCODE_ELSE:
4784          /* Clear all channels written inside the block from the ACP, but
4785           * leaving those that were not touched.
4786           */
4787          for (int r = 0; r < this->next_temp; r++) {
4788             for (int c = 0; c < 4; c++) {
4789                if (!acp[4 * r + c])
4790                   continue;
4791
4792                if (acp_level[4 * r + c] >= level)
4793                   acp[4 * r + c] = NULL;
4794             }
4795          }
4796          if (inst->op == TGSI_OPCODE_ENDIF)
4797             --level;
4798          break;
4799
4800       default:
4801          /* Continuing the block, clear any written channels from
4802           * the ACP.
4803           */
4804          for (int d = 0; d < 2; d++) {
4805             if (inst->dst[d].file == PROGRAM_TEMPORARY && inst->dst[d].reladdr) {
4806                /* Any temporary might be written, so no copy propagation
4807                 * across this instruction.
4808                 */
4809                memset(acp, 0, sizeof(*acp) * this->next_temp * 4);
4810             } else if (inst->dst[d].file == PROGRAM_OUTPUT &&
4811                        inst->dst[d].reladdr) {
4812                /* Any output might be written, so no copy propagation
4813                 * from outputs across this instruction.
4814                 */
4815                for (int r = 0; r < this->next_temp; r++) {
4816                   for (int c = 0; c < 4; c++) {
4817                      if (!acp[4 * r + c])
4818                         continue;
4819
4820                      if (acp[4 * r + c]->src[0].file == PROGRAM_OUTPUT)
4821                         acp[4 * r + c] = NULL;
4822                   }
4823                }
4824             } else if (inst->dst[d].file == PROGRAM_TEMPORARY ||
4825                        inst->dst[d].file == PROGRAM_OUTPUT) {
4826                /* Clear where it's used as dst. */
4827                if (inst->dst[d].file == PROGRAM_TEMPORARY) {
4828                   for (int c = 0; c < 4; c++) {
4829                      if (inst->dst[d].writemask & (1 << c))
4830                         acp[4 * inst->dst[d].index + c] = NULL;
4831                   }
4832                }
4833
4834                /* Clear where it's used as src. */
4835                for (int r = 0; r < this->next_temp; r++) {
4836                   for (int c = 0; c < 4; c++) {
4837                      if (!acp[4 * r + c])
4838                         continue;
4839
4840                      int src_chan = GET_SWZ(acp[4 * r + c]->src[0].swizzle, c);
4841
4842                      if (acp[4 * r + c]->src[0].file == inst->dst[d].file &&
4843                          acp[4 * r + c]->src[0].index == inst->dst[d].index &&
4844                          inst->dst[d].writemask & (1 << src_chan)) {
4845                         acp[4 * r + c] = NULL;
4846                      }
4847                   }
4848                }
4849             }
4850          }
4851          break;
4852       }
4853
4854       /* If this is a copy, add it to the ACP. */
4855       if (inst->op == TGSI_OPCODE_MOV &&
4856           inst->dst[0].file == PROGRAM_TEMPORARY &&
4857           !(inst->dst[0].file == inst->src[0].file &&
4858              inst->dst[0].index == inst->src[0].index) &&
4859           !inst->dst[0].reladdr &&
4860           !inst->dst[0].reladdr2 &&
4861           !inst->saturate &&
4862           inst->src[0].file != PROGRAM_ARRAY &&
4863           !inst->src[0].reladdr &&
4864           !inst->src[0].reladdr2 &&
4865           !inst->src[0].negate &&
4866           !inst->src[0].abs) {
4867          for (int i = 0; i < 4; i++) {
4868             if (inst->dst[0].writemask & (1 << i)) {
4869                acp[4 * inst->dst[0].index + i] = inst;
4870                acp_level[4 * inst->dst[0].index + i] = level;
4871             }
4872          }
4873       }
4874    }
4875
4876    ralloc_free(acp_level);
4877    ralloc_free(acp);
4878 }
4879
4880 /*
4881  * On a basic block basis, tracks available PROGRAM_TEMPORARY registers for dead
4882  * code elimination.
4883  *
4884  * The glsl_to_tgsi_visitor lazily produces code assuming that this pass
4885  * will occur.  As an example, a TXP production after copy propagation but
4886  * before this pass:
4887  *
4888  * 0: MOV TEMP[1], INPUT[4].xyyy;
4889  * 1: MOV TEMP[1].w, INPUT[4].wwww;
4890  * 2: TXP TEMP[2], INPUT[4].xyyw, texture[0], 2D;
4891  *
4892  * and after this pass:
4893  *
4894  * 0: TXP TEMP[2], INPUT[4].xyyw, texture[0], 2D;
4895  */
4896 int
4897 glsl_to_tgsi_visitor::eliminate_dead_code(void)
4898 {
4899    glsl_to_tgsi_instruction **writes = rzalloc_array(mem_ctx,
4900                                                      glsl_to_tgsi_instruction *,
4901                                                      this->next_temp * 4);
4902    int *write_level = rzalloc_array(mem_ctx, int, this->next_temp * 4);
4903    int level = 0;
4904    int removed = 0;
4905
4906    foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
4907       assert(inst->dst[0].file != PROGRAM_TEMPORARY
4908              || inst->dst[0].index < this->next_temp);
4909
4910       switch (inst->op) {
4911       case TGSI_OPCODE_BGNLOOP:
4912       case TGSI_OPCODE_ENDLOOP:
4913       case TGSI_OPCODE_CONT:
4914       case TGSI_OPCODE_BRK:
4915          /* End of a basic block, clear the write array entirely.
4916           *
4917           * This keeps us from killing dead code when the writes are
4918           * on either side of a loop, even when the register isn't touched
4919           * inside the loop.  However, glsl_to_tgsi_visitor doesn't seem to emit
4920           * dead code of this type, so it shouldn't make a difference as long as
4921           * the dead code elimination pass in the GLSL compiler does its job.
4922           */
4923          memset(writes, 0, sizeof(*writes) * this->next_temp * 4);
4924          break;
4925
4926       case TGSI_OPCODE_ENDIF:
4927       case TGSI_OPCODE_ELSE:
4928          /* Promote the recorded level of all channels written inside the
4929           * preceding if or else block to the level above the if/else block.
4930           */
4931          for (int r = 0; r < this->next_temp; r++) {
4932             for (int c = 0; c < 4; c++) {
4933                if (!writes[4 * r + c])
4934                   continue;
4935
4936                if (write_level[4 * r + c] == level)
4937                   write_level[4 * r + c] = level-1;
4938             }
4939          }
4940          if(inst->op == TGSI_OPCODE_ENDIF)
4941             --level;
4942          break;
4943
4944       case TGSI_OPCODE_IF:
4945       case TGSI_OPCODE_UIF:
4946          ++level;
4947          /* fallthrough to default case to mark the condition as read */
4948       default:
4949          /* Continuing the block, clear any channels from the write array that
4950           * are read by this instruction.
4951           */
4952          for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
4953             if (inst->src[i].file == PROGRAM_TEMPORARY && inst->src[i].reladdr){
4954                /* Any temporary might be read, so no dead code elimination
4955                 * across this instruction.
4956                 */
4957                memset(writes, 0, sizeof(*writes) * this->next_temp * 4);
4958             } else if (inst->src[i].file == PROGRAM_TEMPORARY) {
4959                /* Clear where it's used as src. */
4960                int src_chans = 1 << GET_SWZ(inst->src[i].swizzle, 0);
4961                src_chans |= 1 << GET_SWZ(inst->src[i].swizzle, 1);
4962                src_chans |= 1 << GET_SWZ(inst->src[i].swizzle, 2);
4963                src_chans |= 1 << GET_SWZ(inst->src[i].swizzle, 3);
4964
4965                for (int c = 0; c < 4; c++) {
4966                   if (src_chans & (1 << c))
4967                      writes[4 * inst->src[i].index + c] = NULL;
4968                }
4969             }
4970          }
4971          for (unsigned i = 0; i < inst->tex_offset_num_offset; i++) {
4972             if (inst->tex_offsets[i].file == PROGRAM_TEMPORARY && inst->tex_offsets[i].reladdr){
4973                /* Any temporary might be read, so no dead code elimination
4974                 * across this instruction.
4975                 */
4976                memset(writes, 0, sizeof(*writes) * this->next_temp * 4);
4977             } else if (inst->tex_offsets[i].file == PROGRAM_TEMPORARY) {
4978                /* Clear where it's used as src. */
4979                int src_chans = 1 << GET_SWZ(inst->tex_offsets[i].swizzle, 0);
4980                src_chans |= 1 << GET_SWZ(inst->tex_offsets[i].swizzle, 1);
4981                src_chans |= 1 << GET_SWZ(inst->tex_offsets[i].swizzle, 2);
4982                src_chans |= 1 << GET_SWZ(inst->tex_offsets[i].swizzle, 3);
4983
4984                for (int c = 0; c < 4; c++) {
4985                   if (src_chans & (1 << c))
4986                      writes[4 * inst->tex_offsets[i].index + c] = NULL;
4987                }
4988             }
4989          }
4990          break;
4991       }
4992
4993       /* If this instruction writes to a temporary, add it to the write array.
4994        * If there is already an instruction in the write array for one or more
4995        * of the channels, flag that channel write as dead.
4996        */
4997       for (unsigned i = 0; i < ARRAY_SIZE(inst->dst); i++) {
4998          if (inst->dst[i].file == PROGRAM_TEMPORARY &&
4999              !inst->dst[i].reladdr) {
5000             for (int c = 0; c < 4; c++) {
5001                if (inst->dst[i].writemask & (1 << c)) {
5002                   if (writes[4 * inst->dst[i].index + c]) {
5003                      if (write_level[4 * inst->dst[i].index + c] < level)
5004                         continue;
5005                      else
5006                         writes[4 * inst->dst[i].index + c]->dead_mask |= (1 << c);
5007                   }
5008                   writes[4 * inst->dst[i].index + c] = inst;
5009                   write_level[4 * inst->dst[i].index + c] = level;
5010                }
5011             }
5012          }
5013       }
5014    }
5015
5016    /* Anything still in the write array at this point is dead code. */
5017    for (int r = 0; r < this->next_temp; r++) {
5018       for (int c = 0; c < 4; c++) {
5019          glsl_to_tgsi_instruction *inst = writes[4 * r + c];
5020          if (inst)
5021             inst->dead_mask |= (1 << c);
5022       }
5023    }
5024
5025    /* Now actually remove the instructions that are completely dead and update
5026     * the writemask of other instructions with dead channels.
5027     */
5028    foreach_in_list_safe(glsl_to_tgsi_instruction, inst, &this->instructions) {
5029       if (!inst->dead_mask || !inst->dst[0].writemask)
5030          continue;
5031       /* No amount of dead masks should remove memory stores */
5032       if (inst->info->is_store)
5033          continue;
5034
5035       if ((inst->dst[0].writemask & ~inst->dead_mask) == 0) {
5036          inst->remove();
5037          delete inst;
5038          removed++;
5039       } else {
5040          if (glsl_base_type_is_64bit(inst->dst[0].type)) {
5041             if (inst->dead_mask == WRITEMASK_XY ||
5042                 inst->dead_mask == WRITEMASK_ZW)
5043                inst->dst[0].writemask &= ~(inst->dead_mask);
5044          } else
5045             inst->dst[0].writemask &= ~(inst->dead_mask);
5046       }
5047    }
5048
5049    ralloc_free(write_level);
5050    ralloc_free(writes);
5051
5052    return removed;
5053 }
5054
5055 /* merge DFRACEXP instructions into one. */
5056 void
5057 glsl_to_tgsi_visitor::merge_two_dsts(void)
5058 {
5059    foreach_in_list_safe(glsl_to_tgsi_instruction, inst, &this->instructions) {
5060       glsl_to_tgsi_instruction *inst2;
5061       bool merged;
5062       if (num_inst_dst_regs(inst) != 2)
5063          continue;
5064
5065       if (inst->dst[0].file != PROGRAM_UNDEFINED &&
5066           inst->dst[1].file != PROGRAM_UNDEFINED)
5067          continue;
5068
5069       inst2 = (glsl_to_tgsi_instruction *) inst->next;
5070       do {
5071
5072          if (inst->src[0].file == inst2->src[0].file &&
5073              inst->src[0].index == inst2->src[0].index &&
5074              inst->src[0].type == inst2->src[0].type &&
5075              inst->src[0].swizzle == inst2->src[0].swizzle)
5076             break;
5077          inst2 = (glsl_to_tgsi_instruction *) inst2->next;
5078       } while (inst2);
5079
5080       if (!inst2)
5081          continue;
5082       merged = false;
5083       if (inst->dst[0].file == PROGRAM_UNDEFINED) {
5084          merged = true;
5085          inst->dst[0] = inst2->dst[0];
5086       } else if (inst->dst[1].file == PROGRAM_UNDEFINED) {
5087          inst->dst[1] = inst2->dst[1];
5088          merged = true;
5089       }
5090
5091       if (merged) {
5092          inst2->remove();
5093          delete inst2;
5094       }
5095    }
5096 }
5097
5098 /* Merges temporary registers together where possible to reduce the number of
5099  * registers needed to run a program.
5100  *
5101  * Produces optimal code only after copy propagation and dead code elimination
5102  * have been run. */
5103 void
5104 glsl_to_tgsi_visitor::merge_registers(void)
5105 {
5106    int *last_reads = rzalloc_array(mem_ctx, int, this->next_temp);
5107    int *first_writes = rzalloc_array(mem_ctx, int, this->next_temp);
5108    struct rename_reg_pair *renames = rzalloc_array(mem_ctx, struct rename_reg_pair, this->next_temp);
5109    int i, j;
5110    int num_renames = 0;
5111
5112    /* Read the indices of the last read and first write to each temp register
5113     * into an array so that we don't have to traverse the instruction list as
5114     * much. */
5115    for (i = 0; i < this->next_temp; i++) {
5116       last_reads[i] = -1;
5117       first_writes[i] = -1;
5118    }
5119    get_last_temp_read_first_temp_write(last_reads, first_writes);
5120
5121    /* Start looking for registers with non-overlapping usages that can be
5122     * merged together. */
5123    for (i = 0; i < this->next_temp; i++) {
5124       /* Don't touch unused registers. */
5125       if (last_reads[i] < 0 || first_writes[i] < 0) continue;
5126
5127       for (j = 0; j < this->next_temp; j++) {
5128          /* Don't touch unused registers. */
5129          if (last_reads[j] < 0 || first_writes[j] < 0) continue;
5130
5131          /* We can merge the two registers if the first write to j is after or
5132           * in the same instruction as the last read from i.  Note that the
5133           * register at index i will always be used earlier or at the same time
5134           * as the register at index j. */
5135          if (first_writes[i] <= first_writes[j] &&
5136              last_reads[i] <= first_writes[j]) {
5137             renames[num_renames].old_reg = j;
5138             renames[num_renames].new_reg = i;
5139             num_renames++;
5140
5141             /* Update the first_writes and last_reads arrays with the new
5142              * values for the merged register index, and mark the newly unused
5143              * register index as such. */
5144             assert(last_reads[j] >= last_reads[i]);
5145             last_reads[i] = last_reads[j];
5146             first_writes[j] = -1;
5147             last_reads[j] = -1;
5148          }
5149       }
5150    }
5151
5152    rename_temp_registers(num_renames, renames);
5153    ralloc_free(renames);
5154    ralloc_free(last_reads);
5155    ralloc_free(first_writes);
5156 }
5157
5158 /* Reassign indices to temporary registers by reusing unused indices created
5159  * by optimization passes. */
5160 void
5161 glsl_to_tgsi_visitor::renumber_registers(void)
5162 {
5163    int i = 0;
5164    int new_index = 0;
5165    int *first_reads = rzalloc_array(mem_ctx, int, this->next_temp);
5166    struct rename_reg_pair *renames = rzalloc_array(mem_ctx, struct rename_reg_pair, this->next_temp);
5167    int num_renames = 0;
5168    for (i = 0; i < this->next_temp; i++) {
5169       first_reads[i] = -1;
5170    }
5171    get_first_temp_read(first_reads);
5172
5173    for (i = 0; i < this->next_temp; i++) {
5174       if (first_reads[i] < 0) continue;
5175       if (i != new_index) {
5176          renames[num_renames].old_reg = i;
5177          renames[num_renames].new_reg = new_index;
5178          num_renames++;
5179       }
5180       new_index++;
5181    }
5182
5183    rename_temp_registers(num_renames, renames);
5184    this->next_temp = new_index;
5185    ralloc_free(renames);
5186    ralloc_free(first_reads);
5187 }
5188
5189 /* ------------------------- TGSI conversion stuff -------------------------- */
5190
5191 /**
5192  * Intermediate state used during shader translation.
5193  */
5194 struct st_translate {
5195    struct ureg_program *ureg;
5196
5197    unsigned temps_size;
5198    struct ureg_dst *temps;
5199
5200    struct ureg_dst *arrays;
5201    unsigned num_temp_arrays;
5202    struct ureg_src *constants;
5203    int num_constants;
5204    struct ureg_src *immediates;
5205    int num_immediates;
5206    struct ureg_dst outputs[PIPE_MAX_SHADER_OUTPUTS];
5207    struct ureg_src inputs[PIPE_MAX_SHADER_INPUTS];
5208    struct ureg_dst address[3];
5209    struct ureg_src samplers[PIPE_MAX_SAMPLERS];
5210    struct ureg_src buffers[PIPE_MAX_SHADER_BUFFERS];
5211    struct ureg_src images[PIPE_MAX_SHADER_IMAGES];
5212    struct ureg_src systemValues[SYSTEM_VALUE_MAX];
5213    struct ureg_src shared_memory;
5214    unsigned *array_sizes;
5215    struct inout_decl *input_decls;
5216    unsigned num_input_decls;
5217    struct inout_decl *output_decls;
5218    unsigned num_output_decls;
5219
5220    const GLuint *inputMapping;
5221    const GLuint *outputMapping;
5222
5223    unsigned procType;  /**< PIPE_SHADER_VERTEX/FRAGMENT */
5224 };
5225
5226 /** Map Mesa's SYSTEM_VALUE_x to TGSI_SEMANTIC_x */
5227 unsigned
5228 _mesa_sysval_to_semantic(unsigned sysval)
5229 {
5230    switch (sysval) {
5231    /* Vertex shader */
5232    case SYSTEM_VALUE_VERTEX_ID:
5233       return TGSI_SEMANTIC_VERTEXID;
5234    case SYSTEM_VALUE_INSTANCE_ID:
5235       return TGSI_SEMANTIC_INSTANCEID;
5236    case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
5237       return TGSI_SEMANTIC_VERTEXID_NOBASE;
5238    case SYSTEM_VALUE_BASE_VERTEX:
5239       return TGSI_SEMANTIC_BASEVERTEX;
5240    case SYSTEM_VALUE_BASE_INSTANCE:
5241       return TGSI_SEMANTIC_BASEINSTANCE;
5242    case SYSTEM_VALUE_DRAW_ID:
5243       return TGSI_SEMANTIC_DRAWID;
5244
5245    /* Geometry shader */
5246    case SYSTEM_VALUE_INVOCATION_ID:
5247       return TGSI_SEMANTIC_INVOCATIONID;
5248
5249    /* Fragment shader */
5250    case SYSTEM_VALUE_FRAG_COORD:
5251       return TGSI_SEMANTIC_POSITION;
5252    case SYSTEM_VALUE_FRONT_FACE:
5253       return TGSI_SEMANTIC_FACE;
5254    case SYSTEM_VALUE_SAMPLE_ID:
5255       return TGSI_SEMANTIC_SAMPLEID;
5256    case SYSTEM_VALUE_SAMPLE_POS:
5257       return TGSI_SEMANTIC_SAMPLEPOS;
5258    case SYSTEM_VALUE_SAMPLE_MASK_IN:
5259       return TGSI_SEMANTIC_SAMPLEMASK;
5260    case SYSTEM_VALUE_HELPER_INVOCATION:
5261       return TGSI_SEMANTIC_HELPER_INVOCATION;
5262
5263    /* Tessellation shader */
5264    case SYSTEM_VALUE_TESS_COORD:
5265       return TGSI_SEMANTIC_TESSCOORD;
5266    case SYSTEM_VALUE_VERTICES_IN:
5267       return TGSI_SEMANTIC_VERTICESIN;
5268    case SYSTEM_VALUE_PRIMITIVE_ID:
5269       return TGSI_SEMANTIC_PRIMID;
5270    case SYSTEM_VALUE_TESS_LEVEL_OUTER:
5271       return TGSI_SEMANTIC_TESSOUTER;
5272    case SYSTEM_VALUE_TESS_LEVEL_INNER:
5273       return TGSI_SEMANTIC_TESSINNER;
5274
5275    /* Compute shader */
5276    case SYSTEM_VALUE_LOCAL_INVOCATION_ID:
5277       return TGSI_SEMANTIC_THREAD_ID;
5278    case SYSTEM_VALUE_WORK_GROUP_ID:
5279       return TGSI_SEMANTIC_BLOCK_ID;
5280    case SYSTEM_VALUE_NUM_WORK_GROUPS:
5281       return TGSI_SEMANTIC_GRID_SIZE;
5282    case SYSTEM_VALUE_LOCAL_GROUP_SIZE:
5283       return TGSI_SEMANTIC_BLOCK_SIZE;
5284
5285    /* Unhandled */
5286    case SYSTEM_VALUE_LOCAL_INVOCATION_INDEX:
5287    case SYSTEM_VALUE_GLOBAL_INVOCATION_ID:
5288    case SYSTEM_VALUE_VERTEX_CNT:
5289    default:
5290       assert(!"Unexpected SYSTEM_VALUE_ enum");
5291       return TGSI_SEMANTIC_COUNT;
5292    }
5293 }
5294
5295 /**
5296  * Map a glsl_to_tgsi constant/immediate to a TGSI immediate.
5297  */
5298 static struct ureg_src
5299 emit_immediate(struct st_translate *t,
5300                gl_constant_value values[4],
5301                int type, int size)
5302 {
5303    struct ureg_program *ureg = t->ureg;
5304
5305    switch(type)
5306    {
5307    case GL_FLOAT:
5308       return ureg_DECL_immediate(ureg, &values[0].f, size);
5309    case GL_DOUBLE:
5310       return ureg_DECL_immediate_f64(ureg, (double *)&values[0].f, size);
5311    case GL_INT:
5312       return ureg_DECL_immediate_int(ureg, &values[0].i, size);
5313    case GL_UNSIGNED_INT:
5314    case GL_BOOL:
5315       return ureg_DECL_immediate_uint(ureg, &values[0].u, size);
5316    default:
5317       assert(!"should not get here - type must be float, int, uint, or bool");
5318       return ureg_src_undef();
5319    }
5320 }
5321
5322 /**
5323  * Map a glsl_to_tgsi dst register to a TGSI ureg_dst register.
5324  */
5325 static struct ureg_dst
5326 dst_register(struct st_translate *t, gl_register_file file, unsigned index,
5327              unsigned array_id)
5328 {
5329    unsigned array;
5330
5331    switch(file) {
5332    case PROGRAM_UNDEFINED:
5333       return ureg_dst_undef();
5334
5335    case PROGRAM_TEMPORARY:
5336       /* Allocate space for temporaries on demand. */
5337       if (index >= t->temps_size) {
5338          const int inc = align(index - t->temps_size + 1, 4096);
5339
5340          t->temps = (struct ureg_dst*)
5341                     realloc(t->temps,
5342                             (t->temps_size + inc) * sizeof(struct ureg_dst));
5343          if (!t->temps)
5344             return ureg_dst_undef();
5345
5346          memset(t->temps + t->temps_size, 0, inc * sizeof(struct ureg_dst));
5347          t->temps_size += inc;
5348       }
5349
5350       if (ureg_dst_is_undef(t->temps[index]))
5351          t->temps[index] = ureg_DECL_local_temporary(t->ureg);
5352
5353       return t->temps[index];
5354
5355    case PROGRAM_ARRAY:
5356       assert(array_id && array_id <= t->num_temp_arrays);
5357       array = array_id - 1;
5358
5359       if (ureg_dst_is_undef(t->arrays[array]))
5360          t->arrays[array] = ureg_DECL_array_temporary(
5361             t->ureg, t->array_sizes[array], TRUE);
5362
5363       return ureg_dst_array_offset(t->arrays[array], index);
5364
5365    case PROGRAM_OUTPUT:
5366       if (!array_id) {
5367          if (t->procType == PIPE_SHADER_FRAGMENT)
5368             assert(index < 2 * FRAG_RESULT_MAX);
5369          else if (t->procType == PIPE_SHADER_TESS_CTRL ||
5370                   t->procType == PIPE_SHADER_TESS_EVAL)
5371             assert(index < VARYING_SLOT_TESS_MAX);
5372          else
5373             assert(index < VARYING_SLOT_MAX);
5374
5375          assert(t->outputMapping[index] < ARRAY_SIZE(t->outputs));
5376          assert(t->outputs[t->outputMapping[index]].File != TGSI_FILE_NULL);
5377          return t->outputs[t->outputMapping[index]];
5378       }
5379       else {
5380          struct inout_decl *decl = find_inout_array(t->output_decls, t->num_output_decls, array_id);
5381          unsigned mesa_index = decl->mesa_index;
5382          int slot = t->outputMapping[mesa_index];
5383
5384          assert(slot != -1 && t->outputs[slot].File == TGSI_FILE_OUTPUT);
5385
5386          struct ureg_dst dst = t->outputs[slot];
5387          dst.ArrayID = array_id;
5388          return ureg_dst_array_offset(dst, index - mesa_index);
5389       }
5390
5391    case PROGRAM_ADDRESS:
5392       return t->address[index];
5393
5394    default:
5395       assert(!"unknown dst register file");
5396       return ureg_dst_undef();
5397    }
5398 }
5399
5400 /**
5401  * Map a glsl_to_tgsi src register to a TGSI ureg_src register.
5402  */
5403 static struct ureg_src
5404 src_register(struct st_translate *t, const st_src_reg *reg)
5405 {
5406    int index = reg->index;
5407    int double_reg2 = reg->double_reg2 ? 1 : 0;
5408
5409    switch(reg->file) {
5410    case PROGRAM_UNDEFINED:
5411       return ureg_imm4f(t->ureg, 0, 0, 0, 0);
5412
5413    case PROGRAM_TEMPORARY:
5414    case PROGRAM_ARRAY:
5415       return ureg_src(dst_register(t, reg->file, reg->index, reg->array_id));
5416
5417    case PROGRAM_OUTPUT: {
5418       struct ureg_dst dst = dst_register(t, reg->file, reg->index, reg->array_id);
5419       assert(dst.WriteMask != 0);
5420       unsigned shift = ffs(dst.WriteMask) - 1;
5421       return ureg_swizzle(ureg_src(dst),
5422                           shift,
5423                           MIN2(shift + 1, 3),
5424                           MIN2(shift + 2, 3),
5425                           MIN2(shift + 3, 3));
5426    }
5427
5428    case PROGRAM_UNIFORM:
5429       assert(reg->index >= 0);
5430       return reg->index < t->num_constants ?
5431                t->constants[reg->index] : ureg_imm4f(t->ureg, 0, 0, 0, 0);
5432    case PROGRAM_STATE_VAR:
5433    case PROGRAM_CONSTANT:       /* ie, immediate */
5434       if (reg->has_index2)
5435          return ureg_src_register(TGSI_FILE_CONSTANT, reg->index);
5436       else
5437          return reg->index >= 0 && reg->index < t->num_constants ?
5438                   t->constants[reg->index] : ureg_imm4f(t->ureg, 0, 0, 0, 0);
5439
5440    case PROGRAM_IMMEDIATE:
5441       assert(reg->index >= 0 && reg->index < t->num_immediates);
5442       return t->immediates[reg->index];
5443
5444    case PROGRAM_INPUT:
5445       /* GLSL inputs are 64-bit containers, so we have to
5446        * map back to the original index and add the offset after
5447        * mapping. */
5448       index -= double_reg2;
5449       if (!reg->array_id) {
5450          assert(t->inputMapping[index] < ARRAY_SIZE(t->inputs));
5451          assert(t->inputs[t->inputMapping[index]].File != TGSI_FILE_NULL);
5452          return t->inputs[t->inputMapping[index] + double_reg2];
5453       }
5454       else {
5455          struct inout_decl *decl = find_inout_array(t->input_decls, t->num_input_decls, reg->array_id);
5456          unsigned mesa_index = decl->mesa_index;
5457          int slot = t->inputMapping[mesa_index];
5458
5459          assert(slot != -1 && t->inputs[slot].File == TGSI_FILE_INPUT);
5460
5461          struct ureg_src src = t->inputs[slot];
5462          src.ArrayID = reg->array_id;
5463          return ureg_src_array_offset(src, index + double_reg2 - mesa_index);
5464       }
5465
5466    case PROGRAM_ADDRESS:
5467       return ureg_src(t->address[reg->index]);
5468
5469    case PROGRAM_SYSTEM_VALUE:
5470       assert(reg->index < (int) ARRAY_SIZE(t->systemValues));
5471       return t->systemValues[reg->index];
5472
5473    default:
5474       assert(!"unknown src register file");
5475       return ureg_src_undef();
5476    }
5477 }
5478
5479 /**
5480  * Create a TGSI ureg_dst register from an st_dst_reg.
5481  */
5482 static struct ureg_dst
5483 translate_dst(struct st_translate *t,
5484               const st_dst_reg *dst_reg,
5485               bool saturate)
5486 {
5487    struct ureg_dst dst = dst_register(t, dst_reg->file, dst_reg->index,
5488                                       dst_reg->array_id);
5489
5490    if (dst.File == TGSI_FILE_NULL)
5491       return dst;
5492
5493    dst = ureg_writemask(dst, dst_reg->writemask);
5494
5495    if (saturate)
5496       dst = ureg_saturate(dst);
5497
5498    if (dst_reg->reladdr != NULL) {
5499       assert(dst_reg->file != PROGRAM_TEMPORARY);
5500       dst = ureg_dst_indirect(dst, ureg_src(t->address[0]));
5501    }
5502
5503    if (dst_reg->has_index2) {
5504       if (dst_reg->reladdr2)
5505          dst = ureg_dst_dimension_indirect(dst, ureg_src(t->address[1]),
5506                                            dst_reg->index2D);
5507       else
5508          dst = ureg_dst_dimension(dst, dst_reg->index2D);
5509    }
5510
5511    return dst;
5512 }
5513
5514 /**
5515  * Create a TGSI ureg_src register from an st_src_reg.
5516  */
5517 static struct ureg_src
5518 translate_src(struct st_translate *t, const st_src_reg *src_reg)
5519 {
5520    struct ureg_src src = src_register(t, src_reg);
5521
5522    if (src_reg->has_index2) {
5523       /* 2D indexes occur with geometry shader inputs (attrib, vertex)
5524        * and UBO constant buffers (buffer, position).
5525        */
5526       if (src_reg->reladdr2)
5527          src = ureg_src_dimension_indirect(src, ureg_src(t->address[1]),
5528                                            src_reg->index2D);
5529       else
5530          src = ureg_src_dimension(src, src_reg->index2D);
5531    }
5532
5533    src = ureg_swizzle(src,
5534                       GET_SWZ(src_reg->swizzle, 0) & 0x3,
5535                       GET_SWZ(src_reg->swizzle, 1) & 0x3,
5536                       GET_SWZ(src_reg->swizzle, 2) & 0x3,
5537                       GET_SWZ(src_reg->swizzle, 3) & 0x3);
5538
5539    if (src_reg->abs)
5540       src = ureg_abs(src);
5541
5542    if ((src_reg->negate & 0xf) == NEGATE_XYZW)
5543       src = ureg_negate(src);
5544
5545    if (src_reg->reladdr != NULL) {
5546       assert(src_reg->file != PROGRAM_TEMPORARY);
5547       src = ureg_src_indirect(src, ureg_src(t->address[0]));
5548    }
5549
5550    return src;
5551 }
5552
5553 static struct tgsi_texture_offset
5554 translate_tex_offset(struct st_translate *t,
5555                      const st_src_reg *in_offset)
5556 {
5557    struct tgsi_texture_offset offset;
5558    struct ureg_src src = translate_src(t, in_offset);
5559
5560    offset.File = src.File;
5561    offset.Index = src.Index;
5562    offset.SwizzleX = src.SwizzleX;
5563    offset.SwizzleY = src.SwizzleY;
5564    offset.SwizzleZ = src.SwizzleZ;
5565    offset.Padding = 0;
5566
5567    assert(!src.Indirect);
5568    assert(!src.DimIndirect);
5569    assert(!src.Dimension);
5570    assert(!src.Absolute); /* those shouldn't be used with integers anyway */
5571    assert(!src.Negate);
5572
5573    return offset;
5574 }
5575
5576 static void
5577 compile_tgsi_instruction(struct st_translate *t,
5578                          const glsl_to_tgsi_instruction *inst)
5579 {
5580    struct ureg_program *ureg = t->ureg;
5581    int i;
5582    struct ureg_dst dst[2];
5583    struct ureg_src src[4];
5584    struct tgsi_texture_offset texoffsets[MAX_GLSL_TEXTURE_OFFSET];
5585
5586    int num_dst;
5587    int num_src;
5588    unsigned tex_target = 0;
5589
5590    num_dst = num_inst_dst_regs(inst);
5591    num_src = num_inst_src_regs(inst);
5592
5593    for (i = 0; i < num_dst; i++)
5594       dst[i] = translate_dst(t,
5595                              &inst->dst[i],
5596                              inst->saturate);
5597
5598    for (i = 0; i < num_src; i++)
5599       src[i] = translate_src(t, &inst->src[i]);
5600
5601    switch(inst->op) {
5602    case TGSI_OPCODE_BGNLOOP:
5603    case TGSI_OPCODE_ELSE:
5604    case TGSI_OPCODE_ENDLOOP:
5605    case TGSI_OPCODE_IF:
5606    case TGSI_OPCODE_UIF:
5607       assert(num_dst == 0);
5608       ureg_insn(ureg, inst->op, NULL, 0, src, num_src);
5609       return;
5610
5611    case TGSI_OPCODE_TEX:
5612    case TGSI_OPCODE_TXB:
5613    case TGSI_OPCODE_TXD:
5614    case TGSI_OPCODE_TXL:
5615    case TGSI_OPCODE_TXP:
5616    case TGSI_OPCODE_TXQ:
5617    case TGSI_OPCODE_TXQS:
5618    case TGSI_OPCODE_TXF:
5619    case TGSI_OPCODE_TEX2:
5620    case TGSI_OPCODE_TXB2:
5621    case TGSI_OPCODE_TXL2:
5622    case TGSI_OPCODE_TG4:
5623    case TGSI_OPCODE_LODQ:
5624       src[num_src] = t->samplers[inst->resource.index];
5625       assert(src[num_src].File != TGSI_FILE_NULL);
5626       if (inst->resource.reladdr)
5627          src[num_src] =
5628             ureg_src_indirect(src[num_src], ureg_src(t->address[2]));
5629       num_src++;
5630       for (i = 0; i < (int)inst->tex_offset_num_offset; i++) {
5631          texoffsets[i] = translate_tex_offset(t, &inst->tex_offsets[i]);
5632       }
5633       tex_target = st_translate_texture_target(inst->tex_target, inst->tex_shadow);
5634
5635       ureg_tex_insn(ureg,
5636                     inst->op,
5637                     dst, num_dst,
5638                     tex_target,
5639                     texoffsets, inst->tex_offset_num_offset,
5640                     src, num_src);
5641       return;
5642
5643    case TGSI_OPCODE_RESQ:
5644    case TGSI_OPCODE_LOAD:
5645    case TGSI_OPCODE_ATOMUADD:
5646    case TGSI_OPCODE_ATOMXCHG:
5647    case TGSI_OPCODE_ATOMCAS:
5648    case TGSI_OPCODE_ATOMAND:
5649    case TGSI_OPCODE_ATOMOR:
5650    case TGSI_OPCODE_ATOMXOR:
5651    case TGSI_OPCODE_ATOMUMIN:
5652    case TGSI_OPCODE_ATOMUMAX:
5653    case TGSI_OPCODE_ATOMIMIN:
5654    case TGSI_OPCODE_ATOMIMAX:
5655       for (i = num_src - 1; i >= 0; i--)
5656          src[i + 1] = src[i];
5657       num_src++;
5658       if (inst->resource.file == PROGRAM_MEMORY) {
5659          src[0] = t->shared_memory;
5660       } else if (inst->resource.file == PROGRAM_BUFFER) {
5661          src[0] = t->buffers[inst->resource.index];
5662       } else {
5663          src[0] = t->images[inst->resource.index];
5664          tex_target = st_translate_texture_target(inst->tex_target, inst->tex_shadow);
5665       }
5666       if (inst->resource.reladdr)
5667          src[0] = ureg_src_indirect(src[0], ureg_src(t->address[2]));
5668       assert(src[0].File != TGSI_FILE_NULL);
5669       ureg_memory_insn(ureg, inst->op, dst, num_dst, src, num_src,
5670                        inst->buffer_access,
5671                        tex_target, inst->image_format);
5672       break;
5673
5674    case TGSI_OPCODE_STORE:
5675       if (inst->resource.file == PROGRAM_MEMORY) {
5676          dst[0] = ureg_dst(t->shared_memory);
5677       } else if (inst->resource.file == PROGRAM_BUFFER) {
5678          dst[0] = ureg_dst(t->buffers[inst->resource.index]);
5679       } else {
5680          dst[0] = ureg_dst(t->images[inst->resource.index]);
5681          tex_target = st_translate_texture_target(inst->tex_target, inst->tex_shadow);
5682       }
5683       dst[0] = ureg_writemask(dst[0], inst->dst[0].writemask);
5684       if (inst->resource.reladdr)
5685          dst[0] = ureg_dst_indirect(dst[0], ureg_src(t->address[2]));
5686       assert(dst[0].File != TGSI_FILE_NULL);
5687       ureg_memory_insn(ureg, inst->op, dst, num_dst, src, num_src,
5688                        inst->buffer_access,
5689                        tex_target, inst->image_format);
5690       break;
5691
5692    case TGSI_OPCODE_SCS:
5693       dst[0] = ureg_writemask(dst[0], TGSI_WRITEMASK_XY);
5694       ureg_insn(ureg, inst->op, dst, num_dst, src, num_src);
5695       break;
5696
5697    default:
5698       ureg_insn(ureg,
5699                 inst->op,
5700                 dst, num_dst,
5701                 src, num_src);
5702       break;
5703    }
5704 }
5705
5706 /**
5707  * Emit the TGSI instructions for inverting and adjusting WPOS.
5708  * This code is unavoidable because it also depends on whether
5709  * a FBO is bound (STATE_FB_WPOS_Y_TRANSFORM).
5710  */
5711 static void
5712 emit_wpos_adjustment(struct gl_context *ctx,
5713                      struct st_translate *t,
5714                      int wpos_transform_const,
5715                      boolean invert,
5716                      GLfloat adjX, GLfloat adjY[2])
5717 {
5718    struct ureg_program *ureg = t->ureg;
5719
5720    assert(wpos_transform_const >= 0);
5721
5722    /* Fragment program uses fragment position input.
5723     * Need to replace instances of INPUT[WPOS] with temp T
5724     * where T = INPUT[WPOS] is inverted by Y.
5725     */
5726    struct ureg_src wpostrans = ureg_DECL_constant(ureg, wpos_transform_const);
5727    struct ureg_dst wpos_temp = ureg_DECL_temporary( ureg );
5728    struct ureg_src *wpos =
5729       ctx->Const.GLSLFragCoordIsSysVal ?
5730          &t->systemValues[SYSTEM_VALUE_FRAG_COORD] :
5731          &t->inputs[t->inputMapping[VARYING_SLOT_POS]];
5732    struct ureg_src wpos_input = *wpos;
5733
5734    /* First, apply the coordinate shift: */
5735    if (adjX || adjY[0] || adjY[1]) {
5736       if (adjY[0] != adjY[1]) {
5737          /* Adjust the y coordinate by adjY[1] or adjY[0] respectively
5738           * depending on whether inversion is actually going to be applied
5739           * or not, which is determined by testing against the inversion
5740           * state variable used below, which will be either +1 or -1.
5741           */
5742          struct ureg_dst adj_temp = ureg_DECL_local_temporary(ureg);
5743
5744          ureg_CMP(ureg, adj_temp,
5745                   ureg_scalar(wpostrans, invert ? 2 : 0),
5746                   ureg_imm4f(ureg, adjX, adjY[0], 0.0f, 0.0f),
5747                   ureg_imm4f(ureg, adjX, adjY[1], 0.0f, 0.0f));
5748          ureg_ADD(ureg, wpos_temp, wpos_input, ureg_src(adj_temp));
5749       } else {
5750          ureg_ADD(ureg, wpos_temp, wpos_input,
5751                   ureg_imm4f(ureg, adjX, adjY[0], 0.0f, 0.0f));
5752       }
5753       wpos_input = ureg_src(wpos_temp);
5754    } else {
5755       /* MOV wpos_temp, input[wpos]
5756        */
5757       ureg_MOV( ureg, wpos_temp, wpos_input );
5758    }
5759
5760    /* Now the conditional y flip: STATE_FB_WPOS_Y_TRANSFORM.xy/zw will be
5761     * inversion/identity, or the other way around if we're drawing to an FBO.
5762     */
5763    if (invert) {
5764       /* MAD wpos_temp.y, wpos_input, wpostrans.xxxx, wpostrans.yyyy
5765        */
5766       ureg_MAD( ureg,
5767                 ureg_writemask(wpos_temp, TGSI_WRITEMASK_Y ),
5768                 wpos_input,
5769                 ureg_scalar(wpostrans, 0),
5770                 ureg_scalar(wpostrans, 1));
5771    } else {
5772       /* MAD wpos_temp.y, wpos_input, wpostrans.zzzz, wpostrans.wwww
5773        */
5774       ureg_MAD( ureg,
5775                 ureg_writemask(wpos_temp, TGSI_WRITEMASK_Y ),
5776                 wpos_input,
5777                 ureg_scalar(wpostrans, 2),
5778                 ureg_scalar(wpostrans, 3));
5779    }
5780
5781    /* Use wpos_temp as position input from here on:
5782     */
5783    *wpos = ureg_src(wpos_temp);
5784 }
5785
5786
5787 /**
5788  * Emit fragment position/ooordinate code.
5789  */
5790 static void
5791 emit_wpos(struct st_context *st,
5792           struct st_translate *t,
5793           const struct gl_program *program,
5794           struct ureg_program *ureg,
5795           int wpos_transform_const)
5796 {
5797    struct pipe_screen *pscreen = st->pipe->screen;
5798    GLfloat adjX = 0.0f;
5799    GLfloat adjY[2] = { 0.0f, 0.0f };
5800    boolean invert = FALSE;
5801
5802    /* Query the pixel center conventions supported by the pipe driver and set
5803     * adjX, adjY to help out if it cannot handle the requested one internally.
5804     *
5805     * The bias of the y-coordinate depends on whether y-inversion takes place
5806     * (adjY[1]) or not (adjY[0]), which is in turn dependent on whether we are
5807     * drawing to an FBO (causes additional inversion), and whether the pipe
5808     * driver origin and the requested origin differ (the latter condition is
5809     * stored in the 'invert' variable).
5810     *
5811     * For height = 100 (i = integer, h = half-integer, l = lower, u = upper):
5812     *
5813     * center shift only:
5814     * i -> h: +0.5
5815     * h -> i: -0.5
5816     *
5817     * inversion only:
5818     * l,i -> u,i: ( 0.0 + 1.0) * -1 + 100 = 99
5819     * l,h -> u,h: ( 0.5 + 0.0) * -1 + 100 = 99.5
5820     * u,i -> l,i: (99.0 + 1.0) * -1 + 100 = 0
5821     * u,h -> l,h: (99.5 + 0.0) * -1 + 100 = 0.5
5822     *
5823     * inversion and center shift:
5824     * l,i -> u,h: ( 0.0 + 0.5) * -1 + 100 = 99.5
5825     * l,h -> u,i: ( 0.5 + 0.5) * -1 + 100 = 99
5826     * u,i -> l,h: (99.0 + 0.5) * -1 + 100 = 0.5
5827     * u,h -> l,i: (99.5 + 0.5) * -1 + 100 = 0
5828     */
5829    if (program->OriginUpperLeft) {
5830       /* Fragment shader wants origin in upper-left */
5831       if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT)) {
5832          /* the driver supports upper-left origin */
5833       }
5834       else if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT)) {
5835          /* the driver supports lower-left origin, need to invert Y */
5836          ureg_property(ureg, TGSI_PROPERTY_FS_COORD_ORIGIN,
5837                        TGSI_FS_COORD_ORIGIN_LOWER_LEFT);
5838          invert = TRUE;
5839       }
5840       else
5841          assert(0);
5842    }
5843    else {
5844       /* Fragment shader wants origin in lower-left */
5845       if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT))
5846          /* the driver supports lower-left origin */
5847          ureg_property(ureg, TGSI_PROPERTY_FS_COORD_ORIGIN,
5848                        TGSI_FS_COORD_ORIGIN_LOWER_LEFT);
5849       else if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT))
5850          /* the driver supports upper-left origin, need to invert Y */
5851          invert = TRUE;
5852       else
5853          assert(0);
5854    }
5855
5856    if (program->PixelCenterInteger) {
5857       /* Fragment shader wants pixel center integer */
5858       if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER)) {
5859          /* the driver supports pixel center integer */
5860          adjY[1] = 1.0f;
5861          ureg_property(ureg, TGSI_PROPERTY_FS_COORD_PIXEL_CENTER,
5862                        TGSI_FS_COORD_PIXEL_CENTER_INTEGER);
5863       }
5864       else if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER)) {
5865          /* the driver supports pixel center half integer, need to bias X,Y */
5866          adjX = -0.5f;
5867          adjY[0] = -0.5f;
5868          adjY[1] = 0.5f;
5869       }
5870       else
5871          assert(0);
5872    }
5873    else {
5874       /* Fragment shader wants pixel center half integer */
5875       if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER)) {
5876          /* the driver supports pixel center half integer */
5877       }
5878       else if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER)) {
5879          /* the driver supports pixel center integer, need to bias X,Y */
5880          adjX = adjY[0] = adjY[1] = 0.5f;
5881          ureg_property(ureg, TGSI_PROPERTY_FS_COORD_PIXEL_CENTER,
5882                        TGSI_FS_COORD_PIXEL_CENTER_INTEGER);
5883       }
5884       else
5885          assert(0);
5886    }
5887
5888    /* we invert after adjustment so that we avoid the MOV to temporary,
5889     * and reuse the adjustment ADD instead */
5890    emit_wpos_adjustment(st->ctx, t, wpos_transform_const, invert, adjX, adjY);
5891 }
5892
5893 /**
5894  * OpenGL's fragment gl_FrontFace input is 1 for front-facing, 0 for back.
5895  * TGSI uses +1 for front, -1 for back.
5896  * This function converts the TGSI value to the GL value.  Simply clamping/
5897  * saturating the value to [0,1] does the job.
5898  */
5899 static void
5900 emit_face_var(struct gl_context *ctx, struct st_translate *t)
5901 {
5902    struct ureg_program *ureg = t->ureg;
5903    struct ureg_dst face_temp = ureg_DECL_temporary(ureg);
5904    struct ureg_src face_input = t->inputs[t->inputMapping[VARYING_SLOT_FACE]];
5905
5906    if (ctx->Const.NativeIntegers) {
5907       ureg_FSGE(ureg, face_temp, face_input, ureg_imm1f(ureg, 0));
5908    }
5909    else {
5910       /* MOV_SAT face_temp, input[face] */
5911       ureg_MOV(ureg, ureg_saturate(face_temp), face_input);
5912    }
5913
5914    /* Use face_temp as face input from here on: */
5915    t->inputs[t->inputMapping[VARYING_SLOT_FACE]] = ureg_src(face_temp);
5916 }
5917
5918 static void
5919 emit_compute_block_size(const struct gl_program *prog,
5920                         struct ureg_program *ureg) {
5921    ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH,
5922                  prog->info.cs.local_size[0]);
5923    ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT,
5924                  prog->info.cs.local_size[1]);
5925    ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH,
5926                  prog->info.cs.local_size[2]);
5927 }
5928
5929 struct sort_inout_decls {
5930    bool operator()(const struct inout_decl &a, const struct inout_decl &b) const {
5931       return mapping[a.mesa_index] < mapping[b.mesa_index];
5932    }
5933
5934    const GLuint *mapping;
5935 };
5936
5937 /* Sort the given array of decls by the corresponding slot (TGSI file index).
5938  *
5939  * This is for the benefit of older drivers which are broken when the
5940  * declarations aren't sorted in this way.
5941  */
5942 static void
5943 sort_inout_decls_by_slot(struct inout_decl *decls,
5944                          unsigned count,
5945                          const GLuint mapping[])
5946 {
5947    sort_inout_decls sorter;
5948    sorter.mapping = mapping;
5949    std::sort(decls, decls + count, sorter);
5950 }
5951
5952 static unsigned
5953 st_translate_interp(enum glsl_interp_mode glsl_qual, GLuint varying)
5954 {
5955    switch (glsl_qual) {
5956    case INTERP_MODE_NONE:
5957       if (varying == VARYING_SLOT_COL0 || varying == VARYING_SLOT_COL1)
5958          return TGSI_INTERPOLATE_COLOR;
5959       return TGSI_INTERPOLATE_PERSPECTIVE;
5960    case INTERP_MODE_SMOOTH:
5961       return TGSI_INTERPOLATE_PERSPECTIVE;
5962    case INTERP_MODE_FLAT:
5963       return TGSI_INTERPOLATE_CONSTANT;
5964    case INTERP_MODE_NOPERSPECTIVE:
5965       return TGSI_INTERPOLATE_LINEAR;
5966    default:
5967       assert(0 && "unexpected interp mode in st_translate_interp()");
5968       return TGSI_INTERPOLATE_PERSPECTIVE;
5969    }
5970 }
5971
5972 /**
5973  * Translate intermediate IR (glsl_to_tgsi_instruction) to TGSI format.
5974  * \param program  the program to translate
5975  * \param numInputs  number of input registers used
5976  * \param inputMapping  maps Mesa fragment program inputs to TGSI generic
5977  *                      input indexes
5978  * \param inputSemanticName  the TGSI_SEMANTIC flag for each input
5979  * \param inputSemanticIndex  the semantic index (ex: which texcoord) for
5980  *                            each input
5981  * \param interpMode  the TGSI_INTERPOLATE_LINEAR/PERSP mode for each input
5982  * \param numOutputs  number of output registers used
5983  * \param outputMapping  maps Mesa fragment program outputs to TGSI
5984  *                       generic outputs
5985  * \param outputSemanticName  the TGSI_SEMANTIC flag for each output
5986  * \param outputSemanticIndex  the semantic index (ex: which texcoord) for
5987  *                             each output
5988  *
5989  * \return  PIPE_OK or PIPE_ERROR_OUT_OF_MEMORY
5990  */
5991 extern "C" enum pipe_error
5992 st_translate_program(
5993    struct gl_context *ctx,
5994    uint procType,
5995    struct ureg_program *ureg,
5996    glsl_to_tgsi_visitor *program,
5997    const struct gl_program *proginfo,
5998    GLuint numInputs,
5999    const GLuint inputMapping[],
6000    const GLuint inputSlotToAttr[],
6001    const ubyte inputSemanticName[],
6002    const ubyte inputSemanticIndex[],
6003    const GLuint interpMode[],
6004    GLuint numOutputs,
6005    const GLuint outputMapping[],
6006    const GLuint outputSlotToAttr[],
6007    const ubyte outputSemanticName[],
6008    const ubyte outputSemanticIndex[])
6009 {
6010    struct st_translate *t;
6011    unsigned i;
6012    struct gl_program_constants *frag_const =
6013       &ctx->Const.Program[MESA_SHADER_FRAGMENT];
6014    enum pipe_error ret = PIPE_OK;
6015
6016    assert(numInputs <= ARRAY_SIZE(t->inputs));
6017    assert(numOutputs <= ARRAY_SIZE(t->outputs));
6018
6019    t = CALLOC_STRUCT(st_translate);
6020    if (!t) {
6021       ret = PIPE_ERROR_OUT_OF_MEMORY;
6022       goto out;
6023    }
6024
6025    t->procType = procType;
6026    t->inputMapping = inputMapping;
6027    t->outputMapping = outputMapping;
6028    t->ureg = ureg;
6029    t->num_temp_arrays = program->next_array;
6030    if (t->num_temp_arrays)
6031       t->arrays = (struct ureg_dst*)
6032                   calloc(t->num_temp_arrays, sizeof(t->arrays[0]));
6033
6034    /*
6035     * Declare input attributes.
6036     */
6037    switch (procType) {
6038    case PIPE_SHADER_FRAGMENT:
6039    case PIPE_SHADER_GEOMETRY:
6040    case PIPE_SHADER_TESS_EVAL:
6041    case PIPE_SHADER_TESS_CTRL:
6042       sort_inout_decls_by_slot(program->inputs, program->num_inputs, inputMapping);
6043
6044       for (i = 0; i < program->num_inputs; ++i) {
6045          struct inout_decl *decl = &program->inputs[i];
6046          unsigned slot = inputMapping[decl->mesa_index];
6047          struct ureg_src src;
6048          ubyte tgsi_usage_mask = decl->usage_mask;
6049
6050          if (glsl_base_type_is_64bit(decl->base_type)) {
6051             if (tgsi_usage_mask == 1)
6052                tgsi_usage_mask = TGSI_WRITEMASK_XY;
6053             else if (tgsi_usage_mask == 2)
6054                tgsi_usage_mask = TGSI_WRITEMASK_ZW;
6055             else
6056                tgsi_usage_mask = TGSI_WRITEMASK_XYZW;
6057          }
6058
6059          unsigned interp_mode = 0;
6060          unsigned interp_location = 0;
6061          if (procType == PIPE_SHADER_FRAGMENT) {
6062             assert(interpMode);
6063             interp_mode = interpMode[slot] != TGSI_INTERPOLATE_COUNT ?
6064                interpMode[slot] :
6065                st_translate_interp(decl->interp, inputSlotToAttr[slot]);
6066
6067             interp_location = decl->interp_loc;
6068          }
6069
6070          src = ureg_DECL_fs_input_cyl_centroid_layout(ureg,
6071                   inputSemanticName[slot], inputSemanticIndex[slot],
6072                   interp_mode, 0, interp_location, slot, tgsi_usage_mask,
6073                   decl->array_id, decl->size);
6074
6075          for (unsigned j = 0; j < decl->size; ++j) {
6076             if (t->inputs[slot + j].File != TGSI_FILE_INPUT) {
6077                /* The ArrayID is set up in dst_register */
6078                t->inputs[slot + j] = src;
6079                t->inputs[slot + j].ArrayID = 0;
6080                t->inputs[slot + j].Index += j;
6081             }
6082          }
6083       }
6084       break;
6085    case PIPE_SHADER_VERTEX:
6086       for (i = 0; i < numInputs; i++) {
6087          t->inputs[i] = ureg_DECL_vs_input(ureg, i);
6088       }
6089       break;
6090    case PIPE_SHADER_COMPUTE:
6091       break;
6092    default:
6093       assert(0);
6094    }
6095
6096    /*
6097     * Declare output attributes.
6098     */
6099    switch (procType) {
6100    case PIPE_SHADER_FRAGMENT:
6101    case PIPE_SHADER_COMPUTE:
6102       break;
6103    case PIPE_SHADER_GEOMETRY:
6104    case PIPE_SHADER_TESS_EVAL:
6105    case PIPE_SHADER_TESS_CTRL:
6106    case PIPE_SHADER_VERTEX:
6107       sort_inout_decls_by_slot(program->outputs, program->num_outputs, outputMapping);
6108
6109       for (i = 0; i < program->num_outputs; ++i) {
6110          struct inout_decl *decl = &program->outputs[i];
6111          unsigned slot = outputMapping[decl->mesa_index];
6112          struct ureg_dst dst;
6113          ubyte tgsi_usage_mask = decl->usage_mask;
6114
6115          if (glsl_base_type_is_64bit(decl->base_type)) {
6116             if (tgsi_usage_mask == 1)
6117                tgsi_usage_mask = TGSI_WRITEMASK_XY;
6118             else if (tgsi_usage_mask == 2)
6119                tgsi_usage_mask = TGSI_WRITEMASK_ZW;
6120             else
6121                tgsi_usage_mask = TGSI_WRITEMASK_XYZW;
6122          }
6123
6124          dst = ureg_DECL_output_layout(ureg,
6125                      outputSemanticName[slot], outputSemanticIndex[slot],
6126                      decl->gs_out_streams,
6127                      slot, tgsi_usage_mask, decl->array_id, decl->size);
6128
6129          for (unsigned j = 0; j < decl->size; ++j) {
6130             if (t->outputs[slot + j].File != TGSI_FILE_OUTPUT) {
6131                /* The ArrayID is set up in dst_register */
6132                t->outputs[slot + j] = dst;
6133                t->outputs[slot + j].ArrayID = 0;
6134                t->outputs[slot + j].Index += j;
6135             }
6136          }
6137       }
6138       break;
6139    default:
6140       assert(0);
6141    }
6142
6143    if (procType == PIPE_SHADER_FRAGMENT) {
6144       if (program->shader->info.EarlyFragmentTests)
6145          ureg_property(ureg, TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL, 1);
6146
6147       if (proginfo->info.inputs_read & VARYING_BIT_POS) {
6148           /* Must do this after setting up t->inputs. */
6149           emit_wpos(st_context(ctx), t, proginfo, ureg,
6150                     program->wpos_transform_const);
6151       }
6152
6153       if (proginfo->info.inputs_read & VARYING_BIT_FACE)
6154          emit_face_var(ctx, t);
6155
6156       for (i = 0; i < numOutputs; i++) {
6157          switch (outputSemanticName[i]) {
6158          case TGSI_SEMANTIC_POSITION:
6159             t->outputs[i] = ureg_DECL_output(ureg,
6160                                              TGSI_SEMANTIC_POSITION, /* Z/Depth */
6161                                              outputSemanticIndex[i]);
6162             t->outputs[i] = ureg_writemask(t->outputs[i], TGSI_WRITEMASK_Z);
6163             break;
6164          case TGSI_SEMANTIC_STENCIL:
6165             t->outputs[i] = ureg_DECL_output(ureg,
6166                                              TGSI_SEMANTIC_STENCIL, /* Stencil */
6167                                              outputSemanticIndex[i]);
6168             t->outputs[i] = ureg_writemask(t->outputs[i], TGSI_WRITEMASK_Y);
6169             break;
6170          case TGSI_SEMANTIC_COLOR:
6171             t->outputs[i] = ureg_DECL_output(ureg,
6172                                              TGSI_SEMANTIC_COLOR,
6173                                              outputSemanticIndex[i]);
6174             break;
6175          case TGSI_SEMANTIC_SAMPLEMASK:
6176             t->outputs[i] = ureg_DECL_output(ureg,
6177                                              TGSI_SEMANTIC_SAMPLEMASK,
6178                                              outputSemanticIndex[i]);
6179             /* TODO: If we ever support more than 32 samples, this will have
6180              * to become an array.
6181              */
6182             t->outputs[i] = ureg_writemask(t->outputs[i], TGSI_WRITEMASK_X);
6183             break;
6184          default:
6185             assert(!"fragment shader outputs must be POSITION/STENCIL/COLOR");
6186             ret = PIPE_ERROR_BAD_INPUT;
6187             goto out;
6188          }
6189       }
6190    }
6191    else if (procType == PIPE_SHADER_VERTEX) {
6192       for (i = 0; i < numOutputs; i++) {
6193          if (outputSemanticName[i] == TGSI_SEMANTIC_FOG) {
6194             /* force register to contain a fog coordinate in the form (F, 0, 0, 1). */
6195             ureg_MOV(ureg,
6196                      ureg_writemask(t->outputs[i], TGSI_WRITEMASK_YZW),
6197                      ureg_imm4f(ureg, 0.0f, 0.0f, 0.0f, 1.0f));
6198             t->outputs[i] = ureg_writemask(t->outputs[i], TGSI_WRITEMASK_X);
6199          }
6200       }
6201    }
6202
6203    if (procType == PIPE_SHADER_COMPUTE) {
6204       emit_compute_block_size(proginfo, ureg);
6205    }
6206
6207    /* Declare address register.
6208     */
6209    if (program->num_address_regs > 0) {
6210       assert(program->num_address_regs <= 3);
6211       for (int i = 0; i < program->num_address_regs; i++)
6212          t->address[i] = ureg_DECL_address(ureg);
6213    }
6214
6215    /* Declare misc input registers
6216     */
6217    {
6218       GLbitfield sysInputs = proginfo->info.system_values_read;
6219
6220       for (i = 0; sysInputs; i++) {
6221          if (sysInputs & (1 << i)) {
6222             unsigned semName = _mesa_sysval_to_semantic(i);
6223
6224             t->systemValues[i] = ureg_DECL_system_value(ureg, semName, 0);
6225
6226             if (semName == TGSI_SEMANTIC_INSTANCEID ||
6227                 semName == TGSI_SEMANTIC_VERTEXID) {
6228                /* From Gallium perspective, these system values are always
6229                 * integer, and require native integer support.  However, if
6230                 * native integer is supported on the vertex stage but not the
6231                 * pixel stage (e.g, i915g + draw), Mesa will generate IR that
6232                 * assumes these system values are floats. To resolve the
6233                 * inconsistency, we insert a U2F.
6234                 */
6235                struct st_context *st = st_context(ctx);
6236                struct pipe_screen *pscreen = st->pipe->screen;
6237                assert(procType == PIPE_SHADER_VERTEX);
6238                assert(pscreen->get_shader_param(pscreen, PIPE_SHADER_VERTEX, PIPE_SHADER_CAP_INTEGERS));
6239                (void) pscreen;
6240                if (!ctx->Const.NativeIntegers) {
6241                   struct ureg_dst temp = ureg_DECL_local_temporary(t->ureg);
6242                   ureg_U2F( t->ureg, ureg_writemask(temp, TGSI_WRITEMASK_X), t->systemValues[i]);
6243                   t->systemValues[i] = ureg_scalar(ureg_src(temp), 0);
6244                }
6245             }
6246
6247             if (procType == PIPE_SHADER_FRAGMENT &&
6248                 semName == TGSI_SEMANTIC_POSITION)
6249                emit_wpos(st_context(ctx), t, proginfo, ureg,
6250                          program->wpos_transform_const);
6251
6252             sysInputs &= ~(1 << i);
6253          }
6254       }
6255    }
6256
6257    t->array_sizes = program->array_sizes;
6258    t->input_decls = program->inputs;
6259    t->num_input_decls = program->num_inputs;
6260    t->output_decls = program->outputs;
6261    t->num_output_decls = program->num_outputs;
6262
6263    /* Emit constants and uniforms.  TGSI uses a single index space for these,
6264     * so we put all the translated regs in t->constants.
6265     */
6266    if (proginfo->Parameters) {
6267       t->constants = (struct ureg_src *)
6268          calloc(proginfo->Parameters->NumParameters, sizeof(t->constants[0]));
6269       if (t->constants == NULL) {
6270          ret = PIPE_ERROR_OUT_OF_MEMORY;
6271          goto out;
6272       }
6273       t->num_constants = proginfo->Parameters->NumParameters;
6274
6275       for (i = 0; i < proginfo->Parameters->NumParameters; i++) {
6276          switch (proginfo->Parameters->Parameters[i].Type) {
6277          case PROGRAM_STATE_VAR:
6278          case PROGRAM_UNIFORM:
6279             t->constants[i] = ureg_DECL_constant(ureg, i);
6280             break;
6281
6282          /* Emit immediates for PROGRAM_CONSTANT only when there's no indirect
6283           * addressing of the const buffer.
6284           * FIXME: Be smarter and recognize param arrays:
6285           * indirect addressing is only valid within the referenced
6286           * array.
6287           */
6288          case PROGRAM_CONSTANT:
6289             if (program->indirect_addr_consts)
6290                t->constants[i] = ureg_DECL_constant(ureg, i);
6291             else
6292                t->constants[i] = emit_immediate(t,
6293                                                 proginfo->Parameters->ParameterValues[i],
6294                                                 proginfo->Parameters->Parameters[i].DataType,
6295                                                 4);
6296             break;
6297          default:
6298             break;
6299          }
6300       }
6301    }
6302
6303    if (program->shader) {
6304       unsigned num_ubos = program->shader->NumUniformBlocks;
6305
6306       for (i = 0; i < num_ubos; i++) {
6307          unsigned size = program->shader->UniformBlocks[i]->UniformBufferSize;
6308          unsigned num_const_vecs = (size + 15) / 16;
6309          unsigned first, last;
6310          assert(num_const_vecs > 0);
6311          first = 0;
6312          last = num_const_vecs > 0 ? num_const_vecs - 1 : 0;
6313          ureg_DECL_constant2D(t->ureg, first, last, i + 1);
6314       }
6315    }
6316
6317    /* Emit immediate values.
6318     */
6319    t->immediates = (struct ureg_src *)
6320       calloc(program->num_immediates, sizeof(struct ureg_src));
6321    if (t->immediates == NULL) {
6322       ret = PIPE_ERROR_OUT_OF_MEMORY;
6323       goto out;
6324    }
6325    t->num_immediates = program->num_immediates;
6326
6327    i = 0;
6328    foreach_in_list(immediate_storage, imm, &program->immediates) {
6329       assert(i < program->num_immediates);
6330       t->immediates[i++] = emit_immediate(t, imm->values, imm->type, imm->size32);
6331    }
6332    assert(i == program->num_immediates);
6333
6334    /* texture samplers */
6335    for (i = 0; i < frag_const->MaxTextureImageUnits; i++) {
6336       if (program->samplers_used & (1u << i)) {
6337          unsigned type;
6338
6339          t->samplers[i] = ureg_DECL_sampler(ureg, i);
6340
6341          switch (program->sampler_types[i]) {
6342          case GLSL_TYPE_INT:
6343             type = TGSI_RETURN_TYPE_SINT;
6344             break;
6345          case GLSL_TYPE_UINT:
6346             type = TGSI_RETURN_TYPE_UINT;
6347             break;
6348          case GLSL_TYPE_FLOAT:
6349             type = TGSI_RETURN_TYPE_FLOAT;
6350             break;
6351          default:
6352             unreachable("not reached");
6353          }
6354
6355          ureg_DECL_sampler_view( ureg, i, program->sampler_targets[i],
6356                                  type, type, type, type );
6357       }
6358    }
6359
6360    for (i = 0; i < frag_const->MaxAtomicBuffers; i++) {
6361       if (program->buffers_used & (1 << i)) {
6362          t->buffers[i] = ureg_DECL_buffer(ureg, i, true);
6363       }
6364    }
6365
6366    for (; i < frag_const->MaxAtomicBuffers + frag_const->MaxShaderStorageBlocks;
6367         i++) {
6368       if (program->buffers_used & (1 << i)) {
6369          t->buffers[i] = ureg_DECL_buffer(ureg, i, false);
6370       }
6371    }
6372
6373    if (program->use_shared_memory)
6374       t->shared_memory = ureg_DECL_memory(ureg, TGSI_MEMORY_TYPE_SHARED);
6375
6376    for (i = 0; i < program->shader->NumImages; i++) {
6377       if (program->images_used & (1 << i)) {
6378          t->images[i] = ureg_DECL_image(ureg, i,
6379                                         program->image_targets[i],
6380                                         program->image_formats[i],
6381                                         true, false);
6382       }
6383    }
6384
6385    /* Emit each instruction in turn:
6386     */
6387    foreach_in_list(glsl_to_tgsi_instruction, inst, &program->instructions)
6388       compile_tgsi_instruction(t, inst);
6389
6390    /* Set the next shader stage hint for VS and TES. */
6391    switch (procType) {
6392    case PIPE_SHADER_VERTEX:
6393    case PIPE_SHADER_TESS_EVAL:
6394       if (program->shader_program->SeparateShader)
6395          break;
6396
6397       for (i = program->shader->Stage+1; i <= MESA_SHADER_FRAGMENT; i++) {
6398          if (program->shader_program->_LinkedShaders[i]) {
6399             unsigned next;
6400
6401             switch (i) {
6402             case MESA_SHADER_TESS_CTRL:
6403                next = PIPE_SHADER_TESS_CTRL;
6404                break;
6405             case MESA_SHADER_TESS_EVAL:
6406                next = PIPE_SHADER_TESS_EVAL;
6407                break;
6408             case MESA_SHADER_GEOMETRY:
6409                next = PIPE_SHADER_GEOMETRY;
6410                break;
6411             case MESA_SHADER_FRAGMENT:
6412                next = PIPE_SHADER_FRAGMENT;
6413                break;
6414             default:
6415                assert(0);
6416                continue;
6417             }
6418
6419             ureg_set_next_shader_processor(ureg, next);
6420             break;
6421          }
6422       }
6423       break;
6424    }
6425
6426 out:
6427    if (t) {
6428       free(t->arrays);
6429       free(t->temps);
6430       free(t->constants);
6431       t->num_constants = 0;
6432       free(t->immediates);
6433       t->num_immediates = 0;
6434       FREE(t);
6435    }
6436
6437    return ret;
6438 }
6439 /* ----------------------------- End TGSI code ------------------------------ */
6440
6441
6442 /**
6443  * Convert a shader's GLSL IR into a Mesa gl_program, although without
6444  * generating Mesa IR.
6445  */
6446 static struct gl_program *
6447 get_mesa_program_tgsi(struct gl_context *ctx,
6448                       struct gl_shader_program *shader_program,
6449                       struct gl_linked_shader *shader)
6450 {
6451    glsl_to_tgsi_visitor* v;
6452    struct gl_program *prog;
6453    struct gl_shader_compiler_options *options =
6454          &ctx->Const.ShaderCompilerOptions[shader->Stage];
6455    struct pipe_screen *pscreen = ctx->st->pipe->screen;
6456    enum pipe_shader_type ptarget = st_shader_stage_to_ptarget(shader->Stage);
6457
6458    validate_ir_tree(shader->ir);
6459
6460    prog = shader->Program;
6461
6462    prog->Parameters = _mesa_new_parameter_list();
6463    v = new glsl_to_tgsi_visitor();
6464    v->ctx = ctx;
6465    v->prog = prog;
6466    v->shader_program = shader_program;
6467    v->shader = shader;
6468    v->options = options;
6469    v->glsl_version = ctx->Const.GLSLVersion;
6470    v->native_integers = ctx->Const.NativeIntegers;
6471
6472    v->have_sqrt = pscreen->get_shader_param(pscreen, ptarget,
6473                                             PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED);
6474    v->have_fma = pscreen->get_shader_param(pscreen, ptarget,
6475                                            PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED);
6476
6477    _mesa_generate_parameters_list_for_uniforms(shader_program, shader,
6478                                                prog->Parameters);
6479
6480    /* Remove reads from output registers. */
6481    if (!pscreen->get_param(pscreen, PIPE_CAP_TGSI_CAN_READ_OUTPUTS))
6482       lower_output_reads(shader->Stage, shader->ir);
6483
6484    /* Emit intermediate IR for main(). */
6485    visit_exec_list(shader->ir, v);
6486
6487 #if 0
6488    /* Print out some information (for debugging purposes) used by the
6489     * optimization passes. */
6490    {
6491       int i;
6492       int *first_writes = rzalloc_array(v->mem_ctx, int, v->next_temp);
6493       int *first_reads = rzalloc_array(v->mem_ctx, int, v->next_temp);
6494       int *last_writes = rzalloc_array(v->mem_ctx, int, v->next_temp);
6495       int *last_reads = rzalloc_array(v->mem_ctx, int, v->next_temp);
6496
6497       for (i = 0; i < v->next_temp; i++) {
6498          first_writes[i] = -1;
6499          first_reads[i] = -1;
6500          last_writes[i] = -1;
6501          last_reads[i] = -1;
6502       }
6503       v->get_first_temp_read(first_reads);
6504       v->get_last_temp_read_first_temp_write(last_reads, first_writes);
6505       v->get_last_temp_write(last_writes);
6506       for (i = 0; i < v->next_temp; i++)
6507          printf("Temp %d: FR=%3d FW=%3d LR=%3d LW=%3d\n", i, first_reads[i],
6508                 first_writes[i],
6509                 last_reads[i],
6510                 last_writes[i]);
6511       ralloc_free(first_writes);
6512       ralloc_free(first_reads);
6513       ralloc_free(last_writes);
6514       ralloc_free(last_reads);
6515    }
6516 #endif
6517
6518    /* Perform optimizations on the instructions in the glsl_to_tgsi_visitor. */
6519    v->simplify_cmp();
6520
6521    if (shader->Stage != MESA_SHADER_TESS_CTRL &&
6522        shader->Stage != MESA_SHADER_TESS_EVAL)
6523       v->copy_propagate();
6524
6525    while (v->eliminate_dead_code());
6526
6527    v->merge_two_dsts();
6528    v->merge_registers();
6529    v->renumber_registers();
6530
6531    /* Write the END instruction. */
6532    v->emit_asm(NULL, TGSI_OPCODE_END);
6533
6534    if (ctx->_Shader->Flags & GLSL_DUMP) {
6535       _mesa_log("\n");
6536       _mesa_log("GLSL IR for linked %s program %d:\n",
6537              _mesa_shader_stage_to_string(shader->Stage),
6538              shader_program->Name);
6539       _mesa_print_ir(_mesa_get_log_file(), shader->ir, NULL);
6540       _mesa_log("\n\n");
6541    }
6542
6543    do_set_program_inouts(shader->ir, prog, shader->Stage);
6544    _mesa_copy_linked_program_data(shader_program, shader);
6545    shrink_array_declarations(v->inputs, v->num_inputs,
6546                              &prog->info.inputs_read,
6547                              prog->info.double_inputs_read,
6548                              &prog->info.patch_inputs_read);
6549    shrink_array_declarations(v->outputs, v->num_outputs,
6550                              &prog->info.outputs_written, 0ULL,
6551                              &prog->info.patch_outputs_written);
6552    count_resources(v, prog);
6553
6554    /* The GLSL IR won't be needed anymore. */
6555    ralloc_free(shader->ir);
6556    shader->ir = NULL;
6557
6558    /* This must be done before the uniform storage is associated. */
6559    if (shader->Stage == MESA_SHADER_FRAGMENT &&
6560        (prog->info.inputs_read & VARYING_BIT_POS ||
6561         prog->info.system_values_read & (1 << SYSTEM_VALUE_FRAG_COORD))) {
6562       static const gl_state_index wposTransformState[STATE_LENGTH] = {
6563          STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM
6564       };
6565
6566       v->wpos_transform_const = _mesa_add_state_reference(prog->Parameters,
6567                                                           wposTransformState);
6568    }
6569
6570    /* Avoid reallocation of the program parameter list, because the uniform
6571     * storage is only associated with the original parameter list.
6572     * This should be enough for Bitmap and DrawPixels constants.
6573     */
6574    _mesa_reserve_parameter_storage(prog->Parameters, 8);
6575
6576    /* This has to be done last.  Any operation the can cause
6577     * prog->ParameterValues to get reallocated (e.g., anything that adds a
6578     * program constant) has to happen before creating this linkage.
6579     */
6580    _mesa_associate_uniform_storage(ctx, shader_program, prog->Parameters);
6581    if (!shader_program->data->LinkStatus) {
6582       free_glsl_to_tgsi_visitor(v);
6583       _mesa_reference_program(ctx, &shader->Program, NULL);
6584       return NULL;
6585    }
6586
6587    struct st_vertex_program *stvp;
6588    struct st_fragment_program *stfp;
6589    struct st_geometry_program *stgp;
6590    struct st_tessctrl_program *sttcp;
6591    struct st_tesseval_program *sttep;
6592    struct st_compute_program *stcp;
6593
6594    switch (shader->Stage) {
6595    case MESA_SHADER_VERTEX:
6596       stvp = (struct st_vertex_program *)prog;
6597       stvp->glsl_to_tgsi = v;
6598       break;
6599    case MESA_SHADER_FRAGMENT:
6600       stfp = (struct st_fragment_program *)prog;
6601       stfp->glsl_to_tgsi = v;
6602       break;
6603    case MESA_SHADER_GEOMETRY:
6604       stgp = (struct st_geometry_program *)prog;
6605       stgp->glsl_to_tgsi = v;
6606       break;
6607    case MESA_SHADER_TESS_CTRL:
6608       sttcp = (struct st_tessctrl_program *)prog;
6609       sttcp->glsl_to_tgsi = v;
6610       break;
6611    case MESA_SHADER_TESS_EVAL:
6612       sttep = (struct st_tesseval_program *)prog;
6613       sttep->glsl_to_tgsi = v;
6614       break;
6615    case MESA_SHADER_COMPUTE:
6616       stcp = (struct st_compute_program *)prog;
6617       stcp->glsl_to_tgsi = v;
6618       break;
6619    default:
6620       assert(!"should not be reached");
6621       return NULL;
6622    }
6623
6624    return prog;
6625 }
6626
6627 static void
6628 set_affected_state_flags(uint64_t *states,
6629                          struct gl_program *prog,
6630                          struct gl_linked_shader *shader,
6631                          uint64_t new_constants,
6632                          uint64_t new_sampler_views,
6633                          uint64_t new_samplers,
6634                          uint64_t new_images,
6635                          uint64_t new_ubos,
6636                          uint64_t new_ssbos,
6637                          uint64_t new_atomics)
6638 {
6639    if (prog->Parameters->NumParameters)
6640       *states |= new_constants;
6641
6642    if (shader->num_samplers)
6643       *states |= new_sampler_views | new_samplers;
6644
6645    if (shader->NumImages)
6646       *states |= new_images;
6647
6648    if (shader->NumUniformBlocks)
6649       *states |= new_ubos;
6650
6651    if (shader->NumShaderStorageBlocks)
6652       *states |= new_ssbos;
6653
6654    if (prog->info.num_abos)
6655       *states |= new_atomics;
6656 }
6657
6658 static struct gl_program *
6659 get_mesa_program(struct gl_context *ctx,
6660                  struct gl_shader_program *shader_program,
6661                  struct gl_linked_shader *shader)
6662 {
6663    struct pipe_screen *pscreen = ctx->st->pipe->screen;
6664    enum pipe_shader_type ptarget = st_shader_stage_to_ptarget(shader->Stage);
6665    enum pipe_shader_ir preferred_ir = (enum pipe_shader_ir)
6666       pscreen->get_shader_param(pscreen, ptarget, PIPE_SHADER_CAP_PREFERRED_IR);
6667    struct gl_program *prog = NULL;
6668
6669    if (preferred_ir == PIPE_SHADER_IR_NIR) {
6670       /* TODO only for GLSL VS/FS for now: */
6671       switch (shader->Stage) {
6672       case MESA_SHADER_VERTEX:
6673       case MESA_SHADER_FRAGMENT:
6674          prog = st_nir_get_mesa_program(ctx, shader_program, shader);
6675       default:
6676          break;
6677       }
6678    } else {
6679       prog = get_mesa_program_tgsi(ctx, shader_program, shader);
6680    }
6681
6682    if (prog) {
6683       uint64_t *states;
6684
6685       /* This determines which states will be updated when the shader is
6686        * bound.
6687        */
6688       switch (shader->Stage) {
6689       case MESA_SHADER_VERTEX:
6690          states = &((struct st_vertex_program*)prog)->affected_states;
6691
6692          *states = ST_NEW_VS_STATE |
6693                    ST_NEW_RASTERIZER |
6694                    ST_NEW_VERTEX_ARRAYS;
6695
6696          set_affected_state_flags(states, prog, shader,
6697                                   ST_NEW_VS_CONSTANTS,
6698                                   ST_NEW_VS_SAMPLER_VIEWS,
6699                                   ST_NEW_RENDER_SAMPLERS,
6700                                   ST_NEW_VS_IMAGES,
6701                                   ST_NEW_VS_UBOS,
6702                                   ST_NEW_VS_SSBOS,
6703                                   ST_NEW_VS_ATOMICS);
6704          break;
6705
6706       case MESA_SHADER_TESS_CTRL:
6707          states = &((struct st_tessctrl_program*)prog)->affected_states;
6708
6709          *states = ST_NEW_TCS_STATE;
6710
6711          set_affected_state_flags(states, prog, shader,
6712                                   ST_NEW_TCS_CONSTANTS,
6713                                   ST_NEW_TCS_SAMPLER_VIEWS,
6714                                   ST_NEW_RENDER_SAMPLERS,
6715                                   ST_NEW_TCS_IMAGES,
6716                                   ST_NEW_TCS_UBOS,
6717                                   ST_NEW_TCS_SSBOS,
6718                                   ST_NEW_TCS_ATOMICS);
6719          break;
6720
6721       case MESA_SHADER_TESS_EVAL:
6722          states = &((struct st_tesseval_program*)prog)->affected_states;
6723
6724          *states = ST_NEW_TES_STATE |
6725                    ST_NEW_RASTERIZER;
6726
6727          set_affected_state_flags(states, prog, shader,
6728                                   ST_NEW_TES_CONSTANTS,
6729                                   ST_NEW_TES_SAMPLER_VIEWS,
6730                                   ST_NEW_RENDER_SAMPLERS,
6731                                   ST_NEW_TES_IMAGES,
6732                                   ST_NEW_TES_UBOS,
6733                                   ST_NEW_TES_SSBOS,
6734                                   ST_NEW_TES_ATOMICS);
6735          break;
6736
6737       case MESA_SHADER_GEOMETRY:
6738          states = &((struct st_geometry_program*)prog)->affected_states;
6739
6740          *states = ST_NEW_GS_STATE |
6741                    ST_NEW_RASTERIZER;
6742
6743          set_affected_state_flags(states, prog, shader,
6744                                   ST_NEW_GS_CONSTANTS,
6745                                   ST_NEW_GS_SAMPLER_VIEWS,
6746                                   ST_NEW_RENDER_SAMPLERS,
6747                                   ST_NEW_GS_IMAGES,
6748                                   ST_NEW_GS_UBOS,
6749                                   ST_NEW_GS_SSBOS,
6750                                   ST_NEW_GS_ATOMICS);
6751          break;
6752
6753       case MESA_SHADER_FRAGMENT:
6754          states = &((struct st_fragment_program*)prog)->affected_states;
6755
6756          /* gl_FragCoord and glDrawPixels always use constants. */
6757          *states = ST_NEW_FS_STATE |
6758                    ST_NEW_SAMPLE_SHADING |
6759                    ST_NEW_FS_CONSTANTS;
6760
6761          set_affected_state_flags(states, prog, shader,
6762                                   ST_NEW_FS_CONSTANTS,
6763                                   ST_NEW_FS_SAMPLER_VIEWS,
6764                                   ST_NEW_RENDER_SAMPLERS,
6765                                   ST_NEW_FS_IMAGES,
6766                                   ST_NEW_FS_UBOS,
6767                                   ST_NEW_FS_SSBOS,
6768                                   ST_NEW_FS_ATOMICS);
6769          break;
6770
6771       case MESA_SHADER_COMPUTE:
6772          states = &((struct st_compute_program*)prog)->affected_states;
6773
6774          *states = ST_NEW_CS_STATE;
6775
6776          set_affected_state_flags(states, prog, shader,
6777                                   ST_NEW_CS_CONSTANTS,
6778                                   ST_NEW_CS_SAMPLER_VIEWS,
6779                                   ST_NEW_CS_SAMPLERS,
6780                                   ST_NEW_CS_IMAGES,
6781                                   ST_NEW_CS_UBOS,
6782                                   ST_NEW_CS_SSBOS,
6783                                   ST_NEW_CS_ATOMICS);
6784          break;
6785
6786       default:
6787          unreachable("unhandled shader stage");
6788       }
6789    }
6790
6791    return prog;
6792 }
6793
6794 /* See if there are unsupported control flow statements. */
6795 class ir_control_flow_info_visitor : public ir_hierarchical_visitor {
6796 private:
6797    const struct gl_shader_compiler_options *options;
6798 public:
6799    ir_control_flow_info_visitor(const struct gl_shader_compiler_options *options)
6800       : options(options),
6801         unsupported(false)
6802    {
6803    }
6804
6805    virtual ir_visitor_status visit_enter(ir_function *ir)
6806    {
6807       /* Other functions are skipped (same as glsl_to_tgsi). */
6808       if (strcmp(ir->name, "main") == 0)
6809          return visit_continue;
6810
6811       return visit_continue_with_parent;
6812    }
6813
6814    virtual ir_visitor_status visit_enter(ir_call *ir)
6815    {
6816       if (!ir->callee->is_intrinsic()) {
6817          unsupported = true; /* it's a function call */
6818          return visit_stop;
6819       }
6820       return visit_continue;
6821    }
6822
6823    virtual ir_visitor_status visit_enter(ir_return *ir)
6824    {
6825       if (options->EmitNoMainReturn) {
6826          unsupported = true;
6827          return visit_stop;
6828       }
6829       return visit_continue;
6830    }
6831
6832    bool unsupported;
6833 };
6834
6835 static bool
6836 has_unsupported_control_flow(exec_list *ir,
6837                              const struct gl_shader_compiler_options *options)
6838 {
6839    ir_control_flow_info_visitor visitor(options);
6840    visit_list_elements(&visitor, ir);
6841    return visitor.unsupported;
6842 }
6843
6844 extern "C" {
6845
6846 /**
6847  * Link a shader.
6848  * Called via ctx->Driver.LinkShader()
6849  * This actually involves converting GLSL IR into an intermediate TGSI-like IR
6850  * with code lowering and other optimizations.
6851  */
6852 GLboolean
6853 st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
6854 {
6855    struct pipe_screen *pscreen = ctx->st->pipe->screen;
6856    assert(prog->data->LinkStatus);
6857
6858    for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
6859       if (prog->_LinkedShaders[i] == NULL)
6860          continue;
6861
6862       exec_list *ir = prog->_LinkedShaders[i]->ir;
6863       gl_shader_stage stage = prog->_LinkedShaders[i]->Stage;
6864       const struct gl_shader_compiler_options *options =
6865             &ctx->Const.ShaderCompilerOptions[stage];
6866       enum pipe_shader_type ptarget = st_shader_stage_to_ptarget(stage);
6867       bool have_dround = pscreen->get_shader_param(pscreen, ptarget,
6868                                                    PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED);
6869       bool have_dfrexp = pscreen->get_shader_param(pscreen, ptarget,
6870                                                    PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED);
6871       unsigned if_threshold = pscreen->get_shader_param(pscreen, ptarget,
6872                                                         PIPE_SHADER_CAP_LOWER_IF_THRESHOLD);
6873
6874       /* If there are forms of indirect addressing that the driver
6875        * cannot handle, perform the lowering pass.
6876        */
6877       if (options->EmitNoIndirectInput || options->EmitNoIndirectOutput ||
6878           options->EmitNoIndirectTemp || options->EmitNoIndirectUniform) {
6879          lower_variable_index_to_cond_assign(prog->_LinkedShaders[i]->Stage, ir,
6880                                              options->EmitNoIndirectInput,
6881                                              options->EmitNoIndirectOutput,
6882                                              options->EmitNoIndirectTemp,
6883                                              options->EmitNoIndirectUniform);
6884       }
6885
6886       if (ctx->Extensions.ARB_shading_language_packing) {
6887          unsigned lower_inst = LOWER_PACK_SNORM_2x16 |
6888                                LOWER_UNPACK_SNORM_2x16 |
6889                                LOWER_PACK_UNORM_2x16 |
6890                                LOWER_UNPACK_UNORM_2x16 |
6891                                LOWER_PACK_SNORM_4x8 |
6892                                LOWER_UNPACK_SNORM_4x8 |
6893                                LOWER_UNPACK_UNORM_4x8 |
6894                                LOWER_PACK_UNORM_4x8;
6895
6896          if (ctx->Extensions.ARB_gpu_shader5)
6897             lower_inst |= LOWER_PACK_USE_BFI |
6898                           LOWER_PACK_USE_BFE;
6899          if (!ctx->st->has_half_float_packing)
6900             lower_inst |= LOWER_PACK_HALF_2x16 |
6901                           LOWER_UNPACK_HALF_2x16;
6902
6903          lower_packing_builtins(ir, lower_inst);
6904       }
6905
6906       if (!pscreen->get_param(pscreen, PIPE_CAP_TEXTURE_GATHER_OFFSETS))
6907          lower_offset_arrays(ir);
6908       do_mat_op_to_vec(ir);
6909       lower_instructions(ir,
6910                          MOD_TO_FLOOR |
6911                          DIV_TO_MUL_RCP |
6912                          EXP_TO_EXP2 |
6913                          LOG_TO_LOG2 |
6914                          LDEXP_TO_ARITH |
6915                          (have_dfrexp ? 0 : DFREXP_DLDEXP_TO_ARITH) |
6916                          CARRY_TO_ARITH |
6917                          BORROW_TO_ARITH |
6918                          (have_dround ? 0 : DOPS_TO_DFRAC) |
6919                          (options->EmitNoPow ? POW_TO_EXP2 : 0) |
6920                          (!ctx->Const.NativeIntegers ? INT_DIV_TO_MUL_RCP : 0) |
6921                          (options->EmitNoSat ? SAT_TO_CLAMP : 0) |
6922                          /* Assume that if ARB_gpu_shader5 is not supported
6923                           * then all of the extended integer functions need
6924                           * lowering.  It may be necessary to add some caps
6925                           * for individual instructions.
6926                           */
6927                          (!ctx->Extensions.ARB_gpu_shader5
6928                           ? BIT_COUNT_TO_MATH |
6929                             EXTRACT_TO_SHIFTS |
6930                             INSERT_TO_SHIFTS |
6931                             REVERSE_TO_SHIFTS |
6932                             FIND_LSB_TO_FLOAT_CAST |
6933                             FIND_MSB_TO_FLOAT_CAST |
6934                             IMUL_HIGH_TO_MUL
6935                           : 0));
6936
6937       do_vec_index_to_cond_assign(ir);
6938       lower_vector_insert(ir, true);
6939       lower_quadop_vector(ir, false);
6940       lower_noise(ir);
6941       if (options->MaxIfDepth == 0) {
6942          lower_discard(ir);
6943       }
6944
6945       if (ctx->Const.GLSLOptimizeConservatively) {
6946          /* Do it once and repeat only if there's unsupported control flow. */
6947          do {
6948             do_common_optimization(ir, true, true, options,
6949                                    ctx->Const.NativeIntegers);
6950             lower_if_to_cond_assign((gl_shader_stage)i, ir,
6951                                     options->MaxIfDepth, if_threshold);
6952          } while (has_unsupported_control_flow(ir, options));
6953       } else {
6954          /* Repeat it until it stops making changes. */
6955          bool progress;
6956          do {
6957             progress = do_common_optimization(ir, true, true, options,
6958                                               ctx->Const.NativeIntegers);
6959             progress |= lower_if_to_cond_assign((gl_shader_stage)i, ir,
6960                                                 options->MaxIfDepth, if_threshold);
6961          } while (progress);
6962       }
6963
6964       validate_ir_tree(ir);
6965    }
6966
6967    build_program_resource_list(ctx, prog);
6968
6969    for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
6970       struct gl_program *linked_prog;
6971
6972       if (prog->_LinkedShaders[i] == NULL)
6973          continue;
6974
6975       linked_prog = get_mesa_program(ctx, prog, prog->_LinkedShaders[i]);
6976
6977       if (linked_prog) {
6978          if (!ctx->Driver.ProgramStringNotify(ctx,
6979                                               _mesa_shader_stage_to_program(i),
6980                                               linked_prog)) {
6981             _mesa_reference_program(ctx, &prog->_LinkedShaders[i]->Program,
6982                                     NULL);
6983             return GL_FALSE;
6984          }
6985       }
6986    }
6987
6988    return GL_TRUE;
6989 }
6990
6991 void
6992 st_translate_stream_output_info(glsl_to_tgsi_visitor *glsl_to_tgsi,
6993                                 const GLuint outputMapping[],
6994                                 struct pipe_stream_output_info *so)
6995 {
6996    struct gl_transform_feedback_info *info =
6997       glsl_to_tgsi->shader_program->xfb_program->sh.LinkedTransformFeedback;
6998    st_translate_stream_output_info2(info, outputMapping, so);
6999 }
7000
7001 void
7002 st_translate_stream_output_info2(struct gl_transform_feedback_info *info,
7003                                 const GLuint outputMapping[],
7004                                 struct pipe_stream_output_info *so)
7005 {
7006    unsigned i;
7007
7008    for (i = 0; i < info->NumOutputs; i++) {
7009       so->output[i].register_index =
7010          outputMapping[info->Outputs[i].OutputRegister];
7011       so->output[i].start_component = info->Outputs[i].ComponentOffset;
7012       so->output[i].num_components = info->Outputs[i].NumComponents;
7013       so->output[i].output_buffer = info->Outputs[i].OutputBuffer;
7014       so->output[i].dst_offset = info->Outputs[i].DstOffset;
7015       so->output[i].stream = info->Outputs[i].StreamId;
7016    }
7017
7018    for (i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
7019       so->stride[i] = info->Buffers[i].Stride;
7020    }
7021    so->num_outputs = info->NumOutputs;
7022 }
7023
7024 } /* extern "C" */