src/gallium/drivers/llvmpipe/lp_state_fs.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009 VMware, Inc.
   4  * Copyright 2007 VMware, Inc.
   5  * All Rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the
   9  * "Software"), to deal in the Software without restriction, including
  10  * without limitation the rights to use, copy, modify, merge, publish,
  11  * distribute, sub license, and/or sell copies of the Software, and to
  12  * permit persons to whom the Software is furnished to do so, subject to
  13  * the following conditions:
  14  *
  15  * The above copyright notice and this permission notice (including the
  16  * next paragraph) shall be included in all copies or substantial portions
  17  * of the Software.
  18  *
  19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  20  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  22  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  23  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  26  *
  27  **************************************************************************/
  28
  29 /**
  30  * @file
  31  * Code generate the whole fragment pipeline.
  32  *
  33  * The fragment pipeline consists of the following stages:
  34  * - early depth test
  35  * - fragment shader
  36  * - alpha test
  37  * - depth/stencil test
  38  * - blending
  39  *
  40  * This file has only the glue to assemble the fragment pipeline.  The actual
  41  * plumbing of converting Gallium state into LLVM IR is done elsewhere, in the
  42  * lp_bld_*.[ch] files, and in a complete generic and reusable way. Here we
  43  * muster the LLVM JIT execution engine to create a function that follows an
  44  * established binary interface and that can be called from C directly.
  45  *
  46  * A big source of complexity here is that we often want to run different
  47  * stages with different precisions and data types and precisions. For example,
  48  * the fragment shader needs typically to be done in floats, but the
  49  * depth/stencil test and blending is better done in the type that most closely
  50  * matches the depth/stencil and color buffer respectively.
  51  *
  52  * Since the width of a SIMD vector register stays the same regardless of the
  53  * element type, different types imply different number of elements, so we must
  54  * code generate more instances of the stages with larger types to be able to
  55  * feed/consume the stages with smaller types.
  56  *
  57  * @author Jose Fonseca <jfonseca@vmware.com>
  58  */
  59
  60 #include <limits.h>
  61 #include "pipe/p_defines.h"
  62 #include "util/u_inlines.h"
  63 #include "util/u_memory.h"
  64 #include "util/u_pointer.h"
  65 #include "util/format/u_format.h"
  66 #include "util/u_dump.h"
  67 #include "util/u_string.h"
  68 #include "util/simple_list.h"
  69 #include "util/u_dual_blend.h"
  70 #include "util/os_time.h"
  71 #include "pipe/p_shader_tokens.h"
  72 #include "draw/draw_context.h"
  73 #include "tgsi/tgsi_dump.h"
  74 #include "tgsi/tgsi_scan.h"
  75 #include "tgsi/tgsi_parse.h"
  76 #include "gallivm/lp_bld_type.h"
  77 #include "gallivm/lp_bld_const.h"
  78 #include "gallivm/lp_bld_conv.h"
  79 #include "gallivm/lp_bld_init.h"
  80 #include "gallivm/lp_bld_intr.h"
  81 #include "gallivm/lp_bld_logic.h"
  82 #include "gallivm/lp_bld_tgsi.h"
  83 #include "gallivm/lp_bld_nir.h"
  84 #include "gallivm/lp_bld_swizzle.h"
  85 #include "gallivm/lp_bld_flow.h"
  86 #include "gallivm/lp_bld_debug.h"
  87 #include "gallivm/lp_bld_arit.h"
  88 #include "gallivm/lp_bld_bitarit.h"
  89 #include "gallivm/lp_bld_pack.h"
  90 #include "gallivm/lp_bld_format.h"
  91 #include "gallivm/lp_bld_quad.h"
  92
  93 #include "lp_bld_alpha.h"
  94 #include "lp_bld_blend.h"
  95 #include "lp_bld_depth.h"
  96 #include "lp_bld_interp.h"
  97 #include "lp_context.h"
  98 #include "lp_debug.h"
  99 #include "lp_perf.h"
 100 #include "lp_setup.h"
 101 #include "lp_state.h"
 102 #include "lp_tex_sample.h"
 103 #include "lp_flush.h"
 104 #include "lp_state_fs.h"
 105 #include "lp_rast.h"
 106 #include "nir/nir_to_tgsi_info.h"
 107
 108 #include "lp_screen.h"
 109 #include "compiler/nir/nir_serialize.h"
 110 #include "util/mesa-sha1.h"
 111 /** Fragment shader number (for debugging) */
 112 static unsigned fs_no = 0;
 113
 114
 115 /**
 116  * Expand the relevant bits of mask_input to a n*4-dword mask for the
 117  * n*four pixels in n 2x2 quads.  This will set the n*four elements of the
 118  * quad mask vector to 0 or ~0.
 119  * Grouping is 01, 23 for 2 quad mode hence only 0 and 2 are valid
 120  * quad arguments with fs length 8.
 121  *
 122  * \param first_quad  which quad(s) of the quad group to test, in [0,3]
 123  * \param mask_input  bitwise mask for the whole 4x4 stamp
 124  */
 125 static LLVMValueRef
 126 generate_quad_mask(struct gallivm_state *gallivm,
 127                    struct lp_type fs_type,
 128                    unsigned first_quad,
 129                    unsigned sample,
 130                    LLVMValueRef mask_input) /* int64 */
 131 {
 132    LLVMBuilderRef builder = gallivm->builder;
 133    struct lp_type mask_type;
 134    LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
 135    LLVMValueRef bits[16];
 136    LLVMValueRef mask, bits_vec;
 137    int shift, i;
 138
 139    /*
 140     * XXX: We'll need a different path for 16 x u8
 141     */
 142    assert(fs_type.width == 32);
 143    assert(fs_type.length <= ARRAY_SIZE(bits));
 144    mask_type = lp_int_type(fs_type);
 145
 146    /*
 147     * mask_input >>= (quad * 4)
 148     */
 149    switch (first_quad) {
 150    case 0:
 151       shift = 0;
 152       break;
 153    case 1:
 154       assert(fs_type.length == 4);
 155       shift = 2;
 156       break;
 157    case 2:
 158       shift = 8;
 159       break;
 160    case 3:
 161       assert(fs_type.length == 4);
 162       shift = 10;
 163       break;
 164    default:
 165       assert(0);
 166       shift = 0;
 167    }
 168
 169    mask_input = LLVMBuildLShr(builder, mask_input, lp_build_const_int64(gallivm, 16 * sample), "");
 170    mask_input = LLVMBuildTrunc(builder, mask_input,
 171                                i32t, "");
 172    mask_input = LLVMBuildAnd(builder, mask_input, lp_build_const_int32(gallivm, 0xffff), "");
 173
 174    mask_input = LLVMBuildLShr(builder,
 175                               mask_input,
 176                               LLVMConstInt(i32t, shift, 0),
 177                               "");
 178
 179    /*
 180     * mask = { mask_input & (1 << i), for i in [0,3] }
 181     */
 182    mask = lp_build_broadcast(gallivm,
 183                              lp_build_vec_type(gallivm, mask_type),
 184                              mask_input);
 185
 186    for (i = 0; i < fs_type.length / 4; i++) {
 187       unsigned j = 2 * (i % 2) + (i / 2) * 8;
 188       bits[4*i + 0] = LLVMConstInt(i32t, 1ULL << (j + 0), 0);
 189       bits[4*i + 1] = LLVMConstInt(i32t, 1ULL << (j + 1), 0);
 190       bits[4*i + 2] = LLVMConstInt(i32t, 1ULL << (j + 4), 0);
 191       bits[4*i + 3] = LLVMConstInt(i32t, 1ULL << (j + 5), 0);
 192    }
 193    bits_vec = LLVMConstVector(bits, fs_type.length);
 194    mask = LLVMBuildAnd(builder, mask, bits_vec, "");
 195
 196    /*
 197     * mask = mask == bits ? ~0 : 0
 198     */
 199    mask = lp_build_compare(gallivm,
 200                            mask_type, PIPE_FUNC_EQUAL,
 201                            mask, bits_vec);
 202
 203    return mask;
 204 }
 205
 206
 207 #define EARLY_DEPTH_TEST  0x1
 208 #define LATE_DEPTH_TEST   0x2
 209 #define EARLY_DEPTH_WRITE 0x4
 210 #define LATE_DEPTH_WRITE  0x8
 211
 212 static int
 213 find_output_by_semantic( const struct tgsi_shader_info *info,
 214                          unsigned semantic,
 215                          unsigned index )
 216 {
 217    int i;
 218
 219    for (i = 0; i < info->num_outputs; i++)
 220       if (info->output_semantic_name[i] == semantic &&
 221           info->output_semantic_index[i] == index)
 222          return i;
 223
 224    return -1;
 225 }
 226
 227
 228 /**
 229  * Fetch the specified lp_jit_viewport structure for a given viewport_index.
 230  */
 231 static LLVMValueRef
 232 lp_llvm_viewport(LLVMValueRef context_ptr,
 233                  struct gallivm_state *gallivm,
 234                  LLVMValueRef viewport_index)
 235 {
 236    LLVMBuilderRef builder = gallivm->builder;
 237    LLVMValueRef ptr;
 238    LLVMValueRef res;
 239    struct lp_type viewport_type =
 240       lp_type_float_vec(32, 32 * LP_JIT_VIEWPORT_NUM_FIELDS);
 241
 242    ptr = lp_jit_context_viewports(gallivm, context_ptr);
 243    ptr = LLVMBuildPointerCast(builder, ptr,
 244             LLVMPointerType(lp_build_vec_type(gallivm, viewport_type), 0), "");
 245
 246    res = lp_build_pointer_get(builder, ptr, viewport_index);
 247
 248    return res;
 249 }
 250
 251
 252 static LLVMValueRef
 253 lp_build_depth_clamp(struct gallivm_state *gallivm,
 254                      LLVMBuilderRef builder,
 255                      struct lp_type type,
 256                      LLVMValueRef context_ptr,
 257                      LLVMValueRef thread_data_ptr,
 258                      LLVMValueRef z)
 259 {
 260    LLVMValueRef viewport, min_depth, max_depth;
 261    LLVMValueRef viewport_index;
 262    struct lp_build_context f32_bld;
 263
 264    assert(type.floating);
 265    lp_build_context_init(&f32_bld, gallivm, type);
 266
 267    /*
 268     * Assumes clamping of the viewport index will occur in setup/gs. Value
 269     * is passed through the rasterization stage via lp_rast_shader_inputs.
 270     *
 271     * See: draw_clamp_viewport_idx and lp_clamp_viewport_idx for clamping
 272     *      semantics.
 273     */
 274    viewport_index = lp_jit_thread_data_raster_state_viewport_index(gallivm,
 275                        thread_data_ptr);
 276
 277    /*
 278     * Load the min and max depth from the lp_jit_context.viewports
 279     * array of lp_jit_viewport structures.
 280     */
 281    viewport = lp_llvm_viewport(context_ptr, gallivm, viewport_index);
 282
 283    /* viewports[viewport_index].min_depth */
 284    min_depth = LLVMBuildExtractElement(builder, viewport,
 285                   lp_build_const_int32(gallivm, LP_JIT_VIEWPORT_MIN_DEPTH), "");
 286    min_depth = lp_build_broadcast_scalar(&f32_bld, min_depth);
 287
 288    /* viewports[viewport_index].max_depth */
 289    max_depth = LLVMBuildExtractElement(builder, viewport,
 290                   lp_build_const_int32(gallivm, LP_JIT_VIEWPORT_MAX_DEPTH), "");
 291    max_depth = lp_build_broadcast_scalar(&f32_bld, max_depth);
 292
 293    /*
 294     * Clamp to the min and max depth values for the given viewport.
 295     */
 296    return lp_build_clamp(&f32_bld, z, min_depth, max_depth);
 297 }
 298
 299 static void
 300 lp_build_sample_alpha_to_coverage(struct gallivm_state *gallivm,
 301                                   struct lp_type type,
 302                                   unsigned coverage_samples,
 303                                   LLVMValueRef num_loop,
 304                                   LLVMValueRef loop_counter,
 305                                   LLVMValueRef coverage_mask_store,
 306                                   LLVMValueRef alpha)
 307 {
 308    struct lp_build_context bld;
 309    LLVMBuilderRef builder = gallivm->builder;
 310    float step = 1.0 / coverage_samples;
 311
 312    lp_build_context_init(&bld, gallivm, type);
 313    for (unsigned s = 0; s < coverage_samples; s++) {
 314       LLVMValueRef alpha_ref_value = lp_build_const_vec(gallivm, type, step * s);
 315       LLVMValueRef test = lp_build_cmp(&bld, PIPE_FUNC_GREATER, alpha, alpha_ref_value);
 316
 317       LLVMValueRef s_mask_idx = LLVMBuildMul(builder, lp_build_const_int32(gallivm, s), num_loop, "");
 318       s_mask_idx = LLVMBuildAdd(builder, s_mask_idx, loop_counter, "");
 319       LLVMValueRef s_mask_ptr = LLVMBuildGEP(builder, coverage_mask_store, &s_mask_idx, 1, "");
 320       LLVMValueRef s_mask = LLVMBuildLoad(builder, s_mask_ptr, "");
 321       s_mask = LLVMBuildAnd(builder, s_mask, test, "");
 322       LLVMBuildStore(builder, s_mask, s_mask_ptr);
 323    }
 324 };
 325
 326 struct lp_build_fs_llvm_iface {
 327    struct lp_build_fs_iface base;
 328    struct lp_build_interp_soa_context *interp;
 329    struct lp_build_for_loop_state *loop_state;
 330    LLVMValueRef mask_store;
 331 };
 332
 333 static LLVMValueRef fs_interp(const struct lp_build_fs_iface *iface,
 334                               struct lp_build_context *bld,
 335                               unsigned attrib, unsigned chan,
 336                               bool centroid, bool sample,
 337                               LLVMValueRef attrib_indir,
 338                               LLVMValueRef offsets[2])
 339 {
 340    struct lp_build_fs_llvm_iface *fs_iface = (struct lp_build_fs_llvm_iface *)iface;
 341    struct lp_build_interp_soa_context *interp = fs_iface->interp;
 342    unsigned loc = TGSI_INTERPOLATE_LOC_CENTER;
 343    if (centroid)
 344       loc = TGSI_INTERPOLATE_LOC_CENTROID;
 345    if (sample)
 346       loc = TGSI_INTERPOLATE_LOC_SAMPLE;
 347
 348    return lp_build_interp_soa(interp, bld->gallivm, fs_iface->loop_state->counter,
 349                               fs_iface->mask_store,
 350                               attrib, chan, loc, attrib_indir, offsets);
 351 }
 352
 353 /**
 354  * Generate the fragment shader, depth/stencil test, and alpha tests.
 355  */
 356 static void
 357 generate_fs_loop(struct gallivm_state *gallivm,
 358                  struct lp_fragment_shader *shader,
 359                  const struct lp_fragment_shader_variant_key *key,
 360                  LLVMBuilderRef builder,
 361                  struct lp_type type,
 362                  LLVMValueRef context_ptr,
 363                  LLVMValueRef sample_pos_array,
 364                  LLVMValueRef num_loop,
 365                  struct lp_build_interp_soa_context *interp,
 366                  const struct lp_build_sampler_soa *sampler,
 367                  const struct lp_build_image_soa *image,
 368                  LLVMValueRef mask_store,
 369                  LLVMValueRef (*out_color)[4],
 370                  LLVMValueRef depth_base_ptr,
 371                  LLVMValueRef depth_stride,
 372                  LLVMValueRef depth_sample_stride,
 373                  LLVMValueRef facing,
 374                  LLVMValueRef thread_data_ptr)
 375 {
 376    const struct util_format_description *zs_format_desc = NULL;
 377    const struct tgsi_token *tokens = shader->base.tokens;
 378    struct lp_type int_type = lp_int_type(type);
 379    LLVMTypeRef vec_type, int_vec_type;
 380    LLVMValueRef mask_ptr = NULL, mask_val = NULL;
 381    LLVMValueRef consts_ptr, num_consts_ptr;
 382    LLVMValueRef ssbo_ptr, num_ssbo_ptr;
 383    LLVMValueRef z;
 384    LLVMValueRef z_value, s_value;
 385    LLVMValueRef z_fb, s_fb;
 386    LLVMValueRef depth_ptr;
 387    LLVMValueRef stencil_refs[2];
 388    LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS];
 389    LLVMValueRef zs_samples = lp_build_const_int32(gallivm, key->zsbuf_nr_samples);
 390    struct lp_build_for_loop_state loop_state, sample_loop_state;
 391    struct lp_build_mask_context mask;
 392    /*
 393     * TODO: figure out if simple_shader optimization is really worthwile to
 394     * keep. Disabled because it may hide some real bugs in the (depth/stencil)
 395     * code since tests tend to take another codepath than real shaders.
 396     */
 397    boolean simple_shader = (shader->info.base.file_count[TGSI_FILE_SAMPLER] == 0 &&
 398                             shader->info.base.num_inputs < 3 &&
 399                             shader->info.base.num_instructions < 8) && 0;
 400    const boolean dual_source_blend = key->blend.rt[0].blend_enable &&
 401                                      util_blend_state_is_dual(&key->blend, 0);
 402    unsigned attrib;
 403    unsigned chan;
 404    unsigned cbuf;
 405    unsigned depth_mode;
 406
 407    struct lp_bld_tgsi_system_values system_values;
 408
 409    memset(&system_values, 0, sizeof(system_values));
 410
 411    /* truncate then sign extend. */
 412    system_values.front_facing = LLVMBuildTrunc(gallivm->builder, facing, LLVMInt1TypeInContext(gallivm->context), "");
 413    system_values.front_facing = LLVMBuildSExt(gallivm->builder, system_values.front_facing, LLVMInt32TypeInContext(gallivm->context), "");
 414
 415    if (key->depth.enabled ||
 416        key->stencil[0].enabled) {
 417
 418       zs_format_desc = util_format_description(key->zsbuf_format);
 419       assert(zs_format_desc);
 420
 421       if (shader->info.base.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL])
 422          depth_mode = EARLY_DEPTH_TEST | EARLY_DEPTH_WRITE;
 423       else if (!shader->info.base.writes_z && !shader->info.base.writes_stencil) {
 424          if (shader->info.base.writes_memory)
 425             depth_mode = LATE_DEPTH_TEST | LATE_DEPTH_WRITE;
 426          else if (key->alpha.enabled ||
 427              key->blend.alpha_to_coverage ||
 428              shader->info.base.uses_kill ||
 429              shader->info.base.writes_samplemask) {
 430             /* With alpha test and kill, can do the depth test early
 431              * and hopefully eliminate some quads.  But need to do a
 432              * special deferred depth write once the final mask value
 433              * is known. This only works though if there's either no
 434              * stencil test or the stencil value isn't written.
 435              */
 436             if (key->stencil[0].enabled && (key->stencil[0].writemask ||
 437                                             (key->stencil[1].enabled &&
 438                                              key->stencil[1].writemask)))
 439                depth_mode = LATE_DEPTH_TEST | LATE_DEPTH_WRITE;
 440             else
 441                depth_mode = EARLY_DEPTH_TEST | LATE_DEPTH_WRITE;
 442          }
 443          else
 444             depth_mode = EARLY_DEPTH_TEST | EARLY_DEPTH_WRITE;
 445       }
 446       else {
 447          depth_mode = LATE_DEPTH_TEST | LATE_DEPTH_WRITE;
 448       }
 449
 450       if (!(key->depth.enabled && key->depth.writemask) &&
 451           !(key->stencil[0].enabled && (key->stencil[0].writemask ||
 452                                         (key->stencil[1].enabled &&
 453                                          key->stencil[1].writemask))))
 454          depth_mode &= ~(LATE_DEPTH_WRITE | EARLY_DEPTH_WRITE);
 455    }
 456    else {
 457       depth_mode = 0;
 458    }
 459
 460    vec_type = lp_build_vec_type(gallivm, type);
 461    int_vec_type = lp_build_vec_type(gallivm, int_type);
 462
 463    stencil_refs[0] = lp_jit_context_stencil_ref_front_value(gallivm, context_ptr);
 464    stencil_refs[1] = lp_jit_context_stencil_ref_back_value(gallivm, context_ptr);
 465    /* convert scalar stencil refs into vectors */
 466    stencil_refs[0] = lp_build_broadcast(gallivm, int_vec_type, stencil_refs[0]);
 467    stencil_refs[1] = lp_build_broadcast(gallivm, int_vec_type, stencil_refs[1]);
 468
 469    consts_ptr = lp_jit_context_constants(gallivm, context_ptr);
 470    num_consts_ptr = lp_jit_context_num_constants(gallivm, context_ptr);
 471
 472    ssbo_ptr = lp_jit_context_ssbos(gallivm, context_ptr);
 473    num_ssbo_ptr = lp_jit_context_num_ssbos(gallivm, context_ptr);
 474
 475    memset(outputs, 0, sizeof outputs);
 476
 477    /* Allocate color storage for each fragment sample */
 478    LLVMValueRef color_store_size = num_loop;
 479    if (key->min_samples > 1)
 480       color_store_size = LLVMBuildMul(builder, num_loop, lp_build_const_int32(gallivm, key->min_samples), "");
 481
 482    for(cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
 483       for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
 484          out_color[cbuf][chan] = lp_build_array_alloca(gallivm,
 485                                                        lp_build_vec_type(gallivm,
 486                                                                          type),
 487                                                        color_store_size, "color");
 488       }
 489    }
 490    if (dual_source_blend) {
 491       assert(key->nr_cbufs <= 1);
 492       for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
 493          out_color[1][chan] = lp_build_array_alloca(gallivm,
 494                                                     lp_build_vec_type(gallivm,
 495                                                                       type),
 496                                                     color_store_size, "color1");
 497       }
 498    }
 499
 500    lp_build_for_loop_begin(&loop_state, gallivm,
 501                            lp_build_const_int32(gallivm, 0),
 502                            LLVMIntULT,
 503                            num_loop,
 504                            lp_build_const_int32(gallivm, 1));
 505
 506    LLVMValueRef sample_mask_in;
 507    if (key->multisample) {
 508       sample_mask_in = lp_build_const_int_vec(gallivm, type, 0);
 509       /* create shader execution mask by combining all sample masks. */
 510       for (unsigned s = 0; s < key->coverage_samples; s++) {
 511          LLVMValueRef s_mask_idx = LLVMBuildMul(builder, num_loop, lp_build_const_int32(gallivm, s), "");
 512          s_mask_idx = LLVMBuildAdd(builder, s_mask_idx, loop_state.counter, "");
 513          LLVMValueRef s_mask = lp_build_pointer_get(builder, mask_store, s_mask_idx);
 514          if (s == 0)
 515             mask_val = s_mask;
 516          else
 517             mask_val = LLVMBuildOr(builder, s_mask, mask_val, "");
 518
 519          LLVMValueRef mask_in = LLVMBuildAnd(builder, s_mask, lp_build_const_int_vec(gallivm, type, (1 << s)), "");
 520          sample_mask_in = LLVMBuildOr(builder, sample_mask_in, mask_in, "");
 521       }
 522    } else {
 523       sample_mask_in = lp_build_const_int_vec(gallivm, type, 1);
 524       mask_ptr = LLVMBuildGEP(builder, mask_store,
 525                               &loop_state.counter, 1, "mask_ptr");
 526       mask_val = LLVMBuildLoad(builder, mask_ptr, "");
 527
 528       LLVMValueRef mask_in = LLVMBuildAnd(builder, mask_val, lp_build_const_int_vec(gallivm, type, 1), "");
 529       sample_mask_in = LLVMBuildOr(builder, sample_mask_in, mask_in, "");
 530    }
 531
 532    /* 'mask' will control execution based on quad's pixel alive/killed state */
 533    lp_build_mask_begin(&mask, gallivm, type, mask_val);
 534
 535    if (!(depth_mode & EARLY_DEPTH_TEST) && !simple_shader)
 536       lp_build_mask_check(&mask);
 537
 538    /* Create storage for recombining sample masks after early Z pass. */
 539    LLVMValueRef s_mask_or = lp_build_alloca(gallivm, lp_build_int_vec_type(gallivm, type), "cov_mask_early_depth");
 540    LLVMBuildStore(builder, LLVMConstNull(lp_build_int_vec_type(gallivm, type)), s_mask_or);
 541
 542    LLVMValueRef s_mask = NULL, s_mask_ptr = NULL;
 543    LLVMValueRef z_sample_value_store = NULL, s_sample_value_store = NULL;
 544    LLVMValueRef z_fb_store = NULL, s_fb_store = NULL;
 545    LLVMTypeRef z_type = NULL, z_fb_type = NULL;
 546
 547    /* Run early depth once per sample */
 548    if (key->multisample) {
 549
 550       if (zs_format_desc) {
 551          struct lp_type zs_type = lp_depth_type(zs_format_desc, type.length);
 552          struct lp_type z_type = zs_type;
 553          struct lp_type s_type = zs_type;
 554          if (zs_format_desc->block.bits < type.width)
 555             z_type.width = type.width;
 556          else if (zs_format_desc->block.bits > 32) {
 557             z_type.width = z_type.width / 2;
 558             s_type.width = s_type.width / 2;
 559             s_type.floating = 0;
 560          }
 561          z_sample_value_store = lp_build_array_alloca(gallivm, lp_build_int_vec_type(gallivm, type),
 562                                                       zs_samples, "z_sample_store");
 563          s_sample_value_store = lp_build_array_alloca(gallivm, lp_build_int_vec_type(gallivm, type),
 564                                                       zs_samples, "s_sample_store");
 565          z_fb_store = lp_build_array_alloca(gallivm, lp_build_vec_type(gallivm, z_type),
 566                                             zs_samples, "z_fb_store");
 567          s_fb_store = lp_build_array_alloca(gallivm, lp_build_vec_type(gallivm, s_type),
 568                                             zs_samples, "s_fb_store");
 569       }
 570       lp_build_for_loop_begin(&sample_loop_state, gallivm,
 571                               lp_build_const_int32(gallivm, 0),
 572                               LLVMIntULT, lp_build_const_int32(gallivm, key->coverage_samples),
 573                               lp_build_const_int32(gallivm, 1));
 574
 575       LLVMValueRef s_mask_idx = LLVMBuildMul(builder, sample_loop_state.counter, num_loop, "");
 576       s_mask_idx = LLVMBuildAdd(builder, s_mask_idx, loop_state.counter, "");
 577       s_mask_ptr = LLVMBuildGEP(builder, mask_store, &s_mask_idx, 1, "");
 578
 579       s_mask = LLVMBuildLoad(builder, s_mask_ptr, "");
 580       s_mask = LLVMBuildAnd(builder, s_mask, mask_val, "");
 581    }
 582
 583
 584    /* for multisample Z needs to be interpolated at sample points for testing. */
 585    lp_build_interp_soa_update_pos_dyn(interp, gallivm, loop_state.counter, key->multisample ? sample_loop_state.counter : NULL);
 586    z = interp->pos[2];
 587
 588    depth_ptr = depth_base_ptr;
 589    if (key->multisample) {
 590       LLVMValueRef sample_offset = LLVMBuildMul(builder, sample_loop_state.counter, depth_sample_stride, "");
 591       depth_ptr = LLVMBuildGEP(builder, depth_ptr, &sample_offset, 1, "");
 592    }
 593
 594    if (depth_mode & EARLY_DEPTH_TEST) {
 595       /*
 596        * Clamp according to ARB_depth_clamp semantics.
 597        */
 598       if (key->depth_clamp) {
 599          z = lp_build_depth_clamp(gallivm, builder, type, context_ptr,
 600                                   thread_data_ptr, z);
 601       }
 602       lp_build_depth_stencil_load_swizzled(gallivm, type,
 603                                            zs_format_desc, key->resource_1d,
 604                                            depth_ptr, depth_stride,
 605                                            &z_fb, &s_fb, loop_state.counter);
 606       lp_build_depth_stencil_test(gallivm,
 607                                   &key->depth,
 608                                   key->stencil,
 609                                   type,
 610                                   zs_format_desc,
 611                                   key->multisample ? NULL : &mask,
 612                                   &s_mask,
 613                                   stencil_refs,
 614                                   z, z_fb, s_fb,
 615                                   facing,
 616                                   &z_value, &s_value,
 617                                   !simple_shader && !key->multisample);
 618
 619       if (depth_mode & EARLY_DEPTH_WRITE) {
 620          lp_build_depth_stencil_write_swizzled(gallivm, type,
 621                                                zs_format_desc, key->resource_1d,
 622                                                NULL, NULL, NULL, loop_state.counter,
 623                                                depth_ptr, depth_stride,
 624                                                z_value, s_value);
 625       }
 626       /*
 627        * Note mask check if stencil is enabled must be after ds write not after
 628        * stencil test otherwise new stencil values may not get written if all
 629        * fragments got killed by depth/stencil test.
 630        */
 631       if (!simple_shader && key->stencil[0].enabled && !key->multisample)
 632          lp_build_mask_check(&mask);
 633
 634       if (key->multisample) {
 635          z_fb_type = LLVMTypeOf(z_fb);
 636          z_type = LLVMTypeOf(z_value);
 637          lp_build_pointer_set(builder, z_sample_value_store, sample_loop_state.counter, LLVMBuildBitCast(builder, z_value, lp_build_int_vec_type(gallivm, type), ""));
 638          lp_build_pointer_set(builder, s_sample_value_store, sample_loop_state.counter, LLVMBuildBitCast(builder, s_value, lp_build_int_vec_type(gallivm, type), ""));
 639          lp_build_pointer_set(builder, z_fb_store, sample_loop_state.counter, z_fb);
 640          lp_build_pointer_set(builder, s_fb_store, sample_loop_state.counter, s_fb);
 641       }
 642    }
 643
 644    if (key->multisample) {
 645       /*
 646        * Store the post-early Z coverage mask.
 647        * Recombine the resulting coverage masks post early Z into the fragment
 648        * shader execution mask.
 649        */
 650       LLVMValueRef tmp_s_mask_or = LLVMBuildLoad(builder, s_mask_or, "");
 651       tmp_s_mask_or = LLVMBuildOr(builder, tmp_s_mask_or, s_mask, "");
 652       LLVMBuildStore(builder, tmp_s_mask_or, s_mask_or);
 653
 654       LLVMBuildStore(builder, s_mask, s_mask_ptr);
 655
 656       lp_build_for_loop_end(&sample_loop_state);
 657
 658       /* recombined all the coverage masks in the shader exec mask. */
 659       tmp_s_mask_or = LLVMBuildLoad(builder, s_mask_or, "");
 660       lp_build_mask_update(&mask, tmp_s_mask_or);
 661
 662       if (key->min_samples == 1) {
 663          /* for multisample Z needs to be re interpolated at pixel center */
 664          lp_build_interp_soa_update_pos_dyn(interp, gallivm, loop_state.counter, NULL);
 665          lp_build_mask_update(&mask, tmp_s_mask_or);
 666       }
 667    }
 668
 669    LLVMValueRef out_sample_mask_storage = NULL;
 670    if (shader->info.base.writes_samplemask) {
 671       out_sample_mask_storage = lp_build_alloca(gallivm, int_vec_type, "write_mask");
 672       if (key->min_samples > 1)
 673          LLVMBuildStore(builder, LLVMConstNull(int_vec_type), out_sample_mask_storage);
 674    }
 675
 676    if (key->multisample && key->min_samples > 1) {
 677       lp_build_for_loop_begin(&sample_loop_state, gallivm,
 678                               lp_build_const_int32(gallivm, 0),
 679                               LLVMIntULT,
 680                               lp_build_const_int32(gallivm, key->min_samples),
 681                               lp_build_const_int32(gallivm, 1));
 682
 683       LLVMValueRef s_mask_idx = LLVMBuildMul(builder, sample_loop_state.counter, num_loop, "");
 684       s_mask_idx = LLVMBuildAdd(builder, s_mask_idx, loop_state.counter, "");
 685       s_mask_ptr = LLVMBuildGEP(builder, mask_store, &s_mask_idx, 1, "");
 686       s_mask = LLVMBuildLoad(builder, s_mask_ptr, "");
 687       lp_build_mask_force(&mask, s_mask);
 688       lp_build_interp_soa_update_pos_dyn(interp, gallivm, loop_state.counter, sample_loop_state.counter);
 689       system_values.sample_id = sample_loop_state.counter;
 690    } else
 691       system_values.sample_id = lp_build_const_int32(gallivm, 0);
 692
 693    system_values.sample_mask_in = sample_mask_in;
 694    system_values.sample_pos = sample_pos_array;
 695
 696    lp_build_interp_soa_update_inputs_dyn(interp, gallivm, loop_state.counter, mask_store, sample_loop_state.counter);
 697
 698    struct lp_build_fs_llvm_iface fs_iface = {
 699      .base.interp_fn = fs_interp,
 700      .interp = interp,
 701      .loop_state = &loop_state,
 702      .mask_store = mask_store,
 703    };
 704
 705    struct lp_build_tgsi_params params;
 706    memset(&params, 0, sizeof(params));
 707
 708    params.type = type;
 709    params.mask = &mask;
 710    params.fs_iface = &fs_iface.base;
 711    params.consts_ptr = consts_ptr;
 712    params.const_sizes_ptr = num_consts_ptr;
 713    params.system_values = &system_values;
 714    params.inputs = interp->inputs;
 715    params.context_ptr = context_ptr;
 716    params.thread_data_ptr = thread_data_ptr;
 717    params.sampler = sampler;
 718    params.info = &shader->info.base;
 719    params.ssbo_ptr = ssbo_ptr;
 720    params.ssbo_sizes_ptr = num_ssbo_ptr;
 721    params.image = image;
 722
 723    /* Build the actual shader */
 724    if (shader->base.type == PIPE_SHADER_IR_TGSI)
 725       lp_build_tgsi_soa(gallivm, tokens, &params,
 726                         outputs);
 727    else
 728       lp_build_nir_soa(gallivm, shader->base.ir.nir, &params,
 729                        outputs);
 730
 731    /* Alpha test */
 732    if (key->alpha.enabled) {
 733       int color0 = find_output_by_semantic(&shader->info.base,
 734                                            TGSI_SEMANTIC_COLOR,
 735                                            0);
 736
 737       if (color0 != -1 && outputs[color0][3]) {
 738          const struct util_format_description *cbuf_format_desc;
 739          LLVMValueRef alpha = LLVMBuildLoad(builder, outputs[color0][3], "alpha");
 740          LLVMValueRef alpha_ref_value;
 741
 742          alpha_ref_value = lp_jit_context_alpha_ref_value(gallivm, context_ptr);
 743          alpha_ref_value = lp_build_broadcast(gallivm, vec_type, alpha_ref_value);
 744
 745          cbuf_format_desc = util_format_description(key->cbuf_format[0]);
 746
 747          lp_build_alpha_test(gallivm, key->alpha.func, type, cbuf_format_desc,
 748                              &mask, alpha, alpha_ref_value,
 749                              (depth_mode & LATE_DEPTH_TEST) != 0);
 750       }
 751    }
 752
 753    /* Emulate Alpha to Coverage with Alpha test */
 754    if (key->blend.alpha_to_coverage) {
 755       int color0 = find_output_by_semantic(&shader->info.base,
 756                                            TGSI_SEMANTIC_COLOR,
 757                                            0);
 758
 759       if (color0 != -1 && outputs[color0][3]) {
 760          LLVMValueRef alpha = LLVMBuildLoad(builder, outputs[color0][3], "alpha");
 761
 762          if (!key->multisample) {
 763             lp_build_alpha_to_coverage(gallivm, type,
 764                                        &mask, alpha,
 765                                        (depth_mode & LATE_DEPTH_TEST) != 0);
 766          } else {
 767             lp_build_sample_alpha_to_coverage(gallivm, type, key->coverage_samples, num_loop,
 768                                               loop_state.counter,
 769                                               mask_store, alpha);
 770          }
 771       }
 772    }
 773    if (key->blend.alpha_to_one && key->multisample) {
 774       for (attrib = 0; attrib < shader->info.base.num_outputs; ++attrib) {
 775          unsigned cbuf = shader->info.base.output_semantic_index[attrib];
 776          if ((shader->info.base.output_semantic_name[attrib] == TGSI_SEMANTIC_COLOR) &&
 777              ((cbuf < key->nr_cbufs) || (cbuf == 1 && dual_source_blend)))
 778             if (outputs[cbuf][3]) {
 779                LLVMBuildStore(builder, lp_build_const_vec(gallivm, type, 1.0), outputs[cbuf][3]);
 780             }
 781       }
 782    }
 783    if (shader->info.base.writes_samplemask) {
 784       LLVMValueRef output_smask = NULL;
 785       int smaski = find_output_by_semantic(&shader->info.base,
 786                                            TGSI_SEMANTIC_SAMPLEMASK,
 787                                            0);
 788       struct lp_build_context smask_bld;
 789       lp_build_context_init(&smask_bld, gallivm, int_type);
 790
 791       assert(smaski >= 0);
 792       output_smask = LLVMBuildLoad(builder, outputs[smaski][0], "smask");
 793       output_smask = LLVMBuildBitCast(builder, output_smask, smask_bld.vec_type, "");
 794
 795       if (key->min_samples > 1) {
 796          /* only the bit corresponding to this sample is to be used. */
 797          LLVMValueRef tmp_mask = LLVMBuildLoad(builder, out_sample_mask_storage, "tmp_mask");
 798          LLVMValueRef out_smask_idx = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1), sample_loop_state.counter, "");
 799          LLVMValueRef smask_bit = LLVMBuildAnd(builder, output_smask, lp_build_broadcast(gallivm, int_vec_type, out_smask_idx), "");
 800          output_smask = LLVMBuildOr(builder, tmp_mask, smask_bit, "");
 801       }
 802
 803       LLVMBuildStore(builder, output_smask, out_sample_mask_storage);
 804    }
 805
 806    /* Color write - per fragment sample */
 807    for (attrib = 0; attrib < shader->info.base.num_outputs; ++attrib)
 808    {
 809       unsigned cbuf = shader->info.base.output_semantic_index[attrib];
 810       if ((shader->info.base.output_semantic_name[attrib] == TGSI_SEMANTIC_COLOR) &&
 811            ((cbuf < key->nr_cbufs) || (cbuf == 1 && dual_source_blend)))
 812       {
 813          for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
 814             if(outputs[attrib][chan]) {
 815                /* XXX: just initialize outputs to point at colors[] and
 816                 * skip this.
 817                 */
 818                LLVMValueRef out = LLVMBuildLoad(builder, outputs[attrib][chan], "");
 819                LLVMValueRef color_ptr;
 820                LLVMValueRef color_idx = loop_state.counter;
 821                if (key->min_samples > 1)
 822                   color_idx = LLVMBuildAdd(builder, color_idx,
 823                                            LLVMBuildMul(builder, sample_loop_state.counter, num_loop, ""), "");
 824                color_ptr = LLVMBuildGEP(builder, out_color[cbuf][chan],
 825                                         &color_idx, 1, "");
 826                lp_build_name(out, "color%u.%c", attrib, "rgba"[chan]);
 827                LLVMBuildStore(builder, out, color_ptr);
 828             }
 829          }
 830       }
 831    }
 832
 833    if (key->multisample && key->min_samples > 1) {
 834       LLVMBuildStore(builder, lp_build_mask_value(&mask), s_mask_ptr);
 835       lp_build_for_loop_end(&sample_loop_state);
 836    }
 837
 838    if (key->multisample) {
 839       /* execute depth test for each sample */
 840       lp_build_for_loop_begin(&sample_loop_state, gallivm,
 841                               lp_build_const_int32(gallivm, 0),
 842                               LLVMIntULT, lp_build_const_int32(gallivm, key->coverage_samples),
 843                               lp_build_const_int32(gallivm, 1));
 844
 845       /* load the per-sample coverage mask */
 846       LLVMValueRef s_mask_idx = LLVMBuildMul(builder, sample_loop_state.counter, num_loop, "");
 847       s_mask_idx = LLVMBuildAdd(builder, s_mask_idx, loop_state.counter, "");
 848       s_mask_ptr = LLVMBuildGEP(builder, mask_store, &s_mask_idx, 1, "");
 849
 850       /* combine the execution mask post fragment shader with the coverage mask. */
 851       s_mask = LLVMBuildLoad(builder, s_mask_ptr, "");
 852       if (key->min_samples == 1)
 853          s_mask = LLVMBuildAnd(builder, s_mask, lp_build_mask_value(&mask), "");
 854
 855       /* if the shader writes sample mask use that */
 856       if (shader->info.base.writes_samplemask) {
 857          LLVMValueRef out_smask_idx = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1), sample_loop_state.counter, "");
 858          out_smask_idx = lp_build_broadcast(gallivm, int_vec_type, out_smask_idx);
 859          LLVMValueRef output_smask = LLVMBuildLoad(builder, out_sample_mask_storage, "");
 860          LLVMValueRef smask_bit = LLVMBuildAnd(builder, output_smask, out_smask_idx, "");
 861          LLVMValueRef cmp = LLVMBuildICmp(builder, LLVMIntNE, smask_bit, lp_build_const_int_vec(gallivm, int_type, 0), "");
 862          smask_bit = LLVMBuildSExt(builder, cmp, int_vec_type, "");
 863
 864          s_mask = LLVMBuildAnd(builder, s_mask, smask_bit, "");
 865       }
 866    }
 867
 868    depth_ptr = depth_base_ptr;
 869    if (key->multisample) {
 870       LLVMValueRef sample_offset = LLVMBuildMul(builder, sample_loop_state.counter, depth_sample_stride, "");
 871       depth_ptr = LLVMBuildGEP(builder, depth_ptr, &sample_offset, 1, "");
 872    }
 873
 874    /* Late Z test */
 875    if (depth_mode & LATE_DEPTH_TEST) {
 876       int pos0 = find_output_by_semantic(&shader->info.base,
 877                                          TGSI_SEMANTIC_POSITION,
 878                                          0);
 879       int s_out = find_output_by_semantic(&shader->info.base,
 880                                           TGSI_SEMANTIC_STENCIL,
 881                                           0);
 882       if (pos0 != -1 && outputs[pos0][2]) {
 883          z = LLVMBuildLoad(builder, outputs[pos0][2], "output.z");
 884       }
 885       /*
 886        * Clamp according to ARB_depth_clamp semantics.
 887        */
 888       if (key->depth_clamp) {
 889          z = lp_build_depth_clamp(gallivm, builder, type, context_ptr,
 890                                   thread_data_ptr, z);
 891       }
 892
 893       if (s_out != -1 && outputs[s_out][1]) {
 894          /* there's only one value, and spec says to discard additional bits */
 895          LLVMValueRef s_max_mask = lp_build_const_int_vec(gallivm, int_type, 255);
 896          stencil_refs[0] = LLVMBuildLoad(builder, outputs[s_out][1], "output.s");
 897          stencil_refs[0] = LLVMBuildBitCast(builder, stencil_refs[0], int_vec_type, "");
 898          stencil_refs[0] = LLVMBuildAnd(builder, stencil_refs[0], s_max_mask, "");
 899          stencil_refs[1] = stencil_refs[0];
 900       }
 901
 902       lp_build_depth_stencil_load_swizzled(gallivm, type,
 903                                            zs_format_desc, key->resource_1d,
 904                                            depth_ptr, depth_stride,
 905                                            &z_fb, &s_fb, loop_state.counter);
 906
 907       lp_build_depth_stencil_test(gallivm,
 908                                   &key->depth,
 909                                   key->stencil,
 910                                   type,
 911                                   zs_format_desc,
 912                                   key->multisample ? NULL : &mask,
 913                                   &s_mask,
 914                                   stencil_refs,
 915                                   z, z_fb, s_fb,
 916                                   facing,
 917                                   &z_value, &s_value,
 918                                   !simple_shader);
 919       /* Late Z write */
 920       if (depth_mode & LATE_DEPTH_WRITE) {
 921          lp_build_depth_stencil_write_swizzled(gallivm, type,
 922                                                zs_format_desc, key->resource_1d,
 923                                                NULL, NULL, NULL, loop_state.counter,
 924                                                depth_ptr, depth_stride,
 925                                                z_value, s_value);
 926       }
 927    }
 928    else if ((depth_mode & EARLY_DEPTH_TEST) &&
 929             (depth_mode & LATE_DEPTH_WRITE))
 930    {
 931       /* Need to apply a reduced mask to the depth write.  Reload the
 932        * depth value, update from zs_value with the new mask value and
 933        * write that out.
 934        */
 935       if (key->multisample) {
 936          z_value = LLVMBuildBitCast(builder, lp_build_pointer_get(builder, z_sample_value_store, sample_loop_state.counter), z_type, "");;
 937          s_value = lp_build_pointer_get(builder, s_sample_value_store, sample_loop_state.counter);
 938          z_fb = LLVMBuildBitCast(builder, lp_build_pointer_get(builder, z_fb_store, sample_loop_state.counter), z_fb_type, "");
 939          s_fb = lp_build_pointer_get(builder, s_fb_store, sample_loop_state.counter);
 940       }
 941       lp_build_depth_stencil_write_swizzled(gallivm, type,
 942                                             zs_format_desc, key->resource_1d,
 943                                             key->multisample ? s_mask : lp_build_mask_value(&mask), z_fb, s_fb, loop_state.counter,
 944                                             depth_ptr, depth_stride,
 945                                             z_value, s_value);
 946    }
 947
 948    if (key->occlusion_count) {
 949       LLVMValueRef counter = lp_jit_thread_data_counter(gallivm, thread_data_ptr);
 950       lp_build_name(counter, "counter");
 951
 952       lp_build_occlusion_count(gallivm, type,
 953                                key->multisample ? s_mask : lp_build_mask_value(&mask), counter);
 954    }
 955
 956    if (key->multisample) {
 957       /* store the sample mask for this loop */
 958       LLVMBuildStore(builder, s_mask, s_mask_ptr);
 959       lp_build_for_loop_end(&sample_loop_state);
 960    }
 961
 962    mask_val = lp_build_mask_end(&mask);
 963    if (!key->multisample)
 964       LLVMBuildStore(builder, mask_val, mask_ptr);
 965    lp_build_for_loop_end(&loop_state);
 966 }
 967
 968
 969 /**
 970  * This function will reorder pixels from the fragment shader SoA to memory layout AoS
 971  *
 972  * Fragment Shader outputs pixels in small 2x2 blocks
 973  *  e.g. (0, 0), (1, 0), (0, 1), (1, 1) ; (2, 0) ...
 974  *
 975  * However in memory pixels are stored in rows
 976  *  e.g. (0, 0), (1, 0), (2, 0), (3, 0) ; (0, 1) ...
 977  *
 978  * @param type            fragment shader type (4x or 8x float)
 979  * @param num_fs          number of fs_src
 980  * @param is_1d           whether we're outputting to a 1d resource
 981  * @param dst_channels    number of output channels
 982  * @param fs_src          output from fragment shader
 983  * @param dst             pointer to store result
 984  * @param pad_inline      is channel padding inline or at end of row
 985  * @return                the number of dsts
 986  */
 987 static int
 988 generate_fs_twiddle(struct gallivm_state *gallivm,
 989                     struct lp_type type,
 990                     unsigned num_fs,
 991                     unsigned dst_channels,
 992                     LLVMValueRef fs_src[][4],
 993                     LLVMValueRef* dst,
 994                     bool pad_inline)
 995 {
 996    LLVMValueRef src[16];
 997
 998    bool swizzle_pad;
 999    bool twiddle;
1000    bool split;
1001
1002    unsigned pixels = type.length / 4;
1003    unsigned reorder_group;
1004    unsigned src_channels;
1005    unsigned src_count;
1006    unsigned i;
1007
1008    src_channels = dst_channels < 3 ? dst_channels : 4;
1009    src_count = num_fs * src_channels;
1010
1011    assert(pixels == 2 || pixels == 1);
1012    assert(num_fs * src_channels <= ARRAY_SIZE(src));
1013
1014    /*
1015     * Transpose from SoA -> AoS
1016     */
1017    for (i = 0; i < num_fs; ++i) {
1018       lp_build_transpose_aos_n(gallivm, type, &fs_src[i][0], src_channels, &src[i * src_channels]);
1019    }
1020
1021    /*
1022     * Pick transformation options
1023     */
1024    swizzle_pad = false;
1025    twiddle = false;
1026    split = false;
1027    reorder_group = 0;
1028
1029    if (dst_channels == 1) {
1030       twiddle = true;
1031
1032       if (pixels == 2) {
1033          split = true;
1034       }
1035    } else if (dst_channels == 2) {
1036       if (pixels == 1) {
1037          reorder_group = 1;
1038       }
1039    } else if (dst_channels > 2) {
1040       if (pixels == 1) {
1041          reorder_group = 2;
1042       } else {
1043          twiddle = true;
1044       }
1045
1046       if (!pad_inline && dst_channels == 3 && pixels > 1) {
1047          swizzle_pad = true;
1048       }
1049    }
1050
1051    /*
1052     * Split the src in half
1053     */
1054    if (split) {
1055       for (i = num_fs; i > 0; --i) {
1056          src[(i - 1)*2 + 1] = lp_build_extract_range(gallivm, src[i - 1], 4, 4);
1057          src[(i - 1)*2 + 0] = lp_build_extract_range(gallivm, src[i - 1], 0, 4);
1058       }
1059
1060       src_count *= 2;
1061       type.length = 4;
1062    }
1063
1064    /*
1065     * Ensure pixels are in memory order
1066     */
1067    if (reorder_group) {
1068       /* Twiddle pixels by reordering the array, e.g.:
1069        *
1070        * src_count =  8 -> 0 2 1 3 4 6 5 7
1071        * src_count = 16 -> 0 1 4 5 2 3 6 7 8 9 12 13 10 11 14 15
1072        */
1073       const unsigned reorder_sw[] = { 0, 2, 1, 3 };
1074
1075       for (i = 0; i < src_count; ++i) {
1076          unsigned group = i / reorder_group;
1077          unsigned block = (group / 4) * 4 * reorder_group;
1078          unsigned j = block + (reorder_sw[group % 4] * reorder_group) + (i % reorder_group);
1079          dst[i] = src[j];
1080       }
1081    } else if (twiddle) {
1082       /* Twiddle pixels across elements of array */
1083       /*
1084        * XXX: we should avoid this in some cases, but would need to tell
1085        * lp_build_conv to reorder (or deal with it ourselves).
1086        */
1087       lp_bld_quad_twiddle(gallivm, type, src, src_count, dst);
1088    } else {
1089       /* Do nothing */
1090       memcpy(dst, src, sizeof(LLVMValueRef) * src_count);
1091    }
1092
1093    /*
1094     * Moves any padding between pixels to the end
1095     * e.g. RGBXRGBX -> RGBRGBXX
1096     */
1097    if (swizzle_pad) {
1098       unsigned char swizzles[16];
1099       unsigned elems = pixels * dst_channels;
1100
1101       for (i = 0; i < type.length; ++i) {
1102          if (i < elems)
1103             swizzles[i] = i % dst_channels + (i / dst_channels) * 4;
1104          else
1105             swizzles[i] = LP_BLD_SWIZZLE_DONTCARE;
1106       }
1107
1108       for (i = 0; i < src_count; ++i) {
1109          dst[i] = lp_build_swizzle_aos_n(gallivm, dst[i], swizzles, type.length, type.length);
1110       }
1111    }
1112
1113    return src_count;
1114 }
1115
1116
1117 /*
1118  * Untwiddle and transpose, much like the above.
1119  * However, this is after conversion, so we get packed vectors.
1120  * At this time only handle 4x16i8 rgba / 2x16i8 rg / 1x16i8 r data,
1121  * the vectors will look like:
1122  * r0r1r4r5r2r3r6r7r8r9r12... (albeit color channels may
1123  * be swizzled here). Extending to 16bit should be trivial.
1124  * Should also be extended to handle twice wide vectors with AVX2...
1125  */
1126 static void
1127 fs_twiddle_transpose(struct gallivm_state *gallivm,
1128                      struct lp_type type,
1129                      LLVMValueRef *src,
1130                      unsigned src_count,
1131                      LLVMValueRef *dst)
1132 {
1133    unsigned i, j;
1134    struct lp_type type64, type16, type32;
1135    LLVMTypeRef type64_t, type8_t, type16_t, type32_t;
1136    LLVMBuilderRef builder = gallivm->builder;
1137    LLVMValueRef tmp[4], shuf[8];
1138    for (j = 0; j < 2; j++) {
1139       shuf[j*4 + 0] = lp_build_const_int32(gallivm, j*4 + 0);
1140       shuf[j*4 + 1] = lp_build_const_int32(gallivm, j*4 + 2);
1141       shuf[j*4 + 2] = lp_build_const_int32(gallivm, j*4 + 1);
1142       shuf[j*4 + 3] = lp_build_const_int32(gallivm, j*4 + 3);
1143    }
1144
1145    assert(src_count == 4 || src_count == 2 || src_count == 1);
1146    assert(type.width == 8);
1147    assert(type.length == 16);
1148
1149    type8_t = lp_build_vec_type(gallivm, type);
1150
1151    type64 = type;
1152    type64.length /= 8;
1153    type64.width *= 8;
1154    type64_t = lp_build_vec_type(gallivm, type64);
1155
1156    type16 = type;
1157    type16.length /= 2;
1158    type16.width *= 2;
1159    type16_t = lp_build_vec_type(gallivm, type16);
1160
1161    type32 = type;
1162    type32.length /= 4;
1163    type32.width *= 4;
1164    type32_t = lp_build_vec_type(gallivm, type32);
1165
1166    lp_build_transpose_aos_n(gallivm, type, src, src_count, tmp);
1167
1168    if (src_count == 1) {
1169       /* transpose was no-op, just untwiddle */
1170       LLVMValueRef shuf_vec;
1171       shuf_vec = LLVMConstVector(shuf, 8);
1172       tmp[0] = LLVMBuildBitCast(builder, src[0], type16_t, "");
1173       tmp[0] = LLVMBuildShuffleVector(builder, tmp[0], tmp[0], shuf_vec, "");
1174       dst[0] = LLVMBuildBitCast(builder, tmp[0], type8_t, "");
1175    } else if (src_count == 2) {
1176       LLVMValueRef shuf_vec;
1177       shuf_vec = LLVMConstVector(shuf, 4);
1178
1179       for (i = 0; i < 2; i++) {
1180          tmp[i] = LLVMBuildBitCast(builder, tmp[i], type32_t, "");
1181          tmp[i] = LLVMBuildShuffleVector(builder, tmp[i], tmp[i], shuf_vec, "");
1182          dst[i] = LLVMBuildBitCast(builder, tmp[i], type8_t, "");
1183       }
1184    } else {
1185       for (j = 0; j < 2; j++) {
1186          LLVMValueRef lo, hi, lo2, hi2;
1187           /*
1188           * Note that if we only really have 3 valid channels (rgb)
1189           * and we don't need alpha we could substitute a undef here
1190           * for the respective channel (causing llvm to drop conversion
1191           * for alpha).
1192           */
1193          /* we now have rgba0rgba1rgba4rgba5 etc, untwiddle */
1194          lo2 = LLVMBuildBitCast(builder, tmp[j*2], type64_t, "");
1195          hi2 = LLVMBuildBitCast(builder, tmp[j*2 + 1], type64_t, "");
1196          lo = lp_build_interleave2(gallivm, type64, lo2, hi2, 0);
1197          hi = lp_build_interleave2(gallivm, type64, lo2, hi2, 1);
1198          dst[j*2] = LLVMBuildBitCast(builder, lo, type8_t, "");
1199          dst[j*2 + 1] = LLVMBuildBitCast(builder, hi, type8_t, "");
1200       }
1201    }
1202 }
1203
1204
1205 /**
1206  * Load an unswizzled block of pixels from memory
1207  */
1208 static void
1209 load_unswizzled_block(struct gallivm_state *gallivm,
1210                       LLVMValueRef base_ptr,
1211                       LLVMValueRef stride,
1212                       unsigned block_width,
1213                       unsigned block_height,
1214                       LLVMValueRef* dst,
1215                       struct lp_type dst_type,
1216                       unsigned dst_count,
1217                       unsigned dst_alignment)
1218 {
1219    LLVMBuilderRef builder = gallivm->builder;
1220    unsigned row_size = dst_count / block_height;
1221    unsigned i;
1222
1223    /* Ensure block exactly fits into dst */
1224    assert((block_width * block_height) % dst_count == 0);
1225
1226    for (i = 0; i < dst_count; ++i) {
1227       unsigned x = i % row_size;
1228       unsigned y = i / row_size;
1229
1230       LLVMValueRef bx = lp_build_const_int32(gallivm, x * (dst_type.width / 8) * dst_type.length);
1231       LLVMValueRef by = LLVMBuildMul(builder, lp_build_const_int32(gallivm, y), stride, "");
1232
1233       LLVMValueRef gep[2];
1234       LLVMValueRef dst_ptr;
1235
1236       gep[0] = lp_build_const_int32(gallivm, 0);
1237       gep[1] = LLVMBuildAdd(builder, bx, by, "");
1238
1239       dst_ptr = LLVMBuildGEP(builder, base_ptr, gep, 2, "");
1240       dst_ptr = LLVMBuildBitCast(builder, dst_ptr,
1241                                  LLVMPointerType(lp_build_vec_type(gallivm, dst_type), 0), "");
1242
1243       dst[i] = LLVMBuildLoad(builder, dst_ptr, "");
1244
1245       LLVMSetAlignment(dst[i], dst_alignment);
1246    }
1247 }
1248
1249
1250 /**
1251  * Store an unswizzled block of pixels to memory
1252  */
1253 static void
1254 store_unswizzled_block(struct gallivm_state *gallivm,
1255                        LLVMValueRef base_ptr,
1256                        LLVMValueRef stride,
1257                        unsigned block_width,
1258                        unsigned block_height,
1259                        LLVMValueRef* src,
1260                        struct lp_type src_type,
1261                        unsigned src_count,
1262                        unsigned src_alignment)
1263 {
1264    LLVMBuilderRef builder = gallivm->builder;
1265    unsigned row_size = src_count / block_height;
1266    unsigned i;
1267
1268    /* Ensure src exactly fits into block */
1269    assert((block_width * block_height) % src_count == 0);
1270
1271    for (i = 0; i < src_count; ++i) {
1272       unsigned x = i % row_size;
1273       unsigned y = i / row_size;
1274
1275       LLVMValueRef bx = lp_build_const_int32(gallivm, x * (src_type.width / 8) * src_type.length);
1276       LLVMValueRef by = LLVMBuildMul(builder, lp_build_const_int32(gallivm, y), stride, "");
1277
1278       LLVMValueRef gep[2];
1279       LLVMValueRef src_ptr;
1280
1281       gep[0] = lp_build_const_int32(gallivm, 0);
1282       gep[1] = LLVMBuildAdd(builder, bx, by, "");
1283
1284       src_ptr = LLVMBuildGEP(builder, base_ptr, gep, 2, "");
1285       src_ptr = LLVMBuildBitCast(builder, src_ptr,
1286                                  LLVMPointerType(lp_build_vec_type(gallivm, src_type), 0), "");
1287
1288       src_ptr = LLVMBuildStore(builder, src[i], src_ptr);
1289
1290       LLVMSetAlignment(src_ptr, src_alignment);
1291    }
1292 }
1293
1294
1295 /**
1296  * Checks if a format description is an arithmetic format
1297  *
1298  * A format which has irregular channel sizes such as R3_G3_B2 or R5_G6_B5.
1299  */
1300 static inline boolean
1301 is_arithmetic_format(const struct util_format_description *format_desc)
1302 {
1303    boolean arith = false;
1304    unsigned i;
1305
1306    for (i = 0; i < format_desc->nr_channels; ++i) {
1307       arith |= format_desc->channel[i].size != format_desc->channel[0].size;
1308       arith |= (format_desc->channel[i].size % 8) != 0;
1309    }
1310
1311    return arith;
1312 }
1313
1314
1315 /**
1316  * Checks if this format requires special handling due to required expansion
1317  * to floats for blending, and furthermore has "natural" packed AoS -> unpacked
1318  * SoA conversion.
1319  */
1320 static inline boolean
1321 format_expands_to_float_soa(const struct util_format_description *format_desc)
1322 {
1323    if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT ||
1324        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
1325       return true;
1326    }
1327    return false;
1328 }
1329
1330
1331 /**
1332  * Retrieves the type representing the memory layout for a format
1333  *
1334  * e.g. RGBA16F = 4x half-float and R3G3B2 = 1x byte
1335  */
1336 static inline void
1337 lp_mem_type_from_format_desc(const struct util_format_description *format_desc,
1338                              struct lp_type* type)
1339 {
1340    unsigned i;
1341    unsigned chan;
1342
1343    if (format_expands_to_float_soa(format_desc)) {
1344       /* just make this a uint with width of block */
1345       type->floating = false;
1346       type->fixed = false;
1347       type->sign = false;
1348       type->norm = false;
1349       type->width = format_desc->block.bits;
1350       type->length = 1;
1351       return;
1352    }
1353
1354    for (i = 0; i < 4; i++)
1355       if (format_desc->channel[i].type != UTIL_FORMAT_TYPE_VOID)
1356          break;
1357    chan = i;
1358
1359    memset(type, 0, sizeof(struct lp_type));
1360    type->floating = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FLOAT;
1361    type->fixed    = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FIXED;
1362    type->sign     = format_desc->channel[chan].type != UTIL_FORMAT_TYPE_UNSIGNED;
1363    type->norm     = format_desc->channel[chan].normalized;
1364
1365    if (is_arithmetic_format(format_desc)) {
1366       type->width = 0;
1367       type->length = 1;
1368
1369       for (i = 0; i < format_desc->nr_channels; ++i) {
1370          type->width += format_desc->channel[i].size;
1371       }
1372    } else {
1373       type->width = format_desc->channel[chan].size;
1374       type->length = format_desc->nr_channels;
1375    }
1376 }
1377
1378
1379 /**
1380  * Retrieves the type for a format which is usable in the blending code.
1381  *
1382  * e.g. RGBA16F = 4x float, R3G3B2 = 3x byte
1383  */
1384 static inline void
1385 lp_blend_type_from_format_desc(const struct util_format_description *format_desc,
1386                                struct lp_type* type)
1387 {
1388    unsigned i;
1389    unsigned chan;
1390
1391    if (format_expands_to_float_soa(format_desc)) {
1392       /* always use ordinary floats for blending */
1393       type->floating = true;
1394       type->fixed = false;
1395       type->sign = true;
1396       type->norm = false;
1397       type->width = 32;
1398       type->length = 4;
1399       return;
1400    }
1401
1402    for (i = 0; i < 4; i++)
1403       if (format_desc->channel[i].type != UTIL_FORMAT_TYPE_VOID)
1404          break;
1405    chan = i;
1406
1407    memset(type, 0, sizeof(struct lp_type));
1408    type->floating = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FLOAT;
1409    type->fixed    = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FIXED;
1410    type->sign     = format_desc->channel[chan].type != UTIL_FORMAT_TYPE_UNSIGNED;
1411    type->norm     = format_desc->channel[chan].normalized;
1412    type->width    = format_desc->channel[chan].size;
1413    type->length   = format_desc->nr_channels;
1414
1415    for (i = 1; i < format_desc->nr_channels; ++i) {
1416       if (format_desc->channel[i].size > type->width)
1417          type->width = format_desc->channel[i].size;
1418    }
1419
1420    if (type->floating) {
1421       type->width = 32;
1422    } else {
1423       if (type->width <= 8) {
1424          type->width = 8;
1425       } else if (type->width <= 16) {
1426          type->width = 16;
1427       } else {
1428          type->width = 32;
1429       }
1430    }
1431
1432    if (is_arithmetic_format(format_desc) && type->length == 3) {
1433       type->length = 4;
1434    }
1435 }
1436
1437
1438 /**
1439  * Scale a normalized value from src_bits to dst_bits.
1440  *
1441  * The exact calculation is
1442  *
1443  *    dst = iround(src * dst_mask / src_mask)
1444  *
1445  *  or with integer rounding
1446  *
1447  *    dst = src * (2*dst_mask + sign(src)*src_mask) / (2*src_mask)
1448  *
1449  *  where
1450  *
1451  *    src_mask = (1 << src_bits) - 1
1452  *    dst_mask = (1 << dst_bits) - 1
1453  *
1454  * but we try to avoid division and multiplication through shifts.
1455  */
1456 static inline LLVMValueRef
1457 scale_bits(struct gallivm_state *gallivm,
1458            int src_bits,
1459            int dst_bits,
1460            LLVMValueRef src,
1461            struct lp_type src_type)
1462 {
1463    LLVMBuilderRef builder = gallivm->builder;
1464    LLVMValueRef result = src;
1465
1466    if (dst_bits < src_bits) {
1467       int delta_bits = src_bits - dst_bits;
1468
1469       if (delta_bits <= dst_bits) {
1470          /*
1471           * Approximate the rescaling with a single shift.
1472           *
1473           * This gives the wrong rounding.
1474           */
1475
1476          result = LLVMBuildLShr(builder,
1477                                 src,
1478                                 lp_build_const_int_vec(gallivm, src_type, delta_bits),
1479                                 "");
1480
1481       } else {
1482          /*
1483           * Try more accurate rescaling.
1484           */
1485
1486          /*
1487           * Drop the least significant bits to make space for the multiplication.
1488           *
1489           * XXX: A better approach would be to use a wider integer type as intermediate.  But
1490           * this is enough to convert alpha from 16bits -> 2 when rendering to
1491           * PIPE_FORMAT_R10G10B10A2_UNORM.
1492           */
1493          result = LLVMBuildLShr(builder,
1494                                 src,
1495                                 lp_build_const_int_vec(gallivm, src_type, dst_bits),
1496                                 "");
1497
1498
1499          result = LLVMBuildMul(builder,
1500                                result,
1501                                lp_build_const_int_vec(gallivm, src_type, (1LL << dst_bits) - 1),
1502                                "");
1503
1504          /*
1505           * Add a rounding term before the division.
1506           *
1507           * TODO: Handle signed integers too.
1508           */
1509          if (!src_type.sign) {
1510             result = LLVMBuildAdd(builder,
1511                                   result,
1512                                   lp_build_const_int_vec(gallivm, src_type, (1LL << (delta_bits - 1))),
1513                                   "");
1514          }
1515
1516          /*
1517           * Approximate the division by src_mask with a src_bits shift.
1518           *
1519           * Given the src has already been shifted by dst_bits, all we need
1520           * to do is to shift by the difference.
1521           */
1522
1523          result = LLVMBuildLShr(builder,
1524                                 result,
1525                                 lp_build_const_int_vec(gallivm, src_type, delta_bits),
1526                                 "");
1527       }
1528
1529    } else if (dst_bits > src_bits) {
1530       /* Scale up bits */
1531       int db = dst_bits - src_bits;
1532
1533       /* Shift left by difference in bits */
1534       result = LLVMBuildShl(builder,
1535                             src,
1536                             lp_build_const_int_vec(gallivm, src_type, db),
1537                             "");
1538
1539       if (db <= src_bits) {
1540          /* Enough bits in src to fill the remainder */
1541          LLVMValueRef lower = LLVMBuildLShr(builder,
1542                                             src,
1543                                             lp_build_const_int_vec(gallivm, src_type, src_bits - db),
1544                                             "");
1545
1546          result = LLVMBuildOr(builder, result, lower, "");
1547       } else if (db > src_bits) {
1548          /* Need to repeatedly copy src bits to fill remainder in dst */
1549          unsigned n;
1550
1551          for (n = src_bits; n < dst_bits; n *= 2) {
1552             LLVMValueRef shuv = lp_build_const_int_vec(gallivm, src_type, n);
1553
1554             result = LLVMBuildOr(builder,
1555                                  result,
1556                                  LLVMBuildLShr(builder, result, shuv, ""),
1557                                  "");
1558          }
1559       }
1560    }
1561
1562    return result;
1563 }
1564
1565 /**
1566  * If RT is a smallfloat (needing denorms) format
1567  */
1568 static inline int
1569 have_smallfloat_format(struct lp_type dst_type,
1570                        enum pipe_format format)
1571 {
1572    return ((dst_type.floating && dst_type.width != 32) ||
1573     /* due to format handling hacks this format doesn't have floating set
1574      * here (and actually has width set to 32 too) so special case this. */
1575     (format == PIPE_FORMAT_R11G11B10_FLOAT));
1576 }
1577
1578
1579 /**
1580  * Convert from memory format to blending format
1581  *
1582  * e.g. GL_R3G3B2 is 1 byte in memory but 3 bytes for blending
1583  */
1584 static void
1585 convert_to_blend_type(struct gallivm_state *gallivm,
1586                       unsigned block_size,
1587                       const struct util_format_description *src_fmt,
1588                       struct lp_type src_type,
1589                       struct lp_type dst_type,
1590                       LLVMValueRef* src, // and dst
1591                       unsigned num_srcs)
1592 {
1593    LLVMValueRef *dst = src;
1594    LLVMBuilderRef builder = gallivm->builder;
1595    struct lp_type blend_type;
1596    struct lp_type mem_type;
1597    unsigned i, j;
1598    unsigned pixels = block_size / num_srcs;
1599    bool is_arith;
1600
1601    /*
1602     * full custom path for packed floats and srgb formats - none of the later
1603     * functions would do anything useful, and given the lp_type representation they
1604     * can't be fixed. Should really have some SoA blend path for these kind of
1605     * formats rather than hacking them in here.
1606     */
1607    if (format_expands_to_float_soa(src_fmt)) {
1608       LLVMValueRef tmpsrc[4];
1609       /*
1610        * This is pretty suboptimal for this case blending in SoA would be much
1611        * better, since conversion gets us SoA values so need to convert back.
1612        */
1613       assert(src_type.width == 32 || src_type.width == 16);
1614       assert(dst_type.floating);
1615       assert(dst_type.width == 32);
1616       assert(dst_type.length % 4 == 0);
1617       assert(num_srcs % 4 == 0);
1618
1619       if (src_type.width == 16) {
1620          /* expand 4x16bit values to 4x32bit */
1621          struct lp_type type32x4 = src_type;
1622          LLVMTypeRef ltype32x4;
1623          unsigned num_fetch = dst_type.length == 8 ? num_srcs / 2 : num_srcs / 4;
1624          type32x4.width = 32;
1625          ltype32x4 = lp_build_vec_type(gallivm, type32x4);
1626          for (i = 0; i < num_fetch; i++) {
1627             src[i] = LLVMBuildZExt(builder, src[i], ltype32x4, "");
1628          }
1629          src_type.width = 32;
1630       }
1631       for (i = 0; i < 4; i++) {
1632          tmpsrc[i] = src[i];
1633       }
1634       for (i = 0; i < num_srcs / 4; i++) {
1635          LLVMValueRef tmpsoa[4];
1636          LLVMValueRef tmps = tmpsrc[i];
1637          if (dst_type.length == 8) {
1638             LLVMValueRef shuffles[8];
1639             unsigned j;
1640             /* fetch was 4 values but need 8-wide output values */
1641             tmps = lp_build_concat(gallivm, &tmpsrc[i * 2], src_type, 2);
1642             /*
1643              * for 8-wide aos transpose would give us wrong order not matching
1644              * incoming converted fs values and mask. ARGH.
1645              */
1646             for (j = 0; j < 4; j++) {
1647                shuffles[j] = lp_build_const_int32(gallivm, j * 2);
1648                shuffles[j + 4] = lp_build_const_int32(gallivm, j * 2 + 1);
1649             }
1650             tmps = LLVMBuildShuffleVector(builder, tmps, tmps,
1651                                           LLVMConstVector(shuffles, 8), "");
1652          }
1653          if (src_fmt->format == PIPE_FORMAT_R11G11B10_FLOAT) {
1654             lp_build_r11g11b10_to_float(gallivm, tmps, tmpsoa);
1655          }
1656          else {
1657             lp_build_unpack_rgba_soa(gallivm, src_fmt, dst_type, tmps, tmpsoa);
1658          }
1659          lp_build_transpose_aos(gallivm, dst_type, tmpsoa, &src[i * 4]);
1660       }
1661       return;
1662    }
1663
1664    lp_mem_type_from_format_desc(src_fmt, &mem_type);
1665    lp_blend_type_from_format_desc(src_fmt, &blend_type);
1666
1667    /* Is the format arithmetic */
1668    is_arith = blend_type.length * blend_type.width != mem_type.width * mem_type.length;
1669    is_arith &= !(mem_type.width == 16 && mem_type.floating);
1670
1671    /* Pad if necessary */
1672    if (!is_arith && src_type.length < dst_type.length) {
1673       for (i = 0; i < num_srcs; ++i) {
1674          dst[i] = lp_build_pad_vector(gallivm, src[i], dst_type.length);
1675       }
1676
1677       src_type.length = dst_type.length;
1678    }
1679
1680    /* Special case for half-floats */
1681    if (mem_type.width == 16 && mem_type.floating) {
1682       assert(blend_type.width == 32 && blend_type.floating);
1683       lp_build_conv_auto(gallivm, src_type, &dst_type, dst, num_srcs, dst);
1684       is_arith = false;
1685    }
1686
1687    if (!is_arith) {
1688       return;
1689    }
1690
1691    src_type.width = blend_type.width * blend_type.length;
1692    blend_type.length *= pixels;
1693    src_type.length *= pixels / (src_type.length / mem_type.length);
1694
1695    for (i = 0; i < num_srcs; ++i) {
1696       LLVMValueRef chans[4];
1697       LLVMValueRef res = NULL;
1698
1699       dst[i] = LLVMBuildZExt(builder, src[i], lp_build_vec_type(gallivm, src_type), "");
1700
1701       for (j = 0; j < src_fmt->nr_channels; ++j) {
1702          unsigned mask = 0;
1703          unsigned sa = src_fmt->channel[j].shift;
1704 #if UTIL_ARCH_LITTLE_ENDIAN
1705          unsigned from_lsb = j;
1706 #else
1707          unsigned from_lsb = src_fmt->nr_channels - j - 1;
1708 #endif
1709
1710          mask = (1 << src_fmt->channel[j].size) - 1;
1711
1712          /* Extract bits from source */
1713          chans[j] = LLVMBuildLShr(builder,
1714                                   dst[i],
1715                                   lp_build_const_int_vec(gallivm, src_type, sa),
1716                                   "");
1717
1718          chans[j] = LLVMBuildAnd(builder,
1719                                  chans[j],
1720                                  lp_build_const_int_vec(gallivm, src_type, mask),
1721                                  "");
1722
1723          /* Scale bits */
1724          if (src_type.norm) {
1725             chans[j] = scale_bits(gallivm, src_fmt->channel[j].size,
1726                                   blend_type.width, chans[j], src_type);
1727          }
1728
1729          /* Insert bits into correct position */
1730          chans[j] = LLVMBuildShl(builder,
1731                                  chans[j],
1732                                  lp_build_const_int_vec(gallivm, src_type, from_lsb * blend_type.width),
1733                                  "");
1734
1735          if (j == 0) {
1736             res = chans[j];
1737          } else {
1738             res = LLVMBuildOr(builder, res, chans[j], "");
1739          }
1740       }
1741
1742       dst[i] = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, blend_type), "");
1743    }
1744 }
1745
1746
1747 /**
1748  * Convert from blending format to memory format
1749  *
1750  * e.g. GL_R3G3B2 is 3 bytes for blending but 1 byte in memory
1751  */
1752 static void
1753 convert_from_blend_type(struct gallivm_state *gallivm,
1754                         unsigned block_size,
1755                         const struct util_format_description *src_fmt,
1756                         struct lp_type src_type,
1757                         struct lp_type dst_type,
1758                         LLVMValueRef* src, // and dst
1759                         unsigned num_srcs)
1760 {
1761    LLVMValueRef* dst = src;
1762    unsigned i, j, k;
1763    struct lp_type mem_type;
1764    struct lp_type blend_type;
1765    LLVMBuilderRef builder = gallivm->builder;
1766    unsigned pixels = block_size / num_srcs;
1767    bool is_arith;
1768
1769    /*
1770     * full custom path for packed floats and srgb formats - none of the later
1771     * functions would do anything useful, and given the lp_type representation they
1772     * can't be fixed. Should really have some SoA blend path for these kind of
1773     * formats rather than hacking them in here.
1774     */
1775    if (format_expands_to_float_soa(src_fmt)) {
1776       /*
1777        * This is pretty suboptimal for this case blending in SoA would be much
1778        * better - we need to transpose the AoS values back to SoA values for
1779        * conversion/packing.
1780        */
1781       assert(src_type.floating);
1782       assert(src_type.width == 32);
1783       assert(src_type.length % 4 == 0);
1784       assert(dst_type.width == 32 || dst_type.width == 16);
1785
1786       for (i = 0; i < num_srcs / 4; i++) {
1787          LLVMValueRef tmpsoa[4], tmpdst;
1788          lp_build_transpose_aos(gallivm, src_type, &src[i * 4], tmpsoa);
1789          /* really really need SoA here */
1790
1791          if (src_fmt->format == PIPE_FORMAT_R11G11B10_FLOAT) {
1792             tmpdst = lp_build_float_to_r11g11b10(gallivm, tmpsoa);
1793          }
1794          else {
1795             tmpdst = lp_build_float_to_srgb_packed(gallivm, src_fmt,
1796                                                    src_type, tmpsoa);
1797          }
1798
1799          if (src_type.length == 8) {
1800             LLVMValueRef tmpaos, shuffles[8];
1801             unsigned j;
1802             /*
1803              * for 8-wide aos transpose has given us wrong order not matching
1804              * output order. HMPF. Also need to split the output values manually.
1805              */
1806             for (j = 0; j < 4; j++) {
1807                shuffles[j * 2] = lp_build_const_int32(gallivm, j);
1808                shuffles[j * 2 + 1] = lp_build_const_int32(gallivm, j + 4);
1809             }
1810             tmpaos = LLVMBuildShuffleVector(builder, tmpdst, tmpdst,
1811                                             LLVMConstVector(shuffles, 8), "");
1812             src[i * 2] = lp_build_extract_range(gallivm, tmpaos, 0, 4);
1813             src[i * 2 + 1] = lp_build_extract_range(gallivm, tmpaos, 4, 4);
1814          }
1815          else {
1816             src[i] = tmpdst;
1817          }
1818       }
1819       if (dst_type.width == 16) {
1820          struct lp_type type16x8 = dst_type;
1821          struct lp_type type32x4 = dst_type;
1822          LLVMTypeRef ltype16x4, ltypei64, ltypei128;
1823          unsigned num_fetch = src_type.length == 8 ? num_srcs / 2 : num_srcs / 4;
1824          type16x8.length = 8;
1825          type32x4.width = 32;
1826          ltypei128 = LLVMIntTypeInContext(gallivm->context, 128);
1827          ltypei64 = LLVMIntTypeInContext(gallivm->context, 64);
1828          ltype16x4 = lp_build_vec_type(gallivm, dst_type);
1829          /* We could do vector truncation but it doesn't generate very good code */
1830          for (i = 0; i < num_fetch; i++) {
1831             src[i] = lp_build_pack2(gallivm, type32x4, type16x8,
1832                                     src[i], lp_build_zero(gallivm, type32x4));
1833             src[i] = LLVMBuildBitCast(builder, src[i], ltypei128, "");
1834             src[i] = LLVMBuildTrunc(builder, src[i], ltypei64, "");
1835             src[i] = LLVMBuildBitCast(builder, src[i], ltype16x4, "");
1836          }
1837       }
1838       return;
1839    }
1840
1841    lp_mem_type_from_format_desc(src_fmt, &mem_type);
1842    lp_blend_type_from_format_desc(src_fmt, &blend_type);
1843
1844    is_arith = (blend_type.length * blend_type.width != mem_type.width * mem_type.length);
1845
1846    /* Special case for half-floats */
1847    if (mem_type.width == 16 && mem_type.floating) {
1848       int length = dst_type.length;
1849       assert(blend_type.width == 32 && blend_type.floating);
1850
1851       dst_type.length = src_type.length;
1852
1853       lp_build_conv_auto(gallivm, src_type, &dst_type, dst, num_srcs, dst);
1854
1855       dst_type.length = length;
1856       is_arith = false;
1857    }
1858
1859    /* Remove any padding */
1860    if (!is_arith && (src_type.length % mem_type.length)) {
1861       src_type.length -= (src_type.length % mem_type.length);
1862
1863       for (i = 0; i < num_srcs; ++i) {
1864          dst[i] = lp_build_extract_range(gallivm, dst[i], 0, src_type.length);
1865       }
1866    }
1867
1868    /* No bit arithmetic to do */
1869    if (!is_arith) {
1870       return;
1871    }
1872
1873    src_type.length = pixels;
1874    src_type.width = blend_type.length * blend_type.width;
1875    dst_type.length = pixels;
1876
1877    for (i = 0; i < num_srcs; ++i) {
1878       LLVMValueRef chans[4];
1879       LLVMValueRef res = NULL;
1880
1881       dst[i] = LLVMBuildBitCast(builder, src[i], lp_build_vec_type(gallivm, src_type), "");
1882
1883       for (j = 0; j < src_fmt->nr_channels; ++j) {
1884          unsigned mask = 0;
1885          unsigned sa = src_fmt->channel[j].shift;
1886          unsigned sz_a = src_fmt->channel[j].size;
1887 #if UTIL_ARCH_LITTLE_ENDIAN
1888          unsigned from_lsb = j;
1889 #else
1890          unsigned from_lsb = src_fmt->nr_channels - j - 1;
1891 #endif
1892
1893          assert(blend_type.width > src_fmt->channel[j].size);
1894
1895          for (k = 0; k < blend_type.width; ++k) {
1896             mask |= 1 << k;
1897          }
1898
1899          /* Extract bits */
1900          chans[j] = LLVMBuildLShr(builder,
1901                                   dst[i],
1902                                   lp_build_const_int_vec(gallivm, src_type,
1903                                                          from_lsb * blend_type.width),
1904                                   "");
1905
1906          chans[j] = LLVMBuildAnd(builder,
1907                                  chans[j],
1908                                  lp_build_const_int_vec(gallivm, src_type, mask),
1909                                  "");
1910
1911          /* Scale down bits */
1912          if (src_type.norm) {
1913             chans[j] = scale_bits(gallivm, blend_type.width,
1914                                   src_fmt->channel[j].size, chans[j], src_type);
1915          } else if (!src_type.floating && sz_a < blend_type.width) {
1916             LLVMValueRef mask_val = lp_build_const_int_vec(gallivm, src_type, (1UL << sz_a) - 1);
1917             LLVMValueRef mask = LLVMBuildICmp(builder, LLVMIntUGT, chans[j], mask_val, "");
1918             chans[j] = LLVMBuildSelect(builder, mask, mask_val, chans[j], "");
1919          }
1920
1921          /* Insert bits */
1922          chans[j] = LLVMBuildShl(builder,
1923                                  chans[j],
1924                                  lp_build_const_int_vec(gallivm, src_type, sa),
1925                                  "");
1926
1927          sa += src_fmt->channel[j].size;
1928
1929          if (j == 0) {
1930             res = chans[j];
1931          } else {
1932             res = LLVMBuildOr(builder, res, chans[j], "");
1933          }
1934       }
1935
1936       assert (dst_type.width != 24);
1937
1938       dst[i] = LLVMBuildTrunc(builder, res, lp_build_vec_type(gallivm, dst_type), "");
1939    }
1940 }
1941
1942
1943 /**
1944  * Convert alpha to same blend type as src
1945  */
1946 static void
1947 convert_alpha(struct gallivm_state *gallivm,
1948               struct lp_type row_type,
1949               struct lp_type alpha_type,
1950               const unsigned block_size,
1951               const unsigned block_height,
1952               const unsigned src_count,
1953               const unsigned dst_channels,
1954               const bool pad_inline,
1955               LLVMValueRef* src_alpha)
1956 {
1957    LLVMBuilderRef builder = gallivm->builder;
1958    unsigned i, j;
1959    unsigned length = row_type.length;
1960    row_type.length = alpha_type.length;
1961
1962    /* Twiddle the alpha to match pixels */
1963    lp_bld_quad_twiddle(gallivm, alpha_type, src_alpha, block_height, src_alpha);
1964
1965    /*
1966     * TODO this should use single lp_build_conv call for
1967     * src_count == 1 && dst_channels == 1 case (dropping the concat below)
1968     */
1969    for (i = 0; i < block_height; ++i) {
1970       lp_build_conv(gallivm, alpha_type, row_type, &src_alpha[i], 1, &src_alpha[i], 1);
1971    }
1972
1973    alpha_type = row_type;
1974    row_type.length = length;
1975
1976    /* If only one channel we can only need the single alpha value per pixel */
1977    if (src_count == 1 && dst_channels == 1) {
1978
1979       lp_build_concat_n(gallivm, alpha_type, src_alpha, block_height, src_alpha, src_count);
1980    } else {
1981       /* If there are more srcs than rows then we need to split alpha up */
1982       if (src_count > block_height) {
1983          for (i = src_count; i > 0; --i) {
1984             unsigned pixels = block_size / src_count;
1985             unsigned idx = i - 1;
1986
1987             src_alpha[idx] = lp_build_extract_range(gallivm, src_alpha[(idx * pixels) / 4],
1988                                                     (idx * pixels) % 4, pixels);
1989          }
1990       }
1991
1992       /* If there is a src for each pixel broadcast the alpha across whole row */
1993       if (src_count == block_size) {
1994          for (i = 0; i < src_count; ++i) {
1995             src_alpha[i] = lp_build_broadcast(gallivm,
1996                               lp_build_vec_type(gallivm, row_type), src_alpha[i]);
1997          }
1998       } else {
1999          unsigned pixels = block_size / src_count;
2000          unsigned channels = pad_inline ? TGSI_NUM_CHANNELS : dst_channels;
2001          unsigned alpha_span = 1;
2002          LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
2003
2004          /* Check if we need 2 src_alphas for our shuffles */
2005          if (pixels > alpha_type.length) {
2006             alpha_span = 2;
2007          }
2008
2009          /* Broadcast alpha across all channels, e.g. a1a2 to a1a1a1a1a2a2a2a2 */
2010          for (j = 0; j < row_type.length; ++j) {
2011             if (j < pixels * channels) {
2012                shuffles[j] = lp_build_const_int32(gallivm, j / channels);
2013             } else {
2014                shuffles[j] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
2015             }
2016          }
2017
2018          for (i = 0; i < src_count; ++i) {
2019             unsigned idx1 = i, idx2 = i;
2020
2021             if (alpha_span > 1){
2022                idx1 *= alpha_span;
2023                idx2 = idx1 + 1;
2024             }
2025
2026             src_alpha[i] = LLVMBuildShuffleVector(builder,
2027                                                   src_alpha[idx1],
2028                                                   src_alpha[idx2],
2029                                                   LLVMConstVector(shuffles, row_type.length),
2030                                                   "");
2031          }
2032       }
2033    }
2034 }
2035
2036
2037 /**
2038  * Generates the blend function for unswizzled colour buffers
2039  * Also generates the read & write from colour buffer
2040  */
2041 static void
2042 generate_unswizzled_blend(struct gallivm_state *gallivm,
2043                           unsigned rt,
2044                           struct lp_fragment_shader_variant *variant,
2045                           enum pipe_format out_format,
2046                           unsigned int num_fs,
2047                           struct lp_type fs_type,
2048                           LLVMValueRef* fs_mask,
2049                           LLVMValueRef fs_out_color[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][4],
2050                           LLVMValueRef context_ptr,
2051                           LLVMValueRef color_ptr,
2052                           LLVMValueRef stride,
2053                           unsigned partial_mask,
2054                           boolean do_branch)
2055 {
2056    const unsigned alpha_channel = 3;
2057    const unsigned block_width = LP_RASTER_BLOCK_SIZE;
2058    const unsigned block_height = LP_RASTER_BLOCK_SIZE;
2059    const unsigned block_size = block_width * block_height;
2060    const unsigned lp_integer_vector_width = 128;
2061
2062    LLVMBuilderRef builder = gallivm->builder;
2063    LLVMValueRef fs_src[4][TGSI_NUM_CHANNELS];
2064    LLVMValueRef fs_src1[4][TGSI_NUM_CHANNELS];
2065    LLVMValueRef src_alpha[4 * 4];
2066    LLVMValueRef src1_alpha[4 * 4] = { NULL };
2067    LLVMValueRef src_mask[4 * 4];
2068    LLVMValueRef src[4 * 4];
2069    LLVMValueRef src1[4 * 4];
2070    LLVMValueRef dst[4 * 4];
2071    LLVMValueRef blend_color;
2072    LLVMValueRef blend_alpha;
2073    LLVMValueRef i32_zero;
2074    LLVMValueRef check_mask;
2075    LLVMValueRef undef_src_val;
2076
2077    struct lp_build_mask_context mask_ctx;
2078    struct lp_type mask_type;
2079    struct lp_type blend_type;
2080    struct lp_type row_type;
2081    struct lp_type dst_type;
2082    struct lp_type ls_type;
2083
2084    unsigned char swizzle[TGSI_NUM_CHANNELS];
2085    unsigned vector_width;
2086    unsigned src_channels = TGSI_NUM_CHANNELS;
2087    unsigned dst_channels;
2088    unsigned dst_count;
2089    unsigned src_count;
2090    unsigned i, j;
2091
2092    const struct util_format_description* out_format_desc = util_format_description(out_format);
2093
2094    unsigned dst_alignment;
2095
2096    bool pad_inline = is_arithmetic_format(out_format_desc);
2097    bool has_alpha = false;
2098    const boolean dual_source_blend = variant->key.blend.rt[0].blend_enable &&
2099                                      util_blend_state_is_dual(&variant->key.blend, 0);
2100
2101    const boolean is_1d = variant->key.resource_1d;
2102    boolean twiddle_after_convert = FALSE;
2103    unsigned num_fullblock_fs = is_1d ? 2 * num_fs : num_fs;
2104    LLVMValueRef fpstate = 0;
2105
2106    /* Get type from output format */
2107    lp_blend_type_from_format_desc(out_format_desc, &row_type);
2108    lp_mem_type_from_format_desc(out_format_desc, &dst_type);
2109
2110    /*
2111     * Technically this code should go into lp_build_smallfloat_to_float
2112     * and lp_build_float_to_smallfloat but due to the
2113     * http://llvm.org/bugs/show_bug.cgi?id=6393
2114     * llvm reorders the mxcsr intrinsics in a way that breaks the code.
2115     * So the ordering is important here and there shouldn't be any
2116     * llvm ir instrunctions in this function before
2117     * this, otherwise half-float format conversions won't work
2118     * (again due to llvm bug #6393).
2119     */
2120    if (have_smallfloat_format(dst_type, out_format)) {
2121       /* We need to make sure that denorms are ok for half float
2122          conversions */
2123       fpstate = lp_build_fpstate_get(gallivm);
2124       lp_build_fpstate_set_denorms_zero(gallivm, FALSE);
2125    }
2126
2127    mask_type = lp_int32_vec4_type();
2128    mask_type.length = fs_type.length;
2129
2130    for (i = num_fs; i < num_fullblock_fs; i++) {
2131       fs_mask[i] = lp_build_zero(gallivm, mask_type);
2132    }
2133
2134    /* Do not bother executing code when mask is empty.. */
2135    if (do_branch) {
2136       check_mask = LLVMConstNull(lp_build_int_vec_type(gallivm, mask_type));
2137
2138       for (i = 0; i < num_fullblock_fs; ++i) {
2139          check_mask = LLVMBuildOr(builder, check_mask, fs_mask[i], "");
2140       }
2141
2142       lp_build_mask_begin(&mask_ctx, gallivm, mask_type, check_mask);
2143       lp_build_mask_check(&mask_ctx);
2144    }
2145
2146    partial_mask |= !variant->opaque;
2147    i32_zero = lp_build_const_int32(gallivm, 0);
2148
2149    undef_src_val = lp_build_undef(gallivm, fs_type);
2150
2151    row_type.length = fs_type.length;
2152    vector_width    = dst_type.floating ? lp_native_vector_width : lp_integer_vector_width;
2153
2154    /* Compute correct swizzle and count channels */
2155    memset(swizzle, LP_BLD_SWIZZLE_DONTCARE, TGSI_NUM_CHANNELS);
2156    dst_channels = 0;
2157
2158    for (i = 0; i < TGSI_NUM_CHANNELS; ++i) {
2159       /* Ensure channel is used */
2160       if (out_format_desc->swizzle[i] >= TGSI_NUM_CHANNELS) {
2161          continue;
2162       }
2163
2164       /* Ensure not already written to (happens in case with GL_ALPHA) */
2165       if (swizzle[out_format_desc->swizzle[i]] < TGSI_NUM_CHANNELS) {
2166          continue;
2167       }
2168
2169       /* Ensure we havn't already found all channels */
2170       if (dst_channels >= out_format_desc->nr_channels) {
2171          continue;
2172       }
2173
2174       swizzle[out_format_desc->swizzle[i]] = i;
2175       ++dst_channels;
2176
2177       if (i == alpha_channel) {
2178          has_alpha = true;
2179       }
2180    }
2181
2182    if (format_expands_to_float_soa(out_format_desc)) {
2183       /*
2184        * the code above can't work for layout_other
2185        * for srgb it would sort of work but we short-circuit swizzles, etc.
2186        * as that is done as part of unpack / pack.
2187        */
2188       dst_channels = 4; /* HACK: this is fake 4 really but need it due to transpose stuff later */
2189       has_alpha = true;
2190       swizzle[0] = 0;
2191       swizzle[1] = 1;
2192       swizzle[2] = 2;
2193       swizzle[3] = 3;
2194       pad_inline = true; /* HACK: prevent rgbxrgbx->rgbrgbxx conversion later */
2195    }
2196
2197    /* If 3 channels then pad to include alpha for 4 element transpose */
2198    if (dst_channels == 3) {
2199       assert (!has_alpha);
2200       for (i = 0; i < TGSI_NUM_CHANNELS; i++) {
2201          if (swizzle[i] > TGSI_NUM_CHANNELS)
2202             swizzle[i] = 3;
2203       }
2204       if (out_format_desc->nr_channels == 4) {
2205          dst_channels = 4;
2206          /*
2207           * We use alpha from the color conversion, not separate one.
2208           * We had to include it for transpose, hence it will get converted
2209           * too (albeit when doing transpose after conversion, that would
2210           * no longer be the case necessarily).
2211           * (It works only with 4 channel dsts, e.g. rgbx formats, because
2212           * otherwise we really have padding, not alpha, included.)
2213           */
2214          has_alpha = true;
2215       }
2216    }
2217
2218    /*
2219     * Load shader output
2220     */
2221    for (i = 0; i < num_fullblock_fs; ++i) {
2222       /* Always load alpha for use in blending */
2223       LLVMValueRef alpha;
2224       if (i < num_fs) {
2225          alpha = LLVMBuildLoad(builder, fs_out_color[rt][alpha_channel][i], "");
2226       }
2227       else {
2228          alpha = undef_src_val;
2229       }
2230
2231       /* Load each channel */
2232       for (j = 0; j < dst_channels; ++j) {
2233          assert(swizzle[j] < 4);
2234          if (i < num_fs) {
2235             fs_src[i][j] = LLVMBuildLoad(builder, fs_out_color[rt][swizzle[j]][i], "");
2236          }
2237          else {
2238             fs_src[i][j] = undef_src_val;
2239          }
2240       }
2241
2242       /* If 3 channels then pad to include alpha for 4 element transpose */
2243       /*
2244        * XXX If we include that here maybe could actually use it instead of
2245        * separate alpha for blending?
2246        * (Difficult though we actually convert pad channels, not alpha.)
2247        */
2248       if (dst_channels == 3 && !has_alpha) {
2249          fs_src[i][3] = alpha;
2250       }
2251
2252       /* We split the row_mask and row_alpha as we want 128bit interleave */
2253       if (fs_type.length == 8) {
2254          src_mask[i*2 + 0]  = lp_build_extract_range(gallivm, fs_mask[i],
2255                                                      0, src_channels);
2256          src_mask[i*2 + 1]  = lp_build_extract_range(gallivm, fs_mask[i],
2257                                                      src_channels, src_channels);
2258
2259          src_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha, 0, src_channels);
2260          src_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha,
2261                                                      src_channels, src_channels);
2262       } else {
2263          src_mask[i] = fs_mask[i];
2264          src_alpha[i] = alpha;
2265       }
2266    }
2267    if (dual_source_blend) {
2268       /* same as above except different src/dst, skip masks and comments... */
2269       for (i = 0; i < num_fullblock_fs; ++i) {
2270          LLVMValueRef alpha;
2271          if (i < num_fs) {
2272             alpha = LLVMBuildLoad(builder, fs_out_color[1][alpha_channel][i], "");
2273          }
2274          else {
2275             alpha = undef_src_val;
2276          }
2277
2278          for (j = 0; j < dst_channels; ++j) {
2279             assert(swizzle[j] < 4);
2280             if (i < num_fs) {
2281                fs_src1[i][j] = LLVMBuildLoad(builder, fs_out_color[1][swizzle[j]][i], "");
2282             }
2283             else {
2284                fs_src1[i][j] = undef_src_val;
2285             }
2286          }
2287          if (dst_channels == 3 && !has_alpha) {
2288             fs_src1[i][3] = alpha;
2289          }
2290          if (fs_type.length == 8) {
2291             src1_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha, 0, src_channels);
2292             src1_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha,
2293                                                          src_channels, src_channels);
2294          } else {
2295             src1_alpha[i] = alpha;
2296          }
2297       }
2298    }
2299
2300    if (util_format_is_pure_integer(out_format)) {
2301       /*
2302        * In this case fs_type was really ints or uints disguised as floats,
2303        * fix that up now.
2304        */
2305       fs_type.floating = 0;
2306       fs_type.sign = dst_type.sign;
2307       for (i = 0; i < num_fullblock_fs; ++i) {
2308          for (j = 0; j < dst_channels; ++j) {
2309             fs_src[i][j] = LLVMBuildBitCast(builder, fs_src[i][j],
2310                                             lp_build_vec_type(gallivm, fs_type), "");
2311          }
2312          if (dst_channels == 3 && !has_alpha) {
2313             fs_src[i][3] = LLVMBuildBitCast(builder, fs_src[i][3],
2314                                             lp_build_vec_type(gallivm, fs_type), "");
2315          }
2316       }
2317    }
2318
2319    /*
2320     * We actually should generally do conversion first (for non-1d cases)
2321     * when the blend format is 8 or 16 bits. The reason is obvious,
2322     * there's 2 or 4 times less vectors to deal with for the interleave...
2323     * Albeit for the AVX (not AVX2) case there's no benefit with 16 bit
2324     * vectors (as it can do 32bit unpack with 256bit vectors, but 8/16bit
2325     * unpack only with 128bit vectors).
2326     * Note: for 16bit sizes really need matching pack conversion code
2327     */
2328    if (!is_1d && dst_channels != 3 && dst_type.width == 8) {
2329       twiddle_after_convert = TRUE;
2330    }
2331
2332    /*
2333     * Pixel twiddle from fragment shader order to memory order
2334     */
2335    if (!twiddle_after_convert) {
2336       src_count = generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs,
2337                                       dst_channels, fs_src, src, pad_inline);
2338       if (dual_source_blend) {
2339          generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs, dst_channels,
2340                              fs_src1, src1, pad_inline);
2341       }
2342    } else {
2343       src_count = num_fullblock_fs * dst_channels;
2344       /*
2345        * We reorder things a bit here, so the cases for 4-wide and 8-wide
2346        * (AVX) turn out the same later when untwiddling/transpose (albeit
2347        * for true AVX2 path untwiddle needs to be different).
2348        * For now just order by colors first (so we can use unpack later).
2349        */
2350       for (j = 0; j < num_fullblock_fs; j++) {
2351          for (i = 0; i < dst_channels; i++) {
2352             src[i*num_fullblock_fs + j] = fs_src[j][i];
2353             if (dual_source_blend) {
2354                src1[i*num_fullblock_fs + j] = fs_src1[j][i];
2355             }
2356          }
2357       }
2358    }
2359
2360    src_channels = dst_channels < 3 ? dst_channels : 4;
2361    if (src_count != num_fullblock_fs * src_channels) {
2362       unsigned ds = src_count / (num_fullblock_fs * src_channels);
2363       row_type.length /= ds;
2364       fs_type.length = row_type.length;
2365    }
2366
2367    blend_type = row_type;
2368    mask_type.length = 4;
2369
2370    /* Convert src to row_type */
2371    if (dual_source_blend) {
2372       struct lp_type old_row_type = row_type;
2373       lp_build_conv_auto(gallivm, fs_type, &row_type, src, src_count, src);
2374       src_count = lp_build_conv_auto(gallivm, fs_type, &old_row_type, src1, src_count, src1);
2375    }
2376    else {
2377       src_count = lp_build_conv_auto(gallivm, fs_type, &row_type, src, src_count, src);
2378    }
2379
2380    /* If the rows are not an SSE vector, combine them to become SSE size! */
2381    if ((row_type.width * row_type.length) % 128) {
2382       unsigned bits = row_type.width * row_type.length;
2383       unsigned combined;
2384
2385       assert(src_count >= (vector_width / bits));
2386
2387       dst_count = src_count / (vector_width / bits);
2388
2389       combined = lp_build_concat_n(gallivm, row_type, src, src_count, src, dst_count);
2390       if (dual_source_blend) {
2391          lp_build_concat_n(gallivm, row_type, src1, src_count, src1, dst_count);
2392       }
2393
2394       row_type.length *= combined;
2395       src_count /= combined;
2396
2397       bits = row_type.width * row_type.length;
2398       assert(bits == 128 || bits == 256);
2399    }
2400
2401    if (twiddle_after_convert) {
2402       fs_twiddle_transpose(gallivm, row_type, src, src_count, src);
2403       if (dual_source_blend) {
2404          fs_twiddle_transpose(gallivm, row_type, src1, src_count, src1);
2405       }
2406    }
2407
2408    /*
2409     * Blend Colour conversion
2410     */
2411    blend_color = lp_jit_context_f_blend_color(gallivm, context_ptr);
2412    blend_color = LLVMBuildPointerCast(builder, blend_color,
2413                     LLVMPointerType(lp_build_vec_type(gallivm, fs_type), 0), "");
2414    blend_color = LLVMBuildLoad(builder, LLVMBuildGEP(builder, blend_color,
2415                                &i32_zero, 1, ""), "");
2416
2417    /* Convert */
2418    lp_build_conv(gallivm, fs_type, blend_type, &blend_color, 1, &blend_color, 1);
2419
2420    if (out_format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
2421       /*
2422        * since blending is done with floats, there was no conversion.
2423        * However, the rules according to fixed point renderbuffers still
2424        * apply, that is we must clamp inputs to 0.0/1.0.
2425        * (This would apply to separate alpha conversion too but we currently
2426        * force has_alpha to be true.)
2427        * TODO: should skip this with "fake" blend, since post-blend conversion
2428        * will clamp anyway.
2429        * TODO: could also skip this if fragment color clamping is enabled. We
2430        * don't support it natively so it gets baked into the shader however, so
2431        * can't really tell here.
2432        */
2433       struct lp_build_context f32_bld;
2434       assert(row_type.floating);
2435       lp_build_context_init(&f32_bld, gallivm, row_type);
2436       for (i = 0; i < src_count; i++) {
2437          src[i] = lp_build_clamp_zero_one_nanzero(&f32_bld, src[i]);
2438       }
2439       if (dual_source_blend) {
2440          for (i = 0; i < src_count; i++) {
2441             src1[i] = lp_build_clamp_zero_one_nanzero(&f32_bld, src1[i]);
2442          }
2443       }
2444       /* probably can't be different than row_type but better safe than sorry... */
2445       lp_build_context_init(&f32_bld, gallivm, blend_type);
2446       blend_color = lp_build_clamp(&f32_bld, blend_color, f32_bld.zero, f32_bld.one);
2447    }
2448
2449    /* Extract alpha */
2450    blend_alpha = lp_build_extract_broadcast(gallivm, blend_type, row_type, blend_color, lp_build_const_int32(gallivm, 3));
2451
2452    /* Swizzle to appropriate channels, e.g. from RGBA to BGRA BGRA */
2453    pad_inline &= (dst_channels * (block_size / src_count) * row_type.width) != vector_width;
2454    if (pad_inline) {
2455       /* Use all 4 channels e.g. from RGBA RGBA to RGxx RGxx */
2456       blend_color = lp_build_swizzle_aos_n(gallivm, blend_color, swizzle, TGSI_NUM_CHANNELS, row_type.length);
2457    } else {
2458       /* Only use dst_channels e.g. RGBA RGBA to RG RG xxxx */
2459       blend_color = lp_build_swizzle_aos_n(gallivm, blend_color, swizzle, dst_channels, row_type.length);
2460    }
2461
2462    /*
2463     * Mask conversion
2464     */
2465    lp_bld_quad_twiddle(gallivm, mask_type, &src_mask[0], block_height, &src_mask[0]);
2466
2467    if (src_count < block_height) {
2468       lp_build_concat_n(gallivm, mask_type, src_mask, 4, src_mask, src_count);
2469    } else if (src_count > block_height) {
2470       for (i = src_count; i > 0; --i) {
2471          unsigned pixels = block_size / src_count;
2472          unsigned idx = i - 1;
2473
2474          src_mask[idx] = lp_build_extract_range(gallivm, src_mask[(idx * pixels) / 4],
2475                                                 (idx * pixels) % 4, pixels);
2476       }
2477    }
2478
2479    assert(mask_type.width == 32);
2480
2481    for (i = 0; i < src_count; ++i) {
2482       unsigned pixels = block_size / src_count;
2483       unsigned pixel_width = row_type.width * dst_channels;
2484
2485       if (pixel_width == 24) {
2486          mask_type.width = 8;
2487          mask_type.length = vector_width / mask_type.width;
2488       } else {
2489          mask_type.length = pixels;
2490          mask_type.width = row_type.width * dst_channels;
2491
2492          /*
2493           * If mask_type width is smaller than 32bit, this doesn't quite
2494           * generate the most efficient code (could use some pack).
2495           */
2496          src_mask[i] = LLVMBuildIntCast(builder, src_mask[i],
2497                                         lp_build_int_vec_type(gallivm, mask_type), "");
2498
2499          mask_type.length *= dst_channels;
2500          mask_type.width /= dst_channels;
2501       }
2502
2503       src_mask[i] = LLVMBuildBitCast(builder, src_mask[i],
2504                                      lp_build_int_vec_type(gallivm, mask_type), "");
2505       src_mask[i] = lp_build_pad_vector(gallivm, src_mask[i], row_type.length);
2506    }
2507
2508    /*
2509     * Alpha conversion
2510     */
2511    if (!has_alpha) {
2512       struct lp_type alpha_type = fs_type;
2513       alpha_type.length = 4;
2514       convert_alpha(gallivm, row_type, alpha_type,
2515                     block_size, block_height,
2516                     src_count, dst_channels,
2517                     pad_inline, src_alpha);
2518       if (dual_source_blend) {
2519          convert_alpha(gallivm, row_type, alpha_type,
2520                        block_size, block_height,
2521                        src_count, dst_channels,
2522                        pad_inline, src1_alpha);
2523       }
2524    }
2525
2526
2527    /*
2528     * Load dst from memory
2529     */
2530    if (src_count < block_height) {
2531       dst_count = block_height;
2532    } else {
2533       dst_count = src_count;
2534    }
2535
2536    dst_type.length *= block_size / dst_count;
2537
2538    if (format_expands_to_float_soa(out_format_desc)) {
2539       /*
2540        * we need multiple values at once for the conversion, so can as well
2541        * load them vectorized here too instead of concatenating later.
2542        * (Still need concatenation later for 8-wide vectors).
2543        */
2544       dst_count = block_height;
2545       dst_type.length = block_width;
2546    }
2547
2548    /*
2549     * Compute the alignment of the destination pointer in bytes
2550     * We fetch 1-4 pixels, if the format has pot alignment then those fetches
2551     * are always aligned by MIN2(16, fetch_width) except for buffers (not
2552     * 1d tex but can't distinguish here) so need to stick with per-pixel
2553     * alignment in this case.
2554     */
2555    if (is_1d) {
2556       dst_alignment = (out_format_desc->block.bits + 7)/(out_format_desc->block.width * 8);
2557    }
2558    else {
2559       dst_alignment = dst_type.length * dst_type.width / 8;
2560    }
2561    /* Force power-of-two alignment by extracting only the least-significant-bit */
2562    dst_alignment = 1 << (ffs(dst_alignment) - 1);
2563    /*
2564     * Resource base and stride pointers are aligned to 16 bytes, so that's
2565     * the maximum alignment we can guarantee
2566     */
2567    dst_alignment = MIN2(16, dst_alignment);
2568
2569    ls_type = dst_type;
2570
2571    if (dst_count > src_count) {
2572       if ((dst_type.width == 8 || dst_type.width == 16) &&
2573           util_is_power_of_two_or_zero(dst_type.length) &&
2574           dst_type.length * dst_type.width < 128) {
2575          /*
2576           * Never try to load values as 4xi8 which we will then
2577           * concatenate to larger vectors. This gives llvm a real
2578           * headache (the problem is the type legalizer (?) will
2579           * try to load that as 4xi8 zext to 4xi32 to fill the vector,
2580           * then the shuffles to concatenate are more or less impossible
2581           * - llvm is easily capable of generating a sequence of 32
2582           * pextrb/pinsrb instructions for that. Albeit it appears to
2583           * be fixed in llvm 4.0. So, load and concatenate with 32bit
2584           * width to avoid the trouble (16bit seems not as bad, llvm
2585           * probably recognizes the load+shuffle as only one shuffle
2586           * is necessary, but we can do just the same anyway).
2587           */
2588          ls_type.length = dst_type.length * dst_type.width / 32;
2589          ls_type.width = 32;
2590       }
2591    }
2592
2593    if (is_1d) {
2594       load_unswizzled_block(gallivm, color_ptr, stride, block_width, 1,
2595                             dst, ls_type, dst_count / 4, dst_alignment);
2596       for (i = dst_count / 4; i < dst_count; i++) {
2597          dst[i] = lp_build_undef(gallivm, ls_type);
2598       }
2599
2600    }
2601    else {
2602       load_unswizzled_block(gallivm, color_ptr, stride, block_width, block_height,
2603                             dst, ls_type, dst_count, dst_alignment);
2604    }
2605
2606
2607    /*
2608     * Convert from dst/output format to src/blending format.
2609     *
2610     * This is necessary as we can only read 1 row from memory at a time,
2611     * so the minimum dst_count will ever be at this point is 4.
2612     *
2613     * With, for example, R8 format you can have all 16 pixels in a 128 bit vector,
2614     * this will take the 4 dsts and combine them into 1 src so we can perform blending
2615     * on all 16 pixels in that single vector at once.
2616     */
2617    if (dst_count > src_count) {
2618       if (ls_type.length != dst_type.length && ls_type.length == 1) {
2619          LLVMTypeRef elem_type = lp_build_elem_type(gallivm, ls_type);
2620          LLVMTypeRef ls_vec_type = LLVMVectorType(elem_type, 1);
2621          for (i = 0; i < dst_count; i++) {
2622             dst[i] = LLVMBuildBitCast(builder, dst[i], ls_vec_type, "");
2623          }
2624       }
2625
2626       lp_build_concat_n(gallivm, ls_type, dst, 4, dst, src_count);
2627
2628       if (ls_type.length != dst_type.length) {
2629          struct lp_type tmp_type = dst_type;
2630          tmp_type.length = dst_type.length * 4 / src_count;
2631          for (i = 0; i < src_count; i++) {
2632             dst[i] = LLVMBuildBitCast(builder, dst[i],
2633                                       lp_build_vec_type(gallivm, tmp_type), "");
2634          }
2635       }
2636    }
2637
2638    /*
2639     * Blending
2640     */
2641    /* XXX this is broken for RGB8 formats -
2642     * they get expanded from 12 to 16 elements (to include alpha)
2643     * by convert_to_blend_type then reduced to 15 instead of 12
2644     * by convert_from_blend_type (a simple fix though breaks A8...).
2645     * R16G16B16 also crashes differently however something going wrong
2646     * inside llvm handling npot vector sizes seemingly.
2647     * It seems some cleanup could be done here (like skipping conversion/blend
2648     * when not needed).
2649     */
2650    convert_to_blend_type(gallivm, block_size, out_format_desc, dst_type,
2651                          row_type, dst, src_count);
2652
2653    /*
2654     * FIXME: Really should get logic ops / masks out of generic blend / row
2655     * format. Logic ops will definitely not work on the blend float format
2656     * used for SRGB here and I think OpenGL expects this to work as expected
2657     * (that is incoming values converted to srgb then logic op applied).
2658     */
2659    for (i = 0; i < src_count; ++i) {
2660       dst[i] = lp_build_blend_aos(gallivm,
2661                                   &variant->key.blend,
2662                                   out_format,
2663                                   row_type,
2664                                   rt,
2665                                   src[i],
2666                                   has_alpha ? NULL : src_alpha[i],
2667                                   src1[i],
2668                                   has_alpha ? NULL : src1_alpha[i],
2669                                   dst[i],
2670                                   partial_mask ? src_mask[i] : NULL,
2671                                   blend_color,
2672                                   has_alpha ? NULL : blend_alpha,
2673                                   swizzle,
2674                                   pad_inline ? 4 : dst_channels);
2675    }
2676
2677    convert_from_blend_type(gallivm, block_size, out_format_desc,
2678                            row_type, dst_type, dst, src_count);
2679
2680    /* Split the blend rows back to memory rows */
2681    if (dst_count > src_count) {
2682       row_type.length = dst_type.length * (dst_count / src_count);
2683
2684       if (src_count == 1) {
2685          dst[1] = lp_build_extract_range(gallivm, dst[0], row_type.length / 2, row_type.length / 2);
2686          dst[0] = lp_build_extract_range(gallivm, dst[0], 0, row_type.length / 2);
2687
2688          row_type.length /= 2;
2689          src_count *= 2;
2690       }
2691
2692       dst[3] = lp_build_extract_range(gallivm, dst[1], row_type.length / 2, row_type.length / 2);
2693       dst[2] = lp_build_extract_range(gallivm, dst[1], 0, row_type.length / 2);
2694       dst[1] = lp_build_extract_range(gallivm, dst[0], row_type.length / 2, row_type.length / 2);
2695       dst[0] = lp_build_extract_range(gallivm, dst[0], 0, row_type.length / 2);
2696
2697       row_type.length /= 2;
2698       src_count *= 2;
2699    }
2700
2701    /*
2702     * Store blend result to memory
2703     */
2704    if (is_1d) {
2705       store_unswizzled_block(gallivm, color_ptr, stride, block_width, 1,
2706                              dst, dst_type, dst_count / 4, dst_alignment);
2707    }
2708    else {
2709       store_unswizzled_block(gallivm, color_ptr, stride, block_width, block_height,
2710                              dst, dst_type, dst_count, dst_alignment);
2711    }
2712
2713    if (have_smallfloat_format(dst_type, out_format)) {
2714       lp_build_fpstate_set(gallivm, fpstate);
2715    }
2716
2717    if (do_branch) {
2718       lp_build_mask_end(&mask_ctx);
2719    }
2720 }
2721
2722
2723 /**
2724  * Generate the runtime callable function for the whole fragment pipeline.
2725  * Note that the function which we generate operates on a block of 16
2726  * pixels at at time.  The block contains 2x2 quads.  Each quad contains
2727  * 2x2 pixels.
2728  */
2729 static void
2730 generate_fragment(struct llvmpipe_context *lp,
2731                   struct lp_fragment_shader *shader,
2732                   struct lp_fragment_shader_variant *variant,
2733                   unsigned partial_mask)
2734 {
2735    struct gallivm_state *gallivm = variant->gallivm;
2736    struct lp_fragment_shader_variant_key *key = &variant->key;
2737    struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS];
2738    char func_name[64];
2739    struct lp_type fs_type;
2740    struct lp_type blend_type;
2741    LLVMTypeRef fs_elem_type;
2742    LLVMTypeRef blend_vec_type;
2743    LLVMTypeRef arg_types[15];
2744    LLVMTypeRef func_type;
2745    LLVMTypeRef int32_type = LLVMInt32TypeInContext(gallivm->context);
2746    LLVMTypeRef int8_type = LLVMInt8TypeInContext(gallivm->context);
2747    LLVMValueRef context_ptr;
2748    LLVMValueRef x;
2749    LLVMValueRef y;
2750    LLVMValueRef a0_ptr;
2751    LLVMValueRef dadx_ptr;
2752    LLVMValueRef dady_ptr;
2753    LLVMValueRef color_ptr_ptr;
2754    LLVMValueRef stride_ptr;
2755    LLVMValueRef color_sample_stride_ptr;
2756    LLVMValueRef depth_ptr;
2757    LLVMValueRef depth_stride;
2758    LLVMValueRef depth_sample_stride;
2759    LLVMValueRef mask_input;
2760    LLVMValueRef thread_data_ptr;
2761    LLVMBasicBlockRef block;
2762    LLVMBuilderRef builder;
2763    struct lp_build_sampler_soa *sampler;
2764    struct lp_build_image_soa *image;
2765    struct lp_build_interp_soa_context interp;
2766    LLVMValueRef fs_mask[(16 / 4) * LP_MAX_SAMPLES];
2767    LLVMValueRef fs_out_color[LP_MAX_SAMPLES][PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][16 / 4];
2768    LLVMValueRef function;
2769    LLVMValueRef facing;
2770    unsigned num_fs;
2771    unsigned i;
2772    unsigned chan;
2773    unsigned cbuf;
2774    boolean cbuf0_write_all;
2775    const boolean dual_source_blend = key->blend.rt[0].blend_enable &&
2776                                      util_blend_state_is_dual(&key->blend, 0);
2777
2778    assert(lp_native_vector_width / 32 >= 4);
2779
2780    /* Adjust color input interpolation according to flatshade state:
2781     */
2782    memcpy(inputs, shader->inputs, shader->info.base.num_inputs * sizeof inputs[0]);
2783    for (i = 0; i < shader->info.base.num_inputs; i++) {
2784       if (inputs[i].interp == LP_INTERP_COLOR) {
2785          if (key->flatshade)
2786             inputs[i].interp = LP_INTERP_CONSTANT;
2787          else
2788             inputs[i].interp = LP_INTERP_PERSPECTIVE;
2789       }
2790    }
2791
2792    /* check if writes to cbuf[0] are to be copied to all cbufs */
2793    cbuf0_write_all =
2794      shader->info.base.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS];
2795
2796    /* TODO: actually pick these based on the fs and color buffer
2797     * characteristics. */
2798
2799    memset(&fs_type, 0, sizeof fs_type);
2800    fs_type.floating = TRUE;      /* floating point values */
2801    fs_type.sign = TRUE;          /* values are signed */
2802    fs_type.norm = FALSE;         /* values are not limited to [0,1] or [-1,1] */
2803    fs_type.width = 32;           /* 32-bit float */
2804    fs_type.length = MIN2(lp_native_vector_width / 32, 16); /* n*4 elements per vector */
2805
2806    memset(&blend_type, 0, sizeof blend_type);
2807    blend_type.floating = FALSE; /* values are integers */
2808    blend_type.sign = FALSE;     /* values are unsigned */
2809    blend_type.norm = TRUE;      /* values are in [0,1] or [-1,1] */
2810    blend_type.width = 8;        /* 8-bit ubyte values */
2811    blend_type.length = 16;      /* 16 elements per vector */
2812
2813    /*
2814     * Generate the function prototype. Any change here must be reflected in
2815     * lp_jit.h's lp_jit_frag_func function pointer type, and vice-versa.
2816     */
2817
2818    fs_elem_type = lp_build_elem_type(gallivm, fs_type);
2819
2820    blend_vec_type = lp_build_vec_type(gallivm, blend_type);
2821
2822    snprintf(func_name, sizeof(func_name), "fs_variant_%s",
2823             partial_mask ? "partial" : "whole");
2824
2825    arg_types[0] = variant->jit_context_ptr_type;       /* context */
2826    arg_types[1] = int32_type;                          /* x */
2827    arg_types[2] = int32_type;                          /* y */
2828    arg_types[3] = int32_type;                          /* facing */
2829    arg_types[4] = LLVMPointerType(fs_elem_type, 0);    /* a0 */
2830    arg_types[5] = LLVMPointerType(fs_elem_type, 0);    /* dadx */
2831    arg_types[6] = LLVMPointerType(fs_elem_type, 0);    /* dady */
2832    arg_types[7] = LLVMPointerType(LLVMPointerType(int8_type, 0), 0);  /* color */
2833    arg_types[8] = LLVMPointerType(int8_type, 0);       /* depth */
2834    arg_types[9] = LLVMInt64TypeInContext(gallivm->context);  /* mask_input */
2835    arg_types[10] = variant->jit_thread_data_ptr_type;  /* per thread data */
2836    arg_types[11] = LLVMPointerType(int32_type, 0);     /* stride */
2837    arg_types[12] = int32_type;                         /* depth_stride */
2838    arg_types[13] = LLVMPointerType(int32_type, 0);     /* color sample strides */
2839    arg_types[14] = int32_type;                         /* depth sample stride */
2840
2841    func_type = LLVMFunctionType(LLVMVoidTypeInContext(gallivm->context),
2842                                 arg_types, ARRAY_SIZE(arg_types), 0);
2843
2844    function = LLVMAddFunction(gallivm->module, func_name, func_type);
2845    LLVMSetFunctionCallConv(function, LLVMCCallConv);
2846
2847    variant->function[partial_mask] = function;
2848
2849    /* XXX: need to propagate noalias down into color param now we are
2850     * passing a pointer-to-pointer?
2851     */
2852    for(i = 0; i < ARRAY_SIZE(arg_types); ++i)
2853       if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind)
2854          lp_add_function_attr(function, i + 1, LP_FUNC_ATTR_NOALIAS);
2855
2856    if (variant->gallivm->cache->data_size)
2857       return;
2858
2859    context_ptr  = LLVMGetParam(function, 0);
2860    x            = LLVMGetParam(function, 1);
2861    y            = LLVMGetParam(function, 2);
2862    facing       = LLVMGetParam(function, 3);
2863    a0_ptr       = LLVMGetParam(function, 4);
2864    dadx_ptr     = LLVMGetParam(function, 5);
2865    dady_ptr     = LLVMGetParam(function, 6);
2866    color_ptr_ptr = LLVMGetParam(function, 7);
2867    depth_ptr    = LLVMGetParam(function, 8);
2868    mask_input   = LLVMGetParam(function, 9);
2869    thread_data_ptr  = LLVMGetParam(function, 10);
2870    stride_ptr   = LLVMGetParam(function, 11);
2871    depth_stride = LLVMGetParam(function, 12);
2872    color_sample_stride_ptr = LLVMGetParam(function, 13);
2873    depth_sample_stride = LLVMGetParam(function, 14);
2874
2875    lp_build_name(context_ptr, "context");
2876    lp_build_name(x, "x");
2877    lp_build_name(y, "y");
2878    lp_build_name(a0_ptr, "a0");
2879    lp_build_name(dadx_ptr, "dadx");
2880    lp_build_name(dady_ptr, "dady");
2881    lp_build_name(color_ptr_ptr, "color_ptr_ptr");
2882    lp_build_name(depth_ptr, "depth");
2883    lp_build_name(mask_input, "mask_input");
2884    lp_build_name(thread_data_ptr, "thread_data");
2885    lp_build_name(stride_ptr, "stride_ptr");
2886    lp_build_name(depth_stride, "depth_stride");
2887    lp_build_name(color_sample_stride_ptr, "color_sample_stride_ptr");
2888    lp_build_name(depth_sample_stride, "depth_sample_stride");
2889
2890    /*
2891     * Function body
2892     */
2893
2894    block = LLVMAppendBasicBlockInContext(gallivm->context, function, "entry");
2895    builder = gallivm->builder;
2896    assert(builder);
2897    LLVMPositionBuilderAtEnd(builder, block);
2898
2899    /*
2900     * Must not count ps invocations if there's a null shader.
2901     * (It would be ok to count with null shader if there's d/s tests,
2902     * but only if there's d/s buffers too, which is different
2903     * to implicit rasterization disable which must not depend
2904     * on the d/s buffers.)
2905     * Could use popcount on mask, but pixel accuracy is not required.
2906     * Could disable if there's no stats query, but maybe not worth it.
2907     */
2908    if (shader->info.base.num_instructions > 1) {
2909       LLVMValueRef invocs, val;
2910       invocs = lp_jit_thread_data_invocations(gallivm, thread_data_ptr);
2911       val = LLVMBuildLoad(builder, invocs, "");
2912       val = LLVMBuildAdd(builder, val,
2913                          LLVMConstInt(LLVMInt64TypeInContext(gallivm->context), 1, 0),
2914                          "invoc_count");
2915       LLVMBuildStore(builder, val, invocs);
2916    }
2917
2918    /* code generated texture sampling */
2919    sampler = lp_llvm_sampler_soa_create(key->samplers, key->nr_samplers);
2920    image = lp_llvm_image_soa_create(lp_fs_variant_key_images(key), key->nr_images);
2921
2922    num_fs = 16 / fs_type.length; /* number of loops per 4x4 stamp */
2923    /* for 1d resources only run "upper half" of stamp */
2924    if (key->resource_1d)
2925       num_fs /= 2;
2926
2927    {
2928       LLVMValueRef num_loop = lp_build_const_int32(gallivm, num_fs);
2929       LLVMTypeRef mask_type = lp_build_int_vec_type(gallivm, fs_type);
2930       LLVMValueRef num_loop_samp = lp_build_const_int32(gallivm, num_fs * key->coverage_samples);
2931       LLVMValueRef mask_store = lp_build_array_alloca(gallivm, mask_type,
2932                                                       num_loop_samp, "mask_store");
2933
2934       LLVMTypeRef flt_type = LLVMFloatTypeInContext(gallivm->context);
2935       LLVMValueRef glob_sample_pos = LLVMAddGlobal(gallivm->module, LLVMArrayType(flt_type, key->coverage_samples * 2), "");
2936       LLVMValueRef sample_pos_array;
2937
2938       if (key->multisample && key->coverage_samples == 4) {
2939          LLVMValueRef sample_pos_arr[8];
2940          for (unsigned i = 0; i < 4; i++) {
2941             sample_pos_arr[i * 2] = LLVMConstReal(flt_type, lp_sample_pos_4x[i][0]);
2942             sample_pos_arr[i * 2 + 1] = LLVMConstReal(flt_type, lp_sample_pos_4x[i][1]);
2943          }
2944          sample_pos_array = LLVMConstArray(LLVMFloatTypeInContext(gallivm->context), sample_pos_arr, 8);
2945       } else {
2946          LLVMValueRef sample_pos_arr[2];
2947          sample_pos_arr[0] = LLVMConstReal(flt_type, 0.5);
2948          sample_pos_arr[1] = LLVMConstReal(flt_type, 0.5);
2949          sample_pos_array = LLVMConstArray(LLVMFloatTypeInContext(gallivm->context), sample_pos_arr, 2);
2950       }
2951       LLVMSetInitializer(glob_sample_pos, sample_pos_array);
2952
2953       LLVMValueRef color_store[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS];
2954       boolean pixel_center_integer =
2955          shader->info.base.properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER];
2956
2957       /*
2958        * The shader input interpolation info is not explicitely baked in the
2959        * shader key, but everything it derives from (TGSI, and flatshade) is
2960        * already included in the shader key.
2961        */
2962       lp_build_interp_soa_init(&interp,
2963                                gallivm,
2964                                shader->info.base.num_inputs,
2965                                inputs,
2966                                pixel_center_integer,
2967                                key->coverage_samples, glob_sample_pos,
2968                                num_loop,
2969                                key->depth_clamp,
2970                                builder, fs_type,
2971                                a0_ptr, dadx_ptr, dady_ptr,
2972                                x, y);
2973
2974       for (i = 0; i < num_fs; i++) {
2975          if (key->multisample) {
2976             LLVMValueRef smask_val = LLVMBuildLoad(builder, lp_jit_context_sample_mask(gallivm, context_ptr), "");
2977
2978             /*
2979              * For multisampling, extract the per-sample mask from the incoming 64-bit mask,
2980              * store to the per sample mask storage. Or all of them together to generate
2981              * the fragment shader mask. (sample shading TODO).
2982              * Take the incoming state coverage mask into account.
2983              */
2984             for (unsigned s = 0; s < key->coverage_samples; s++) {
2985                LLVMValueRef sindexi = lp_build_const_int32(gallivm, i + (s * num_fs));
2986                LLVMValueRef sample_mask_ptr = LLVMBuildGEP(builder, mask_store,
2987                                                            &sindexi, 1, "sample_mask_ptr");
2988                LLVMValueRef s_mask = generate_quad_mask(gallivm, fs_type,
2989                                                         i*fs_type.length/4, s, mask_input);
2990
2991                LLVMValueRef smask_bit = LLVMBuildAnd(builder, smask_val, lp_build_const_int32(gallivm, (1 << s)), "");
2992                LLVMValueRef cmp = LLVMBuildICmp(builder, LLVMIntNE, smask_bit, lp_build_const_int32(gallivm, 0), "");
2993                smask_bit = LLVMBuildSExt(builder, cmp, int32_type, "");
2994                smask_bit = lp_build_broadcast(gallivm, mask_type, smask_bit);
2995
2996                s_mask = LLVMBuildAnd(builder, s_mask, smask_bit, "");
2997                LLVMBuildStore(builder, s_mask, sample_mask_ptr);
2998             }
2999          } else {
3000             LLVMValueRef mask;
3001             LLVMValueRef indexi = lp_build_const_int32(gallivm, i);
3002             LLVMValueRef mask_ptr = LLVMBuildGEP(builder, mask_store,
3003                                                  &indexi, 1, "mask_ptr");
3004
3005             if (partial_mask) {
3006                mask = generate_quad_mask(gallivm, fs_type,
3007                                          i*fs_type.length/4, 0, mask_input);
3008             }
3009             else {
3010                mask = lp_build_const_int_vec(gallivm, fs_type, ~0);
3011             }
3012             LLVMBuildStore(builder, mask, mask_ptr);
3013          }
3014       }
3015
3016       generate_fs_loop(gallivm,
3017                        shader, key,
3018                        builder,
3019                        fs_type,
3020                        context_ptr,
3021                        glob_sample_pos,
3022                        num_loop,
3023                        &interp,
3024                        sampler,
3025                        image,
3026                        mask_store, /* output */
3027                        color_store,
3028                        depth_ptr,
3029                        depth_stride,
3030                        depth_sample_stride,
3031                        facing,
3032                        thread_data_ptr);
3033
3034       for (i = 0; i < num_fs; i++) {
3035          LLVMValueRef ptr;
3036          for (unsigned s = 0; s < key->coverage_samples; s++) {
3037             int idx = (i + (s * num_fs));
3038             LLVMValueRef sindexi = lp_build_const_int32(gallivm, idx);
3039             ptr = LLVMBuildGEP(builder, mask_store, &sindexi, 1, "");
3040
3041             fs_mask[idx] = LLVMBuildLoad(builder, ptr, "smask");
3042          }
3043
3044          for (unsigned s = 0; s < key->min_samples; s++) {
3045             /* This is fucked up need to reorganize things */
3046             int idx = s * num_fs + i;
3047             LLVMValueRef sindexi = lp_build_const_int32(gallivm, idx);
3048             for (cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
3049                for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
3050                   ptr = LLVMBuildGEP(builder,
3051                                      color_store[cbuf * !cbuf0_write_all][chan],
3052                                      &sindexi, 1, "");
3053                   fs_out_color[s][cbuf][chan][i] = ptr;
3054                }
3055             }
3056             if (dual_source_blend) {
3057                /* only support one dual source blend target hence always use output 1 */
3058                for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
3059                   ptr = LLVMBuildGEP(builder,
3060                                      color_store[1][chan],
3061                                      &sindexi, 1, "");
3062                   fs_out_color[s][1][chan][i] = ptr;
3063                }
3064             }
3065          }
3066       }
3067    }
3068
3069    sampler->destroy(sampler);
3070    image->destroy(image);
3071    /* Loop over color outputs / color buffers to do blending.
3072     */
3073    for(cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
3074       if (key->cbuf_format[cbuf] != PIPE_FORMAT_NONE) {
3075          LLVMValueRef color_ptr;
3076          LLVMValueRef stride;
3077          LLVMValueRef sample_stride = NULL;
3078          LLVMValueRef index = lp_build_const_int32(gallivm, cbuf);
3079
3080          boolean do_branch = ((key->depth.enabled
3081                                || key->stencil[0].enabled
3082                                || key->alpha.enabled)
3083                               && !shader->info.base.uses_kill);
3084
3085          color_ptr = LLVMBuildLoad(builder,
3086                                    LLVMBuildGEP(builder, color_ptr_ptr,
3087                                                 &index, 1, ""),
3088                                    "");
3089
3090          stride = LLVMBuildLoad(builder,
3091                                 LLVMBuildGEP(builder, stride_ptr, &index, 1, ""),
3092                                 "");
3093
3094          if (key->multisample)
3095             sample_stride = LLVMBuildLoad(builder,
3096                                           LLVMBuildGEP(builder, color_sample_stride_ptr,
3097                                                        &index, 1, ""), "");
3098
3099          for (unsigned s = 0; s < key->cbuf_nr_samples[cbuf]; s++) {
3100             unsigned mask_idx = num_fs * (key->multisample ? s : 0);
3101             unsigned out_idx = key->min_samples == 1 ? 0 : s;
3102             LLVMValueRef out_ptr = color_ptr;;
3103
3104             if (key->multisample) {
3105                LLVMValueRef sample_offset = LLVMBuildMul(builder, sample_stride, lp_build_const_int32(gallivm, s), "");
3106                out_ptr = LLVMBuildGEP(builder, out_ptr, &sample_offset, 1, "");
3107             }
3108             out_ptr = LLVMBuildBitCast(builder, out_ptr, LLVMPointerType(blend_vec_type, 0), "");
3109
3110             lp_build_name(out_ptr, "color_ptr%d", cbuf);
3111
3112             generate_unswizzled_blend(gallivm, cbuf, variant,
3113                                       key->cbuf_format[cbuf],
3114                                       num_fs, fs_type, &fs_mask[mask_idx], fs_out_color[out_idx],
3115                                       context_ptr, out_ptr, stride,
3116                                       partial_mask, do_branch);
3117          }
3118       }
3119    }
3120
3121    LLVMBuildRetVoid(builder);
3122
3123    gallivm_verify_function(gallivm, function);
3124 }
3125
3126
3127 static void
3128 dump_fs_variant_key(struct lp_fragment_shader_variant_key *key)
3129 {
3130    unsigned i;
3131
3132    debug_printf("fs variant %p:\n", (void *) key);
3133
3134    if (key->flatshade) {
3135       debug_printf("flatshade = 1\n");
3136    }
3137    if (key->multisample) {
3138       debug_printf("multisample = 1\n");
3139       debug_printf("coverage samples = %d\n", key->coverage_samples);
3140       debug_printf("min samples = %d\n", key->min_samples);
3141    }
3142    for (i = 0; i < key->nr_cbufs; ++i) {
3143       debug_printf("cbuf_format[%u] = %s\n", i, util_format_name(key->cbuf_format[i]));
3144       debug_printf("cbuf nr_samples[%u] = %d\n", i, key->cbuf_nr_samples[i]);
3145    }
3146    if (key->depth.enabled || key->stencil[0].enabled) {
3147       debug_printf("depth.format = %s\n", util_format_name(key->zsbuf_format));
3148       debug_printf("depth nr_samples = %d\n", key->zsbuf_nr_samples);
3149    }
3150    if (key->depth.enabled) {
3151       debug_printf("depth.func = %s\n", util_str_func(key->depth.func, TRUE));
3152       debug_printf("depth.writemask = %u\n", key->depth.writemask);
3153    }
3154
3155    for (i = 0; i < 2; ++i) {
3156       if (key->stencil[i].enabled) {
3157          debug_printf("stencil[%u].func = %s\n", i, util_str_func(key->stencil[i].func, TRUE));
3158          debug_printf("stencil[%u].fail_op = %s\n", i, util_str_stencil_op(key->stencil[i].fail_op, TRUE));
3159          debug_printf("stencil[%u].zpass_op = %s\n", i, util_str_stencil_op(key->stencil[i].zpass_op, TRUE));
3160          debug_printf("stencil[%u].zfail_op = %s\n", i, util_str_stencil_op(key->stencil[i].zfail_op, TRUE));
3161          debug_printf("stencil[%u].valuemask = 0x%x\n", i, key->stencil[i].valuemask);
3162          debug_printf("stencil[%u].writemask = 0x%x\n", i, key->stencil[i].writemask);
3163       }
3164    }
3165
3166    if (key->alpha.enabled) {
3167       debug_printf("alpha.func = %s\n", util_str_func(key->alpha.func, TRUE));
3168    }
3169
3170    if (key->occlusion_count) {
3171       debug_printf("occlusion_count = 1\n");
3172    }
3173
3174    if (key->blend.logicop_enable) {
3175       debug_printf("blend.logicop_func = %s\n", util_str_logicop(key->blend.logicop_func, TRUE));
3176    }
3177    else if (key->blend.rt[0].blend_enable) {
3178       debug_printf("blend.rgb_func = %s\n",   util_str_blend_func  (key->blend.rt[0].rgb_func, TRUE));
3179       debug_printf("blend.rgb_src_factor = %s\n",   util_str_blend_factor(key->blend.rt[0].rgb_src_factor, TRUE));
3180       debug_printf("blend.rgb_dst_factor = %s\n",   util_str_blend_factor(key->blend.rt[0].rgb_dst_factor, TRUE));
3181       debug_printf("blend.alpha_func = %s\n",       util_str_blend_func  (key->blend.rt[0].alpha_func, TRUE));
3182       debug_printf("blend.alpha_src_factor = %s\n", util_str_blend_factor(key->blend.rt[0].alpha_src_factor, TRUE));
3183       debug_printf("blend.alpha_dst_factor = %s\n", util_str_blend_factor(key->blend.rt[0].alpha_dst_factor, TRUE));
3184    }
3185    debug_printf("blend.colormask = 0x%x\n", key->blend.rt[0].colormask);
3186    if (key->blend.alpha_to_coverage) {
3187       debug_printf("blend.alpha_to_coverage is enabled\n");
3188    }
3189    for (i = 0; i < key->nr_samplers; ++i) {
3190       const struct lp_static_sampler_state *sampler = &key->samplers[i].sampler_state;
3191       debug_printf("sampler[%u] = \n", i);
3192       debug_printf("  .wrap = %s %s %s\n",
3193                    util_str_tex_wrap(sampler->wrap_s, TRUE),
3194                    util_str_tex_wrap(sampler->wrap_t, TRUE),
3195                    util_str_tex_wrap(sampler->wrap_r, TRUE));
3196       debug_printf("  .min_img_filter = %s\n",
3197                    util_str_tex_filter(sampler->min_img_filter, TRUE));
3198       debug_printf("  .min_mip_filter = %s\n",
3199                    util_str_tex_mipfilter(sampler->min_mip_filter, TRUE));
3200       debug_printf("  .mag_img_filter = %s\n",
3201                    util_str_tex_filter(sampler->mag_img_filter, TRUE));
3202       if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE)
3203          debug_printf("  .compare_func = %s\n", util_str_func(sampler->compare_func, TRUE));
3204       debug_printf("  .normalized_coords = %u\n", sampler->normalized_coords);
3205       debug_printf("  .min_max_lod_equal = %u\n", sampler->min_max_lod_equal);
3206       debug_printf("  .lod_bias_non_zero = %u\n", sampler->lod_bias_non_zero);
3207       debug_printf("  .apply_min_lod = %u\n", sampler->apply_min_lod);
3208       debug_printf("  .apply_max_lod = %u\n", sampler->apply_max_lod);
3209    }
3210    for (i = 0; i < key->nr_sampler_views; ++i) {
3211       const struct lp_static_texture_state *texture = &key->samplers[i].texture_state;
3212       debug_printf("texture[%u] = \n", i);
3213       debug_printf("  .format = %s\n",
3214                    util_format_name(texture->format));
3215       debug_printf("  .target = %s\n",
3216                    util_str_tex_target(texture->target, TRUE));
3217       debug_printf("  .level_zero_only = %u\n",
3218                    texture->level_zero_only);
3219       debug_printf("  .pot = %u %u %u\n",
3220                    texture->pot_width,
3221                    texture->pot_height,
3222                    texture->pot_depth);
3223    }
3224    struct lp_image_static_state *images = lp_fs_variant_key_images(key);
3225    for (i = 0; i < key->nr_images; ++i) {
3226       const struct lp_static_texture_state *image = &images[i].image_state;
3227       debug_printf("image[%u] = \n", i);
3228       debug_printf("  .format = %s\n",
3229                    util_format_name(image->format));
3230       debug_printf("  .target = %s\n",
3231                    util_str_tex_target(image->target, TRUE));
3232       debug_printf("  .level_zero_only = %u\n",
3233                    image->level_zero_only);
3234       debug_printf("  .pot = %u %u %u\n",
3235                    image->pot_width,
3236                    image->pot_height,
3237                    image->pot_depth);
3238    }
3239 }
3240
3241
3242 void
3243 lp_debug_fs_variant(struct lp_fragment_shader_variant *variant)
3244 {
3245    debug_printf("llvmpipe: Fragment shader #%u variant #%u:\n",
3246                 variant->shader->no, variant->no);
3247    if (variant->shader->base.type == PIPE_SHADER_IR_TGSI)
3248       tgsi_dump(variant->shader->base.tokens, 0);
3249    else
3250       nir_print_shader(variant->shader->base.ir.nir, stderr);
3251    dump_fs_variant_key(&variant->key);
3252    debug_printf("variant->opaque = %u\n", variant->opaque);
3253    debug_printf("\n");
3254 }
3255
3256 static void
3257 lp_fs_get_ir_cache_key(struct lp_fragment_shader_variant *variant,
3258                             unsigned char ir_sha1_cache_key[20])
3259 {
3260    struct blob blob = { 0 };
3261    unsigned ir_size;
3262    void *ir_binary;
3263
3264    blob_init(&blob);
3265    nir_serialize(&blob, variant->shader->base.ir.nir, true);
3266    ir_binary = blob.data;
3267    ir_size = blob.size;
3268
3269    struct mesa_sha1 ctx;
3270    _mesa_sha1_init(&ctx);
3271    _mesa_sha1_update(&ctx, &variant->key, variant->shader->variant_key_size);
3272    _mesa_sha1_update(&ctx, ir_binary, ir_size);
3273    _mesa_sha1_final(&ctx, ir_sha1_cache_key);
3274
3275    blob_finish(&blob);
3276 }
3277
3278 /**
3279  * Generate a new fragment shader variant from the shader code and
3280  * other state indicated by the key.
3281  */
3282 static struct lp_fragment_shader_variant *
3283 generate_variant(struct llvmpipe_context *lp,
3284                  struct lp_fragment_shader *shader,
3285                  const struct lp_fragment_shader_variant_key *key)
3286 {
3287    struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen);
3288    struct lp_fragment_shader_variant *variant;
3289    const struct util_format_description *cbuf0_format_desc = NULL;
3290    boolean fullcolormask;
3291    char module_name[64];
3292    unsigned char ir_sha1_cache_key[20];
3293    struct lp_cached_code cached = { 0 };
3294    bool needs_caching = false;
3295    variant = MALLOC(sizeof *variant + shader->variant_key_size - sizeof variant->key);
3296    if (!variant)
3297       return NULL;
3298
3299    memset(variant, 0, sizeof(*variant));
3300    snprintf(module_name, sizeof(module_name), "fs%u_variant%u",
3301             shader->no, shader->variants_created);
3302
3303    variant->shader = shader;
3304    memcpy(&variant->key, key, shader->variant_key_size);
3305
3306    if (shader->base.ir.nir) {
3307       lp_fs_get_ir_cache_key(variant, ir_sha1_cache_key);
3308
3309       lp_disk_cache_find_shader(screen, &cached, ir_sha1_cache_key);
3310       if (!cached.data_size)
3311          needs_caching = true;
3312    }
3313    variant->gallivm = gallivm_create(module_name, lp->context, &cached);
3314    if (!variant->gallivm) {
3315       FREE(variant);
3316       return NULL;
3317    }
3318
3319    variant->list_item_global.base = variant;
3320    variant->list_item_local.base = variant;
3321    variant->no = shader->variants_created++;
3322
3323
3324
3325    /*
3326     * Determine whether we are touching all channels in the color buffer.
3327     */
3328    fullcolormask = FALSE;
3329    if (key->nr_cbufs == 1) {
3330       cbuf0_format_desc = util_format_description(key->cbuf_format[0]);
3331       fullcolormask = util_format_colormask_full(cbuf0_format_desc, key->blend.rt[0].colormask);
3332    }
3333
3334    variant->opaque =
3335          !key->blend.logicop_enable &&
3336          !key->blend.rt[0].blend_enable &&
3337          fullcolormask &&
3338          !key->stencil[0].enabled &&
3339          !key->alpha.enabled &&
3340          !key->multisample &&
3341          !key->blend.alpha_to_coverage &&
3342          !key->depth.enabled &&
3343          !shader->info.base.uses_kill &&
3344          !shader->info.base.writes_samplemask
3345       ? TRUE : FALSE;
3346
3347    if ((LP_DEBUG & DEBUG_FS) || (gallivm_debug & GALLIVM_DEBUG_IR)) {
3348       lp_debug_fs_variant(variant);
3349    }
3350
3351    lp_jit_init_types(variant);
3352
3353    if (variant->jit_function[RAST_EDGE_TEST] == NULL)
3354       generate_fragment(lp, shader, variant, RAST_EDGE_TEST);
3355
3356    if (variant->jit_function[RAST_WHOLE] == NULL) {
3357       if (variant->opaque) {
3358          /* Specialized shader, which doesn't need to read the color buffer. */
3359          generate_fragment(lp, shader, variant, RAST_WHOLE);
3360       }
3361    }
3362
3363    /*
3364     * Compile everything
3365     */
3366
3367    gallivm_compile_module(variant->gallivm);
3368
3369    variant->nr_instrs += lp_build_count_ir_module(variant->gallivm->module);
3370
3371    if (variant->function[RAST_EDGE_TEST]) {
3372       variant->jit_function[RAST_EDGE_TEST] = (lp_jit_frag_func)
3373             gallivm_jit_function(variant->gallivm,
3374                                  variant->function[RAST_EDGE_TEST]);
3375    }
3376
3377    if (variant->function[RAST_WHOLE]) {
3378          variant->jit_function[RAST_WHOLE] = (lp_jit_frag_func)
3379                gallivm_jit_function(variant->gallivm,
3380                                     variant->function[RAST_WHOLE]);
3381    } else if (!variant->jit_function[RAST_WHOLE]) {
3382       variant->jit_function[RAST_WHOLE] = variant->jit_function[RAST_EDGE_TEST];
3383    }
3384
3385    if (needs_caching) {
3386       lp_disk_cache_insert_shader(screen, &cached, ir_sha1_cache_key);
3387    }
3388
3389    gallivm_free_ir(variant->gallivm);
3390
3391    return variant;
3392 }
3393
3394
3395 static void *
3396 llvmpipe_create_fs_state(struct pipe_context *pipe,
3397                          const struct pipe_shader_state *templ)
3398 {
3399    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
3400    struct lp_fragment_shader *shader;
3401    int nr_samplers;
3402    int nr_sampler_views;
3403    int nr_images;
3404    int i;
3405
3406    shader = CALLOC_STRUCT(lp_fragment_shader);
3407    if (!shader)
3408       return NULL;
3409
3410    shader->no = fs_no++;
3411    make_empty_list(&shader->variants);
3412
3413    shader->base.type = templ->type;
3414    if (templ->type == PIPE_SHADER_IR_TGSI) {
3415       /* get/save the summary info for this shader */
3416       lp_build_tgsi_info(templ->tokens, &shader->info);
3417
3418       /* we need to keep a local copy of the tokens */
3419       shader->base.tokens = tgsi_dup_tokens(templ->tokens);
3420    } else {
3421       shader->base.ir.nir = templ->ir.nir;
3422       nir_tgsi_scan_shader(templ->ir.nir, &shader->info.base, true);
3423    }
3424
3425    shader->draw_data = draw_create_fragment_shader(llvmpipe->draw, templ);
3426    if (shader->draw_data == NULL) {
3427       FREE((void *) shader->base.tokens);
3428       FREE(shader);
3429       return NULL;
3430    }
3431
3432    nr_samplers = shader->info.base.file_max[TGSI_FILE_SAMPLER] + 1;
3433    nr_sampler_views = shader->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] + 1;
3434    nr_images = shader->info.base.file_max[TGSI_FILE_IMAGE] + 1;
3435    shader->variant_key_size = lp_fs_variant_key_size(MAX2(nr_samplers, nr_sampler_views), nr_images);
3436
3437    for (i = 0; i < shader->info.base.num_inputs; i++) {
3438       shader->inputs[i].usage_mask = shader->info.base.input_usage_mask[i];
3439       shader->inputs[i].cyl_wrap = shader->info.base.input_cylindrical_wrap[i];
3440       shader->inputs[i].location = shader->info.base.input_interpolate_loc[i];
3441
3442       switch (shader->info.base.input_interpolate[i]) {
3443       case TGSI_INTERPOLATE_CONSTANT:
3444          shader->inputs[i].interp = LP_INTERP_CONSTANT;
3445          break;
3446       case TGSI_INTERPOLATE_LINEAR:
3447          shader->inputs[i].interp = LP_INTERP_LINEAR;
3448          break;
3449       case TGSI_INTERPOLATE_PERSPECTIVE:
3450          shader->inputs[i].interp = LP_INTERP_PERSPECTIVE;
3451          break;
3452       case TGSI_INTERPOLATE_COLOR:
3453          shader->inputs[i].interp = LP_INTERP_COLOR;
3454          break;
3455       default:
3456          assert(0);
3457          break;
3458       }
3459
3460       switch (shader->info.base.input_semantic_name[i]) {
3461       case TGSI_SEMANTIC_FACE:
3462          shader->inputs[i].interp = LP_INTERP_FACING;
3463          break;
3464       case TGSI_SEMANTIC_POSITION:
3465          /* Position was already emitted above
3466           */
3467          shader->inputs[i].interp = LP_INTERP_POSITION;
3468          shader->inputs[i].src_index = 0;
3469          continue;
3470       }
3471
3472       /* XXX this is a completely pointless index map... */
3473       shader->inputs[i].src_index = i+1;
3474    }
3475
3476    if (LP_DEBUG & DEBUG_TGSI) {
3477       unsigned attrib;
3478       debug_printf("llvmpipe: Create fragment shader #%u %p:\n",
3479                    shader->no, (void *) shader);
3480       tgsi_dump(templ->tokens, 0);
3481       debug_printf("usage masks:\n");
3482       for (attrib = 0; attrib < shader->info.base.num_inputs; ++attrib) {
3483          unsigned usage_mask = shader->info.base.input_usage_mask[attrib];
3484          debug_printf("  IN[%u].%s%s%s%s\n",
3485                       attrib,
3486                       usage_mask & TGSI_WRITEMASK_X ? "x" : "",
3487                       usage_mask & TGSI_WRITEMASK_Y ? "y" : "",
3488                       usage_mask & TGSI_WRITEMASK_Z ? "z" : "",
3489                       usage_mask & TGSI_WRITEMASK_W ? "w" : "");
3490       }
3491       debug_printf("\n");
3492    }
3493
3494    return shader;
3495 }
3496
3497
3498 static void
3499 llvmpipe_bind_fs_state(struct pipe_context *pipe, void *fs)
3500 {
3501    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
3502    struct lp_fragment_shader *lp_fs = (struct lp_fragment_shader *)fs;
3503    if (llvmpipe->fs == lp_fs)
3504       return;
3505
3506    draw_bind_fragment_shader(llvmpipe->draw,
3507                              (lp_fs ? lp_fs->draw_data : NULL));
3508
3509    llvmpipe->fs = lp_fs;
3510
3511    llvmpipe->dirty |= LP_NEW_FS;
3512 }
3513
3514
3515 /**
3516  * Remove shader variant from two lists: the shader's variant list
3517  * and the context's variant list.
3518  */
3519 static void
3520 llvmpipe_remove_shader_variant(struct llvmpipe_context *lp,
3521                                struct lp_fragment_shader_variant *variant)
3522 {
3523    if ((LP_DEBUG & DEBUG_FS) || (gallivm_debug & GALLIVM_DEBUG_IR)) {
3524       debug_printf("llvmpipe: del fs #%u var %u v created %u v cached %u "
3525                    "v total cached %u inst %u total inst %u\n",
3526                    variant->shader->no, variant->no,
3527                    variant->shader->variants_created,
3528                    variant->shader->variants_cached,
3529                    lp->nr_fs_variants, variant->nr_instrs, lp->nr_fs_instrs);
3530    }
3531
3532    gallivm_destroy(variant->gallivm);
3533
3534    /* remove from shader's list */
3535    remove_from_list(&variant->list_item_local);
3536    variant->shader->variants_cached--;
3537
3538    /* remove from context's list */
3539    remove_from_list(&variant->list_item_global);
3540    lp->nr_fs_variants--;
3541    lp->nr_fs_instrs -= variant->nr_instrs;
3542
3543    FREE(variant);
3544 }
3545
3546
3547 static void
3548 llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
3549 {
3550    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
3551    struct lp_fragment_shader *shader = fs;
3552    struct lp_fs_variant_list_item *li;
3553
3554    assert(fs != llvmpipe->fs);
3555
3556    /*
3557     * XXX: we need to flush the context until we have some sort of reference
3558     * counting in fragment shaders as they may still be binned
3559     * Flushing alone might not sufficient we need to wait on it too.
3560     */
3561    llvmpipe_finish(pipe, __FUNCTION__);
3562
3563    /* Delete all the variants */
3564    li = first_elem(&shader->variants);
3565    while(!at_end(&shader->variants, li)) {
3566       struct lp_fs_variant_list_item *next = next_elem(li);
3567       llvmpipe_remove_shader_variant(llvmpipe, li->base);
3568       li = next;
3569    }
3570
3571    /* Delete draw module's data */
3572    draw_delete_fragment_shader(llvmpipe->draw, shader->draw_data);
3573
3574    if (shader->base.ir.nir)
3575       ralloc_free(shader->base.ir.nir);
3576    assert(shader->variants_cached == 0);
3577    FREE((void *) shader->base.tokens);
3578    FREE(shader);
3579 }
3580
3581
3582
3583 static void
3584 llvmpipe_set_constant_buffer(struct pipe_context *pipe,
3585                              enum pipe_shader_type shader, uint index,
3586                              const struct pipe_constant_buffer *cb)
3587 {
3588    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
3589    struct pipe_resource *constants = cb ? cb->buffer : NULL;
3590
3591    assert(shader < PIPE_SHADER_TYPES);
3592    assert(index < ARRAY_SIZE(llvmpipe->constants[shader]));
3593
3594    /* note: reference counting */
3595    util_copy_constant_buffer(&llvmpipe->constants[shader][index], cb);
3596
3597    if (constants) {
3598        if (!(constants->bind & PIPE_BIND_CONSTANT_BUFFER)) {
3599          debug_printf("Illegal set constant without bind flag\n");
3600          constants->bind |= PIPE_BIND_CONSTANT_BUFFER;
3601       }
3602    }
3603
3604    if (shader == PIPE_SHADER_VERTEX ||
3605        shader == PIPE_SHADER_GEOMETRY ||
3606        shader == PIPE_SHADER_TESS_CTRL ||
3607        shader == PIPE_SHADER_TESS_EVAL) {
3608       /* Pass the constants to the 'draw' module */
3609       const unsigned size = cb ? cb->buffer_size : 0;
3610       const ubyte *data;
3611
3612       if (constants) {
3613          data = (ubyte *) llvmpipe_resource_data(constants);
3614       }
3615       else if (cb && cb->user_buffer) {
3616          data = (ubyte *) cb->user_buffer;
3617       }
3618       else {
3619          data = NULL;
3620       }
3621
3622       if (data)
3623          data += cb->buffer_offset;
3624
3625       draw_set_mapped_constant_buffer(llvmpipe->draw, shader,
3626                                       index, data, size);
3627    }
3628    else if (shader == PIPE_SHADER_COMPUTE)
3629       llvmpipe->cs_dirty |= LP_CSNEW_CONSTANTS;
3630    else
3631       llvmpipe->dirty |= LP_NEW_FS_CONSTANTS;
3632
3633    if (cb && cb->user_buffer) {
3634       pipe_resource_reference(&constants, NULL);
3635    }
3636 }
3637
3638 static void
3639 llvmpipe_set_shader_buffers(struct pipe_context *pipe,
3640                             enum pipe_shader_type shader, unsigned start_slot,
3641                             unsigned count, const struct pipe_shader_buffer *buffers,
3642                             unsigned writable_bitmask)
3643 {
3644    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
3645    unsigned i, idx;
3646    for (i = start_slot, idx = 0; i < start_slot + count; i++, idx++) {
3647       const struct pipe_shader_buffer *buffer = buffers ? &buffers[idx] : NULL;
3648
3649       util_copy_shader_buffer(&llvmpipe->ssbos[shader][i], buffer);
3650
3651       if (shader == PIPE_SHADER_VERTEX ||
3652           shader == PIPE_SHADER_GEOMETRY ||
3653           shader == PIPE_SHADER_TESS_CTRL ||
3654           shader == PIPE_SHADER_TESS_EVAL) {
3655          const unsigned size = buffer ? buffer->buffer_size : 0;
3656          const ubyte *data = NULL;
3657          if (buffer && buffer->buffer)
3658             data = (ubyte *) llvmpipe_resource_data(buffer->buffer);
3659          if (data)
3660             data += buffer->buffer_offset;
3661          draw_set_mapped_shader_buffer(llvmpipe->draw, shader,
3662                                        i, data, size);
3663       } else if (shader == PIPE_SHADER_COMPUTE) {
3664          llvmpipe->cs_dirty |= LP_CSNEW_SSBOS;
3665       } else if (shader == PIPE_SHADER_FRAGMENT) {
3666          llvmpipe->dirty |= LP_NEW_FS_SSBOS;
3667       }
3668    }
3669 }
3670
3671 static void
3672 llvmpipe_set_shader_images(struct pipe_context *pipe,
3673                             enum pipe_shader_type shader, unsigned start_slot,
3674                            unsigned count, const struct pipe_image_view *images)
3675 {
3676    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
3677    unsigned i, idx;
3678
3679    draw_flush(llvmpipe->draw);
3680    for (i = start_slot, idx = 0; i < start_slot + count; i++, idx++) {
3681       const struct pipe_image_view *image = images ? &images[idx] : NULL;
3682
3683       util_copy_image_view(&llvmpipe->images[shader][i], image);
3684    }
3685
3686    llvmpipe->num_images[shader] = start_slot + count;
3687    if (shader == PIPE_SHADER_VERTEX ||
3688        shader == PIPE_SHADER_GEOMETRY ||
3689        shader == PIPE_SHADER_TESS_CTRL ||
3690        shader == PIPE_SHADER_TESS_EVAL) {
3691       draw_set_images(llvmpipe->draw,
3692                       shader,
3693                       llvmpipe->images[shader],
3694                       start_slot + count);
3695    } else if (shader == PIPE_SHADER_COMPUTE)
3696       llvmpipe->cs_dirty |= LP_CSNEW_IMAGES;
3697    else
3698       llvmpipe->dirty |= LP_NEW_FS_IMAGES;
3699 }
3700
3701 /**
3702  * Return the blend factor equivalent to a destination alpha of one.
3703  */
3704 static inline unsigned
3705 force_dst_alpha_one(unsigned factor, boolean clamped_zero)
3706 {
3707    switch(factor) {
3708    case PIPE_BLENDFACTOR_DST_ALPHA:
3709       return PIPE_BLENDFACTOR_ONE;
3710    case PIPE_BLENDFACTOR_INV_DST_ALPHA:
3711       return PIPE_BLENDFACTOR_ZERO;
3712    case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
3713       if (clamped_zero)
3714          return PIPE_BLENDFACTOR_ZERO;
3715       else
3716          return PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE;
3717    }
3718
3719    return factor;
3720 }
3721
3722
3723 /**
3724  * We need to generate several variants of the fragment pipeline to match
3725  * all the combinations of the contributing state atoms.
3726  *
3727  * TODO: there is actually no reason to tie this to context state -- the
3728  * generated code could be cached globally in the screen.
3729  */
3730 static struct lp_fragment_shader_variant_key *
3731 make_variant_key(struct llvmpipe_context *lp,
3732                  struct lp_fragment_shader *shader,
3733                  char *store)
3734 {
3735    unsigned i;
3736    struct lp_fragment_shader_variant_key *key;
3737
3738    key = (struct lp_fragment_shader_variant_key *)store;
3739
3740    memset(key, 0, offsetof(struct lp_fragment_shader_variant_key, samplers[1]));
3741
3742    if (lp->framebuffer.zsbuf) {
3743       enum pipe_format zsbuf_format = lp->framebuffer.zsbuf->format;
3744       const struct util_format_description *zsbuf_desc =
3745          util_format_description(zsbuf_format);
3746
3747       if (lp->depth_stencil->depth.enabled &&
3748           util_format_has_depth(zsbuf_desc)) {
3749          key->zsbuf_format = zsbuf_format;
3750          memcpy(&key->depth, &lp->depth_stencil->depth, sizeof key->depth);
3751       }
3752       if (lp->depth_stencil->stencil[0].enabled &&
3753           util_format_has_stencil(zsbuf_desc)) {
3754          key->zsbuf_format = zsbuf_format;
3755          memcpy(&key->stencil, &lp->depth_stencil->stencil, sizeof key->stencil);
3756       }
3757       if (llvmpipe_resource_is_1d(lp->framebuffer.zsbuf->texture)) {
3758          key->resource_1d = TRUE;
3759       }
3760       key->zsbuf_nr_samples = util_res_sample_count(lp->framebuffer.zsbuf->texture);
3761    }
3762
3763    /*
3764     * Propagate the depth clamp setting from the rasterizer state.
3765     * depth_clip == 0 implies depth clamping is enabled.
3766     *
3767     * When clip_halfz is enabled, then always clamp the depth values.
3768     *
3769     * XXX: This is incorrect for GL, but correct for d3d10 (depth
3770     * clamp is always active in d3d10, regardless if depth clip is
3771     * enabled or not).
3772     * (GL has an always-on [0,1] clamp on fs depth output instead
3773     * to ensure the depth values stay in range. Doesn't look like
3774     * we do that, though...)
3775     */
3776    if (lp->rasterizer->clip_halfz) {
3777       key->depth_clamp = 1;
3778    } else {
3779       key->depth_clamp = (lp->rasterizer->depth_clip_near == 0) ? 1 : 0;
3780    }
3781
3782    /* alpha test only applies if render buffer 0 is non-integer (or does not exist) */
3783    if (!lp->framebuffer.nr_cbufs ||
3784        !lp->framebuffer.cbufs[0] ||
3785        !util_format_is_pure_integer(lp->framebuffer.cbufs[0]->format)) {
3786       key->alpha.enabled = lp->depth_stencil->alpha.enabled;
3787    }
3788    if(key->alpha.enabled)
3789       key->alpha.func = lp->depth_stencil->alpha.func;
3790    /* alpha.ref_value is passed in jit_context */
3791
3792    key->flatshade = lp->rasterizer->flatshade;
3793    key->multisample = lp->rasterizer->multisample;
3794    if (lp->active_occlusion_queries && !lp->queries_disabled) {
3795       key->occlusion_count = TRUE;
3796    }
3797
3798    if (lp->framebuffer.nr_cbufs) {
3799       memcpy(&key->blend, lp->blend, sizeof key->blend);
3800    }
3801
3802    key->coverage_samples = 1;
3803    key->min_samples = 1;
3804    if (key->multisample) {
3805       key->coverage_samples = util_framebuffer_get_num_samples(&lp->framebuffer);
3806       key->min_samples = lp->min_samples == 1 ? 1 : key->coverage_samples;
3807    }
3808    key->nr_cbufs = lp->framebuffer.nr_cbufs;
3809
3810    if (!key->blend.independent_blend_enable) {
3811       /* we always need independent blend otherwise the fixups below won't work */
3812       for (i = 1; i < key->nr_cbufs; i++) {
3813          memcpy(&key->blend.rt[i], &key->blend.rt[0], sizeof(key->blend.rt[0]));
3814       }
3815       key->blend.independent_blend_enable = 1;
3816    }
3817
3818    for (i = 0; i < lp->framebuffer.nr_cbufs; i++) {
3819       struct pipe_rt_blend_state *blend_rt = &key->blend.rt[i];
3820
3821       if (lp->framebuffer.cbufs[i]) {
3822          enum pipe_format format = lp->framebuffer.cbufs[i]->format;
3823          const struct util_format_description *format_desc;
3824
3825          key->cbuf_format[i] = format;
3826          key->cbuf_nr_samples[i] = util_res_sample_count(lp->framebuffer.cbufs[i]->texture);
3827
3828          /*
3829           * Figure out if this is a 1d resource. Note that OpenGL allows crazy
3830           * mixing of 2d textures with height 1 and 1d textures, so make sure
3831           * we pick 1d if any cbuf or zsbuf is 1d.
3832           */
3833          if (llvmpipe_resource_is_1d(lp->framebuffer.cbufs[i]->texture)) {
3834             key->resource_1d = TRUE;
3835          }
3836
3837          format_desc = util_format_description(format);
3838          assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
3839                 format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB);
3840
3841          /*
3842           * Mask out color channels not present in the color buffer.
3843           */
3844          blend_rt->colormask &= util_format_colormask(format_desc);
3845
3846          /*
3847           * Disable blend for integer formats.
3848           */
3849          if (util_format_is_pure_integer(format)) {
3850             blend_rt->blend_enable = 0;
3851          }
3852
3853          /*
3854           * Our swizzled render tiles always have an alpha channel, but the
3855           * linear render target format often does not, so force here the dst
3856           * alpha to be one.
3857           *
3858           * This is not a mere optimization. Wrong results will be produced if
3859           * the dst alpha is used, the dst format does not have alpha, and the
3860           * previous rendering was not flushed from the swizzled to linear
3861           * buffer. For example, NonPowTwo DCT.
3862           *
3863           * TODO: This should be generalized to all channels for better
3864           * performance, but only alpha causes correctness issues.
3865           *
3866           * Also, force rgb/alpha func/factors match, to make AoS blending
3867           * easier.
3868           */
3869          if (format_desc->swizzle[3] > PIPE_SWIZZLE_W ||
3870              format_desc->swizzle[3] == format_desc->swizzle[0]) {
3871             /* Doesn't cover mixed snorm/unorm but can't render to them anyway */
3872             boolean clamped_zero = !util_format_is_float(format) &&
3873                                    !util_format_is_snorm(format);
3874             blend_rt->rgb_src_factor =
3875                force_dst_alpha_one(blend_rt->rgb_src_factor, clamped_zero);
3876             blend_rt->rgb_dst_factor =
3877                force_dst_alpha_one(blend_rt->rgb_dst_factor, clamped_zero);
3878             blend_rt->alpha_func       = blend_rt->rgb_func;
3879             blend_rt->alpha_src_factor = blend_rt->rgb_src_factor;
3880             blend_rt->alpha_dst_factor = blend_rt->rgb_dst_factor;
3881          }
3882       }
3883       else {
3884          /* no color buffer for this fragment output */
3885          key->cbuf_format[i] = PIPE_FORMAT_NONE;
3886          key->cbuf_nr_samples[i] = 0;
3887          blend_rt->colormask = 0x0;
3888          blend_rt->blend_enable = 0;
3889       }
3890    }
3891
3892    /* This value will be the same for all the variants of a given shader:
3893     */
3894    key->nr_samplers = shader->info.base.file_max[TGSI_FILE_SAMPLER] + 1;
3895
3896    struct lp_sampler_static_state *fs_sampler;
3897
3898    fs_sampler = key->samplers;
3899
3900    memset(fs_sampler, 0, MAX2(key->nr_samplers, key->nr_sampler_views) * sizeof *fs_sampler);
3901
3902    for(i = 0; i < key->nr_samplers; ++i) {
3903       if(shader->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) {
3904          lp_sampler_static_sampler_state(&fs_sampler[i].sampler_state,
3905                                          lp->samplers[PIPE_SHADER_FRAGMENT][i]);
3906       }
3907    }
3908
3909    /*
3910     * XXX If TGSI_FILE_SAMPLER_VIEW exists assume all texture opcodes
3911     * are dx10-style? Can't really have mixed opcodes, at least not
3912     * if we want to skip the holes here (without rescanning tgsi).
3913     */
3914    if (shader->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] != -1) {
3915       key->nr_sampler_views = shader->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] + 1;
3916       for(i = 0; i < key->nr_sampler_views; ++i) {
3917          /*
3918           * Note sview may exceed what's representable by file_mask.
3919           * This will still work, the only downside is that not actually
3920           * used views may be included in the shader key.
3921           */
3922          if(shader->info.base.file_mask[TGSI_FILE_SAMPLER_VIEW] & (1u << (i & 31))) {
3923             lp_sampler_static_texture_state(&fs_sampler[i].texture_state,
3924                                             lp->sampler_views[PIPE_SHADER_FRAGMENT][i]);
3925          }
3926       }
3927    }
3928    else {
3929       key->nr_sampler_views = key->nr_samplers;
3930       for(i = 0; i < key->nr_sampler_views; ++i) {
3931          if(shader->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) {
3932             lp_sampler_static_texture_state(&fs_sampler[i].texture_state,
3933                                             lp->sampler_views[PIPE_SHADER_FRAGMENT][i]);
3934          }
3935       }
3936    }
3937
3938    struct lp_image_static_state *lp_image;
3939    lp_image = lp_fs_variant_key_images(key);
3940    key->nr_images = shader->info.base.file_max[TGSI_FILE_IMAGE] + 1;
3941    for (i = 0; i < key->nr_images; ++i) {
3942       if (shader->info.base.file_mask[TGSI_FILE_IMAGE] & (1 << i)) {
3943          lp_sampler_static_texture_state_image(&lp_image[i].image_state,
3944                                                &lp->images[PIPE_SHADER_FRAGMENT][i]);
3945       }
3946    }
3947    return key;
3948 }
3949
3950
3951
3952 /**
3953  * Update fragment shader state.  This is called just prior to drawing
3954  * something when some fragment-related state has changed.
3955  */
3956 void
3957 llvmpipe_update_fs(struct llvmpipe_context *lp)
3958 {
3959    struct lp_fragment_shader *shader = lp->fs;
3960    struct lp_fragment_shader_variant_key *key;
3961    struct lp_fragment_shader_variant *variant = NULL;
3962    struct lp_fs_variant_list_item *li;
3963    char store[LP_FS_MAX_VARIANT_KEY_SIZE];
3964
3965    key = make_variant_key(lp, shader, store);
3966
3967    /* Search the variants for one which matches the key */
3968    li = first_elem(&shader->variants);
3969    while(!at_end(&shader->variants, li)) {
3970       if(memcmp(&li->base->key, key, shader->variant_key_size) == 0) {
3971          variant = li->base;
3972          break;
3973       }
3974       li = next_elem(li);
3975    }
3976
3977    if (variant) {
3978       /* Move this variant to the head of the list to implement LRU
3979        * deletion of shader's when we have too many.
3980        */
3981       move_to_head(&lp->fs_variants_list, &variant->list_item_global);
3982    }
3983    else {
3984       /* variant not found, create it now */
3985       int64_t t0, t1, dt;
3986       unsigned i;
3987       unsigned variants_to_cull;
3988
3989       if (LP_DEBUG & DEBUG_FS) {
3990          debug_printf("%u variants,\t%u instrs,\t%u instrs/variant\n",
3991                       lp->nr_fs_variants,
3992                       lp->nr_fs_instrs,
3993                       lp->nr_fs_variants ? lp->nr_fs_instrs / lp->nr_fs_variants : 0);
3994       }
3995
3996       /* First, check if we've exceeded the max number of shader variants.
3997        * If so, free 6.25% of them (the least recently used ones).
3998        */
3999       variants_to_cull = lp->nr_fs_variants >= LP_MAX_SHADER_VARIANTS ? LP_MAX_SHADER_VARIANTS / 16 : 0;
4000
4001       if (variants_to_cull ||
4002           lp->nr_fs_instrs >= LP_MAX_SHADER_INSTRUCTIONS) {
4003          struct pipe_context *pipe = &lp->pipe;
4004
4005          if (gallivm_debug & GALLIVM_DEBUG_PERF) {
4006             debug_printf("Evicting FS: %u fs variants,\t%u total variants,"
4007                          "\t%u instrs,\t%u instrs/variant\n",
4008                          shader->variants_cached,
4009                          lp->nr_fs_variants, lp->nr_fs_instrs,
4010                          lp->nr_fs_instrs / lp->nr_fs_variants);
4011          }
4012
4013          /*
4014           * XXX: we need to flush the context until we have some sort of
4015           * reference counting in fragment shaders as they may still be binned
4016           * Flushing alone might not be sufficient we need to wait on it too.
4017           */
4018          llvmpipe_finish(pipe, __FUNCTION__);
4019
4020          /*
4021           * We need to re-check lp->nr_fs_variants because an arbitrarliy large
4022           * number of shader variants (potentially all of them) could be
4023           * pending for destruction on flush.
4024           */
4025
4026          for (i = 0; i < variants_to_cull || lp->nr_fs_instrs >= LP_MAX_SHADER_INSTRUCTIONS; i++) {
4027             struct lp_fs_variant_list_item *item;
4028             if (is_empty_list(&lp->fs_variants_list)) {
4029                break;
4030             }
4031             item = last_elem(&lp->fs_variants_list);
4032             assert(item);
4033             assert(item->base);
4034             llvmpipe_remove_shader_variant(lp, item->base);
4035          }
4036       }
4037
4038       /*
4039        * Generate the new variant.
4040        */
4041       t0 = os_time_get();
4042       variant = generate_variant(lp, shader, key);
4043       t1 = os_time_get();
4044       dt = t1 - t0;
4045       LP_COUNT_ADD(llvm_compile_time, dt);
4046       LP_COUNT_ADD(nr_llvm_compiles, 2);  /* emit vs. omit in/out test */
4047
4048       /* Put the new variant into the list */
4049       if (variant) {
4050          insert_at_head(&shader->variants, &variant->list_item_local);
4051          insert_at_head(&lp->fs_variants_list, &variant->list_item_global);
4052          lp->nr_fs_variants++;
4053          lp->nr_fs_instrs += variant->nr_instrs;
4054          shader->variants_cached++;
4055       }
4056    }
4057
4058    /* Bind this variant */
4059    lp_setup_set_fs_variant(lp->setup, variant);
4060 }
4061
4062
4063
4064
4065
4066 void
4067 llvmpipe_init_fs_funcs(struct llvmpipe_context *llvmpipe)
4068 {
4069    llvmpipe->pipe.create_fs_state = llvmpipe_create_fs_state;
4070    llvmpipe->pipe.bind_fs_state   = llvmpipe_bind_fs_state;
4071    llvmpipe->pipe.delete_fs_state = llvmpipe_delete_fs_state;
4072
4073    llvmpipe->pipe.set_constant_buffer = llvmpipe_set_constant_buffer;
4074
4075    llvmpipe->pipe.set_shader_buffers = llvmpipe_set_shader_buffers;
4076    llvmpipe->pipe.set_shader_images = llvmpipe_set_shader_images;
4077 }
4078
4079