src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 /**
  29  * @file
  30  * Texture sampling -- SoA.
  31  *
  32  * @author Jose Fonseca <jfonseca@vmware.com>
  33  * @author Brian Paul <brianp@vmware.com>
  34  */
  35
  36 #include "pipe/p_defines.h"
  37 #include "pipe/p_state.h"
  38 #include "util/u_debug.h"
  39 #include "util/u_dump.h"
  40 #include "util/u_memory.h"
  41 #include "util/u_math.h"
  42 #include "util/u_format.h"
  43 #include "lp_bld_debug.h"
  44 #include "lp_bld_type.h"
  45 #include "lp_bld_const.h"
  46 #include "lp_bld_conv.h"
  47 #include "lp_bld_arit.h"
  48 #include "lp_bld_logic.h"
  49 #include "lp_bld_swizzle.h"
  50 #include "lp_bld_pack.h"
  51 #include "lp_bld_flow.h"
  52 #include "lp_bld_gather.h"
  53 #include "lp_bld_format.h"
  54 #include "lp_bld_sample.h"
  55 #include "lp_bld_sample_aos.h"
  56 #include "lp_bld_quad.h"
  57
  58
  59 /**
  60  * Build LLVM code for texture coord wrapping, for nearest filtering,
  61  * for scaled integer texcoords.
  62  * \param block_length  is the length of the pixel block along the
  63  *                      coordinate axis
  64  * \param coord  the incoming texcoord (s,t,r or q) scaled to the texture size
  65  * \param length  the texture size along one dimension
  66  * \param stride  pixel stride along the coordinate axis (in bytes)
  67  * \param is_pot  if TRUE, length is a power of two
  68  * \param wrap_mode  one of PIPE_TEX_WRAP_x
  69  * \param out_offset  byte offset for the wrapped coordinate
  70  * \param out_i  resulting sub-block pixel coordinate for coord0
  71  */
  72 static void
  73 lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld,
  74                                  unsigned block_length,
  75                                  LLVMValueRef coord,
  76                                  LLVMValueRef length,
  77                                  LLVMValueRef stride,
  78                                  boolean is_pot,
  79                                  unsigned wrap_mode,
  80                                  LLVMValueRef *out_offset,
  81                                  LLVMValueRef *out_i)
  82 {
  83    struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
  84    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
  85    LLVMValueRef length_minus_one;
  86
  87    length_minus_one = lp_build_sub(uint_coord_bld, length, uint_coord_bld->one);
  88
  89    switch(wrap_mode) {
  90    case PIPE_TEX_WRAP_REPEAT:
  91       if(is_pot)
  92          coord = LLVMBuildAnd(bld->builder, coord, length_minus_one, "");
  93       else {
  94          /* Add a bias to the texcoord to handle negative coords */
  95          LLVMValueRef bias = lp_build_mul_imm(uint_coord_bld, length, 1024);
  96          coord = LLVMBuildAdd(bld->builder, coord, bias, "");
  97          coord = LLVMBuildURem(bld->builder, coord, length, "");
  98       }
  99       break;
 100
 101    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 102       coord = lp_build_max(int_coord_bld, coord, int_coord_bld->zero);
 103       coord = lp_build_min(int_coord_bld, coord, length_minus_one);
 104       break;
 105
 106    case PIPE_TEX_WRAP_CLAMP:
 107    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
 108    case PIPE_TEX_WRAP_MIRROR_REPEAT:
 109    case PIPE_TEX_WRAP_MIRROR_CLAMP:
 110    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 111    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 112    default:
 113       assert(0);
 114    }
 115
 116    lp_build_sample_partial_offset(uint_coord_bld, block_length, coord, stride,
 117                                   out_offset, out_i);
 118 }
 119
 120
 121 /**
 122  * Build LLVM code for texture coord wrapping, for linear filtering,
 123  * for scaled integer texcoords.
 124  * \param block_length  is the length of the pixel block along the
 125  *                      coordinate axis
 126  * \param coord0  the incoming texcoord (s,t,r or q) scaled to the texture size
 127  * \param length  the texture size along one dimension
 128  * \param stride  pixel stride along the coordinate axis (in bytes)
 129  * \param is_pot  if TRUE, length is a power of two
 130  * \param wrap_mode  one of PIPE_TEX_WRAP_x
 131  * \param offset0  resulting relative offset for coord0
 132  * \param offset1  resulting relative offset for coord0 + 1
 133  * \param i0  resulting sub-block pixel coordinate for coord0
 134  * \param i1  resulting sub-block pixel coordinate for coord0 + 1
 135  */
 136 static void
 137 lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
 138                                 unsigned block_length,
 139                                 LLVMValueRef coord0,
 140                                 LLVMValueRef length,
 141                                 LLVMValueRef stride,
 142                                 boolean is_pot,
 143                                 unsigned wrap_mode,
 144                                 LLVMValueRef *offset0,
 145                                 LLVMValueRef *offset1,
 146                                 LLVMValueRef *i0,
 147                                 LLVMValueRef *i1)
 148 {
 149    struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
 150    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
 151    LLVMValueRef length_minus_one;
 152    LLVMValueRef lmask, umask, mask;
 153
 154    if (block_length != 1) {
 155       /*
 156        * If the pixel block covers more than one pixel then there is no easy
 157        * way to calculate offset1 relative to offset0. Instead, compute them
 158        * independently.
 159        */
 160
 161       LLVMValueRef coord1;
 162
 163       lp_build_sample_wrap_nearest_int(bld,
 164                                        block_length,
 165                                        coord0,
 166                                        length,
 167                                        stride,
 168                                        is_pot,
 169                                        wrap_mode,
 170                                        offset0, i0);
 171
 172       coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 173
 174       lp_build_sample_wrap_nearest_int(bld,
 175                                        block_length,
 176                                        coord1,
 177                                        length,
 178                                        stride,
 179                                        is_pot,
 180                                        wrap_mode,
 181                                        offset1, i1);
 182
 183       return;
 184    }
 185
 186    /*
 187     * Scalar pixels -- try to compute offset0 and offset1 with a single stride
 188     * multiplication.
 189     */
 190
 191    *i0 = uint_coord_bld->zero;
 192    *i1 = uint_coord_bld->zero;
 193
 194    length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
 195
 196    switch(wrap_mode) {
 197    case PIPE_TEX_WRAP_REPEAT:
 198       if (is_pot) {
 199          coord0 = LLVMBuildAnd(bld->builder, coord0, length_minus_one, "");
 200       }
 201       else {
 202          /* Add a bias to the texcoord to handle negative coords */
 203          LLVMValueRef bias = lp_build_mul_imm(uint_coord_bld, length, 1024);
 204          coord0 = LLVMBuildAdd(bld->builder, coord0, bias, "");
 205          coord0 = LLVMBuildURem(bld->builder, coord0, length, "");
 206       }
 207
 208       mask = lp_build_compare(bld->builder, int_coord_bld->type,
 209                               PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
 210
 211       *offset0 = lp_build_mul(uint_coord_bld, coord0, stride);
 212       *offset1 = LLVMBuildAnd(bld->builder,
 213                               lp_build_add(uint_coord_bld, *offset0, stride),
 214                               mask, "");
 215       break;
 216
 217    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 218       lmask = lp_build_compare(int_coord_bld->builder, int_coord_bld->type,
 219                                PIPE_FUNC_GEQUAL, coord0, int_coord_bld->zero);
 220       umask = lp_build_compare(int_coord_bld->builder, int_coord_bld->type,
 221                                PIPE_FUNC_LESS, coord0, length_minus_one);
 222
 223       coord0 = lp_build_select(int_coord_bld, lmask, coord0, int_coord_bld->zero);
 224       coord0 = lp_build_select(int_coord_bld, umask, coord0, length_minus_one);
 225
 226       mask = LLVMBuildAnd(bld->builder, lmask, umask, "");
 227
 228       *offset0 = lp_build_mul(uint_coord_bld, coord0, stride);
 229       *offset1 = lp_build_add(uint_coord_bld,
 230                               *offset0,
 231                               LLVMBuildAnd(bld->builder, stride, mask, ""));
 232       break;
 233
 234    case PIPE_TEX_WRAP_CLAMP:
 235    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
 236    case PIPE_TEX_WRAP_MIRROR_REPEAT:
 237    case PIPE_TEX_WRAP_MIRROR_CLAMP:
 238    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 239    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 240    default:
 241       assert(0);
 242       *offset0 = uint_coord_bld->zero;
 243       *offset1 = uint_coord_bld->zero;
 244       break;
 245    }
 246 }
 247
 248
 249 /**
 250  * Sample a single texture image with nearest sampling.
 251  * If sampling a cube texture, r = cube face in [0,5].
 252  * Return filtered color as two vectors of 16-bit fixed point values.
 253  */
 254 static void
 255 lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
 256                               LLVMValueRef width_vec,
 257                               LLVMValueRef height_vec,
 258                               LLVMValueRef depth_vec,
 259                               LLVMValueRef row_stride_vec,
 260                               LLVMValueRef img_stride_vec,
 261                               LLVMValueRef data_ptr,
 262                               LLVMValueRef s,
 263                               LLVMValueRef t,
 264                               LLVMValueRef r,
 265                               LLVMValueRef *colors_lo,
 266                               LLVMValueRef *colors_hi)
 267 {
 268    const unsigned dims = bld->dims;
 269    LLVMBuilderRef builder = bld->builder;
 270    struct lp_build_context i32, h16, u8n;
 271    LLVMTypeRef i32_vec_type, h16_vec_type, u8n_vec_type;
 272    LLVMValueRef i32_c8;
 273    LLVMValueRef s_ipart, t_ipart, r_ipart;
 274    LLVMValueRef x_stride;
 275    LLVMValueRef x_offset, offset;
 276    LLVMValueRef x_subcoord, y_subcoord, z_subcoord;
 277
 278    lp_build_context_init(&i32, builder, lp_type_int_vec(32));
 279    lp_build_context_init(&h16, builder, lp_type_ufixed(16));
 280    lp_build_context_init(&u8n, builder, lp_type_unorm(8));
 281
 282    i32_vec_type = lp_build_vec_type(i32.type);
 283    h16_vec_type = lp_build_vec_type(h16.type);
 284    u8n_vec_type = lp_build_vec_type(u8n.type);
 285
 286    if (bld->static_state->normalized_coords) {
 287       /* s = s * width, t = t * height */
 288       LLVMTypeRef coord_vec_type = lp_build_vec_type(bld->coord_type);
 289       LLVMValueRef fp_width = LLVMBuildSIToFP(bld->builder, width_vec,
 290                                               coord_vec_type, "");
 291       s = lp_build_mul(&bld->coord_bld, s, fp_width);
 292       if (dims >= 2) {
 293          LLVMValueRef fp_height = LLVMBuildSIToFP(bld->builder, height_vec,
 294                                                   coord_vec_type, "");
 295          t = lp_build_mul(&bld->coord_bld, t, fp_height);
 296          if (dims >= 3) {
 297             LLVMValueRef fp_depth = LLVMBuildSIToFP(bld->builder, depth_vec,
 298                                                     coord_vec_type, "");
 299             r = lp_build_mul(&bld->coord_bld, r, fp_depth);
 300          }
 301       }
 302    }
 303
 304    /* scale coords by 256 (8 fractional bits) */
 305    s = lp_build_mul_imm(&bld->coord_bld, s, 256);
 306    if (dims >= 2)
 307       t = lp_build_mul_imm(&bld->coord_bld, t, 256);
 308    if (dims >= 3)
 309       r = lp_build_mul_imm(&bld->coord_bld, r, 256);
 310
 311    /* convert float to int */
 312    s = LLVMBuildFPToSI(builder, s, i32_vec_type, "");
 313    if (dims >= 2)
 314       t = LLVMBuildFPToSI(builder, t, i32_vec_type, "");
 315    if (dims >= 3)
 316       r = LLVMBuildFPToSI(builder, r, i32_vec_type, "");
 317
 318    /* compute floor (shift right 8) */
 319    i32_c8 = lp_build_const_int_vec(i32.type, 8);
 320    s_ipart = LLVMBuildAShr(builder, s, i32_c8, "");
 321    if (dims >= 2)
 322       t_ipart = LLVMBuildAShr(builder, t, i32_c8, "");
 323    if (dims >= 3)
 324       r_ipart = LLVMBuildAShr(builder, r, i32_c8, "");
 325
 326    /* get pixel, row, image strides */
 327    x_stride = lp_build_const_vec(bld->uint_coord_bld.type,
 328                                  bld->format_desc->block.bits/8);
 329
 330    /* Do texcoord wrapping, compute texel offset */
 331    lp_build_sample_wrap_nearest_int(bld,
 332                                     bld->format_desc->block.width,
 333                                     s_ipart, width_vec, x_stride,
 334                                     bld->static_state->pot_width,
 335                                     bld->static_state->wrap_s,
 336                                     &x_offset, &x_subcoord);
 337    offset = x_offset;
 338    if (dims >= 2) {
 339       LLVMValueRef y_offset;
 340       lp_build_sample_wrap_nearest_int(bld,
 341                                        bld->format_desc->block.height,
 342                                        t_ipart, height_vec, row_stride_vec,
 343                                        bld->static_state->pot_height,
 344                                        bld->static_state->wrap_t,
 345                                        &y_offset, &y_subcoord);
 346       offset = lp_build_add(&bld->uint_coord_bld, offset, y_offset);
 347       if (dims >= 3) {
 348          LLVMValueRef z_offset;
 349          lp_build_sample_wrap_nearest_int(bld,
 350                                           1, /* block length (depth) */
 351                                           r_ipart, depth_vec, img_stride_vec,
 352                                           bld->static_state->pot_height,
 353                                           bld->static_state->wrap_r,
 354                                           &z_offset, &z_subcoord);
 355          offset = lp_build_add(&bld->uint_coord_bld, offset, z_offset);
 356       }
 357       else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
 358          LLVMValueRef z_offset;
 359          /* The r coord is the cube face in [0,5] */
 360          z_offset = lp_build_mul(&bld->uint_coord_bld, r, img_stride_vec);
 361          offset = lp_build_add(&bld->uint_coord_bld, offset, z_offset);
 362       }
 363    }
 364
 365    /*
 366     * Fetch the pixels as 4 x 32bit (rgba order might differ):
 367     *
 368     *   rgba0 rgba1 rgba2 rgba3
 369     *
 370     * bit cast them into 16 x u8
 371     *
 372     *   r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
 373     *
 374     * unpack them into two 8 x i16:
 375     *
 376     *   r0 g0 b0 a0 r1 g1 b1 a1
 377     *   r2 g2 b2 a2 r3 g3 b3 a3
 378     *
 379     * The higher 8 bits of the resulting elements will be zero.
 380     */
 381    {
 382       LLVMValueRef rgba8;
 383
 384       if (util_format_is_rgba8_variant(bld->format_desc)) {
 385          /*
 386           * Given the format is a rgba8, just read the pixels as is,
 387           * without any swizzling. Swizzling will be done later.
 388           */
 389          rgba8 = lp_build_gather(bld->builder,
 390                                  bld->texel_type.length,
 391                                  bld->format_desc->block.bits,
 392                                  bld->texel_type.width,
 393                                  data_ptr, offset);
 394
 395          rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
 396       }
 397       else {
 398          rgba8 = lp_build_fetch_rgba_aos(bld->builder,
 399                                          bld->format_desc,
 400                                          u8n.type,
 401                                          data_ptr, offset,
 402                                          x_subcoord,
 403                                          y_subcoord);
 404       }
 405
 406       /* Expand one 4*rgba8 to two 2*rgba16 */
 407       lp_build_unpack2(builder, u8n.type, h16.type,
 408                        rgba8,
 409                        colors_lo, colors_hi);
 410    }
 411 }
 412
 413
 414 /**
 415  * Sample a single texture image with (bi-)(tri-)linear sampling.
 416  * Return filtered color as two vectors of 16-bit fixed point values.
 417  */
 418 static void
 419 lp_build_sample_image_linear(struct lp_build_sample_context *bld,
 420                              LLVMValueRef width_vec,
 421                              LLVMValueRef height_vec,
 422                              LLVMValueRef depth_vec,
 423                              LLVMValueRef row_stride_vec,
 424                              LLVMValueRef img_stride_vec,
 425                              LLVMValueRef data_ptr,
 426                              LLVMValueRef s,
 427                              LLVMValueRef t,
 428                              LLVMValueRef r,
 429                              LLVMValueRef *colors_lo,
 430                              LLVMValueRef *colors_hi)
 431 {
 432    const unsigned dims = bld->dims;
 433    LLVMBuilderRef builder = bld->builder;
 434    struct lp_build_context i32, h16, u8n;
 435    LLVMTypeRef i32_vec_type, h16_vec_type, u8n_vec_type;
 436    LLVMValueRef i32_c8, i32_c128, i32_c255;
 437    LLVMValueRef s_ipart, s_fpart, s_fpart_lo, s_fpart_hi;
 438    LLVMValueRef t_ipart, t_fpart, t_fpart_lo, t_fpart_hi;
 439    LLVMValueRef r_ipart, r_fpart, r_fpart_lo, r_fpart_hi;
 440    LLVMValueRef x_stride, y_stride, z_stride;
 441    LLVMValueRef x_offset0, x_offset1;
 442    LLVMValueRef y_offset0, y_offset1;
 443    LLVMValueRef z_offset0, z_offset1;
 444    LLVMValueRef offset[2][2][2]; /* [z][y][x] */
 445    LLVMValueRef x_subcoord[2], y_subcoord[2], z_subcoord[2];
 446    LLVMValueRef neighbors_lo[2][2][2]; /* [z][y][x] */
 447    LLVMValueRef neighbors_hi[2][2][2]; /* [z][y][x] */
 448    LLVMValueRef packed_lo, packed_hi;
 449    unsigned x, y, z;
 450    unsigned i, j, k;
 451    unsigned numj, numk;
 452
 453    lp_build_context_init(&i32, builder, lp_type_int_vec(32));
 454    lp_build_context_init(&h16, builder, lp_type_ufixed(16));
 455    lp_build_context_init(&u8n, builder, lp_type_unorm(8));
 456
 457    i32_vec_type = lp_build_vec_type(i32.type);
 458    h16_vec_type = lp_build_vec_type(h16.type);
 459    u8n_vec_type = lp_build_vec_type(u8n.type);
 460
 461    if (bld->static_state->normalized_coords) {
 462       /* s = s * width, t = t * height */
 463       LLVMTypeRef coord_vec_type = lp_build_vec_type(bld->coord_type);
 464       LLVMValueRef fp_width = LLVMBuildSIToFP(bld->builder, width_vec,
 465                                               coord_vec_type, "");
 466       s = lp_build_mul(&bld->coord_bld, s, fp_width);
 467       if (dims >= 2) {
 468          LLVMValueRef fp_height = LLVMBuildSIToFP(bld->builder, height_vec,
 469                                                   coord_vec_type, "");
 470          t = lp_build_mul(&bld->coord_bld, t, fp_height);
 471       }
 472       if (dims >= 3) {
 473          LLVMValueRef fp_depth = LLVMBuildSIToFP(bld->builder, depth_vec,
 474                                                  coord_vec_type, "");
 475          r = lp_build_mul(&bld->coord_bld, r, fp_depth);
 476       }
 477    }
 478
 479    /* scale coords by 256 (8 fractional bits) */
 480    s = lp_build_mul_imm(&bld->coord_bld, s, 256);
 481    if (dims >= 2)
 482       t = lp_build_mul_imm(&bld->coord_bld, t, 256);
 483    if (dims >= 3)
 484       r = lp_build_mul_imm(&bld->coord_bld, r, 256);
 485
 486    /* convert float to int */
 487    s = LLVMBuildFPToSI(builder, s, i32_vec_type, "");
 488    if (dims >= 2)
 489       t = LLVMBuildFPToSI(builder, t, i32_vec_type, "");
 490    if (dims >= 3)
 491       r = LLVMBuildFPToSI(builder, r, i32_vec_type, "");
 492
 493    /* subtract 0.5 (add -128) */
 494    i32_c128 = lp_build_const_int_vec(i32.type, -128);
 495    s = LLVMBuildAdd(builder, s, i32_c128, "");
 496    if (dims >= 2) {
 497       t = LLVMBuildAdd(builder, t, i32_c128, "");
 498    }
 499    if (dims >= 3) {
 500       r = LLVMBuildAdd(builder, r, i32_c128, "");
 501    }
 502
 503    /* compute floor (shift right 8) */
 504    i32_c8 = lp_build_const_int_vec(i32.type, 8);
 505    s_ipart = LLVMBuildAShr(builder, s, i32_c8, "");
 506    if (dims >= 2)
 507       t_ipart = LLVMBuildAShr(builder, t, i32_c8, "");
 508    if (dims >= 3)
 509       r_ipart = LLVMBuildAShr(builder, r, i32_c8, "");
 510
 511    /* compute fractional part (AND with 0xff) */
 512    i32_c255 = lp_build_const_int_vec(i32.type, 255);
 513    s_fpart = LLVMBuildAnd(builder, s, i32_c255, "");
 514    if (dims >= 2)
 515       t_fpart = LLVMBuildAnd(builder, t, i32_c255, "");
 516    if (dims >= 3)
 517       r_fpart = LLVMBuildAnd(builder, r, i32_c255, "");
 518
 519    /* get pixel, row and image strides */
 520    x_stride = lp_build_const_vec(bld->uint_coord_bld.type,
 521                                  bld->format_desc->block.bits/8);
 522    y_stride = row_stride_vec;
 523    z_stride = img_stride_vec;
 524
 525    /* do texcoord wrapping and compute texel offsets */
 526    lp_build_sample_wrap_linear_int(bld,
 527                                    bld->format_desc->block.width,
 528                                    s_ipart, width_vec, x_stride,
 529                                    bld->static_state->pot_width,
 530                                    bld->static_state->wrap_s,
 531                                    &x_offset0, &x_offset1,
 532                                    &x_subcoord[0], &x_subcoord[1]);
 533    for (z = 0; z < 2; z++) {
 534       for (y = 0; y < 2; y++) {
 535          offset[z][y][0] = x_offset0;
 536          offset[z][y][1] = x_offset1;
 537       }
 538    }
 539
 540    if (dims >= 2) {
 541       lp_build_sample_wrap_linear_int(bld,
 542                                       bld->format_desc->block.height,
 543                                       t_ipart, height_vec, y_stride,
 544                                       bld->static_state->pot_height,
 545                                       bld->static_state->wrap_t,
 546                                       &y_offset0, &y_offset1,
 547                                       &y_subcoord[0], &y_subcoord[1]);
 548
 549       for (z = 0; z < 2; z++) {
 550          for (x = 0; x < 2; x++) {
 551             offset[z][0][x] = lp_build_add(&bld->uint_coord_bld,
 552                                            offset[z][0][x], y_offset0);
 553             offset[z][1][x] = lp_build_add(&bld->uint_coord_bld,
 554                                            offset[z][1][x], y_offset1);
 555          }
 556       }
 557    }
 558
 559    if (dims >= 3) {
 560       lp_build_sample_wrap_linear_int(bld,
 561                                       bld->format_desc->block.height,
 562                                       r_ipart, depth_vec, z_stride,
 563                                       bld->static_state->pot_depth,
 564                                       bld->static_state->wrap_r,
 565                                       &z_offset0, &z_offset1,
 566                                       &z_subcoord[0], &z_subcoord[1]);
 567       for (y = 0; y < 2; y++) {
 568          for (x = 0; x < 2; x++) {
 569             offset[0][y][x] = lp_build_add(&bld->uint_coord_bld,
 570                                            offset[0][y][x], z_offset0);
 571             offset[1][y][x] = lp_build_add(&bld->uint_coord_bld,
 572                                            offset[1][y][x], z_offset1);
 573          }
 574       }
 575    }
 576    else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
 577       LLVMValueRef z_offset;
 578       z_offset = lp_build_mul(&bld->uint_coord_bld, r, img_stride_vec);
 579       for (y = 0; y < 2; y++) {
 580          for (x = 0; x < 2; x++) {
 581             /* The r coord is the cube face in [0,5] */
 582             offset[0][y][x] = lp_build_add(&bld->uint_coord_bld,
 583                                            offset[0][y][x], z_offset);
 584          }
 585       }
 586    }
 587
 588    /*
 589     * Transform 4 x i32 in
 590     *
 591     *   s_fpart = {s0, s1, s2, s3}
 592     *
 593     * into 8 x i16
 594     *
 595     *   s_fpart = {00, s0, 00, s1, 00, s2, 00, s3}
 596     *
 597     * into two 8 x i16
 598     *
 599     *   s_fpart_lo = {s0, s0, s0, s0, s1, s1, s1, s1}
 600     *   s_fpart_hi = {s2, s2, s2, s2, s3, s3, s3, s3}
 601     *
 602     * and likewise for t_fpart. There is no risk of loosing precision here
 603     * since the fractional parts only use the lower 8bits.
 604     */
 605    s_fpart = LLVMBuildBitCast(builder, s_fpart, h16_vec_type, "");
 606    if (dims >= 2)
 607       t_fpart = LLVMBuildBitCast(builder, t_fpart, h16_vec_type, "");
 608    if (dims >= 3)
 609       r_fpart = LLVMBuildBitCast(builder, r_fpart, h16_vec_type, "");
 610
 611    {
 612       LLVMTypeRef elem_type = LLVMInt32Type();
 613       LLVMValueRef shuffles_lo[LP_MAX_VECTOR_LENGTH];
 614       LLVMValueRef shuffles_hi[LP_MAX_VECTOR_LENGTH];
 615       LLVMValueRef shuffle_lo;
 616       LLVMValueRef shuffle_hi;
 617
 618       for (j = 0; j < h16.type.length; j += 4) {
 619 #ifdef PIPE_ARCH_LITTLE_ENDIAN
 620          unsigned subindex = 0;
 621 #else
 622          unsigned subindex = 1;
 623 #endif
 624          LLVMValueRef index;
 625
 626          index = LLVMConstInt(elem_type, j/2 + subindex, 0);
 627          for (i = 0; i < 4; ++i)
 628             shuffles_lo[j + i] = index;
 629
 630          index = LLVMConstInt(elem_type, h16.type.length/2 + j/2 + subindex, 0);
 631          for (i = 0; i < 4; ++i)
 632             shuffles_hi[j + i] = index;
 633       }
 634
 635       shuffle_lo = LLVMConstVector(shuffles_lo, h16.type.length);
 636       shuffle_hi = LLVMConstVector(shuffles_hi, h16.type.length);
 637
 638       s_fpart_lo = LLVMBuildShuffleVector(builder, s_fpart, h16.undef,
 639                                           shuffle_lo, "");
 640       s_fpart_hi = LLVMBuildShuffleVector(builder, s_fpart, h16.undef,
 641                                           shuffle_hi, "");
 642       if (dims >= 2) {
 643          t_fpart_lo = LLVMBuildShuffleVector(builder, t_fpart, h16.undef,
 644                                              shuffle_lo, "");
 645          t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef,
 646                                              shuffle_hi, "");
 647       }
 648       if (dims >= 3) {
 649          r_fpart_lo = LLVMBuildShuffleVector(builder, r_fpart, h16.undef,
 650                                              shuffle_lo, "");
 651          r_fpart_hi = LLVMBuildShuffleVector(builder, r_fpart, h16.undef,
 652                                              shuffle_hi, "");
 653       }
 654    }
 655
 656    /*
 657     * Fetch the pixels as 4 x 32bit (rgba order might differ):
 658     *
 659     *   rgba0 rgba1 rgba2 rgba3
 660     *
 661     * bit cast them into 16 x u8
 662     *
 663     *   r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
 664     *
 665     * unpack them into two 8 x i16:
 666     *
 667     *   r0 g0 b0 a0 r1 g1 b1 a1
 668     *   r2 g2 b2 a2 r3 g3 b3 a3
 669     *
 670     * The higher 8 bits of the resulting elements will be zero.
 671     */
 672    numj = 1 + (dims >= 2);
 673    numk = 1 + (dims >= 3);
 674
 675    for (k = 0; k < numk; k++) {
 676       for (j = 0; j < numj; j++) {
 677          for (i = 0; i < 2; i++) {
 678             LLVMValueRef rgba8;
 679
 680             if (util_format_is_rgba8_variant(bld->format_desc)) {
 681                /*
 682                 * Given the format is a rgba8, just read the pixels as is,
 683                 * without any swizzling. Swizzling will be done later.
 684                 */
 685                rgba8 = lp_build_gather(bld->builder,
 686                                        bld->texel_type.length,
 687                                        bld->format_desc->block.bits,
 688                                        bld->texel_type.width,
 689                                        data_ptr, offset[k][j][i]);
 690
 691                rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
 692             }
 693             else {
 694                rgba8 = lp_build_fetch_rgba_aos(bld->builder,
 695                                                bld->format_desc,
 696                                                u8n.type,
 697                                                data_ptr, offset[k][j][i],
 698                                                x_subcoord[i],
 699                                                y_subcoord[j]);
 700             }
 701
 702             /* Expand one 4*rgba8 to two 2*rgba16 */
 703             lp_build_unpack2(builder, u8n.type, h16.type,
 704                              rgba8,
 705                              &neighbors_lo[k][j][i], &neighbors_hi[k][j][i]);
 706          }
 707       }
 708    }
 709
 710    /*
 711     * Linear interpolation with 8.8 fixed point.
 712     */
 713    if (dims == 1) {
 714       /* 1-D lerp */
 715       packed_lo = lp_build_lerp(&h16,
 716                                 s_fpart_lo,
 717                                 neighbors_lo[0][0][0],
 718                                 neighbors_lo[0][0][1]);
 719
 720       packed_hi = lp_build_lerp(&h16,
 721                                 s_fpart_hi,
 722                                 neighbors_hi[0][0][0],
 723                                 neighbors_hi[0][0][1]);
 724    }
 725    else {
 726       /* 2-D lerp */
 727       packed_lo = lp_build_lerp_2d(&h16,
 728                                    s_fpart_lo, t_fpart_lo,
 729                                    neighbors_lo[0][0][0],
 730                                    neighbors_lo[0][0][1],
 731                                    neighbors_lo[0][1][0],
 732                                    neighbors_lo[0][1][1]);
 733
 734       packed_hi = lp_build_lerp_2d(&h16,
 735                                    s_fpart_hi, t_fpart_hi,
 736                                    neighbors_hi[0][0][0],
 737                                    neighbors_hi[0][0][1],
 738                                    neighbors_hi[0][1][0],
 739                                    neighbors_hi[0][1][1]);
 740
 741       if (dims >= 3) {
 742          LLVMValueRef packed_lo2, packed_hi2;
 743
 744          /* lerp in the second z slice */
 745          packed_lo2 = lp_build_lerp_2d(&h16,
 746                                        s_fpart_lo, t_fpart_lo,
 747                                        neighbors_lo[1][0][0],
 748                                        neighbors_lo[1][0][1],
 749                                        neighbors_lo[1][1][0],
 750                                        neighbors_lo[1][1][1]);
 751
 752          packed_hi2 = lp_build_lerp_2d(&h16,
 753                                        s_fpart_hi, t_fpart_hi,
 754                                        neighbors_hi[1][0][0],
 755                                        neighbors_hi[1][0][1],
 756                                        neighbors_hi[1][1][0],
 757                                        neighbors_hi[1][1][1]);
 758          /* interp between two z slices */
 759          packed_lo = lp_build_lerp(&h16, r_fpart_lo,
 760                                    packed_lo, packed_lo2);
 761          packed_hi = lp_build_lerp(&h16, r_fpart_hi,
 762                                    packed_hi, packed_hi2);
 763       }
 764    }
 765
 766    *colors_lo = packed_lo;
 767    *colors_hi = packed_hi;
 768 }
 769
 770
 771 /**
 772  * Sample the texture/mipmap using given image filter and mip filter.
 773  * data0_ptr and data1_ptr point to the two mipmap levels to sample
 774  * from.  width0/1_vec, height0/1_vec, depth0/1_vec indicate their sizes.
 775  * If we're using nearest miplevel sampling the '1' values will be null/unused.
 776  */
 777 static void
 778 lp_build_sample_mipmap(struct lp_build_sample_context *bld,
 779                        unsigned img_filter,
 780                        unsigned mip_filter,
 781                        LLVMValueRef s,
 782                        LLVMValueRef t,
 783                        LLVMValueRef r,
 784                        LLVMValueRef ilevel0,
 785                        LLVMValueRef ilevel1,
 786                        LLVMValueRef lod_fpart,
 787                        LLVMValueRef colors_lo_var,
 788                        LLVMValueRef colors_hi_var)
 789 {
 790    LLVMBuilderRef builder = bld->builder;
 791    LLVMValueRef width0_vec;
 792    LLVMValueRef width1_vec;
 793    LLVMValueRef height0_vec;
 794    LLVMValueRef height1_vec;
 795    LLVMValueRef depth0_vec;
 796    LLVMValueRef depth1_vec;
 797    LLVMValueRef row_stride0_vec;
 798    LLVMValueRef row_stride1_vec;
 799    LLVMValueRef img_stride0_vec;
 800    LLVMValueRef img_stride1_vec;
 801    LLVMValueRef data_ptr0;
 802    LLVMValueRef data_ptr1;
 803    LLVMValueRef colors0_lo, colors0_hi;
 804    LLVMValueRef colors1_lo, colors1_hi;
 805
 806
 807    /* sample the first mipmap level */
 808    lp_build_mipmap_level_sizes(bld, ilevel0,
 809                                &width0_vec, &height0_vec, &depth0_vec,
 810                                &row_stride0_vec, &img_stride0_vec);
 811    data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
 812    if (img_filter == PIPE_TEX_FILTER_NEAREST) {
 813       lp_build_sample_image_nearest(bld,
 814                                     width0_vec, height0_vec, depth0_vec,
 815                                     row_stride0_vec, img_stride0_vec,
 816                                     data_ptr0, s, t, r,
 817                                     &colors0_lo, &colors0_hi);
 818    }
 819    else {
 820       assert(img_filter == PIPE_TEX_FILTER_LINEAR);
 821       lp_build_sample_image_linear(bld,
 822                                    width0_vec, height0_vec, depth0_vec,
 823                                    row_stride0_vec, img_stride0_vec,
 824                                    data_ptr0, s, t, r,
 825                                    &colors0_lo, &colors0_hi);
 826    }
 827
 828    /* Store the first level's colors in the output variables */
 829    LLVMBuildStore(builder, colors0_lo, colors_lo_var);
 830    LLVMBuildStore(builder, colors0_hi, colors_hi_var);
 831
 832    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
 833       LLVMValueRef h16_scale = LLVMConstReal(LLVMFloatType(), 256.0);
 834       LLVMTypeRef i32_type = LLVMIntType(32);
 835       struct lp_build_flow_context *flow_ctx;
 836       struct lp_build_if_state if_ctx;
 837       LLVMValueRef need_lerp;
 838
 839       flow_ctx = lp_build_flow_create(builder);
 840
 841       lod_fpart = LLVMBuildFMul(builder, lod_fpart, h16_scale, "");
 842       lod_fpart = LLVMBuildFPToSI(builder, lod_fpart, i32_type, "lod_fpart.fixed16");
 843
 844       /* need_lerp = lod_fpart > 0 */
 845       need_lerp = LLVMBuildICmp(builder, LLVMIntSGT,
 846                                 lod_fpart, LLVMConstNull(i32_type),
 847                                 "need_lerp");
 848
 849       lp_build_if(&if_ctx, flow_ctx, builder, need_lerp);
 850       {
 851          struct lp_build_context h16_bld;
 852
 853          lp_build_context_init(&h16_bld, builder, lp_type_ufixed(16));
 854
 855          /* sample the second mipmap level */
 856          lp_build_mipmap_level_sizes(bld, ilevel1,
 857                                      &width1_vec, &height1_vec, &depth1_vec,
 858                                      &row_stride1_vec, &img_stride1_vec);
 859          data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
 860          if (img_filter == PIPE_TEX_FILTER_NEAREST) {
 861             lp_build_sample_image_nearest(bld,
 862                                           width1_vec, height1_vec, depth1_vec,
 863                                           row_stride1_vec, img_stride1_vec,
 864                                           data_ptr1, s, t, r,
 865                                           &colors1_lo, &colors1_hi);
 866          }
 867          else {
 868             lp_build_sample_image_linear(bld,
 869                                          width1_vec, height1_vec, depth1_vec,
 870                                          row_stride1_vec, img_stride1_vec,
 871                                          data_ptr1, s, t, r,
 872                                          &colors1_lo, &colors1_hi);
 873          }
 874
 875          /* interpolate samples from the two mipmap levels */
 876
 877          lod_fpart = LLVMBuildTrunc(builder, lod_fpart, h16_bld.elem_type, "");
 878          lod_fpart = lp_build_broadcast_scalar(&h16_bld, lod_fpart);
 879
 880          colors0_lo = lp_build_lerp(&h16_bld, lod_fpart,
 881                                     colors0_lo, colors1_lo);
 882          colors0_hi = lp_build_lerp(&h16_bld, lod_fpart,
 883                                     colors0_hi, colors1_hi);
 884
 885          LLVMBuildStore(builder, colors0_lo, colors_lo_var);
 886          LLVMBuildStore(builder, colors0_hi, colors_hi_var);
 887       }
 888       lp_build_endif(&if_ctx);
 889
 890       lp_build_flow_destroy(flow_ctx);
 891    }
 892 }
 893
 894
 895
 896 /**
 897  * Texture sampling in AoS format.  Used when sampling common 32-bit/texel
 898  * formats.  1D/2D/3D/cube texture supported.  All mipmap sampling modes
 899  * but only limited texture coord wrap modes.
 900  */
 901 void
 902 lp_build_sample_aos(struct lp_build_sample_context *bld,
 903                     unsigned unit,
 904                     LLVMValueRef s,
 905                     LLVMValueRef t,
 906                     LLVMValueRef r,
 907                     const LLVMValueRef *ddx,
 908                     const LLVMValueRef *ddy,
 909                     LLVMValueRef lod_bias, /* optional */
 910                     LLVMValueRef explicit_lod, /* optional */
 911                     LLVMValueRef texel_out[4])
 912 {
 913    struct lp_build_context *int_bld = &bld->int_bld;
 914    LLVMBuilderRef builder = bld->builder;
 915    const unsigned mip_filter = bld->static_state->min_mip_filter;
 916    const unsigned min_filter = bld->static_state->min_img_filter;
 917    const unsigned mag_filter = bld->static_state->mag_img_filter;
 918    const unsigned dims = bld->dims;
 919    LLVMValueRef lod_ipart = NULL, lod_fpart = NULL;
 920    LLVMValueRef ilevel0, ilevel1 = NULL;
 921    LLVMValueRef packed, packed_lo, packed_hi;
 922    LLVMValueRef unswizzled[4];
 923    LLVMValueRef face_ddx[4], face_ddy[4];
 924    struct lp_build_context h16_bld;
 925    LLVMTypeRef i32t = LLVMInt32Type();
 926    LLVMValueRef i32t_zero = LLVMConstInt(i32t, 0, 0);
 927
 928    /* we only support the common/simple wrap modes at this time */
 929    assert(lp_is_simple_wrap_mode(bld->static_state->wrap_s));
 930    if (dims >= 2)
 931       assert(lp_is_simple_wrap_mode(bld->static_state->wrap_t));
 932    if (dims >= 3)
 933       assert(lp_is_simple_wrap_mode(bld->static_state->wrap_r));
 934
 935
 936    /* make 16-bit fixed-pt builder context */
 937    lp_build_context_init(&h16_bld, builder, lp_type_ufixed(16));
 938
 939    /* cube face selection, compute pre-face coords, etc. */
 940    if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
 941       LLVMValueRef face, face_s, face_t;
 942       lp_build_cube_lookup(bld, s, t, r, &face, &face_s, &face_t);
 943       s = face_s; /* vec */
 944       t = face_t; /* vec */
 945       /* use 'r' to indicate cube face */
 946       r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */
 947
 948       /* recompute ddx, ddy using the new (s,t) face texcoords */
 949       face_ddx[0] = lp_build_ddx(&bld->coord_bld, s);
 950       face_ddx[1] = lp_build_ddx(&bld->coord_bld, t);
 951       face_ddx[2] = NULL;
 952       face_ddx[3] = NULL;
 953       face_ddy[0] = lp_build_ddy(&bld->coord_bld, s);
 954       face_ddy[1] = lp_build_ddy(&bld->coord_bld, t);
 955       face_ddy[2] = NULL;
 956       face_ddy[3] = NULL;
 957       ddx = face_ddx;
 958       ddy = face_ddy;
 959    }
 960
 961    /*
 962     * Compute the level of detail (float).
 963     */
 964    if (min_filter != mag_filter ||
 965        mip_filter != PIPE_TEX_MIPFILTER_NONE) {
 966       /* Need to compute lod either to choose mipmap levels or to
 967        * distinguish between minification/magnification with one mipmap level.
 968        */
 969       lp_build_lod_selector(bld, unit, ddx, ddy,
 970                             lod_bias, explicit_lod,
 971                             mip_filter,
 972                             &lod_ipart, &lod_fpart);
 973    } else {
 974       lod_ipart = i32t_zero;
 975    }
 976
 977    /*
 978     * Compute integer mipmap level(s) to fetch texels from: ilevel0, ilevel1
 979     */
 980    switch (mip_filter) {
 981    default:
 982       assert(0 && "bad mip_filter value in lp_build_sample_aos()");
 983       /* fall-through */
 984    case PIPE_TEX_MIPFILTER_NONE:
 985       /* always use mip level 0 */
 986       if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
 987          /* XXX this is a work-around for an apparent bug in LLVM 2.7.
 988           * We should be able to set ilevel0 = const(0) but that causes
 989           * bad x86 code to be emitted.
 990           */
 991          assert(lod_ipart);
 992          lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0);
 993       }
 994       else {
 995          ilevel0 = i32t_zero;
 996       }
 997       break;
 998    case PIPE_TEX_MIPFILTER_NEAREST:
 999       assert(lod_ipart);
1000       lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0);
1001       break;
1002    case PIPE_TEX_MIPFILTER_LINEAR:
1003       assert(lod_ipart);
1004       assert(lod_fpart);
1005       lp_build_linear_mip_levels(bld, unit,
1006                                  lod_ipart, &lod_fpart,
1007                                  &ilevel0, &ilevel1);
1008       break;
1009    }
1010
1011    /*
1012     * Get/interpolate texture colors.
1013     */
1014
1015    packed_lo = lp_build_alloca(builder, h16_bld.vec_type, "packed_lo");
1016    packed_hi = lp_build_alloca(builder, h16_bld.vec_type, "packed_hi");
1017
1018    if (min_filter == mag_filter) {
1019       /* no need to distinquish between minification and magnification */
1020       lp_build_sample_mipmap(bld,
1021                              min_filter, mip_filter,
1022                              s, t, r,
1023                              ilevel0, ilevel1, lod_fpart,
1024                              packed_lo, packed_hi);
1025    }
1026    else {
1027       /* Emit conditional to choose min image filter or mag image filter
1028        * depending on the lod being > 0 or <= 0, respectively.
1029        */
1030       struct lp_build_flow_context *flow_ctx;
1031       struct lp_build_if_state if_ctx;
1032       LLVMValueRef minify;
1033
1034       flow_ctx = lp_build_flow_create(builder);
1035
1036       /* minify = lod >= 0.0 */
1037       minify = LLVMBuildICmp(builder, LLVMIntSGE,
1038                              lod_ipart, int_bld->zero, "");
1039
1040       lp_build_if(&if_ctx, flow_ctx, builder, minify);
1041       {
1042          /* Use the minification filter */
1043          lp_build_sample_mipmap(bld,
1044                                 min_filter, mip_filter,
1045                                 s, t, r,
1046                                 ilevel0, ilevel1, lod_fpart,
1047                                 packed_lo, packed_hi);
1048       }
1049       lp_build_else(&if_ctx);
1050       {
1051          /* Use the magnification filter */
1052          lp_build_sample_mipmap(bld,
1053                                 mag_filter, PIPE_TEX_MIPFILTER_NONE,
1054                                 s, t, r,
1055                                 i32t_zero, NULL, NULL,
1056                                 packed_lo, packed_hi);
1057       }
1058       lp_build_endif(&if_ctx);
1059
1060       lp_build_flow_destroy(flow_ctx);
1061    }
1062
1063    /*
1064     * combine the values stored in 'packed_lo' and 'packed_hi' variables
1065     * into 'packed'
1066     */
1067    packed = lp_build_pack2(builder,
1068                            h16_bld.type, lp_type_unorm(8),
1069                            LLVMBuildLoad(builder, packed_lo, ""),
1070                            LLVMBuildLoad(builder, packed_hi, ""));
1071
1072    /*
1073     * Convert to SoA and swizzle.
1074     */
1075    lp_build_rgba8_to_f32_soa(builder,
1076                              bld->texel_type,
1077                              packed, unswizzled);
1078
1079    if (util_format_is_rgba8_variant(bld->format_desc)) {
1080       lp_build_format_swizzle_soa(bld->format_desc,
1081                                   &bld->texel_bld,
1082                                   unswizzled, texel_out);
1083    }
1084    else {
1085       texel_out[0] = unswizzled[0];
1086       texel_out[1] = unswizzled[1];
1087       texel_out[2] = unswizzled[2];
1088       texel_out[3] = unswizzled[3];
1089    }
1090
1091    apply_sampler_swizzle(bld, texel_out);
1092 }