src/gallium/drivers/cell/spu/spu_texture.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 #include <math.h>
  30
  31 #include "pipe/p_compiler.h"
  32 #include "spu_main.h"
  33 #include "spu_texture.h"
  34 #include "spu_tile.h"
  35 #include "spu_colorpack.h"
  36 #include "spu_dcache.h"
  37
  38
  39 /**
  40  * Mark all tex cache entries as invalid.
  41  */
  42 void
  43 invalidate_tex_cache(void)
  44 {
  45    uint lvl;
  46    for (lvl = 0; lvl < CELL_MAX_TEXTURE_LEVELS; lvl++) {
  47       uint unit = 0;
  48       uint bytes = 4 * spu.texture[unit].level[lvl].width
  49          * spu.texture[unit].level[lvl].height;
  50
  51       if (spu.texture[unit].target == PIPE_TEXTURE_CUBE)
  52          bytes *= 6;
  53       else if (spu.texture[unit].target == PIPE_TEXTURE_3D)
  54          bytes *= spu.texture[unit].level[lvl].depth;
  55
  56       spu_dcache_mark_dirty((unsigned) spu.texture[unit].level[lvl].start, bytes);
  57    }
  58 }
  59
  60
  61 /**
  62  * Get four texels from locations (x[0], y[0]), (x[1], y[1]) ...
  63  *
  64  * NOTE: in the typical case of bilinear filtering, the four texels
  65  * are in a 2x2 group so we could get by with just two dcache fetches
  66  * (two side-by-side texels per fetch).  But when bilinear filtering
  67  * wraps around a texture edge, we'll probably need code like we have
  68  * now.
  69  * FURTHERMORE: since we're rasterizing a quad of 2x2 pixels at a time,
  70  * it's quite likely that the four pixels in a quad will need some of the
  71  * same texels.  So look into doing texture fetches for four pixels at
  72  * a time.
  73  */
  74 static void
  75 get_four_texels(uint unit, uint level, uint face, vec_int4 x, vec_int4 y,
  76                 vec_uint4 *texels)
  77 {
  78    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
  79    unsigned texture_ea = (uintptr_t) tlevel->start;
  80    const vec_int4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
  81    const vec_int4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
  82    const qword offset_x = si_andi((qword) x, 0x1f); /* offset_x = x & 0x1f */
  83    const qword offset_y = si_andi((qword) y, 0x1f); /* offset_y = y & 0x1f */
  84
  85    const qword tiles_per_row = (qword) spu_splats(tlevel->tiles_per_row);
  86    const qword tile_size = (qword) spu_splats((unsigned) sizeof(tile_t));
  87
  88    qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x);
  89    tile_offset = si_mpy((qword) tile_offset, tile_size);
  90
  91    qword texel_offset = si_a(si_mpyui(offset_y, 32), offset_x);
  92    texel_offset = si_mpyui(texel_offset, 4);
  93
  94    vec_uint4 offset = (vec_uint4) si_a(tile_offset, texel_offset);
  95
  96    texture_ea = texture_ea + face * tlevel->bytes_per_image;
  97
  98    spu_dcache_fetch_unaligned((qword *) & texels[0],
  99                               texture_ea + spu_extract(offset, 0), 4);
 100    spu_dcache_fetch_unaligned((qword *) & texels[1],
 101                               texture_ea + spu_extract(offset, 1), 4);
 102    spu_dcache_fetch_unaligned((qword *) & texels[2],
 103                               texture_ea + spu_extract(offset, 2), 4);
 104    spu_dcache_fetch_unaligned((qword *) & texels[3],
 105                               texture_ea + spu_extract(offset, 3), 4);
 106 }
 107
 108
 109 /** clamp vec to [0, max] */
 110 static INLINE vector signed int
 111 spu_clamp(vector signed int vec, vector signed int max)
 112 {
 113    static const vector signed int zero = {0,0,0,0};
 114    vector unsigned int c;
 115    c = spu_cmpgt(vec, zero);    /* c = vec > zero ? ~0 : 0 */
 116    vec = spu_sel(zero, vec, c);
 117    c = spu_cmpgt(vec, max);    /* c = vec > max ? ~0 : 0 */
 118    vec = spu_sel(vec, max, c);
 119    return vec;
 120 }
 121
 122
 123
 124 /**
 125  * Do nearest texture sampling for four pixels.
 126  * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
 127  */
 128 void
 129 sample_texture4_nearest(vector float s, vector float t,
 130                         vector float r, vector float q,
 131                         uint unit, uint level, uint face,
 132                         vector float colors[4])
 133 {
 134    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
 135    vector float ss = spu_mul(s, tlevel->scale_s);
 136    vector float tt = spu_mul(t, tlevel->scale_t);
 137    vector signed int is = spu_convts(ss, 0);
 138    vector signed int it = spu_convts(tt, 0);
 139    vec_uint4 texels[4];
 140
 141    /* PIPE_TEX_WRAP_REPEAT */
 142    is = spu_and(is, tlevel->mask_s);
 143    it = spu_and(it, tlevel->mask_t);
 144
 145    /* PIPE_TEX_WRAP_CLAMP */
 146    is = spu_clamp(is, tlevel->max_s);
 147    it = spu_clamp(it, tlevel->max_t);
 148
 149    get_four_texels(unit, level, face, is, it, texels);
 150
 151    /* convert four packed ARGBA pixels to float RRRR,GGGG,BBBB,AAAA */
 152    spu_unpack_A8R8G8B8_transpose4(texels, colors);
 153 }
 154
 155
 156 /**
 157  * Do bilinear texture sampling for four pixels.
 158  * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
 159  */
 160 void
 161 sample_texture4_bilinear(vector float s, vector float t,
 162                          vector float r, vector float q,
 163                          uint unit, uint level, uint face,
 164                          vector float colors[4])
 165 {
 166    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
 167    static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
 168
 169    vector float ss = spu_madd(s, tlevel->scale_s, half);
 170    vector float tt = spu_madd(t, tlevel->scale_t, half);
 171
 172    vector signed int is0 = spu_convts(ss, 0);
 173    vector signed int it0 = spu_convts(tt, 0);
 174
 175    /* is + 1, it + 1 */
 176    vector signed int is1 = spu_add(is0, 1);
 177    vector signed int it1 = spu_add(it0, 1);
 178
 179    /* PIPE_TEX_WRAP_REPEAT */
 180    is0 = spu_and(is0, tlevel->mask_s);
 181    it0 = spu_and(it0, tlevel->mask_t);
 182    is1 = spu_and(is1, tlevel->mask_s);
 183    it1 = spu_and(it1, tlevel->mask_t);
 184
 185    /* PIPE_TEX_WRAP_CLAMP */
 186    is0 = spu_clamp(is0, tlevel->max_s);
 187    it0 = spu_clamp(it0, tlevel->max_t);
 188    is1 = spu_clamp(is1, tlevel->max_s);
 189    it1 = spu_clamp(it1, tlevel->max_t);
 190
 191    /* get packed int texels */
 192    vector unsigned int texels[16];
 193    get_four_texels(unit, level, face, is0, it0, texels + 0);  /* upper-left */
 194    get_four_texels(unit, level, face, is1, it0, texels + 4);  /* upper-right */
 195    get_four_texels(unit, level, face, is0, it1, texels + 8);  /* lower-left */
 196    get_four_texels(unit, level, face, is1, it1, texels + 12); /* lower-right */
 197
 198    /* XXX possibly rework following code to compute the weighted sample
 199     * colors with integer arithmetic for fewer int->float conversions.
 200     */
 201
 202    /* convert packed int texels to float colors */
 203    vector float ftexels[16];
 204    spu_unpack_A8R8G8B8_transpose4(texels + 0, ftexels + 0);
 205    spu_unpack_A8R8G8B8_transpose4(texels + 4, ftexels + 4);
 206    spu_unpack_A8R8G8B8_transpose4(texels + 8, ftexels + 8);
 207    spu_unpack_A8R8G8B8_transpose4(texels + 12, ftexels + 12);
 208
 209    /* Compute weighting factors in [0,1]
 210     * Multiply texcoord by 1024, AND with 1023, convert back to float.
 211     */
 212    vector float ss1024 = spu_mul(ss, spu_splats(1024.0f));
 213    vector signed int iss1024 = spu_convts(ss1024, 0);
 214    iss1024 = spu_and(iss1024, 1023);
 215    vector float sWeights0 = spu_convtf(iss1024, 10);
 216
 217    vector float tt1024 = spu_mul(tt, spu_splats(1024.0f));
 218    vector signed int itt1024 = spu_convts(tt1024, 0);
 219    itt1024 = spu_and(itt1024, 1023);
 220    vector float tWeights0 = spu_convtf(itt1024, 10);
 221
 222    /* 1 - sWeight and 1 - tWeight */
 223    vector float sWeights1 = spu_sub(spu_splats(1.0f), sWeights0);
 224    vector float tWeights1 = spu_sub(spu_splats(1.0f), tWeights0);
 225
 226    /* reds, for four pixels */
 227    ftexels[ 0] = spu_mul(ftexels[ 0], spu_mul(sWeights1, tWeights1)); /*ul*/
 228    ftexels[ 4] = spu_mul(ftexels[ 4], spu_mul(sWeights0, tWeights1)); /*ur*/
 229    ftexels[ 8] = spu_mul(ftexels[ 8], spu_mul(sWeights1, tWeights0)); /*ll*/
 230    ftexels[12] = spu_mul(ftexels[12], spu_mul(sWeights0, tWeights0)); /*lr*/
 231    colors[0] = spu_add(spu_add(ftexels[0], ftexels[4]),
 232                        spu_add(ftexels[8], ftexels[12]));
 233
 234    /* greens, for four pixels */
 235    ftexels[ 1] = spu_mul(ftexels[ 1], spu_mul(sWeights1, tWeights1)); /*ul*/
 236    ftexels[ 5] = spu_mul(ftexels[ 5], spu_mul(sWeights0, tWeights1)); /*ur*/
 237    ftexels[ 9] = spu_mul(ftexels[ 9], spu_mul(sWeights1, tWeights0)); /*ll*/
 238    ftexels[13] = spu_mul(ftexels[13], spu_mul(sWeights0, tWeights0)); /*lr*/
 239    colors[1] = spu_add(spu_add(ftexels[1], ftexels[5]),
 240                        spu_add(ftexels[9], ftexels[13]));
 241
 242    /* blues, for four pixels */
 243    ftexels[ 2] = spu_mul(ftexels[ 2], spu_mul(sWeights1, tWeights1)); /*ul*/
 244    ftexels[ 6] = spu_mul(ftexels[ 6], spu_mul(sWeights0, tWeights1)); /*ur*/
 245    ftexels[10] = spu_mul(ftexels[10], spu_mul(sWeights1, tWeights0)); /*ll*/
 246    ftexels[14] = spu_mul(ftexels[14], spu_mul(sWeights0, tWeights0)); /*lr*/
 247    colors[2] = spu_add(spu_add(ftexels[2], ftexels[6]),
 248                        spu_add(ftexels[10], ftexels[14]));
 249
 250    /* alphas, for four pixels */
 251    ftexels[ 3] = spu_mul(ftexels[ 3], spu_mul(sWeights1, tWeights1)); /*ul*/
 252    ftexels[ 7] = spu_mul(ftexels[ 7], spu_mul(sWeights0, tWeights1)); /*ur*/
 253    ftexels[11] = spu_mul(ftexels[11], spu_mul(sWeights1, tWeights0)); /*ll*/
 254    ftexels[15] = spu_mul(ftexels[15], spu_mul(sWeights0, tWeights0)); /*lr*/
 255    colors[3] = spu_add(spu_add(ftexels[3], ftexels[7]),
 256                        spu_add(ftexels[11], ftexels[15]));
 257 }
 258
 259
 260
 261 /**
 262  * Adapted from /opt/cell/sdk/usr/spu/include/transpose_matrix4x4.h
 263  */
 264 static INLINE void
 265 transpose(vector unsigned int *mOut0,
 266           vector unsigned int *mOut1,
 267           vector unsigned int *mOut2,
 268           vector unsigned int *mOut3,
 269           vector unsigned int *mIn)
 270 {
 271   vector unsigned int abcd, efgh, ijkl, mnop;   /* input vectors */
 272   vector unsigned int aeim, bfjn, cgko, dhlp;   /* output vectors */
 273   vector unsigned int aibj, ckdl, emfn, gohp;   /* intermediate vectors */
 274
 275   vector unsigned char shufflehi = ((vector unsigned char) {
 276                                                0x00, 0x01, 0x02, 0x03,
 277                                                0x10, 0x11, 0x12, 0x13,
 278                                                0x04, 0x05, 0x06, 0x07,
 279                                                0x14, 0x15, 0x16, 0x17});
 280   vector unsigned char shufflelo = ((vector unsigned char) {
 281                                                0x08, 0x09, 0x0A, 0x0B,
 282                                                0x18, 0x19, 0x1A, 0x1B,
 283                                                0x0C, 0x0D, 0x0E, 0x0F,
 284                                                0x1C, 0x1D, 0x1E, 0x1F});
 285   abcd = *(mIn+0);
 286   efgh = *(mIn+1);
 287   ijkl = *(mIn+2);
 288   mnop = *(mIn+3);
 289
 290   aibj = spu_shuffle(abcd, ijkl, shufflehi);
 291   ckdl = spu_shuffle(abcd, ijkl, shufflelo);
 292   emfn = spu_shuffle(efgh, mnop, shufflehi);
 293   gohp = spu_shuffle(efgh, mnop, shufflelo);
 294
 295   aeim = spu_shuffle(aibj, emfn, shufflehi);
 296   bfjn = spu_shuffle(aibj, emfn, shufflelo);
 297   cgko = spu_shuffle(ckdl, gohp, shufflehi);
 298   dhlp = spu_shuffle(ckdl, gohp, shufflelo);
 299
 300   *mOut0 = aeim;
 301   *mOut1 = bfjn;
 302   *mOut2 = cgko;
 303   *mOut3 = dhlp;
 304 }
 305
 306
 307 /**
 308  * Bilinear filtering, using int intead of float arithmetic
 309  */
 310 void
 311 sample_texture4_bilinear_2(vector float s, vector float t,
 312                            vector float r, vector float q,
 313                            uint unit, uint level, uint face,
 314                            vector float colors[4])
 315 {
 316    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
 317    static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
 318
 319    /* Scale texcoords by size of texture, and add half pixel bias */
 320    vector float ss = spu_madd(s, tlevel->scale_s, half);
 321    vector float tt = spu_madd(t, tlevel->scale_t, half);
 322
 323    /* convert float coords to fixed-pt coords with 8 fraction bits */
 324    vector signed int is = spu_convts(ss, 8);
 325    vector signed int it = spu_convts(tt, 8);
 326
 327    /* compute integer texel weights in [0, 255] */
 328    vector signed int sWeights0 = spu_and(is, 255);
 329    vector signed int tWeights0 = spu_and(it, 255);
 330    vector signed int sWeights1 = spu_sub(255, sWeights0);
 331    vector signed int tWeights1 = spu_sub(255, tWeights0);
 332
 333    /* texel coords: is0 = is / 256, it0 = is / 256 */
 334    vector signed int is0 = spu_rlmask(is, -8);
 335    vector signed int it0 = spu_rlmask(it, -8);
 336
 337    /* texel coords: i1 = is0 + 1, it1 = it0 + 1 */
 338    vector signed int is1 = spu_add(is0, 1);
 339    vector signed int it1 = spu_add(it0, 1);
 340
 341    /* PIPE_TEX_WRAP_REPEAT */
 342    is0 = spu_and(is0, tlevel->mask_s);
 343    it0 = spu_and(it0, tlevel->mask_t);
 344    is1 = spu_and(is1, tlevel->mask_s);
 345    it1 = spu_and(it1, tlevel->mask_t);
 346
 347    /* PIPE_TEX_WRAP_CLAMP */
 348    is0 = spu_clamp(is0, tlevel->max_s);
 349    it0 = spu_clamp(it0, tlevel->max_t);
 350    is1 = spu_clamp(is1, tlevel->max_s);
 351    it1 = spu_clamp(it1, tlevel->max_t);
 352
 353    /* get packed int texels */
 354    vector unsigned int texels[16];
 355    get_four_texels(unit, level, face, is0, it0, texels + 0);  /* upper-left */
 356    get_four_texels(unit, level, face, is1, it0, texels + 4);  /* upper-right */
 357    get_four_texels(unit, level, face, is0, it1, texels + 8);  /* lower-left */
 358    get_four_texels(unit, level, face, is1, it1, texels + 12); /* lower-right */
 359
 360    /* twiddle packed 32-bit BGRA pixels into RGBA as four unsigned ints */
 361    {
 362       static const unsigned char ZERO = 0x80;
 363       int i;
 364       for (i = 0; i < 16; i++) {
 365          texels[i] = spu_shuffle(texels[i], texels[i],
 366                                  ((vector unsigned char) {
 367                                     ZERO, ZERO, ZERO, 1,
 368                                     ZERO, ZERO, ZERO, 2,
 369                                     ZERO, ZERO, ZERO, 3,
 370                                     ZERO, ZERO, ZERO, 0}));
 371       }
 372    }
 373
 374    /* convert RGBA,RGBA,RGBA,RGBA to RRRR,GGGG,BBBB,AAAA */
 375    vector unsigned int texel0, texel1, texel2, texel3, texel4, texel5, texel6, texel7,
 376       texel8, texel9, texel10, texel11, texel12, texel13, texel14, texel15;
 377    transpose(&texel0, &texel1, &texel2, &texel3, texels + 0);
 378    transpose(&texel4, &texel5, &texel6, &texel7, texels + 4);
 379    transpose(&texel8, &texel9, &texel10, &texel11, texels + 8);
 380    transpose(&texel12, &texel13, &texel14, &texel15, texels + 12);
 381
 382    /* computed weighted colors */
 383    vector unsigned int c0, c1, c2, c3, cSum;
 384
 385    /* red */
 386    c0 = (vector unsigned int) si_mpyu((qword) texel0, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
 387    c1 = (vector unsigned int) si_mpyu((qword) texel4, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
 388    c2 = (vector unsigned int) si_mpyu((qword) texel8, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
 389    c3 = (vector unsigned int) si_mpyu((qword) texel12, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
 390    cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
 391    colors[0] = spu_convtf(cSum, 24);
 392
 393    /* green */
 394    c0 = (vector unsigned int) si_mpyu((qword) texel1, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
 395    c1 = (vector unsigned int) si_mpyu((qword) texel5, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
 396    c2 = (vector unsigned int) si_mpyu((qword) texel9, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
 397    c3 = (vector unsigned int) si_mpyu((qword) texel13, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
 398    cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
 399    colors[1] = spu_convtf(cSum, 24);
 400
 401    /* blue */
 402    c0 = (vector unsigned int) si_mpyu((qword) texel2, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
 403    c1 = (vector unsigned int) si_mpyu((qword) texel6, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
 404    c2 = (vector unsigned int) si_mpyu((qword) texel10, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
 405    c3 = (vector unsigned int) si_mpyu((qword) texel14, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
 406    cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
 407    colors[2] = spu_convtf(cSum, 24);
 408
 409    /* alpha */
 410    c0 = (vector unsigned int) si_mpyu((qword) texel3, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
 411    c1 = (vector unsigned int) si_mpyu((qword) texel7, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
 412    c2 = (vector unsigned int) si_mpyu((qword) texel11, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
 413    c3 = (vector unsigned int) si_mpyu((qword) texel15, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
 414    cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
 415    colors[3] = spu_convtf(cSum, 24);
 416 }
 417
 418
 419
 420 /**
 421  * Compute level of detail factor from texcoords.
 422  */
 423 static float
 424 compute_lambda(uint unit, vector float s, vector float t)
 425 {
 426    uint baseLevel = 0;
 427    float width = spu.texture[unit].level[baseLevel].width;
 428    float height = spu.texture[unit].level[baseLevel].width;
 429    float dsdx = width * (spu_extract(s, 1) - spu_extract(s, 0));
 430    float dsdy = width * (spu_extract(s, 2) - spu_extract(s, 0));
 431    float dtdx = height * (spu_extract(t, 1) - spu_extract(t, 0));
 432    float dtdy = height * (spu_extract(t, 2) - spu_extract(t, 0));
 433    float x = dsdx * dsdx + dtdx * dtdx;
 434    float y = dsdy * dsdy + dtdy * dtdy;
 435    float rho = x > y ? x : y;
 436    rho = sqrtf(rho);
 437    float lambda = logf(rho) * 1.442695f;
 438    return lambda;
 439 }
 440
 441
 442
 443 /**
 444  * Texture sampling with level of detail selection.
 445  */
 446 void
 447 sample_texture4_lod(vector float s, vector float t,
 448                     vector float r, vector float q,
 449                     uint unit, uint level_ignored, uint face,
 450                     vector float colors[4])
 451 {
 452    /*
 453     * Note that we're computing a lambda/lod here that's used for all
 454     * four pixels in the quad.
 455     */
 456    float lambda = compute_lambda(unit, s, t);
 457
 458    /* apply lod bias */
 459    lambda += spu.sampler[unit].lod_bias;
 460
 461    /* clamp */
 462    if (lambda < spu.sampler[unit].min_lod)
 463       lambda = spu.sampler[unit].min_lod;
 464    else if (lambda > spu.sampler[unit].max_lod)
 465       lambda = spu.sampler[unit].max_lod;
 466
 467    if (lambda <= 0.0f) {
 468       /* magnify */
 469       spu.mag_sample_texture4[unit](s, t, r, q, unit, 0, 0, colors);
 470    }
 471    else {
 472       /* minify */
 473       int level = (int) (lambda + 0.5f);
 474       if (level > (int) spu.texture[unit].max_level)
 475          level = spu.texture[unit].max_level;
 476       spu.min_sample_texture4[unit](s, t, r, q, unit, level, 0, colors);
 477       /* XXX to do: mipmap level interpolation */
 478    }
 479 }
 480
 481
 482 /** XXX need a SIMD version of this */
 483 static unsigned
 484 choose_cube_face(float rx, float ry, float rz, float *newS, float *newT)
 485 {
 486    /*
 487       major axis
 488       direction     target                             sc     tc    ma
 489       ----------    -------------------------------    ---    ---   ---
 490        +rx          TEXTURE_CUBE_MAP_POSITIVE_X_EXT    -rz    -ry   rx
 491        -rx          TEXTURE_CUBE_MAP_NEGATIVE_X_EXT    +rz    -ry   rx
 492        +ry          TEXTURE_CUBE_MAP_POSITIVE_Y_EXT    +rx    +rz   ry
 493        -ry          TEXTURE_CUBE_MAP_NEGATIVE_Y_EXT    +rx    -rz   ry
 494        +rz          TEXTURE_CUBE_MAP_POSITIVE_Z_EXT    +rx    -ry   rz
 495        -rz          TEXTURE_CUBE_MAP_NEGATIVE_Z_EXT    -rx    -ry   rz
 496    */
 497    const float arx = fabsf(rx);
 498    const float ary = fabsf(ry);
 499    const float arz = fabsf(rz);
 500    unsigned face;
 501    float sc, tc, ma;
 502
 503    if (arx > ary && arx > arz) {
 504       if (rx >= 0.0F) {
 505          face = PIPE_TEX_FACE_POS_X;
 506          sc = -rz;
 507          tc = -ry;
 508          ma = arx;
 509       }
 510       else {
 511          face = PIPE_TEX_FACE_NEG_X;
 512          sc = rz;
 513          tc = -ry;
 514          ma = arx;
 515       }
 516    }
 517    else if (ary > arx && ary > arz) {
 518       if (ry >= 0.0F) {
 519          face = PIPE_TEX_FACE_POS_Y;
 520          sc = rx;
 521          tc = rz;
 522          ma = ary;
 523       }
 524       else {
 525          face = PIPE_TEX_FACE_NEG_Y;
 526          sc = rx;
 527          tc = -rz;
 528          ma = ary;
 529       }
 530    }
 531    else {
 532       if (rz > 0.0F) {
 533          face = PIPE_TEX_FACE_POS_Z;
 534          sc = rx;
 535          tc = -ry;
 536          ma = arz;
 537       }
 538       else {
 539          face = PIPE_TEX_FACE_NEG_Z;
 540          sc = -rx;
 541          tc = -ry;
 542          ma = arz;
 543       }
 544    }
 545
 546    *newS = (sc / ma + 1.0F) * 0.5F;
 547    *newT = (tc / ma + 1.0F) * 0.5F;
 548
 549    return face;
 550 }
 551
 552
 553
 554 void
 555 sample_texture4_cube(vector float s, vector float t,
 556                      vector float r, vector float q,
 557                      uint unit, uint level, uint face_ignored,
 558                      vector float colors[4])
 559 {
 560    static const vector float zero = {0.0f, 0.0f, 0.0f, 0.0f};
 561    uint p, faces[4];
 562    float newS[4], newT[4];
 563
 564    /* Compute cube face referenced by the four sets of texcoords.
 565     * XXX we should SIMD-ize this.
 566     */
 567    for (p = 0; p < 4; p++) {
 568       float rx = spu_extract(s, p);
 569       float ry = spu_extract(t, p);
 570       float rz = spu_extract(r, p);
 571       faces[p] = choose_cube_face(rx, ry, rz, &newS[p], &newT[p]);
 572    }
 573
 574    if (faces[0] == faces[1] &&
 575        faces[0] == faces[2] &&
 576        faces[0] == faces[3]) {
 577       /* GOOD!  All four texcoords refer to the same cube face */
 578       s = (vector float) {newS[0], newS[1], newS[2], newS[3]};
 579       t = (vector float) {newT[0], newT[1], newT[2], newT[3]};
 580       sample_texture4_nearest(s, t, zero, zero, unit, level, faces[0], colors);
 581    }
 582    else {
 583       /* BAD!  The four texcoords refer to different faces */
 584       for (p = 0; p < 4; p++) {
 585          vector float c[4];
 586
 587          sample_texture4_nearest(spu_splats(newS[p]), spu_splats(newT[p]),
 588                                  zero, zero, unit, level, faces[p], c);
 589
 590          float red = spu_extract(c[0], p);
 591          float green = spu_extract(c[1], p);
 592          float blue = spu_extract(c[2], p);
 593          float alpha = spu_extract(c[3], p);
 594
 595          colors[0] = spu_insert(red,   colors[0], p);
 596          colors[1] = spu_insert(green, colors[1], p);
 597          colors[2] = spu_insert(blue,  colors[2], p);
 598          colors[3] = spu_insert(alpha, colors[3], p);
 599       }
 600    }
 601 }