src/gallium/drivers/cell/spu/spu_texture.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 #include <math.h>
  30
  31 #include "pipe/p_compiler.h"
  32 #include "spu_main.h"
  33 #include "spu_texture.h"
  34 #include "spu_tile.h"
  35 #include "spu_colorpack.h"
  36 #include "spu_dcache.h"
  37
  38
  39 /**
  40  * Mark all tex cache entries as invalid.
  41  */
  42 void
  43 invalidate_tex_cache(void)
  44 {
  45    uint lvl;
  46    for (lvl = 0; lvl < CELL_MAX_TEXTURE_LEVELS; lvl++) {
  47       uint unit = 0;
  48       uint bytes = 4 * spu.texture[unit].level[lvl].width
  49          * spu.texture[unit].level[lvl].height;
  50
  51       spu_dcache_mark_dirty((unsigned) spu.texture[unit].level[lvl].start, bytes);
  52    }
  53 }
  54
  55
  56 /**
  57  * Get four texels from locations (x[0], y[0]), (x[1], y[1]) ...
  58  *
  59  * NOTE: in the typical case of bilinear filtering, the four texels
  60  * are in a 2x2 group so we could get by with just two dcache fetches
  61  * (two side-by-side texels per fetch).  But when bilinear filtering
  62  * wraps around a texture edge, we'll probably need code like we have
  63  * now.
  64  * FURTHERMORE: since we're rasterizing a quad of 2x2 pixels at a time,
  65  * it's quite likely that the four pixels in a quad will need some of the
  66  * same texels.  So look into doing texture fetches for four pixels at
  67  * a time.
  68  */
  69 static void
  70 get_four_texels(uint unit, uint level, vec_uint4 x, vec_uint4 y,
  71                 vec_uint4 *texels)
  72 {
  73    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
  74    const unsigned texture_ea = (uintptr_t) tlevel->start;
  75    const vec_uint4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
  76    const vec_uint4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
  77    const qword offset_x = si_andi((qword) x, 0x1f); /* offset_x = x & 0x1f */
  78    const qword offset_y = si_andi((qword) y, 0x1f); /* offset_y = y & 0x1f */
  79
  80    const qword tiles_per_row = (qword) spu_splats(tlevel->tiles_per_row);
  81    const qword tile_size = (qword) spu_splats((unsigned) sizeof(tile_t));
  82
  83    qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x);
  84    tile_offset = si_mpy((qword) tile_offset, tile_size);
  85
  86    qword texel_offset = si_a(si_mpyui(offset_y, 32), offset_x);
  87    texel_offset = si_mpyui(texel_offset, 4);
  88
  89    vec_uint4 offset = (vec_uint4) si_a(tile_offset, texel_offset);
  90
  91    spu_dcache_fetch_unaligned((qword *) & texels[0],
  92                               texture_ea + spu_extract(offset, 0), 4);
  93    spu_dcache_fetch_unaligned((qword *) & texels[1],
  94                               texture_ea + spu_extract(offset, 1), 4);
  95    spu_dcache_fetch_unaligned((qword *) & texels[2],
  96                               texture_ea + spu_extract(offset, 2), 4);
  97    spu_dcache_fetch_unaligned((qword *) & texels[3],
  98                               texture_ea + spu_extract(offset, 3), 4);
  99 }
 100
 101
 102
 103 /**
 104  * Do nearest texture sampling for four pixels.
 105  * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
 106  */
 107 void
 108 sample_texture4_nearest(vector float s, vector float t,
 109                         vector float r, vector float q,
 110                         uint unit, uint level, vector float colors[4])
 111 {
 112    vector float ss = spu_mul(s, spu.texture[unit].level[level].width4);
 113    vector float tt = spu_mul(t, spu.texture[unit].level[level].height4);
 114    vector unsigned int is = spu_convtu(ss, 0);
 115    vector unsigned int it = spu_convtu(tt, 0);
 116    vec_uint4 texels[4];
 117
 118    /* PIPE_TEX_WRAP_REPEAT */
 119    is = spu_and(is, spu.texture[unit].level[level].tex_size_x_mask);
 120    it = spu_and(it, spu.texture[unit].level[level].tex_size_y_mask);
 121
 122    get_four_texels(unit, level, is, it, texels);
 123
 124    /* convert four packed ARGBA pixels to float RRRR,GGGG,BBBB,AAAA */
 125    spu_unpack_A8R8G8B8_transpose4(texels, colors);
 126 }
 127
 128
 129 /**
 130  * Do bilinear texture sampling for four pixels.
 131  * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
 132  */
 133 void
 134 sample_texture4_bilinear(vector float s, vector float t,
 135                          vector float r, vector float q,
 136                          uint unit, uint level, vector float colors[4])
 137 {
 138    vector float ss = spu_madd(s, spu.texture[unit].level[level].width4,  spu_splats(-0.5f));
 139    vector float tt = spu_madd(t, spu.texture[unit].level[level].height4, spu_splats(-0.5f));
 140
 141    vector unsigned int is0 = (vector unsigned int) spu_convts(ss, 0);
 142    vector unsigned int it0 = (vector unsigned int) spu_convts(tt, 0);
 143
 144    /* is + 1, it + 1 */
 145    vector unsigned int is1 = spu_add(is0, 1);
 146    vector unsigned int it1 = spu_add(it0, 1);
 147
 148    /* PIPE_TEX_WRAP_REPEAT */
 149    is0 = spu_and(is0, spu.texture[unit].level[level].tex_size_x_mask);
 150    it0 = spu_and(it0, spu.texture[unit].level[level].tex_size_y_mask);
 151    is1 = spu_and(is1, spu.texture[unit].level[level].tex_size_x_mask);
 152    it1 = spu_and(it1, spu.texture[unit].level[level].tex_size_y_mask);
 153
 154    /* get packed int texels */
 155    vector unsigned int texels[16];
 156    get_four_texels(unit, level, is0, it0, texels + 0);  /* upper-left */
 157    get_four_texels(unit, level, is1, it0, texels + 4);  /* upper-right */
 158    get_four_texels(unit, level, is0, it1, texels + 8);  /* lower-left */
 159    get_four_texels(unit, level, is1, it1, texels + 12); /* lower-right */
 160
 161    /* XXX possibly rework following code to compute the weighted sample
 162     * colors with integer arithmetic for fewer int->float conversions.
 163     */
 164
 165    /* convert packed int texels to float colors */
 166    vector float ftexels[16];
 167    spu_unpack_A8R8G8B8_transpose4(texels + 0, ftexels + 0);
 168    spu_unpack_A8R8G8B8_transpose4(texels + 4, ftexels + 4);
 169    spu_unpack_A8R8G8B8_transpose4(texels + 8, ftexels + 8);
 170    spu_unpack_A8R8G8B8_transpose4(texels + 12, ftexels + 12);
 171
 172    /* Compute weighting factors in [0,1]
 173     * Multiply texcoord by 1024, AND with 1023, convert back to float.
 174     */
 175    vector float ss1024 = spu_mul(ss, spu_splats(1024.0f));
 176    vector signed int iss1024 = spu_convts(ss1024, 0);
 177    iss1024 = spu_and(iss1024, 1023);
 178    vector float sWeights0 = spu_convtf(iss1024, 10);
 179
 180    vector float tt1024 = spu_mul(tt, spu_splats(1024.0f));
 181    vector signed int itt1024 = spu_convts(tt1024, 0);
 182    itt1024 = spu_and(itt1024, 1023);
 183    vector float tWeights0 = spu_convtf(itt1024, 10);
 184
 185    /* 1 - sWeight and 1 - tWeight */
 186    vector float sWeights1 = spu_sub(spu_splats(1.0f), sWeights0);
 187    vector float tWeights1 = spu_sub(spu_splats(1.0f), tWeights0);
 188
 189    /* reds, for four pixels */
 190    ftexels[ 0] = spu_mul(ftexels[ 0], spu_mul(sWeights1, tWeights1)); /*ul*/
 191    ftexels[ 4] = spu_mul(ftexels[ 4], spu_mul(sWeights0, tWeights1)); /*ur*/
 192    ftexels[ 8] = spu_mul(ftexels[ 8], spu_mul(sWeights1, tWeights0)); /*ll*/
 193    ftexels[12] = spu_mul(ftexels[12], spu_mul(sWeights0, tWeights0)); /*lr*/
 194    colors[0] = spu_add(spu_add(ftexels[0], ftexels[4]),
 195                        spu_add(ftexels[8], ftexels[12]));
 196
 197    /* greens, for four pixels */
 198    ftexels[ 1] = spu_mul(ftexels[ 1], spu_mul(sWeights1, tWeights1)); /*ul*/
 199    ftexels[ 5] = spu_mul(ftexels[ 5], spu_mul(sWeights0, tWeights1)); /*ur*/
 200    ftexels[ 9] = spu_mul(ftexels[ 9], spu_mul(sWeights1, tWeights0)); /*ll*/
 201    ftexels[13] = spu_mul(ftexels[13], spu_mul(sWeights0, tWeights0)); /*lr*/
 202    colors[1] = spu_add(spu_add(ftexels[1], ftexels[5]),
 203                        spu_add(ftexels[9], ftexels[13]));
 204
 205    /* blues, for four pixels */
 206    ftexels[ 2] = spu_mul(ftexels[ 2], spu_mul(sWeights1, tWeights1)); /*ul*/
 207    ftexels[ 6] = spu_mul(ftexels[ 6], spu_mul(sWeights0, tWeights1)); /*ur*/
 208    ftexels[10] = spu_mul(ftexels[10], spu_mul(sWeights1, tWeights0)); /*ll*/
 209    ftexels[14] = spu_mul(ftexels[14], spu_mul(sWeights0, tWeights0)); /*lr*/
 210    colors[2] = spu_add(spu_add(ftexels[2], ftexels[6]),
 211                        spu_add(ftexels[10], ftexels[14]));
 212
 213    /* alphas, for four pixels */
 214    ftexels[ 3] = spu_mul(ftexels[ 3], spu_mul(sWeights1, tWeights1)); /*ul*/
 215    ftexels[ 7] = spu_mul(ftexels[ 7], spu_mul(sWeights0, tWeights1)); /*ur*/
 216    ftexels[11] = spu_mul(ftexels[11], spu_mul(sWeights1, tWeights0)); /*ll*/
 217    ftexels[15] = spu_mul(ftexels[15], spu_mul(sWeights0, tWeights0)); /*lr*/
 218    colors[3] = spu_add(spu_add(ftexels[3], ftexels[7]),
 219                        spu_add(ftexels[11], ftexels[15]));
 220 }
 221
 222
 223
 224 /**
 225  * Adapted from /opt/cell/sdk/usr/spu/include/transpose_matrix4x4.h
 226  */
 227 static INLINE void
 228 transpose(vector unsigned int *mOut0,
 229           vector unsigned int *mOut1,
 230           vector unsigned int *mOut2,
 231           vector unsigned int *mOut3,
 232           vector unsigned int *mIn)
 233 {
 234   vector unsigned int abcd, efgh, ijkl, mnop;   /* input vectors */
 235   vector unsigned int aeim, bfjn, cgko, dhlp;   /* output vectors */
 236   vector unsigned int aibj, ckdl, emfn, gohp;   /* intermediate vectors */
 237
 238   vector unsigned char shufflehi = ((vector unsigned char) {
 239                                                0x00, 0x01, 0x02, 0x03,
 240                                                0x10, 0x11, 0x12, 0x13,
 241                                                0x04, 0x05, 0x06, 0x07,
 242                                                0x14, 0x15, 0x16, 0x17});
 243   vector unsigned char shufflelo = ((vector unsigned char) {
 244                                                0x08, 0x09, 0x0A, 0x0B,
 245                                                0x18, 0x19, 0x1A, 0x1B,
 246                                                0x0C, 0x0D, 0x0E, 0x0F,
 247                                                0x1C, 0x1D, 0x1E, 0x1F});
 248   abcd = *(mIn+0);
 249   efgh = *(mIn+1);
 250   ijkl = *(mIn+2);
 251   mnop = *(mIn+3);
 252
 253   aibj = spu_shuffle(abcd, ijkl, shufflehi);
 254   ckdl = spu_shuffle(abcd, ijkl, shufflelo);
 255   emfn = spu_shuffle(efgh, mnop, shufflehi);
 256   gohp = spu_shuffle(efgh, mnop, shufflelo);
 257
 258   aeim = spu_shuffle(aibj, emfn, shufflehi);
 259   bfjn = spu_shuffle(aibj, emfn, shufflelo);
 260   cgko = spu_shuffle(ckdl, gohp, shufflehi);
 261   dhlp = spu_shuffle(ckdl, gohp, shufflelo);
 262
 263   *mOut0 = aeim;
 264   *mOut1 = bfjn;
 265   *mOut2 = cgko;
 266   *mOut3 = dhlp;
 267 }
 268
 269
 270 /**
 271  * Bilinear filtering, using int intead of float arithmetic
 272  */
 273 void
 274 sample_texture4_bilinear_2(vector float s, vector float t,
 275                            vector float r, vector float q,
 276                            uint unit, uint level, vector float colors[4])
 277 {
 278    static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
 279    /* Scale texcoords by size of texture, and add half pixel bias */
 280    vector float ss = spu_madd(s, spu.texture[unit].level[level].width4, half);
 281    vector float tt = spu_madd(t, spu.texture[unit].level[level].height4, half);
 282
 283    /* convert float coords to fixed-pt coords with 8 fraction bits */
 284    vector unsigned int is = (vector unsigned int) spu_convts(ss, 8);
 285    vector unsigned int it = (vector unsigned int) spu_convts(tt, 8);
 286
 287    /* compute integer texel weights in [0, 255] */
 288    vector signed int sWeights0 = spu_and((vector signed int) is, 255);
 289    vector signed int tWeights0 = spu_and((vector signed int) it, 255);
 290    vector signed int sWeights1 = spu_sub(255, sWeights0);
 291    vector signed int tWeights1 = spu_sub(255, tWeights0);
 292
 293    /* texel coords: is0 = is / 256, it0 = is / 256 */
 294    vector unsigned int is0 = spu_rlmask(is, -8);
 295    vector unsigned int it0 = spu_rlmask(it, -8);
 296
 297    /* texel coords: i1 = is0 + 1, it1 = it0 + 1 */
 298    vector unsigned int is1 = spu_add(is0, 1);
 299    vector unsigned int it1 = spu_add(it0, 1);
 300
 301    /* PIPE_TEX_WRAP_REPEAT */
 302    is0 = spu_and(is0, spu.texture[unit].level[level].tex_size_x_mask);
 303    it0 = spu_and(it0, spu.texture[unit].level[level].tex_size_y_mask);
 304    is1 = spu_and(is1, spu.texture[unit].level[level].tex_size_x_mask);
 305    it1 = spu_and(it1, spu.texture[unit].level[level].tex_size_y_mask);
 306
 307    /* get packed int texels */
 308    vector unsigned int texels[16];
 309    get_four_texels(unit, level, is0, it0, texels + 0);  /* upper-left */
 310    get_four_texels(unit, level, is1, it0, texels + 4);  /* upper-right */
 311    get_four_texels(unit, level, is0, it1, texels + 8);  /* lower-left */
 312    get_four_texels(unit, level, is1, it1, texels + 12); /* lower-right */
 313
 314    /* twiddle packed 32-bit BGRA pixels into RGBA as four unsigned ints */
 315    {
 316       static const unsigned char ZERO = 0x80;
 317       int i;
 318       for (i = 0; i < 16; i++) {
 319          texels[i] = spu_shuffle(texels[i], texels[i],
 320                                  ((vector unsigned char) {
 321                                     ZERO, ZERO, ZERO, 1,
 322                                     ZERO, ZERO, ZERO, 2,
 323                                     ZERO, ZERO, ZERO, 3,
 324                                     ZERO, ZERO, ZERO, 0}));
 325       }
 326    }
 327
 328    /* convert RGBA,RGBA,RGBA,RGBA to RRRR,GGGG,BBBB,AAAA */
 329    vector unsigned int texel0, texel1, texel2, texel3, texel4, texel5, texel6, texel7,
 330       texel8, texel9, texel10, texel11, texel12, texel13, texel14, texel15;
 331    transpose(&texel0, &texel1, &texel2, &texel3, texels + 0);
 332    transpose(&texel4, &texel5, &texel6, &texel7, texels + 4);
 333    transpose(&texel8, &texel9, &texel10, &texel11, texels + 8);
 334    transpose(&texel12, &texel13, &texel14, &texel15, texels + 12);
 335
 336    /* computed weighted colors */
 337    vector unsigned int c0, c1, c2, c3, cSum;
 338
 339    /* red */
 340    c0 = (vector unsigned int) si_mpyu((qword) texel0, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
 341    c1 = (vector unsigned int) si_mpyu((qword) texel4, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
 342    c2 = (vector unsigned int) si_mpyu((qword) texel8, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
 343    c3 = (vector unsigned int) si_mpyu((qword) texel12, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
 344    cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
 345    colors[0] = spu_convtf(cSum, 24);
 346
 347    /* green */
 348    c0 = (vector unsigned int) si_mpyu((qword) texel1, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
 349    c1 = (vector unsigned int) si_mpyu((qword) texel5, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
 350    c2 = (vector unsigned int) si_mpyu((qword) texel9, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
 351    c3 = (vector unsigned int) si_mpyu((qword) texel13, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
 352    cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
 353    colors[1] = spu_convtf(cSum, 24);
 354
 355    /* blue */
 356    c0 = (vector unsigned int) si_mpyu((qword) texel2, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
 357    c1 = (vector unsigned int) si_mpyu((qword) texel6, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
 358    c2 = (vector unsigned int) si_mpyu((qword) texel10, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
 359    c3 = (vector unsigned int) si_mpyu((qword) texel14, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
 360    cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
 361    colors[2] = spu_convtf(cSum, 24);
 362
 363    /* alpha */
 364    c0 = (vector unsigned int) si_mpyu((qword) texel3, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
 365    c1 = (vector unsigned int) si_mpyu((qword) texel7, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
 366    c2 = (vector unsigned int) si_mpyu((qword) texel11, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
 367    c3 = (vector unsigned int) si_mpyu((qword) texel15, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
 368    cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
 369    colors[3] = spu_convtf(cSum, 24);
 370 }
 371
 372
 373
 374 /**
 375  * Compute level of detail factor from texcoords.
 376  */
 377 static float
 378 compute_lambda(uint unit, vector float s, vector float t)
 379 {
 380    uint baseLevel = 0;
 381    float width = spu.texture[unit].level[baseLevel].width;
 382    float height = spu.texture[unit].level[baseLevel].width;
 383    float dsdx = width * (spu_extract(s, 1) - spu_extract(s, 0));
 384    float dsdy = width * (spu_extract(s, 2) - spu_extract(s, 0));
 385    float dtdx = height * (spu_extract(t, 1) - spu_extract(t, 0));
 386    float dtdy = height * (spu_extract(t, 2) - spu_extract(t, 0));
 387    float x = dsdx * dsdx + dtdx * dtdx;
 388    float y = dsdy * dsdy + dtdy * dtdy;
 389    float rho = x > y ? x : y;
 390    rho = sqrtf(rho);
 391    float lambda = logf(rho) * 1.442695f;
 392    return lambda;
 393 }
 394
 395
 396
 397 /**
 398  * Texture sampling with level of detail selection.
 399  */
 400 void
 401 sample_texture4_lod(vector float s, vector float t,
 402                     vector float r, vector float q,
 403                     uint unit, uint level_ignored, vector float colors[4])
 404 {
 405    /*
 406     * Note that we're computing a lambda/lod here that's used for all
 407     * four pixels in the quad.
 408     */
 409    float lambda = compute_lambda(unit, s, t);
 410
 411    /* apply lod bias */
 412    lambda += spu.sampler[unit].lod_bias;
 413
 414    /* clamp */
 415    if (lambda < spu.sampler[unit].min_lod)
 416       lambda = spu.sampler[unit].min_lod;
 417    else if (lambda > spu.sampler[unit].max_lod)
 418       lambda = spu.sampler[unit].max_lod;
 419
 420    if (lambda <= 0.0f) {
 421       /* magnify */
 422       spu.mag_sample_texture4[unit](s, t, r, q, unit, 0, colors);
 423    }
 424    else {
 425       /* minify */
 426       int level = (int) (lambda + 0.5f);
 427       if (level > (int) spu.texture[unit].max_level)
 428          level = spu.texture[unit].max_level;
 429       spu.min_sample_texture4[unit](s, t, r, q, unit, level, colors);
 430       /* XXX to do: mipmap level interpolation */
 431    }
 432 }
 433