83cf7dc39423388719bde31b3ccab122a51007f3
[mesa.git] / src / gallium / drivers / cell / spu / spu_texture.c
1 /**************************************************************************
2 *
3 * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 #include <math.h>
30
31 #include "pipe/p_compiler.h"
32 #include "spu_main.h"
33 #include "spu_texture.h"
34 #include "spu_tile.h"
35 #include "spu_colorpack.h"
36 #include "spu_dcache.h"
37
38
39 /**
40 * Mark all tex cache entries as invalid.
41 */
42 void
43 invalidate_tex_cache(void)
44 {
45 uint lvl;
46 for (lvl = 0; lvl < CELL_MAX_TEXTURE_LEVELS; lvl++) {
47 uint unit = 0;
48 uint bytes = 4 * spu.texture[unit].level[lvl].width
49 * spu.texture[unit].level[lvl].height;
50
51 spu_dcache_mark_dirty((unsigned) spu.texture[unit].level[lvl].start, bytes);
52 }
53 }
54
55
56 /**
57 * Get four texels from locations (x[0], y[0]), (x[1], y[1]) ...
58 *
59 * NOTE: in the typical case of bilinear filtering, the four texels
60 * are in a 2x2 group so we could get by with just two dcache fetches
61 * (two side-by-side texels per fetch). But when bilinear filtering
62 * wraps around a texture edge, we'll probably need code like we have
63 * now.
64 * FURTHERMORE: since we're rasterizing a quad of 2x2 pixels at a time,
65 * it's quite likely that the four pixels in a quad will need some of the
66 * same texels. So look into doing texture fetches for four pixels at
67 * a time.
68 */
69 static void
70 get_four_texels(uint unit, uint level, vec_uint4 x, vec_uint4 y,
71 vec_uint4 *texels)
72 {
73 const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
74 const unsigned texture_ea = (uintptr_t) tlevel->start;
75 const vec_uint4 tile_x = spu_rlmask(x, -5); /* tile_x = x / 32 */
76 const vec_uint4 tile_y = spu_rlmask(y, -5); /* tile_y = y / 32 */
77 const qword offset_x = si_andi((qword) x, 0x1f); /* offset_x = x & 0x1f */
78 const qword offset_y = si_andi((qword) y, 0x1f); /* offset_y = y & 0x1f */
79
80 const qword tiles_per_row = (qword) spu_splats(tlevel->tiles_per_row);
81 const qword tile_size = (qword) spu_splats((unsigned) sizeof(tile_t));
82
83 qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x);
84 tile_offset = si_mpy((qword) tile_offset, tile_size);
85
86 qword texel_offset = si_a(si_mpyui(offset_y, 32), offset_x);
87 texel_offset = si_mpyui(texel_offset, 4);
88
89 vec_uint4 offset = (vec_uint4) si_a(tile_offset, texel_offset);
90
91 spu_dcache_fetch_unaligned((qword *) & texels[0],
92 texture_ea + spu_extract(offset, 0), 4);
93 spu_dcache_fetch_unaligned((qword *) & texels[1],
94 texture_ea + spu_extract(offset, 1), 4);
95 spu_dcache_fetch_unaligned((qword *) & texels[2],
96 texture_ea + spu_extract(offset, 2), 4);
97 spu_dcache_fetch_unaligned((qword *) & texels[3],
98 texture_ea + spu_extract(offset, 3), 4);
99 }
100
101
102
103 /**
104 * Do nearest texture sampling for four pixels.
105 * \param colors returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
106 */
107 void
108 sample_texture4_nearest(vector float s, vector float t,
109 vector float r, vector float q,
110 uint unit, uint level, vector float colors[4])
111 {
112 vector float ss = spu_mul(s, spu.texture[unit].level[level].width4);
113 vector float tt = spu_mul(t, spu.texture[unit].level[level].height4);
114 vector unsigned int is = spu_convtu(ss, 0);
115 vector unsigned int it = spu_convtu(tt, 0);
116 vec_uint4 texels[4];
117
118 /* PIPE_TEX_WRAP_REPEAT */
119 is = spu_and(is, spu.texture[unit].level[level].tex_size_x_mask);
120 it = spu_and(it, spu.texture[unit].level[level].tex_size_y_mask);
121
122 get_four_texels(unit, level, is, it, texels);
123
124 /* convert four packed ARGBA pixels to float RRRR,GGGG,BBBB,AAAA */
125 spu_unpack_A8R8G8B8_transpose4(texels, colors);
126 }
127
128
129 /**
130 * Do bilinear texture sampling for four pixels.
131 * \param colors returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
132 */
133 void
134 sample_texture4_bilinear(vector float s, vector float t,
135 vector float r, vector float q,
136 uint unit, uint level, vector float colors[4])
137 {
138 vector float ss = spu_madd(s, spu.texture[unit].level[level].width4, spu_splats(-0.5f));
139 vector float tt = spu_madd(t, spu.texture[unit].level[level].height4, spu_splats(-0.5f));
140
141 vector unsigned int is0 = (vector unsigned int) spu_convts(ss, 0);
142 vector unsigned int it0 = (vector unsigned int) spu_convts(tt, 0);
143
144 /* is + 1, it + 1 */
145 vector unsigned int is1 = spu_add(is0, 1);
146 vector unsigned int it1 = spu_add(it0, 1);
147
148 /* PIPE_TEX_WRAP_REPEAT */
149 is0 = spu_and(is0, spu.texture[unit].level[level].tex_size_x_mask);
150 it0 = spu_and(it0, spu.texture[unit].level[level].tex_size_y_mask);
151 is1 = spu_and(is1, spu.texture[unit].level[level].tex_size_x_mask);
152 it1 = spu_and(it1, spu.texture[unit].level[level].tex_size_y_mask);
153
154 /* get packed int texels */
155 vector unsigned int texels[16];
156 get_four_texels(unit, level, is0, it0, texels + 0); /* upper-left */
157 get_four_texels(unit, level, is1, it0, texels + 4); /* upper-right */
158 get_four_texels(unit, level, is0, it1, texels + 8); /* lower-left */
159 get_four_texels(unit, level, is1, it1, texels + 12); /* lower-right */
160
161 /* XXX possibly rework following code to compute the weighted sample
162 * colors with integer arithmetic for fewer int->float conversions.
163 */
164
165 /* convert packed int texels to float colors */
166 vector float ftexels[16];
167 spu_unpack_A8R8G8B8_transpose4(texels + 0, ftexels + 0);
168 spu_unpack_A8R8G8B8_transpose4(texels + 4, ftexels + 4);
169 spu_unpack_A8R8G8B8_transpose4(texels + 8, ftexels + 8);
170 spu_unpack_A8R8G8B8_transpose4(texels + 12, ftexels + 12);
171
172 /* Compute weighting factors in [0,1]
173 * Multiply texcoord by 1024, AND with 1023, convert back to float.
174 */
175 vector float ss1024 = spu_mul(ss, spu_splats(1024.0f));
176 vector signed int iss1024 = spu_convts(ss1024, 0);
177 iss1024 = spu_and(iss1024, 1023);
178 vector float sWeights0 = spu_convtf(iss1024, 10);
179
180 vector float tt1024 = spu_mul(tt, spu_splats(1024.0f));
181 vector signed int itt1024 = spu_convts(tt1024, 0);
182 itt1024 = spu_and(itt1024, 1023);
183 vector float tWeights0 = spu_convtf(itt1024, 10);
184
185 /* 1 - sWeight and 1 - tWeight */
186 vector float sWeights1 = spu_sub(spu_splats(1.0f), sWeights0);
187 vector float tWeights1 = spu_sub(spu_splats(1.0f), tWeights0);
188
189 /* reds, for four pixels */
190 ftexels[ 0] = spu_mul(ftexels[ 0], spu_mul(sWeights1, tWeights1)); /*ul*/
191 ftexels[ 4] = spu_mul(ftexels[ 4], spu_mul(sWeights0, tWeights1)); /*ur*/
192 ftexels[ 8] = spu_mul(ftexels[ 8], spu_mul(sWeights1, tWeights0)); /*ll*/
193 ftexels[12] = spu_mul(ftexels[12], spu_mul(sWeights0, tWeights0)); /*lr*/
194 colors[0] = spu_add(spu_add(ftexels[0], ftexels[4]),
195 spu_add(ftexels[8], ftexels[12]));
196
197 /* greens, for four pixels */
198 ftexels[ 1] = spu_mul(ftexels[ 1], spu_mul(sWeights1, tWeights1)); /*ul*/
199 ftexels[ 5] = spu_mul(ftexels[ 5], spu_mul(sWeights0, tWeights1)); /*ur*/
200 ftexels[ 9] = spu_mul(ftexels[ 9], spu_mul(sWeights1, tWeights0)); /*ll*/
201 ftexels[13] = spu_mul(ftexels[13], spu_mul(sWeights0, tWeights0)); /*lr*/
202 colors[1] = spu_add(spu_add(ftexels[1], ftexels[5]),
203 spu_add(ftexels[9], ftexels[13]));
204
205 /* blues, for four pixels */
206 ftexels[ 2] = spu_mul(ftexels[ 2], spu_mul(sWeights1, tWeights1)); /*ul*/
207 ftexels[ 6] = spu_mul(ftexels[ 6], spu_mul(sWeights0, tWeights1)); /*ur*/
208 ftexels[10] = spu_mul(ftexels[10], spu_mul(sWeights1, tWeights0)); /*ll*/
209 ftexels[14] = spu_mul(ftexels[14], spu_mul(sWeights0, tWeights0)); /*lr*/
210 colors[2] = spu_add(spu_add(ftexels[2], ftexels[6]),
211 spu_add(ftexels[10], ftexels[14]));
212
213 /* alphas, for four pixels */
214 ftexels[ 3] = spu_mul(ftexels[ 3], spu_mul(sWeights1, tWeights1)); /*ul*/
215 ftexels[ 7] = spu_mul(ftexels[ 7], spu_mul(sWeights0, tWeights1)); /*ur*/
216 ftexels[11] = spu_mul(ftexels[11], spu_mul(sWeights1, tWeights0)); /*ll*/
217 ftexels[15] = spu_mul(ftexels[15], spu_mul(sWeights0, tWeights0)); /*lr*/
218 colors[3] = spu_add(spu_add(ftexels[3], ftexels[7]),
219 spu_add(ftexels[11], ftexels[15]));
220 }
221
222
223
224 /**
225 * Adapted from /opt/cell/sdk/usr/spu/include/transpose_matrix4x4.h
226 */
227 static INLINE void
228 transpose(vector unsigned int *mOut0,
229 vector unsigned int *mOut1,
230 vector unsigned int *mOut2,
231 vector unsigned int *mOut3,
232 vector unsigned int *mIn)
233 {
234 vector unsigned int abcd, efgh, ijkl, mnop; /* input vectors */
235 vector unsigned int aeim, bfjn, cgko, dhlp; /* output vectors */
236 vector unsigned int aibj, ckdl, emfn, gohp; /* intermediate vectors */
237
238 vector unsigned char shufflehi = ((vector unsigned char) {
239 0x00, 0x01, 0x02, 0x03,
240 0x10, 0x11, 0x12, 0x13,
241 0x04, 0x05, 0x06, 0x07,
242 0x14, 0x15, 0x16, 0x17});
243 vector unsigned char shufflelo = ((vector unsigned char) {
244 0x08, 0x09, 0x0A, 0x0B,
245 0x18, 0x19, 0x1A, 0x1B,
246 0x0C, 0x0D, 0x0E, 0x0F,
247 0x1C, 0x1D, 0x1E, 0x1F});
248 abcd = *(mIn+0);
249 efgh = *(mIn+1);
250 ijkl = *(mIn+2);
251 mnop = *(mIn+3);
252
253 aibj = spu_shuffle(abcd, ijkl, shufflehi);
254 ckdl = spu_shuffle(abcd, ijkl, shufflelo);
255 emfn = spu_shuffle(efgh, mnop, shufflehi);
256 gohp = spu_shuffle(efgh, mnop, shufflelo);
257
258 aeim = spu_shuffle(aibj, emfn, shufflehi);
259 bfjn = spu_shuffle(aibj, emfn, shufflelo);
260 cgko = spu_shuffle(ckdl, gohp, shufflehi);
261 dhlp = spu_shuffle(ckdl, gohp, shufflelo);
262
263 *mOut0 = aeim;
264 *mOut1 = bfjn;
265 *mOut2 = cgko;
266 *mOut3 = dhlp;
267 }
268
269
270 /**
271 * Bilinear filtering, using int intead of float arithmetic
272 */
273 void
274 sample_texture4_bilinear_2(vector float s, vector float t,
275 vector float r, vector float q,
276 uint unit, uint level, vector float colors[4])
277 {
278 static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
279 /* Scale texcoords by size of texture, and add half pixel bias */
280 vector float ss = spu_madd(s, spu.texture[unit].level[level].width4, half);
281 vector float tt = spu_madd(t, spu.texture[unit].level[level].height4, half);
282
283 /* convert float coords to fixed-pt coords with 8 fraction bits */
284 vector unsigned int is = (vector unsigned int) spu_convts(ss, 8);
285 vector unsigned int it = (vector unsigned int) spu_convts(tt, 8);
286
287 /* compute integer texel weights in [0, 255] */
288 vector signed int sWeights0 = spu_and((vector signed int) is, 255);
289 vector signed int tWeights0 = spu_and((vector signed int) it, 255);
290 vector signed int sWeights1 = spu_sub(255, sWeights0);
291 vector signed int tWeights1 = spu_sub(255, tWeights0);
292
293 /* texel coords: is0 = is / 256, it0 = is / 256 */
294 vector unsigned int is0 = spu_rlmask(is, -8);
295 vector unsigned int it0 = spu_rlmask(it, -8);
296
297 /* texel coords: i1 = is0 + 1, it1 = it0 + 1 */
298 vector unsigned int is1 = spu_add(is0, 1);
299 vector unsigned int it1 = spu_add(it0, 1);
300
301 /* PIPE_TEX_WRAP_REPEAT */
302 is0 = spu_and(is0, spu.texture[unit].level[level].tex_size_x_mask);
303 it0 = spu_and(it0, spu.texture[unit].level[level].tex_size_y_mask);
304 is1 = spu_and(is1, spu.texture[unit].level[level].tex_size_x_mask);
305 it1 = spu_and(it1, spu.texture[unit].level[level].tex_size_y_mask);
306
307 /* get packed int texels */
308 vector unsigned int texels[16];
309 get_four_texels(unit, level, is0, it0, texels + 0); /* upper-left */
310 get_four_texels(unit, level, is1, it0, texels + 4); /* upper-right */
311 get_four_texels(unit, level, is0, it1, texels + 8); /* lower-left */
312 get_four_texels(unit, level, is1, it1, texels + 12); /* lower-right */
313
314 /* twiddle packed 32-bit BGRA pixels into RGBA as four unsigned ints */
315 {
316 static const unsigned char ZERO = 0x80;
317 int i;
318 for (i = 0; i < 16; i++) {
319 texels[i] = spu_shuffle(texels[i], texels[i],
320 ((vector unsigned char) {
321 ZERO, ZERO, ZERO, 1,
322 ZERO, ZERO, ZERO, 2,
323 ZERO, ZERO, ZERO, 3,
324 ZERO, ZERO, ZERO, 0}));
325 }
326 }
327
328 /* convert RGBA,RGBA,RGBA,RGBA to RRRR,GGGG,BBBB,AAAA */
329 vector unsigned int texel0, texel1, texel2, texel3, texel4, texel5, texel6, texel7,
330 texel8, texel9, texel10, texel11, texel12, texel13, texel14, texel15;
331 transpose(&texel0, &texel1, &texel2, &texel3, texels + 0);
332 transpose(&texel4, &texel5, &texel6, &texel7, texels + 4);
333 transpose(&texel8, &texel9, &texel10, &texel11, texels + 8);
334 transpose(&texel12, &texel13, &texel14, &texel15, texels + 12);
335
336 /* computed weighted colors */
337 vector unsigned int c0, c1, c2, c3, cSum;
338
339 /* red */
340 c0 = (vector unsigned int) si_mpyu((qword) texel0, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
341 c1 = (vector unsigned int) si_mpyu((qword) texel4, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
342 c2 = (vector unsigned int) si_mpyu((qword) texel8, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
343 c3 = (vector unsigned int) si_mpyu((qword) texel12, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
344 cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
345 colors[0] = spu_convtf(cSum, 24);
346
347 /* green */
348 c0 = (vector unsigned int) si_mpyu((qword) texel1, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
349 c1 = (vector unsigned int) si_mpyu((qword) texel5, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
350 c2 = (vector unsigned int) si_mpyu((qword) texel9, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
351 c3 = (vector unsigned int) si_mpyu((qword) texel13, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
352 cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
353 colors[1] = spu_convtf(cSum, 24);
354
355 /* blue */
356 c0 = (vector unsigned int) si_mpyu((qword) texel2, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
357 c1 = (vector unsigned int) si_mpyu((qword) texel6, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
358 c2 = (vector unsigned int) si_mpyu((qword) texel10, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
359 c3 = (vector unsigned int) si_mpyu((qword) texel14, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
360 cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
361 colors[2] = spu_convtf(cSum, 24);
362
363 /* alpha */
364 c0 = (vector unsigned int) si_mpyu((qword) texel3, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
365 c1 = (vector unsigned int) si_mpyu((qword) texel7, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
366 c2 = (vector unsigned int) si_mpyu((qword) texel11, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
367 c3 = (vector unsigned int) si_mpyu((qword) texel15, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
368 cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
369 colors[3] = spu_convtf(cSum, 24);
370 }
371
372
373
374 /**
375 * Compute level of detail factor from texcoords.
376 */
377 static float
378 compute_lambda(uint unit, vector float s, vector float t)
379 {
380 uint baseLevel = 0;
381 float width = spu.texture[unit].level[baseLevel].width;
382 float height = spu.texture[unit].level[baseLevel].width;
383 float dsdx = width * (spu_extract(s, 1) - spu_extract(s, 0));
384 float dsdy = width * (spu_extract(s, 2) - spu_extract(s, 0));
385 float dtdx = height * (spu_extract(t, 1) - spu_extract(t, 0));
386 float dtdy = height * (spu_extract(t, 2) - spu_extract(t, 0));
387 float x = dsdx * dsdx + dtdx * dtdx;
388 float y = dsdy * dsdy + dtdy * dtdy;
389 float rho = x > y ? x : y;
390 rho = sqrtf(rho);
391 float lambda = logf(rho) * 1.442695f;
392 return lambda;
393 }
394
395
396
397 /**
398 * Texture sampling with level of detail selection.
399 */
400 void
401 sample_texture4_lod(vector float s, vector float t,
402 vector float r, vector float q,
403 uint unit, uint level_ignored, vector float colors[4])
404 {
405 /*
406 * Note that we're computing a lambda/lod here that's used for all
407 * four pixels in the quad.
408 */
409 float lambda = compute_lambda(unit, s, t);
410
411 /* apply lod bias */
412 lambda += spu.sampler[unit].lod_bias;
413
414 /* clamp */
415 if (lambda < spu.sampler[unit].min_lod)
416 lambda = spu.sampler[unit].min_lod;
417 else if (lambda > spu.sampler[unit].max_lod)
418 lambda = spu.sampler[unit].max_lod;
419
420 if (lambda <= 0.0f) {
421 /* magnify */
422 spu.mag_sample_texture4[unit](s, t, r, q, unit, 0, colors);
423 }
424 else {
425 /* minify */
426 int level = (int) (lambda + 0.5f);
427 if (level > (int) spu.texture[unit].max_level)
428 level = spu.texture[unit].max_level;
429 spu.min_sample_texture4[unit](s, t, r, q, unit, level, colors);
430 /* XXX to do: mipmap level interpolation */
431 }
432 }
433