cell: use fewer memory references in sample_texture4_bilinear_2()
[mesa.git] / src / gallium / drivers / cell / spu / spu_texture.c
1 /**************************************************************************
2 *
3 * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 #include <transpose_matrix4x4.h>
30
31 #include "pipe/p_compiler.h"
32 #include "spu_main.h"
33 #include "spu_texture.h"
34 #include "spu_tile.h"
35 #include "spu_colorpack.h"
36 #include "spu_dcache.h"
37
38
39 /**
40 * Mark all tex cache entries as invalid.
41 */
42 void
43 invalidate_tex_cache(void)
44 {
45 uint unit = 0;
46 uint bytes = 4 * spu.texture[unit].width
47 * spu.texture[unit].height;
48
49 spu_dcache_mark_dirty((unsigned) spu.texture[unit].start, bytes);
50 }
51
52
53 /**
54 * Get four texels from locations (x[0], y[0]), (x[1], y[1]) ...
55 *
56 * NOTE: in the typical case of bilinear filtering, the four texels
57 * are in a 2x2 group so we could get by with just two dcache fetches
58 * (two side-by-side texels per fetch). But when bilinear filtering
59 * wraps around a texture edge, we'll probably need code like we have
60 * now.
61 * FURTHERMORE: since we're rasterizing a quad of 2x2 pixels at a time,
62 * it's quite likely that the four pixels in a quad will need some of the
63 * same texels. So look into doing texture fetches for four pixels at
64 * a time.
65 */
66 static void
67 get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
68 {
69 const unsigned texture_ea = (uintptr_t) spu.texture[unit].start;
70 vec_uint4 tile_x = spu_rlmask(x, -5); /* tile_x = x / 32 */
71 vec_uint4 tile_y = spu_rlmask(y, -5); /* tile_y = y / 32 */
72 const qword offset_x = si_andi((qword) x, 0x1f); /* offset_x = x & 0x1f */
73 const qword offset_y = si_andi((qword) y, 0x1f); /* offset_y = y & 0x1f */
74
75 const qword tiles_per_row = (qword) spu_splats(spu.texture[unit].tiles_per_row);
76 const qword tile_size = (qword) spu_splats((unsigned) sizeof(tile_t));
77
78 qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x);
79 tile_offset = si_mpy((qword) tile_offset, tile_size);
80
81 qword texel_offset = si_a(si_mpyui(offset_y, 32), offset_x);
82 texel_offset = si_mpyui(texel_offset, 4);
83
84 vec_uint4 offset = (vec_uint4) si_a(tile_offset, texel_offset);
85
86 spu_dcache_fetch_unaligned((qword *) & texels[0],
87 texture_ea + spu_extract(offset, 0), 4);
88 spu_dcache_fetch_unaligned((qword *) & texels[1],
89 texture_ea + spu_extract(offset, 1), 4);
90 spu_dcache_fetch_unaligned((qword *) & texels[2],
91 texture_ea + spu_extract(offset, 2), 4);
92 spu_dcache_fetch_unaligned((qword *) & texels[3],
93 texture_ea + spu_extract(offset, 3), 4);
94 }
95
96
97
98 /**
99 * Do nearest texture sampling for four pixels.
100 * \param colors returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
101 */
102 void
103 sample_texture4_nearest(vector float s, vector float t,
104 vector float r, vector float q,
105 uint unit, vector float colors[4])
106 {
107 vector float ss = spu_mul(s, spu.texture[unit].width4);
108 vector float tt = spu_mul(t, spu.texture[unit].height4);
109 vector unsigned int is = spu_convtu(ss, 0);
110 vector unsigned int it = spu_convtu(tt, 0);
111 vec_uint4 texels[4];
112
113 /* PIPE_TEX_WRAP_REPEAT */
114 is = spu_and(is, spu.texture[unit].tex_size_x_mask);
115 it = spu_and(it, spu.texture[unit].tex_size_y_mask);
116
117 get_four_texels(unit, is, it, texels);
118
119 /* convert four packed ARGBA pixels to float RRRR,GGGG,BBBB,AAAA */
120 spu_unpack_A8R8G8B8_transpose4(texels, colors);
121 }
122
123
124 /**
125 * Do bilinear texture sampling for four pixels.
126 * \param colors returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
127 */
128 void
129 sample_texture4_bilinear(vector float s, vector float t,
130 vector float r, vector float q,
131 uint unit, vector float colors[4])
132 {
133 vector float ss = spu_madd(s, spu.texture[unit].width4, spu_splats(-0.5f));
134 vector float tt = spu_madd(t, spu.texture[unit].height4, spu_splats(-0.5f));
135
136 vector unsigned int is0 = spu_convtu(ss, 0);
137 vector unsigned int it0 = spu_convtu(tt, 0);
138
139 /* is + 1, it + 1 */
140 vector unsigned int is1 = spu_add(is0, 1);
141 vector unsigned int it1 = spu_add(it0, 1);
142
143 /* PIPE_TEX_WRAP_REPEAT */
144 is0 = spu_and(is0, spu.texture[unit].tex_size_x_mask);
145 it0 = spu_and(it0, spu.texture[unit].tex_size_y_mask);
146 is1 = spu_and(is1, spu.texture[unit].tex_size_x_mask);
147 it1 = spu_and(it1, spu.texture[unit].tex_size_y_mask);
148
149 /* get packed int texels */
150 vector unsigned int texels[16];
151 get_four_texels(unit, is0, it0, texels + 0); /* upper-left */
152 get_four_texels(unit, is1, it0, texels + 4); /* upper-right */
153 get_four_texels(unit, is0, it1, texels + 8); /* lower-left */
154 get_four_texels(unit, is1, it1, texels + 12); /* lower-right */
155
156 /* XXX possibly rework following code to compute the weighted sample
157 * colors with integer arithmetic for fewer int->float conversions.
158 */
159
160 /* convert packed int texels to float colors */
161 vector float ftexels[16];
162 spu_unpack_A8R8G8B8_transpose4(texels + 0, ftexels + 0);
163 spu_unpack_A8R8G8B8_transpose4(texels + 4, ftexels + 4);
164 spu_unpack_A8R8G8B8_transpose4(texels + 8, ftexels + 8);
165 spu_unpack_A8R8G8B8_transpose4(texels + 12, ftexels + 12);
166
167 /* Compute weighting factors in [0,1]
168 * Multiply texcoord by 1024, AND with 1023, convert back to float.
169 */
170 vector float ss1024 = spu_mul(ss, spu_splats(1024.0f));
171 vector signed int iss1024 = spu_convts(ss1024, 0);
172 iss1024 = spu_and(iss1024, 1023);
173 vector float sWeights0 = spu_convtf(iss1024, 10);
174
175 vector float tt1024 = spu_mul(tt, spu_splats(1024.0f));
176 vector signed int itt1024 = spu_convts(tt1024, 0);
177 itt1024 = spu_and(itt1024, 1023);
178 vector float tWeights0 = spu_convtf(itt1024, 10);
179
180 /* 1 - sWeight and 1 - tWeight */
181 vector float sWeights1 = spu_sub(spu_splats(1.0f), sWeights0);
182 vector float tWeights1 = spu_sub(spu_splats(1.0f), tWeights0);
183
184 /* reds, for four pixels */
185 ftexels[ 0] = spu_mul(ftexels[ 0], spu_mul(sWeights1, tWeights1)); /*ul*/
186 ftexels[ 4] = spu_mul(ftexels[ 4], spu_mul(sWeights0, tWeights1)); /*ur*/
187 ftexels[ 8] = spu_mul(ftexels[ 8], spu_mul(sWeights1, tWeights0)); /*ll*/
188 ftexels[12] = spu_mul(ftexels[12], spu_mul(sWeights0, tWeights0)); /*lr*/
189 colors[0] = spu_add(spu_add(ftexels[0], ftexels[4]),
190 spu_add(ftexels[8], ftexels[12]));
191
192 /* greens, for four pixels */
193 ftexels[ 1] = spu_mul(ftexels[ 1], spu_mul(sWeights1, tWeights1)); /*ul*/
194 ftexels[ 5] = spu_mul(ftexels[ 5], spu_mul(sWeights0, tWeights1)); /*ur*/
195 ftexels[ 9] = spu_mul(ftexels[ 9], spu_mul(sWeights1, tWeights0)); /*ll*/
196 ftexels[13] = spu_mul(ftexels[13], spu_mul(sWeights0, tWeights0)); /*lr*/
197 colors[1] = spu_add(spu_add(ftexels[1], ftexels[5]),
198 spu_add(ftexels[9], ftexels[13]));
199
200 /* blues, for four pixels */
201 ftexels[ 2] = spu_mul(ftexels[ 2], spu_mul(sWeights1, tWeights1)); /*ul*/
202 ftexels[ 6] = spu_mul(ftexels[ 6], spu_mul(sWeights0, tWeights1)); /*ur*/
203 ftexels[10] = spu_mul(ftexels[10], spu_mul(sWeights1, tWeights0)); /*ll*/
204 ftexels[14] = spu_mul(ftexels[14], spu_mul(sWeights0, tWeights0)); /*lr*/
205 colors[2] = spu_add(spu_add(ftexels[2], ftexels[6]),
206 spu_add(ftexels[10], ftexels[14]));
207
208 /* alphas, for four pixels */
209 ftexels[ 3] = spu_mul(ftexels[ 3], spu_mul(sWeights1, tWeights1)); /*ul*/
210 ftexels[ 7] = spu_mul(ftexels[ 7], spu_mul(sWeights0, tWeights1)); /*ur*/
211 ftexels[11] = spu_mul(ftexels[11], spu_mul(sWeights1, tWeights0)); /*ll*/
212 ftexels[15] = spu_mul(ftexels[15], spu_mul(sWeights0, tWeights0)); /*lr*/
213 colors[3] = spu_add(spu_add(ftexels[3], ftexels[7]),
214 spu_add(ftexels[11], ftexels[15]));
215 }
216
217
218
219 /**
220 * Adapted from /opt/cell/sdk/usr/spu/include/transpose_matrix4x4.h
221 */
222 static INLINE void
223 transpose(vector unsigned int *mOut0,
224 vector unsigned int *mOut1,
225 vector unsigned int *mOut2,
226 vector unsigned int *mOut3,
227 vector unsigned int *mIn)
228 {
229 vector unsigned int abcd, efgh, ijkl, mnop; /* input vectors */
230 vector unsigned int aeim, bfjn, cgko, dhlp; /* output vectors */
231 vector unsigned int aibj, ckdl, emfn, gohp; /* intermediate vectors */
232
233 vector unsigned char shufflehi = ((vector unsigned char) {
234 0x00, 0x01, 0x02, 0x03,
235 0x10, 0x11, 0x12, 0x13,
236 0x04, 0x05, 0x06, 0x07,
237 0x14, 0x15, 0x16, 0x17});
238 vector unsigned char shufflelo = ((vector unsigned char) {
239 0x08, 0x09, 0x0A, 0x0B,
240 0x18, 0x19, 0x1A, 0x1B,
241 0x0C, 0x0D, 0x0E, 0x0F,
242 0x1C, 0x1D, 0x1E, 0x1F});
243 abcd = *(mIn+0);
244 efgh = *(mIn+1);
245 ijkl = *(mIn+2);
246 mnop = *(mIn+3);
247
248 aibj = spu_shuffle(abcd, ijkl, shufflehi);
249 ckdl = spu_shuffle(abcd, ijkl, shufflelo);
250 emfn = spu_shuffle(efgh, mnop, shufflehi);
251 gohp = spu_shuffle(efgh, mnop, shufflelo);
252
253 aeim = spu_shuffle(aibj, emfn, shufflehi);
254 bfjn = spu_shuffle(aibj, emfn, shufflelo);
255 cgko = spu_shuffle(ckdl, gohp, shufflehi);
256 dhlp = spu_shuffle(ckdl, gohp, shufflelo);
257
258 *mOut0 = aeim;
259 *mOut1 = bfjn;
260 *mOut2 = cgko;
261 *mOut3 = dhlp;
262 }
263
264
265 /**
266 * Bilinear filtering, using int intead of float arithmetic
267 */
268 void
269 sample_texture4_bilinear_2(vector float s, vector float t,
270 vector float r, vector float q,
271 uint unit, vector float colors[4])
272 {
273 static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
274 /* Scale texcoords by size of texture, and add half pixel bias */
275 vector float ss = spu_madd(s, spu.texture[unit].width4, half);
276 vector float tt = spu_madd(t, spu.texture[unit].height4, half);
277
278 /* convert float coords to fixed-pt coords with 8 fraction bits */
279 vector unsigned int is = spu_convtu(ss, 8);
280 vector unsigned int it = spu_convtu(tt, 8);
281
282 /* compute integer texel weights in [0, 255] */
283 vector signed int sWeights0 = spu_and((vector signed int) is, 255);
284 vector signed int tWeights0 = spu_and((vector signed int) it, 255);
285 vector signed int sWeights1 = spu_sub(255, sWeights0);
286 vector signed int tWeights1 = spu_sub(255, tWeights0);
287
288 /* texel coords: is0 = is / 256, it0 = is / 256 */
289 vector unsigned int is0 = spu_rlmask(is, -8);
290 vector unsigned int it0 = spu_rlmask(it, -8);
291
292 /* texel coords: i1 = is0 + 1, it1 = it0 + 1 */
293 vector unsigned int is1 = spu_add(is0, 1);
294 vector unsigned int it1 = spu_add(it0, 1);
295
296 /* PIPE_TEX_WRAP_REPEAT */
297 is0 = spu_and(is0, spu.texture[unit].tex_size_x_mask);
298 it0 = spu_and(it0, spu.texture[unit].tex_size_y_mask);
299 is1 = spu_and(is1, spu.texture[unit].tex_size_x_mask);
300 it1 = spu_and(it1, spu.texture[unit].tex_size_y_mask);
301
302 /* get packed int texels */
303 vector unsigned int texels[16];
304 get_four_texels(unit, is0, it0, texels + 0); /* upper-left */
305 get_four_texels(unit, is1, it0, texels + 4); /* upper-right */
306 get_four_texels(unit, is0, it1, texels + 8); /* lower-left */
307 get_four_texels(unit, is1, it1, texels + 12); /* lower-right */
308
309 /* twiddle packed 32-bit BGRA pixels into RGBA as four unsigned ints */
310 {
311 static const unsigned char ZERO = 0x80;
312 int i;
313 for (i = 0; i < 16; i++) {
314 texels[i] = spu_shuffle(texels[i], texels[i],
315 ((vector unsigned char) {
316 ZERO, ZERO, ZERO, 1,
317 ZERO, ZERO, ZERO, 2,
318 ZERO, ZERO, ZERO, 3,
319 ZERO, ZERO, ZERO, 0}));
320 }
321 }
322
323 /* convert RGBA,RGBA,RGBA,RGBA to RRRR,GGGG,BBBB,AAAA */
324 vector unsigned int texel0, texel1, texel2, texel3, texel4, texel5, texel6, texel7,
325 texel8, texel9, texel10, texel11, texel12, texel13, texel14, texel15;
326 transpose(&texel0, &texel1, &texel2, &texel3, texels + 0);
327 transpose(&texel4, &texel5, &texel6, &texel7, texels + 4);
328 transpose(&texel8, &texel9, &texel10, &texel11, texels + 8);
329 transpose(&texel12, &texel13, &texel14, &texel15, texels + 12);
330
331 /* computed weighted colors */
332 vector unsigned int c0, c1, c2, c3, cSum;
333
334 /* red */
335 c0 = (vector unsigned int) si_mpyu((qword) texel0, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
336 c1 = (vector unsigned int) si_mpyu((qword) texel4, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
337 c2 = (vector unsigned int) si_mpyu((qword) texel8, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
338 c3 = (vector unsigned int) si_mpyu((qword) texel12, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
339 cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
340 colors[0] = spu_convtf(cSum, 24);
341
342 /* green */
343 c0 = (vector unsigned int) si_mpyu((qword) texel1, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
344 c1 = (vector unsigned int) si_mpyu((qword) texel5, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
345 c2 = (vector unsigned int) si_mpyu((qword) texel9, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
346 c3 = (vector unsigned int) si_mpyu((qword) texel13, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
347 cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
348 colors[1] = spu_convtf(cSum, 24);
349
350 /* blue */
351 c0 = (vector unsigned int) si_mpyu((qword) texel2, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
352 c1 = (vector unsigned int) si_mpyu((qword) texel6, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
353 c2 = (vector unsigned int) si_mpyu((qword) texel10, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
354 c3 = (vector unsigned int) si_mpyu((qword) texel14, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
355 cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
356 colors[2] = spu_convtf(cSum, 24);
357
358 /* alpha */
359 c0 = (vector unsigned int) si_mpyu((qword) texel3, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
360 c1 = (vector unsigned int) si_mpyu((qword) texel7, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
361 c2 = (vector unsigned int) si_mpyu((qword) texel11, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
362 c3 = (vector unsigned int) si_mpyu((qword) texel15, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
363 cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
364 colors[3] = spu_convtf(cSum, 24);
365 }