i965: Make brw_vs_outputs_written static.
[mesa.git] / src / mesa / drivers / dri / i965 / intel_tiled_memcpy.c
1 /*
2 * Mesa 3-D graphics library
3 *
4 * Copyright 2012 Intel Corporation
5 * Copyright 2013 Google
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sublicense, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 * Authors:
28 * Chad Versace <chad.versace@linux.intel.com>
29 * Frank Henigman <fjhenigman@google.com>
30 */
31
32 #include <string.h>
33
34 #include "util/macros.h"
35
36 #include "brw_context.h"
37 #include "intel_tiled_memcpy.h"
38
39 #if defined(__SSSE3__)
40 #include <tmmintrin.h>
41 #elif defined(__SSE2__)
42 #include <emmintrin.h>
43 #endif
44
45 #define FILE_DEBUG_FLAG DEBUG_TEXTURE
46
47 #define ALIGN_DOWN(a, b) ROUND_DOWN_TO(a, b)
48 #define ALIGN_UP(a, b) ALIGN(a, b)
49
50 /* Tile dimensions. Width and span are in bytes, height is in pixels (i.e.
51 * unitless). A "span" is the most number of bytes we can copy from linear
52 * to tiled without needing to calculate a new destination address.
53 */
54 static const uint32_t xtile_width = 512;
55 static const uint32_t xtile_height = 8;
56 static const uint32_t xtile_span = 64;
57 static const uint32_t ytile_width = 128;
58 static const uint32_t ytile_height = 32;
59 static const uint32_t ytile_span = 16;
60
61 static inline uint32_t
62 ror(uint32_t n, uint32_t d)
63 {
64 return (n >> d) | (n << (32 - d));
65 }
66
67 static inline uint32_t
68 bswap32(uint32_t n)
69 {
70 #if defined(HAVE___BUILTIN_BSWAP32)
71 return __builtin_bswap32(n);
72 #else
73 return (n >> 24) |
74 ((n >> 8) & 0x0000ff00) |
75 ((n << 8) & 0x00ff0000) |
76 (n << 24);
77 #endif
78 }
79
80 /**
81 * Copy RGBA to BGRA - swap R and B.
82 */
83 static inline void *
84 rgba8_copy(void *dst, const void *src, size_t bytes)
85 {
86 uint32_t *d = dst;
87 uint32_t const *s = src;
88
89 assert(bytes % 4 == 0);
90
91 while (bytes >= 4) {
92 *d = ror(bswap32(*s), 8);
93 d += 1;
94 s += 1;
95 bytes -= 4;
96 }
97 return dst;
98 }
99
100 #ifdef __SSSE3__
101 static const uint8_t rgba8_permutation[16] =
102 { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };
103
104 static inline void
105 rgba8_copy_16_aligned_dst(void *dst, const void *src)
106 {
107 _mm_store_si128(dst,
108 _mm_shuffle_epi8(_mm_loadu_si128(src),
109 *(__m128i *)rgba8_permutation));
110 }
111
112 static inline void
113 rgba8_copy_16_aligned_src(void *dst, const void *src)
114 {
115 _mm_storeu_si128(dst,
116 _mm_shuffle_epi8(_mm_load_si128(src),
117 *(__m128i *)rgba8_permutation));
118 }
119
120 #elif defined(__SSE2__)
121 static inline void
122 rgba8_copy_16_aligned_dst(void *dst, const void *src)
123 {
124 __m128i srcreg, dstreg, agmask, ag, rb, br;
125
126 agmask = _mm_set1_epi32(0xFF00FF00);
127 srcreg = _mm_loadu_si128((__m128i *)src);
128
129 rb = _mm_andnot_si128(agmask, srcreg);
130 ag = _mm_and_si128(agmask, srcreg);
131 br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)),
132 _MM_SHUFFLE(2, 3, 0, 1));
133 dstreg = _mm_or_si128(ag, br);
134
135 _mm_store_si128((__m128i *)dst, dstreg);
136 }
137
138 static inline void
139 rgba8_copy_16_aligned_src(void *dst, const void *src)
140 {
141 __m128i srcreg, dstreg, agmask, ag, rb, br;
142
143 agmask = _mm_set1_epi32(0xFF00FF00);
144 srcreg = _mm_load_si128((__m128i *)src);
145
146 rb = _mm_andnot_si128(agmask, srcreg);
147 ag = _mm_and_si128(agmask, srcreg);
148 br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)),
149 _MM_SHUFFLE(2, 3, 0, 1));
150 dstreg = _mm_or_si128(ag, br);
151
152 _mm_storeu_si128((__m128i *)dst, dstreg);
153 }
154 #endif
155
156 /**
157 * Copy RGBA to BGRA - swap R and B, with the destination 16-byte aligned.
158 */
159 static inline void *
160 rgba8_copy_aligned_dst(void *dst, const void *src, size_t bytes)
161 {
162 assert(bytes == 0 || !(((uintptr_t)dst) & 0xf));
163
164 #if defined(__SSSE3__) || defined(__SSE2__)
165 if (bytes == 64) {
166 rgba8_copy_16_aligned_dst(dst + 0, src + 0);
167 rgba8_copy_16_aligned_dst(dst + 16, src + 16);
168 rgba8_copy_16_aligned_dst(dst + 32, src + 32);
169 rgba8_copy_16_aligned_dst(dst + 48, src + 48);
170 return dst;
171 }
172
173 while (bytes >= 16) {
174 rgba8_copy_16_aligned_dst(dst, src);
175 src += 16;
176 dst += 16;
177 bytes -= 16;
178 }
179 #endif
180
181 rgba8_copy(dst, src, bytes);
182
183 return dst;
184 }
185
186 /**
187 * Copy RGBA to BGRA - swap R and B, with the source 16-byte aligned.
188 */
189 static inline void *
190 rgba8_copy_aligned_src(void *dst, const void *src, size_t bytes)
191 {
192 assert(bytes == 0 || !(((uintptr_t)src) & 0xf));
193
194 #if defined(__SSSE3__) || defined(__SSE2__)
195 if (bytes == 64) {
196 rgba8_copy_16_aligned_src(dst + 0, src + 0);
197 rgba8_copy_16_aligned_src(dst + 16, src + 16);
198 rgba8_copy_16_aligned_src(dst + 32, src + 32);
199 rgba8_copy_16_aligned_src(dst + 48, src + 48);
200 return dst;
201 }
202
203 while (bytes >= 16) {
204 rgba8_copy_16_aligned_src(dst, src);
205 src += 16;
206 dst += 16;
207 bytes -= 16;
208 }
209 #endif
210
211 rgba8_copy(dst, src, bytes);
212
213 return dst;
214 }
215
216 /**
217 * Each row from y0 to y1 is copied in three parts: [x0,x1), [x1,x2), [x2,x3).
218 * These ranges are in bytes, i.e. pixels * bytes-per-pixel.
219 * The first and last ranges must be shorter than a "span" (the longest linear
220 * stretch within a tile) and the middle must equal a whole number of spans.
221 * Ranges may be empty. The region copied must land entirely within one tile.
222 * 'dst' is the start of the tile and 'src' is the corresponding
223 * address to copy from, though copying begins at (x0, y0).
224 * To enable swizzling 'swizzle_bit' must be 1<<6, otherwise zero.
225 * Swizzling flips bit 6 in the copy destination offset, when certain other
226 * bits are set in it.
227 */
228 typedef void (*tile_copy_fn)(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
229 uint32_t y0, uint32_t y1,
230 char *dst, const char *src,
231 int32_t linear_pitch,
232 uint32_t swizzle_bit,
233 mem_copy_fn mem_copy);
234
235 /**
236 * Copy texture data from linear to X tile layout.
237 *
238 * \copydoc tile_copy_fn
239 *
240 * The mem_copy parameters allow the user to specify an alternative mem_copy
241 * function that, for instance, may do RGBA -> BGRA swizzling. The first
242 * function must handle any memory alignment while the second function must
243 * only handle 16-byte alignment in whichever side (source or destination) is
244 * tiled.
245 */
246 static inline void
247 linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
248 uint32_t y0, uint32_t y1,
249 char *dst, const char *src,
250 int32_t src_pitch,
251 uint32_t swizzle_bit,
252 mem_copy_fn mem_copy,
253 mem_copy_fn mem_copy_align16)
254 {
255 /* The copy destination offset for each range copied is the sum of
256 * an X offset 'x0' or 'xo' and a Y offset 'yo.'
257 */
258 uint32_t xo, yo;
259
260 src += (ptrdiff_t)y0 * src_pitch;
261
262 for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) {
263 /* Bits 9 and 10 of the copy destination offset control swizzling.
264 * Only 'yo' contributes to those bits in the total offset,
265 * so calculate 'swizzle' just once per row.
266 * Move bits 9 and 10 three and four places respectively down
267 * to bit 6 and xor them.
268 */
269 uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit;
270
271 mem_copy(dst + ((x0 + yo) ^ swizzle), src + x0, x1 - x0);
272
273 for (xo = x1; xo < x2; xo += xtile_span) {
274 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + xo, xtile_span);
275 }
276
277 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
278
279 src += src_pitch;
280 }
281 }
282
283 /**
284 * Copy texture data from linear to Y tile layout.
285 *
286 * \copydoc tile_copy_fn
287 */
288 static inline void
289 linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
290 uint32_t y0, uint32_t y3,
291 char *dst, const char *src,
292 int32_t src_pitch,
293 uint32_t swizzle_bit,
294 mem_copy_fn mem_copy,
295 mem_copy_fn mem_copy_align16)
296 {
297 /* Y tiles consist of columns that are 'ytile_span' wide (and the same height
298 * as the tile). Thus the destination offset for (x,y) is the sum of:
299 * (x % column_width) // position within column
300 * (x / column_width) * bytes_per_column // column number * bytes per column
301 * y * column_width
302 *
303 * The copy destination offset for each range copied is the sum of
304 * an X offset 'xo0' or 'xo' and a Y offset 'yo.'
305 */
306 const uint32_t column_width = ytile_span;
307 const uint32_t bytes_per_column = column_width * ytile_height;
308
309 uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4));
310 uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4));
311
312 uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;
313 uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;
314
315 /* Bit 9 of the destination offset control swizzling.
316 * Only the X offset contributes to bit 9 of the total offset,
317 * so swizzle can be calculated in advance for these X positions.
318 * Move bit 9 three places down to bit 6.
319 */
320 uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit;
321 uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit;
322
323 uint32_t x, yo;
324
325 src += (ptrdiff_t)y0 * src_pitch;
326
327 if (y0 != y1) {
328 for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {
329 uint32_t xo = xo1;
330 uint32_t swizzle = swizzle1;
331
332 mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0);
333
334 /* Step by spans/columns. As it happens, the swizzle bit flips
335 * at each step so we don't need to calculate it explicitly.
336 */
337 for (x = x1; x < x2; x += ytile_span) {
338 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);
339 xo += bytes_per_column;
340 swizzle ^= swizzle_bit;
341 }
342
343 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
344
345 src += src_pitch;
346 }
347 }
348
349 for (yo = y1 * column_width; yo < y2 * column_width; yo += 4 * column_width) {
350 uint32_t xo = xo1;
351 uint32_t swizzle = swizzle1;
352
353 if (x0 != x1) {
354 mem_copy(dst + ((xo0 + yo + 0 * column_width) ^ swizzle0), src + x0 + 0 * src_pitch, x1 - x0);
355 mem_copy(dst + ((xo0 + yo + 1 * column_width) ^ swizzle0), src + x0 + 1 * src_pitch, x1 - x0);
356 mem_copy(dst + ((xo0 + yo + 2 * column_width) ^ swizzle0), src + x0 + 2 * src_pitch, x1 - x0);
357 mem_copy(dst + ((xo0 + yo + 3 * column_width) ^ swizzle0), src + x0 + 3 * src_pitch, x1 - x0);
358 }
359
360 /* Step by spans/columns. As it happens, the swizzle bit flips
361 * at each step so we don't need to calculate it explicitly.
362 */
363 for (x = x1; x < x2; x += ytile_span) {
364 mem_copy_align16(dst + ((xo + yo + 0 * column_width) ^ swizzle), src + x + 0 * src_pitch, ytile_span);
365 mem_copy_align16(dst + ((xo + yo + 1 * column_width) ^ swizzle), src + x + 1 * src_pitch, ytile_span);
366 mem_copy_align16(dst + ((xo + yo + 2 * column_width) ^ swizzle), src + x + 2 * src_pitch, ytile_span);
367 mem_copy_align16(dst + ((xo + yo + 3 * column_width) ^ swizzle), src + x + 3 * src_pitch, ytile_span);
368 xo += bytes_per_column;
369 swizzle ^= swizzle_bit;
370 }
371
372 if (x2 != x3) {
373 mem_copy_align16(dst + ((xo + yo + 0 * column_width) ^ swizzle), src + x2 + 0 * src_pitch, x3 - x2);
374 mem_copy_align16(dst + ((xo + yo + 1 * column_width) ^ swizzle), src + x2 + 1 * src_pitch, x3 - x2);
375 mem_copy_align16(dst + ((xo + yo + 2 * column_width) ^ swizzle), src + x2 + 2 * src_pitch, x3 - x2);
376 mem_copy_align16(dst + ((xo + yo + 3 * column_width) ^ swizzle), src + x2 + 3 * src_pitch, x3 - x2);
377 }
378
379 src += 4 * src_pitch;
380 }
381
382 if (y2 != y3) {
383 for (yo = y2 * column_width; yo < y3 * column_width; yo += column_width) {
384 uint32_t xo = xo1;
385 uint32_t swizzle = swizzle1;
386
387 mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0);
388
389 /* Step by spans/columns. As it happens, the swizzle bit flips
390 * at each step so we don't need to calculate it explicitly.
391 */
392 for (x = x1; x < x2; x += ytile_span) {
393 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);
394 xo += bytes_per_column;
395 swizzle ^= swizzle_bit;
396 }
397
398 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
399
400 src += src_pitch;
401 }
402 }
403 }
404
405 /**
406 * Copy texture data from X tile layout to linear.
407 *
408 * \copydoc tile_copy_fn
409 */
410 static inline void
411 xtiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
412 uint32_t y0, uint32_t y1,
413 char *dst, const char *src,
414 int32_t dst_pitch,
415 uint32_t swizzle_bit,
416 mem_copy_fn mem_copy,
417 mem_copy_fn mem_copy_align16)
418 {
419 /* The copy destination offset for each range copied is the sum of
420 * an X offset 'x0' or 'xo' and a Y offset 'yo.'
421 */
422 uint32_t xo, yo;
423
424 dst += (ptrdiff_t)y0 * dst_pitch;
425
426 for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) {
427 /* Bits 9 and 10 of the copy destination offset control swizzling.
428 * Only 'yo' contributes to those bits in the total offset,
429 * so calculate 'swizzle' just once per row.
430 * Move bits 9 and 10 three and four places respectively down
431 * to bit 6 and xor them.
432 */
433 uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit;
434
435 mem_copy(dst + x0, src + ((x0 + yo) ^ swizzle), x1 - x0);
436
437 for (xo = x1; xo < x2; xo += xtile_span) {
438 mem_copy_align16(dst + xo, src + ((xo + yo) ^ swizzle), xtile_span);
439 }
440
441 mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
442
443 dst += dst_pitch;
444 }
445 }
446
447 /**
448 * Copy texture data from Y tile layout to linear.
449 *
450 * \copydoc tile_copy_fn
451 */
452 static inline void
453 ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
454 uint32_t y0, uint32_t y3,
455 char *dst, const char *src,
456 int32_t dst_pitch,
457 uint32_t swizzle_bit,
458 mem_copy_fn mem_copy,
459 mem_copy_fn mem_copy_align16)
460 {
461 /* Y tiles consist of columns that are 'ytile_span' wide (and the same height
462 * as the tile). Thus the destination offset for (x,y) is the sum of:
463 * (x % column_width) // position within column
464 * (x / column_width) * bytes_per_column // column number * bytes per column
465 * y * column_width
466 *
467 * The copy destination offset for each range copied is the sum of
468 * an X offset 'xo0' or 'xo' and a Y offset 'yo.'
469 */
470 const uint32_t column_width = ytile_span;
471 const uint32_t bytes_per_column = column_width * ytile_height;
472
473 uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4));
474 uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4));
475
476 uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;
477 uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;
478
479 /* Bit 9 of the destination offset control swizzling.
480 * Only the X offset contributes to bit 9 of the total offset,
481 * so swizzle can be calculated in advance for these X positions.
482 * Move bit 9 three places down to bit 6.
483 */
484 uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit;
485 uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit;
486
487 uint32_t x, yo;
488
489 dst += (ptrdiff_t)y0 * dst_pitch;
490
491 if (y0 != y1) {
492 for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {
493 uint32_t xo = xo1;
494 uint32_t swizzle = swizzle1;
495
496 mem_copy(dst + x0, src + ((xo0 + yo) ^ swizzle0), x1 - x0);
497
498 /* Step by spans/columns. As it happens, the swizzle bit flips
499 * at each step so we don't need to calculate it explicitly.
500 */
501 for (x = x1; x < x2; x += ytile_span) {
502 mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);
503 xo += bytes_per_column;
504 swizzle ^= swizzle_bit;
505 }
506
507 mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
508
509 dst += dst_pitch;
510 }
511 }
512
513 for (yo = y1 * column_width; yo < y2 * column_width; yo += 4 * column_width) {
514 uint32_t xo = xo1;
515 uint32_t swizzle = swizzle1;
516
517 if (x0 != x1) {
518 mem_copy(dst + x0 + 0 * dst_pitch, src + ((xo0 + yo + 0 * column_width) ^ swizzle0), x1 - x0);
519 mem_copy(dst + x0 + 1 * dst_pitch, src + ((xo0 + yo + 1 * column_width) ^ swizzle0), x1 - x0);
520 mem_copy(dst + x0 + 2 * dst_pitch, src + ((xo0 + yo + 2 * column_width) ^ swizzle0), x1 - x0);
521 mem_copy(dst + x0 + 3 * dst_pitch, src + ((xo0 + yo + 3 * column_width) ^ swizzle0), x1 - x0);
522 }
523
524 /* Step by spans/columns. As it happens, the swizzle bit flips
525 * at each step so we don't need to calculate it explicitly.
526 */
527 for (x = x1; x < x2; x += ytile_span) {
528 mem_copy_align16(dst + x + 0 * dst_pitch, src + ((xo + yo + 0 * column_width) ^ swizzle), ytile_span);
529 mem_copy_align16(dst + x + 1 * dst_pitch, src + ((xo + yo + 1 * column_width) ^ swizzle), ytile_span);
530 mem_copy_align16(dst + x + 2 * dst_pitch, src + ((xo + yo + 2 * column_width) ^ swizzle), ytile_span);
531 mem_copy_align16(dst + x + 3 * dst_pitch, src + ((xo + yo + 3 * column_width) ^ swizzle), ytile_span);
532 xo += bytes_per_column;
533 swizzle ^= swizzle_bit;
534 }
535
536 if (x2 != x3) {
537 mem_copy_align16(dst + x2 + 0 * dst_pitch, src + ((xo + yo + 0 * column_width) ^ swizzle), x3 - x2);
538 mem_copy_align16(dst + x2 + 1 * dst_pitch, src + ((xo + yo + 1 * column_width) ^ swizzle), x3 - x2);
539 mem_copy_align16(dst + x2 + 2 * dst_pitch, src + ((xo + yo + 2 * column_width) ^ swizzle), x3 - x2);
540 mem_copy_align16(dst + x2 + 3 * dst_pitch, src + ((xo + yo + 3 * column_width) ^ swizzle), x3 - x2);
541 }
542
543 dst += 4 * dst_pitch;
544 }
545
546 if (y2 != y3) {
547 for (yo = y2 * column_width; yo < y3 * column_width; yo += column_width) {
548 uint32_t xo = xo1;
549 uint32_t swizzle = swizzle1;
550
551 mem_copy(dst + x0, src + ((xo0 + yo) ^ swizzle0), x1 - x0);
552
553 /* Step by spans/columns. As it happens, the swizzle bit flips
554 * at each step so we don't need to calculate it explicitly.
555 */
556 for (x = x1; x < x2; x += ytile_span) {
557 mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);
558 xo += bytes_per_column;
559 swizzle ^= swizzle_bit;
560 }
561
562 mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
563
564 dst += dst_pitch;
565 }
566 }
567 }
568
569
570 /**
571 * Copy texture data from linear to X tile layout, faster.
572 *
573 * Same as \ref linear_to_xtiled but faster, because it passes constant
574 * parameters for common cases, allowing the compiler to inline code
575 * optimized for those cases.
576 *
577 * \copydoc tile_copy_fn
578 */
579 static FLATTEN void
580 linear_to_xtiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
581 uint32_t y0, uint32_t y1,
582 char *dst, const char *src,
583 int32_t src_pitch,
584 uint32_t swizzle_bit,
585 mem_copy_fn mem_copy)
586 {
587 if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) {
588 if (mem_copy == memcpy)
589 return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
590 dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
591 else if (mem_copy == rgba8_copy)
592 return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
593 dst, src, src_pitch, swizzle_bit,
594 rgba8_copy, rgba8_copy_aligned_dst);
595 else
596 unreachable("not reached");
597 } else {
598 if (mem_copy == memcpy)
599 return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
600 dst, src, src_pitch, swizzle_bit,
601 memcpy, memcpy);
602 else if (mem_copy == rgba8_copy)
603 return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
604 dst, src, src_pitch, swizzle_bit,
605 rgba8_copy, rgba8_copy_aligned_dst);
606 else
607 unreachable("not reached");
608 }
609 linear_to_xtiled(x0, x1, x2, x3, y0, y1,
610 dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy);
611 }
612
613 /**
614 * Copy texture data from linear to Y tile layout, faster.
615 *
616 * Same as \ref linear_to_ytiled but faster, because it passes constant
617 * parameters for common cases, allowing the compiler to inline code
618 * optimized for those cases.
619 *
620 * \copydoc tile_copy_fn
621 */
622 static FLATTEN void
623 linear_to_ytiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
624 uint32_t y0, uint32_t y1,
625 char *dst, const char *src,
626 int32_t src_pitch,
627 uint32_t swizzle_bit,
628 mem_copy_fn mem_copy)
629 {
630 if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
631 if (mem_copy == memcpy)
632 return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
633 dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
634 else if (mem_copy == rgba8_copy)
635 return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
636 dst, src, src_pitch, swizzle_bit,
637 rgba8_copy, rgba8_copy_aligned_dst);
638 else
639 unreachable("not reached");
640 } else {
641 if (mem_copy == memcpy)
642 return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
643 dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
644 else if (mem_copy == rgba8_copy)
645 return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
646 dst, src, src_pitch, swizzle_bit,
647 rgba8_copy, rgba8_copy_aligned_dst);
648 else
649 unreachable("not reached");
650 }
651 linear_to_ytiled(x0, x1, x2, x3, y0, y1,
652 dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy);
653 }
654
655 /**
656 * Copy texture data from X tile layout to linear, faster.
657 *
658 * Same as \ref xtile_to_linear but faster, because it passes constant
659 * parameters for common cases, allowing the compiler to inline code
660 * optimized for those cases.
661 *
662 * \copydoc tile_copy_fn
663 */
664 static FLATTEN void
665 xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
666 uint32_t y0, uint32_t y1,
667 char *dst, const char *src,
668 int32_t dst_pitch,
669 uint32_t swizzle_bit,
670 mem_copy_fn mem_copy)
671 {
672 if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) {
673 if (mem_copy == memcpy)
674 return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
675 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
676 else if (mem_copy == rgba8_copy)
677 return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
678 dst, src, dst_pitch, swizzle_bit,
679 rgba8_copy, rgba8_copy_aligned_src);
680 else
681 unreachable("not reached");
682 } else {
683 if (mem_copy == memcpy)
684 return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
685 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
686 else if (mem_copy == rgba8_copy)
687 return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
688 dst, src, dst_pitch, swizzle_bit,
689 rgba8_copy, rgba8_copy_aligned_src);
690 else
691 unreachable("not reached");
692 }
693 xtiled_to_linear(x0, x1, x2, x3, y0, y1,
694 dst, src, dst_pitch, swizzle_bit, mem_copy, mem_copy);
695 }
696
697 /**
698 * Copy texture data from Y tile layout to linear, faster.
699 *
700 * Same as \ref ytile_to_linear but faster, because it passes constant
701 * parameters for common cases, allowing the compiler to inline code
702 * optimized for those cases.
703 *
704 * \copydoc tile_copy_fn
705 */
706 static FLATTEN void
707 ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
708 uint32_t y0, uint32_t y1,
709 char *dst, const char *src,
710 int32_t dst_pitch,
711 uint32_t swizzle_bit,
712 mem_copy_fn mem_copy)
713 {
714 if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
715 if (mem_copy == memcpy)
716 return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
717 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
718 else if (mem_copy == rgba8_copy)
719 return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
720 dst, src, dst_pitch, swizzle_bit,
721 rgba8_copy, rgba8_copy_aligned_src);
722 else
723 unreachable("not reached");
724 } else {
725 if (mem_copy == memcpy)
726 return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
727 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
728 else if (mem_copy == rgba8_copy)
729 return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
730 dst, src, dst_pitch, swizzle_bit,
731 rgba8_copy, rgba8_copy_aligned_src);
732 else
733 unreachable("not reached");
734 }
735 ytiled_to_linear(x0, x1, x2, x3, y0, y1,
736 dst, src, dst_pitch, swizzle_bit, mem_copy, mem_copy);
737 }
738
739 /**
740 * Copy from linear to tiled texture.
741 *
742 * Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into
743 * pieces that do not cross tile boundaries and copy each piece with a tile
744 * copy function (\ref tile_copy_fn).
745 * The X range is in bytes, i.e. pixels * bytes-per-pixel.
746 * The Y range is in pixels (i.e. unitless).
747 * 'dst' is the address of (0, 0) in the destination tiled texture.
748 * 'src' is the address of (xt1, yt1) in the source linear texture.
749 */
750 void
751 linear_to_tiled(uint32_t xt1, uint32_t xt2,
752 uint32_t yt1, uint32_t yt2,
753 char *dst, const char *src,
754 uint32_t dst_pitch, int32_t src_pitch,
755 bool has_swizzling,
756 enum isl_tiling tiling,
757 mem_copy_fn mem_copy)
758 {
759 tile_copy_fn tile_copy;
760 uint32_t xt0, xt3;
761 uint32_t yt0, yt3;
762 uint32_t xt, yt;
763 uint32_t tw, th, span;
764 uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;
765
766 if (tiling == ISL_TILING_X) {
767 tw = xtile_width;
768 th = xtile_height;
769 span = xtile_span;
770 tile_copy = linear_to_xtiled_faster;
771 } else if (tiling == ISL_TILING_Y0) {
772 tw = ytile_width;
773 th = ytile_height;
774 span = ytile_span;
775 tile_copy = linear_to_ytiled_faster;
776 } else {
777 unreachable("unsupported tiling");
778 }
779
780 /* Round out to tile boundaries. */
781 xt0 = ALIGN_DOWN(xt1, tw);
782 xt3 = ALIGN_UP (xt2, tw);
783 yt0 = ALIGN_DOWN(yt1, th);
784 yt3 = ALIGN_UP (yt2, th);
785
786 /* Loop over all tiles to which we have something to copy.
787 * 'xt' and 'yt' are the origin of the destination tile, whether copying
788 * copying a full or partial tile.
789 * tile_copy() copies one tile or partial tile.
790 * Looping x inside y is the faster memory access pattern.
791 */
792 for (yt = yt0; yt < yt3; yt += th) {
793 for (xt = xt0; xt < xt3; xt += tw) {
794 /* The area to update is [x0,x3) x [y0,y1).
795 * May not want the whole tile, hence the min and max.
796 */
797 uint32_t x0 = MAX2(xt1, xt);
798 uint32_t y0 = MAX2(yt1, yt);
799 uint32_t x3 = MIN2(xt2, xt + tw);
800 uint32_t y1 = MIN2(yt2, yt + th);
801
802 /* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that
803 * the middle interval is the longest span-aligned part.
804 * The sub-ranges could be empty.
805 */
806 uint32_t x1, x2;
807 x1 = ALIGN_UP(x0, span);
808 if (x1 > x3)
809 x1 = x2 = x3;
810 else
811 x2 = ALIGN_DOWN(x3, span);
812
813 assert(x0 <= x1 && x1 <= x2 && x2 <= x3);
814 assert(x1 - x0 < span && x3 - x2 < span);
815 assert(x3 - x0 <= tw);
816 assert((x2 - x1) % span == 0);
817
818 /* Translate by (xt,yt) for single-tile copier. */
819 tile_copy(x0-xt, x1-xt, x2-xt, x3-xt,
820 y0-yt, y1-yt,
821 dst + (ptrdiff_t)xt * th + (ptrdiff_t)yt * dst_pitch,
822 src + (ptrdiff_t)xt - xt1 + ((ptrdiff_t)yt - yt1) * src_pitch,
823 src_pitch,
824 swizzle_bit,
825 mem_copy);
826 }
827 }
828 }
829
830 /**
831 * Copy from tiled to linear texture.
832 *
833 * Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into
834 * pieces that do not cross tile boundaries and copy each piece with a tile
835 * copy function (\ref tile_copy_fn).
836 * The X range is in bytes, i.e. pixels * bytes-per-pixel.
837 * The Y range is in pixels (i.e. unitless).
838 * 'dst' is the address of (xt1, yt1) in the destination linear texture.
839 * 'src' is the address of (0, 0) in the source tiled texture.
840 */
841 void
842 tiled_to_linear(uint32_t xt1, uint32_t xt2,
843 uint32_t yt1, uint32_t yt2,
844 char *dst, const char *src,
845 int32_t dst_pitch, uint32_t src_pitch,
846 bool has_swizzling,
847 enum isl_tiling tiling,
848 mem_copy_fn mem_copy)
849 {
850 tile_copy_fn tile_copy;
851 uint32_t xt0, xt3;
852 uint32_t yt0, yt3;
853 uint32_t xt, yt;
854 uint32_t tw, th, span;
855 uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;
856
857 if (tiling == ISL_TILING_X) {
858 tw = xtile_width;
859 th = xtile_height;
860 span = xtile_span;
861 tile_copy = xtiled_to_linear_faster;
862 } else if (tiling == ISL_TILING_Y0) {
863 tw = ytile_width;
864 th = ytile_height;
865 span = ytile_span;
866 tile_copy = ytiled_to_linear_faster;
867 } else {
868 unreachable("unsupported tiling");
869 }
870
871 /* Round out to tile boundaries. */
872 xt0 = ALIGN_DOWN(xt1, tw);
873 xt3 = ALIGN_UP (xt2, tw);
874 yt0 = ALIGN_DOWN(yt1, th);
875 yt3 = ALIGN_UP (yt2, th);
876
877 /* Loop over all tiles to which we have something to copy.
878 * 'xt' and 'yt' are the origin of the destination tile, whether copying
879 * copying a full or partial tile.
880 * tile_copy() copies one tile or partial tile.
881 * Looping x inside y is the faster memory access pattern.
882 */
883 for (yt = yt0; yt < yt3; yt += th) {
884 for (xt = xt0; xt < xt3; xt += tw) {
885 /* The area to update is [x0,x3) x [y0,y1).
886 * May not want the whole tile, hence the min and max.
887 */
888 uint32_t x0 = MAX2(xt1, xt);
889 uint32_t y0 = MAX2(yt1, yt);
890 uint32_t x3 = MIN2(xt2, xt + tw);
891 uint32_t y1 = MIN2(yt2, yt + th);
892
893 /* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that
894 * the middle interval is the longest span-aligned part.
895 * The sub-ranges could be empty.
896 */
897 uint32_t x1, x2;
898 x1 = ALIGN_UP(x0, span);
899 if (x1 > x3)
900 x1 = x2 = x3;
901 else
902 x2 = ALIGN_DOWN(x3, span);
903
904 assert(x0 <= x1 && x1 <= x2 && x2 <= x3);
905 assert(x1 - x0 < span && x3 - x2 < span);
906 assert(x3 - x0 <= tw);
907 assert((x2 - x1) % span == 0);
908
909 /* Translate by (xt,yt) for single-tile copier. */
910 tile_copy(x0-xt, x1-xt, x2-xt, x3-xt,
911 y0-yt, y1-yt,
912 dst + (ptrdiff_t)xt - xt1 + ((ptrdiff_t)yt - yt1) * dst_pitch,
913 src + (ptrdiff_t)xt * th + (ptrdiff_t)yt * src_pitch,
914 dst_pitch,
915 swizzle_bit,
916 mem_copy);
917 }
918 }
919 }
920
921
922 /**
923 * Determine which copy function to use for the given format combination
924 *
925 * The only two possible copy functions which are ever returned are a
926 * direct memcpy and a RGBA <-> BGRA copy function. Since RGBA -> BGRA and
927 * BGRA -> RGBA are exactly the same operation (and memcpy is obviously
928 * symmetric), it doesn't matter whether the copy is from the tiled image
929 * to the untiled or vice versa. The copy function required is the same in
930 * either case so this function can be used.
931 *
932 * \param[in] tiledFormat The format of the tiled image
933 * \param[in] format The GL format of the client data
934 * \param[in] type The GL type of the client data
935 * \param[out] mem_copy Will be set to one of either the standard
936 * library's memcpy or a different copy function
937 * that performs an RGBA to BGRA conversion
938 * \param[out] cpp Number of bytes per channel
939 *
940 * \return true if the format and type combination are valid
941 */
942 bool intel_get_memcpy(mesa_format tiledFormat, GLenum format,
943 GLenum type, mem_copy_fn *mem_copy, uint32_t *cpp)
944 {
945 if (type == GL_UNSIGNED_INT_8_8_8_8_REV &&
946 !(format == GL_RGBA || format == GL_BGRA))
947 return false; /* Invalid type/format combination */
948
949 if ((tiledFormat == MESA_FORMAT_L_UNORM8 && format == GL_LUMINANCE) ||
950 (tiledFormat == MESA_FORMAT_A_UNORM8 && format == GL_ALPHA)) {
951 *cpp = 1;
952 *mem_copy = memcpy;
953 } else if ((tiledFormat == MESA_FORMAT_B8G8R8A8_UNORM) ||
954 (tiledFormat == MESA_FORMAT_B8G8R8X8_UNORM) ||
955 (tiledFormat == MESA_FORMAT_B8G8R8A8_SRGB) ||
956 (tiledFormat == MESA_FORMAT_B8G8R8X8_SRGB)) {
957 *cpp = 4;
958 if (format == GL_BGRA) {
959 *mem_copy = memcpy;
960 } else if (format == GL_RGBA) {
961 *mem_copy = rgba8_copy;
962 }
963 } else if ((tiledFormat == MESA_FORMAT_R8G8B8A8_UNORM) ||
964 (tiledFormat == MESA_FORMAT_R8G8B8X8_UNORM) ||
965 (tiledFormat == MESA_FORMAT_R8G8B8A8_SRGB) ||
966 (tiledFormat == MESA_FORMAT_R8G8B8X8_SRGB)) {
967 *cpp = 4;
968 if (format == GL_BGRA) {
969 /* Copying from RGBA to BGRA is the same as BGRA to RGBA so we can
970 * use the same function.
971 */
972 *mem_copy = rgba8_copy;
973 } else if (format == GL_RGBA) {
974 *mem_copy = memcpy;
975 }
976 }
977
978 if (!(*mem_copy))
979 return false;
980
981 return true;
982 }