i965/icl: Update the assert in brw_memory_barrier()
[mesa.git] / src / mesa / drivers / dri / i965 / intel_tiled_memcpy.c
1 /*
2 * Mesa 3-D graphics library
3 *
4 * Copyright 2012 Intel Corporation
5 * Copyright 2013 Google
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sublicense, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 * Authors:
28 * Chad Versace <chad.versace@linux.intel.com>
29 * Frank Henigman <fjhenigman@google.com>
30 */
31
32 #include <string.h>
33
34 #include "util/macros.h"
35
36 #include "brw_context.h"
37 #include "intel_tiled_memcpy.h"
38
39 #if defined(__SSSE3__)
40 #include <tmmintrin.h>
41 #elif defined(__SSE2__)
42 #include <emmintrin.h>
43 #endif
44
45 #define FILE_DEBUG_FLAG DEBUG_TEXTURE
46
47 #define ALIGN_DOWN(a, b) ROUND_DOWN_TO(a, b)
48 #define ALIGN_UP(a, b) ALIGN(a, b)
49
50 /* Tile dimensions. Width and span are in bytes, height is in pixels (i.e.
51 * unitless). A "span" is the most number of bytes we can copy from linear
52 * to tiled without needing to calculate a new destination address.
53 */
54 static const uint32_t xtile_width = 512;
55 static const uint32_t xtile_height = 8;
56 static const uint32_t xtile_span = 64;
57 static const uint32_t ytile_width = 128;
58 static const uint32_t ytile_height = 32;
59 static const uint32_t ytile_span = 16;
60
61 static inline uint32_t
62 ror(uint32_t n, uint32_t d)
63 {
64 return (n >> d) | (n << (32 - d));
65 }
66
67 static inline uint32_t
68 bswap32(uint32_t n)
69 {
70 #if defined(HAVE___BUILTIN_BSWAP32)
71 return __builtin_bswap32(n);
72 #else
73 return (n >> 24) |
74 ((n >> 8) & 0x0000ff00) |
75 ((n << 8) & 0x00ff0000) |
76 (n << 24);
77 #endif
78 }
79
80 /**
81 * Copy RGBA to BGRA - swap R and B.
82 */
83 static inline void *
84 rgba8_copy(void *dst, const void *src, size_t bytes)
85 {
86 uint32_t *d = dst;
87 uint32_t const *s = src;
88
89 assert(bytes % 4 == 0);
90
91 while (bytes >= 4) {
92 *d = ror(bswap32(*s), 8);
93 d += 1;
94 s += 1;
95 bytes -= 4;
96 }
97 return dst;
98 }
99
100 #ifdef __SSSE3__
101 static const uint8_t rgba8_permutation[16] =
102 { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };
103
104 static inline void
105 rgba8_copy_16_aligned_dst(void *dst, const void *src)
106 {
107 _mm_store_si128(dst,
108 _mm_shuffle_epi8(_mm_loadu_si128(src),
109 *(__m128i *)rgba8_permutation));
110 }
111
112 static inline void
113 rgba8_copy_16_aligned_src(void *dst, const void *src)
114 {
115 _mm_storeu_si128(dst,
116 _mm_shuffle_epi8(_mm_load_si128(src),
117 *(__m128i *)rgba8_permutation));
118 }
119
120 #elif defined(__SSE2__)
121 static inline void
122 rgba8_copy_16_aligned_dst(void *dst, const void *src)
123 {
124 __m128i srcreg, dstreg, agmask, ag, rb, br;
125
126 agmask = _mm_set1_epi32(0xFF00FF00);
127 srcreg = _mm_loadu_si128((__m128i *)src);
128
129 rb = _mm_andnot_si128(agmask, srcreg);
130 ag = _mm_and_si128(agmask, srcreg);
131 br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)),
132 _MM_SHUFFLE(2, 3, 0, 1));
133 dstreg = _mm_or_si128(ag, br);
134
135 _mm_store_si128((__m128i *)dst, dstreg);
136 }
137
138 static inline void
139 rgba8_copy_16_aligned_src(void *dst, const void *src)
140 {
141 __m128i srcreg, dstreg, agmask, ag, rb, br;
142
143 agmask = _mm_set1_epi32(0xFF00FF00);
144 srcreg = _mm_load_si128((__m128i *)src);
145
146 rb = _mm_andnot_si128(agmask, srcreg);
147 ag = _mm_and_si128(agmask, srcreg);
148 br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)),
149 _MM_SHUFFLE(2, 3, 0, 1));
150 dstreg = _mm_or_si128(ag, br);
151
152 _mm_storeu_si128((__m128i *)dst, dstreg);
153 }
154 #endif
155
156 /**
157 * Copy RGBA to BGRA - swap R and B, with the destination 16-byte aligned.
158 */
159 static inline void *
160 rgba8_copy_aligned_dst(void *dst, const void *src, size_t bytes)
161 {
162 assert(bytes == 0 || !(((uintptr_t)dst) & 0xf));
163
164 #if defined(__SSSE3__) || defined(__SSE2__)
165 if (bytes == 64) {
166 rgba8_copy_16_aligned_dst(dst + 0, src + 0);
167 rgba8_copy_16_aligned_dst(dst + 16, src + 16);
168 rgba8_copy_16_aligned_dst(dst + 32, src + 32);
169 rgba8_copy_16_aligned_dst(dst + 48, src + 48);
170 return dst;
171 }
172
173 while (bytes >= 16) {
174 rgba8_copy_16_aligned_dst(dst, src);
175 src += 16;
176 dst += 16;
177 bytes -= 16;
178 }
179 #endif
180
181 rgba8_copy(dst, src, bytes);
182
183 return dst;
184 }
185
186 /**
187 * Copy RGBA to BGRA - swap R and B, with the source 16-byte aligned.
188 */
189 static inline void *
190 rgba8_copy_aligned_src(void *dst, const void *src, size_t bytes)
191 {
192 assert(bytes == 0 || !(((uintptr_t)src) & 0xf));
193
194 #if defined(__SSSE3__) || defined(__SSE2__)
195 if (bytes == 64) {
196 rgba8_copy_16_aligned_src(dst + 0, src + 0);
197 rgba8_copy_16_aligned_src(dst + 16, src + 16);
198 rgba8_copy_16_aligned_src(dst + 32, src + 32);
199 rgba8_copy_16_aligned_src(dst + 48, src + 48);
200 return dst;
201 }
202
203 while (bytes >= 16) {
204 rgba8_copy_16_aligned_src(dst, src);
205 src += 16;
206 dst += 16;
207 bytes -= 16;
208 }
209 #endif
210
211 rgba8_copy(dst, src, bytes);
212
213 return dst;
214 }
215
216 /**
217 * Each row from y0 to y1 is copied in three parts: [x0,x1), [x1,x2), [x2,x3).
218 * These ranges are in bytes, i.e. pixels * bytes-per-pixel.
219 * The first and last ranges must be shorter than a "span" (the longest linear
220 * stretch within a tile) and the middle must equal a whole number of spans.
221 * Ranges may be empty. The region copied must land entirely within one tile.
222 * 'dst' is the start of the tile and 'src' is the corresponding
223 * address to copy from, though copying begins at (x0, y0).
224 * To enable swizzling 'swizzle_bit' must be 1<<6, otherwise zero.
225 * Swizzling flips bit 6 in the copy destination offset, when certain other
226 * bits are set in it.
227 */
228 typedef void (*tile_copy_fn)(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
229 uint32_t y0, uint32_t y1,
230 char *dst, const char *src,
231 int32_t linear_pitch,
232 uint32_t swizzle_bit,
233 mem_copy_fn mem_copy);
234
235 /**
236 * Copy texture data from linear to X tile layout.
237 *
238 * \copydoc tile_copy_fn
239 *
240 * The mem_copy parameters allow the user to specify an alternative mem_copy
241 * function that, for instance, may do RGBA -> BGRA swizzling. The first
242 * function must handle any memory alignment while the second function must
243 * only handle 16-byte alignment in whichever side (source or destination) is
244 * tiled.
245 */
246 static inline void
247 linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
248 uint32_t y0, uint32_t y1,
249 char *dst, const char *src,
250 int32_t src_pitch,
251 uint32_t swizzle_bit,
252 mem_copy_fn mem_copy,
253 mem_copy_fn mem_copy_align16)
254 {
255 /* The copy destination offset for each range copied is the sum of
256 * an X offset 'x0' or 'xo' and a Y offset 'yo.'
257 */
258 uint32_t xo, yo;
259
260 src += (ptrdiff_t)y0 * src_pitch;
261
262 for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) {
263 /* Bits 9 and 10 of the copy destination offset control swizzling.
264 * Only 'yo' contributes to those bits in the total offset,
265 * so calculate 'swizzle' just once per row.
266 * Move bits 9 and 10 three and four places respectively down
267 * to bit 6 and xor them.
268 */
269 uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit;
270
271 mem_copy(dst + ((x0 + yo) ^ swizzle), src + x0, x1 - x0);
272
273 for (xo = x1; xo < x2; xo += xtile_span) {
274 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + xo, xtile_span);
275 }
276
277 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
278
279 src += src_pitch;
280 }
281 }
282
283 /**
284 * Copy texture data from linear to Y tile layout.
285 *
286 * \copydoc tile_copy_fn
287 */
288 static inline void
289 linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
290 uint32_t y0, uint32_t y3,
291 char *dst, const char *src,
292 int32_t src_pitch,
293 uint32_t swizzle_bit,
294 mem_copy_fn mem_copy,
295 mem_copy_fn mem_copy_align16)
296 {
297 /* Y tiles consist of columns that are 'ytile_span' wide (and the same height
298 * as the tile). Thus the destination offset for (x,y) is the sum of:
299 * (x % column_width) // position within column
300 * (x / column_width) * bytes_per_column // column number * bytes per column
301 * y * column_width
302 *
303 * The copy destination offset for each range copied is the sum of
304 * an X offset 'xo0' or 'xo' and a Y offset 'yo.'
305 */
306 const uint32_t column_width = ytile_span;
307 const uint32_t bytes_per_column = column_width * ytile_height;
308
309 uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4));
310 uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4));
311
312 uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;
313 uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;
314
315 /* Bit 9 of the destination offset control swizzling.
316 * Only the X offset contributes to bit 9 of the total offset,
317 * so swizzle can be calculated in advance for these X positions.
318 * Move bit 9 three places down to bit 6.
319 */
320 uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit;
321 uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit;
322
323 uint32_t x, yo;
324
325 src += (ptrdiff_t)y0 * src_pitch;
326
327 if (y0 != y1) {
328 for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {
329 uint32_t xo = xo1;
330 uint32_t swizzle = swizzle1;
331
332 mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0);
333
334 /* Step by spans/columns. As it happens, the swizzle bit flips
335 * at each step so we don't need to calculate it explicitly.
336 */
337 for (x = x1; x < x2; x += ytile_span) {
338 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);
339 xo += bytes_per_column;
340 swizzle ^= swizzle_bit;
341 }
342
343 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
344
345 src += src_pitch;
346 }
347 }
348
349 for (yo = y1 * column_width; yo < y2 * column_width; yo += 4 * column_width) {
350 uint32_t xo = xo1;
351 uint32_t swizzle = swizzle1;
352
353 if (x0 != x1) {
354 mem_copy(dst + ((xo0 + yo + 0 * column_width) ^ swizzle0), src + x0 + 0 * src_pitch, x1 - x0);
355 mem_copy(dst + ((xo0 + yo + 1 * column_width) ^ swizzle0), src + x0 + 1 * src_pitch, x1 - x0);
356 mem_copy(dst + ((xo0 + yo + 2 * column_width) ^ swizzle0), src + x0 + 2 * src_pitch, x1 - x0);
357 mem_copy(dst + ((xo0 + yo + 3 * column_width) ^ swizzle0), src + x0 + 3 * src_pitch, x1 - x0);
358 }
359
360 /* Step by spans/columns. As it happens, the swizzle bit flips
361 * at each step so we don't need to calculate it explicitly.
362 */
363 for (x = x1; x < x2; x += ytile_span) {
364 mem_copy_align16(dst + ((xo + yo + 0 * column_width) ^ swizzle), src + x + 0 * src_pitch, ytile_span);
365 mem_copy_align16(dst + ((xo + yo + 1 * column_width) ^ swizzle), src + x + 1 * src_pitch, ytile_span);
366 mem_copy_align16(dst + ((xo + yo + 2 * column_width) ^ swizzle), src + x + 2 * src_pitch, ytile_span);
367 mem_copy_align16(dst + ((xo + yo + 3 * column_width) ^ swizzle), src + x + 3 * src_pitch, ytile_span);
368 xo += bytes_per_column;
369 swizzle ^= swizzle_bit;
370 }
371
372 if (x2 != x3) {
373 mem_copy_align16(dst + ((xo + yo + 0 * column_width) ^ swizzle), src + x2 + 0 * src_pitch, x3 - x2);
374 mem_copy_align16(dst + ((xo + yo + 1 * column_width) ^ swizzle), src + x2 + 1 * src_pitch, x3 - x2);
375 mem_copy_align16(dst + ((xo + yo + 2 * column_width) ^ swizzle), src + x2 + 2 * src_pitch, x3 - x2);
376 mem_copy_align16(dst + ((xo + yo + 3 * column_width) ^ swizzle), src + x2 + 3 * src_pitch, x3 - x2);
377 }
378
379 src += 4 * src_pitch;
380 }
381
382 if (y2 != y3) {
383 for (yo = y2 * column_width; yo < y3 * column_width; yo += column_width) {
384 uint32_t xo = xo1;
385 uint32_t swizzle = swizzle1;
386
387 mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0);
388
389 /* Step by spans/columns. As it happens, the swizzle bit flips
390 * at each step so we don't need to calculate it explicitly.
391 */
392 for (x = x1; x < x2; x += ytile_span) {
393 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);
394 xo += bytes_per_column;
395 swizzle ^= swizzle_bit;
396 }
397
398 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
399
400 src += src_pitch;
401 }
402 }
403 }
404
405 /**
406 * Copy texture data from X tile layout to linear.
407 *
408 * \copydoc tile_copy_fn
409 */
410 static inline void
411 xtiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
412 uint32_t y0, uint32_t y1,
413 char *dst, const char *src,
414 int32_t dst_pitch,
415 uint32_t swizzle_bit,
416 mem_copy_fn mem_copy,
417 mem_copy_fn mem_copy_align16)
418 {
419 /* The copy destination offset for each range copied is the sum of
420 * an X offset 'x0' or 'xo' and a Y offset 'yo.'
421 */
422 uint32_t xo, yo;
423
424 dst += (ptrdiff_t)y0 * dst_pitch;
425
426 for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) {
427 /* Bits 9 and 10 of the copy destination offset control swizzling.
428 * Only 'yo' contributes to those bits in the total offset,
429 * so calculate 'swizzle' just once per row.
430 * Move bits 9 and 10 three and four places respectively down
431 * to bit 6 and xor them.
432 */
433 uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit;
434
435 mem_copy(dst + x0, src + ((x0 + yo) ^ swizzle), x1 - x0);
436
437 for (xo = x1; xo < x2; xo += xtile_span) {
438 mem_copy_align16(dst + xo, src + ((xo + yo) ^ swizzle), xtile_span);
439 }
440
441 mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
442
443 dst += dst_pitch;
444 }
445 }
446
447 /**
448 * Copy texture data from Y tile layout to linear.
449 *
450 * \copydoc tile_copy_fn
451 */
452 static inline void
453 ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
454 uint32_t y0, uint32_t y1,
455 char *dst, const char *src,
456 int32_t dst_pitch,
457 uint32_t swizzle_bit,
458 mem_copy_fn mem_copy,
459 mem_copy_fn mem_copy_align16)
460 {
461 /* Y tiles consist of columns that are 'ytile_span' wide (and the same height
462 * as the tile). Thus the destination offset for (x,y) is the sum of:
463 * (x % column_width) // position within column
464 * (x / column_width) * bytes_per_column // column number * bytes per column
465 * y * column_width
466 *
467 * The copy destination offset for each range copied is the sum of
468 * an X offset 'xo0' or 'xo' and a Y offset 'yo.'
469 */
470 const uint32_t column_width = ytile_span;
471 const uint32_t bytes_per_column = column_width * ytile_height;
472
473 uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;
474 uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;
475
476 /* Bit 9 of the destination offset control swizzling.
477 * Only the X offset contributes to bit 9 of the total offset,
478 * so swizzle can be calculated in advance for these X positions.
479 * Move bit 9 three places down to bit 6.
480 */
481 uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit;
482 uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit;
483
484 uint32_t x, yo;
485
486 dst += (ptrdiff_t)y0 * dst_pitch;
487
488 for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {
489 uint32_t xo = xo1;
490 uint32_t swizzle = swizzle1;
491
492 mem_copy(dst + x0, src + ((xo0 + yo) ^ swizzle0), x1 - x0);
493
494 /* Step by spans/columns. As it happens, the swizzle bit flips
495 * at each step so we don't need to calculate it explicitly.
496 */
497 for (x = x1; x < x2; x += ytile_span) {
498 mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);
499 xo += bytes_per_column;
500 swizzle ^= swizzle_bit;
501 }
502
503 mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
504
505 dst += dst_pitch;
506 }
507 }
508
509
510 /**
511 * Copy texture data from linear to X tile layout, faster.
512 *
513 * Same as \ref linear_to_xtiled but faster, because it passes constant
514 * parameters for common cases, allowing the compiler to inline code
515 * optimized for those cases.
516 *
517 * \copydoc tile_copy_fn
518 */
519 static FLATTEN void
520 linear_to_xtiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
521 uint32_t y0, uint32_t y1,
522 char *dst, const char *src,
523 int32_t src_pitch,
524 uint32_t swizzle_bit,
525 mem_copy_fn mem_copy)
526 {
527 if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) {
528 if (mem_copy == memcpy)
529 return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
530 dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
531 else if (mem_copy == rgba8_copy)
532 return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
533 dst, src, src_pitch, swizzle_bit,
534 rgba8_copy, rgba8_copy_aligned_dst);
535 else
536 unreachable("not reached");
537 } else {
538 if (mem_copy == memcpy)
539 return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
540 dst, src, src_pitch, swizzle_bit,
541 memcpy, memcpy);
542 else if (mem_copy == rgba8_copy)
543 return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
544 dst, src, src_pitch, swizzle_bit,
545 rgba8_copy, rgba8_copy_aligned_dst);
546 else
547 unreachable("not reached");
548 }
549 linear_to_xtiled(x0, x1, x2, x3, y0, y1,
550 dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy);
551 }
552
553 /**
554 * Copy texture data from linear to Y tile layout, faster.
555 *
556 * Same as \ref linear_to_ytiled but faster, because it passes constant
557 * parameters for common cases, allowing the compiler to inline code
558 * optimized for those cases.
559 *
560 * \copydoc tile_copy_fn
561 */
562 static FLATTEN void
563 linear_to_ytiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
564 uint32_t y0, uint32_t y1,
565 char *dst, const char *src,
566 int32_t src_pitch,
567 uint32_t swizzle_bit,
568 mem_copy_fn mem_copy)
569 {
570 if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
571 if (mem_copy == memcpy)
572 return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
573 dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
574 else if (mem_copy == rgba8_copy)
575 return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
576 dst, src, src_pitch, swizzle_bit,
577 rgba8_copy, rgba8_copy_aligned_dst);
578 else
579 unreachable("not reached");
580 } else {
581 if (mem_copy == memcpy)
582 return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
583 dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
584 else if (mem_copy == rgba8_copy)
585 return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
586 dst, src, src_pitch, swizzle_bit,
587 rgba8_copy, rgba8_copy_aligned_dst);
588 else
589 unreachable("not reached");
590 }
591 linear_to_ytiled(x0, x1, x2, x3, y0, y1,
592 dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy);
593 }
594
595 /**
596 * Copy texture data from X tile layout to linear, faster.
597 *
598 * Same as \ref xtile_to_linear but faster, because it passes constant
599 * parameters for common cases, allowing the compiler to inline code
600 * optimized for those cases.
601 *
602 * \copydoc tile_copy_fn
603 */
604 static FLATTEN void
605 xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
606 uint32_t y0, uint32_t y1,
607 char *dst, const char *src,
608 int32_t dst_pitch,
609 uint32_t swizzle_bit,
610 mem_copy_fn mem_copy)
611 {
612 if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) {
613 if (mem_copy == memcpy)
614 return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
615 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
616 else if (mem_copy == rgba8_copy)
617 return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
618 dst, src, dst_pitch, swizzle_bit,
619 rgba8_copy, rgba8_copy_aligned_src);
620 else
621 unreachable("not reached");
622 } else {
623 if (mem_copy == memcpy)
624 return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
625 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
626 else if (mem_copy == rgba8_copy)
627 return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
628 dst, src, dst_pitch, swizzle_bit,
629 rgba8_copy, rgba8_copy_aligned_src);
630 else
631 unreachable("not reached");
632 }
633 xtiled_to_linear(x0, x1, x2, x3, y0, y1,
634 dst, src, dst_pitch, swizzle_bit, mem_copy, mem_copy);
635 }
636
637 /**
638 * Copy texture data from Y tile layout to linear, faster.
639 *
640 * Same as \ref ytile_to_linear but faster, because it passes constant
641 * parameters for common cases, allowing the compiler to inline code
642 * optimized for those cases.
643 *
644 * \copydoc tile_copy_fn
645 */
646 static FLATTEN void
647 ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
648 uint32_t y0, uint32_t y1,
649 char *dst, const char *src,
650 int32_t dst_pitch,
651 uint32_t swizzle_bit,
652 mem_copy_fn mem_copy)
653 {
654 if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
655 if (mem_copy == memcpy)
656 return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
657 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
658 else if (mem_copy == rgba8_copy)
659 return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
660 dst, src, dst_pitch, swizzle_bit,
661 rgba8_copy, rgba8_copy_aligned_src);
662 else
663 unreachable("not reached");
664 } else {
665 if (mem_copy == memcpy)
666 return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
667 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
668 else if (mem_copy == rgba8_copy)
669 return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
670 dst, src, dst_pitch, swizzle_bit,
671 rgba8_copy, rgba8_copy_aligned_src);
672 else
673 unreachable("not reached");
674 }
675 ytiled_to_linear(x0, x1, x2, x3, y0, y1,
676 dst, src, dst_pitch, swizzle_bit, mem_copy, mem_copy);
677 }
678
679 /**
680 * Copy from linear to tiled texture.
681 *
682 * Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into
683 * pieces that do not cross tile boundaries and copy each piece with a tile
684 * copy function (\ref tile_copy_fn).
685 * The X range is in bytes, i.e. pixels * bytes-per-pixel.
686 * The Y range is in pixels (i.e. unitless).
687 * 'dst' is the address of (0, 0) in the destination tiled texture.
688 * 'src' is the address of (xt1, yt1) in the source linear texture.
689 */
690 void
691 linear_to_tiled(uint32_t xt1, uint32_t xt2,
692 uint32_t yt1, uint32_t yt2,
693 char *dst, const char *src,
694 uint32_t dst_pitch, int32_t src_pitch,
695 bool has_swizzling,
696 enum isl_tiling tiling,
697 mem_copy_fn mem_copy)
698 {
699 tile_copy_fn tile_copy;
700 uint32_t xt0, xt3;
701 uint32_t yt0, yt3;
702 uint32_t xt, yt;
703 uint32_t tw, th, span;
704 uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;
705
706 if (tiling == ISL_TILING_X) {
707 tw = xtile_width;
708 th = xtile_height;
709 span = xtile_span;
710 tile_copy = linear_to_xtiled_faster;
711 } else if (tiling == ISL_TILING_Y0) {
712 tw = ytile_width;
713 th = ytile_height;
714 span = ytile_span;
715 tile_copy = linear_to_ytiled_faster;
716 } else {
717 unreachable("unsupported tiling");
718 }
719
720 /* Round out to tile boundaries. */
721 xt0 = ALIGN_DOWN(xt1, tw);
722 xt3 = ALIGN_UP (xt2, tw);
723 yt0 = ALIGN_DOWN(yt1, th);
724 yt3 = ALIGN_UP (yt2, th);
725
726 /* Loop over all tiles to which we have something to copy.
727 * 'xt' and 'yt' are the origin of the destination tile, whether copying
728 * copying a full or partial tile.
729 * tile_copy() copies one tile or partial tile.
730 * Looping x inside y is the faster memory access pattern.
731 */
732 for (yt = yt0; yt < yt3; yt += th) {
733 for (xt = xt0; xt < xt3; xt += tw) {
734 /* The area to update is [x0,x3) x [y0,y1).
735 * May not want the whole tile, hence the min and max.
736 */
737 uint32_t x0 = MAX2(xt1, xt);
738 uint32_t y0 = MAX2(yt1, yt);
739 uint32_t x3 = MIN2(xt2, xt + tw);
740 uint32_t y1 = MIN2(yt2, yt + th);
741
742 /* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that
743 * the middle interval is the longest span-aligned part.
744 * The sub-ranges could be empty.
745 */
746 uint32_t x1, x2;
747 x1 = ALIGN_UP(x0, span);
748 if (x1 > x3)
749 x1 = x2 = x3;
750 else
751 x2 = ALIGN_DOWN(x3, span);
752
753 assert(x0 <= x1 && x1 <= x2 && x2 <= x3);
754 assert(x1 - x0 < span && x3 - x2 < span);
755 assert(x3 - x0 <= tw);
756 assert((x2 - x1) % span == 0);
757
758 /* Translate by (xt,yt) for single-tile copier. */
759 tile_copy(x0-xt, x1-xt, x2-xt, x3-xt,
760 y0-yt, y1-yt,
761 dst + (ptrdiff_t)xt * th + (ptrdiff_t)yt * dst_pitch,
762 src + (ptrdiff_t)xt - xt1 + ((ptrdiff_t)yt - yt1) * src_pitch,
763 src_pitch,
764 swizzle_bit,
765 mem_copy);
766 }
767 }
768 }
769
770 /**
771 * Copy from tiled to linear texture.
772 *
773 * Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into
774 * pieces that do not cross tile boundaries and copy each piece with a tile
775 * copy function (\ref tile_copy_fn).
776 * The X range is in bytes, i.e. pixels * bytes-per-pixel.
777 * The Y range is in pixels (i.e. unitless).
778 * 'dst' is the address of (xt1, yt1) in the destination linear texture.
779 * 'src' is the address of (0, 0) in the source tiled texture.
780 */
781 void
782 tiled_to_linear(uint32_t xt1, uint32_t xt2,
783 uint32_t yt1, uint32_t yt2,
784 char *dst, const char *src,
785 int32_t dst_pitch, uint32_t src_pitch,
786 bool has_swizzling,
787 enum isl_tiling tiling,
788 mem_copy_fn mem_copy)
789 {
790 tile_copy_fn tile_copy;
791 uint32_t xt0, xt3;
792 uint32_t yt0, yt3;
793 uint32_t xt, yt;
794 uint32_t tw, th, span;
795 uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;
796
797 if (tiling == ISL_TILING_X) {
798 tw = xtile_width;
799 th = xtile_height;
800 span = xtile_span;
801 tile_copy = xtiled_to_linear_faster;
802 } else if (tiling == ISL_TILING_Y0) {
803 tw = ytile_width;
804 th = ytile_height;
805 span = ytile_span;
806 tile_copy = ytiled_to_linear_faster;
807 } else {
808 unreachable("unsupported tiling");
809 }
810
811 /* Round out to tile boundaries. */
812 xt0 = ALIGN_DOWN(xt1, tw);
813 xt3 = ALIGN_UP (xt2, tw);
814 yt0 = ALIGN_DOWN(yt1, th);
815 yt3 = ALIGN_UP (yt2, th);
816
817 /* Loop over all tiles to which we have something to copy.
818 * 'xt' and 'yt' are the origin of the destination tile, whether copying
819 * copying a full or partial tile.
820 * tile_copy() copies one tile or partial tile.
821 * Looping x inside y is the faster memory access pattern.
822 */
823 for (yt = yt0; yt < yt3; yt += th) {
824 for (xt = xt0; xt < xt3; xt += tw) {
825 /* The area to update is [x0,x3) x [y0,y1).
826 * May not want the whole tile, hence the min and max.
827 */
828 uint32_t x0 = MAX2(xt1, xt);
829 uint32_t y0 = MAX2(yt1, yt);
830 uint32_t x3 = MIN2(xt2, xt + tw);
831 uint32_t y1 = MIN2(yt2, yt + th);
832
833 /* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that
834 * the middle interval is the longest span-aligned part.
835 * The sub-ranges could be empty.
836 */
837 uint32_t x1, x2;
838 x1 = ALIGN_UP(x0, span);
839 if (x1 > x3)
840 x1 = x2 = x3;
841 else
842 x2 = ALIGN_DOWN(x3, span);
843
844 assert(x0 <= x1 && x1 <= x2 && x2 <= x3);
845 assert(x1 - x0 < span && x3 - x2 < span);
846 assert(x3 - x0 <= tw);
847 assert((x2 - x1) % span == 0);
848
849 /* Translate by (xt,yt) for single-tile copier. */
850 tile_copy(x0-xt, x1-xt, x2-xt, x3-xt,
851 y0-yt, y1-yt,
852 dst + (ptrdiff_t)xt - xt1 + ((ptrdiff_t)yt - yt1) * dst_pitch,
853 src + (ptrdiff_t)xt * th + (ptrdiff_t)yt * src_pitch,
854 dst_pitch,
855 swizzle_bit,
856 mem_copy);
857 }
858 }
859 }
860
861
862 /**
863 * Determine which copy function to use for the given format combination
864 *
865 * The only two possible copy functions which are ever returned are a
866 * direct memcpy and a RGBA <-> BGRA copy function. Since RGBA -> BGRA and
867 * BGRA -> RGBA are exactly the same operation (and memcpy is obviously
868 * symmetric), it doesn't matter whether the copy is from the tiled image
869 * to the untiled or vice versa. The copy function required is the same in
870 * either case so this function can be used.
871 *
872 * \param[in] tiledFormat The format of the tiled image
873 * \param[in] format The GL format of the client data
874 * \param[in] type The GL type of the client data
875 * \param[out] mem_copy Will be set to one of either the standard
876 * library's memcpy or a different copy function
877 * that performs an RGBA to BGRA conversion
878 * \param[out] cpp Number of bytes per channel
879 *
880 * \return true if the format and type combination are valid
881 */
882 bool intel_get_memcpy(mesa_format tiledFormat, GLenum format,
883 GLenum type, mem_copy_fn *mem_copy, uint32_t *cpp)
884 {
885 if (type == GL_UNSIGNED_INT_8_8_8_8_REV &&
886 !(format == GL_RGBA || format == GL_BGRA))
887 return false; /* Invalid type/format combination */
888
889 if ((tiledFormat == MESA_FORMAT_L_UNORM8 && format == GL_LUMINANCE) ||
890 (tiledFormat == MESA_FORMAT_A_UNORM8 && format == GL_ALPHA)) {
891 *cpp = 1;
892 *mem_copy = memcpy;
893 } else if ((tiledFormat == MESA_FORMAT_B8G8R8A8_UNORM) ||
894 (tiledFormat == MESA_FORMAT_B8G8R8X8_UNORM) ||
895 (tiledFormat == MESA_FORMAT_B8G8R8A8_SRGB) ||
896 (tiledFormat == MESA_FORMAT_B8G8R8X8_SRGB)) {
897 *cpp = 4;
898 if (format == GL_BGRA) {
899 *mem_copy = memcpy;
900 } else if (format == GL_RGBA) {
901 *mem_copy = rgba8_copy;
902 }
903 } else if ((tiledFormat == MESA_FORMAT_R8G8B8A8_UNORM) ||
904 (tiledFormat == MESA_FORMAT_R8G8B8X8_UNORM) ||
905 (tiledFormat == MESA_FORMAT_R8G8B8A8_SRGB) ||
906 (tiledFormat == MESA_FORMAT_R8G8B8X8_SRGB)) {
907 *cpp = 4;
908 if (format == GL_BGRA) {
909 /* Copying from RGBA to BGRA is the same as BGRA to RGBA so we can
910 * use the same function.
911 */
912 *mem_copy = rgba8_copy;
913 } else if (format == GL_RGBA) {
914 *mem_copy = memcpy;
915 }
916 }
917
918 if (!(*mem_copy))
919 return false;
920
921 return true;
922 }