intel/device: rename gen_get_device_info
[mesa.git] / src / intel / isl / isl_tiled_memcpy.c
1 /*
2 * Mesa 3-D graphics library
3 *
4 * Copyright 2012 Intel Corporation
5 * Copyright 2013 Google
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sublicense, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 * Authors:
28 * Chad Versace <chad.versace@linux.intel.com>
29 * Frank Henigman <fjhenigman@google.com>
30 */
31
32 #include <string.h>
33
34 #include "util/macros.h"
35 #include "main/macros.h"
36
37 #include "isl_priv.h"
38
39 #if defined(__SSSE3__)
40 #include <tmmintrin.h>
41 #elif defined(__SSE2__)
42 #include <emmintrin.h>
43 #endif
44
45 #define FILE_DEBUG_FLAG DEBUG_TEXTURE
46
47 #define ALIGN_DOWN(a, b) ROUND_DOWN_TO(a, b)
48 #define ALIGN_UP(a, b) ALIGN(a, b)
49
50 /* Tile dimensions. Width and span are in bytes, height is in pixels (i.e.
51 * unitless). A "span" is the most number of bytes we can copy from linear
52 * to tiled without needing to calculate a new destination address.
53 */
54 static const uint32_t xtile_width = 512;
55 static const uint32_t xtile_height = 8;
56 static const uint32_t xtile_span = 64;
57 static const uint32_t ytile_width = 128;
58 static const uint32_t ytile_height = 32;
59 static const uint32_t ytile_span = 16;
60
61 static inline uint32_t
62 ror(uint32_t n, uint32_t d)
63 {
64 return (n >> d) | (n << (32 - d));
65 }
66
67 static inline uint32_t
68 bswap32(uint32_t n)
69 {
70 #if defined(HAVE___BUILTIN_BSWAP32)
71 return __builtin_bswap32(n);
72 #else
73 return (n >> 24) |
74 ((n >> 8) & 0x0000ff00) |
75 ((n << 8) & 0x00ff0000) |
76 (n << 24);
77 #endif
78 }
79
80 /**
81 * Copy RGBA to BGRA - swap R and B.
82 */
83 static inline void *
84 rgba8_copy(void *dst, const void *src, size_t bytes)
85 {
86 uint32_t *d = dst;
87 uint32_t const *s = src;
88
89 assert(bytes % 4 == 0);
90
91 while (bytes >= 4) {
92 *d = ror(bswap32(*s), 8);
93 d += 1;
94 s += 1;
95 bytes -= 4;
96 }
97 return dst;
98 }
99
100 #ifdef __SSSE3__
101 static const uint8_t rgba8_permutation[16] =
102 { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };
103
104 static inline void
105 rgba8_copy_16_aligned_dst(void *dst, const void *src)
106 {
107 _mm_store_si128(dst,
108 _mm_shuffle_epi8(_mm_loadu_si128(src),
109 *(__m128i *)rgba8_permutation));
110 }
111
112 static inline void
113 rgba8_copy_16_aligned_src(void *dst, const void *src)
114 {
115 _mm_storeu_si128(dst,
116 _mm_shuffle_epi8(_mm_load_si128(src),
117 *(__m128i *)rgba8_permutation));
118 }
119
120 #elif defined(__SSE2__)
121 static inline void
122 rgba8_copy_16_aligned_dst(void *dst, const void *src)
123 {
124 __m128i srcreg, dstreg, agmask, ag, rb, br;
125
126 agmask = _mm_set1_epi32(0xFF00FF00);
127 srcreg = _mm_loadu_si128((__m128i *)src);
128
129 rb = _mm_andnot_si128(agmask, srcreg);
130 ag = _mm_and_si128(agmask, srcreg);
131 br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)),
132 _MM_SHUFFLE(2, 3, 0, 1));
133 dstreg = _mm_or_si128(ag, br);
134
135 _mm_store_si128((__m128i *)dst, dstreg);
136 }
137
138 static inline void
139 rgba8_copy_16_aligned_src(void *dst, const void *src)
140 {
141 __m128i srcreg, dstreg, agmask, ag, rb, br;
142
143 agmask = _mm_set1_epi32(0xFF00FF00);
144 srcreg = _mm_load_si128((__m128i *)src);
145
146 rb = _mm_andnot_si128(agmask, srcreg);
147 ag = _mm_and_si128(agmask, srcreg);
148 br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)),
149 _MM_SHUFFLE(2, 3, 0, 1));
150 dstreg = _mm_or_si128(ag, br);
151
152 _mm_storeu_si128((__m128i *)dst, dstreg);
153 }
154 #endif
155
156 /**
157 * Copy RGBA to BGRA - swap R and B, with the destination 16-byte aligned.
158 */
159 static inline void *
160 rgba8_copy_aligned_dst(void *dst, const void *src, size_t bytes)
161 {
162 assert(bytes == 0 || !(((uintptr_t)dst) & 0xf));
163
164 #if defined(__SSSE3__) || defined(__SSE2__)
165 if (bytes == 64) {
166 rgba8_copy_16_aligned_dst(dst + 0, src + 0);
167 rgba8_copy_16_aligned_dst(dst + 16, src + 16);
168 rgba8_copy_16_aligned_dst(dst + 32, src + 32);
169 rgba8_copy_16_aligned_dst(dst + 48, src + 48);
170 return dst;
171 }
172
173 while (bytes >= 16) {
174 rgba8_copy_16_aligned_dst(dst, src);
175 src += 16;
176 dst += 16;
177 bytes -= 16;
178 }
179 #endif
180
181 rgba8_copy(dst, src, bytes);
182
183 return dst;
184 }
185
186 /**
187 * Copy RGBA to BGRA - swap R and B, with the source 16-byte aligned.
188 */
189 static inline void *
190 rgba8_copy_aligned_src(void *dst, const void *src, size_t bytes)
191 {
192 assert(bytes == 0 || !(((uintptr_t)src) & 0xf));
193
194 #if defined(__SSSE3__) || defined(__SSE2__)
195 if (bytes == 64) {
196 rgba8_copy_16_aligned_src(dst + 0, src + 0);
197 rgba8_copy_16_aligned_src(dst + 16, src + 16);
198 rgba8_copy_16_aligned_src(dst + 32, src + 32);
199 rgba8_copy_16_aligned_src(dst + 48, src + 48);
200 return dst;
201 }
202
203 while (bytes >= 16) {
204 rgba8_copy_16_aligned_src(dst, src);
205 src += 16;
206 dst += 16;
207 bytes -= 16;
208 }
209 #endif
210
211 rgba8_copy(dst, src, bytes);
212
213 return dst;
214 }
215
216 /**
217 * Each row from y0 to y1 is copied in three parts: [x0,x1), [x1,x2), [x2,x3).
218 * These ranges are in bytes, i.e. pixels * bytes-per-pixel.
219 * The first and last ranges must be shorter than a "span" (the longest linear
220 * stretch within a tile) and the middle must equal a whole number of spans.
221 * Ranges may be empty. The region copied must land entirely within one tile.
222 * 'dst' is the start of the tile and 'src' is the corresponding
223 * address to copy from, though copying begins at (x0, y0).
224 * To enable swizzling 'swizzle_bit' must be 1<<6, otherwise zero.
225 * Swizzling flips bit 6 in the copy destination offset, when certain other
226 * bits are set in it.
227 */
228 typedef void (*tile_copy_fn)(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
229 uint32_t y0, uint32_t y1,
230 char *dst, const char *src,
231 int32_t linear_pitch,
232 uint32_t swizzle_bit,
233 isl_memcpy_type copy_type);
234
235 /**
236 * Copy texture data from linear to X tile layout.
237 *
238 * \copydoc tile_copy_fn
239 *
240 * The mem_copy parameters allow the user to specify an alternative mem_copy
241 * function that, for instance, may do RGBA -> BGRA swizzling. The first
242 * function must handle any memory alignment while the second function must
243 * only handle 16-byte alignment in whichever side (source or destination) is
244 * tiled.
245 */
246 static inline void
247 linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
248 uint32_t y0, uint32_t y1,
249 char *dst, const char *src,
250 int32_t src_pitch,
251 uint32_t swizzle_bit,
252 isl_mem_copy_fn mem_copy,
253 isl_mem_copy_fn mem_copy_align16)
254 {
255 /* The copy destination offset for each range copied is the sum of
256 * an X offset 'x0' or 'xo' and a Y offset 'yo.'
257 */
258 uint32_t xo, yo;
259
260 src += (ptrdiff_t)y0 * src_pitch;
261
262 for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) {
263 /* Bits 9 and 10 of the copy destination offset control swizzling.
264 * Only 'yo' contributes to those bits in the total offset,
265 * so calculate 'swizzle' just once per row.
266 * Move bits 9 and 10 three and four places respectively down
267 * to bit 6 and xor them.
268 */
269 uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit;
270
271 mem_copy(dst + ((x0 + yo) ^ swizzle), src + x0, x1 - x0);
272
273 for (xo = x1; xo < x2; xo += xtile_span) {
274 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + xo, xtile_span);
275 }
276
277 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
278
279 src += src_pitch;
280 }
281 }
282
283 /**
284 * Copy texture data from linear to Y tile layout.
285 *
286 * \copydoc tile_copy_fn
287 */
288 static inline void
289 linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
290 uint32_t y0, uint32_t y3,
291 char *dst, const char *src,
292 int32_t src_pitch,
293 uint32_t swizzle_bit,
294 isl_mem_copy_fn mem_copy,
295 isl_mem_copy_fn mem_copy_align16)
296 {
297 /* Y tiles consist of columns that are 'ytile_span' wide (and the same height
298 * as the tile). Thus the destination offset for (x,y) is the sum of:
299 * (x % column_width) // position within column
300 * (x / column_width) * bytes_per_column // column number * bytes per column
301 * y * column_width
302 *
303 * The copy destination offset for each range copied is the sum of
304 * an X offset 'xo0' or 'xo' and a Y offset 'yo.'
305 */
306 const uint32_t column_width = ytile_span;
307 const uint32_t bytes_per_column = column_width * ytile_height;
308
309 uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4));
310 uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4));
311
312 uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;
313 uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;
314
315 /* Bit 9 of the destination offset control swizzling.
316 * Only the X offset contributes to bit 9 of the total offset,
317 * so swizzle can be calculated in advance for these X positions.
318 * Move bit 9 three places down to bit 6.
319 */
320 uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit;
321 uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit;
322
323 uint32_t x, yo;
324
325 src += (ptrdiff_t)y0 * src_pitch;
326
327 if (y0 != y1) {
328 for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {
329 uint32_t xo = xo1;
330 uint32_t swizzle = swizzle1;
331
332 mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0);
333
334 /* Step by spans/columns. As it happens, the swizzle bit flips
335 * at each step so we don't need to calculate it explicitly.
336 */
337 for (x = x1; x < x2; x += ytile_span) {
338 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);
339 xo += bytes_per_column;
340 swizzle ^= swizzle_bit;
341 }
342
343 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
344
345 src += src_pitch;
346 }
347 }
348
349 for (yo = y1 * column_width; yo < y2 * column_width; yo += 4 * column_width) {
350 uint32_t xo = xo1;
351 uint32_t swizzle = swizzle1;
352
353 if (x0 != x1) {
354 mem_copy(dst + ((xo0 + yo + 0 * column_width) ^ swizzle0), src + x0 + 0 * src_pitch, x1 - x0);
355 mem_copy(dst + ((xo0 + yo + 1 * column_width) ^ swizzle0), src + x0 + 1 * src_pitch, x1 - x0);
356 mem_copy(dst + ((xo0 + yo + 2 * column_width) ^ swizzle0), src + x0 + 2 * src_pitch, x1 - x0);
357 mem_copy(dst + ((xo0 + yo + 3 * column_width) ^ swizzle0), src + x0 + 3 * src_pitch, x1 - x0);
358 }
359
360 /* Step by spans/columns. As it happens, the swizzle bit flips
361 * at each step so we don't need to calculate it explicitly.
362 */
363 for (x = x1; x < x2; x += ytile_span) {
364 mem_copy_align16(dst + ((xo + yo + 0 * column_width) ^ swizzle), src + x + 0 * src_pitch, ytile_span);
365 mem_copy_align16(dst + ((xo + yo + 1 * column_width) ^ swizzle), src + x + 1 * src_pitch, ytile_span);
366 mem_copy_align16(dst + ((xo + yo + 2 * column_width) ^ swizzle), src + x + 2 * src_pitch, ytile_span);
367 mem_copy_align16(dst + ((xo + yo + 3 * column_width) ^ swizzle), src + x + 3 * src_pitch, ytile_span);
368 xo += bytes_per_column;
369 swizzle ^= swizzle_bit;
370 }
371
372 if (x2 != x3) {
373 mem_copy_align16(dst + ((xo + yo + 0 * column_width) ^ swizzle), src + x2 + 0 * src_pitch, x3 - x2);
374 mem_copy_align16(dst + ((xo + yo + 1 * column_width) ^ swizzle), src + x2 + 1 * src_pitch, x3 - x2);
375 mem_copy_align16(dst + ((xo + yo + 2 * column_width) ^ swizzle), src + x2 + 2 * src_pitch, x3 - x2);
376 mem_copy_align16(dst + ((xo + yo + 3 * column_width) ^ swizzle), src + x2 + 3 * src_pitch, x3 - x2);
377 }
378
379 src += 4 * src_pitch;
380 }
381
382 if (y2 != y3) {
383 for (yo = y2 * column_width; yo < y3 * column_width; yo += column_width) {
384 uint32_t xo = xo1;
385 uint32_t swizzle = swizzle1;
386
387 mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0);
388
389 /* Step by spans/columns. As it happens, the swizzle bit flips
390 * at each step so we don't need to calculate it explicitly.
391 */
392 for (x = x1; x < x2; x += ytile_span) {
393 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);
394 xo += bytes_per_column;
395 swizzle ^= swizzle_bit;
396 }
397
398 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
399
400 src += src_pitch;
401 }
402 }
403 }
404
405 /**
406 * Copy texture data from X tile layout to linear.
407 *
408 * \copydoc tile_copy_fn
409 */
410 static inline void
411 xtiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
412 uint32_t y0, uint32_t y1,
413 char *dst, const char *src,
414 int32_t dst_pitch,
415 uint32_t swizzle_bit,
416 isl_mem_copy_fn mem_copy,
417 isl_mem_copy_fn mem_copy_align16)
418 {
419 /* The copy destination offset for each range copied is the sum of
420 * an X offset 'x0' or 'xo' and a Y offset 'yo.'
421 */
422 uint32_t xo, yo;
423
424 dst += (ptrdiff_t)y0 * dst_pitch;
425
426 for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) {
427 /* Bits 9 and 10 of the copy destination offset control swizzling.
428 * Only 'yo' contributes to those bits in the total offset,
429 * so calculate 'swizzle' just once per row.
430 * Move bits 9 and 10 three and four places respectively down
431 * to bit 6 and xor them.
432 */
433 uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit;
434
435 mem_copy(dst + x0, src + ((x0 + yo) ^ swizzle), x1 - x0);
436
437 for (xo = x1; xo < x2; xo += xtile_span) {
438 mem_copy_align16(dst + xo, src + ((xo + yo) ^ swizzle), xtile_span);
439 }
440
441 mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
442
443 dst += dst_pitch;
444 }
445 }
446
447 /**
448 * Copy texture data from Y tile layout to linear.
449 *
450 * \copydoc tile_copy_fn
451 */
452 static inline void
453 ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
454 uint32_t y0, uint32_t y3,
455 char *dst, const char *src,
456 int32_t dst_pitch,
457 uint32_t swizzle_bit,
458 isl_mem_copy_fn mem_copy,
459 isl_mem_copy_fn mem_copy_align16)
460 {
461 /* Y tiles consist of columns that are 'ytile_span' wide (and the same height
462 * as the tile). Thus the destination offset for (x,y) is the sum of:
463 * (x % column_width) // position within column
464 * (x / column_width) * bytes_per_column // column number * bytes per column
465 * y * column_width
466 *
467 * The copy destination offset for each range copied is the sum of
468 * an X offset 'xo0' or 'xo' and a Y offset 'yo.'
469 */
470 const uint32_t column_width = ytile_span;
471 const uint32_t bytes_per_column = column_width * ytile_height;
472
473 uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4));
474 uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4));
475
476 uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;
477 uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;
478
479 /* Bit 9 of the destination offset control swizzling.
480 * Only the X offset contributes to bit 9 of the total offset,
481 * so swizzle can be calculated in advance for these X positions.
482 * Move bit 9 three places down to bit 6.
483 */
484 uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit;
485 uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit;
486
487 uint32_t x, yo;
488
489 dst += (ptrdiff_t)y0 * dst_pitch;
490
491 if (y0 != y1) {
492 for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {
493 uint32_t xo = xo1;
494 uint32_t swizzle = swizzle1;
495
496 mem_copy(dst + x0, src + ((xo0 + yo) ^ swizzle0), x1 - x0);
497
498 /* Step by spans/columns. As it happens, the swizzle bit flips
499 * at each step so we don't need to calculate it explicitly.
500 */
501 for (x = x1; x < x2; x += ytile_span) {
502 mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);
503 xo += bytes_per_column;
504 swizzle ^= swizzle_bit;
505 }
506
507 mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
508
509 dst += dst_pitch;
510 }
511 }
512
513 for (yo = y1 * column_width; yo < y2 * column_width; yo += 4 * column_width) {
514 uint32_t xo = xo1;
515 uint32_t swizzle = swizzle1;
516
517 if (x0 != x1) {
518 mem_copy(dst + x0 + 0 * dst_pitch, src + ((xo0 + yo + 0 * column_width) ^ swizzle0), x1 - x0);
519 mem_copy(dst + x0 + 1 * dst_pitch, src + ((xo0 + yo + 1 * column_width) ^ swizzle0), x1 - x0);
520 mem_copy(dst + x0 + 2 * dst_pitch, src + ((xo0 + yo + 2 * column_width) ^ swizzle0), x1 - x0);
521 mem_copy(dst + x0 + 3 * dst_pitch, src + ((xo0 + yo + 3 * column_width) ^ swizzle0), x1 - x0);
522 }
523
524 /* Step by spans/columns. As it happens, the swizzle bit flips
525 * at each step so we don't need to calculate it explicitly.
526 */
527 for (x = x1; x < x2; x += ytile_span) {
528 mem_copy_align16(dst + x + 0 * dst_pitch, src + ((xo + yo + 0 * column_width) ^ swizzle), ytile_span);
529 mem_copy_align16(dst + x + 1 * dst_pitch, src + ((xo + yo + 1 * column_width) ^ swizzle), ytile_span);
530 mem_copy_align16(dst + x + 2 * dst_pitch, src + ((xo + yo + 2 * column_width) ^ swizzle), ytile_span);
531 mem_copy_align16(dst + x + 3 * dst_pitch, src + ((xo + yo + 3 * column_width) ^ swizzle), ytile_span);
532 xo += bytes_per_column;
533 swizzle ^= swizzle_bit;
534 }
535
536 if (x2 != x3) {
537 mem_copy_align16(dst + x2 + 0 * dst_pitch, src + ((xo + yo + 0 * column_width) ^ swizzle), x3 - x2);
538 mem_copy_align16(dst + x2 + 1 * dst_pitch, src + ((xo + yo + 1 * column_width) ^ swizzle), x3 - x2);
539 mem_copy_align16(dst + x2 + 2 * dst_pitch, src + ((xo + yo + 2 * column_width) ^ swizzle), x3 - x2);
540 mem_copy_align16(dst + x2 + 3 * dst_pitch, src + ((xo + yo + 3 * column_width) ^ swizzle), x3 - x2);
541 }
542
543 dst += 4 * dst_pitch;
544 }
545
546 if (y2 != y3) {
547 for (yo = y2 * column_width; yo < y3 * column_width; yo += column_width) {
548 uint32_t xo = xo1;
549 uint32_t swizzle = swizzle1;
550
551 mem_copy(dst + x0, src + ((xo0 + yo) ^ swizzle0), x1 - x0);
552
553 /* Step by spans/columns. As it happens, the swizzle bit flips
554 * at each step so we don't need to calculate it explicitly.
555 */
556 for (x = x1; x < x2; x += ytile_span) {
557 mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);
558 xo += bytes_per_column;
559 swizzle ^= swizzle_bit;
560 }
561
562 mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
563
564 dst += dst_pitch;
565 }
566 }
567 }
568
569 #if defined(INLINE_SSE41)
570 static ALWAYS_INLINE void *
571 _memcpy_streaming_load(void *dest, const void *src, size_t count)
572 {
573 if (count == 16) {
574 __m128i val = _mm_stream_load_si128((__m128i *)src);
575 _mm_storeu_si128((__m128i *)dest, val);
576 return dest;
577 } else if (count == 64) {
578 __m128i val0 = _mm_stream_load_si128(((__m128i *)src) + 0);
579 __m128i val1 = _mm_stream_load_si128(((__m128i *)src) + 1);
580 __m128i val2 = _mm_stream_load_si128(((__m128i *)src) + 2);
581 __m128i val3 = _mm_stream_load_si128(((__m128i *)src) + 3);
582 _mm_storeu_si128(((__m128i *)dest) + 0, val0);
583 _mm_storeu_si128(((__m128i *)dest) + 1, val1);
584 _mm_storeu_si128(((__m128i *)dest) + 2, val2);
585 _mm_storeu_si128(((__m128i *)dest) + 3, val3);
586 return dest;
587 } else {
588 assert(count < 64); /* and (count < 16) for ytiled */
589 return memcpy(dest, src, count);
590 }
591 }
592 #endif
593
594 static isl_mem_copy_fn
595 choose_copy_function(isl_memcpy_type copy_type)
596 {
597 switch(copy_type) {
598 case ISL_MEMCPY:
599 return memcpy;
600 case ISL_MEMCPY_BGRA8:
601 return rgba8_copy;
602 case ISL_MEMCPY_STREAMING_LOAD:
603 #if defined(INLINE_SSE41)
604 return _memcpy_streaming_load;
605 #else
606 unreachable("ISL_MEMCOPY_STREAMING_LOAD requires sse4.1");
607 #endif
608 case ISL_MEMCPY_INVALID:
609 unreachable("invalid copy_type");
610 }
611 unreachable("unhandled copy_type");
612 return NULL;
613 }
614
615 /**
616 * Copy texture data from linear to X tile layout, faster.
617 *
618 * Same as \ref linear_to_xtiled but faster, because it passes constant
619 * parameters for common cases, allowing the compiler to inline code
620 * optimized for those cases.
621 *
622 * \copydoc tile_copy_fn
623 */
624 static FLATTEN void
625 linear_to_xtiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
626 uint32_t y0, uint32_t y1,
627 char *dst, const char *src,
628 int32_t src_pitch,
629 uint32_t swizzle_bit,
630 isl_memcpy_type copy_type)
631 {
632 isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
633
634 if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) {
635 if (mem_copy == memcpy)
636 return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
637 dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
638 else if (mem_copy == rgba8_copy)
639 return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
640 dst, src, src_pitch, swizzle_bit,
641 rgba8_copy, rgba8_copy_aligned_dst);
642 else
643 unreachable("not reached");
644 } else {
645 if (mem_copy == memcpy)
646 return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
647 dst, src, src_pitch, swizzle_bit,
648 memcpy, memcpy);
649 else if (mem_copy == rgba8_copy)
650 return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
651 dst, src, src_pitch, swizzle_bit,
652 rgba8_copy, rgba8_copy_aligned_dst);
653 else
654 unreachable("not reached");
655 }
656 linear_to_xtiled(x0, x1, x2, x3, y0, y1,
657 dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy);
658 }
659
660 /**
661 * Copy texture data from linear to Y tile layout, faster.
662 *
663 * Same as \ref linear_to_ytiled but faster, because it passes constant
664 * parameters for common cases, allowing the compiler to inline code
665 * optimized for those cases.
666 *
667 * \copydoc tile_copy_fn
668 */
669 static FLATTEN void
670 linear_to_ytiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
671 uint32_t y0, uint32_t y1,
672 char *dst, const char *src,
673 int32_t src_pitch,
674 uint32_t swizzle_bit,
675 isl_memcpy_type copy_type)
676 {
677 isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
678
679 if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
680 if (mem_copy == memcpy)
681 return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
682 dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
683 else if (mem_copy == rgba8_copy)
684 return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
685 dst, src, src_pitch, swizzle_bit,
686 rgba8_copy, rgba8_copy_aligned_dst);
687 else
688 unreachable("not reached");
689 } else {
690 if (mem_copy == memcpy)
691 return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
692 dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
693 else if (mem_copy == rgba8_copy)
694 return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
695 dst, src, src_pitch, swizzle_bit,
696 rgba8_copy, rgba8_copy_aligned_dst);
697 else
698 unreachable("not reached");
699 }
700 linear_to_ytiled(x0, x1, x2, x3, y0, y1,
701 dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy);
702 }
703
704 /**
705 * Copy texture data from X tile layout to linear, faster.
706 *
707 * Same as \ref xtile_to_linear but faster, because it passes constant
708 * parameters for common cases, allowing the compiler to inline code
709 * optimized for those cases.
710 *
711 * \copydoc tile_copy_fn
712 */
713 static FLATTEN void
714 xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
715 uint32_t y0, uint32_t y1,
716 char *dst, const char *src,
717 int32_t dst_pitch,
718 uint32_t swizzle_bit,
719 isl_memcpy_type copy_type)
720 {
721 isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
722
723 if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) {
724 if (mem_copy == memcpy)
725 return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
726 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
727 else if (mem_copy == rgba8_copy)
728 return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
729 dst, src, dst_pitch, swizzle_bit,
730 rgba8_copy, rgba8_copy_aligned_src);
731 #if defined(INLINE_SSE41)
732 else if (mem_copy == _memcpy_streaming_load)
733 return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
734 dst, src, dst_pitch, swizzle_bit,
735 memcpy, _memcpy_streaming_load);
736 #endif
737 else
738 unreachable("not reached");
739 } else {
740 if (mem_copy == memcpy)
741 return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
742 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
743 else if (mem_copy == rgba8_copy)
744 return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
745 dst, src, dst_pitch, swizzle_bit,
746 rgba8_copy, rgba8_copy_aligned_src);
747 #if defined(INLINE_SSE41)
748 else if (mem_copy == _memcpy_streaming_load)
749 return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
750 dst, src, dst_pitch, swizzle_bit,
751 memcpy, _memcpy_streaming_load);
752 #endif
753 else
754 unreachable("not reached");
755 }
756 xtiled_to_linear(x0, x1, x2, x3, y0, y1,
757 dst, src, dst_pitch, swizzle_bit, mem_copy, mem_copy);
758 }
759
760 /**
761 * Copy texture data from Y tile layout to linear, faster.
762 *
763 * Same as \ref ytile_to_linear but faster, because it passes constant
764 * parameters for common cases, allowing the compiler to inline code
765 * optimized for those cases.
766 *
767 * \copydoc tile_copy_fn
768 */
769 static FLATTEN void
770 ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
771 uint32_t y0, uint32_t y1,
772 char *dst, const char *src,
773 int32_t dst_pitch,
774 uint32_t swizzle_bit,
775 isl_memcpy_type copy_type)
776 {
777 isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
778
779 if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
780 if (mem_copy == memcpy)
781 return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
782 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
783 else if (mem_copy == rgba8_copy)
784 return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
785 dst, src, dst_pitch, swizzle_bit,
786 rgba8_copy, rgba8_copy_aligned_src);
787 #if defined(INLINE_SSE41)
788 else if (copy_type == ISL_MEMCPY_STREAMING_LOAD)
789 return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
790 dst, src, dst_pitch, swizzle_bit,
791 memcpy, _memcpy_streaming_load);
792 #endif
793 else
794 unreachable("not reached");
795 } else {
796 if (mem_copy == memcpy)
797 return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
798 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
799 else if (mem_copy == rgba8_copy)
800 return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
801 dst, src, dst_pitch, swizzle_bit,
802 rgba8_copy, rgba8_copy_aligned_src);
803 #if defined(INLINE_SSE41)
804 else if (copy_type == ISL_MEMCPY_STREAMING_LOAD)
805 return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
806 dst, src, dst_pitch, swizzle_bit,
807 memcpy, _memcpy_streaming_load);
808 #endif
809 else
810 unreachable("not reached");
811 }
812 ytiled_to_linear(x0, x1, x2, x3, y0, y1,
813 dst, src, dst_pitch, swizzle_bit, mem_copy, mem_copy);
814 }
815
816 /**
817 * Copy from linear to tiled texture.
818 *
819 * Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into
820 * pieces that do not cross tile boundaries and copy each piece with a tile
821 * copy function (\ref tile_copy_fn).
822 * The X range is in bytes, i.e. pixels * bytes-per-pixel.
823 * The Y range is in pixels (i.e. unitless).
824 * 'dst' is the address of (0, 0) in the destination tiled texture.
825 * 'src' is the address of (xt1, yt1) in the source linear texture.
826 */
827 static void
828 intel_linear_to_tiled(uint32_t xt1, uint32_t xt2,
829 uint32_t yt1, uint32_t yt2,
830 char *dst, const char *src,
831 uint32_t dst_pitch, int32_t src_pitch,
832 bool has_swizzling,
833 enum isl_tiling tiling,
834 isl_memcpy_type copy_type)
835 {
836 tile_copy_fn tile_copy;
837 uint32_t xt0, xt3;
838 uint32_t yt0, yt3;
839 uint32_t xt, yt;
840 uint32_t tw, th, span;
841 uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;
842
843 if (tiling == ISL_TILING_X) {
844 tw = xtile_width;
845 th = xtile_height;
846 span = xtile_span;
847 tile_copy = linear_to_xtiled_faster;
848 } else if (tiling == ISL_TILING_Y0) {
849 tw = ytile_width;
850 th = ytile_height;
851 span = ytile_span;
852 tile_copy = linear_to_ytiled_faster;
853 } else {
854 unreachable("unsupported tiling");
855 }
856
857 /* Round out to tile boundaries. */
858 xt0 = ALIGN_DOWN(xt1, tw);
859 xt3 = ALIGN_UP (xt2, tw);
860 yt0 = ALIGN_DOWN(yt1, th);
861 yt3 = ALIGN_UP (yt2, th);
862
863 /* Loop over all tiles to which we have something to copy.
864 * 'xt' and 'yt' are the origin of the destination tile, whether copying
865 * copying a full or partial tile.
866 * tile_copy() copies one tile or partial tile.
867 * Looping x inside y is the faster memory access pattern.
868 */
869 for (yt = yt0; yt < yt3; yt += th) {
870 for (xt = xt0; xt < xt3; xt += tw) {
871 /* The area to update is [x0,x3) x [y0,y1).
872 * May not want the whole tile, hence the min and max.
873 */
874 uint32_t x0 = MAX2(xt1, xt);
875 uint32_t y0 = MAX2(yt1, yt);
876 uint32_t x3 = MIN2(xt2, xt + tw);
877 uint32_t y1 = MIN2(yt2, yt + th);
878
879 /* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that
880 * the middle interval is the longest span-aligned part.
881 * The sub-ranges could be empty.
882 */
883 uint32_t x1, x2;
884 x1 = ALIGN_UP(x0, span);
885 if (x1 > x3)
886 x1 = x2 = x3;
887 else
888 x2 = ALIGN_DOWN(x3, span);
889
890 assert(x0 <= x1 && x1 <= x2 && x2 <= x3);
891 assert(x1 - x0 < span && x3 - x2 < span);
892 assert(x3 - x0 <= tw);
893 assert((x2 - x1) % span == 0);
894
895 /* Translate by (xt,yt) for single-tile copier. */
896 tile_copy(x0-xt, x1-xt, x2-xt, x3-xt,
897 y0-yt, y1-yt,
898 dst + (ptrdiff_t)xt * th + (ptrdiff_t)yt * dst_pitch,
899 src + (ptrdiff_t)xt - xt1 + ((ptrdiff_t)yt - yt1) * src_pitch,
900 src_pitch,
901 swizzle_bit,
902 copy_type);
903 }
904 }
905 }
906
907 /**
908 * Copy from tiled to linear texture.
909 *
910 * Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into
911 * pieces that do not cross tile boundaries and copy each piece with a tile
912 * copy function (\ref tile_copy_fn).
913 * The X range is in bytes, i.e. pixels * bytes-per-pixel.
914 * The Y range is in pixels (i.e. unitless).
915 * 'dst' is the address of (xt1, yt1) in the destination linear texture.
916 * 'src' is the address of (0, 0) in the source tiled texture.
917 */
918 static void
919 intel_tiled_to_linear(uint32_t xt1, uint32_t xt2,
920 uint32_t yt1, uint32_t yt2,
921 char *dst, const char *src,
922 int32_t dst_pitch, uint32_t src_pitch,
923 bool has_swizzling,
924 enum isl_tiling tiling,
925 isl_memcpy_type copy_type)
926 {
927 tile_copy_fn tile_copy;
928 uint32_t xt0, xt3;
929 uint32_t yt0, yt3;
930 uint32_t xt, yt;
931 uint32_t tw, th, span;
932 uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;
933
934 if (tiling == ISL_TILING_X) {
935 tw = xtile_width;
936 th = xtile_height;
937 span = xtile_span;
938 tile_copy = xtiled_to_linear_faster;
939 } else if (tiling == ISL_TILING_Y0) {
940 tw = ytile_width;
941 th = ytile_height;
942 span = ytile_span;
943 tile_copy = ytiled_to_linear_faster;
944 } else {
945 unreachable("unsupported tiling");
946 }
947
948 #if defined(INLINE_SSE41)
949 if (copy_type == ISL_MEMCPY_STREAMING_LOAD) {
950 /* The hidden cacheline sized register used by movntdqa can apparently
951 * give you stale data, so do an mfence to invalidate it.
952 */
953 _mm_mfence();
954 }
955 #endif
956
957 /* Round out to tile boundaries. */
958 xt0 = ALIGN_DOWN(xt1, tw);
959 xt3 = ALIGN_UP (xt2, tw);
960 yt0 = ALIGN_DOWN(yt1, th);
961 yt3 = ALIGN_UP (yt2, th);
962
963 /* Loop over all tiles to which we have something to copy.
964 * 'xt' and 'yt' are the origin of the destination tile, whether copying
965 * copying a full or partial tile.
966 * tile_copy() copies one tile or partial tile.
967 * Looping x inside y is the faster memory access pattern.
968 */
969 for (yt = yt0; yt < yt3; yt += th) {
970 for (xt = xt0; xt < xt3; xt += tw) {
971 /* The area to update is [x0,x3) x [y0,y1).
972 * May not want the whole tile, hence the min and max.
973 */
974 uint32_t x0 = MAX2(xt1, xt);
975 uint32_t y0 = MAX2(yt1, yt);
976 uint32_t x3 = MIN2(xt2, xt + tw);
977 uint32_t y1 = MIN2(yt2, yt + th);
978
979 /* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that
980 * the middle interval is the longest span-aligned part.
981 * The sub-ranges could be empty.
982 */
983 uint32_t x1, x2;
984 x1 = ALIGN_UP(x0, span);
985 if (x1 > x3)
986 x1 = x2 = x3;
987 else
988 x2 = ALIGN_DOWN(x3, span);
989
990 assert(x0 <= x1 && x1 <= x2 && x2 <= x3);
991 assert(x1 - x0 < span && x3 - x2 < span);
992 assert(x3 - x0 <= tw);
993 assert((x2 - x1) % span == 0);
994
995 /* Translate by (xt,yt) for single-tile copier. */
996 tile_copy(x0-xt, x1-xt, x2-xt, x3-xt,
997 y0-yt, y1-yt,
998 dst + (ptrdiff_t)xt - xt1 + ((ptrdiff_t)yt - yt1) * dst_pitch,
999 src + (ptrdiff_t)xt * th + (ptrdiff_t)yt * src_pitch,
1000 dst_pitch,
1001 swizzle_bit,
1002 copy_type);
1003 }
1004 }
1005 }