src/mesa/drivers/dri/i965/intel_tiled_memcpy.c

   1 /*
   2  * Mesa 3-D graphics library
   3  *
   4  * Copyright 2012 Intel Corporation
   5  * Copyright 2013 Google
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the
   9  * "Software"), to deal in the Software without restriction, including
  10  * without limitation the rights to use, copy, modify, merge, publish,
  11  * distribute, sublicense, and/or sell copies of the Software, and to
  12  * permit persons to whom the Software is furnished to do so, subject to
  13  * the following conditions:
  14  *
  15  * The above copyright notice and this permission notice (including the
  16  * next paragraph) shall be included in all copies or substantial portions
  17  * of the Software.
  18  *
  19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  20  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  22  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  23  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  26  *
  27  * Authors:
  28  *    Chad Versace <chad.versace@linux.intel.com>
  29  *    Frank Henigman <fjhenigman@google.com>
  30  */
  31
  32 #include <string.h>
  33
  34 #include "util/macros.h"
  35
  36 #include "brw_context.h"
  37 #include "intel_tiled_memcpy.h"
  38
  39 #if defined(__SSSE3__)
  40 #include <tmmintrin.h>
  41 #elif defined(__SSE2__)
  42 #include <emmintrin.h>
  43 #endif
  44
  45
  46 #define FILE_DEBUG_FLAG DEBUG_TEXTURE
  47
  48 #define ALIGN_DOWN(a, b) ROUND_DOWN_TO(a, b)
  49 #define ALIGN_UP(a, b) ALIGN(a, b)
  50
  51 /* Tile dimensions.  Width and span are in bytes, height is in pixels (i.e.
  52  * unitless).  A "span" is the most number of bytes we can copy from linear
  53  * to tiled without needing to calculate a new destination address.
  54  */
  55 static const uint32_t xtile_width = 512;
  56 static const uint32_t xtile_height = 8;
  57 static const uint32_t xtile_span = 64;
  58 static const uint32_t ytile_width = 128;
  59 static const uint32_t ytile_height = 32;
  60 static const uint32_t ytile_span = 16;
  61
  62 #if defined(__SSSE3__)
  63 static const uint8_t rgba8_permutation[16] =
  64    { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };
  65
  66 /* NOTE: dst must be 16-byte aligned. src may be unaligned. */
  67 static inline void
  68 rgba8_copy_16_aligned_dst(void *dst, const void *src)
  69 {
  70    _mm_store_si128((__m128i *)(dst),
  71                    _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(src)),
  72                                     *(__m128i *)rgba8_permutation));
  73 }
  74
  75 /* NOTE: src must be 16-byte aligned. dst may be unaligned. */
  76 static inline void
  77 rgba8_copy_16_aligned_src(void *dst, const void *src)
  78 {
  79    _mm_storeu_si128((__m128i *)(dst),
  80                     _mm_shuffle_epi8(_mm_load_si128((__m128i *)(src)),
  81                                      *(__m128i *)rgba8_permutation));
  82 }
  83
  84 #elif defined(__SSE2__)
  85 static inline void
  86 rgba8_copy_16_aligned_dst(void *dst, const void *src)
  87 {
  88    __m128i srcreg, dstreg, agmask, ag, rb, br;
  89
  90    agmask = _mm_set1_epi32(0xFF00FF00);
  91    srcreg = _mm_loadu_si128((__m128i *)src);
  92
  93    rb = _mm_andnot_si128(agmask, srcreg);
  94    ag = _mm_and_si128(agmask, srcreg);
  95    br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)),
  96                             _MM_SHUFFLE(2, 3, 0, 1));
  97    dstreg = _mm_or_si128(ag, br);
  98
  99    _mm_store_si128((__m128i *)dst, dstreg);
 100 }
 101
 102 static inline void
 103 rgba8_copy_16_aligned_src(void *dst, const void *src)
 104 {
 105    __m128i srcreg, dstreg, agmask, ag, rb, br;
 106
 107    agmask = _mm_set1_epi32(0xFF00FF00);
 108    srcreg = _mm_load_si128((__m128i *)src);
 109
 110    rb = _mm_andnot_si128(agmask, srcreg);
 111    ag = _mm_and_si128(agmask, srcreg);
 112    br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)),
 113                             _MM_SHUFFLE(2, 3, 0, 1));
 114    dstreg = _mm_or_si128(ag, br);
 115
 116    _mm_storeu_si128((__m128i *)dst, dstreg);
 117 }
 118 #endif
 119
 120
 121 /**
 122  * Copy RGBA to BGRA - swap R and B, with the destination 16-byte aligned.
 123  */
 124 static inline void *
 125 rgba8_copy_aligned_dst(void *dst, const void *src, size_t bytes)
 126 {
 127    uint8_t *d = dst;
 128    uint8_t const *s = src;
 129
 130 #if defined(__SSSE3__) || defined(__SSE2__)
 131    if (bytes == 16) {
 132       assert(!(((uintptr_t)dst) & 0xf));
 133       rgba8_copy_16_aligned_dst(d+ 0, s+ 0);
 134       return dst;
 135    }
 136
 137    if (bytes == 64) {
 138       assert(!(((uintptr_t)dst) & 0xf));
 139       rgba8_copy_16_aligned_dst(d+ 0, s+ 0);
 140       rgba8_copy_16_aligned_dst(d+16, s+16);
 141       rgba8_copy_16_aligned_dst(d+32, s+32);
 142       rgba8_copy_16_aligned_dst(d+48, s+48);
 143       return dst;
 144    }
 145 #endif
 146
 147    while (bytes >= 4) {
 148       d[0] = s[2];
 149       d[1] = s[1];
 150       d[2] = s[0];
 151       d[3] = s[3];
 152       d += 4;
 153       s += 4;
 154       bytes -= 4;
 155    }
 156    return dst;
 157 }
 158
 159 /**
 160  * Copy RGBA to BGRA - swap R and B, with the source 16-byte aligned.
 161  */
 162 static inline void *
 163 rgba8_copy_aligned_src(void *dst, const void *src, size_t bytes)
 164 {
 165    uint8_t *d = dst;
 166    uint8_t const *s = src;
 167
 168 #if defined(__SSSE3__) || defined(__SSE2__)
 169    if (bytes == 16) {
 170       assert(!(((uintptr_t)src) & 0xf));
 171       rgba8_copy_16_aligned_src(d+ 0, s+ 0);
 172       return dst;
 173    }
 174
 175    if (bytes == 64) {
 176       assert(!(((uintptr_t)src) & 0xf));
 177       rgba8_copy_16_aligned_src(d+ 0, s+ 0);
 178       rgba8_copy_16_aligned_src(d+16, s+16);
 179       rgba8_copy_16_aligned_src(d+32, s+32);
 180       rgba8_copy_16_aligned_src(d+48, s+48);
 181       return dst;
 182    }
 183 #endif
 184
 185    while (bytes >= 4) {
 186       d[0] = s[2];
 187       d[1] = s[1];
 188       d[2] = s[0];
 189       d[3] = s[3];
 190       d += 4;
 191       s += 4;
 192       bytes -= 4;
 193    }
 194    return dst;
 195 }
 196
 197 /**
 198  * Each row from y0 to y1 is copied in three parts: [x0,x1), [x1,x2), [x2,x3).
 199  * These ranges are in bytes, i.e. pixels * bytes-per-pixel.
 200  * The first and last ranges must be shorter than a "span" (the longest linear
 201  * stretch within a tile) and the middle must equal a whole number of spans.
 202  * Ranges may be empty.  The region copied must land entirely within one tile.
 203  * 'dst' is the start of the tile and 'src' is the corresponding
 204  * address to copy from, though copying begins at (x0, y0).
 205  * To enable swizzling 'swizzle_bit' must be 1<<6, otherwise zero.
 206  * Swizzling flips bit 6 in the copy destination offset, when certain other
 207  * bits are set in it.
 208  */
 209 typedef void (*tile_copy_fn)(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
 210                              uint32_t y0, uint32_t y1,
 211                              char *dst, const char *src,
 212                              int32_t linear_pitch,
 213                              uint32_t swizzle_bit,
 214                              mem_copy_fn mem_copy);
 215
 216 /**
 217  * Copy texture data from linear to X tile layout.
 218  *
 219  * \copydoc tile_copy_fn
 220  */
 221 static inline void
 222 linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
 223                  uint32_t y0, uint32_t y1,
 224                  char *dst, const char *src,
 225                  int32_t src_pitch,
 226                  uint32_t swizzle_bit,
 227                  mem_copy_fn mem_copy)
 228 {
 229    /* The copy destination offset for each range copied is the sum of
 230     * an X offset 'x0' or 'xo' and a Y offset 'yo.'
 231     */
 232    uint32_t xo, yo;
 233
 234    src += (ptrdiff_t)y0 * src_pitch;
 235
 236    for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) {
 237       /* Bits 9 and 10 of the copy destination offset control swizzling.
 238        * Only 'yo' contributes to those bits in the total offset,
 239        * so calculate 'swizzle' just once per row.
 240        * Move bits 9 and 10 three and four places respectively down
 241        * to bit 6 and xor them.
 242        */
 243       uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit;
 244
 245       mem_copy(dst + ((x0 + yo) ^ swizzle), src + x0, x1 - x0);
 246
 247       for (xo = x1; xo < x2; xo += xtile_span) {
 248          mem_copy(dst + ((xo + yo) ^ swizzle), src + xo, xtile_span);
 249       }
 250
 251       mem_copy(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
 252
 253       src += src_pitch;
 254    }
 255 }
 256
 257 /**
 258  * Copy texture data from linear to Y tile layout.
 259  *
 260  * \copydoc tile_copy_fn
 261  */
 262 static inline void
 263 linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
 264                  uint32_t y0, uint32_t y1,
 265                  char *dst, const char *src,
 266                  int32_t src_pitch,
 267                  uint32_t swizzle_bit,
 268                  mem_copy_fn mem_copy)
 269 {
 270    /* Y tiles consist of columns that are 'ytile_span' wide (and the same height
 271     * as the tile).  Thus the destination offset for (x,y) is the sum of:
 272     *   (x % column_width)                    // position within column
 273     *   (x / column_width) * bytes_per_column // column number * bytes per column
 274     *   y * column_width
 275     *
 276     * The copy destination offset for each range copied is the sum of
 277     * an X offset 'xo0' or 'xo' and a Y offset 'yo.'
 278     */
 279    const uint32_t column_width = ytile_span;
 280    const uint32_t bytes_per_column = column_width * ytile_height;
 281
 282    uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;
 283    uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;
 284
 285    /* Bit 9 of the destination offset control swizzling.
 286     * Only the X offset contributes to bit 9 of the total offset,
 287     * so swizzle can be calculated in advance for these X positions.
 288     * Move bit 9 three places down to bit 6.
 289     */
 290    uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit;
 291    uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit;
 292
 293    uint32_t x, yo;
 294
 295    src += (ptrdiff_t)y0 * src_pitch;
 296
 297    for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {
 298       uint32_t xo = xo1;
 299       uint32_t swizzle = swizzle1;
 300
 301       mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0);
 302
 303       /* Step by spans/columns.  As it happens, the swizzle bit flips
 304        * at each step so we don't need to calculate it explicitly.
 305        */
 306       for (x = x1; x < x2; x += ytile_span) {
 307          mem_copy(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);
 308          xo += bytes_per_column;
 309          swizzle ^= swizzle_bit;
 310       }
 311
 312       mem_copy(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
 313
 314       src += src_pitch;
 315    }
 316 }
 317
 318 /**
 319  * Copy texture data from X tile layout to linear.
 320  *
 321  * \copydoc tile_copy_fn
 322  */
 323 static inline void
 324 xtiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
 325                  uint32_t y0, uint32_t y1,
 326                  char *dst, const char *src,
 327                  int32_t dst_pitch,
 328                  uint32_t swizzle_bit,
 329                  mem_copy_fn mem_copy)
 330 {
 331    /* The copy destination offset for each range copied is the sum of
 332     * an X offset 'x0' or 'xo' and a Y offset 'yo.'
 333     */
 334    uint32_t xo, yo;
 335
 336    dst += (ptrdiff_t)y0 * dst_pitch;
 337
 338    for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) {
 339       /* Bits 9 and 10 of the copy destination offset control swizzling.
 340        * Only 'yo' contributes to those bits in the total offset,
 341        * so calculate 'swizzle' just once per row.
 342        * Move bits 9 and 10 three and four places respectively down
 343        * to bit 6 and xor them.
 344        */
 345       uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit;
 346
 347       mem_copy(dst + x0, src + ((x0 + yo) ^ swizzle), x1 - x0);
 348
 349       for (xo = x1; xo < x2; xo += xtile_span) {
 350          mem_copy(dst + xo, src + ((xo + yo) ^ swizzle), xtile_span);
 351       }
 352
 353       mem_copy(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
 354
 355       dst += dst_pitch;
 356    }
 357 }
 358
 359  /**
 360  * Copy texture data from Y tile layout to linear.
 361  *
 362  * \copydoc tile_copy_fn
 363  */
 364 static inline void
 365 ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
 366                  uint32_t y0, uint32_t y1,
 367                  char *dst, const char *src,
 368                  int32_t dst_pitch,
 369                  uint32_t swizzle_bit,
 370                  mem_copy_fn mem_copy)
 371 {
 372    /* Y tiles consist of columns that are 'ytile_span' wide (and the same height
 373     * as the tile).  Thus the destination offset for (x,y) is the sum of:
 374     *   (x % column_width)                    // position within column
 375     *   (x / column_width) * bytes_per_column // column number * bytes per column
 376     *   y * column_width
 377     *
 378     * The copy destination offset for each range copied is the sum of
 379     * an X offset 'xo0' or 'xo' and a Y offset 'yo.'
 380     */
 381    const uint32_t column_width = ytile_span;
 382    const uint32_t bytes_per_column = column_width * ytile_height;
 383
 384    uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;
 385    uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;
 386
 387    /* Bit 9 of the destination offset control swizzling.
 388     * Only the X offset contributes to bit 9 of the total offset,
 389     * so swizzle can be calculated in advance for these X positions.
 390     * Move bit 9 three places down to bit 6.
 391     */
 392    uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit;
 393    uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit;
 394
 395    uint32_t x, yo;
 396
 397    dst += (ptrdiff_t)y0 * dst_pitch;
 398
 399    for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {
 400       uint32_t xo = xo1;
 401       uint32_t swizzle = swizzle1;
 402
 403       mem_copy(dst + x0, src + ((xo0 + yo) ^ swizzle0), x1 - x0);
 404
 405       /* Step by spans/columns.  As it happens, the swizzle bit flips
 406        * at each step so we don't need to calculate it explicitly.
 407        */
 408       for (x = x1; x < x2; x += ytile_span) {
 409          mem_copy(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);
 410          xo += bytes_per_column;
 411          swizzle ^= swizzle_bit;
 412       }
 413
 414       mem_copy(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
 415
 416       dst += dst_pitch;
 417    }
 418 }
 419
 420
 421 /**
 422  * Copy texture data from linear to X tile layout, faster.
 423  *
 424  * Same as \ref linear_to_xtiled but faster, because it passes constant
 425  * parameters for common cases, allowing the compiler to inline code
 426  * optimized for those cases.
 427  *
 428  * \copydoc tile_copy_fn
 429  */
 430 static FLATTEN void
 431 linear_to_xtiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
 432                         uint32_t y0, uint32_t y1,
 433                         char *dst, const char *src,
 434                         int32_t src_pitch,
 435                         uint32_t swizzle_bit,
 436                         mem_copy_fn mem_copy)
 437 {
 438    if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) {
 439       if (mem_copy == memcpy)
 440          return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
 441                                  dst, src, src_pitch, swizzle_bit, memcpy);
 442       else if (mem_copy == rgba8_copy_aligned_dst)
 443          return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
 444                                  dst, src, src_pitch, swizzle_bit,
 445                                  rgba8_copy_aligned_dst);
 446       else
 447          unreachable("not reached");
 448    } else {
 449       if (mem_copy == memcpy)
 450          return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
 451                                  dst, src, src_pitch, swizzle_bit, memcpy);
 452       else if (mem_copy == rgba8_copy_aligned_dst)
 453          return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
 454                                  dst, src, src_pitch, swizzle_bit,
 455                                  rgba8_copy_aligned_dst);
 456       else
 457          unreachable("not reached");
 458    }
 459    linear_to_xtiled(x0, x1, x2, x3, y0, y1,
 460                     dst, src, src_pitch, swizzle_bit, mem_copy);
 461 }
 462
 463 /**
 464  * Copy texture data from linear to Y tile layout, faster.
 465  *
 466  * Same as \ref linear_to_ytiled but faster, because it passes constant
 467  * parameters for common cases, allowing the compiler to inline code
 468  * optimized for those cases.
 469  *
 470  * \copydoc tile_copy_fn
 471  */
 472 static FLATTEN void
 473 linear_to_ytiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
 474                         uint32_t y0, uint32_t y1,
 475                         char *dst, const char *src,
 476                         int32_t src_pitch,
 477                         uint32_t swizzle_bit,
 478                         mem_copy_fn mem_copy)
 479 {
 480    if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
 481       if (mem_copy == memcpy)
 482          return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
 483                                  dst, src, src_pitch, swizzle_bit, memcpy);
 484       else if (mem_copy == rgba8_copy_aligned_dst)
 485          return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
 486                                  dst, src, src_pitch, swizzle_bit,
 487                                  rgba8_copy_aligned_dst);
 488       else
 489          unreachable("not reached");
 490    } else {
 491       if (mem_copy == memcpy)
 492          return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
 493                                  dst, src, src_pitch, swizzle_bit, memcpy);
 494       else if (mem_copy == rgba8_copy_aligned_dst)
 495          return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
 496                                  dst, src, src_pitch, swizzle_bit,
 497                                  rgba8_copy_aligned_dst);
 498       else
 499          unreachable("not reached");
 500    }
 501    linear_to_ytiled(x0, x1, x2, x3, y0, y1,
 502                     dst, src, src_pitch, swizzle_bit, mem_copy);
 503 }
 504
 505 /**
 506  * Copy texture data from X tile layout to linear, faster.
 507  *
 508  * Same as \ref xtile_to_linear but faster, because it passes constant
 509  * parameters for common cases, allowing the compiler to inline code
 510  * optimized for those cases.
 511  *
 512  * \copydoc tile_copy_fn
 513  */
 514 static FLATTEN void
 515 xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
 516                         uint32_t y0, uint32_t y1,
 517                         char *dst, const char *src,
 518                         int32_t dst_pitch,
 519                         uint32_t swizzle_bit,
 520                         mem_copy_fn mem_copy)
 521 {
 522    if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) {
 523       if (mem_copy == memcpy)
 524          return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
 525                                  dst, src, dst_pitch, swizzle_bit, memcpy);
 526       else if (mem_copy == rgba8_copy_aligned_src)
 527          return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
 528                                  dst, src, dst_pitch, swizzle_bit,
 529                                  rgba8_copy_aligned_src);
 530       else
 531          unreachable("not reached");
 532    } else {
 533       if (mem_copy == memcpy)
 534          return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
 535                                  dst, src, dst_pitch, swizzle_bit, memcpy);
 536       else if (mem_copy == rgba8_copy_aligned_src)
 537          return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
 538                                  dst, src, dst_pitch, swizzle_bit,
 539                                  rgba8_copy_aligned_src);
 540       else
 541          unreachable("not reached");
 542    }
 543    xtiled_to_linear(x0, x1, x2, x3, y0, y1,
 544                     dst, src, dst_pitch, swizzle_bit, mem_copy);
 545 }
 546
 547 /**
 548  * Copy texture data from Y tile layout to linear, faster.
 549  *
 550  * Same as \ref ytile_to_linear but faster, because it passes constant
 551  * parameters for common cases, allowing the compiler to inline code
 552  * optimized for those cases.
 553  *
 554  * \copydoc tile_copy_fn
 555  */
 556 static FLATTEN void
 557 ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
 558                         uint32_t y0, uint32_t y1,
 559                         char *dst, const char *src,
 560                         int32_t dst_pitch,
 561                         uint32_t swizzle_bit,
 562                         mem_copy_fn mem_copy)
 563 {
 564    if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
 565       if (mem_copy == memcpy)
 566          return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
 567                                  dst, src, dst_pitch, swizzle_bit, memcpy);
 568       else if (mem_copy == rgba8_copy_aligned_src)
 569          return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
 570                                  dst, src, dst_pitch, swizzle_bit,
 571                                  rgba8_copy_aligned_src);
 572       else
 573          unreachable("not reached");
 574    } else {
 575       if (mem_copy == memcpy)
 576          return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
 577                                  dst, src, dst_pitch, swizzle_bit, memcpy);
 578       else if (mem_copy == rgba8_copy_aligned_src)
 579          return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
 580                                  dst, src, dst_pitch, swizzle_bit,
 581                                  rgba8_copy_aligned_src);
 582       else
 583          unreachable("not reached");
 584    }
 585    ytiled_to_linear(x0, x1, x2, x3, y0, y1,
 586                     dst, src, dst_pitch, swizzle_bit, mem_copy);
 587 }
 588
 589 /**
 590  * Copy from linear to tiled texture.
 591  *
 592  * Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into
 593  * pieces that do not cross tile boundaries and copy each piece with a tile
 594  * copy function (\ref tile_copy_fn).
 595  * The X range is in bytes, i.e. pixels * bytes-per-pixel.
 596  * The Y range is in pixels (i.e. unitless).
 597  * 'dst' is the start of the texture and 'src' is the corresponding
 598  * address to copy from, though copying begins at (xt1, yt1).
 599  */
 600 void
 601 linear_to_tiled(uint32_t xt1, uint32_t xt2,
 602                 uint32_t yt1, uint32_t yt2,
 603                 char *dst, const char *src,
 604                 uint32_t dst_pitch, int32_t src_pitch,
 605                 bool has_swizzling,
 606                 uint32_t tiling,
 607                 mem_copy_fn mem_copy)
 608 {
 609    tile_copy_fn tile_copy;
 610    uint32_t xt0, xt3;
 611    uint32_t yt0, yt3;
 612    uint32_t xt, yt;
 613    uint32_t tw, th, span;
 614    uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;
 615
 616    if (tiling == I915_TILING_X) {
 617       tw = xtile_width;
 618       th = xtile_height;
 619       span = xtile_span;
 620       tile_copy = linear_to_xtiled_faster;
 621    } else if (tiling == I915_TILING_Y) {
 622       tw = ytile_width;
 623       th = ytile_height;
 624       span = ytile_span;
 625       tile_copy = linear_to_ytiled_faster;
 626    } else {
 627       unreachable("unsupported tiling");
 628    }
 629
 630    /* Round out to tile boundaries. */
 631    xt0 = ALIGN_DOWN(xt1, tw);
 632    xt3 = ALIGN_UP  (xt2, tw);
 633    yt0 = ALIGN_DOWN(yt1, th);
 634    yt3 = ALIGN_UP  (yt2, th);
 635
 636    /* Loop over all tiles to which we have something to copy.
 637     * 'xt' and 'yt' are the origin of the destination tile, whether copying
 638     * copying a full or partial tile.
 639     * tile_copy() copies one tile or partial tile.
 640     * Looping x inside y is the faster memory access pattern.
 641     */
 642    for (yt = yt0; yt < yt3; yt += th) {
 643       for (xt = xt0; xt < xt3; xt += tw) {
 644          /* The area to update is [x0,x3) x [y0,y1).
 645           * May not want the whole tile, hence the min and max.
 646           */
 647          uint32_t x0 = MAX2(xt1, xt);
 648          uint32_t y0 = MAX2(yt1, yt);
 649          uint32_t x3 = MIN2(xt2, xt + tw);
 650          uint32_t y1 = MIN2(yt2, yt + th);
 651
 652          /* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that
 653           * the middle interval is the longest span-aligned part.
 654           * The sub-ranges could be empty.
 655           */
 656          uint32_t x1, x2;
 657          x1 = ALIGN_UP(x0, span);
 658          if (x1 > x3)
 659             x1 = x2 = x3;
 660          else
 661             x2 = ALIGN_DOWN(x3, span);
 662
 663          assert(x0 <= x1 && x1 <= x2 && x2 <= x3);
 664          assert(x1 - x0 < span && x3 - x2 < span);
 665          assert(x3 - x0 <= tw);
 666          assert((x2 - x1) % span == 0);
 667
 668          /* Translate by (xt,yt) for single-tile copier. */
 669          tile_copy(x0-xt, x1-xt, x2-xt, x3-xt,
 670                    y0-yt, y1-yt,
 671                    dst + (ptrdiff_t) xt * th + (ptrdiff_t) yt * dst_pitch,
 672                    src + (ptrdiff_t) xt      + (ptrdiff_t) yt * src_pitch,
 673                    src_pitch,
 674                    swizzle_bit,
 675                    mem_copy);
 676       }
 677    }
 678 }
 679
 680 /**
 681  * Copy from tiled to linear texture.
 682  *
 683  * Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into
 684  * pieces that do not cross tile boundaries and copy each piece with a tile
 685  * copy function (\ref tile_copy_fn).
 686  * The X range is in bytes, i.e. pixels * bytes-per-pixel.
 687  * The Y range is in pixels (i.e. unitless).
 688  * 'dst' is the start of the texture and 'src' is the corresponding
 689  * address to copy from, though copying begins at (xt1, yt1).
 690  */
 691 void
 692 tiled_to_linear(uint32_t xt1, uint32_t xt2,
 693                 uint32_t yt1, uint32_t yt2,
 694                 char *dst, const char *src,
 695                 int32_t dst_pitch, uint32_t src_pitch,
 696                 bool has_swizzling,
 697                 uint32_t tiling,
 698                 mem_copy_fn mem_copy)
 699 {
 700    tile_copy_fn tile_copy;
 701    uint32_t xt0, xt3;
 702    uint32_t yt0, yt3;
 703    uint32_t xt, yt;
 704    uint32_t tw, th, span;
 705    uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;
 706
 707    if (tiling == I915_TILING_X) {
 708       tw = xtile_width;
 709       th = xtile_height;
 710       span = xtile_span;
 711       tile_copy = xtiled_to_linear_faster;
 712    } else if (tiling == I915_TILING_Y) {
 713       tw = ytile_width;
 714       th = ytile_height;
 715       span = ytile_span;
 716       tile_copy = ytiled_to_linear_faster;
 717    } else {
 718       unreachable("unsupported tiling");
 719    }
 720
 721    /* Round out to tile boundaries. */
 722    xt0 = ALIGN_DOWN(xt1, tw);
 723    xt3 = ALIGN_UP  (xt2, tw);
 724    yt0 = ALIGN_DOWN(yt1, th);
 725    yt3 = ALIGN_UP  (yt2, th);
 726
 727    /* Loop over all tiles to which we have something to copy.
 728     * 'xt' and 'yt' are the origin of the destination tile, whether copying
 729     * copying a full or partial tile.
 730     * tile_copy() copies one tile or partial tile.
 731     * Looping x inside y is the faster memory access pattern.
 732     */
 733    for (yt = yt0; yt < yt3; yt += th) {
 734       for (xt = xt0; xt < xt3; xt += tw) {
 735          /* The area to update is [x0,x3) x [y0,y1).
 736           * May not want the whole tile, hence the min and max.
 737           */
 738          uint32_t x0 = MAX2(xt1, xt);
 739          uint32_t y0 = MAX2(yt1, yt);
 740          uint32_t x3 = MIN2(xt2, xt + tw);
 741          uint32_t y1 = MIN2(yt2, yt + th);
 742
 743          /* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that
 744           * the middle interval is the longest span-aligned part.
 745           * The sub-ranges could be empty.
 746           */
 747          uint32_t x1, x2;
 748          x1 = ALIGN_UP(x0, span);
 749          if (x1 > x3)
 750             x1 = x2 = x3;
 751          else
 752             x2 = ALIGN_DOWN(x3, span);
 753
 754          assert(x0 <= x1 && x1 <= x2 && x2 <= x3);
 755          assert(x1 - x0 < span && x3 - x2 < span);
 756          assert(x3 - x0 <= tw);
 757          assert((x2 - x1) % span == 0);
 758
 759          /* Translate by (xt,yt) for single-tile copier. */
 760          tile_copy(x0-xt, x1-xt, x2-xt, x3-xt,
 761                    y0-yt, y1-yt,
 762                    dst + (ptrdiff_t) xt      + (ptrdiff_t) yt * dst_pitch,
 763                    src + (ptrdiff_t) xt * th + (ptrdiff_t) yt * src_pitch,
 764                    dst_pitch,
 765                    swizzle_bit,
 766                    mem_copy);
 767       }
 768    }
 769 }
 770
 771
 772 /**
 773  * Determine which copy function to use for the given format combination
 774  *
 775  * The only two possible copy functions which are ever returned are a
 776  * direct memcpy and a RGBA <-> BGRA copy function.  Since RGBA -> BGRA and
 777  * BGRA -> RGBA are exactly the same operation (and memcpy is obviously
 778  * symmetric), it doesn't matter whether the copy is from the tiled image
 779  * to the untiled or vice versa.  The copy function required is the same in
 780  * either case so this function can be used.
 781  *
 782  * \param[in]  tiledFormat The format of the tiled image
 783  * \param[in]  format      The GL format of the client data
 784  * \param[in]  type        The GL type of the client data
 785  * \param[out] mem_copy    Will be set to one of either the standard
 786  *                         library's memcpy or a different copy function
 787  *                         that performs an RGBA to BGRA conversion
 788  * \param[out] cpp         Number of bytes per channel
 789  *
 790  * \return true if the format and type combination are valid
 791  */
 792 bool intel_get_memcpy(mesa_format tiledFormat, GLenum format,
 793                       GLenum type, mem_copy_fn *mem_copy, uint32_t *cpp,
 794                       enum intel_memcpy_direction direction)
 795 {
 796    if (type == GL_UNSIGNED_INT_8_8_8_8_REV &&
 797        !(format == GL_RGBA || format == GL_BGRA))
 798       return false; /* Invalid type/format combination */
 799
 800    if ((tiledFormat == MESA_FORMAT_L_UNORM8 && format == GL_LUMINANCE) ||
 801        (tiledFormat == MESA_FORMAT_A_UNORM8 && format == GL_ALPHA)) {
 802       *cpp = 1;
 803       *mem_copy = memcpy;
 804    } else if ((tiledFormat == MESA_FORMAT_B8G8R8A8_UNORM) ||
 805               (tiledFormat == MESA_FORMAT_B8G8R8X8_UNORM)) {
 806       *cpp = 4;
 807       if (format == GL_BGRA) {
 808          *mem_copy = memcpy;
 809       } else if (format == GL_RGBA) {
 810          *mem_copy = direction == INTEL_UPLOAD ? rgba8_copy_aligned_dst
 811                                                : rgba8_copy_aligned_src;
 812       }
 813    } else if ((tiledFormat == MESA_FORMAT_R8G8B8A8_UNORM) ||
 814               (tiledFormat == MESA_FORMAT_R8G8B8X8_UNORM)) {
 815       *cpp = 4;
 816       if (format == GL_BGRA) {
 817          /* Copying from RGBA to BGRA is the same as BGRA to RGBA so we can
 818           * use the same function.
 819           */
 820          *mem_copy = direction == INTEL_UPLOAD ? rgba8_copy_aligned_dst
 821                                                : rgba8_copy_aligned_src;
 822       } else if (format == GL_RGBA) {
 823          *mem_copy = memcpy;
 824       }
 825    }
 826
 827    if (!(*mem_copy))
 828       return false;
 829
 830    return true;
 831 }