src/gallium/drivers/llvmpipe/lp_tile_soa.py

   1 #!/usr/bin/env python
   2
   3 '''
   4 /**************************************************************************
   5  *
   6  * Copyright 2009 VMware, Inc.
   7  * All Rights Reserved.
   8  *
   9  * Permission is hereby granted, free of charge, to any person obtaining a
  10  * copy of this software and associated documentation files (the
  11  * "Software"), to deal in the Software without restriction, including
  12  * without limitation the rights to use, copy, modify, merge, publish,
  13  * distribute, sub license, and/or sell copies of the Software, and to
  14  * permit persons to whom the Software is furnished to do so, subject to
  15  * the following conditions:
  16  *
  17  * The above copyright notice and this permission notice (including the
  18  * next paragraph) shall be included in all copies or substantial portions
  19  * of the Software.
  20  *
  21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  22  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  23  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  24  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  25  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  26  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  27  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  28  *
  29  **************************************************************************/
  30
  31 /**
  32  * @file
  33  * Pixel format accessor functions.
  34  *
  35  * @author Jose Fonseca <jfonseca@vmware.com>
  36  */
  37 '''
  38
  39
  40 import sys
  41 import os.path
  42
  43 sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), '../../auxiliary/util'))
  44
  45 from u_format_pack import *
  46
  47
  48 def is_format_supported(format):
  49     '''Determines whether we actually have the plumbing necessary to generate the
  50     to read/write to/from this format.'''
  51
  52     # FIXME: Ideally we would support any format combination here.
  53
  54     if format.layout != PLAIN:
  55         return False
  56
  57     for i in range(4):
  58         channel = format.channels[i]
  59         if channel.type not in (VOID, UNSIGNED, SIGNED, FLOAT):
  60             return False
  61         if channel.type == FLOAT and channel.size not in (16, 32 ,64):
  62             return False
  63
  64     if format.colorspace not in ('rgb', 'srgb'):
  65         return False
  66
  67     return True
  68
  69
  70 def generate_format_read(format, dst_channel, dst_native_type, dst_suffix):
  71     '''Generate the function to read pixels from a particular format'''
  72
  73     name = format.short_name()
  74
  75     src_native_type = native_type(format)
  76
  77     print 'static void'
  78     print 'lp_tile_%s_swizzle_%s(%s *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)' % (name, dst_suffix, dst_native_type)
  79     print '{'
  80     print '   unsigned x, y;'
  81     print '   const uint8_t *src_row = src + y0*src_stride;'
  82     print '   for (y = 0; y < TILE_SIZE; ++y) {'
  83     print '      const %s *src_pixel = (const %s *)(src_row + x0*%u);' % (src_native_type, src_native_type, format.stride())
  84     print '      for (x = 0; x < TILE_SIZE; ++x) {'
  85
  86     names = ['']*4
  87     if format.colorspace in ('rgb', 'srgb'):
  88         for i in range(4):
  89             swizzle = format.swizzles[i]
  90             if swizzle < 4:
  91                 names[swizzle] += 'rgba'[i]
  92     elif format.colorspace == 'zs':
  93         swizzle = format.swizzles[0]
  94         if swizzle < 4:
  95             names[swizzle] = 'z'
  96         else:
  97             assert False
  98     else:
  99         assert False
 100
 101     if format.layout == PLAIN:
 102         if not format.is_array():
 103             print '         %s pixel = *src_pixel++;' % src_native_type
 104             shift = 0;
 105             for i in range(4):
 106                 src_channel = format.channels[i]
 107                 width = src_channel.size
 108                 if names[i]:
 109                     value = 'pixel'
 110                     mask = (1 << width) - 1
 111                     if shift:
 112                         value = '(%s >> %u)' % (value, shift)
 113                     if shift + width < format.block_size():
 114                         value = '(%s & 0x%x)' % (value, mask)
 115                     value = conversion_expr(src_channel, dst_channel, dst_native_type, value, clamp=False)
 116                     print '         %s %s = %s;' % (dst_native_type, names[i], value)
 117                 shift += width
 118         else:
 119             for i in range(4):
 120                 if names[i]:
 121                     print '         %s %s;' % (dst_native_type, names[i])
 122             for i in range(4):
 123                 src_channel = format.channels[i]
 124                 if names[i]:
 125                     value = '(*src_pixel++)'
 126                     value = conversion_expr(src_channel, dst_channel, dst_native_type, value, clamp=False)
 127                     print '         %s = %s;' % (names[i], value)
 128                 elif src_channel.size:
 129                     print '         ++src_pixel;'
 130     else:
 131         assert False
 132
 133     for i in range(4):
 134         if format.colorspace in ('rgb', 'srgb'):
 135             swizzle = format.swizzles[i]
 136             if swizzle < 4:
 137                 value = names[swizzle]
 138             elif swizzle == SWIZZLE_0:
 139                 value = '0'
 140             elif swizzle == SWIZZLE_1:
 141                 value = get_one(dst_channel)
 142             else:
 143                 assert False
 144         elif format.colorspace == 'zs':
 145             if i < 3:
 146                 value = 'z'
 147             else:
 148                 value = get_one(dst_channel)
 149         else:
 150             assert False
 151         print '         TILE_PIXEL(dst, x, y, %u) = %s; /* %s */' % (i, value, 'rgba'[i])
 152
 153     print '      }'
 154     print '      src_row += src_stride;'
 155     print '   }'
 156     print '}'
 157     print
 158
 159
 160 def pack_rgba(format, src_channel, r, g, b, a):
 161     """Return an expression for packing r, g, b, a into a pixel of the
 162     given format.  Ex: '(b << 24) | (g << 16) | (r << 8) | (a << 0)'
 163     """
 164     assert format.colorspace in ('rgb', 'srgb')
 165     inv_swizzle = format.inv_swizzles()
 166     shift = 0
 167     expr = None
 168     for i in range(4):
 169         # choose r, g, b, or a depending on the inverse swizzle term
 170         if inv_swizzle[i] == 0:
 171             value = r
 172         elif inv_swizzle[i] == 1:
 173             value = g
 174         elif inv_swizzle[i] == 2:
 175             value = b
 176         elif inv_swizzle[i] == 3:
 177             value = a
 178         else:
 179             value = None
 180
 181         if value:
 182             dst_channel = format.channels[i]
 183             dst_native_type = native_type(format)
 184             value = conversion_expr(src_channel, dst_channel, dst_native_type, value, clamp=False)
 185             term = "((%s) << %d)" % (value, shift)
 186             if expr:
 187                 expr = expr + " | " + term
 188             else:
 189                 expr = term
 190
 191         width = format.channels[i].size
 192         shift = shift + width
 193     return expr
 194
 195
 196 def emit_unrolled_unswizzle_code(format, src_channel):
 197     '''Emit code for writing a block based on unrolled loops.
 198     This is considerably faster than the TILE_PIXEL-based code below.
 199     '''
 200     dst_native_type = 'uint%u_t' % format.block_size()
 201     print '   const unsigned dstpix_stride = dst_stride / %d;' % format.stride()
 202     print '   %s *dstpix = (%s *) dst;' % (dst_native_type, dst_native_type)
 203     print '   unsigned int qx, qy, i;'
 204     print
 205     print '   for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {'
 206     print '      const unsigned py = y0 + qy;'
 207     print '      for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {'
 208     print '         const unsigned px = x0 + qx;'
 209     print '         const uint8_t *r = src + 0 * TILE_C_STRIDE;'
 210     print '         const uint8_t *g = src + 1 * TILE_C_STRIDE;'
 211     print '         const uint8_t *b = src + 2 * TILE_C_STRIDE;'
 212     print '         const uint8_t *a = src + 3 * TILE_C_STRIDE;'
 213     print '         (void) r; (void) g; (void) b; (void) a; /* silence warnings */'
 214     print '         for (i = 0; i < TILE_C_STRIDE; i += 2) {'
 215     print '            const uint32_t pixel0 = %s;' % pack_rgba(format, src_channel, "r[i+0]", "g[i+0]", "b[i+0]", "a[i+0]")
 216     print '            const uint32_t pixel1 = %s;' % pack_rgba(format, src_channel, "r[i+1]", "g[i+1]", "b[i+1]", "a[i+1]")
 217     print '            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);'
 218     print '            dstpix[offset + 0] = pixel0;'
 219     print '            dstpix[offset + 1] = pixel1;'
 220     print '         }'
 221     print '         src += TILE_X_STRIDE;'
 222     print '      }'
 223     print '   }'
 224
 225
 226 def emit_tile_pixel_unswizzle_code(format, src_channel):
 227     '''Emit code for writing a block based on the TILE_PIXEL macro.'''
 228     dst_native_type = native_type(format)
 229
 230     inv_swizzle = format.inv_swizzles()
 231
 232     print '   unsigned x, y;'
 233     print '   uint8_t *dst_row = dst + y0*dst_stride;'
 234     print '   for (y = 0; y < TILE_SIZE; ++y) {'
 235     print '      %s *dst_pixel = (%s *)(dst_row + x0*%u);' % (dst_native_type, dst_native_type, format.stride())
 236     print '      for (x = 0; x < TILE_SIZE; ++x) {'
 237
 238     if format.layout == PLAIN:
 239         if not format.is_array():
 240             print '         %s pixel = 0;' % dst_native_type
 241             shift = 0;
 242             for i in range(4):
 243                 dst_channel = format.channels[i]
 244                 width = dst_channel.size
 245                 if inv_swizzle[i] is not None:
 246                     value = 'TILE_PIXEL(src, x, y, %u)' % inv_swizzle[i]
 247                     value = conversion_expr(src_channel, dst_channel, dst_native_type, value, clamp=False)
 248                     if shift:
 249                         value = '(%s << %u)' % (value, shift)
 250                     print '         pixel |= %s;' % value
 251                 shift += width
 252             print '         *dst_pixel++ = pixel;'
 253         else:
 254             for i in range(4):
 255                 dst_channel = format.channels[i]
 256                 if inv_swizzle[i] is not None:
 257                     value = 'TILE_PIXEL(src, x, y, %u)' % inv_swizzle[i]
 258                     value = conversion_expr(src_channel, dst_channel, dst_native_type, value, clamp=False)
 259                     print '         *dst_pixel++ = %s;' % value
 260                 elif dst_channel.size:
 261                     print '         ++dst_pixel;'
 262     else:
 263         assert False
 264
 265     print '      }'
 266     print '      dst_row += dst_stride;'
 267     print '   }'
 268
 269
 270 def generate_format_write(format, src_channel, src_native_type, src_suffix):
 271     '''Generate the function to write pixels to a particular format'''
 272
 273     name = format.short_name()
 274
 275     print 'static void'
 276     print 'lp_tile_%s_unswizzle_%s(const %s *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)' % (name, src_suffix, src_native_type)
 277     print '{'
 278     if format.layout == PLAIN \
 279         and format.colorspace == 'rgb' \
 280         and format.block_size() <= 32 \
 281         and format.is_pot() \
 282         and not format.is_mixed() \
 283         and (format.channels[0].type == UNSIGNED \
 284              or format.channels[1].type == UNSIGNED):
 285         emit_unrolled_unswizzle_code(format, src_channel)
 286     else:
 287         emit_tile_pixel_unswizzle_code(format, src_channel)
 288     print '}'
 289     print
 290
 291
 292 def generate_ssse3():
 293     print '''
 294 #if defined(PIPE_ARCH_SSE)
 295
 296
 297 #if defined(PIPE_ARCH_SSSE3)
 298
 299 #include <tmmintrin.h>
 300
 301 #else
 302
 303 #include <emmintrin.h>
 304
 305 /**
 306  * Describe _mm_shuffle_epi8() with gcc extended inline assembly, for cases
 307  * where -mssse3 is not supported/enabled.
 308  *
 309  * MSVC will never get in here as its intrinsics support do not rely on
 310  * compiler command line options.
 311  */
 312 static __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 313 _mm_shuffle_epi8(__m128i a, __m128i mask)
 314 {
 315     __m128i result;
 316     __asm__("pshufb %1, %0"
 317             : "=x" (result)
 318             : "xm" (mask), "0" (a));
 319     return result;
 320 }
 321
 322 #endif
 323
 324
 325 static void
 326 lp_tile_b8g8r8a8_unorm_swizzle_4ub_ssse3(uint8_t *dst,
 327                                          const uint8_t *src, unsigned src_stride,
 328                                          unsigned x0, unsigned y0)
 329 {
 330
 331    unsigned x, y;
 332    __m128i *pdst = (__m128i*) dst;
 333    const uint8_t *ysrc0 = src + y0*src_stride + x0*sizeof(uint32_t);
 334    unsigned int tile_stridex = src_stride*(TILE_VECTOR_HEIGHT - 1) - sizeof(uint32_t)*TILE_VECTOR_WIDTH;
 335    unsigned int tile_stridey = src_stride*TILE_VECTOR_HEIGHT;
 336
 337    const __m128i shuffle00 = _mm_setr_epi8(0x02,0x06,0xff,0xff,0x0a,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
 338    const __m128i shuffle01 = _mm_setr_epi8(0x01,0x05,0xff,0xff,0x09,0x0d,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
 339    const __m128i shuffle02 = _mm_setr_epi8(0x00,0x04,0xff,0xff,0x08,0x0c,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
 340    const __m128i shuffle03 = _mm_setr_epi8(0x03,0x07,0xff,0xff,0x0b,0x0f,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
 341
 342    const __m128i shuffle10 = _mm_setr_epi8(0xff,0xff,0x02,0x06,0xff,0xff,0x0a,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
 343    const __m128i shuffle11 = _mm_setr_epi8(0xff,0xff,0x01,0x05,0xff,0xff,0x09,0x0d,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
 344    const __m128i shuffle12 = _mm_setr_epi8(0xff,0xff,0x00,0x04,0xff,0xff,0x08,0x0c,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
 345    const __m128i shuffle13 = _mm_setr_epi8(0xff,0xff,0x03,0x07,0xff,0xff,0x0b,0x0f,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
 346
 347    const __m128i shuffle20 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x02,0x06,0xff,0xff,0x0a,0x0e,0xff,0xff);
 348    const __m128i shuffle21 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x01,0x05,0xff,0xff,0x09,0x0d,0xff,0xff);
 349    const __m128i shuffle22 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x04,0xff,0xff,0x08,0x0c,0xff,0xff);
 350    const __m128i shuffle23 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x03,0x07,0xff,0xff,0x0b,0x0f,0xff,0xff);
 351
 352    const __m128i shuffle30 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x02,0x06,0xff,0xff,0x0a,0x0e);
 353    const __m128i shuffle31 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x01,0x05,0xff,0xff,0x09,0x0d);
 354    const __m128i shuffle32 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x04,0xff,0xff,0x08,0x0c);
 355    const __m128i shuffle33 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x03,0x07,0xff,0xff,0x0b,0x0f);
 356
 357    for (y = 0; y < TILE_SIZE; y += TILE_VECTOR_HEIGHT) {
 358       __m128i line0 = *(__m128i*)ysrc0;
 359       const uint8_t *ysrc = ysrc0 + src_stride;
 360       ysrc0 += tile_stridey;
 361
 362       for (x = 0; x < TILE_SIZE; x += TILE_VECTOR_WIDTH) {
 363          __m128i r, g, b, a, line1;
 364          line1 = *(__m128i*)ysrc;
 365          PIPE_READ_WRITE_BARRIER();
 366          ysrc += src_stride;
 367          r = _mm_shuffle_epi8(line0, shuffle00);
 368          g = _mm_shuffle_epi8(line0, shuffle01);
 369          b = _mm_shuffle_epi8(line0, shuffle02);
 370          a = _mm_shuffle_epi8(line0, shuffle03);
 371
 372          line0 = *(__m128i*)ysrc;
 373          PIPE_READ_WRITE_BARRIER();
 374          ysrc += src_stride;
 375          r = _mm_or_si128(r, _mm_shuffle_epi8(line1, shuffle10));
 376          g = _mm_or_si128(g, _mm_shuffle_epi8(line1, shuffle11));
 377          b = _mm_or_si128(b, _mm_shuffle_epi8(line1, shuffle12));
 378          a = _mm_or_si128(a, _mm_shuffle_epi8(line1, shuffle13));
 379
 380          line1 = *(__m128i*)ysrc;
 381          PIPE_READ_WRITE_BARRIER();
 382          ysrc -= tile_stridex;
 383          r = _mm_or_si128(r, _mm_shuffle_epi8(line0, shuffle20));
 384          g = _mm_or_si128(g, _mm_shuffle_epi8(line0, shuffle21));
 385          b = _mm_or_si128(b, _mm_shuffle_epi8(line0, shuffle22));
 386          a = _mm_or_si128(a, _mm_shuffle_epi8(line0, shuffle23));
 387
 388          if (x + 1 < TILE_SIZE) {
 389             line0 = *(__m128i*)ysrc;
 390             ysrc += src_stride;
 391          }
 392
 393          PIPE_READ_WRITE_BARRIER();
 394          r = _mm_or_si128(r, _mm_shuffle_epi8(line1, shuffle30));
 395          g = _mm_or_si128(g, _mm_shuffle_epi8(line1, shuffle31));
 396          b = _mm_or_si128(b, _mm_shuffle_epi8(line1, shuffle32));
 397          a = _mm_or_si128(a, _mm_shuffle_epi8(line1, shuffle33));
 398
 399          *pdst++ = r;
 400          *pdst++ = g;
 401          *pdst++ = b;
 402          *pdst++ = a;
 403       }
 404    }
 405
 406 }
 407
 408 static void
 409 lp_tile_b8g8r8a8_unorm_unswizzle_4ub_ssse3(const uint8_t *src,
 410                                           uint8_t *dst, unsigned dst_stride,
 411                                           unsigned x0, unsigned y0)
 412 {
 413    unsigned int x, y;
 414    const __m128i *psrc = (__m128i*) src;
 415    const __m128i *end = (__m128i*) (src + (y0 + TILE_SIZE - 1)*dst_stride + (x0 + TILE_SIZE - 1)*sizeof(uint32_t));
 416    uint8_t *pdst = dst + y0 * dst_stride + x0 * sizeof(uint32_t);
 417    __m128i c0 = *psrc++;
 418    __m128i c1;
 419
 420    const __m128i shuffle00 = _mm_setr_epi8(0xff,0xff,0x00,0xff,0xff,0xff,0x01,0xff,0xff,0xff,0x04,0xff,0xff,0xff,0x05,0xff);
 421    const __m128i shuffle01 = _mm_setr_epi8(0xff,0xff,0x02,0xff,0xff,0xff,0x03,0xff,0xff,0xff,0x06,0xff,0xff,0xff,0x07,0xff);
 422    const __m128i shuffle02 = _mm_setr_epi8(0xff,0xff,0x08,0xff,0xff,0xff,0x09,0xff,0xff,0xff,0x0c,0xff,0xff,0xff,0x0d,0xff);
 423    const __m128i shuffle03 = _mm_setr_epi8(0xff,0xff,0x0a,0xff,0xff,0xff,0x0b,0xff,0xff,0xff,0x0e,0xff,0xff,0xff,0x0f,0xff);
 424
 425    const __m128i shuffle10 = _mm_setr_epi8(0xff,0x00,0xff,0xff,0xff,0x01,0xff,0xff,0xff,0x04,0xff,0xff,0xff,0x05,0xff,0xff);
 426    const __m128i shuffle11 = _mm_setr_epi8(0xff,0x02,0xff,0xff,0xff,0x03,0xff,0xff,0xff,0x06,0xff,0xff,0xff,0x07,0xff,0xff);
 427    const __m128i shuffle12 = _mm_setr_epi8(0xff,0x08,0xff,0xff,0xff,0x09,0xff,0xff,0xff,0x0c,0xff,0xff,0xff,0x0d,0xff,0xff);
 428    const __m128i shuffle13 = _mm_setr_epi8(0xff,0x0a,0xff,0xff,0xff,0x0b,0xff,0xff,0xff,0x0e,0xff,0xff,0xff,0x0f,0xff,0xff);
 429
 430    const __m128i shuffle20 = _mm_setr_epi8(0x00,0xff,0xff,0xff,0x01,0xff,0xff,0xff,0x04,0xff,0xff,0xff,0x05,0xff,0xff,0xff);
 431    const __m128i shuffle21 = _mm_setr_epi8(0x02,0xff,0xff,0xff,0x03,0xff,0xff,0xff,0x06,0xff,0xff,0xff,0x07,0xff,0xff,0xff);
 432    const __m128i shuffle22 = _mm_setr_epi8(0x08,0xff,0xff,0xff,0x09,0xff,0xff,0xff,0x0c,0xff,0xff,0xff,0x0d,0xff,0xff,0xff);
 433    const __m128i shuffle23 = _mm_setr_epi8(0x0a,0xff,0xff,0xff,0x0b,0xff,0xff,0xff,0x0e,0xff,0xff,0xff,0x0f,0xff,0xff,0xff);
 434
 435    const __m128i shuffle30 = _mm_setr_epi8(0xff,0xff,0xff,0x00,0xff,0xff,0xff,0x01,0xff,0xff,0xff,0x04,0xff,0xff,0xff,0x05);
 436    const __m128i shuffle31 = _mm_setr_epi8(0xff,0xff,0xff,0x02,0xff,0xff,0xff,0x03,0xff,0xff,0xff,0x06,0xff,0xff,0xff,0x07);
 437    const __m128i shuffle32 = _mm_setr_epi8(0xff,0xff,0xff,0x08,0xff,0xff,0xff,0x09,0xff,0xff,0xff,0x0c,0xff,0xff,0xff,0x0d);
 438    const __m128i shuffle33 = _mm_setr_epi8(0xff,0xff,0xff,0x0a,0xff,0xff,0xff,0x0b,0xff,0xff,0xff,0x0e,0xff,0xff,0xff,0x0f);
 439
 440    for (y = 0; y < TILE_SIZE; y += TILE_VECTOR_HEIGHT) {
 441       __m128i *tile = (__m128i*) pdst;
 442       pdst += dst_stride * TILE_VECTOR_HEIGHT;
 443       for (x = 0; x < TILE_SIZE; x += TILE_VECTOR_WIDTH) {
 444          uint8_t *linep = (uint8_t*) (tile++);
 445          __m128i line0, line1, line2, line3;
 446
 447          c1 = *psrc++; /* r */
 448          PIPE_READ_WRITE_BARRIER();
 449          line0 = _mm_shuffle_epi8(c0, shuffle00);
 450          line1 = _mm_shuffle_epi8(c0, shuffle01);
 451          line2 = _mm_shuffle_epi8(c0, shuffle02);
 452          line3 = _mm_shuffle_epi8(c0, shuffle03);
 453
 454          c0 = *psrc++; /* g */
 455          PIPE_READ_WRITE_BARRIER();
 456          line0 = _mm_or_si128(line0, _mm_shuffle_epi8(c1, shuffle10));
 457          line1 = _mm_or_si128(line1, _mm_shuffle_epi8(c1, shuffle11));
 458          line2 = _mm_or_si128(line2, _mm_shuffle_epi8(c1, shuffle12));
 459          line3 = _mm_or_si128(line3, _mm_shuffle_epi8(c1, shuffle13));
 460
 461          c1 = *psrc++; /* b */
 462          PIPE_READ_WRITE_BARRIER();
 463          line0 = _mm_or_si128(line0, _mm_shuffle_epi8(c0, shuffle20));
 464          line1 = _mm_or_si128(line1, _mm_shuffle_epi8(c0, shuffle21));
 465          line2 = _mm_or_si128(line2, _mm_shuffle_epi8(c0, shuffle22));
 466          line3 = _mm_or_si128(line3, _mm_shuffle_epi8(c0, shuffle23));
 467
 468          if (psrc != end)
 469                  c0 = *psrc++; /* a */
 470          PIPE_READ_WRITE_BARRIER();
 471          line0 = _mm_or_si128(line0, _mm_shuffle_epi8(c1, shuffle30));
 472          line1 = _mm_or_si128(line1, _mm_shuffle_epi8(c1, shuffle31));
 473          line2 = _mm_or_si128(line2, _mm_shuffle_epi8(c1, shuffle32));
 474          line3 = _mm_or_si128(line3, _mm_shuffle_epi8(c1, shuffle33));
 475
 476          *(__m128i*) (linep) = line0;
 477          *(__m128i*) (((char*)linep) + dst_stride) = line1;
 478          *(__m128i*) (((char*)linep) + 2 * dst_stride) = line2;
 479          *(__m128i*) (((char*)linep) + 3 * dst_stride) = line3;
 480       }
 481    }
 482 }
 483
 484 #endif /* PIPE_ARCH_SSSE3 */
 485 '''
 486
 487
 488 def generate_swizzle(formats, dst_channel, dst_native_type, dst_suffix):
 489     '''Generate the dispatch function to read pixels from any format'''
 490
 491     for format in formats:
 492         if is_format_supported(format):
 493             generate_format_read(format, dst_channel, dst_native_type, dst_suffix)
 494
 495     print 'void'
 496     print 'lp_tile_swizzle_%s(enum pipe_format format, %s *dst, const void *src, unsigned src_stride, unsigned x, unsigned y)' % (dst_suffix, dst_native_type)
 497     print '{'
 498     print '   void (*func)(%s *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0);' % dst_native_type
 499     print '#ifdef DEBUG'
 500     print '   lp_tile_swizzle_count += 1;'
 501     print '#endif'
 502     print '   switch(format) {'
 503     for format in formats:
 504         if is_format_supported(format):
 505             print '   case %s:' % format.name
 506             func_name = 'lp_tile_%s_swizzle_%s' % (format.short_name(), dst_suffix)
 507             if format.name == 'PIPE_FORMAT_B8G8R8A8_UNORM':
 508                 print '#ifdef PIPE_ARCH_SSE'
 509                 print '      func = util_cpu_caps.has_ssse3 ? %s_ssse3 : %s;' % (func_name, func_name)
 510                 print '#else'
 511                 print '      func = %s;' % (func_name,)
 512                 print '#endif'
 513             else:
 514                 print '      func = %s;' % (func_name,)
 515             print '      break;'
 516     print '   default:'
 517     print '      debug_printf("%s: unsupported format %s\\n", __FUNCTION__, util_format_name(format));'
 518     print '      return;'
 519     print '   }'
 520     print '   func(dst, (const uint8_t *)src, src_stride, x, y);'
 521     print '}'
 522     print
 523
 524
 525 def generate_unswizzle(formats, src_channel, src_native_type, src_suffix):
 526     '''Generate the dispatch function to write pixels to any format'''
 527
 528     for format in formats:
 529         if is_format_supported(format):
 530             generate_format_write(format, src_channel, src_native_type, src_suffix)
 531
 532     print 'void'
 533     print 'lp_tile_unswizzle_%s(enum pipe_format format, const %s *src, void *dst, unsigned dst_stride, unsigned x, unsigned y)' % (src_suffix, src_native_type)
 534
 535     print '{'
 536     print '   void (*func)(const %s *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0);' % src_native_type
 537     print '#ifdef DEBUG'
 538     print '   lp_tile_unswizzle_count += 1;'
 539     print '#endif'
 540     print '   switch(format) {'
 541     for format in formats:
 542         if is_format_supported(format):
 543             print '   case %s:' % format.name
 544             func_name = 'lp_tile_%s_unswizzle_%s' % (format.short_name(), src_suffix)
 545             if format.name == 'PIPE_FORMAT_B8G8R8A8_UNORM':
 546                 print '#ifdef PIPE_ARCH_SSE'
 547                 print '      func = util_cpu_caps.has_ssse3 ? %s_ssse3 : %s;' % (func_name, func_name)
 548                 print '#else'
 549                 print '      func = %s;' % (func_name,)
 550                 print '#endif'
 551             else:
 552                 print '      func = %s;' % (func_name,)
 553             print '      break;'
 554     print '   default:'
 555     print '      debug_printf("%s: unsupported format %s\\n", __FUNCTION__, util_format_name(format));'
 556     print '      return;'
 557     print '   }'
 558     print '   func(src, (uint8_t *)dst, dst_stride, x, y);'
 559     print '}'
 560     print
 561
 562
 563 def main():
 564     formats = []
 565     for arg in sys.argv[1:]:
 566         formats.extend(parse(arg))
 567
 568     print '/* This file is autogenerated by lp_tile_soa.py from u_format.csv. Do not edit directly. */'
 569     print
 570     # This will print the copyright message on the top of this file
 571     print __doc__.strip()
 572     print
 573     print '#include "pipe/p_compiler.h"'
 574     print '#include "util/u_format.h"'
 575     print '#include "util/u_math.h"'
 576     print '#include "util/u_half.h"'
 577     print '#include "util/u_cpu_detect.h"'
 578     print '#include "lp_tile_soa.h"'
 579     print
 580     print '#ifdef DEBUG'
 581     print 'unsigned lp_tile_unswizzle_count = 0;'
 582     print 'unsigned lp_tile_swizzle_count = 0;'
 583     print '#endif'
 584     print
 585     print 'const unsigned char'
 586     print 'tile_offset[TILE_VECTOR_HEIGHT][TILE_VECTOR_WIDTH] = {'
 587     print '   {  0,  1,  4,  5},'
 588     print '   {  2,  3,  6,  7},'
 589     print '   {  8,  9, 12, 13},'
 590     print '   { 10, 11, 14, 15}'
 591     print '};'
 592     print
 593     print '/* Note: these lookup tables could be replaced with some'
 594     print ' * bit-twiddling code, but this is a little faster.'
 595     print ' */'
 596     print 'static unsigned tile_x_offset[TILE_VECTOR_WIDTH * TILE_VECTOR_HEIGHT] = {'
 597     print '   0, 1, 0, 1, 2, 3, 2, 3,'
 598     print '   0, 1, 0, 1, 2, 3, 2, 3'
 599     print '};'
 600     print
 601     print 'static unsigned tile_y_offset[TILE_VECTOR_WIDTH * TILE_VECTOR_HEIGHT] = {'
 602     print '   0, 0, 1, 1, 0, 0, 1, 1,'
 603     print '   2, 2, 3, 3, 2, 2, 3, 3'
 604     print '};'
 605     print
 606
 607     generate_ssse3()
 608
 609     channel = Channel(UNSIGNED, True, 8)
 610     native_type = 'uint8_t'
 611     suffix = '4ub'
 612
 613     generate_swizzle(formats, channel, native_type, suffix)
 614     generate_unswizzle(formats, channel, native_type, suffix)
 615
 616
 617 if __name__ == '__main__':
 618     main()