src/gallium/drivers/llvmpipe/lp_tile_soa.py

   1 #!/usr/bin/env python
   2
   3 '''
   4 /**************************************************************************
   5  *
   6  * Copyright 2009 VMware, Inc.
   7  * All Rights Reserved.
   8  *
   9  * Permission is hereby granted, free of charge, to any person obtaining a
  10  * copy of this software and associated documentation files (the
  11  * "Software"), to deal in the Software without restriction, including
  12  * without limitation the rights to use, copy, modify, merge, publish,
  13  * distribute, sub license, and/or sell copies of the Software, and to
  14  * permit persons to whom the Software is furnished to do so, subject to
  15  * the following conditions:
  16  *
  17  * The above copyright notice and this permission notice (including the
  18  * next paragraph) shall be included in all copies or substantial portions
  19  * of the Software.
  20  *
  21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  22  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  23  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  24  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  25  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  26  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  27  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  28  *
  29  **************************************************************************/
  30
  31 /**
  32  * @file
  33  * Pixel format accessor functions.
  34  *
  35  * @author Jose Fonseca <jfonseca@vmware.com>
  36  */
  37 '''
  38
  39
  40 import sys
  41 import os.path
  42
  43 sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), '../../auxiliary/util'))
  44
  45 from u_format_pack import *
  46
  47
  48 def is_format_supported(format):
  49     '''Determines whether we actually have the plumbing necessary to generate the
  50     to read/write to/from this format.'''
  51
  52     # FIXME: Ideally we would support any format combination here.
  53
  54     if format.layout != PLAIN:
  55         return False
  56
  57     for i in range(4):
  58         channel = format.channels[i]
  59         if channel.type not in (VOID, UNSIGNED, SIGNED, FLOAT):
  60             return False
  61         if channel.type == FLOAT and channel.size not in (16, 32 ,64):
  62             return False
  63
  64     if format.colorspace not in ('rgb', 'srgb'):
  65         return False
  66
  67     return True
  68
  69
  70 def generate_format_read(format, dst_channel, dst_native_type, dst_suffix):
  71     '''Generate the function to read pixels from a particular format'''
  72
  73     name = format.short_name()
  74
  75     src_native_type = native_type(format)
  76
  77     print 'static void'
  78     print 'lp_tile_%s_swizzle_%s(%s *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)' % (name, dst_suffix, dst_native_type)
  79     print '{'
  80     print '   unsigned x, y;'
  81     print '   const uint8_t *src_row = src + y0*src_stride;'
  82     print '   for (y = 0; y < TILE_SIZE; ++y) {'
  83     print '      const %s *src_pixel = (const %s *)(src_row + x0*%u);' % (src_native_type, src_native_type, format.stride())
  84     print '      for (x = 0; x < TILE_SIZE; ++x) {'
  85
  86     names = ['']*4
  87     if format.colorspace in ('rgb', 'srgb'):
  88         for i in range(4):
  89             swizzle = format.swizzles[i]
  90             if swizzle < 4:
  91                 names[swizzle] += 'rgba'[i]
  92     elif format.colorspace == 'zs':
  93         swizzle = format.swizzles[0]
  94         if swizzle < 4:
  95             names[swizzle] = 'z'
  96         else:
  97             assert False
  98     else:
  99         assert False
 100
 101     if format.layout == PLAIN:
 102         if not format.is_array():
 103             print '         %s pixel = *src_pixel++;' % src_native_type
 104             shift = 0;
 105             for i in range(4):
 106                 src_channel = format.channels[i]
 107                 width = src_channel.size
 108                 if names[i]:
 109                     value = 'pixel'
 110                     mask = (1 << width) - 1
 111                     if shift:
 112                         value = '(%s >> %u)' % (value, shift)
 113                     if shift + width < format.block_size():
 114                         value = '(%s & 0x%x)' % (value, mask)
 115                     value = conversion_expr(src_channel, dst_channel, dst_native_type, value, clamp=False)
 116                     print '         %s %s = %s;' % (dst_native_type, names[i], value)
 117                 shift += width
 118         else:
 119             for i in range(4):
 120                 if names[i]:
 121                     print '         %s %s;' % (dst_native_type, names[i])
 122             for i in range(4):
 123                 src_channel = format.channels[i]
 124                 if names[i]:
 125                     value = '(*src_pixel++)'
 126                     value = conversion_expr(src_channel, dst_channel, dst_native_type, value, clamp=False)
 127                     print '         %s = %s;' % (names[i], value)
 128                 elif src_channel.size:
 129                     print '         ++src_pixel;'
 130     else:
 131         assert False
 132
 133     for i in range(4):
 134         if format.colorspace in ('rgb', 'srgb'):
 135             swizzle = format.swizzles[i]
 136             if swizzle < 4:
 137                 value = names[swizzle]
 138             elif swizzle == SWIZZLE_0:
 139                 value = '0'
 140             elif swizzle == SWIZZLE_1:
 141                 value = get_one(dst_channel)
 142             else:
 143                 assert False
 144         elif format.colorspace == 'zs':
 145             if i < 3:
 146                 value = 'z'
 147             else:
 148                 value = get_one(dst_channel)
 149         else:
 150             assert False
 151         print '         TILE_PIXEL(dst, x, y, %u) = %s; /* %s */' % (i, value, 'rgba'[i])
 152
 153     print '      }'
 154     print '      src_row += src_stride;'
 155     print '   }'
 156     print '}'
 157     print
 158
 159
 160 def pack_rgba(format, src_channel, r, g, b, a):
 161     """Return an expression for packing r, g, b, a into a pixel of the
 162     given format.  Ex: '(b << 24) | (g << 16) | (r << 8) | (a << 0)'
 163     """
 164     assert format.colorspace in ('rgb', 'srgb')
 165     inv_swizzle = format.inv_swizzles()
 166     shift = 0
 167     expr = None
 168     for i in range(4):
 169         # choose r, g, b, or a depending on the inverse swizzle term
 170         if inv_swizzle[i] == 0:
 171             value = r
 172         elif inv_swizzle[i] == 1:
 173             value = g
 174         elif inv_swizzle[i] == 2:
 175             value = b
 176         elif inv_swizzle[i] == 3:
 177             value = a
 178         else:
 179             value = None
 180
 181         if value:
 182             dst_channel = format.channels[i]
 183             dst_native_type = native_type(format)
 184             value = conversion_expr(src_channel, dst_channel, dst_native_type, value, clamp=False)
 185             term = "((%s) << %d)" % (value, shift)
 186             if expr:
 187                 expr = expr + " | " + term
 188             else:
 189                 expr = term
 190
 191         width = format.channels[i].size
 192         shift = shift + width
 193     return expr
 194
 195
 196 def emit_unrolled_unswizzle_code(format, src_channel):
 197     '''Emit code for writing a block based on unrolled loops.
 198     This is considerably faster than the TILE_PIXEL-based code below.
 199     '''
 200     dst_native_type = 'uint%u_t' % format.block_size()
 201     print '   const unsigned dstpix_stride = dst_stride / %d;' % format.stride()
 202     print '   %s *dstpix = (%s *) dst;' % (dst_native_type, dst_native_type)
 203     print '   unsigned int qx, qy, i;'
 204     print
 205     print '   for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {'
 206     print '      const unsigned py = y0 + qy;'
 207     print '      for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {'
 208     print '         const unsigned px = x0 + qx;'
 209     print '         const uint8_t *r = src + 0 * TILE_C_STRIDE;'
 210     print '         const uint8_t *g = src + 1 * TILE_C_STRIDE;'
 211     print '         const uint8_t *b = src + 2 * TILE_C_STRIDE;'
 212     print '         const uint8_t *a = src + 3 * TILE_C_STRIDE;'
 213     print '         (void) r; (void) g; (void) b; (void) a; /* silence warnings */'
 214     print '         for (i = 0; i < TILE_C_STRIDE; i += 2) {'
 215     print '            const uint32_t pixel0 = %s;' % pack_rgba(format, src_channel, "r[i+0]", "g[i+0]", "b[i+0]", "a[i+0]")
 216     print '            const uint32_t pixel1 = %s;' % pack_rgba(format, src_channel, "r[i+1]", "g[i+1]", "b[i+1]", "a[i+1]")
 217     print '            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);'
 218     print '            dstpix[offset + 0] = pixel0;'
 219     print '            dstpix[offset + 1] = pixel1;'
 220     print '         }'
 221     print '         src += TILE_X_STRIDE;'
 222     print '      }'
 223     print '   }'
 224
 225
 226 def emit_tile_pixel_unswizzle_code(format, src_channel):
 227     '''Emit code for writing a block based on the TILE_PIXEL macro.'''
 228     dst_native_type = native_type(format)
 229
 230     inv_swizzle = format.inv_swizzles()
 231
 232     print '   unsigned x, y;'
 233     print '   uint8_t *dst_row = dst + y0*dst_stride;'
 234     print '   for (y = 0; y < TILE_SIZE; ++y) {'
 235     print '      %s *dst_pixel = (%s *)(dst_row + x0*%u);' % (dst_native_type, dst_native_type, format.stride())
 236     print '      for (x = 0; x < TILE_SIZE; ++x) {'
 237
 238     if format.layout == PLAIN:
 239         if not format.is_array():
 240             print '         %s pixel = 0;' % dst_native_type
 241             shift = 0;
 242             for i in range(4):
 243                 dst_channel = format.channels[i]
 244                 width = dst_channel.size
 245                 if inv_swizzle[i] is not None:
 246                     value = 'TILE_PIXEL(src, x, y, %u)' % inv_swizzle[i]
 247                     value = conversion_expr(src_channel, dst_channel, dst_native_type, value, clamp=False)
 248                     if shift:
 249                         value = '(%s << %u)' % (value, shift)
 250                     print '         pixel |= %s;' % value
 251                 shift += width
 252             print '         *dst_pixel++ = pixel;'
 253         else:
 254             for i in range(4):
 255                 dst_channel = format.channels[i]
 256                 if inv_swizzle[i] is not None:
 257                     value = 'TILE_PIXEL(src, x, y, %u)' % inv_swizzle[i]
 258                     value = conversion_expr(src_channel, dst_channel, dst_native_type, value, clamp=False)
 259                     print '         *dst_pixel++ = %s;' % value
 260                 elif dst_channel.size:
 261                     print '         ++dst_pixel;'
 262     else:
 263         assert False
 264
 265     print '      }'
 266     print '      dst_row += dst_stride;'
 267     print '   }'
 268
 269
 270 def generate_format_write(format, src_channel, src_native_type, src_suffix):
 271     '''Generate the function to write pixels to a particular format'''
 272
 273     name = format.short_name()
 274
 275     print 'static void'
 276     print 'lp_tile_%s_unswizzle_%s(const %s *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)' % (name, src_suffix, src_native_type)
 277     print '{'
 278     if format.layout == PLAIN \
 279         and format.colorspace == 'rgb' \
 280         and format.block_size() <= 32 \
 281         and format.is_pot() \
 282         and not format.is_mixed() \
 283         and (format.channels[0].type == UNSIGNED \
 284              or format.channels[1].type == UNSIGNED):
 285         emit_unrolled_unswizzle_code(format, src_channel)
 286     else:
 287         emit_tile_pixel_unswizzle_code(format, src_channel)
 288     print '}'
 289     print
 290
 291
 292 def generate_ssse3():
 293     print '''
 294 #if defined(PIPE_ARCH_SSE)
 295
 296 #include "util/u_sse.h"
 297
 298 static void
 299 lp_tile_b8g8r8a8_unorm_swizzle_4ub_ssse3(uint8_t *dst,
 300                                          const uint8_t *src, unsigned src_stride,
 301                                          unsigned x0, unsigned y0)
 302 {
 303
 304    unsigned x, y;
 305    __m128i *pdst = (__m128i*) dst;
 306    const uint8_t *ysrc0 = src + y0*src_stride + x0*sizeof(uint32_t);
 307    unsigned int tile_stridex = src_stride*(TILE_VECTOR_HEIGHT - 1) - sizeof(uint32_t)*TILE_VECTOR_WIDTH;
 308    unsigned int tile_stridey = src_stride*TILE_VECTOR_HEIGHT;
 309
 310    const __m128i shuffle00 = _mm_setr_epi8(0x02,0x06,0xff,0xff,0x0a,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
 311    const __m128i shuffle01 = _mm_setr_epi8(0x01,0x05,0xff,0xff,0x09,0x0d,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
 312    const __m128i shuffle02 = _mm_setr_epi8(0x00,0x04,0xff,0xff,0x08,0x0c,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
 313    const __m128i shuffle03 = _mm_setr_epi8(0x03,0x07,0xff,0xff,0x0b,0x0f,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
 314
 315    const __m128i shuffle10 = _mm_setr_epi8(0xff,0xff,0x02,0x06,0xff,0xff,0x0a,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
 316    const __m128i shuffle11 = _mm_setr_epi8(0xff,0xff,0x01,0x05,0xff,0xff,0x09,0x0d,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
 317    const __m128i shuffle12 = _mm_setr_epi8(0xff,0xff,0x00,0x04,0xff,0xff,0x08,0x0c,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
 318    const __m128i shuffle13 = _mm_setr_epi8(0xff,0xff,0x03,0x07,0xff,0xff,0x0b,0x0f,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
 319
 320    const __m128i shuffle20 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x02,0x06,0xff,0xff,0x0a,0x0e,0xff,0xff);
 321    const __m128i shuffle21 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x01,0x05,0xff,0xff,0x09,0x0d,0xff,0xff);
 322    const __m128i shuffle22 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x04,0xff,0xff,0x08,0x0c,0xff,0xff);
 323    const __m128i shuffle23 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x03,0x07,0xff,0xff,0x0b,0x0f,0xff,0xff);
 324
 325    const __m128i shuffle30 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x02,0x06,0xff,0xff,0x0a,0x0e);
 326    const __m128i shuffle31 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x01,0x05,0xff,0xff,0x09,0x0d);
 327    const __m128i shuffle32 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x04,0xff,0xff,0x08,0x0c);
 328    const __m128i shuffle33 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x03,0x07,0xff,0xff,0x0b,0x0f);
 329
 330    for (y = 0; y < TILE_SIZE; y += TILE_VECTOR_HEIGHT) {
 331       __m128i line0 = *(__m128i*)ysrc0;
 332       const uint8_t *ysrc = ysrc0 + src_stride;
 333       ysrc0 += tile_stridey;
 334
 335       for (x = 0; x < TILE_SIZE; x += TILE_VECTOR_WIDTH) {
 336          __m128i r, g, b, a, line1;
 337          line1 = *(__m128i*)ysrc;
 338          PIPE_READ_WRITE_BARRIER();
 339          ysrc += src_stride;
 340          r = _mm_shuffle_epi8(line0, shuffle00);
 341          g = _mm_shuffle_epi8(line0, shuffle01);
 342          b = _mm_shuffle_epi8(line0, shuffle02);
 343          a = _mm_shuffle_epi8(line0, shuffle03);
 344
 345          line0 = *(__m128i*)ysrc;
 346          PIPE_READ_WRITE_BARRIER();
 347          ysrc += src_stride;
 348          r = _mm_or_si128(r, _mm_shuffle_epi8(line1, shuffle10));
 349          g = _mm_or_si128(g, _mm_shuffle_epi8(line1, shuffle11));
 350          b = _mm_or_si128(b, _mm_shuffle_epi8(line1, shuffle12));
 351          a = _mm_or_si128(a, _mm_shuffle_epi8(line1, shuffle13));
 352
 353          line1 = *(__m128i*)ysrc;
 354          PIPE_READ_WRITE_BARRIER();
 355          ysrc -= tile_stridex;
 356          r = _mm_or_si128(r, _mm_shuffle_epi8(line0, shuffle20));
 357          g = _mm_or_si128(g, _mm_shuffle_epi8(line0, shuffle21));
 358          b = _mm_or_si128(b, _mm_shuffle_epi8(line0, shuffle22));
 359          a = _mm_or_si128(a, _mm_shuffle_epi8(line0, shuffle23));
 360
 361          if (x + 1 < TILE_SIZE) {
 362             line0 = *(__m128i*)ysrc;
 363             ysrc += src_stride;
 364          }
 365
 366          PIPE_READ_WRITE_BARRIER();
 367          r = _mm_or_si128(r, _mm_shuffle_epi8(line1, shuffle30));
 368          g = _mm_or_si128(g, _mm_shuffle_epi8(line1, shuffle31));
 369          b = _mm_or_si128(b, _mm_shuffle_epi8(line1, shuffle32));
 370          a = _mm_or_si128(a, _mm_shuffle_epi8(line1, shuffle33));
 371
 372          *pdst++ = r;
 373          *pdst++ = g;
 374          *pdst++ = b;
 375          *pdst++ = a;
 376       }
 377    }
 378
 379 }
 380
 381 static void
 382 lp_tile_b8g8r8a8_unorm_unswizzle_4ub_ssse3(const uint8_t *src,
 383                                           uint8_t *dst, unsigned dst_stride,
 384                                           unsigned x0, unsigned y0)
 385 {
 386    unsigned int x, y;
 387    const __m128i *psrc = (__m128i*) src;
 388    const __m128i *end = (__m128i*) (src + (y0 + TILE_SIZE - 1)*dst_stride + (x0 + TILE_SIZE - 1)*sizeof(uint32_t));
 389    uint8_t *pdst = dst + y0 * dst_stride + x0 * sizeof(uint32_t);
 390    __m128i c0 = *psrc++;
 391    __m128i c1;
 392
 393    const __m128i shuffle00 = _mm_setr_epi8(0xff,0xff,0x00,0xff,0xff,0xff,0x01,0xff,0xff,0xff,0x04,0xff,0xff,0xff,0x05,0xff);
 394    const __m128i shuffle01 = _mm_setr_epi8(0xff,0xff,0x02,0xff,0xff,0xff,0x03,0xff,0xff,0xff,0x06,0xff,0xff,0xff,0x07,0xff);
 395    const __m128i shuffle02 = _mm_setr_epi8(0xff,0xff,0x08,0xff,0xff,0xff,0x09,0xff,0xff,0xff,0x0c,0xff,0xff,0xff,0x0d,0xff);
 396    const __m128i shuffle03 = _mm_setr_epi8(0xff,0xff,0x0a,0xff,0xff,0xff,0x0b,0xff,0xff,0xff,0x0e,0xff,0xff,0xff,0x0f,0xff);
 397
 398    const __m128i shuffle10 = _mm_setr_epi8(0xff,0x00,0xff,0xff,0xff,0x01,0xff,0xff,0xff,0x04,0xff,0xff,0xff,0x05,0xff,0xff);
 399    const __m128i shuffle11 = _mm_setr_epi8(0xff,0x02,0xff,0xff,0xff,0x03,0xff,0xff,0xff,0x06,0xff,0xff,0xff,0x07,0xff,0xff);
 400    const __m128i shuffle12 = _mm_setr_epi8(0xff,0x08,0xff,0xff,0xff,0x09,0xff,0xff,0xff,0x0c,0xff,0xff,0xff,0x0d,0xff,0xff);
 401    const __m128i shuffle13 = _mm_setr_epi8(0xff,0x0a,0xff,0xff,0xff,0x0b,0xff,0xff,0xff,0x0e,0xff,0xff,0xff,0x0f,0xff,0xff);
 402
 403    const __m128i shuffle20 = _mm_setr_epi8(0x00,0xff,0xff,0xff,0x01,0xff,0xff,0xff,0x04,0xff,0xff,0xff,0x05,0xff,0xff,0xff);
 404    const __m128i shuffle21 = _mm_setr_epi8(0x02,0xff,0xff,0xff,0x03,0xff,0xff,0xff,0x06,0xff,0xff,0xff,0x07,0xff,0xff,0xff);
 405    const __m128i shuffle22 = _mm_setr_epi8(0x08,0xff,0xff,0xff,0x09,0xff,0xff,0xff,0x0c,0xff,0xff,0xff,0x0d,0xff,0xff,0xff);
 406    const __m128i shuffle23 = _mm_setr_epi8(0x0a,0xff,0xff,0xff,0x0b,0xff,0xff,0xff,0x0e,0xff,0xff,0xff,0x0f,0xff,0xff,0xff);
 407
 408    const __m128i shuffle30 = _mm_setr_epi8(0xff,0xff,0xff,0x00,0xff,0xff,0xff,0x01,0xff,0xff,0xff,0x04,0xff,0xff,0xff,0x05);
 409    const __m128i shuffle31 = _mm_setr_epi8(0xff,0xff,0xff,0x02,0xff,0xff,0xff,0x03,0xff,0xff,0xff,0x06,0xff,0xff,0xff,0x07);
 410    const __m128i shuffle32 = _mm_setr_epi8(0xff,0xff,0xff,0x08,0xff,0xff,0xff,0x09,0xff,0xff,0xff,0x0c,0xff,0xff,0xff,0x0d);
 411    const __m128i shuffle33 = _mm_setr_epi8(0xff,0xff,0xff,0x0a,0xff,0xff,0xff,0x0b,0xff,0xff,0xff,0x0e,0xff,0xff,0xff,0x0f);
 412
 413    for (y = 0; y < TILE_SIZE; y += TILE_VECTOR_HEIGHT) {
 414       __m128i *tile = (__m128i*) pdst;
 415       pdst += dst_stride * TILE_VECTOR_HEIGHT;
 416       for (x = 0; x < TILE_SIZE; x += TILE_VECTOR_WIDTH) {
 417          uint8_t *linep = (uint8_t*) (tile++);
 418          __m128i line0, line1, line2, line3;
 419
 420          c1 = *psrc++; /* r */
 421          PIPE_READ_WRITE_BARRIER();
 422          line0 = _mm_shuffle_epi8(c0, shuffle00);
 423          line1 = _mm_shuffle_epi8(c0, shuffle01);
 424          line2 = _mm_shuffle_epi8(c0, shuffle02);
 425          line3 = _mm_shuffle_epi8(c0, shuffle03);
 426
 427          c0 = *psrc++; /* g */
 428          PIPE_READ_WRITE_BARRIER();
 429          line0 = _mm_or_si128(line0, _mm_shuffle_epi8(c1, shuffle10));
 430          line1 = _mm_or_si128(line1, _mm_shuffle_epi8(c1, shuffle11));
 431          line2 = _mm_or_si128(line2, _mm_shuffle_epi8(c1, shuffle12));
 432          line3 = _mm_or_si128(line3, _mm_shuffle_epi8(c1, shuffle13));
 433
 434          c1 = *psrc++; /* b */
 435          PIPE_READ_WRITE_BARRIER();
 436          line0 = _mm_or_si128(line0, _mm_shuffle_epi8(c0, shuffle20));
 437          line1 = _mm_or_si128(line1, _mm_shuffle_epi8(c0, shuffle21));
 438          line2 = _mm_or_si128(line2, _mm_shuffle_epi8(c0, shuffle22));
 439          line3 = _mm_or_si128(line3, _mm_shuffle_epi8(c0, shuffle23));
 440
 441          if (psrc != end)
 442                  c0 = *psrc++; /* a */
 443          PIPE_READ_WRITE_BARRIER();
 444          line0 = _mm_or_si128(line0, _mm_shuffle_epi8(c1, shuffle30));
 445          line1 = _mm_or_si128(line1, _mm_shuffle_epi8(c1, shuffle31));
 446          line2 = _mm_or_si128(line2, _mm_shuffle_epi8(c1, shuffle32));
 447          line3 = _mm_or_si128(line3, _mm_shuffle_epi8(c1, shuffle33));
 448
 449          *(__m128i*) (linep) = line0;
 450          *(__m128i*) (((char*)linep) + dst_stride) = line1;
 451          *(__m128i*) (((char*)linep) + 2 * dst_stride) = line2;
 452          *(__m128i*) (((char*)linep) + 3 * dst_stride) = line3;
 453       }
 454    }
 455 }
 456
 457 #endif /* PIPE_ARCH_SSSE3 */
 458 '''
 459
 460
 461 def generate_swizzle(formats, dst_channel, dst_native_type, dst_suffix):
 462     '''Generate the dispatch function to read pixels from any format'''
 463
 464     for format in formats:
 465         if is_format_supported(format):
 466             generate_format_read(format, dst_channel, dst_native_type, dst_suffix)
 467
 468     print 'void'
 469     print 'lp_tile_swizzle_%s(enum pipe_format format, %s *dst, const void *src, unsigned src_stride, unsigned x, unsigned y)' % (dst_suffix, dst_native_type)
 470     print '{'
 471     print '   void (*func)(%s *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0);' % dst_native_type
 472     print '#ifdef DEBUG'
 473     print '   lp_tile_swizzle_count += 1;'
 474     print '#endif'
 475     print '   switch(format) {'
 476     for format in formats:
 477         if is_format_supported(format):
 478             print '   case %s:' % format.name
 479             func_name = 'lp_tile_%s_swizzle_%s' % (format.short_name(), dst_suffix)
 480             if format.name == 'PIPE_FORMAT_B8G8R8A8_UNORM':
 481                 print '#ifdef PIPE_ARCH_SSE'
 482                 print '      func = util_cpu_caps.has_ssse3 ? %s_ssse3 : %s;' % (func_name, func_name)
 483                 print '#else'
 484                 print '      func = %s;' % (func_name,)
 485                 print '#endif'
 486             else:
 487                 print '      func = %s;' % (func_name,)
 488             print '      break;'
 489     print '   default:'
 490     print '      debug_printf("%s: unsupported format %s\\n", __FUNCTION__, util_format_name(format));'
 491     print '      return;'
 492     print '   }'
 493     print '   func(dst, (const uint8_t *)src, src_stride, x, y);'
 494     print '}'
 495     print
 496
 497
 498 def generate_unswizzle(formats, src_channel, src_native_type, src_suffix):
 499     '''Generate the dispatch function to write pixels to any format'''
 500
 501     for format in formats:
 502         if is_format_supported(format):
 503             generate_format_write(format, src_channel, src_native_type, src_suffix)
 504
 505     print 'void'
 506     print 'lp_tile_unswizzle_%s(enum pipe_format format, const %s *src, void *dst, unsigned dst_stride, unsigned x, unsigned y)' % (src_suffix, src_native_type)
 507
 508     print '{'
 509     print '   void (*func)(const %s *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0);' % src_native_type
 510     print '#ifdef DEBUG'
 511     print '   lp_tile_unswizzle_count += 1;'
 512     print '#endif'
 513     print '   switch(format) {'
 514     for format in formats:
 515         if is_format_supported(format):
 516             print '   case %s:' % format.name
 517             func_name = 'lp_tile_%s_unswizzle_%s' % (format.short_name(), src_suffix)
 518             if format.name == 'PIPE_FORMAT_B8G8R8A8_UNORM':
 519                 print '#ifdef PIPE_ARCH_SSE'
 520                 print '      func = util_cpu_caps.has_ssse3 ? %s_ssse3 : %s;' % (func_name, func_name)
 521                 print '#else'
 522                 print '      func = %s;' % (func_name,)
 523                 print '#endif'
 524             else:
 525                 print '      func = %s;' % (func_name,)
 526             print '      break;'
 527     print '   default:'
 528     print '      debug_printf("%s: unsupported format %s\\n", __FUNCTION__, util_format_name(format));'
 529     print '      return;'
 530     print '   }'
 531     print '   func(src, (uint8_t *)dst, dst_stride, x, y);'
 532     print '}'
 533     print
 534
 535
 536 def main():
 537     formats = []
 538     for arg in sys.argv[1:]:
 539         formats.extend(parse(arg))
 540
 541     print '/* This file is autogenerated by lp_tile_soa.py from u_format.csv. Do not edit directly. */'
 542     print
 543     # This will print the copyright message on the top of this file
 544     print __doc__.strip()
 545     print
 546     print '#include "pipe/p_compiler.h"'
 547     print '#include "util/u_format.h"'
 548     print '#include "util/u_math.h"'
 549     print '#include "util/u_half.h"'
 550     print '#include "util/u_cpu_detect.h"'
 551     print '#include "lp_tile_soa.h"'
 552     print
 553     print '#ifdef DEBUG'
 554     print 'unsigned lp_tile_unswizzle_count = 0;'
 555     print 'unsigned lp_tile_swizzle_count = 0;'
 556     print '#endif'
 557     print
 558     print 'const unsigned char'
 559     print 'tile_offset[TILE_VECTOR_HEIGHT][TILE_VECTOR_WIDTH] = {'
 560     print '   {  0,  1,  4,  5},'
 561     print '   {  2,  3,  6,  7},'
 562     print '   {  8,  9, 12, 13},'
 563     print '   { 10, 11, 14, 15}'
 564     print '};'
 565     print
 566     print '/* Note: these lookup tables could be replaced with some'
 567     print ' * bit-twiddling code, but this is a little faster.'
 568     print ' */'
 569     print 'static unsigned tile_x_offset[TILE_VECTOR_WIDTH * TILE_VECTOR_HEIGHT] = {'
 570     print '   0, 1, 0, 1, 2, 3, 2, 3,'
 571     print '   0, 1, 0, 1, 2, 3, 2, 3'
 572     print '};'
 573     print
 574     print 'static unsigned tile_y_offset[TILE_VECTOR_WIDTH * TILE_VECTOR_HEIGHT] = {'
 575     print '   0, 0, 1, 1, 0, 0, 1, 1,'
 576     print '   2, 2, 3, 3, 2, 2, 3, 3'
 577     print '};'
 578     print
 579
 580     generate_ssse3()
 581
 582     channel = Channel(UNSIGNED, True, 8)
 583     native_type = 'uint8_t'
 584     suffix = '4ub'
 585
 586     generate_swizzle(formats, channel, native_type, suffix)
 587     generate_unswizzle(formats, channel, native_type, suffix)
 588
 589
 590 if __name__ == '__main__':
 591     main()