src/gallium/drivers/llvmpipe/lp_bld_conv.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include "util/u_debug.h"
  49
  50 #include "lp_bld_type.h"
  51 #include "lp_bld_const.h"
  52 #include "lp_bld_intr.h"
  53 #include "lp_bld_conv.h"
  54
  55
  56 /**
  57  * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
  58  */
  59 static LLVMValueRef
  60 lp_build_const_expand_shuffle(unsigned n, unsigned lo_hi)
  61 {
  62    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
  63    unsigned i, j;
  64
  65    assert(n <= LP_MAX_VECTOR_LENGTH);
  66    assert(lo_hi < 2);
  67
  68    /* TODO: cache results in a static table */
  69
  70    for(i = 0, j = lo_hi*n/2; i < n; i += 2, ++j) {
  71       elems[i + 0] = LLVMConstInt(LLVMInt32Type(), 0 + j, 0);
  72       elems[i + 1] = LLVMConstInt(LLVMInt32Type(), n + j, 0);
  73    }
  74
  75    return LLVMConstVector(elems, n);
  76 }
  77
  78
  79 static void
  80 lp_build_expand(LLVMBuilderRef builder,
  81                union lp_type src_type,
  82                union lp_type dst_type,
  83                LLVMValueRef src,
  84                LLVMValueRef *dst, unsigned num_dsts)
  85 {
  86    unsigned num_tmps;
  87    unsigned i;
  88
  89    /* Register width must remain constant */
  90    assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
  91
  92    /* We must not loose or gain channels. Only precision */
  93    assert(src_type.length == dst_type.length * num_dsts);
  94
  95    num_tmps = 1;
  96    dst[0] = src;
  97
  98    while(src_type.width < dst_type.width) {
  99       union lp_type new_type = src_type;
 100       LLVMTypeRef new_vec_type;
 101
 102       new_type.width *= 2;
 103       new_type.length /= 2;
 104       new_vec_type = lp_build_vec_type(new_type);
 105
 106       for(i = num_tmps; i--; ) {
 107          LLVMValueRef zero;
 108          LLVMValueRef shuffle_lo;
 109          LLVMValueRef shuffle_hi;
 110          LLVMValueRef lo;
 111          LLVMValueRef hi;
 112
 113          zero = lp_build_zero(src_type);
 114          shuffle_lo = lp_build_const_expand_shuffle(src_type.length, 0);
 115          shuffle_hi = lp_build_const_expand_shuffle(src_type.length, 1);
 116
 117          /*  PUNPCKLBW, PUNPCKHBW */
 118          lo = LLVMBuildShuffleVector(builder, dst[i], zero, shuffle_lo, "");
 119          hi = LLVMBuildShuffleVector(builder, dst[i], zero, shuffle_hi, "");
 120
 121          dst[2*i + 0] = LLVMBuildBitCast(builder, lo, new_vec_type, "");
 122          dst[2*i + 1] = LLVMBuildBitCast(builder, hi, new_vec_type, "");
 123       }
 124
 125       src_type = new_type;
 126
 127       num_tmps *= 2;
 128    }
 129
 130    assert(num_tmps == num_dsts);
 131 }
 132
 133
 134 static LLVMValueRef
 135 lp_build_trunc(LLVMBuilderRef builder,
 136                union lp_type src_type,
 137                union lp_type dst_type,
 138                const LLVMValueRef *src, unsigned num_srcs)
 139 {
 140    LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
 141    unsigned i;
 142
 143    /* Register width must remain constant */
 144    assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
 145
 146    /* We must not loose or gain channels. Only precision */
 147    assert(src_type.length * num_srcs == dst_type.length);
 148
 149    for(i = 0; i < num_srcs; ++i)
 150       tmp[i] = src[i];
 151
 152    while(src_type.width > dst_type.width) {
 153       LLVMTypeRef tmp_vec_type = lp_build_vec_type(src_type);
 154       union lp_type new_type = src_type;
 155       LLVMTypeRef new_vec_type;
 156
 157       new_type.width /= 2;
 158       new_type.length *= 2;
 159       new_vec_type = lp_build_vec_type(new_type);
 160
 161       for(i = 0; i < num_srcs/2; ++i) {
 162          LLVMValueRef lo = tmp[2*i + 0];
 163          LLVMValueRef hi = tmp[2*i + 1];
 164          LLVMValueRef packed = NULL;
 165
 166          if(src_type.width == 32) {
 167             /* FIXME: we only have a packed signed intrinsic */
 168             packed = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", tmp_vec_type, lo, hi);
 169          }
 170          else if(src_type.width == 16) {
 171             if(dst_type.sign)
 172                packed = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", tmp_vec_type, lo, hi);
 173             else
 174                packed = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", tmp_vec_type, lo, hi);
 175          }
 176          else
 177             assert(0);
 178
 179          tmp[i] = LLVMBuildBitCast(builder, packed, new_vec_type, "");
 180       }
 181
 182       src_type = new_type;
 183
 184       num_srcs /= 2;
 185    }
 186
 187    assert(num_srcs == 1);
 188
 189    return tmp[0];
 190 }
 191
 192
 193 /**
 194  * Convert between two SIMD types.
 195  *
 196  * Converting between SIMD types of different element width poses a problem:
 197  * SIMD registers have a fixed number of bits, so different element widths
 198  * imply different vector lengths. Therefore we must multiplex the multiple
 199  * incoming sources into a single destination vector, or demux a single incoming
 200  * vector into multiple vectors.
 201  */
 202 void
 203 lp_build_conv(LLVMBuilderRef builder,
 204               union lp_type src_type,
 205               union lp_type dst_type,
 206               const LLVMValueRef *src, unsigned num_srcs,
 207               LLVMValueRef *dst, unsigned num_dsts)
 208 {
 209    union lp_type tmp_type;
 210    LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
 211    unsigned num_tmps;
 212    unsigned i;
 213
 214    /* Register width must remain constant */
 215    assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
 216
 217    /* We must not loose or gain channels. Only precision */
 218    assert(src_type.length * num_srcs == dst_type.length * num_dsts);
 219
 220    assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
 221    assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
 222
 223    tmp_type = src_type;
 224    for(i = 0; i < num_srcs; ++i)
 225       tmp[i] = src[i];
 226    num_tmps = num_srcs;
 227
 228    /*
 229     * Clamp if necessary
 230     */
 231
 232    if(!tmp_type.norm && dst_type.norm) {
 233       /* FIXME */
 234    }
 235
 236    /*
 237     * Scale to the narrowest range
 238     */
 239
 240    if(dst_type.floating) {
 241       /* Nothing to do */
 242    }
 243    else if(tmp_type.floating) {
 244       double dst_scale = lp_const_scale(dst_type);
 245       LLVMTypeRef tmp_vec_type;
 246
 247       if (dst_scale != 1.0) {
 248          LLVMValueRef scale = lp_build_const_uni(tmp_type, dst_scale);
 249          for(i = 0; i < num_tmps; ++i)
 250             tmp[i] = LLVMBuildMul(builder, tmp[i], scale, "");
 251       }
 252
 253       /* Use an equally sized integer for intermediate computations */
 254       tmp_type.floating = FALSE;
 255       tmp_vec_type = lp_build_vec_type(tmp_type);
 256       for(i = 0; i < num_tmps; ++i) {
 257 #if 0
 258          if(dst_type.sign)
 259             tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
 260          else
 261             tmp[i] = LLVMBuildFPToUI(builder, tmp[i], tmp_vec_type, "");
 262 #else
 263         /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */
 264          tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
 265 #endif
 266       }
 267    }
 268    else {
 269       unsigned src_shift = lp_const_shift(src_type);
 270       unsigned dst_shift = lp_const_shift(dst_type);
 271
 272       /* FIXME: compensate different offsets too */
 273       if(src_shift > dst_shift) {
 274          LLVMValueRef shift = lp_build_int_const_uni(tmp_type, src_shift - dst_shift);
 275          for(i = 0; i < num_tmps; ++i)
 276             if(dst_type.sign)
 277                tmp[i] = LLVMBuildAShr(builder, tmp[i], shift, "");
 278             else
 279                tmp[i] = LLVMBuildLShr(builder, tmp[i], shift, "");
 280       }
 281    }
 282
 283    /*
 284     * Truncate or expand bit width
 285     */
 286
 287    assert(!tmp_type.floating);
 288
 289    if(tmp_type.width > dst_type.width) {
 290       assert(num_dsts == 1);
 291       tmp[0] = lp_build_trunc(builder, tmp_type, dst_type, tmp, num_tmps);
 292       tmp_type.width = dst_type.width;
 293       tmp_type.length = dst_type.length;
 294       num_tmps = 1;
 295    }
 296
 297    if(tmp_type.width < dst_type.width) {
 298       assert(num_tmps == 1);
 299       lp_build_expand(builder, tmp_type, dst_type, tmp[0], tmp, num_dsts);
 300       tmp_type.width = dst_type.width;
 301       tmp_type.length = dst_type.length;
 302       num_tmps = num_dsts;
 303    }
 304
 305    assert(tmp_type.width == dst_type.width);
 306    assert(tmp_type.length == dst_type.length);
 307    assert(num_tmps == num_dsts);
 308
 309    /*
 310     * Scale to the widest range
 311     */
 312
 313    if(src_type.floating) {
 314       /* Nothing to do */
 315    }
 316    else if(!src_type.floating && dst_type.floating) {
 317       double src_scale = lp_const_scale(src_type);
 318       LLVMTypeRef tmp_vec_type;
 319
 320       /* Use an equally sized integer for intermediate computations */
 321       tmp_type.floating = TRUE;
 322       tmp_type.sign = TRUE;
 323       tmp_vec_type = lp_build_vec_type(tmp_type);
 324       for(i = 0; i < num_tmps; ++i) {
 325  #if 0
 326          if(dst_type.sign)
 327             tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
 328          else
 329             tmp[i] = LLVMBuildUIToFP(builder, tmp[i], tmp_vec_type, "");
 330  #else
 331          /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */
 332          tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
 333  #endif
 334        }
 335
 336        if (src_scale != 1.0) {
 337           LLVMValueRef scale = lp_build_const_uni(tmp_type, 1.0/src_scale);
 338           for(i = 0; i < num_tmps; ++i)
 339              tmp[i] = LLVMBuildMul(builder, tmp[i], scale, "");
 340        }
 341     }
 342     else {
 343        unsigned src_shift = lp_const_shift(src_type);
 344        unsigned dst_shift = lp_const_shift(dst_type);
 345
 346        /* FIXME: compensate different offsets too */
 347        if(src_shift < dst_shift) {
 348           LLVMValueRef shift = lp_build_int_const_uni(tmp_type, src_shift - dst_shift);
 349           for(i = 0; i < num_tmps; ++i)
 350              tmp[i] = LLVMBuildShl(builder, tmp[i], shift, "");
 351        }
 352     }
 353
 354    for(i = 0; i < num_dsts; ++i)
 355       dst[i] = tmp[i];
 356 }