From b101a094b5576337f5bb7aecf525eb1755757b9c Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Fri, 22 Mar 2013 20:09:18 +0100
Subject: [PATCH] llvmpipe: add EXT_packed_float render target format support

New conversion code to handle conversion from/to r11g11b10 AoS to/from
SoA floats, and also add code for conversion from rgb9e5 AoS to float SoA
(which works pretty much the same as r11g11b10 except for the packing).
(This code should also be used for texture sampling instead of
relying on u_format conversion but it's not yet, so rgb9e5 is unused.)
Unfortunately a crazy amount of hacks is necessary to get the conversion
code running in llvmpipe's generate_unswizzled_blend, which isn't well
suited for formats where the storage representation has nothing to do
with what's needed for blending (moreover, the conversion will convert
from packed AoS values, which is the storage format, to float SoA values,
because this is much more natural for the conversion, and likewise from
SoA values to packed AoS values - but the "blend" (which includes
trivial things like partial mask) works on AoS values, so incoming fs
values will go SoA->AoS, values from destination will go packed
AoS->SoA->AoS, then do blend, then AoS->SoA->packed AoS which probably
isn't the most efficient way though the shuffles are probably bearable).

Passes piglit fbo-blending-formats (with GL_EXT_packed_float parameter),
still need to verify Inf/NaNs (where most of the complexity in the
conversion comes from actually).

v2: drop the (very bogus) rgb9e5 part, and do component extraction
in the helper code for r11g11b10 to float conversion, making the code
slightly more compact (suggested by Jose), now that there are no other
callers left this works quite well. (Could do the same for the
opposite way but it's less than ideal there, final part of packing
needs to be done in caller anyway and there'd be another conditional.)

v3: minor style and comment fixes. Also fix a potential issue with
negative zero being potentially returned by max(src, zero) as we
don't have well-defined min/max behavior (fortunately no additonal cost).

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 src/gallium/auxiliary/gallivm/lp_bld_conv.c | 243 ++++++++++++++++++++
 src/gallium/auxiliary/gallivm/lp_bld_conv.h |   9 +
 src/gallium/drivers/llvmpipe/lp_screen.c    |   6 +-
 src/gallium/drivers/llvmpipe/lp_state_fs.c  | 126 ++++++++++
 4 files changed, 382 insertions(+), 2 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index dc3649db363..053f4132080 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -154,6 +154,249 @@ lp_build_bswap_vec(struct gallivm_state *gallivm,
 }
 
 
+/**
+ * Convert float32 to a float-like value with less exponent and mantissa
+ * bits. The mantissa is still biased, and the mantissa still has an implied 1,
+ * but there's no sign bit.
+ *
+ * @param src             (vector) float value to convert
+ * @param mantissa_bits   the number of mantissa bits
+ * @param exponent_bits   the number of exponent bits
+ *
+ * Unlike float_to_half using accurate method here.
+ * This implements round-towards-zero (trunc) hence too large numbers get
+ * converted to largest representable number, not infinity.
+ * Small numbers may get converted to denorms, depending on normal
+ * float denorm handling of the cpu.
+ * Note that compared to the references, below, we skip any rounding bias
+ * since we do rounding towards zero - OpenGL allows rounding towards zero
+ * (though not preferred) and DX10 even seems to require it.
+ * Note that this will not do any packing - the value will
+ * look like a "rescaled float" (except for Inf/NaN) but be returned
+ * as int32.
+ *
+ * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
+ * ref https://gist.github.com/rygorous/2156668
+ */
+static LLVMValueRef
+lp_build_float_to_smallfloat_nosign(struct gallivm_state *gallivm,
+                                    struct lp_type i32_type,
+                                    LLVMValueRef src,
+                                    unsigned mantissa_bits,
+                                    unsigned exponent_bits)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef i32_floatexpmask, i32_smallexpmask, magic, normal;
+   LLVMValueRef clamped, tmp, i32_roundmask, small_max, src_abs;
+   LLVMValueRef is_nan, is_posinf, is_nan_or_posinf, i32_qnanbit, nan_or_posinf;
+   struct lp_type f32_type = lp_type_float_vec(32, 32 * i32_type.length);
+   struct lp_build_context f32_bld, i32_bld;
+   LLVMValueRef zero = lp_build_const_vec(gallivm, f32_type, 0.0f);
+
+   lp_build_context_init(&f32_bld, gallivm, f32_type);
+   lp_build_context_init(&i32_bld, gallivm, i32_type);
+
+   i32_smallexpmask = lp_build_const_int_vec(gallivm, i32_type,
+                                             ((1 << exponent_bits) - 1) << 23);
+   i32_floatexpmask = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23);
+
+   /* "ordinary" number */
+   /* clamp to pos range (can still have sign bit if NaN or negative zero) */
+   clamped = lp_build_max(&f32_bld, src, zero);
+   clamped = LLVMBuildBitCast(builder, clamped, i32_bld.vec_type, "");
+   /* get rid of excess mantissa bits, and while here also potential sign bit */
+   i32_roundmask = lp_build_const_int_vec(gallivm, i32_type,
+                                          ~((1 << (23 - mantissa_bits)) - 1) |
+                                          0x7fffffff);
+
+   tmp = lp_build_and(&i32_bld, clamped, i32_roundmask);
+   tmp = LLVMBuildBitCast(builder, tmp, f32_bld.vec_type, "");
+   /* bias exponent (and denormalize if necessary) */
+   magic = lp_build_const_int_vec(gallivm, i32_type,
+                                  ((1 << (exponent_bits - 1)) - 1) << 23);
+   magic = LLVMBuildBitCast(builder, magic, f32_bld.vec_type, "");
+   normal = lp_build_mul(&f32_bld, tmp, magic);
+
+   /* clamp to max value */
+   small_max = lp_build_const_int_vec(gallivm, i32_type,
+                                      (((1 << exponent_bits) - 2) << 23) |
+                                      (((1 << mantissa_bits) - 1) << (23 - mantissa_bits)));
+   small_max = LLVMBuildBitCast(builder, small_max, f32_bld.vec_type, "");
+   normal = lp_build_min(&f32_bld, normal, small_max);
+   normal = LLVMBuildBitCast(builder, normal, i32_bld.vec_type, "");
+
+   /*
+    * handle nan/inf cases
+    * a little bit tricky since -Inf -> 0, +Inf -> +Inf, +-Nan -> +Nan
+    * Note that on a lucky day, we could simplify this a bit,
+    * by just using the max(src, zero) result - this will have -Inf
+    * clamped to 0, and MIGHT preserve the NaNs.
+    */
+   src_abs = lp_build_abs(&f32_bld, src);
+   src_abs = LLVMBuildBitCast(builder, src_abs, i32_bld.vec_type, "");
+   src = LLVMBuildBitCast(builder, src, i32_bld.vec_type, "");
+   is_nan = lp_build_compare(gallivm, i32_type, PIPE_FUNC_GREATER,
+                             src_abs, i32_floatexpmask);
+   is_posinf = lp_build_compare(gallivm, i32_type, PIPE_FUNC_EQUAL,
+                                src, i32_floatexpmask);
+   is_nan_or_posinf = lp_build_and(&i32_bld, is_nan, is_posinf);
+   /* could also set more mantissa bits but need at least the highest mantissa bit */
+   i32_qnanbit = lp_build_const_vec(gallivm, i32_type, 1 << 22);
+   /* combine maxexp with qnanbit */
+   nan_or_posinf = lp_build_or(&i32_bld, i32_smallexpmask,
+                               lp_build_and(&i32_bld, is_nan, i32_qnanbit));
+
+   return lp_build_select(&i32_bld, is_nan_or_posinf, nan_or_posinf, normal);
+}
+
+
+/**
+ * Convert rgba float SoA values to packed r11g11b10 values.
+ *
+ * @param src   SoA float (vector) values to convert.
+ */
+LLVMValueRef
+lp_build_float_to_r11g11b10(struct gallivm_state *gallivm,
+                            LLVMValueRef *src)
+{
+   LLVMValueRef dst, rcomp, bcomp, gcomp, shift, mask;
+   struct lp_build_context i32_bld;
+   LLVMTypeRef src_type = LLVMTypeOf(*src);
+   unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
+                            LLVMGetVectorSize(src_type) : 1;
+   struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length);
+
+   lp_build_context_init(&i32_bld, gallivm, i32_type);
+
+   /* "rescale" - this does the actual conversion except the packing */
+   rcomp = lp_build_float_to_smallfloat_nosign(gallivm, i32_type, src[0], 6, 5);
+   gcomp = lp_build_float_to_smallfloat_nosign(gallivm, i32_type, src[1], 6, 5);
+   bcomp = lp_build_float_to_smallfloat_nosign(gallivm, i32_type, src[2], 5, 5);
+
+   /* pack rescaled SoA floats to r11g11b10 AoS values */
+   shift = lp_build_const_int_vec(gallivm, i32_type, 23 - 6);
+   rcomp = lp_build_shr(&i32_bld, rcomp, shift);
+
+   shift = lp_build_const_int_vec(gallivm, i32_type, 23 - 17);
+   mask = lp_build_const_int_vec(gallivm, i32_type, 0x7ff << 11);
+   gcomp = lp_build_shr(&i32_bld, gcomp, shift);
+   gcomp = lp_build_and(&i32_bld, gcomp, mask);
+
+   shift = lp_build_const_int_vec(gallivm, i32_type, 27 - 23);
+   mask = lp_build_const_int_vec(gallivm, i32_type, 0x3ff << 22);
+   bcomp = lp_build_shl(&i32_bld, bcomp, shift);
+   bcomp = lp_build_and(&i32_bld, bcomp, mask);
+
+   dst = lp_build_or(&i32_bld, rcomp, gcomp);
+   return lp_build_or(&i32_bld, dst, bcomp);
+}
+
+
+/**
+ * Convert a float-like value with less exponent and mantissa
+ * bits than a normal float32 to a float32. The mantissa of
+ * the source value is assumed to have an implied 1, and the exponent
+ * is biased. There are no negative values.
+ * The source value to extract must be in a 32bit int.
+ * While this helper is generic, it is only ever going to be useful for
+ * r11g11b10 (no other common format exists with the same properties).
+ *
+ * @param src             (vector) value to convert
+ * @param mantissa_bits   the number of mantissa bits
+ * @param exponent_bits   the number of exponent bits
+ * @param mantissa_start  the bit start position of the packed component
+ *
+ * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
+ * ref https://gist.github.com/rygorous/2156668
+ */
+static LLVMValueRef
+lp_build_smallfloat_nosign_to_float(struct gallivm_state *gallivm,
+                                    struct lp_type f32_type,
+                                    LLVMValueRef src,
+                                    unsigned mantissa_bits,
+                                    unsigned exponent_bits,
+                                    unsigned mantissa_start)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef smallexpmask, i32_floatexpmask, magic;
+   LLVMValueRef wasinfnan, tmp, res, shift, mask;
+   unsigned exponent_start = mantissa_start + mantissa_bits;
+   struct lp_type i32_type = lp_type_int_vec(32, 32 * f32_type.length);
+   struct lp_build_context f32_bld, i32_bld;
+
+   lp_build_context_init(&f32_bld, gallivm, f32_type);
+   lp_build_context_init(&i32_bld, gallivm, i32_type);
+
+   /* extract the component to "float position" */
+   if (exponent_start < 23) {
+      shift = lp_build_const_int_vec(gallivm, i32_type, 23 - exponent_start);
+      src = lp_build_shl(&i32_bld, src, shift);
+   }
+   else {
+      shift = lp_build_const_int_vec(gallivm, i32_type, exponent_start - 23);
+      src = lp_build_shr(&i32_bld, src, shift);
+   }
+   mask = lp_build_const_int_vec(gallivm, i32_type,
+                                 ((1 << (mantissa_bits + exponent_bits)) - 1) <<
+                                 (23 - mantissa_bits));
+   src = lp_build_and(&i32_bld, src, mask);
+   src = LLVMBuildBitCast(builder, src, f32_bld.vec_type, "");
+
+   /* now do the actual scaling */
+   smallexpmask = lp_build_const_int_vec(gallivm, i32_type,
+                                         ((1 << exponent_bits) - 1) << 23);
+   i32_floatexpmask = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23);
+   /*
+    * magic number has exponent new exp bias + (new exp bias - old exp bias),
+    * mantissa is 0.
+    */
+   magic = lp_build_const_int_vec(gallivm, i32_type,
+                                  (255 - (1 << (exponent_bits - 1))) << 23);
+   magic = LLVMBuildBitCast(builder, magic, f32_bld.vec_type, "");
+
+   /* adjust exponent and fix denorms */
+   res = lp_build_mul(&f32_bld, src, magic);
+
+   /*
+    * if exp was max (== NaN or Inf) set new exp to max (keep mantissa),
+    * so a simple "or" will do (because exp adjust will leave mantissa intact)
+    */
+   /* use float compare (better for AVX 8-wide / no AVX2 though otherwise should use int) */
+   smallexpmask = LLVMBuildBitCast(builder, magic, f32_bld.vec_type, "");
+   wasinfnan = lp_build_compare(gallivm, f32_type, PIPE_FUNC_GEQUAL, src, smallexpmask);
+   res = LLVMBuildBitCast(builder, res, i32_bld.vec_type, "");
+   tmp = lp_build_and(&i32_bld, i32_floatexpmask, wasinfnan);
+   res = lp_build_or(&i32_bld, tmp, res);
+
+   return LLVMBuildBitCast(builder, res, f32_bld.vec_type, "");
+}
+
+
+/**
+ * Convert packed float format (r11g11b10) value(s) to rgba float SoA values.
+ *
+ * @param src   packed AoS r11g11b10 values (as (vector) int32)
+ * @param dst   pointer to the SoA result values
+ */
+void
+lp_build_r11g11b10_to_float(struct gallivm_state *gallivm,
+                            LLVMValueRef src,
+                            LLVMValueRef *dst)
+{
+   LLVMTypeRef src_type = LLVMTypeOf(src);
+   unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
+                            LLVMGetVectorSize(src_type) : 1;
+   struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length);
+
+   dst[0] = lp_build_smallfloat_nosign_to_float(gallivm, f32_type, src, 6, 5, 0);
+   dst[1] = lp_build_smallfloat_nosign_to_float(gallivm, f32_type, src, 6, 5, 11);
+   dst[2] = lp_build_smallfloat_nosign_to_float(gallivm, f32_type, src, 5, 5, 22);
+
+   /* Just set alpha to one */
+   dst[3] = lp_build_one(gallivm, f32_type);
+}
+
+
 /**
  * Converts int16 half-float to float32
  * Note this can be performed in 1 instruction if vcvtph2ps exists (sse5 i think?)
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.h b/src/gallium/auxiliary/gallivm/lp_bld_conv.h
index d7dfed85187..5bd6f4f1d75 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.h
@@ -61,6 +61,15 @@ LLVMValueRef
 lp_build_float_to_half(struct gallivm_state *gallivm,
                        LLVMValueRef src);
 
+LLVMValueRef
+lp_build_float_to_r11g11b10(struct gallivm_state *gallivm,
+                            LLVMValueRef *src);
+
+void
+lp_build_r11g11b10_to_float(struct gallivm_state *gallivm,
+                            LLVMValueRef src,
+                            LLVMValueRef *dst);
+
 LLVMValueRef
 lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm,
                                         struct lp_type src_type,
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index 5ec1df659b8..6760db0b38f 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -322,7 +322,8 @@ llvmpipe_is_format_supported( struct pipe_screen *_screen,
       if (format_desc->colorspace != UTIL_FORMAT_COLORSPACE_RGB)
          return FALSE;
 
-      if (format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
+      if (format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN &&
+          format != PIPE_FORMAT_R11G11B10_FLOAT)
          return FALSE;
       assert(format_desc->block.width == 1);
       assert(format_desc->block.height == 1);
@@ -330,7 +331,8 @@ llvmpipe_is_format_supported( struct pipe_screen *_screen,
       if (format_desc->is_mixed)
          return FALSE;
 
-      if (!format_desc->is_array && !format_desc->is_bitmask)
+      if (!format_desc->is_array && !format_desc->is_bitmask &&
+          format != PIPE_FORMAT_R11G11B10_FLOAT)
          return FALSE;
 
       /*
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index d8369b4d807..953a5c1aa44 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -972,6 +972,17 @@ lp_mem_type_from_format_desc(const struct util_format_description *format_desc,
    unsigned i;
    unsigned chan;
 
+   if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT) {
+      /* just make this a 32bit uint */
+      type->floating = false;
+      type->fixed = false;
+      type->sign = false;
+      type->norm = false;
+      type->width = 32;
+      type->length = 1;
+      return;
+   }
+
    for (i = 0; i < 4; i++)
       if (format_desc->channel[i].type != UTIL_FORMAT_TYPE_VOID)
          break;
@@ -1009,6 +1020,17 @@ lp_blend_type_from_format_desc(const struct util_format_description *format_desc
    unsigned i;
    unsigned chan;
 
+   if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT) {
+      /* always use ordinary floats for blending */
+      type->floating = true;
+      type->fixed = false;
+      type->sign = true;
+      type->norm = false;
+      type->width = 32;
+      type->length = 4;
+      return;
+   }
+
    for (i = 0; i < 4; i++)
       if (format_desc->channel[i].type != UTIL_FORMAT_TYPE_VOID)
          break;
@@ -1122,6 +1144,48 @@ convert_to_blend_type(struct gallivm_state *gallivm,
    unsigned pixels = 16 / num_srcs;
    bool is_arith;
 
+   /*
+    * full custom path for packed floats - none of the later functions would do
+    * anything useful, and given the lp_type representation they can't be fixed.
+    */
+   if (src_fmt->format == PIPE_FORMAT_R11G11B10_FLOAT) {
+      LLVMValueRef tmpsrc[4];
+      /*
+       * This is pretty suboptimal for this case blending in SoA would be much
+       * better, since conversion gets us SoA values so need to convert back.
+       */
+      assert(src_type.width == 32);
+      assert(dst_type.floating);
+      assert(dst_type.width = 32);
+      assert(dst_type.length % 4 == 0);
+      for (i = 0; i < 4; i++) {
+         tmpsrc[i] = src[i];
+      }
+      for (i = 0; i < num_srcs / 4; i++) {
+         LLVMValueRef tmpsoa[4];
+         LLVMValueRef tmps = tmpsrc[i];
+         if (num_srcs == 8) {
+            LLVMValueRef shuffles[8];
+            unsigned j;
+            /* fetch was 4 values but need 8-wide output values */
+            tmps = lp_build_concat(gallivm, &tmpsrc[i * 2], src_type, 2);
+            /*
+             * for 8-wide aos transpose would give us wrong order not matching
+             * incoming converted fs values and mask. ARGH.
+             */
+            for (j = 0; j < 4; j++) {
+               shuffles[j] = lp_build_const_int32(gallivm, j * 2);
+               shuffles[j + 4] = lp_build_const_int32(gallivm, j * 2 + 1);
+            }
+            tmps = LLVMBuildShuffleVector(builder, tmps, tmps,
+                                          LLVMConstVector(shuffles, 8), "");
+         }
+         lp_build_r11g11b10_to_float(gallivm, tmps, tmpsoa);
+         lp_build_transpose_aos(gallivm, dst_type, tmpsoa, &src[i * 4]);
+      }
+      return;
+   }
+
    lp_mem_type_from_format_desc(src_fmt, &mem_type);
    lp_blend_type_from_format_desc(src_fmt, &blend_type);
 
@@ -1225,6 +1289,47 @@ convert_from_blend_type(struct gallivm_state *gallivm,
    unsigned pixels = 16 / num_srcs;
    bool is_arith;
 
+   /*
+    * full custom path for packed floats - none of the later functions would do
+    * anything useful, and given the lp_type representation they can't be fixed.
+    */
+   if (src_fmt->format == PIPE_FORMAT_R11G11B10_FLOAT) {
+      /*
+       * This is pretty suboptimal for this case blending in SoA would be much
+       * better - we need to transpose the AoS values back to SoA values for
+       * conversion/packing.
+       */
+      assert(src_type.floating);
+      assert(src_type.width = 32);
+      assert(src_type.length % 4 == 0);
+      assert(dst_type.width == 32);
+      for (i = 0; i < num_srcs / 4; i++) {
+         LLVMValueRef tmpsoa[4], tmpdst;
+         lp_build_transpose_aos(gallivm, src_type, &src[i * 4], tmpsoa);
+         tmpdst = lp_build_float_to_r11g11b10(gallivm, tmpsoa);
+         if (num_srcs == 8) {
+            LLVMValueRef tmpaos, shuffles[8];
+            unsigned j;
+            /*
+             * for 8-wide aos transpose has given us wrong order not matching
+             * output order. HMPF. Also need to split the output values manually.
+             */
+            for (j = 0; j < 4; j++) {
+               shuffles[j * 2] = lp_build_const_int32(gallivm, j);
+               shuffles[j * 2 + 1] = lp_build_const_int32(gallivm, j + 4);
+            }
+            tmpaos = LLVMBuildShuffleVector(builder, tmpdst, tmpdst,
+                                            LLVMConstVector(shuffles, 8), "");
+            src[i * 2] = lp_build_extract_range(gallivm, tmpaos, 0, 4);
+            src[i * 2 + 1] = lp_build_extract_range(gallivm, tmpaos, 4, 4);
+         }
+         else {
+            src[i] = tmpdst;
+         }
+      }
+      return;
+   }
+
    lp_mem_type_from_format_desc(src_fmt, &mem_type);
    lp_blend_type_from_format_desc(src_fmt, &blend_type);
 
@@ -1532,6 +1637,17 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
       }
    }
 
+   if (out_format == PIPE_FORMAT_R11G11B10_FLOAT) {
+      /* the code above can't work for layout_other */
+      dst_channels = 4; /* HACK: this is fake 4 really but need it due to transpose stuff later */
+      has_alpha = true;
+      swizzle[0] = 0;
+      swizzle[1] = 1;
+      swizzle[2] = 2;
+      swizzle[3] = 3;
+      pad_inline = true; /* HACK: prevent rgbxrgbx->rgbrgbxx conversion later */
+   }
+
    /* If 3 channels then pad to include alpha for 4 element transpose */
    if (dst_channels == 3 && !has_alpha) {
       for (i = 0; i < TGSI_NUM_CHANNELS; i++) {
@@ -1756,6 +1872,16 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
 
    dst_type.length *= 16 / dst_count;
 
+   if (out_format == PIPE_FORMAT_R11G11B10_FLOAT) {
+      /*
+       * we need multiple values at once for the conversion, so can as well
+       * load them vectorized here too instead of concatenating later.
+       * (Still need concatenation later for 8-wide vectors).
+       */
+      dst_count = block_height;
+      dst_type.length = block_width;
+   }
+
    load_unswizzled_block(gallivm, color_ptr, stride, block_width, block_height,
                          dst, dst_type, dst_count, dst_alignment);
 
-- 
2.30.2