1 /**************************************************************************
3 * Copyright 2013 VMware, Inc.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
31 * Format conversion code for srgb formats.
33 * Functions for converting from srgb to linear and vice versa.
34 * From http://www.opengl.org/registry/specs/EXT/texture_sRGB.txt:
37 * cl = cs / 12.92, cs <= 0.04045
38 * cl = ((cs + 0.055)/1.055)^2.4, cs > 0.04045
42 * Map IEEE-754 Not-a-number to zero.
44 * } else if (cl > 1.0) {
46 * } else if (cl < 0.0) {
48 * } else if (cl < 0.0031308) {
51 * cs = 1.055 * pow(cl, 0.41666) - 0.055;
54 * This does not need to be accurate, however at least for d3d10
55 * (http://msdn.microsoft.com/en-us/library/windows/desktop/dd607323%28v=vs.85%29.aspx):
56 * 1) For srgb->linear, it is required that the error on the srgb side is
57 * not larger than 0.5f, which I interpret that if you map the value back
58 * to srgb from linear using the ideal conversion, it would not be off by
59 * more than 0.5f (that is, it would map to the same 8-bit integer value
60 * as it was before conversion to linear).
61 * 2) linear->srgb is permitted 0.6f which luckily looks like quite a large
63 * 3) Additionally, all srgb values converted to linear and back must result
64 * in the same value as they were originally.
66 * @author Roland Scheidegger <sroland@vmware.com>
70 #include "util/u_debug.h"
72 #include "lp_bld_type.h"
73 #include "lp_bld_const.h"
74 #include "lp_bld_arit.h"
75 #include "lp_bld_bitarit.h"
76 #include "lp_bld_logic.h"
77 #include "lp_bld_format.h"
82 * Convert srgb int values to linear float values.
83 * Several possibilities how to do this, e.g.
85 * - doing the pow() with int-to-float and float-to-int tricks
86 * (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent)
87 * - just using standard polynomial approximation
88 * (3rd order polynomial is required for crappy but just sufficient accuracy)
90 * @param src integer (vector) value(s) to convert
91 * (chan_bits bit values unpacked to 32 bit already).
94 lp_build_srgb_to_linear(struct gallivm_state
*gallivm
,
95 struct lp_type src_type
,
99 struct lp_type f32_type
= lp_type_float_vec(32, src_type
.length
* 32);
100 struct lp_build_context f32_bld
;
101 LLVMValueRef srcf
, part_lin
, part_pow
, is_linear
, lin_const
, lin_thresh
;
102 double coeffs
[4] = {0.0023f
,
104 0.6935f
/ (255.0f
* 255.0f
),
105 0.3012f
/ (255.0f
* 255.0f
* 255.0f
)
108 assert(src_type
.width
== 32);
109 /* Technically this would work with more bits too but would be inaccurate. */
110 assert(chan_bits
<= 8);
112 lp_build_context_init(&f32_bld
, gallivm
, f32_type
);
115 * using polynomial: (src * (src * (src * 0.3012 + 0.6935) + 0.0030) + 0.0023)
116 * ( poly = 0.3012*x^3 + 0.6935*x^2 + 0.0030*x + 0.0023)
117 * (found with octave polyfit and some magic as I couldn't get the error
118 * function right). Using the above mentioned error function, the values stay
119 * within +-0.35, except for the lowest values - hence tweaking linear segment
120 * to cover the first 16 instead of the first 11 values (the error stays
121 * just about acceptable there too).
122 * Hence: lin = src > 15 ? poly : src / 12.6
123 * This function really only makes sense for vectors, should use LUT otherwise.
124 * All in all (including float conversion) 11 instructions (with sse4.1),
125 * 6 constants (polynomial could be done with 1 instruction less at the cost
126 * of slightly worse dependency chain, fma should also help).
128 /* doing the 1/255 mul as part of the approximation */
129 srcf
= lp_build_int_to_float(&f32_bld
, src
);
130 if (chan_bits
!= 8) {
131 /* could adjust all the constants instead */
132 LLVMValueRef rescale_const
= lp_build_const_vec(gallivm
, f32_type
,
133 255.0f
/ ((1 << chan_bits
) - 1));
134 srcf
= lp_build_mul(&f32_bld
, srcf
, rescale_const
);
136 lin_const
= lp_build_const_vec(gallivm
, f32_type
, 1.0f
/ (12.6f
* 255.0f
));
137 part_lin
= lp_build_mul(&f32_bld
, srcf
, lin_const
);
139 part_pow
= lp_build_polynomial(&f32_bld
, srcf
, coeffs
, 4);
141 lin_thresh
= lp_build_const_vec(gallivm
, f32_type
, 15.0f
);
142 is_linear
= lp_build_compare(gallivm
, f32_type
, PIPE_FUNC_LEQUAL
, srcf
, lin_thresh
);
143 return lp_build_select(&f32_bld
, is_linear
, part_lin
, part_pow
);
148 * Convert linear float values to srgb int values.
149 * Several possibilities how to do this, e.g.
150 * - use table (based on exponent/highest order mantissa bits) and do
151 * linear interpolation (https://gist.github.com/rygorous/2203834)
152 * - Chebyshev polynomial
153 * - Approximation using reciprocals
154 * - using int-to-float and float-to-int tricks for pow()
155 * (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent)
157 * @param src float (vector) value(s) to convert.
160 lp_build_linear_to_srgb(struct gallivm_state
*gallivm
,
161 struct lp_type src_type
,
165 LLVMBuilderRef builder
= gallivm
->builder
;
166 struct lp_build_context f32_bld
;
167 LLVMValueRef lin_thresh
, lin
, lin_const
, is_linear
, tmp
, pow_final
;
169 lp_build_context_init(&f32_bld
, gallivm
, src_type
);
171 src
= lp_build_clamp(&f32_bld
, src
, f32_bld
.zero
, f32_bld
.one
);
175 * using int-to-float and float-to-int trick for pow().
176 * This is much more accurate than necessary thanks to the correction,
177 * but it most certainly makes no sense without rsqrt available.
178 * Bonus points if you understand how this works...
179 * All in all (including min/max clamp, conversion) 19 instructions.
182 float exp_f
= 2.0f
/ 3.0f
;
183 /* some compilers can't do exp2f, so this is exp2f(127.0f/exp_f - 127.0f) */
184 float exp2f_c
= 1.30438178253e+19f
;
185 float coeff_f
= 0.62996f
;
186 LLVMValueRef pow_approx
, coeff
, x2
, exponent
, pow_1
, pow_2
;
187 struct lp_type int_type
= lp_int_type(src_type
);
190 * First calculate approx x^8/12
192 exponent
= lp_build_const_vec(gallivm
, src_type
, exp_f
);
193 coeff
= lp_build_const_vec(gallivm
, src_type
,
194 exp2f_c
* powf(coeff_f
, 1.0f
/ exp_f
));
196 /* premultiply src */
197 tmp
= lp_build_mul(&f32_bld
, coeff
, src
);
199 tmp
= LLVMBuildBitCast(builder
, tmp
, lp_build_vec_type(gallivm
, int_type
), "");
200 tmp
= lp_build_int_to_float(&f32_bld
, tmp
);
201 /* multiply for pow */
202 tmp
= lp_build_mul(&f32_bld
, tmp
, exponent
);
204 pow_approx
= lp_build_itrunc(&f32_bld
, tmp
);
205 pow_approx
= LLVMBuildBitCast(builder
, pow_approx
,
206 lp_build_vec_type(gallivm
, src_type
), "");
209 * Since that pow was inaccurate (like 3 bits, though each sqrt step would
210 * give another bit), compensate the error (which is why we chose another
211 * exponent in the first place).
213 /* x * x^(8/12) = x^(20/12) */
214 pow_1
= lp_build_mul(&f32_bld
, pow_approx
, src
);
216 /* x * x * x^(-4/12) = x^(20/12) */
217 /* Should avoid using rsqrt if it's not available, but
218 * using x * x^(4/12) * x^(4/12) instead will change error weight */
219 tmp
= lp_build_fast_rsqrt(&f32_bld
, pow_approx
);
220 x2
= lp_build_mul(&f32_bld
, src
, src
);
221 pow_2
= lp_build_mul(&f32_bld
, x2
, tmp
);
223 /* average the values so the errors cancel out, compensate bias,
224 * we also squeeze the 1.055 mul of the srgb conversion plus the 255.0 mul
225 * for conversion to int in here */
226 tmp
= lp_build_add(&f32_bld
, pow_1
, pow_2
);
227 coeff
= lp_build_const_vec(gallivm
, src_type
,
228 1.0f
/ (3.0f
* coeff_f
) * 0.999852f
*
229 powf(1.055f
* 255.0f
, 4.0f
));
230 pow_final
= lp_build_mul(&f32_bld
, tmp
, coeff
);
232 /* x^(5/12) = rsqrt(rsqrt(x^20/12)) */
233 if (lp_build_fast_rsqrt_available(src_type
)) {
234 pow_final
= lp_build_fast_rsqrt(&f32_bld
,
235 lp_build_fast_rsqrt(&f32_bld
, pow_final
));
238 pow_final
= lp_build_sqrt(&f32_bld
, lp_build_sqrt(&f32_bld
, pow_final
));
240 pow_final
= lp_build_add(&f32_bld
, pow_final
,
241 lp_build_const_vec(gallivm
, src_type
, -0.055f
* 255.0f
));
246 * using "rational polynomial" approximation here.
247 * Essentially y = a*x^0.375 + b*x^0.5 + c, with also
248 * factoring in the 255.0 mul and the scaling mul.
249 * (a is closer to actual value so has higher weight than b.)
250 * Note: the constants are magic values. They were found empirically,
251 * possibly could be improved but good enough (be VERY careful with
252 * error metric if you'd want to tweak them, they also MUST fit with
253 * the crappy polynomial above for srgb->linear since it is required
254 * that each srgb value maps back to the same value).
255 * This function has an error of max +-0.17. Not sure this is actually
256 * enough, we require +-0.6 but that may include the +-0.5 from integer
257 * conversion. Seems to pass all relevant tests though...
258 * For the approximated srgb->linear values the error is naturally larger
259 * (+-0.42) but still accurate enough (required +-0.5 essentially).
260 * All in all (including min/max clamp, conversion) 15 instructions.
261 * FMA would help (minus 2 instructions).
264 LLVMValueRef x05
, x0375
, a_const
, b_const
, c_const
, tmp2
;
266 if (lp_build_fast_rsqrt_available(src_type
)) {
267 tmp
= lp_build_fast_rsqrt(&f32_bld
, src
);
268 x05
= lp_build_mul(&f32_bld
, src
, tmp
);
272 * I don't really expect this to be practical without rsqrt
273 * but there's no reason for triple punishment so at least
274 * save the otherwise resulting division and unnecessary mul...
276 x05
= lp_build_sqrt(&f32_bld
, src
);
279 tmp
= lp_build_mul(&f32_bld
, x05
, src
);
280 if (lp_build_fast_rsqrt_available(src_type
)) {
281 x0375
= lp_build_fast_rsqrt(&f32_bld
, lp_build_fast_rsqrt(&f32_bld
, tmp
));
284 x0375
= lp_build_sqrt(&f32_bld
, lp_build_sqrt(&f32_bld
, tmp
));
287 a_const
= lp_build_const_vec(gallivm
, src_type
, 0.675f
* 1.0622 * 255.0f
);
288 b_const
= lp_build_const_vec(gallivm
, src_type
, 0.325f
* 1.0622 * 255.0f
);
289 c_const
= lp_build_const_vec(gallivm
, src_type
, -0.0620f
* 255.0f
);
291 tmp
= lp_build_mul(&f32_bld
, a_const
, x0375
);
292 tmp2
= lp_build_mul(&f32_bld
, b_const
, x05
);
293 tmp2
= lp_build_add(&f32_bld
, tmp2
, c_const
);
294 pow_final
= lp_build_add(&f32_bld
, tmp
, tmp2
);
297 /* linear part is easy */
298 lin_const
= lp_build_const_vec(gallivm
, src_type
, 12.92f
* 255.0f
);
299 lin
= lp_build_mul(&f32_bld
, src
, lin_const
);
301 lin_thresh
= lp_build_const_vec(gallivm
, src_type
, 0.0031308f
);
302 is_linear
= lp_build_compare(gallivm
, src_type
, PIPE_FUNC_LEQUAL
, src
, lin_thresh
);
303 tmp
= lp_build_select(&f32_bld
, is_linear
, lin
, pow_final
);
305 if (chan_bits
!= 8) {
306 /* could adjust all the constants instead */
307 LLVMValueRef rescale_const
= lp_build_const_vec(gallivm
, src_type
,
308 ((1 << chan_bits
) - 1) / 255.0f
);
309 tmp
= lp_build_mul(&f32_bld
, tmp
, rescale_const
);
312 f32_bld
.type
.sign
= 0;
313 return lp_build_iround(&f32_bld
, tmp
);
318 * Convert linear float soa values to packed srgb AoS values.
319 * This only handles packed formats which are 4x8bit in size
320 * (rgba and rgbx plus swizzles), and 16bit 565-style formats
321 * with no alpha. (In the latter case the return values won't be
322 * fully packed, it will look like r5g6b5x16r5g6b5x16...)
324 * @param src float SoA (vector) values to convert.
327 lp_build_float_to_srgb_packed(struct gallivm_state
*gallivm
,
328 const struct util_format_description
*dst_fmt
,
329 struct lp_type src_type
,
332 LLVMBuilderRef builder
= gallivm
->builder
;
334 struct lp_build_context f32_bld
;
335 struct lp_type int32_type
= lp_int_type(src_type
);
336 LLVMValueRef tmpsrgb
[4], alpha
, dst
;
338 lp_build_context_init(&f32_bld
, gallivm
, src_type
);
340 /* rgb is subject to linear->srgb conversion, alpha is not */
341 for (chan
= 0; chan
< 3; chan
++) {
342 unsigned chan_bits
= dst_fmt
->channel
[dst_fmt
->swizzle
[chan
]].size
;
343 tmpsrgb
[chan
] = lp_build_linear_to_srgb(gallivm
, src_type
, chan_bits
, src
[chan
]);
346 * can't use lp_build_conv since we want to keep values as 32bit
347 * here so we can interleave with rgb to go from SoA->AoS.
349 alpha
= lp_build_clamp_zero_one_nanzero(&f32_bld
, src
[3]);
350 alpha
= lp_build_mul(&f32_bld
, alpha
,
351 lp_build_const_vec(gallivm
, src_type
, 255.0f
));
352 tmpsrgb
[3] = lp_build_iround(&f32_bld
, alpha
);
354 dst
= lp_build_zero(gallivm
, int32_type
);
355 for (chan
= 0; chan
< dst_fmt
->nr_channels
; chan
++) {
356 if (dst_fmt
->swizzle
[chan
] <= UTIL_FORMAT_SWIZZLE_W
) {
358 LLVMValueRef shifted
, shift_val
;
359 ls
= dst_fmt
->channel
[dst_fmt
->swizzle
[chan
]].shift
;
360 shift_val
= lp_build_const_int_vec(gallivm
, int32_type
, ls
);
361 shifted
= LLVMBuildShl(builder
, tmpsrgb
[chan
], shift_val
, "");
362 dst
= LLVMBuildOr(builder
, dst
, shifted
, "");