gallivm: fix rsqrt failures
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_format_soa.c
1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 #include "pipe/p_defines.h"
30
31 #include "util/u_format.h"
32 #include "util/u_memory.h"
33 #include "util/u_string.h"
34
35 #include "lp_bld_type.h"
36 #include "lp_bld_const.h"
37 #include "lp_bld_conv.h"
38 #include "lp_bld_swizzle.h"
39 #include "lp_bld_gather.h"
40 #include "lp_bld_debug.h"
41 #include "lp_bld_format.h"
42
43
44 void
45 lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
46 struct lp_build_context *bld,
47 const LLVMValueRef *unswizzled,
48 LLVMValueRef swizzled_out[4])
49 {
50 assert(UTIL_FORMAT_SWIZZLE_0 == PIPE_SWIZZLE_ZERO);
51 assert(UTIL_FORMAT_SWIZZLE_1 == PIPE_SWIZZLE_ONE);
52
53 if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
54 /*
55 * Return zzz1 for depth-stencil formats.
56 *
57 * XXX: Allow to control the depth swizzle with an additional parameter,
58 * as the caller may wish another depth swizzle, or retain the stencil
59 * value.
60 */
61 enum util_format_swizzle swizzle = format_desc->swizzle[0];
62 LLVMValueRef depth = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
63 swizzled_out[2] = swizzled_out[1] = swizzled_out[0] = depth;
64 swizzled_out[3] = bld->one;
65 }
66 else {
67 unsigned chan;
68 for (chan = 0; chan < 4; ++chan) {
69 enum util_format_swizzle swizzle = format_desc->swizzle[chan];
70 swizzled_out[chan] = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
71 }
72 }
73 }
74
75
76 /**
77 * Unpack several pixels in SoA.
78 *
79 * It takes a vector of packed pixels:
80 *
81 * packed = {P0, P1, P2, P3, ..., Pn}
82 *
83 * And will produce four vectors:
84 *
85 * red = {R0, R1, R2, R3, ..., Rn}
86 * green = {G0, G1, G2, G3, ..., Gn}
87 * blue = {B0, B1, B2, B3, ..., Bn}
88 * alpha = {A0, A1, A2, A3, ..., An}
89 *
90 * It requires that a packed pixel fits into an element of the output
91 * channels. The common case is when converting pixel with a depth of 32 bit or
92 * less into floats.
93 *
94 * \param format_desc the format of the 'packed' incoming pixel vector
95 * \param type the desired type for rgba_out (type.length = n, above)
96 * \param packed the incoming vector of packed pixels
97 * \param rgba_out returns the SoA R,G,B,A vectors
98 */
99 void
100 lp_build_unpack_rgba_soa(struct gallivm_state *gallivm,
101 const struct util_format_description *format_desc,
102 struct lp_type type,
103 LLVMValueRef packed,
104 LLVMValueRef rgba_out[4])
105 {
106 LLVMBuilderRef builder = gallivm->builder;
107 struct lp_build_context bld;
108 LLVMValueRef inputs[4];
109 unsigned start;
110 unsigned chan;
111
112 assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
113 assert(format_desc->block.width == 1);
114 assert(format_desc->block.height == 1);
115 assert(format_desc->block.bits <= type.width);
116 /* FIXME: Support more output types */
117 assert(type.floating);
118 assert(type.width == 32);
119
120 lp_build_context_init(&bld, gallivm, type);
121
122 /* Decode the input vector components */
123 start = 0;
124 for (chan = 0; chan < format_desc->nr_channels; ++chan) {
125 const unsigned width = format_desc->channel[chan].size;
126 const unsigned stop = start + width;
127 LLVMValueRef input;
128
129 input = packed;
130
131 switch(format_desc->channel[chan].type) {
132 case UTIL_FORMAT_TYPE_VOID:
133 input = lp_build_undef(gallivm, type);
134 break;
135
136 case UTIL_FORMAT_TYPE_UNSIGNED:
137 /*
138 * Align the LSB
139 */
140
141 if (start) {
142 input = LLVMBuildLShr(builder, input, lp_build_const_int_vec(gallivm, type, start), "");
143 }
144
145 /*
146 * Zero the MSBs
147 */
148
149 if (stop < format_desc->block.bits) {
150 unsigned mask = ((unsigned long long)1 << width) - 1;
151 input = LLVMBuildAnd(builder, input, lp_build_const_int_vec(gallivm, type, mask), "");
152 }
153
154 /*
155 * Type conversion
156 */
157
158 if (type.floating) {
159 if(format_desc->channel[chan].normalized)
160 input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
161 else
162 input = LLVMBuildSIToFP(builder, input,
163 lp_build_vec_type(gallivm, type), "");
164 }
165 else {
166 /* FIXME */
167 assert(0);
168 input = lp_build_undef(gallivm, type);
169 }
170
171 break;
172
173 case UTIL_FORMAT_TYPE_SIGNED:
174 /*
175 * Align the sign bit first.
176 */
177
178 if (stop < type.width) {
179 unsigned bits = type.width - stop;
180 LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
181 input = LLVMBuildShl(builder, input, bits_val, "");
182 }
183
184 /*
185 * Align the LSB (with an arithmetic shift to preserve the sign)
186 */
187
188 if (format_desc->channel[chan].size < type.width) {
189 unsigned bits = type.width - format_desc->channel[chan].size;
190 LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
191 input = LLVMBuildAShr(builder, input, bits_val, "");
192 }
193
194 /*
195 * Type conversion
196 */
197
198 if (type.floating) {
199 input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(gallivm, type), "");
200 if (format_desc->channel[chan].normalized) {
201 double scale = 1.0 / ((1 << (format_desc->channel[chan].size - 1)) - 1);
202 LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
203 input = LLVMBuildFMul(builder, input, scale_val, "");
204 }
205 }
206 else {
207 /* FIXME */
208 assert(0);
209 input = lp_build_undef(gallivm, type);
210 }
211
212 break;
213
214 case UTIL_FORMAT_TYPE_FLOAT:
215 if (type.floating) {
216 assert(start == 0);
217 assert(stop == 32);
218 assert(type.width == 32);
219 input = LLVMBuildBitCast(builder, input, lp_build_vec_type(gallivm, type), "");
220 }
221 else {
222 /* FIXME */
223 assert(0);
224 input = lp_build_undef(gallivm, type);
225 }
226 break;
227
228 case UTIL_FORMAT_TYPE_FIXED:
229 if (type.floating) {
230 double scale = 1.0 / ((1 << (format_desc->channel[chan].size/2)) - 1);
231 LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
232 input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(gallivm, type), "");
233 input = LLVMBuildFMul(builder, input, scale_val, "");
234 }
235 else {
236 /* FIXME */
237 assert(0);
238 input = lp_build_undef(gallivm, type);
239 }
240 break;
241
242 default:
243 assert(0);
244 input = lp_build_undef(gallivm, type);
245 break;
246 }
247
248 inputs[chan] = input;
249
250 start = stop;
251 }
252
253 lp_build_format_swizzle_soa(format_desc, &bld, inputs, rgba_out);
254 }
255
256
257 void
258 lp_build_rgba8_to_f32_soa(struct gallivm_state *gallivm,
259 struct lp_type dst_type,
260 LLVMValueRef packed,
261 LLVMValueRef *rgba)
262 {
263 LLVMBuilderRef builder = gallivm->builder;
264 LLVMValueRef mask = lp_build_const_int_vec(gallivm, dst_type, 0xff);
265 unsigned chan;
266
267 packed = LLVMBuildBitCast(builder, packed,
268 lp_build_int_vec_type(gallivm, dst_type), "");
269
270 /* Decode the input vector components */
271 for (chan = 0; chan < 4; ++chan) {
272 unsigned start = chan*8;
273 unsigned stop = start + 8;
274 LLVMValueRef input;
275
276 input = packed;
277
278 if (start)
279 input = LLVMBuildLShr(builder, input,
280 lp_build_const_int_vec(gallivm, dst_type, start), "");
281
282 if (stop < 32)
283 input = LLVMBuildAnd(builder, input, mask, "");
284
285 input = lp_build_unsigned_norm_to_float(gallivm, 8, dst_type, input);
286
287 rgba[chan] = input;
288 }
289 }
290
291
292
293 /**
294 * Fetch a texels from a texture, returning them in SoA layout.
295 *
296 * \param type the desired return type for 'rgba'. The vector length
297 * is the number of texels to fetch
298 *
299 * \param base_ptr points to start of the texture image block. For non-
300 * compressed formats, this simply points to the texel.
301 * For compressed formats, it points to the start of the
302 * compressed data block.
303 *
304 * \param i, j the sub-block pixel coordinates. For non-compressed formats
305 * these will always be (0,0). For compressed formats, i will
306 * be in [0, block_width-1] and j will be in [0, block_height-1].
307 */
308 void
309 lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
310 const struct util_format_description *format_desc,
311 struct lp_type type,
312 LLVMValueRef base_ptr,
313 LLVMValueRef offset,
314 LLVMValueRef i,
315 LLVMValueRef j,
316 LLVMValueRef rgba_out[4])
317 {
318 LLVMBuilderRef builder = gallivm->builder;
319
320 if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
321 (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
322 format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
323 format_desc->block.width == 1 &&
324 format_desc->block.height == 1 &&
325 format_desc->block.bits <= type.width &&
326 (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||
327 format_desc->channel[0].size == 32))
328 {
329 /*
330 * The packed pixel fits into an element of the destination format. Put
331 * the packed pixels into a vector and extract each component for all
332 * vector elements in parallel.
333 */
334
335 LLVMValueRef packed;
336
337 /*
338 * gather the texels from the texture
339 * Ex: packed = {BGRA, BGRA, BGRA, BGRA}.
340 */
341 packed = lp_build_gather(gallivm,
342 type.length,
343 format_desc->block.bits,
344 type.width,
345 base_ptr, offset);
346
347 /*
348 * convert texels to float rgba
349 */
350 lp_build_unpack_rgba_soa(gallivm,
351 format_desc,
352 type,
353 packed, rgba_out);
354 return;
355 }
356
357 /*
358 * Try calling lp_build_fetch_rgba_aos for all pixels.
359 */
360
361 if (util_format_fits_8unorm(format_desc) &&
362 type.floating && type.width == 32 &&
363 (type.length == 1 || (type.length % 4 == 0))) {
364 struct lp_type tmp_type;
365 LLVMValueRef tmp;
366
367 memset(&tmp_type, 0, sizeof tmp_type);
368 tmp_type.width = 8;
369 tmp_type.length = type.length * 4;
370 tmp_type.norm = TRUE;
371
372 tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
373 base_ptr, offset, i, j);
374
375 lp_build_rgba8_to_f32_soa(gallivm,
376 type,
377 tmp,
378 rgba_out);
379
380 return;
381 }
382
383 /*
384 * Fallback to calling lp_build_fetch_rgba_aos for each pixel.
385 *
386 * This is not the most efficient way of fetching pixels, as we
387 * miss some opportunities to do vectorization, but this is
388 * convenient for formats or scenarios for which there was no
389 * opportunity or incentive to optimize.
390 */
391
392 {
393 unsigned k, chan;
394 struct lp_type tmp_type;
395
396 if (gallivm_debug & GALLIVM_DEBUG_PERF) {
397 debug_printf("%s: scalar unpacking of %s\n",
398 __FUNCTION__, format_desc->short_name);
399 }
400
401 tmp_type = type;
402 tmp_type.length = 4;
403
404 for (chan = 0; chan < 4; ++chan) {
405 rgba_out[chan] = lp_build_undef(gallivm, type);
406 }
407
408 /* loop over number of pixels */
409 for(k = 0; k < type.length; ++k) {
410 LLVMValueRef index = lp_build_const_int32(gallivm, k);
411 LLVMValueRef offset_elem;
412 LLVMValueRef i_elem, j_elem;
413 LLVMValueRef tmp;
414
415 offset_elem = LLVMBuildExtractElement(builder, offset,
416 index, "");
417
418 i_elem = LLVMBuildExtractElement(builder, i, index, "");
419 j_elem = LLVMBuildExtractElement(builder, j, index, "");
420
421 /* Get a single float[4]={R,G,B,A} pixel */
422 tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
423 base_ptr, offset_elem,
424 i_elem, j_elem);
425
426 /*
427 * Insert the AoS tmp value channels into the SoA result vectors at
428 * position = 'index'.
429 */
430 for (chan = 0; chan < 4; ++chan) {
431 LLVMValueRef chan_val = lp_build_const_int32(gallivm, chan),
432 tmp_chan = LLVMBuildExtractElement(builder, tmp, chan_val, "");
433 rgba_out[chan] = LLVMBuildInsertElement(builder, rgba_out[chan],
434 tmp_chan, index, "");
435 }
436 }
437 }
438 }