gallivm: avoid unnecessary URem in linear wrap repeat case
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_format_soa.c
1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 #include "pipe/p_defines.h"
30
31 #include "util/u_format.h"
32 #include "util/u_memory.h"
33 #include "util/u_string.h"
34
35 #include "lp_bld_type.h"
36 #include "lp_bld_const.h"
37 #include "lp_bld_conv.h"
38 #include "lp_bld_swizzle.h"
39 #include "lp_bld_gather.h"
40 #include "lp_bld_debug.h"
41 #include "lp_bld_format.h"
42
43
44 void
45 lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
46 struct lp_build_context *bld,
47 const LLVMValueRef *unswizzled,
48 LLVMValueRef swizzled_out[4])
49 {
50 assert(UTIL_FORMAT_SWIZZLE_0 == PIPE_SWIZZLE_ZERO);
51 assert(UTIL_FORMAT_SWIZZLE_1 == PIPE_SWIZZLE_ONE);
52
53 if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
54 /*
55 * Return zzz1 for depth-stencil formats.
56 *
57 * XXX: Allow to control the depth swizzle with an additional parameter,
58 * as the caller may wish another depth swizzle, or retain the stencil
59 * value.
60 */
61 enum util_format_swizzle swizzle = format_desc->swizzle[0];
62 LLVMValueRef depth = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
63 swizzled_out[2] = swizzled_out[1] = swizzled_out[0] = depth;
64 swizzled_out[3] = bld->one;
65 }
66 else {
67 unsigned chan;
68 for (chan = 0; chan < 4; ++chan) {
69 enum util_format_swizzle swizzle = format_desc->swizzle[chan];
70 swizzled_out[chan] = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
71 }
72 }
73 }
74
75
76 /**
77 * Unpack several pixels in SoA.
78 *
79 * It takes a vector of packed pixels:
80 *
81 * packed = {P0, P1, P2, P3, ..., Pn}
82 *
83 * And will produce four vectors:
84 *
85 * red = {R0, R1, R2, R3, ..., Rn}
86 * green = {G0, G1, G2, G3, ..., Gn}
87 * blue = {B0, B1, B2, B3, ..., Bn}
88 * alpha = {A0, A1, A2, A3, ..., An}
89 *
90 * It requires that a packed pixel fits into an element of the output
91 * channels. The common case is when converting pixel with a depth of 32 bit or
92 * less into floats.
93 *
94 * \param format_desc the format of the 'packed' incoming pixel vector
95 * \param type the desired type for rgba_out (type.length = n, above)
96 * \param packed the incoming vector of packed pixels
97 * \param rgba_out returns the SoA R,G,B,A vectors
98 */
99 void
100 lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
101 const struct util_format_description *format_desc,
102 struct lp_type type,
103 LLVMValueRef packed,
104 LLVMValueRef rgba_out[4])
105 {
106 struct lp_build_context bld;
107 LLVMValueRef inputs[4];
108 unsigned start;
109 unsigned chan;
110
111 assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
112 assert(format_desc->block.width == 1);
113 assert(format_desc->block.height == 1);
114 assert(format_desc->block.bits <= type.width);
115 /* FIXME: Support more output types */
116 assert(type.floating);
117 assert(type.width == 32);
118
119 lp_build_context_init(&bld, builder, type);
120
121 /* Decode the input vector components */
122 start = 0;
123 for (chan = 0; chan < format_desc->nr_channels; ++chan) {
124 const unsigned width = format_desc->channel[chan].size;
125 const unsigned stop = start + width;
126 LLVMValueRef input;
127
128 input = packed;
129
130 switch(format_desc->channel[chan].type) {
131 case UTIL_FORMAT_TYPE_VOID:
132 input = lp_build_undef(type);
133 break;
134
135 case UTIL_FORMAT_TYPE_UNSIGNED:
136 /*
137 * Align the LSB
138 */
139
140 if (start) {
141 input = LLVMBuildLShr(builder, input, lp_build_const_int_vec(type, start), "");
142 }
143
144 /*
145 * Zero the MSBs
146 */
147
148 if (stop < format_desc->block.bits) {
149 unsigned mask = ((unsigned long long)1 << width) - 1;
150 input = LLVMBuildAnd(builder, input, lp_build_const_int_vec(type, mask), "");
151 }
152
153 /*
154 * Type conversion
155 */
156
157 if (type.floating) {
158 if(format_desc->channel[chan].normalized)
159 input = lp_build_unsigned_norm_to_float(builder, width, type, input);
160 else
161 input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(type), "");
162 }
163 else {
164 /* FIXME */
165 assert(0);
166 input = lp_build_undef(type);
167 }
168
169 break;
170
171 case UTIL_FORMAT_TYPE_SIGNED:
172 /*
173 * Align the sign bit first.
174 */
175
176 if (stop < type.width) {
177 unsigned bits = type.width - stop;
178 LLVMValueRef bits_val = lp_build_const_int_vec(type, bits);
179 input = LLVMBuildShl(builder, input, bits_val, "");
180 }
181
182 /*
183 * Align the LSB (with an arithmetic shift to preserve the sign)
184 */
185
186 if (format_desc->channel[chan].size < type.width) {
187 unsigned bits = type.width - format_desc->channel[chan].size;
188 LLVMValueRef bits_val = lp_build_const_int_vec(type, bits);
189 input = LLVMBuildAShr(builder, input, bits_val, "");
190 }
191
192 /*
193 * Type conversion
194 */
195
196 if (type.floating) {
197 input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(type), "");
198 if (format_desc->channel[chan].normalized) {
199 double scale = 1.0 / ((1 << (format_desc->channel[chan].size - 1)) - 1);
200 LLVMValueRef scale_val = lp_build_const_vec(type, scale);
201 input = LLVMBuildFMul(builder, input, scale_val, "");
202 }
203 }
204 else {
205 /* FIXME */
206 assert(0);
207 input = lp_build_undef(type);
208 }
209
210 break;
211
212 case UTIL_FORMAT_TYPE_FLOAT:
213 if (type.floating) {
214 assert(start == 0);
215 assert(stop == 32);
216 assert(type.width == 32);
217 input = LLVMBuildBitCast(builder, input, lp_build_vec_type(type), "");
218 }
219 else {
220 /* FIXME */
221 assert(0);
222 input = lp_build_undef(type);
223 }
224 break;
225
226 case UTIL_FORMAT_TYPE_FIXED:
227 if (type.floating) {
228 double scale = 1.0 / ((1 << (format_desc->channel[chan].size/2)) - 1);
229 LLVMValueRef scale_val = lp_build_const_vec(type, scale);
230 input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(type), "");
231 input = LLVMBuildFMul(builder, input, scale_val, "");
232 }
233 else {
234 /* FIXME */
235 assert(0);
236 input = lp_build_undef(type);
237 }
238 break;
239
240 default:
241 assert(0);
242 input = lp_build_undef(type);
243 break;
244 }
245
246 inputs[chan] = input;
247
248 start = stop;
249 }
250
251 lp_build_format_swizzle_soa(format_desc, &bld, inputs, rgba_out);
252 }
253
254
255 void
256 lp_build_rgba8_to_f32_soa(LLVMBuilderRef builder,
257 struct lp_type dst_type,
258 LLVMValueRef packed,
259 LLVMValueRef *rgba)
260 {
261 LLVMValueRef mask = lp_build_const_int_vec(dst_type, 0xff);
262 unsigned chan;
263
264 packed = LLVMBuildBitCast(builder, packed,
265 lp_build_int_vec_type(dst_type), "");
266
267 /* Decode the input vector components */
268 for (chan = 0; chan < 4; ++chan) {
269 unsigned start = chan*8;
270 unsigned stop = start + 8;
271 LLVMValueRef input;
272
273 input = packed;
274
275 if (start)
276 input = LLVMBuildLShr(builder, input,
277 lp_build_const_int_vec(dst_type, start), "");
278
279 if (stop < 32)
280 input = LLVMBuildAnd(builder, input, mask, "");
281
282 input = lp_build_unsigned_norm_to_float(builder, 8, dst_type, input);
283
284 rgba[chan] = input;
285 }
286 }
287
288
289
290 /**
291 * Fetch a texels from a texture, returning them in SoA layout.
292 *
293 * \param type the desired return type for 'rgba'. The vector length
294 * is the number of texels to fetch
295 *
296 * \param base_ptr points to start of the texture image block. For non-
297 * compressed formats, this simply points to the texel.
298 * For compressed formats, it points to the start of the
299 * compressed data block.
300 *
301 * \param i, j the sub-block pixel coordinates. For non-compressed formats
302 * these will always be (0,0). For compressed formats, i will
303 * be in [0, block_width-1] and j will be in [0, block_height-1].
304 */
305 void
306 lp_build_fetch_rgba_soa(LLVMBuilderRef builder,
307 const struct util_format_description *format_desc,
308 struct lp_type type,
309 LLVMValueRef base_ptr,
310 LLVMValueRef offset,
311 LLVMValueRef i,
312 LLVMValueRef j,
313 LLVMValueRef rgba_out[4])
314 {
315
316 if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
317 (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
318 format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
319 format_desc->block.width == 1 &&
320 format_desc->block.height == 1 &&
321 format_desc->block.bits <= type.width &&
322 (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||
323 format_desc->channel[0].size == 32))
324 {
325 /*
326 * The packed pixel fits into an element of the destination format. Put
327 * the packed pixels into a vector and extract each component for all
328 * vector elements in parallel.
329 */
330
331 LLVMValueRef packed;
332
333 /*
334 * gather the texels from the texture
335 * Ex: packed = {BGRA, BGRA, BGRA, BGRA}.
336 */
337 packed = lp_build_gather(builder,
338 type.length,
339 format_desc->block.bits,
340 type.width,
341 base_ptr, offset);
342
343 /*
344 * convert texels to float rgba
345 */
346 lp_build_unpack_rgba_soa(builder,
347 format_desc,
348 type,
349 packed, rgba_out);
350 return;
351 }
352
353 /*
354 * Try calling lp_build_fetch_rgba_aos for all pixels.
355 */
356
357 if (util_format_fits_8unorm(format_desc) &&
358 type.floating && type.width == 32 && type.length == 4) {
359 struct lp_type tmp_type;
360 LLVMValueRef tmp;
361
362 memset(&tmp_type, 0, sizeof tmp_type);
363 tmp_type.width = 8;
364 tmp_type.length = type.length * 4;
365 tmp_type.norm = TRUE;
366
367 tmp = lp_build_fetch_rgba_aos(builder, format_desc, tmp_type,
368 base_ptr, offset, i, j);
369
370 lp_build_rgba8_to_f32_soa(builder,
371 type,
372 tmp,
373 rgba_out);
374
375 return;
376 }
377
378 /*
379 * Fallback to calling lp_build_fetch_rgba_aos for each pixel.
380 *
381 * This is not the most efficient way of fetching pixels, as we
382 * miss some opportunities to do vectorization, but this is
383 * convenient for formats or scenarios for which there was no
384 * opportunity or incentive to optimize.
385 */
386
387 {
388 unsigned k, chan;
389 struct lp_type tmp_type;
390
391 if (gallivm_debug & GALLIVM_DEBUG_PERF) {
392 debug_printf("%s: scalar unpacking of %s\n",
393 __FUNCTION__, format_desc->short_name);
394 }
395
396 tmp_type = type;
397 tmp_type.length = 4;
398
399 for (chan = 0; chan < 4; ++chan) {
400 rgba_out[chan] = lp_build_undef(type);
401 }
402
403 /* loop over number of pixels */
404 for(k = 0; k < type.length; ++k) {
405 LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), k, 0);
406 LLVMValueRef offset_elem;
407 LLVMValueRef i_elem, j_elem;
408 LLVMValueRef tmp;
409
410 offset_elem = LLVMBuildExtractElement(builder, offset, index, "");
411
412 i_elem = LLVMBuildExtractElement(builder, i, index, "");
413 j_elem = LLVMBuildExtractElement(builder, j, index, "");
414
415 /* Get a single float[4]={R,G,B,A} pixel */
416 tmp = lp_build_fetch_rgba_aos(builder, format_desc, tmp_type,
417 base_ptr, offset_elem,
418 i_elem, j_elem);
419
420 /*
421 * Insert the AoS tmp value channels into the SoA result vectors at
422 * position = 'index'.
423 */
424 for (chan = 0; chan < 4; ++chan) {
425 LLVMValueRef chan_val = LLVMConstInt(LLVMInt32Type(), chan, 0),
426 tmp_chan = LLVMBuildExtractElement(builder, tmp, chan_val, "");
427 rgba_out[chan] = LLVMBuildInsertElement(builder, rgba_out[chan],
428 tmp_chan, index, "");
429 }
430 }
431 }
432 }