gallivm: optimize yuv decoding
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_format_yuv.c
1 /**************************************************************************
2 *
3 * Copyright 2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
18 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
19 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
20 * USE OR OTHER DEALINGS IN THE SOFTWARE.
21 *
22 * The above copyright notice and this permission notice (including the
23 * next paragraph) shall be included in all copies or substantial portions
24 * of the Software.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * YUV pixel format manipulation.
32 *
33 * @author Jose Fonseca <jfonseca@vmware.com>
34 */
35
36
37 #include "util/u_format.h"
38 #include "util/u_cpu_detect.h"
39
40 #include "lp_bld_arit.h"
41 #include "lp_bld_type.h"
42 #include "lp_bld_const.h"
43 #include "lp_bld_conv.h"
44 #include "lp_bld_gather.h"
45 #include "lp_bld_format.h"
46 #include "lp_bld_logic.h"
47
48 /**
49 * Extract Y, U, V channels from packed UYVY.
50 * @param packed is a <n x i32> vector with the packed UYVY blocks
51 * @param i is a <n x i32> vector with the x pixel coordinate (0 or 1)
52 */
53 static void
54 uyvy_to_yuv_soa(LLVMBuilderRef builder,
55 unsigned n,
56 LLVMValueRef packed,
57 LLVMValueRef i,
58 LLVMValueRef *y,
59 LLVMValueRef *u,
60 LLVMValueRef *v)
61 {
62 struct lp_type type;
63 LLVMValueRef mask;
64
65 memset(&type, 0, sizeof type);
66 type.width = 32;
67 type.length = n;
68
69 assert(lp_check_value(type, packed));
70 assert(lp_check_value(type, i));
71
72 /*
73 * y = (uyvy >> (16*i + 8)) & 0xff
74 * u = (uyvy ) & 0xff
75 * v = (uyvy >> 16 ) & 0xff
76 */
77
78 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
79 /*
80 * Avoid shift with per-element count.
81 * No support on x86, gets translated to roughly 5 instructions
82 * per element. Didn't measure performance but cuts shader size
83 * by quite a bit (less difference if cpu has no sse4.1 support).
84 */
85 if (util_cpu_caps.has_sse2 && n == 4) {
86 LLVMValueRef sel, tmp, tmp2;
87 struct lp_build_context bld32;
88
89 lp_build_context_init(&bld32, builder, type);
90
91 tmp = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 8), "");
92 tmp2 = LLVMBuildLShr(builder, tmp, lp_build_const_int_vec(type, 16), "");
93 sel = lp_build_compare(builder, type, PIPE_FUNC_EQUAL, i, lp_build_const_int_vec(type, 0));
94 *y = lp_build_select(&bld32, sel, tmp, tmp2);
95 } else
96 #endif
97 {
98 LLVMValueRef shift;
99 shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(type, 16), "");
100 shift = LLVMBuildAdd(builder, shift, lp_build_const_int_vec(type, 8), "");
101 *y = LLVMBuildLShr(builder, packed, shift, "");
102 }
103
104 *u = packed;
105 *v = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 16), "");
106
107 mask = lp_build_const_int_vec(type, 0xff);
108
109 *y = LLVMBuildAnd(builder, *y, mask, "y");
110 *u = LLVMBuildAnd(builder, *u, mask, "u");
111 *v = LLVMBuildAnd(builder, *v, mask, "v");
112 }
113
114
115 /**
116 * Extract Y, U, V channels from packed YUYV.
117 * @param packed is a <n x i32> vector with the packed YUYV blocks
118 * @param i is a <n x i32> vector with the x pixel coordinate (0 or 1)
119 */
120 static void
121 yuyv_to_yuv_soa(LLVMBuilderRef builder,
122 unsigned n,
123 LLVMValueRef packed,
124 LLVMValueRef i,
125 LLVMValueRef *y,
126 LLVMValueRef *u,
127 LLVMValueRef *v)
128 {
129 struct lp_type type;
130 LLVMValueRef mask;
131
132 memset(&type, 0, sizeof type);
133 type.width = 32;
134 type.length = n;
135
136 assert(lp_check_value(type, packed));
137 assert(lp_check_value(type, i));
138
139 /*
140 * y = (yuyv >> 16*i) & 0xff
141 * u = (yuyv >> 8 ) & 0xff
142 * v = (yuyv >> 24 ) & 0xff
143 */
144
145 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
146 /*
147 * Avoid shift with per-element count.
148 * No support on x86, gets translated to roughly 5 instructions
149 * per element. Didn't measure performance but cuts shader size
150 * by quite a bit (less difference if cpu has no sse4.1 support).
151 */
152 if (util_cpu_caps.has_sse2 && n == 4) {
153 LLVMValueRef sel, tmp;
154 struct lp_build_context bld32;
155
156 lp_build_context_init(&bld32, builder, type);
157
158 tmp = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 16), "");
159 sel = lp_build_compare(builder, type, PIPE_FUNC_EQUAL, i, lp_build_const_int_vec(type, 0));
160 *y = lp_build_select(&bld32, sel, packed, tmp);
161 } else
162 #endif
163 {
164 LLVMValueRef shift;
165 shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(type, 16), "");
166 *y = LLVMBuildLShr(builder, packed, shift, "");
167 }
168
169 *u = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 8), "");
170 *v = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 24), "");
171
172 mask = lp_build_const_int_vec(type, 0xff);
173
174 *y = LLVMBuildAnd(builder, *y, mask, "y");
175 *u = LLVMBuildAnd(builder, *u, mask, "u");
176 *v = LLVMBuildAnd(builder, *v, mask, "v");
177 }
178
179
180 static INLINE void
181 yuv_to_rgb_soa(LLVMBuilderRef builder,
182 unsigned n,
183 LLVMValueRef y, LLVMValueRef u, LLVMValueRef v,
184 LLVMValueRef *r, LLVMValueRef *g, LLVMValueRef *b)
185 {
186 struct lp_type type;
187 struct lp_build_context bld;
188
189 LLVMValueRef c0;
190 LLVMValueRef c8;
191 LLVMValueRef c16;
192 LLVMValueRef c128;
193 LLVMValueRef c255;
194
195 LLVMValueRef cy;
196 LLVMValueRef cug;
197 LLVMValueRef cub;
198 LLVMValueRef cvr;
199 LLVMValueRef cvg;
200
201 memset(&type, 0, sizeof type);
202 type.sign = TRUE;
203 type.width = 32;
204 type.length = n;
205
206 lp_build_context_init(&bld, builder, type);
207
208 assert(lp_check_value(type, y));
209 assert(lp_check_value(type, u));
210 assert(lp_check_value(type, v));
211
212 /*
213 * Constants
214 */
215
216 c0 = lp_build_const_int_vec(type, 0);
217 c8 = lp_build_const_int_vec(type, 8);
218 c16 = lp_build_const_int_vec(type, 16);
219 c128 = lp_build_const_int_vec(type, 128);
220 c255 = lp_build_const_int_vec(type, 255);
221
222 cy = lp_build_const_int_vec(type, 298);
223 cug = lp_build_const_int_vec(type, -100);
224 cub = lp_build_const_int_vec(type, 516);
225 cvr = lp_build_const_int_vec(type, 409);
226 cvg = lp_build_const_int_vec(type, -208);
227
228 /*
229 * y -= 16;
230 * u -= 128;
231 * v -= 128;
232 */
233
234 y = LLVMBuildSub(builder, y, c16, "");
235 u = LLVMBuildSub(builder, u, c128, "");
236 v = LLVMBuildSub(builder, v, c128, "");
237
238 /*
239 * r = 298 * _y + 409 * _v + 128;
240 * g = 298 * _y - 100 * _u - 208 * _v + 128;
241 * b = 298 * _y + 516 * _u + 128;
242 */
243
244 y = LLVMBuildMul(builder, y, cy, "");
245 y = LLVMBuildAdd(builder, y, c128, "");
246
247 *r = LLVMBuildMul(builder, v, cvr, "");
248 *g = LLVMBuildAdd(builder,
249 LLVMBuildMul(builder, u, cug, ""),
250 LLVMBuildMul(builder, v, cvg, ""),
251 "");
252 *b = LLVMBuildMul(builder, u, cub, "");
253
254 *r = LLVMBuildAdd(builder, *r, y, "");
255 *g = LLVMBuildAdd(builder, *g, y, "");
256 *b = LLVMBuildAdd(builder, *b, y, "");
257
258 /*
259 * r >>= 8;
260 * g >>= 8;
261 * b >>= 8;
262 */
263
264 *r = LLVMBuildAShr(builder, *r, c8, "r");
265 *g = LLVMBuildAShr(builder, *g, c8, "g");
266 *b = LLVMBuildAShr(builder, *b, c8, "b");
267
268 /*
269 * Clamp
270 */
271
272 *r = lp_build_clamp(&bld, *r, c0, c255);
273 *g = lp_build_clamp(&bld, *g, c0, c255);
274 *b = lp_build_clamp(&bld, *b, c0, c255);
275 }
276
277
278 static LLVMValueRef
279 rgb_to_rgba_aos(LLVMBuilderRef builder,
280 unsigned n,
281 LLVMValueRef r, LLVMValueRef g, LLVMValueRef b)
282 {
283 struct lp_type type;
284 LLVMValueRef a;
285 LLVMValueRef rgba;
286
287 memset(&type, 0, sizeof type);
288 type.sign = TRUE;
289 type.width = 32;
290 type.length = n;
291
292 assert(lp_check_value(type, r));
293 assert(lp_check_value(type, g));
294 assert(lp_check_value(type, b));
295
296 /*
297 * Make a 4 x unorm8 vector
298 */
299
300 r = r;
301 g = LLVMBuildShl(builder, g, lp_build_const_int_vec(type, 8), "");
302 b = LLVMBuildShl(builder, b, lp_build_const_int_vec(type, 16), "");
303 a = lp_build_const_int_vec(type, 0xff000000);
304
305 rgba = r;
306 rgba = LLVMBuildOr(builder, rgba, g, "");
307 rgba = LLVMBuildOr(builder, rgba, b, "");
308 rgba = LLVMBuildOr(builder, rgba, a, "");
309
310 rgba = LLVMBuildBitCast(builder, rgba,
311 LLVMVectorType(LLVMInt8Type(), 4*n), "");
312
313 return rgba;
314 }
315
316
317 /**
318 * Convert from <n x i32> packed UYVY to <4n x i8> RGBA AoS
319 */
320 static LLVMValueRef
321 uyvy_to_rgba_aos(LLVMBuilderRef builder,
322 unsigned n,
323 LLVMValueRef packed,
324 LLVMValueRef i)
325 {
326 LLVMValueRef y, u, v;
327 LLVMValueRef r, g, b;
328 LLVMValueRef rgba;
329
330 uyvy_to_yuv_soa(builder, n, packed, i, &y, &u, &v);
331 yuv_to_rgb_soa(builder, n, y, u, v, &r, &g, &b);
332 rgba = rgb_to_rgba_aos(builder, n, r, g, b);
333
334 return rgba;
335 }
336
337
338 /**
339 * Convert from <n x i32> packed YUYV to <4n x i8> RGBA AoS
340 */
341 static LLVMValueRef
342 yuyv_to_rgba_aos(LLVMBuilderRef builder,
343 unsigned n,
344 LLVMValueRef packed,
345 LLVMValueRef i)
346 {
347 LLVMValueRef y, u, v;
348 LLVMValueRef r, g, b;
349 LLVMValueRef rgba;
350
351 yuyv_to_yuv_soa(builder, n, packed, i, &y, &u, &v);
352 yuv_to_rgb_soa(builder, n, y, u, v, &r, &g, &b);
353 rgba = rgb_to_rgba_aos(builder, n, r, g, b);
354
355 return rgba;
356 }
357
358
359 /**
360 * Convert from <n x i32> packed RG_BG to <4n x i8> RGBA AoS
361 */
362 static LLVMValueRef
363 rgbg_to_rgba_aos(LLVMBuilderRef builder,
364 unsigned n,
365 LLVMValueRef packed,
366 LLVMValueRef i)
367 {
368 LLVMValueRef r, g, b;
369 LLVMValueRef rgba;
370
371 uyvy_to_yuv_soa(builder, n, packed, i, &g, &r, &b);
372 rgba = rgb_to_rgba_aos(builder, n, r, g, b);
373
374 return rgba;
375 }
376
377
378 /**
379 * Convert from <n x i32> packed GR_GB to <4n x i8> RGBA AoS
380 */
381 static LLVMValueRef
382 grgb_to_rgba_aos(LLVMBuilderRef builder,
383 unsigned n,
384 LLVMValueRef packed,
385 LLVMValueRef i)
386 {
387 LLVMValueRef r, g, b;
388 LLVMValueRef rgba;
389
390 yuyv_to_yuv_soa(builder, n, packed, i, &g, &r, &b);
391 rgba = rgb_to_rgba_aos(builder, n, r, g, b);
392
393 return rgba;
394 }
395
396
397 /**
398 * @param n is the number of pixels processed
399 * @param packed is a <n x i32> vector with the packed YUYV blocks
400 * @param i is a <n x i32> vector with the x pixel coordinate (0 or 1)
401 * @return a <4*n x i8> vector with the pixel RGBA values in AoS
402 */
403 LLVMValueRef
404 lp_build_fetch_subsampled_rgba_aos(LLVMBuilderRef builder,
405 const struct util_format_description *format_desc,
406 unsigned n,
407 LLVMValueRef base_ptr,
408 LLVMValueRef offset,
409 LLVMValueRef i,
410 LLVMValueRef j)
411 {
412 LLVMValueRef packed;
413 LLVMValueRef rgba;
414
415 assert(format_desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED);
416 assert(format_desc->block.bits == 32);
417 assert(format_desc->block.width == 2);
418 assert(format_desc->block.height == 1);
419
420 packed = lp_build_gather(builder, n, 32, 32, base_ptr, offset);
421
422 (void)j;
423
424 switch (format_desc->format) {
425 case PIPE_FORMAT_UYVY:
426 rgba = uyvy_to_rgba_aos(builder, n, packed, i);
427 break;
428 case PIPE_FORMAT_YUYV:
429 rgba = yuyv_to_rgba_aos(builder, n, packed, i);
430 break;
431 case PIPE_FORMAT_R8G8_B8G8_UNORM:
432 rgba = rgbg_to_rgba_aos(builder, n, packed, i);
433 break;
434 case PIPE_FORMAT_G8R8_G8B8_UNORM:
435 rgba = grgb_to_rgba_aos(builder, n, packed, i);
436 break;
437 default:
438 assert(0);
439 rgba = LLVMGetUndef(LLVMVectorType(LLVMInt8Type(), 4*n));
440 break;
441 }
442
443 return rgba;
444 }
445