mesa/gallium: Move u_bit_scan{,64} from gallium to util.
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_format_cached.c
1 /**************************************************************************
2 *
3 * Copyright 2015 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 #include "lp_bld_format.h"
29 #include "lp_bld_type.h"
30 #include "lp_bld_struct.h"
31 #include "lp_bld_const.h"
32 #include "lp_bld_flow.h"
33 #include "lp_bld_swizzle.h"
34
35 #include "util/u_math.h"
36
37
38 /**
39 * @file
40 * Complex block-compression based formats are handled here by using a cache,
41 * so re-decoding of every pixel is not required.
42 * Especially for bilinear filtering, texel reuse is very high hence even
43 * a small cache helps.
44 * The elements in the cache are the decoded blocks - currently things
45 * are restricted to formats which are 4x4 block based, and the decoded
46 * texels must fit into 4x8 bits.
47 * The cache is direct mapped so hitrates aren't all that great and cache
48 * thrashing could happen.
49 *
50 * @author Roland Scheidegger <sroland@vmware.com>
51 */
52
53
54 #if LP_BUILD_FORMAT_CACHE_DEBUG
55 static void
56 update_cache_access(struct gallivm_state *gallivm,
57 LLVMValueRef ptr,
58 unsigned count,
59 unsigned index)
60 {
61 LLVMBuilderRef builder = gallivm->builder;
62 LLVMValueRef member_ptr, cache_access;
63
64 assert(index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL ||
65 index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
66
67 member_ptr = lp_build_struct_get_ptr(gallivm, ptr, index, "");
68 cache_access = LLVMBuildLoad(builder, member_ptr, "cache_access");
69 cache_access = LLVMBuildAdd(builder, cache_access,
70 LLVMConstInt(LLVMInt64TypeInContext(gallivm->context),
71 count, 0), "");
72 LLVMBuildStore(builder, cache_access, member_ptr);
73 }
74 #endif
75
76
77 static void
78 store_cached_block(struct gallivm_state *gallivm,
79 LLVMValueRef *col,
80 LLVMValueRef tag_value,
81 LLVMValueRef hash_index,
82 LLVMValueRef cache)
83 {
84 LLVMBuilderRef builder = gallivm->builder;
85 LLVMValueRef ptr, indices[3];
86 LLVMTypeRef type_ptr4x32;
87 unsigned count;
88
89 type_ptr4x32 = LLVMPointerType(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), 0);
90 indices[0] = lp_build_const_int32(gallivm, 0);
91 indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS);
92 indices[2] = hash_index;
93 ptr = LLVMBuildGEP(builder, cache, indices, ARRAY_SIZE(indices), "");
94 LLVMBuildStore(builder, tag_value, ptr);
95
96 indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA);
97 hash_index = LLVMBuildMul(builder, hash_index,
98 lp_build_const_int32(gallivm, 16), "");
99 for (count = 0; count < 4; count++) {
100 indices[2] = hash_index;
101 ptr = LLVMBuildGEP(builder, cache, indices, ARRAY_SIZE(indices), "");
102 ptr = LLVMBuildBitCast(builder, ptr, type_ptr4x32, "");
103 LLVMBuildStore(builder, col[count], ptr);
104 hash_index = LLVMBuildAdd(builder, hash_index,
105 lp_build_const_int32(gallivm, 4), "");
106 }
107 }
108
109
110 static LLVMValueRef
111 lookup_cached_pixel(struct gallivm_state *gallivm,
112 LLVMValueRef ptr,
113 LLVMValueRef index)
114 {
115 LLVMBuilderRef builder = gallivm->builder;
116 LLVMValueRef member_ptr, indices[3];
117
118 indices[0] = lp_build_const_int32(gallivm, 0);
119 indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA);
120 indices[2] = index;
121 member_ptr = LLVMBuildGEP(builder, ptr, indices, ARRAY_SIZE(indices), "");
122 return LLVMBuildLoad(builder, member_ptr, "cache_data");
123 }
124
125
126 static LLVMValueRef
127 lookup_tag_data(struct gallivm_state *gallivm,
128 LLVMValueRef ptr,
129 LLVMValueRef index)
130 {
131 LLVMBuilderRef builder = gallivm->builder;
132 LLVMValueRef member_ptr, indices[3];
133
134 indices[0] = lp_build_const_int32(gallivm, 0);
135 indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS);
136 indices[2] = index;
137 member_ptr = LLVMBuildGEP(builder, ptr, indices, ARRAY_SIZE(indices), "");
138 return LLVMBuildLoad(builder, member_ptr, "tag_data");
139 }
140
141
142 static void
143 update_cached_block(struct gallivm_state *gallivm,
144 const struct util_format_description *format_desc,
145 LLVMValueRef ptr_addr,
146 LLVMValueRef hash_index,
147 LLVMValueRef cache)
148
149 {
150 LLVMBuilderRef builder = gallivm->builder;
151 LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
152 LLVMTypeRef pi8t = LLVMPointerType(i8t, 0);
153 LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
154 LLVMTypeRef i32x4 = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4);
155 LLVMValueRef function;
156 LLVMValueRef tag_value, tmp_ptr;
157 LLVMValueRef col[4];
158 unsigned i, j;
159
160 /*
161 * Use format_desc->fetch_rgba_8unorm() for each pixel in the block.
162 * This doesn't actually make any sense whatsoever, someone would need
163 * to write a function doing this for all pixels in a block (either as
164 * an external c function or with generated code). Don't ask.
165 */
166
167 {
168 /*
169 * Function to call looks like:
170 * fetch(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
171 */
172 LLVMTypeRef ret_type;
173 LLVMTypeRef arg_types[4];
174 LLVMTypeRef function_type;
175
176 assert(format_desc->fetch_rgba_8unorm);
177
178 ret_type = LLVMVoidTypeInContext(gallivm->context);
179 arg_types[0] = pi8t;
180 arg_types[1] = pi8t;
181 arg_types[2] = i32t;
182 arg_types[3] = i32t;
183 function_type = LLVMFunctionType(ret_type, arg_types,
184 ARRAY_SIZE(arg_types), 0);
185
186 /* make const pointer for the C fetch_rgba_8unorm function */
187 function = lp_build_const_int_pointer(gallivm,
188 func_to_pointer((func_pointer) format_desc->fetch_rgba_8unorm));
189
190 /* cast the callee pointer to the function's type */
191 function = LLVMBuildBitCast(builder, function,
192 LLVMPointerType(function_type, 0),
193 "cast callee");
194 }
195
196 tmp_ptr = lp_build_array_alloca(gallivm, i32x4,
197 lp_build_const_int32(gallivm, 16),
198 "tmp_decode_store");
199 tmp_ptr = LLVMBuildBitCast(builder, tmp_ptr, pi8t, "");
200
201 /*
202 * Invoke format_desc->fetch_rgba_8unorm() for each pixel.
203 * This is going to be really really slow.
204 * Note: the block store format is actually
205 * x0y0x0y1x0y2x0y3 x1y0x1y1x1y2x1y3 ...
206 */
207 for (i = 0; i < 4; ++i) {
208 for (j = 0; j < 4; ++j) {
209 LLVMValueRef args[4];
210 LLVMValueRef dst_offset = lp_build_const_int32(gallivm, (i * 4 + j) * 4);
211
212 /*
213 * Note we actually supply a pointer to the start of the block,
214 * not the start of the texture.
215 */
216 args[0] = LLVMBuildGEP(gallivm->builder, tmp_ptr, &dst_offset, 1, "");
217 args[1] = ptr_addr;
218 args[2] = LLVMConstInt(i32t, i, 0);
219 args[3] = LLVMConstInt(i32t, j, 0);
220 LLVMBuildCall(builder, function, args, ARRAY_SIZE(args), "");
221 }
222 }
223
224 /* Finally store the block - pointless mem copy + update tag. */
225 tmp_ptr = LLVMBuildBitCast(builder, tmp_ptr, LLVMPointerType(i32x4, 0), "");
226 for (i = 0; i < 4; ++i) {
227 LLVMValueRef tmp_offset = lp_build_const_int32(gallivm, i);
228 LLVMValueRef ptr = LLVMBuildGEP(gallivm->builder, tmp_ptr, &tmp_offset, 1, "");
229 col[i] = LLVMBuildLoad(builder, ptr, "");
230 }
231
232 tag_value = LLVMBuildPtrToInt(gallivm->builder, ptr_addr,
233 LLVMInt64TypeInContext(gallivm->context), "");
234 store_cached_block(gallivm, col, tag_value, hash_index, cache);
235 }
236
237
238 /*
239 * Do a cached lookup.
240 *
241 * Returns (vectors of) 4x8 rgba aos value
242 */
243 LLVMValueRef
244 lp_build_fetch_cached_texels(struct gallivm_state *gallivm,
245 const struct util_format_description *format_desc,
246 unsigned n,
247 LLVMValueRef base_ptr,
248 LLVMValueRef offset,
249 LLVMValueRef i,
250 LLVMValueRef j,
251 LLVMValueRef cache)
252
253 {
254 LLVMBuilderRef builder = gallivm->builder;
255 unsigned count, low_bit, log2size;
256 LLVMValueRef color, offset_stored, addr, ptr_addrtrunc, tmp;
257 LLVMValueRef ij_index, hash_index, hash_mask, block_index;
258 LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
259 LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
260 LLVMTypeRef i64t = LLVMInt64TypeInContext(gallivm->context);
261 struct lp_type type;
262 struct lp_build_context bld32;
263 memset(&type, 0, sizeof type);
264 type.width = 32;
265 type.length = n;
266
267 assert(format_desc->block.width == 4);
268 assert(format_desc->block.height == 4);
269
270 lp_build_context_init(&bld32, gallivm, type);
271
272 /*
273 * compute hash - we use direct mapped cache, the hash function could
274 * be better but it needs to be simple
275 * per-element:
276 * compare offset with offset stored at tag (hash)
277 * if not equal decode/store block, update tag
278 * extract color from cache
279 * assemble result vector
280 */
281
282 /* TODO: not ideal with 32bit pointers... */
283
284 low_bit = util_logbase2(format_desc->block.bits / 8);
285 log2size = util_logbase2(LP_BUILD_FORMAT_CACHE_SIZE);
286 addr = LLVMBuildPtrToInt(builder, base_ptr, i64t, "");
287 ptr_addrtrunc = LLVMBuildPtrToInt(builder, base_ptr, i32t, "");
288 ptr_addrtrunc = lp_build_broadcast_scalar(&bld32, ptr_addrtrunc);
289 /* For the hash function, first mask off the unused lowest bits. Then just
290 do some xor with address bits - only use lower 32bits */
291 ptr_addrtrunc = LLVMBuildAdd(builder, offset, ptr_addrtrunc, "");
292 ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc,
293 lp_build_const_int_vec(gallivm, type, low_bit), "");
294 /* This only really makes sense for size 64,128,256 */
295 hash_index = ptr_addrtrunc;
296 ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc,
297 lp_build_const_int_vec(gallivm, type, 2*log2size), "");
298 hash_index = LLVMBuildXor(builder, ptr_addrtrunc, hash_index, "");
299 tmp = LLVMBuildLShr(builder, hash_index,
300 lp_build_const_int_vec(gallivm, type, log2size), "");
301 hash_index = LLVMBuildXor(builder, hash_index, tmp, "");
302
303 hash_mask = lp_build_const_int_vec(gallivm, type, LP_BUILD_FORMAT_CACHE_SIZE - 1);
304 hash_index = LLVMBuildAnd(builder, hash_index, hash_mask, "");
305 ij_index = LLVMBuildShl(builder, i, lp_build_const_int_vec(gallivm, type, 2), "");
306 ij_index = LLVMBuildAdd(builder, ij_index, j, "");
307 block_index = LLVMBuildShl(builder, hash_index,
308 lp_build_const_int_vec(gallivm, type, 4), "");
309 block_index = LLVMBuildAdd(builder, ij_index, block_index, "");
310
311 if (n > 1) {
312 color = LLVMGetUndef(LLVMVectorType(i32t, n));
313 for (count = 0; count < n; count++) {
314 LLVMValueRef index, cond, colorx;
315 LLVMValueRef block_indexx, hash_indexx, addrx, offsetx, ptr_addrx;
316 struct lp_build_if_state if_ctx;
317
318 index = lp_build_const_int32(gallivm, count);
319 offsetx = LLVMBuildExtractElement(builder, offset, index, "");
320 addrx = LLVMBuildZExt(builder, offsetx, i64t, "");
321 addrx = LLVMBuildAdd(builder, addrx, addr, "");
322 block_indexx = LLVMBuildExtractElement(builder, block_index, index, "");
323 hash_indexx = LLVMBuildLShr(builder, block_indexx,
324 lp_build_const_int32(gallivm, 4), "");
325 offset_stored = lookup_tag_data(gallivm, cache, hash_indexx);
326 cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addrx, "");
327
328 lp_build_if(&if_ctx, gallivm, cond);
329 {
330 ptr_addrx = LLVMBuildIntToPtr(builder, addrx,
331 LLVMPointerType(i8t, 0), "");
332 update_cached_block(gallivm, format_desc, ptr_addrx, hash_indexx, cache);
333 #if LP_BUILD_FORMAT_CACHE_DEBUG
334 update_cache_access(gallivm, cache, 1,
335 LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
336 #endif
337 }
338 lp_build_endif(&if_ctx);
339
340 colorx = lookup_cached_pixel(gallivm, cache, block_indexx);
341
342 color = LLVMBuildInsertElement(builder, color, colorx,
343 lp_build_const_int32(gallivm, count), "");
344 }
345 }
346 else {
347 LLVMValueRef cond;
348 struct lp_build_if_state if_ctx;
349
350 tmp = LLVMBuildZExt(builder, offset, i64t, "");
351 addr = LLVMBuildAdd(builder, tmp, addr, "");
352 offset_stored = lookup_tag_data(gallivm, cache, hash_index);
353 cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addr, "");
354
355 lp_build_if(&if_ctx, gallivm, cond);
356 {
357 tmp = LLVMBuildIntToPtr(builder, addr, LLVMPointerType(i8t, 0), "");
358 update_cached_block(gallivm, format_desc, tmp, hash_index, cache);
359 #if LP_BUILD_FORMAT_CACHE_DEBUG
360 update_cache_access(gallivm, cache, 1,
361 LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
362 #endif
363 }
364 lp_build_endif(&if_ctx);
365
366 color = lookup_cached_pixel(gallivm, cache, block_index);
367 }
368 #if LP_BUILD_FORMAT_CACHE_DEBUG
369 update_cache_access(gallivm, cache, n,
370 LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL);
371 #endif
372 return LLVMBuildBitCast(builder, color, LLVMVectorType(i8t, n * 4), "");
373 }
374