Merge remote-tracking branch 'public/master' into vulkan
[mesa.git] / src / intel / vulkan / anv_pipeline_cache.c
1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "util/mesa-sha1.h"
25 #include "util/debug.h"
26 #include "anv_private.h"
27
28 /* Remaining work:
29 *
30 * - Compact binding table layout so it's tight and not dependent on
31 * descriptor set layout.
32 *
33 * - Review prog_data struct for size and cacheability: struct
34 * brw_stage_prog_data has binding_table which uses a lot of uint32_t for 8
35 * bit quantities etc; param, pull_param, and image_params are pointers, we
36 * just need the compation map. use bit fields for all bools, eg
37 * dual_src_blend.
38 */
39
40 void
41 anv_pipeline_cache_init(struct anv_pipeline_cache *cache,
42 struct anv_device *device)
43 {
44 cache->device = device;
45 anv_state_stream_init(&cache->program_stream,
46 &device->instruction_block_pool);
47 pthread_mutex_init(&cache->mutex, NULL);
48
49 cache->kernel_count = 0;
50 cache->total_size = 0;
51 cache->table_size = 1024;
52 const size_t byte_size = cache->table_size * sizeof(cache->hash_table[0]);
53 cache->hash_table = malloc(byte_size);
54
55 /* We don't consider allocation failure fatal, we just start with a 0-sized
56 * cache. */
57 if (cache->hash_table == NULL ||
58 !env_var_as_boolean("ANV_ENABLE_PIPELINE_CACHE", true))
59 cache->table_size = 0;
60 else
61 memset(cache->hash_table, 0xff, byte_size);
62 }
63
64 void
65 anv_pipeline_cache_finish(struct anv_pipeline_cache *cache)
66 {
67 anv_state_stream_finish(&cache->program_stream);
68 pthread_mutex_destroy(&cache->mutex);
69 free(cache->hash_table);
70 }
71
72 struct cache_entry {
73 unsigned char sha1[20];
74 uint32_t prog_data_size;
75 uint32_t kernel_size;
76 uint32_t surface_count;
77 uint32_t sampler_count;
78 uint32_t image_count;
79
80 char prog_data[0];
81
82 /* kernel follows prog_data at next 64 byte aligned address */
83 };
84
85 static uint32_t
86 entry_size(struct cache_entry *entry)
87 {
88 /* This returns the number of bytes needed to serialize an entry, which
89 * doesn't include the alignment padding bytes.
90 */
91
92 const uint32_t map_size =
93 entry->surface_count * sizeof(struct anv_pipeline_binding) +
94 entry->sampler_count * sizeof(struct anv_pipeline_binding);
95
96 return sizeof(*entry) + entry->prog_data_size + map_size;
97 }
98
99 void
100 anv_hash_shader(unsigned char *hash, const void *key, size_t key_size,
101 struct anv_shader_module *module,
102 const char *entrypoint,
103 const VkSpecializationInfo *spec_info)
104 {
105 struct mesa_sha1 *ctx;
106
107 ctx = _mesa_sha1_init();
108 _mesa_sha1_update(ctx, key, key_size);
109 _mesa_sha1_update(ctx, module->sha1, sizeof(module->sha1));
110 _mesa_sha1_update(ctx, entrypoint, strlen(entrypoint));
111 /* hash in shader stage, pipeline layout? */
112 if (spec_info) {
113 _mesa_sha1_update(ctx, spec_info->pMapEntries,
114 spec_info->mapEntryCount * sizeof spec_info->pMapEntries[0]);
115 _mesa_sha1_update(ctx, spec_info->pData, spec_info->dataSize);
116 }
117 _mesa_sha1_final(ctx, hash);
118 }
119
120 static uint32_t
121 anv_pipeline_cache_search_unlocked(struct anv_pipeline_cache *cache,
122 const unsigned char *sha1,
123 const struct brw_stage_prog_data **prog_data,
124 struct anv_pipeline_bind_map *map)
125 {
126 const uint32_t mask = cache->table_size - 1;
127 const uint32_t start = (*(uint32_t *) sha1);
128
129 for (uint32_t i = 0; i < cache->table_size; i++) {
130 const uint32_t index = (start + i) & mask;
131 const uint32_t offset = cache->hash_table[index];
132
133 if (offset == ~0)
134 return NO_KERNEL;
135
136 struct cache_entry *entry =
137 cache->program_stream.block_pool->map + offset;
138 if (memcmp(entry->sha1, sha1, sizeof(entry->sha1)) == 0) {
139 if (prog_data) {
140 assert(map);
141 void *p = entry->prog_data;
142 *prog_data = p;
143 p += entry->prog_data_size;
144 map->surface_count = entry->surface_count;
145 map->sampler_count = entry->sampler_count;
146 map->image_count = entry->image_count;
147 map->surface_to_descriptor = p;
148 p += map->surface_count * sizeof(struct anv_pipeline_binding);
149 map->sampler_to_descriptor = p;
150 }
151
152 return offset + align_u32(entry_size(entry), 64);
153 }
154 }
155
156 unreachable("hash table should never be full");
157 }
158
159 uint32_t
160 anv_pipeline_cache_search(struct anv_pipeline_cache *cache,
161 const unsigned char *sha1,
162 const struct brw_stage_prog_data **prog_data,
163 struct anv_pipeline_bind_map *map)
164 {
165 uint32_t kernel;
166
167 pthread_mutex_lock(&cache->mutex);
168
169 kernel = anv_pipeline_cache_search_unlocked(cache, sha1, prog_data, map);
170
171 pthread_mutex_unlock(&cache->mutex);
172
173 return kernel;
174 }
175
176 static void
177 anv_pipeline_cache_set_entry(struct anv_pipeline_cache *cache,
178 struct cache_entry *entry, uint32_t entry_offset)
179 {
180 const uint32_t mask = cache->table_size - 1;
181 const uint32_t start = (*(uint32_t *) entry->sha1);
182
183 /* We'll always be able to insert when we get here. */
184 assert(cache->kernel_count < cache->table_size / 2);
185
186 for (uint32_t i = 0; i < cache->table_size; i++) {
187 const uint32_t index = (start + i) & mask;
188 if (cache->hash_table[index] == ~0) {
189 cache->hash_table[index] = entry_offset;
190 break;
191 }
192 }
193
194 cache->total_size += entry_size(entry) + entry->kernel_size;
195 cache->kernel_count++;
196 }
197
198 static VkResult
199 anv_pipeline_cache_grow(struct anv_pipeline_cache *cache)
200 {
201 const uint32_t table_size = cache->table_size * 2;
202 const uint32_t old_table_size = cache->table_size;
203 const size_t byte_size = table_size * sizeof(cache->hash_table[0]);
204 uint32_t *table;
205 uint32_t *old_table = cache->hash_table;
206
207 table = malloc(byte_size);
208 if (table == NULL)
209 return VK_ERROR_OUT_OF_HOST_MEMORY;
210
211 cache->hash_table = table;
212 cache->table_size = table_size;
213 cache->kernel_count = 0;
214 cache->total_size = 0;
215
216 memset(cache->hash_table, 0xff, byte_size);
217 for (uint32_t i = 0; i < old_table_size; i++) {
218 const uint32_t offset = old_table[i];
219 if (offset == ~0)
220 continue;
221
222 struct cache_entry *entry =
223 cache->program_stream.block_pool->map + offset;
224 anv_pipeline_cache_set_entry(cache, entry, offset);
225 }
226
227 free(old_table);
228
229 return VK_SUCCESS;
230 }
231
232 static void
233 anv_pipeline_cache_add_entry(struct anv_pipeline_cache *cache,
234 struct cache_entry *entry, uint32_t entry_offset)
235 {
236 if (cache->kernel_count == cache->table_size / 2)
237 anv_pipeline_cache_grow(cache);
238
239 /* Failing to grow that hash table isn't fatal, but may mean we don't
240 * have enough space to add this new kernel. Only add it if there's room.
241 */
242 if (cache->kernel_count < cache->table_size / 2)
243 anv_pipeline_cache_set_entry(cache, entry, entry_offset);
244 }
245
246 uint32_t
247 anv_pipeline_cache_upload_kernel(struct anv_pipeline_cache *cache,
248 const unsigned char *sha1,
249 const void *kernel, size_t kernel_size,
250 const struct brw_stage_prog_data **prog_data,
251 size_t prog_data_size,
252 struct anv_pipeline_bind_map *map)
253 {
254 pthread_mutex_lock(&cache->mutex);
255
256 /* Before uploading, check again that another thread didn't upload this
257 * shader while we were compiling it.
258 */
259 if (sha1) {
260 uint32_t cached_kernel =
261 anv_pipeline_cache_search_unlocked(cache, sha1, prog_data, map);
262 if (cached_kernel != NO_KERNEL) {
263 pthread_mutex_unlock(&cache->mutex);
264 return cached_kernel;
265 }
266 }
267
268 struct cache_entry *entry;
269
270 const uint32_t map_size =
271 map->surface_count * sizeof(struct anv_pipeline_binding) +
272 map->sampler_count * sizeof(struct anv_pipeline_binding);
273
274 const uint32_t preamble_size =
275 align_u32(sizeof(*entry) + prog_data_size + map_size, 64);
276
277 const uint32_t size = preamble_size + kernel_size;
278
279 assert(size < cache->program_stream.block_pool->block_size);
280 const struct anv_state state =
281 anv_state_stream_alloc(&cache->program_stream, size, 64);
282
283 entry = state.map;
284 entry->prog_data_size = prog_data_size;
285 entry->surface_count = map->surface_count;
286 entry->sampler_count = map->sampler_count;
287 entry->image_count = map->image_count;
288 entry->kernel_size = kernel_size;
289
290 void *p = entry->prog_data;
291 memcpy(p, *prog_data, prog_data_size);
292 p += prog_data_size;
293
294 memcpy(p, map->surface_to_descriptor,
295 map->surface_count * sizeof(struct anv_pipeline_binding));
296 map->surface_to_descriptor = p;
297 p += map->surface_count * sizeof(struct anv_pipeline_binding);
298
299 memcpy(p, map->sampler_to_descriptor,
300 map->sampler_count * sizeof(struct anv_pipeline_binding));
301 map->sampler_to_descriptor = p;
302
303 if (sha1) {
304 assert(anv_pipeline_cache_search_unlocked(cache, sha1,
305 NULL, NULL) == NO_KERNEL);
306
307 memcpy(entry->sha1, sha1, sizeof(entry->sha1));
308 anv_pipeline_cache_add_entry(cache, entry, state.offset);
309 }
310
311 pthread_mutex_unlock(&cache->mutex);
312
313 memcpy(state.map + preamble_size, kernel, kernel_size);
314
315 if (!cache->device->info.has_llc)
316 anv_state_clflush(state);
317
318 *prog_data = (const struct brw_stage_prog_data *) entry->prog_data;
319
320 return state.offset + preamble_size;
321 }
322
323 struct cache_header {
324 uint32_t header_size;
325 uint32_t header_version;
326 uint32_t vendor_id;
327 uint32_t device_id;
328 uint8_t uuid[VK_UUID_SIZE];
329 };
330
331 static void
332 anv_pipeline_cache_load(struct anv_pipeline_cache *cache,
333 const void *data, size_t size)
334 {
335 struct anv_device *device = cache->device;
336 struct cache_header header;
337 uint8_t uuid[VK_UUID_SIZE];
338
339 if (size < sizeof(header))
340 return;
341 memcpy(&header, data, sizeof(header));
342 if (header.header_size < sizeof(header))
343 return;
344 if (header.header_version != VK_PIPELINE_CACHE_HEADER_VERSION_ONE)
345 return;
346 if (header.vendor_id != 0x8086)
347 return;
348 if (header.device_id != device->chipset_id)
349 return;
350 anv_device_get_cache_uuid(uuid);
351 if (memcmp(header.uuid, uuid, VK_UUID_SIZE) != 0)
352 return;
353
354 void *end = (void *) data + size;
355 void *p = (void *) data + header.header_size;
356
357 while (p < end) {
358 struct cache_entry *entry = p;
359
360 void *data = entry->prog_data;
361 const struct brw_stage_prog_data *prog_data = data;
362 data += entry->prog_data_size;
363
364 struct anv_pipeline_binding *surface_to_descriptor = data;
365 data += entry->surface_count * sizeof(struct anv_pipeline_binding);
366 struct anv_pipeline_binding *sampler_to_descriptor = data;
367 data += entry->sampler_count * sizeof(struct anv_pipeline_binding);
368 void *kernel = data;
369
370 struct anv_pipeline_bind_map map = {
371 .surface_count = entry->surface_count,
372 .sampler_count = entry->sampler_count,
373 .image_count = entry->image_count,
374 .surface_to_descriptor = surface_to_descriptor,
375 .sampler_to_descriptor = sampler_to_descriptor
376 };
377
378 anv_pipeline_cache_upload_kernel(cache, entry->sha1,
379 kernel, entry->kernel_size,
380 &prog_data,
381 entry->prog_data_size, &map);
382 p = kernel + entry->kernel_size;
383 }
384 }
385
386 VkResult anv_CreatePipelineCache(
387 VkDevice _device,
388 const VkPipelineCacheCreateInfo* pCreateInfo,
389 const VkAllocationCallbacks* pAllocator,
390 VkPipelineCache* pPipelineCache)
391 {
392 ANV_FROM_HANDLE(anv_device, device, _device);
393 struct anv_pipeline_cache *cache;
394
395 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO);
396 assert(pCreateInfo->flags == 0);
397
398 cache = anv_alloc2(&device->alloc, pAllocator,
399 sizeof(*cache), 8,
400 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
401 if (cache == NULL)
402 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
403
404 anv_pipeline_cache_init(cache, device);
405
406 if (pCreateInfo->initialDataSize > 0)
407 anv_pipeline_cache_load(cache,
408 pCreateInfo->pInitialData,
409 pCreateInfo->initialDataSize);
410
411 *pPipelineCache = anv_pipeline_cache_to_handle(cache);
412
413 return VK_SUCCESS;
414 }
415
416 void anv_DestroyPipelineCache(
417 VkDevice _device,
418 VkPipelineCache _cache,
419 const VkAllocationCallbacks* pAllocator)
420 {
421 ANV_FROM_HANDLE(anv_device, device, _device);
422 ANV_FROM_HANDLE(anv_pipeline_cache, cache, _cache);
423
424 anv_pipeline_cache_finish(cache);
425
426 anv_free2(&device->alloc, pAllocator, cache);
427 }
428
429 VkResult anv_GetPipelineCacheData(
430 VkDevice _device,
431 VkPipelineCache _cache,
432 size_t* pDataSize,
433 void* pData)
434 {
435 ANV_FROM_HANDLE(anv_device, device, _device);
436 ANV_FROM_HANDLE(anv_pipeline_cache, cache, _cache);
437 struct cache_header *header;
438
439 const size_t size = sizeof(*header) + cache->total_size;
440
441 if (pData == NULL) {
442 *pDataSize = size;
443 return VK_SUCCESS;
444 }
445
446 if (*pDataSize < sizeof(*header)) {
447 *pDataSize = 0;
448 return VK_INCOMPLETE;
449 }
450
451 void *p = pData, *end = pData + *pDataSize;
452 header = p;
453 header->header_size = sizeof(*header);
454 header->header_version = VK_PIPELINE_CACHE_HEADER_VERSION_ONE;
455 header->vendor_id = 0x8086;
456 header->device_id = device->chipset_id;
457 anv_device_get_cache_uuid(header->uuid);
458 p += header->header_size;
459
460 struct cache_entry *entry;
461 for (uint32_t i = 0; i < cache->table_size; i++) {
462 if (cache->hash_table[i] == ~0)
463 continue;
464
465 entry = cache->program_stream.block_pool->map + cache->hash_table[i];
466 const uint32_t size = entry_size(entry);
467 if (end < p + size + entry->kernel_size)
468 break;
469
470 memcpy(p, entry, size);
471 p += size;
472
473 void *kernel = (void *) entry + align_u32(size, 64);
474
475 memcpy(p, kernel, entry->kernel_size);
476 p += entry->kernel_size;
477 }
478
479 *pDataSize = p - pData;
480
481 return VK_SUCCESS;
482 }
483
484 static void
485 anv_pipeline_cache_merge(struct anv_pipeline_cache *dst,
486 struct anv_pipeline_cache *src)
487 {
488 for (uint32_t i = 0; i < src->table_size; i++) {
489 const uint32_t offset = src->hash_table[i];
490 if (offset == ~0)
491 continue;
492
493 struct cache_entry *entry =
494 src->program_stream.block_pool->map + offset;
495
496 if (anv_pipeline_cache_search(dst, entry->sha1, NULL, NULL) != NO_KERNEL)
497 continue;
498
499 anv_pipeline_cache_add_entry(dst, entry, offset);
500 }
501 }
502
503 VkResult anv_MergePipelineCaches(
504 VkDevice _device,
505 VkPipelineCache destCache,
506 uint32_t srcCacheCount,
507 const VkPipelineCache* pSrcCaches)
508 {
509 ANV_FROM_HANDLE(anv_pipeline_cache, dst, destCache);
510
511 for (uint32_t i = 0; i < srcCacheCount; i++) {
512 ANV_FROM_HANDLE(anv_pipeline_cache, src, pSrcCaches[i]);
513
514 anv_pipeline_cache_merge(dst, src);
515 }
516
517 return VK_SUCCESS;
518 }