anv: Check if shader if present before uploading to cache
[mesa.git] / src / intel / vulkan / anv_pipeline_cache.c
1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "util/mesa-sha1.h"
25 #include "util/debug.h"
26 #include "anv_private.h"
27
28 /* Remaining work:
29 *
30 * - Compact binding table layout so it's tight and not dependent on
31 * descriptor set layout.
32 *
33 * - Review prog_data struct for size and cacheability: struct
34 * brw_stage_prog_data has binding_table which uses a lot of uint32_t for 8
35 * bit quantities etc; param, pull_param, and image_params are pointers, we
36 * just need the compation map. use bit fields for all bools, eg
37 * dual_src_blend.
38 */
39
40 void
41 anv_pipeline_cache_init(struct anv_pipeline_cache *cache,
42 struct anv_device *device)
43 {
44 cache->device = device;
45 anv_state_stream_init(&cache->program_stream,
46 &device->instruction_block_pool);
47 pthread_mutex_init(&cache->mutex, NULL);
48
49 cache->kernel_count = 0;
50 cache->total_size = 0;
51 cache->table_size = 1024;
52 const size_t byte_size = cache->table_size * sizeof(cache->hash_table[0]);
53 cache->hash_table = malloc(byte_size);
54
55 /* We don't consider allocation failure fatal, we just start with a 0-sized
56 * cache. */
57 if (cache->hash_table == NULL)
58 cache->table_size = 0;
59 else
60 memset(cache->hash_table, 0xff, byte_size);
61 }
62
63 void
64 anv_pipeline_cache_finish(struct anv_pipeline_cache *cache)
65 {
66 anv_state_stream_finish(&cache->program_stream);
67 pthread_mutex_destroy(&cache->mutex);
68 free(cache->hash_table);
69 }
70
71 struct cache_entry {
72 unsigned char sha1[20];
73 uint32_t prog_data_size;
74 uint32_t kernel_size;
75 uint32_t surface_count;
76 uint32_t sampler_count;
77 uint32_t image_count;
78
79 char prog_data[0];
80
81 /* kernel follows prog_data at next 64 byte aligned address */
82 };
83
84 static uint32_t
85 entry_size(struct cache_entry *entry)
86 {
87 /* This returns the number of bytes needed to serialize an entry, which
88 * doesn't include the alignment padding bytes.
89 */
90
91 const uint32_t map_size =
92 entry->surface_count * sizeof(struct anv_pipeline_binding) +
93 entry->sampler_count * sizeof(struct anv_pipeline_binding);
94
95 return sizeof(*entry) + entry->prog_data_size + map_size;
96 }
97
98 void
99 anv_hash_shader(unsigned char *hash, const void *key, size_t key_size,
100 struct anv_shader_module *module,
101 const char *entrypoint,
102 const VkSpecializationInfo *spec_info)
103 {
104 struct mesa_sha1 *ctx;
105
106 ctx = _mesa_sha1_init();
107 _mesa_sha1_update(ctx, key, key_size);
108 _mesa_sha1_update(ctx, module->sha1, sizeof(module->sha1));
109 _mesa_sha1_update(ctx, entrypoint, strlen(entrypoint));
110 /* hash in shader stage, pipeline layout? */
111 if (spec_info) {
112 _mesa_sha1_update(ctx, spec_info->pMapEntries,
113 spec_info->mapEntryCount * sizeof spec_info->pMapEntries[0]);
114 _mesa_sha1_update(ctx, spec_info->pData, spec_info->dataSize);
115 }
116 _mesa_sha1_final(ctx, hash);
117 }
118
119 static uint32_t
120 anv_pipeline_cache_search_unlocked(struct anv_pipeline_cache *cache,
121 const unsigned char *sha1,
122 const struct brw_stage_prog_data **prog_data,
123 struct anv_pipeline_bind_map *map)
124 {
125 const uint32_t mask = cache->table_size - 1;
126 const uint32_t start = (*(uint32_t *) sha1);
127
128 for (uint32_t i = 0; i < cache->table_size; i++) {
129 const uint32_t index = (start + i) & mask;
130 const uint32_t offset = cache->hash_table[index];
131
132 if (offset == ~0)
133 return NO_KERNEL;
134
135 struct cache_entry *entry =
136 cache->program_stream.block_pool->map + offset;
137 if (memcmp(entry->sha1, sha1, sizeof(entry->sha1)) == 0) {
138 if (prog_data) {
139 assert(map);
140 void *p = entry->prog_data;
141 *prog_data = p;
142 p += entry->prog_data_size;
143 map->surface_count = entry->surface_count;
144 map->sampler_count = entry->sampler_count;
145 map->image_count = entry->image_count;
146 map->surface_to_descriptor = p;
147 p += map->surface_count * sizeof(struct anv_pipeline_binding);
148 map->sampler_to_descriptor = p;
149 }
150
151 return offset + align_u32(entry_size(entry), 64);
152 }
153 }
154
155 unreachable("hash table should never be full");
156 }
157
158 uint32_t
159 anv_pipeline_cache_search(struct anv_pipeline_cache *cache,
160 const unsigned char *sha1,
161 const struct brw_stage_prog_data **prog_data,
162 struct anv_pipeline_bind_map *map)
163 {
164 uint32_t kernel;
165
166 pthread_mutex_lock(&cache->mutex);
167
168 kernel = anv_pipeline_cache_search_unlocked(cache, sha1, prog_data, map);
169
170 pthread_mutex_unlock(&cache->mutex);
171
172 return kernel;
173 }
174
175 static void
176 anv_pipeline_cache_set_entry(struct anv_pipeline_cache *cache,
177 struct cache_entry *entry, uint32_t entry_offset)
178 {
179 const uint32_t mask = cache->table_size - 1;
180 const uint32_t start = (*(uint32_t *) entry->sha1);
181
182 /* We'll always be able to insert when we get here. */
183 assert(cache->kernel_count < cache->table_size / 2);
184
185 for (uint32_t i = 0; i < cache->table_size; i++) {
186 const uint32_t index = (start + i) & mask;
187 if (cache->hash_table[index] == ~0) {
188 cache->hash_table[index] = entry_offset;
189 break;
190 }
191 }
192
193 cache->total_size += entry_size(entry) + entry->kernel_size;
194 cache->kernel_count++;
195 }
196
197 static VkResult
198 anv_pipeline_cache_grow(struct anv_pipeline_cache *cache)
199 {
200 const uint32_t table_size = cache->table_size * 2;
201 const uint32_t old_table_size = cache->table_size;
202 const size_t byte_size = table_size * sizeof(cache->hash_table[0]);
203 uint32_t *table;
204 uint32_t *old_table = cache->hash_table;
205
206 table = malloc(byte_size);
207 if (table == NULL)
208 return VK_ERROR_OUT_OF_HOST_MEMORY;
209
210 cache->hash_table = table;
211 cache->table_size = table_size;
212 cache->kernel_count = 0;
213 cache->total_size = 0;
214
215 memset(cache->hash_table, 0xff, byte_size);
216 for (uint32_t i = 0; i < old_table_size; i++) {
217 const uint32_t offset = old_table[i];
218 if (offset == ~0)
219 continue;
220
221 struct cache_entry *entry =
222 cache->program_stream.block_pool->map + offset;
223 anv_pipeline_cache_set_entry(cache, entry, offset);
224 }
225
226 free(old_table);
227
228 return VK_SUCCESS;
229 }
230
231 static void
232 anv_pipeline_cache_add_entry(struct anv_pipeline_cache *cache,
233 struct cache_entry *entry, uint32_t entry_offset)
234 {
235 if (cache->kernel_count == cache->table_size / 2)
236 anv_pipeline_cache_grow(cache);
237
238 /* Failing to grow that hash table isn't fatal, but may mean we don't
239 * have enough space to add this new kernel. Only add it if there's room.
240 */
241 if (cache->kernel_count < cache->table_size / 2)
242 anv_pipeline_cache_set_entry(cache, entry, entry_offset);
243 }
244
245 uint32_t
246 anv_pipeline_cache_upload_kernel(struct anv_pipeline_cache *cache,
247 const unsigned char *sha1,
248 const void *kernel, size_t kernel_size,
249 const struct brw_stage_prog_data **prog_data,
250 size_t prog_data_size,
251 struct anv_pipeline_bind_map *map)
252 {
253 pthread_mutex_lock(&cache->mutex);
254
255 /* Before uploading, check again that another thread didn't upload this
256 * shader while we were compiling it.
257 */
258 if (sha1) {
259 uint32_t cached_kernel =
260 anv_pipeline_cache_search_unlocked(cache, sha1, prog_data, map);
261 if (cached_kernel != NO_KERNEL) {
262 pthread_mutex_unlock(&cache->mutex);
263 return cached_kernel;
264 }
265 }
266
267 struct cache_entry *entry;
268
269 const uint32_t map_size =
270 map->surface_count * sizeof(struct anv_pipeline_binding) +
271 map->sampler_count * sizeof(struct anv_pipeline_binding);
272
273 const uint32_t preamble_size =
274 align_u32(sizeof(*entry) + prog_data_size + map_size, 64);
275
276 const uint32_t size = preamble_size + kernel_size;
277
278 assert(size < cache->program_stream.block_pool->block_size);
279 const struct anv_state state =
280 anv_state_stream_alloc(&cache->program_stream, size, 64);
281
282 entry = state.map;
283 entry->prog_data_size = prog_data_size;
284 entry->surface_count = map->surface_count;
285 entry->sampler_count = map->sampler_count;
286 entry->image_count = map->image_count;
287 entry->kernel_size = kernel_size;
288
289 void *p = entry->prog_data;
290 memcpy(p, *prog_data, prog_data_size);
291 p += prog_data_size;
292
293 memcpy(p, map->surface_to_descriptor,
294 map->surface_count * sizeof(struct anv_pipeline_binding));
295 map->surface_to_descriptor = p;
296 p += map->surface_count * sizeof(struct anv_pipeline_binding);
297
298 memcpy(p, map->sampler_to_descriptor,
299 map->sampler_count * sizeof(struct anv_pipeline_binding));
300 map->sampler_to_descriptor = p;
301
302 if (sha1 && env_var_as_boolean("ANV_ENABLE_PIPELINE_CACHE", false)) {
303 assert(anv_pipeline_cache_search_unlocked(cache, sha1,
304 NULL, NULL) == NO_KERNEL);
305
306 memcpy(entry->sha1, sha1, sizeof(entry->sha1));
307 anv_pipeline_cache_add_entry(cache, entry, state.offset);
308 }
309
310 pthread_mutex_unlock(&cache->mutex);
311
312 memcpy(state.map + preamble_size, kernel, kernel_size);
313
314 if (!cache->device->info.has_llc)
315 anv_state_clflush(state);
316
317 *prog_data = (const struct brw_stage_prog_data *) entry->prog_data;
318
319 return state.offset + preamble_size;
320 }
321
322 struct cache_header {
323 uint32_t header_size;
324 uint32_t header_version;
325 uint32_t vendor_id;
326 uint32_t device_id;
327 uint8_t uuid[VK_UUID_SIZE];
328 };
329
330 static void
331 anv_pipeline_cache_load(struct anv_pipeline_cache *cache,
332 const void *data, size_t size)
333 {
334 struct anv_device *device = cache->device;
335 struct cache_header header;
336 uint8_t uuid[VK_UUID_SIZE];
337
338 if (size < sizeof(header))
339 return;
340 memcpy(&header, data, sizeof(header));
341 if (header.header_size < sizeof(header))
342 return;
343 if (header.header_version != VK_PIPELINE_CACHE_HEADER_VERSION_ONE)
344 return;
345 if (header.vendor_id != 0x8086)
346 return;
347 if (header.device_id != device->chipset_id)
348 return;
349 anv_device_get_cache_uuid(uuid);
350 if (memcmp(header.uuid, uuid, VK_UUID_SIZE) != 0)
351 return;
352
353 void *end = (void *) data + size;
354 void *p = (void *) data + header.header_size;
355
356 while (p < end) {
357 struct cache_entry *entry = p;
358
359 void *data = entry->prog_data;
360 const struct brw_stage_prog_data *prog_data = data;
361 data += entry->prog_data_size;
362
363 struct anv_pipeline_binding *surface_to_descriptor = data;
364 data += entry->surface_count * sizeof(struct anv_pipeline_binding);
365 struct anv_pipeline_binding *sampler_to_descriptor = data;
366 data += entry->sampler_count * sizeof(struct anv_pipeline_binding);
367 void *kernel = data;
368
369 struct anv_pipeline_bind_map map = {
370 .surface_count = entry->surface_count,
371 .sampler_count = entry->sampler_count,
372 .image_count = entry->image_count,
373 .surface_to_descriptor = surface_to_descriptor,
374 .sampler_to_descriptor = sampler_to_descriptor
375 };
376
377 anv_pipeline_cache_upload_kernel(cache, entry->sha1,
378 kernel, entry->kernel_size,
379 &prog_data,
380 entry->prog_data_size, &map);
381 p = kernel + entry->kernel_size;
382 }
383 }
384
385 VkResult anv_CreatePipelineCache(
386 VkDevice _device,
387 const VkPipelineCacheCreateInfo* pCreateInfo,
388 const VkAllocationCallbacks* pAllocator,
389 VkPipelineCache* pPipelineCache)
390 {
391 ANV_FROM_HANDLE(anv_device, device, _device);
392 struct anv_pipeline_cache *cache;
393
394 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO);
395 assert(pCreateInfo->flags == 0);
396
397 cache = anv_alloc2(&device->alloc, pAllocator,
398 sizeof(*cache), 8,
399 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
400 if (cache == NULL)
401 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
402
403 anv_pipeline_cache_init(cache, device);
404
405 if (pCreateInfo->initialDataSize > 0)
406 anv_pipeline_cache_load(cache,
407 pCreateInfo->pInitialData,
408 pCreateInfo->initialDataSize);
409
410 *pPipelineCache = anv_pipeline_cache_to_handle(cache);
411
412 return VK_SUCCESS;
413 }
414
415 void anv_DestroyPipelineCache(
416 VkDevice _device,
417 VkPipelineCache _cache,
418 const VkAllocationCallbacks* pAllocator)
419 {
420 ANV_FROM_HANDLE(anv_device, device, _device);
421 ANV_FROM_HANDLE(anv_pipeline_cache, cache, _cache);
422
423 anv_pipeline_cache_finish(cache);
424
425 anv_free2(&device->alloc, pAllocator, cache);
426 }
427
428 VkResult anv_GetPipelineCacheData(
429 VkDevice _device,
430 VkPipelineCache _cache,
431 size_t* pDataSize,
432 void* pData)
433 {
434 ANV_FROM_HANDLE(anv_device, device, _device);
435 ANV_FROM_HANDLE(anv_pipeline_cache, cache, _cache);
436 struct cache_header *header;
437
438 const size_t size = sizeof(*header) + cache->total_size;
439
440 if (pData == NULL) {
441 *pDataSize = size;
442 return VK_SUCCESS;
443 }
444
445 if (*pDataSize < sizeof(*header)) {
446 *pDataSize = 0;
447 return VK_INCOMPLETE;
448 }
449
450 void *p = pData, *end = pData + *pDataSize;
451 header = p;
452 header->header_size = sizeof(*header);
453 header->header_version = VK_PIPELINE_CACHE_HEADER_VERSION_ONE;
454 header->vendor_id = 0x8086;
455 header->device_id = device->chipset_id;
456 anv_device_get_cache_uuid(header->uuid);
457 p += header->header_size;
458
459 struct cache_entry *entry;
460 for (uint32_t i = 0; i < cache->table_size; i++) {
461 if (cache->hash_table[i] == ~0)
462 continue;
463
464 entry = cache->program_stream.block_pool->map + cache->hash_table[i];
465 const uint32_t size = entry_size(entry);
466 if (end < p + size + entry->kernel_size)
467 break;
468
469 memcpy(p, entry, size);
470 p += size;
471
472 void *kernel = (void *) entry + align_u32(size, 64);
473
474 memcpy(p, kernel, entry->kernel_size);
475 p += entry->kernel_size;
476 }
477
478 *pDataSize = p - pData;
479
480 return VK_SUCCESS;
481 }
482
483 static void
484 anv_pipeline_cache_merge(struct anv_pipeline_cache *dst,
485 struct anv_pipeline_cache *src)
486 {
487 for (uint32_t i = 0; i < src->table_size; i++) {
488 const uint32_t offset = src->hash_table[i];
489 if (offset == ~0)
490 continue;
491
492 struct cache_entry *entry =
493 src->program_stream.block_pool->map + offset;
494
495 if (anv_pipeline_cache_search(dst, entry->sha1, NULL, NULL) != NO_KERNEL)
496 continue;
497
498 anv_pipeline_cache_add_entry(dst, entry, offset);
499 }
500 }
501
502 VkResult anv_MergePipelineCaches(
503 VkDevice _device,
504 VkPipelineCache destCache,
505 uint32_t srcCacheCount,
506 const VkPipelineCache* pSrcCaches)
507 {
508 ANV_FROM_HANDLE(anv_pipeline_cache, dst, destCache);
509
510 for (uint32_t i = 0; i < srcCacheCount; i++) {
511 ANV_FROM_HANDLE(anv_pipeline_cache, src, pSrcCaches[i]);
512
513 anv_pipeline_cache_merge(dst, src);
514 }
515
516 return VK_SUCCESS;
517 }