i965: Fix asynchronous mappings on !LLC platforms.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_bufmgr.c
1 /**************************************************************************
2 *
3 * Copyright © 2007 Red Hat Inc.
4 * Copyright © 2007-2012 Intel Corporation
5 * Copyright 2006 Tungsten Graphics, Inc., Bismarck, ND., USA
6 * All Rights Reserved.
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the
10 * "Software"), to deal in the Software without restriction, including
11 * without limitation the rights to use, copy, modify, merge, publish,
12 * distribute, sub license, and/or sell copies of the Software, and to
13 * permit persons to whom the Software is furnished to do so, subject to
14 * the following conditions:
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * The above copyright notice and this permission notice (including the
25 * next paragraph) shall be included in all copies or substantial portions
26 * of the Software.
27 *
28 *
29 **************************************************************************/
30 /*
31 * Authors: Thomas Hellström <thomas-at-tungstengraphics-dot-com>
32 * Keith Whitwell <keithw-at-tungstengraphics-dot-com>
33 * Eric Anholt <eric@anholt.net>
34 * Dave Airlie <airlied@linux.ie>
35 */
36
37 #ifdef HAVE_CONFIG_H
38 #include "config.h"
39 #endif
40
41 #include <xf86drm.h>
42 #include <util/u_atomic.h>
43 #include <fcntl.h>
44 #include <stdio.h>
45 #include <stdlib.h>
46 #include <string.h>
47 #include <unistd.h>
48 #include <assert.h>
49 #include <pthread.h>
50 #include <sys/ioctl.h>
51 #include <sys/stat.h>
52 #include <sys/types.h>
53 #include <stdbool.h>
54
55 #include "errno.h"
56 #ifndef ETIME
57 #define ETIME ETIMEDOUT
58 #endif
59 #include "common/gen_clflush.h"
60 #include "common/gen_debug.h"
61 #include "common/gen_device_info.h"
62 #include "libdrm_macros.h"
63 #include "main/macros.h"
64 #include "util/macros.h"
65 #include "util/hash_table.h"
66 #include "util/list.h"
67 #include "brw_bufmgr.h"
68 #include "brw_context.h"
69 #include "string.h"
70
71 #include "i915_drm.h"
72
73 #ifdef HAVE_VALGRIND
74 #include <valgrind.h>
75 #include <memcheck.h>
76 #define VG(x) x
77 #else
78 #define VG(x)
79 #endif
80
81 #define memclear(s) memset(&s, 0, sizeof(s))
82
83 #define FILE_DEBUG_FLAG DEBUG_BUFMGR
84
85 static inline int
86 atomic_add_unless(int *v, int add, int unless)
87 {
88 int c, old;
89 c = p_atomic_read(v);
90 while (c != unless && (old = p_atomic_cmpxchg(v, c, c + add)) != c)
91 c = old;
92 return c == unless;
93 }
94
95 struct bo_cache_bucket {
96 struct list_head head;
97 uint64_t size;
98 };
99
100 struct brw_bufmgr {
101 int fd;
102
103 pthread_mutex_t lock;
104
105 /** Array of lists of cached gem objects of power-of-two sizes */
106 struct bo_cache_bucket cache_bucket[14 * 4];
107 int num_buckets;
108 time_t time;
109
110 struct hash_table *name_table;
111 struct hash_table *handle_table;
112
113 bool has_llc:1;
114 bool bo_reuse:1;
115 };
116
117 static int bo_set_tiling_internal(struct brw_bo *bo, uint32_t tiling_mode,
118 uint32_t stride);
119
120 static void bo_free(struct brw_bo *bo);
121
122 static uint32_t
123 key_hash_uint(const void *key)
124 {
125 return _mesa_hash_data(key, 4);
126 }
127
128 static bool
129 key_uint_equal(const void *a, const void *b)
130 {
131 return *((unsigned *) a) == *((unsigned *) b);
132 }
133
134 static struct brw_bo *
135 hash_find_bo(struct hash_table *ht, unsigned int key)
136 {
137 struct hash_entry *entry = _mesa_hash_table_search(ht, &key);
138 return entry ? (struct brw_bo *) entry->data : NULL;
139 }
140
141 static uint64_t
142 bo_tile_size(struct brw_bufmgr *bufmgr, uint64_t size, uint32_t tiling)
143 {
144 if (tiling == I915_TILING_NONE)
145 return size;
146
147 /* 965+ just need multiples of page size for tiling */
148 return ALIGN(size, 4096);
149 }
150
151 /*
152 * Round a given pitch up to the minimum required for X tiling on a
153 * given chip. We use 512 as the minimum to allow for a later tiling
154 * change.
155 */
156 static uint32_t
157 bo_tile_pitch(struct brw_bufmgr *bufmgr, uint32_t pitch, uint32_t tiling)
158 {
159 unsigned long tile_width;
160
161 /* If untiled, then just align it so that we can do rendering
162 * to it with the 3D engine.
163 */
164 if (tiling == I915_TILING_NONE)
165 return ALIGN(pitch, 64);
166
167 if (tiling == I915_TILING_X)
168 tile_width = 512;
169 else
170 tile_width = 128;
171
172 /* 965 is flexible */
173 return ALIGN(pitch, tile_width);
174 }
175
176 static struct bo_cache_bucket *
177 bucket_for_size(struct brw_bufmgr *bufmgr, uint64_t size)
178 {
179 int i;
180
181 for (i = 0; i < bufmgr->num_buckets; i++) {
182 struct bo_cache_bucket *bucket = &bufmgr->cache_bucket[i];
183 if (bucket->size >= size) {
184 return bucket;
185 }
186 }
187
188 return NULL;
189 }
190
191 inline void
192 brw_bo_reference(struct brw_bo *bo)
193 {
194 p_atomic_inc(&bo->refcount);
195 }
196
197 int
198 brw_bo_busy(struct brw_bo *bo)
199 {
200 struct brw_bufmgr *bufmgr = bo->bufmgr;
201 struct drm_i915_gem_busy busy;
202 int ret;
203
204 memclear(busy);
205 busy.handle = bo->gem_handle;
206
207 ret = drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_BUSY, &busy);
208 if (ret == 0) {
209 bo->idle = !busy.busy;
210 return busy.busy;
211 }
212 return false;
213 }
214
215 int
216 brw_bo_madvise(struct brw_bo *bo, int state)
217 {
218 struct drm_i915_gem_madvise madv;
219
220 memclear(madv);
221 madv.handle = bo->gem_handle;
222 madv.madv = state;
223 madv.retained = 1;
224 drmIoctl(bo->bufmgr->fd, DRM_IOCTL_I915_GEM_MADVISE, &madv);
225
226 return madv.retained;
227 }
228
229 /* drop the oldest entries that have been purged by the kernel */
230 static void
231 brw_bo_cache_purge_bucket(struct brw_bufmgr *bufmgr,
232 struct bo_cache_bucket *bucket)
233 {
234 list_for_each_entry_safe(struct brw_bo, bo, &bucket->head, head) {
235 if (brw_bo_madvise(bo, I915_MADV_DONTNEED))
236 break;
237
238 list_del(&bo->head);
239 bo_free(bo);
240 }
241 }
242
243 static struct brw_bo *
244 bo_alloc_internal(struct brw_bufmgr *bufmgr,
245 const char *name,
246 uint64_t size,
247 unsigned flags,
248 uint32_t tiling_mode,
249 uint32_t stride, uint64_t alignment)
250 {
251 struct brw_bo *bo;
252 unsigned int page_size = getpagesize();
253 int ret;
254 struct bo_cache_bucket *bucket;
255 bool alloc_from_cache;
256 uint64_t bo_size;
257 bool for_render = false;
258
259 if (flags & BO_ALLOC_FOR_RENDER)
260 for_render = true;
261
262 /* Round the allocated size up to a power of two number of pages. */
263 bucket = bucket_for_size(bufmgr, size);
264
265 /* If we don't have caching at this size, don't actually round the
266 * allocation up.
267 */
268 if (bucket == NULL) {
269 bo_size = size;
270 if (bo_size < page_size)
271 bo_size = page_size;
272 } else {
273 bo_size = bucket->size;
274 }
275
276 pthread_mutex_lock(&bufmgr->lock);
277 /* Get a buffer out of the cache if available */
278 retry:
279 alloc_from_cache = false;
280 if (bucket != NULL && !list_empty(&bucket->head)) {
281 if (for_render) {
282 /* Allocate new render-target BOs from the tail (MRU)
283 * of the list, as it will likely be hot in the GPU
284 * cache and in the aperture for us.
285 */
286 bo = LIST_ENTRY(struct brw_bo, bucket->head.prev, head);
287 list_del(&bo->head);
288 alloc_from_cache = true;
289 bo->align = alignment;
290 } else {
291 assert(alignment == 0);
292 /* For non-render-target BOs (where we're probably
293 * going to map it first thing in order to fill it
294 * with data), check if the last BO in the cache is
295 * unbusy, and only reuse in that case. Otherwise,
296 * allocating a new buffer is probably faster than
297 * waiting for the GPU to finish.
298 */
299 bo = LIST_ENTRY(struct brw_bo, bucket->head.next, head);
300 if (!brw_bo_busy(bo)) {
301 alloc_from_cache = true;
302 list_del(&bo->head);
303 }
304 }
305
306 if (alloc_from_cache) {
307 if (!brw_bo_madvise(bo, I915_MADV_WILLNEED)) {
308 bo_free(bo);
309 brw_bo_cache_purge_bucket(bufmgr, bucket);
310 goto retry;
311 }
312
313 if (bo_set_tiling_internal(bo, tiling_mode, stride)) {
314 bo_free(bo);
315 goto retry;
316 }
317 }
318 }
319
320 if (!alloc_from_cache) {
321 struct drm_i915_gem_create create;
322
323 bo = calloc(1, sizeof(*bo));
324 if (!bo)
325 goto err;
326
327 bo->size = bo_size;
328 bo->idle = true;
329
330 memclear(create);
331 create.size = bo_size;
332
333 ret = drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CREATE, &create);
334 if (ret != 0) {
335 free(bo);
336 goto err;
337 }
338
339 bo->gem_handle = create.handle;
340 _mesa_hash_table_insert(bufmgr->handle_table, &bo->gem_handle, bo);
341
342 bo->bufmgr = bufmgr;
343 bo->align = alignment;
344
345 bo->tiling_mode = I915_TILING_NONE;
346 bo->swizzle_mode = I915_BIT_6_SWIZZLE_NONE;
347 bo->stride = 0;
348
349 if (bo_set_tiling_internal(bo, tiling_mode, stride))
350 goto err_free;
351 }
352
353 bo->name = name;
354 p_atomic_set(&bo->refcount, 1);
355 bo->reusable = true;
356 bo->cache_coherent = bufmgr->has_llc;
357
358 pthread_mutex_unlock(&bufmgr->lock);
359
360 DBG("bo_create: buf %d (%s) %ldb\n", bo->gem_handle, bo->name, size);
361
362 return bo;
363
364 err_free:
365 bo_free(bo);
366 err:
367 pthread_mutex_unlock(&bufmgr->lock);
368 return NULL;
369 }
370
371 struct brw_bo *
372 brw_bo_alloc(struct brw_bufmgr *bufmgr,
373 const char *name, uint64_t size, uint64_t alignment)
374 {
375 return bo_alloc_internal(bufmgr, name, size, 0, I915_TILING_NONE, 0, 0);
376 }
377
378 struct brw_bo *
379 brw_bo_alloc_tiled(struct brw_bufmgr *bufmgr, const char *name,
380 uint64_t size, uint32_t tiling_mode, uint32_t pitch,
381 unsigned flags)
382 {
383 return bo_alloc_internal(bufmgr, name, size, flags, tiling_mode, pitch, 0);
384 }
385
386 struct brw_bo *
387 brw_bo_alloc_tiled_2d(struct brw_bufmgr *bufmgr, const char *name,
388 int x, int y, int cpp, uint32_t tiling,
389 uint32_t *pitch, unsigned flags)
390 {
391 uint64_t size;
392 uint32_t stride;
393 unsigned long aligned_y, height_alignment;
394
395 /* If we're tiled, our allocations are in 8 or 32-row blocks,
396 * so failure to align our height means that we won't allocate
397 * enough pages.
398 *
399 * If we're untiled, we still have to align to 2 rows high
400 * because the data port accesses 2x2 blocks even if the
401 * bottom row isn't to be rendered, so failure to align means
402 * we could walk off the end of the GTT and fault. This is
403 * documented on 965, and may be the case on older chipsets
404 * too so we try to be careful.
405 */
406 aligned_y = y;
407 height_alignment = 2;
408
409 if (tiling == I915_TILING_X)
410 height_alignment = 8;
411 else if (tiling == I915_TILING_Y)
412 height_alignment = 32;
413 aligned_y = ALIGN(y, height_alignment);
414
415 stride = x * cpp;
416 stride = bo_tile_pitch(bufmgr, stride, tiling);
417 size = stride * aligned_y;
418 size = bo_tile_size(bufmgr, size, tiling);
419 *pitch = stride;
420
421 if (tiling == I915_TILING_NONE)
422 stride = 0;
423
424 return bo_alloc_internal(bufmgr, name, size, flags, tiling, stride, 0);
425 }
426
427 /**
428 * Returns a brw_bo wrapping the given buffer object handle.
429 *
430 * This can be used when one application needs to pass a buffer object
431 * to another.
432 */
433 struct brw_bo *
434 brw_bo_gem_create_from_name(struct brw_bufmgr *bufmgr,
435 const char *name, unsigned int handle)
436 {
437 struct brw_bo *bo;
438 int ret;
439 struct drm_gem_open open_arg;
440 struct drm_i915_gem_get_tiling get_tiling;
441
442 /* At the moment most applications only have a few named bo.
443 * For instance, in a DRI client only the render buffers passed
444 * between X and the client are named. And since X returns the
445 * alternating names for the front/back buffer a linear search
446 * provides a sufficiently fast match.
447 */
448 pthread_mutex_lock(&bufmgr->lock);
449 bo = hash_find_bo(bufmgr->name_table, handle);
450 if (bo) {
451 brw_bo_reference(bo);
452 goto out;
453 }
454
455 memclear(open_arg);
456 open_arg.name = handle;
457 ret = drmIoctl(bufmgr->fd, DRM_IOCTL_GEM_OPEN, &open_arg);
458 if (ret != 0) {
459 DBG("Couldn't reference %s handle 0x%08x: %s\n",
460 name, handle, strerror(errno));
461 bo = NULL;
462 goto out;
463 }
464 /* Now see if someone has used a prime handle to get this
465 * object from the kernel before by looking through the list
466 * again for a matching gem_handle
467 */
468 bo = hash_find_bo(bufmgr->handle_table, open_arg.handle);
469 if (bo) {
470 brw_bo_reference(bo);
471 goto out;
472 }
473
474 bo = calloc(1, sizeof(*bo));
475 if (!bo)
476 goto out;
477
478 p_atomic_set(&bo->refcount, 1);
479
480 bo->size = open_arg.size;
481 bo->offset64 = 0;
482 bo->bufmgr = bufmgr;
483 bo->gem_handle = open_arg.handle;
484 bo->name = name;
485 bo->global_name = handle;
486 bo->reusable = false;
487 bo->external = true;
488
489 _mesa_hash_table_insert(bufmgr->handle_table, &bo->gem_handle, bo);
490 _mesa_hash_table_insert(bufmgr->name_table, &bo->global_name, bo);
491
492 memclear(get_tiling);
493 get_tiling.handle = bo->gem_handle;
494 ret = drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_GET_TILING, &get_tiling);
495 if (ret != 0)
496 goto err_unref;
497
498 bo->tiling_mode = get_tiling.tiling_mode;
499 bo->swizzle_mode = get_tiling.swizzle_mode;
500 /* XXX stride is unknown */
501 DBG("bo_create_from_handle: %d (%s)\n", handle, bo->name);
502
503 out:
504 pthread_mutex_unlock(&bufmgr->lock);
505 return bo;
506
507 err_unref:
508 bo_free(bo);
509 pthread_mutex_unlock(&bufmgr->lock);
510 return NULL;
511 }
512
513 static void
514 bo_free(struct brw_bo *bo)
515 {
516 struct brw_bufmgr *bufmgr = bo->bufmgr;
517 struct drm_gem_close close;
518 struct hash_entry *entry;
519 int ret;
520
521 if (bo->map_cpu) {
522 VG(VALGRIND_FREELIKE_BLOCK(bo->map_cpu, 0));
523 drm_munmap(bo->map_cpu, bo->size);
524 }
525 if (bo->map_wc) {
526 VG(VALGRIND_FREELIKE_BLOCK(bo->map_wc, 0));
527 drm_munmap(bo->map_wc, bo->size);
528 }
529 if (bo->map_gtt) {
530 drm_munmap(bo->map_gtt, bo->size);
531 }
532
533 if (bo->global_name) {
534 entry = _mesa_hash_table_search(bufmgr->name_table, &bo->global_name);
535 _mesa_hash_table_remove(bufmgr->name_table, entry);
536 }
537 entry = _mesa_hash_table_search(bufmgr->handle_table, &bo->gem_handle);
538 _mesa_hash_table_remove(bufmgr->handle_table, entry);
539
540 /* Close this object */
541 memclear(close);
542 close.handle = bo->gem_handle;
543 ret = drmIoctl(bufmgr->fd, DRM_IOCTL_GEM_CLOSE, &close);
544 if (ret != 0) {
545 DBG("DRM_IOCTL_GEM_CLOSE %d failed (%s): %s\n",
546 bo->gem_handle, bo->name, strerror(errno));
547 }
548 free(bo);
549 }
550
551 /** Frees all cached buffers significantly older than @time. */
552 static void
553 cleanup_bo_cache(struct brw_bufmgr *bufmgr, time_t time)
554 {
555 int i;
556
557 if (bufmgr->time == time)
558 return;
559
560 for (i = 0; i < bufmgr->num_buckets; i++) {
561 struct bo_cache_bucket *bucket = &bufmgr->cache_bucket[i];
562
563 list_for_each_entry_safe(struct brw_bo, bo, &bucket->head, head) {
564 if (time - bo->free_time <= 1)
565 break;
566
567 list_del(&bo->head);
568
569 bo_free(bo);
570 }
571 }
572
573 bufmgr->time = time;
574 }
575
576 static void
577 bo_unreference_final(struct brw_bo *bo, time_t time)
578 {
579 struct brw_bufmgr *bufmgr = bo->bufmgr;
580 struct bo_cache_bucket *bucket;
581
582 DBG("bo_unreference final: %d (%s)\n", bo->gem_handle, bo->name);
583
584 bucket = bucket_for_size(bufmgr, bo->size);
585 /* Put the buffer into our internal cache for reuse if we can. */
586 if (bufmgr->bo_reuse && bo->reusable && bucket != NULL &&
587 brw_bo_madvise(bo, I915_MADV_DONTNEED)) {
588 bo->free_time = time;
589
590 bo->name = NULL;
591 bo->kflags = 0;
592
593 list_addtail(&bo->head, &bucket->head);
594 } else {
595 bo_free(bo);
596 }
597 }
598
599 void
600 brw_bo_unreference(struct brw_bo *bo)
601 {
602 if (bo == NULL)
603 return;
604
605 assert(p_atomic_read(&bo->refcount) > 0);
606
607 if (atomic_add_unless(&bo->refcount, -1, 1)) {
608 struct brw_bufmgr *bufmgr = bo->bufmgr;
609 struct timespec time;
610
611 clock_gettime(CLOCK_MONOTONIC, &time);
612
613 pthread_mutex_lock(&bufmgr->lock);
614
615 if (p_atomic_dec_zero(&bo->refcount)) {
616 bo_unreference_final(bo, time.tv_sec);
617 cleanup_bo_cache(bufmgr, time.tv_sec);
618 }
619
620 pthread_mutex_unlock(&bufmgr->lock);
621 }
622 }
623
624 static void
625 set_domain(struct brw_context *brw, const char *action,
626 struct brw_bo *bo, uint32_t read_domains, uint32_t write_domain)
627 {
628 struct drm_i915_gem_set_domain sd = {
629 .handle = bo->gem_handle,
630 .read_domains = read_domains,
631 .write_domain = write_domain,
632 };
633
634 double elapsed = unlikely(brw && brw->perf_debug) ? -get_time() : 0.0;
635
636 if (drmIoctl(bo->bufmgr->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &sd) != 0) {
637 DBG("%s:%d: Error setting memory domains %d (%08x %08x): %s.\n",
638 __FILE__, __LINE__, bo->gem_handle, read_domains, write_domain,
639 strerror(errno));
640 }
641
642 if (unlikely(brw && brw->perf_debug)) {
643 elapsed += get_time();
644 if (elapsed > 1e-5) /* 0.01ms */
645 perf_debug("%s a busy \"%s\" BO stalled and took %.03f ms.\n",
646 action, bo->name, elapsed * 1000);
647 }
648 }
649
650 static void
651 print_flags(unsigned flags)
652 {
653 if (flags & MAP_READ)
654 DBG("READ ");
655 if (flags & MAP_WRITE)
656 DBG("WRITE ");
657 if (flags & MAP_ASYNC)
658 DBG("ASYNC ");
659 if (flags & MAP_PERSISTENT)
660 DBG("PERSISTENT ");
661 if (flags & MAP_COHERENT)
662 DBG("COHERENT ");
663 if (flags & MAP_RAW)
664 DBG("RAW ");
665 DBG("\n");
666 }
667
668 static void *
669 brw_bo_map_cpu(struct brw_context *brw, struct brw_bo *bo, unsigned flags)
670 {
671 struct brw_bufmgr *bufmgr = bo->bufmgr;
672
673 /* We disallow CPU maps for writing to non-coherent buffers, as the
674 * CPU map can become invalidated when a batch is flushed out, which
675 * can happen at unpredictable times. You should use WC maps instead.
676 */
677 assert(bo->cache_coherent || !(flags & MAP_WRITE));
678
679 if (!bo->map_cpu) {
680 struct drm_i915_gem_mmap mmap_arg;
681 void *map;
682
683 DBG("brw_bo_map_cpu: %d (%s)\n", bo->gem_handle, bo->name);
684
685 memclear(mmap_arg);
686 mmap_arg.handle = bo->gem_handle;
687 mmap_arg.size = bo->size;
688 int ret = drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg);
689 if (ret != 0) {
690 ret = -errno;
691 DBG("%s:%d: Error mapping buffer %d (%s): %s .\n",
692 __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno));
693 return NULL;
694 }
695 VG(VALGRIND_MALLOCLIKE_BLOCK(mmap_arg.addr_ptr, mmap_arg.size, 0, 1));
696 map = (void *) (uintptr_t) mmap_arg.addr_ptr;
697
698 if (p_atomic_cmpxchg(&bo->map_cpu, NULL, map)) {
699 VG(VALGRIND_FREELIKE_BLOCK(map, 0));
700 drm_munmap(map, bo->size);
701 }
702 }
703 DBG("brw_bo_map_cpu: %d (%s) -> %p, ", bo->gem_handle, bo->name,
704 bo->map_cpu);
705 print_flags(flags);
706
707 if (!(flags & MAP_ASYNC)) {
708 set_domain(brw, "CPU mapping", bo, I915_GEM_DOMAIN_CPU,
709 flags & MAP_WRITE ? I915_GEM_DOMAIN_CPU : 0);
710 }
711
712 if (!bo->cache_coherent) {
713 /* If we're reusing an existing CPU mapping, the CPU caches may
714 * contain stale data from the last time we read from that mapping.
715 * (With the BO cache, it might even be data from a previous buffer!)
716 * Even if it's a brand new mapping, the kernel may have zeroed the
717 * buffer via CPU writes.
718 *
719 * We need to invalidate those cachelines so that we see the latest
720 * contents, and so long as we only read from the CPU mmap we do not
721 * need to write those cachelines back afterwards.
722 */
723 gen_invalidate_range(bo->map_cpu, bo->size);
724 }
725
726 return bo->map_cpu;
727 }
728
729 static void *
730 brw_bo_map_gtt(struct brw_context *brw, struct brw_bo *bo, unsigned flags)
731 {
732 struct brw_bufmgr *bufmgr = bo->bufmgr;
733
734 /* Get a mapping of the buffer if we haven't before. */
735 if (bo->map_gtt == NULL) {
736 struct drm_i915_gem_mmap_gtt mmap_arg;
737 void *map;
738
739 DBG("bo_map_gtt: mmap %d (%s)\n", bo->gem_handle, bo->name);
740
741 memclear(mmap_arg);
742 mmap_arg.handle = bo->gem_handle;
743
744 /* Get the fake offset back... */
745 int ret = drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_MMAP_GTT, &mmap_arg);
746 if (ret != 0) {
747 DBG("%s:%d: Error preparing buffer map %d (%s): %s .\n",
748 __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno));
749 pthread_mutex_unlock(&bufmgr->lock);
750 return NULL;
751 }
752
753 /* and mmap it. We don't need to use VALGRIND_MALLOCLIKE_BLOCK
754 * because Valgrind will already intercept this mmap call.
755 */
756 map = drm_mmap(0, bo->size, PROT_READ | PROT_WRITE,
757 MAP_SHARED, bufmgr->fd, mmap_arg.offset);
758 if (map == MAP_FAILED) {
759 DBG("%s:%d: Error mapping buffer %d (%s): %s .\n",
760 __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno));
761 return NULL;
762 }
763
764 if (p_atomic_cmpxchg(&bo->map_gtt, NULL, map)) {
765 drm_munmap(map, bo->size);
766 }
767 }
768
769 DBG("bo_map_gtt: %d (%s) -> %p, ", bo->gem_handle, bo->name, bo->map_gtt);
770 print_flags(flags);
771
772 if (!(flags & MAP_ASYNC)) {
773 set_domain(brw, "GTT mapping", bo,
774 I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
775 }
776
777 return bo->map_gtt;
778 }
779
780 static bool
781 can_map_cpu(struct brw_bo *bo, unsigned flags)
782 {
783 if (bo->cache_coherent)
784 return true;
785
786 /* If PERSISTENT or COHERENT are set, the mmapping needs to remain valid
787 * across batch flushes where the kernel will change cache domains of the
788 * bo, invalidating continued access to the CPU mmap on non-LLC device.
789 *
790 * Similarly, ASYNC typically means that the buffer will be accessed via
791 * both the CPU and the GPU simultaneously. Batches may be executed that
792 * use the BO even while it is mapped. While OpenGL technically disallows
793 * most drawing while non-persistent mappings are active, we may still use
794 * the GPU for blits or other operations, causing batches to happen at
795 * inconvenient times.
796 */
797 if (flags & (MAP_PERSISTENT | MAP_COHERENT | MAP_ASYNC))
798 return false;
799
800 return !(flags & MAP_WRITE);
801 }
802
803 void *
804 brw_bo_map(struct brw_context *brw, struct brw_bo *bo, unsigned flags)
805 {
806 if (bo->tiling_mode != I915_TILING_NONE && !(flags & MAP_RAW))
807 return brw_bo_map_gtt(brw, bo, flags);
808 else if (can_map_cpu(bo, flags))
809 return brw_bo_map_cpu(brw, bo, flags);
810 else
811 return brw_bo_map_gtt(brw, bo, flags);
812 }
813
814 int
815 brw_bo_subdata(struct brw_bo *bo, uint64_t offset,
816 uint64_t size, const void *data)
817 {
818 struct brw_bufmgr *bufmgr = bo->bufmgr;
819 struct drm_i915_gem_pwrite pwrite;
820 int ret;
821
822 memclear(pwrite);
823 pwrite.handle = bo->gem_handle;
824 pwrite.offset = offset;
825 pwrite.size = size;
826 pwrite.data_ptr = (uint64_t) (uintptr_t) data;
827 ret = drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_PWRITE, &pwrite);
828 if (ret != 0) {
829 ret = -errno;
830 DBG("%s:%d: Error writing data to buffer %d: "
831 "(%"PRIu64" %"PRIu64") %s .\n",
832 __FILE__, __LINE__, bo->gem_handle, offset, size, strerror(errno));
833 }
834
835 return ret;
836 }
837
838 /** Waits for all GPU rendering with the object to have completed. */
839 void
840 brw_bo_wait_rendering(struct brw_bo *bo)
841 {
842 /* We require a kernel recent enough for WAIT_IOCTL support.
843 * See intel_init_bufmgr()
844 */
845 brw_bo_wait(bo, -1);
846 }
847
848 /**
849 * Waits on a BO for the given amount of time.
850 *
851 * @bo: buffer object to wait for
852 * @timeout_ns: amount of time to wait in nanoseconds.
853 * If value is less than 0, an infinite wait will occur.
854 *
855 * Returns 0 if the wait was successful ie. the last batch referencing the
856 * object has completed within the allotted time. Otherwise some negative return
857 * value describes the error. Of particular interest is -ETIME when the wait has
858 * failed to yield the desired result.
859 *
860 * Similar to brw_bo_wait_rendering except a timeout parameter allows
861 * the operation to give up after a certain amount of time. Another subtle
862 * difference is the internal locking semantics are different (this variant does
863 * not hold the lock for the duration of the wait). This makes the wait subject
864 * to a larger userspace race window.
865 *
866 * The implementation shall wait until the object is no longer actively
867 * referenced within a batch buffer at the time of the call. The wait will
868 * not guarantee that the buffer is re-issued via another thread, or an flinked
869 * handle. Userspace must make sure this race does not occur if such precision
870 * is important.
871 *
872 * Note that some kernels have broken the inifite wait for negative values
873 * promise, upgrade to latest stable kernels if this is the case.
874 */
875 int
876 brw_bo_wait(struct brw_bo *bo, int64_t timeout_ns)
877 {
878 struct brw_bufmgr *bufmgr = bo->bufmgr;
879 struct drm_i915_gem_wait wait;
880 int ret;
881
882 memclear(wait);
883 wait.bo_handle = bo->gem_handle;
884 wait.timeout_ns = timeout_ns;
885 ret = drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_WAIT, &wait);
886 if (ret == -1)
887 return -errno;
888
889 return ret;
890 }
891
892 void
893 brw_bufmgr_destroy(struct brw_bufmgr *bufmgr)
894 {
895 pthread_mutex_destroy(&bufmgr->lock);
896
897 /* Free any cached buffer objects we were going to reuse */
898 for (int i = 0; i < bufmgr->num_buckets; i++) {
899 struct bo_cache_bucket *bucket = &bufmgr->cache_bucket[i];
900
901 list_for_each_entry_safe(struct brw_bo, bo, &bucket->head, head) {
902 list_del(&bo->head);
903
904 bo_free(bo);
905 }
906 }
907
908 _mesa_hash_table_destroy(bufmgr->name_table, NULL);
909 _mesa_hash_table_destroy(bufmgr->handle_table, NULL);
910
911 free(bufmgr);
912 }
913
914 static int
915 bo_set_tiling_internal(struct brw_bo *bo, uint32_t tiling_mode,
916 uint32_t stride)
917 {
918 struct brw_bufmgr *bufmgr = bo->bufmgr;
919 struct drm_i915_gem_set_tiling set_tiling;
920 int ret;
921
922 if (bo->global_name == 0 &&
923 tiling_mode == bo->tiling_mode && stride == bo->stride)
924 return 0;
925
926 memset(&set_tiling, 0, sizeof(set_tiling));
927 do {
928 /* set_tiling is slightly broken and overwrites the
929 * input on the error path, so we have to open code
930 * rmIoctl.
931 */
932 set_tiling.handle = bo->gem_handle;
933 set_tiling.tiling_mode = tiling_mode;
934 set_tiling.stride = stride;
935
936 ret = ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_SET_TILING, &set_tiling);
937 } while (ret == -1 && (errno == EINTR || errno == EAGAIN));
938 if (ret == -1)
939 return -errno;
940
941 bo->tiling_mode = set_tiling.tiling_mode;
942 bo->swizzle_mode = set_tiling.swizzle_mode;
943 bo->stride = set_tiling.stride;
944 return 0;
945 }
946
947 int
948 brw_bo_get_tiling(struct brw_bo *bo, uint32_t *tiling_mode,
949 uint32_t *swizzle_mode)
950 {
951 *tiling_mode = bo->tiling_mode;
952 *swizzle_mode = bo->swizzle_mode;
953 return 0;
954 }
955
956 struct brw_bo *
957 brw_bo_gem_create_from_prime(struct brw_bufmgr *bufmgr, int prime_fd)
958 {
959 int ret;
960 uint32_t handle;
961 struct brw_bo *bo;
962 struct drm_i915_gem_get_tiling get_tiling;
963
964 pthread_mutex_lock(&bufmgr->lock);
965 ret = drmPrimeFDToHandle(bufmgr->fd, prime_fd, &handle);
966 if (ret) {
967 DBG("create_from_prime: failed to obtain handle from fd: %s\n",
968 strerror(errno));
969 pthread_mutex_unlock(&bufmgr->lock);
970 return NULL;
971 }
972
973 /*
974 * See if the kernel has already returned this buffer to us. Just as
975 * for named buffers, we must not create two bo's pointing at the same
976 * kernel object
977 */
978 bo = hash_find_bo(bufmgr->handle_table, handle);
979 if (bo) {
980 brw_bo_reference(bo);
981 goto out;
982 }
983
984 bo = calloc(1, sizeof(*bo));
985 if (!bo)
986 goto out;
987
988 p_atomic_set(&bo->refcount, 1);
989
990 /* Determine size of bo. The fd-to-handle ioctl really should
991 * return the size, but it doesn't. If we have kernel 3.12 or
992 * later, we can lseek on the prime fd to get the size. Older
993 * kernels will just fail, in which case we fall back to the
994 * provided (estimated or guess size). */
995 ret = lseek(prime_fd, 0, SEEK_END);
996 if (ret != -1)
997 bo->size = ret;
998
999 bo->bufmgr = bufmgr;
1000
1001 bo->gem_handle = handle;
1002 _mesa_hash_table_insert(bufmgr->handle_table, &bo->gem_handle, bo);
1003
1004 bo->name = "prime";
1005 bo->reusable = false;
1006 bo->external = true;
1007
1008 memclear(get_tiling);
1009 get_tiling.handle = bo->gem_handle;
1010 if (drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_GET_TILING, &get_tiling))
1011 goto err;
1012
1013 bo->tiling_mode = get_tiling.tiling_mode;
1014 bo->swizzle_mode = get_tiling.swizzle_mode;
1015 /* XXX stride is unknown */
1016
1017 out:
1018 pthread_mutex_unlock(&bufmgr->lock);
1019 return bo;
1020
1021 err:
1022 bo_free(bo);
1023 pthread_mutex_unlock(&bufmgr->lock);
1024 return NULL;
1025 }
1026
1027 int
1028 brw_bo_gem_export_to_prime(struct brw_bo *bo, int *prime_fd)
1029 {
1030 struct brw_bufmgr *bufmgr = bo->bufmgr;
1031
1032 if (drmPrimeHandleToFD(bufmgr->fd, bo->gem_handle,
1033 DRM_CLOEXEC, prime_fd) != 0)
1034 return -errno;
1035
1036 bo->reusable = false;
1037 bo->external = true;
1038
1039 return 0;
1040 }
1041
1042 int
1043 brw_bo_flink(struct brw_bo *bo, uint32_t *name)
1044 {
1045 struct brw_bufmgr *bufmgr = bo->bufmgr;
1046
1047 if (!bo->global_name) {
1048 struct drm_gem_flink flink;
1049
1050 memclear(flink);
1051 flink.handle = bo->gem_handle;
1052 if (drmIoctl(bufmgr->fd, DRM_IOCTL_GEM_FLINK, &flink))
1053 return -errno;
1054
1055 pthread_mutex_lock(&bufmgr->lock);
1056 if (!bo->global_name) {
1057 bo->global_name = flink.name;
1058 bo->reusable = false;
1059 bo->external = true;
1060
1061 _mesa_hash_table_insert(bufmgr->name_table, &bo->global_name, bo);
1062 }
1063 pthread_mutex_unlock(&bufmgr->lock);
1064 }
1065
1066 *name = bo->global_name;
1067 return 0;
1068 }
1069
1070 /**
1071 * Enables unlimited caching of buffer objects for reuse.
1072 *
1073 * This is potentially very memory expensive, as the cache at each bucket
1074 * size is only bounded by how many buffers of that size we've managed to have
1075 * in flight at once.
1076 */
1077 void
1078 brw_bufmgr_enable_reuse(struct brw_bufmgr *bufmgr)
1079 {
1080 bufmgr->bo_reuse = true;
1081 }
1082
1083 static void
1084 add_bucket(struct brw_bufmgr *bufmgr, int size)
1085 {
1086 unsigned int i = bufmgr->num_buckets;
1087
1088 assert(i < ARRAY_SIZE(bufmgr->cache_bucket));
1089
1090 list_inithead(&bufmgr->cache_bucket[i].head);
1091 bufmgr->cache_bucket[i].size = size;
1092 bufmgr->num_buckets++;
1093 }
1094
1095 static void
1096 init_cache_buckets(struct brw_bufmgr *bufmgr)
1097 {
1098 uint64_t size, cache_max_size = 64 * 1024 * 1024;
1099
1100 /* OK, so power of two buckets was too wasteful of memory.
1101 * Give 3 other sizes between each power of two, to hopefully
1102 * cover things accurately enough. (The alternative is
1103 * probably to just go for exact matching of sizes, and assume
1104 * that for things like composited window resize the tiled
1105 * width/height alignment and rounding of sizes to pages will
1106 * get us useful cache hit rates anyway)
1107 */
1108 add_bucket(bufmgr, 4096);
1109 add_bucket(bufmgr, 4096 * 2);
1110 add_bucket(bufmgr, 4096 * 3);
1111
1112 /* Initialize the linked lists for BO reuse cache. */
1113 for (size = 4 * 4096; size <= cache_max_size; size *= 2) {
1114 add_bucket(bufmgr, size);
1115
1116 add_bucket(bufmgr, size + size * 1 / 4);
1117 add_bucket(bufmgr, size + size * 2 / 4);
1118 add_bucket(bufmgr, size + size * 3 / 4);
1119 }
1120 }
1121
1122 uint32_t
1123 brw_create_hw_context(struct brw_bufmgr *bufmgr)
1124 {
1125 struct drm_i915_gem_context_create create;
1126 int ret;
1127
1128 memclear(create);
1129 ret = drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE, &create);
1130 if (ret != 0) {
1131 DBG("DRM_IOCTL_I915_GEM_CONTEXT_CREATE failed: %s\n", strerror(errno));
1132 return 0;
1133 }
1134
1135 return create.ctx_id;
1136 }
1137
1138 void
1139 brw_destroy_hw_context(struct brw_bufmgr *bufmgr, uint32_t ctx_id)
1140 {
1141 struct drm_i915_gem_context_destroy d = {.ctx_id = ctx_id };
1142
1143 if (ctx_id != 0 &&
1144 drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_DESTROY, &d) != 0) {
1145 fprintf(stderr, "DRM_IOCTL_I915_GEM_CONTEXT_DESTROY failed: %s\n",
1146 strerror(errno));
1147 }
1148 }
1149
1150 int
1151 brw_reg_read(struct brw_bufmgr *bufmgr, uint32_t offset, uint64_t *result)
1152 {
1153 struct drm_i915_reg_read reg_read;
1154 int ret;
1155
1156 memclear(reg_read);
1157 reg_read.offset = offset;
1158
1159 ret = drmIoctl(bufmgr->fd, DRM_IOCTL_I915_REG_READ, &reg_read);
1160
1161 *result = reg_read.val;
1162 return ret;
1163 }
1164
1165 /**
1166 * Initializes the GEM buffer manager, which uses the kernel to allocate, map,
1167 * and manage map buffer objections.
1168 *
1169 * \param fd File descriptor of the opened DRM device.
1170 */
1171 struct brw_bufmgr *
1172 brw_bufmgr_init(struct gen_device_info *devinfo, int fd, int batch_size)
1173 {
1174 struct brw_bufmgr *bufmgr;
1175
1176 bufmgr = calloc(1, sizeof(*bufmgr));
1177 if (bufmgr == NULL)
1178 return NULL;
1179
1180 /* Handles to buffer objects belong to the device fd and are not
1181 * reference counted by the kernel. If the same fd is used by
1182 * multiple parties (threads sharing the same screen bufmgr, or
1183 * even worse the same device fd passed to multiple libraries)
1184 * ownership of those handles is shared by those independent parties.
1185 *
1186 * Don't do this! Ensure that each library/bufmgr has its own device
1187 * fd so that its namespace does not clash with another.
1188 */
1189 bufmgr->fd = fd;
1190
1191 if (pthread_mutex_init(&bufmgr->lock, NULL) != 0) {
1192 free(bufmgr);
1193 return NULL;
1194 }
1195
1196 bufmgr->has_llc = devinfo->has_llc;
1197
1198 init_cache_buckets(bufmgr);
1199
1200 bufmgr->name_table =
1201 _mesa_hash_table_create(NULL, key_hash_uint, key_uint_equal);
1202 bufmgr->handle_table =
1203 _mesa_hash_table_create(NULL, key_hash_uint, key_uint_equal);
1204
1205 return bufmgr;
1206 }