src/panfrost/encoder/pan_bo.c

   1 /*
   2  * Copyright 2019 Collabora, Ltd.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  *
  23  * Authors (Collabora):
  24  *   Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
  25  */
  26 #include <errno.h>
  27 #include <stdio.h>
  28 #include <fcntl.h>
  29 #include <xf86drm.h>
  30 #include <pthread.h>
  31 #include "drm-uapi/panfrost_drm.h"
  32
  33 #include "pan_bo.h"
  34 #include "pan_util.h"
  35 #include "../pandecode/public.h"
  36
  37 #include "os/os_mman.h"
  38
  39 #include "util/u_inlines.h"
  40 #include "util/u_math.h"
  41
  42 /* This file implements a userspace BO cache. Allocating and freeing
  43  * GPU-visible buffers is very expensive, and even the extra kernel roundtrips
  44  * adds more work than we would like at this point. So caching BOs in userspace
  45  * solves both of these problems and does not require kernel updates.
  46  *
  47  * Cached BOs are sorted into a bucket based on rounding their size down to the
  48  * nearest power-of-two. Each bucket contains a linked list of free panfrost_bo
  49  * objects. Putting a BO into the cache is accomplished by adding it to the
  50  * corresponding bucket. Getting a BO from the cache consists of finding the
  51  * appropriate bucket and sorting. A cache eviction is a kernel-level free of a
  52  * BO and removing it from the bucket. We special case evicting all BOs from
  53  * the cache, since that's what helpful in practice and avoids extra logic
  54  * around the linked list.
  55  */
  56
  57 static struct panfrost_bo *
  58 panfrost_bo_alloc(struct panfrost_device *dev, size_t size,
  59                   uint32_t flags)
  60 {
  61         struct drm_panfrost_create_bo create_bo = { .size = size };
  62         struct panfrost_bo *bo;
  63         int ret;
  64
  65         if (dev->kernel_version->version_major > 1 ||
  66             dev->kernel_version->version_minor >= 1) {
  67                 if (flags & PAN_BO_GROWABLE)
  68                         create_bo.flags |= PANFROST_BO_HEAP;
  69                 if (!(flags & PAN_BO_EXECUTE))
  70                         create_bo.flags |= PANFROST_BO_NOEXEC;
  71         }
  72
  73         ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_CREATE_BO, &create_bo);
  74         if (ret) {
  75                 fprintf(stderr, "DRM_IOCTL_PANFROST_CREATE_BO failed: %m\n");
  76                 return NULL;
  77         }
  78
  79         bo = pan_lookup_bo(dev, create_bo.handle);
  80         assert(!memcmp(bo, &((struct panfrost_bo){}), sizeof(*bo)));
  81
  82         bo->size = create_bo.size;
  83         bo->gpu = create_bo.offset;
  84         bo->gem_handle = create_bo.handle;
  85         bo->flags = flags;
  86         bo->dev = dev;
  87         return bo;
  88 }
  89
  90 static void
  91 panfrost_bo_free(struct panfrost_bo *bo)
  92 {
  93         struct drm_gem_close gem_close = { .handle = bo->gem_handle };
  94         int ret;
  95
  96         ret = drmIoctl(bo->dev->fd, DRM_IOCTL_GEM_CLOSE, &gem_close);
  97         if (ret) {
  98                 fprintf(stderr, "DRM_IOCTL_GEM_CLOSE failed: %m\n");
  99                 assert(0);
 100         }
 101
 102         /* BO will be freed with the sparse array, but zero to indicate free */
 103         memset(bo, 0, sizeof(*bo));
 104 }
 105
 106 /* Returns true if the BO is ready, false otherwise.
 107  * access_type is encoding the type of access one wants to ensure is done.
 108  * Say you want to make sure all writers are done writing, you should pass
 109  * PAN_BO_ACCESS_WRITE.
 110  * If you want to wait for all users, you should pass PAN_BO_ACCESS_RW.
 111  * PAN_BO_ACCESS_READ would work too as waiting for readers implies
 112  * waiting for writers as well, but we want to make things explicit and waiting
 113  * only for readers is impossible.
 114  */
 115 bool
 116 panfrost_bo_wait(struct panfrost_bo *bo, int64_t timeout_ns,
 117                  uint32_t access_type)
 118 {
 119         struct drm_panfrost_wait_bo req = {
 120                 .handle = bo->gem_handle,
 121                 .timeout_ns = timeout_ns,
 122         };
 123         int ret;
 124
 125         assert(access_type == PAN_BO_ACCESS_WRITE ||
 126                access_type == PAN_BO_ACCESS_RW);
 127
 128         /* If the BO has been exported or imported we can't rely on the cached
 129          * state, we need to call the WAIT_BO ioctl.
 130          */
 131         if (!(bo->flags & (PAN_BO_IMPORTED | PAN_BO_EXPORTED))) {
 132                 /* If ->gpu_access is 0, the BO is idle, no need to wait. */
 133                 if (!bo->gpu_access)
 134                         return true;
 135
 136                 /* If the caller only wants to wait for writers and no
 137                  * writes are pending, we don't have to wait.
 138                  */
 139                 if (access_type == PAN_BO_ACCESS_WRITE &&
 140                     !(bo->gpu_access & PAN_BO_ACCESS_WRITE))
 141                         return true;
 142         }
 143
 144         /* The ioctl returns >= 0 value when the BO we are waiting for is ready
 145          * -1 otherwise.
 146          */
 147         ret = drmIoctl(bo->dev->fd, DRM_IOCTL_PANFROST_WAIT_BO, &req);
 148         if (ret != -1) {
 149                 /* Set gpu_access to 0 so that the next call to bo_wait()
 150                  * doesn't have to call the WAIT_BO ioctl.
 151                  */
 152                 bo->gpu_access = 0;
 153                 return true;
 154         }
 155
 156         /* If errno is not ETIMEDOUT or EBUSY that means the handle we passed
 157          * is invalid, which shouldn't happen here.
 158          */
 159         assert(errno == ETIMEDOUT || errno == EBUSY);
 160         return false;
 161 }
 162
 163 /* Helper to calculate the bucket index of a BO */
 164
 165 static unsigned
 166 pan_bucket_index(unsigned size)
 167 {
 168         /* Round down to POT to compute a bucket index */
 169
 170         unsigned bucket_index = util_logbase2(size);
 171
 172         /* Clamp the bucket index; all huge allocations will be
 173          * sorted into the largest bucket */
 174
 175         bucket_index = MIN2(bucket_index, MAX_BO_CACHE_BUCKET);
 176
 177         /* The minimum bucket size must equal the minimum allocation
 178          * size; the maximum we clamped */
 179
 180         assert(bucket_index >= MIN_BO_CACHE_BUCKET);
 181         assert(bucket_index <= MAX_BO_CACHE_BUCKET);
 182
 183         /* Reindex from 0 */
 184         return (bucket_index - MIN_BO_CACHE_BUCKET);
 185 }
 186
 187 static struct list_head *
 188 pan_bucket(struct panfrost_device *dev, unsigned size)
 189 {
 190         return &dev->bo_cache.buckets[pan_bucket_index(size)];
 191 }
 192
 193 /* Tries to fetch a BO of sufficient size with the appropriate flags from the
 194  * BO cache. If it succeeds, it returns that BO and removes the BO from the
 195  * cache. If it fails, it returns NULL signaling the caller to allocate a new
 196  * BO. */
 197
 198 static struct panfrost_bo *
 199 panfrost_bo_cache_fetch(struct panfrost_device *dev,
 200                         size_t size, uint32_t flags, bool dontwait)
 201 {
 202         pthread_mutex_lock(&dev->bo_cache.lock);
 203         struct list_head *bucket = pan_bucket(dev, size);
 204         struct panfrost_bo *bo = NULL;
 205
 206         /* Iterate the bucket looking for something suitable */
 207         list_for_each_entry_safe(struct panfrost_bo, entry, bucket,
 208                                  bucket_link) {
 209                 if (entry->size < size || entry->flags != flags)
 210                         continue;
 211
 212                 if (!panfrost_bo_wait(entry, dontwait ? 0 : INT64_MAX,
 213                                       PAN_BO_ACCESS_RW))
 214                         continue;
 215
 216                 struct drm_panfrost_madvise madv = {
 217                         .handle = entry->gem_handle,
 218                         .madv = PANFROST_MADV_WILLNEED,
 219                 };
 220                 int ret;
 221
 222                 /* This one works, splice it out of the cache */
 223                 list_del(&entry->bucket_link);
 224                 list_del(&entry->lru_link);
 225
 226                 ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_MADVISE, &madv);
 227                 if (!ret && !madv.retained) {
 228                         panfrost_bo_free(entry);
 229                         continue;
 230                 }
 231                 /* Let's go! */
 232                 bo = entry;
 233                 break;
 234         }
 235         pthread_mutex_unlock(&dev->bo_cache.lock);
 236
 237         return bo;
 238 }
 239
 240 static void
 241 panfrost_bo_cache_evict_stale_bos(struct panfrost_device *dev)
 242 {
 243         struct timespec time;
 244
 245         clock_gettime(CLOCK_MONOTONIC, &time);
 246         list_for_each_entry_safe(struct panfrost_bo, entry,
 247                                  &dev->bo_cache.lru, lru_link) {
 248                 /* We want all entries that have been used more than 1 sec
 249                  * ago to be dropped, others can be kept.
 250                  * Note the <= 2 check and not <= 1. It's here to account for
 251                  * the fact that we're only testing ->tv_sec, not ->tv_nsec.
 252                  * That means we might keep entries that are between 1 and 2
 253                  * seconds old, but we don't really care, as long as unused BOs
 254                  * are dropped at some point.
 255                  */
 256                 if (time.tv_sec - entry->last_used <= 2)
 257                         break;
 258
 259                 list_del(&entry->bucket_link);
 260                 list_del(&entry->lru_link);
 261                 panfrost_bo_free(entry);
 262         }
 263 }
 264
 265 /* Tries to add a BO to the cache. Returns if it was
 266  * successful */
 267
 268 static bool
 269 panfrost_bo_cache_put(struct panfrost_bo *bo)
 270 {
 271         struct panfrost_device *dev = bo->dev;
 272
 273         if (bo->flags & PAN_BO_DONT_REUSE)
 274                 return false;
 275
 276         pthread_mutex_lock(&dev->bo_cache.lock);
 277         struct list_head *bucket = pan_bucket(dev, MAX2(bo->size, 4096));
 278         struct drm_panfrost_madvise madv;
 279         struct timespec time;
 280
 281         madv.handle = bo->gem_handle;
 282         madv.madv = PANFROST_MADV_DONTNEED;
 283         madv.retained = 0;
 284
 285         drmIoctl(dev->fd, DRM_IOCTL_PANFROST_MADVISE, &madv);
 286
 287         /* Add us to the bucket */
 288         list_addtail(&bo->bucket_link, bucket);
 289
 290         /* Add us to the LRU list and update the last_used field. */
 291         list_addtail(&bo->lru_link, &dev->bo_cache.lru);
 292         clock_gettime(CLOCK_MONOTONIC, &time);
 293         bo->last_used = time.tv_sec;
 294
 295         /* Let's do some cleanup in the BO cache while we hold the
 296          * lock.
 297          */
 298         panfrost_bo_cache_evict_stale_bos(dev);
 299         pthread_mutex_unlock(&dev->bo_cache.lock);
 300
 301         return true;
 302 }
 303
 304 /* Evicts all BOs from the cache. Called during context
 305  * destroy or during low-memory situations (to free up
 306  * memory that may be unused by us just sitting in our
 307  * cache, but still reserved from the perspective of the
 308  * OS) */
 309
 310 void
 311 panfrost_bo_cache_evict_all(
 312                 struct panfrost_device *dev)
 313 {
 314         pthread_mutex_lock(&dev->bo_cache.lock);
 315         for (unsigned i = 0; i < ARRAY_SIZE(dev->bo_cache.buckets); ++i) {
 316                 struct list_head *bucket = &dev->bo_cache.buckets[i];
 317
 318                 list_for_each_entry_safe(struct panfrost_bo, entry, bucket,
 319                                          bucket_link) {
 320                         list_del(&entry->bucket_link);
 321                         list_del(&entry->lru_link);
 322                         panfrost_bo_free(entry);
 323                 }
 324         }
 325         pthread_mutex_unlock(&dev->bo_cache.lock);
 326 }
 327
 328 void
 329 panfrost_bo_mmap(struct panfrost_bo *bo)
 330 {
 331         struct drm_panfrost_mmap_bo mmap_bo = { .handle = bo->gem_handle };
 332         int ret;
 333
 334         if (bo->cpu)
 335                 return;
 336
 337         ret = drmIoctl(bo->dev->fd, DRM_IOCTL_PANFROST_MMAP_BO, &mmap_bo);
 338         if (ret) {
 339                 fprintf(stderr, "DRM_IOCTL_PANFROST_MMAP_BO failed: %m\n");
 340                 assert(0);
 341         }
 342
 343         bo->cpu = os_mmap(NULL, bo->size, PROT_READ | PROT_WRITE, MAP_SHARED,
 344                           bo->dev->fd, mmap_bo.offset);
 345         if (bo->cpu == MAP_FAILED) {
 346                 fprintf(stderr, "mmap failed: %p %m\n", bo->cpu);
 347                 assert(0);
 348         }
 349 }
 350
 351 static void
 352 panfrost_bo_munmap(struct panfrost_bo *bo)
 353 {
 354         if (!bo->cpu)
 355                 return;
 356
 357         if (os_munmap((void *) (uintptr_t)bo->cpu, bo->size)) {
 358                 perror("munmap");
 359                 abort();
 360         }
 361
 362         bo->cpu = NULL;
 363 }
 364
 365 struct panfrost_bo *
 366 panfrost_bo_create(struct panfrost_device *dev, size_t size,
 367                    uint32_t flags)
 368 {
 369         struct panfrost_bo *bo;
 370
 371         /* Kernel will fail (confusingly) with EPERM otherwise */
 372         assert(size > 0);
 373
 374         /* To maximize BO cache usage, don't allocate tiny BOs */
 375         size = MAX2(size, 4096);
 376
 377         /* GROWABLE BOs cannot be mmapped */
 378         if (flags & PAN_BO_GROWABLE)
 379                 assert(flags & PAN_BO_INVISIBLE);
 380
 381         /* Before creating a BO, we first want to check the cache but without
 382          * waiting for BO readiness (BOs in the cache can still be referenced
 383          * by jobs that are not finished yet).
 384          * If the cached allocation fails we fall back on fresh BO allocation,
 385          * and if that fails too, we try one more time to allocate from the
 386          * cache, but this time we accept to wait.
 387          */
 388         bo = panfrost_bo_cache_fetch(dev, size, flags, true);
 389         if (!bo)
 390                 bo = panfrost_bo_alloc(dev, size, flags);
 391         if (!bo)
 392                 bo = panfrost_bo_cache_fetch(dev, size, flags, false);
 393
 394         if (!bo)
 395                 fprintf(stderr, "BO creation failed\n");
 396
 397         assert(bo);
 398
 399         /* Only mmap now if we know we need to. For CPU-invisible buffers, we
 400          * never map since we don't care about their contents; they're purely
 401          * for GPU-internal use. But we do trace them anyway. */
 402
 403         if (!(flags & (PAN_BO_INVISIBLE | PAN_BO_DELAY_MMAP)))
 404                 panfrost_bo_mmap(bo);
 405
 406         p_atomic_set(&bo->refcnt, 1);
 407
 408         if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) {
 409                 if (flags & PAN_BO_INVISIBLE)
 410                         pandecode_inject_mmap(bo->gpu, NULL, bo->size, NULL);
 411                 else if (!(flags & PAN_BO_DELAY_MMAP))
 412                         pandecode_inject_mmap(bo->gpu, bo->cpu, bo->size, NULL);
 413         }
 414
 415         return bo;
 416 }
 417
 418 void
 419 panfrost_bo_reference(struct panfrost_bo *bo)
 420 {
 421         if (bo) {
 422                 ASSERTED int count = p_atomic_inc_return(&bo->refcnt);
 423                 assert(count != 1);
 424         }
 425 }
 426
 427 void
 428 panfrost_bo_unreference(struct panfrost_bo *bo)
 429 {
 430         if (!bo)
 431                 return;
 432
 433         /* Don't return to cache if there are still references */
 434         if (p_atomic_dec_return(&bo->refcnt))
 435                 return;
 436
 437         struct panfrost_device *dev = bo->dev;
 438
 439         pthread_mutex_lock(&dev->bo_map_lock);
 440
 441         /* Someone might have imported this BO while we were waiting for the
 442          * lock, let's make sure it's still not referenced before freeing it.
 443          */
 444         if (p_atomic_read(&bo->refcnt) == 0) {
 445                 /* When the reference count goes to zero, we need to cleanup */
 446                 panfrost_bo_munmap(bo);
 447
 448                 /* Rather than freeing the BO now, we'll cache the BO for later
 449                  * allocations if we're allowed to.
 450                  */
 451                 if (!panfrost_bo_cache_put(bo))
 452                         panfrost_bo_free(bo);
 453
 454         }
 455         pthread_mutex_unlock(&dev->bo_map_lock);
 456 }
 457
 458 struct panfrost_bo *
 459 panfrost_bo_import(struct panfrost_device *dev, int fd)
 460 {
 461         struct panfrost_bo *bo;
 462         struct drm_panfrost_get_bo_offset get_bo_offset = {0,};
 463         ASSERTED int ret;
 464         unsigned gem_handle;
 465
 466         ret = drmPrimeFDToHandle(dev->fd, fd, &gem_handle);
 467         assert(!ret);
 468
 469         pthread_mutex_lock(&dev->bo_map_lock);
 470         bo = pan_lookup_bo(dev, gem_handle);
 471
 472         if (!bo->dev) {
 473                 get_bo_offset.handle = gem_handle;
 474                 ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_GET_BO_OFFSET, &get_bo_offset);
 475                 assert(!ret);
 476
 477                 bo->dev = dev;
 478                 bo->gpu = (mali_ptr) get_bo_offset.offset;
 479                 bo->size = lseek(fd, 0, SEEK_END);
 480                 bo->flags = PAN_BO_DONT_REUSE | PAN_BO_IMPORTED;
 481                 bo->gem_handle = gem_handle;
 482                 assert(bo->size > 0);
 483                 p_atomic_set(&bo->refcnt, 1);
 484                 // TODO map and unmap on demand?
 485                 panfrost_bo_mmap(bo);
 486         } else {
 487                 /* bo->refcnt == 0 can happen if the BO
 488                  * was being released but panfrost_bo_import() acquired the
 489                  * lock before panfrost_bo_unreference(). In that case, refcnt
 490                  * is 0 and we can't use panfrost_bo_reference() directly, we
 491                  * have to re-initialize the refcnt().
 492                  * Note that panfrost_bo_unreference() checks
 493                  * refcnt value just after acquiring the lock to
 494                  * make sure the object is not freed if panfrost_bo_import()
 495                  * acquired it in the meantime.
 496                  */
 497                 if (p_atomic_read(&bo->refcnt) == 0)
 498                         p_atomic_set(&bo->refcnt, 1);
 499                 else
 500                         panfrost_bo_reference(bo);
 501                 assert(bo->cpu);
 502         }
 503         pthread_mutex_unlock(&dev->bo_map_lock);
 504
 505         return bo;
 506 }
 507
 508 int
 509 panfrost_bo_export(struct panfrost_bo *bo)
 510 {
 511         struct drm_prime_handle args = {
 512                 .handle = bo->gem_handle,
 513                 .flags = DRM_CLOEXEC,
 514         };
 515
 516         int ret = drmIoctl(bo->dev->fd, DRM_IOCTL_PRIME_HANDLE_TO_FD, &args);
 517         if (ret == -1)
 518                 return -1;
 519
 520         bo->flags |= PAN_BO_DONT_REUSE | PAN_BO_EXPORTED;
 521         return args.fd;
 522 }
 523