src/gallium/drivers/radeon/r600_pipe_common.c

   1 /*
   2  * Copyright 2013 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  *
  23  * Authors: Marek Olšák <maraeo@gmail.com>
  24  *
  25  */
  26
  27 #include "r600_pipe_common.h"
  28 #include "r600_cs.h"
  29 #include "tgsi/tgsi_parse.h"
  30 #include "util/list.h"
  31 #include "util/u_draw_quad.h"
  32 #include "util/u_memory.h"
  33 #include "util/u_format_s3tc.h"
  34 #include "util/u_upload_mgr.h"
  35 #include "os/os_time.h"
  36 #include "vl/vl_decoder.h"
  37 #include "vl/vl_video_buffer.h"
  38 #include "radeon/radeon_video.h"
  39 #include "amd/common/sid.h"
  40 #include <inttypes.h>
  41 #include <sys/utsname.h>
  42 #include <libsync.h>
  43
  44 #include <llvm-c/TargetMachine.h>
  45
  46
  47 struct r600_multi_fence {
  48         struct pipe_reference reference;
  49         struct pipe_fence_handle *gfx;
  50         struct pipe_fence_handle *sdma;
  51
  52         /* If the context wasn't flushed at fence creation, this is non-NULL. */
  53         struct {
  54                 struct r600_common_context *ctx;
  55                 unsigned ib_index;
  56         } gfx_unflushed;
  57 };
  58
  59 /*
  60  * shader binary helpers.
  61  */
  62 void si_radeon_shader_binary_init(struct ac_shader_binary *b)
  63 {
  64         memset(b, 0, sizeof(*b));
  65 }
  66
  67 void si_radeon_shader_binary_clean(struct ac_shader_binary *b)
  68 {
  69         if (!b)
  70                 return;
  71         FREE(b->code);
  72         FREE(b->config);
  73         FREE(b->rodata);
  74         FREE(b->global_symbol_offsets);
  75         FREE(b->relocs);
  76         FREE(b->disasm_string);
  77         FREE(b->llvm_ir_string);
  78 }
  79
  80 /*
  81  * pipe_context
  82  */
  83
  84 /**
  85  * Write an EOP event.
  86  *
  87  * \param event         EVENT_TYPE_*
  88  * \param event_flags   Optional cache flush flags (TC)
  89  * \param data_sel      1 = fence, 3 = timestamp
  90  * \param buf           Buffer
  91  * \param va            GPU address
  92  * \param old_value     Previous fence value (for a bug workaround)
  93  * \param new_value     Fence value to write for this event.
  94  */
  95 void si_gfx_write_event_eop(struct r600_common_context *ctx,
  96                             unsigned event, unsigned event_flags,
  97                             unsigned data_sel,
  98                             struct r600_resource *buf, uint64_t va,
  99                             uint32_t new_fence, unsigned query_type)
 100 {
 101         struct radeon_winsys_cs *cs = ctx->gfx.cs;
 102         unsigned op = EVENT_TYPE(event) |
 103                       EVENT_INDEX(5) |
 104                       event_flags;
 105         unsigned sel = EOP_DATA_SEL(data_sel);
 106
 107         /* Wait for write confirmation before writing data, but don't send
 108          * an interrupt. */
 109         if (data_sel != EOP_DATA_SEL_DISCARD)
 110                 sel |= EOP_INT_SEL(EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM);
 111
 112         if (ctx->chip_class >= GFX9) {
 113                 /* A ZPASS_DONE or PIXEL_STAT_DUMP_EVENT (of the DB occlusion
 114                  * counters) must immediately precede every timestamp event to
 115                  * prevent a GPU hang on GFX9.
 116                  *
 117                  * Occlusion queries don't need to do it here, because they
 118                  * always do ZPASS_DONE before the timestamp.
 119                  */
 120                 if (ctx->chip_class == GFX9 &&
 121                     query_type != PIPE_QUERY_OCCLUSION_COUNTER &&
 122                     query_type != PIPE_QUERY_OCCLUSION_PREDICATE &&
 123                     query_type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
 124                         struct r600_resource *scratch = ctx->eop_bug_scratch;
 125
 126                         assert(16 * ctx->screen->info.num_render_backends <=
 127                                scratch->b.b.width0);
 128                         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
 129                         radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
 130                         radeon_emit(cs, scratch->gpu_address);
 131                         radeon_emit(cs, scratch->gpu_address >> 32);
 132
 133                         radeon_add_to_buffer_list(ctx, &ctx->gfx, scratch,
 134                                                   RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
 135                 }
 136
 137                 radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, 6, 0));
 138                 radeon_emit(cs, op);
 139                 radeon_emit(cs, sel);
 140                 radeon_emit(cs, va);            /* address lo */
 141                 radeon_emit(cs, va >> 32);      /* address hi */
 142                 radeon_emit(cs, new_fence);     /* immediate data lo */
 143                 radeon_emit(cs, 0); /* immediate data hi */
 144                 radeon_emit(cs, 0); /* unused */
 145         } else {
 146                 if (ctx->chip_class == CIK ||
 147                     ctx->chip_class == VI) {
 148                         struct r600_resource *scratch = ctx->eop_bug_scratch;
 149                         uint64_t va = scratch->gpu_address;
 150
 151                         /* Two EOP events are required to make all engines go idle
 152                          * (and optional cache flushes executed) before the timestamp
 153                          * is written.
 154                          */
 155                         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
 156                         radeon_emit(cs, op);
 157                         radeon_emit(cs, va);
 158                         radeon_emit(cs, ((va >> 32) & 0xffff) | sel);
 159                         radeon_emit(cs, 0); /* immediate data */
 160                         radeon_emit(cs, 0); /* unused */
 161
 162                         radeon_add_to_buffer_list(ctx, &ctx->gfx, scratch,
 163                                                   RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
 164                 }
 165
 166                 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
 167                 radeon_emit(cs, op);
 168                 radeon_emit(cs, va);
 169                 radeon_emit(cs, ((va >> 32) & 0xffff) | sel);
 170                 radeon_emit(cs, new_fence); /* immediate data */
 171                 radeon_emit(cs, 0); /* unused */
 172         }
 173
 174         if (buf) {
 175                 radeon_add_to_buffer_list(ctx, &ctx->gfx, buf, RADEON_USAGE_WRITE,
 176                                           RADEON_PRIO_QUERY);
 177         }
 178 }
 179
 180 unsigned si_gfx_write_fence_dwords(struct r600_common_screen *screen)
 181 {
 182         unsigned dwords = 6;
 183
 184         if (screen->chip_class == CIK ||
 185             screen->chip_class == VI)
 186                 dwords *= 2;
 187
 188         if (!screen->info.has_virtual_memory)
 189                 dwords += 2;
 190
 191         return dwords;
 192 }
 193
 194 void si_gfx_wait_fence(struct r600_common_context *ctx,
 195                        uint64_t va, uint32_t ref, uint32_t mask)
 196 {
 197         struct radeon_winsys_cs *cs = ctx->gfx.cs;
 198
 199         radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
 200         radeon_emit(cs, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_MEM_SPACE(1));
 201         radeon_emit(cs, va);
 202         radeon_emit(cs, va >> 32);
 203         radeon_emit(cs, ref); /* reference value */
 204         radeon_emit(cs, mask); /* mask */
 205         radeon_emit(cs, 4); /* poll interval */
 206 }
 207
 208 static void r600_dma_emit_wait_idle(struct r600_common_context *rctx)
 209 {
 210         struct radeon_winsys_cs *cs = rctx->dma.cs;
 211
 212         /* NOP waits for idle on Evergreen and later. */
 213         if (rctx->chip_class >= CIK)
 214                 radeon_emit(cs, 0x00000000); /* NOP */
 215         else
 216                 radeon_emit(cs, 0xf0000000); /* NOP */
 217 }
 218
 219 void si_need_dma_space(struct r600_common_context *ctx, unsigned num_dw,
 220                        struct r600_resource *dst, struct r600_resource *src)
 221 {
 222         uint64_t vram = ctx->dma.cs->used_vram;
 223         uint64_t gtt = ctx->dma.cs->used_gart;
 224
 225         if (dst) {
 226                 vram += dst->vram_usage;
 227                 gtt += dst->gart_usage;
 228         }
 229         if (src) {
 230                 vram += src->vram_usage;
 231                 gtt += src->gart_usage;
 232         }
 233
 234         /* Flush the GFX IB if DMA depends on it. */
 235         if (radeon_emitted(ctx->gfx.cs, ctx->initial_gfx_cs_size) &&
 236             ((dst &&
 237               ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, dst->buf,
 238                                                RADEON_USAGE_READWRITE)) ||
 239              (src &&
 240               ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, src->buf,
 241                                                RADEON_USAGE_WRITE))))
 242                 ctx->gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 243
 244         /* Flush if there's not enough space, or if the memory usage per IB
 245          * is too large.
 246          *
 247          * IBs using too little memory are limited by the IB submission overhead.
 248          * IBs using too much memory are limited by the kernel/TTM overhead.
 249          * Too long IBs create CPU-GPU pipeline bubbles and add latency.
 250          *
 251          * This heuristic makes sure that DMA requests are executed
 252          * very soon after the call is made and lowers memory usage.
 253          * It improves texture upload performance by keeping the DMA
 254          * engine busy while uploads are being submitted.
 255          */
 256         num_dw++; /* for emit_wait_idle below */
 257         if (!ctx->ws->cs_check_space(ctx->dma.cs, num_dw) ||
 258             ctx->dma.cs->used_vram + ctx->dma.cs->used_gart > 64 * 1024 * 1024 ||
 259             !radeon_cs_memory_below_limit(ctx->screen, ctx->dma.cs, vram, gtt)) {
 260                 ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 261                 assert((num_dw + ctx->dma.cs->current.cdw) <= ctx->dma.cs->current.max_dw);
 262         }
 263
 264         /* Wait for idle if either buffer has been used in the IB before to
 265          * prevent read-after-write hazards.
 266          */
 267         if ((dst &&
 268              ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, dst->buf,
 269                                               RADEON_USAGE_READWRITE)) ||
 270             (src &&
 271              ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, src->buf,
 272                                               RADEON_USAGE_WRITE)))
 273                 r600_dma_emit_wait_idle(ctx);
 274
 275         /* If GPUVM is not supported, the CS checker needs 2 entries
 276          * in the buffer list per packet, which has to be done manually.
 277          */
 278         if (ctx->screen->info.has_virtual_memory) {
 279                 if (dst)
 280                         radeon_add_to_buffer_list(ctx, &ctx->dma, dst,
 281                                                   RADEON_USAGE_WRITE,
 282                                                   RADEON_PRIO_SDMA_BUFFER);
 283                 if (src)
 284                         radeon_add_to_buffer_list(ctx, &ctx->dma, src,
 285                                                   RADEON_USAGE_READ,
 286                                                   RADEON_PRIO_SDMA_BUFFER);
 287         }
 288
 289         /* this function is called before all DMA calls, so increment this. */
 290         ctx->num_dma_calls++;
 291 }
 292
 293 static void r600_memory_barrier(struct pipe_context *ctx, unsigned flags)
 294 {
 295 }
 296
 297 void si_preflush_suspend_features(struct r600_common_context *ctx)
 298 {
 299         /* suspend queries */
 300         if (!LIST_IS_EMPTY(&ctx->active_queries))
 301                 si_suspend_queries(ctx);
 302 }
 303
 304 void si_postflush_resume_features(struct r600_common_context *ctx)
 305 {
 306         /* resume queries */
 307         if (!LIST_IS_EMPTY(&ctx->active_queries))
 308                 si_resume_queries(ctx);
 309 }
 310
 311 static void r600_add_fence_dependency(struct r600_common_context *rctx,
 312                                       struct pipe_fence_handle *fence)
 313 {
 314         struct radeon_winsys *ws = rctx->ws;
 315
 316         if (rctx->dma.cs)
 317                 ws->cs_add_fence_dependency(rctx->dma.cs, fence);
 318         ws->cs_add_fence_dependency(rctx->gfx.cs, fence);
 319 }
 320
 321 static void r600_fence_server_sync(struct pipe_context *ctx,
 322                                    struct pipe_fence_handle *fence)
 323 {
 324         struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 325         struct r600_multi_fence *rfence = (struct r600_multi_fence *)fence;
 326
 327         /* Only amdgpu needs to handle fence dependencies (for fence imports).
 328          * radeon synchronizes all rings by default and will not implement
 329          * fence imports.
 330          */
 331         if (rctx->screen->info.drm_major == 2)
 332                 return;
 333
 334         /* Only imported fences need to be handled by fence_server_sync,
 335          * because the winsys handles synchronizations automatically for BOs
 336          * within the process.
 337          *
 338          * Simply skip unflushed fences here, and the winsys will drop no-op
 339          * dependencies (i.e. dependencies within the same ring).
 340          */
 341         if (rfence->gfx_unflushed.ctx)
 342                 return;
 343
 344         /* All unflushed commands will not start execution before
 345          * this fence dependency is signalled.
 346          *
 347          * Should we flush the context to allow more GPU parallelism?
 348          */
 349         if (rfence->sdma)
 350                 r600_add_fence_dependency(rctx, rfence->sdma);
 351         if (rfence->gfx)
 352                 r600_add_fence_dependency(rctx, rfence->gfx);
 353 }
 354
 355 static void r600_create_fence_fd(struct pipe_context *ctx,
 356                                  struct pipe_fence_handle **pfence, int fd)
 357 {
 358         struct r600_common_screen *rscreen = (struct r600_common_screen*)ctx->screen;
 359         struct radeon_winsys *ws = rscreen->ws;
 360         struct r600_multi_fence *rfence;
 361
 362         *pfence = NULL;
 363
 364         if (!rscreen->info.has_sync_file)
 365                 return;
 366
 367         rfence = CALLOC_STRUCT(r600_multi_fence);
 368         if (!rfence)
 369                 return;
 370
 371         pipe_reference_init(&rfence->reference, 1);
 372         rfence->gfx = ws->fence_import_sync_file(ws, fd);
 373         if (!rfence->gfx) {
 374                 FREE(rfence);
 375                 return;
 376         }
 377
 378         *pfence = (struct pipe_fence_handle*)rfence;
 379 }
 380
 381 static int r600_fence_get_fd(struct pipe_screen *screen,
 382                              struct pipe_fence_handle *fence)
 383 {
 384         struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
 385         struct radeon_winsys *ws = rscreen->ws;
 386         struct r600_multi_fence *rfence = (struct r600_multi_fence *)fence;
 387         int gfx_fd = -1, sdma_fd = -1;
 388
 389         if (!rscreen->info.has_sync_file)
 390                 return -1;
 391
 392         /* Deferred fences aren't supported. */
 393         assert(!rfence->gfx_unflushed.ctx);
 394         if (rfence->gfx_unflushed.ctx)
 395                 return -1;
 396
 397         if (rfence->sdma) {
 398                 sdma_fd = ws->fence_export_sync_file(ws, rfence->sdma);
 399                 if (sdma_fd == -1)
 400                         return -1;
 401         }
 402         if (rfence->gfx) {
 403                 gfx_fd = ws->fence_export_sync_file(ws, rfence->gfx);
 404                 if (gfx_fd == -1) {
 405                         if (sdma_fd != -1)
 406                                 close(sdma_fd);
 407                         return -1;
 408                 }
 409         }
 410
 411         /* If we don't have FDs at this point, it means we don't have fences
 412          * either. */
 413         if (sdma_fd == -1)
 414                 return gfx_fd;
 415         if (gfx_fd == -1)
 416                 return sdma_fd;
 417
 418         /* Get a fence that will be a combination of both fences. */
 419         sync_accumulate("radeonsi", &gfx_fd, sdma_fd);
 420         close(sdma_fd);
 421         return gfx_fd;
 422 }
 423
 424 static void r600_flush_from_st(struct pipe_context *ctx,
 425                                struct pipe_fence_handle **fence,
 426                                unsigned flags)
 427 {
 428         struct pipe_screen *screen = ctx->screen;
 429         struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 430         struct radeon_winsys *ws = rctx->ws;
 431         struct pipe_fence_handle *gfx_fence = NULL;
 432         struct pipe_fence_handle *sdma_fence = NULL;
 433         bool deferred_fence = false;
 434         unsigned rflags = RADEON_FLUSH_ASYNC;
 435
 436         if (flags & PIPE_FLUSH_END_OF_FRAME)
 437                 rflags |= RADEON_FLUSH_END_OF_FRAME;
 438
 439         /* DMA IBs are preambles to gfx IBs, therefore must be flushed first. */
 440         if (rctx->dma.cs)
 441                 rctx->dma.flush(rctx, rflags, fence ? &sdma_fence : NULL);
 442
 443         if (!radeon_emitted(rctx->gfx.cs, rctx->initial_gfx_cs_size)) {
 444                 if (fence)
 445                         ws->fence_reference(&gfx_fence, rctx->last_gfx_fence);
 446                 if (!(flags & PIPE_FLUSH_DEFERRED))
 447                         ws->cs_sync_flush(rctx->gfx.cs);
 448         } else {
 449                 /* Instead of flushing, create a deferred fence. Constraints:
 450                  * - The state tracker must allow a deferred flush.
 451                  * - The state tracker must request a fence.
 452                  * - fence_get_fd is not allowed.
 453                  * Thread safety in fence_finish must be ensured by the state tracker.
 454                  */
 455                 if (flags & PIPE_FLUSH_DEFERRED &&
 456                     !(flags & PIPE_FLUSH_FENCE_FD) &&
 457                     fence) {
 458                         gfx_fence = rctx->ws->cs_get_next_fence(rctx->gfx.cs);
 459                         deferred_fence = true;
 460                 } else {
 461                         rctx->gfx.flush(rctx, rflags, fence ? &gfx_fence : NULL);
 462                 }
 463         }
 464
 465         /* Both engines can signal out of order, so we need to keep both fences. */
 466         if (fence) {
 467                 struct r600_multi_fence *multi_fence =
 468                         CALLOC_STRUCT(r600_multi_fence);
 469                 if (!multi_fence) {
 470                         ws->fence_reference(&sdma_fence, NULL);
 471                         ws->fence_reference(&gfx_fence, NULL);
 472                         goto finish;
 473                 }
 474
 475                 multi_fence->reference.count = 1;
 476                 /* If both fences are NULL, fence_finish will always return true. */
 477                 multi_fence->gfx = gfx_fence;
 478                 multi_fence->sdma = sdma_fence;
 479
 480                 if (deferred_fence) {
 481                         multi_fence->gfx_unflushed.ctx = rctx;
 482                         multi_fence->gfx_unflushed.ib_index = rctx->num_gfx_cs_flushes;
 483                 }
 484
 485                 screen->fence_reference(screen, fence, NULL);
 486                 *fence = (struct pipe_fence_handle*)multi_fence;
 487         }
 488 finish:
 489         if (!(flags & PIPE_FLUSH_DEFERRED)) {
 490                 if (rctx->dma.cs)
 491                         ws->cs_sync_flush(rctx->dma.cs);
 492                 ws->cs_sync_flush(rctx->gfx.cs);
 493         }
 494 }
 495
 496 static void r600_flush_dma_ring(void *ctx, unsigned flags,
 497                                 struct pipe_fence_handle **fence)
 498 {
 499         struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 500         struct radeon_winsys_cs *cs = rctx->dma.cs;
 501         struct radeon_saved_cs saved;
 502         bool check_vm =
 503                 (rctx->screen->debug_flags & DBG(CHECK_VM)) &&
 504                 rctx->check_vm_faults;
 505
 506         if (!radeon_emitted(cs, 0)) {
 507                 if (fence)
 508                         rctx->ws->fence_reference(fence, rctx->last_sdma_fence);
 509                 return;
 510         }
 511
 512         if (check_vm)
 513                 si_save_cs(rctx->ws, cs, &saved, true);
 514
 515         rctx->ws->cs_flush(cs, flags, &rctx->last_sdma_fence);
 516         if (fence)
 517                 rctx->ws->fence_reference(fence, rctx->last_sdma_fence);
 518
 519         if (check_vm) {
 520                 /* Use conservative timeout 800ms, after which we won't wait any
 521                  * longer and assume the GPU is hung.
 522                  */
 523                 rctx->ws->fence_wait(rctx->ws, rctx->last_sdma_fence, 800*1000*1000);
 524
 525                 rctx->check_vm_faults(rctx, &saved, RING_DMA);
 526                 si_clear_saved_cs(&saved);
 527         }
 528 }
 529
 530 /**
 531  * Store a linearized copy of all chunks of \p cs together with the buffer
 532  * list in \p saved.
 533  */
 534 void si_save_cs(struct radeon_winsys *ws, struct radeon_winsys_cs *cs,
 535                 struct radeon_saved_cs *saved, bool get_buffer_list)
 536 {
 537         uint32_t *buf;
 538         unsigned i;
 539
 540         /* Save the IB chunks. */
 541         saved->num_dw = cs->prev_dw + cs->current.cdw;
 542         saved->ib = MALLOC(4 * saved->num_dw);
 543         if (!saved->ib)
 544                 goto oom;
 545
 546         buf = saved->ib;
 547         for (i = 0; i < cs->num_prev; ++i) {
 548                 memcpy(buf, cs->prev[i].buf, cs->prev[i].cdw * 4);
 549                 buf += cs->prev[i].cdw;
 550         }
 551         memcpy(buf, cs->current.buf, cs->current.cdw * 4);
 552
 553         if (!get_buffer_list)
 554                 return;
 555
 556         /* Save the buffer list. */
 557         saved->bo_count = ws->cs_get_buffer_list(cs, NULL);
 558         saved->bo_list = CALLOC(saved->bo_count,
 559                                 sizeof(saved->bo_list[0]));
 560         if (!saved->bo_list) {
 561                 FREE(saved->ib);
 562                 goto oom;
 563         }
 564         ws->cs_get_buffer_list(cs, saved->bo_list);
 565
 566         return;
 567
 568 oom:
 569         fprintf(stderr, "%s: out of memory\n", __func__);
 570         memset(saved, 0, sizeof(*saved));
 571 }
 572
 573 void si_clear_saved_cs(struct radeon_saved_cs *saved)
 574 {
 575         FREE(saved->ib);
 576         FREE(saved->bo_list);
 577
 578         memset(saved, 0, sizeof(*saved));
 579 }
 580
 581 static enum pipe_reset_status r600_get_reset_status(struct pipe_context *ctx)
 582 {
 583         struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 584         unsigned latest = rctx->ws->query_value(rctx->ws,
 585                                                 RADEON_GPU_RESET_COUNTER);
 586
 587         if (rctx->gpu_reset_counter == latest)
 588                 return PIPE_NO_RESET;
 589
 590         rctx->gpu_reset_counter = latest;
 591         return PIPE_UNKNOWN_CONTEXT_RESET;
 592 }
 593
 594 static void r600_set_debug_callback(struct pipe_context *ctx,
 595                                     const struct pipe_debug_callback *cb)
 596 {
 597         struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 598
 599         if (cb)
 600                 rctx->debug = *cb;
 601         else
 602                 memset(&rctx->debug, 0, sizeof(rctx->debug));
 603 }
 604
 605 static void r600_set_device_reset_callback(struct pipe_context *ctx,
 606                                            const struct pipe_device_reset_callback *cb)
 607 {
 608         struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 609
 610         if (cb)
 611                 rctx->device_reset_callback = *cb;
 612         else
 613                 memset(&rctx->device_reset_callback, 0,
 614                        sizeof(rctx->device_reset_callback));
 615 }
 616
 617 bool si_check_device_reset(struct r600_common_context *rctx)
 618 {
 619         enum pipe_reset_status status;
 620
 621         if (!rctx->device_reset_callback.reset)
 622                 return false;
 623
 624         if (!rctx->b.get_device_reset_status)
 625                 return false;
 626
 627         status = rctx->b.get_device_reset_status(&rctx->b);
 628         if (status == PIPE_NO_RESET)
 629                 return false;
 630
 631         rctx->device_reset_callback.reset(rctx->device_reset_callback.data, status);
 632         return true;
 633 }
 634
 635 static void r600_dma_clear_buffer_fallback(struct pipe_context *ctx,
 636                                            struct pipe_resource *dst,
 637                                            uint64_t offset, uint64_t size,
 638                                            unsigned value)
 639 {
 640         struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 641
 642         rctx->clear_buffer(ctx, dst, offset, size, value, R600_COHERENCY_NONE);
 643 }
 644
 645 static bool r600_resource_commit(struct pipe_context *pctx,
 646                                  struct pipe_resource *resource,
 647                                  unsigned level, struct pipe_box *box,
 648                                  bool commit)
 649 {
 650         struct r600_common_context *ctx = (struct r600_common_context *)pctx;
 651         struct r600_resource *res = r600_resource(resource);
 652
 653         /*
 654          * Since buffer commitment changes cannot be pipelined, we need to
 655          * (a) flush any pending commands that refer to the buffer we're about
 656          *     to change, and
 657          * (b) wait for threaded submit to finish, including those that were
 658          *     triggered by some other, earlier operation.
 659          */
 660         if (radeon_emitted(ctx->gfx.cs, ctx->initial_gfx_cs_size) &&
 661             ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs,
 662                                              res->buf, RADEON_USAGE_READWRITE)) {
 663                 ctx->gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 664         }
 665         if (radeon_emitted(ctx->dma.cs, 0) &&
 666             ctx->ws->cs_is_buffer_referenced(ctx->dma.cs,
 667                                              res->buf, RADEON_USAGE_READWRITE)) {
 668                 ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 669         }
 670
 671         ctx->ws->cs_sync_flush(ctx->dma.cs);
 672         ctx->ws->cs_sync_flush(ctx->gfx.cs);
 673
 674         assert(resource->target == PIPE_BUFFER);
 675
 676         return ctx->ws->buffer_commit(res->buf, box->x, box->width, commit);
 677 }
 678
 679 bool si_common_context_init(struct r600_common_context *rctx,
 680                             struct r600_common_screen *rscreen,
 681                             unsigned context_flags)
 682 {
 683         slab_create_child(&rctx->pool_transfers, &rscreen->pool_transfers);
 684         slab_create_child(&rctx->pool_transfers_unsync, &rscreen->pool_transfers);
 685
 686         rctx->screen = rscreen;
 687         rctx->ws = rscreen->ws;
 688         rctx->family = rscreen->family;
 689         rctx->chip_class = rscreen->chip_class;
 690
 691         rctx->b.invalidate_resource = si_invalidate_resource;
 692         rctx->b.resource_commit = r600_resource_commit;
 693         rctx->b.transfer_map = u_transfer_map_vtbl;
 694         rctx->b.transfer_flush_region = u_transfer_flush_region_vtbl;
 695         rctx->b.transfer_unmap = u_transfer_unmap_vtbl;
 696         rctx->b.texture_subdata = u_default_texture_subdata;
 697         rctx->b.memory_barrier = r600_memory_barrier;
 698         rctx->b.flush = r600_flush_from_st;
 699         rctx->b.set_debug_callback = r600_set_debug_callback;
 700         rctx->b.create_fence_fd = r600_create_fence_fd;
 701         rctx->b.fence_server_sync = r600_fence_server_sync;
 702         rctx->dma_clear_buffer = r600_dma_clear_buffer_fallback;
 703         rctx->b.buffer_subdata = si_buffer_subdata;
 704
 705         if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 43) {
 706                 rctx->b.get_device_reset_status = r600_get_reset_status;
 707                 rctx->gpu_reset_counter =
 708                         rctx->ws->query_value(rctx->ws,
 709                                               RADEON_GPU_RESET_COUNTER);
 710         }
 711
 712         rctx->b.set_device_reset_callback = r600_set_device_reset_callback;
 713
 714         si_init_context_texture_functions(rctx);
 715         si_init_query_functions(rctx);
 716
 717         if (rctx->chip_class == CIK ||
 718             rctx->chip_class == VI ||
 719             rctx->chip_class == GFX9) {
 720                 rctx->eop_bug_scratch = (struct r600_resource*)
 721                         pipe_buffer_create(&rscreen->b, 0, PIPE_USAGE_DEFAULT,
 722                                            16 * rscreen->info.num_render_backends);
 723                 if (!rctx->eop_bug_scratch)
 724                         return false;
 725         }
 726
 727         rctx->allocator_zeroed_memory =
 728                 u_suballocator_create(&rctx->b, rscreen->info.gart_page_size,
 729                                       0, PIPE_USAGE_DEFAULT, 0, true);
 730         if (!rctx->allocator_zeroed_memory)
 731                 return false;
 732
 733         rctx->b.stream_uploader = u_upload_create(&rctx->b, 1024 * 1024,
 734                                                   0, PIPE_USAGE_STREAM);
 735         if (!rctx->b.stream_uploader)
 736                 return false;
 737
 738         rctx->b.const_uploader = u_upload_create(&rctx->b, 128 * 1024,
 739                                                  0, PIPE_USAGE_DEFAULT);
 740         if (!rctx->b.const_uploader)
 741                 return false;
 742
 743         rctx->ctx = rctx->ws->ctx_create(rctx->ws);
 744         if (!rctx->ctx)
 745                 return false;
 746
 747         if (rscreen->info.num_sdma_rings && !(rscreen->debug_flags & DBG(NO_ASYNC_DMA))) {
 748                 rctx->dma.cs = rctx->ws->cs_create(rctx->ctx, RING_DMA,
 749                                                    r600_flush_dma_ring,
 750                                                    rctx);
 751                 rctx->dma.flush = r600_flush_dma_ring;
 752         }
 753
 754         return true;
 755 }
 756
 757 void si_common_context_cleanup(struct r600_common_context *rctx)
 758 {
 759         unsigned i,j;
 760
 761         /* Release DCC stats. */
 762         for (i = 0; i < ARRAY_SIZE(rctx->dcc_stats); i++) {
 763                 assert(!rctx->dcc_stats[i].query_active);
 764
 765                 for (j = 0; j < ARRAY_SIZE(rctx->dcc_stats[i].ps_stats); j++)
 766                         if (rctx->dcc_stats[i].ps_stats[j])
 767                                 rctx->b.destroy_query(&rctx->b,
 768                                                       rctx->dcc_stats[i].ps_stats[j]);
 769
 770                 r600_texture_reference(&rctx->dcc_stats[i].tex, NULL);
 771         }
 772
 773         if (rctx->query_result_shader)
 774                 rctx->b.delete_compute_state(&rctx->b, rctx->query_result_shader);
 775
 776         if (rctx->gfx.cs)
 777                 rctx->ws->cs_destroy(rctx->gfx.cs);
 778         if (rctx->dma.cs)
 779                 rctx->ws->cs_destroy(rctx->dma.cs);
 780         if (rctx->ctx)
 781                 rctx->ws->ctx_destroy(rctx->ctx);
 782
 783         if (rctx->b.stream_uploader)
 784                 u_upload_destroy(rctx->b.stream_uploader);
 785         if (rctx->b.const_uploader)
 786                 u_upload_destroy(rctx->b.const_uploader);
 787
 788         slab_destroy_child(&rctx->pool_transfers);
 789         slab_destroy_child(&rctx->pool_transfers_unsync);
 790
 791         if (rctx->allocator_zeroed_memory) {
 792                 u_suballocator_destroy(rctx->allocator_zeroed_memory);
 793         }
 794         rctx->ws->fence_reference(&rctx->last_gfx_fence, NULL);
 795         rctx->ws->fence_reference(&rctx->last_sdma_fence, NULL);
 796         r600_resource_reference(&rctx->eop_bug_scratch, NULL);
 797 }
 798
 799 /*
 800  * pipe_screen
 801  */
 802
 803 static const struct debug_named_value common_debug_options[] = {
 804         /* logging */
 805         { "tex", DBG(TEX), "Print texture info" },
 806         { "nir", DBG(NIR), "Enable experimental NIR shaders" },
 807         { "compute", DBG(COMPUTE), "Print compute info" },
 808         { "vm", DBG(VM), "Print virtual addresses when creating resources" },
 809         { "info", DBG(INFO), "Print driver information" },
 810
 811         /* shaders */
 812         { "vs", DBG(VS), "Print vertex shaders" },
 813         { "gs", DBG(GS), "Print geometry shaders" },
 814         { "ps", DBG(PS), "Print pixel shaders" },
 815         { "cs", DBG(CS), "Print compute shaders" },
 816         { "tcs", DBG(TCS), "Print tessellation control shaders" },
 817         { "tes", DBG(TES), "Print tessellation evaluation shaders" },
 818         { "noir", DBG(NO_IR), "Don't print the LLVM IR"},
 819         { "notgsi", DBG(NO_TGSI), "Don't print the TGSI"},
 820         { "noasm", DBG(NO_ASM), "Don't print disassembled shaders"},
 821         { "preoptir", DBG(PREOPT_IR), "Print the LLVM IR before initial optimizations" },
 822         { "checkir", DBG(CHECK_IR), "Enable additional sanity checks on shader IR" },
 823         { "nooptvariant", DBG(NO_OPT_VARIANT), "Disable compiling optimized shader variants." },
 824
 825         { "testdma", DBG(TEST_DMA), "Invoke SDMA tests and exit." },
 826         { "testvmfaultcp", DBG(TEST_VMFAULT_CP), "Invoke a CP VM fault test and exit." },
 827         { "testvmfaultsdma", DBG(TEST_VMFAULT_SDMA), "Invoke a SDMA VM fault test and exit." },
 828         { "testvmfaultshader", DBG(TEST_VMFAULT_SHADER), "Invoke a shader VM fault test and exit." },
 829
 830         /* features */
 831         { "nodma", DBG(NO_ASYNC_DMA), "Disable asynchronous DMA" },
 832         { "nohyperz", DBG(NO_HYPERZ), "Disable Hyper-Z" },
 833         /* GL uses the word INVALIDATE, gallium uses the word DISCARD */
 834         { "noinvalrange", DBG(NO_DISCARD_RANGE), "Disable handling of INVALIDATE_RANGE map flags" },
 835         { "no2d", DBG(NO_2D_TILING), "Disable 2D tiling" },
 836         { "notiling", DBG(NO_TILING), "Disable tiling" },
 837         { "switch_on_eop", DBG(SWITCH_ON_EOP), "Program WD/IA to switch on end-of-packet." },
 838         { "forcedma", DBG(FORCE_DMA), "Use asynchronous DMA for all operations when possible." },
 839         { "precompile", DBG(PRECOMPILE), "Compile one shader variant at shader creation." },
 840         { "nowc", DBG(NO_WC), "Disable GTT write combining" },
 841         { "check_vm", DBG(CHECK_VM), "Check VM faults and dump debug info." },
 842         { "nodcc", DBG(NO_DCC), "Disable DCC." },
 843         { "nodccclear", DBG(NO_DCC_CLEAR), "Disable DCC fast clear." },
 844         { "norbplus", DBG(NO_RB_PLUS), "Disable RB+." },
 845         { "sisched", DBG(SI_SCHED), "Enable LLVM SI Machine Instruction Scheduler." },
 846         { "mono", DBG(MONOLITHIC_SHADERS), "Use old-style monolithic shaders compiled on demand" },
 847         { "unsafemath", DBG(UNSAFE_MATH), "Enable unsafe math shader optimizations" },
 848         { "nodccfb", DBG(NO_DCC_FB), "Disable separate DCC on the main framebuffer" },
 849         { "nodpbb", DBG(NO_DPBB), "Disable DPBB." },
 850         { "nodfsm", DBG(NO_DFSM), "Disable DFSM." },
 851         { "dpbb", DBG(DPBB), "Enable DPBB." },
 852         { "dfsm", DBG(DFSM), "Enable DFSM." },
 853         { "nooutoforder", DBG(NO_OUT_OF_ORDER), "Disable out-of-order rasterization" },
 854
 855         DEBUG_NAMED_VALUE_END /* must be last */
 856 };
 857
 858 static const char* r600_get_vendor(struct pipe_screen* pscreen)
 859 {
 860         return "X.Org";
 861 }
 862
 863 static const char* r600_get_device_vendor(struct pipe_screen* pscreen)
 864 {
 865         return "AMD";
 866 }
 867
 868 static const char *r600_get_marketing_name(struct radeon_winsys *ws)
 869 {
 870         if (!ws->get_chip_name)
 871                 return NULL;
 872         return ws->get_chip_name(ws);
 873 }
 874
 875 static const char *r600_get_family_name(const struct r600_common_screen *rscreen)
 876 {
 877         switch (rscreen->info.family) {
 878         case CHIP_TAHITI: return "AMD TAHITI";
 879         case CHIP_PITCAIRN: return "AMD PITCAIRN";
 880         case CHIP_VERDE: return "AMD CAPE VERDE";
 881         case CHIP_OLAND: return "AMD OLAND";
 882         case CHIP_HAINAN: return "AMD HAINAN";
 883         case CHIP_BONAIRE: return "AMD BONAIRE";
 884         case CHIP_KAVERI: return "AMD KAVERI";
 885         case CHIP_KABINI: return "AMD KABINI";
 886         case CHIP_HAWAII: return "AMD HAWAII";
 887         case CHIP_MULLINS: return "AMD MULLINS";
 888         case CHIP_TONGA: return "AMD TONGA";
 889         case CHIP_ICELAND: return "AMD ICELAND";
 890         case CHIP_CARRIZO: return "AMD CARRIZO";
 891         case CHIP_FIJI: return "AMD FIJI";
 892         case CHIP_POLARIS10: return "AMD POLARIS10";
 893         case CHIP_POLARIS11: return "AMD POLARIS11";
 894         case CHIP_POLARIS12: return "AMD POLARIS12";
 895         case CHIP_STONEY: return "AMD STONEY";
 896         case CHIP_VEGA10: return "AMD VEGA10";
 897         case CHIP_RAVEN: return "AMD RAVEN";
 898         default: return "AMD unknown";
 899         }
 900 }
 901
 902 static void r600_disk_cache_create(struct r600_common_screen *rscreen)
 903 {
 904         /* Don't use the cache if shader dumping is enabled. */
 905         if (rscreen->debug_flags & DBG_ALL_SHADERS)
 906                 return;
 907
 908         uint32_t mesa_timestamp;
 909         if (disk_cache_get_function_timestamp(r600_disk_cache_create,
 910                                               &mesa_timestamp)) {
 911                 char *timestamp_str;
 912                 int res = -1;
 913                 uint32_t llvm_timestamp;
 914
 915                 if (disk_cache_get_function_timestamp(LLVMInitializeAMDGPUTargetInfo,
 916                                                       &llvm_timestamp)) {
 917                         res = asprintf(&timestamp_str, "%u_%u",
 918                                        mesa_timestamp, llvm_timestamp);
 919                 }
 920
 921                 if (res != -1) {
 922                         /* These flags affect shader compilation. */
 923                         uint64_t shader_debug_flags =
 924                                 rscreen->debug_flags &
 925                                 (DBG(FS_CORRECT_DERIVS_AFTER_KILL) |
 926                                  DBG(SI_SCHED) |
 927                                  DBG(UNSAFE_MATH));
 928
 929                         rscreen->disk_shader_cache =
 930                                 disk_cache_create(r600_get_family_name(rscreen),
 931                                                   timestamp_str,
 932                                                   shader_debug_flags);
 933                         free(timestamp_str);
 934                 }
 935         }
 936 }
 937
 938 static struct disk_cache *r600_get_disk_shader_cache(struct pipe_screen *pscreen)
 939 {
 940         struct r600_common_screen *rscreen = (struct r600_common_screen*)pscreen;
 941         return rscreen->disk_shader_cache;
 942 }
 943
 944 static const char* r600_get_name(struct pipe_screen* pscreen)
 945 {
 946         struct r600_common_screen *rscreen = (struct r600_common_screen*)pscreen;
 947
 948         return rscreen->renderer_string;
 949 }
 950
 951 static float r600_get_paramf(struct pipe_screen* pscreen,
 952                              enum pipe_capf param)
 953 {
 954         switch (param) {
 955         case PIPE_CAPF_MAX_LINE_WIDTH:
 956         case PIPE_CAPF_MAX_LINE_WIDTH_AA:
 957         case PIPE_CAPF_MAX_POINT_WIDTH:
 958         case PIPE_CAPF_MAX_POINT_WIDTH_AA:
 959                 return 8192.0f;
 960         case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY:
 961                 return 16.0f;
 962         case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
 963                 return 16.0f;
 964         case PIPE_CAPF_GUARD_BAND_LEFT:
 965         case PIPE_CAPF_GUARD_BAND_TOP:
 966         case PIPE_CAPF_GUARD_BAND_RIGHT:
 967         case PIPE_CAPF_GUARD_BAND_BOTTOM:
 968                 return 0.0f;
 969         }
 970         return 0.0f;
 971 }
 972
 973 static int r600_get_video_param(struct pipe_screen *screen,
 974                                 enum pipe_video_profile profile,
 975                                 enum pipe_video_entrypoint entrypoint,
 976                                 enum pipe_video_cap param)
 977 {
 978         switch (param) {
 979         case PIPE_VIDEO_CAP_SUPPORTED:
 980                 return vl_profile_supported(screen, profile, entrypoint);
 981         case PIPE_VIDEO_CAP_NPOT_TEXTURES:
 982                 return 1;
 983         case PIPE_VIDEO_CAP_MAX_WIDTH:
 984         case PIPE_VIDEO_CAP_MAX_HEIGHT:
 985                 return vl_video_buffer_max_size(screen);
 986         case PIPE_VIDEO_CAP_PREFERED_FORMAT:
 987                 return PIPE_FORMAT_NV12;
 988         case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
 989                 return false;
 990         case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED:
 991                 return false;
 992         case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
 993                 return true;
 994         case PIPE_VIDEO_CAP_MAX_LEVEL:
 995                 return vl_level_supported(screen, profile);
 996         default:
 997                 return 0;
 998         }
 999 }
1000
1001 const char *si_get_llvm_processor_name(enum radeon_family family)
1002 {
1003         switch (family) {
1004         case CHIP_TAHITI: return "tahiti";
1005         case CHIP_PITCAIRN: return "pitcairn";
1006         case CHIP_VERDE: return "verde";
1007         case CHIP_OLAND: return "oland";
1008         case CHIP_HAINAN: return "hainan";
1009         case CHIP_BONAIRE: return "bonaire";
1010         case CHIP_KABINI: return "kabini";
1011         case CHIP_KAVERI: return "kaveri";
1012         case CHIP_HAWAII: return "hawaii";
1013         case CHIP_MULLINS:
1014                 return "mullins";
1015         case CHIP_TONGA: return "tonga";
1016         case CHIP_ICELAND: return "iceland";
1017         case CHIP_CARRIZO: return "carrizo";
1018         case CHIP_FIJI:
1019                 return "fiji";
1020         case CHIP_STONEY:
1021                 return "stoney";
1022         case CHIP_POLARIS10:
1023                 return "polaris10";
1024         case CHIP_POLARIS11:
1025         case CHIP_POLARIS12: /* same as polaris11 */
1026                 return "polaris11";
1027         case CHIP_VEGA10:
1028         case CHIP_RAVEN:
1029                 return "gfx900";
1030         default:
1031                 return "";
1032         }
1033 }
1034
1035 static unsigned get_max_threads_per_block(struct r600_common_screen *screen,
1036                                           enum pipe_shader_ir ir_type)
1037 {
1038         if (ir_type != PIPE_SHADER_IR_TGSI)
1039                 return 256;
1040
1041         /* Only 16 waves per thread-group on gfx9. */
1042         if (screen->chip_class >= GFX9)
1043                 return 1024;
1044
1045         /* Up to 40 waves per thread-group on GCN < gfx9. Expose a nice
1046          * round number.
1047          */
1048         return 2048;
1049 }
1050
1051 static int r600_get_compute_param(struct pipe_screen *screen,
1052         enum pipe_shader_ir ir_type,
1053         enum pipe_compute_cap param,
1054         void *ret)
1055 {
1056         struct r600_common_screen *rscreen = (struct r600_common_screen *)screen;
1057
1058         //TODO: select these params by asic
1059         switch (param) {
1060         case PIPE_COMPUTE_CAP_IR_TARGET: {
1061                 const char *gpu;
1062                 const char *triple;
1063
1064                 if (HAVE_LLVM < 0x0400)
1065                         triple = "amdgcn--";
1066                 else
1067                         triple = "amdgcn-mesa-mesa3d";
1068
1069                 gpu = si_get_llvm_processor_name(rscreen->family);
1070                 if (ret) {
1071                         sprintf(ret, "%s-%s", gpu, triple);
1072                 }
1073                 /* +2 for dash and terminating NIL byte */
1074                 return (strlen(triple) + strlen(gpu) + 2) * sizeof(char);
1075         }
1076         case PIPE_COMPUTE_CAP_GRID_DIMENSION:
1077                 if (ret) {
1078                         uint64_t *grid_dimension = ret;
1079                         grid_dimension[0] = 3;
1080                 }
1081                 return 1 * sizeof(uint64_t);
1082
1083         case PIPE_COMPUTE_CAP_MAX_GRID_SIZE:
1084                 if (ret) {
1085                         uint64_t *grid_size = ret;
1086                         grid_size[0] = 65535;
1087                         grid_size[1] = 65535;
1088                         grid_size[2] = 65535;
1089                 }
1090                 return 3 * sizeof(uint64_t) ;
1091
1092         case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE:
1093                 if (ret) {
1094                         uint64_t *block_size = ret;
1095                         unsigned threads_per_block = get_max_threads_per_block(rscreen, ir_type);
1096                         block_size[0] = threads_per_block;
1097                         block_size[1] = threads_per_block;
1098                         block_size[2] = threads_per_block;
1099                 }
1100                 return 3 * sizeof(uint64_t);
1101
1102         case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK:
1103                 if (ret) {
1104                         uint64_t *max_threads_per_block = ret;
1105                         *max_threads_per_block = get_max_threads_per_block(rscreen, ir_type);
1106                 }
1107                 return sizeof(uint64_t);
1108         case PIPE_COMPUTE_CAP_ADDRESS_BITS:
1109                 if (ret) {
1110                         uint32_t *address_bits = ret;
1111                         address_bits[0] = 64;
1112                 }
1113                 return 1 * sizeof(uint32_t);
1114
1115         case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE:
1116                 if (ret) {
1117                         uint64_t *max_global_size = ret;
1118                         uint64_t max_mem_alloc_size;
1119
1120                         r600_get_compute_param(screen, ir_type,
1121                                 PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE,
1122                                 &max_mem_alloc_size);
1123
1124                         /* In OpenCL, the MAX_MEM_ALLOC_SIZE must be at least
1125                          * 1/4 of the MAX_GLOBAL_SIZE.  Since the
1126                          * MAX_MEM_ALLOC_SIZE is fixed for older kernels,
1127                          * make sure we never report more than
1128                          * 4 * MAX_MEM_ALLOC_SIZE.
1129                          */
1130                         *max_global_size = MIN2(4 * max_mem_alloc_size,
1131                                                 MAX2(rscreen->info.gart_size,
1132                                                      rscreen->info.vram_size));
1133                 }
1134                 return sizeof(uint64_t);
1135
1136         case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE:
1137                 if (ret) {
1138                         uint64_t *max_local_size = ret;
1139                         /* Value reported by the closed source driver. */
1140                         *max_local_size = 32768;
1141                 }
1142                 return sizeof(uint64_t);
1143
1144         case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE:
1145                 if (ret) {
1146                         uint64_t *max_input_size = ret;
1147                         /* Value reported by the closed source driver. */
1148                         *max_input_size = 1024;
1149                 }
1150                 return sizeof(uint64_t);
1151
1152         case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE:
1153                 if (ret) {
1154                         uint64_t *max_mem_alloc_size = ret;
1155
1156                         *max_mem_alloc_size = rscreen->info.max_alloc_size;
1157                 }
1158                 return sizeof(uint64_t);
1159
1160         case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY:
1161                 if (ret) {
1162                         uint32_t *max_clock_frequency = ret;
1163                         *max_clock_frequency = rscreen->info.max_shader_clock;
1164                 }
1165                 return sizeof(uint32_t);
1166
1167         case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS:
1168                 if (ret) {
1169                         uint32_t *max_compute_units = ret;
1170                         *max_compute_units = rscreen->info.num_good_compute_units;
1171                 }
1172                 return sizeof(uint32_t);
1173
1174         case PIPE_COMPUTE_CAP_IMAGES_SUPPORTED:
1175                 if (ret) {
1176                         uint32_t *images_supported = ret;
1177                         *images_supported = 0;
1178                 }
1179                 return sizeof(uint32_t);
1180         case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE:
1181                 break; /* unused */
1182         case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
1183                 if (ret) {
1184                         uint32_t *subgroup_size = ret;
1185                         *subgroup_size = 64;
1186                 }
1187                 return sizeof(uint32_t);
1188         case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK:
1189                 if (ret) {
1190                         uint64_t *max_variable_threads_per_block = ret;
1191                         if (ir_type == PIPE_SHADER_IR_TGSI)
1192                                 *max_variable_threads_per_block = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
1193                         else
1194                                 *max_variable_threads_per_block = 0;
1195                 }
1196                 return sizeof(uint64_t);
1197         }
1198
1199         fprintf(stderr, "unknown PIPE_COMPUTE_CAP %d\n", param);
1200         return 0;
1201 }
1202
1203 static uint64_t r600_get_timestamp(struct pipe_screen *screen)
1204 {
1205         struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
1206
1207         return 1000000 * rscreen->ws->query_value(rscreen->ws, RADEON_TIMESTAMP) /
1208                         rscreen->info.clock_crystal_freq;
1209 }
1210
1211 static void r600_fence_reference(struct pipe_screen *screen,
1212                                  struct pipe_fence_handle **dst,
1213                                  struct pipe_fence_handle *src)
1214 {
1215         struct radeon_winsys *ws = ((struct r600_common_screen*)screen)->ws;
1216         struct r600_multi_fence **rdst = (struct r600_multi_fence **)dst;
1217         struct r600_multi_fence *rsrc = (struct r600_multi_fence *)src;
1218
1219         if (pipe_reference(&(*rdst)->reference, &rsrc->reference)) {
1220                 ws->fence_reference(&(*rdst)->gfx, NULL);
1221                 ws->fence_reference(&(*rdst)->sdma, NULL);
1222                 FREE(*rdst);
1223         }
1224         *rdst = rsrc;
1225 }
1226
1227 static boolean r600_fence_finish(struct pipe_screen *screen,
1228                                  struct pipe_context *ctx,
1229                                  struct pipe_fence_handle *fence,
1230                                  uint64_t timeout)
1231 {
1232         struct radeon_winsys *rws = ((struct r600_common_screen*)screen)->ws;
1233         struct r600_multi_fence *rfence = (struct r600_multi_fence *)fence;
1234         struct r600_common_context *rctx;
1235         int64_t abs_timeout = os_time_get_absolute_timeout(timeout);
1236
1237         ctx = threaded_context_unwrap_sync(ctx);
1238         rctx = ctx ? (struct r600_common_context*)ctx : NULL;
1239
1240         if (rfence->sdma) {
1241                 if (!rws->fence_wait(rws, rfence->sdma, timeout))
1242                         return false;
1243
1244                 /* Recompute the timeout after waiting. */
1245                 if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
1246                         int64_t time = os_time_get_nano();
1247                         timeout = abs_timeout > time ? abs_timeout - time : 0;
1248                 }
1249         }
1250
1251         if (!rfence->gfx)
1252                 return true;
1253
1254         /* Flush the gfx IB if it hasn't been flushed yet. */
1255         if (rctx &&
1256             rfence->gfx_unflushed.ctx == rctx &&
1257             rfence->gfx_unflushed.ib_index == rctx->num_gfx_cs_flushes) {
1258                 rctx->gfx.flush(rctx, timeout ? 0 : RADEON_FLUSH_ASYNC, NULL);
1259                 rfence->gfx_unflushed.ctx = NULL;
1260
1261                 if (!timeout)
1262                         return false;
1263
1264                 /* Recompute the timeout after all that. */
1265                 if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
1266                         int64_t time = os_time_get_nano();
1267                         timeout = abs_timeout > time ? abs_timeout - time : 0;
1268                 }
1269         }
1270
1271         return rws->fence_wait(rws, rfence->gfx, timeout);
1272 }
1273
1274 static void r600_query_memory_info(struct pipe_screen *screen,
1275                                    struct pipe_memory_info *info)
1276 {
1277         struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
1278         struct radeon_winsys *ws = rscreen->ws;
1279         unsigned vram_usage, gtt_usage;
1280
1281         info->total_device_memory = rscreen->info.vram_size / 1024;
1282         info->total_staging_memory = rscreen->info.gart_size / 1024;
1283
1284         /* The real TTM memory usage is somewhat random, because:
1285          *
1286          * 1) TTM delays freeing memory, because it can only free it after
1287          *    fences expire.
1288          *
1289          * 2) The memory usage can be really low if big VRAM evictions are
1290          *    taking place, but the real usage is well above the size of VRAM.
1291          *
1292          * Instead, return statistics of this process.
1293          */
1294         vram_usage = ws->query_value(ws, RADEON_REQUESTED_VRAM_MEMORY) / 1024;
1295         gtt_usage =  ws->query_value(ws, RADEON_REQUESTED_GTT_MEMORY) / 1024;
1296
1297         info->avail_device_memory =
1298                 vram_usage <= info->total_device_memory ?
1299                                 info->total_device_memory - vram_usage : 0;
1300         info->avail_staging_memory =
1301                 gtt_usage <= info->total_staging_memory ?
1302                                 info->total_staging_memory - gtt_usage : 0;
1303
1304         info->device_memory_evicted =
1305                 ws->query_value(ws, RADEON_NUM_BYTES_MOVED) / 1024;
1306
1307         if (rscreen->info.drm_major == 3 && rscreen->info.drm_minor >= 4)
1308                 info->nr_device_memory_evictions =
1309                         ws->query_value(ws, RADEON_NUM_EVICTIONS);
1310         else
1311                 /* Just return the number of evicted 64KB pages. */
1312                 info->nr_device_memory_evictions = info->device_memory_evicted / 64;
1313 }
1314
1315 struct pipe_resource *si_resource_create_common(struct pipe_screen *screen,
1316                                                 const struct pipe_resource *templ)
1317 {
1318         if (templ->target == PIPE_BUFFER) {
1319                 return si_buffer_create(screen, templ, 256);
1320         } else {
1321                 return si_texture_create(screen, templ);
1322         }
1323 }
1324
1325 bool si_common_screen_init(struct r600_common_screen *rscreen,
1326                            struct radeon_winsys *ws)
1327 {
1328         char family_name[32] = {}, llvm_string[32] = {}, kernel_version[128] = {};
1329         struct utsname uname_data;
1330         const char *chip_name;
1331
1332         ws->query_info(ws, &rscreen->info);
1333         rscreen->ws = ws;
1334
1335         if ((chip_name = r600_get_marketing_name(ws)))
1336                 snprintf(family_name, sizeof(family_name), "%s / ",
1337                          r600_get_family_name(rscreen) + 4);
1338         else
1339                 chip_name = r600_get_family_name(rscreen);
1340
1341         if (uname(&uname_data) == 0)
1342                 snprintf(kernel_version, sizeof(kernel_version),
1343                          " / %s", uname_data.release);
1344
1345         if (HAVE_LLVM > 0) {
1346                 snprintf(llvm_string, sizeof(llvm_string),
1347                          ", LLVM %i.%i.%i", (HAVE_LLVM >> 8) & 0xff,
1348                          HAVE_LLVM & 0xff, MESA_LLVM_VERSION_PATCH);
1349         }
1350
1351         snprintf(rscreen->renderer_string, sizeof(rscreen->renderer_string),
1352                  "%s (%sDRM %i.%i.%i%s%s)",
1353                  chip_name, family_name, rscreen->info.drm_major,
1354                  rscreen->info.drm_minor, rscreen->info.drm_patchlevel,
1355                  kernel_version, llvm_string);
1356
1357         rscreen->b.get_name = r600_get_name;
1358         rscreen->b.get_vendor = r600_get_vendor;
1359         rscreen->b.get_device_vendor = r600_get_device_vendor;
1360         rscreen->b.get_disk_shader_cache = r600_get_disk_shader_cache;
1361         rscreen->b.get_compute_param = r600_get_compute_param;
1362         rscreen->b.get_paramf = r600_get_paramf;
1363         rscreen->b.get_timestamp = r600_get_timestamp;
1364         rscreen->b.fence_finish = r600_fence_finish;
1365         rscreen->b.fence_reference = r600_fence_reference;
1366         rscreen->b.resource_destroy = u_resource_destroy_vtbl;
1367         rscreen->b.resource_from_user_memory = si_buffer_from_user_memory;
1368         rscreen->b.query_memory_info = r600_query_memory_info;
1369         rscreen->b.fence_get_fd = r600_fence_get_fd;
1370
1371         if (rscreen->info.has_hw_decode) {
1372                 rscreen->b.get_video_param = si_vid_get_video_param;
1373                 rscreen->b.is_video_format_supported = si_vid_is_format_supported;
1374         } else {
1375                 rscreen->b.get_video_param = r600_get_video_param;
1376                 rscreen->b.is_video_format_supported = vl_video_buffer_is_format_supported;
1377         }
1378
1379         si_init_screen_texture_functions(rscreen);
1380         si_init_screen_query_functions(rscreen);
1381
1382         rscreen->family = rscreen->info.family;
1383         rscreen->chip_class = rscreen->info.chip_class;
1384         rscreen->debug_flags |= debug_get_flags_option("R600_DEBUG", common_debug_options, 0);
1385         rscreen->has_rbplus = false;
1386         rscreen->rbplus_allowed = false;
1387
1388         r600_disk_cache_create(rscreen);
1389
1390         slab_create_parent(&rscreen->pool_transfers, sizeof(struct r600_transfer), 64);
1391
1392         rscreen->force_aniso = MIN2(16, debug_get_num_option("R600_TEX_ANISO", -1));
1393         if (rscreen->force_aniso >= 0) {
1394                 printf("radeon: Forcing anisotropy filter to %ix\n",
1395                        /* round down to a power of two */
1396                        1 << util_logbase2(rscreen->force_aniso));
1397         }
1398
1399         (void) mtx_init(&rscreen->aux_context_lock, mtx_plain);
1400         (void) mtx_init(&rscreen->gpu_load_mutex, mtx_plain);
1401
1402         if (rscreen->debug_flags & DBG(INFO)) {
1403                 printf("pci (domain:bus:dev.func): %04x:%02x:%02x.%x\n",
1404                        rscreen->info.pci_domain, rscreen->info.pci_bus,
1405                        rscreen->info.pci_dev, rscreen->info.pci_func);
1406                 printf("pci_id = 0x%x\n", rscreen->info.pci_id);
1407                 printf("family = %i (%s)\n", rscreen->info.family,
1408                        r600_get_family_name(rscreen));
1409                 printf("chip_class = %i\n", rscreen->info.chip_class);
1410                 printf("pte_fragment_size = %u\n", rscreen->info.pte_fragment_size);
1411                 printf("gart_page_size = %u\n", rscreen->info.gart_page_size);
1412                 printf("gart_size = %i MB\n", (int)DIV_ROUND_UP(rscreen->info.gart_size, 1024*1024));
1413                 printf("vram_size = %i MB\n", (int)DIV_ROUND_UP(rscreen->info.vram_size, 1024*1024));
1414                 printf("vram_vis_size = %i MB\n", (int)DIV_ROUND_UP(rscreen->info.vram_vis_size, 1024*1024));
1415                 printf("max_alloc_size = %i MB\n",
1416                        (int)DIV_ROUND_UP(rscreen->info.max_alloc_size, 1024*1024));
1417                 printf("min_alloc_size = %u\n", rscreen->info.min_alloc_size);
1418                 printf("has_dedicated_vram = %u\n", rscreen->info.has_dedicated_vram);
1419                 printf("has_virtual_memory = %i\n", rscreen->info.has_virtual_memory);
1420                 printf("gfx_ib_pad_with_type2 = %i\n", rscreen->info.gfx_ib_pad_with_type2);
1421                 printf("has_hw_decode = %u\n", rscreen->info.has_hw_decode);
1422                 printf("num_sdma_rings = %i\n", rscreen->info.num_sdma_rings);
1423                 printf("num_compute_rings = %u\n", rscreen->info.num_compute_rings);
1424                 printf("uvd_fw_version = %u\n", rscreen->info.uvd_fw_version);
1425                 printf("vce_fw_version = %u\n", rscreen->info.vce_fw_version);
1426                 printf("me_fw_version = %i\n", rscreen->info.me_fw_version);
1427                 printf("me_fw_feature = %i\n", rscreen->info.me_fw_feature);
1428                 printf("pfp_fw_version = %i\n", rscreen->info.pfp_fw_version);
1429                 printf("pfp_fw_feature = %i\n", rscreen->info.pfp_fw_feature);
1430                 printf("ce_fw_version = %i\n", rscreen->info.ce_fw_version);
1431                 printf("ce_fw_feature = %i\n", rscreen->info.ce_fw_feature);
1432                 printf("vce_harvest_config = %i\n", rscreen->info.vce_harvest_config);
1433                 printf("clock_crystal_freq = %i\n", rscreen->info.clock_crystal_freq);
1434                 printf("tcc_cache_line_size = %u\n", rscreen->info.tcc_cache_line_size);
1435                 printf("drm = %i.%i.%i\n", rscreen->info.drm_major,
1436                        rscreen->info.drm_minor, rscreen->info.drm_patchlevel);
1437                 printf("has_userptr = %i\n", rscreen->info.has_userptr);
1438                 printf("has_syncobj = %u\n", rscreen->info.has_syncobj);
1439                 printf("has_sync_file = %u\n", rscreen->info.has_sync_file);
1440
1441                 printf("r600_max_quad_pipes = %i\n", rscreen->info.r600_max_quad_pipes);
1442                 printf("max_shader_clock = %i\n", rscreen->info.max_shader_clock);
1443                 printf("num_good_compute_units = %i\n", rscreen->info.num_good_compute_units);
1444                 printf("max_se = %i\n", rscreen->info.max_se);
1445                 printf("max_sh_per_se = %i\n", rscreen->info.max_sh_per_se);
1446
1447                 printf("r600_gb_backend_map = %i\n", rscreen->info.r600_gb_backend_map);
1448                 printf("r600_gb_backend_map_valid = %i\n", rscreen->info.r600_gb_backend_map_valid);
1449                 printf("r600_num_banks = %i\n", rscreen->info.r600_num_banks);
1450                 printf("num_render_backends = %i\n", rscreen->info.num_render_backends);
1451                 printf("num_tile_pipes = %i\n", rscreen->info.num_tile_pipes);
1452                 printf("pipe_interleave_bytes = %i\n", rscreen->info.pipe_interleave_bytes);
1453                 printf("enabled_rb_mask = 0x%x\n", rscreen->info.enabled_rb_mask);
1454                 printf("max_alignment = %u\n", (unsigned)rscreen->info.max_alignment);
1455         }
1456         return true;
1457 }
1458
1459 void si_destroy_common_screen(struct r600_common_screen *rscreen)
1460 {
1461         si_perfcounters_destroy(rscreen);
1462         si_gpu_load_kill_thread(rscreen);
1463
1464         mtx_destroy(&rscreen->gpu_load_mutex);
1465         mtx_destroy(&rscreen->aux_context_lock);
1466         rscreen->aux_context->destroy(rscreen->aux_context);
1467
1468         slab_destroy_parent(&rscreen->pool_transfers);
1469
1470         disk_cache_destroy(rscreen->disk_shader_cache);
1471         rscreen->ws->destroy(rscreen->ws);
1472         FREE(rscreen);
1473 }
1474
1475 bool si_can_dump_shader(struct r600_common_screen *rscreen,
1476                         unsigned processor)
1477 {
1478         return rscreen->debug_flags & (1 << processor);
1479 }
1480
1481 bool si_extra_shader_checks(struct r600_common_screen *rscreen, unsigned processor)
1482 {
1483         return (rscreen->debug_flags & DBG(CHECK_IR)) ||
1484                si_can_dump_shader(rscreen, processor);
1485 }
1486
1487 void si_screen_clear_buffer(struct r600_common_screen *rscreen, struct pipe_resource *dst,
1488                             uint64_t offset, uint64_t size, unsigned value)
1489 {
1490         struct r600_common_context *rctx = (struct r600_common_context*)rscreen->aux_context;
1491
1492         mtx_lock(&rscreen->aux_context_lock);
1493         rctx->dma_clear_buffer(&rctx->b, dst, offset, size, value);
1494         rscreen->aux_context->flush(rscreen->aux_context, NULL, 0);
1495         mtx_unlock(&rscreen->aux_context_lock);
1496 }