src/gallium/drivers/radeon/r600_pipe_common.c

   1 /*
   2  * Copyright 2013 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  *
  23  * Authors: Marek Olšák <maraeo@gmail.com>
  24  *
  25  */
  26
  27 #include "r600_pipe_common.h"
  28 #include "r600_cs.h"
  29 #include "tgsi/tgsi_parse.h"
  30 #include "util/list.h"
  31 #include "util/u_draw_quad.h"
  32 #include "util/u_memory.h"
  33 #include "util/u_format_s3tc.h"
  34 #include "util/u_upload_mgr.h"
  35 #include "os/os_time.h"
  36 #include "vl/vl_decoder.h"
  37 #include "vl/vl_video_buffer.h"
  38 #include "radeon/radeon_video.h"
  39 #include "amd/common/sid.h"
  40 #include <inttypes.h>
  41 #include <sys/utsname.h>
  42
  43 #include <llvm-c/TargetMachine.h>
  44
  45
  46 struct r600_multi_fence {
  47         struct pipe_reference reference;
  48         struct pipe_fence_handle *gfx;
  49         struct pipe_fence_handle *sdma;
  50
  51         /* If the context wasn't flushed at fence creation, this is non-NULL. */
  52         struct {
  53                 struct r600_common_context *ctx;
  54                 unsigned ib_index;
  55         } gfx_unflushed;
  56 };
  57
  58 /*
  59  * shader binary helpers.
  60  */
  61 void si_radeon_shader_binary_init(struct ac_shader_binary *b)
  62 {
  63         memset(b, 0, sizeof(*b));
  64 }
  65
  66 void si_radeon_shader_binary_clean(struct ac_shader_binary *b)
  67 {
  68         if (!b)
  69                 return;
  70         FREE(b->code);
  71         FREE(b->config);
  72         FREE(b->rodata);
  73         FREE(b->global_symbol_offsets);
  74         FREE(b->relocs);
  75         FREE(b->disasm_string);
  76         FREE(b->llvm_ir_string);
  77 }
  78
  79 /*
  80  * pipe_context
  81  */
  82
  83 /**
  84  * Write an EOP event.
  85  *
  86  * \param event         EVENT_TYPE_*
  87  * \param event_flags   Optional cache flush flags (TC)
  88  * \param data_sel      1 = fence, 3 = timestamp
  89  * \param buf           Buffer
  90  * \param va            GPU address
  91  * \param old_value     Previous fence value (for a bug workaround)
  92  * \param new_value     Fence value to write for this event.
  93  */
  94 void si_gfx_write_event_eop(struct r600_common_context *ctx,
  95                             unsigned event, unsigned event_flags,
  96                             unsigned data_sel,
  97                             struct r600_resource *buf, uint64_t va,
  98                             uint32_t new_fence, unsigned query_type)
  99 {
 100         struct radeon_winsys_cs *cs = ctx->gfx.cs;
 101         unsigned op = EVENT_TYPE(event) |
 102                       EVENT_INDEX(5) |
 103                       event_flags;
 104         unsigned sel = EOP_DATA_SEL(data_sel);
 105
 106         /* Wait for write confirmation before writing data, but don't send
 107          * an interrupt. */
 108         if (data_sel != EOP_DATA_SEL_DISCARD)
 109                 sel |= EOP_INT_SEL(EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM);
 110
 111         if (ctx->chip_class >= GFX9) {
 112                 /* A ZPASS_DONE or PIXEL_STAT_DUMP_EVENT (of the DB occlusion
 113                  * counters) must immediately precede every timestamp event to
 114                  * prevent a GPU hang on GFX9.
 115                  *
 116                  * Occlusion queries don't need to do it here, because they
 117                  * always do ZPASS_DONE before the timestamp.
 118                  */
 119                 if (ctx->chip_class == GFX9 &&
 120                     query_type != PIPE_QUERY_OCCLUSION_COUNTER &&
 121                     query_type != PIPE_QUERY_OCCLUSION_PREDICATE &&
 122                     query_type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
 123                         struct r600_resource *scratch = ctx->eop_bug_scratch;
 124
 125                         assert(16 * ctx->screen->info.num_render_backends <=
 126                                scratch->b.b.width0);
 127                         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
 128                         radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
 129                         radeon_emit(cs, scratch->gpu_address);
 130                         radeon_emit(cs, scratch->gpu_address >> 32);
 131
 132                         radeon_add_to_buffer_list(ctx, &ctx->gfx, scratch,
 133                                                   RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
 134                 }
 135
 136                 radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, 6, 0));
 137                 radeon_emit(cs, op);
 138                 radeon_emit(cs, sel);
 139                 radeon_emit(cs, va);            /* address lo */
 140                 radeon_emit(cs, va >> 32);      /* address hi */
 141                 radeon_emit(cs, new_fence);     /* immediate data lo */
 142                 radeon_emit(cs, 0); /* immediate data hi */
 143                 radeon_emit(cs, 0); /* unused */
 144         } else {
 145                 if (ctx->chip_class == CIK ||
 146                     ctx->chip_class == VI) {
 147                         struct r600_resource *scratch = ctx->eop_bug_scratch;
 148                         uint64_t va = scratch->gpu_address;
 149
 150                         /* Two EOP events are required to make all engines go idle
 151                          * (and optional cache flushes executed) before the timestamp
 152                          * is written.
 153                          */
 154                         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
 155                         radeon_emit(cs, op);
 156                         radeon_emit(cs, va);
 157                         radeon_emit(cs, ((va >> 32) & 0xffff) | sel);
 158                         radeon_emit(cs, 0); /* immediate data */
 159                         radeon_emit(cs, 0); /* unused */
 160
 161                         radeon_add_to_buffer_list(ctx, &ctx->gfx, scratch,
 162                                                   RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
 163                 }
 164
 165                 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
 166                 radeon_emit(cs, op);
 167                 radeon_emit(cs, va);
 168                 radeon_emit(cs, ((va >> 32) & 0xffff) | sel);
 169                 radeon_emit(cs, new_fence); /* immediate data */
 170                 radeon_emit(cs, 0); /* unused */
 171         }
 172
 173         if (buf) {
 174                 radeon_add_to_buffer_list(ctx, &ctx->gfx, buf, RADEON_USAGE_WRITE,
 175                                           RADEON_PRIO_QUERY);
 176         }
 177 }
 178
 179 unsigned si_gfx_write_fence_dwords(struct r600_common_screen *screen)
 180 {
 181         unsigned dwords = 6;
 182
 183         if (screen->chip_class == CIK ||
 184             screen->chip_class == VI)
 185                 dwords *= 2;
 186
 187         if (!screen->info.has_virtual_memory)
 188                 dwords += 2;
 189
 190         return dwords;
 191 }
 192
 193 void si_gfx_wait_fence(struct r600_common_context *ctx,
 194                        uint64_t va, uint32_t ref, uint32_t mask)
 195 {
 196         struct radeon_winsys_cs *cs = ctx->gfx.cs;
 197
 198         radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
 199         radeon_emit(cs, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_MEM_SPACE(1));
 200         radeon_emit(cs, va);
 201         radeon_emit(cs, va >> 32);
 202         radeon_emit(cs, ref); /* reference value */
 203         radeon_emit(cs, mask); /* mask */
 204         radeon_emit(cs, 4); /* poll interval */
 205 }
 206
 207 static void r600_dma_emit_wait_idle(struct r600_common_context *rctx)
 208 {
 209         struct radeon_winsys_cs *cs = rctx->dma.cs;
 210
 211         /* NOP waits for idle on Evergreen and later. */
 212         if (rctx->chip_class >= CIK)
 213                 radeon_emit(cs, 0x00000000); /* NOP */
 214         else
 215                 radeon_emit(cs, 0xf0000000); /* NOP */
 216 }
 217
 218 void si_need_dma_space(struct r600_common_context *ctx, unsigned num_dw,
 219                        struct r600_resource *dst, struct r600_resource *src)
 220 {
 221         uint64_t vram = ctx->dma.cs->used_vram;
 222         uint64_t gtt = ctx->dma.cs->used_gart;
 223
 224         if (dst) {
 225                 vram += dst->vram_usage;
 226                 gtt += dst->gart_usage;
 227         }
 228         if (src) {
 229                 vram += src->vram_usage;
 230                 gtt += src->gart_usage;
 231         }
 232
 233         /* Flush the GFX IB if DMA depends on it. */
 234         if (radeon_emitted(ctx->gfx.cs, ctx->initial_gfx_cs_size) &&
 235             ((dst &&
 236               ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, dst->buf,
 237                                                RADEON_USAGE_READWRITE)) ||
 238              (src &&
 239               ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, src->buf,
 240                                                RADEON_USAGE_WRITE))))
 241                 ctx->gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 242
 243         /* Flush if there's not enough space, or if the memory usage per IB
 244          * is too large.
 245          *
 246          * IBs using too little memory are limited by the IB submission overhead.
 247          * IBs using too much memory are limited by the kernel/TTM overhead.
 248          * Too long IBs create CPU-GPU pipeline bubbles and add latency.
 249          *
 250          * This heuristic makes sure that DMA requests are executed
 251          * very soon after the call is made and lowers memory usage.
 252          * It improves texture upload performance by keeping the DMA
 253          * engine busy while uploads are being submitted.
 254          */
 255         num_dw++; /* for emit_wait_idle below */
 256         if (!ctx->ws->cs_check_space(ctx->dma.cs, num_dw) ||
 257             ctx->dma.cs->used_vram + ctx->dma.cs->used_gart > 64 * 1024 * 1024 ||
 258             !radeon_cs_memory_below_limit(ctx->screen, ctx->dma.cs, vram, gtt)) {
 259                 ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 260                 assert((num_dw + ctx->dma.cs->current.cdw) <= ctx->dma.cs->current.max_dw);
 261         }
 262
 263         /* Wait for idle if either buffer has been used in the IB before to
 264          * prevent read-after-write hazards.
 265          */
 266         if ((dst &&
 267              ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, dst->buf,
 268                                               RADEON_USAGE_READWRITE)) ||
 269             (src &&
 270              ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, src->buf,
 271                                               RADEON_USAGE_WRITE)))
 272                 r600_dma_emit_wait_idle(ctx);
 273
 274         /* If GPUVM is not supported, the CS checker needs 2 entries
 275          * in the buffer list per packet, which has to be done manually.
 276          */
 277         if (ctx->screen->info.has_virtual_memory) {
 278                 if (dst)
 279                         radeon_add_to_buffer_list(ctx, &ctx->dma, dst,
 280                                                   RADEON_USAGE_WRITE,
 281                                                   RADEON_PRIO_SDMA_BUFFER);
 282                 if (src)
 283                         radeon_add_to_buffer_list(ctx, &ctx->dma, src,
 284                                                   RADEON_USAGE_READ,
 285                                                   RADEON_PRIO_SDMA_BUFFER);
 286         }
 287
 288         /* this function is called before all DMA calls, so increment this. */
 289         ctx->num_dma_calls++;
 290 }
 291
 292 static void r600_memory_barrier(struct pipe_context *ctx, unsigned flags)
 293 {
 294 }
 295
 296 void si_preflush_suspend_features(struct r600_common_context *ctx)
 297 {
 298         /* suspend queries */
 299         if (!LIST_IS_EMPTY(&ctx->active_queries))
 300                 si_suspend_queries(ctx);
 301 }
 302
 303 void si_postflush_resume_features(struct r600_common_context *ctx)
 304 {
 305         /* resume queries */
 306         if (!LIST_IS_EMPTY(&ctx->active_queries))
 307                 si_resume_queries(ctx);
 308 }
 309
 310 static void r600_add_fence_dependency(struct r600_common_context *rctx,
 311                                       struct pipe_fence_handle *fence)
 312 {
 313         struct radeon_winsys *ws = rctx->ws;
 314
 315         if (rctx->dma.cs)
 316                 ws->cs_add_fence_dependency(rctx->dma.cs, fence);
 317         ws->cs_add_fence_dependency(rctx->gfx.cs, fence);
 318 }
 319
 320 static void r600_fence_server_sync(struct pipe_context *ctx,
 321                                    struct pipe_fence_handle *fence)
 322 {
 323         struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 324         struct r600_multi_fence *rfence = (struct r600_multi_fence *)fence;
 325
 326         /* Only amdgpu needs to handle fence dependencies (for fence imports).
 327          * radeon synchronizes all rings by default and will not implement
 328          * fence imports.
 329          */
 330         if (rctx->screen->info.drm_major == 2)
 331                 return;
 332
 333         /* Only imported fences need to be handled by fence_server_sync,
 334          * because the winsys handles synchronizations automatically for BOs
 335          * within the process.
 336          *
 337          * Simply skip unflushed fences here, and the winsys will drop no-op
 338          * dependencies (i.e. dependencies within the same ring).
 339          */
 340         if (rfence->gfx_unflushed.ctx)
 341                 return;
 342
 343         /* All unflushed commands will not start execution before
 344          * this fence dependency is signalled.
 345          *
 346          * Should we flush the context to allow more GPU parallelism?
 347          */
 348         if (rfence->sdma)
 349                 r600_add_fence_dependency(rctx, rfence->sdma);
 350         if (rfence->gfx)
 351                 r600_add_fence_dependency(rctx, rfence->gfx);
 352 }
 353
 354 static void r600_flush_from_st(struct pipe_context *ctx,
 355                                struct pipe_fence_handle **fence,
 356                                unsigned flags)
 357 {
 358         struct pipe_screen *screen = ctx->screen;
 359         struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 360         struct radeon_winsys *ws = rctx->ws;
 361         struct pipe_fence_handle *gfx_fence = NULL;
 362         struct pipe_fence_handle *sdma_fence = NULL;
 363         bool deferred_fence = false;
 364         unsigned rflags = RADEON_FLUSH_ASYNC;
 365
 366         if (flags & PIPE_FLUSH_END_OF_FRAME)
 367                 rflags |= RADEON_FLUSH_END_OF_FRAME;
 368
 369         /* DMA IBs are preambles to gfx IBs, therefore must be flushed first. */
 370         if (rctx->dma.cs)
 371                 rctx->dma.flush(rctx, rflags, fence ? &sdma_fence : NULL);
 372
 373         if (!radeon_emitted(rctx->gfx.cs, rctx->initial_gfx_cs_size)) {
 374                 if (fence)
 375                         ws->fence_reference(&gfx_fence, rctx->last_gfx_fence);
 376                 if (!(flags & PIPE_FLUSH_DEFERRED))
 377                         ws->cs_sync_flush(rctx->gfx.cs);
 378         } else {
 379                 /* Instead of flushing, create a deferred fence. Constraints:
 380                  * - The state tracker must allow a deferred flush.
 381                  * - The state tracker must request a fence.
 382                  * Thread safety in fence_finish must be ensured by the state tracker.
 383                  */
 384                 if (flags & PIPE_FLUSH_DEFERRED && fence) {
 385                         gfx_fence = rctx->ws->cs_get_next_fence(rctx->gfx.cs);
 386                         deferred_fence = true;
 387                 } else {
 388                         rctx->gfx.flush(rctx, rflags, fence ? &gfx_fence : NULL);
 389                 }
 390         }
 391
 392         /* Both engines can signal out of order, so we need to keep both fences. */
 393         if (fence) {
 394                 struct r600_multi_fence *multi_fence =
 395                         CALLOC_STRUCT(r600_multi_fence);
 396                 if (!multi_fence) {
 397                         ws->fence_reference(&sdma_fence, NULL);
 398                         ws->fence_reference(&gfx_fence, NULL);
 399                         goto finish;
 400                 }
 401
 402                 multi_fence->reference.count = 1;
 403                 /* If both fences are NULL, fence_finish will always return true. */
 404                 multi_fence->gfx = gfx_fence;
 405                 multi_fence->sdma = sdma_fence;
 406
 407                 if (deferred_fence) {
 408                         multi_fence->gfx_unflushed.ctx = rctx;
 409                         multi_fence->gfx_unflushed.ib_index = rctx->num_gfx_cs_flushes;
 410                 }
 411
 412                 screen->fence_reference(screen, fence, NULL);
 413                 *fence = (struct pipe_fence_handle*)multi_fence;
 414         }
 415 finish:
 416         if (!(flags & PIPE_FLUSH_DEFERRED)) {
 417                 if (rctx->dma.cs)
 418                         ws->cs_sync_flush(rctx->dma.cs);
 419                 ws->cs_sync_flush(rctx->gfx.cs);
 420         }
 421 }
 422
 423 static void r600_flush_dma_ring(void *ctx, unsigned flags,
 424                                 struct pipe_fence_handle **fence)
 425 {
 426         struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 427         struct radeon_winsys_cs *cs = rctx->dma.cs;
 428         struct radeon_saved_cs saved;
 429         bool check_vm =
 430                 (rctx->screen->debug_flags & DBG(CHECK_VM)) &&
 431                 rctx->check_vm_faults;
 432
 433         if (!radeon_emitted(cs, 0)) {
 434                 if (fence)
 435                         rctx->ws->fence_reference(fence, rctx->last_sdma_fence);
 436                 return;
 437         }
 438
 439         if (check_vm)
 440                 si_save_cs(rctx->ws, cs, &saved, true);
 441
 442         rctx->ws->cs_flush(cs, flags, &rctx->last_sdma_fence);
 443         if (fence)
 444                 rctx->ws->fence_reference(fence, rctx->last_sdma_fence);
 445
 446         if (check_vm) {
 447                 /* Use conservative timeout 800ms, after which we won't wait any
 448                  * longer and assume the GPU is hung.
 449                  */
 450                 rctx->ws->fence_wait(rctx->ws, rctx->last_sdma_fence, 800*1000*1000);
 451
 452                 rctx->check_vm_faults(rctx, &saved, RING_DMA);
 453                 si_clear_saved_cs(&saved);
 454         }
 455 }
 456
 457 /**
 458  * Store a linearized copy of all chunks of \p cs together with the buffer
 459  * list in \p saved.
 460  */
 461 void si_save_cs(struct radeon_winsys *ws, struct radeon_winsys_cs *cs,
 462                 struct radeon_saved_cs *saved, bool get_buffer_list)
 463 {
 464         uint32_t *buf;
 465         unsigned i;
 466
 467         /* Save the IB chunks. */
 468         saved->num_dw = cs->prev_dw + cs->current.cdw;
 469         saved->ib = MALLOC(4 * saved->num_dw);
 470         if (!saved->ib)
 471                 goto oom;
 472
 473         buf = saved->ib;
 474         for (i = 0; i < cs->num_prev; ++i) {
 475                 memcpy(buf, cs->prev[i].buf, cs->prev[i].cdw * 4);
 476                 buf += cs->prev[i].cdw;
 477         }
 478         memcpy(buf, cs->current.buf, cs->current.cdw * 4);
 479
 480         if (!get_buffer_list)
 481                 return;
 482
 483         /* Save the buffer list. */
 484         saved->bo_count = ws->cs_get_buffer_list(cs, NULL);
 485         saved->bo_list = CALLOC(saved->bo_count,
 486                                 sizeof(saved->bo_list[0]));
 487         if (!saved->bo_list) {
 488                 FREE(saved->ib);
 489                 goto oom;
 490         }
 491         ws->cs_get_buffer_list(cs, saved->bo_list);
 492
 493         return;
 494
 495 oom:
 496         fprintf(stderr, "%s: out of memory\n", __func__);
 497         memset(saved, 0, sizeof(*saved));
 498 }
 499
 500 void si_clear_saved_cs(struct radeon_saved_cs *saved)
 501 {
 502         FREE(saved->ib);
 503         FREE(saved->bo_list);
 504
 505         memset(saved, 0, sizeof(*saved));
 506 }
 507
 508 static enum pipe_reset_status r600_get_reset_status(struct pipe_context *ctx)
 509 {
 510         struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 511         unsigned latest = rctx->ws->query_value(rctx->ws,
 512                                                 RADEON_GPU_RESET_COUNTER);
 513
 514         if (rctx->gpu_reset_counter == latest)
 515                 return PIPE_NO_RESET;
 516
 517         rctx->gpu_reset_counter = latest;
 518         return PIPE_UNKNOWN_CONTEXT_RESET;
 519 }
 520
 521 static void r600_set_debug_callback(struct pipe_context *ctx,
 522                                     const struct pipe_debug_callback *cb)
 523 {
 524         struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 525
 526         if (cb)
 527                 rctx->debug = *cb;
 528         else
 529                 memset(&rctx->debug, 0, sizeof(rctx->debug));
 530 }
 531
 532 static void r600_set_device_reset_callback(struct pipe_context *ctx,
 533                                            const struct pipe_device_reset_callback *cb)
 534 {
 535         struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 536
 537         if (cb)
 538                 rctx->device_reset_callback = *cb;
 539         else
 540                 memset(&rctx->device_reset_callback, 0,
 541                        sizeof(rctx->device_reset_callback));
 542 }
 543
 544 bool si_check_device_reset(struct r600_common_context *rctx)
 545 {
 546         enum pipe_reset_status status;
 547
 548         if (!rctx->device_reset_callback.reset)
 549                 return false;
 550
 551         if (!rctx->b.get_device_reset_status)
 552                 return false;
 553
 554         status = rctx->b.get_device_reset_status(&rctx->b);
 555         if (status == PIPE_NO_RESET)
 556                 return false;
 557
 558         rctx->device_reset_callback.reset(rctx->device_reset_callback.data, status);
 559         return true;
 560 }
 561
 562 static void r600_dma_clear_buffer_fallback(struct pipe_context *ctx,
 563                                            struct pipe_resource *dst,
 564                                            uint64_t offset, uint64_t size,
 565                                            unsigned value)
 566 {
 567         struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 568
 569         rctx->clear_buffer(ctx, dst, offset, size, value, R600_COHERENCY_NONE);
 570 }
 571
 572 static bool r600_resource_commit(struct pipe_context *pctx,
 573                                  struct pipe_resource *resource,
 574                                  unsigned level, struct pipe_box *box,
 575                                  bool commit)
 576 {
 577         struct r600_common_context *ctx = (struct r600_common_context *)pctx;
 578         struct r600_resource *res = r600_resource(resource);
 579
 580         /*
 581          * Since buffer commitment changes cannot be pipelined, we need to
 582          * (a) flush any pending commands that refer to the buffer we're about
 583          *     to change, and
 584          * (b) wait for threaded submit to finish, including those that were
 585          *     triggered by some other, earlier operation.
 586          */
 587         if (radeon_emitted(ctx->gfx.cs, ctx->initial_gfx_cs_size) &&
 588             ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs,
 589                                              res->buf, RADEON_USAGE_READWRITE)) {
 590                 ctx->gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 591         }
 592         if (radeon_emitted(ctx->dma.cs, 0) &&
 593             ctx->ws->cs_is_buffer_referenced(ctx->dma.cs,
 594                                              res->buf, RADEON_USAGE_READWRITE)) {
 595                 ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 596         }
 597
 598         ctx->ws->cs_sync_flush(ctx->dma.cs);
 599         ctx->ws->cs_sync_flush(ctx->gfx.cs);
 600
 601         assert(resource->target == PIPE_BUFFER);
 602
 603         return ctx->ws->buffer_commit(res->buf, box->x, box->width, commit);
 604 }
 605
 606 bool si_common_context_init(struct r600_common_context *rctx,
 607                             struct r600_common_screen *rscreen,
 608                             unsigned context_flags)
 609 {
 610         slab_create_child(&rctx->pool_transfers, &rscreen->pool_transfers);
 611         slab_create_child(&rctx->pool_transfers_unsync, &rscreen->pool_transfers);
 612
 613         rctx->screen = rscreen;
 614         rctx->ws = rscreen->ws;
 615         rctx->family = rscreen->family;
 616         rctx->chip_class = rscreen->chip_class;
 617
 618         rctx->b.invalidate_resource = si_invalidate_resource;
 619         rctx->b.resource_commit = r600_resource_commit;
 620         rctx->b.transfer_map = u_transfer_map_vtbl;
 621         rctx->b.transfer_flush_region = u_transfer_flush_region_vtbl;
 622         rctx->b.transfer_unmap = u_transfer_unmap_vtbl;
 623         rctx->b.texture_subdata = u_default_texture_subdata;
 624         rctx->b.memory_barrier = r600_memory_barrier;
 625         rctx->b.flush = r600_flush_from_st;
 626         rctx->b.set_debug_callback = r600_set_debug_callback;
 627         rctx->b.fence_server_sync = r600_fence_server_sync;
 628         rctx->dma_clear_buffer = r600_dma_clear_buffer_fallback;
 629         rctx->b.buffer_subdata = si_buffer_subdata;
 630
 631         if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 43) {
 632                 rctx->b.get_device_reset_status = r600_get_reset_status;
 633                 rctx->gpu_reset_counter =
 634                         rctx->ws->query_value(rctx->ws,
 635                                               RADEON_GPU_RESET_COUNTER);
 636         }
 637
 638         rctx->b.set_device_reset_callback = r600_set_device_reset_callback;
 639
 640         si_init_context_texture_functions(rctx);
 641         si_init_query_functions(rctx);
 642
 643         if (rctx->chip_class == CIK ||
 644             rctx->chip_class == VI ||
 645             rctx->chip_class == GFX9) {
 646                 rctx->eop_bug_scratch = (struct r600_resource*)
 647                         pipe_buffer_create(&rscreen->b, 0, PIPE_USAGE_DEFAULT,
 648                                            16 * rscreen->info.num_render_backends);
 649                 if (!rctx->eop_bug_scratch)
 650                         return false;
 651         }
 652
 653         rctx->allocator_zeroed_memory =
 654                 u_suballocator_create(&rctx->b, rscreen->info.gart_page_size,
 655                                       0, PIPE_USAGE_DEFAULT, 0, true);
 656         if (!rctx->allocator_zeroed_memory)
 657                 return false;
 658
 659         rctx->b.stream_uploader = u_upload_create(&rctx->b, 1024 * 1024,
 660                                                   0, PIPE_USAGE_STREAM);
 661         if (!rctx->b.stream_uploader)
 662                 return false;
 663
 664         rctx->b.const_uploader = u_upload_create(&rctx->b, 128 * 1024,
 665                                                  0, PIPE_USAGE_DEFAULT);
 666         if (!rctx->b.const_uploader)
 667                 return false;
 668
 669         rctx->ctx = rctx->ws->ctx_create(rctx->ws);
 670         if (!rctx->ctx)
 671                 return false;
 672
 673         if (rscreen->info.num_sdma_rings && !(rscreen->debug_flags & DBG(NO_ASYNC_DMA))) {
 674                 rctx->dma.cs = rctx->ws->cs_create(rctx->ctx, RING_DMA,
 675                                                    r600_flush_dma_ring,
 676                                                    rctx);
 677                 rctx->dma.flush = r600_flush_dma_ring;
 678         }
 679
 680         return true;
 681 }
 682
 683 void si_common_context_cleanup(struct r600_common_context *rctx)
 684 {
 685         unsigned i,j;
 686
 687         /* Release DCC stats. */
 688         for (i = 0; i < ARRAY_SIZE(rctx->dcc_stats); i++) {
 689                 assert(!rctx->dcc_stats[i].query_active);
 690
 691                 for (j = 0; j < ARRAY_SIZE(rctx->dcc_stats[i].ps_stats); j++)
 692                         if (rctx->dcc_stats[i].ps_stats[j])
 693                                 rctx->b.destroy_query(&rctx->b,
 694                                                       rctx->dcc_stats[i].ps_stats[j]);
 695
 696                 r600_texture_reference(&rctx->dcc_stats[i].tex, NULL);
 697         }
 698
 699         if (rctx->query_result_shader)
 700                 rctx->b.delete_compute_state(&rctx->b, rctx->query_result_shader);
 701
 702         if (rctx->gfx.cs)
 703                 rctx->ws->cs_destroy(rctx->gfx.cs);
 704         if (rctx->dma.cs)
 705                 rctx->ws->cs_destroy(rctx->dma.cs);
 706         if (rctx->ctx)
 707                 rctx->ws->ctx_destroy(rctx->ctx);
 708
 709         if (rctx->b.stream_uploader)
 710                 u_upload_destroy(rctx->b.stream_uploader);
 711         if (rctx->b.const_uploader)
 712                 u_upload_destroy(rctx->b.const_uploader);
 713
 714         slab_destroy_child(&rctx->pool_transfers);
 715         slab_destroy_child(&rctx->pool_transfers_unsync);
 716
 717         if (rctx->allocator_zeroed_memory) {
 718                 u_suballocator_destroy(rctx->allocator_zeroed_memory);
 719         }
 720         rctx->ws->fence_reference(&rctx->last_gfx_fence, NULL);
 721         rctx->ws->fence_reference(&rctx->last_sdma_fence, NULL);
 722         r600_resource_reference(&rctx->eop_bug_scratch, NULL);
 723 }
 724
 725 /*
 726  * pipe_screen
 727  */
 728
 729 static const struct debug_named_value common_debug_options[] = {
 730         /* logging */
 731         { "tex", DBG(TEX), "Print texture info" },
 732         { "nir", DBG(NIR), "Enable experimental NIR shaders" },
 733         { "compute", DBG(COMPUTE), "Print compute info" },
 734         { "vm", DBG(VM), "Print virtual addresses when creating resources" },
 735         { "info", DBG(INFO), "Print driver information" },
 736
 737         /* shaders */
 738         { "vs", DBG(VS), "Print vertex shaders" },
 739         { "gs", DBG(GS), "Print geometry shaders" },
 740         { "ps", DBG(PS), "Print pixel shaders" },
 741         { "cs", DBG(CS), "Print compute shaders" },
 742         { "tcs", DBG(TCS), "Print tessellation control shaders" },
 743         { "tes", DBG(TES), "Print tessellation evaluation shaders" },
 744         { "noir", DBG(NO_IR), "Don't print the LLVM IR"},
 745         { "notgsi", DBG(NO_TGSI), "Don't print the TGSI"},
 746         { "noasm", DBG(NO_ASM), "Don't print disassembled shaders"},
 747         { "preoptir", DBG(PREOPT_IR), "Print the LLVM IR before initial optimizations" },
 748         { "checkir", DBG(CHECK_IR), "Enable additional sanity checks on shader IR" },
 749         { "nooptvariant", DBG(NO_OPT_VARIANT), "Disable compiling optimized shader variants." },
 750
 751         { "testdma", DBG(TEST_DMA), "Invoke SDMA tests and exit." },
 752         { "testvmfaultcp", DBG(TEST_VMFAULT_CP), "Invoke a CP VM fault test and exit." },
 753         { "testvmfaultsdma", DBG(TEST_VMFAULT_SDMA), "Invoke a SDMA VM fault test and exit." },
 754         { "testvmfaultshader", DBG(TEST_VMFAULT_SHADER), "Invoke a shader VM fault test and exit." },
 755
 756         /* features */
 757         { "nodma", DBG(NO_ASYNC_DMA), "Disable asynchronous DMA" },
 758         { "nohyperz", DBG(NO_HYPERZ), "Disable Hyper-Z" },
 759         /* GL uses the word INVALIDATE, gallium uses the word DISCARD */
 760         { "noinvalrange", DBG(NO_DISCARD_RANGE), "Disable handling of INVALIDATE_RANGE map flags" },
 761         { "no2d", DBG(NO_2D_TILING), "Disable 2D tiling" },
 762         { "notiling", DBG(NO_TILING), "Disable tiling" },
 763         { "switch_on_eop", DBG(SWITCH_ON_EOP), "Program WD/IA to switch on end-of-packet." },
 764         { "forcedma", DBG(FORCE_DMA), "Use asynchronous DMA for all operations when possible." },
 765         { "precompile", DBG(PRECOMPILE), "Compile one shader variant at shader creation." },
 766         { "nowc", DBG(NO_WC), "Disable GTT write combining" },
 767         { "check_vm", DBG(CHECK_VM), "Check VM faults and dump debug info." },
 768         { "nodcc", DBG(NO_DCC), "Disable DCC." },
 769         { "nodccclear", DBG(NO_DCC_CLEAR), "Disable DCC fast clear." },
 770         { "norbplus", DBG(NO_RB_PLUS), "Disable RB+." },
 771         { "sisched", DBG(SI_SCHED), "Enable LLVM SI Machine Instruction Scheduler." },
 772         { "mono", DBG(MONOLITHIC_SHADERS), "Use old-style monolithic shaders compiled on demand" },
 773         { "unsafemath", DBG(UNSAFE_MATH), "Enable unsafe math shader optimizations" },
 774         { "nodccfb", DBG(NO_DCC_FB), "Disable separate DCC on the main framebuffer" },
 775         { "nodpbb", DBG(NO_DPBB), "Disable DPBB." },
 776         { "nodfsm", DBG(NO_DFSM), "Disable DFSM." },
 777         { "dpbb", DBG(DPBB), "Enable DPBB." },
 778         { "dfsm", DBG(DFSM), "Enable DFSM." },
 779         { "nooutoforder", DBG(NO_OUT_OF_ORDER), "Disable out-of-order rasterization" },
 780
 781         DEBUG_NAMED_VALUE_END /* must be last */
 782 };
 783
 784 static const char* r600_get_vendor(struct pipe_screen* pscreen)
 785 {
 786         return "X.Org";
 787 }
 788
 789 static const char* r600_get_device_vendor(struct pipe_screen* pscreen)
 790 {
 791         return "AMD";
 792 }
 793
 794 static const char *r600_get_marketing_name(struct radeon_winsys *ws)
 795 {
 796         if (!ws->get_chip_name)
 797                 return NULL;
 798         return ws->get_chip_name(ws);
 799 }
 800
 801 static const char *r600_get_family_name(const struct r600_common_screen *rscreen)
 802 {
 803         switch (rscreen->info.family) {
 804         case CHIP_TAHITI: return "AMD TAHITI";
 805         case CHIP_PITCAIRN: return "AMD PITCAIRN";
 806         case CHIP_VERDE: return "AMD CAPE VERDE";
 807         case CHIP_OLAND: return "AMD OLAND";
 808         case CHIP_HAINAN: return "AMD HAINAN";
 809         case CHIP_BONAIRE: return "AMD BONAIRE";
 810         case CHIP_KAVERI: return "AMD KAVERI";
 811         case CHIP_KABINI: return "AMD KABINI";
 812         case CHIP_HAWAII: return "AMD HAWAII";
 813         case CHIP_MULLINS: return "AMD MULLINS";
 814         case CHIP_TONGA: return "AMD TONGA";
 815         case CHIP_ICELAND: return "AMD ICELAND";
 816         case CHIP_CARRIZO: return "AMD CARRIZO";
 817         case CHIP_FIJI: return "AMD FIJI";
 818         case CHIP_POLARIS10: return "AMD POLARIS10";
 819         case CHIP_POLARIS11: return "AMD POLARIS11";
 820         case CHIP_POLARIS12: return "AMD POLARIS12";
 821         case CHIP_STONEY: return "AMD STONEY";
 822         case CHIP_VEGA10: return "AMD VEGA10";
 823         case CHIP_RAVEN: return "AMD RAVEN";
 824         default: return "AMD unknown";
 825         }
 826 }
 827
 828 static void r600_disk_cache_create(struct r600_common_screen *rscreen)
 829 {
 830         /* Don't use the cache if shader dumping is enabled. */
 831         if (rscreen->debug_flags & DBG_ALL_SHADERS)
 832                 return;
 833
 834         uint32_t mesa_timestamp;
 835         if (disk_cache_get_function_timestamp(r600_disk_cache_create,
 836                                               &mesa_timestamp)) {
 837                 char *timestamp_str;
 838                 int res = -1;
 839                 uint32_t llvm_timestamp;
 840
 841                 if (disk_cache_get_function_timestamp(LLVMInitializeAMDGPUTargetInfo,
 842                                                       &llvm_timestamp)) {
 843                         res = asprintf(&timestamp_str, "%u_%u",
 844                                        mesa_timestamp, llvm_timestamp);
 845                 }
 846
 847                 if (res != -1) {
 848                         /* These flags affect shader compilation. */
 849                         uint64_t shader_debug_flags =
 850                                 rscreen->debug_flags &
 851                                 (DBG(FS_CORRECT_DERIVS_AFTER_KILL) |
 852                                  DBG(SI_SCHED) |
 853                                  DBG(UNSAFE_MATH));
 854
 855                         rscreen->disk_shader_cache =
 856                                 disk_cache_create(r600_get_family_name(rscreen),
 857                                                   timestamp_str,
 858                                                   shader_debug_flags);
 859                         free(timestamp_str);
 860                 }
 861         }
 862 }
 863
 864 static struct disk_cache *r600_get_disk_shader_cache(struct pipe_screen *pscreen)
 865 {
 866         struct r600_common_screen *rscreen = (struct r600_common_screen*)pscreen;
 867         return rscreen->disk_shader_cache;
 868 }
 869
 870 static const char* r600_get_name(struct pipe_screen* pscreen)
 871 {
 872         struct r600_common_screen *rscreen = (struct r600_common_screen*)pscreen;
 873
 874         return rscreen->renderer_string;
 875 }
 876
 877 static float r600_get_paramf(struct pipe_screen* pscreen,
 878                              enum pipe_capf param)
 879 {
 880         switch (param) {
 881         case PIPE_CAPF_MAX_LINE_WIDTH:
 882         case PIPE_CAPF_MAX_LINE_WIDTH_AA:
 883         case PIPE_CAPF_MAX_POINT_WIDTH:
 884         case PIPE_CAPF_MAX_POINT_WIDTH_AA:
 885                 return 8192.0f;
 886         case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY:
 887                 return 16.0f;
 888         case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
 889                 return 16.0f;
 890         case PIPE_CAPF_GUARD_BAND_LEFT:
 891         case PIPE_CAPF_GUARD_BAND_TOP:
 892         case PIPE_CAPF_GUARD_BAND_RIGHT:
 893         case PIPE_CAPF_GUARD_BAND_BOTTOM:
 894                 return 0.0f;
 895         }
 896         return 0.0f;
 897 }
 898
 899 static int r600_get_video_param(struct pipe_screen *screen,
 900                                 enum pipe_video_profile profile,
 901                                 enum pipe_video_entrypoint entrypoint,
 902                                 enum pipe_video_cap param)
 903 {
 904         switch (param) {
 905         case PIPE_VIDEO_CAP_SUPPORTED:
 906                 return vl_profile_supported(screen, profile, entrypoint);
 907         case PIPE_VIDEO_CAP_NPOT_TEXTURES:
 908                 return 1;
 909         case PIPE_VIDEO_CAP_MAX_WIDTH:
 910         case PIPE_VIDEO_CAP_MAX_HEIGHT:
 911                 return vl_video_buffer_max_size(screen);
 912         case PIPE_VIDEO_CAP_PREFERED_FORMAT:
 913                 return PIPE_FORMAT_NV12;
 914         case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
 915                 return false;
 916         case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED:
 917                 return false;
 918         case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
 919                 return true;
 920         case PIPE_VIDEO_CAP_MAX_LEVEL:
 921                 return vl_level_supported(screen, profile);
 922         default:
 923                 return 0;
 924         }
 925 }
 926
 927 const char *si_get_llvm_processor_name(enum radeon_family family)
 928 {
 929         switch (family) {
 930         case CHIP_TAHITI: return "tahiti";
 931         case CHIP_PITCAIRN: return "pitcairn";
 932         case CHIP_VERDE: return "verde";
 933         case CHIP_OLAND: return "oland";
 934         case CHIP_HAINAN: return "hainan";
 935         case CHIP_BONAIRE: return "bonaire";
 936         case CHIP_KABINI: return "kabini";
 937         case CHIP_KAVERI: return "kaveri";
 938         case CHIP_HAWAII: return "hawaii";
 939         case CHIP_MULLINS:
 940                 return "mullins";
 941         case CHIP_TONGA: return "tonga";
 942         case CHIP_ICELAND: return "iceland";
 943         case CHIP_CARRIZO: return "carrizo";
 944         case CHIP_FIJI:
 945                 return "fiji";
 946         case CHIP_STONEY:
 947                 return "stoney";
 948         case CHIP_POLARIS10:
 949                 return "polaris10";
 950         case CHIP_POLARIS11:
 951         case CHIP_POLARIS12: /* same as polaris11 */
 952                 return "polaris11";
 953         case CHIP_VEGA10:
 954         case CHIP_RAVEN:
 955                 return "gfx900";
 956         default:
 957                 return "";
 958         }
 959 }
 960
 961 static unsigned get_max_threads_per_block(struct r600_common_screen *screen,
 962                                           enum pipe_shader_ir ir_type)
 963 {
 964         if (ir_type != PIPE_SHADER_IR_TGSI)
 965                 return 256;
 966
 967         /* Only 16 waves per thread-group on gfx9. */
 968         if (screen->chip_class >= GFX9)
 969                 return 1024;
 970
 971         /* Up to 40 waves per thread-group on GCN < gfx9. Expose a nice
 972          * round number.
 973          */
 974         return 2048;
 975 }
 976
 977 static int r600_get_compute_param(struct pipe_screen *screen,
 978         enum pipe_shader_ir ir_type,
 979         enum pipe_compute_cap param,
 980         void *ret)
 981 {
 982         struct r600_common_screen *rscreen = (struct r600_common_screen *)screen;
 983
 984         //TODO: select these params by asic
 985         switch (param) {
 986         case PIPE_COMPUTE_CAP_IR_TARGET: {
 987                 const char *gpu;
 988                 const char *triple;
 989
 990                 if (HAVE_LLVM < 0x0400)
 991                         triple = "amdgcn--";
 992                 else
 993                         triple = "amdgcn-mesa-mesa3d";
 994
 995                 gpu = si_get_llvm_processor_name(rscreen->family);
 996                 if (ret) {
 997                         sprintf(ret, "%s-%s", gpu, triple);
 998                 }
 999                 /* +2 for dash and terminating NIL byte */
1000                 return (strlen(triple) + strlen(gpu) + 2) * sizeof(char);
1001         }
1002         case PIPE_COMPUTE_CAP_GRID_DIMENSION:
1003                 if (ret) {
1004                         uint64_t *grid_dimension = ret;
1005                         grid_dimension[0] = 3;
1006                 }
1007                 return 1 * sizeof(uint64_t);
1008
1009         case PIPE_COMPUTE_CAP_MAX_GRID_SIZE:
1010                 if (ret) {
1011                         uint64_t *grid_size = ret;
1012                         grid_size[0] = 65535;
1013                         grid_size[1] = 65535;
1014                         grid_size[2] = 65535;
1015                 }
1016                 return 3 * sizeof(uint64_t) ;
1017
1018         case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE:
1019                 if (ret) {
1020                         uint64_t *block_size = ret;
1021                         unsigned threads_per_block = get_max_threads_per_block(rscreen, ir_type);
1022                         block_size[0] = threads_per_block;
1023                         block_size[1] = threads_per_block;
1024                         block_size[2] = threads_per_block;
1025                 }
1026                 return 3 * sizeof(uint64_t);
1027
1028         case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK:
1029                 if (ret) {
1030                         uint64_t *max_threads_per_block = ret;
1031                         *max_threads_per_block = get_max_threads_per_block(rscreen, ir_type);
1032                 }
1033                 return sizeof(uint64_t);
1034         case PIPE_COMPUTE_CAP_ADDRESS_BITS:
1035                 if (ret) {
1036                         uint32_t *address_bits = ret;
1037                         address_bits[0] = 64;
1038                 }
1039                 return 1 * sizeof(uint32_t);
1040
1041         case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE:
1042                 if (ret) {
1043                         uint64_t *max_global_size = ret;
1044                         uint64_t max_mem_alloc_size;
1045
1046                         r600_get_compute_param(screen, ir_type,
1047                                 PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE,
1048                                 &max_mem_alloc_size);
1049
1050                         /* In OpenCL, the MAX_MEM_ALLOC_SIZE must be at least
1051                          * 1/4 of the MAX_GLOBAL_SIZE.  Since the
1052                          * MAX_MEM_ALLOC_SIZE is fixed for older kernels,
1053                          * make sure we never report more than
1054                          * 4 * MAX_MEM_ALLOC_SIZE.
1055                          */
1056                         *max_global_size = MIN2(4 * max_mem_alloc_size,
1057                                                 MAX2(rscreen->info.gart_size,
1058                                                      rscreen->info.vram_size));
1059                 }
1060                 return sizeof(uint64_t);
1061
1062         case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE:
1063                 if (ret) {
1064                         uint64_t *max_local_size = ret;
1065                         /* Value reported by the closed source driver. */
1066                         *max_local_size = 32768;
1067                 }
1068                 return sizeof(uint64_t);
1069
1070         case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE:
1071                 if (ret) {
1072                         uint64_t *max_input_size = ret;
1073                         /* Value reported by the closed source driver. */
1074                         *max_input_size = 1024;
1075                 }
1076                 return sizeof(uint64_t);
1077
1078         case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE:
1079                 if (ret) {
1080                         uint64_t *max_mem_alloc_size = ret;
1081
1082                         *max_mem_alloc_size = rscreen->info.max_alloc_size;
1083                 }
1084                 return sizeof(uint64_t);
1085
1086         case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY:
1087                 if (ret) {
1088                         uint32_t *max_clock_frequency = ret;
1089                         *max_clock_frequency = rscreen->info.max_shader_clock;
1090                 }
1091                 return sizeof(uint32_t);
1092
1093         case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS:
1094                 if (ret) {
1095                         uint32_t *max_compute_units = ret;
1096                         *max_compute_units = rscreen->info.num_good_compute_units;
1097                 }
1098                 return sizeof(uint32_t);
1099
1100         case PIPE_COMPUTE_CAP_IMAGES_SUPPORTED:
1101                 if (ret) {
1102                         uint32_t *images_supported = ret;
1103                         *images_supported = 0;
1104                 }
1105                 return sizeof(uint32_t);
1106         case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE:
1107                 break; /* unused */
1108         case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
1109                 if (ret) {
1110                         uint32_t *subgroup_size = ret;
1111                         *subgroup_size = 64;
1112                 }
1113                 return sizeof(uint32_t);
1114         case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK:
1115                 if (ret) {
1116                         uint64_t *max_variable_threads_per_block = ret;
1117                         if (ir_type == PIPE_SHADER_IR_TGSI)
1118                                 *max_variable_threads_per_block = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
1119                         else
1120                                 *max_variable_threads_per_block = 0;
1121                 }
1122                 return sizeof(uint64_t);
1123         }
1124
1125         fprintf(stderr, "unknown PIPE_COMPUTE_CAP %d\n", param);
1126         return 0;
1127 }
1128
1129 static uint64_t r600_get_timestamp(struct pipe_screen *screen)
1130 {
1131         struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
1132
1133         return 1000000 * rscreen->ws->query_value(rscreen->ws, RADEON_TIMESTAMP) /
1134                         rscreen->info.clock_crystal_freq;
1135 }
1136
1137 static void r600_fence_reference(struct pipe_screen *screen,
1138                                  struct pipe_fence_handle **dst,
1139                                  struct pipe_fence_handle *src)
1140 {
1141         struct radeon_winsys *ws = ((struct r600_common_screen*)screen)->ws;
1142         struct r600_multi_fence **rdst = (struct r600_multi_fence **)dst;
1143         struct r600_multi_fence *rsrc = (struct r600_multi_fence *)src;
1144
1145         if (pipe_reference(&(*rdst)->reference, &rsrc->reference)) {
1146                 ws->fence_reference(&(*rdst)->gfx, NULL);
1147                 ws->fence_reference(&(*rdst)->sdma, NULL);
1148                 FREE(*rdst);
1149         }
1150         *rdst = rsrc;
1151 }
1152
1153 static boolean r600_fence_finish(struct pipe_screen *screen,
1154                                  struct pipe_context *ctx,
1155                                  struct pipe_fence_handle *fence,
1156                                  uint64_t timeout)
1157 {
1158         struct radeon_winsys *rws = ((struct r600_common_screen*)screen)->ws;
1159         struct r600_multi_fence *rfence = (struct r600_multi_fence *)fence;
1160         struct r600_common_context *rctx;
1161         int64_t abs_timeout = os_time_get_absolute_timeout(timeout);
1162
1163         ctx = threaded_context_unwrap_sync(ctx);
1164         rctx = ctx ? (struct r600_common_context*)ctx : NULL;
1165
1166         if (rfence->sdma) {
1167                 if (!rws->fence_wait(rws, rfence->sdma, timeout))
1168                         return false;
1169
1170                 /* Recompute the timeout after waiting. */
1171                 if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
1172                         int64_t time = os_time_get_nano();
1173                         timeout = abs_timeout > time ? abs_timeout - time : 0;
1174                 }
1175         }
1176
1177         if (!rfence->gfx)
1178                 return true;
1179
1180         /* Flush the gfx IB if it hasn't been flushed yet. */
1181         if (rctx &&
1182             rfence->gfx_unflushed.ctx == rctx &&
1183             rfence->gfx_unflushed.ib_index == rctx->num_gfx_cs_flushes) {
1184                 rctx->gfx.flush(rctx, timeout ? 0 : RADEON_FLUSH_ASYNC, NULL);
1185                 rfence->gfx_unflushed.ctx = NULL;
1186
1187                 if (!timeout)
1188                         return false;
1189
1190                 /* Recompute the timeout after all that. */
1191                 if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
1192                         int64_t time = os_time_get_nano();
1193                         timeout = abs_timeout > time ? abs_timeout - time : 0;
1194                 }
1195         }
1196
1197         return rws->fence_wait(rws, rfence->gfx, timeout);
1198 }
1199
1200 static void r600_query_memory_info(struct pipe_screen *screen,
1201                                    struct pipe_memory_info *info)
1202 {
1203         struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
1204         struct radeon_winsys *ws = rscreen->ws;
1205         unsigned vram_usage, gtt_usage;
1206
1207         info->total_device_memory = rscreen->info.vram_size / 1024;
1208         info->total_staging_memory = rscreen->info.gart_size / 1024;
1209
1210         /* The real TTM memory usage is somewhat random, because:
1211          *
1212          * 1) TTM delays freeing memory, because it can only free it after
1213          *    fences expire.
1214          *
1215          * 2) The memory usage can be really low if big VRAM evictions are
1216          *    taking place, but the real usage is well above the size of VRAM.
1217          *
1218          * Instead, return statistics of this process.
1219          */
1220         vram_usage = ws->query_value(ws, RADEON_REQUESTED_VRAM_MEMORY) / 1024;
1221         gtt_usage =  ws->query_value(ws, RADEON_REQUESTED_GTT_MEMORY) / 1024;
1222
1223         info->avail_device_memory =
1224                 vram_usage <= info->total_device_memory ?
1225                                 info->total_device_memory - vram_usage : 0;
1226         info->avail_staging_memory =
1227                 gtt_usage <= info->total_staging_memory ?
1228                                 info->total_staging_memory - gtt_usage : 0;
1229
1230         info->device_memory_evicted =
1231                 ws->query_value(ws, RADEON_NUM_BYTES_MOVED) / 1024;
1232
1233         if (rscreen->info.drm_major == 3 && rscreen->info.drm_minor >= 4)
1234                 info->nr_device_memory_evictions =
1235                         ws->query_value(ws, RADEON_NUM_EVICTIONS);
1236         else
1237                 /* Just return the number of evicted 64KB pages. */
1238                 info->nr_device_memory_evictions = info->device_memory_evicted / 64;
1239 }
1240
1241 struct pipe_resource *si_resource_create_common(struct pipe_screen *screen,
1242                                                 const struct pipe_resource *templ)
1243 {
1244         if (templ->target == PIPE_BUFFER) {
1245                 return si_buffer_create(screen, templ, 256);
1246         } else {
1247                 return si_texture_create(screen, templ);
1248         }
1249 }
1250
1251 bool si_common_screen_init(struct r600_common_screen *rscreen,
1252                            struct radeon_winsys *ws)
1253 {
1254         char family_name[32] = {}, llvm_string[32] = {}, kernel_version[128] = {};
1255         struct utsname uname_data;
1256         const char *chip_name;
1257
1258         ws->query_info(ws, &rscreen->info);
1259         rscreen->ws = ws;
1260
1261         if ((chip_name = r600_get_marketing_name(ws)))
1262                 snprintf(family_name, sizeof(family_name), "%s / ",
1263                          r600_get_family_name(rscreen) + 4);
1264         else
1265                 chip_name = r600_get_family_name(rscreen);
1266
1267         if (uname(&uname_data) == 0)
1268                 snprintf(kernel_version, sizeof(kernel_version),
1269                          " / %s", uname_data.release);
1270
1271         if (HAVE_LLVM > 0) {
1272                 snprintf(llvm_string, sizeof(llvm_string),
1273                          ", LLVM %i.%i.%i", (HAVE_LLVM >> 8) & 0xff,
1274                          HAVE_LLVM & 0xff, MESA_LLVM_VERSION_PATCH);
1275         }
1276
1277         snprintf(rscreen->renderer_string, sizeof(rscreen->renderer_string),
1278                  "%s (%sDRM %i.%i.%i%s%s)",
1279                  chip_name, family_name, rscreen->info.drm_major,
1280                  rscreen->info.drm_minor, rscreen->info.drm_patchlevel,
1281                  kernel_version, llvm_string);
1282
1283         rscreen->b.get_name = r600_get_name;
1284         rscreen->b.get_vendor = r600_get_vendor;
1285         rscreen->b.get_device_vendor = r600_get_device_vendor;
1286         rscreen->b.get_disk_shader_cache = r600_get_disk_shader_cache;
1287         rscreen->b.get_compute_param = r600_get_compute_param;
1288         rscreen->b.get_paramf = r600_get_paramf;
1289         rscreen->b.get_timestamp = r600_get_timestamp;
1290         rscreen->b.fence_finish = r600_fence_finish;
1291         rscreen->b.fence_reference = r600_fence_reference;
1292         rscreen->b.resource_destroy = u_resource_destroy_vtbl;
1293         rscreen->b.resource_from_user_memory = si_buffer_from_user_memory;
1294         rscreen->b.query_memory_info = r600_query_memory_info;
1295
1296         if (rscreen->info.has_hw_decode) {
1297                 rscreen->b.get_video_param = si_vid_get_video_param;
1298                 rscreen->b.is_video_format_supported = si_vid_is_format_supported;
1299         } else {
1300                 rscreen->b.get_video_param = r600_get_video_param;
1301                 rscreen->b.is_video_format_supported = vl_video_buffer_is_format_supported;
1302         }
1303
1304         si_init_screen_texture_functions(rscreen);
1305         si_init_screen_query_functions(rscreen);
1306
1307         rscreen->family = rscreen->info.family;
1308         rscreen->chip_class = rscreen->info.chip_class;
1309         rscreen->debug_flags |= debug_get_flags_option("R600_DEBUG", common_debug_options, 0);
1310         rscreen->has_rbplus = false;
1311         rscreen->rbplus_allowed = false;
1312
1313         r600_disk_cache_create(rscreen);
1314
1315         slab_create_parent(&rscreen->pool_transfers, sizeof(struct r600_transfer), 64);
1316
1317         rscreen->force_aniso = MIN2(16, debug_get_num_option("R600_TEX_ANISO", -1));
1318         if (rscreen->force_aniso >= 0) {
1319                 printf("radeon: Forcing anisotropy filter to %ix\n",
1320                        /* round down to a power of two */
1321                        1 << util_logbase2(rscreen->force_aniso));
1322         }
1323
1324         (void) mtx_init(&rscreen->aux_context_lock, mtx_plain);
1325         (void) mtx_init(&rscreen->gpu_load_mutex, mtx_plain);
1326
1327         if (rscreen->debug_flags & DBG(INFO)) {
1328                 printf("pci (domain:bus:dev.func): %04x:%02x:%02x.%x\n",
1329                        rscreen->info.pci_domain, rscreen->info.pci_bus,
1330                        rscreen->info.pci_dev, rscreen->info.pci_func);
1331                 printf("pci_id = 0x%x\n", rscreen->info.pci_id);
1332                 printf("family = %i (%s)\n", rscreen->info.family,
1333                        r600_get_family_name(rscreen));
1334                 printf("chip_class = %i\n", rscreen->info.chip_class);
1335                 printf("pte_fragment_size = %u\n", rscreen->info.pte_fragment_size);
1336                 printf("gart_page_size = %u\n", rscreen->info.gart_page_size);
1337                 printf("gart_size = %i MB\n", (int)DIV_ROUND_UP(rscreen->info.gart_size, 1024*1024));
1338                 printf("vram_size = %i MB\n", (int)DIV_ROUND_UP(rscreen->info.vram_size, 1024*1024));
1339                 printf("vram_vis_size = %i MB\n", (int)DIV_ROUND_UP(rscreen->info.vram_vis_size, 1024*1024));
1340                 printf("max_alloc_size = %i MB\n",
1341                        (int)DIV_ROUND_UP(rscreen->info.max_alloc_size, 1024*1024));
1342                 printf("min_alloc_size = %u\n", rscreen->info.min_alloc_size);
1343                 printf("has_dedicated_vram = %u\n", rscreen->info.has_dedicated_vram);
1344                 printf("has_virtual_memory = %i\n", rscreen->info.has_virtual_memory);
1345                 printf("gfx_ib_pad_with_type2 = %i\n", rscreen->info.gfx_ib_pad_with_type2);
1346                 printf("has_hw_decode = %u\n", rscreen->info.has_hw_decode);
1347                 printf("num_sdma_rings = %i\n", rscreen->info.num_sdma_rings);
1348                 printf("num_compute_rings = %u\n", rscreen->info.num_compute_rings);
1349                 printf("uvd_fw_version = %u\n", rscreen->info.uvd_fw_version);
1350                 printf("vce_fw_version = %u\n", rscreen->info.vce_fw_version);
1351                 printf("me_fw_version = %i\n", rscreen->info.me_fw_version);
1352                 printf("me_fw_feature = %i\n", rscreen->info.me_fw_feature);
1353                 printf("pfp_fw_version = %i\n", rscreen->info.pfp_fw_version);
1354                 printf("pfp_fw_feature = %i\n", rscreen->info.pfp_fw_feature);
1355                 printf("ce_fw_version = %i\n", rscreen->info.ce_fw_version);
1356                 printf("ce_fw_feature = %i\n", rscreen->info.ce_fw_feature);
1357                 printf("vce_harvest_config = %i\n", rscreen->info.vce_harvest_config);
1358                 printf("clock_crystal_freq = %i\n", rscreen->info.clock_crystal_freq);
1359                 printf("tcc_cache_line_size = %u\n", rscreen->info.tcc_cache_line_size);
1360                 printf("drm = %i.%i.%i\n", rscreen->info.drm_major,
1361                        rscreen->info.drm_minor, rscreen->info.drm_patchlevel);
1362                 printf("has_userptr = %i\n", rscreen->info.has_userptr);
1363                 printf("has_syncobj = %u\n", rscreen->info.has_syncobj);
1364                 printf("has_sync_file = %u\n", rscreen->info.has_sync_file);
1365
1366                 printf("r600_max_quad_pipes = %i\n", rscreen->info.r600_max_quad_pipes);
1367                 printf("max_shader_clock = %i\n", rscreen->info.max_shader_clock);
1368                 printf("num_good_compute_units = %i\n", rscreen->info.num_good_compute_units);
1369                 printf("max_se = %i\n", rscreen->info.max_se);
1370                 printf("max_sh_per_se = %i\n", rscreen->info.max_sh_per_se);
1371
1372                 printf("r600_gb_backend_map = %i\n", rscreen->info.r600_gb_backend_map);
1373                 printf("r600_gb_backend_map_valid = %i\n", rscreen->info.r600_gb_backend_map_valid);
1374                 printf("r600_num_banks = %i\n", rscreen->info.r600_num_banks);
1375                 printf("num_render_backends = %i\n", rscreen->info.num_render_backends);
1376                 printf("num_tile_pipes = %i\n", rscreen->info.num_tile_pipes);
1377                 printf("pipe_interleave_bytes = %i\n", rscreen->info.pipe_interleave_bytes);
1378                 printf("enabled_rb_mask = 0x%x\n", rscreen->info.enabled_rb_mask);
1379                 printf("max_alignment = %u\n", (unsigned)rscreen->info.max_alignment);
1380         }
1381         return true;
1382 }
1383
1384 void si_destroy_common_screen(struct r600_common_screen *rscreen)
1385 {
1386         si_perfcounters_destroy(rscreen);
1387         si_gpu_load_kill_thread(rscreen);
1388
1389         mtx_destroy(&rscreen->gpu_load_mutex);
1390         mtx_destroy(&rscreen->aux_context_lock);
1391         rscreen->aux_context->destroy(rscreen->aux_context);
1392
1393         slab_destroy_parent(&rscreen->pool_transfers);
1394
1395         disk_cache_destroy(rscreen->disk_shader_cache);
1396         rscreen->ws->destroy(rscreen->ws);
1397         FREE(rscreen);
1398 }
1399
1400 bool si_can_dump_shader(struct r600_common_screen *rscreen,
1401                         unsigned processor)
1402 {
1403         return rscreen->debug_flags & (1 << processor);
1404 }
1405
1406 bool si_extra_shader_checks(struct r600_common_screen *rscreen, unsigned processor)
1407 {
1408         return (rscreen->debug_flags & DBG(CHECK_IR)) ||
1409                si_can_dump_shader(rscreen, processor);
1410 }
1411
1412 void si_screen_clear_buffer(struct r600_common_screen *rscreen, struct pipe_resource *dst,
1413                             uint64_t offset, uint64_t size, unsigned value)
1414 {
1415         struct r600_common_context *rctx = (struct r600_common_context*)rscreen->aux_context;
1416
1417         mtx_lock(&rscreen->aux_context_lock);
1418         rctx->dma_clear_buffer(&rctx->b, dst, offset, size, value);
1419         rscreen->aux_context->flush(rscreen->aux_context, NULL, 0);
1420         mtx_unlock(&rscreen->aux_context_lock);
1421 }