src/gallium/drivers/radeon/r600_pipe_common.c

   1 /*
   2  * Copyright 2013 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  *
  23  * Authors: Marek Olšák <maraeo@gmail.com>
  24  *
  25  */
  26
  27 #include "r600_pipe_common.h"
  28 #include "r600_cs.h"
  29 #include "tgsi/tgsi_parse.h"
  30 #include "util/list.h"
  31 #include "util/u_draw_quad.h"
  32 #include "util/u_memory.h"
  33 #include "util/u_format_s3tc.h"
  34 #include "util/u_upload_mgr.h"
  35 #include "os/os_time.h"
  36 #include "vl/vl_decoder.h"
  37 #include "vl/vl_video_buffer.h"
  38 #include "radeon/radeon_video.h"
  39 #include <inttypes.h>
  40 #include <sys/utsname.h>
  41
  42 #include <llvm-c/TargetMachine.h>
  43
  44
  45 struct r600_multi_fence {
  46         struct pipe_reference reference;
  47         struct pipe_fence_handle *gfx;
  48         struct pipe_fence_handle *sdma;
  49
  50         /* If the context wasn't flushed at fence creation, this is non-NULL. */
  51         struct {
  52                 struct r600_common_context *ctx;
  53                 unsigned ib_index;
  54         } gfx_unflushed;
  55 };
  56
  57 /*
  58  * shader binary helpers.
  59  */
  60 void si_radeon_shader_binary_init(struct ac_shader_binary *b)
  61 {
  62         memset(b, 0, sizeof(*b));
  63 }
  64
  65 void si_radeon_shader_binary_clean(struct ac_shader_binary *b)
  66 {
  67         if (!b)
  68                 return;
  69         FREE(b->code);
  70         FREE(b->config);
  71         FREE(b->rodata);
  72         FREE(b->global_symbol_offsets);
  73         FREE(b->relocs);
  74         FREE(b->disasm_string);
  75         FREE(b->llvm_ir_string);
  76 }
  77
  78 /*
  79  * pipe_context
  80  */
  81
  82 /**
  83  * Write an EOP event.
  84  *
  85  * \param event         EVENT_TYPE_*
  86  * \param event_flags   Optional cache flush flags (TC)
  87  * \param data_sel      1 = fence, 3 = timestamp
  88  * \param buf           Buffer
  89  * \param va            GPU address
  90  * \param old_value     Previous fence value (for a bug workaround)
  91  * \param new_value     Fence value to write for this event.
  92  */
  93 void si_gfx_write_event_eop(struct r600_common_context *ctx,
  94                             unsigned event, unsigned event_flags,
  95                             unsigned data_sel,
  96                             struct r600_resource *buf, uint64_t va,
  97                             uint32_t new_fence, unsigned query_type)
  98 {
  99         struct radeon_winsys_cs *cs = ctx->gfx.cs;
 100         unsigned op = EVENT_TYPE(event) |
 101                       EVENT_INDEX(5) |
 102                       event_flags;
 103         unsigned sel = EOP_DATA_SEL(data_sel);
 104
 105         /* Wait for write confirmation before writing data, but don't send
 106          * an interrupt. */
 107         if (data_sel != EOP_DATA_SEL_DISCARD)
 108                 sel |= EOP_INT_SEL(EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM);
 109
 110         if (ctx->chip_class >= GFX9) {
 111                 /* A ZPASS_DONE or PIXEL_STAT_DUMP_EVENT (of the DB occlusion
 112                  * counters) must immediately precede every timestamp event to
 113                  * prevent a GPU hang on GFX9.
 114                  *
 115                  * Occlusion queries don't need to do it here, because they
 116                  * always do ZPASS_DONE before the timestamp.
 117                  */
 118                 if (ctx->chip_class == GFX9 &&
 119                     query_type != PIPE_QUERY_OCCLUSION_COUNTER &&
 120                     query_type != PIPE_QUERY_OCCLUSION_PREDICATE &&
 121                     query_type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
 122                         struct r600_resource *scratch = ctx->eop_bug_scratch;
 123
 124                         assert(16 * ctx->screen->info.num_render_backends <=
 125                                scratch->b.b.width0);
 126                         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
 127                         radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
 128                         radeon_emit(cs, scratch->gpu_address);
 129                         radeon_emit(cs, scratch->gpu_address >> 32);
 130
 131                         radeon_add_to_buffer_list(ctx, &ctx->gfx, scratch,
 132                                                   RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
 133                 }
 134
 135                 radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, 6, 0));
 136                 radeon_emit(cs, op);
 137                 radeon_emit(cs, sel);
 138                 radeon_emit(cs, va);            /* address lo */
 139                 radeon_emit(cs, va >> 32);      /* address hi */
 140                 radeon_emit(cs, new_fence);     /* immediate data lo */
 141                 radeon_emit(cs, 0); /* immediate data hi */
 142                 radeon_emit(cs, 0); /* unused */
 143         } else {
 144                 if (ctx->chip_class == CIK ||
 145                     ctx->chip_class == VI) {
 146                         struct r600_resource *scratch = ctx->eop_bug_scratch;
 147                         uint64_t va = scratch->gpu_address;
 148
 149                         /* Two EOP events are required to make all engines go idle
 150                          * (and optional cache flushes executed) before the timestamp
 151                          * is written.
 152                          */
 153                         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
 154                         radeon_emit(cs, op);
 155                         radeon_emit(cs, va);
 156                         radeon_emit(cs, ((va >> 32) & 0xffff) | sel);
 157                         radeon_emit(cs, 0); /* immediate data */
 158                         radeon_emit(cs, 0); /* unused */
 159
 160                         radeon_add_to_buffer_list(ctx, &ctx->gfx, scratch,
 161                                                   RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
 162                 }
 163
 164                 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
 165                 radeon_emit(cs, op);
 166                 radeon_emit(cs, va);
 167                 radeon_emit(cs, ((va >> 32) & 0xffff) | sel);
 168                 radeon_emit(cs, new_fence); /* immediate data */
 169                 radeon_emit(cs, 0); /* unused */
 170         }
 171
 172         if (buf)
 173                 r600_emit_reloc(ctx, &ctx->gfx, buf, RADEON_USAGE_WRITE,
 174                                 RADEON_PRIO_QUERY);
 175 }
 176
 177 unsigned si_gfx_write_fence_dwords(struct r600_common_screen *screen)
 178 {
 179         unsigned dwords = 6;
 180
 181         if (screen->chip_class == CIK ||
 182             screen->chip_class == VI)
 183                 dwords *= 2;
 184
 185         if (!screen->info.has_virtual_memory)
 186                 dwords += 2;
 187
 188         return dwords;
 189 }
 190
 191 void si_gfx_wait_fence(struct r600_common_context *ctx,
 192                        uint64_t va, uint32_t ref, uint32_t mask)
 193 {
 194         struct radeon_winsys_cs *cs = ctx->gfx.cs;
 195
 196         radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
 197         radeon_emit(cs, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_MEM_SPACE(1));
 198         radeon_emit(cs, va);
 199         radeon_emit(cs, va >> 32);
 200         radeon_emit(cs, ref); /* reference value */
 201         radeon_emit(cs, mask); /* mask */
 202         radeon_emit(cs, 4); /* poll interval */
 203 }
 204
 205 static void r600_dma_emit_wait_idle(struct r600_common_context *rctx)
 206 {
 207         struct radeon_winsys_cs *cs = rctx->dma.cs;
 208
 209         /* NOP waits for idle on Evergreen and later. */
 210         if (rctx->chip_class >= CIK)
 211                 radeon_emit(cs, 0x00000000); /* NOP */
 212         else
 213                 radeon_emit(cs, 0xf0000000); /* NOP */
 214 }
 215
 216 void si_need_dma_space(struct r600_common_context *ctx, unsigned num_dw,
 217                        struct r600_resource *dst, struct r600_resource *src)
 218 {
 219         uint64_t vram = ctx->dma.cs->used_vram;
 220         uint64_t gtt = ctx->dma.cs->used_gart;
 221
 222         if (dst) {
 223                 vram += dst->vram_usage;
 224                 gtt += dst->gart_usage;
 225         }
 226         if (src) {
 227                 vram += src->vram_usage;
 228                 gtt += src->gart_usage;
 229         }
 230
 231         /* Flush the GFX IB if DMA depends on it. */
 232         if (radeon_emitted(ctx->gfx.cs, ctx->initial_gfx_cs_size) &&
 233             ((dst &&
 234               ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, dst->buf,
 235                                                RADEON_USAGE_READWRITE)) ||
 236              (src &&
 237               ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, src->buf,
 238                                                RADEON_USAGE_WRITE))))
 239                 ctx->gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 240
 241         /* Flush if there's not enough space, or if the memory usage per IB
 242          * is too large.
 243          *
 244          * IBs using too little memory are limited by the IB submission overhead.
 245          * IBs using too much memory are limited by the kernel/TTM overhead.
 246          * Too long IBs create CPU-GPU pipeline bubbles and add latency.
 247          *
 248          * This heuristic makes sure that DMA requests are executed
 249          * very soon after the call is made and lowers memory usage.
 250          * It improves texture upload performance by keeping the DMA
 251          * engine busy while uploads are being submitted.
 252          */
 253         num_dw++; /* for emit_wait_idle below */
 254         if (!ctx->ws->cs_check_space(ctx->dma.cs, num_dw) ||
 255             ctx->dma.cs->used_vram + ctx->dma.cs->used_gart > 64 * 1024 * 1024 ||
 256             !radeon_cs_memory_below_limit(ctx->screen, ctx->dma.cs, vram, gtt)) {
 257                 ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 258                 assert((num_dw + ctx->dma.cs->current.cdw) <= ctx->dma.cs->current.max_dw);
 259         }
 260
 261         /* Wait for idle if either buffer has been used in the IB before to
 262          * prevent read-after-write hazards.
 263          */
 264         if ((dst &&
 265              ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, dst->buf,
 266                                               RADEON_USAGE_READWRITE)) ||
 267             (src &&
 268              ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, src->buf,
 269                                               RADEON_USAGE_WRITE)))
 270                 r600_dma_emit_wait_idle(ctx);
 271
 272         /* If GPUVM is not supported, the CS checker needs 2 entries
 273          * in the buffer list per packet, which has to be done manually.
 274          */
 275         if (ctx->screen->info.has_virtual_memory) {
 276                 if (dst)
 277                         radeon_add_to_buffer_list(ctx, &ctx->dma, dst,
 278                                                   RADEON_USAGE_WRITE,
 279                                                   RADEON_PRIO_SDMA_BUFFER);
 280                 if (src)
 281                         radeon_add_to_buffer_list(ctx, &ctx->dma, src,
 282                                                   RADEON_USAGE_READ,
 283                                                   RADEON_PRIO_SDMA_BUFFER);
 284         }
 285
 286         /* this function is called before all DMA calls, so increment this. */
 287         ctx->num_dma_calls++;
 288 }
 289
 290 static void r600_memory_barrier(struct pipe_context *ctx, unsigned flags)
 291 {
 292 }
 293
 294 void si_preflush_suspend_features(struct r600_common_context *ctx)
 295 {
 296         /* suspend queries */
 297         if (!LIST_IS_EMPTY(&ctx->active_queries))
 298                 si_suspend_queries(ctx);
 299
 300         ctx->streamout.suspended = false;
 301         if (ctx->streamout.begin_emitted) {
 302                 si_emit_streamout_end(ctx);
 303                 ctx->streamout.suspended = true;
 304         }
 305 }
 306
 307 void si_postflush_resume_features(struct r600_common_context *ctx)
 308 {
 309         if (ctx->streamout.suspended) {
 310                 ctx->streamout.append_bitmask = ctx->streamout.enabled_mask;
 311                 si_streamout_buffers_dirty(ctx);
 312         }
 313
 314         /* resume queries */
 315         if (!LIST_IS_EMPTY(&ctx->active_queries))
 316                 si_resume_queries(ctx);
 317 }
 318
 319 static void r600_add_fence_dependency(struct r600_common_context *rctx,
 320                                       struct pipe_fence_handle *fence)
 321 {
 322         struct radeon_winsys *ws = rctx->ws;
 323
 324         if (rctx->dma.cs)
 325                 ws->cs_add_fence_dependency(rctx->dma.cs, fence);
 326         ws->cs_add_fence_dependency(rctx->gfx.cs, fence);
 327 }
 328
 329 static void r600_fence_server_sync(struct pipe_context *ctx,
 330                                    struct pipe_fence_handle *fence)
 331 {
 332         struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 333         struct r600_multi_fence *rfence = (struct r600_multi_fence *)fence;
 334
 335         /* Only amdgpu needs to handle fence dependencies (for fence imports).
 336          * radeon synchronizes all rings by default and will not implement
 337          * fence imports.
 338          */
 339         if (rctx->screen->info.drm_major == 2)
 340                 return;
 341
 342         /* Only imported fences need to be handled by fence_server_sync,
 343          * because the winsys handles synchronizations automatically for BOs
 344          * within the process.
 345          *
 346          * Simply skip unflushed fences here, and the winsys will drop no-op
 347          * dependencies (i.e. dependencies within the same ring).
 348          */
 349         if (rfence->gfx_unflushed.ctx)
 350                 return;
 351
 352         /* All unflushed commands will not start execution before
 353          * this fence dependency is signalled.
 354          *
 355          * Should we flush the context to allow more GPU parallelism?
 356          */
 357         if (rfence->sdma)
 358                 r600_add_fence_dependency(rctx, rfence->sdma);
 359         if (rfence->gfx)
 360                 r600_add_fence_dependency(rctx, rfence->gfx);
 361 }
 362
 363 static void r600_flush_from_st(struct pipe_context *ctx,
 364                                struct pipe_fence_handle **fence,
 365                                unsigned flags)
 366 {
 367         struct pipe_screen *screen = ctx->screen;
 368         struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 369         struct radeon_winsys *ws = rctx->ws;
 370         struct pipe_fence_handle *gfx_fence = NULL;
 371         struct pipe_fence_handle *sdma_fence = NULL;
 372         bool deferred_fence = false;
 373         unsigned rflags = RADEON_FLUSH_ASYNC;
 374
 375         if (flags & PIPE_FLUSH_END_OF_FRAME)
 376                 rflags |= RADEON_FLUSH_END_OF_FRAME;
 377
 378         /* DMA IBs are preambles to gfx IBs, therefore must be flushed first. */
 379         if (rctx->dma.cs)
 380                 rctx->dma.flush(rctx, rflags, fence ? &sdma_fence : NULL);
 381
 382         if (!radeon_emitted(rctx->gfx.cs, rctx->initial_gfx_cs_size)) {
 383                 if (fence)
 384                         ws->fence_reference(&gfx_fence, rctx->last_gfx_fence);
 385                 if (!(flags & PIPE_FLUSH_DEFERRED))
 386                         ws->cs_sync_flush(rctx->gfx.cs);
 387         } else {
 388                 /* Instead of flushing, create a deferred fence. Constraints:
 389                  * - The state tracker must allow a deferred flush.
 390                  * - The state tracker must request a fence.
 391                  * Thread safety in fence_finish must be ensured by the state tracker.
 392                  */
 393                 if (flags & PIPE_FLUSH_DEFERRED && fence) {
 394                         gfx_fence = rctx->ws->cs_get_next_fence(rctx->gfx.cs);
 395                         deferred_fence = true;
 396                 } else {
 397                         rctx->gfx.flush(rctx, rflags, fence ? &gfx_fence : NULL);
 398                 }
 399         }
 400
 401         /* Both engines can signal out of order, so we need to keep both fences. */
 402         if (fence) {
 403                 struct r600_multi_fence *multi_fence =
 404                         CALLOC_STRUCT(r600_multi_fence);
 405                 if (!multi_fence) {
 406                         ws->fence_reference(&sdma_fence, NULL);
 407                         ws->fence_reference(&gfx_fence, NULL);
 408                         goto finish;
 409                 }
 410
 411                 multi_fence->reference.count = 1;
 412                 /* If both fences are NULL, fence_finish will always return true. */
 413                 multi_fence->gfx = gfx_fence;
 414                 multi_fence->sdma = sdma_fence;
 415
 416                 if (deferred_fence) {
 417                         multi_fence->gfx_unflushed.ctx = rctx;
 418                         multi_fence->gfx_unflushed.ib_index = rctx->num_gfx_cs_flushes;
 419                 }
 420
 421                 screen->fence_reference(screen, fence, NULL);
 422                 *fence = (struct pipe_fence_handle*)multi_fence;
 423         }
 424 finish:
 425         if (!(flags & PIPE_FLUSH_DEFERRED)) {
 426                 if (rctx->dma.cs)
 427                         ws->cs_sync_flush(rctx->dma.cs);
 428                 ws->cs_sync_flush(rctx->gfx.cs);
 429         }
 430 }
 431
 432 static void r600_flush_dma_ring(void *ctx, unsigned flags,
 433                                 struct pipe_fence_handle **fence)
 434 {
 435         struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 436         struct radeon_winsys_cs *cs = rctx->dma.cs;
 437         struct radeon_saved_cs saved;
 438         bool check_vm =
 439                 (rctx->screen->debug_flags & DBG(CHECK_VM)) &&
 440                 rctx->check_vm_faults;
 441
 442         if (!radeon_emitted(cs, 0)) {
 443                 if (fence)
 444                         rctx->ws->fence_reference(fence, rctx->last_sdma_fence);
 445                 return;
 446         }
 447
 448         if (check_vm)
 449                 si_save_cs(rctx->ws, cs, &saved, true);
 450
 451         rctx->ws->cs_flush(cs, flags, &rctx->last_sdma_fence);
 452         if (fence)
 453                 rctx->ws->fence_reference(fence, rctx->last_sdma_fence);
 454
 455         if (check_vm) {
 456                 /* Use conservative timeout 800ms, after which we won't wait any
 457                  * longer and assume the GPU is hung.
 458                  */
 459                 rctx->ws->fence_wait(rctx->ws, rctx->last_sdma_fence, 800*1000*1000);
 460
 461                 rctx->check_vm_faults(rctx, &saved, RING_DMA);
 462                 si_clear_saved_cs(&saved);
 463         }
 464 }
 465
 466 /**
 467  * Store a linearized copy of all chunks of \p cs together with the buffer
 468  * list in \p saved.
 469  */
 470 void si_save_cs(struct radeon_winsys *ws, struct radeon_winsys_cs *cs,
 471                 struct radeon_saved_cs *saved, bool get_buffer_list)
 472 {
 473         uint32_t *buf;
 474         unsigned i;
 475
 476         /* Save the IB chunks. */
 477         saved->num_dw = cs->prev_dw + cs->current.cdw;
 478         saved->ib = MALLOC(4 * saved->num_dw);
 479         if (!saved->ib)
 480                 goto oom;
 481
 482         buf = saved->ib;
 483         for (i = 0; i < cs->num_prev; ++i) {
 484                 memcpy(buf, cs->prev[i].buf, cs->prev[i].cdw * 4);
 485                 buf += cs->prev[i].cdw;
 486         }
 487         memcpy(buf, cs->current.buf, cs->current.cdw * 4);
 488
 489         if (!get_buffer_list)
 490                 return;
 491
 492         /* Save the buffer list. */
 493         saved->bo_count = ws->cs_get_buffer_list(cs, NULL);
 494         saved->bo_list = CALLOC(saved->bo_count,
 495                                 sizeof(saved->bo_list[0]));
 496         if (!saved->bo_list) {
 497                 FREE(saved->ib);
 498                 goto oom;
 499         }
 500         ws->cs_get_buffer_list(cs, saved->bo_list);
 501
 502         return;
 503
 504 oom:
 505         fprintf(stderr, "%s: out of memory\n", __func__);
 506         memset(saved, 0, sizeof(*saved));
 507 }
 508
 509 void si_clear_saved_cs(struct radeon_saved_cs *saved)
 510 {
 511         FREE(saved->ib);
 512         FREE(saved->bo_list);
 513
 514         memset(saved, 0, sizeof(*saved));
 515 }
 516
 517 static enum pipe_reset_status r600_get_reset_status(struct pipe_context *ctx)
 518 {
 519         struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 520         unsigned latest = rctx->ws->query_value(rctx->ws,
 521                                                 RADEON_GPU_RESET_COUNTER);
 522
 523         if (rctx->gpu_reset_counter == latest)
 524                 return PIPE_NO_RESET;
 525
 526         rctx->gpu_reset_counter = latest;
 527         return PIPE_UNKNOWN_CONTEXT_RESET;
 528 }
 529
 530 static void r600_set_debug_callback(struct pipe_context *ctx,
 531                                     const struct pipe_debug_callback *cb)
 532 {
 533         struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 534
 535         if (cb)
 536                 rctx->debug = *cb;
 537         else
 538                 memset(&rctx->debug, 0, sizeof(rctx->debug));
 539 }
 540
 541 static void r600_set_device_reset_callback(struct pipe_context *ctx,
 542                                            const struct pipe_device_reset_callback *cb)
 543 {
 544         struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 545
 546         if (cb)
 547                 rctx->device_reset_callback = *cb;
 548         else
 549                 memset(&rctx->device_reset_callback, 0,
 550                        sizeof(rctx->device_reset_callback));
 551 }
 552
 553 bool si_check_device_reset(struct r600_common_context *rctx)
 554 {
 555         enum pipe_reset_status status;
 556
 557         if (!rctx->device_reset_callback.reset)
 558                 return false;
 559
 560         if (!rctx->b.get_device_reset_status)
 561                 return false;
 562
 563         status = rctx->b.get_device_reset_status(&rctx->b);
 564         if (status == PIPE_NO_RESET)
 565                 return false;
 566
 567         rctx->device_reset_callback.reset(rctx->device_reset_callback.data, status);
 568         return true;
 569 }
 570
 571 static void r600_dma_clear_buffer_fallback(struct pipe_context *ctx,
 572                                            struct pipe_resource *dst,
 573                                            uint64_t offset, uint64_t size,
 574                                            unsigned value)
 575 {
 576         struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 577
 578         rctx->clear_buffer(ctx, dst, offset, size, value, R600_COHERENCY_NONE);
 579 }
 580
 581 static bool r600_resource_commit(struct pipe_context *pctx,
 582                                  struct pipe_resource *resource,
 583                                  unsigned level, struct pipe_box *box,
 584                                  bool commit)
 585 {
 586         struct r600_common_context *ctx = (struct r600_common_context *)pctx;
 587         struct r600_resource *res = r600_resource(resource);
 588
 589         /*
 590          * Since buffer commitment changes cannot be pipelined, we need to
 591          * (a) flush any pending commands that refer to the buffer we're about
 592          *     to change, and
 593          * (b) wait for threaded submit to finish, including those that were
 594          *     triggered by some other, earlier operation.
 595          */
 596         if (radeon_emitted(ctx->gfx.cs, ctx->initial_gfx_cs_size) &&
 597             ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs,
 598                                              res->buf, RADEON_USAGE_READWRITE)) {
 599                 ctx->gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 600         }
 601         if (radeon_emitted(ctx->dma.cs, 0) &&
 602             ctx->ws->cs_is_buffer_referenced(ctx->dma.cs,
 603                                              res->buf, RADEON_USAGE_READWRITE)) {
 604                 ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 605         }
 606
 607         ctx->ws->cs_sync_flush(ctx->dma.cs);
 608         ctx->ws->cs_sync_flush(ctx->gfx.cs);
 609
 610         assert(resource->target == PIPE_BUFFER);
 611
 612         return ctx->ws->buffer_commit(res->buf, box->x, box->width, commit);
 613 }
 614
 615 bool si_common_context_init(struct r600_common_context *rctx,
 616                             struct r600_common_screen *rscreen,
 617                             unsigned context_flags)
 618 {
 619         slab_create_child(&rctx->pool_transfers, &rscreen->pool_transfers);
 620         slab_create_child(&rctx->pool_transfers_unsync, &rscreen->pool_transfers);
 621
 622         rctx->screen = rscreen;
 623         rctx->ws = rscreen->ws;
 624         rctx->family = rscreen->family;
 625         rctx->chip_class = rscreen->chip_class;
 626
 627         rctx->b.invalidate_resource = si_invalidate_resource;
 628         rctx->b.resource_commit = r600_resource_commit;
 629         rctx->b.transfer_map = u_transfer_map_vtbl;
 630         rctx->b.transfer_flush_region = u_transfer_flush_region_vtbl;
 631         rctx->b.transfer_unmap = u_transfer_unmap_vtbl;
 632         rctx->b.texture_subdata = u_default_texture_subdata;
 633         rctx->b.memory_barrier = r600_memory_barrier;
 634         rctx->b.flush = r600_flush_from_st;
 635         rctx->b.set_debug_callback = r600_set_debug_callback;
 636         rctx->b.fence_server_sync = r600_fence_server_sync;
 637         rctx->dma_clear_buffer = r600_dma_clear_buffer_fallback;
 638         rctx->b.buffer_subdata = si_buffer_subdata;
 639
 640         if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 43) {
 641                 rctx->b.get_device_reset_status = r600_get_reset_status;
 642                 rctx->gpu_reset_counter =
 643                         rctx->ws->query_value(rctx->ws,
 644                                               RADEON_GPU_RESET_COUNTER);
 645         }
 646
 647         rctx->b.set_device_reset_callback = r600_set_device_reset_callback;
 648
 649         si_init_context_texture_functions(rctx);
 650         si_streamout_init(rctx);
 651         si_init_query_functions(rctx);
 652         si_init_msaa(&rctx->b);
 653
 654         if (rctx->chip_class == CIK ||
 655             rctx->chip_class == VI ||
 656             rctx->chip_class == GFX9) {
 657                 rctx->eop_bug_scratch = (struct r600_resource*)
 658                         pipe_buffer_create(&rscreen->b, 0, PIPE_USAGE_DEFAULT,
 659                                            16 * rscreen->info.num_render_backends);
 660                 if (!rctx->eop_bug_scratch)
 661                         return false;
 662         }
 663
 664         rctx->allocator_zeroed_memory =
 665                 u_suballocator_create(&rctx->b, rscreen->info.gart_page_size,
 666                                       0, PIPE_USAGE_DEFAULT, 0, true);
 667         if (!rctx->allocator_zeroed_memory)
 668                 return false;
 669
 670         rctx->b.stream_uploader = u_upload_create(&rctx->b, 1024 * 1024,
 671                                                   0, PIPE_USAGE_STREAM);
 672         if (!rctx->b.stream_uploader)
 673                 return false;
 674
 675         rctx->b.const_uploader = u_upload_create(&rctx->b, 128 * 1024,
 676                                                  0, PIPE_USAGE_DEFAULT);
 677         if (!rctx->b.const_uploader)
 678                 return false;
 679
 680         rctx->ctx = rctx->ws->ctx_create(rctx->ws);
 681         if (!rctx->ctx)
 682                 return false;
 683
 684         if (rscreen->info.num_sdma_rings && !(rscreen->debug_flags & DBG(NO_ASYNC_DMA))) {
 685                 rctx->dma.cs = rctx->ws->cs_create(rctx->ctx, RING_DMA,
 686                                                    r600_flush_dma_ring,
 687                                                    rctx);
 688                 rctx->dma.flush = r600_flush_dma_ring;
 689         }
 690
 691         return true;
 692 }
 693
 694 void si_common_context_cleanup(struct r600_common_context *rctx)
 695 {
 696         unsigned i,j;
 697
 698         /* Release DCC stats. */
 699         for (i = 0; i < ARRAY_SIZE(rctx->dcc_stats); i++) {
 700                 assert(!rctx->dcc_stats[i].query_active);
 701
 702                 for (j = 0; j < ARRAY_SIZE(rctx->dcc_stats[i].ps_stats); j++)
 703                         if (rctx->dcc_stats[i].ps_stats[j])
 704                                 rctx->b.destroy_query(&rctx->b,
 705                                                       rctx->dcc_stats[i].ps_stats[j]);
 706
 707                 r600_texture_reference(&rctx->dcc_stats[i].tex, NULL);
 708         }
 709
 710         if (rctx->query_result_shader)
 711                 rctx->b.delete_compute_state(&rctx->b, rctx->query_result_shader);
 712
 713         if (rctx->gfx.cs)
 714                 rctx->ws->cs_destroy(rctx->gfx.cs);
 715         if (rctx->dma.cs)
 716                 rctx->ws->cs_destroy(rctx->dma.cs);
 717         if (rctx->ctx)
 718                 rctx->ws->ctx_destroy(rctx->ctx);
 719
 720         if (rctx->b.stream_uploader)
 721                 u_upload_destroy(rctx->b.stream_uploader);
 722         if (rctx->b.const_uploader)
 723                 u_upload_destroy(rctx->b.const_uploader);
 724
 725         slab_destroy_child(&rctx->pool_transfers);
 726         slab_destroy_child(&rctx->pool_transfers_unsync);
 727
 728         if (rctx->allocator_zeroed_memory) {
 729                 u_suballocator_destroy(rctx->allocator_zeroed_memory);
 730         }
 731         rctx->ws->fence_reference(&rctx->last_gfx_fence, NULL);
 732         rctx->ws->fence_reference(&rctx->last_sdma_fence, NULL);
 733         r600_resource_reference(&rctx->eop_bug_scratch, NULL);
 734 }
 735
 736 /*
 737  * pipe_screen
 738  */
 739
 740 static const struct debug_named_value common_debug_options[] = {
 741         /* logging */
 742         { "tex", DBG(TEX), "Print texture info" },
 743         { "nir", DBG(NIR), "Enable experimental NIR shaders" },
 744         { "compute", DBG(COMPUTE), "Print compute info" },
 745         { "vm", DBG(VM), "Print virtual addresses when creating resources" },
 746         { "info", DBG(INFO), "Print driver information" },
 747
 748         /* shaders */
 749         { "vs", DBG(VS), "Print vertex shaders" },
 750         { "gs", DBG(GS), "Print geometry shaders" },
 751         { "ps", DBG(PS), "Print pixel shaders" },
 752         { "cs", DBG(CS), "Print compute shaders" },
 753         { "tcs", DBG(TCS), "Print tessellation control shaders" },
 754         { "tes", DBG(TES), "Print tessellation evaluation shaders" },
 755         { "noir", DBG(NO_IR), "Don't print the LLVM IR"},
 756         { "notgsi", DBG(NO_TGSI), "Don't print the TGSI"},
 757         { "noasm", DBG(NO_ASM), "Don't print disassembled shaders"},
 758         { "preoptir", DBG(PREOPT_IR), "Print the LLVM IR before initial optimizations" },
 759         { "checkir", DBG(CHECK_IR), "Enable additional sanity checks on shader IR" },
 760         { "nooptvariant", DBG(NO_OPT_VARIANT), "Disable compiling optimized shader variants." },
 761
 762         { "testdma", DBG(TEST_DMA), "Invoke SDMA tests and exit." },
 763         { "testvmfaultcp", DBG(TEST_VMFAULT_CP), "Invoke a CP VM fault test and exit." },
 764         { "testvmfaultsdma", DBG(TEST_VMFAULT_SDMA), "Invoke a SDMA VM fault test and exit." },
 765         { "testvmfaultshader", DBG(TEST_VMFAULT_SHADER), "Invoke a shader VM fault test and exit." },
 766
 767         /* features */
 768         { "nodma", DBG(NO_ASYNC_DMA), "Disable asynchronous DMA" },
 769         { "nohyperz", DBG(NO_HYPERZ), "Disable Hyper-Z" },
 770         /* GL uses the word INVALIDATE, gallium uses the word DISCARD */
 771         { "noinvalrange", DBG(NO_DISCARD_RANGE), "Disable handling of INVALIDATE_RANGE map flags" },
 772         { "no2d", DBG(NO_2D_TILING), "Disable 2D tiling" },
 773         { "notiling", DBG(NO_TILING), "Disable tiling" },
 774         { "switch_on_eop", DBG(SWITCH_ON_EOP), "Program WD/IA to switch on end-of-packet." },
 775         { "forcedma", DBG(FORCE_DMA), "Use asynchronous DMA for all operations when possible." },
 776         { "precompile", DBG(PRECOMPILE), "Compile one shader variant at shader creation." },
 777         { "nowc", DBG(NO_WC), "Disable GTT write combining" },
 778         { "check_vm", DBG(CHECK_VM), "Check VM faults and dump debug info." },
 779         { "nodcc", DBG(NO_DCC), "Disable DCC." },
 780         { "nodccclear", DBG(NO_DCC_CLEAR), "Disable DCC fast clear." },
 781         { "norbplus", DBG(NO_RB_PLUS), "Disable RB+." },
 782         { "sisched", DBG(SI_SCHED), "Enable LLVM SI Machine Instruction Scheduler." },
 783         { "mono", DBG(MONOLITHIC_SHADERS), "Use old-style monolithic shaders compiled on demand" },
 784         { "unsafemath", DBG(UNSAFE_MATH), "Enable unsafe math shader optimizations" },
 785         { "nodccfb", DBG(NO_DCC_FB), "Disable separate DCC on the main framebuffer" },
 786         { "nodpbb", DBG(NO_DPBB), "Disable DPBB." },
 787         { "nodfsm", DBG(NO_DFSM), "Disable DFSM." },
 788         { "dpbb", DBG(DPBB), "Enable DPBB." },
 789         { "dfsm", DBG(DFSM), "Enable DFSM." },
 790         { "nooutoforder", DBG(NO_OUT_OF_ORDER), "Disable out-of-order rasterization" },
 791
 792         DEBUG_NAMED_VALUE_END /* must be last */
 793 };
 794
 795 static const char* r600_get_vendor(struct pipe_screen* pscreen)
 796 {
 797         return "X.Org";
 798 }
 799
 800 static const char* r600_get_device_vendor(struct pipe_screen* pscreen)
 801 {
 802         return "AMD";
 803 }
 804
 805 static const char *r600_get_marketing_name(struct radeon_winsys *ws)
 806 {
 807         if (!ws->get_chip_name)
 808                 return NULL;
 809         return ws->get_chip_name(ws);
 810 }
 811
 812 static const char *r600_get_family_name(const struct r600_common_screen *rscreen)
 813 {
 814         switch (rscreen->info.family) {
 815         case CHIP_TAHITI: return "AMD TAHITI";
 816         case CHIP_PITCAIRN: return "AMD PITCAIRN";
 817         case CHIP_VERDE: return "AMD CAPE VERDE";
 818         case CHIP_OLAND: return "AMD OLAND";
 819         case CHIP_HAINAN: return "AMD HAINAN";
 820         case CHIP_BONAIRE: return "AMD BONAIRE";
 821         case CHIP_KAVERI: return "AMD KAVERI";
 822         case CHIP_KABINI: return "AMD KABINI";
 823         case CHIP_HAWAII: return "AMD HAWAII";
 824         case CHIP_MULLINS: return "AMD MULLINS";
 825         case CHIP_TONGA: return "AMD TONGA";
 826         case CHIP_ICELAND: return "AMD ICELAND";
 827         case CHIP_CARRIZO: return "AMD CARRIZO";
 828         case CHIP_FIJI: return "AMD FIJI";
 829         case CHIP_POLARIS10: return "AMD POLARIS10";
 830         case CHIP_POLARIS11: return "AMD POLARIS11";
 831         case CHIP_POLARIS12: return "AMD POLARIS12";
 832         case CHIP_STONEY: return "AMD STONEY";
 833         case CHIP_VEGA10: return "AMD VEGA10";
 834         case CHIP_RAVEN: return "AMD RAVEN";
 835         default: return "AMD unknown";
 836         }
 837 }
 838
 839 static void r600_disk_cache_create(struct r600_common_screen *rscreen)
 840 {
 841         /* Don't use the cache if shader dumping is enabled. */
 842         if (rscreen->debug_flags & DBG_ALL_SHADERS)
 843                 return;
 844
 845         uint32_t mesa_timestamp;
 846         if (disk_cache_get_function_timestamp(r600_disk_cache_create,
 847                                               &mesa_timestamp)) {
 848                 char *timestamp_str;
 849                 int res = -1;
 850                 uint32_t llvm_timestamp;
 851
 852                 if (disk_cache_get_function_timestamp(LLVMInitializeAMDGPUTargetInfo,
 853                                                       &llvm_timestamp)) {
 854                         res = asprintf(&timestamp_str, "%u_%u",
 855                                        mesa_timestamp, llvm_timestamp);
 856                 }
 857
 858                 if (res != -1) {
 859                         /* These flags affect shader compilation. */
 860                         uint64_t shader_debug_flags =
 861                                 rscreen->debug_flags &
 862                                 (DBG(FS_CORRECT_DERIVS_AFTER_KILL) |
 863                                  DBG(SI_SCHED) |
 864                                  DBG(UNSAFE_MATH));
 865
 866                         rscreen->disk_shader_cache =
 867                                 disk_cache_create(r600_get_family_name(rscreen),
 868                                                   timestamp_str,
 869                                                   shader_debug_flags);
 870                         free(timestamp_str);
 871                 }
 872         }
 873 }
 874
 875 static struct disk_cache *r600_get_disk_shader_cache(struct pipe_screen *pscreen)
 876 {
 877         struct r600_common_screen *rscreen = (struct r600_common_screen*)pscreen;
 878         return rscreen->disk_shader_cache;
 879 }
 880
 881 static const char* r600_get_name(struct pipe_screen* pscreen)
 882 {
 883         struct r600_common_screen *rscreen = (struct r600_common_screen*)pscreen;
 884
 885         return rscreen->renderer_string;
 886 }
 887
 888 static float r600_get_paramf(struct pipe_screen* pscreen,
 889                              enum pipe_capf param)
 890 {
 891         switch (param) {
 892         case PIPE_CAPF_MAX_LINE_WIDTH:
 893         case PIPE_CAPF_MAX_LINE_WIDTH_AA:
 894         case PIPE_CAPF_MAX_POINT_WIDTH:
 895         case PIPE_CAPF_MAX_POINT_WIDTH_AA:
 896                 return 8192.0f;
 897         case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY:
 898                 return 16.0f;
 899         case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
 900                 return 16.0f;
 901         case PIPE_CAPF_GUARD_BAND_LEFT:
 902         case PIPE_CAPF_GUARD_BAND_TOP:
 903         case PIPE_CAPF_GUARD_BAND_RIGHT:
 904         case PIPE_CAPF_GUARD_BAND_BOTTOM:
 905                 return 0.0f;
 906         }
 907         return 0.0f;
 908 }
 909
 910 static int r600_get_video_param(struct pipe_screen *screen,
 911                                 enum pipe_video_profile profile,
 912                                 enum pipe_video_entrypoint entrypoint,
 913                                 enum pipe_video_cap param)
 914 {
 915         switch (param) {
 916         case PIPE_VIDEO_CAP_SUPPORTED:
 917                 return vl_profile_supported(screen, profile, entrypoint);
 918         case PIPE_VIDEO_CAP_NPOT_TEXTURES:
 919                 return 1;
 920         case PIPE_VIDEO_CAP_MAX_WIDTH:
 921         case PIPE_VIDEO_CAP_MAX_HEIGHT:
 922                 return vl_video_buffer_max_size(screen);
 923         case PIPE_VIDEO_CAP_PREFERED_FORMAT:
 924                 return PIPE_FORMAT_NV12;
 925         case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
 926                 return false;
 927         case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED:
 928                 return false;
 929         case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
 930                 return true;
 931         case PIPE_VIDEO_CAP_MAX_LEVEL:
 932                 return vl_level_supported(screen, profile);
 933         default:
 934                 return 0;
 935         }
 936 }
 937
 938 const char *si_get_llvm_processor_name(enum radeon_family family)
 939 {
 940         switch (family) {
 941         case CHIP_TAHITI: return "tahiti";
 942         case CHIP_PITCAIRN: return "pitcairn";
 943         case CHIP_VERDE: return "verde";
 944         case CHIP_OLAND: return "oland";
 945         case CHIP_HAINAN: return "hainan";
 946         case CHIP_BONAIRE: return "bonaire";
 947         case CHIP_KABINI: return "kabini";
 948         case CHIP_KAVERI: return "kaveri";
 949         case CHIP_HAWAII: return "hawaii";
 950         case CHIP_MULLINS:
 951                 return "mullins";
 952         case CHIP_TONGA: return "tonga";
 953         case CHIP_ICELAND: return "iceland";
 954         case CHIP_CARRIZO: return "carrizo";
 955         case CHIP_FIJI:
 956                 return "fiji";
 957         case CHIP_STONEY:
 958                 return "stoney";
 959         case CHIP_POLARIS10:
 960                 return "polaris10";
 961         case CHIP_POLARIS11:
 962         case CHIP_POLARIS12: /* same as polaris11 */
 963                 return "polaris11";
 964         case CHIP_VEGA10:
 965         case CHIP_RAVEN:
 966                 return "gfx900";
 967         default:
 968                 return "";
 969         }
 970 }
 971
 972 static unsigned get_max_threads_per_block(struct r600_common_screen *screen,
 973                                           enum pipe_shader_ir ir_type)
 974 {
 975         if (ir_type != PIPE_SHADER_IR_TGSI)
 976                 return 256;
 977
 978         /* Only 16 waves per thread-group on gfx9. */
 979         if (screen->chip_class >= GFX9)
 980                 return 1024;
 981
 982         /* Up to 40 waves per thread-group on GCN < gfx9. Expose a nice
 983          * round number.
 984          */
 985         return 2048;
 986 }
 987
 988 static int r600_get_compute_param(struct pipe_screen *screen,
 989         enum pipe_shader_ir ir_type,
 990         enum pipe_compute_cap param,
 991         void *ret)
 992 {
 993         struct r600_common_screen *rscreen = (struct r600_common_screen *)screen;
 994
 995         //TODO: select these params by asic
 996         switch (param) {
 997         case PIPE_COMPUTE_CAP_IR_TARGET: {
 998                 const char *gpu;
 999                 const char *triple;
1000
1001                 if (HAVE_LLVM < 0x0400)
1002                         triple = "amdgcn--";
1003                 else
1004                         triple = "amdgcn-mesa-mesa3d";
1005
1006                 gpu = si_get_llvm_processor_name(rscreen->family);
1007                 if (ret) {
1008                         sprintf(ret, "%s-%s", gpu, triple);
1009                 }
1010                 /* +2 for dash and terminating NIL byte */
1011                 return (strlen(triple) + strlen(gpu) + 2) * sizeof(char);
1012         }
1013         case PIPE_COMPUTE_CAP_GRID_DIMENSION:
1014                 if (ret) {
1015                         uint64_t *grid_dimension = ret;
1016                         grid_dimension[0] = 3;
1017                 }
1018                 return 1 * sizeof(uint64_t);
1019
1020         case PIPE_COMPUTE_CAP_MAX_GRID_SIZE:
1021                 if (ret) {
1022                         uint64_t *grid_size = ret;
1023                         grid_size[0] = 65535;
1024                         grid_size[1] = 65535;
1025                         grid_size[2] = 65535;
1026                 }
1027                 return 3 * sizeof(uint64_t) ;
1028
1029         case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE:
1030                 if (ret) {
1031                         uint64_t *block_size = ret;
1032                         unsigned threads_per_block = get_max_threads_per_block(rscreen, ir_type);
1033                         block_size[0] = threads_per_block;
1034                         block_size[1] = threads_per_block;
1035                         block_size[2] = threads_per_block;
1036                 }
1037                 return 3 * sizeof(uint64_t);
1038
1039         case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK:
1040                 if (ret) {
1041                         uint64_t *max_threads_per_block = ret;
1042                         *max_threads_per_block = get_max_threads_per_block(rscreen, ir_type);
1043                 }
1044                 return sizeof(uint64_t);
1045         case PIPE_COMPUTE_CAP_ADDRESS_BITS:
1046                 if (ret) {
1047                         uint32_t *address_bits = ret;
1048                         address_bits[0] = 64;
1049                 }
1050                 return 1 * sizeof(uint32_t);
1051
1052         case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE:
1053                 if (ret) {
1054                         uint64_t *max_global_size = ret;
1055                         uint64_t max_mem_alloc_size;
1056
1057                         r600_get_compute_param(screen, ir_type,
1058                                 PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE,
1059                                 &max_mem_alloc_size);
1060
1061                         /* In OpenCL, the MAX_MEM_ALLOC_SIZE must be at least
1062                          * 1/4 of the MAX_GLOBAL_SIZE.  Since the
1063                          * MAX_MEM_ALLOC_SIZE is fixed for older kernels,
1064                          * make sure we never report more than
1065                          * 4 * MAX_MEM_ALLOC_SIZE.
1066                          */
1067                         *max_global_size = MIN2(4 * max_mem_alloc_size,
1068                                                 MAX2(rscreen->info.gart_size,
1069                                                      rscreen->info.vram_size));
1070                 }
1071                 return sizeof(uint64_t);
1072
1073         case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE:
1074                 if (ret) {
1075                         uint64_t *max_local_size = ret;
1076                         /* Value reported by the closed source driver. */
1077                         *max_local_size = 32768;
1078                 }
1079                 return sizeof(uint64_t);
1080
1081         case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE:
1082                 if (ret) {
1083                         uint64_t *max_input_size = ret;
1084                         /* Value reported by the closed source driver. */
1085                         *max_input_size = 1024;
1086                 }
1087                 return sizeof(uint64_t);
1088
1089         case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE:
1090                 if (ret) {
1091                         uint64_t *max_mem_alloc_size = ret;
1092
1093                         *max_mem_alloc_size = rscreen->info.max_alloc_size;
1094                 }
1095                 return sizeof(uint64_t);
1096
1097         case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY:
1098                 if (ret) {
1099                         uint32_t *max_clock_frequency = ret;
1100                         *max_clock_frequency = rscreen->info.max_shader_clock;
1101                 }
1102                 return sizeof(uint32_t);
1103
1104         case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS:
1105                 if (ret) {
1106                         uint32_t *max_compute_units = ret;
1107                         *max_compute_units = rscreen->info.num_good_compute_units;
1108                 }
1109                 return sizeof(uint32_t);
1110
1111         case PIPE_COMPUTE_CAP_IMAGES_SUPPORTED:
1112                 if (ret) {
1113                         uint32_t *images_supported = ret;
1114                         *images_supported = 0;
1115                 }
1116                 return sizeof(uint32_t);
1117         case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE:
1118                 break; /* unused */
1119         case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
1120                 if (ret) {
1121                         uint32_t *subgroup_size = ret;
1122                         *subgroup_size = 64;
1123                 }
1124                 return sizeof(uint32_t);
1125         case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK:
1126                 if (ret) {
1127                         uint64_t *max_variable_threads_per_block = ret;
1128                         if (ir_type == PIPE_SHADER_IR_TGSI)
1129                                 *max_variable_threads_per_block = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
1130                         else
1131                                 *max_variable_threads_per_block = 0;
1132                 }
1133                 return sizeof(uint64_t);
1134         }
1135
1136         fprintf(stderr, "unknown PIPE_COMPUTE_CAP %d\n", param);
1137         return 0;
1138 }
1139
1140 static uint64_t r600_get_timestamp(struct pipe_screen *screen)
1141 {
1142         struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
1143
1144         return 1000000 * rscreen->ws->query_value(rscreen->ws, RADEON_TIMESTAMP) /
1145                         rscreen->info.clock_crystal_freq;
1146 }
1147
1148 static void r600_fence_reference(struct pipe_screen *screen,
1149                                  struct pipe_fence_handle **dst,
1150                                  struct pipe_fence_handle *src)
1151 {
1152         struct radeon_winsys *ws = ((struct r600_common_screen*)screen)->ws;
1153         struct r600_multi_fence **rdst = (struct r600_multi_fence **)dst;
1154         struct r600_multi_fence *rsrc = (struct r600_multi_fence *)src;
1155
1156         if (pipe_reference(&(*rdst)->reference, &rsrc->reference)) {
1157                 ws->fence_reference(&(*rdst)->gfx, NULL);
1158                 ws->fence_reference(&(*rdst)->sdma, NULL);
1159                 FREE(*rdst);
1160         }
1161         *rdst = rsrc;
1162 }
1163
1164 static boolean r600_fence_finish(struct pipe_screen *screen,
1165                                  struct pipe_context *ctx,
1166                                  struct pipe_fence_handle *fence,
1167                                  uint64_t timeout)
1168 {
1169         struct radeon_winsys *rws = ((struct r600_common_screen*)screen)->ws;
1170         struct r600_multi_fence *rfence = (struct r600_multi_fence *)fence;
1171         struct r600_common_context *rctx;
1172         int64_t abs_timeout = os_time_get_absolute_timeout(timeout);
1173
1174         ctx = threaded_context_unwrap_sync(ctx);
1175         rctx = ctx ? (struct r600_common_context*)ctx : NULL;
1176
1177         if (rfence->sdma) {
1178                 if (!rws->fence_wait(rws, rfence->sdma, timeout))
1179                         return false;
1180
1181                 /* Recompute the timeout after waiting. */
1182                 if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
1183                         int64_t time = os_time_get_nano();
1184                         timeout = abs_timeout > time ? abs_timeout - time : 0;
1185                 }
1186         }
1187
1188         if (!rfence->gfx)
1189                 return true;
1190
1191         /* Flush the gfx IB if it hasn't been flushed yet. */
1192         if (rctx &&
1193             rfence->gfx_unflushed.ctx == rctx &&
1194             rfence->gfx_unflushed.ib_index == rctx->num_gfx_cs_flushes) {
1195                 rctx->gfx.flush(rctx, timeout ? 0 : RADEON_FLUSH_ASYNC, NULL);
1196                 rfence->gfx_unflushed.ctx = NULL;
1197
1198                 if (!timeout)
1199                         return false;
1200
1201                 /* Recompute the timeout after all that. */
1202                 if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
1203                         int64_t time = os_time_get_nano();
1204                         timeout = abs_timeout > time ? abs_timeout - time : 0;
1205                 }
1206         }
1207
1208         return rws->fence_wait(rws, rfence->gfx, timeout);
1209 }
1210
1211 static void r600_query_memory_info(struct pipe_screen *screen,
1212                                    struct pipe_memory_info *info)
1213 {
1214         struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
1215         struct radeon_winsys *ws = rscreen->ws;
1216         unsigned vram_usage, gtt_usage;
1217
1218         info->total_device_memory = rscreen->info.vram_size / 1024;
1219         info->total_staging_memory = rscreen->info.gart_size / 1024;
1220
1221         /* The real TTM memory usage is somewhat random, because:
1222          *
1223          * 1) TTM delays freeing memory, because it can only free it after
1224          *    fences expire.
1225          *
1226          * 2) The memory usage can be really low if big VRAM evictions are
1227          *    taking place, but the real usage is well above the size of VRAM.
1228          *
1229          * Instead, return statistics of this process.
1230          */
1231         vram_usage = ws->query_value(ws, RADEON_REQUESTED_VRAM_MEMORY) / 1024;
1232         gtt_usage =  ws->query_value(ws, RADEON_REQUESTED_GTT_MEMORY) / 1024;
1233
1234         info->avail_device_memory =
1235                 vram_usage <= info->total_device_memory ?
1236                                 info->total_device_memory - vram_usage : 0;
1237         info->avail_staging_memory =
1238                 gtt_usage <= info->total_staging_memory ?
1239                                 info->total_staging_memory - gtt_usage : 0;
1240
1241         info->device_memory_evicted =
1242                 ws->query_value(ws, RADEON_NUM_BYTES_MOVED) / 1024;
1243
1244         if (rscreen->info.drm_major == 3 && rscreen->info.drm_minor >= 4)
1245                 info->nr_device_memory_evictions =
1246                         ws->query_value(ws, RADEON_NUM_EVICTIONS);
1247         else
1248                 /* Just return the number of evicted 64KB pages. */
1249                 info->nr_device_memory_evictions = info->device_memory_evicted / 64;
1250 }
1251
1252 struct pipe_resource *si_resource_create_common(struct pipe_screen *screen,
1253                                                 const struct pipe_resource *templ)
1254 {
1255         if (templ->target == PIPE_BUFFER) {
1256                 return si_buffer_create(screen, templ, 256);
1257         } else {
1258                 return si_texture_create(screen, templ);
1259         }
1260 }
1261
1262 bool si_common_screen_init(struct r600_common_screen *rscreen,
1263                            struct radeon_winsys *ws)
1264 {
1265         char family_name[32] = {}, llvm_string[32] = {}, kernel_version[128] = {};
1266         struct utsname uname_data;
1267         const char *chip_name;
1268
1269         ws->query_info(ws, &rscreen->info);
1270         rscreen->ws = ws;
1271
1272         if ((chip_name = r600_get_marketing_name(ws)))
1273                 snprintf(family_name, sizeof(family_name), "%s / ",
1274                          r600_get_family_name(rscreen) + 4);
1275         else
1276                 chip_name = r600_get_family_name(rscreen);
1277
1278         if (uname(&uname_data) == 0)
1279                 snprintf(kernel_version, sizeof(kernel_version),
1280                          " / %s", uname_data.release);
1281
1282         if (HAVE_LLVM > 0) {
1283                 snprintf(llvm_string, sizeof(llvm_string),
1284                          ", LLVM %i.%i.%i", (HAVE_LLVM >> 8) & 0xff,
1285                          HAVE_LLVM & 0xff, MESA_LLVM_VERSION_PATCH);
1286         }
1287
1288         snprintf(rscreen->renderer_string, sizeof(rscreen->renderer_string),
1289                  "%s (%sDRM %i.%i.%i%s%s)",
1290                  chip_name, family_name, rscreen->info.drm_major,
1291                  rscreen->info.drm_minor, rscreen->info.drm_patchlevel,
1292                  kernel_version, llvm_string);
1293
1294         rscreen->b.get_name = r600_get_name;
1295         rscreen->b.get_vendor = r600_get_vendor;
1296         rscreen->b.get_device_vendor = r600_get_device_vendor;
1297         rscreen->b.get_disk_shader_cache = r600_get_disk_shader_cache;
1298         rscreen->b.get_compute_param = r600_get_compute_param;
1299         rscreen->b.get_paramf = r600_get_paramf;
1300         rscreen->b.get_timestamp = r600_get_timestamp;
1301         rscreen->b.fence_finish = r600_fence_finish;
1302         rscreen->b.fence_reference = r600_fence_reference;
1303         rscreen->b.resource_destroy = u_resource_destroy_vtbl;
1304         rscreen->b.resource_from_user_memory = si_buffer_from_user_memory;
1305         rscreen->b.query_memory_info = r600_query_memory_info;
1306
1307         if (rscreen->info.has_hw_decode) {
1308                 rscreen->b.get_video_param = si_vid_get_video_param;
1309                 rscreen->b.is_video_format_supported = si_vid_is_format_supported;
1310         } else {
1311                 rscreen->b.get_video_param = r600_get_video_param;
1312                 rscreen->b.is_video_format_supported = vl_video_buffer_is_format_supported;
1313         }
1314
1315         si_init_screen_texture_functions(rscreen);
1316         si_init_screen_query_functions(rscreen);
1317
1318         rscreen->family = rscreen->info.family;
1319         rscreen->chip_class = rscreen->info.chip_class;
1320         rscreen->debug_flags |= debug_get_flags_option("R600_DEBUG", common_debug_options, 0);
1321         rscreen->has_rbplus = false;
1322         rscreen->rbplus_allowed = false;
1323
1324         r600_disk_cache_create(rscreen);
1325
1326         slab_create_parent(&rscreen->pool_transfers, sizeof(struct r600_transfer), 64);
1327
1328         rscreen->force_aniso = MIN2(16, debug_get_num_option("R600_TEX_ANISO", -1));
1329         if (rscreen->force_aniso >= 0) {
1330                 printf("radeon: Forcing anisotropy filter to %ix\n",
1331                        /* round down to a power of two */
1332                        1 << util_logbase2(rscreen->force_aniso));
1333         }
1334
1335         (void) mtx_init(&rscreen->aux_context_lock, mtx_plain);
1336         (void) mtx_init(&rscreen->gpu_load_mutex, mtx_plain);
1337
1338         if (rscreen->debug_flags & DBG(INFO)) {
1339                 printf("pci (domain:bus:dev.func): %04x:%02x:%02x.%x\n",
1340                        rscreen->info.pci_domain, rscreen->info.pci_bus,
1341                        rscreen->info.pci_dev, rscreen->info.pci_func);
1342                 printf("pci_id = 0x%x\n", rscreen->info.pci_id);
1343                 printf("family = %i (%s)\n", rscreen->info.family,
1344                        r600_get_family_name(rscreen));
1345                 printf("chip_class = %i\n", rscreen->info.chip_class);
1346                 printf("pte_fragment_size = %u\n", rscreen->info.pte_fragment_size);
1347                 printf("gart_page_size = %u\n", rscreen->info.gart_page_size);
1348                 printf("gart_size = %i MB\n", (int)DIV_ROUND_UP(rscreen->info.gart_size, 1024*1024));
1349                 printf("vram_size = %i MB\n", (int)DIV_ROUND_UP(rscreen->info.vram_size, 1024*1024));
1350                 printf("vram_vis_size = %i MB\n", (int)DIV_ROUND_UP(rscreen->info.vram_vis_size, 1024*1024));
1351                 printf("max_alloc_size = %i MB\n",
1352                        (int)DIV_ROUND_UP(rscreen->info.max_alloc_size, 1024*1024));
1353                 printf("min_alloc_size = %u\n", rscreen->info.min_alloc_size);
1354                 printf("has_dedicated_vram = %u\n", rscreen->info.has_dedicated_vram);
1355                 printf("has_virtual_memory = %i\n", rscreen->info.has_virtual_memory);
1356                 printf("gfx_ib_pad_with_type2 = %i\n", rscreen->info.gfx_ib_pad_with_type2);
1357                 printf("has_hw_decode = %u\n", rscreen->info.has_hw_decode);
1358                 printf("num_sdma_rings = %i\n", rscreen->info.num_sdma_rings);
1359                 printf("num_compute_rings = %u\n", rscreen->info.num_compute_rings);
1360                 printf("uvd_fw_version = %u\n", rscreen->info.uvd_fw_version);
1361                 printf("vce_fw_version = %u\n", rscreen->info.vce_fw_version);
1362                 printf("me_fw_version = %i\n", rscreen->info.me_fw_version);
1363                 printf("me_fw_feature = %i\n", rscreen->info.me_fw_feature);
1364                 printf("pfp_fw_version = %i\n", rscreen->info.pfp_fw_version);
1365                 printf("pfp_fw_feature = %i\n", rscreen->info.pfp_fw_feature);
1366                 printf("ce_fw_version = %i\n", rscreen->info.ce_fw_version);
1367                 printf("ce_fw_feature = %i\n", rscreen->info.ce_fw_feature);
1368                 printf("vce_harvest_config = %i\n", rscreen->info.vce_harvest_config);
1369                 printf("clock_crystal_freq = %i\n", rscreen->info.clock_crystal_freq);
1370                 printf("tcc_cache_line_size = %u\n", rscreen->info.tcc_cache_line_size);
1371                 printf("drm = %i.%i.%i\n", rscreen->info.drm_major,
1372                        rscreen->info.drm_minor, rscreen->info.drm_patchlevel);
1373                 printf("has_userptr = %i\n", rscreen->info.has_userptr);
1374                 printf("has_syncobj = %u\n", rscreen->info.has_syncobj);
1375
1376                 printf("r600_max_quad_pipes = %i\n", rscreen->info.r600_max_quad_pipes);
1377                 printf("max_shader_clock = %i\n", rscreen->info.max_shader_clock);
1378                 printf("num_good_compute_units = %i\n", rscreen->info.num_good_compute_units);
1379                 printf("max_se = %i\n", rscreen->info.max_se);
1380                 printf("max_sh_per_se = %i\n", rscreen->info.max_sh_per_se);
1381
1382                 printf("r600_gb_backend_map = %i\n", rscreen->info.r600_gb_backend_map);
1383                 printf("r600_gb_backend_map_valid = %i\n", rscreen->info.r600_gb_backend_map_valid);
1384                 printf("r600_num_banks = %i\n", rscreen->info.r600_num_banks);
1385                 printf("num_render_backends = %i\n", rscreen->info.num_render_backends);
1386                 printf("num_tile_pipes = %i\n", rscreen->info.num_tile_pipes);
1387                 printf("pipe_interleave_bytes = %i\n", rscreen->info.pipe_interleave_bytes);
1388                 printf("enabled_rb_mask = 0x%x\n", rscreen->info.enabled_rb_mask);
1389                 printf("max_alignment = %u\n", (unsigned)rscreen->info.max_alignment);
1390         }
1391         return true;
1392 }
1393
1394 void si_destroy_common_screen(struct r600_common_screen *rscreen)
1395 {
1396         si_perfcounters_destroy(rscreen);
1397         si_gpu_load_kill_thread(rscreen);
1398
1399         mtx_destroy(&rscreen->gpu_load_mutex);
1400         mtx_destroy(&rscreen->aux_context_lock);
1401         rscreen->aux_context->destroy(rscreen->aux_context);
1402
1403         slab_destroy_parent(&rscreen->pool_transfers);
1404
1405         disk_cache_destroy(rscreen->disk_shader_cache);
1406         rscreen->ws->destroy(rscreen->ws);
1407         FREE(rscreen);
1408 }
1409
1410 bool si_can_dump_shader(struct r600_common_screen *rscreen,
1411                         unsigned processor)
1412 {
1413         return rscreen->debug_flags & (1 << processor);
1414 }
1415
1416 bool si_extra_shader_checks(struct r600_common_screen *rscreen, unsigned processor)
1417 {
1418         return (rscreen->debug_flags & DBG(CHECK_IR)) ||
1419                si_can_dump_shader(rscreen, processor);
1420 }
1421
1422 void si_screen_clear_buffer(struct r600_common_screen *rscreen, struct pipe_resource *dst,
1423                             uint64_t offset, uint64_t size, unsigned value)
1424 {
1425         struct r600_common_context *rctx = (struct r600_common_context*)rscreen->aux_context;
1426
1427         mtx_lock(&rscreen->aux_context_lock);
1428         rctx->dma_clear_buffer(&rctx->b, dst, offset, size, value);
1429         rscreen->aux_context->flush(rscreen->aux_context, NULL, 0);
1430         mtx_unlock(&rscreen->aux_context_lock);
1431 }